celt_pvq_search.asm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. ;******************************************************************************
  2. ;* SIMD optimized Opus encoder DSP function
  3. ;*
  4. ;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "config.asm"
  23. %include "libavutil/x86/x86util.asm"
  24. %ifdef __NASM_VER__
  25. %use "smartalign"
  26. ALIGNMODE p6
  27. %endif
  28. SECTION_RODATA 64
  29. const_float_abs_mask: times 8 dd 0x7fffffff
  30. const_align_abs_edge: times 8 dd 0
  31. const_float_0_5: times 8 dd 0.5
  32. const_float_1: times 8 dd 1.0
  33. const_float_sign_mask: times 8 dd 0x80000000
  34. const_int32_offsets:
  35. %rep 8
  36. dd $-const_int32_offsets
  37. %endrep
  38. SECTION .text
  39. ;
  40. ; Setup High Register to be used
  41. ; for holding memory constants
  42. ;
  43. ; %1 - the register to be used, assmues it is >= mm8
  44. ; %2 - name of the constant.
  45. ;
  46. ; Subsequent opcodes are going to use the constant in the form
  47. ; "addps m0, mm_const_name" and it would be turned into:
  48. ; "addps m0, [const_name]" on 32 bit arch or
  49. ; "addps m0, m8" on 64 bit arch
  50. %macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
  51. %if num_mmregs > 8
  52. %define mm_%3 %2
  53. %{1} %2, [%3] ; movaps m8, [const_name]
  54. %else
  55. %define mm_%3 [%3]
  56. %endif
  57. %endmacro
  58. ;
  59. ; Set Position Independent Code
  60. ; Base address of a constant
  61. ; %1 - the register to be used, if PIC is set
  62. ; %2 - name of the constant.
  63. ;
  64. ; Subsequent opcode are going to use the base address in the form
  65. ; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
  66. ; "movaps m0, [r5 + r4]" if PIC is enabled
  67. ; "movaps m0, [constant_name + r4]" if texrel are used
  68. %macro SET_PIC_BASE 3; reg, const_label
  69. %ifdef PIC
  70. %{1} %2, [%3] ; lea r5, [rip+const]
  71. %define pic_base_%3 %2
  72. %else
  73. %define pic_base_%3 %3
  74. %endif
  75. %endmacro
  76. %macro PULSES_SEARCH 1
  77. ; m6 Syy_norm
  78. ; m7 Sxy_norm
  79. addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
  80. pxor m1, m1 ; max_idx
  81. xorps m3, m3 ; p_max
  82. xor r4d, r4d
  83. align 16
  84. %%distortion_search:
  85. movd xm2, dword r4d ; movd zero extends
  86. %ifidn %1,add
  87. movaps m4, [tmpY + r4] ; y[i]
  88. movaps m5, [tmpX + r4] ; X[i]
  89. %if USE_APPROXIMATION == 1
  90. xorps m0, m0
  91. cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
  92. %endif
  93. addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
  94. addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
  95. %if USE_APPROXIMATION == 1
  96. andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
  97. %endif
  98. %else
  99. movaps m5, [tmpY + r4] ; m5 = y[i]
  100. xorps m0, m0 ; m0 = 0;
  101. cmpps m0, m0, m5, 1 ; m0 = (0<y)
  102. subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
  103. subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
  104. andps m5, m0 ; (0<y)?m5:0
  105. %endif
  106. %if USE_APPROXIMATION == 1
  107. rsqrtps m4, m4
  108. mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
  109. %else
  110. mulps m5, m5
  111. divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
  112. %endif
  113. VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
  114. cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
  115. maxps m3, m5 ; m3=max(p_max,p)
  116. ; maxps here is faster than blendvps, despite blend having lower latency.
  117. pand m2, m0 ; This version seems faster than sse41 pblendvb
  118. pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
  119. add r4d, mmsize
  120. cmp r4d, Nd
  121. jb %%distortion_search
  122. por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
  123. movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
  124. %if mmsize >= 32
  125. ; Merge parallel maximums round 8 (4 vs 4)
  126. vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
  127. cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
  128. vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
  129. BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
  130. PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
  131. %endif
  132. ; Merge parallel maximums round 4 (2 vs 2)
  133. ; m3=p[3210]
  134. movhlps xm5, xm3 ; m5=p[xx32]
  135. cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
  136. pshufd xm2, xm1, q3232
  137. BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
  138. PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
  139. ; Merge parallel maximums final round (1 vs 1)
  140. shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
  141. cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
  142. pshufd xm2, xm1, q1111
  143. PBLENDVB xm1, xm2, xm0
  144. movd dword r4d, xm1 ; zero extends to the rest of r4q
  145. VBROADCASTSS m3, [tmpX + r4]
  146. %{1}ps m7, m3 ; Sxy += X[max_idx]
  147. VBROADCASTSS m5, [tmpY + r4]
  148. %{1}ps m6, m5 ; Syy += Y[max_idx]
  149. ; We have to update a single element in Y[i]
  150. ; However writing 4 bytes and then doing 16 byte load in the inner loop
  151. ; could cause a stall due to breaking write forwarding.
  152. VPBROADCASTD m1, xm1
  153. pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
  154. and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
  155. movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
  156. andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
  157. %{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
  158. movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
  159. %endmacro
  160. ;
  161. ; We need one more register for
  162. ; PIC relative addressing. Use this
  163. ; to count it in cglobal
  164. ;
  165. %ifdef PIC
  166. %define num_pic_regs 1
  167. %else
  168. %define num_pic_regs 0
  169. %endif
  170. ;
  171. ; Pyramid Vector Quantization Search implementation
  172. ;
  173. ; float * inX - Unaligned (SIMD) access, it will be overread,
  174. ; but extra data is masked away.
  175. ; int32 * outY - Should be aligned and padded buffer.
  176. ; It is used as temp buffer.
  177. ; uint32 K - Number of pulses to have after quantizations.
  178. ; uint32 N - Number of vector elements. Must be 0 < N < 256
  179. ;
  180. %macro PVQ_FAST_SEARCH 1
  181. cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
  182. %define tmpX rsp
  183. %define tmpY outYq
  184. movaps m0, [const_float_abs_mask]
  185. shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
  186. mov r4d, Nd
  187. neg r4d
  188. and r4d, mmsize-1
  189. SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
  190. movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
  191. add Nd, r4d ; N = align(N, mmsize)
  192. lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
  193. movups m1, [inXq + r4]
  194. andps m1, m2
  195. movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
  196. align 16
  197. %%loop_abs_sum:
  198. sub r4d, mmsize
  199. jc %%end_loop_abs_sum
  200. movups m2, [inXq + r4]
  201. andps m2, m0
  202. movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
  203. addps m1, m2 ; Sx += abs(X[i])
  204. jmp %%loop_abs_sum
  205. align 16
  206. %%end_loop_abs_sum:
  207. HSUMPS m1, m2 ; m1 = Sx
  208. xorps m0, m0
  209. comiss xm0, xm1 ;
  210. jz %%zero_input ; if (Sx==0) goto zero_input
  211. cvtsi2ss xm0, dword Kd ; m0 = K
  212. %if USE_APPROXIMATION == 1
  213. rcpss xm1, xm1 ; m1 = approx(1/Sx)
  214. mulss xm0, xm1 ; m0 = K*(1/Sx)
  215. %else
  216. divss xm0, xm1 ; b = K/Sx
  217. ; b = K/max_x
  218. %endif
  219. VBROADCASTSS m0, xm0
  220. lea r4d, [Nd - mmsize]
  221. pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
  222. xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
  223. xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
  224. align 16
  225. %%loop_guess:
  226. movaps m1, [tmpX + r4] ; m1 = X[i]
  227. mulps m2, m0, m1 ; m2 = res*X[i]
  228. cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
  229. paddd m5, m2 ; Sy += yt
  230. cvtdq2ps m2, m2 ; yt = (float)yt
  231. mulps m1, m2 ; m1 = X[i]*yt
  232. movaps [tmpY + r4], m2 ; y[i] = m2
  233. addps m7, m1 ; Sxy += m1;
  234. mulps m2, m2 ; m2 = yt*yt
  235. addps m6, m2 ; Syy += m2
  236. sub r4d, mmsize
  237. jnc %%loop_guess
  238. HSUMPS m6, m1 ; Syy_norm
  239. HADDD m5, m4 ; pulses
  240. movd dword r4d, xm5 ; zero extends to the rest of r4q
  241. sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
  242. jz %%finish ; K - pulses == 0
  243. SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
  244. SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
  245. SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
  246. ; Use Syy/2 in distortion parameter calculations.
  247. ; Saves pre and post-caclulation to correct Y[] values.
  248. ; Same precision, since float mantisa is normalized.
  249. ; The SQRT approximation does differ.
  250. HSUMPS m7, m0 ; Sxy_norm
  251. mulps m6, mm_const_float_0_5
  252. jc %%remove_pulses_loop ; K - pulses < 0
  253. align 16 ; K - pulses > 0
  254. %%add_pulses_loop:
  255. PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
  256. sub Kd, 1
  257. jnz %%add_pulses_loop
  258. addps m6, m6 ; Syy*=2
  259. jmp %%finish
  260. align 16
  261. %%remove_pulses_loop:
  262. PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
  263. add Kd, 1
  264. jnz %%remove_pulses_loop
  265. addps m6, m6 ; Syy*=2
  266. align 16
  267. %%finish:
  268. lea r4d, [Nd - mmsize]
  269. movaps m2, [const_float_sign_mask]
  270. align 16
  271. %%restore_sign_loop:
  272. movaps m0, [tmpY + r4] ; m0 = Y[i]
  273. movups m1, [inXq + r4] ; m1 = X[i]
  274. andps m1, m2 ; m1 = sign(X[i])
  275. orps m0, m1 ; m0 = Y[i]*sign
  276. cvtps2dq m3, m0 ; m3 = (int)m0
  277. movaps [outYq + r4], m3
  278. sub r4d, mmsize
  279. jnc %%restore_sign_loop
  280. %%return:
  281. %if ARCH_X86_64 == 0 ; sbrdsp
  282. movss r0m, xm6 ; return (float)Syy_norm
  283. fld dword r0m
  284. %else
  285. movaps m0, m6 ; return (float)Syy_norm
  286. %endif
  287. RET
  288. align 16
  289. %%zero_input:
  290. lea r4d, [Nd - mmsize]
  291. xorps m0, m0
  292. %%zero_loop:
  293. movaps [outYq + r4], m0
  294. sub r4d, mmsize
  295. jnc %%zero_loop
  296. movaps m6, [const_float_1]
  297. jmp %%return
  298. %endmacro
  299. ; if 1, use a float op that give half precision but execute for around 3 cycles.
  300. ; On Skylake & Ryzen the division is much faster (around 11c/3),
  301. ; that makes the full precision code about 2% slower.
  302. ; Opus also does use rsqrt approximation in their intrinsics code.
  303. %define USE_APPROXIMATION 1
  304. INIT_XMM sse2
  305. PVQ_FAST_SEARCH _approx
  306. INIT_XMM sse4
  307. PVQ_FAST_SEARCH _approx
  308. %define USE_APPROXIMATION 0
  309. INIT_XMM avx
  310. PVQ_FAST_SEARCH _exact