sbrdsp.asm 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548
  1. ;******************************************************************************
  2. ;* AAC Spectral Band Replication decoding functions
  3. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. ; mask equivalent for multiply by -1.0 1.0
  24. ps_mask times 2 dd 1<<31, 0
  25. ps_mask2 times 2 dd 0, 1<<31
  26. ps_mask3 dd 0, 0, 0, 1<<31
  27. ps_noise0 times 2 dd 1.0, 0.0,
  28. ps_noise2 times 2 dd -1.0, 0.0
  29. ps_noise13 dd 0.0, 1.0, 0.0, -1.0
  30. dd 0.0, -1.0, 0.0, 1.0
  31. dd 0.0, 1.0, 0.0, -1.0
  32. cextern sbr_noise_table
  33. cextern ps_neg
  34. SECTION .text
  35. INIT_XMM sse
  36. cglobal sbr_sum_square, 2, 3, 6
  37. mov r2d, r1d
  38. xorps m0, m0
  39. xorps m1, m1
  40. sar r2, 3
  41. jz .prepare
  42. .loop:
  43. movu m2, [r0 + 0]
  44. movu m3, [r0 + 16]
  45. movu m4, [r0 + 32]
  46. movu m5, [r0 + 48]
  47. mulps m2, m2
  48. mulps m3, m3
  49. mulps m4, m4
  50. mulps m5, m5
  51. addps m0, m2
  52. addps m1, m3
  53. addps m0, m4
  54. addps m1, m5
  55. add r0, 64
  56. dec r2
  57. jnz .loop
  58. .prepare:
  59. and r1, 7
  60. sar r1, 1
  61. jz .end
  62. ; len is a multiple of 2, thus there are at least 4 elements to process
  63. .endloop:
  64. movu m2, [r0]
  65. add r0, 16
  66. mulps m2, m2
  67. dec r1
  68. addps m0, m2
  69. jnz .endloop
  70. .end:
  71. addps m0, m1
  72. movhlps m2, m0
  73. addps m0, m2
  74. movss m1, m0
  75. shufps m0, m0, 1
  76. addss m0, m1
  77. %if ARCH_X86_64 == 0
  78. movss r0m, m0
  79. fld dword r0m
  80. %endif
  81. RET
  82. %define STEP 40*4*2
  83. cglobal sbr_hf_g_filt, 5, 6, 5
  84. lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
  85. mov r5, r3
  86. and r3, 0xFC
  87. lea r2, [r2 + r3*4]
  88. lea r0, [r0 + r3*8]
  89. neg r3
  90. jz .loop1
  91. .loop4:
  92. movlps m0, [r2 + 4*r3 + 0]
  93. movlps m1, [r2 + 4*r3 + 8]
  94. movlps m2, [r1 + 0*STEP]
  95. movlps m3, [r1 + 2*STEP]
  96. movhps m2, [r1 + 1*STEP]
  97. movhps m3, [r1 + 3*STEP]
  98. unpcklps m0, m0
  99. unpcklps m1, m1
  100. mulps m0, m2
  101. mulps m1, m3
  102. movu [r0 + 8*r3 + 0], m0
  103. movu [r0 + 8*r3 + 16], m1
  104. add r1, 4*STEP
  105. add r3, 4
  106. jnz .loop4
  107. and r5, 3 ; number of single element loops
  108. jz .end
  109. .loop1: ; element 0 and 1 can be computed at the same time
  110. movss m0, [r2]
  111. movlps m2, [r1]
  112. unpcklps m0, m0
  113. mulps m2, m0
  114. movlps [r0], m2
  115. add r0, 8
  116. add r2, 4
  117. add r1, STEP
  118. dec r5
  119. jnz .loop1
  120. .end:
  121. RET
  122. ; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
  123. ; const float alpha0[2], const float alpha1[2],
  124. ; float bw, int start, int end)
  125. ;
  126. cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
  127. ; load alpha factors
  128. %define bw m0
  129. %if ARCH_X86_64 == 0 || WIN64
  130. movss bw, BWm
  131. %endif
  132. movlps m2, [alpha1q]
  133. movlps m1, [alpha0q]
  134. shufps bw, bw, 0
  135. mulps m2, bw ; (a1[0] a1[1])*bw
  136. mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
  137. mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
  138. mova m3, m1
  139. mova m4, m2
  140. ; Set pointers
  141. %if ARCH_X86_64 == 0 || WIN64
  142. ; start and end 6th and 7th args on stack
  143. mov r2d, Sm
  144. mov r3d, Em
  145. DEFINE_ARGS X_high, X_low, start, end
  146. %else
  147. ; BW does not actually occupy a register, so shift by 1
  148. DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
  149. movsxd startq, startd
  150. movsxd endq, endd
  151. %endif
  152. sub startq, endq ; neg num of loops
  153. lea X_highq, [X_highq + endq*2*4]
  154. lea X_lowq, [X_lowq + endq*2*4 - 2*2*4]
  155. shl startq, 3 ; offset from num loops
  156. mova m0, [X_lowq + startq]
  157. shufps m3, m3, q1111
  158. shufps m4, m4, q1111
  159. xorps m3, [ps_mask]
  160. shufps m1, m1, q0000
  161. shufps m2, m2, q0000
  162. xorps m4, [ps_mask]
  163. .loop2:
  164. movu m7, [X_lowq + startq + 8] ; BbCc
  165. mova m6, m0
  166. mova m5, m7
  167. shufps m0, m0, q2301 ; aAbB
  168. shufps m7, m7, q2301 ; bBcC
  169. mulps m0, m4
  170. mulps m7, m3
  171. mulps m6, m2
  172. mulps m5, m1
  173. addps m7, m0
  174. mova m0, [X_lowq + startq + 16] ; CcDd
  175. addps m7, m0
  176. addps m6, m5
  177. addps m7, m6
  178. mova [X_highq + startq], m7
  179. add startq, 16
  180. jnz .loop2
  181. RET
  182. cglobal sbr_sum64x5, 1,2,4,z
  183. lea r1q, [zq+ 256]
  184. .loop:
  185. mova m0, [zq+ 0]
  186. mova m2, [zq+ 16]
  187. mova m1, [zq+ 256]
  188. mova m3, [zq+ 272]
  189. addps m0, [zq+ 512]
  190. addps m2, [zq+ 528]
  191. addps m1, [zq+ 768]
  192. addps m3, [zq+ 784]
  193. addps m0, [zq+1024]
  194. addps m2, [zq+1040]
  195. addps m0, m1
  196. addps m2, m3
  197. mova [zq], m0
  198. mova [zq+16], m2
  199. add zq, 32
  200. cmp zq, r1q
  201. jne .loop
  202. REP_RET
  203. INIT_XMM sse
  204. cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
  205. lea r2q, [zq + (64-4)*4]
  206. mova m3, [ps_neg]
  207. .loop:
  208. mova m1, [zq]
  209. xorps m0, m3, [r2q]
  210. shufps m0, m0, m0, q0123
  211. unpcklps m2, m0, m1
  212. unpckhps m0, m0, m1
  213. mova [Wq + 0], m2
  214. mova [Wq + 16], m0
  215. add Wq, 32
  216. sub r2q, 16
  217. add zq, 16
  218. cmp zq, r2q
  219. jl .loop
  220. REP_RET
  221. INIT_XMM sse
  222. cglobal sbr_neg_odd_64, 1,2,4,z
  223. lea r1q, [zq+256]
  224. .loop:
  225. mova m0, [zq+ 0]
  226. mova m1, [zq+16]
  227. mova m2, [zq+32]
  228. mova m3, [zq+48]
  229. xorps m0, [ps_mask2]
  230. xorps m1, [ps_mask2]
  231. xorps m2, [ps_mask2]
  232. xorps m3, [ps_mask2]
  233. mova [zq+ 0], m0
  234. mova [zq+16], m1
  235. mova [zq+32], m2
  236. mova [zq+48], m3
  237. add zq, 64
  238. cmp zq, r1q
  239. jne .loop
  240. REP_RET
  241. ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
  242. %macro SBR_QMF_DEINT_BFLY 0
  243. cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
  244. mov cq, 64*4-2*mmsize
  245. lea vrevq, [vq + 64*4]
  246. .loop:
  247. mova m0, [src0q+cq]
  248. mova m1, [src1q]
  249. mova m4, [src0q+cq+mmsize]
  250. mova m5, [src1q+mmsize]
  251. %if cpuflag(sse2)
  252. pshufd m2, m0, q0123
  253. pshufd m3, m1, q0123
  254. pshufd m6, m4, q0123
  255. pshufd m7, m5, q0123
  256. %else
  257. shufps m2, m0, m0, q0123
  258. shufps m3, m1, m1, q0123
  259. shufps m6, m4, m4, q0123
  260. shufps m7, m5, m5, q0123
  261. %endif
  262. addps m5, m2
  263. subps m0, m7
  264. addps m1, m6
  265. subps m4, m3
  266. mova [vrevq], m1
  267. mova [vrevq+mmsize], m5
  268. mova [vq+cq], m0
  269. mova [vq+cq+mmsize], m4
  270. add src1q, 2*mmsize
  271. add vrevq, 2*mmsize
  272. sub cq, 2*mmsize
  273. jge .loop
  274. REP_RET
  275. %endmacro
  276. INIT_XMM sse
  277. SBR_QMF_DEINT_BFLY
  278. INIT_XMM sse2
  279. SBR_QMF_DEINT_BFLY
  280. INIT_XMM sse2
  281. cglobal sbr_qmf_pre_shuffle, 1,4,6,z
  282. %define OFFSET (32*4-2*mmsize)
  283. mov r3q, OFFSET
  284. lea r1q, [zq + (32+1)*4]
  285. lea r2q, [zq + 64*4]
  286. mova m5, [ps_neg]
  287. .loop:
  288. movu m0, [r1q]
  289. movu m2, [r1q + mmsize]
  290. movu m1, [zq + r3q + 4 + mmsize]
  291. movu m3, [zq + r3q + 4]
  292. pxor m2, m5
  293. pxor m0, m5
  294. pshufd m2, m2, q0123
  295. pshufd m0, m0, q0123
  296. SBUTTERFLY dq, 2, 3, 4
  297. SBUTTERFLY dq, 0, 1, 4
  298. mova [r2q + 2*r3q + 0*mmsize], m2
  299. mova [r2q + 2*r3q + 1*mmsize], m3
  300. mova [r2q + 2*r3q + 2*mmsize], m0
  301. mova [r2q + 2*r3q + 3*mmsize], m1
  302. add r1q, 2*mmsize
  303. sub r3q, 2*mmsize
  304. jge .loop
  305. movq m2, [zq]
  306. movq [r2q], m2
  307. REP_RET
  308. %ifdef PIC
  309. %define NREGS 1
  310. %if UNIX64
  311. %define NOISE_TABLE r6q ; r5q is m_max
  312. %else
  313. %define NOISE_TABLE r5q
  314. %endif
  315. %else
  316. %define NREGS 0
  317. %define NOISE_TABLE sbr_noise_table
  318. %endif
  319. %macro LOAD_NST 1
  320. %ifdef PIC
  321. lea NOISE_TABLE, [%1]
  322. mova m0, [kxq + NOISE_TABLE]
  323. %else
  324. mova m0, [kxq + %1]
  325. %endif
  326. %endmacro
  327. INIT_XMM sse2
  328. ; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
  329. ; const float *q_filt, int noise,
  330. ; int kx, int m_max)
  331. cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
  332. mova m0, [ps_noise0]
  333. jmp apply_noise_main
  334. ; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
  335. ; const float *q_filt, int noise,
  336. ; int kx, int m_max)
  337. cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
  338. and kxq, 1
  339. shl kxq, 4
  340. LOAD_NST ps_noise13
  341. jmp apply_noise_main
  342. ; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
  343. ; const float *q_filt, int noise,
  344. ; int kx, int m_max)
  345. cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
  346. mova m0, [ps_noise2]
  347. jmp apply_noise_main
  348. ; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
  349. ; const float *q_filt, int noise,
  350. ; int kx, int m_max)
  351. cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
  352. and kxq, 1
  353. shl kxq, 4
  354. LOAD_NST ps_noise13+16
  355. apply_noise_main:
  356. %if ARCH_X86_64 == 0 || WIN64
  357. mov kxd, m_maxm
  358. DEFINE_ARGS Y, s_m, q_filt, noise, count
  359. %else
  360. DEFINE_ARGS Y, s_m, q_filt, noise, kx, count
  361. %endif
  362. movsxdifnidn noiseq, noised
  363. dec noiseq
  364. shl countd, 2
  365. %ifdef PIC
  366. lea NOISE_TABLE, [sbr_noise_table]
  367. %endif
  368. lea Yq, [Yq + 2*countq]
  369. add s_mq, countq
  370. add q_filtq, countq
  371. shl noiseq, 3
  372. pxor m5, m5
  373. neg countq
  374. .loop:
  375. mova m1, [q_filtq + countq]
  376. movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
  377. movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
  378. add noiseq, 2*mmsize
  379. and noiseq, 0x1ff<<3
  380. punpckhdq m2, m1, m1
  381. punpckldq m1, m1
  382. mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
  383. mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
  384. mova m3, [s_mq + countq]
  385. ; TODO: replace by a vpermd in AVX2
  386. punpckhdq m4, m3, m3
  387. punpckldq m3, m3
  388. pcmpeqd m6, m3, m5 ; m6 == 0
  389. pcmpeqd m7, m4, m5 ; m7 == 0
  390. mulps m3, m0 ; s_m[m] * phi_sign
  391. mulps m4, m0 ; s_m[m] * phi_sign
  392. pand m1, m6
  393. pand m2, m7
  394. movu m6, [Yq + 2*countq]
  395. movu m7, [Yq + 2*countq + mmsize]
  396. addps m3, m1
  397. addps m4, m2
  398. addps m6, m3
  399. addps m7, m4
  400. movu [Yq + 2*countq], m6
  401. movu [Yq + 2*countq + mmsize], m7
  402. add countq, mmsize
  403. jl .loop
  404. RET
  405. INIT_XMM sse
  406. cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
  407. %define COUNT 32*4
  408. %define OFFSET 32*4
  409. mov cq, -COUNT
  410. lea vrevq, [vq + OFFSET + COUNT]
  411. add vq, OFFSET-mmsize
  412. add srcq, 2*COUNT
  413. mova m3, [ps_neg]
  414. .loop:
  415. mova m0, [srcq + 2*cq + 0*mmsize]
  416. mova m1, [srcq + 2*cq + 1*mmsize]
  417. shufps m2, m0, m1, q2020
  418. shufps m1, m0, q1313
  419. xorps m2, m3
  420. mova [vq], m1
  421. mova [vrevq + cq], m2
  422. sub vq, mmsize
  423. add cq, mmsize
  424. jl .loop
  425. REP_RET
  426. %macro SBR_AUTOCORRELATE 0
  427. cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
  428. mov cntq, 37*8
  429. add xq, cntq
  430. neg cntq
  431. %if cpuflag(sse3)
  432. %define MOVH movsd
  433. movddup m5, [xq+cntq]
  434. %else
  435. %define MOVH movlps
  436. movlps m5, [xq+cntq]
  437. movlhps m5, m5
  438. %endif
  439. MOVH m7, [xq+cntq+8 ]
  440. MOVH m1, [xq+cntq+16]
  441. shufps m7, m7, q0110
  442. shufps m1, m1, q0110
  443. mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
  444. mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1];
  445. mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
  446. movaps [rsp ], m3
  447. movaps [rsp+16], m4
  448. add cntq, 8
  449. MOVH m2, [xq+cntq+16]
  450. movlhps m7, m7
  451. shufps m2, m2, q0110
  452. mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
  453. mulps m4, m7, m2
  454. mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1];
  455. addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
  456. align 16
  457. .loop:
  458. add cntq, 8
  459. MOVH m0, [xq+cntq+16]
  460. movlhps m1, m1
  461. shufps m0, m0, q0110
  462. mulps m3, m1, m2
  463. mulps m4, m1, m0
  464. mulps m1, m1
  465. addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
  466. addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
  467. addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
  468. add cntq, 8
  469. MOVH m1, [xq+cntq+16]
  470. movlhps m2, m2
  471. shufps m1, m1, q0110
  472. mulps m3, m2, m0
  473. mulps m4, m2, m1
  474. mulps m2, m2
  475. addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
  476. addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
  477. addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
  478. add cntq, 8
  479. MOVH m2, [xq+cntq+16]
  480. movlhps m0, m0
  481. shufps m2, m2, q0110
  482. mulps m3, m0, m1
  483. mulps m4, m0, m2
  484. mulps m0, m0
  485. addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
  486. addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
  487. addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
  488. jl .loop
  489. movlhps m1, m1
  490. mulps m2, m1
  491. mulps m1, m1
  492. addps m2, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
  493. addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
  494. addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
  495. addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
  496. xorps m2, [ps_mask3]
  497. xorps m5, [ps_mask3]
  498. xorps m6, [ps_mask3]
  499. HADDPS m2, m5, m3
  500. HADDPS m7, m6, m4
  501. %if cpuflag(sse3)
  502. movshdup m0, m1
  503. %else
  504. movss m0, m1
  505. shufps m1, m1, q0001
  506. %endif
  507. addss m1, m0
  508. movaps [phiq ], m2
  509. movhps [phiq+0x18], m7
  510. movss [phiq+0x28], m7
  511. movss [phiq+0x10], m1
  512. RET
  513. %endmacro
  514. INIT_XMM sse
  515. SBR_AUTOCORRELATE
  516. INIT_XMM sse3
  517. SBR_AUTOCORRELATE