float_dsp.asm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. ;*****************************************************************************
  2. ;* x86-optimized Float DSP functions
  3. ;*
  4. ;* Copyright 2006 Loren Merritt
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86util.asm"
  23. SECTION_RODATA 32
  24. pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
  25. SECTION .text
  26. ;-----------------------------------------------------------------------------
  27. ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
  28. ;-----------------------------------------------------------------------------
  29. %macro VECTOR_FMUL 0
  30. cglobal vector_fmul, 4,4,2, dst, src0, src1, len
  31. lea lenq, [lend*4 - 64]
  32. ALIGN 16
  33. .loop:
  34. %assign a 0
  35. %rep 32/mmsize
  36. mova m0, [src0q + lenq + (a+0)*mmsize]
  37. mova m1, [src0q + lenq + (a+1)*mmsize]
  38. mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
  39. mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
  40. mova [dstq + lenq + (a+0)*mmsize], m0
  41. mova [dstq + lenq + (a+1)*mmsize], m1
  42. %assign a a+2
  43. %endrep
  44. sub lenq, 64
  45. jge .loop
  46. REP_RET
  47. %endmacro
  48. INIT_XMM sse
  49. VECTOR_FMUL
  50. %if HAVE_AVX_EXTERNAL
  51. INIT_YMM avx
  52. VECTOR_FMUL
  53. %endif
  54. ;-----------------------------------------------------------------------------
  55. ; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
  56. ;-----------------------------------------------------------------------------
  57. %macro VECTOR_DMUL 0
  58. cglobal vector_dmul, 4,4,4, dst, src0, src1, len
  59. lea lend, [lenq*8 - mmsize*4]
  60. ALIGN 16
  61. .loop:
  62. movaps m0, [src0q + lenq + 0*mmsize]
  63. movaps m1, [src0q + lenq + 1*mmsize]
  64. movaps m2, [src0q + lenq + 2*mmsize]
  65. movaps m3, [src0q + lenq + 3*mmsize]
  66. mulpd m0, m0, [src1q + lenq + 0*mmsize]
  67. mulpd m1, m1, [src1q + lenq + 1*mmsize]
  68. mulpd m2, m2, [src1q + lenq + 2*mmsize]
  69. mulpd m3, m3, [src1q + lenq + 3*mmsize]
  70. movaps [dstq + lenq + 0*mmsize], m0
  71. movaps [dstq + lenq + 1*mmsize], m1
  72. movaps [dstq + lenq + 2*mmsize], m2
  73. movaps [dstq + lenq + 3*mmsize], m3
  74. sub lenq, mmsize*4
  75. jge .loop
  76. RET
  77. %endmacro
  78. INIT_XMM sse2
  79. VECTOR_DMUL
  80. %if HAVE_AVX_EXTERNAL
  81. INIT_YMM avx
  82. VECTOR_DMUL
  83. %endif
  84. ;------------------------------------------------------------------------------
  85. ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
  86. ;------------------------------------------------------------------------------
  87. %macro VECTOR_FMAC_SCALAR 0
  88. %if UNIX64
  89. cglobal vector_fmac_scalar, 3,3,5, dst, src, len
  90. %else
  91. cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
  92. %endif
  93. %if ARCH_X86_32
  94. VBROADCASTSS m0, mulm
  95. %else
  96. %if WIN64
  97. SWAP 0, 2
  98. %endif
  99. shufps xm0, xm0, 0
  100. %if cpuflag(avx)
  101. vinsertf128 m0, m0, xm0, 1
  102. %endif
  103. %endif
  104. lea lenq, [lend*4-64]
  105. .loop:
  106. %if cpuflag(fma3)
  107. mova m1, [dstq+lenq]
  108. mova m2, [dstq+lenq+1*mmsize]
  109. fmaddps m1, m0, [srcq+lenq], m1
  110. fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
  111. %else ; cpuflag
  112. mulps m1, m0, [srcq+lenq]
  113. mulps m2, m0, [srcq+lenq+1*mmsize]
  114. %if mmsize < 32
  115. mulps m3, m0, [srcq+lenq+2*mmsize]
  116. mulps m4, m0, [srcq+lenq+3*mmsize]
  117. %endif ; mmsize
  118. addps m1, m1, [dstq+lenq]
  119. addps m2, m2, [dstq+lenq+1*mmsize]
  120. %if mmsize < 32
  121. addps m3, m3, [dstq+lenq+2*mmsize]
  122. addps m4, m4, [dstq+lenq+3*mmsize]
  123. %endif ; mmsize
  124. %endif ; cpuflag
  125. mova [dstq+lenq], m1
  126. mova [dstq+lenq+1*mmsize], m2
  127. %if mmsize < 32
  128. mova [dstq+lenq+2*mmsize], m3
  129. mova [dstq+lenq+3*mmsize], m4
  130. %endif ; mmsize
  131. sub lenq, 64
  132. jge .loop
  133. REP_RET
  134. %endmacro
  135. INIT_XMM sse
  136. VECTOR_FMAC_SCALAR
  137. %if HAVE_AVX_EXTERNAL
  138. INIT_YMM avx
  139. VECTOR_FMAC_SCALAR
  140. %endif
  141. %if HAVE_FMA3_EXTERNAL
  142. INIT_YMM fma3
  143. VECTOR_FMAC_SCALAR
  144. %endif
  145. ;------------------------------------------------------------------------------
  146. ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
  147. ;------------------------------------------------------------------------------
  148. %macro VECTOR_FMUL_SCALAR 0
  149. %if UNIX64
  150. cglobal vector_fmul_scalar, 3,3,2, dst, src, len
  151. %else
  152. cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
  153. %endif
  154. %if ARCH_X86_32
  155. movss m0, mulm
  156. %elif WIN64
  157. SWAP 0, 2
  158. %endif
  159. shufps m0, m0, 0
  160. lea lenq, [lend*4-mmsize]
  161. .loop:
  162. mova m1, [srcq+lenq]
  163. mulps m1, m0
  164. mova [dstq+lenq], m1
  165. sub lenq, mmsize
  166. jge .loop
  167. REP_RET
  168. %endmacro
  169. INIT_XMM sse
  170. VECTOR_FMUL_SCALAR
  171. ;------------------------------------------------------------------------------
  172. ; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
  173. ; int len)
  174. ;------------------------------------------------------------------------------
  175. %macro VECTOR_DMAC_SCALAR 0
  176. %if ARCH_X86_32
  177. cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
  178. mov lenq, lenaddrm
  179. VBROADCASTSD m0, mulm
  180. %else
  181. %if UNIX64
  182. cglobal vector_dmac_scalar, 3,3,5, dst, src, len
  183. %else
  184. cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
  185. SWAP 0, 2
  186. %endif
  187. movlhps xm0, xm0
  188. %if cpuflag(avx)
  189. vinsertf128 m0, m0, xm0, 1
  190. %endif
  191. %endif
  192. lea lenq, [lend*8-mmsize*4]
  193. .loop:
  194. %if cpuflag(fma3)
  195. movaps m1, [dstq+lenq]
  196. movaps m2, [dstq+lenq+1*mmsize]
  197. movaps m3, [dstq+lenq+2*mmsize]
  198. movaps m4, [dstq+lenq+3*mmsize]
  199. fmaddpd m1, m0, [srcq+lenq], m1
  200. fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2
  201. fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3
  202. fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4
  203. %else ; cpuflag
  204. mulpd m1, m0, [srcq+lenq]
  205. mulpd m2, m0, [srcq+lenq+1*mmsize]
  206. mulpd m3, m0, [srcq+lenq+2*mmsize]
  207. mulpd m4, m0, [srcq+lenq+3*mmsize]
  208. addpd m1, m1, [dstq+lenq]
  209. addpd m2, m2, [dstq+lenq+1*mmsize]
  210. addpd m3, m3, [dstq+lenq+2*mmsize]
  211. addpd m4, m4, [dstq+lenq+3*mmsize]
  212. %endif ; cpuflag
  213. movaps [dstq+lenq], m1
  214. movaps [dstq+lenq+1*mmsize], m2
  215. movaps [dstq+lenq+2*mmsize], m3
  216. movaps [dstq+lenq+3*mmsize], m4
  217. sub lenq, mmsize*4
  218. jge .loop
  219. REP_RET
  220. %endmacro
  221. INIT_XMM sse2
  222. VECTOR_DMAC_SCALAR
  223. %if HAVE_AVX_EXTERNAL
  224. INIT_YMM avx
  225. VECTOR_DMAC_SCALAR
  226. %endif
  227. %if HAVE_FMA3_EXTERNAL
  228. INIT_YMM fma3
  229. VECTOR_DMAC_SCALAR
  230. %endif
  231. ;------------------------------------------------------------------------------
  232. ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
  233. ; int len)
  234. ;------------------------------------------------------------------------------
  235. %macro VECTOR_DMUL_SCALAR 0
  236. %if ARCH_X86_32
  237. cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
  238. mov lenq, lenaddrm
  239. %elif UNIX64
  240. cglobal vector_dmul_scalar, 3,3,3, dst, src, len
  241. %else
  242. cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
  243. %endif
  244. %if ARCH_X86_32
  245. VBROADCASTSD m0, mulm
  246. %else
  247. %if WIN64
  248. SWAP 0, 2
  249. %endif
  250. movlhps xm0, xm0
  251. %if cpuflag(avx)
  252. vinsertf128 ym0, ym0, xm0, 1
  253. %endif
  254. %endif
  255. lea lenq, [lend*8-2*mmsize]
  256. .loop:
  257. mulpd m1, m0, [srcq+lenq ]
  258. mulpd m2, m0, [srcq+lenq+mmsize]
  259. movaps [dstq+lenq ], m1
  260. movaps [dstq+lenq+mmsize], m2
  261. sub lenq, 2*mmsize
  262. jge .loop
  263. REP_RET
  264. %endmacro
  265. INIT_XMM sse2
  266. VECTOR_DMUL_SCALAR
  267. %if HAVE_AVX_EXTERNAL
  268. INIT_YMM avx
  269. VECTOR_DMUL_SCALAR
  270. %endif
  271. ;-----------------------------------------------------------------------------
  272. ; vector_fmul_window(float *dst, const float *src0,
  273. ; const float *src1, const float *win, int len);
  274. ;-----------------------------------------------------------------------------
  275. %macro VECTOR_FMUL_WINDOW 0
  276. cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
  277. shl lend, 2
  278. lea len1q, [lenq - mmsize]
  279. add src0q, lenq
  280. add dstq, lenq
  281. add winq, lenq
  282. neg lenq
  283. .loop:
  284. mova m0, [winq + lenq]
  285. mova m4, [src0q + lenq]
  286. %if cpuflag(sse)
  287. mova m1, [winq + len1q]
  288. mova m5, [src1q + len1q]
  289. shufps m1, m1, 0x1b
  290. shufps m5, m5, 0x1b
  291. mova m2, m0
  292. mova m3, m1
  293. mulps m2, m4
  294. mulps m3, m5
  295. mulps m1, m4
  296. mulps m0, m5
  297. addps m2, m3
  298. subps m1, m0
  299. shufps m2, m2, 0x1b
  300. %else
  301. pswapd m1, [winq + len1q]
  302. pswapd m5, [src1q + len1q]
  303. mova m2, m0
  304. mova m3, m1
  305. pfmul m2, m4
  306. pfmul m3, m5
  307. pfmul m1, m4
  308. pfmul m0, m5
  309. pfadd m2, m3
  310. pfsub m1, m0
  311. pswapd m2, m2
  312. %endif
  313. mova [dstq + lenq], m1
  314. mova [dstq + len1q], m2
  315. sub len1q, mmsize
  316. add lenq, mmsize
  317. jl .loop
  318. %if mmsize == 8
  319. femms
  320. %endif
  321. REP_RET
  322. %endmacro
  323. INIT_MMX 3dnowext
  324. VECTOR_FMUL_WINDOW
  325. INIT_XMM sse
  326. VECTOR_FMUL_WINDOW
  327. ;-----------------------------------------------------------------------------
  328. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  329. ; const float *src2, int len)
  330. ;-----------------------------------------------------------------------------
  331. %macro VECTOR_FMUL_ADD 0
  332. cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
  333. lea lenq, [lend*4 - 2*mmsize]
  334. ALIGN 16
  335. .loop:
  336. mova m0, [src0q + lenq]
  337. mova m1, [src0q + lenq + mmsize]
  338. %if cpuflag(fma3)
  339. mova m2, [src2q + lenq]
  340. mova m3, [src2q + lenq + mmsize]
  341. fmaddps m0, m0, [src1q + lenq], m2
  342. fmaddps m1, m1, [src1q + lenq + mmsize], m3
  343. %else
  344. mulps m0, m0, [src1q + lenq]
  345. mulps m1, m1, [src1q + lenq + mmsize]
  346. addps m0, m0, [src2q + lenq]
  347. addps m1, m1, [src2q + lenq + mmsize]
  348. %endif
  349. mova [dstq + lenq], m0
  350. mova [dstq + lenq + mmsize], m1
  351. sub lenq, 2*mmsize
  352. jge .loop
  353. REP_RET
  354. %endmacro
  355. INIT_XMM sse
  356. VECTOR_FMUL_ADD
  357. %if HAVE_AVX_EXTERNAL
  358. INIT_YMM avx
  359. VECTOR_FMUL_ADD
  360. %endif
  361. %if HAVE_FMA3_EXTERNAL
  362. INIT_YMM fma3
  363. VECTOR_FMUL_ADD
  364. %endif
  365. ;-----------------------------------------------------------------------------
  366. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  367. ; int len)
  368. ;-----------------------------------------------------------------------------
  369. %macro VECTOR_FMUL_REVERSE 0
  370. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  371. %if cpuflag(avx2)
  372. movaps m2, [pd_reverse]
  373. %endif
  374. lea lenq, [lend*4 - 2*mmsize]
  375. ALIGN 16
  376. .loop:
  377. %if cpuflag(avx2)
  378. vpermps m0, m2, [src1q]
  379. vpermps m1, m2, [src1q+mmsize]
  380. %elif cpuflag(avx)
  381. vmovaps xmm0, [src1q + 16]
  382. vinsertf128 m0, m0, [src1q], 1
  383. vshufps m0, m0, m0, q0123
  384. vmovaps xmm1, [src1q + mmsize + 16]
  385. vinsertf128 m1, m1, [src1q + mmsize], 1
  386. vshufps m1, m1, m1, q0123
  387. %else
  388. mova m0, [src1q]
  389. mova m1, [src1q + mmsize]
  390. shufps m0, m0, q0123
  391. shufps m1, m1, q0123
  392. %endif
  393. mulps m0, m0, [src0q + lenq + mmsize]
  394. mulps m1, m1, [src0q + lenq]
  395. movaps [dstq + lenq + mmsize], m0
  396. movaps [dstq + lenq], m1
  397. add src1q, 2*mmsize
  398. sub lenq, 2*mmsize
  399. jge .loop
  400. REP_RET
  401. %endmacro
  402. INIT_XMM sse
  403. VECTOR_FMUL_REVERSE
  404. %if HAVE_AVX_EXTERNAL
  405. INIT_YMM avx
  406. VECTOR_FMUL_REVERSE
  407. %endif
  408. %if HAVE_AVX2_EXTERNAL
  409. INIT_YMM avx2
  410. VECTOR_FMUL_REVERSE
  411. %endif
  412. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  413. INIT_XMM sse
  414. cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  415. shl offsetd, 2
  416. add v1q, offsetq
  417. add v2q, offsetq
  418. neg offsetq
  419. xorps xmm0, xmm0
  420. .loop:
  421. movaps xmm1, [v1q+offsetq]
  422. mulps xmm1, [v2q+offsetq]
  423. addps xmm0, xmm1
  424. add offsetq, 16
  425. js .loop
  426. movhlps xmm1, xmm0
  427. addps xmm0, xmm1
  428. movss xmm1, xmm0
  429. shufps xmm0, xmm0, 1
  430. addss xmm0, xmm1
  431. %if ARCH_X86_64 == 0
  432. movss r0m, xmm0
  433. fld dword r0m
  434. %endif
  435. RET
  436. ;-----------------------------------------------------------------------------
  437. ; void ff_butterflies_float(float *src0, float *src1, int len);
  438. ;-----------------------------------------------------------------------------
  439. INIT_XMM sse
  440. cglobal butterflies_float, 3,3,3, src0, src1, len
  441. shl lend, 2
  442. add src0q, lenq
  443. add src1q, lenq
  444. neg lenq
  445. .loop:
  446. mova m0, [src0q + lenq]
  447. mova m1, [src1q + lenq]
  448. subps m2, m0, m1
  449. addps m0, m0, m1
  450. mova [src1q + lenq], m2
  451. mova [src0q + lenq], m0
  452. add lenq, mmsize
  453. jl .loop
  454. REP_RET