fft.asm 25 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085
  1. ;******************************************************************************
  2. ;* FFT transform with SSE/3DNow optimizations
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2011 Vitor Sessak
  5. ;*
  6. ;* This algorithm (though not any of the implementation details) is
  7. ;* based on libdjbfft by D. J. Bernstein.
  8. ;*
  9. ;* This file is part of FFmpeg.
  10. ;*
  11. ;* FFmpeg is free software; you can redistribute it and/or
  12. ;* modify it under the terms of the GNU Lesser General Public
  13. ;* License as published by the Free Software Foundation; either
  14. ;* version 2.1 of the License, or (at your option) any later version.
  15. ;*
  16. ;* FFmpeg is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. ;* Lesser General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU Lesser General Public
  22. ;* License along with FFmpeg; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. ;******************************************************************************
  25. ; These functions are not individually interchangeable with the C versions.
  26. ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
  27. ; in blocks as conventient to the vector size.
  28. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
  29. %include "libavutil/x86/x86util.asm"
  30. %if ARCH_X86_64
  31. %define pointer resq
  32. %else
  33. %define pointer resd
  34. %endif
  35. struc FFTContext
  36. .nbits: resd 1
  37. .reverse: resd 1
  38. .revtab: pointer 1
  39. .tmpbuf: pointer 1
  40. .mdctsize: resd 1
  41. .mdctbits: resd 1
  42. .tcos: pointer 1
  43. .tsin: pointer 1
  44. .fftperm: pointer 1
  45. .fftcalc: pointer 1
  46. .imdctcalc:pointer 1
  47. .imdcthalf:pointer 1
  48. endstruc
  49. SECTION_RODATA 32
  50. %define M_SQRT1_2 0.70710678118654752440
  51. %define M_COS_PI_1_8 0.923879532511287
  52. %define M_COS_PI_3_8 0.38268343236509
  53. ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
  54. ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
  55. ps_root2: times 8 dd M_SQRT1_2
  56. ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  57. ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
  58. perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
  59. perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
  60. ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
  61. ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
  62. ps_m1p1: dd 1<<31, 0
  63. cextern ps_neg
  64. %assign i 16
  65. %rep 14
  66. cextern cos_ %+ i
  67. %assign i i<<1
  68. %endrep
  69. %if ARCH_X86_64
  70. %define pointer dq
  71. %else
  72. %define pointer dd
  73. %endif
  74. %macro IF0 1+
  75. %endmacro
  76. %macro IF1 1+
  77. %1
  78. %endmacro
  79. SECTION .text
  80. %macro T2_3DNOW 4 ; z0, z1, mem0, mem1
  81. mova %1, %3
  82. mova %2, %1
  83. pfadd %1, %4
  84. pfsub %2, %4
  85. %endmacro
  86. %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
  87. mova %5, %3
  88. pfsub %3, %4
  89. pfadd %5, %4 ; {t6,t5}
  90. pxor %3, [ps_m1p1] ; {t8,t7}
  91. mova %6, %1
  92. movd [r0+12], %3
  93. punpckhdq %3, [r0+8]
  94. pfadd %1, %5 ; {r0,i0}
  95. pfsub %6, %5 ; {r2,i2}
  96. mova %4, %2
  97. pfadd %2, %3 ; {r1,i1}
  98. pfsub %4, %3 ; {r3,i3}
  99. SWAP %3, %6
  100. %endmacro
  101. ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
  102. ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
  103. ; %3, %4, %5 tmp
  104. ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
  105. ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
  106. %macro T8_AVX 5
  107. vsubps %5, %1, %2 ; v = %1 - %2
  108. vaddps %3, %1, %2 ; w = %1 + %2
  109. vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
  110. vpermilps %2, %2, [perm1]
  111. vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
  112. vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
  113. vsubps %4, %5, %1 ; s = r - q
  114. vaddps %1, %5, %1 ; u = r + q
  115. vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
  116. vshufps %5, %4, %1, 0xbb
  117. vshufps %3, %4, %1, 0xee
  118. vperm2f128 %3, %3, %5, 0x13
  119. vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
  120. vshufps %2, %1, %4, 0xdd
  121. vshufps %1, %1, %4, 0x88
  122. vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
  123. vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
  124. vsubps %5, %1, %3
  125. vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
  126. vsubps %2, %4, %1 ; %2 = v - w
  127. vaddps %1, %4, %1 ; %1 = v + w
  128. %endmacro
  129. ; In SSE mode do one fft4 transforms
  130. ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
  131. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
  132. ;
  133. ; In AVX mode do two fft4 transforms
  134. ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
  135. ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
  136. %macro T4_SSE 3
  137. subps %3, %1, %2 ; {t3,t4,-t8,t7}
  138. addps %1, %1, %2 ; {t1,t2,t6,t5}
  139. xorps %3, %3, [ps_p1p1m1p1]
  140. shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
  141. shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
  142. subps %3, %1, %2 ; {r2,i2,r3,i3}
  143. addps %1, %1, %2 ; {r0,i0,r1,i1}
  144. shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
  145. shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
  146. %endmacro
  147. ; In SSE mode do one FFT8
  148. ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
  149. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
  150. ;
  151. ; In AVX mode do two FFT8
  152. ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
  153. ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
  154. ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
  155. ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
  156. %macro T8_SSE 6
  157. addps %6, %3, %4 ; {t1,t2,t3,t4}
  158. subps %3, %3, %4 ; {r5,i5,r7,i7}
  159. shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
  160. mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
  161. mulps %4, %4, [ps_root2]
  162. addps %3, %3, %4 ; {t8,t7,ta,t9}
  163. shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
  164. shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
  165. subps %3, %6, %4 ; {t6,t5,tc,tb}
  166. addps %6, %6, %4 ; {t1,t2,t9,ta}
  167. shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
  168. shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
  169. subps %3, %1, %6 ; {r4,r5,r6,r7}
  170. addps %1, %1, %6 ; {r0,r1,r2,r3}
  171. subps %4, %2, %5 ; {i4,i5,i6,i7}
  172. addps %2, %2, %5 ; {i0,i1,i2,i3}
  173. %endmacro
  174. %macro INTERL 5
  175. %if cpuflag(avx)
  176. vunpckhps %3, %2, %1
  177. vunpcklps %2, %2, %1
  178. vextractf128 %4(%5), %2, 0
  179. vextractf128 %4 %+ H(%5), %3, 0
  180. vextractf128 %4(%5 + 1), %2, 1
  181. vextractf128 %4 %+ H(%5 + 1), %3, 1
  182. %elif cpuflag(sse) || cpuflag(3dnow)
  183. mova %3, %2
  184. unpcklps %2, %1
  185. unpckhps %3, %1
  186. mova %4(%5), %2
  187. mova %4(%5+1), %3
  188. %endif
  189. %endmacro
  190. ; scheduled for cpu-bound sizes
  191. %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
  192. IF%1 mova m4, Z(4)
  193. IF%1 mova m5, Z(5)
  194. mova m0, %2 ; wre
  195. mova m1, %3 ; wim
  196. mulps m2, m4, m0 ; r2*wre
  197. IF%1 mova m6, Z2(6)
  198. mulps m3, m5, m1 ; i2*wim
  199. IF%1 mova m7, Z2(7)
  200. mulps m4, m4, m1 ; r2*wim
  201. mulps m5, m5, m0 ; i2*wre
  202. addps m2, m2, m3 ; r2*wre + i2*wim
  203. mulps m3, m1, m7 ; i3*wim
  204. subps m5, m5, m4 ; i2*wre - r2*wim
  205. mulps m1, m1, m6 ; r3*wim
  206. mulps m4, m0, m6 ; r3*wre
  207. mulps m0, m0, m7 ; i3*wre
  208. subps m4, m4, m3 ; r3*wre - i3*wim
  209. mova m3, Z(0)
  210. addps m0, m0, m1 ; i3*wre + r3*wim
  211. subps m1, m4, m2 ; t3
  212. addps m4, m4, m2 ; t5
  213. subps m3, m3, m4 ; r2
  214. addps m4, m4, Z(0) ; r0
  215. mova m6, Z(2)
  216. mova Z(4), m3
  217. mova Z(0), m4
  218. subps m3, m5, m0 ; t4
  219. subps m4, m6, m3 ; r3
  220. addps m3, m3, m6 ; r1
  221. mova Z2(6), m4
  222. mova Z(2), m3
  223. mova m2, Z(3)
  224. addps m3, m5, m0 ; t6
  225. subps m2, m2, m1 ; i3
  226. mova m7, Z(1)
  227. addps m1, m1, Z(3) ; i1
  228. mova Z2(7), m2
  229. mova Z(3), m1
  230. subps m4, m7, m3 ; i2
  231. addps m3, m3, m7 ; i0
  232. mova Z(5), m4
  233. mova Z(1), m3
  234. %endmacro
  235. ; scheduled to avoid store->load aliasing
  236. %macro PASS_BIG 1 ; (!interleave)
  237. mova m4, Z(4) ; r2
  238. mova m5, Z(5) ; i2
  239. mova m0, [wq] ; wre
  240. mova m1, [wq+o1q] ; wim
  241. mulps m2, m4, m0 ; r2*wre
  242. mova m6, Z2(6) ; r3
  243. mulps m3, m5, m1 ; i2*wim
  244. mova m7, Z2(7) ; i3
  245. mulps m4, m4, m1 ; r2*wim
  246. mulps m5, m5, m0 ; i2*wre
  247. addps m2, m2, m3 ; r2*wre + i2*wim
  248. mulps m3, m1, m7 ; i3*wim
  249. mulps m1, m1, m6 ; r3*wim
  250. subps m5, m5, m4 ; i2*wre - r2*wim
  251. mulps m4, m0, m6 ; r3*wre
  252. mulps m0, m0, m7 ; i3*wre
  253. subps m4, m4, m3 ; r3*wre - i3*wim
  254. mova m3, Z(0)
  255. addps m0, m0, m1 ; i3*wre + r3*wim
  256. subps m1, m4, m2 ; t3
  257. addps m4, m4, m2 ; t5
  258. subps m3, m3, m4 ; r2
  259. addps m4, m4, Z(0) ; r0
  260. mova m6, Z(2)
  261. mova Z(4), m3
  262. mova Z(0), m4
  263. subps m3, m5, m0 ; t4
  264. subps m4, m6, m3 ; r3
  265. addps m3, m3, m6 ; r1
  266. IF%1 mova Z2(6), m4
  267. IF%1 mova Z(2), m3
  268. mova m2, Z(3)
  269. addps m5, m5, m0 ; t6
  270. subps m2, m2, m1 ; i3
  271. mova m7, Z(1)
  272. addps m1, m1, Z(3) ; i1
  273. IF%1 mova Z2(7), m2
  274. IF%1 mova Z(3), m1
  275. subps m6, m7, m5 ; i2
  276. addps m5, m5, m7 ; i0
  277. IF%1 mova Z(5), m6
  278. IF%1 mova Z(1), m5
  279. %if %1==0
  280. INTERL m1, m3, m7, Z, 2
  281. INTERL m2, m4, m0, Z2, 6
  282. mova m1, Z(0)
  283. mova m2, Z(4)
  284. INTERL m5, m1, m3, Z, 0
  285. INTERL m6, m2, m7, Z, 4
  286. %endif
  287. %endmacro
  288. %macro PUNPCK 3
  289. mova %3, %1
  290. punpckldq %1, %2
  291. punpckhdq %3, %2
  292. %endmacro
  293. %define Z(x) [r0+mmsize*x]
  294. %define Z2(x) [r0+mmsize*x]
  295. %define ZH(x) [r0+mmsize*x+mmsize/2]
  296. INIT_YMM avx
  297. %if HAVE_AVX_EXTERNAL
  298. align 16
  299. fft8_avx:
  300. mova m0, Z(0)
  301. mova m1, Z(1)
  302. T8_AVX m0, m1, m2, m3, m4
  303. mova Z(0), m0
  304. mova Z(1), m1
  305. ret
  306. align 16
  307. fft16_avx:
  308. mova m2, Z(2)
  309. mova m3, Z(3)
  310. T4_SSE m2, m3, m7
  311. mova m0, Z(0)
  312. mova m1, Z(1)
  313. T8_AVX m0, m1, m4, m5, m7
  314. mova m4, [ps_cos16_1]
  315. mova m5, [ps_cos16_2]
  316. vmulps m6, m2, m4
  317. vmulps m7, m3, m5
  318. vaddps m7, m7, m6
  319. vmulps m2, m2, m5
  320. vmulps m3, m3, m4
  321. vsubps m3, m3, m2
  322. vblendps m2, m7, m3, 0xf0
  323. vperm2f128 m3, m7, m3, 0x21
  324. vaddps m4, m2, m3
  325. vsubps m2, m3, m2
  326. vperm2f128 m2, m2, m2, 0x01
  327. vsubps m3, m1, m2
  328. vaddps m1, m1, m2
  329. vsubps m5, m0, m4
  330. vaddps m0, m0, m4
  331. vextractf128 Z(0), m0, 0
  332. vextractf128 ZH(0), m1, 0
  333. vextractf128 Z(1), m0, 1
  334. vextractf128 ZH(1), m1, 1
  335. vextractf128 Z(2), m5, 0
  336. vextractf128 ZH(2), m3, 0
  337. vextractf128 Z(3), m5, 1
  338. vextractf128 ZH(3), m3, 1
  339. ret
  340. align 16
  341. fft32_avx:
  342. call fft16_avx
  343. mova m0, Z(4)
  344. mova m1, Z(5)
  345. T4_SSE m0, m1, m4
  346. mova m2, Z(6)
  347. mova m3, Z(7)
  348. T8_SSE m0, m1, m2, m3, m4, m6
  349. ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
  350. ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
  351. vperm2f128 m4, m0, m2, 0x20
  352. vperm2f128 m5, m1, m3, 0x20
  353. vperm2f128 m6, m0, m2, 0x31
  354. vperm2f128 m7, m1, m3, 0x31
  355. PASS_SMALL 0, [cos_32], [cos_32+32]
  356. ret
  357. fft32_interleave_avx:
  358. call fft32_avx
  359. mov r2d, 32
  360. .deint_loop:
  361. mova m2, Z(0)
  362. mova m3, Z(1)
  363. vunpcklps m0, m2, m3
  364. vunpckhps m1, m2, m3
  365. vextractf128 Z(0), m0, 0
  366. vextractf128 ZH(0), m1, 0
  367. vextractf128 Z(1), m0, 1
  368. vextractf128 ZH(1), m1, 1
  369. add r0, mmsize*2
  370. sub r2d, mmsize/4
  371. jg .deint_loop
  372. ret
  373. %endif
  374. INIT_XMM sse
  375. align 16
  376. fft4_avx:
  377. fft4_sse:
  378. mova m0, Z(0)
  379. mova m1, Z(1)
  380. T4_SSE m0, m1, m2
  381. mova Z(0), m0
  382. mova Z(1), m1
  383. ret
  384. align 16
  385. fft8_sse:
  386. mova m0, Z(0)
  387. mova m1, Z(1)
  388. T4_SSE m0, m1, m2
  389. mova m2, Z(2)
  390. mova m3, Z(3)
  391. T8_SSE m0, m1, m2, m3, m4, m5
  392. mova Z(0), m0
  393. mova Z(1), m1
  394. mova Z(2), m2
  395. mova Z(3), m3
  396. ret
  397. align 16
  398. fft16_sse:
  399. mova m0, Z(0)
  400. mova m1, Z(1)
  401. T4_SSE m0, m1, m2
  402. mova m2, Z(2)
  403. mova m3, Z(3)
  404. T8_SSE m0, m1, m2, m3, m4, m5
  405. mova m4, Z(4)
  406. mova m5, Z(5)
  407. mova Z(0), m0
  408. mova Z(1), m1
  409. mova Z(2), m2
  410. mova Z(3), m3
  411. T4_SSE m4, m5, m6
  412. mova m6, Z2(6)
  413. mova m7, Z2(7)
  414. T4_SSE m6, m7, m0
  415. PASS_SMALL 0, [cos_16], [cos_16+16]
  416. ret
  417. %macro FFT48_3DNOW 0
  418. align 16
  419. fft4 %+ SUFFIX:
  420. T2_3DNOW m0, m1, Z(0), Z(1)
  421. mova m2, Z(2)
  422. mova m3, Z(3)
  423. T4_3DNOW m0, m1, m2, m3, m4, m5
  424. PUNPCK m0, m1, m4
  425. PUNPCK m2, m3, m5
  426. mova Z(0), m0
  427. mova Z(1), m4
  428. mova Z(2), m2
  429. mova Z(3), m5
  430. ret
  431. align 16
  432. fft8 %+ SUFFIX:
  433. T2_3DNOW m0, m1, Z(0), Z(1)
  434. mova m2, Z(2)
  435. mova m3, Z(3)
  436. T4_3DNOW m0, m1, m2, m3, m4, m5
  437. mova Z(0), m0
  438. mova Z(2), m2
  439. T2_3DNOW m4, m5, Z(4), Z(5)
  440. T2_3DNOW m6, m7, Z2(6), Z2(7)
  441. PSWAPD m0, m5
  442. PSWAPD m2, m7
  443. pxor m0, [ps_m1p1]
  444. pxor m2, [ps_m1p1]
  445. pfsub m5, m0
  446. pfadd m7, m2
  447. pfmul m5, [ps_root2]
  448. pfmul m7, [ps_root2]
  449. T4_3DNOW m1, m3, m5, m7, m0, m2
  450. mova Z(5), m5
  451. mova Z2(7), m7
  452. mova m0, Z(0)
  453. mova m2, Z(2)
  454. T4_3DNOW m0, m2, m4, m6, m5, m7
  455. PUNPCK m0, m1, m5
  456. PUNPCK m2, m3, m7
  457. mova Z(0), m0
  458. mova Z(1), m5
  459. mova Z(2), m2
  460. mova Z(3), m7
  461. PUNPCK m4, Z(5), m5
  462. PUNPCK m6, Z2(7), m7
  463. mova Z(4), m4
  464. mova Z(5), m5
  465. mova Z2(6), m6
  466. mova Z2(7), m7
  467. ret
  468. %endmacro
  469. %if ARCH_X86_32
  470. INIT_MMX 3dnowext
  471. FFT48_3DNOW
  472. INIT_MMX 3dnow
  473. FFT48_3DNOW
  474. %endif
  475. %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
  476. %define Z2(x) [zcq + o3q + mmsize*(x&1)]
  477. %define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
  478. %define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
  479. %macro DECL_PASS 2+ ; name, payload
  480. align 16
  481. %1:
  482. DEFINE_ARGS zc, w, n, o1, o3
  483. lea o3q, [nq*3]
  484. lea o1q, [nq*8]
  485. shl o3q, 4
  486. .loop:
  487. %2
  488. add zcq, mmsize*2
  489. add wq, mmsize
  490. sub nd, mmsize/8
  491. jg .loop
  492. rep ret
  493. %endmacro
  494. %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
  495. lea r2, [dispatch_tab%1]
  496. mov r2, [r2 + (%2q-2)*gprsize]
  497. %ifdef PIC
  498. lea r3, [$$]
  499. add r2, r3
  500. %endif
  501. call r2
  502. %endmacro ; FFT_DISPATCH
  503. INIT_YMM avx
  504. %if HAVE_AVX_EXTERNAL
  505. DECL_PASS pass_avx, PASS_BIG 1
  506. DECL_PASS pass_interleave_avx, PASS_BIG 0
  507. cglobal fft_calc, 2,5,8
  508. mov r3d, [r0 + FFTContext.nbits]
  509. mov r0, r1
  510. mov r1, r3
  511. FFT_DISPATCH _interleave %+ SUFFIX, r1
  512. REP_RET
  513. %endif
  514. INIT_XMM sse
  515. DECL_PASS pass_sse, PASS_BIG 1
  516. DECL_PASS pass_interleave_sse, PASS_BIG 0
  517. %macro FFT_CALC_FUNC 0
  518. cglobal fft_calc, 2,5,8
  519. mov r3d, [r0 + FFTContext.nbits]
  520. PUSH r1
  521. PUSH r3
  522. mov r0, r1
  523. mov r1, r3
  524. FFT_DISPATCH _interleave %+ SUFFIX, r1
  525. POP rcx
  526. POP r4
  527. cmp rcx, 3+(mmsize/16)
  528. jg .end
  529. mov r2, -1
  530. add rcx, 3
  531. shl r2, cl
  532. sub r4, r2
  533. .loop:
  534. %if mmsize == 8
  535. PSWAPD m0, [r4 + r2 + 4]
  536. mova [r4 + r2 + 4], m0
  537. %else
  538. movaps xmm0, [r4 + r2]
  539. movaps xmm1, xmm0
  540. unpcklps xmm0, [r4 + r2 + 16]
  541. unpckhps xmm1, [r4 + r2 + 16]
  542. movaps [r4 + r2], xmm0
  543. movaps [r4 + r2 + 16], xmm1
  544. %endif
  545. add r2, mmsize*2
  546. jl .loop
  547. .end:
  548. %if cpuflag(3dnow)
  549. femms
  550. RET
  551. %else
  552. REP_RET
  553. %endif
  554. %endmacro
  555. %if ARCH_X86_32
  556. INIT_MMX 3dnow
  557. FFT_CALC_FUNC
  558. INIT_MMX 3dnowext
  559. FFT_CALC_FUNC
  560. %endif
  561. INIT_XMM sse
  562. FFT_CALC_FUNC
  563. cglobal fft_permute, 2,7,1
  564. mov r4, [r0 + FFTContext.revtab]
  565. mov r5, [r0 + FFTContext.tmpbuf]
  566. mov ecx, [r0 + FFTContext.nbits]
  567. mov r2, 1
  568. shl r2, cl
  569. xor r0, r0
  570. %if ARCH_X86_32
  571. mov r1, r1m
  572. %endif
  573. .loop:
  574. movaps xmm0, [r1 + 8*r0]
  575. movzx r6, word [r4 + 2*r0]
  576. movzx r3, word [r4 + 2*r0 + 2]
  577. movlps [r5 + 8*r6], xmm0
  578. movhps [r5 + 8*r3], xmm0
  579. add r0, 2
  580. cmp r0, r2
  581. jl .loop
  582. shl r2, 3
  583. add r1, r2
  584. add r5, r2
  585. neg r2
  586. ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
  587. .loopcopy:
  588. movaps xmm0, [r5 + r2]
  589. movaps xmm1, [r5 + r2 + 16]
  590. movaps [r1 + r2], xmm0
  591. movaps [r1 + r2 + 16], xmm1
  592. add r2, 32
  593. jl .loopcopy
  594. REP_RET
  595. %macro IMDCT_CALC_FUNC 0
  596. cglobal imdct_calc, 3,5,3
  597. mov r3d, [r0 + FFTContext.mdctsize]
  598. mov r4, [r0 + FFTContext.imdcthalf]
  599. add r1, r3
  600. PUSH r3
  601. PUSH r1
  602. %if ARCH_X86_32
  603. push r2
  604. push r1
  605. push r0
  606. %else
  607. sub rsp, 8+32*WIN64 ; allocate win64 shadow space
  608. %endif
  609. call r4
  610. %if ARCH_X86_32
  611. add esp, 12
  612. %else
  613. add rsp, 8+32*WIN64
  614. %endif
  615. POP r1
  616. POP r3
  617. lea r0, [r1 + 2*r3]
  618. mov r2, r3
  619. sub r3, mmsize
  620. neg r2
  621. mova m2, [ps_neg]
  622. .loop:
  623. %if mmsize == 8
  624. PSWAPD m0, [r1 + r3]
  625. PSWAPD m1, [r0 + r2]
  626. pxor m0, m2
  627. %else
  628. mova m0, [r1 + r3]
  629. mova m1, [r0 + r2]
  630. shufps m0, m0, 0x1b
  631. shufps m1, m1, 0x1b
  632. xorps m0, m2
  633. %endif
  634. mova [r0 + r3], m1
  635. mova [r1 + r2], m0
  636. sub r3, mmsize
  637. add r2, mmsize
  638. jl .loop
  639. %if cpuflag(3dnow)
  640. femms
  641. RET
  642. %else
  643. REP_RET
  644. %endif
  645. %endmacro
  646. %if ARCH_X86_32
  647. INIT_MMX 3dnow
  648. IMDCT_CALC_FUNC
  649. INIT_MMX 3dnowext
  650. IMDCT_CALC_FUNC
  651. %endif
  652. INIT_XMM sse
  653. IMDCT_CALC_FUNC
  654. %if ARCH_X86_32
  655. INIT_MMX 3dnow
  656. %define mulps pfmul
  657. %define addps pfadd
  658. %define subps pfsub
  659. %define unpcklps punpckldq
  660. %define unpckhps punpckhdq
  661. DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
  662. DECL_PASS pass_interleave_3dnow, PASS_BIG 0
  663. %define pass_3dnowext pass_3dnow
  664. %define pass_interleave_3dnowext pass_interleave_3dnow
  665. %endif
  666. %ifdef PIC
  667. %define SECTION_REL - $$
  668. %else
  669. %define SECTION_REL
  670. %endif
  671. %macro DECL_FFT 1-2 ; nbits, suffix
  672. %ifidn %0, 1
  673. %xdefine fullsuffix SUFFIX
  674. %else
  675. %xdefine fullsuffix %2 %+ SUFFIX
  676. %endif
  677. %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
  678. %if %1>=5
  679. %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
  680. %endif
  681. %if %1>=6
  682. %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
  683. %endif
  684. %assign n 1<<%1
  685. %rep 18-%1
  686. %assign n2 n/2
  687. %assign n4 n/4
  688. %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
  689. align 16
  690. fft %+ n %+ fullsuffix:
  691. call fft %+ n2 %+ SUFFIX
  692. add r0, n*4 - (n&(-2<<%1))
  693. call fft %+ n4 %+ SUFFIX
  694. add r0, n*2 - (n2&(-2<<%1))
  695. call fft %+ n4 %+ SUFFIX
  696. sub r0, n*6 + (n2&(-2<<%1))
  697. lea r1, [cos_ %+ n]
  698. mov r2d, n4/2
  699. jmp pass %+ fullsuffix
  700. %assign n n*2
  701. %endrep
  702. %undef n
  703. align 8
  704. dispatch_tab %+ fullsuffix: pointer list_of_fft
  705. %endmacro ; DECL_FFT
  706. %if HAVE_AVX_EXTERNAL
  707. INIT_YMM avx
  708. DECL_FFT 6
  709. DECL_FFT 6, _interleave
  710. %endif
  711. INIT_XMM sse
  712. DECL_FFT 5
  713. DECL_FFT 5, _interleave
  714. %if ARCH_X86_32
  715. INIT_MMX 3dnow
  716. DECL_FFT 4
  717. DECL_FFT 4, _interleave
  718. INIT_MMX 3dnowext
  719. DECL_FFT 4
  720. DECL_FFT 4, _interleave
  721. %endif
  722. INIT_XMM sse
  723. %undef mulps
  724. %undef addps
  725. %undef subps
  726. %undef unpcklps
  727. %undef unpckhps
  728. %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
  729. %if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
  730. PSWAPD m0, [%3+%2*4]
  731. movq m2, [%3+%1*4-8]
  732. movq m3, m0
  733. punpckldq m0, m2
  734. punpckhdq m2, m3
  735. movd m1, [%4+%1*2-4] ; tcos[j]
  736. movd m3, [%4+%2*2] ; tcos[n4-j-1]
  737. punpckldq m1, [%5+%1*2-4] ; tsin[j]
  738. punpckldq m3, [%5+%2*2] ; tsin[n4-j-1]
  739. mova m4, m0
  740. PSWAPD m5, m1
  741. pfmul m0, m1
  742. pfmul m4, m5
  743. mova m6, m2
  744. PSWAPD m5, m3
  745. pfmul m2, m3
  746. pfmul m6, m5
  747. %if cpuflag(3dnowext)
  748. pfpnacc m0, m4
  749. pfpnacc m2, m6
  750. %else
  751. SBUTTERFLY dq, 0, 4, 1
  752. SBUTTERFLY dq, 2, 6, 3
  753. pxor m4, m7
  754. pxor m6, m7
  755. pfadd m0, m4
  756. pfadd m2, m6
  757. %endif
  758. %else
  759. movaps xmm0, [%3+%2*4]
  760. movaps xmm1, [%3+%1*4-0x10]
  761. movaps xmm2, xmm0
  762. shufps xmm0, xmm1, 0x88
  763. shufps xmm1, xmm2, 0x77
  764. movlps xmm4, [%4+%2*2]
  765. movlps xmm5, [%5+%2*2+0x0]
  766. movhps xmm4, [%4+%1*2-0x8]
  767. movhps xmm5, [%5+%1*2-0x8]
  768. movaps xmm2, xmm0
  769. movaps xmm3, xmm1
  770. mulps xmm0, xmm5
  771. mulps xmm1, xmm4
  772. mulps xmm2, xmm4
  773. mulps xmm3, xmm5
  774. subps xmm1, xmm0
  775. addps xmm2, xmm3
  776. movaps xmm0, xmm1
  777. unpcklps xmm1, xmm2
  778. unpckhps xmm0, xmm2
  779. %endif
  780. %endmacro
  781. %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
  782. %if cpuflag(sse)
  783. mulps m6, %3, [%5+%1]
  784. mulps m7, %2, [%5+%1]
  785. mulps %2, %2, [%6+%1]
  786. mulps %3, %3, [%6+%1]
  787. subps %2, %2, m6
  788. addps %3, %3, m7
  789. %elif cpuflag(3dnow)
  790. mova m6, [%1+%2*2]
  791. mova %3, [%1+%2*2+8]
  792. mova %4, m6
  793. mova m7, %3
  794. pfmul m6, [%5+%2]
  795. pfmul %3, [%6+%2]
  796. pfmul %4, [%6+%2]
  797. pfmul m7, [%5+%2]
  798. pfsub %3, m6
  799. pfadd %4, m7
  800. %endif
  801. %endmacro
  802. %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
  803. .post:
  804. %if cpuflag(avx)
  805. vmovaps ymm1, [%3+%1*2]
  806. vmovaps ymm0, [%3+%1*2+0x20]
  807. vmovaps ymm3, [%3+%2*2]
  808. vmovaps ymm2, [%3+%2*2+0x20]
  809. CMUL %1, ymm0, ymm1, %3, %4, %5
  810. CMUL %2, ymm2, ymm3, %3, %4, %5
  811. vshufps ymm1, ymm1, ymm1, 0x1b
  812. vshufps ymm3, ymm3, ymm3, 0x1b
  813. vperm2f128 ymm1, ymm1, ymm1, 0x01
  814. vperm2f128 ymm3, ymm3, ymm3, 0x01
  815. vunpcklps ymm6, ymm2, ymm1
  816. vunpckhps ymm4, ymm2, ymm1
  817. vunpcklps ymm7, ymm0, ymm3
  818. vunpckhps ymm5, ymm0, ymm3
  819. vextractf128 [%3+%1*2], ymm7, 0
  820. vextractf128 [%3+%1*2+0x10], ymm5, 0
  821. vextractf128 [%3+%1*2+0x20], ymm7, 1
  822. vextractf128 [%3+%1*2+0x30], ymm5, 1
  823. vextractf128 [%3+%2*2], ymm6, 0
  824. vextractf128 [%3+%2*2+0x10], ymm4, 0
  825. vextractf128 [%3+%2*2+0x20], ymm6, 1
  826. vextractf128 [%3+%2*2+0x30], ymm4, 1
  827. sub %2, 0x20
  828. add %1, 0x20
  829. jl .post
  830. %elif cpuflag(sse)
  831. movaps xmm1, [%3+%1*2]
  832. movaps xmm0, [%3+%1*2+0x10]
  833. CMUL %1, xmm0, xmm1, %3, %4, %5
  834. movaps xmm5, [%3+%2*2]
  835. movaps xmm4, [%3+%2*2+0x10]
  836. CMUL %2, xmm4, xmm5, %3, %4, %5
  837. shufps xmm1, xmm1, 0x1b
  838. shufps xmm5, xmm5, 0x1b
  839. movaps xmm6, xmm4
  840. unpckhps xmm4, xmm1
  841. unpcklps xmm6, xmm1
  842. movaps xmm2, xmm0
  843. unpcklps xmm0, xmm5
  844. unpckhps xmm2, xmm5
  845. movaps [%3+%2*2], xmm6
  846. movaps [%3+%2*2+0x10], xmm4
  847. movaps [%3+%1*2], xmm0
  848. movaps [%3+%1*2+0x10], xmm2
  849. sub %2, 0x10
  850. add %1, 0x10
  851. jl .post
  852. %elif cpuflag(3dnow)
  853. CMUL %3, %1, m0, m1, %4, %5
  854. CMUL %3, %2, m2, m3, %4, %5
  855. movd [%3+%1*2+ 0], m0
  856. movd [%3+%2*2+12], m1
  857. movd [%3+%2*2+ 0], m2
  858. movd [%3+%1*2+12], m3
  859. psrlq m0, 32
  860. psrlq m1, 32
  861. psrlq m2, 32
  862. psrlq m3, 32
  863. movd [%3+%1*2+ 8], m0
  864. movd [%3+%2*2+ 4], m1
  865. movd [%3+%2*2+ 8], m2
  866. movd [%3+%1*2+ 4], m3
  867. sub %2, 8
  868. add %1, 8
  869. jl .post
  870. %endif
  871. %endmacro
  872. %macro DECL_IMDCT 0
  873. cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
  874. %if ARCH_X86_64
  875. %define rrevtab r7
  876. %define rtcos r8
  877. %define rtsin r9
  878. %else
  879. %define rrevtab r6
  880. %define rtsin r6
  881. %define rtcos r5
  882. %endif
  883. mov r3d, [r0+FFTContext.mdctsize]
  884. add r2, r3
  885. shr r3, 1
  886. mov rtcos, [r0+FFTContext.tcos]
  887. mov rtsin, [r0+FFTContext.tsin]
  888. add rtcos, r3
  889. add rtsin, r3
  890. %if ARCH_X86_64 == 0
  891. push rtcos
  892. push rtsin
  893. %endif
  894. shr r3, 1
  895. mov rrevtab, [r0+FFTContext.revtab]
  896. add rrevtab, r3
  897. %if ARCH_X86_64 == 0
  898. push rrevtab
  899. %endif
  900. %if mmsize == 8
  901. sub r3, 2
  902. %else
  903. sub r3, 4
  904. %endif
  905. %if ARCH_X86_64 || mmsize == 8
  906. xor r4, r4
  907. sub r4, r3
  908. %endif
  909. %if notcpuflag(3dnowext) && mmsize == 8
  910. movd m7, [ps_neg]
  911. %endif
  912. .pre:
  913. %if ARCH_X86_64 == 0
  914. ;unspill
  915. %if mmsize != 8
  916. xor r4, r4
  917. sub r4, r3
  918. %endif
  919. mov rtcos, [esp+8]
  920. mov rtsin, [esp+4]
  921. %endif
  922. PREROTATER r4, r3, r2, rtcos, rtsin
  923. %if mmsize == 8
  924. mov r6, [esp] ; rrevtab = ptr+n8
  925. movzx r5, word [rrevtab+r4-2] ; rrevtab[j]
  926. movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1]
  927. mova [r1+r5*8], m0
  928. mova [r1+r6*8], m2
  929. add r4, 2
  930. sub r3, 2
  931. %else
  932. %if ARCH_X86_64
  933. movzx r5, word [rrevtab+r4-4]
  934. movzx r6, word [rrevtab+r4-2]
  935. movzx r10, word [rrevtab+r3]
  936. movzx r11, word [rrevtab+r3+2]
  937. movlps [r1+r5 *8], xmm0
  938. movhps [r1+r6 *8], xmm0
  939. movlps [r1+r10*8], xmm1
  940. movhps [r1+r11*8], xmm1
  941. add r4, 4
  942. %else
  943. mov r6, [esp]
  944. movzx r5, word [r6+r4-4]
  945. movzx r4, word [r6+r4-2]
  946. movlps [r1+r5*8], xmm0
  947. movhps [r1+r4*8], xmm0
  948. movzx r5, word [r6+r3]
  949. movzx r4, word [r6+r3+2]
  950. movlps [r1+r5*8], xmm1
  951. movhps [r1+r4*8], xmm1
  952. %endif
  953. sub r3, 4
  954. %endif
  955. jns .pre
  956. mov r5, r0
  957. mov r6, r1
  958. mov r0, r1
  959. mov r1d, [r5+FFTContext.nbits]
  960. FFT_DISPATCH SUFFIX, r1
  961. mov r0d, [r5+FFTContext.mdctsize]
  962. add r6, r0
  963. shr r0, 1
  964. %if ARCH_X86_64 == 0
  965. %define rtcos r2
  966. %define rtsin r3
  967. mov rtcos, [esp+8]
  968. mov rtsin, [esp+4]
  969. %endif
  970. neg r0
  971. mov r1, -mmsize
  972. sub r1, r0
  973. POSROTATESHUF r0, r1, r6, rtcos, rtsin
  974. %if ARCH_X86_64 == 0
  975. add esp, 12
  976. %endif
  977. %if mmsize == 8
  978. femms
  979. %endif
  980. RET
  981. %endmacro
  982. DECL_IMDCT
  983. %if ARCH_X86_32
  984. INIT_MMX 3dnow
  985. DECL_IMDCT
  986. INIT_MMX 3dnowext
  987. DECL_IMDCT
  988. %endif
  989. INIT_YMM avx
  990. %if HAVE_AVX_EXTERNAL
  991. DECL_IMDCT
  992. %endif