2
0

resample.asm 23 KB


  1. ;******************************************************************************
  2. ;* Copyright (c) 2012 Michael Niedermayer
  3. ;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com>
  4. ;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. %if ARCH_X86_64
  24. %define pointer resq
  25. %else
  26. %define pointer resd
  27. %endif
  28. struc ResampleContext
  29. .av_class: pointer 1
  30. .filter_bank: pointer 1
  31. .filter_length: resd 1
  32. .filter_alloc: resd 1
  33. .ideal_dst_incr: resd 1
  34. .dst_incr: resd 1
  35. .dst_incr_div: resd 1
  36. .dst_incr_mod: resd 1
  37. .index: resd 1
  38. .frac: resd 1
  39. .src_incr: resd 1
  40. .compensation_distance: resd 1
  41. .phase_count: resd 1
  42. ; there's a few more here but we only care about the first few
  43. endstruc
  44. SECTION_RODATA
  45. pf_1: dd 1.0
  46. pdbl_1: dq 1.0
  47. pd_0x4000: dd 0x4000
  48. SECTION .text
  49. ; FIXME remove unneeded variables (index_incr, phase_mask)
  50. %macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant
  51. ; int resample_common_$format(ResampleContext *ctx, $format *dst,
  52. ; const $format *src, int size, int update_ctx)
  53. %if ARCH_X86_64 ; unix64 and win64
  54. cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_count, index, frac, \
  55. dst_incr_mod, size, min_filter_count_x4, \
  56. min_filter_len_x4, dst_incr_div, src_incr, \
  57. phase_mask, dst_end, filter_bank
  58. ; use red-zone for variable storage
  59. %define ctx_stackq [rsp-0x8]
  60. %define src_stackq [rsp-0x10]
  61. %if WIN64
  62. %define update_context_stackd r4m
  63. %else ; unix64
  64. %define update_context_stackd [rsp-0x14]
  65. %endif
  66. ; load as many variables in registers as possible; for the rest, store
  67. ; on stack so that we have 'ctx' available as one extra register
  68. mov sized, r3d
  69. %if UNIX64
  70. mov update_context_stackd, r4d
  71. %endif
  72. mov indexd, [ctxq+ResampleContext.index]
  73. mov fracd, [ctxq+ResampleContext.frac]
  74. mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
  75. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  76. mov src_incrd, [ctxq+ResampleContext.src_incr]
  77. mov ctx_stackq, ctxq
  78. mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
  79. mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
  80. shl min_filter_len_x4d, %3
  81. lea dst_endq, [dstq+sizeq*%2]
  82. %if UNIX64
  83. mov ecx, [ctxq+ResampleContext.phase_count]
  84. mov edi, [ctxq+ResampleContext.filter_alloc]
  85. DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \
  86. filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  87. src_incr, phase_mask, dst_end, filter_bank
  88. %elif WIN64
  89. mov R9d, [ctxq+ResampleContext.filter_alloc]
  90. mov ecx, [ctxq+ResampleContext.phase_count]
  91. DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \
  92. filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  93. src_incr, phase_mask, dst_end, filter_bank
  94. %endif
  95. neg min_filter_len_x4q
  96. sub filter_bankq, min_filter_len_x4q
  97. sub srcq, min_filter_len_x4q
  98. mov src_stackq, srcq
  99. %else ; x86-32
  100. cglobal resample_common_%1, 1, 7, 2, ctx, phase_count, dst, frac, \
  101. index, min_filter_length_x4, filter_bank
  102. ; push temp variables to stack
  103. %define ctx_stackq r0mp
  104. %define src_stackq r2mp
  105. %define update_context_stackd r4m
  106. mov dstq, r1mp
  107. mov r3, r3mp
  108. lea r3, [dstq+r3*%2]
  109. PUSH dword [ctxq+ResampleContext.dst_incr_div]
  110. PUSH dword [ctxq+ResampleContext.dst_incr_mod]
  111. PUSH dword [ctxq+ResampleContext.filter_alloc]
  112. PUSH r3
  113. PUSH dword [ctxq+ResampleContext.phase_count] ; unneeded replacement for phase_mask
  114. PUSH dword [ctxq+ResampleContext.src_incr]
  115. mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
  116. mov indexd, [ctxq+ResampleContext.index]
  117. shl min_filter_length_x4d, %3
  118. mov fracd, [ctxq+ResampleContext.frac]
  119. neg min_filter_length_x4q
  120. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  121. sub r2mp, min_filter_length_x4q
  122. sub filter_bankq, min_filter_length_x4q
  123. PUSH min_filter_length_x4q
  124. PUSH filter_bankq
  125. mov phase_countd, [ctxq+ResampleContext.phase_count]
  126. DEFINE_ARGS src, phase_count, dst, frac, index, min_filter_count_x4, filter
  127. %define filter_bankq dword [rsp+0x0]
  128. %define min_filter_length_x4q dword [rsp+0x4]
  129. %define src_incrd dword [rsp+0x8]
  130. %define phase_maskd dword [rsp+0xc]
  131. %define dst_endq dword [rsp+0x10]
  132. %define filter_allocd dword [rsp+0x14]
  133. %define dst_incr_modd dword [rsp+0x18]
  134. %define dst_incr_divd dword [rsp+0x1c]
  135. mov srcq, r2mp
  136. %endif
  137. .loop:
  138. mov filterd, filter_allocd
  139. imul filterd, indexd
  140. %if ARCH_X86_64
  141. mov min_filter_count_x4q, min_filter_len_x4q
  142. lea filterq, [filter_bankq+filterq*%2]
  143. %else ; x86-32
  144. mov min_filter_count_x4q, filter_bankq
  145. lea filterq, [min_filter_count_x4q+filterq*%2]
  146. mov min_filter_count_x4q, min_filter_length_x4q
  147. %endif
  148. %ifidn %1, int16
  149. movd m0, [pd_0x4000]
  150. %else ; float/double
  151. xorps m0, m0, m0
  152. %endif
  153. align 16
  154. .inner_loop:
  155. movu m1, [srcq+min_filter_count_x4q*1]
  156. %ifidn %1, int16
  157. %if cpuflag(xop)
  158. vpmadcswd m0, m1, [filterq+min_filter_count_x4q*1], m0
  159. %else
  160. pmaddwd m1, [filterq+min_filter_count_x4q*1]
  161. paddd m0, m1
  162. %endif
  163. %else ; float/double
  164. %if cpuflag(fma4) || cpuflag(fma3)
  165. fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0
  166. %else
  167. mulp%4 m1, m1, [filterq+min_filter_count_x4q*1]
  168. addp%4 m0, m0, m1
  169. %endif ; cpuflag
  170. %endif
  171. add min_filter_count_x4q, mmsize
  172. js .inner_loop
  173. %ifidn %1, int16
  174. HADDD m0, m1
  175. psrad m0, 15
  176. add fracd, dst_incr_modd
  177. packssdw m0, m0
  178. add indexd, dst_incr_divd
  179. movd [dstq], m0
  180. %else ; float/double
  181. ; horizontal sum & store
  182. %if mmsize == 32
  183. vextractf128 xm1, m0, 0x1
  184. addp%4 xm0, xm1
  185. %endif
  186. movhlps xm1, xm0
  187. %ifidn %1, float
  188. addps xm0, xm1
  189. shufps xm1, xm0, xm0, q0001
  190. %endif
  191. add fracd, dst_incr_modd
  192. addp%4 xm0, xm1
  193. add indexd, dst_incr_divd
  194. movs%4 [dstq], xm0
  195. %endif
  196. cmp fracd, src_incrd
  197. jl .skip
  198. sub fracd, src_incrd
  199. inc indexd
  200. %if UNIX64
  201. DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \
  202. index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  203. src_incr, phase_mask, dst_end, filter_bank
  204. %elif WIN64
  205. DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \
  206. index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  207. src_incr, phase_mask, dst_end, filter_bank
  208. %else ; x86-32
  209. DEFINE_ARGS src, phase_count, dst, frac, index, index_incr
  210. %endif
  211. .skip:
  212. add dstq, %2
  213. cmp indexd, phase_countd
  214. jb .index_skip
  215. .index_while:
  216. sub indexd, phase_countd
  217. lea srcq, [srcq+%2]
  218. cmp indexd, phase_countd
  219. jnb .index_while
  220. .index_skip:
  221. cmp dstq, dst_endq
  222. jne .loop
  223. %if ARCH_X86_64
  224. DEFINE_ARGS ctx, dst, src, phase_count, index, frac
  225. %else ; x86-32
  226. DEFINE_ARGS src, ctx, update_context, frac, index
  227. %endif
  228. cmp dword update_context_stackd, 0
  229. jz .skip_store
  230. ; strictly speaking, the function should always return the consumed
  231. ; number of bytes; however, we only use the value if update_context
  232. ; is true, so let's just leave it uninitialized otherwise
  233. mov ctxq, ctx_stackq
  234. movifnidn rax, srcq
  235. mov [ctxq+ResampleContext.frac ], fracd
  236. sub rax, src_stackq
  237. mov [ctxq+ResampleContext.index], indexd
  238. shr rax, %3
  239. .skip_store:
  240. %if ARCH_X86_32
  241. ADD rsp, 0x20
  242. %endif
  243. RET
  244. ; int resample_linear_$format(ResampleContext *ctx, float *dst,
  245. ; const float *src, int size, int update_ctx)
  246. %if ARCH_X86_64 ; unix64 and win64
  247. %if UNIX64
  248. cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_count, index, frac, \
  249. size, dst_incr_mod, min_filter_count_x4, \
  250. min_filter_len_x4, dst_incr_div, src_incr, \
  251. src, dst_end, filter_bank
  252. mov srcq, r2mp
  253. %else ; win64
  254. cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_count, index, frac, \
  255. size, dst_incr_mod, min_filter_count_x4, \
  256. min_filter_len_x4, dst_incr_div, src_incr, \
  257. dst, dst_end, filter_bank
  258. mov dstq, r1mp
  259. %endif
  260. ; use red-zone for variable storage
  261. %define ctx_stackq [rsp-0x8]
  262. %define src_stackq [rsp-0x10]
  263. %define phase_mask_stackd [rsp-0x14]
  264. %if WIN64
  265. %define update_context_stackd r4m
  266. %else ; unix64
  267. %define update_context_stackd [rsp-0x18]
  268. %endif
  269. ; load as many variables in registers as possible; for the rest, store
  270. ; on stack so that we have 'ctx' available as one extra register
  271. mov sized, r3d
  272. %if UNIX64
  273. mov update_context_stackd, r4d
  274. %endif
  275. mov indexd, [ctxq+ResampleContext.index]
  276. mov fracd, [ctxq+ResampleContext.frac]
  277. mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
  278. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  279. mov src_incrd, [ctxq+ResampleContext.src_incr]
  280. mov ctx_stackq, ctxq
  281. mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
  282. %ifidn %1, int16
  283. movd m4, [pd_0x4000]
  284. %else ; float/double
  285. cvtsi2s%4 xm0, src_incrd
  286. movs%4 xm4, [%5]
  287. divs%4 xm4, xm0
  288. %endif
  289. mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
  290. shl min_filter_len_x4d, %3
  291. lea dst_endq, [dstq+sizeq*%2]
  292. %if UNIX64
  293. mov ecx, [ctxq+ResampleContext.phase_count]
  294. mov edi, [ctxq+ResampleContext.filter_alloc]
  295. DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, filter1, \
  296. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  297. dst_incr_div, src_incr, src, dst_end, filter_bank
  298. %elif WIN64
  299. mov R9d, [ctxq+ResampleContext.filter_alloc]
  300. mov ecx, [ctxq+ResampleContext.phase_count]
  301. DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, filter1, \
  302. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  303. dst_incr_div, src_incr, dst, dst_end, filter_bank
  304. %endif
  305. neg min_filter_len_x4q
  306. sub filter_bankq, min_filter_len_x4q
  307. sub srcq, min_filter_len_x4q
  308. mov src_stackq, srcq
  309. %else ; x86-32
  310. cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
  311. frac, index, dst, filter_bank
  312. ; push temp variables to stack
  313. %define ctx_stackq r0mp
  314. %define src_stackq r2mp
  315. %define update_context_stackd r4m
  316. mov dstq, r1mp
  317. mov r3, r3mp
  318. lea r3, [dstq+r3*%2]
  319. PUSH dword [ctxq+ResampleContext.dst_incr_div]
  320. PUSH r3
  321. mov r3, dword [ctxq+ResampleContext.filter_alloc]
  322. PUSH dword [ctxq+ResampleContext.dst_incr_mod]
  323. PUSH r3
  324. shl r3, %3
  325. PUSH r3
  326. mov r3, dword [ctxq+ResampleContext.src_incr]
  327. PUSH dword [ctxq+ResampleContext.phase_count] ; unneeded replacement of phase_mask
  328. PUSH r3d
  329. %ifidn %1, int16
  330. movd m4, [pd_0x4000]
  331. %else ; float/double
  332. cvtsi2s%4 xm0, r3d
  333. movs%4 xm4, [%5]
  334. divs%4 xm4, xm0
  335. %endif
  336. mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
  337. mov indexd, [ctxq+ResampleContext.index]
  338. shl min_filter_length_x4d, %3
  339. mov fracd, [ctxq+ResampleContext.frac]
  340. neg min_filter_length_x4q
  341. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  342. sub r2mp, min_filter_length_x4q
  343. sub filter_bankq, min_filter_length_x4q
  344. PUSH min_filter_length_x4q
  345. PUSH filter_bankq
  346. PUSH dword [ctxq+ResampleContext.phase_count]
  347. DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src
  348. %define phase_count_stackd dword [rsp+0x0]
  349. %define filter_bankq dword [rsp+0x4]
  350. %define min_filter_length_x4q dword [rsp+0x8]
  351. %define src_incrd dword [rsp+0xc]
  352. %define phase_mask_stackd dword [rsp+0x10]
  353. %define filter_alloc_x4q dword [rsp+0x14]
  354. %define filter_allocd dword [rsp+0x18]
  355. %define dst_incr_modd dword [rsp+0x1c]
  356. %define dst_endq dword [rsp+0x20]
  357. %define dst_incr_divd dword [rsp+0x24]
  358. mov srcq, r2mp
  359. %endif
  360. .loop:
  361. mov filter1d, filter_allocd
  362. imul filter1d, indexd
  363. %if ARCH_X86_64
  364. mov min_filter_count_x4q, min_filter_len_x4q
  365. lea filter1q, [filter_bankq+filter1q*%2]
  366. lea filter2q, [filter1q+filter_allocq*%2]
  367. %else ; x86-32
  368. mov min_filter_count_x4q, filter_bankq
  369. lea filter1q, [min_filter_count_x4q+filter1q*%2]
  370. mov min_filter_count_x4q, min_filter_length_x4q
  371. mov filter2q, filter1q
  372. add filter2q, filter_alloc_x4q
  373. %endif
  374. %ifidn %1, int16
  375. mova m0, m4
  376. mova m2, m4
  377. %else ; float/double
  378. xorps m0, m0, m0
  379. xorps m2, m2, m2
  380. %endif
  381. align 16
  382. .inner_loop:
  383. movu m1, [srcq+min_filter_count_x4q*1]
  384. %ifidn %1, int16
  385. %if cpuflag(xop)
  386. vpmadcswd m2, m1, [filter2q+min_filter_count_x4q*1], m2
  387. vpmadcswd m0, m1, [filter1q+min_filter_count_x4q*1], m0
  388. %else
  389. pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1]
  390. pmaddwd m1, [filter1q+min_filter_count_x4q*1]
  391. paddd m2, m3
  392. paddd m0, m1
  393. %endif ; cpuflag
  394. %else ; float/double
  395. %if cpuflag(fma4) || cpuflag(fma3)
  396. fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2
  397. fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0
  398. %else
  399. mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1]
  400. mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1]
  401. addp%4 m2, m2, m3
  402. addp%4 m0, m0, m1
  403. %endif ; cpuflag
  404. %endif
  405. add min_filter_count_x4q, mmsize
  406. js .inner_loop
  407. %ifidn %1, int16
  408. %if mmsize == 16
  409. %if cpuflag(xop)
  410. vphadddq m2, m2
  411. vphadddq m0, m0
  412. %endif
  413. pshufd m3, m2, q0032
  414. pshufd m1, m0, q0032
  415. paddd m2, m3
  416. paddd m0, m1
  417. %endif
  418. %if notcpuflag(xop)
  419. PSHUFLW m3, m2, q0032
  420. PSHUFLW m1, m0, q0032
  421. paddd m2, m3
  422. paddd m0, m1
  423. %endif
  424. psubd m2, m0
  425. ; This is probably a really bad idea on atom and other machines with a
  426. ; long transfer latency between GPRs and XMMs (atom). However, it does
  427. ; make the clip a lot simpler...
  428. movd eax, m2
  429. add indexd, dst_incr_divd
  430. imul fracd
  431. idiv src_incrd
  432. movd m1, eax
  433. add fracd, dst_incr_modd
  434. paddd m0, m1
  435. psrad m0, 15
  436. packssdw m0, m0
  437. movd [dstq], m0
  438. ; note that for imul/idiv, I need to move filter to edx/eax for each:
  439. ; - 32bit: eax=r0[filter1], edx=r2[filter2]
  440. ; - win64: eax=r6[filter1], edx=r1[todo]
  441. ; - unix64: eax=r6[filter1], edx=r2[todo]
  442. %else ; float/double
  443. ; val += (v2 - val) * (FELEML) frac / c->src_incr;
  444. %if mmsize == 32
  445. vextractf128 xm1, m0, 0x1
  446. vextractf128 xm3, m2, 0x1
  447. addp%4 xm0, xm1
  448. addp%4 xm2, xm3
  449. %endif
  450. cvtsi2s%4 xm1, fracd
  451. subp%4 xm2, xm0
  452. mulp%4 xm1, xm4
  453. shufp%4 xm1, xm1, q0000
  454. %if cpuflag(fma4) || cpuflag(fma3)
  455. fmaddp%4 xm0, xm2, xm1, xm0
  456. %else
  457. mulp%4 xm2, xm1
  458. addp%4 xm0, xm2
  459. %endif ; cpuflag
  460. ; horizontal sum & store
  461. movhlps xm1, xm0
  462. %ifidn %1, float
  463. addps xm0, xm1
  464. shufps xm1, xm0, xm0, q0001
  465. %endif
  466. add fracd, dst_incr_modd
  467. addp%4 xm0, xm1
  468. add indexd, dst_incr_divd
  469. movs%4 [dstq], xm0
  470. %endif
  471. cmp fracd, src_incrd
  472. jl .skip
  473. sub fracd, src_incrd
  474. inc indexd
  475. %if UNIX64
  476. DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, index_incr, \
  477. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  478. dst_incr_div, src_incr, src, dst_end, filter_bank
  479. %elif WIN64
  480. DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, index_incr, \
  481. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  482. dst_incr_div, src_incr, dst, dst_end, filter_bank
  483. %else ; x86-32
  484. DEFINE_ARGS filter1, phase_count, index_incr, frac, index, dst, src
  485. %endif
  486. .skip:
  487. %if ARCH_X86_32
  488. mov phase_countd, phase_count_stackd
  489. %endif
  490. add dstq, %2
  491. cmp indexd, phase_countd
  492. jb .index_skip
  493. .index_while:
  494. sub indexd, phase_countd
  495. lea srcq, [srcq+%2]
  496. cmp indexd, phase_countd
  497. jnb .index_while
  498. .index_skip:
  499. cmp dstq, dst_endq
  500. jne .loop
  501. %if UNIX64
  502. DEFINE_ARGS ctx, dst, filter2, phase_count, index, frac, index_incr, \
  503. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  504. dst_incr_div, src_incr, src, dst_end, filter_bank
  505. %elif WIN64
  506. DEFINE_ARGS ctx, filter2, src, phase_count, index, frac, index_incr, \
  507. dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
  508. dst_incr_div, src_incr, dst, dst_end, filter_bank
  509. %else ; x86-32
  510. DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src
  511. %endif
  512. cmp dword update_context_stackd, 0
  513. jz .skip_store
  514. ; strictly speaking, the function should always return the consumed
  515. ; number of bytes; however, we only use the value if update_context
  516. ; is true, so let's just leave it uninitialized otherwise
  517. mov ctxq, ctx_stackq
  518. movifnidn rax, srcq
  519. mov [ctxq+ResampleContext.frac ], fracd
  520. sub rax, src_stackq
  521. mov [ctxq+ResampleContext.index], indexd
  522. shr rax, %3
  523. .skip_store:
  524. %if ARCH_X86_32
  525. ADD rsp, 0x28
  526. %endif
  527. RET
  528. %endmacro
  529. INIT_XMM sse
  530. RESAMPLE_FNS float, 4, 2, s, pf_1
  531. %if HAVE_AVX_EXTERNAL
  532. INIT_YMM avx
  533. RESAMPLE_FNS float, 4, 2, s, pf_1
  534. %endif
  535. %if HAVE_FMA3_EXTERNAL
  536. INIT_YMM fma3
  537. RESAMPLE_FNS float, 4, 2, s, pf_1
  538. %endif
  539. %if HAVE_FMA4_EXTERNAL
  540. INIT_XMM fma4
  541. RESAMPLE_FNS float, 4, 2, s, pf_1
  542. %endif
  543. %if ARCH_X86_32
  544. INIT_MMX mmxext
  545. RESAMPLE_FNS int16, 2, 1
  546. %endif
  547. INIT_XMM sse2
  548. RESAMPLE_FNS int16, 2, 1
  549. %if HAVE_XOP_EXTERNAL
  550. INIT_XMM xop
  551. RESAMPLE_FNS int16, 2, 1
  552. %endif
  553. INIT_XMM sse2
  554. RESAMPLE_FNS double, 8, 3, d, pdbl_1
  555. %if HAVE_AVX_EXTERNAL
  556. INIT_YMM avx
  557. RESAMPLE_FNS double, 8, 3, d, pdbl_1
  558. %endif
  559. %if HAVE_FMA3_EXTERNAL
  560. INIT_YMM fma3
  561. RESAMPLE_FNS double, 8, 3, d, pdbl_1
  562. %endif