vf_w3fdif.asm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. ;*****************************************************************************
  2. ;* x86-optimized functions for w3fdif filter
  3. ;*
  4. ;* Copyright (c) 2015 Paul B Mahol
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION .text
  24. INIT_XMM sse2
  25. cglobal w3fdif_scale, 3, 3, 2, 0, out_pixel, work_pixel, linesize
  26. .loop:
  27. mova m0, [work_pixelq]
  28. mova m1, [work_pixelq+mmsize]
  29. psrad m0, 15
  30. psrad m1, 15
  31. packssdw m0, m1
  32. packuswb m0, m0
  33. movh [out_pixelq], m0
  34. add out_pixelq, mmsize/2
  35. add work_pixelq, mmsize*2
  36. sub linesized, mmsize/2
  37. jg .loop
  38. REP_RET
  39. cglobal w3fdif_simple_low, 4, 5, 6, 0, work_line, in_lines_cur0, coef, linesize, offset
  40. movd m1, [coefq]
  41. DEFINE_ARGS work_line, in_lines_cur0, in_lines_cur1, linesize, offset
  42. SPLATW m0, m1, 0
  43. SPLATW m1, m1, 1
  44. pxor m4, m4
  45. mov offsetq, 0
  46. mov in_lines_cur1q, [in_lines_cur0q + gprsize]
  47. mov in_lines_cur0q, [in_lines_cur0q]
  48. .loop:
  49. movh m2, [in_lines_cur0q+offsetq]
  50. movh m3, [in_lines_cur1q+offsetq]
  51. punpcklbw m2, m4
  52. punpcklbw m3, m4
  53. SBUTTERFLY wd, 2, 3, 5
  54. pmaddwd m2, m0
  55. pmaddwd m3, m1
  56. mova [work_lineq+offsetq*4], m2
  57. mova [work_lineq+offsetq*4+mmsize], m3
  58. add offsetq, mmsize/2
  59. sub linesized, mmsize/2
  60. jg .loop
  61. REP_RET
  62. cglobal w3fdif_complex_low, 4, 7, 8, 0, work_line, in_lines_cur0, coef, linesize
  63. movq m0, [coefq]
  64. DEFINE_ARGS work_line, in_lines_cur0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_cur3
  65. pshufd m2, m0, q1111
  66. SPLATD m0
  67. pxor m1, m1
  68. mov offsetq, 0
  69. mov in_lines_cur3q, [in_lines_cur0q+gprsize*3]
  70. mov in_lines_cur2q, [in_lines_cur0q+gprsize*2]
  71. mov in_lines_cur1q, [in_lines_cur0q+gprsize]
  72. mov in_lines_cur0q, [in_lines_cur0q]
  73. .loop:
  74. movh m4, [in_lines_cur0q+offsetq]
  75. movh m5, [in_lines_cur1q+offsetq]
  76. punpcklbw m4, m1
  77. punpcklbw m5, m1
  78. SBUTTERFLY wd, 4, 5, 7
  79. pmaddwd m4, m0
  80. pmaddwd m5, m0
  81. movh m6, [in_lines_cur2q+offsetq]
  82. movh m3, [in_lines_cur3q+offsetq]
  83. punpcklbw m6, m1
  84. punpcklbw m3, m1
  85. SBUTTERFLY wd, 6, 3, 7
  86. pmaddwd m6, m2
  87. pmaddwd m3, m2
  88. paddd m4, m6
  89. paddd m5, m3
  90. mova [work_lineq+offsetq*4], m4
  91. mova [work_lineq+offsetq*4+mmsize], m5
  92. add offsetq, mmsize/2
  93. sub linesized, mmsize/2
  94. jg .loop
  95. REP_RET
  96. %if ARCH_X86_64
  97. cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
  98. %else
  99. cglobal w3fdif_simple_high, 4, 7, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
  100. %endif
  101. movq m2, [coefq]
  102. %if ARCH_X86_64
  103. DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
  104. xor offsetq, offsetq
  105. %else
  106. DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, in_lines_cur2, in_lines_adj1, in_lines_adj2
  107. %define linesized r4mp
  108. %endif
  109. pshufd m0, m2, q0000
  110. SPLATW m2, m2, 2
  111. pxor m7, m7
  112. mov in_lines_cur2q, [in_lines_cur0q+gprsize*2]
  113. mov in_lines_cur1q, [in_lines_cur0q+gprsize]
  114. mov in_lines_cur0q, [in_lines_cur0q]
  115. mov in_lines_adj2q, [in_lines_adj0q+gprsize*2]
  116. mov in_lines_adj1q, [in_lines_adj0q+gprsize]
  117. mov in_lines_adj0q, [in_lines_adj0q]
  118. %if ARCH_X86_32
  119. sub in_lines_cur1q, in_lines_cur0q
  120. sub in_lines_cur2q, in_lines_cur0q
  121. sub in_lines_adj0q, in_lines_cur0q
  122. sub in_lines_adj1q, in_lines_cur0q
  123. sub in_lines_adj2q, in_lines_cur0q
  124. %define offsetq in_lines_cur0q
  125. %endif
  126. .loop:
  127. %if ARCH_X86_64
  128. movh m3, [in_lines_cur0q+offsetq]
  129. %else
  130. movh m3, [in_lines_cur0q]
  131. %endif
  132. movh m4, [in_lines_cur1q+offsetq]
  133. punpcklbw m3, m7
  134. punpcklbw m4, m7
  135. SBUTTERFLY wd, 3, 4, 1
  136. pmaddwd m3, m0
  137. pmaddwd m4, m0
  138. movh m5, [in_lines_adj0q+offsetq]
  139. movh m6, [in_lines_adj1q+offsetq]
  140. punpcklbw m5, m7
  141. punpcklbw m6, m7
  142. SBUTTERFLY wd, 5, 6, 1
  143. pmaddwd m5, m0
  144. pmaddwd m6, m0
  145. paddd m3, m5
  146. paddd m4, m6
  147. movh m5, [in_lines_cur2q+offsetq]
  148. movh m6, [in_lines_adj2q+offsetq]
  149. punpcklbw m5, m7
  150. punpcklbw m6, m7
  151. SBUTTERFLY wd, 5, 6, 1
  152. pmaddwd m5, m2
  153. pmaddwd m6, m2
  154. paddd m3, m5
  155. paddd m4, m6
  156. %if ARCH_X86_64
  157. paddd m3, [work_lineq+offsetq*4]
  158. paddd m4, [work_lineq+offsetq*4+mmsize]
  159. mova [work_lineq+offsetq*4], m3
  160. mova [work_lineq+offsetq*4+mmsize], m4
  161. %else
  162. paddd m3, [work_lineq]
  163. paddd m4, [work_lineq+mmsize]
  164. mova [work_lineq], m3
  165. mova [work_lineq+mmsize], m4
  166. add work_lineq, mmsize*2
  167. %endif
  168. add offsetq, mmsize/2
  169. sub linesized, mmsize/2
  170. jg .loop
  171. REP_RET
  172. %if ARCH_X86_64
  173. cglobal w3fdif_complex_high, 5, 13, 10, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
  174. movq m0, [coefq+0]
  175. movd m4, [coefq+8]
  176. DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_cur3, in_lines_cur4, in_lines_adj1, in_lines_adj2, in_lines_adj3, in_lines_adj4
  177. pshufd m1, m0, q1111
  178. SPLATD m0
  179. SPLATW m4, m4
  180. pxor m3, m3
  181. mov offsetq, 0
  182. mov in_lines_cur4q, [in_lines_cur0q+gprsize*4]
  183. mov in_lines_cur3q, [in_lines_cur0q+gprsize*3]
  184. mov in_lines_cur2q, [in_lines_cur0q+gprsize*2]
  185. mov in_lines_cur1q, [in_lines_cur0q+gprsize]
  186. mov in_lines_cur0q, [in_lines_cur0q]
  187. mov in_lines_adj4q, [in_lines_adj0q+gprsize*4]
  188. mov in_lines_adj3q, [in_lines_adj0q+gprsize*3]
  189. mov in_lines_adj2q, [in_lines_adj0q+gprsize*2]
  190. mov in_lines_adj1q, [in_lines_adj0q+gprsize]
  191. mov in_lines_adj0q, [in_lines_adj0q]
  192. .loop:
  193. movh m5, [in_lines_cur0q+offsetq]
  194. movh m6, [in_lines_cur1q+offsetq]
  195. punpcklbw m5, m3
  196. punpcklbw m6, m3
  197. SBUTTERFLY wd, 5, 6, 2
  198. pmaddwd m5, m0
  199. pmaddwd m6, m0
  200. movh m8, [in_lines_cur2q+offsetq]
  201. movh m9, [in_lines_cur3q+offsetq]
  202. punpcklbw m8, m3
  203. punpcklbw m9, m3
  204. SBUTTERFLY wd, 8, 9, 2
  205. pmaddwd m8, m1
  206. pmaddwd m9, m1
  207. paddd m5, m8
  208. paddd m6, m9
  209. movh m8, [in_lines_adj0q+offsetq]
  210. movh m9, [in_lines_adj1q+offsetq]
  211. punpcklbw m8, m3
  212. punpcklbw m9, m3
  213. SBUTTERFLY wd, 8, 9, 2
  214. pmaddwd m8, m0
  215. pmaddwd m9, m0
  216. paddd m5, m8
  217. paddd m6, m9
  218. movh m8, [in_lines_adj2q+offsetq]
  219. movh m9, [in_lines_adj3q+offsetq]
  220. punpcklbw m8, m3
  221. punpcklbw m9, m3
  222. SBUTTERFLY wd, 8, 9, 2
  223. pmaddwd m8, m1
  224. pmaddwd m9, m1
  225. paddd m5, m8
  226. paddd m6, m9
  227. movh m8, [in_lines_cur4q+offsetq]
  228. movh m9, [in_lines_adj4q+offsetq]
  229. punpcklbw m8, m3
  230. punpcklbw m9, m3
  231. SBUTTERFLY wd, 8, 9, 2
  232. pmaddwd m8, m4
  233. pmaddwd m9, m4
  234. paddd m5, m8
  235. paddd m6, m9
  236. paddd m5, [work_lineq+offsetq*4]
  237. paddd m6, [work_lineq+offsetq*4+mmsize]
  238. mova [work_lineq+offsetq*4], m5
  239. mova [work_lineq+offsetq*4+mmsize], m6
  240. add offsetq, mmsize/2
  241. sub linesized, mmsize/2
  242. jg .loop
  243. REP_RET
  244. %endif