vpx_convolve_copy_sse2.asm 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION .text
  12. %macro convolve_fn 1-2
  13. %ifidn %1, avg
  14. %define AUX_XMM_REGS 4
  15. %else
  16. %define AUX_XMM_REGS 0
  17. %endif
  18. %ifidn %2, highbd
  19. %define pavg pavgw
  20. cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
  21. dst, dst_stride, \
  22. f, fxo, fxs, fyo, fys, w, h, bd
  23. %else
  24. %define pavg pavgb
  25. cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
  26. dst, dst_stride, \
  27. f, fxo, fxs, fyo, fys, w, h
  28. %endif
  29. mov r4d, dword wm
  30. %ifidn %2, highbd
  31. shl r4d, 1
  32. shl src_strideq, 1
  33. shl dst_strideq, 1
  34. %else
  35. cmp r4d, 4
  36. je .w4
  37. %endif
  38. cmp r4d, 8
  39. je .w8
  40. cmp r4d, 16
  41. je .w16
  42. cmp r4d, 32
  43. je .w32
  44. %ifidn %2, highbd
  45. cmp r4d, 64
  46. je .w64
  47. mov r4d, dword hm
  48. .loop128:
  49. movu m0, [srcq]
  50. movu m1, [srcq+16]
  51. movu m2, [srcq+32]
  52. movu m3, [srcq+48]
  53. %ifidn %1, avg
  54. pavg m0, [dstq]
  55. pavg m1, [dstq+16]
  56. pavg m2, [dstq+32]
  57. pavg m3, [dstq+48]
  58. %endif
  59. mova [dstq ], m0
  60. mova [dstq+16], m1
  61. mova [dstq+32], m2
  62. mova [dstq+48], m3
  63. movu m0, [srcq+64]
  64. movu m1, [srcq+80]
  65. movu m2, [srcq+96]
  66. movu m3, [srcq+112]
  67. add srcq, src_strideq
  68. %ifidn %1, avg
  69. pavg m0, [dstq+64]
  70. pavg m1, [dstq+80]
  71. pavg m2, [dstq+96]
  72. pavg m3, [dstq+112]
  73. %endif
  74. mova [dstq+64], m0
  75. mova [dstq+80], m1
  76. mova [dstq+96], m2
  77. mova [dstq+112], m3
  78. add dstq, dst_strideq
  79. dec r4d
  80. jnz .loop128
  81. RET
  82. %endif
  83. .w64:
  84. mov r4d, dword hm
  85. .loop64:
  86. movu m0, [srcq]
  87. movu m1, [srcq+16]
  88. movu m2, [srcq+32]
  89. movu m3, [srcq+48]
  90. add srcq, src_strideq
  91. %ifidn %1, avg
  92. pavg m0, [dstq]
  93. pavg m1, [dstq+16]
  94. pavg m2, [dstq+32]
  95. pavg m3, [dstq+48]
  96. %endif
  97. mova [dstq ], m0
  98. mova [dstq+16], m1
  99. mova [dstq+32], m2
  100. mova [dstq+48], m3
  101. add dstq, dst_strideq
  102. dec r4d
  103. jnz .loop64
  104. RET
  105. .w32:
  106. mov r4d, dword hm
  107. .loop32:
  108. movu m0, [srcq]
  109. movu m1, [srcq+16]
  110. movu m2, [srcq+src_strideq]
  111. movu m3, [srcq+src_strideq+16]
  112. lea srcq, [srcq+src_strideq*2]
  113. %ifidn %1, avg
  114. pavg m0, [dstq]
  115. pavg m1, [dstq +16]
  116. pavg m2, [dstq+dst_strideq]
  117. pavg m3, [dstq+dst_strideq+16]
  118. %endif
  119. mova [dstq ], m0
  120. mova [dstq +16], m1
  121. mova [dstq+dst_strideq ], m2
  122. mova [dstq+dst_strideq+16], m3
  123. lea dstq, [dstq+dst_strideq*2]
  124. sub r4d, 2
  125. jnz .loop32
  126. RET
  127. .w16:
  128. mov r4d, dword hm
  129. lea r5q, [src_strideq*3]
  130. lea r6q, [dst_strideq*3]
  131. .loop16:
  132. movu m0, [srcq]
  133. movu m1, [srcq+src_strideq]
  134. movu m2, [srcq+src_strideq*2]
  135. movu m3, [srcq+r5q]
  136. lea srcq, [srcq+src_strideq*4]
  137. %ifidn %1, avg
  138. pavg m0, [dstq]
  139. pavg m1, [dstq+dst_strideq]
  140. pavg m2, [dstq+dst_strideq*2]
  141. pavg m3, [dstq+r6q]
  142. %endif
  143. mova [dstq ], m0
  144. mova [dstq+dst_strideq ], m1
  145. mova [dstq+dst_strideq*2], m2
  146. mova [dstq+r6q ], m3
  147. lea dstq, [dstq+dst_strideq*4]
  148. sub r4d, 4
  149. jnz .loop16
  150. RET
  151. .w8:
  152. mov r4d, dword hm
  153. lea r5q, [src_strideq*3]
  154. lea r6q, [dst_strideq*3]
  155. .loop8:
  156. movh m0, [srcq]
  157. movh m1, [srcq+src_strideq]
  158. movh m2, [srcq+src_strideq*2]
  159. movh m3, [srcq+r5q]
  160. lea srcq, [srcq+src_strideq*4]
  161. %ifidn %1, avg
  162. movh m4, [dstq]
  163. movh m5, [dstq+dst_strideq]
  164. movh m6, [dstq+dst_strideq*2]
  165. movh m7, [dstq+r6q]
  166. pavg m0, m4
  167. pavg m1, m5
  168. pavg m2, m6
  169. pavg m3, m7
  170. %endif
  171. movh [dstq ], m0
  172. movh [dstq+dst_strideq ], m1
  173. movh [dstq+dst_strideq*2], m2
  174. movh [dstq+r6q ], m3
  175. lea dstq, [dstq+dst_strideq*4]
  176. sub r4d, 4
  177. jnz .loop8
  178. RET
  179. %ifnidn %2, highbd
  180. .w4:
  181. mov r4d, dword hm
  182. lea r5q, [src_strideq*3]
  183. lea r6q, [dst_strideq*3]
  184. .loop4:
  185. movd m0, [srcq]
  186. movd m1, [srcq+src_strideq]
  187. movd m2, [srcq+src_strideq*2]
  188. movd m3, [srcq+r5q]
  189. lea srcq, [srcq+src_strideq*4]
  190. %ifidn %1, avg
  191. movd m4, [dstq]
  192. movd m5, [dstq+dst_strideq]
  193. movd m6, [dstq+dst_strideq*2]
  194. movd m7, [dstq+r6q]
  195. pavg m0, m4
  196. pavg m1, m5
  197. pavg m2, m6
  198. pavg m3, m7
  199. %endif
  200. movd [dstq ], m0
  201. movd [dstq+dst_strideq ], m1
  202. movd [dstq+dst_strideq*2], m2
  203. movd [dstq+r6q ], m3
  204. lea dstq, [dstq+dst_strideq*4]
  205. sub r4d, 4
  206. jnz .loop4
  207. RET
  208. %endif
  209. %endmacro
  210. INIT_XMM sse2
  211. convolve_fn copy
  212. convolve_fn avg
  213. %if CONFIG_VP9_HIGHBITDEPTH
  214. convolve_fn copy, highbd
  215. convolve_fn avg, highbd
  216. %endif