sad_sse2.asm 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION .text
  12. %macro SAD_FN 4
  13. %if %4 == 0
  14. %if %3 == 5
  15. cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
  16. %else ; %3 == 7
  17. cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
  18. src_stride3, ref_stride3, n_rows
  19. %endif ; %3 == 5/7
  20. %else ; avg
  21. %if %3 == 5
  22. cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
  23. second_pred, n_rows
  24. %else ; %3 == 7
  25. cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
  26. ref, ref_stride, \
  27. second_pred, \
  28. src_stride3, ref_stride3
  29. %if ARCH_X86_64
  30. %define n_rowsd r7d
  31. %else ; x86-32
  32. %define n_rowsd dword r0m
  33. %endif ; x86-32/64
  34. %endif ; %3 == 5/7
  35. %endif ; avg/sad
  36. movsxdifnidn src_strideq, src_strided
  37. movsxdifnidn ref_strideq, ref_strided
  38. %if %3 == 7
  39. lea src_stride3q, [src_strideq*3]
  40. lea ref_stride3q, [ref_strideq*3]
  41. %endif ; %3 == 7
  42. %endmacro
  43. ; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
  44. ; uint8_t *ref, int ref_stride);
  45. %macro SAD64XN 1-2 0
  46. SAD_FN 64, %1, 5, %2
  47. mov n_rowsd, %1
  48. pxor m0, m0
  49. .loop:
  50. movu m1, [refq]
  51. movu m2, [refq+16]
  52. movu m3, [refq+32]
  53. movu m4, [refq+48]
  54. %if %2 == 1
  55. pavgb m1, [second_predq+mmsize*0]
  56. pavgb m2, [second_predq+mmsize*1]
  57. pavgb m3, [second_predq+mmsize*2]
  58. pavgb m4, [second_predq+mmsize*3]
  59. lea second_predq, [second_predq+mmsize*4]
  60. %endif
  61. psadbw m1, [srcq]
  62. psadbw m2, [srcq+16]
  63. psadbw m3, [srcq+32]
  64. psadbw m4, [srcq+48]
  65. paddd m1, m2
  66. paddd m3, m4
  67. add refq, ref_strideq
  68. paddd m0, m1
  69. add srcq, src_strideq
  70. paddd m0, m3
  71. dec n_rowsd
  72. jg .loop
  73. movhlps m1, m0
  74. paddd m0, m1
  75. movd eax, m0
  76. RET
  77. %endmacro
  78. INIT_XMM sse2
  79. SAD64XN 64 ; sad64x64_sse2
  80. SAD64XN 32 ; sad64x32_sse2
  81. SAD64XN 64, 1 ; sad64x64_avg_sse2
  82. SAD64XN 32, 1 ; sad64x32_avg_sse2
  83. ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
  84. ; uint8_t *ref, int ref_stride);
  85. %macro SAD32XN 1-2 0
  86. SAD_FN 32, %1, 5, %2
  87. mov n_rowsd, %1/2
  88. pxor m0, m0
  89. .loop:
  90. movu m1, [refq]
  91. movu m2, [refq+16]
  92. movu m3, [refq+ref_strideq]
  93. movu m4, [refq+ref_strideq+16]
  94. %if %2 == 1
  95. pavgb m1, [second_predq+mmsize*0]
  96. pavgb m2, [second_predq+mmsize*1]
  97. pavgb m3, [second_predq+mmsize*2]
  98. pavgb m4, [second_predq+mmsize*3]
  99. lea second_predq, [second_predq+mmsize*4]
  100. %endif
  101. psadbw m1, [srcq]
  102. psadbw m2, [srcq+16]
  103. psadbw m3, [srcq+src_strideq]
  104. psadbw m4, [srcq+src_strideq+16]
  105. paddd m1, m2
  106. paddd m3, m4
  107. lea refq, [refq+ref_strideq*2]
  108. paddd m0, m1
  109. lea srcq, [srcq+src_strideq*2]
  110. paddd m0, m3
  111. dec n_rowsd
  112. jg .loop
  113. movhlps m1, m0
  114. paddd m0, m1
  115. movd eax, m0
  116. RET
  117. %endmacro
  118. INIT_XMM sse2
  119. SAD32XN 64 ; sad32x64_sse2
  120. SAD32XN 32 ; sad32x32_sse2
  121. SAD32XN 16 ; sad32x16_sse2
  122. SAD32XN 64, 1 ; sad32x64_avg_sse2
  123. SAD32XN 32, 1 ; sad32x32_avg_sse2
  124. SAD32XN 16, 1 ; sad32x16_avg_sse2
  125. ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
  126. ; uint8_t *ref, int ref_stride);
  127. %macro SAD16XN 1-2 0
  128. SAD_FN 16, %1, 7, %2
  129. mov n_rowsd, %1/4
  130. pxor m0, m0
  131. .loop:
  132. movu m1, [refq]
  133. movu m2, [refq+ref_strideq]
  134. movu m3, [refq+ref_strideq*2]
  135. movu m4, [refq+ref_stride3q]
  136. %if %2 == 1
  137. pavgb m1, [second_predq+mmsize*0]
  138. pavgb m2, [second_predq+mmsize*1]
  139. pavgb m3, [second_predq+mmsize*2]
  140. pavgb m4, [second_predq+mmsize*3]
  141. lea second_predq, [second_predq+mmsize*4]
  142. %endif
  143. psadbw m1, [srcq]
  144. psadbw m2, [srcq+src_strideq]
  145. psadbw m3, [srcq+src_strideq*2]
  146. psadbw m4, [srcq+src_stride3q]
  147. paddd m1, m2
  148. paddd m3, m4
  149. lea refq, [refq+ref_strideq*4]
  150. paddd m0, m1
  151. lea srcq, [srcq+src_strideq*4]
  152. paddd m0, m3
  153. dec n_rowsd
  154. jg .loop
  155. movhlps m1, m0
  156. paddd m0, m1
  157. movd eax, m0
  158. RET
  159. %endmacro
  160. INIT_XMM sse2
  161. SAD16XN 32 ; sad16x32_sse2
  162. SAD16XN 16 ; sad16x16_sse2
  163. SAD16XN 8 ; sad16x8_sse2
  164. SAD16XN 32, 1 ; sad16x32_avg_sse2
  165. SAD16XN 16, 1 ; sad16x16_avg_sse2
  166. SAD16XN 8, 1 ; sad16x8_avg_sse2
  167. ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
  168. ; uint8_t *ref, int ref_stride);
  169. %macro SAD8XN 1-2 0
  170. SAD_FN 8, %1, 7, %2
  171. mov n_rowsd, %1/4
  172. pxor m0, m0
  173. .loop:
  174. movh m1, [refq]
  175. movhps m1, [refq+ref_strideq]
  176. movh m2, [refq+ref_strideq*2]
  177. movhps m2, [refq+ref_stride3q]
  178. %if %2 == 1
  179. pavgb m1, [second_predq+mmsize*0]
  180. pavgb m2, [second_predq+mmsize*1]
  181. lea second_predq, [second_predq+mmsize*2]
  182. %endif
  183. movh m3, [srcq]
  184. movhps m3, [srcq+src_strideq]
  185. movh m4, [srcq+src_strideq*2]
  186. movhps m4, [srcq+src_stride3q]
  187. psadbw m1, m3
  188. psadbw m2, m4
  189. lea refq, [refq+ref_strideq*4]
  190. paddd m0, m1
  191. lea srcq, [srcq+src_strideq*4]
  192. paddd m0, m2
  193. dec n_rowsd
  194. jg .loop
  195. movhlps m1, m0
  196. paddd m0, m1
  197. movd eax, m0
  198. RET
  199. %endmacro
  200. INIT_XMM sse2
  201. SAD8XN 16 ; sad8x16_sse2
  202. SAD8XN 8 ; sad8x8_sse2
  203. SAD8XN 4 ; sad8x4_sse2
  204. SAD8XN 16, 1 ; sad8x16_avg_sse2
  205. SAD8XN 8, 1 ; sad8x8_avg_sse2
  206. SAD8XN 4, 1 ; sad8x4_avg_sse2
  207. ; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
  208. ; uint8_t *ref, int ref_stride);
  209. %macro SAD4XN 1-2 0
  210. SAD_FN 4, %1, 7, %2
  211. mov n_rowsd, %1/4
  212. pxor m0, m0
  213. .loop:
  214. movd m1, [refq]
  215. movd m2, [refq+ref_strideq]
  216. movd m3, [refq+ref_strideq*2]
  217. movd m4, [refq+ref_stride3q]
  218. punpckldq m1, m2
  219. punpckldq m3, m4
  220. movlhps m1, m3
  221. %if %2 == 1
  222. pavgb m1, [second_predq+mmsize*0]
  223. lea second_predq, [second_predq+mmsize*1]
  224. %endif
  225. movd m2, [srcq]
  226. movd m5, [srcq+src_strideq]
  227. movd m4, [srcq+src_strideq*2]
  228. movd m3, [srcq+src_stride3q]
  229. punpckldq m2, m5
  230. punpckldq m4, m3
  231. movlhps m2, m4
  232. psadbw m1, m2
  233. lea refq, [refq+ref_strideq*4]
  234. paddd m0, m1
  235. lea srcq, [srcq+src_strideq*4]
  236. dec n_rowsd
  237. jg .loop
  238. movhlps m1, m0
  239. paddd m0, m1
  240. movd eax, m0
  241. RET
  242. %endmacro
  243. INIT_XMM sse2
  244. SAD4XN 8 ; sad4x8_sse
  245. SAD4XN 4 ; sad4x4_sse
  246. SAD4XN 8, 1 ; sad4x8_avg_sse
  247. SAD4XN 4, 1 ; sad4x4_avg_sse