sad4d_sse2.asm 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION .text
  12. ; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
  13. %macro PROCESS_4x2x4 5-6 0
  14. movd m0, [srcq +%2]
  15. %if %1 == 1
  16. movd m6, [ref1q+%3]
  17. movd m4, [ref2q+%3]
  18. movd m7, [ref3q+%3]
  19. movd m5, [ref4q+%3]
  20. movd m1, [srcq +%4]
  21. movd m2, [ref1q+%5]
  22. punpckldq m0, m1
  23. punpckldq m6, m2
  24. movd m1, [ref2q+%5]
  25. movd m2, [ref3q+%5]
  26. movd m3, [ref4q+%5]
  27. punpckldq m4, m1
  28. punpckldq m7, m2
  29. punpckldq m5, m3
  30. movlhps m0, m0
  31. movlhps m6, m4
  32. movlhps m7, m5
  33. psadbw m6, m0
  34. psadbw m7, m0
  35. %else
  36. movd m1, [ref1q+%3]
  37. movd m5, [ref1q+%5]
  38. movd m2, [ref2q+%3]
  39. movd m4, [ref2q+%5]
  40. punpckldq m1, m5
  41. punpckldq m2, m4
  42. movd m3, [ref3q+%3]
  43. movd m5, [ref3q+%5]
  44. punpckldq m3, m5
  45. movd m4, [ref4q+%3]
  46. movd m5, [ref4q+%5]
  47. punpckldq m4, m5
  48. movd m5, [srcq +%4]
  49. punpckldq m0, m5
  50. movlhps m0, m0
  51. movlhps m1, m2
  52. movlhps m3, m4
  53. psadbw m1, m0
  54. psadbw m3, m0
  55. paddd m6, m1
  56. paddd m7, m3
  57. %endif
  58. %if %6 == 1
  59. lea srcq, [srcq +src_strideq*2]
  60. lea ref1q, [ref1q+ref_strideq*2]
  61. lea ref2q, [ref2q+ref_strideq*2]
  62. lea ref3q, [ref3q+ref_strideq*2]
  63. lea ref4q, [ref4q+ref_strideq*2]
  64. %endif
  65. %endmacro
  66. ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
  67. %macro PROCESS_8x2x4 5-6 0
  68. movh m0, [srcq +%2]
  69. %if %1 == 1
  70. movh m4, [ref1q+%3]
  71. movh m5, [ref2q+%3]
  72. movh m6, [ref3q+%3]
  73. movh m7, [ref4q+%3]
  74. movhps m0, [srcq +%4]
  75. movhps m4, [ref1q+%5]
  76. movhps m5, [ref2q+%5]
  77. movhps m6, [ref3q+%5]
  78. movhps m7, [ref4q+%5]
  79. psadbw m4, m0
  80. psadbw m5, m0
  81. psadbw m6, m0
  82. psadbw m7, m0
  83. %else
  84. movh m1, [ref1q+%3]
  85. movh m2, [ref2q+%3]
  86. movh m3, [ref3q+%3]
  87. movhps m0, [srcq +%4]
  88. movhps m1, [ref1q+%5]
  89. movhps m2, [ref2q+%5]
  90. movhps m3, [ref3q+%5]
  91. psadbw m1, m0
  92. psadbw m2, m0
  93. psadbw m3, m0
  94. paddd m4, m1
  95. movh m1, [ref4q+%3]
  96. movhps m1, [ref4q+%5]
  97. paddd m5, m2
  98. paddd m6, m3
  99. psadbw m1, m0
  100. paddd m7, m1
  101. %endif
  102. %if %6 == 1
  103. lea srcq, [srcq +src_strideq*2]
  104. lea ref1q, [ref1q+ref_strideq*2]
  105. lea ref2q, [ref2q+ref_strideq*2]
  106. lea ref3q, [ref3q+ref_strideq*2]
  107. lea ref4q, [ref4q+ref_strideq*2]
  108. %endif
  109. %endmacro
  110. ; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
  111. %macro PROCESS_16x2x4 5-6 0
  112. ; 1st 16 px
  113. mova m0, [srcq +%2]
  114. %if %1 == 1
  115. movu m4, [ref1q+%3]
  116. movu m5, [ref2q+%3]
  117. movu m6, [ref3q+%3]
  118. movu m7, [ref4q+%3]
  119. psadbw m4, m0
  120. psadbw m5, m0
  121. psadbw m6, m0
  122. psadbw m7, m0
  123. %else
  124. movu m1, [ref1q+%3]
  125. movu m2, [ref2q+%3]
  126. movu m3, [ref3q+%3]
  127. psadbw m1, m0
  128. psadbw m2, m0
  129. psadbw m3, m0
  130. paddd m4, m1
  131. movu m1, [ref4q+%3]
  132. paddd m5, m2
  133. paddd m6, m3
  134. psadbw m1, m0
  135. paddd m7, m1
  136. %endif
  137. ; 2nd 16 px
  138. mova m0, [srcq +%4]
  139. movu m1, [ref1q+%5]
  140. movu m2, [ref2q+%5]
  141. movu m3, [ref3q+%5]
  142. psadbw m1, m0
  143. psadbw m2, m0
  144. psadbw m3, m0
  145. paddd m4, m1
  146. movu m1, [ref4q+%5]
  147. paddd m5, m2
  148. paddd m6, m3
  149. %if %6 == 1
  150. lea srcq, [srcq +src_strideq*2]
  151. lea ref1q, [ref1q+ref_strideq*2]
  152. lea ref2q, [ref2q+ref_strideq*2]
  153. lea ref3q, [ref3q+ref_strideq*2]
  154. lea ref4q, [ref4q+ref_strideq*2]
  155. %endif
  156. psadbw m1, m0
  157. paddd m7, m1
  158. %endmacro
  159. ; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
  160. %macro PROCESS_32x2x4 5-6 0
  161. PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
  162. PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6
  163. %endmacro
  164. ; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
  165. %macro PROCESS_64x2x4 5-6 0
  166. PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
  167. PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
  168. %endmacro
  169. ; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride,
  170. ; uint8_t *ref[4], int ref_stride,
  171. ; uint32_t res[4]);
  172. ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
  173. %macro SADNXN4D 2
  174. %if UNIX64
  175. cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
  176. res, ref2, ref3, ref4
  177. %else
  178. cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
  179. ref2, ref3, ref4
  180. %endif
  181. movsxdifnidn src_strideq, src_strided
  182. movsxdifnidn ref_strideq, ref_strided
  183. mov ref2q, [ref1q+gprsize*1]
  184. mov ref3q, [ref1q+gprsize*2]
  185. mov ref4q, [ref1q+gprsize*3]
  186. mov ref1q, [ref1q+gprsize*0]
  187. PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
  188. %rep (%2-4)/2
  189. PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
  190. %endrep
  191. PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
  192. %if %1 > 4
  193. pslldq m5, 4
  194. pslldq m7, 4
  195. por m4, m5
  196. por m6, m7
  197. mova m5, m4
  198. mova m7, m6
  199. punpcklqdq m4, m6
  200. punpckhqdq m5, m7
  201. movifnidn r4, r4mp
  202. paddd m4, m5
  203. movu [r4], m4
  204. RET
  205. %else
  206. movifnidn r4, r4mp
  207. pshufd m6, m6, 0x08
  208. pshufd m7, m7, 0x08
  209. movq [r4+0], m6
  210. movq [r4+8], m7
  211. RET
  212. %endif
  213. %endmacro
  214. INIT_XMM sse2
  215. SADNXN4D 64, 64
  216. SADNXN4D 64, 32
  217. SADNXN4D 32, 64
  218. SADNXN4D 32, 32
  219. SADNXN4D 32, 16
  220. SADNXN4D 16, 32
  221. SADNXN4D 16, 16
  222. SADNXN4D 16, 8
  223. SADNXN4D 8, 16
  224. SADNXN4D 8, 8
  225. SADNXN4D 8, 4
  226. SADNXN4D 4, 8
  227. SADNXN4D 4, 4