highbd_variance_impl_sse2.asm 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. SECTION .text
  12. ;unsigned int vpx_highbd_calc16x16var_sse2
  13. ;(
  14. ; unsigned char * src_ptr,
  15. ; int src_stride,
  16. ; unsigned char * ref_ptr,
  17. ; int ref_stride,
  18. ; unsigned int * SSE,
  19. ; int * Sum
  20. ;)
  21. global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
  22. sym(vpx_highbd_calc16x16var_sse2):
  23. push rbp
  24. mov rbp, rsp
  25. SHADOW_ARGS_TO_STACK 6
  26. SAVE_XMM 7
  27. push rbx
  28. push rsi
  29. push rdi
  30. ; end prolog
  31. mov rsi, arg(0) ;[src_ptr]
  32. mov rdi, arg(2) ;[ref_ptr]
  33. movsxd rax, DWORD PTR arg(1) ;[src_stride]
  34. movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
  35. add rax, rax ; source stride in bytes
  36. add rdx, rdx ; recon stride in bytes
  37. ; Prefetch data
  38. prefetcht0 [rsi]
  39. prefetcht0 [rsi+16]
  40. prefetcht0 [rsi+rax]
  41. prefetcht0 [rsi+rax+16]
  42. lea rbx, [rsi+rax*2]
  43. prefetcht0 [rbx]
  44. prefetcht0 [rbx+16]
  45. prefetcht0 [rbx+rax]
  46. prefetcht0 [rbx+rax+16]
  47. prefetcht0 [rdi]
  48. prefetcht0 [rdi+16]
  49. prefetcht0 [rdi+rdx]
  50. prefetcht0 [rdi+rdx+16]
  51. lea rbx, [rdi+rdx*2]
  52. prefetcht0 [rbx]
  53. prefetcht0 [rbx+16]
  54. prefetcht0 [rbx+rdx]
  55. prefetcht0 [rbx+rdx+16]
  56. pxor xmm0, xmm0 ; clear xmm0 for unpack
  57. pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
  58. pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
  59. mov rcx, 16
  60. .var16loop:
  61. movdqu xmm1, XMMWORD PTR [rsi]
  62. movdqu xmm2, XMMWORD PTR [rdi]
  63. lea rbx, [rsi+rax*2]
  64. prefetcht0 [rbx]
  65. prefetcht0 [rbx+16]
  66. prefetcht0 [rbx+rax]
  67. prefetcht0 [rbx+rax+16]
  68. lea rbx, [rdi+rdx*2]
  69. prefetcht0 [rbx]
  70. prefetcht0 [rbx+16]
  71. prefetcht0 [rbx+rdx]
  72. prefetcht0 [rbx+rdx+16]
  73. pxor xmm5, xmm5
  74. psubw xmm1, xmm2
  75. movdqu xmm3, XMMWORD PTR [rsi+16]
  76. paddw xmm5, xmm1
  77. pmaddwd xmm1, xmm1
  78. movdqu xmm2, XMMWORD PTR [rdi+16]
  79. paddd xmm6, xmm1
  80. psubw xmm3, xmm2
  81. movdqu xmm1, XMMWORD PTR [rsi+rax]
  82. paddw xmm5, xmm3
  83. pmaddwd xmm3, xmm3
  84. movdqu xmm2, XMMWORD PTR [rdi+rdx]
  85. paddd xmm6, xmm3
  86. psubw xmm1, xmm2
  87. movdqu xmm3, XMMWORD PTR [rsi+rax+16]
  88. paddw xmm5, xmm1
  89. pmaddwd xmm1, xmm1
  90. movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
  91. paddd xmm6, xmm1
  92. psubw xmm3, xmm2
  93. paddw xmm5, xmm3
  94. pmaddwd xmm3, xmm3
  95. paddd xmm6, xmm3
  96. movdqa xmm1, xmm5
  97. movdqa xmm2, xmm5
  98. pcmpgtw xmm1, xmm0
  99. pcmpeqw xmm2, xmm0
  100. por xmm1, xmm2
  101. pcmpeqw xmm1, xmm0
  102. movdqa xmm2, xmm5
  103. punpcklwd xmm5, xmm1
  104. punpckhwd xmm2, xmm1
  105. paddd xmm7, xmm5
  106. paddd xmm7, xmm2
  107. lea rsi, [rsi + 2*rax]
  108. lea rdi, [rdi + 2*rdx]
  109. sub rcx, 2
  110. jnz .var16loop
  111. movdqa xmm4, xmm6
  112. punpckldq xmm6, xmm0
  113. punpckhdq xmm4, xmm0
  114. movdqa xmm5, xmm7
  115. paddd xmm6, xmm4
  116. punpckldq xmm7, xmm0
  117. punpckhdq xmm5, xmm0
  118. paddd xmm7, xmm5
  119. movdqa xmm4, xmm6
  120. movdqa xmm5, xmm7
  121. psrldq xmm4, 8
  122. psrldq xmm5, 8
  123. paddd xmm6, xmm4
  124. paddd xmm7, xmm5
  125. mov rdi, arg(4) ; [SSE]
  126. mov rax, arg(5) ; [Sum]
  127. movd DWORD PTR [rdi], xmm6
  128. movd DWORD PTR [rax], xmm7
  129. ; begin epilog
  130. pop rdi
  131. pop rsi
  132. pop rbx
  133. RESTORE_XMM
  134. UNSHADOW_ARGS
  135. pop rbp
  136. ret
  137. ;unsigned int vpx_highbd_calc8x8var_sse2
  138. ;(
  139. ; unsigned char * src_ptr,
  140. ; int src_stride,
  141. ; unsigned char * ref_ptr,
  142. ; int ref_stride,
  143. ; unsigned int * SSE,
  144. ; int * Sum
  145. ;)
  146. global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
  147. sym(vpx_highbd_calc8x8var_sse2):
  148. push rbp
  149. mov rbp, rsp
  150. SHADOW_ARGS_TO_STACK 6
  151. SAVE_XMM 7
  152. push rbx
  153. push rsi
  154. push rdi
  155. ; end prolog
  156. mov rsi, arg(0) ;[src_ptr]
  157. mov rdi, arg(2) ;[ref_ptr]
  158. movsxd rax, DWORD PTR arg(1) ;[src_stride]
  159. movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
  160. add rax, rax ; source stride in bytes
  161. add rdx, rdx ; recon stride in bytes
  162. ; Prefetch data
  163. prefetcht0 [rsi]
  164. prefetcht0 [rsi+rax]
  165. lea rbx, [rsi+rax*2]
  166. prefetcht0 [rbx]
  167. prefetcht0 [rbx+rax]
  168. prefetcht0 [rdi]
  169. prefetcht0 [rdi+rdx]
  170. lea rbx, [rdi+rdx*2]
  171. prefetcht0 [rbx]
  172. prefetcht0 [rbx+rdx]
  173. pxor xmm0, xmm0 ; clear xmm0 for unpack
  174. pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
  175. pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
  176. mov rcx, 8
  177. .var8loop:
  178. movdqu xmm1, XMMWORD PTR [rsi]
  179. movdqu xmm2, XMMWORD PTR [rdi]
  180. lea rbx, [rsi+rax*4]
  181. prefetcht0 [rbx]
  182. prefetcht0 [rbx+rax]
  183. lea rbx, [rbx+rax*2]
  184. prefetcht0 [rbx]
  185. prefetcht0 [rbx+rax]
  186. lea rbx, [rdi+rdx*4]
  187. prefetcht0 [rbx]
  188. prefetcht0 [rbx+rdx]
  189. lea rbx, [rbx+rdx*2]
  190. prefetcht0 [rbx]
  191. prefetcht0 [rbx+rdx]
  192. pxor xmm5, xmm5
  193. psubw xmm1, xmm2
  194. movdqu xmm3, XMMWORD PTR [rsi+rax]
  195. paddw xmm5, xmm1
  196. pmaddwd xmm1, xmm1
  197. movdqu xmm2, XMMWORD PTR [rdi+rdx]
  198. paddd xmm6, xmm1
  199. lea rsi, [rsi + 2*rax]
  200. lea rdi, [rdi + 2*rdx]
  201. psubw xmm3, xmm2
  202. movdqu xmm1, XMMWORD PTR [rsi]
  203. paddw xmm5, xmm3
  204. pmaddwd xmm3, xmm3
  205. movdqu xmm2, XMMWORD PTR [rdi]
  206. paddd xmm6, xmm3
  207. psubw xmm1, xmm2
  208. movdqu xmm3, XMMWORD PTR [rsi+rax]
  209. paddw xmm5, xmm1
  210. pmaddwd xmm1, xmm1
  211. movdqu xmm2, XMMWORD PTR [rdi+rdx]
  212. paddd xmm6, xmm1
  213. psubw xmm3, xmm2
  214. paddw xmm5, xmm3
  215. pmaddwd xmm3, xmm3
  216. paddd xmm6, xmm3
  217. movdqa xmm1, xmm5
  218. movdqa xmm2, xmm5
  219. pcmpgtw xmm1, xmm0
  220. pcmpeqw xmm2, xmm0
  221. por xmm1, xmm2
  222. pcmpeqw xmm1, xmm0
  223. movdqa xmm2, xmm5
  224. punpcklwd xmm5, xmm1
  225. punpckhwd xmm2, xmm1
  226. paddd xmm7, xmm5
  227. paddd xmm7, xmm2
  228. lea rsi, [rsi + 2*rax]
  229. lea rdi, [rdi + 2*rdx]
  230. sub rcx, 4
  231. jnz .var8loop
  232. movdqa xmm4, xmm6
  233. punpckldq xmm6, xmm0
  234. punpckhdq xmm4, xmm0
  235. movdqa xmm5, xmm7
  236. paddd xmm6, xmm4
  237. punpckldq xmm7, xmm0
  238. punpckhdq xmm5, xmm0
  239. paddd xmm7, xmm5
  240. movdqa xmm4, xmm6
  241. movdqa xmm5, xmm7
  242. psrldq xmm4, 8
  243. psrldq xmm5, 8
  244. paddd xmm6, xmm4
  245. paddd xmm7, xmm5
  246. mov rdi, arg(4) ; [SSE]
  247. mov rax, arg(5) ; [Sum]
  248. movd DWORD PTR [rdi], xmm6
  249. movd DWORD PTR [rax], xmm7
  250. ; begin epilog
  251. pop rdi
  252. pop rsi
  253. pop rbx
  254. RESTORE_XMM
  255. UNSHADOW_ARGS
  256. pop rbp
  257. ret