2
0

mfqe_sse2.asm 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. ;
  2. ; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;void vp8_filter_by_weight16x16_sse2
  12. ;(
  13. ; unsigned char *src,
  14. ; int src_stride,
  15. ; unsigned char *dst,
  16. ; int dst_stride,
  17. ; int src_weight
  18. ;)
  19. global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
  20. sym(vp8_filter_by_weight16x16_sse2):
  21. push rbp
  22. mov rbp, rsp
  23. SHADOW_ARGS_TO_STACK 5
  24. SAVE_XMM 6
  25. GET_GOT rbx
  26. push rsi
  27. push rdi
  28. ; end prolog
  29. movd xmm0, arg(4) ; src_weight
  30. pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
  31. punpcklqdq xmm0, xmm0 ; replicate to all hi words
  32. movdqa xmm1, [GLOBAL(tMFQE)]
  33. psubw xmm1, xmm0 ; dst_weight
  34. mov rax, arg(0) ; src
  35. mov rsi, arg(1) ; src_stride
  36. mov rdx, arg(2) ; dst
  37. mov rdi, arg(3) ; dst_stride
  38. mov rcx, 16 ; loop count
  39. pxor xmm6, xmm6
  40. .combine:
  41. movdqa xmm2, [rax]
  42. movdqa xmm4, [rdx]
  43. add rax, rsi
  44. ; src * src_weight
  45. movdqa xmm3, xmm2
  46. punpcklbw xmm2, xmm6
  47. punpckhbw xmm3, xmm6
  48. pmullw xmm2, xmm0
  49. pmullw xmm3, xmm0
  50. ; dst * dst_weight
  51. movdqa xmm5, xmm4
  52. punpcklbw xmm4, xmm6
  53. punpckhbw xmm5, xmm6
  54. pmullw xmm4, xmm1
  55. pmullw xmm5, xmm1
  56. ; sum, round and shift
  57. paddw xmm2, xmm4
  58. paddw xmm3, xmm5
  59. paddw xmm2, [GLOBAL(tMFQE_round)]
  60. paddw xmm3, [GLOBAL(tMFQE_round)]
  61. psrlw xmm2, 4
  62. psrlw xmm3, 4
  63. packuswb xmm2, xmm3
  64. movdqa [rdx], xmm2
  65. add rdx, rdi
  66. dec rcx
  67. jnz .combine
  68. ; begin epilog
  69. pop rdi
  70. pop rsi
  71. RESTORE_GOT
  72. RESTORE_XMM
  73. UNSHADOW_ARGS
  74. pop rbp
  75. ret
  76. ;void vp8_filter_by_weight8x8_sse2
  77. ;(
  78. ; unsigned char *src,
  79. ; int src_stride,
  80. ; unsigned char *dst,
  81. ; int dst_stride,
  82. ; int src_weight
  83. ;)
  84. global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
  85. sym(vp8_filter_by_weight8x8_sse2):
  86. push rbp
  87. mov rbp, rsp
  88. SHADOW_ARGS_TO_STACK 5
  89. GET_GOT rbx
  90. push rsi
  91. push rdi
  92. ; end prolog
  93. movd xmm0, arg(4) ; src_weight
  94. pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
  95. punpcklqdq xmm0, xmm0 ; replicate to all hi words
  96. movdqa xmm1, [GLOBAL(tMFQE)]
  97. psubw xmm1, xmm0 ; dst_weight
  98. mov rax, arg(0) ; src
  99. mov rsi, arg(1) ; src_stride
  100. mov rdx, arg(2) ; dst
  101. mov rdi, arg(3) ; dst_stride
  102. mov rcx, 8 ; loop count
  103. pxor xmm4, xmm4
  104. .combine:
  105. movq xmm2, [rax]
  106. movq xmm3, [rdx]
  107. add rax, rsi
  108. ; src * src_weight
  109. punpcklbw xmm2, xmm4
  110. pmullw xmm2, xmm0
  111. ; dst * dst_weight
  112. punpcklbw xmm3, xmm4
  113. pmullw xmm3, xmm1
  114. ; sum, round and shift
  115. paddw xmm2, xmm3
  116. paddw xmm2, [GLOBAL(tMFQE_round)]
  117. psrlw xmm2, 4
  118. packuswb xmm2, xmm4
  119. movq [rdx], xmm2
  120. add rdx, rdi
  121. dec rcx
  122. jnz .combine
  123. ; begin epilog
  124. pop rdi
  125. pop rsi
  126. RESTORE_GOT
  127. UNSHADOW_ARGS
  128. pop rbp
  129. ret
  130. ;void vp8_variance_and_sad_16x16_sse2 | arg
  131. ;(
  132. ; unsigned char *src1, 0
  133. ; int stride1, 1
  134. ; unsigned char *src2, 2
  135. ; int stride2, 3
  136. ; unsigned int *variance, 4
  137. ; unsigned int *sad, 5
  138. ;)
  139. global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
  140. sym(vp8_variance_and_sad_16x16_sse2):
  141. push rbp
  142. mov rbp, rsp
  143. SHADOW_ARGS_TO_STACK 6
  144. GET_GOT rbx
  145. push rsi
  146. push rdi
  147. ; end prolog
  148. mov rax, arg(0) ; src1
  149. mov rcx, arg(1) ; stride1
  150. mov rdx, arg(2) ; src2
  151. mov rdi, arg(3) ; stride2
  152. mov rsi, 16 ; block height
  153. ; Prep accumulator registers
  154. pxor xmm3, xmm3 ; SAD
  155. pxor xmm4, xmm4 ; sum of src2
  156. pxor xmm5, xmm5 ; sum of src2^2
  157. ; Because we're working with the actual output frames
  158. ; we can't depend on any kind of data alignment.
  159. .accumulate:
  160. movdqa xmm0, [rax] ; src1
  161. movdqa xmm1, [rdx] ; src2
  162. add rax, rcx ; src1 + stride1
  163. add rdx, rdi ; src2 + stride2
  164. ; SAD(src1, src2)
  165. psadbw xmm0, xmm1
  166. paddusw xmm3, xmm0
  167. ; SUM(src2)
  168. pxor xmm2, xmm2
  169. psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
  170. paddusw xmm4, xmm2
  171. ; pmaddubsw would be ideal if it took two unsigned values. instead,
  172. ; it expects a signed and an unsigned value. so instead we zero extend
  173. ; and operate on words.
  174. pxor xmm2, xmm2
  175. movdqa xmm0, xmm1
  176. punpcklbw xmm0, xmm2
  177. punpckhbw xmm1, xmm2
  178. pmaddwd xmm0, xmm0
  179. pmaddwd xmm1, xmm1
  180. paddd xmm5, xmm0
  181. paddd xmm5, xmm1
  182. sub rsi, 1
  183. jnz .accumulate
  184. ; phaddd only operates on adjacent double words.
  185. ; Finalize SAD and store
  186. movdqa xmm0, xmm3
  187. psrldq xmm0, 8
  188. paddusw xmm0, xmm3
  189. paddd xmm0, [GLOBAL(t128)]
  190. psrld xmm0, 8
  191. mov rax, arg(5)
  192. movd [rax], xmm0
  193. ; Accumulate sum of src2
  194. movdqa xmm0, xmm4
  195. psrldq xmm0, 8
  196. paddusw xmm0, xmm4
  197. ; Square src2. Ignore high value
  198. pmuludq xmm0, xmm0
  199. psrld xmm0, 8
  200. ; phaddw could be used to sum adjacent values but we want
  201. ; all the values summed. promote to doubles, accumulate,
  202. ; shift and sum
  203. pxor xmm2, xmm2
  204. movdqa xmm1, xmm5
  205. punpckldq xmm1, xmm2
  206. punpckhdq xmm5, xmm2
  207. paddd xmm1, xmm5
  208. movdqa xmm2, xmm1
  209. psrldq xmm1, 8
  210. paddd xmm1, xmm2
  211. psubd xmm1, xmm0
  212. ; (variance + 128) >> 8
  213. paddd xmm1, [GLOBAL(t128)]
  214. psrld xmm1, 8
  215. mov rax, arg(4)
  216. movd [rax], xmm1
  217. ; begin epilog
  218. pop rdi
  219. pop rsi
  220. RESTORE_GOT
  221. UNSHADOW_ARGS
  222. pop rbp
  223. ret
  224. SECTION_RODATA
  225. align 16
  226. t128:
  227. %ifndef __NASM_VER__
  228. ddq 128
  229. %elif CONFIG_BIG_ENDIAN
  230. dq 0, 128
  231. %else
  232. dq 128, 0
  233. %endif
  234. align 16
  235. tMFQE: ; 1 << MFQE_PRECISION
  236. times 8 dw 0x10
  237. align 16
  238. tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
  239. times 8 dw 0x08