mfqe_sse2.asm 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. ;
  2. ; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. SECTION .text
  12. ;void vp8_filter_by_weight16x16_sse2
  13. ;(
  14. ; unsigned char *src,
  15. ; int src_stride,
  16. ; unsigned char *dst,
  17. ; int dst_stride,
  18. ; int src_weight
  19. ;)
  20. global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
  21. sym(vp8_filter_by_weight16x16_sse2):
  22. push rbp
  23. mov rbp, rsp
  24. SHADOW_ARGS_TO_STACK 5
  25. SAVE_XMM 6
  26. GET_GOT rbx
  27. push rsi
  28. push rdi
  29. ; end prolog
  30. movd xmm0, arg(4) ; src_weight
  31. pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
  32. punpcklqdq xmm0, xmm0 ; replicate to all hi words
  33. movdqa xmm1, [GLOBAL(tMFQE)]
  34. psubw xmm1, xmm0 ; dst_weight
  35. mov rax, arg(0) ; src
  36. mov rsi, arg(1) ; src_stride
  37. mov rdx, arg(2) ; dst
  38. mov rdi, arg(3) ; dst_stride
  39. mov rcx, 16 ; loop count
  40. pxor xmm6, xmm6
  41. .combine:
  42. movdqa xmm2, [rax]
  43. movdqa xmm4, [rdx]
  44. add rax, rsi
  45. ; src * src_weight
  46. movdqa xmm3, xmm2
  47. punpcklbw xmm2, xmm6
  48. punpckhbw xmm3, xmm6
  49. pmullw xmm2, xmm0
  50. pmullw xmm3, xmm0
  51. ; dst * dst_weight
  52. movdqa xmm5, xmm4
  53. punpcklbw xmm4, xmm6
  54. punpckhbw xmm5, xmm6
  55. pmullw xmm4, xmm1
  56. pmullw xmm5, xmm1
  57. ; sum, round and shift
  58. paddw xmm2, xmm4
  59. paddw xmm3, xmm5
  60. paddw xmm2, [GLOBAL(tMFQE_round)]
  61. paddw xmm3, [GLOBAL(tMFQE_round)]
  62. psrlw xmm2, 4
  63. psrlw xmm3, 4
  64. packuswb xmm2, xmm3
  65. movdqa [rdx], xmm2
  66. add rdx, rdi
  67. dec rcx
  68. jnz .combine
  69. ; begin epilog
  70. pop rdi
  71. pop rsi
  72. RESTORE_GOT
  73. RESTORE_XMM
  74. UNSHADOW_ARGS
  75. pop rbp
  76. ret
  77. ;void vp8_filter_by_weight8x8_sse2
  78. ;(
  79. ; unsigned char *src,
  80. ; int src_stride,
  81. ; unsigned char *dst,
  82. ; int dst_stride,
  83. ; int src_weight
  84. ;)
  85. global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
  86. sym(vp8_filter_by_weight8x8_sse2):
  87. push rbp
  88. mov rbp, rsp
  89. SHADOW_ARGS_TO_STACK 5
  90. GET_GOT rbx
  91. push rsi
  92. push rdi
  93. ; end prolog
  94. movd xmm0, arg(4) ; src_weight
  95. pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
  96. punpcklqdq xmm0, xmm0 ; replicate to all hi words
  97. movdqa xmm1, [GLOBAL(tMFQE)]
  98. psubw xmm1, xmm0 ; dst_weight
  99. mov rax, arg(0) ; src
  100. mov rsi, arg(1) ; src_stride
  101. mov rdx, arg(2) ; dst
  102. mov rdi, arg(3) ; dst_stride
  103. mov rcx, 8 ; loop count
  104. pxor xmm4, xmm4
  105. .combine:
  106. movq xmm2, [rax]
  107. movq xmm3, [rdx]
  108. add rax, rsi
  109. ; src * src_weight
  110. punpcklbw xmm2, xmm4
  111. pmullw xmm2, xmm0
  112. ; dst * dst_weight
  113. punpcklbw xmm3, xmm4
  114. pmullw xmm3, xmm1
  115. ; sum, round and shift
  116. paddw xmm2, xmm3
  117. paddw xmm2, [GLOBAL(tMFQE_round)]
  118. psrlw xmm2, 4
  119. packuswb xmm2, xmm4
  120. movq [rdx], xmm2
  121. add rdx, rdi
  122. dec rcx
  123. jnz .combine
  124. ; begin epilog
  125. pop rdi
  126. pop rsi
  127. RESTORE_GOT
  128. UNSHADOW_ARGS
  129. pop rbp
  130. ret
  131. ;void vp8_variance_and_sad_16x16_sse2 | arg
  132. ;(
  133. ; unsigned char *src1, 0
  134. ; int stride1, 1
  135. ; unsigned char *src2, 2
  136. ; int stride2, 3
  137. ; unsigned int *variance, 4
  138. ; unsigned int *sad, 5
  139. ;)
  140. global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
  141. sym(vp8_variance_and_sad_16x16_sse2):
  142. push rbp
  143. mov rbp, rsp
  144. SHADOW_ARGS_TO_STACK 6
  145. GET_GOT rbx
  146. push rsi
  147. push rdi
  148. ; end prolog
  149. mov rax, arg(0) ; src1
  150. mov rcx, arg(1) ; stride1
  151. mov rdx, arg(2) ; src2
  152. mov rdi, arg(3) ; stride2
  153. mov rsi, 16 ; block height
  154. ; Prep accumulator registers
  155. pxor xmm3, xmm3 ; SAD
  156. pxor xmm4, xmm4 ; sum of src2
  157. pxor xmm5, xmm5 ; sum of src2^2
  158. ; Because we're working with the actual output frames
  159. ; we can't depend on any kind of data alignment.
  160. .accumulate:
  161. movdqa xmm0, [rax] ; src1
  162. movdqa xmm1, [rdx] ; src2
  163. add rax, rcx ; src1 + stride1
  164. add rdx, rdi ; src2 + stride2
  165. ; SAD(src1, src2)
  166. psadbw xmm0, xmm1
  167. paddusw xmm3, xmm0
  168. ; SUM(src2)
  169. pxor xmm2, xmm2
  170. psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
  171. paddusw xmm4, xmm2
  172. ; pmaddubsw would be ideal if it took two unsigned values. instead,
  173. ; it expects a signed and an unsigned value. so instead we zero extend
  174. ; and operate on words.
  175. pxor xmm2, xmm2
  176. movdqa xmm0, xmm1
  177. punpcklbw xmm0, xmm2
  178. punpckhbw xmm1, xmm2
  179. pmaddwd xmm0, xmm0
  180. pmaddwd xmm1, xmm1
  181. paddd xmm5, xmm0
  182. paddd xmm5, xmm1
  183. sub rsi, 1
  184. jnz .accumulate
  185. ; phaddd only operates on adjacent double words.
  186. ; Finalize SAD and store
  187. movdqa xmm0, xmm3
  188. psrldq xmm0, 8
  189. paddusw xmm0, xmm3
  190. paddd xmm0, [GLOBAL(t128)]
  191. psrld xmm0, 8
  192. mov rax, arg(5)
  193. movd [rax], xmm0
  194. ; Accumulate sum of src2
  195. movdqa xmm0, xmm4
  196. psrldq xmm0, 8
  197. paddusw xmm0, xmm4
  198. ; Square src2. Ignore high value
  199. pmuludq xmm0, xmm0
  200. psrld xmm0, 8
  201. ; phaddw could be used to sum adjacent values but we want
  202. ; all the values summed. promote to doubles, accumulate,
  203. ; shift and sum
  204. pxor xmm2, xmm2
  205. movdqa xmm1, xmm5
  206. punpckldq xmm1, xmm2
  207. punpckhdq xmm5, xmm2
  208. paddd xmm1, xmm5
  209. movdqa xmm2, xmm1
  210. psrldq xmm1, 8
  211. paddd xmm1, xmm2
  212. psubd xmm1, xmm0
  213. ; (variance + 128) >> 8
  214. paddd xmm1, [GLOBAL(t128)]
  215. psrld xmm1, 8
  216. mov rax, arg(4)
  217. movd [rax], xmm1
  218. ; begin epilog
  219. pop rdi
  220. pop rsi
  221. RESTORE_GOT
  222. UNSHADOW_ARGS
  223. pop rbp
  224. ret
  225. SECTION_RODATA
  226. align 16
  227. t128:
  228. %ifndef __NASM_VER__
  229. ddq 128
  230. %elif CONFIG_BIG_ENDIAN
  231. dq 0, 128
  232. %else
  233. dq 128, 0
  234. %endif
  235. align 16
  236. tMFQE: ; 1 << MFQE_PRECISION
  237. times 8 dw 0x10
  238. align 16
  239. tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
  240. times 8 dw 0x08