block_error_sse2.asm 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. SECTION .text
  12. ;int vp8_block_error_sse2(short *coeff_ptr, short *dcoef_ptr)
  13. global sym(vp8_block_error_sse2) PRIVATE
  14. sym(vp8_block_error_sse2):
  15. push rbp
  16. mov rbp, rsp
  17. SHADOW_ARGS_TO_STACK 2
  18. push rsi
  19. push rdi
  20. ; end prologue
  21. mov rsi, arg(0) ;coeff_ptr
  22. mov rdi, arg(1) ;dcoef_ptr
  23. movdqa xmm0, [rsi]
  24. movdqa xmm1, [rdi]
  25. movdqa xmm2, [rsi+16]
  26. movdqa xmm3, [rdi+16]
  27. psubw xmm0, xmm1
  28. psubw xmm2, xmm3
  29. pmaddwd xmm0, xmm0
  30. pmaddwd xmm2, xmm2
  31. paddd xmm0, xmm2
  32. pxor xmm5, xmm5
  33. movdqa xmm1, xmm0
  34. punpckldq xmm0, xmm5
  35. punpckhdq xmm1, xmm5
  36. paddd xmm0, xmm1
  37. movdqa xmm1, xmm0
  38. psrldq xmm0, 8
  39. paddd xmm0, xmm1
  40. movq rax, xmm0
  41. pop rdi
  42. pop rsi
  43. ; begin epilog
  44. UNSHADOW_ARGS
  45. pop rbp
  46. ret
  47. ;int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
  48. global sym(vp8_mbblock_error_sse2_impl) PRIVATE
  49. sym(vp8_mbblock_error_sse2_impl):
  50. push rbp
  51. mov rbp, rsp
  52. SHADOW_ARGS_TO_STACK 3
  53. SAVE_XMM 6
  54. push rsi
  55. push rdi
  56. ; end prolog
  57. mov rsi, arg(0) ;coeff_ptr
  58. pxor xmm6, xmm6
  59. mov rdi, arg(1) ;dcoef_ptr
  60. pxor xmm4, xmm4
  61. movd xmm5, dword ptr arg(2) ;dc
  62. por xmm5, xmm4
  63. pcmpeqw xmm5, xmm6
  64. mov rcx, 16
  65. .mberror_loop:
  66. movdqa xmm0, [rsi]
  67. movdqa xmm1, [rdi]
  68. movdqa xmm2, [rsi+16]
  69. movdqa xmm3, [rdi+16]
  70. psubw xmm2, xmm3
  71. pmaddwd xmm2, xmm2
  72. psubw xmm0, xmm1
  73. pand xmm0, xmm5
  74. pmaddwd xmm0, xmm0
  75. add rsi, 32
  76. add rdi, 32
  77. sub rcx, 1
  78. paddd xmm4, xmm2
  79. paddd xmm4, xmm0
  80. jnz .mberror_loop
  81. movdqa xmm0, xmm4
  82. punpckldq xmm0, xmm6
  83. punpckhdq xmm4, xmm6
  84. paddd xmm0, xmm4
  85. movdqa xmm1, xmm0
  86. psrldq xmm0, 8
  87. paddd xmm0, xmm1
  88. movq rax, xmm0
  89. pop rdi
  90. pop rsi
  91. ; begin epilog
  92. RESTORE_XMM
  93. UNSHADOW_ARGS
  94. pop rbp
  95. ret
  96. ;int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr);
  97. global sym(vp8_mbuverror_sse2_impl) PRIVATE
  98. sym(vp8_mbuverror_sse2_impl):
  99. push rbp
  100. mov rbp, rsp
  101. SHADOW_ARGS_TO_STACK 2
  102. push rsi
  103. push rdi
  104. ; end prolog
  105. mov rsi, arg(0) ;s_ptr
  106. mov rdi, arg(1) ;d_ptr
  107. mov rcx, 16
  108. pxor xmm3, xmm3
  109. .mbuverror_loop:
  110. movdqa xmm1, [rsi]
  111. movdqa xmm2, [rdi]
  112. psubw xmm1, xmm2
  113. pmaddwd xmm1, xmm1
  114. paddd xmm3, xmm1
  115. add rsi, 16
  116. add rdi, 16
  117. dec rcx
  118. jnz .mbuverror_loop
  119. pxor xmm0, xmm0
  120. movdqa xmm1, xmm3
  121. movdqa xmm2, xmm1
  122. punpckldq xmm1, xmm0
  123. punpckhdq xmm2, xmm0
  124. paddd xmm1, xmm2
  125. movdqa xmm2, xmm1
  126. psrldq xmm1, 8
  127. paddd xmm1, xmm2
  128. movq rax, xmm1
  129. pop rdi
  130. pop rsi
  131. ; begin epilog
  132. UNSHADOW_ARGS
  133. pop rbp
  134. ret