vp9_error_sse2.asm 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %define private_prefix vp9
  11. %include "third_party/x86inc/x86inc.asm"
  12. SECTION .text
  13. ; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
  14. ; int64_t *ssz)
  15. INIT_XMM sse2
  16. cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
  17. pxor m4, m4 ; sse accumulator
  18. pxor m6, m6 ; ssz accumulator
  19. pxor m5, m5 ; dedicated zero register
  20. lea uqcq, [uqcq+sizeq*2]
  21. lea dqcq, [dqcq+sizeq*2]
  22. neg sizeq
  23. .loop:
  24. mova m2, [uqcq+sizeq*2]
  25. mova m0, [dqcq+sizeq*2]
  26. mova m3, [uqcq+sizeq*2+mmsize]
  27. mova m1, [dqcq+sizeq*2+mmsize]
  28. psubw m0, m2
  29. psubw m1, m3
  30. ; individual errors are max. 15bit+sign, so squares are 30bit, and
  31. ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
  32. pmaddwd m0, m0
  33. pmaddwd m1, m1
  34. pmaddwd m2, m2
  35. pmaddwd m3, m3
  36. ; accumulate in 64bit
  37. punpckldq m7, m0, m5
  38. punpckhdq m0, m5
  39. paddq m4, m7
  40. punpckldq m7, m1, m5
  41. paddq m4, m0
  42. punpckhdq m1, m5
  43. paddq m4, m7
  44. punpckldq m7, m2, m5
  45. paddq m4, m1
  46. punpckhdq m2, m5
  47. paddq m6, m7
  48. punpckldq m7, m3, m5
  49. paddq m6, m2
  50. punpckhdq m3, m5
  51. paddq m6, m7
  52. paddq m6, m3
  53. add sizeq, mmsize
  54. jl .loop
  55. ; accumulate horizontally and store in return value
  56. movhlps m5, m4
  57. movhlps m7, m6
  58. paddq m4, m5
  59. paddq m6, m7
  60. %if ARCH_X86_64
  61. movq rax, m4
  62. movq [sszq], m6
  63. %else
  64. mov eax, sszm
  65. pshufd m5, m4, 0x1
  66. movq [eax], m6
  67. movd eax, m4
  68. movd edx, m5
  69. %endif
  70. RET
  71. ; Compute the sum of squared difference between two int16_t vectors.
  72. ; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
  73. ; intptr_t block_size)
  74. INIT_XMM sse2
  75. cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
  76. pxor m4, m4 ; sse accumulator
  77. pxor m5, m5 ; dedicated zero register
  78. lea uqcq, [uqcq+sizeq*2]
  79. lea dqcq, [dqcq+sizeq*2]
  80. neg sizeq
  81. .loop:
  82. mova m2, [uqcq+sizeq*2]
  83. mova m0, [dqcq+sizeq*2]
  84. mova m3, [uqcq+sizeq*2+mmsize]
  85. mova m1, [dqcq+sizeq*2+mmsize]
  86. psubw m0, m2
  87. psubw m1, m3
  88. ; individual errors are max. 15bit+sign, so squares are 30bit, and
  89. ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
  90. pmaddwd m0, m0
  91. pmaddwd m1, m1
  92. ; accumulate in 64bit
  93. punpckldq m3, m0, m5
  94. punpckhdq m0, m5
  95. paddq m4, m3
  96. punpckldq m3, m1, m5
  97. paddq m4, m0
  98. punpckhdq m1, m5
  99. paddq m4, m3
  100. paddq m4, m1
  101. add sizeq, mmsize
  102. jl .loop
  103. ; accumulate horizontally and store in return value
  104. movhlps m5, m4
  105. paddq m4, m5
  106. %if ARCH_X86_64
  107. movq rax, m4
  108. %else
  109. pshufd m5, m4, 0x1
  110. movd eax, m4
  111. movd edx, m5
  112. %endif
  113. RET