vp9_error_sse2.asm 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %define private_prefix vp9
  11. %include "third_party/x86inc/x86inc.asm"
  12. %include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
  13. SECTION .text
  14. ; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
  15. ; int64_t *ssz)
  16. INIT_XMM sse2
  17. cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
  18. pxor m4, m4 ; sse accumulator
  19. pxor m6, m6 ; ssz accumulator
  20. pxor m5, m5 ; dedicated zero register
  21. .loop:
  22. LOAD_TRAN_LOW 2, uqcq, 0
  23. LOAD_TRAN_LOW 0, dqcq, 0
  24. LOAD_TRAN_LOW 3, uqcq, 8
  25. LOAD_TRAN_LOW 1, dqcq, 8
  26. INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
  27. INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
  28. sub sizeq, 16
  29. psubw m0, m2
  30. psubw m1, m3
  31. ; individual errors are max. 15bit+sign, so squares are 30bit, and
  32. ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
  33. pmaddwd m0, m0
  34. pmaddwd m1, m1
  35. pmaddwd m2, m2
  36. pmaddwd m3, m3
  37. ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
  38. paddd m0, m1
  39. paddd m2, m3
  40. ; accumulate in 64bit
  41. punpckldq m7, m0, m5
  42. punpckhdq m0, m5
  43. paddq m4, m7
  44. punpckldq m7, m2, m5
  45. paddq m4, m0
  46. punpckhdq m2, m5
  47. paddq m6, m7
  48. paddq m6, m2
  49. jg .loop
  50. ; accumulate horizontally and store in return value
  51. movhlps m5, m4
  52. movhlps m7, m6
  53. paddq m4, m5
  54. paddq m6, m7
  55. %if ARCH_X86_64
  56. movq rax, m4
  57. movq [sszq], m6
  58. %else
  59. mov eax, sszm
  60. pshufd m5, m4, 0x1
  61. movq [eax], m6
  62. movd eax, m4
  63. movd edx, m5
  64. %endif
  65. RET
  66. ; Compute the sum of squared difference between two tran_low_t vectors.
  67. ; Vectors are converted (if necessary) to int16_t for calculations.
  68. ; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff,
  69. ; intptr_t block_size)
  70. INIT_XMM sse2
  71. cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
  72. pxor m4, m4 ; sse accumulator
  73. pxor m5, m5 ; dedicated zero register
  74. .loop:
  75. LOAD_TRAN_LOW 2, uqcq, 0
  76. LOAD_TRAN_LOW 0, dqcq, 0
  77. LOAD_TRAN_LOW 3, uqcq, 8
  78. LOAD_TRAN_LOW 1, dqcq, 8
  79. INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
  80. INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
  81. sub sizeq, 16
  82. psubw m0, m2
  83. psubw m1, m3
  84. ; individual errors are max. 15bit+sign, so squares are 30bit, and
  85. ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
  86. pmaddwd m0, m0
  87. pmaddwd m1, m1
  88. ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
  89. paddd m0, m1
  90. ; accumulate in 64bit
  91. punpckldq m3, m0, m5
  92. punpckhdq m0, m5
  93. paddq m4, m3
  94. paddq m4, m0
  95. jnz .loop
  96. ; accumulate horizontally and store in return value
  97. movhlps m5, m4
  98. paddq m4, m5
  99. %if ARCH_X86_64
  100. movq rax, m4
  101. %else
  102. pshufd m5, m4, 0x1
  103. movd eax, m4
  104. movd edx, m5
  105. %endif
  106. RET