vp9_highbd_error_sse2.asm 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %define private_prefix vp9
  11. %include "third_party/x86inc/x86inc.asm"
  12. SECTION .text
  13. ALIGN 16
  14. ;
  15. ; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
  16. ; intptr_t block_size, int64_t *ssz)
  17. ;
  18. INIT_XMM sse2
  19. cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
  20. pxor m4, m4 ; sse accumulator
  21. pxor m6, m6 ; ssz accumulator
  22. pxor m5, m5 ; dedicated zero register
  23. lea uqcq, [uqcq+sizeq*4]
  24. lea dqcq, [dqcq+sizeq*4]
  25. neg sizeq
  26. ALIGN 16
  27. .loop:
  28. mova m0, [dqcq+sizeq*4]
  29. packssdw m0, [dqcq+sizeq*4+mmsize]
  30. mova m2, [uqcq+sizeq*4]
  31. packssdw m2, [uqcq+sizeq*4+mmsize]
  32. mova m1, [dqcq+sizeq*4+mmsize*2]
  33. packssdw m1, [dqcq+sizeq*4+mmsize*3]
  34. mova m3, [uqcq+sizeq*4+mmsize*2]
  35. packssdw m3, [uqcq+sizeq*4+mmsize*3]
  36. add sizeq, mmsize
  37. ; individual errors are max. 15bit+sign, so squares are 30bit, and
  38. ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
  39. psubw m0, m2
  40. pmaddwd m2, m2
  41. pmaddwd m0, m0
  42. psubw m1, m3
  43. pmaddwd m3, m3
  44. pmaddwd m1, m1
  45. ; accumulate in 64bit
  46. punpckldq m7, m0, m5
  47. punpckhdq m0, m5
  48. paddq m4, m7
  49. punpckldq m7, m2, m5
  50. punpckhdq m2, m5
  51. paddq m6, m7
  52. punpckldq m7, m1, m5
  53. punpckhdq m1, m5
  54. paddq m4, m7
  55. punpckldq m7, m3, m5
  56. punpckhdq m3, m5
  57. paddq m6, m7
  58. paddq m4, m0
  59. paddq m4, m1
  60. paddq m6, m2
  61. paddq m6, m3
  62. jnz .loop
  63. ; accumulate horizontally and store in return value
  64. movhlps m5, m4
  65. movhlps m7, m6
  66. paddq m4, m5
  67. paddq m6, m7
  68. %if ARCH_X86_64
  69. movq rax, m4
  70. movq [sszq], m6
  71. %else
  72. mov eax, sszm
  73. pshufd m5, m4, 0x1
  74. movq [eax], m6
  75. movd eax, m4
  76. movd edx, m5
  77. %endif
  78. RET