vp9_highbd_error_avx.asm 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. ;
  2. ; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %define private_prefix vp9
  11. %include "third_party/x86inc/x86inc.asm"
  12. SECTION .text
  13. ALIGN 16
  14. ;
  15. ; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
  16. ; intptr_t block_size, int64_t *ssz)
  17. ;
  18. INIT_XMM avx
  19. cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
  20. vzeroupper
  21. ; If only one iteration is required, then handle this as a special case.
  22. ; It is the most frequent case, so we can have a significant gain here
  23. ; by not setting up a loop and accumulators.
  24. cmp sizeq, 16
  25. jne .generic
  26. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  27. ;; Common case of size == 16
  28. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29. ; Load input vectors
  30. mova xm0, [dqcq]
  31. packssdw xm0, [dqcq+16]
  32. mova xm2, [uqcq]
  33. packssdw xm2, [uqcq+16]
  34. mova xm1, [dqcq+32]
  35. packssdw xm1, [dqcq+48]
  36. mova xm3, [uqcq+32]
  37. packssdw xm3, [uqcq+48]
  38. ; Compute the errors.
  39. psubw xm0, xm2
  40. psubw xm1, xm3
  41. ; Individual errors are max 15bit+sign, so squares are 30bit, and
  42. ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
  43. pmaddwd xm2, xm2
  44. pmaddwd xm3, xm3
  45. pmaddwd xm0, xm0
  46. pmaddwd xm1, xm1
  47. ; Squares are always positive, so we can use unsigned arithmetic after
  48. ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
  49. ; fit in 32bits
  50. paddd xm2, xm3
  51. paddd xm0, xm1
  52. ; Accumulate horizontally in 64 bits, there is no chance of overflow here
  53. pxor xm5, xm5
  54. pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits
  55. psrlq xm2, 32 ; Zero extended high of a pair of 32 bits
  56. pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits
  57. psrlq xm0, 32 ; Zero extended high of a pair of 32 bits
  58. paddq xm2, xm3
  59. paddq xm0, xm1
  60. psrldq xm3, xm2, 8
  61. psrldq xm1, xm0, 8
  62. paddq xm2, xm3
  63. paddq xm0, xm1
  64. ; Store the return value
  65. %if ARCH_X86_64
  66. movq rax, xm0
  67. movq [sszq], xm2
  68. %else
  69. movd eax, xm0
  70. pextrd edx, xm0, 1
  71. movq [sszd], xm2
  72. %endif
  73. RET
  74. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  75. ;; Generic case of size != 16, speculative low precision
  76. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  77. ALIGN 16
  78. .generic:
  79. pxor xm4, xm4 ; sse accumulator
  80. pxor xm5, xm5 ; overflow detection register for xm4
  81. pxor xm6, xm6 ; ssz accumulator
  82. pxor xm7, xm7 ; overflow detection register for xm6
  83. lea uqcq, [uqcq+sizeq*4]
  84. lea dqcq, [dqcq+sizeq*4]
  85. neg sizeq
  86. ; Push the negative size as the high precision code might need it
  87. push sizeq
  88. .loop:
  89. ; Load input vectors
  90. mova xm0, [dqcq+sizeq*4]
  91. packssdw xm0, [dqcq+sizeq*4+16]
  92. mova xm2, [uqcq+sizeq*4]
  93. packssdw xm2, [uqcq+sizeq*4+16]
  94. mova xm1, [dqcq+sizeq*4+32]
  95. packssdw xm1, [dqcq+sizeq*4+48]
  96. mova xm3, [uqcq+sizeq*4+32]
  97. packssdw xm3, [uqcq+sizeq*4+48]
  98. add sizeq, 16
  99. ; Compute the squared errors.
  100. ; Individual errors are max 15bit+sign, so squares are 30bit, and
  101. ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
  102. psubw xm0, xm2
  103. pmaddwd xm2, xm2
  104. pmaddwd xm0, xm0
  105. psubw xm1, xm3
  106. pmaddwd xm3, xm3
  107. pmaddwd xm1, xm1
  108. ; Squares are always positive, so we can use unsigned arithmetic after
  109. ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
  110. ; fit in 32bits
  111. paddd xm2, xm3
  112. paddd xm0, xm1
  113. ; We accumulate using 32 bit arithmetic, but detect potential overflow
  114. ; by checking if the MSB of the accumulators have ever been a set bit.
  115. ; If yes, we redo the whole compute at the end on higher precision, but
  116. ; this happens extremely rarely, so we still achieve a net gain.
  117. paddd xm4, xm0
  118. paddd xm6, xm2
  119. por xm5, xm4 ; OR in the accumulator for overflow detection
  120. por xm7, xm6 ; OR in the accumulator for overflow detection
  121. jnz .loop
  122. ; Add pairs horizontally (still only on 32 bits)
  123. phaddd xm4, xm4
  124. por xm5, xm4 ; OR in the accumulator for overflow detection
  125. phaddd xm6, xm6
  126. por xm7, xm6 ; OR in the accumulator for overflow detection
  127. ; Check for possibility of overflow by testing if bit 32 of each dword lane
  128. ; have ever been set. If they were not, then there was no overflow and the
  129. ; final sum will fit in 32 bits. If overflow happened, then
  130. ; we redo the whole computation on higher precision.
  131. por xm7, xm5
  132. pmovmskb r4, xm7
  133. test r4, 0x8888
  134. jnz .highprec
  135. phaddd xm4, xm4
  136. phaddd xm6, xm6
  137. pmovzxdq xm4, xm4
  138. pmovzxdq xm6, xm6
  139. ; Restore stack
  140. pop sizeq
  141. ; Store the return value
  142. %if ARCH_X86_64
  143. movq rax, xm4
  144. movq [sszq], xm6
  145. %else
  146. movd eax, xm4
  147. pextrd edx, xm4, 1
  148. movq [sszd], xm6
  149. %endif
  150. RET
  151. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  152. ;; Generic case of size != 16, high precision case
  153. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  154. .highprec:
  155. pxor xm4, xm4 ; sse accumulator
  156. pxor xm5, xm5 ; dedicated zero register
  157. pxor xm6, xm6 ; ssz accumulator
  158. pop sizeq
  159. .loophp:
  160. mova xm0, [dqcq+sizeq*4]
  161. packssdw xm0, [dqcq+sizeq*4+16]
  162. mova xm2, [uqcq+sizeq*4]
  163. packssdw xm2, [uqcq+sizeq*4+16]
  164. mova xm1, [dqcq+sizeq*4+32]
  165. packssdw xm1, [dqcq+sizeq*4+48]
  166. mova xm3, [uqcq+sizeq*4+32]
  167. packssdw xm3, [uqcq+sizeq*4+48]
  168. add sizeq, 16
  169. ; individual errors are max. 15bit+sign, so squares are 30bit, and
  170. ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
  171. psubw xm0, xm2
  172. pmaddwd xm2, xm2
  173. pmaddwd xm0, xm0
  174. psubw xm1, xm3
  175. pmaddwd xm3, xm3
  176. pmaddwd xm1, xm1
  177. ; accumulate in 64bit
  178. punpckldq xm7, xm0, xm5
  179. punpckhdq xm0, xm5
  180. paddq xm4, xm7
  181. punpckldq xm7, xm2, xm5
  182. punpckhdq xm2, xm5
  183. paddq xm6, xm7
  184. punpckldq xm7, xm1, xm5
  185. punpckhdq xm1, xm5
  186. paddq xm4, xm7
  187. punpckldq xm7, xm3, xm5
  188. punpckhdq xm3, xm5
  189. paddq xm6, xm7
  190. paddq xm4, xm0
  191. paddq xm4, xm1
  192. paddq xm6, xm2
  193. paddq xm6, xm3
  194. jnz .loophp
  195. ; Accumulate horizontally
  196. movhlps xm5, xm4
  197. movhlps xm7, xm6
  198. paddq xm4, xm5
  199. paddq xm6, xm7
  200. ; Store the return value
  201. %if ARCH_X86_64
  202. movq rax, xm4
  203. movq [sszq], xm6
  204. %else
  205. movd eax, xm4
  206. pextrd edx, xm4, 1
  207. movq [sszd], xm6
  208. %endif
  209. RET
  210. END