vp9_quantize_ssse3_x86_64.asm 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %define private_prefix vp9
  11. %include "third_party/x86inc/x86inc.asm"
  12. %include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
  13. SECTION_RODATA
  14. pw_1: times 8 dw 1
  15. SECTION .text
  16. %macro QUANTIZE_FP 2
  17. cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \
  18. qcoeff, dqcoeff, dequant, \
  19. eob, scan, iscan
  20. ; actual quantize loop - setup pointers, rounders, etc.
  21. movifnidn coeffq, coeffmp
  22. movifnidn ncoeffq, ncoeffmp
  23. mov r2, dequantmp
  24. movifnidn roundq, roundmp
  25. movifnidn quantq, quantmp
  26. mova m1, [roundq] ; m1 = round
  27. mova m2, [quantq] ; m2 = quant
  28. %ifidn %1, fp_32x32
  29. pcmpeqw m5, m5
  30. psrlw m5, 15
  31. paddw m1, m5
  32. psrlw m1, 1 ; m1 = (m1 + 1) / 2
  33. %endif
  34. mova m3, [r2q] ; m3 = dequant
  35. mov r3, qcoeffmp
  36. mov r4, dqcoeffmp
  37. mov r5, iscanmp
  38. %ifidn %1, fp_32x32
  39. psllw m2, 1
  40. %endif
  41. pxor m5, m5 ; m5 = dedicated zero
  42. INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq
  43. lea r5q, [r5q+ncoeffq*2]
  44. INCREMENT_ELEMENTS_TRAN_LOW r3q, ncoeffq
  45. INCREMENT_ELEMENTS_TRAN_LOW r4q, ncoeffq
  46. neg ncoeffq
  47. ; get DC and first 15 AC coeffs
  48. LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i]
  49. LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i]
  50. pabsw m6, m9 ; m6 = abs(m9)
  51. pabsw m11, m10 ; m11 = abs(m10)
  52. pcmpeqw m7, m7
  53. paddsw m6, m1 ; m6 += round
  54. punpckhqdq m1, m1
  55. paddsw m11, m1 ; m11 += round
  56. pmulhw m8, m6, m2 ; m8 = m6*q>>16
  57. punpckhqdq m2, m2
  58. pmulhw m13, m11, m2 ; m13 = m11*q>>16
  59. psignw m8, m9 ; m8 = reinsert sign
  60. psignw m13, m10 ; m13 = reinsert sign
  61. STORE_TRAN_LOW 8, r3q, ncoeffq, 6, 11, 12
  62. STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
  63. %ifidn %1, fp_32x32
  64. pabsw m8, m8
  65. pabsw m13, m13
  66. %endif
  67. pmullw m8, m3 ; r4[i] = r3[i] * q
  68. punpckhqdq m3, m3
  69. pmullw m13, m3 ; r4[i] = r3[i] * q
  70. %ifidn %1, fp_32x32
  71. psrlw m8, 1
  72. psrlw m13, 1
  73. psignw m8, m9
  74. psignw m13, m10
  75. psrlw m0, m3, 2
  76. %else
  77. psrlw m0, m3, 1
  78. %endif
  79. STORE_TRAN_LOW 8, r4q, ncoeffq, 6, 11, 12
  80. STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
  81. pcmpeqw m8, m5 ; m8 = c[i] == 0
  82. pcmpeqw m13, m5 ; m13 = c[i] == 0
  83. mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
  84. mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
  85. psubw m6, m7 ; m6 = scan[i] + 1
  86. psubw m11, m7 ; m11 = scan[i] + 1
  87. pandn m8, m6 ; m8 = max(eob)
  88. pandn m13, m11 ; m13 = max(eob)
  89. pmaxsw m8, m13
  90. add ncoeffq, mmsize
  91. jz .accumulate_eob
  92. .ac_only_loop:
  93. LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i]
  94. LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i]
  95. pabsw m6, m9 ; m6 = abs(m9)
  96. pabsw m11, m10 ; m11 = abs(m10)
  97. pcmpgtw m7, m6, m0
  98. pcmpgtw m12, m11, m0
  99. pmovmskb r6d, m7
  100. pmovmskb r2d, m12
  101. or r6, r2
  102. jz .skip_iter
  103. pcmpeqw m7, m7
  104. paddsw m6, m1 ; m6 += round
  105. paddsw m11, m1 ; m11 += round
  106. pmulhw m14, m6, m2 ; m14 = m6*q>>16
  107. pmulhw m13, m11, m2 ; m13 = m11*q>>16
  108. psignw m14, m9 ; m14 = reinsert sign
  109. psignw m13, m10 ; m13 = reinsert sign
  110. STORE_TRAN_LOW 14, r3q, ncoeffq, 6, 11, 12
  111. STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
  112. %ifidn %1, fp_32x32
  113. pabsw m14, m14
  114. pabsw m13, m13
  115. %endif
  116. pmullw m14, m3 ; r4[i] = r3[i] * q
  117. pmullw m13, m3 ; r4[i] = r3[i] * q
  118. %ifidn %1, fp_32x32
  119. psrlw m14, 1
  120. psrlw m13, 1
  121. psignw m14, m9
  122. psignw m13, m10
  123. %endif
  124. STORE_TRAN_LOW 14, r4q, ncoeffq, 6, 11, 12
  125. STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
  126. pcmpeqw m14, m5 ; m14 = c[i] == 0
  127. pcmpeqw m13, m5 ; m13 = c[i] == 0
  128. mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
  129. mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
  130. psubw m6, m7 ; m6 = scan[i] + 1
  131. psubw m11, m7 ; m11 = scan[i] + 1
  132. pandn m14, m6 ; m14 = max(eob)
  133. pandn m13, m11 ; m13 = max(eob)
  134. pmaxsw m8, m14
  135. pmaxsw m8, m13
  136. add ncoeffq, mmsize
  137. jl .ac_only_loop
  138. jmp .accumulate_eob
  139. .skip_iter:
  140. STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq
  141. STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8
  142. STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq
  143. STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8
  144. add ncoeffq, mmsize
  145. jl .ac_only_loop
  146. .accumulate_eob:
  147. ; horizontally accumulate/max eobs and write into [eob] memory pointer
  148. mov r2, eobmp
  149. pshufd m7, m8, 0xe
  150. pmaxsw m8, m7
  151. pshuflw m7, m8, 0xe
  152. pmaxsw m8, m7
  153. pshuflw m7, m8, 0x1
  154. pmaxsw m8, m7
  155. pextrw r6, m8, 0
  156. mov [r2], r6w
  157. RET
  158. %endmacro
  159. INIT_XMM ssse3
  160. QUANTIZE_FP fp, 7
  161. QUANTIZE_FP fp_32x32, 7