vp9_quantize_ssse3_x86_64.asm 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %define private_prefix vp9
  11. %include "third_party/x86inc/x86inc.asm"
  12. SECTION_RODATA
  13. pw_1: times 8 dw 1
  14. SECTION .text
  15. %macro QUANTIZE_FP 2
  16. cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  17. shift, qcoeff, dqcoeff, dequant, \
  18. eob, scan, iscan
  19. cmp dword skipm, 0
  20. jne .blank
  21. ; actual quantize loop - setup pointers, rounders, etc.
  22. movifnidn coeffq, coeffmp
  23. movifnidn ncoeffq, ncoeffmp
  24. mov r2, dequantmp
  25. movifnidn zbinq, zbinmp
  26. movifnidn roundq, roundmp
  27. movifnidn quantq, quantmp
  28. mova m1, [roundq] ; m1 = round
  29. mova m2, [quantq] ; m2 = quant
  30. %ifidn %1, fp_32x32
  31. pcmpeqw m5, m5
  32. psrlw m5, 15
  33. paddw m1, m5
  34. psrlw m1, 1 ; m1 = (m1 + 1) / 2
  35. %endif
  36. mova m3, [r2q] ; m3 = dequant
  37. mov r3, qcoeffmp
  38. mov r4, dqcoeffmp
  39. mov r5, iscanmp
  40. %ifidn %1, fp_32x32
  41. psllw m2, 1
  42. %endif
  43. pxor m5, m5 ; m5 = dedicated zero
  44. lea coeffq, [ coeffq+ncoeffq*2]
  45. lea r5q, [ r5q+ncoeffq*2]
  46. lea r3q, [ r3q+ncoeffq*2]
  47. lea r4q, [r4q+ncoeffq*2]
  48. neg ncoeffq
  49. ; get DC and first 15 AC coeffs
  50. mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
  51. mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
  52. pabsw m6, m9 ; m6 = abs(m9)
  53. pabsw m11, m10 ; m11 = abs(m10)
  54. pcmpeqw m7, m7
  55. paddsw m6, m1 ; m6 += round
  56. punpckhqdq m1, m1
  57. paddsw m11, m1 ; m11 += round
  58. pmulhw m8, m6, m2 ; m8 = m6*q>>16
  59. punpckhqdq m2, m2
  60. pmulhw m13, m11, m2 ; m13 = m11*q>>16
  61. psignw m8, m9 ; m8 = reinsert sign
  62. psignw m13, m10 ; m13 = reinsert sign
  63. mova [r3q+ncoeffq*2+ 0], m8
  64. mova [r3q+ncoeffq*2+16], m13
  65. %ifidn %1, fp_32x32
  66. pabsw m8, m8
  67. pabsw m13, m13
  68. %endif
  69. pmullw m8, m3 ; r4[i] = r3[i] * q
  70. punpckhqdq m3, m3
  71. pmullw m13, m3 ; r4[i] = r3[i] * q
  72. %ifidn %1, fp_32x32
  73. psrlw m8, 1
  74. psrlw m13, 1
  75. psignw m8, m9
  76. psignw m13, m10
  77. psrlw m0, m3, 2
  78. %else
  79. psrlw m0, m3, 1
  80. %endif
  81. mova [r4q+ncoeffq*2+ 0], m8
  82. mova [r4q+ncoeffq*2+16], m13
  83. pcmpeqw m8, m5 ; m8 = c[i] == 0
  84. pcmpeqw m13, m5 ; m13 = c[i] == 0
  85. mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
  86. mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
  87. psubw m6, m7 ; m6 = scan[i] + 1
  88. psubw m11, m7 ; m11 = scan[i] + 1
  89. pandn m8, m6 ; m8 = max(eob)
  90. pandn m13, m11 ; m13 = max(eob)
  91. pmaxsw m8, m13
  92. add ncoeffq, mmsize
  93. jz .accumulate_eob
  94. .ac_only_loop:
  95. mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
  96. mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
  97. pabsw m6, m9 ; m6 = abs(m9)
  98. pabsw m11, m10 ; m11 = abs(m10)
  99. pcmpgtw m7, m6, m0
  100. pcmpgtw m12, m11, m0
  101. pmovmskb r6d, m7
  102. pmovmskb r2d, m12
  103. or r6, r2
  104. jz .skip_iter
  105. pcmpeqw m7, m7
  106. paddsw m6, m1 ; m6 += round
  107. paddsw m11, m1 ; m11 += round
  108. pmulhw m14, m6, m2 ; m14 = m6*q>>16
  109. pmulhw m13, m11, m2 ; m13 = m11*q>>16
  110. psignw m14, m9 ; m14 = reinsert sign
  111. psignw m13, m10 ; m13 = reinsert sign
  112. mova [r3q+ncoeffq*2+ 0], m14
  113. mova [r3q+ncoeffq*2+16], m13
  114. %ifidn %1, fp_32x32
  115. pabsw m14, m14
  116. pabsw m13, m13
  117. %endif
  118. pmullw m14, m3 ; r4[i] = r3[i] * q
  119. pmullw m13, m3 ; r4[i] = r3[i] * q
  120. %ifidn %1, fp_32x32
  121. psrlw m14, 1
  122. psrlw m13, 1
  123. psignw m14, m9
  124. psignw m13, m10
  125. %endif
  126. mova [r4q+ncoeffq*2+ 0], m14
  127. mova [r4q+ncoeffq*2+16], m13
  128. pcmpeqw m14, m5 ; m14 = c[i] == 0
  129. pcmpeqw m13, m5 ; m13 = c[i] == 0
  130. mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
  131. mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
  132. psubw m6, m7 ; m6 = scan[i] + 1
  133. psubw m11, m7 ; m11 = scan[i] + 1
  134. pandn m14, m6 ; m14 = max(eob)
  135. pandn m13, m11 ; m13 = max(eob)
  136. pmaxsw m8, m14
  137. pmaxsw m8, m13
  138. add ncoeffq, mmsize
  139. jl .ac_only_loop
  140. jmp .accumulate_eob
  141. .skip_iter:
  142. mova [r3q+ncoeffq*2+ 0], m5
  143. mova [r3q+ncoeffq*2+16], m5
  144. mova [r4q+ncoeffq*2+ 0], m5
  145. mova [r4q+ncoeffq*2+16], m5
  146. add ncoeffq, mmsize
  147. jl .ac_only_loop
  148. .accumulate_eob:
  149. ; horizontally accumulate/max eobs and write into [eob] memory pointer
  150. mov r2, eobmp
  151. pshufd m7, m8, 0xe
  152. pmaxsw m8, m7
  153. pshuflw m7, m8, 0xe
  154. pmaxsw m8, m7
  155. pshuflw m7, m8, 0x1
  156. pmaxsw m8, m7
  157. pextrw r6, m8, 0
  158. mov [r2], r6
  159. RET
  160. ; skip-block, i.e. just write all zeroes
  161. .blank:
  162. mov r0, dqcoeffmp
  163. movifnidn ncoeffq, ncoeffmp
  164. mov r2, qcoeffmp
  165. mov r3, eobmp
  166. lea r0q, [r0q+ncoeffq*2]
  167. lea r2q, [r2q+ncoeffq*2]
  168. neg ncoeffq
  169. pxor m7, m7
  170. .blank_loop:
  171. mova [r0q+ncoeffq*2+ 0], m7
  172. mova [r0q+ncoeffq*2+16], m7
  173. mova [r2q+ncoeffq*2+ 0], m7
  174. mova [r2q+ncoeffq*2+16], m7
  175. add ncoeffq, mmsize
  176. jl .blank_loop
  177. mov word [r3q], 0
  178. RET
  179. %endmacro
  180. INIT_XMM ssse3
  181. QUANTIZE_FP fp, 7
  182. QUANTIZE_FP fp_32x32, 7