vp9_quantize_sse2.c 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <emmintrin.h>
  12. #include <xmmintrin.h>
  13. #include "./vp9_rtcd.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vpx_dsp/vpx_dsp_common.h"
  16. #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
  17. void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  18. int skip_block, const int16_t *round_ptr,
  19. const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
  20. tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
  21. uint16_t *eob_ptr, const int16_t *scan,
  22. const int16_t *iscan) {
  23. __m128i zero;
  24. __m128i thr;
  25. int16_t nzflag;
  26. __m128i eob;
  27. __m128i round, quant, dequant;
  28. (void)scan;
  29. (void)skip_block;
  30. assert(!skip_block);
  31. coeff_ptr += n_coeffs;
  32. iscan += n_coeffs;
  33. qcoeff_ptr += n_coeffs;
  34. dqcoeff_ptr += n_coeffs;
  35. n_coeffs = -n_coeffs;
  36. zero = _mm_setzero_si128();
  37. {
  38. __m128i coeff0, coeff1;
  39. // Setup global values
  40. {
  41. round = _mm_load_si128((const __m128i *)round_ptr);
  42. quant = _mm_load_si128((const __m128i *)quant_ptr);
  43. dequant = _mm_load_si128((const __m128i *)dequant_ptr);
  44. }
  45. {
  46. __m128i coeff0_sign, coeff1_sign;
  47. __m128i qcoeff0, qcoeff1;
  48. __m128i qtmp0, qtmp1;
  49. // Do DC and first 15 AC
  50. coeff0 = load_tran_low(coeff_ptr + n_coeffs);
  51. coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
  52. // Poor man's sign extract
  53. coeff0_sign = _mm_srai_epi16(coeff0, 15);
  54. coeff1_sign = _mm_srai_epi16(coeff1, 15);
  55. qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
  56. qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
  57. qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
  58. qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
  59. qcoeff0 = _mm_adds_epi16(qcoeff0, round);
  60. round = _mm_unpackhi_epi64(round, round);
  61. qcoeff1 = _mm_adds_epi16(qcoeff1, round);
  62. qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
  63. quant = _mm_unpackhi_epi64(quant, quant);
  64. qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
  65. // Reinsert signs
  66. qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
  67. qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
  68. qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
  69. qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
  70. store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
  71. store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
  72. coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
  73. dequant = _mm_unpackhi_epi64(dequant, dequant);
  74. coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
  75. store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
  76. store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
  77. }
  78. {
  79. // Scan for eob
  80. __m128i zero_coeff0, zero_coeff1;
  81. __m128i nzero_coeff0, nzero_coeff1;
  82. __m128i iscan0, iscan1;
  83. __m128i eob1;
  84. zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
  85. zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
  86. nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
  87. nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
  88. iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
  89. iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
  90. // Add one to convert from indices to counts
  91. iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
  92. iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
  93. eob = _mm_and_si128(iscan0, nzero_coeff0);
  94. eob1 = _mm_and_si128(iscan1, nzero_coeff1);
  95. eob = _mm_max_epi16(eob, eob1);
  96. }
  97. n_coeffs += 8 * 2;
  98. }
  99. thr = _mm_srai_epi16(dequant, 1);
  100. // AC only loop
  101. while (n_coeffs < 0) {
  102. __m128i coeff0, coeff1;
  103. {
  104. __m128i coeff0_sign, coeff1_sign;
  105. __m128i qcoeff0, qcoeff1;
  106. __m128i qtmp0, qtmp1;
  107. coeff0 = load_tran_low(coeff_ptr + n_coeffs);
  108. coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
  109. // Poor man's sign extract
  110. coeff0_sign = _mm_srai_epi16(coeff0, 15);
  111. coeff1_sign = _mm_srai_epi16(coeff1, 15);
  112. qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
  113. qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
  114. qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
  115. qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
  116. nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
  117. _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
  118. if (nzflag) {
  119. qcoeff0 = _mm_adds_epi16(qcoeff0, round);
  120. qcoeff1 = _mm_adds_epi16(qcoeff1, round);
  121. qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
  122. qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
  123. // Reinsert signs
  124. qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
  125. qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
  126. qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
  127. qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
  128. store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
  129. store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
  130. coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
  131. coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
  132. store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
  133. store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
  134. } else {
  135. store_zero_tran_low(qcoeff_ptr + n_coeffs);
  136. store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
  137. store_zero_tran_low(dqcoeff_ptr + n_coeffs);
  138. store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
  139. }
  140. }
  141. if (nzflag) {
  142. // Scan for eob
  143. __m128i zero_coeff0, zero_coeff1;
  144. __m128i nzero_coeff0, nzero_coeff1;
  145. __m128i iscan0, iscan1;
  146. __m128i eob0, eob1;
  147. zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
  148. zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
  149. nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
  150. nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
  151. iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
  152. iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
  153. // Add one to convert from indices to counts
  154. iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
  155. iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
  156. eob0 = _mm_and_si128(iscan0, nzero_coeff0);
  157. eob1 = _mm_and_si128(iscan1, nzero_coeff1);
  158. eob0 = _mm_max_epi16(eob0, eob1);
  159. eob = _mm_max_epi16(eob, eob0);
  160. }
  161. n_coeffs += 8 * 2;
  162. }
  163. // Accumulate EOB
  164. {
  165. __m128i eob_shuffled;
  166. eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
  167. eob = _mm_max_epi16(eob, eob_shuffled);
  168. eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
  169. eob = _mm_max_epi16(eob, eob_shuffled);
  170. eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
  171. eob = _mm_max_epi16(eob, eob_shuffled);
  172. *eob_ptr = _mm_extract_epi16(eob, 1);
  173. }
  174. }