highbd_quantize_intrin_sse2.c 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <emmintrin.h>
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_dsp/vpx_dsp_common.h"
  14. #include "vpx_mem/vpx_mem.h"
  15. #include "vpx_ports/mem.h"
  16. #if CONFIG_VP9_HIGHBITDEPTH
  17. void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
  18. int skip_block, const int16_t *zbin_ptr,
  19. const int16_t *round_ptr,
  20. const int16_t *quant_ptr,
  21. const int16_t *quant_shift_ptr,
  22. tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
  23. const int16_t *dequant_ptr, uint16_t *eob_ptr,
  24. const int16_t *scan, const int16_t *iscan) {
  25. int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
  26. __m128i zbins[2];
  27. __m128i nzbins[2];
  28. zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
  29. (int)zbin_ptr[0]);
  30. zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
  31. nzbins[0] = _mm_setzero_si128();
  32. nzbins[1] = _mm_setzero_si128();
  33. nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
  34. nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
  35. (void)scan;
  36. (void)skip_block;
  37. assert(!skip_block);
  38. memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
  39. memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
  40. // Pre-scan pass
  41. for (i = ((int)count / 4) - 1; i >= 0; i--) {
  42. __m128i coeffs, cmp1, cmp2;
  43. int test;
  44. coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
  45. cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
  46. cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
  47. cmp1 = _mm_and_si128(cmp1, cmp2);
  48. test = _mm_movemask_epi8(cmp1);
  49. if (test == 0xffff)
  50. non_zero_regs--;
  51. else
  52. break;
  53. }
  54. // Quantization pass:
  55. for (i = 0; i < non_zero_regs; i++) {
  56. __m128i coeffs, coeffs_sign, tmp1, tmp2;
  57. int test;
  58. int abs_coeff[4];
  59. int coeff_sign[4];
  60. coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
  61. coeffs_sign = _mm_srai_epi32(coeffs, 31);
  62. coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
  63. tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
  64. tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
  65. tmp1 = _mm_or_si128(tmp1, tmp2);
  66. test = _mm_movemask_epi8(tmp1);
  67. _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
  68. _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
  69. for (j = 0; j < 4; j++) {
  70. if (test & (1 << (4 * j))) {
  71. int k = 4 * i + j;
  72. const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
  73. const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
  74. const uint32_t abs_qcoeff =
  75. (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
  76. qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
  77. dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
  78. if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
  79. }
  80. }
  81. }
  82. *eob_ptr = eob_i + 1;
  83. }
  84. void vpx_highbd_quantize_b_32x32_sse2(
  85. const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
  86. const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
  87. const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
  88. tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
  89. const int16_t *scan, const int16_t *iscan) {
  90. __m128i zbins[2];
  91. __m128i nzbins[2];
  92. int idx = 0;
  93. int idx_arr[1024];
  94. int i, eob = -1;
  95. const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
  96. const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
  97. (void)scan;
  98. (void)skip_block;
  99. assert(!skip_block);
  100. zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
  101. zbins[1] = _mm_set1_epi32(zbin1_tmp);
  102. nzbins[0] = _mm_setzero_si128();
  103. nzbins[1] = _mm_setzero_si128();
  104. nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
  105. nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
  106. memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
  107. memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
  108. // Pre-scan pass
  109. for (i = 0; i < n_coeffs / 4; i++) {
  110. __m128i coeffs, cmp1, cmp2;
  111. int test;
  112. coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
  113. cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
  114. cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
  115. cmp1 = _mm_and_si128(cmp1, cmp2);
  116. test = _mm_movemask_epi8(cmp1);
  117. if (!(test & 0xf)) idx_arr[idx++] = i * 4;
  118. if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
  119. if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
  120. if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
  121. }
  122. // Quantization pass: only process the coefficients selected in
  123. // pre-scan pass. Note: idx can be zero.
  124. for (i = 0; i < idx; i++) {
  125. const int rc = idx_arr[i];
  126. const int coeff = coeff_ptr[rc];
  127. const int coeff_sign = (coeff >> 31);
  128. const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
  129. const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
  130. const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
  131. const uint32_t abs_qcoeff =
  132. (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
  133. qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
  134. dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
  135. if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
  136. }
  137. *eob_ptr = eob + 1;
  138. }
  139. #endif