vp8_quantize_sse2.c 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. /*
  2. * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "vpx_config.h"
  11. #include "vp8_rtcd.h"
  12. #include "vpx_ports/x86.h"
  13. #include "vpx_mem/vpx_mem.h"
  14. #include "vp8/encoder/block.h"
  15. #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
  16. #include <mmintrin.h> /* MMX */
  17. #include <xmmintrin.h> /* SSE */
  18. #include <emmintrin.h> /* SSE2 */
  19. #define SELECT_EOB(i, z) \
  20. do { \
  21. short boost = *zbin_boost_ptr; \
  22. int cmp = (x[z] < boost) | (y[z] == 0); \
  23. zbin_boost_ptr++; \
  24. if (cmp) break; \
  25. qcoeff_ptr[z] = y[z]; \
  26. eob = i; \
  27. zbin_boost_ptr = b->zrun_zbin_boost; \
  28. } while (0)
  29. void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) {
  30. char eob = 0;
  31. short *zbin_boost_ptr;
  32. short *qcoeff_ptr = d->qcoeff;
  33. DECLARE_ALIGNED(16, short, x[16]);
  34. DECLARE_ALIGNED(16, short, y[16]);
  35. __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
  36. __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
  37. __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
  38. __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
  39. __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
  40. __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
  41. __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
  42. __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
  43. __m128i round0 = _mm_load_si128((__m128i *)(b->round));
  44. __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
  45. __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
  46. __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
  47. __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
  48. __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
  49. memset(qcoeff_ptr, 0, 32);
  50. /* Duplicate to all lanes. */
  51. zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
  52. zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
  53. /* Sign of z: z >> 15 */
  54. sz0 = _mm_srai_epi16(z0, 15);
  55. sz1 = _mm_srai_epi16(z1, 15);
  56. /* x = abs(z): (z ^ sz) - sz */
  57. x0 = _mm_xor_si128(z0, sz0);
  58. x1 = _mm_xor_si128(z1, sz1);
  59. x0 = _mm_sub_epi16(x0, sz0);
  60. x1 = _mm_sub_epi16(x1, sz1);
  61. /* zbin[] + zbin_extra */
  62. zbin0 = _mm_add_epi16(zbin0, zbin_extra);
  63. zbin1 = _mm_add_epi16(zbin1, zbin_extra);
  64. /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
  65. * the equation because boost is the only value which can change:
  66. * x - (zbin[] + extra) >= boost */
  67. x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
  68. x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
  69. _mm_store_si128((__m128i *)(x), x_minus_zbin0);
  70. _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);
  71. /* All the remaining calculations are valid whether they are done now with
  72. * simd or later inside the loop one at a time. */
  73. x0 = _mm_add_epi16(x0, round0);
  74. x1 = _mm_add_epi16(x1, round1);
  75. y0 = _mm_mulhi_epi16(x0, quant0);
  76. y1 = _mm_mulhi_epi16(x1, quant1);
  77. y0 = _mm_add_epi16(y0, x0);
  78. y1 = _mm_add_epi16(y1, x1);
  79. /* Instead of shifting each value independently we convert the scaling
  80. * factor with 1 << (16 - shift) so we can use multiply/return high half. */
  81. y0 = _mm_mulhi_epi16(y0, quant_shift0);
  82. y1 = _mm_mulhi_epi16(y1, quant_shift1);
  83. /* Return the sign: (y ^ sz) - sz */
  84. y0 = _mm_xor_si128(y0, sz0);
  85. y1 = _mm_xor_si128(y1, sz1);
  86. y0 = _mm_sub_epi16(y0, sz0);
  87. y1 = _mm_sub_epi16(y1, sz1);
  88. _mm_store_si128((__m128i *)(y), y0);
  89. _mm_store_si128((__m128i *)(y + 8), y1);
  90. zbin_boost_ptr = b->zrun_zbin_boost;
  91. /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
  92. SELECT_EOB(1, 0);
  93. SELECT_EOB(2, 1);
  94. SELECT_EOB(3, 4);
  95. SELECT_EOB(4, 8);
  96. SELECT_EOB(5, 5);
  97. SELECT_EOB(6, 2);
  98. SELECT_EOB(7, 3);
  99. SELECT_EOB(8, 6);
  100. SELECT_EOB(9, 9);
  101. SELECT_EOB(10, 12);
  102. SELECT_EOB(11, 13);
  103. SELECT_EOB(12, 10);
  104. SELECT_EOB(13, 7);
  105. SELECT_EOB(14, 11);
  106. SELECT_EOB(15, 14);
  107. SELECT_EOB(16, 15);
  108. y0 = _mm_load_si128((__m128i *)(d->qcoeff));
  109. y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));
  110. /* dqcoeff = qcoeff * dequant */
  111. y0 = _mm_mullo_epi16(y0, dequant0);
  112. y1 = _mm_mullo_epi16(y1, dequant1);
  113. _mm_store_si128((__m128i *)(d->dqcoeff), y0);
  114. _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);
  115. *d->eob = eob;
  116. }
  117. void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) {
  118. __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
  119. __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
  120. __m128i round0 = _mm_load_si128((__m128i *)(b->round));
  121. __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
  122. __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
  123. __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
  124. __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
  125. __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
  126. __m128i inv_zig_zag0 =
  127. _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
  128. __m128i inv_zig_zag1 =
  129. _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
  130. __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
  131. /* sign of z: z >> 15 */
  132. sz0 = _mm_srai_epi16(z0, 15);
  133. sz1 = _mm_srai_epi16(z1, 15);
  134. /* x = abs(z): (z ^ sz) - sz */
  135. x0 = _mm_xor_si128(z0, sz0);
  136. x1 = _mm_xor_si128(z1, sz1);
  137. x0 = _mm_sub_epi16(x0, sz0);
  138. x1 = _mm_sub_epi16(x1, sz1);
  139. /* x += round */
  140. x0 = _mm_add_epi16(x0, round0);
  141. x1 = _mm_add_epi16(x1, round1);
  142. /* y = (x * quant) >> 16 */
  143. y0 = _mm_mulhi_epi16(x0, quant_fast0);
  144. y1 = _mm_mulhi_epi16(x1, quant_fast1);
  145. /* x = abs(y) = (y ^ sz) - sz */
  146. y0 = _mm_xor_si128(y0, sz0);
  147. y1 = _mm_xor_si128(y1, sz1);
  148. x0 = _mm_sub_epi16(y0, sz0);
  149. x1 = _mm_sub_epi16(y1, sz1);
  150. /* qcoeff = x */
  151. _mm_store_si128((__m128i *)(d->qcoeff), x0);
  152. _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
  153. /* x * dequant */
  154. xdq0 = _mm_mullo_epi16(x0, dequant0);
  155. xdq1 = _mm_mullo_epi16(x1, dequant1);
  156. /* dqcoeff = x * dequant */
  157. _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
  158. _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
  159. /* build a mask for the zig zag */
  160. zeros = _mm_setzero_si128();
  161. x0 = _mm_cmpeq_epi16(x0, zeros);
  162. x1 = _mm_cmpeq_epi16(x1, zeros);
  163. ones = _mm_cmpeq_epi16(zeros, zeros);
  164. x0 = _mm_xor_si128(x0, ones);
  165. x1 = _mm_xor_si128(x1, ones);
  166. x0 = _mm_and_si128(x0, inv_zig_zag0);
  167. x1 = _mm_and_si128(x1, inv_zig_zag1);
  168. x0 = _mm_max_epi16(x0, x1);
  169. /* now down to 8 */
  170. x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
  171. x0 = _mm_max_epi16(x0, x1);
  172. /* only 4 left */
  173. x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
  174. x0 = _mm_max_epi16(x0, x1);
  175. /* okay, just 2! */
  176. x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
  177. x0 = _mm_max_epi16(x0, x1);
  178. *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
  179. }