vp9_dct_ssse3.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <tmmintrin.h> // SSSE3
  12. #include "./vp9_rtcd.h"
  13. #include "vpx_dsp/x86/inv_txfm_sse2.h"
  14. #include "vpx_dsp/x86/txfm_common_sse2.h"
  15. void vp9_fdct8x8_quant_ssse3(
  16. const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs,
  17. int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr,
  18. const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
  19. int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
  20. uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
  21. __m128i zero;
  22. int pass;
  23. // Constants
  24. // When we use them, in one case, they are all the same. In all others
  25. // it's a pair of them that we need to repeat four times. This is done
  26. // by constructing the 32 bit constant corresponding to that pair.
  27. const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
  28. const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
  29. const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  30. const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
  31. const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  32. const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
  33. const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
  34. const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
  35. const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  36. const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  37. // Load input
  38. __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
  39. __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
  40. __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
  41. __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
  42. __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
  43. __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
  44. __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
  45. __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
  46. __m128i *in[8];
  47. int index = 0;
  48. (void)scan_ptr;
  49. (void)zbin_ptr;
  50. (void)quant_shift_ptr;
  51. (void)coeff_ptr;
  52. // Pre-condition input (shift by two)
  53. in0 = _mm_slli_epi16(in0, 2);
  54. in1 = _mm_slli_epi16(in1, 2);
  55. in2 = _mm_slli_epi16(in2, 2);
  56. in3 = _mm_slli_epi16(in3, 2);
  57. in4 = _mm_slli_epi16(in4, 2);
  58. in5 = _mm_slli_epi16(in5, 2);
  59. in6 = _mm_slli_epi16(in6, 2);
  60. in7 = _mm_slli_epi16(in7, 2);
  61. in[0] = &in0;
  62. in[1] = &in1;
  63. in[2] = &in2;
  64. in[3] = &in3;
  65. in[4] = &in4;
  66. in[5] = &in5;
  67. in[6] = &in6;
  68. in[7] = &in7;
  69. // We do two passes, first the columns, then the rows. The results of the
  70. // first pass are transposed so that the same column code can be reused. The
  71. // results of the second pass are also transposed so that the rows (processed
  72. // as columns) are put back in row positions.
  73. for (pass = 0; pass < 2; pass++) {
  74. // To store results of each pass before the transpose.
  75. __m128i res0, res1, res2, res3, res4, res5, res6, res7;
  76. // Add/subtract
  77. const __m128i q0 = _mm_add_epi16(in0, in7);
  78. const __m128i q1 = _mm_add_epi16(in1, in6);
  79. const __m128i q2 = _mm_add_epi16(in2, in5);
  80. const __m128i q3 = _mm_add_epi16(in3, in4);
  81. const __m128i q4 = _mm_sub_epi16(in3, in4);
  82. const __m128i q5 = _mm_sub_epi16(in2, in5);
  83. const __m128i q6 = _mm_sub_epi16(in1, in6);
  84. const __m128i q7 = _mm_sub_epi16(in0, in7);
  85. // Work on first four results
  86. {
  87. // Add/subtract
  88. const __m128i r0 = _mm_add_epi16(q0, q3);
  89. const __m128i r1 = _mm_add_epi16(q1, q2);
  90. const __m128i r2 = _mm_sub_epi16(q1, q2);
  91. const __m128i r3 = _mm_sub_epi16(q0, q3);
  92. // Interleave to do the multiply by constants which gets us into 32bits
  93. const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
  94. const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
  95. const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
  96. const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
  97. const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
  98. const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
  99. const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
  100. const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
  101. const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
  102. const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
  103. const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
  104. const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
  105. // dct_const_round_shift
  106. const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
  107. const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
  108. const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
  109. const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
  110. const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
  111. const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
  112. const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
  113. const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
  114. const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
  115. const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
  116. const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
  117. const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
  118. const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
  119. const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
  120. const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
  121. const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
  122. // Combine
  123. res0 = _mm_packs_epi32(w0, w1);
  124. res4 = _mm_packs_epi32(w2, w3);
  125. res2 = _mm_packs_epi32(w4, w5);
  126. res6 = _mm_packs_epi32(w6, w7);
  127. }
  128. // Work on next four results
  129. {
  130. // Interleave to do the multiply by constants which gets us into 32bits
  131. const __m128i d0 = _mm_sub_epi16(q6, q5);
  132. const __m128i d1 = _mm_add_epi16(q6, q5);
  133. const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
  134. const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
  135. // Add/subtract
  136. const __m128i x0 = _mm_add_epi16(q4, r0);
  137. const __m128i x1 = _mm_sub_epi16(q4, r0);
  138. const __m128i x2 = _mm_sub_epi16(q7, r1);
  139. const __m128i x3 = _mm_add_epi16(q7, r1);
  140. // Interleave to do the multiply by constants which gets us into 32bits
  141. const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
  142. const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
  143. const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
  144. const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
  145. const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
  146. const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
  147. const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
  148. const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
  149. const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
  150. const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
  151. const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
  152. const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
  153. // dct_const_round_shift
  154. const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
  155. const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
  156. const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
  157. const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
  158. const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
  159. const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
  160. const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
  161. const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
  162. const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
  163. const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
  164. const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
  165. const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
  166. const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
  167. const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
  168. const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
  169. const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
  170. // Combine
  171. res1 = _mm_packs_epi32(w0, w1);
  172. res7 = _mm_packs_epi32(w2, w3);
  173. res5 = _mm_packs_epi32(w4, w5);
  174. res3 = _mm_packs_epi32(w6, w7);
  175. }
  176. // Transpose the 8x8.
  177. {
  178. // 00 01 02 03 04 05 06 07
  179. // 10 11 12 13 14 15 16 17
  180. // 20 21 22 23 24 25 26 27
  181. // 30 31 32 33 34 35 36 37
  182. // 40 41 42 43 44 45 46 47
  183. // 50 51 52 53 54 55 56 57
  184. // 60 61 62 63 64 65 66 67
  185. // 70 71 72 73 74 75 76 77
  186. const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
  187. const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
  188. const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
  189. const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
  190. const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
  191. const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
  192. const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
  193. const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
  194. // 00 10 01 11 02 12 03 13
  195. // 20 30 21 31 22 32 23 33
  196. // 04 14 05 15 06 16 07 17
  197. // 24 34 25 35 26 36 27 37
  198. // 40 50 41 51 42 52 43 53
  199. // 60 70 61 71 62 72 63 73
  200. // 54 54 55 55 56 56 57 57
  201. // 64 74 65 75 66 76 67 77
  202. const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  203. const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
  204. const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  205. const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
  206. const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
  207. const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
  208. const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
  209. const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
  210. // 00 10 20 30 01 11 21 31
  211. // 40 50 60 70 41 51 61 71
  212. // 02 12 22 32 03 13 23 33
  213. // 42 52 62 72 43 53 63 73
  214. // 04 14 24 34 05 15 21 36
  215. // 44 54 64 74 45 55 61 76
  216. // 06 16 26 36 07 17 27 37
  217. // 46 56 66 76 47 57 67 77
  218. in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
  219. in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
  220. in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
  221. in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
  222. in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
  223. in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
  224. in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
  225. in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
  226. // 00 10 20 30 40 50 60 70
  227. // 01 11 21 31 41 51 61 71
  228. // 02 12 22 32 42 52 62 72
  229. // 03 13 23 33 43 53 63 73
  230. // 04 14 24 34 44 54 64 74
  231. // 05 15 25 35 45 55 65 75
  232. // 06 16 26 36 46 56 66 76
  233. // 07 17 27 37 47 57 67 77
  234. }
  235. }
  236. // Post-condition output and store it
  237. {
  238. // Post-condition (division by two)
  239. // division of two 16 bits signed numbers using shifts
  240. // n / 2 = (n - (n >> 15)) >> 1
  241. const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
  242. const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
  243. const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
  244. const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
  245. const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
  246. const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
  247. const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
  248. const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
  249. in0 = _mm_sub_epi16(in0, sign_in0);
  250. in1 = _mm_sub_epi16(in1, sign_in1);
  251. in2 = _mm_sub_epi16(in2, sign_in2);
  252. in3 = _mm_sub_epi16(in3, sign_in3);
  253. in4 = _mm_sub_epi16(in4, sign_in4);
  254. in5 = _mm_sub_epi16(in5, sign_in5);
  255. in6 = _mm_sub_epi16(in6, sign_in6);
  256. in7 = _mm_sub_epi16(in7, sign_in7);
  257. in0 = _mm_srai_epi16(in0, 1);
  258. in1 = _mm_srai_epi16(in1, 1);
  259. in2 = _mm_srai_epi16(in2, 1);
  260. in3 = _mm_srai_epi16(in3, 1);
  261. in4 = _mm_srai_epi16(in4, 1);
  262. in5 = _mm_srai_epi16(in5, 1);
  263. in6 = _mm_srai_epi16(in6, 1);
  264. in7 = _mm_srai_epi16(in7, 1);
  265. }
  266. iscan_ptr += n_coeffs;
  267. qcoeff_ptr += n_coeffs;
  268. dqcoeff_ptr += n_coeffs;
  269. n_coeffs = -n_coeffs;
  270. zero = _mm_setzero_si128();
  271. if (!skip_block) {
  272. __m128i eob;
  273. __m128i round, quant, dequant, thr;
  274. int16_t nzflag;
  275. {
  276. __m128i coeff0, coeff1;
  277. // Setup global values
  278. {
  279. round = _mm_load_si128((const __m128i *)round_ptr);
  280. quant = _mm_load_si128((const __m128i *)quant_ptr);
  281. dequant = _mm_load_si128((const __m128i *)dequant_ptr);
  282. }
  283. {
  284. __m128i coeff0_sign, coeff1_sign;
  285. __m128i qcoeff0, qcoeff1;
  286. __m128i qtmp0, qtmp1;
  287. // Do DC and first 15 AC
  288. coeff0 = *in[0];
  289. coeff1 = *in[1];
  290. // Poor man's sign extract
  291. coeff0_sign = _mm_srai_epi16(coeff0, 15);
  292. coeff1_sign = _mm_srai_epi16(coeff1, 15);
  293. qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
  294. qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
  295. qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
  296. qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
  297. qcoeff0 = _mm_adds_epi16(qcoeff0, round);
  298. round = _mm_unpackhi_epi64(round, round);
  299. qcoeff1 = _mm_adds_epi16(qcoeff1, round);
  300. qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
  301. quant = _mm_unpackhi_epi64(quant, quant);
  302. qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
  303. // Reinsert signs
  304. qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
  305. qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
  306. qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
  307. qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
  308. _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
  309. _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
  310. coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
  311. dequant = _mm_unpackhi_epi64(dequant, dequant);
  312. coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
  313. _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
  314. _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
  315. }
  316. {
  317. // Scan for eob
  318. __m128i zero_coeff0, zero_coeff1;
  319. __m128i nzero_coeff0, nzero_coeff1;
  320. __m128i iscan0, iscan1;
  321. __m128i eob1;
  322. zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
  323. zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
  324. nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
  325. nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
  326. iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
  327. iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
  328. // Add one to convert from indices to counts
  329. iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
  330. iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
  331. eob = _mm_and_si128(iscan0, nzero_coeff0);
  332. eob1 = _mm_and_si128(iscan1, nzero_coeff1);
  333. eob = _mm_max_epi16(eob, eob1);
  334. }
  335. n_coeffs += 8 * 2;
  336. }
  337. // AC only loop
  338. index = 2;
  339. thr = _mm_srai_epi16(dequant, 1);
  340. while (n_coeffs < 0) {
  341. __m128i coeff0, coeff1;
  342. {
  343. __m128i coeff0_sign, coeff1_sign;
  344. __m128i qcoeff0, qcoeff1;
  345. __m128i qtmp0, qtmp1;
  346. assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
  347. coeff0 = *in[index];
  348. coeff1 = *in[index + 1];
  349. // Poor man's sign extract
  350. coeff0_sign = _mm_srai_epi16(coeff0, 15);
  351. coeff1_sign = _mm_srai_epi16(coeff1, 15);
  352. qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
  353. qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
  354. qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
  355. qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
  356. nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
  357. _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
  358. if (nzflag) {
  359. qcoeff0 = _mm_adds_epi16(qcoeff0, round);
  360. qcoeff1 = _mm_adds_epi16(qcoeff1, round);
  361. qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
  362. qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
  363. // Reinsert signs
  364. qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
  365. qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
  366. qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
  367. qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
  368. _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
  369. _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
  370. coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
  371. coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
  372. _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
  373. _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
  374. } else {
  375. _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
  376. _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
  377. _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
  378. _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
  379. }
  380. }
  381. if (nzflag) {
  382. // Scan for eob
  383. __m128i zero_coeff0, zero_coeff1;
  384. __m128i nzero_coeff0, nzero_coeff1;
  385. __m128i iscan0, iscan1;
  386. __m128i eob0, eob1;
  387. zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
  388. zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
  389. nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
  390. nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
  391. iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
  392. iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
  393. // Add one to convert from indices to counts
  394. iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
  395. iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
  396. eob0 = _mm_and_si128(iscan0, nzero_coeff0);
  397. eob1 = _mm_and_si128(iscan1, nzero_coeff1);
  398. eob0 = _mm_max_epi16(eob0, eob1);
  399. eob = _mm_max_epi16(eob, eob0);
  400. }
  401. n_coeffs += 8 * 2;
  402. index += 2;
  403. }
  404. // Accumulate EOB
  405. {
  406. __m128i eob_shuffled;
  407. eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
  408. eob = _mm_max_epi16(eob, eob_shuffled);
  409. eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
  410. eob = _mm_max_epi16(eob, eob_shuffled);
  411. eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
  412. eob = _mm_max_epi16(eob, eob_shuffled);
  413. *eob_ptr = _mm_extract_epi16(eob, 1);
  414. }
  415. } else {
  416. do {
  417. _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
  418. _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
  419. _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
  420. _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
  421. n_coeffs += 8 * 2;
  422. } while (n_coeffs < 0);
  423. *eob_ptr = 0;
  424. }
  425. }