fwd_txfm_impl_sse2.h 45 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <emmintrin.h> // SSE2
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/txfm_common.h"
  13. #include "vpx_dsp/x86/fwd_txfm_sse2.h"
  14. #include "vpx_dsp/x86/txfm_common_sse2.h"
  15. #include "vpx_ports/mem.h"
  16. // TODO(jingning) The high bit-depth functions need rework for performance.
  17. // After we properly fix the high bit-depth function implementations, this
  18. // file's dependency should be substantially simplified.
  19. #if DCT_HIGH_BIT_DEPTH
  20. #define ADD_EPI16 _mm_adds_epi16
  21. #define SUB_EPI16 _mm_subs_epi16
  22. #else
  23. #define ADD_EPI16 _mm_add_epi16
  24. #define SUB_EPI16 _mm_sub_epi16
  25. #endif
  26. void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
  27. // This 2D transform implements 4 vertical 1D transforms followed
  28. // by 4 horizontal 1D transforms. The multiplies and adds are as given
  29. // by Chen, Smith and Fralick ('77). The commands for moving the data
  30. // around have been minimized by hand.
  31. // For the purposes of the comments, the 16 inputs are referred to at i0
  32. // through iF (in raster order), intermediate variables are a0, b0, c0
  33. // through f, and correspond to the in-place computations mapped to input
  34. // locations. The outputs, o0 through oF are labeled according to the
  35. // output locations.
  36. // Constants
  37. // These are the coefficients used for the multiplies.
  38. // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
  39. // where cospi_N_64 = cos(N pi /64)
  40. const __m128i k__cospi_A =
  41. octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
  42. cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
  43. const __m128i k__cospi_B =
  44. octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
  45. cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
  46. const __m128i k__cospi_C =
  47. octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
  48. cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
  49. const __m128i k__cospi_D =
  50. octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
  51. cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
  52. const __m128i k__cospi_E =
  53. octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
  54. cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
  55. const __m128i k__cospi_F =
  56. octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
  57. cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
  58. const __m128i k__cospi_G =
  59. octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
  60. -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
  61. const __m128i k__cospi_H =
  62. octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
  63. -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
  64. const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  65. // This second rounding constant saves doing some extra adds at the end
  66. const __m128i k__DCT_CONST_ROUNDING2 =
  67. _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
  68. const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
  69. const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
  70. const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
  71. __m128i in0, in1;
  72. #if DCT_HIGH_BIT_DEPTH
  73. __m128i cmp0, cmp1;
  74. int test, overflow;
  75. #endif
  76. // Load inputs.
  77. in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
  78. in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
  79. in1 = _mm_unpacklo_epi64(
  80. in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
  81. in0 = _mm_unpacklo_epi64(
  82. in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
  83. // in0 = [i0 i1 i2 i3 iC iD iE iF]
  84. // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
  85. #if DCT_HIGH_BIT_DEPTH
  86. // Check inputs small enough to use optimised code
  87. cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
  88. _mm_cmplt_epi16(in0, _mm_set1_epi16((int16_t)0xfc00)));
  89. cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
  90. _mm_cmplt_epi16(in1, _mm_set1_epi16((int16_t)0xfc00)));
  91. test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
  92. if (test) {
  93. vpx_highbd_fdct4x4_c(input, output, stride);
  94. return;
  95. }
  96. #endif // DCT_HIGH_BIT_DEPTH
  97. // multiply by 16 to give some extra precision
  98. in0 = _mm_slli_epi16(in0, 4);
  99. in1 = _mm_slli_epi16(in1, 4);
  100. // if (i == 0 && input[0]) input[0] += 1;
  101. // add 1 to the upper left pixel if it is non-zero, which helps reduce
  102. // the round-trip error
  103. {
  104. // The mask will only contain whether the first value is zero, all
  105. // other comparison will fail as something shifted by 4 (above << 4)
  106. // can never be equal to one. To increment in the non-zero case, we
  107. // add the mask and one for the first element:
  108. // - if zero, mask = -1, v = v - 1 + 1 = v
  109. // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
  110. __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
  111. in0 = _mm_add_epi16(in0, mask);
  112. in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
  113. }
  114. // There are 4 total stages, alternating between an add/subtract stage
  115. // followed by an multiply-and-add stage.
  116. {
  117. // Stage 1: Add/subtract
  118. // in0 = [i0 i1 i2 i3 iC iD iE iF]
  119. // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
  120. const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
  121. const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
  122. // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
  123. // r1 = [iC i8 iD i9 iE iA iF iB]
  124. const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
  125. const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
  126. // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
  127. // r3 = [iC i8 iD i9 iF iB iE iA]
  128. const __m128i t0 = _mm_add_epi16(r2, r3);
  129. const __m128i t1 = _mm_sub_epi16(r2, r3);
  130. // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
  131. // t1 = [aC a8 aD a9 aF aB aE aA]
  132. // Stage 2: multiply by constants (which gets us into 32 bits).
  133. // The constants needed here are:
  134. // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
  135. // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
  136. // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
  137. // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
  138. const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
  139. const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
  140. const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
  141. const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
  142. // Then add and right-shift to get back to 16-bit range
  143. const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
  144. const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
  145. const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
  146. const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
  147. const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
  148. const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
  149. const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
  150. const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
  151. // w0 = [b0 b1 b7 b6]
  152. // w1 = [b8 b9 bF bE]
  153. // w2 = [b4 b5 b3 b2]
  154. // w3 = [bC bD bB bA]
  155. const __m128i x0 = _mm_packs_epi32(w0, w1);
  156. const __m128i x1 = _mm_packs_epi32(w2, w3);
  157. #if DCT_HIGH_BIT_DEPTH
  158. overflow = check_epi16_overflow_x2(&x0, &x1);
  159. if (overflow) {
  160. vpx_highbd_fdct4x4_c(input, output, stride);
  161. return;
  162. }
  163. #endif // DCT_HIGH_BIT_DEPTH
  164. // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
  165. // x1 = [b4 b5 b3 b2 bC bD bB bA]
  166. in0 = _mm_shuffle_epi32(x0, 0xD8);
  167. in1 = _mm_shuffle_epi32(x1, 0x8D);
  168. // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
  169. // in1 = [b3 b2 bB bA b4 b5 bC bD]
  170. }
  171. {
  172. // vertical DCTs finished. Now we do the horizontal DCTs.
  173. // Stage 3: Add/subtract
  174. const __m128i t0 = ADD_EPI16(in0, in1);
  175. const __m128i t1 = SUB_EPI16(in0, in1);
  176. // t0 = [c0 c1 c8 c9 c4 c5 cC cD]
  177. // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
  178. #if DCT_HIGH_BIT_DEPTH
  179. overflow = check_epi16_overflow_x2(&t0, &t1);
  180. if (overflow) {
  181. vpx_highbd_fdct4x4_c(input, output, stride);
  182. return;
  183. }
  184. #endif // DCT_HIGH_BIT_DEPTH
  185. // Stage 4: multiply by constants (which gets us into 32 bits).
  186. {
  187. // The constants needed here are:
  188. // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
  189. // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
  190. // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
  191. // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
  192. const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
  193. const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
  194. const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
  195. const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
  196. // Then add and right-shift to get back to 16-bit range
  197. // but this combines the final right-shift as well to save operations
  198. // This unusual rounding operations is to maintain bit-accurate
  199. // compatibility with the c version of this function which has two
  200. // rounding steps in a row.
  201. const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
  202. const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
  203. const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
  204. const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
  205. const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
  206. const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
  207. const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
  208. const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
  209. // w0 = [o0 o4 o8 oC]
  210. // w1 = [o2 o6 oA oE]
  211. // w2 = [o1 o5 o9 oD]
  212. // w3 = [o3 o7 oB oF]
  213. // remember the o's are numbered according to the correct output location
  214. const __m128i x0 = _mm_packs_epi32(w0, w1);
  215. const __m128i x1 = _mm_packs_epi32(w2, w3);
  216. #if DCT_HIGH_BIT_DEPTH
  217. overflow = check_epi16_overflow_x2(&x0, &x1);
  218. if (overflow) {
  219. vpx_highbd_fdct4x4_c(input, output, stride);
  220. return;
  221. }
  222. #endif // DCT_HIGH_BIT_DEPTH
  223. {
  224. // x0 = [o0 o4 o8 oC o2 o6 oA oE]
  225. // x1 = [o1 o5 o9 oD o3 o7 oB oF]
  226. const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
  227. const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
  228. // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
  229. // y1 = [o2 o3 o6 o7 oA oB oE oF]
  230. in0 = _mm_unpacklo_epi32(y0, y1);
  231. // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
  232. in1 = _mm_unpackhi_epi32(y0, y1);
  233. // in1 = [o8 o9 oA oB oC oD oE oF]
  234. }
  235. }
  236. }
  237. // Post-condition (v + 1) >> 2 is now incorporated into previous
  238. // add and right-shift commands. Only 2 store instructions needed
  239. // because we are using the fact that 1/3 are stored just after 0/2.
  240. storeu_output(&in0, output + 0 * 4);
  241. storeu_output(&in1, output + 2 * 4);
  242. }
  243. void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
  244. int pass;
  245. // Constants
  246. // When we use them, in one case, they are all the same. In all others
  247. // it's a pair of them that we need to repeat four times. This is done
  248. // by constructing the 32 bit constant corresponding to that pair.
  249. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
  250. const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  251. const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
  252. const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  253. const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
  254. const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
  255. const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
  256. const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  257. const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  258. #if DCT_HIGH_BIT_DEPTH
  259. int overflow;
  260. #endif
  261. // Load input
  262. __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
  263. __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
  264. __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
  265. __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
  266. __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
  267. __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
  268. __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
  269. __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
  270. // Pre-condition input (shift by two)
  271. in0 = _mm_slli_epi16(in0, 2);
  272. in1 = _mm_slli_epi16(in1, 2);
  273. in2 = _mm_slli_epi16(in2, 2);
  274. in3 = _mm_slli_epi16(in3, 2);
  275. in4 = _mm_slli_epi16(in4, 2);
  276. in5 = _mm_slli_epi16(in5, 2);
  277. in6 = _mm_slli_epi16(in6, 2);
  278. in7 = _mm_slli_epi16(in7, 2);
  279. // We do two passes, first the columns, then the rows. The results of the
  280. // first pass are transposed so that the same column code can be reused. The
  281. // results of the second pass are also transposed so that the rows (processed
  282. // as columns) are put back in row positions.
  283. for (pass = 0; pass < 2; pass++) {
  284. // To store results of each pass before the transpose.
  285. __m128i res0, res1, res2, res3, res4, res5, res6, res7;
  286. // Add/subtract
  287. const __m128i q0 = ADD_EPI16(in0, in7);
  288. const __m128i q1 = ADD_EPI16(in1, in6);
  289. const __m128i q2 = ADD_EPI16(in2, in5);
  290. const __m128i q3 = ADD_EPI16(in3, in4);
  291. const __m128i q4 = SUB_EPI16(in3, in4);
  292. const __m128i q5 = SUB_EPI16(in2, in5);
  293. const __m128i q6 = SUB_EPI16(in1, in6);
  294. const __m128i q7 = SUB_EPI16(in0, in7);
  295. #if DCT_HIGH_BIT_DEPTH
  296. if (pass == 1) {
  297. overflow =
  298. check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
  299. if (overflow) {
  300. vpx_highbd_fdct8x8_c(input, output, stride);
  301. return;
  302. }
  303. }
  304. #endif // DCT_HIGH_BIT_DEPTH
  305. // Work on first four results
  306. {
  307. // Add/subtract
  308. const __m128i r0 = ADD_EPI16(q0, q3);
  309. const __m128i r1 = ADD_EPI16(q1, q2);
  310. const __m128i r2 = SUB_EPI16(q1, q2);
  311. const __m128i r3 = SUB_EPI16(q0, q3);
  312. #if DCT_HIGH_BIT_DEPTH
  313. overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
  314. if (overflow) {
  315. vpx_highbd_fdct8x8_c(input, output, stride);
  316. return;
  317. }
  318. #endif // DCT_HIGH_BIT_DEPTH
  319. // Interleave to do the multiply by constants which gets us into 32bits
  320. {
  321. const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
  322. const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
  323. const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
  324. const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
  325. const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
  326. const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
  327. const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
  328. const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
  329. const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
  330. const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
  331. const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
  332. const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
  333. // dct_const_round_shift
  334. const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
  335. const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
  336. const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
  337. const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
  338. const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
  339. const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
  340. const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
  341. const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
  342. const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
  343. const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
  344. const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
  345. const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
  346. const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
  347. const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
  348. const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
  349. const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
  350. // Combine
  351. res0 = _mm_packs_epi32(w0, w1);
  352. res4 = _mm_packs_epi32(w2, w3);
  353. res2 = _mm_packs_epi32(w4, w5);
  354. res6 = _mm_packs_epi32(w6, w7);
  355. #if DCT_HIGH_BIT_DEPTH
  356. overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
  357. if (overflow) {
  358. vpx_highbd_fdct8x8_c(input, output, stride);
  359. return;
  360. }
  361. #endif // DCT_HIGH_BIT_DEPTH
  362. }
  363. }
  364. // Work on next four results
  365. {
  366. // Interleave to do the multiply by constants which gets us into 32bits
  367. const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
  368. const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
  369. const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
  370. const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
  371. const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
  372. const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
  373. // dct_const_round_shift
  374. const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
  375. const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
  376. const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
  377. const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
  378. const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
  379. const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
  380. const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
  381. const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
  382. // Combine
  383. const __m128i r0 = _mm_packs_epi32(s0, s1);
  384. const __m128i r1 = _mm_packs_epi32(s2, s3);
  385. #if DCT_HIGH_BIT_DEPTH
  386. overflow = check_epi16_overflow_x2(&r0, &r1);
  387. if (overflow) {
  388. vpx_highbd_fdct8x8_c(input, output, stride);
  389. return;
  390. }
  391. #endif // DCT_HIGH_BIT_DEPTH
  392. {
  393. // Add/subtract
  394. const __m128i x0 = ADD_EPI16(q4, r0);
  395. const __m128i x1 = SUB_EPI16(q4, r0);
  396. const __m128i x2 = SUB_EPI16(q7, r1);
  397. const __m128i x3 = ADD_EPI16(q7, r1);
  398. #if DCT_HIGH_BIT_DEPTH
  399. overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
  400. if (overflow) {
  401. vpx_highbd_fdct8x8_c(input, output, stride);
  402. return;
  403. }
  404. #endif // DCT_HIGH_BIT_DEPTH
  405. // Interleave to do the multiply by constants which gets us into 32bits
  406. {
  407. const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
  408. const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
  409. const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
  410. const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
  411. const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
  412. const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
  413. const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
  414. const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
  415. const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
  416. const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
  417. const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
  418. const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
  419. // dct_const_round_shift
  420. const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
  421. const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
  422. const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
  423. const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
  424. const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
  425. const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
  426. const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
  427. const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
  428. const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
  429. const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
  430. const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
  431. const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
  432. const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
  433. const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
  434. const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
  435. const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
  436. // Combine
  437. res1 = _mm_packs_epi32(w0, w1);
  438. res7 = _mm_packs_epi32(w2, w3);
  439. res5 = _mm_packs_epi32(w4, w5);
  440. res3 = _mm_packs_epi32(w6, w7);
  441. #if DCT_HIGH_BIT_DEPTH
  442. overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
  443. if (overflow) {
  444. vpx_highbd_fdct8x8_c(input, output, stride);
  445. return;
  446. }
  447. #endif // DCT_HIGH_BIT_DEPTH
  448. }
  449. }
  450. }
  451. // Transpose the 8x8.
  452. {
  453. // 00 01 02 03 04 05 06 07
  454. // 10 11 12 13 14 15 16 17
  455. // 20 21 22 23 24 25 26 27
  456. // 30 31 32 33 34 35 36 37
  457. // 40 41 42 43 44 45 46 47
  458. // 50 51 52 53 54 55 56 57
  459. // 60 61 62 63 64 65 66 67
  460. // 70 71 72 73 74 75 76 77
  461. const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
  462. const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
  463. const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
  464. const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
  465. const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
  466. const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
  467. const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
  468. const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
  469. // 00 10 01 11 02 12 03 13
  470. // 20 30 21 31 22 32 23 33
  471. // 04 14 05 15 06 16 07 17
  472. // 24 34 25 35 26 36 27 37
  473. // 40 50 41 51 42 52 43 53
  474. // 60 70 61 71 62 72 63 73
  475. // 54 54 55 55 56 56 57 57
  476. // 64 74 65 75 66 76 67 77
  477. const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  478. const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
  479. const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  480. const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
  481. const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
  482. const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
  483. const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
  484. const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
  485. // 00 10 20 30 01 11 21 31
  486. // 40 50 60 70 41 51 61 71
  487. // 02 12 22 32 03 13 23 33
  488. // 42 52 62 72 43 53 63 73
  489. // 04 14 24 34 05 15 21 36
  490. // 44 54 64 74 45 55 61 76
  491. // 06 16 26 36 07 17 27 37
  492. // 46 56 66 76 47 57 67 77
  493. in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
  494. in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
  495. in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
  496. in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
  497. in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
  498. in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
  499. in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
  500. in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
  501. // 00 10 20 30 40 50 60 70
  502. // 01 11 21 31 41 51 61 71
  503. // 02 12 22 32 42 52 62 72
  504. // 03 13 23 33 43 53 63 73
  505. // 04 14 24 34 44 54 64 74
  506. // 05 15 25 35 45 55 65 75
  507. // 06 16 26 36 46 56 66 76
  508. // 07 17 27 37 47 57 67 77
  509. }
  510. }
  511. // Post-condition output and store it
  512. {
  513. // Post-condition (division by two)
  514. // division of two 16 bits signed numbers using shifts
  515. // n / 2 = (n - (n >> 15)) >> 1
  516. const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
  517. const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
  518. const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
  519. const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
  520. const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
  521. const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
  522. const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
  523. const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
  524. in0 = _mm_sub_epi16(in0, sign_in0);
  525. in1 = _mm_sub_epi16(in1, sign_in1);
  526. in2 = _mm_sub_epi16(in2, sign_in2);
  527. in3 = _mm_sub_epi16(in3, sign_in3);
  528. in4 = _mm_sub_epi16(in4, sign_in4);
  529. in5 = _mm_sub_epi16(in5, sign_in5);
  530. in6 = _mm_sub_epi16(in6, sign_in6);
  531. in7 = _mm_sub_epi16(in7, sign_in7);
  532. in0 = _mm_srai_epi16(in0, 1);
  533. in1 = _mm_srai_epi16(in1, 1);
  534. in2 = _mm_srai_epi16(in2, 1);
  535. in3 = _mm_srai_epi16(in3, 1);
  536. in4 = _mm_srai_epi16(in4, 1);
  537. in5 = _mm_srai_epi16(in5, 1);
  538. in6 = _mm_srai_epi16(in6, 1);
  539. in7 = _mm_srai_epi16(in7, 1);
  540. // store results
  541. store_output(&in0, (output + 0 * 8));
  542. store_output(&in1, (output + 1 * 8));
  543. store_output(&in2, (output + 2 * 8));
  544. store_output(&in3, (output + 3 * 8));
  545. store_output(&in4, (output + 4 * 8));
  546. store_output(&in5, (output + 5 * 8));
  547. store_output(&in6, (output + 6 * 8));
  548. store_output(&in7, (output + 7 * 8));
  549. }
  550. }
  551. void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
  552. // The 2D transform is done with two passes which are actually pretty
  553. // similar. In the first one, we transform the columns and transpose
  554. // the results. In the second one, we transform the rows. To achieve that,
  555. // as the first pass results are transposed, we transpose the columns (that
  556. // is the transposed rows) and transpose the results (so that it goes back
  557. // in normal/row positions).
  558. int pass;
  559. // We need an intermediate buffer between passes.
  560. DECLARE_ALIGNED(16, int16_t, intermediate[256]);
  561. const int16_t *in = input;
  562. int16_t *out0 = intermediate;
  563. tran_low_t *out1 = output;
  564. // Constants
  565. // When we use them, in one case, they are all the same. In all others
  566. // it's a pair of them that we need to repeat four times. This is done
  567. // by constructing the 32 bit constant corresponding to that pair.
  568. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
  569. const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  570. const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
  571. const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
  572. const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  573. const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
  574. const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
  575. const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
  576. const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  577. const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
  578. const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
  579. const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
  580. const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
  581. const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
  582. const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
  583. const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
  584. const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
  585. const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  586. const __m128i kOne = _mm_set1_epi16(1);
  587. // Do the two transform/transpose passes
  588. for (pass = 0; pass < 2; ++pass) {
  589. // We process eight columns (transposed rows in second pass) at a time.
  590. int column_start;
  591. #if DCT_HIGH_BIT_DEPTH
  592. int overflow;
  593. #endif
  594. for (column_start = 0; column_start < 16; column_start += 8) {
  595. __m128i in00, in01, in02, in03, in04, in05, in06, in07;
  596. __m128i in08, in09, in10, in11, in12, in13, in14, in15;
  597. __m128i input0, input1, input2, input3, input4, input5, input6, input7;
  598. __m128i step1_0, step1_1, step1_2, step1_3;
  599. __m128i step1_4, step1_5, step1_6, step1_7;
  600. __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
  601. __m128i step3_0, step3_1, step3_2, step3_3;
  602. __m128i step3_4, step3_5, step3_6, step3_7;
  603. __m128i res00, res01, res02, res03, res04, res05, res06, res07;
  604. __m128i res08, res09, res10, res11, res12, res13, res14, res15;
  605. // Load and pre-condition input.
  606. if (0 == pass) {
  607. in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
  608. in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
  609. in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
  610. in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
  611. in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
  612. in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
  613. in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
  614. in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
  615. in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
  616. in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
  617. in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
  618. in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
  619. in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
  620. in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
  621. in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
  622. in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
  623. // x = x << 2
  624. in00 = _mm_slli_epi16(in00, 2);
  625. in01 = _mm_slli_epi16(in01, 2);
  626. in02 = _mm_slli_epi16(in02, 2);
  627. in03 = _mm_slli_epi16(in03, 2);
  628. in04 = _mm_slli_epi16(in04, 2);
  629. in05 = _mm_slli_epi16(in05, 2);
  630. in06 = _mm_slli_epi16(in06, 2);
  631. in07 = _mm_slli_epi16(in07, 2);
  632. in08 = _mm_slli_epi16(in08, 2);
  633. in09 = _mm_slli_epi16(in09, 2);
  634. in10 = _mm_slli_epi16(in10, 2);
  635. in11 = _mm_slli_epi16(in11, 2);
  636. in12 = _mm_slli_epi16(in12, 2);
  637. in13 = _mm_slli_epi16(in13, 2);
  638. in14 = _mm_slli_epi16(in14, 2);
  639. in15 = _mm_slli_epi16(in15, 2);
  640. } else {
  641. in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
  642. in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
  643. in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
  644. in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
  645. in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
  646. in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
  647. in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
  648. in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
  649. in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
  650. in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
  651. in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
  652. in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
  653. in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
  654. in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
  655. in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
  656. in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
  657. // x = (x + 1) >> 2
  658. in00 = _mm_add_epi16(in00, kOne);
  659. in01 = _mm_add_epi16(in01, kOne);
  660. in02 = _mm_add_epi16(in02, kOne);
  661. in03 = _mm_add_epi16(in03, kOne);
  662. in04 = _mm_add_epi16(in04, kOne);
  663. in05 = _mm_add_epi16(in05, kOne);
  664. in06 = _mm_add_epi16(in06, kOne);
  665. in07 = _mm_add_epi16(in07, kOne);
  666. in08 = _mm_add_epi16(in08, kOne);
  667. in09 = _mm_add_epi16(in09, kOne);
  668. in10 = _mm_add_epi16(in10, kOne);
  669. in11 = _mm_add_epi16(in11, kOne);
  670. in12 = _mm_add_epi16(in12, kOne);
  671. in13 = _mm_add_epi16(in13, kOne);
  672. in14 = _mm_add_epi16(in14, kOne);
  673. in15 = _mm_add_epi16(in15, kOne);
  674. in00 = _mm_srai_epi16(in00, 2);
  675. in01 = _mm_srai_epi16(in01, 2);
  676. in02 = _mm_srai_epi16(in02, 2);
  677. in03 = _mm_srai_epi16(in03, 2);
  678. in04 = _mm_srai_epi16(in04, 2);
  679. in05 = _mm_srai_epi16(in05, 2);
  680. in06 = _mm_srai_epi16(in06, 2);
  681. in07 = _mm_srai_epi16(in07, 2);
  682. in08 = _mm_srai_epi16(in08, 2);
  683. in09 = _mm_srai_epi16(in09, 2);
  684. in10 = _mm_srai_epi16(in10, 2);
  685. in11 = _mm_srai_epi16(in11, 2);
  686. in12 = _mm_srai_epi16(in12, 2);
  687. in13 = _mm_srai_epi16(in13, 2);
  688. in14 = _mm_srai_epi16(in14, 2);
  689. in15 = _mm_srai_epi16(in15, 2);
  690. }
  691. in += 8;
  692. // Calculate input for the first 8 results.
  693. {
  694. input0 = ADD_EPI16(in00, in15);
  695. input1 = ADD_EPI16(in01, in14);
  696. input2 = ADD_EPI16(in02, in13);
  697. input3 = ADD_EPI16(in03, in12);
  698. input4 = ADD_EPI16(in04, in11);
  699. input5 = ADD_EPI16(in05, in10);
  700. input6 = ADD_EPI16(in06, in09);
  701. input7 = ADD_EPI16(in07, in08);
  702. #if DCT_HIGH_BIT_DEPTH
  703. overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
  704. &input4, &input5, &input6, &input7);
  705. if (overflow) {
  706. vpx_highbd_fdct16x16_c(input, output, stride);
  707. return;
  708. }
  709. #endif // DCT_HIGH_BIT_DEPTH
  710. }
  711. // Calculate input for the next 8 results.
  712. {
  713. step1_0 = SUB_EPI16(in07, in08);
  714. step1_1 = SUB_EPI16(in06, in09);
  715. step1_2 = SUB_EPI16(in05, in10);
  716. step1_3 = SUB_EPI16(in04, in11);
  717. step1_4 = SUB_EPI16(in03, in12);
  718. step1_5 = SUB_EPI16(in02, in13);
  719. step1_6 = SUB_EPI16(in01, in14);
  720. step1_7 = SUB_EPI16(in00, in15);
  721. #if DCT_HIGH_BIT_DEPTH
  722. overflow =
  723. check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
  724. &step1_4, &step1_5, &step1_6, &step1_7);
  725. if (overflow) {
  726. vpx_highbd_fdct16x16_c(input, output, stride);
  727. return;
  728. }
  729. #endif // DCT_HIGH_BIT_DEPTH
  730. }
  731. // Work on the first eight values; fdct8(input, even_results);
  732. {
  733. // Add/subtract
  734. const __m128i q0 = ADD_EPI16(input0, input7);
  735. const __m128i q1 = ADD_EPI16(input1, input6);
  736. const __m128i q2 = ADD_EPI16(input2, input5);
  737. const __m128i q3 = ADD_EPI16(input3, input4);
  738. const __m128i q4 = SUB_EPI16(input3, input4);
  739. const __m128i q5 = SUB_EPI16(input2, input5);
  740. const __m128i q6 = SUB_EPI16(input1, input6);
  741. const __m128i q7 = SUB_EPI16(input0, input7);
  742. #if DCT_HIGH_BIT_DEPTH
  743. overflow =
  744. check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
  745. if (overflow) {
  746. vpx_highbd_fdct16x16_c(input, output, stride);
  747. return;
  748. }
  749. #endif // DCT_HIGH_BIT_DEPTH
  750. // Work on first four results
  751. {
  752. // Add/subtract
  753. const __m128i r0 = ADD_EPI16(q0, q3);
  754. const __m128i r1 = ADD_EPI16(q1, q2);
  755. const __m128i r2 = SUB_EPI16(q1, q2);
  756. const __m128i r3 = SUB_EPI16(q0, q3);
  757. #if DCT_HIGH_BIT_DEPTH
  758. overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
  759. if (overflow) {
  760. vpx_highbd_fdct16x16_c(input, output, stride);
  761. return;
  762. }
  763. #endif // DCT_HIGH_BIT_DEPTH
  764. // Interleave to do the multiply by constants which gets us
  765. // into 32 bits.
  766. {
  767. const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
  768. const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
  769. const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
  770. const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
  771. res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
  772. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  773. res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
  774. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  775. res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
  776. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  777. res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
  778. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  779. #if DCT_HIGH_BIT_DEPTH
  780. overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
  781. if (overflow) {
  782. vpx_highbd_fdct16x16_c(input, output, stride);
  783. return;
  784. }
  785. #endif // DCT_HIGH_BIT_DEPTH
  786. }
  787. }
  788. // Work on next four results
  789. {
  790. // Interleave to do the multiply by constants which gets us
  791. // into 32 bits.
  792. const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
  793. const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
  794. const __m128i r0 =
  795. mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
  796. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  797. const __m128i r1 =
  798. mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
  799. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  800. #if DCT_HIGH_BIT_DEPTH
  801. overflow = check_epi16_overflow_x2(&r0, &r1);
  802. if (overflow) {
  803. vpx_highbd_fdct16x16_c(input, output, stride);
  804. return;
  805. }
  806. #endif // DCT_HIGH_BIT_DEPTH
  807. {
  808. // Add/subtract
  809. const __m128i x0 = ADD_EPI16(q4, r0);
  810. const __m128i x1 = SUB_EPI16(q4, r0);
  811. const __m128i x2 = SUB_EPI16(q7, r1);
  812. const __m128i x3 = ADD_EPI16(q7, r1);
  813. #if DCT_HIGH_BIT_DEPTH
  814. overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
  815. if (overflow) {
  816. vpx_highbd_fdct16x16_c(input, output, stride);
  817. return;
  818. }
  819. #endif // DCT_HIGH_BIT_DEPTH
  820. // Interleave to do the multiply by constants which gets us
  821. // into 32 bits.
  822. {
  823. const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
  824. const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
  825. const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
  826. const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
  827. res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
  828. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  829. res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
  830. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  831. res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
  832. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  833. res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
  834. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  835. #if DCT_HIGH_BIT_DEPTH
  836. overflow =
  837. check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
  838. if (overflow) {
  839. vpx_highbd_fdct16x16_c(input, output, stride);
  840. return;
  841. }
  842. #endif // DCT_HIGH_BIT_DEPTH
  843. }
  844. }
  845. }
  846. }
  847. // Work on the next eight values; step1 -> odd_results
  848. {
  849. // step 2
  850. {
  851. const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
  852. const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
  853. const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
  854. const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
  855. step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
  856. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  857. step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
  858. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  859. step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
  860. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  861. step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
  862. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  863. #if DCT_HIGH_BIT_DEPTH
  864. overflow =
  865. check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
  866. if (overflow) {
  867. vpx_highbd_fdct16x16_c(input, output, stride);
  868. return;
  869. }
  870. #endif // DCT_HIGH_BIT_DEPTH
  871. }
  872. // step 3
  873. {
  874. step3_0 = ADD_EPI16(step1_0, step2_3);
  875. step3_1 = ADD_EPI16(step1_1, step2_2);
  876. step3_2 = SUB_EPI16(step1_1, step2_2);
  877. step3_3 = SUB_EPI16(step1_0, step2_3);
  878. step3_4 = SUB_EPI16(step1_7, step2_4);
  879. step3_5 = SUB_EPI16(step1_6, step2_5);
  880. step3_6 = ADD_EPI16(step1_6, step2_5);
  881. step3_7 = ADD_EPI16(step1_7, step2_4);
  882. #if DCT_HIGH_BIT_DEPTH
  883. overflow =
  884. check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
  885. &step3_4, &step3_5, &step3_6, &step3_7);
  886. if (overflow) {
  887. vpx_highbd_fdct16x16_c(input, output, stride);
  888. return;
  889. }
  890. #endif // DCT_HIGH_BIT_DEPTH
  891. }
  892. // step 4
  893. {
  894. const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
  895. const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
  896. const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
  897. const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
  898. step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
  899. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  900. step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
  901. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  902. step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
  903. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  904. step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
  905. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  906. #if DCT_HIGH_BIT_DEPTH
  907. overflow =
  908. check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
  909. if (overflow) {
  910. vpx_highbd_fdct16x16_c(input, output, stride);
  911. return;
  912. }
  913. #endif // DCT_HIGH_BIT_DEPTH
  914. }
  915. // step 5
  916. {
  917. step1_0 = ADD_EPI16(step3_0, step2_1);
  918. step1_1 = SUB_EPI16(step3_0, step2_1);
  919. step1_2 = ADD_EPI16(step3_3, step2_2);
  920. step1_3 = SUB_EPI16(step3_3, step2_2);
  921. step1_4 = SUB_EPI16(step3_4, step2_5);
  922. step1_5 = ADD_EPI16(step3_4, step2_5);
  923. step1_6 = SUB_EPI16(step3_7, step2_6);
  924. step1_7 = ADD_EPI16(step3_7, step2_6);
  925. #if DCT_HIGH_BIT_DEPTH
  926. overflow =
  927. check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
  928. &step1_4, &step1_5, &step1_6, &step1_7);
  929. if (overflow) {
  930. vpx_highbd_fdct16x16_c(input, output, stride);
  931. return;
  932. }
  933. #endif // DCT_HIGH_BIT_DEPTH
  934. }
  935. // step 6
  936. {
  937. const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
  938. const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
  939. const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
  940. const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
  941. res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
  942. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  943. res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
  944. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  945. res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
  946. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  947. res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
  948. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  949. #if DCT_HIGH_BIT_DEPTH
  950. overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
  951. if (overflow) {
  952. vpx_highbd_fdct16x16_c(input, output, stride);
  953. return;
  954. }
  955. #endif // DCT_HIGH_BIT_DEPTH
  956. }
  957. {
  958. const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
  959. const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
  960. const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
  961. const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
  962. res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
  963. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  964. res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
  965. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  966. res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
  967. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  968. res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
  969. &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
  970. #if DCT_HIGH_BIT_DEPTH
  971. overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
  972. if (overflow) {
  973. vpx_highbd_fdct16x16_c(input, output, stride);
  974. return;
  975. }
  976. #endif // DCT_HIGH_BIT_DEPTH
  977. }
  978. }
  979. // Transpose the results, do it as two 8x8 transposes.
  980. transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
  981. &res06, &res07, pass, out0, out1);
  982. transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
  983. &res14, &res15, pass, out0 + 8, out1 + 8);
  984. if (pass == 0) {
  985. out0 += 8 * 16;
  986. } else {
  987. out1 += 8 * 16;
  988. }
  989. }
  990. // Setup in/out for next pass.
  991. in = intermediate;
  992. }
  993. }
  994. #undef ADD_EPI16
  995. #undef SUB_EPI16