fwd_txfm_sse2.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <emmintrin.h> // SSE2
  11. #include "./vpx_config.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_dsp/vpx_dsp_common.h"
  14. #include "vpx_dsp/x86/fwd_txfm_sse2.h"
  15. void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
  16. __m128i in0, in1;
  17. __m128i tmp;
  18. const __m128i zero = _mm_setzero_si128();
  19. in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
  20. in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
  21. in1 = _mm_unpacklo_epi64(
  22. in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
  23. in0 = _mm_unpacklo_epi64(
  24. in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
  25. tmp = _mm_add_epi16(in0, in1);
  26. in0 = _mm_unpacklo_epi16(zero, tmp);
  27. in1 = _mm_unpackhi_epi16(zero, tmp);
  28. in0 = _mm_srai_epi32(in0, 16);
  29. in1 = _mm_srai_epi32(in1, 16);
  30. tmp = _mm_add_epi32(in0, in1);
  31. in0 = _mm_unpacklo_epi32(tmp, zero);
  32. in1 = _mm_unpackhi_epi32(tmp, zero);
  33. tmp = _mm_add_epi32(in0, in1);
  34. in0 = _mm_srli_si128(tmp, 8);
  35. in1 = _mm_add_epi32(tmp, in0);
  36. in0 = _mm_slli_epi32(in1, 1);
  37. output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
  38. }
  39. void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
  40. __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
  41. __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
  42. __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
  43. __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
  44. __m128i u0, u1, sum;
  45. u0 = _mm_add_epi16(in0, in1);
  46. u1 = _mm_add_epi16(in2, in3);
  47. in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
  48. in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
  49. in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
  50. in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
  51. sum = _mm_add_epi16(u0, u1);
  52. in0 = _mm_add_epi16(in0, in1);
  53. in2 = _mm_add_epi16(in2, in3);
  54. sum = _mm_add_epi16(sum, in0);
  55. u0 = _mm_setzero_si128();
  56. sum = _mm_add_epi16(sum, in2);
  57. in0 = _mm_unpacklo_epi16(u0, sum);
  58. in1 = _mm_unpackhi_epi16(u0, sum);
  59. in0 = _mm_srai_epi32(in0, 16);
  60. in1 = _mm_srai_epi32(in1, 16);
  61. sum = _mm_add_epi32(in0, in1);
  62. in0 = _mm_unpacklo_epi32(sum, u0);
  63. in1 = _mm_unpackhi_epi32(sum, u0);
  64. sum = _mm_add_epi32(in0, in1);
  65. in0 = _mm_srli_si128(sum, 8);
  66. in1 = _mm_add_epi32(sum, in0);
  67. output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
  68. }
  69. void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
  70. int stride) {
  71. __m128i in0, in1, in2, in3;
  72. __m128i u0, u1;
  73. __m128i sum = _mm_setzero_si128();
  74. int i;
  75. for (i = 0; i < 2; ++i) {
  76. in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
  77. in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
  78. in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
  79. in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
  80. u0 = _mm_add_epi16(in0, in1);
  81. u1 = _mm_add_epi16(in2, in3);
  82. sum = _mm_add_epi16(sum, u0);
  83. in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
  84. in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
  85. in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
  86. in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
  87. sum = _mm_add_epi16(sum, u1);
  88. u0 = _mm_add_epi16(in0, in1);
  89. u1 = _mm_add_epi16(in2, in3);
  90. sum = _mm_add_epi16(sum, u0);
  91. in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
  92. in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
  93. in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
  94. in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
  95. sum = _mm_add_epi16(sum, u1);
  96. u0 = _mm_add_epi16(in0, in1);
  97. u1 = _mm_add_epi16(in2, in3);
  98. sum = _mm_add_epi16(sum, u0);
  99. in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
  100. in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
  101. in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
  102. in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
  103. sum = _mm_add_epi16(sum, u1);
  104. u0 = _mm_add_epi16(in0, in1);
  105. u1 = _mm_add_epi16(in2, in3);
  106. sum = _mm_add_epi16(sum, u0);
  107. sum = _mm_add_epi16(sum, u1);
  108. input += 8 * stride;
  109. }
  110. u0 = _mm_setzero_si128();
  111. in0 = _mm_unpacklo_epi16(u0, sum);
  112. in1 = _mm_unpackhi_epi16(u0, sum);
  113. in0 = _mm_srai_epi32(in0, 16);
  114. in1 = _mm_srai_epi32(in1, 16);
  115. sum = _mm_add_epi32(in0, in1);
  116. in0 = _mm_unpacklo_epi32(sum, u0);
  117. in1 = _mm_unpackhi_epi32(sum, u0);
  118. sum = _mm_add_epi32(in0, in1);
  119. in0 = _mm_srli_si128(sum, 8);
  120. in1 = _mm_add_epi32(sum, in0);
  121. in1 = _mm_srai_epi32(in1, 1);
  122. output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
  123. }
  124. void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
  125. int stride) {
  126. __m128i in0, in1, in2, in3;
  127. __m128i u0, u1;
  128. __m128i sum = _mm_setzero_si128();
  129. int i;
  130. for (i = 0; i < 8; ++i) {
  131. in0 = _mm_load_si128((const __m128i *)(input + 0));
  132. in1 = _mm_load_si128((const __m128i *)(input + 8));
  133. in2 = _mm_load_si128((const __m128i *)(input + 16));
  134. in3 = _mm_load_si128((const __m128i *)(input + 24));
  135. input += stride;
  136. u0 = _mm_add_epi16(in0, in1);
  137. u1 = _mm_add_epi16(in2, in3);
  138. sum = _mm_add_epi16(sum, u0);
  139. in0 = _mm_load_si128((const __m128i *)(input + 0));
  140. in1 = _mm_load_si128((const __m128i *)(input + 8));
  141. in2 = _mm_load_si128((const __m128i *)(input + 16));
  142. in3 = _mm_load_si128((const __m128i *)(input + 24));
  143. input += stride;
  144. sum = _mm_add_epi16(sum, u1);
  145. u0 = _mm_add_epi16(in0, in1);
  146. u1 = _mm_add_epi16(in2, in3);
  147. sum = _mm_add_epi16(sum, u0);
  148. in0 = _mm_load_si128((const __m128i *)(input + 0));
  149. in1 = _mm_load_si128((const __m128i *)(input + 8));
  150. in2 = _mm_load_si128((const __m128i *)(input + 16));
  151. in3 = _mm_load_si128((const __m128i *)(input + 24));
  152. input += stride;
  153. sum = _mm_add_epi16(sum, u1);
  154. u0 = _mm_add_epi16(in0, in1);
  155. u1 = _mm_add_epi16(in2, in3);
  156. sum = _mm_add_epi16(sum, u0);
  157. in0 = _mm_load_si128((const __m128i *)(input + 0));
  158. in1 = _mm_load_si128((const __m128i *)(input + 8));
  159. in2 = _mm_load_si128((const __m128i *)(input + 16));
  160. in3 = _mm_load_si128((const __m128i *)(input + 24));
  161. input += stride;
  162. sum = _mm_add_epi16(sum, u1);
  163. u0 = _mm_add_epi16(in0, in1);
  164. u1 = _mm_add_epi16(in2, in3);
  165. sum = _mm_add_epi16(sum, u0);
  166. sum = _mm_add_epi16(sum, u1);
  167. }
  168. u0 = _mm_setzero_si128();
  169. in0 = _mm_unpacklo_epi16(u0, sum);
  170. in1 = _mm_unpackhi_epi16(u0, sum);
  171. in0 = _mm_srai_epi32(in0, 16);
  172. in1 = _mm_srai_epi32(in1, 16);
  173. sum = _mm_add_epi32(in0, in1);
  174. in0 = _mm_unpacklo_epi32(sum, u0);
  175. in1 = _mm_unpackhi_epi32(sum, u0);
  176. sum = _mm_add_epi32(in0, in1);
  177. in0 = _mm_srli_si128(sum, 8);
  178. in1 = _mm_add_epi32(sum, in0);
  179. in1 = _mm_srai_epi32(in1, 3);
  180. output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
  181. }
  182. #define DCT_HIGH_BIT_DEPTH 0
  183. #define FDCT4x4_2D vpx_fdct4x4_sse2
  184. #define FDCT8x8_2D vpx_fdct8x8_sse2
  185. #define FDCT16x16_2D vpx_fdct16x16_sse2
  186. #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
  187. #undef FDCT4x4_2D
  188. #undef FDCT8x8_2D
  189. #undef FDCT16x16_2D
  190. #define FDCT32x32_2D vpx_fdct32x32_rd_sse2
  191. #define FDCT32x32_HIGH_PRECISION 0
  192. #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
  193. #undef FDCT32x32_2D
  194. #undef FDCT32x32_HIGH_PRECISION
  195. #define FDCT32x32_2D vpx_fdct32x32_sse2
  196. #define FDCT32x32_HIGH_PRECISION 1
  197. #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
  198. #undef FDCT32x32_2D
  199. #undef FDCT32x32_HIGH_PRECISION
  200. #undef DCT_HIGH_BIT_DEPTH
  201. #if CONFIG_VP9_HIGHBITDEPTH
  202. #define DCT_HIGH_BIT_DEPTH 1
  203. #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
  204. #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
  205. #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
  206. #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
  207. #undef FDCT4x4_2D
  208. #undef FDCT8x8_2D
  209. #undef FDCT16x16_2D
  210. #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
  211. #define FDCT32x32_HIGH_PRECISION 0
  212. #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
  213. #undef FDCT32x32_2D
  214. #undef FDCT32x32_HIGH_PRECISION
  215. #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
  216. #define FDCT32x32_HIGH_PRECISION 1
  217. #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
  218. #undef FDCT32x32_2D
  219. #undef FDCT32x32_HIGH_PRECISION
  220. #undef DCT_HIGH_BIT_DEPTH
  221. #endif // CONFIG_VP9_HIGHBITDEPTH