inv_txfm_ssse3.h 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
  11. #define VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
  12. #include <tmmintrin.h>
  13. #include "./vpx_dsp_rtcd.h"
  14. #include "vpx_dsp/x86/inv_txfm_sse2.h"
  15. #include "vpx_dsp/x86/transpose_sse2.h"
  16. #include "vpx_dsp/x86/txfm_common_sse2.h"
  17. static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
  18. const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
  19. const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
  20. const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
  21. const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
  22. const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  23. const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64));
  24. const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64));
  25. const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64));
  26. const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64));
  27. const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64));
  28. const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64));
  29. const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64));
  30. __m128i step1[8], step2[8], tmp[4];
  31. // pass 1
  32. transpose_16bit_4x4(io, io);
  33. // io[0]: 00 10 20 30 01 11 21 31
  34. // io[1]: 02 12 22 32 03 13 23 33
  35. // stage 1
  36. tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
  37. tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
  38. tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
  39. tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
  40. step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7
  41. step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6
  42. // stage 2
  43. step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1
  44. step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2
  45. step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
  46. step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
  47. step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6
  48. // stage 3
  49. tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
  50. step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6
  51. tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
  52. tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
  53. step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
  54. step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
  55. // stage 4
  56. tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
  57. tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
  58. tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
  59. tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
  60. // pass 2
  61. idct8x8_12_transpose_16bit_4x8(tmp, io);
  62. // stage 1
  63. step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
  64. step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
  65. step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
  66. step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
  67. // stage 2
  68. step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0]
  69. step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
  70. step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
  71. step2[4] = _mm_add_epi16(step1[4], step1[5]);
  72. step2[5] = _mm_sub_epi16(step1[4], step1[5]);
  73. step2[6] = _mm_sub_epi16(step1[7], step1[6]);
  74. step2[7] = _mm_add_epi16(step1[7], step1[6]);
  75. // stage 3
  76. step1[0] = _mm_add_epi16(step2[0], step2[3]);
  77. step1[1] = _mm_add_epi16(step2[0], step2[2]);
  78. step1[2] = _mm_sub_epi16(step2[0], step2[2]);
  79. step1[3] = _mm_sub_epi16(step2[0], step2[3]);
  80. butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
  81. // stage 4
  82. io[0] = _mm_add_epi16(step1[0], step2[7]);
  83. io[1] = _mm_add_epi16(step1[1], step1[6]);
  84. io[2] = _mm_add_epi16(step1[2], step1[5]);
  85. io[3] = _mm_add_epi16(step1[3], step2[4]);
  86. io[4] = _mm_sub_epi16(step1[3], step2[4]);
  87. io[5] = _mm_sub_epi16(step1[2], step1[5]);
  88. io[6] = _mm_sub_epi16(step1[1], step1[6]);
  89. io[7] = _mm_sub_epi16(step1[0], step2[7]);
  90. }
  91. void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out);
  92. #endif // VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_