fwd_txfm_msa.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_dsp_rtcd.h"
  11. #include "vpx_dsp/mips/fwd_txfm_msa.h"
  12. void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *out, int32_t stride) {
  13. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  14. v4i32 vec_w;
  15. LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
  16. ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
  17. ADD2(in0, in2, in4, in6, in0, in4);
  18. vec_w = __msa_hadd_s_w(in0, in0);
  19. vec_w += __msa_hadd_s_w(in4, in4);
  20. out[0] = HADD_SW_S32(vec_w);
  21. out[1] = 0;
  22. }
  23. #if !CONFIG_VP9_HIGHBITDEPTH
  24. void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
  25. int32_t src_stride) {
  26. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  27. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  28. v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  29. v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
  30. v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
  31. v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
  32. v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,
  33. -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
  34. v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
  35. cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
  36. v8i16 coeff2 = {
  37. -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
  38. };
  39. LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
  40. in10, in11, in12, in13, in14, in15);
  41. SLLI_4V(in0, in1, in2, in3, 2);
  42. SLLI_4V(in4, in5, in6, in7, 2);
  43. SLLI_4V(in8, in9, in10, in11, 2);
  44. SLLI_4V(in12, in13, in14, in15, 2);
  45. ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
  46. ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
  47. FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
  48. tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
  49. ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
  50. SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
  51. SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
  52. tmp_ptr += 16;
  53. /* stp 1 */
  54. ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
  55. ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);
  56. cnst4 = __msa_splati_h(coeff, 0);
  57. stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);
  58. cnst5 = __msa_splati_h(coeff, 1);
  59. cnst5 = __msa_ilvev_h(cnst5, cnst4);
  60. stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
  61. stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
  62. stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);
  63. /* stp2 */
  64. BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
  65. BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
  66. ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
  67. ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
  68. SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
  69. cnst0 = __msa_ilvev_h(cnst0, cnst1);
  70. stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);
  71. cnst0 = __msa_splati_h(coeff, 4);
  72. cnst1 = __msa_ilvev_h(cnst1, cnst0);
  73. stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);
  74. BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
  75. ILVRL_H2_SH(in15, in8, vec1, vec0);
  76. SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
  77. cnst0 = __msa_ilvev_h(cnst0, cnst1);
  78. in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  79. ST_SH(in8, tmp_ptr);
  80. cnst0 = __msa_splati_h(coeff2, 0);
  81. cnst0 = __msa_ilvev_h(cnst1, cnst0);
  82. in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  83. ST_SH(in8, tmp_ptr + 224);
  84. ILVRL_H2_SH(in14, in9, vec1, vec0);
  85. SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
  86. cnst1 = __msa_ilvev_h(cnst1, cnst0);
  87. in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
  88. ST_SH(in8, tmp_ptr + 128);
  89. cnst1 = __msa_splati_h(coeff2, 2);
  90. cnst0 = __msa_ilvev_h(cnst0, cnst1);
  91. in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  92. ST_SH(in8, tmp_ptr + 96);
  93. SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
  94. cnst1 = __msa_ilvev_h(cnst1, cnst0);
  95. stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
  96. cnst1 = __msa_splati_h(coeff, 3);
  97. cnst1 = __msa_ilvev_h(cnst0, cnst1);
  98. stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
  99. /* stp4 */
  100. ADD2(stp34, stp25, stp33, stp22, in13, in10);
  101. ILVRL_H2_SH(in13, in10, vec1, vec0);
  102. SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
  103. cnst0 = __msa_ilvev_h(cnst0, cnst1);
  104. in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  105. ST_SH(in8, tmp_ptr + 64);
  106. cnst0 = __msa_splati_h(coeff2, 1);
  107. cnst0 = __msa_ilvev_h(cnst1, cnst0);
  108. in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  109. ST_SH(in8, tmp_ptr + 160);
  110. SUB2(stp34, stp25, stp33, stp22, in12, in11);
  111. ILVRL_H2_SH(in12, in11, vec1, vec0);
  112. SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
  113. cnst1 = __msa_ilvev_h(cnst1, cnst0);
  114. in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
  115. ST_SH(in8, tmp_ptr + 192);
  116. cnst1 = __msa_splati_h(coeff2, 3);
  117. cnst0 = __msa_ilvev_h(cnst0, cnst1);
  118. in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  119. ST_SH(in8, tmp_ptr + 32);
  120. }
  121. void fdct16x8_1d_row(int16_t *input, int16_t *output) {
  122. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  123. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  124. v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  125. LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
  126. LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
  127. TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
  128. in4, in5, in6, in7);
  129. TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
  130. in10, in11, in12, in13, in14, in15);
  131. ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
  132. ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
  133. ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
  134. ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15);
  135. SRA_4V(in0, in1, in2, in3, 2);
  136. SRA_4V(in4, in5, in6, in7, 2);
  137. SRA_4V(in8, in9, in10, in11, 2);
  138. SRA_4V(in12, in13, in14, in15, 2);
  139. BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
  140. in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
  141. tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
  142. ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
  143. FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
  144. tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
  145. LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
  146. FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
  147. in4, in5, in6, in7);
  148. TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
  149. tmp1, in1, tmp2, in2, tmp3, in3);
  150. ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
  151. TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
  152. tmp5, in5, tmp6, in6, tmp7, in7);
  153. ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
  154. }
  155. void vpx_fdct4x4_msa(const int16_t *input, int16_t *output,
  156. int32_t src_stride) {
  157. v8i16 in0, in1, in2, in3;
  158. LD_SH4(input, src_stride, in0, in1, in2, in3);
  159. /* fdct4 pre-process */
  160. {
  161. v8i16 vec, mask;
  162. v16i8 zero = { 0 };
  163. v16i8 one = __msa_ldi_b(1);
  164. mask = (v8i16)__msa_sldi_b(zero, one, 15);
  165. SLLI_4V(in0, in1, in2, in3, 4);
  166. vec = __msa_ceqi_h(in0, 0);
  167. vec = vec ^ 255;
  168. vec = mask & vec;
  169. in0 += vec;
  170. }
  171. VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
  172. TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
  173. VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
  174. TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
  175. ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
  176. SRA_4V(in0, in1, in2, in3, 2);
  177. PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
  178. ST_SH2(in0, in2, output, 8);
  179. }
  180. void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
  181. int32_t src_stride) {
  182. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  183. LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
  184. SLLI_4V(in0, in1, in2, in3, 2);
  185. SLLI_4V(in4, in5, in6, in7, 2);
  186. VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
  187. in5, in6, in7);
  188. TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
  189. in4, in5, in6, in7);
  190. VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
  191. in5, in6, in7);
  192. TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
  193. in4, in5, in6, in7);
  194. SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
  195. ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
  196. }
  197. void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
  198. int32_t src_stride) {
  199. int32_t i;
  200. DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
  201. /* column transform */
  202. for (i = 0; i < 2; ++i) {
  203. fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
  204. }
  205. /* row transform */
  206. for (i = 0; i < 2; ++i) {
  207. fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
  208. }
  209. }
  210. void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
  211. int sum, i;
  212. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  213. v4i32 vec_w = { 0 };
  214. for (i = 0; i < 4; ++i) {
  215. LD_SH2(input, 8, in0, in1);
  216. input += stride;
  217. LD_SH2(input, 8, in2, in3);
  218. input += stride;
  219. LD_SH2(input, 8, in4, in5);
  220. input += stride;
  221. LD_SH2(input, 8, in6, in7);
  222. input += stride;
  223. ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
  224. ADD2(in0, in2, in4, in6, in0, in4);
  225. vec_w += __msa_hadd_s_w(in0, in0);
  226. vec_w += __msa_hadd_s_w(in4, in4);
  227. }
  228. sum = HADD_SW_S32(vec_w);
  229. out[0] = (int16_t)(sum >> 1);
  230. }
  231. #endif // !CONFIG_VP9_HIGHBITDEPTH