dct_msa.c 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vp8_rtcd.h"
  11. #include "vp8/common/mips/msa/vp8_macros_msa.h"
  12. #define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
  13. { \
  14. v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m; \
  15. \
  16. ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
  17. ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m); \
  18. ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
  19. ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m); \
  20. PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2); \
  21. PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3); \
  22. }
  23. #define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \
  24. { \
  25. v8i16 tmp0_m; \
  26. \
  27. SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2); \
  28. ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2); \
  29. }
  30. #define RET_1_IF_NZERO_H(in0) \
  31. ({ \
  32. v8i16 tmp0_m; \
  33. v8i16 one_m = __msa_ldi_h(1); \
  34. \
  35. tmp0_m = __msa_ceqi_h(in0, 0); \
  36. tmp0_m = tmp0_m ^ 255; \
  37. tmp0_m = one_m & tmp0_m; \
  38. \
  39. tmp0_m; \
  40. })
  41. #define RET_1_IF_NZERO_W(in0) \
  42. ({ \
  43. v4i32 tmp0_m; \
  44. v4i32 one_m = __msa_ldi_w(1); \
  45. \
  46. tmp0_m = __msa_ceqi_w(in0, 0); \
  47. tmp0_m = tmp0_m ^ 255; \
  48. tmp0_m = one_m & tmp0_m; \
  49. \
  50. tmp0_m; \
  51. })
  52. #define RET_1_IF_NEG_W(in0) \
  53. ({ \
  54. v4i32 tmp0_m; \
  55. \
  56. v4i32 one_m = __msa_ldi_w(1); \
  57. tmp0_m = __msa_clti_s_w(in0, 0); \
  58. tmp0_m = one_m & tmp0_m; \
  59. \
  60. tmp0_m; \
  61. })
  62. void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
  63. v8i16 in0, in1, in2, in3;
  64. v8i16 temp0, temp1;
  65. v8i16 const0, const1;
  66. v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
  67. v4i32 out0, out1, out2, out3;
  68. v8i16 zero = { 0 };
  69. LD_SH4(input, pitch / 2, in0, in1, in2, in3);
  70. TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
  71. BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  72. SLLI_4V(temp0, temp1, in1, in3, 3);
  73. in0 = temp0 + temp1;
  74. in2 = temp0 - temp1;
  75. SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
  76. temp0 = __msa_ilvr_h(in3, in1);
  77. in1 = __msa_splati_h(coeff, 3);
  78. out0 = (v4i32)__msa_ilvev_h(zero, in1);
  79. coeff = __msa_ilvl_h(zero, coeff);
  80. out1 = __msa_splati_w((v4i32)coeff, 0);
  81. DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);
  82. out0 >>= 12;
  83. out1 >>= 12;
  84. PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);
  85. TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
  86. BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  87. in0 = temp0 + temp1 + 7;
  88. in2 = temp0 - temp1 + 7;
  89. in0 >>= 4;
  90. in2 >>= 4;
  91. ILVR_H2_SW(zero, in0, zero, in2, out0, out2);
  92. temp1 = RET_1_IF_NZERO_H(in3);
  93. ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);
  94. SPLATI_W2_SW(coeff, 2, out3, out1);
  95. out3 += out1;
  96. out1 = __msa_splati_w((v4i32)coeff, 1);
  97. DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);
  98. out1 >>= 16;
  99. out3 >>= 16;
  100. out1 += (v4i32)temp1;
  101. PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);
  102. ST_SH2(in0, in2, output, 8);
  103. }
  104. void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
  105. v8i16 in0, in1, in2, in3;
  106. v8i16 temp0, temp1, tmp0, tmp1;
  107. v8i16 const0, const1, const2;
  108. v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
  109. v8i16 zero = { 0 };
  110. v4i32 vec0_w, vec1_w, vec2_w, vec3_w;
  111. LD_SH4(input, pitch / 2, in0, in1, in2, in3);
  112. TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
  113. BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  114. SLLI_4V(temp0, temp1, in1, in3, 3);
  115. in0 = temp0 + temp1;
  116. in2 = temp0 - temp1;
  117. SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
  118. temp0 = __msa_splati_h(coeff, 3);
  119. vec1_w = (v4i32)__msa_ilvev_h(zero, temp0);
  120. coeff = __msa_ilvl_h(zero, coeff);
  121. vec3_w = __msa_splati_w((v4i32)coeff, 0);
  122. ILVRL_H2_SH(in3, in1, tmp1, tmp0);
  123. vec0_w = vec1_w;
  124. vec2_w = vec3_w;
  125. DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w,
  126. vec1_w, vec2_w, vec3_w);
  127. SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12);
  128. PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
  129. TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
  130. BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  131. in0 = temp0 + temp1 + 7;
  132. in2 = temp0 - temp1 + 7;
  133. in0 >>= 4;
  134. in2 >>= 4;
  135. SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w);
  136. vec3_w += vec1_w;
  137. vec1_w = __msa_splati_w((v4i32)coeff, 1);
  138. const0 = RET_1_IF_NZERO_H(in3);
  139. ILVRL_H2_SH(in3, in1, tmp1, tmp0);
  140. vec0_w = vec1_w;
  141. vec2_w = vec3_w;
  142. DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w,
  143. vec1_w, vec2_w, vec3_w);
  144. SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16);
  145. PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
  146. in1 += const0;
  147. PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1);
  148. ST_SH2(temp0, temp1, output, 8);
  149. PCKOD_D2_SH(in1, in0, in3, in2, in0, in2);
  150. ST_SH2(in0, in2, output + 16, 8);
  151. }
  152. void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
  153. v8i16 in0_h, in1_h, in2_h, in3_h;
  154. v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3;
  155. LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h);
  156. TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h);
  157. UNPCK_R_SH_SW(in0_h, in0_w);
  158. UNPCK_R_SH_SW(in1_h, in1_w);
  159. UNPCK_R_SH_SW(in2_h, in2_w);
  160. UNPCK_R_SH_SW(in3_h, in3_w);
  161. BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
  162. SLLI_4V(temp0, temp1, temp2, temp3, 2);
  163. BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
  164. temp0 = RET_1_IF_NZERO_W(temp0);
  165. in0_w += temp0;
  166. TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w);
  167. BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
  168. BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
  169. in0_w += RET_1_IF_NEG_W(in0_w);
  170. in1_w += RET_1_IF_NEG_W(in1_w);
  171. in2_w += RET_1_IF_NEG_W(in2_w);
  172. in3_w += RET_1_IF_NEG_W(in3_w);
  173. ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w);
  174. SRA_4V(in0_w, in1_w, in2_w, in3_w, 3);
  175. PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h);
  176. ST_SH2(in0_h, in1_h, output, 8);
  177. }