inv_txfm_msa.h 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_
  11. #define VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_
  12. #include "vpx_dsp/mips/macros_msa.h"
  13. #include "vpx_dsp/mips/txfm_macros_msa.h"
  14. #include "vpx_dsp/txfm_common.h"
  15. #define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
  16. out3, out4, out5, out6, out7) \
  17. { \
  18. v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
  19. v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
  20. v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
  21. cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
  22. v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \
  23. cospi_24_64, -cospi_24_64, 0, 0 }; \
  24. \
  25. SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
  26. cnst2_m = -cnst0_m; \
  27. ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
  28. SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \
  29. cnst4_m = -cnst2_m; \
  30. ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
  31. \
  32. ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
  33. ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
  34. DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
  35. cnst2_m, cnst3_m, in7, in0, in4, in3); \
  36. \
  37. SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
  38. cnst2_m = -cnst0_m; \
  39. ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
  40. SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \
  41. cnst4_m = -cnst2_m; \
  42. ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
  43. \
  44. ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
  45. ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
  46. \
  47. DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
  48. cnst2_m, cnst3_m, in5, in2, in6, in1); \
  49. BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
  50. out7 = -s0_m; \
  51. out0 = s1_m; \
  52. \
  53. SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
  54. \
  55. ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
  56. cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
  57. cnst1_m = cnst0_m; \
  58. \
  59. ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
  60. ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
  61. DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \
  62. cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \
  63. \
  64. SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
  65. cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
  66. \
  67. ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
  68. ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
  69. out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
  70. out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
  71. out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
  72. out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
  73. \
  74. out1 = -out1; \
  75. out3 = -out3; \
  76. out5 = -out5; \
  77. }
  78. #define VP9_SET_COSPI_PAIR(c0_h, c1_h) \
  79. ({ \
  80. v8i16 out0_m, r0_m, r1_m; \
  81. \
  82. r0_m = __msa_fill_h(c0_h); \
  83. r1_m = __msa_fill_h(c1_h); \
  84. out0_m = __msa_ilvev_h(r1_m, r0_m); \
  85. \
  86. out0_m; \
  87. })
  88. #define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \
  89. { \
  90. uint8_t *dst_m = (uint8_t *)(dst); \
  91. v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
  92. v16i8 tmp0_m, tmp1_m; \
  93. v16i8 zero_m = { 0 }; \
  94. v8i16 res0_m, res1_m, res2_m, res3_m; \
  95. \
  96. LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \
  97. ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \
  98. res0_m, res1_m, res2_m, res3_m); \
  99. ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \
  100. res2_m, res3_m); \
  101. CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
  102. PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
  103. ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
  104. }
  105. #define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
  106. { \
  107. v8i16 c0_m, c1_m, c2_m, c3_m; \
  108. v8i16 step0_m, step1_m; \
  109. v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  110. \
  111. c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
  112. c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
  113. step0_m = __msa_ilvr_h(in2, in0); \
  114. DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \
  115. \
  116. c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
  117. c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
  118. step1_m = __msa_ilvr_h(in3, in1); \
  119. DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \
  120. SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
  121. \
  122. PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \
  123. SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \
  124. BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \
  125. out0, out1, out2, out3); \
  126. }
  127. #define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
  128. { \
  129. v8i16 res0_m, res1_m, c0_m, c1_m; \
  130. v8i16 k1_m, k2_m, k3_m, k4_m; \
  131. v8i16 zero_m = { 0 }; \
  132. v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  133. v4i32 int0_m, int1_m, int2_m, int3_m; \
  134. v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \
  135. -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \
  136. \
  137. SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \
  138. ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \
  139. ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
  140. DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \
  141. int0_m = tmp2_m + tmp1_m; \
  142. \
  143. SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \
  144. ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \
  145. DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
  146. int1_m = tmp0_m + tmp1_m; \
  147. \
  148. c0_m = __msa_splati_h(mask_m, 6); \
  149. ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \
  150. ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
  151. DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
  152. int2_m = tmp0_m + tmp1_m; \
  153. \
  154. c0_m = __msa_splati_h(mask_m, 6); \
  155. c0_m = __msa_ilvev_h(c0_m, k1_m); \
  156. \
  157. res0_m = __msa_ilvr_h((in1), (in3)); \
  158. tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \
  159. int3_m = tmp2_m + tmp0_m; \
  160. \
  161. res0_m = __msa_ilvr_h((in2), (in3)); \
  162. c1_m = __msa_ilvev_h(k4_m, k3_m); \
  163. \
  164. tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \
  165. res1_m = __msa_ilvr_h((in0), (in2)); \
  166. c1_m = __msa_ilvev_h(k1_m, zero_m); \
  167. \
  168. tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \
  169. int3_m += tmp2_m; \
  170. int3_m += tmp3_m; \
  171. \
  172. SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \
  173. PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \
  174. PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \
  175. }
  176. #define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \
  177. ({ \
  178. v8i16 c0_m, c1_m; \
  179. \
  180. SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
  181. c0_m = __msa_ilvev_h(c1_m, c0_m); \
  182. \
  183. c0_m; \
  184. })
  185. /* multiply and add macro */
  186. #define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \
  187. out2, out3) \
  188. { \
  189. v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
  190. v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \
  191. \
  192. ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \
  193. ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \
  194. DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
  195. cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \
  196. SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \
  197. PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \
  198. DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
  199. cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \
  200. SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \
  201. PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \
  202. }
  203. /* idct 8x8 macro */
  204. #define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
  205. out2, out3, out4, out5, out6, out7) \
  206. { \
  207. v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \
  208. v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \
  209. v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  210. v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \
  211. cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \
  212. \
  213. k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \
  214. k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \
  215. k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \
  216. k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \
  217. VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
  218. SUB2(in1, in3, in7, in5, res0_m, res1_m); \
  219. k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \
  220. k1_m = __msa_splati_h(mask_m, 4); \
  221. \
  222. ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \
  223. DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \
  224. tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
  225. SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
  226. tp4_m = in1 + in3; \
  227. PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \
  228. tp7_m = in7 + in5; \
  229. k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
  230. k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
  231. VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \
  232. BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \
  233. BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \
  234. out1, out2, out3, out4, out5, out6, out7); \
  235. }
  236. #define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
  237. out2, out3, out4, out5, out6, out7) \
  238. { \
  239. v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \
  240. v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \
  241. v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \
  242. v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \
  243. cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \
  244. v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \
  245. -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \
  246. v8i16 mask3_m = { \
  247. -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \
  248. }; \
  249. \
  250. k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \
  251. k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \
  252. ILVRL_H2_SH(in1, in0, in_s1, in_s0); \
  253. DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
  254. r1_m, r2_m, r3_m); \
  255. k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \
  256. k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \
  257. ILVRL_H2_SH(in5, in4, in_s1, in_s0); \
  258. DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
  259. r5_m, r6_m, r7_m); \
  260. ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
  261. m3_m); \
  262. SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
  263. PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \
  264. SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
  265. m3_m); \
  266. SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
  267. PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \
  268. k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \
  269. k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \
  270. ILVRL_H2_SH(in3, in2, in_s1, in_s0); \
  271. DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
  272. r1_m, r2_m, r3_m); \
  273. k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \
  274. k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \
  275. ILVRL_H2_SH(in7, in6, in_s1, in_s0); \
  276. DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
  277. r5_m, r6_m, r7_m); \
  278. ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
  279. m3_m); \
  280. SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
  281. PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \
  282. SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
  283. m3_m); \
  284. SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
  285. PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \
  286. ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \
  287. BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \
  288. k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \
  289. k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \
  290. ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \
  291. DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
  292. r1_m, r2_m, r3_m); \
  293. k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \
  294. DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \
  295. r6_m, r7_m); \
  296. ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
  297. m3_m); \
  298. SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
  299. PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \
  300. SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
  301. m3_m); \
  302. SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
  303. PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \
  304. k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \
  305. k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \
  306. ILVRL_H2_SH(in4, in3, in_s1, in_s0); \
  307. DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \
  308. m1_m, m2_m, m3_m); \
  309. SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
  310. PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \
  311. ILVRL_H2_SW(in5, in2, m2_m, m3_m); \
  312. DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \
  313. m2_m, m3_m); \
  314. SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
  315. PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \
  316. \
  317. out1 = -in1; \
  318. out3 = -in3; \
  319. out5 = -in5; \
  320. out7 = -in7; \
  321. }
  322. #define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \
  323. r12, r13, r14, r15, out0, out1, out2, out3, out4, \
  324. out5, out6, out7, out8, out9, out10, out11, out12, \
  325. out13, out14, out15) \
  326. { \
  327. v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
  328. v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
  329. v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
  330. v8i16 h8_m, h9_m, h10_m, h11_m; \
  331. v8i16 k0_m, k1_m, k2_m, k3_m; \
  332. \
  333. /* stage 1 */ \
  334. k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \
  335. k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
  336. k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
  337. k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
  338. MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \
  339. k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
  340. k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
  341. k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
  342. k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
  343. MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \
  344. k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
  345. k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
  346. k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
  347. k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
  348. MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \
  349. g11_m); \
  350. k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
  351. k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
  352. k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
  353. k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
  354. MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \
  355. g15_m); \
  356. \
  357. /* stage 2 */ \
  358. k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
  359. k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
  360. k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
  361. MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \
  362. h3_m); \
  363. k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
  364. k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
  365. k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
  366. MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \
  367. h6_m, h7_m); \
  368. BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
  369. BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \
  370. h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
  371. \
  372. /* stage 3 */ \
  373. BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
  374. k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
  375. k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
  376. k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
  377. MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \
  378. out7); \
  379. MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \
  380. out13, out15); \
  381. \
  382. /* stage 4 */ \
  383. k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
  384. k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
  385. k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
  386. k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
  387. MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
  388. MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
  389. MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
  390. MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
  391. }
  392. void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
  393. int32_t dst_stride);
  394. void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
  395. void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
  396. int32_t dst_stride);
  397. void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
  398. #endif // VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_