vpx_convolve_msa.h 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
  11. #define VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
  12. #include "vpx_dsp/mips/macros_msa.h"
  13. #include "vpx_dsp/vpx_filter.h"
  14. extern const uint8_t mc_filt_mask_arr[16 * 3];
  15. #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \
  16. filt3) \
  17. ({ \
  18. v8i16 tmp_dpadd_0, tmp_dpadd_1; \
  19. \
  20. tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
  21. tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \
  22. tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \
  23. tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \
  24. tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \
  25. \
  26. tmp_dpadd_0; \
  27. })
  28. #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \
  29. filt_h1, filt_h2, filt_h3) \
  30. ({ \
  31. v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
  32. v8i16 hz_out_m; \
  33. \
  34. VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \
  35. vec3_m); \
  36. hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \
  37. filt_h1, filt_h2, filt_h3); \
  38. \
  39. hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \
  40. hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
  41. \
  42. hz_out_m; \
  43. })
  44. #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
  45. mask2, mask3, filt0, filt1, filt2, filt3, \
  46. out0, out1) \
  47. { \
  48. v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
  49. v8i16 res0_m, res1_m, res2_m, res3_m; \
  50. \
  51. VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
  52. DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
  53. VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
  54. DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
  55. VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
  56. DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
  57. VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
  58. DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
  59. ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
  60. }
  61. #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
  62. mask2, mask3, filt0, filt1, filt2, filt3, \
  63. out0, out1, out2, out3) \
  64. { \
  65. v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
  66. v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
  67. \
  68. VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
  69. VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
  70. DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
  71. res0_m, res1_m, res2_m, res3_m); \
  72. VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
  73. VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
  74. DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
  75. res4_m, res5_m, res6_m, res7_m); \
  76. VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
  77. VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
  78. DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
  79. res0_m, res1_m, res2_m, res3_m); \
  80. VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
  81. VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
  82. DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
  83. res4_m, res5_m, res6_m, res7_m); \
  84. ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
  85. res7_m, out0, out1, out2, out3); \
  86. }
  87. #define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
  88. { \
  89. v16u8 tmp_m; \
  90. \
  91. tmp_m = PCKEV_XORI128_UB(in1, in0); \
  92. tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
  93. ST_UB(tmp_m, (pdst)); \
  94. }
  95. #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
  96. { \
  97. v16u8 tmp_m; \
  98. \
  99. tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
  100. tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
  101. ST_UB(tmp_m, (pdst)); \
  102. }
  103. #define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
  104. { \
  105. v16u8 tmp0_m, tmp1_m; \
  106. uint8_t *pdst_m = (uint8_t *)(pdst); \
  107. \
  108. PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
  109. AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
  110. ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
  111. }
  112. #endif // VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_