mfqe_msa.c 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vp8_rtcd.h"
  11. #include "vp8/common/postproc.h"
  12. #include "vp8/common/mips/msa/vp8_macros_msa.h"
  13. static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
  14. uint8_t *dst_ptr, int32_t dst_stride,
  15. int32_t src_weight) {
  16. int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
  17. int32_t row;
  18. uint64_t src0_d, src1_d, dst0_d, dst1_d;
  19. v16i8 src0 = { 0 };
  20. v16i8 src1 = { 0 };
  21. v16i8 dst0 = { 0 };
  22. v16i8 dst1 = { 0 };
  23. v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
  24. src_wt = __msa_fill_h(src_weight);
  25. dst_wt = __msa_fill_h(dst_weight);
  26. for (row = 2; row--;) {
  27. LD2(src_ptr, src_stride, src0_d, src1_d);
  28. src_ptr += (2 * src_stride);
  29. LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
  30. INSERT_D2_SB(src0_d, src1_d, src0);
  31. INSERT_D2_SB(dst0_d, dst1_d, dst0);
  32. LD2(src_ptr, src_stride, src0_d, src1_d);
  33. src_ptr += (2 * src_stride);
  34. LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
  35. INSERT_D2_SB(src0_d, src1_d, src1);
  36. INSERT_D2_SB(dst0_d, dst1_d, dst1);
  37. UNPCK_UB_SH(src0, src_r, src_l);
  38. UNPCK_UB_SH(dst0, dst_r, dst_l);
  39. res_h_r = (src_r * src_wt);
  40. res_h_r += (dst_r * dst_wt);
  41. res_h_l = (src_l * src_wt);
  42. res_h_l += (dst_l * dst_wt);
  43. SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
  44. dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
  45. ST8x2_UB(dst0, dst_ptr, dst_stride);
  46. dst_ptr += (2 * dst_stride);
  47. UNPCK_UB_SH(src1, src_r, src_l);
  48. UNPCK_UB_SH(dst1, dst_r, dst_l);
  49. res_h_r = (src_r * src_wt);
  50. res_h_r += (dst_r * dst_wt);
  51. res_h_l = (src_l * src_wt);
  52. res_h_l += (dst_l * dst_wt);
  53. SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
  54. dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
  55. ST8x2_UB(dst1, dst_ptr, dst_stride);
  56. dst_ptr += (2 * dst_stride);
  57. }
  58. }
  59. static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
  60. uint8_t *dst_ptr, int32_t dst_stride,
  61. int32_t src_weight) {
  62. int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
  63. int32_t row;
  64. v16i8 src0, src1, src2, src3;
  65. v16i8 dst0, dst1, dst2, dst3;
  66. v8i16 src_wt, dst_wt;
  67. v8i16 res_h_r, res_h_l;
  68. v8i16 src_r, src_l, dst_r, dst_l;
  69. src_wt = __msa_fill_h(src_weight);
  70. dst_wt = __msa_fill_h(dst_weight);
  71. for (row = 4; row--;) {
  72. LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
  73. src_ptr += (4 * src_stride);
  74. LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
  75. UNPCK_UB_SH(src0, src_r, src_l);
  76. UNPCK_UB_SH(dst0, dst_r, dst_l);
  77. res_h_r = (src_r * src_wt);
  78. res_h_r += (dst_r * dst_wt);
  79. res_h_l = (src_l * src_wt);
  80. res_h_l += (dst_l * dst_wt);
  81. SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
  82. PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
  83. dst_ptr += dst_stride;
  84. UNPCK_UB_SH(src1, src_r, src_l);
  85. UNPCK_UB_SH(dst1, dst_r, dst_l);
  86. res_h_r = (src_r * src_wt);
  87. res_h_r += (dst_r * dst_wt);
  88. res_h_l = (src_l * src_wt);
  89. res_h_l += (dst_l * dst_wt);
  90. SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
  91. PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
  92. dst_ptr += dst_stride;
  93. UNPCK_UB_SH(src2, src_r, src_l);
  94. UNPCK_UB_SH(dst2, dst_r, dst_l);
  95. res_h_r = (src_r * src_wt);
  96. res_h_r += (dst_r * dst_wt);
  97. res_h_l = (src_l * src_wt);
  98. res_h_l += (dst_l * dst_wt);
  99. SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
  100. PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
  101. dst_ptr += dst_stride;
  102. UNPCK_UB_SH(src3, src_r, src_l);
  103. UNPCK_UB_SH(dst3, dst_r, dst_l);
  104. res_h_r = (src_r * src_wt);
  105. res_h_r += (dst_r * dst_wt);
  106. res_h_l = (src_l * src_wt);
  107. res_h_l += (dst_l * dst_wt);
  108. SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
  109. PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
  110. dst_ptr += dst_stride;
  111. }
  112. }
  113. void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
  114. uint8_t *dst_ptr, int32_t dst_stride,
  115. int32_t src_weight) {
  116. filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride,
  117. src_weight);
  118. }
  119. void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
  120. uint8_t *dst_ptr, int32_t dst_stride,
  121. int32_t src_weight) {
  122. filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride, src_weight);
  123. }