vpx_convolve8_avg_msa.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/mips/vpx_convolve_msa.h"
  13. static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
  14. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  15. int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
  16. uint32_t loop_cnt;
  17. uint32_t tp0, tp1, tp2, tp3;
  18. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  19. v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res;
  20. v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
  21. v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  22. v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
  23. v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
  24. mask0 = LD_UB(&mc_filt_mask_arr[16]);
  25. src -= (3 + 3 * src_stride);
  26. /* rearranging filter */
  27. filt = LD_SH(filter_horiz);
  28. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
  29. mask1 = mask0 + 2;
  30. mask2 = mask0 + 4;
  31. mask3 = mask0 + 6;
  32. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  33. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  34. src += (7 * src_stride);
  35. hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
  36. filt_hz1, filt_hz2, filt_hz3);
  37. hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
  38. filt_hz1, filt_hz2, filt_hz3);
  39. hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
  40. filt_hz1, filt_hz2, filt_hz3);
  41. hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
  42. filt_hz1, filt_hz2, filt_hz3);
  43. SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
  44. filt = LD_SH(filter_vert);
  45. SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
  46. ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  47. vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
  48. for (loop_cnt = (height >> 2); loop_cnt--;) {
  49. LD_SB4(src, src_stride, src7, src8, src9, src10);
  50. XORI_B4_128_SB(src7, src8, src9, src10);
  51. src += (4 * src_stride);
  52. LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
  53. INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
  54. hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
  55. filt_hz1, filt_hz2, filt_hz3);
  56. hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
  57. vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
  58. res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
  59. filt_vt2, filt_vt3);
  60. hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
  61. filt_hz1, filt_hz2, filt_hz3);
  62. hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
  63. vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
  64. res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
  65. filt_vt2, filt_vt3);
  66. SRARI_H2_SH(res0, res1, FILTER_BITS);
  67. SAT_SH2_SH(res0, res1, 7);
  68. res = PCKEV_XORI128_UB(res0, res1);
  69. res = (v16u8)__msa_aver_u_b(res, dst0);
  70. ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
  71. dst += (4 * dst_stride);
  72. hz_out5 = hz_out9;
  73. vec0 = vec2;
  74. vec1 = vec3;
  75. vec2 = vec4;
  76. }
  77. }
  78. static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
  79. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  80. int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
  81. uint32_t loop_cnt;
  82. uint64_t tp0, tp1, tp2, tp3;
  83. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  84. v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
  85. v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
  86. v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3;
  87. v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  88. v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
  89. v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
  90. mask0 = LD_UB(&mc_filt_mask_arr[0]);
  91. src -= (3 + 3 * src_stride);
  92. /* rearranging filter */
  93. filt = LD_SH(filter_horiz);
  94. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
  95. mask1 = mask0 + 2;
  96. mask2 = mask0 + 4;
  97. mask3 = mask0 + 6;
  98. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  99. src += (7 * src_stride);
  100. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  101. hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
  102. filt_hz1, filt_hz2, filt_hz3);
  103. hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
  104. filt_hz1, filt_hz2, filt_hz3);
  105. hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
  106. filt_hz1, filt_hz2, filt_hz3);
  107. hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
  108. filt_hz1, filt_hz2, filt_hz3);
  109. hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
  110. filt_hz1, filt_hz2, filt_hz3);
  111. hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
  112. filt_hz1, filt_hz2, filt_hz3);
  113. hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
  114. filt_hz1, filt_hz2, filt_hz3);
  115. filt = LD_SH(filter_vert);
  116. SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
  117. ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
  118. ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
  119. ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
  120. for (loop_cnt = (height >> 2); loop_cnt--;) {
  121. LD_SB4(src, src_stride, src7, src8, src9, src10);
  122. XORI_B4_128_SB(src7, src8, src9, src10);
  123. src += (4 * src_stride);
  124. LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
  125. INSERT_D2_UB(tp0, tp1, dst0);
  126. INSERT_D2_UB(tp2, tp3, dst1);
  127. hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
  128. filt_hz1, filt_hz2, filt_hz3);
  129. out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
  130. tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
  131. filt_vt2, filt_vt3);
  132. hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
  133. filt_hz1, filt_hz2, filt_hz3);
  134. out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
  135. tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
  136. filt_vt2, filt_vt3);
  137. hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
  138. filt_hz1, filt_hz2, filt_hz3);
  139. out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
  140. tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
  141. filt_vt2, filt_vt3);
  142. hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
  143. filt_hz0, filt_hz1, filt_hz2, filt_hz3);
  144. out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
  145. tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
  146. filt_vt2, filt_vt3);
  147. SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  148. SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
  149. CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst,
  150. dst_stride);
  151. dst += (4 * dst_stride);
  152. hz_out6 = hz_out10;
  153. out0 = out2;
  154. out1 = out3;
  155. out2 = out8;
  156. out4 = out6;
  157. out5 = out7;
  158. out6 = out9;
  159. }
  160. }
  161. static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
  162. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  163. int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
  164. int32_t multiple8_cnt;
  165. for (multiple8_cnt = 2; multiple8_cnt--;) {
  166. common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
  167. filter_horiz, filter_vert, height);
  168. src += 8;
  169. dst += 8;
  170. }
  171. }
  172. static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
  173. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  174. int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
  175. int32_t multiple8_cnt;
  176. for (multiple8_cnt = 4; multiple8_cnt--;) {
  177. common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
  178. filter_horiz, filter_vert, height);
  179. src += 8;
  180. dst += 8;
  181. }
  182. }
  183. static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
  184. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  185. int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
  186. int32_t multiple8_cnt;
  187. for (multiple8_cnt = 8; multiple8_cnt--;) {
  188. common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
  189. filter_horiz, filter_vert, height);
  190. src += 8;
  191. dst += 8;
  192. }
  193. }
  194. static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
  195. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  196. int8_t *filter_horiz, int8_t *filter_vert) {
  197. uint32_t tp0, tp1, tp2, tp3;
  198. v16i8 src0, src1, src2, src3, src4, mask;
  199. v16u8 filt_hz, filt_vt, vec0, vec1;
  200. v16u8 dst0 = { 0 }, out;
  201. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
  202. mask = LD_SB(&mc_filt_mask_arr[16]);
  203. /* rearranging filter */
  204. filt = LD_UH(filter_horiz);
  205. filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
  206. filt = LD_UH(filter_vert);
  207. filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
  208. LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  209. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  210. hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  211. hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  212. hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
  213. hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
  214. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  215. LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
  216. INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
  217. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  218. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  219. out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  220. out = __msa_aver_u_b(out, dst0);
  221. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  222. }
  223. static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
  224. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  225. int8_t *filter_horiz, int8_t *filter_vert) {
  226. uint32_t tp0, tp1, tp2, tp3;
  227. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
  228. v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
  229. v16u8 dst0 = { 0 }, dst1 = { 0 };
  230. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  231. v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
  232. v8i16 filt;
  233. mask = LD_SB(&mc_filt_mask_arr[16]);
  234. /* rearranging filter */
  235. filt = LD_SH(filter_horiz);
  236. filt_hz = (v16u8)__msa_splati_h(filt, 0);
  237. filt = LD_SH(filter_vert);
  238. filt_vt = (v16u8)__msa_splati_h(filt, 0);
  239. LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  240. src += (8 * src_stride);
  241. src8 = LD_SB(src);
  242. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  243. hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  244. hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
  245. hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
  246. hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
  247. SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
  248. hz_out3, hz_out5, 8);
  249. hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
  250. LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
  251. INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
  252. LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
  253. INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
  254. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  255. ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
  256. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
  257. tmp1, tmp2, tmp3);
  258. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  259. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
  260. AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
  261. ST4x8_UB(res0, res1, dst, dst_stride);
  262. }
  263. static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
  264. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  265. int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
  266. if (4 == height) {
  267. common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
  268. filter_horiz, filter_vert);
  269. } else if (8 == height) {
  270. common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
  271. filter_horiz, filter_vert);
  272. }
  273. }
  274. static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
  275. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  276. int8_t *filter_horiz, int8_t *filter_vert) {
  277. uint64_t tp0, tp1, tp2, tp3;
  278. v16i8 src0, src1, src2, src3, src4, mask;
  279. v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3;
  280. v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
  281. v8i16 filt;
  282. mask = LD_SB(&mc_filt_mask_arr[0]);
  283. /* rearranging filter */
  284. filt = LD_SH(filter_horiz);
  285. filt_hz = (v16u8)__msa_splati_h(filt, 0);
  286. filt = LD_SH(filter_vert);
  287. filt_vt = (v16u8)__msa_splati_h(filt, 0);
  288. LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  289. src += (5 * src_stride);
  290. LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
  291. INSERT_D2_UB(tp0, tp1, dst0);
  292. INSERT_D2_UB(tp2, tp3, dst1);
  293. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  294. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  295. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  296. tmp0 = __msa_dotp_u_h(vec0, filt_vt);
  297. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  298. vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  299. tmp1 = __msa_dotp_u_h(vec1, filt_vt);
  300. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  301. vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  302. tmp2 = __msa_dotp_u_h(vec2, filt_vt);
  303. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  304. vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  305. tmp3 = __msa_dotp_u_h(vec3, filt_vt);
  306. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  307. PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
  308. }
  309. static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
  310. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  311. int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
  312. uint32_t loop_cnt;
  313. uint64_t tp0, tp1, tp2, tp3;
  314. v16i8 src0, src1, src2, src3, src4, mask;
  315. v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 };
  316. v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
  317. v8i16 filt;
  318. mask = LD_SB(&mc_filt_mask_arr[0]);
  319. /* rearranging filter */
  320. filt = LD_SH(filter_horiz);
  321. filt_hz = (v16u8)__msa_splati_h(filt, 0);
  322. filt = LD_SH(filter_vert);
  323. filt_vt = (v16u8)__msa_splati_h(filt, 0);
  324. src0 = LD_SB(src);
  325. src += src_stride;
  326. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  327. for (loop_cnt = (height >> 2); loop_cnt--;) {
  328. LD_SB4(src, src_stride, src1, src2, src3, src4);
  329. src += (4 * src_stride);
  330. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  331. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  332. tmp0 = __msa_dotp_u_h(vec0, filt_vt);
  333. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  334. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  335. tmp1 = __msa_dotp_u_h(vec0, filt_vt);
  336. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  337. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  338. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  339. tmp2 = __msa_dotp_u_h(vec0, filt_vt);
  340. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  341. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  342. tmp3 = __msa_dotp_u_h(vec0, filt_vt);
  343. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  344. LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
  345. INSERT_D2_UB(tp0, tp1, dst0);
  346. INSERT_D2_UB(tp2, tp3, dst1);
  347. PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
  348. dst += (4 * dst_stride);
  349. }
  350. }
  351. static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
  352. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  353. int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
  354. if (4 == height) {
  355. common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
  356. filter_horiz, filter_vert);
  357. } else {
  358. common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
  359. src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
  360. }
  361. }
  362. static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
  363. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  364. int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
  365. uint32_t loop_cnt;
  366. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
  367. v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
  368. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
  369. v8i16 filt;
  370. mask = LD_SB(&mc_filt_mask_arr[0]);
  371. /* rearranging filter */
  372. filt = LD_SH(filter_horiz);
  373. filt_hz = (v16u8)__msa_splati_h(filt, 0);
  374. filt = LD_SH(filter_vert);
  375. filt_vt = (v16u8)__msa_splati_h(filt, 0);
  376. LD_SB2(src, 8, src0, src1);
  377. src += src_stride;
  378. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  379. hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  380. for (loop_cnt = (height >> 2); loop_cnt--;) {
  381. LD_SB4(src, src_stride, src0, src2, src4, src6);
  382. LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
  383. src += (4 * src_stride);
  384. LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
  385. hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  386. hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  387. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  388. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  389. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  390. PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
  391. dst += dst_stride;
  392. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  393. hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  394. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  395. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  396. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  397. PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
  398. dst += dst_stride;
  399. hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  400. hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
  401. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  402. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  403. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  404. PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
  405. dst += dst_stride;
  406. hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
  407. hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
  408. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  409. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  410. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  411. PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
  412. dst += dst_stride;
  413. }
  414. }
  415. static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
  416. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  417. int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
  418. int32_t multiple8_cnt;
  419. for (multiple8_cnt = 2; multiple8_cnt--;) {
  420. common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
  421. filter_horiz, filter_vert, height);
  422. src += 16;
  423. dst += 16;
  424. }
  425. }
  426. static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
  427. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  428. int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
  429. int32_t multiple8_cnt;
  430. for (multiple8_cnt = 4; multiple8_cnt--;) {
  431. common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
  432. filter_horiz, filter_vert, height);
  433. src += 16;
  434. dst += 16;
  435. }
  436. }
  437. void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
  438. uint8_t *dst, ptrdiff_t dst_stride,
  439. const InterpKernel *filter, int x0_q4, int x_step_q4,
  440. int y0_q4, int y_step_q4, int w, int h) {
  441. const int16_t *const filter_x = filter[x0_q4];
  442. const int16_t *const filter_y = filter[y0_q4];
  443. int8_t cnt, filt_hor[8], filt_ver[8];
  444. assert(x_step_q4 == 16);
  445. assert(y_step_q4 == 16);
  446. assert(((const int32_t *)filter_x)[1] != 0x800000);
  447. assert(((const int32_t *)filter_y)[1] != 0x800000);
  448. for (cnt = 0; cnt < 8; ++cnt) {
  449. filt_hor[cnt] = filter_x[cnt];
  450. filt_ver[cnt] = filter_y[cnt];
  451. }
  452. if (vpx_get_filter_taps(filter_x) == 2 &&
  453. vpx_get_filter_taps(filter_y) == 2) {
  454. switch (w) {
  455. case 4:
  456. common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
  457. (int32_t)dst_stride, &filt_hor[3],
  458. &filt_ver[3], h);
  459. break;
  460. case 8:
  461. common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
  462. (int32_t)dst_stride, &filt_hor[3],
  463. &filt_ver[3], h);
  464. break;
  465. case 16:
  466. common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
  467. (int32_t)dst_stride,
  468. &filt_hor[3], &filt_ver[3], h);
  469. break;
  470. case 32:
  471. common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
  472. (int32_t)dst_stride,
  473. &filt_hor[3], &filt_ver[3], h);
  474. break;
  475. case 64:
  476. common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
  477. (int32_t)dst_stride,
  478. &filt_hor[3], &filt_ver[3], h);
  479. break;
  480. default:
  481. vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  482. x_step_q4, y0_q4, y_step_q4, w, h);
  483. break;
  484. }
  485. } else if (vpx_get_filter_taps(filter_x) == 2 ||
  486. vpx_get_filter_taps(filter_y) == 2) {
  487. vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  488. x_step_q4, y0_q4, y_step_q4, w, h);
  489. } else {
  490. switch (w) {
  491. case 4:
  492. common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
  493. (int32_t)dst_stride, filt_hor,
  494. filt_ver, h);
  495. break;
  496. case 8:
  497. common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
  498. (int32_t)dst_stride, filt_hor,
  499. filt_ver, h);
  500. break;
  501. case 16:
  502. common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
  503. (int32_t)dst_stride, filt_hor,
  504. filt_ver, h);
  505. break;
  506. case 32:
  507. common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
  508. (int32_t)dst_stride, filt_hor,
  509. filt_ver, h);
  510. break;
  511. case 64:
  512. common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
  513. (int32_t)dst_stride, filt_hor,
  514. filt_ver, h);
  515. break;
  516. default:
  517. vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  518. x_step_q4, y0_q4, y_step_q4, w, h);
  519. break;
  520. }
  521. }
  522. }