vpx_convolve8_avg_vert_msa.c 27 KB


  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/mips/vpx_convolve_msa.h"
  13. static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
  14. int32_t src_stride, uint8_t *dst,
  15. int32_t dst_stride, int8_t *filter,
  16. int32_t height) {
  17. uint32_t loop_cnt;
  18. uint32_t tp0, tp1, tp2, tp3;
  19. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  20. v16u8 dst0 = { 0 }, out;
  21. v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
  22. v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
  23. v16i8 src10998, filt0, filt1, filt2, filt3;
  24. v8i16 filt, out10, out32;
  25. src -= (3 * src_stride);
  26. filt = LD_SH(filter);
  27. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  28. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  29. src += (7 * src_stride);
  30. ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
  31. src54_r, src21_r);
  32. ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  33. ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
  34. src4332, src6554);
  35. XORI_B3_128_SB(src2110, src4332, src6554);
  36. for (loop_cnt = (height >> 2); loop_cnt--;) {
  37. LD_SB4(src, src_stride, src7, src8, src9, src10);
  38. src += (4 * src_stride);
  39. LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
  40. INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
  41. ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
  42. src87_r, src98_r, src109_r);
  43. ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
  44. XORI_B2_128_SB(src8776, src10998);
  45. out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
  46. filt1, filt2, filt3);
  47. out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
  48. filt1, filt2, filt3);
  49. SRARI_H2_SH(out10, out32, FILTER_BITS);
  50. SAT_SH2_SH(out10, out32, 7);
  51. out = PCKEV_XORI128_UB(out10, out32);
  52. out = __msa_aver_u_b(out, dst0);
  53. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  54. dst += (4 * dst_stride);
  55. src2110 = src6554;
  56. src4332 = src8776;
  57. src6554 = src10998;
  58. src6 = src10;
  59. }
  60. }
  61. static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
  62. int32_t src_stride, uint8_t *dst,
  63. int32_t dst_stride, int8_t *filter,
  64. int32_t height) {
  65. uint32_t loop_cnt;
  66. uint64_t tp0, tp1, tp2, tp3;
  67. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  68. v16u8 dst0 = { 0 }, dst1 = { 0 };
  69. v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
  70. v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
  71. v8i16 filt, out0, out1, out2, out3;
  72. src -= (3 * src_stride);
  73. filt = LD_SH(filter);
  74. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  75. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  76. src += (7 * src_stride);
  77. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  78. ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
  79. src54_r, src21_r);
  80. ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  81. for (loop_cnt = (height >> 2); loop_cnt--;) {
  82. LD_SB4(src, src_stride, src7, src8, src9, src10);
  83. src += (4 * src_stride);
  84. LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
  85. INSERT_D2_UB(tp0, tp1, dst0);
  86. INSERT_D2_UB(tp2, tp3, dst1);
  87. XORI_B4_128_SB(src7, src8, src9, src10);
  88. ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
  89. src87_r, src98_r, src109_r);
  90. out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1,
  91. filt2, filt3);
  92. out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1,
  93. filt2, filt3);
  94. out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1,
  95. filt2, filt3);
  96. out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
  97. filt1, filt2, filt3);
  98. SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
  99. SAT_SH4_SH(out0, out1, out2, out3, 7);
  100. CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
  101. dst_stride);
  102. dst += (4 * dst_stride);
  103. src10_r = src54_r;
  104. src32_r = src76_r;
  105. src54_r = src98_r;
  106. src21_r = src65_r;
  107. src43_r = src87_r;
  108. src65_r = src109_r;
  109. src6 = src10;
  110. }
  111. }
  112. static void common_vt_8t_and_aver_dst_16w_mult_msa(
  113. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  114. int8_t *filter, int32_t height, int32_t width) {
  115. const uint8_t *src_tmp;
  116. uint8_t *dst_tmp;
  117. uint32_t loop_cnt, cnt;
  118. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  119. v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
  120. v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
  121. v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
  122. v16i8 filt0, filt1, filt2, filt3;
  123. v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
  124. v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
  125. src -= (3 * src_stride);
  126. filt = LD_SH(filter);
  127. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  128. for (cnt = (width >> 4); cnt--;) {
  129. src_tmp = src;
  130. dst_tmp = dst;
  131. LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
  132. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  133. src_tmp += (7 * src_stride);
  134. ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
  135. src54_r, src21_r);
  136. ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  137. ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
  138. src54_l, src21_l);
  139. ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
  140. for (loop_cnt = (height >> 2); loop_cnt--;) {
  141. LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
  142. src_tmp += (4 * src_stride);
  143. LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
  144. XORI_B4_128_SB(src7, src8, src9, src10);
  145. ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
  146. src87_r, src98_r, src109_r);
  147. ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
  148. src87_l, src98_l, src109_l);
  149. out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
  150. filt1, filt2, filt3);
  151. out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
  152. filt1, filt2, filt3);
  153. out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
  154. filt1, filt2, filt3);
  155. out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
  156. filt1, filt2, filt3);
  157. out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
  158. filt1, filt2, filt3);
  159. out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
  160. filt1, filt2, filt3);
  161. out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
  162. filt1, filt2, filt3);
  163. out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
  164. filt1, filt2, filt3);
  165. SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
  166. SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
  167. SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
  168. SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
  169. PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
  170. out3_r, tmp0, tmp1, tmp2, tmp3);
  171. XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
  172. AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
  173. dst2, dst3);
  174. ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
  175. dst_tmp += (4 * dst_stride);
  176. src10_r = src54_r;
  177. src32_r = src76_r;
  178. src54_r = src98_r;
  179. src21_r = src65_r;
  180. src43_r = src87_r;
  181. src65_r = src109_r;
  182. src10_l = src54_l;
  183. src32_l = src76_l;
  184. src54_l = src98_l;
  185. src21_l = src65_l;
  186. src43_l = src87_l;
  187. src65_l = src109_l;
  188. src6 = src10;
  189. }
  190. src += 16;
  191. dst += 16;
  192. }
  193. }
  194. static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
  195. int32_t src_stride, uint8_t *dst,
  196. int32_t dst_stride,
  197. int8_t *filter, int32_t height) {
  198. common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
  199. filter, height, 16);
  200. }
  201. static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
  202. int32_t src_stride, uint8_t *dst,
  203. int32_t dst_stride,
  204. int8_t *filter, int32_t height) {
  205. common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
  206. filter, height, 32);
  207. }
  208. static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
  209. int32_t src_stride, uint8_t *dst,
  210. int32_t dst_stride,
  211. int8_t *filter, int32_t height) {
  212. common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
  213. filter, height, 64);
  214. }
  215. static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
  216. int32_t src_stride, uint8_t *dst,
  217. int32_t dst_stride,
  218. int8_t *filter) {
  219. uint32_t tp0, tp1, tp2, tp3;
  220. v16i8 src0, src1, src2, src3, src4;
  221. v16u8 dst0 = { 0 }, out, filt0, src2110, src4332;
  222. v16i8 src10_r, src32_r, src21_r, src43_r;
  223. v8i16 filt;
  224. v8u16 tmp0, tmp1;
  225. filt = LD_SH(filter);
  226. filt0 = (v16u8)__msa_splati_h(filt, 0);
  227. LD_SB4(src, src_stride, src0, src1, src2, src3);
  228. src += (4 * src_stride);
  229. src4 = LD_SB(src);
  230. src += src_stride;
  231. LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
  232. INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
  233. ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
  234. src32_r, src43_r);
  235. ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
  236. DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
  237. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  238. out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  239. out = __msa_aver_u_b(out, dst0);
  240. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  241. }
  242. static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
  243. int32_t src_stride, uint8_t *dst,
  244. int32_t dst_stride,
  245. int8_t *filter) {
  246. uint32_t tp0, tp1, tp2, tp3;
  247. v16u8 dst0 = { 0 }, dst1 = { 0 };
  248. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
  249. v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
  250. v16u8 src2110, src4332, src6554, src8776, filt0;
  251. v8u16 tmp0, tmp1, tmp2, tmp3;
  252. v8i16 filt;
  253. filt = LD_SH(filter);
  254. filt0 = (v16u8)__msa_splati_h(filt, 0);
  255. LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  256. src += (8 * src_stride);
  257. src8 = LD_SB(src);
  258. LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
  259. INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
  260. LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
  261. INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
  262. ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
  263. src32_r, src43_r);
  264. ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
  265. src76_r, src87_r);
  266. ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
  267. src76_r, src2110, src4332, src6554, src8776);
  268. DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
  269. tmp0, tmp1, tmp2, tmp3);
  270. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  271. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
  272. AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
  273. ST4x8_UB(src2110, src4332, dst, dst_stride);
  274. }
  275. static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
  276. int32_t src_stride, uint8_t *dst,
  277. int32_t dst_stride, int8_t *filter,
  278. int32_t height) {
  279. if (4 == height) {
  280. common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
  281. } else if (8 == height) {
  282. common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
  283. }
  284. }
  285. static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
  286. int32_t src_stride, uint8_t *dst,
  287. int32_t dst_stride,
  288. int8_t *filter) {
  289. int64_t tp0, tp1, tp2, tp3;
  290. v16u8 src0, src1, src2, src3, src4;
  291. v16u8 dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3, filt0;
  292. v8u16 tmp0, tmp1, tmp2, tmp3;
  293. v8i16 filt;
  294. /* rearranging filter_y */
  295. filt = LD_SH(filter);
  296. filt0 = (v16u8)__msa_splati_h(filt, 0);
  297. LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
  298. LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
  299. INSERT_D2_UB(tp0, tp1, dst0);
  300. INSERT_D2_UB(tp2, tp3, dst1);
  301. ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
  302. ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
  303. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
  304. tmp2, tmp3);
  305. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  306. PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
  307. }
  308. static void common_vt_2t_and_aver_dst_8x8mult_msa(
  309. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  310. int8_t *filter, int32_t height) {
  311. uint32_t loop_cnt;
  312. int64_t tp0, tp1, tp2, tp3;
  313. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  314. v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
  315. v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
  316. v8u16 tmp0, tmp1, tmp2, tmp3;
  317. v8i16 filt;
  318. /* rearranging filter_y */
  319. filt = LD_SH(filter);
  320. filt0 = (v16u8)__msa_splati_h(filt, 0);
  321. src0 = LD_UB(src);
  322. src += src_stride;
  323. for (loop_cnt = (height >> 3); loop_cnt--;) {
  324. LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
  325. src += (8 * src_stride);
  326. LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
  327. INSERT_D2_UB(tp0, tp1, dst0);
  328. INSERT_D2_UB(tp2, tp3, dst1);
  329. LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
  330. INSERT_D2_UB(tp0, tp1, dst2);
  331. INSERT_D2_UB(tp2, tp3, dst3);
  332. ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
  333. vec3);
  334. ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
  335. vec7);
  336. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
  337. tmp2, tmp3);
  338. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  339. PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
  340. dst += (4 * dst_stride);
  341. DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
  342. tmp2, tmp3);
  343. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  344. PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
  345. dst += (4 * dst_stride);
  346. src0 = src8;
  347. }
  348. }
  349. static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
  350. int32_t src_stride, uint8_t *dst,
  351. int32_t dst_stride, int8_t *filter,
  352. int32_t height) {
  353. if (4 == height) {
  354. common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
  355. } else {
  356. common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
  357. filter, height);
  358. }
  359. }
  360. static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
  361. int32_t src_stride, uint8_t *dst,
  362. int32_t dst_stride,
  363. int8_t *filter, int32_t height) {
  364. uint32_t loop_cnt;
  365. v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
  366. v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  367. v8u16 tmp0, tmp1, tmp2, tmp3, filt;
  368. /* rearranging filter_y */
  369. filt = LD_UH(filter);
  370. filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
  371. src0 = LD_UB(src);
  372. src += src_stride;
  373. for (loop_cnt = (height >> 2); loop_cnt--;) {
  374. LD_UB4(src, src_stride, src1, src2, src3, src4);
  375. src += (4 * src_stride);
  376. LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
  377. ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
  378. ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
  379. DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
  380. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  381. PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
  382. dst += dst_stride;
  383. ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
  384. ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
  385. DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
  386. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  387. PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
  388. dst += dst_stride;
  389. DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
  390. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  391. PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
  392. dst += dst_stride;
  393. DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
  394. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  395. PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
  396. dst += dst_stride;
  397. src0 = src4;
  398. }
  399. }
  400. static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
  401. int32_t src_stride, uint8_t *dst,
  402. int32_t dst_stride,
  403. int8_t *filter, int32_t height) {
  404. uint32_t loop_cnt;
  405. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
  406. v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  407. v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
  408. v8u16 tmp0, tmp1, tmp2, tmp3, filt;
  409. /* rearranging filter_y */
  410. filt = LD_UH(filter);
  411. filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
  412. LD_UB2(src, 16, src0, src5);
  413. src += src_stride;
  414. for (loop_cnt = (height >> 2); loop_cnt--;) {
  415. LD_UB4(src, src_stride, src1, src2, src3, src4);
  416. LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
  417. ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
  418. ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
  419. LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
  420. LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
  421. src += (4 * src_stride);
  422. DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
  423. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  424. PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
  425. DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
  426. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  427. PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
  428. ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
  429. ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
  430. DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
  431. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  432. PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
  433. DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
  434. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  435. PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
  436. ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
  437. ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
  438. DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
  439. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  440. PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
  441. DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
  442. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  443. PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
  444. ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
  445. ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
  446. DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
  447. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  448. PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
  449. DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
  450. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  451. PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
  452. dst += (4 * dst_stride);
  453. src0 = src4;
  454. src5 = src9;
  455. }
  456. }
  457. static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
  458. int32_t src_stride, uint8_t *dst,
  459. int32_t dst_stride,
  460. int8_t *filter, int32_t height) {
  461. uint32_t loop_cnt;
  462. v16u8 src0, src1, src2, src3, src4, src5;
  463. v16u8 src6, src7, src8, src9, src10, src11, filt0;
  464. v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  465. v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  466. v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  467. v8u16 filt;
  468. /* rearranging filter_y */
  469. filt = LD_UH(filter);
  470. filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
  471. LD_UB4(src, 16, src0, src3, src6, src9);
  472. src += src_stride;
  473. for (loop_cnt = (height >> 1); loop_cnt--;) {
  474. LD_UB2(src, src_stride, src1, src2);
  475. LD_UB2(dst, dst_stride, dst0, dst1);
  476. LD_UB2(src + 16, src_stride, src4, src5);
  477. LD_UB2(dst + 16, dst_stride, dst2, dst3);
  478. LD_UB2(src + 32, src_stride, src7, src8);
  479. LD_UB2(dst + 32, dst_stride, dst4, dst5);
  480. LD_UB2(src + 48, src_stride, src10, src11);
  481. LD_UB2(dst + 48, dst_stride, dst6, dst7);
  482. src += (2 * src_stride);
  483. ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
  484. ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
  485. DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
  486. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  487. PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
  488. DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
  489. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  490. PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
  491. ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
  492. ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
  493. DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
  494. SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
  495. PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
  496. DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
  497. SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
  498. PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
  499. ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
  500. ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
  501. DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
  502. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  503. PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
  504. DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
  505. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  506. PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
  507. ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
  508. ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
  509. DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
  510. SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
  511. PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
  512. DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
  513. SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
  514. PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
  515. dst += (2 * dst_stride);
  516. src0 = src2;
  517. src3 = src5;
  518. src6 = src8;
  519. src9 = src11;
  520. }
  521. }
  522. void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
  523. uint8_t *dst, ptrdiff_t dst_stride,
  524. const InterpKernel *filter, int x0_q4,
  525. int x_step_q4, int y0_q4, int y_step_q4, int w,
  526. int h) {
  527. const int16_t *const filter_y = filter[y0_q4];
  528. int8_t cnt, filt_ver[8];
  529. assert(y_step_q4 == 16);
  530. assert(((const int32_t *)filter_y)[1] != 0x800000);
  531. for (cnt = 0; cnt < 8; ++cnt) {
  532. filt_ver[cnt] = filter_y[cnt];
  533. }
  534. if (vpx_get_filter_taps(filter_y) == 2) {
  535. switch (w) {
  536. case 4:
  537. common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
  538. (int32_t)dst_stride, &filt_ver[3], h);
  539. break;
  540. case 8:
  541. common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
  542. (int32_t)dst_stride, &filt_ver[3], h);
  543. break;
  544. case 16:
  545. common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
  546. (int32_t)dst_stride, &filt_ver[3], h);
  547. break;
  548. case 32:
  549. common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
  550. (int32_t)dst_stride, &filt_ver[3], h);
  551. break;
  552. case 64:
  553. common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
  554. (int32_t)dst_stride, &filt_ver[3], h);
  555. break;
  556. default:
  557. vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
  558. x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
  559. break;
  560. }
  561. } else {
  562. switch (w) {
  563. case 4:
  564. common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
  565. (int32_t)dst_stride, filt_ver, h);
  566. break;
  567. case 8:
  568. common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
  569. (int32_t)dst_stride, filt_ver, h);
  570. break;
  571. case 16:
  572. common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
  573. (int32_t)dst_stride, filt_ver, h);
  574. break;
  575. case 32:
  576. common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
  577. (int32_t)dst_stride, filt_ver, h);
  578. break;
  579. case 64:
  580. common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
  581. (int32_t)dst_stride, filt_ver, h);
  582. break;
  583. default:
  584. vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
  585. x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
  586. break;
  587. }
  588. }
  589. }