vpx_convolve8_horiz_msa.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/mips/vpx_convolve_msa.h"
  13. static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
  14. uint8_t *dst, int32_t dst_stride,
  15. int8_t *filter) {
  16. v16u8 mask0, mask1, mask2, mask3, out;
  17. v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
  18. v8i16 filt, out0, out1;
  19. mask0 = LD_UB(&mc_filt_mask_arr[16]);
  20. src -= 3;
  21. /* rearranging filter */
  22. filt = LD_SH(filter);
  23. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  24. mask1 = mask0 + 2;
  25. mask2 = mask0 + 4;
  26. mask3 = mask0 + 6;
  27. LD_SB4(src, src_stride, src0, src1, src2, src3);
  28. XORI_B4_128_SB(src0, src1, src2, src3);
  29. HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
  30. filt0, filt1, filt2, filt3, out0, out1);
  31. SRARI_H2_SH(out0, out1, FILTER_BITS);
  32. SAT_SH2_SH(out0, out1, 7);
  33. out = PCKEV_XORI128_UB(out0, out1);
  34. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  35. }
  36. static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
  37. uint8_t *dst, int32_t dst_stride,
  38. int8_t *filter) {
  39. v16i8 filt0, filt1, filt2, filt3;
  40. v16i8 src0, src1, src2, src3;
  41. v16u8 mask0, mask1, mask2, mask3, out;
  42. v8i16 filt, out0, out1, out2, out3;
  43. mask0 = LD_UB(&mc_filt_mask_arr[16]);
  44. src -= 3;
  45. /* rearranging filter */
  46. filt = LD_SH(filter);
  47. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  48. mask1 = mask0 + 2;
  49. mask2 = mask0 + 4;
  50. mask3 = mask0 + 6;
  51. LD_SB4(src, src_stride, src0, src1, src2, src3);
  52. XORI_B4_128_SB(src0, src1, src2, src3);
  53. src += (4 * src_stride);
  54. HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
  55. filt0, filt1, filt2, filt3, out0, out1);
  56. LD_SB4(src, src_stride, src0, src1, src2, src3);
  57. XORI_B4_128_SB(src0, src1, src2, src3);
  58. HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
  59. filt0, filt1, filt2, filt3, out2, out3);
  60. SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
  61. SAT_SH4_SH(out0, out1, out2, out3, 7);
  62. out = PCKEV_XORI128_UB(out0, out1);
  63. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  64. dst += (4 * dst_stride);
  65. out = PCKEV_XORI128_UB(out2, out3);
  66. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  67. }
  68. static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
  69. uint8_t *dst, int32_t dst_stride,
  70. int8_t *filter, int32_t height) {
  71. if (4 == height) {
  72. common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
  73. } else if (8 == height) {
  74. common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
  75. }
  76. }
  77. static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
  78. uint8_t *dst, int32_t dst_stride,
  79. int8_t *filter) {
  80. v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
  81. v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
  82. v8i16 filt, out0, out1, out2, out3;
  83. mask0 = LD_UB(&mc_filt_mask_arr[0]);
  84. src -= 3;
  85. /* rearranging filter */
  86. filt = LD_SH(filter);
  87. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  88. mask1 = mask0 + 2;
  89. mask2 = mask0 + 4;
  90. mask3 = mask0 + 6;
  91. LD_SB4(src, src_stride, src0, src1, src2, src3);
  92. XORI_B4_128_SB(src0, src1, src2, src3);
  93. HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
  94. filt0, filt1, filt2, filt3, out0, out1, out2,
  95. out3);
  96. SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
  97. SAT_SH4_SH(out0, out1, out2, out3, 7);
  98. tmp0 = PCKEV_XORI128_UB(out0, out1);
  99. tmp1 = PCKEV_XORI128_UB(out2, out3);
  100. ST8x4_UB(tmp0, tmp1, dst, dst_stride);
  101. }
  102. static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
  103. uint8_t *dst, int32_t dst_stride,
  104. int8_t *filter, int32_t height) {
  105. uint32_t loop_cnt;
  106. v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
  107. v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
  108. v8i16 filt, out0, out1, out2, out3;
  109. mask0 = LD_UB(&mc_filt_mask_arr[0]);
  110. src -= 3;
  111. /* rearranging filter */
  112. filt = LD_SH(filter);
  113. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  114. mask1 = mask0 + 2;
  115. mask2 = mask0 + 4;
  116. mask3 = mask0 + 6;
  117. for (loop_cnt = (height >> 2); loop_cnt--;) {
  118. LD_SB4(src, src_stride, src0, src1, src2, src3);
  119. XORI_B4_128_SB(src0, src1, src2, src3);
  120. src += (4 * src_stride);
  121. HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
  122. mask3, filt0, filt1, filt2, filt3, out0, out1,
  123. out2, out3);
  124. SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
  125. SAT_SH4_SH(out0, out1, out2, out3, 7);
  126. tmp0 = PCKEV_XORI128_UB(out0, out1);
  127. tmp1 = PCKEV_XORI128_UB(out2, out3);
  128. ST8x4_UB(tmp0, tmp1, dst, dst_stride);
  129. dst += (4 * dst_stride);
  130. }
  131. }
  132. static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
  133. uint8_t *dst, int32_t dst_stride,
  134. int8_t *filter, int32_t height) {
  135. if (4 == height) {
  136. common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
  137. } else {
  138. common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
  139. }
  140. }
  141. static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
  142. uint8_t *dst, int32_t dst_stride,
  143. int8_t *filter, int32_t height) {
  144. uint32_t loop_cnt;
  145. v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
  146. v16u8 mask0, mask1, mask2, mask3, out;
  147. v8i16 filt, out0, out1, out2, out3;
  148. mask0 = LD_UB(&mc_filt_mask_arr[0]);
  149. src -= 3;
  150. /* rearranging filter */
  151. filt = LD_SH(filter);
  152. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  153. mask1 = mask0 + 2;
  154. mask2 = mask0 + 4;
  155. mask3 = mask0 + 6;
  156. for (loop_cnt = (height >> 1); loop_cnt--;) {
  157. LD_SB2(src, src_stride, src0, src2);
  158. LD_SB2(src + 8, src_stride, src1, src3);
  159. XORI_B4_128_SB(src0, src1, src2, src3);
  160. src += (2 * src_stride);
  161. HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
  162. mask3, filt0, filt1, filt2, filt3, out0, out1,
  163. out2, out3);
  164. SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
  165. SAT_SH4_SH(out0, out1, out2, out3, 7);
  166. out = PCKEV_XORI128_UB(out0, out1);
  167. ST_UB(out, dst);
  168. dst += dst_stride;
  169. out = PCKEV_XORI128_UB(out2, out3);
  170. ST_UB(out, dst);
  171. dst += dst_stride;
  172. }
  173. }
  174. static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
  175. uint8_t *dst, int32_t dst_stride,
  176. int8_t *filter, int32_t height) {
  177. uint32_t loop_cnt;
  178. v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
  179. v16u8 mask0, mask1, mask2, mask3, out;
  180. v8i16 filt, out0, out1, out2, out3;
  181. mask0 = LD_UB(&mc_filt_mask_arr[0]);
  182. src -= 3;
  183. /* rearranging filter */
  184. filt = LD_SH(filter);
  185. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  186. mask1 = mask0 + 2;
  187. mask2 = mask0 + 4;
  188. mask3 = mask0 + 6;
  189. for (loop_cnt = (height >> 1); loop_cnt--;) {
  190. src0 = LD_SB(src);
  191. src2 = LD_SB(src + 16);
  192. src3 = LD_SB(src + 24);
  193. src1 = __msa_sldi_b(src2, src0, 8);
  194. src += src_stride;
  195. XORI_B4_128_SB(src0, src1, src2, src3);
  196. HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
  197. mask3, filt0, filt1, filt2, filt3, out0, out1,
  198. out2, out3);
  199. SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
  200. SAT_SH4_SH(out0, out1, out2, out3, 7);
  201. src0 = LD_SB(src);
  202. src2 = LD_SB(src + 16);
  203. src3 = LD_SB(src + 24);
  204. src1 = __msa_sldi_b(src2, src0, 8);
  205. src += src_stride;
  206. out = PCKEV_XORI128_UB(out0, out1);
  207. ST_UB(out, dst);
  208. out = PCKEV_XORI128_UB(out2, out3);
  209. ST_UB(out, dst + 16);
  210. dst += dst_stride;
  211. XORI_B4_128_SB(src0, src1, src2, src3);
  212. HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
  213. mask3, filt0, filt1, filt2, filt3, out0, out1,
  214. out2, out3);
  215. SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
  216. SAT_SH4_SH(out0, out1, out2, out3, 7);
  217. out = PCKEV_XORI128_UB(out0, out1);
  218. ST_UB(out, dst);
  219. out = PCKEV_XORI128_UB(out2, out3);
  220. ST_UB(out, dst + 16);
  221. dst += dst_stride;
  222. }
  223. }
  224. static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
  225. uint8_t *dst, int32_t dst_stride,
  226. int8_t *filter, int32_t height) {
  227. int32_t loop_cnt;
  228. v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
  229. v16u8 mask0, mask1, mask2, mask3, out;
  230. v8i16 filt, out0, out1, out2, out3;
  231. mask0 = LD_UB(&mc_filt_mask_arr[0]);
  232. src -= 3;
  233. /* rearranging filter */
  234. filt = LD_SH(filter);
  235. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  236. mask1 = mask0 + 2;
  237. mask2 = mask0 + 4;
  238. mask3 = mask0 + 6;
  239. for (loop_cnt = height; loop_cnt--;) {
  240. src0 = LD_SB(src);
  241. src2 = LD_SB(src + 16);
  242. src3 = LD_SB(src + 24);
  243. src1 = __msa_sldi_b(src2, src0, 8);
  244. XORI_B4_128_SB(src0, src1, src2, src3);
  245. HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
  246. mask3, filt0, filt1, filt2, filt3, out0, out1,
  247. out2, out3);
  248. SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
  249. SAT_SH4_SH(out0, out1, out2, out3, 7);
  250. out = PCKEV_XORI128_UB(out0, out1);
  251. ST_UB(out, dst);
  252. out = PCKEV_XORI128_UB(out2, out3);
  253. ST_UB(out, dst + 16);
  254. src0 = LD_SB(src + 32);
  255. src2 = LD_SB(src + 48);
  256. src3 = LD_SB(src + 56);
  257. src1 = __msa_sldi_b(src2, src0, 8);
  258. src += src_stride;
  259. XORI_B4_128_SB(src0, src1, src2, src3);
  260. HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
  261. mask3, filt0, filt1, filt2, filt3, out0, out1,
  262. out2, out3);
  263. SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
  264. SAT_SH4_SH(out0, out1, out2, out3, 7);
  265. out = PCKEV_XORI128_UB(out0, out1);
  266. ST_UB(out, dst + 32);
  267. out = PCKEV_XORI128_UB(out2, out3);
  268. ST_UB(out, dst + 48);
  269. dst += dst_stride;
  270. }
  271. }
  272. static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
  273. uint8_t *dst, int32_t dst_stride,
  274. int8_t *filter) {
  275. v16i8 src0, src1, src2, src3, mask;
  276. v16u8 filt0, vec0, vec1, res0, res1;
  277. v8u16 vec2, vec3, filt;
  278. mask = LD_SB(&mc_filt_mask_arr[16]);
  279. /* rearranging filter */
  280. filt = LD_UH(filter);
  281. filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
  282. LD_SB4(src, src_stride, src0, src1, src2, src3);
  283. VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
  284. DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
  285. SRARI_H2_UH(vec2, vec3, FILTER_BITS);
  286. PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
  287. ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
  288. }
  289. static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
  290. uint8_t *dst, int32_t dst_stride,
  291. int8_t *filter) {
  292. v16u8 vec0, vec1, vec2, vec3, filt0;
  293. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
  294. v16i8 res0, res1, res2, res3;
  295. v8u16 vec4, vec5, vec6, vec7, filt;
  296. mask = LD_SB(&mc_filt_mask_arr[16]);
  297. /* rearranging filter */
  298. filt = LD_UH(filter);
  299. filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
  300. LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  301. VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
  302. VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
  303. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
  304. vec6, vec7);
  305. SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
  306. PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
  307. res3);
  308. ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
  309. dst += (4 * dst_stride);
  310. ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
  311. }
  312. static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
  313. uint8_t *dst, int32_t dst_stride,
  314. int8_t *filter, int32_t height) {
  315. if (4 == height) {
  316. common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
  317. } else if (8 == height) {
  318. common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
  319. }
  320. }
  321. static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
  322. uint8_t *dst, int32_t dst_stride,
  323. int8_t *filter) {
  324. v16u8 filt0;
  325. v16i8 src0, src1, src2, src3, mask;
  326. v8u16 vec0, vec1, vec2, vec3, filt;
  327. mask = LD_SB(&mc_filt_mask_arr[0]);
  328. /* rearranging filter */
  329. filt = LD_UH(filter);
  330. filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
  331. LD_SB4(src, src_stride, src0, src1, src2, src3);
  332. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  333. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  334. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  335. vec2, vec3);
  336. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  337. PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
  338. ST8x4_UB(src0, src1, dst, dst_stride);
  339. }
  340. static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
  341. uint8_t *dst, int32_t dst_stride,
  342. int8_t *filter, int32_t height) {
  343. v16u8 filt0;
  344. v16i8 src0, src1, src2, src3, mask, out0, out1;
  345. v8u16 vec0, vec1, vec2, vec3, filt;
  346. mask = LD_SB(&mc_filt_mask_arr[0]);
  347. /* rearranging filter */
  348. filt = LD_UH(filter);
  349. filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
  350. LD_SB4(src, src_stride, src0, src1, src2, src3);
  351. src += (4 * src_stride);
  352. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  353. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  354. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  355. vec2, vec3);
  356. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  357. LD_SB4(src, src_stride, src0, src1, src2, src3);
  358. src += (4 * src_stride);
  359. PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
  360. ST8x4_UB(out0, out1, dst, dst_stride);
  361. dst += (4 * dst_stride);
  362. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  363. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  364. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  365. vec2, vec3);
  366. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  367. PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
  368. ST8x4_UB(out0, out1, dst, dst_stride);
  369. dst += (4 * dst_stride);
  370. if (16 == height) {
  371. LD_SB4(src, src_stride, src0, src1, src2, src3);
  372. src += (4 * src_stride);
  373. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  374. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  375. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  376. vec2, vec3);
  377. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  378. LD_SB4(src, src_stride, src0, src1, src2, src3);
  379. src += (4 * src_stride);
  380. PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
  381. ST8x4_UB(out0, out1, dst, dst_stride);
  382. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  383. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  384. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  385. vec2, vec3);
  386. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  387. PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
  388. ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
  389. }
  390. }
  391. static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
  392. uint8_t *dst, int32_t dst_stride,
  393. int8_t *filter, int32_t height) {
  394. if (4 == height) {
  395. common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
  396. } else {
  397. common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
  398. }
  399. }
  400. static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
  401. uint8_t *dst, int32_t dst_stride,
  402. int8_t *filter, int32_t height) {
  403. uint32_t loop_cnt;
  404. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
  405. v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  406. v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
  407. mask = LD_SB(&mc_filt_mask_arr[0]);
  408. loop_cnt = (height >> 2) - 1;
  409. /* rearranging filter */
  410. filt = LD_UH(filter);
  411. filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
  412. LD_SB4(src, src_stride, src0, src2, src4, src6);
  413. LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
  414. src += (4 * src_stride);
  415. VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
  416. VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
  417. VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
  418. VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
  419. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
  420. out2, out3);
  421. DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
  422. out6, out7);
  423. SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
  424. SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
  425. PCKEV_ST_SB(out0, out1, dst);
  426. dst += dst_stride;
  427. PCKEV_ST_SB(out2, out3, dst);
  428. dst += dst_stride;
  429. PCKEV_ST_SB(out4, out5, dst);
  430. dst += dst_stride;
  431. PCKEV_ST_SB(out6, out7, dst);
  432. dst += dst_stride;
  433. for (; loop_cnt--;) {
  434. LD_SB4(src, src_stride, src0, src2, src4, src6);
  435. LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
  436. src += (4 * src_stride);
  437. VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
  438. VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
  439. VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
  440. VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
  441. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
  442. out2, out3);
  443. DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
  444. out6, out7);
  445. SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
  446. SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
  447. PCKEV_ST_SB(out0, out1, dst);
  448. dst += dst_stride;
  449. PCKEV_ST_SB(out2, out3, dst);
  450. dst += dst_stride;
  451. PCKEV_ST_SB(out4, out5, dst);
  452. dst += dst_stride;
  453. PCKEV_ST_SB(out6, out7, dst);
  454. dst += dst_stride;
  455. }
  456. }
  457. static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
  458. uint8_t *dst, int32_t dst_stride,
  459. int8_t *filter, int32_t height) {
  460. uint32_t loop_cnt;
  461. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
  462. v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  463. v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
  464. mask = LD_SB(&mc_filt_mask_arr[0]);
  465. /* rearranging filter */
  466. filt = LD_UH(filter);
  467. filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
  468. for (loop_cnt = height >> 1; loop_cnt--;) {
  469. src0 = LD_SB(src);
  470. src2 = LD_SB(src + 16);
  471. src3 = LD_SB(src + 24);
  472. src1 = __msa_sldi_b(src2, src0, 8);
  473. src += src_stride;
  474. src4 = LD_SB(src);
  475. src6 = LD_SB(src + 16);
  476. src7 = LD_SB(src + 24);
  477. src5 = __msa_sldi_b(src6, src4, 8);
  478. src += src_stride;
  479. VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
  480. VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
  481. VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
  482. VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
  483. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
  484. out2, out3);
  485. DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
  486. out6, out7);
  487. SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
  488. SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
  489. PCKEV_ST_SB(out0, out1, dst);
  490. PCKEV_ST_SB(out2, out3, dst + 16);
  491. dst += dst_stride;
  492. PCKEV_ST_SB(out4, out5, dst);
  493. PCKEV_ST_SB(out6, out7, dst + 16);
  494. dst += dst_stride;
  495. }
  496. }
  497. static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
  498. uint8_t *dst, int32_t dst_stride,
  499. int8_t *filter, int32_t height) {
  500. uint32_t loop_cnt;
  501. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
  502. v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  503. v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
  504. mask = LD_SB(&mc_filt_mask_arr[0]);
  505. /* rearranging filter */
  506. filt = LD_UH(filter);
  507. filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
  508. for (loop_cnt = height; loop_cnt--;) {
  509. src0 = LD_SB(src);
  510. src2 = LD_SB(src + 16);
  511. src4 = LD_SB(src + 32);
  512. src6 = LD_SB(src + 48);
  513. src7 = LD_SB(src + 56);
  514. SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
  515. src += src_stride;
  516. VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
  517. VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
  518. VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
  519. VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
  520. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
  521. out2, out3);
  522. DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
  523. out6, out7);
  524. SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
  525. SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
  526. PCKEV_ST_SB(out0, out1, dst);
  527. PCKEV_ST_SB(out2, out3, dst + 16);
  528. PCKEV_ST_SB(out4, out5, dst + 32);
  529. PCKEV_ST_SB(out6, out7, dst + 48);
  530. dst += dst_stride;
  531. }
  532. }
  533. void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
  534. uint8_t *dst, ptrdiff_t dst_stride,
  535. const InterpKernel *filter, int x0_q4,
  536. int x_step_q4, int y0_q4, int y_step_q4, int w,
  537. int h) {
  538. const int16_t *const filter_x = filter[x0_q4];
  539. int8_t cnt, filt_hor[8];
  540. assert(x_step_q4 == 16);
  541. assert(((const int32_t *)filter_x)[1] != 0x800000);
  542. for (cnt = 0; cnt < 8; ++cnt) {
  543. filt_hor[cnt] = filter_x[cnt];
  544. }
  545. if (vpx_get_filter_taps(filter_x) == 2) {
  546. switch (w) {
  547. case 4:
  548. common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
  549. &filt_hor[3], h);
  550. break;
  551. case 8:
  552. common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
  553. &filt_hor[3], h);
  554. break;
  555. case 16:
  556. common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
  557. &filt_hor[3], h);
  558. break;
  559. case 32:
  560. common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
  561. &filt_hor[3], h);
  562. break;
  563. case 64:
  564. common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
  565. &filt_hor[3], h);
  566. break;
  567. default:
  568. vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  569. x_step_q4, y0_q4, y_step_q4, w, h);
  570. break;
  571. }
  572. } else {
  573. switch (w) {
  574. case 4:
  575. common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
  576. filt_hor, h);
  577. break;
  578. case 8:
  579. common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
  580. filt_hor, h);
  581. break;
  582. case 16:
  583. common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
  584. filt_hor, h);
  585. break;
  586. case 32:
  587. common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
  588. filt_hor, h);
  589. break;
  590. case 64:
  591. common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
  592. filt_hor, h);
  593. break;
  594. default:
  595. vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  596. x_step_q4, y0_q4, y_step_q4, w, h);
  597. break;
  598. }
  599. }
  600. }