vpx_convolve8_msa.c 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/mips/vpx_convolve_msa.h"
  13. const uint8_t mc_filt_mask_arr[16 * 3] = {
  14. /* 8 width cases */
  15. 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
  16. /* 4 width cases */
  17. 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
  18. /* 4 width cases */
  19. 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
  20. };
  21. static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
  22. uint8_t *dst, int32_t dst_stride,
  23. int8_t *filter_horiz, int8_t *filter_vert,
  24. int32_t height) {
  25. uint32_t loop_cnt;
  26. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  27. v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
  28. v16u8 mask0, mask1, mask2, mask3, out;
  29. v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  30. v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
  31. v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
  32. mask0 = LD_UB(&mc_filt_mask_arr[16]);
  33. src -= (3 + 3 * src_stride);
  34. /* rearranging filter */
  35. filt = LD_SH(filter_horiz);
  36. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
  37. mask1 = mask0 + 2;
  38. mask2 = mask0 + 4;
  39. mask3 = mask0 + 6;
  40. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  41. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  42. src += (7 * src_stride);
  43. hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
  44. filt_hz1, filt_hz2, filt_hz3);
  45. hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
  46. filt_hz1, filt_hz2, filt_hz3);
  47. hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
  48. filt_hz1, filt_hz2, filt_hz3);
  49. hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
  50. filt_hz1, filt_hz2, filt_hz3);
  51. SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
  52. filt = LD_SH(filter_vert);
  53. SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
  54. ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
  55. out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
  56. for (loop_cnt = (height >> 2); loop_cnt--;) {
  57. LD_SB4(src, src_stride, src7, src8, src9, src10);
  58. XORI_B4_128_SB(src7, src8, src9, src10);
  59. src += (4 * src_stride);
  60. hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
  61. filt_hz1, filt_hz2, filt_hz3);
  62. hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
  63. out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
  64. tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
  65. filt_vt2, filt_vt3);
  66. hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
  67. filt_hz1, filt_hz2, filt_hz3);
  68. hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
  69. out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
  70. tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
  71. filt_vt2, filt_vt3);
  72. SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
  73. SAT_SH2_SH(tmp0, tmp1, 7);
  74. out = PCKEV_XORI128_UB(tmp0, tmp1);
  75. ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
  76. dst += (4 * dst_stride);
  77. hz_out5 = hz_out9;
  78. out0 = out2;
  79. out1 = out3;
  80. out2 = out4;
  81. }
  82. }
  83. static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
  84. uint8_t *dst, int32_t dst_stride,
  85. int8_t *filter_horiz, int8_t *filter_vert,
  86. int32_t height) {
  87. uint32_t loop_cnt;
  88. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  89. v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
  90. v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
  91. v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
  92. v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  93. v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
  94. v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
  95. mask0 = LD_UB(&mc_filt_mask_arr[0]);
  96. src -= (3 + 3 * src_stride);
  97. /* rearranging filter */
  98. filt = LD_SH(filter_horiz);
  99. SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
  100. mask1 = mask0 + 2;
  101. mask2 = mask0 + 4;
  102. mask3 = mask0 + 6;
  103. LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  104. src += (7 * src_stride);
  105. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  106. hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
  107. filt_hz1, filt_hz2, filt_hz3);
  108. hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
  109. filt_hz1, filt_hz2, filt_hz3);
  110. hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
  111. filt_hz1, filt_hz2, filt_hz3);
  112. hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
  113. filt_hz1, filt_hz2, filt_hz3);
  114. hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
  115. filt_hz1, filt_hz2, filt_hz3);
  116. hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
  117. filt_hz1, filt_hz2, filt_hz3);
  118. hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
  119. filt_hz1, filt_hz2, filt_hz3);
  120. filt = LD_SH(filter_vert);
  121. SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
  122. ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
  123. ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
  124. ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
  125. for (loop_cnt = (height >> 2); loop_cnt--;) {
  126. LD_SB4(src, src_stride, src7, src8, src9, src10);
  127. src += (4 * src_stride);
  128. XORI_B4_128_SB(src7, src8, src9, src10);
  129. hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
  130. filt_hz1, filt_hz2, filt_hz3);
  131. out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
  132. tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
  133. filt_vt2, filt_vt3);
  134. hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
  135. filt_hz1, filt_hz2, filt_hz3);
  136. out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
  137. tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
  138. filt_vt2, filt_vt3);
  139. hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
  140. filt_hz1, filt_hz2, filt_hz3);
  141. out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
  142. tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
  143. filt_vt2, filt_vt3);
  144. hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
  145. filt_hz0, filt_hz1, filt_hz2, filt_hz3);
  146. out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
  147. tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
  148. filt_vt2, filt_vt3);
  149. SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  150. SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
  151. vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
  152. vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
  153. ST8x4_UB(vec0, vec1, dst, dst_stride);
  154. dst += (4 * dst_stride);
  155. hz_out6 = hz_out10;
  156. out0 = out2;
  157. out1 = out3;
  158. out2 = out8;
  159. out4 = out6;
  160. out5 = out7;
  161. out6 = out9;
  162. }
  163. }
  164. static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
  165. uint8_t *dst, int32_t dst_stride,
  166. int8_t *filter_horiz, int8_t *filter_vert,
  167. int32_t height) {
  168. int32_t multiple8_cnt;
  169. for (multiple8_cnt = 2; multiple8_cnt--;) {
  170. common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
  171. filter_vert, height);
  172. src += 8;
  173. dst += 8;
  174. }
  175. }
  176. static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
  177. uint8_t *dst, int32_t dst_stride,
  178. int8_t *filter_horiz, int8_t *filter_vert,
  179. int32_t height) {
  180. int32_t multiple8_cnt;
  181. for (multiple8_cnt = 4; multiple8_cnt--;) {
  182. common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
  183. filter_vert, height);
  184. src += 8;
  185. dst += 8;
  186. }
  187. }
  188. static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
  189. uint8_t *dst, int32_t dst_stride,
  190. int8_t *filter_horiz, int8_t *filter_vert,
  191. int32_t height) {
  192. int32_t multiple8_cnt;
  193. for (multiple8_cnt = 8; multiple8_cnt--;) {
  194. common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
  195. filter_vert, height);
  196. src += 8;
  197. dst += 8;
  198. }
  199. }
  200. static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
  201. uint8_t *dst, int32_t dst_stride,
  202. int8_t *filter_horiz,
  203. int8_t *filter_vert) {
  204. v16i8 src0, src1, src2, src3, src4, mask;
  205. v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
  206. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
  207. mask = LD_SB(&mc_filt_mask_arr[16]);
  208. /* rearranging filter */
  209. filt = LD_UH(filter_horiz);
  210. filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
  211. filt = LD_UH(filter_vert);
  212. filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
  213. LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  214. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  215. hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  216. hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  217. hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
  218. hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
  219. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  220. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  221. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  222. PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
  223. ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
  224. }
  225. static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
  226. uint8_t *dst, int32_t dst_stride,
  227. int8_t *filter_horiz,
  228. int8_t *filter_vert) {
  229. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
  230. v16i8 res0, res1, res2, res3;
  231. v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
  232. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  233. v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
  234. mask = LD_SB(&mc_filt_mask_arr[16]);
  235. /* rearranging filter */
  236. filt = LD_UH(filter_horiz);
  237. filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
  238. filt = LD_UH(filter_vert);
  239. filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
  240. LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  241. src += (8 * src_stride);
  242. src8 = LD_SB(src);
  243. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  244. hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  245. hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
  246. hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
  247. hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
  248. SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
  249. hz_out3, hz_out5, 8);
  250. hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
  251. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  252. ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
  253. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
  254. vec5, vec6, vec7);
  255. SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
  256. PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
  257. res3);
  258. ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
  259. dst += (4 * dst_stride);
  260. ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
  261. }
  262. static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
  263. uint8_t *dst, int32_t dst_stride,
  264. int8_t *filter_horiz, int8_t *filter_vert,
  265. int32_t height) {
  266. if (4 == height) {
  267. common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
  268. filter_vert);
  269. } else if (8 == height) {
  270. common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
  271. filter_vert);
  272. }
  273. }
  274. static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
  275. uint8_t *dst, int32_t dst_stride,
  276. int8_t *filter_horiz,
  277. int8_t *filter_vert) {
  278. v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
  279. v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
  280. v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
  281. v8i16 filt;
  282. mask = LD_SB(&mc_filt_mask_arr[0]);
  283. /* rearranging filter */
  284. filt = LD_SH(filter_horiz);
  285. filt_hz = (v16u8)__msa_splati_h(filt, 0);
  286. filt = LD_SH(filter_vert);
  287. filt_vt = (v16u8)__msa_splati_h(filt, 0);
  288. LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  289. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  290. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  291. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  292. tmp0 = __msa_dotp_u_h(vec0, filt_vt);
  293. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  294. vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  295. tmp1 = __msa_dotp_u_h(vec1, filt_vt);
  296. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  297. vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  298. tmp2 = __msa_dotp_u_h(vec2, filt_vt);
  299. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  300. vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  301. tmp3 = __msa_dotp_u_h(vec3, filt_vt);
  302. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  303. PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  304. ST8x4_UB(out0, out1, dst, dst_stride);
  305. }
  306. static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
  307. int32_t src_stride, uint8_t *dst,
  308. int32_t dst_stride,
  309. int8_t *filter_horiz,
  310. int8_t *filter_vert, int32_t height) {
  311. uint32_t loop_cnt;
  312. v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
  313. v16u8 filt_hz, filt_vt, vec0;
  314. v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
  315. v8i16 filt;
  316. mask = LD_SB(&mc_filt_mask_arr[0]);
  317. /* rearranging filter */
  318. filt = LD_SH(filter_horiz);
  319. filt_hz = (v16u8)__msa_splati_h(filt, 0);
  320. filt = LD_SH(filter_vert);
  321. filt_vt = (v16u8)__msa_splati_h(filt, 0);
  322. src0 = LD_SB(src);
  323. src += src_stride;
  324. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  325. for (loop_cnt = (height >> 3); loop_cnt--;) {
  326. LD_SB4(src, src_stride, src1, src2, src3, src4);
  327. src += (4 * src_stride);
  328. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  329. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  330. tmp1 = __msa_dotp_u_h(vec0, filt_vt);
  331. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  332. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  333. tmp2 = __msa_dotp_u_h(vec0, filt_vt);
  334. SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
  335. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  336. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  337. tmp3 = __msa_dotp_u_h(vec0, filt_vt);
  338. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  339. LD_SB4(src, src_stride, src1, src2, src3, src4);
  340. src += (4 * src_stride);
  341. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  342. tmp4 = __msa_dotp_u_h(vec0, filt_vt);
  343. SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
  344. PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
  345. ST8x4_UB(out0, out1, dst, dst_stride);
  346. dst += (4 * dst_stride);
  347. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  348. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  349. tmp5 = __msa_dotp_u_h(vec0, filt_vt);
  350. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  351. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  352. tmp6 = __msa_dotp_u_h(vec0, filt_vt);
  353. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  354. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  355. tmp7 = __msa_dotp_u_h(vec0, filt_vt);
  356. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  357. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  358. tmp8 = __msa_dotp_u_h(vec0, filt_vt);
  359. SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
  360. PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
  361. ST8x4_UB(out0, out1, dst, dst_stride);
  362. dst += (4 * dst_stride);
  363. }
  364. }
  365. static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
  366. uint8_t *dst, int32_t dst_stride,
  367. int8_t *filter_horiz, int8_t *filter_vert,
  368. int32_t height) {
  369. if (4 == height) {
  370. common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
  371. filter_vert);
  372. } else {
  373. common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
  374. filter_horiz, filter_vert, height);
  375. }
  376. }
  377. static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
  378. uint8_t *dst, int32_t dst_stride,
  379. int8_t *filter_horiz, int8_t *filter_vert,
  380. int32_t height) {
  381. uint32_t loop_cnt;
  382. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
  383. v16u8 filt_hz, filt_vt, vec0, vec1;
  384. v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
  385. v8i16 filt;
  386. mask = LD_SB(&mc_filt_mask_arr[0]);
  387. /* rearranging filter */
  388. filt = LD_SH(filter_horiz);
  389. filt_hz = (v16u8)__msa_splati_h(filt, 0);
  390. filt = LD_SH(filter_vert);
  391. filt_vt = (v16u8)__msa_splati_h(filt, 0);
  392. LD_SB2(src, 8, src0, src1);
  393. src += src_stride;
  394. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  395. hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  396. for (loop_cnt = (height >> 2); loop_cnt--;) {
  397. LD_SB4(src, src_stride, src0, src2, src4, src6);
  398. LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
  399. src += (4 * src_stride);
  400. hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  401. hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  402. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  403. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
  404. SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
  405. PCKEV_ST_SB(tmp1, tmp2, dst);
  406. dst += dst_stride;
  407. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  408. hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  409. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  410. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
  411. SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
  412. PCKEV_ST_SB(tmp1, tmp2, dst);
  413. dst += dst_stride;
  414. hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  415. hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
  416. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  417. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
  418. SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
  419. PCKEV_ST_SB(tmp1, tmp2, dst);
  420. dst += dst_stride;
  421. hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
  422. hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
  423. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  424. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
  425. SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
  426. PCKEV_ST_SB(tmp1, tmp2, dst);
  427. dst += dst_stride;
  428. }
  429. }
  430. static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
  431. uint8_t *dst, int32_t dst_stride,
  432. int8_t *filter_horiz, int8_t *filter_vert,
  433. int32_t height) {
  434. int32_t multiple8_cnt;
  435. for (multiple8_cnt = 2; multiple8_cnt--;) {
  436. common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
  437. filter_vert, height);
  438. src += 16;
  439. dst += 16;
  440. }
  441. }
  442. static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
  443. uint8_t *dst, int32_t dst_stride,
  444. int8_t *filter_horiz, int8_t *filter_vert,
  445. int32_t height) {
  446. int32_t multiple8_cnt;
  447. for (multiple8_cnt = 4; multiple8_cnt--;) {
  448. common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
  449. filter_vert, height);
  450. src += 16;
  451. dst += 16;
  452. }
  453. }
  454. void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
  455. ptrdiff_t dst_stride, const InterpKernel *filter,
  456. int x0_q4, int32_t x_step_q4, int y0_q4,
  457. int32_t y_step_q4, int32_t w, int32_t h) {
  458. const int16_t *const filter_x = filter[x0_q4];
  459. const int16_t *const filter_y = filter[y0_q4];
  460. int8_t cnt, filt_hor[8], filt_ver[8];
  461. assert(x_step_q4 == 16);
  462. assert(y_step_q4 == 16);
  463. assert(((const int32_t *)filter_x)[1] != 0x800000);
  464. assert(((const int32_t *)filter_y)[1] != 0x800000);
  465. for (cnt = 0; cnt < 8; ++cnt) {
  466. filt_hor[cnt] = filter_x[cnt];
  467. filt_ver[cnt] = filter_y[cnt];
  468. }
  469. if (vpx_get_filter_taps(filter_x) == 2 &&
  470. vpx_get_filter_taps(filter_y) == 2) {
  471. switch (w) {
  472. case 4:
  473. common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
  474. (int32_t)dst_stride, &filt_hor[3],
  475. &filt_ver[3], (int32_t)h);
  476. break;
  477. case 8:
  478. common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
  479. (int32_t)dst_stride, &filt_hor[3],
  480. &filt_ver[3], (int32_t)h);
  481. break;
  482. case 16:
  483. common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
  484. (int32_t)dst_stride, &filt_hor[3],
  485. &filt_ver[3], (int32_t)h);
  486. break;
  487. case 32:
  488. common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
  489. (int32_t)dst_stride, &filt_hor[3],
  490. &filt_ver[3], (int32_t)h);
  491. break;
  492. case 64:
  493. common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
  494. (int32_t)dst_stride, &filt_hor[3],
  495. &filt_ver[3], (int32_t)h);
  496. break;
  497. default:
  498. vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  499. x_step_q4, y0_q4, y_step_q4, w, h);
  500. break;
  501. }
  502. } else if (vpx_get_filter_taps(filter_x) == 2 ||
  503. vpx_get_filter_taps(filter_y) == 2) {
  504. vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
  505. y0_q4, y_step_q4, w, h);
  506. } else {
  507. switch (w) {
  508. case 4:
  509. common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
  510. (int32_t)dst_stride, filt_hor, filt_ver,
  511. (int32_t)h);
  512. break;
  513. case 8:
  514. common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
  515. (int32_t)dst_stride, filt_hor, filt_ver,
  516. (int32_t)h);
  517. break;
  518. case 16:
  519. common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
  520. (int32_t)dst_stride, filt_hor, filt_ver,
  521. (int32_t)h);
  522. break;
  523. case 32:
  524. common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
  525. (int32_t)dst_stride, filt_hor, filt_ver,
  526. (int32_t)h);
  527. break;
  528. case 64:
  529. common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
  530. (int32_t)dst_stride, filt_hor, filt_ver,
  531. (int32_t)h);
  532. break;
  533. default:
  534. vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  535. x_step_q4, y0_q4, y_step_q4, w, h);
  536. break;
  537. }
  538. }
  539. }
  540. static void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
  541. uint8_t *dst, const int16_t *x_filter) {
  542. uint64_t srcd0, srcd1, srcd2, srcd3;
  543. uint32_t res;
  544. v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
  545. v16i8 out0, out1;
  546. v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 };
  547. v16i8 shf2 = shf1 + 2;
  548. v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
  549. v16i8 filt_shf1 = filt_shf0 + 2;
  550. v16i8 filt_shf2 = filt_shf0 + 4;
  551. v16i8 filt_shf3 = filt_shf0 + 6;
  552. v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3;
  553. LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
  554. INSERT_D2_UB(srcd0, srcd1, src0);
  555. INSERT_D2_UB(srcd2, srcd3, src1);
  556. VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
  557. XORI_B2_128_SB(out0, out1);
  558. UNPCK_SB_SH(out0, src0_h, src1_h);
  559. UNPCK_SB_SH(out1, src2_h, src3_h);
  560. filt = LD_SH(x_filter);
  561. VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
  562. VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
  563. src0_h *= filt0;
  564. src0_h += src1_h * filt1;
  565. src0_h += src2_h * filt2;
  566. src0_h += src3_h * filt3;
  567. src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
  568. src0_h = __msa_adds_s_h(src0_h, src1_h);
  569. src0_h = __msa_srari_h(src0_h, FILTER_BITS);
  570. src0_h = __msa_sat_s_h(src0_h, 7);
  571. dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
  572. res = __msa_copy_u_w((v4i32)dst0, 0);
  573. SW(res, dst);
  574. }
  575. static void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
  576. uint8_t *dst, const int16_t *x_filter) {
  577. uint64_t srcd0, srcd1, srcd2, srcd3;
  578. v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
  579. v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
  580. v16i8 out0, out1, out2, out3;
  581. v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
  582. v16i8 shf2 = shf1 + 4;
  583. v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
  584. v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
  585. LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
  586. INSERT_D2_UB(srcd0, srcd1, src0);
  587. INSERT_D2_UB(srcd2, srcd3, src1);
  588. LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
  589. INSERT_D2_UB(srcd0, srcd1, src2);
  590. INSERT_D2_UB(srcd2, srcd3, src3);
  591. filt = LD_SH(x_filter);
  592. SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  593. SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
  594. // transpose
  595. VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
  596. VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
  597. ILVRL_W2_SB(tmp2, tmp0, out0, out1);
  598. ILVRL_W2_SB(tmp3, tmp1, out2, out3);
  599. XORI_B4_128_SB(out0, out1, out2, out3);
  600. UNPCK_SB_SH(out0, src0_h, src1_h);
  601. UNPCK_SB_SH(out1, src2_h, src3_h);
  602. UNPCK_SB_SH(out2, src4_h, src5_h);
  603. UNPCK_SB_SH(out3, src6_h, src7_h);
  604. src0_h *= filt0;
  605. src4_h *= filt4;
  606. src0_h += src1_h * filt1;
  607. src4_h += src5_h * filt5;
  608. src0_h += src2_h * filt2;
  609. src4_h += src6_h * filt6;
  610. src0_h += src3_h * filt3;
  611. src4_h += src7_h * filt7;
  612. src0_h = __msa_adds_s_h(src0_h, src4_h);
  613. src0_h = __msa_srari_h(src0_h, FILTER_BITS);
  614. src0_h = __msa_sat_s_h(src0_h, 7);
  615. dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
  616. ST8x1_UB(dst0, dst);
  617. }
  618. static void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
  619. uint8_t *dst, const int16_t *x_filter) {
  620. uint64_t srcd0, srcd1, srcd2, srcd3;
  621. v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
  622. v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 };
  623. v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
  624. v16i8 out0, out1, out2, out3, out4, out5, out6, out7;
  625. v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
  626. v16i8 shf2 = shf1 + 4;
  627. v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
  628. v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
  629. v8i16 dst0_h, dst1_h, dst2_h, dst3_h;
  630. LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
  631. INSERT_D2_UB(srcd0, srcd1, src0);
  632. INSERT_D2_UB(srcd2, srcd3, src1);
  633. LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
  634. INSERT_D2_UB(srcd0, srcd1, src2);
  635. INSERT_D2_UB(srcd2, srcd3, src3);
  636. LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
  637. INSERT_D2_UB(srcd0, srcd1, src4);
  638. INSERT_D2_UB(srcd2, srcd3, src5);
  639. LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
  640. INSERT_D2_UB(srcd0, srcd1, src6);
  641. INSERT_D2_UB(srcd2, srcd3, src7);
  642. filt = LD_SH(x_filter);
  643. SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  644. SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
  645. // transpose
  646. VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
  647. VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
  648. ILVRL_W2_SB(tmp2, tmp0, out0, out1);
  649. ILVRL_W2_SB(tmp3, tmp1, out2, out3);
  650. XORI_B4_128_SB(out0, out1, out2, out3);
  651. UNPCK_SB_SH(out0, src0_h, src1_h);
  652. UNPCK_SB_SH(out1, src2_h, src3_h);
  653. UNPCK_SB_SH(out2, src4_h, src5_h);
  654. UNPCK_SB_SH(out3, src6_h, src7_h);
  655. VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1);
  656. VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3);
  657. ILVRL_W2_SB(tmp2, tmp0, out4, out5);
  658. ILVRL_W2_SB(tmp3, tmp1, out6, out7);
  659. XORI_B4_128_SB(out4, out5, out6, out7);
  660. dst0_h = src0_h * filt0;
  661. dst1_h = src4_h * filt4;
  662. dst0_h += src1_h * filt1;
  663. dst1_h += src5_h * filt5;
  664. dst0_h += src2_h * filt2;
  665. dst1_h += src6_h * filt6;
  666. dst0_h += src3_h * filt3;
  667. dst1_h += src7_h * filt7;
  668. UNPCK_SB_SH(out4, src0_h, src1_h);
  669. UNPCK_SB_SH(out5, src2_h, src3_h);
  670. UNPCK_SB_SH(out6, src4_h, src5_h);
  671. UNPCK_SB_SH(out7, src6_h, src7_h);
  672. dst2_h = src0_h * filt0;
  673. dst3_h = src4_h * filt4;
  674. dst2_h += src1_h * filt1;
  675. dst3_h += src5_h * filt5;
  676. dst2_h += src2_h * filt2;
  677. dst3_h += src6_h * filt6;
  678. dst2_h += src3_h * filt3;
  679. dst3_h += src7_h * filt7;
  680. ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h);
  681. SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS);
  682. SAT_SH2_SH(dst0_h, dst2_h, 7);
  683. dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h);
  684. ST_UB(dst0, dst);
  685. }
  686. static void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst,
  687. ptrdiff_t dst_stride) {
  688. v16u8 in0;
  689. v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
  690. in0 = LD_UB(src);
  691. out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0);
  692. ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
  693. }
  694. static void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst,
  695. ptrdiff_t dst_stride) {
  696. v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
  697. v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
  698. v16i8 shf2 = shf1 + 4;
  699. LD_UB4(src, 16, in0, in1, in2, in3);
  700. VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1);
  701. VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3);
  702. ILVRL_W2_UB(tmp2, tmp0, out0, out1);
  703. ILVRL_W2_UB(tmp3, tmp1, out2, out3);
  704. ST8x4_UB(out0, out1, dst, dst_stride);
  705. ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride);
  706. }
  707. static void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst,
  708. ptrdiff_t dst_stride) {
  709. v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12;
  710. v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8;
  711. v16u8 out9, out10, out11, out12, out13, out14, out15;
  712. LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7);
  713. LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15);
  714. TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
  715. in11, in12, in13, in14, in15, out0, out1, out2, out3,
  716. out4, out5, out6, out7);
  717. ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride);
  718. dst += 8 * dst_stride;
  719. SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8);
  720. SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8);
  721. SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8);
  722. SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8);
  723. TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
  724. in11, in12, in13, in14, in15, out8, out9, out10, out11,
  725. out12, out13, out14, out15);
  726. ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride);
  727. }
  728. static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
  729. uint8_t *dst, ptrdiff_t dst_stride,
  730. const InterpKernel *x_filters, int x0_q4,
  731. int x_step_q4, int h) {
  732. DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
  733. int y, z, i;
  734. src -= SUBPEL_TAPS / 2 - 1;
  735. for (y = 0; y < h; y += 4) {
  736. int x_q4 = x0_q4;
  737. for (z = 0; z < 4; ++z) {
  738. const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
  739. const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
  740. if (x_q4 & SUBPEL_MASK) {
  741. filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter);
  742. } else {
  743. for (i = 0; i < 4; ++i) {
  744. temp[z * 4 + i] = src_x[i * src_stride + 3];
  745. }
  746. }
  747. x_q4 += x_step_q4;
  748. }
  749. transpose4x4_to_dst(temp, dst, dst_stride);
  750. src += src_stride * 4;
  751. dst += dst_stride * 4;
  752. }
  753. }
  754. static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
  755. uint8_t *dst, ptrdiff_t dst_stride,
  756. const InterpKernel *x_filters, int x0_q4,
  757. int x_step_q4, int h) {
  758. DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
  759. int y, z, i;
  760. src -= SUBPEL_TAPS / 2 - 1;
  761. // This function processes 8x8 areas. The intermediate height is not always
  762. // a multiple of 8, so force it to be a multiple of 8 here.
  763. y = h + (8 - (h & 0x7));
  764. do {
  765. int x_q4 = x0_q4;
  766. for (z = 0; z < 8; ++z) {
  767. const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
  768. const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
  769. if (x_q4 & SUBPEL_MASK) {
  770. filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter);
  771. } else {
  772. for (i = 0; i < 8; ++i) {
  773. temp[z * 8 + i] = src_x[3 + i * src_stride];
  774. }
  775. }
  776. x_q4 += x_step_q4;
  777. }
  778. transpose8x8_to_dst(temp, dst, dst_stride);
  779. src += src_stride * 8;
  780. dst += dst_stride * 8;
  781. } while (y -= 8);
  782. }
  783. static void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride,
  784. uint8_t *dst, ptrdiff_t dst_stride,
  785. const InterpKernel *x_filters, int x0_q4,
  786. int x_step_q4, int w, int h) {
  787. DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]);
  788. int x, y, z, i;
  789. src -= SUBPEL_TAPS / 2 - 1;
  790. // This function processes 16x16 areas. The intermediate height is not always
  791. // a multiple of 16, so force it to be a multiple of 8 here.
  792. y = h + (16 - (h & 0xF));
  793. do {
  794. int x_q4 = x0_q4;
  795. for (x = 0; x < w; x += 16) {
  796. for (z = 0; z < 16; ++z) {
  797. const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
  798. const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
  799. if (x_q4 & SUBPEL_MASK) {
  800. filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter);
  801. } else {
  802. for (i = 0; i < 16; ++i) {
  803. temp[z * 16 + i] = src_x[3 + i * src_stride];
  804. }
  805. }
  806. x_q4 += x_step_q4;
  807. }
  808. transpose16x16_to_dst(temp, dst + x, dst_stride);
  809. }
  810. src += src_stride * 16;
  811. dst += dst_stride * 16;
  812. } while (y -= 16);
  813. }
  814. static void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
  815. uint8_t *dst, const int16_t *y_filter) {
  816. uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7;
  817. uint32_t res;
  818. v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
  819. v16i8 out0, out1;
  820. v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 };
  821. v16i8 shf2 = shf1 + 8;
  822. v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
  823. v16i8 filt_shf1 = filt_shf0 + 2;
  824. v16i8 filt_shf2 = filt_shf0 + 4;
  825. v16i8 filt_shf3 = filt_shf0 + 6;
  826. v8i16 filt, src0_h, src1_h, src2_h, src3_h;
  827. v8i16 filt0, filt1, filt2, filt3;
  828. LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3);
  829. LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7);
  830. INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0);
  831. INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1);
  832. VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
  833. XORI_B2_128_SB(out0, out1);
  834. UNPCK_SB_SH(out0, src0_h, src1_h);
  835. UNPCK_SB_SH(out1, src2_h, src3_h);
  836. filt = LD_SH(y_filter);
  837. VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
  838. VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
  839. src0_h *= filt0;
  840. src0_h += src1_h * filt1;
  841. src0_h += src2_h * filt2;
  842. src0_h += src3_h * filt3;
  843. src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
  844. src0_h = __msa_adds_s_h(src0_h, src1_h);
  845. src0_h = __msa_srari_h(src0_h, FILTER_BITS);
  846. src0_h = __msa_sat_s_h(src0_h, 7);
  847. dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
  848. res = __msa_copy_u_w((v4i32)dst0, 0);
  849. SW(res, dst);
  850. }
  851. static void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
  852. uint8_t *dst, const int16_t *y_filter) {
  853. uint64_t srcd0, srcd1, srcd2, srcd3;
  854. v16u8 dst0;
  855. v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
  856. v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
  857. v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
  858. LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3);
  859. INSERT_D2_SB(srcd0, srcd1, src0);
  860. INSERT_D2_SB(srcd2, srcd3, src1);
  861. LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
  862. INSERT_D2_SB(srcd0, srcd1, src2);
  863. INSERT_D2_SB(srcd2, srcd3, src3);
  864. filt = LD_SH(y_filter);
  865. SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  866. SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
  867. XORI_B4_128_SB(src0, src1, src2, src3);
  868. UNPCK_SB_SH(src0, src0_h, src1_h);
  869. UNPCK_SB_SH(src1, src2_h, src3_h);
  870. UNPCK_SB_SH(src2, src4_h, src5_h);
  871. UNPCK_SB_SH(src3, src6_h, src7_h);
  872. src0_h *= filt0;
  873. src4_h *= filt4;
  874. src0_h += src1_h * filt1;
  875. src4_h += src5_h * filt5;
  876. src0_h += src2_h * filt2;
  877. src4_h += src6_h * filt6;
  878. src0_h += src3_h * filt3;
  879. src4_h += src7_h * filt7;
  880. src0_h = __msa_adds_s_h(src0_h, src4_h);
  881. src0_h = __msa_srari_h(src0_h, FILTER_BITS);
  882. src0_h = __msa_sat_s_h(src0_h, 7);
  883. dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
  884. ST8x1_UB(dst0, dst);
  885. }
  886. static void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
  887. uint8_t *dst, const int16_t *y_filter,
  888. int w) {
  889. int x;
  890. v16u8 dst0;
  891. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  892. v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
  893. v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h;
  894. v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
  895. filt = LD_SH(y_filter);
  896. SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  897. SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
  898. for (x = 0; x < w; x += 16) {
  899. LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7);
  900. src_y += 16;
  901. XORI_B4_128_SB(src0, src1, src2, src3);
  902. XORI_B4_128_SB(src4, src5, src6, src7);
  903. UNPCK_SB_SH(src0, src0_h, src1_h);
  904. UNPCK_SB_SH(src1, src2_h, src3_h);
  905. UNPCK_SB_SH(src2, src4_h, src5_h);
  906. UNPCK_SB_SH(src3, src6_h, src7_h);
  907. UNPCK_SB_SH(src4, src8_h, src9_h);
  908. UNPCK_SB_SH(src5, src10_h, src11_h);
  909. UNPCK_SB_SH(src6, src12_h, src13_h);
  910. UNPCK_SB_SH(src7, src14_h, src15_h);
  911. src0_h *= filt0;
  912. src1_h *= filt0;
  913. src8_h *= filt4;
  914. src9_h *= filt4;
  915. src0_h += src2_h * filt1;
  916. src1_h += src3_h * filt1;
  917. src8_h += src10_h * filt5;
  918. src9_h += src11_h * filt5;
  919. src0_h += src4_h * filt2;
  920. src1_h += src5_h * filt2;
  921. src8_h += src12_h * filt6;
  922. src9_h += src13_h * filt6;
  923. src0_h += src6_h * filt3;
  924. src1_h += src7_h * filt3;
  925. src8_h += src14_h * filt7;
  926. src9_h += src15_h * filt7;
  927. ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h);
  928. SRARI_H2_SH(src0_h, src1_h, FILTER_BITS);
  929. SAT_SH2_SH(src0_h, src1_h, 7);
  930. dst0 = PCKEV_XORI128_UB(src0_h, src1_h);
  931. ST_UB(dst0, dst);
  932. dst += 16;
  933. }
  934. }
  935. static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
  936. uint8_t *dst, ptrdiff_t dst_stride,
  937. const InterpKernel *y_filters, int y0_q4,
  938. int y_step_q4, int h) {
  939. int y;
  940. int y_q4 = y0_q4;
  941. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  942. for (y = 0; y < h; ++y) {
  943. const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
  944. const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
  945. if (y_q4 & SUBPEL_MASK) {
  946. filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
  947. } else {
  948. uint32_t srcd = LW(src_y + 3 * src_stride);
  949. SW(srcd, dst + y * dst_stride);
  950. }
  951. y_q4 += y_step_q4;
  952. }
  953. }
  954. static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
  955. uint8_t *dst, ptrdiff_t dst_stride,
  956. const InterpKernel *y_filters, int y0_q4,
  957. int y_step_q4, int h) {
  958. int y;
  959. int y_q4 = y0_q4;
  960. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  961. for (y = 0; y < h; ++y) {
  962. const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
  963. const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
  964. if (y_q4 & SUBPEL_MASK) {
  965. filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
  966. } else {
  967. uint64_t srcd = LD(src_y + 3 * src_stride);
  968. SD(srcd, dst + y * dst_stride);
  969. }
  970. y_q4 += y_step_q4;
  971. }
  972. }
  973. static void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride,
  974. uint8_t *dst, ptrdiff_t dst_stride,
  975. const InterpKernel *y_filters, int y0_q4,
  976. int y_step_q4, int w, int h) {
  977. int x, y;
  978. int y_q4 = y0_q4;
  979. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  980. for (y = 0; y < h; ++y) {
  981. const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
  982. const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
  983. if (y_q4 & SUBPEL_MASK) {
  984. filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter,
  985. w);
  986. } else {
  987. for (x = 0; x < w; ++x) {
  988. dst[x + y * dst_stride] = src_y[x + 3 * src_stride];
  989. }
  990. }
  991. y_q4 += y_step_q4;
  992. }
  993. }
  994. void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
  995. ptrdiff_t dst_stride, const InterpKernel *filter,
  996. int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
  997. int w, int h) {
  998. // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  999. // 2d filtering proceeds in 2 steps:
  1000. // (1) Interpolate horizontally into an intermediate buffer, temp.
  1001. // (2) Interpolate temp vertically to derive the sub-pixel result.
  1002. // Deriving the maximum number of rows in the temp buffer (135):
  1003. // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  1004. // --Largest block size is 64x64 pixels.
  1005. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  1006. // original frame (in 1/16th pixel units).
  1007. // --Must round-up because block may be located at sub-pixel position.
  1008. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  1009. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  1010. // --Require an additional 8 rows for the horiz_w8 transpose tail.
  1011. DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
  1012. const int intermediate_height =
  1013. (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
  1014. assert(w <= 64);
  1015. assert(h <= 64);
  1016. assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
  1017. assert(x_step_q4 <= 64);
  1018. if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) {
  1019. vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4,
  1020. x_step_q4, y0_q4, y_step_q4, w, h);
  1021. } else {
  1022. if (w >= 16) {
  1023. scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1),
  1024. src_stride, temp, 64, filter, x0_q4, x_step_q4,
  1025. w, intermediate_height);
  1026. } else if (w == 8) {
  1027. scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
  1028. src_stride, temp, 64, filter, x0_q4, x_step_q4,
  1029. intermediate_height);
  1030. } else {
  1031. scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
  1032. src_stride, temp, 64, filter, x0_q4, x_step_q4,
  1033. intermediate_height);
  1034. }
  1035. if (w >= 16) {
  1036. scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
  1037. dst_stride, filter, y0_q4, y_step_q4, w, h);
  1038. } else if (w == 8) {
  1039. scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
  1040. dst_stride, filter, y0_q4, y_step_q4, h);
  1041. } else {
  1042. scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
  1043. dst_stride, filter, y0_q4, y_step_q4, h);
  1044. }
  1045. }
  1046. }