2
0

intrapred_msa.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_dsp_rtcd.h"
  11. #include "vpx_dsp/mips/macros_msa.h"
  12. #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
  13. { \
  14. out0 = __msa_subs_u_h(out0, in0); \
  15. out1 = __msa_subs_u_h(out1, in1); \
  16. }
  17. static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
  18. int32_t dst_stride) {
  19. uint32_t src_data;
  20. src_data = LW(src);
  21. SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
  22. }
  23. static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
  24. int32_t dst_stride) {
  25. uint32_t row;
  26. uint32_t src_data1, src_data2;
  27. src_data1 = LW(src);
  28. src_data2 = LW(src + 4);
  29. for (row = 8; row--;) {
  30. SW(src_data1, dst);
  31. SW(src_data2, (dst + 4));
  32. dst += dst_stride;
  33. }
  34. }
  35. static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
  36. int32_t dst_stride) {
  37. uint32_t row;
  38. v16u8 src0;
  39. src0 = LD_UB(src);
  40. for (row = 16; row--;) {
  41. ST_UB(src0, dst);
  42. dst += dst_stride;
  43. }
  44. }
  45. static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
  46. int32_t dst_stride) {
  47. uint32_t row;
  48. v16u8 src1, src2;
  49. src1 = LD_UB(src);
  50. src2 = LD_UB(src + 16);
  51. for (row = 32; row--;) {
  52. ST_UB2(src1, src2, dst, 16);
  53. dst += dst_stride;
  54. }
  55. }
  56. static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
  57. int32_t dst_stride) {
  58. uint32_t out0, out1, out2, out3;
  59. out0 = src[0] * 0x01010101;
  60. out1 = src[1] * 0x01010101;
  61. out2 = src[2] * 0x01010101;
  62. out3 = src[3] * 0x01010101;
  63. SW4(out0, out1, out2, out3, dst, dst_stride);
  64. }
  65. static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
  66. int32_t dst_stride) {
  67. uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
  68. out0 = src[0] * 0x0101010101010101ull;
  69. out1 = src[1] * 0x0101010101010101ull;
  70. out2 = src[2] * 0x0101010101010101ull;
  71. out3 = src[3] * 0x0101010101010101ull;
  72. out4 = src[4] * 0x0101010101010101ull;
  73. out5 = src[5] * 0x0101010101010101ull;
  74. out6 = src[6] * 0x0101010101010101ull;
  75. out7 = src[7] * 0x0101010101010101ull;
  76. SD4(out0, out1, out2, out3, dst, dst_stride);
  77. dst += (4 * dst_stride);
  78. SD4(out4, out5, out6, out7, dst, dst_stride);
  79. }
  80. static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
  81. int32_t dst_stride) {
  82. uint32_t row;
  83. uint8_t inp0, inp1, inp2, inp3;
  84. v16u8 src0, src1, src2, src3;
  85. for (row = 4; row--;) {
  86. inp0 = src[0];
  87. inp1 = src[1];
  88. inp2 = src[2];
  89. inp3 = src[3];
  90. src += 4;
  91. src0 = (v16u8)__msa_fill_b(inp0);
  92. src1 = (v16u8)__msa_fill_b(inp1);
  93. src2 = (v16u8)__msa_fill_b(inp2);
  94. src3 = (v16u8)__msa_fill_b(inp3);
  95. ST_UB4(src0, src1, src2, src3, dst, dst_stride);
  96. dst += (4 * dst_stride);
  97. }
  98. }
  99. static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
  100. int32_t dst_stride) {
  101. uint32_t row;
  102. uint8_t inp0, inp1, inp2, inp3;
  103. v16u8 src0, src1, src2, src3;
  104. for (row = 8; row--;) {
  105. inp0 = src[0];
  106. inp1 = src[1];
  107. inp2 = src[2];
  108. inp3 = src[3];
  109. src += 4;
  110. src0 = (v16u8)__msa_fill_b(inp0);
  111. src1 = (v16u8)__msa_fill_b(inp1);
  112. src2 = (v16u8)__msa_fill_b(inp2);
  113. src3 = (v16u8)__msa_fill_b(inp3);
  114. ST_UB2(src0, src0, dst, 16);
  115. dst += dst_stride;
  116. ST_UB2(src1, src1, dst, 16);
  117. dst += dst_stride;
  118. ST_UB2(src2, src2, dst, 16);
  119. dst += dst_stride;
  120. ST_UB2(src3, src3, dst, 16);
  121. dst += dst_stride;
  122. }
  123. }
  124. static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
  125. const uint8_t *src_left, uint8_t *dst,
  126. int32_t dst_stride) {
  127. uint32_t val0, val1;
  128. v16i8 store, src = { 0 };
  129. v8u16 sum_h;
  130. v4u32 sum_w;
  131. v2u64 sum_d;
  132. val0 = LW(src_top);
  133. val1 = LW(src_left);
  134. INSERT_W2_SB(val0, val1, src);
  135. sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
  136. sum_w = __msa_hadd_u_w(sum_h, sum_h);
  137. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  138. sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
  139. store = __msa_splati_b((v16i8)sum_w, 0);
  140. val0 = __msa_copy_u_w((v4i32)store, 0);
  141. SW4(val0, val0, val0, val0, dst, dst_stride);
  142. }
  143. static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
  144. int32_t dst_stride) {
  145. uint32_t val0;
  146. v16i8 store, data = { 0 };
  147. v8u16 sum_h;
  148. v4u32 sum_w;
  149. val0 = LW(src);
  150. data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
  151. sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
  152. sum_w = __msa_hadd_u_w(sum_h, sum_h);
  153. sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
  154. store = __msa_splati_b((v16i8)sum_w, 0);
  155. val0 = __msa_copy_u_w((v4i32)store, 0);
  156. SW4(val0, val0, val0, val0, dst, dst_stride);
  157. }
  158. static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
  159. uint32_t out;
  160. const v16i8 store = __msa_ldi_b(128);
  161. out = __msa_copy_u_w((v4i32)store, 0);
  162. SW4(out, out, out, out, dst, dst_stride);
  163. }
  164. static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
  165. const uint8_t *src_left, uint8_t *dst,
  166. int32_t dst_stride) {
  167. uint64_t val0, val1;
  168. v16i8 store;
  169. v16u8 src = { 0 };
  170. v8u16 sum_h;
  171. v4u32 sum_w;
  172. v2u64 sum_d;
  173. val0 = LD(src_top);
  174. val1 = LD(src_left);
  175. INSERT_D2_UB(val0, val1, src);
  176. sum_h = __msa_hadd_u_h(src, src);
  177. sum_w = __msa_hadd_u_w(sum_h, sum_h);
  178. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  179. sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
  180. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  181. sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
  182. store = __msa_splati_b((v16i8)sum_w, 0);
  183. val0 = __msa_copy_u_d((v2i64)store, 0);
  184. SD4(val0, val0, val0, val0, dst, dst_stride);
  185. dst += (4 * dst_stride);
  186. SD4(val0, val0, val0, val0, dst, dst_stride);
  187. }
  188. static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
  189. int32_t dst_stride) {
  190. uint64_t val0;
  191. v16i8 store;
  192. v16u8 data = { 0 };
  193. v8u16 sum_h;
  194. v4u32 sum_w;
  195. v2u64 sum_d;
  196. val0 = LD(src);
  197. data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
  198. sum_h = __msa_hadd_u_h(data, data);
  199. sum_w = __msa_hadd_u_w(sum_h, sum_h);
  200. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  201. sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
  202. store = __msa_splati_b((v16i8)sum_w, 0);
  203. val0 = __msa_copy_u_d((v2i64)store, 0);
  204. SD4(val0, val0, val0, val0, dst, dst_stride);
  205. dst += (4 * dst_stride);
  206. SD4(val0, val0, val0, val0, dst, dst_stride);
  207. }
  208. static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
  209. uint64_t out;
  210. const v16i8 store = __msa_ldi_b(128);
  211. out = __msa_copy_u_d((v2i64)store, 0);
  212. SD4(out, out, out, out, dst, dst_stride);
  213. dst += (4 * dst_stride);
  214. SD4(out, out, out, out, dst, dst_stride);
  215. }
  216. static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
  217. const uint8_t *src_left, uint8_t *dst,
  218. int32_t dst_stride) {
  219. v16u8 top, left, out;
  220. v8u16 sum_h, sum_top, sum_left;
  221. v4u32 sum_w;
  222. v2u64 sum_d;
  223. top = LD_UB(src_top);
  224. left = LD_UB(src_left);
  225. HADD_UB2_UH(top, left, sum_top, sum_left);
  226. sum_h = sum_top + sum_left;
  227. sum_w = __msa_hadd_u_w(sum_h, sum_h);
  228. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  229. sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
  230. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  231. sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
  232. out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
  233. ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
  234. dst += (8 * dst_stride);
  235. ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
  236. }
  237. static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
  238. int32_t dst_stride) {
  239. v16u8 data, out;
  240. v8u16 sum_h;
  241. v4u32 sum_w;
  242. v2u64 sum_d;
  243. data = LD_UB(src);
  244. sum_h = __msa_hadd_u_h(data, data);
  245. sum_w = __msa_hadd_u_w(sum_h, sum_h);
  246. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  247. sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
  248. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  249. sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
  250. out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
  251. ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
  252. dst += (8 * dst_stride);
  253. ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
  254. }
  255. static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
  256. const v16u8 out = (v16u8)__msa_ldi_b(128);
  257. ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
  258. dst += (8 * dst_stride);
  259. ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
  260. }
  261. static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
  262. const uint8_t *src_left, uint8_t *dst,
  263. int32_t dst_stride) {
  264. uint32_t row;
  265. v16u8 top0, top1, left0, left1, out;
  266. v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
  267. v4u32 sum_w;
  268. v2u64 sum_d;
  269. LD_UB2(src_top, 16, top0, top1);
  270. LD_UB2(src_left, 16, left0, left1);
  271. HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
  272. HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
  273. sum_h = sum_top0 + sum_top1;
  274. sum_h += sum_left0 + sum_left1;
  275. sum_w = __msa_hadd_u_w(sum_h, sum_h);
  276. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  277. sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
  278. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  279. sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
  280. out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
  281. for (row = 16; row--;) {
  282. ST_UB2(out, out, dst, 16);
  283. dst += dst_stride;
  284. ST_UB2(out, out, dst, 16);
  285. dst += dst_stride;
  286. }
  287. }
  288. static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
  289. int32_t dst_stride) {
  290. uint32_t row;
  291. v16u8 data0, data1, out;
  292. v8u16 sum_h, sum_data0, sum_data1;
  293. v4u32 sum_w;
  294. v2u64 sum_d;
  295. LD_UB2(src, 16, data0, data1);
  296. HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
  297. sum_h = sum_data0 + sum_data1;
  298. sum_w = __msa_hadd_u_w(sum_h, sum_h);
  299. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  300. sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
  301. sum_d = __msa_hadd_u_d(sum_w, sum_w);
  302. sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
  303. out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
  304. for (row = 16; row--;) {
  305. ST_UB2(out, out, dst, 16);
  306. dst += dst_stride;
  307. ST_UB2(out, out, dst, 16);
  308. dst += dst_stride;
  309. }
  310. }
  311. static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
  312. uint32_t row;
  313. const v16u8 out = (v16u8)__msa_ldi_b(128);
  314. for (row = 16; row--;) {
  315. ST_UB2(out, out, dst, 16);
  316. dst += dst_stride;
  317. ST_UB2(out, out, dst, 16);
  318. dst += dst_stride;
  319. }
  320. }
  321. static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
  322. const uint8_t *src_left, uint8_t *dst,
  323. int32_t dst_stride) {
  324. uint32_t val;
  325. uint8_t top_left = src_top_ptr[-1];
  326. v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
  327. v16u8 src0, src1, src2, src3;
  328. v8u16 src_top_left, vec0, vec1, vec2, vec3;
  329. src_top_left = (v8u16)__msa_fill_h(top_left);
  330. val = LW(src_top_ptr);
  331. src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
  332. src_left0 = __msa_fill_b(src_left[0]);
  333. src_left1 = __msa_fill_b(src_left[1]);
  334. src_left2 = __msa_fill_b(src_left[2]);
  335. src_left3 = __msa_fill_b(src_left[3]);
  336. ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
  337. src_left3, src_top, src0, src1, src2, src3);
  338. HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
  339. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
  340. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
  341. SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
  342. PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
  343. ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
  344. }
  345. static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
  346. const uint8_t *src_left, uint8_t *dst,
  347. int32_t dst_stride) {
  348. uint64_t val;
  349. uint8_t top_left = src_top_ptr[-1];
  350. uint32_t loop_cnt;
  351. v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
  352. v8u16 src_top_left, vec0, vec1, vec2, vec3;
  353. v16u8 src0, src1, src2, src3;
  354. val = LD(src_top_ptr);
  355. src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
  356. src_top_left = (v8u16)__msa_fill_h(top_left);
  357. for (loop_cnt = 2; loop_cnt--;) {
  358. src_left0 = __msa_fill_b(src_left[0]);
  359. src_left1 = __msa_fill_b(src_left[1]);
  360. src_left2 = __msa_fill_b(src_left[2]);
  361. src_left3 = __msa_fill_b(src_left[3]);
  362. src_left += 4;
  363. ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
  364. src_left3, src_top, src0, src1, src2, src3);
  365. HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
  366. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
  367. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
  368. SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
  369. PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
  370. ST8x4_UB(tmp0, tmp1, dst, dst_stride);
  371. dst += (4 * dst_stride);
  372. }
  373. }
  374. static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
  375. const uint8_t *src_left, uint8_t *dst,
  376. int32_t dst_stride) {
  377. uint8_t top_left = src_top_ptr[-1];
  378. uint32_t loop_cnt;
  379. v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
  380. v8u16 src_top_left, res_r, res_l;
  381. src_top = LD_SB(src_top_ptr);
  382. src_top_left = (v8u16)__msa_fill_h(top_left);
  383. for (loop_cnt = 4; loop_cnt--;) {
  384. src_left0 = __msa_fill_b(src_left[0]);
  385. src_left1 = __msa_fill_b(src_left[1]);
  386. src_left2 = __msa_fill_b(src_left[2]);
  387. src_left3 = __msa_fill_b(src_left[3]);
  388. src_left += 4;
  389. ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
  390. HADD_UB2_UH(res_r, res_l, res_r, res_l);
  391. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
  392. SAT_UH2_UH(res_r, res_l, 7);
  393. PCKEV_ST_SB(res_r, res_l, dst);
  394. dst += dst_stride;
  395. ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
  396. HADD_UB2_UH(res_r, res_l, res_r, res_l);
  397. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
  398. SAT_UH2_UH(res_r, res_l, 7);
  399. PCKEV_ST_SB(res_r, res_l, dst);
  400. dst += dst_stride;
  401. ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
  402. HADD_UB2_UH(res_r, res_l, res_r, res_l);
  403. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
  404. SAT_UH2_UH(res_r, res_l, 7);
  405. PCKEV_ST_SB(res_r, res_l, dst);
  406. dst += dst_stride;
  407. ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
  408. HADD_UB2_UH(res_r, res_l, res_r, res_l);
  409. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
  410. SAT_UH2_UH(res_r, res_l, 7);
  411. PCKEV_ST_SB(res_r, res_l, dst);
  412. dst += dst_stride;
  413. }
  414. }
  415. static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
  416. const uint8_t *src_left, uint8_t *dst,
  417. int32_t dst_stride) {
  418. uint8_t top_left = src_top[-1];
  419. uint32_t loop_cnt;
  420. v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
  421. v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
  422. LD_SB2(src_top, 16, src_top0, src_top1);
  423. src_top_left = (v8u16)__msa_fill_h(top_left);
  424. for (loop_cnt = 8; loop_cnt--;) {
  425. src_left0 = __msa_fill_b(src_left[0]);
  426. src_left1 = __msa_fill_b(src_left[1]);
  427. src_left2 = __msa_fill_b(src_left[2]);
  428. src_left3 = __msa_fill_b(src_left[3]);
  429. src_left += 4;
  430. ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
  431. ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
  432. HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
  433. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
  434. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
  435. SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
  436. PCKEV_ST_SB(res_r0, res_l0, dst);
  437. PCKEV_ST_SB(res_r1, res_l1, dst + 16);
  438. dst += dst_stride;
  439. ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
  440. ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
  441. HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
  442. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
  443. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
  444. SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
  445. PCKEV_ST_SB(res_r0, res_l0, dst);
  446. PCKEV_ST_SB(res_r1, res_l1, dst + 16);
  447. dst += dst_stride;
  448. ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
  449. ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
  450. HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
  451. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
  452. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
  453. SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
  454. PCKEV_ST_SB(res_r0, res_l0, dst);
  455. PCKEV_ST_SB(res_r1, res_l1, dst + 16);
  456. dst += dst_stride;
  457. ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
  458. ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
  459. HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
  460. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
  461. IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
  462. SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
  463. PCKEV_ST_SB(res_r0, res_l0, dst);
  464. PCKEV_ST_SB(res_r1, res_l1, dst + 16);
  465. dst += dst_stride;
  466. }
  467. }
  468. void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
  469. const uint8_t *above, const uint8_t *left) {
  470. (void)left;
  471. intra_predict_vert_4x4_msa(above, dst, y_stride);
  472. }
  473. void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
  474. const uint8_t *above, const uint8_t *left) {
  475. (void)left;
  476. intra_predict_vert_8x8_msa(above, dst, y_stride);
  477. }
  478. void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
  479. const uint8_t *above, const uint8_t *left) {
  480. (void)left;
  481. intra_predict_vert_16x16_msa(above, dst, y_stride);
  482. }
  483. void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
  484. const uint8_t *above, const uint8_t *left) {
  485. (void)left;
  486. intra_predict_vert_32x32_msa(above, dst, y_stride);
  487. }
  488. void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
  489. const uint8_t *above, const uint8_t *left) {
  490. (void)above;
  491. intra_predict_horiz_4x4_msa(left, dst, y_stride);
  492. }
  493. void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
  494. const uint8_t *above, const uint8_t *left) {
  495. (void)above;
  496. intra_predict_horiz_8x8_msa(left, dst, y_stride);
  497. }
  498. void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
  499. const uint8_t *above, const uint8_t *left) {
  500. (void)above;
  501. intra_predict_horiz_16x16_msa(left, dst, y_stride);
  502. }
  503. void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
  504. const uint8_t *above, const uint8_t *left) {
  505. (void)above;
  506. intra_predict_horiz_32x32_msa(left, dst, y_stride);
  507. }
  508. void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
  509. const uint8_t *above, const uint8_t *left) {
  510. intra_predict_dc_4x4_msa(above, left, dst, y_stride);
  511. }
  512. void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
  513. const uint8_t *above, const uint8_t *left) {
  514. intra_predict_dc_8x8_msa(above, left, dst, y_stride);
  515. }
  516. void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
  517. const uint8_t *above, const uint8_t *left) {
  518. intra_predict_dc_16x16_msa(above, left, dst, y_stride);
  519. }
  520. void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
  521. const uint8_t *above, const uint8_t *left) {
  522. intra_predict_dc_32x32_msa(above, left, dst, y_stride);
  523. }
  524. void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
  525. const uint8_t *above, const uint8_t *left) {
  526. (void)left;
  527. intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
  528. }
  529. void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
  530. const uint8_t *above, const uint8_t *left) {
  531. (void)left;
  532. intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
  533. }
  534. void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
  535. const uint8_t *above, const uint8_t *left) {
  536. (void)left;
  537. intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
  538. }
  539. void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
  540. const uint8_t *above, const uint8_t *left) {
  541. (void)left;
  542. intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
  543. }
  544. void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
  545. const uint8_t *above, const uint8_t *left) {
  546. (void)above;
  547. intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
  548. }
  549. void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
  550. const uint8_t *above, const uint8_t *left) {
  551. (void)above;
  552. intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
  553. }
  554. void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
  555. const uint8_t *above,
  556. const uint8_t *left) {
  557. (void)above;
  558. intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
  559. }
  560. void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
  561. const uint8_t *above,
  562. const uint8_t *left) {
  563. (void)above;
  564. intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
  565. }
  566. void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
  567. const uint8_t *above, const uint8_t *left) {
  568. (void)above;
  569. (void)left;
  570. intra_predict_128dc_4x4_msa(dst, y_stride);
  571. }
  572. void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
  573. const uint8_t *above, const uint8_t *left) {
  574. (void)above;
  575. (void)left;
  576. intra_predict_128dc_8x8_msa(dst, y_stride);
  577. }
  578. void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
  579. const uint8_t *above, const uint8_t *left) {
  580. (void)above;
  581. (void)left;
  582. intra_predict_128dc_16x16_msa(dst, y_stride);
  583. }
  584. void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
  585. const uint8_t *above, const uint8_t *left) {
  586. (void)above;
  587. (void)left;
  588. intra_predict_128dc_32x32_msa(dst, y_stride);
  589. }
  590. void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
  591. const uint8_t *above, const uint8_t *left) {
  592. intra_predict_tm_4x4_msa(above, left, dst, y_stride);
  593. }
  594. void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
  595. const uint8_t *above, const uint8_t *left) {
  596. intra_predict_tm_8x8_msa(above, left, dst, y_stride);
  597. }
  598. void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
  599. const uint8_t *above, const uint8_t *left) {
  600. intra_predict_tm_16x16_msa(above, left, dst, y_stride);
  601. }
  602. void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
  603. const uint8_t *above, const uint8_t *left) {
  604. intra_predict_tm_32x32_msa(above, left, dst, y_stride);
  605. }