row_msa.cc 138 KB


  1. /*
  2. * Copyright 2016 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <string.h>
  11. #include "libyuv/row.h"
  12. // This module is for GCC MSA
  13. #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
  14. #include "libyuv/macros_msa.h"
  15. #ifdef __cplusplus
  16. namespace libyuv {
  17. extern "C" {
  18. #endif
  19. #define ALPHA_VAL (-1)
  20. // Fill YUV -> RGB conversion constants into vectors
  21. #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
  22. { \
  23. ub = __msa_fill_w(yuvconst->kUVToB[0]); \
  24. vr = __msa_fill_w(yuvconst->kUVToR[1]); \
  25. ug = __msa_fill_w(yuvconst->kUVToG[0]); \
  26. vg = __msa_fill_w(yuvconst->kUVToG[1]); \
  27. bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \
  28. bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \
  29. br = __msa_fill_w(yuvconst->kUVBiasR[0]); \
  30. yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
  31. }
  32. // Load YUV 422 pixel data
  33. #define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
  34. { \
  35. uint64_t y_m; \
  36. uint32_t u_m, v_m; \
  37. v4i32 zero_m = {0}; \
  38. y_m = LD(psrc_y); \
  39. u_m = LW(psrc_u); \
  40. v_m = LW(psrc_v); \
  41. out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \
  42. out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \
  43. out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \
  44. }
  45. // Clip input vector elements between 0 to 255
  46. #define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
  47. { \
  48. v4i32 max_m = __msa_ldi_w(0xFF); \
  49. \
  50. in0 = __msa_maxi_s_w(in0, 0); \
  51. in1 = __msa_maxi_s_w(in1, 0); \
  52. in2 = __msa_maxi_s_w(in2, 0); \
  53. in3 = __msa_maxi_s_w(in3, 0); \
  54. in4 = __msa_maxi_s_w(in4, 0); \
  55. in5 = __msa_maxi_s_w(in5, 0); \
  56. in0 = __msa_min_s_w(max_m, in0); \
  57. in1 = __msa_min_s_w(max_m, in1); \
  58. in2 = __msa_min_s_w(max_m, in2); \
  59. in3 = __msa_min_s_w(max_m, in3); \
  60. in4 = __msa_min_s_w(max_m, in4); \
  61. in5 = __msa_min_s_w(max_m, in5); \
  62. }
  63. // Convert 8 pixels of YUV 420 to RGB.
  64. #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
  65. { \
  66. v8i16 vec0_m, vec1_m; \
  67. v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
  68. v4i32 reg5_m, reg6_m, reg7_m; \
  69. v16i8 zero_m = {0}; \
  70. \
  71. vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
  72. vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
  73. reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
  74. reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
  75. reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \
  76. reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \
  77. reg0_m *= yg; \
  78. reg1_m *= yg; \
  79. reg2_m *= ubvr; \
  80. reg3_m *= ubvr; \
  81. reg0_m = __msa_srai_w(reg0_m, 16); \
  82. reg1_m = __msa_srai_w(reg1_m, 16); \
  83. reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
  84. reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
  85. reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
  86. reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
  87. reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
  88. reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
  89. reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
  90. reg5_m = reg0_m - reg5_m; \
  91. reg6_m = reg1_m - reg6_m; \
  92. reg2_m = reg0_m - reg2_m; \
  93. reg3_m = reg1_m - reg3_m; \
  94. reg7_m = reg0_m - reg7_m; \
  95. reg4_m = reg1_m - reg4_m; \
  96. reg5_m += bb; \
  97. reg6_m += bb; \
  98. reg7_m += bg; \
  99. reg4_m += bg; \
  100. reg2_m += br; \
  101. reg3_m += br; \
  102. reg5_m = __msa_srai_w(reg5_m, 6); \
  103. reg6_m = __msa_srai_w(reg6_m, 6); \
  104. reg7_m = __msa_srai_w(reg7_m, 6); \
  105. reg4_m = __msa_srai_w(reg4_m, 6); \
  106. reg2_m = __msa_srai_w(reg2_m, 6); \
  107. reg3_m = __msa_srai_w(reg3_m, 6); \
  108. CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
  109. out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
  110. out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
  111. out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
  112. }
  113. // Pack and Store 8 ARGB values.
  114. #define STOREARGB(in0, in1, in2, in3, pdst_argb) \
  115. { \
  116. v8i16 vec0_m, vec1_m; \
  117. v16u8 dst0_m, dst1_m; \
  118. vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
  119. vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
  120. dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \
  121. dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \
  122. ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \
  123. }
  124. // Takes ARGB input and calculates Y.
  125. #define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
  126. y_out) \
  127. { \
  128. v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \
  129. v8u16 reg0_m, reg1_m; \
  130. \
  131. vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \
  132. vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \
  133. vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \
  134. vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \
  135. reg0_m = __msa_dotp_u_h(vec0_m, const0); \
  136. reg1_m = __msa_dotp_u_h(vec1_m, const0); \
  137. reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \
  138. reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \
  139. reg0_m += const2; \
  140. reg1_m += const2; \
  141. reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \
  142. reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \
  143. y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
  144. }
  145. // Loads current and next row of ARGB input and averages it to calculate U and V
  146. #define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \
  147. { \
  148. v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
  149. v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
  150. v16u8 vec8_m, vec9_m; \
  151. v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
  152. v8u16 reg8_m, reg9_m; \
  153. \
  154. src0_m = (v16u8)__msa_ld_b((void*)s, 0); \
  155. src1_m = (v16u8)__msa_ld_b((void*)s, 16); \
  156. src2_m = (v16u8)__msa_ld_b((void*)s, 32); \
  157. src3_m = (v16u8)__msa_ld_b((void*)s, 48); \
  158. src4_m = (v16u8)__msa_ld_b((void*)t, 0); \
  159. src5_m = (v16u8)__msa_ld_b((void*)t, 16); \
  160. src6_m = (v16u8)__msa_ld_b((void*)t, 32); \
  161. src7_m = (v16u8)__msa_ld_b((void*)t, 48); \
  162. vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
  163. vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
  164. vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
  165. vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
  166. vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
  167. vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
  168. vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
  169. vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
  170. reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \
  171. reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \
  172. reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \
  173. reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \
  174. reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \
  175. reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \
  176. reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \
  177. reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \
  178. reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
  179. reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
  180. reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
  181. reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
  182. reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
  183. reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
  184. reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
  185. reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
  186. reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
  187. reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
  188. reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
  189. reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
  190. argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
  191. argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
  192. src0_m = (v16u8)__msa_ld_b((void*)s, 64); \
  193. src1_m = (v16u8)__msa_ld_b((void*)s, 80); \
  194. src2_m = (v16u8)__msa_ld_b((void*)s, 96); \
  195. src3_m = (v16u8)__msa_ld_b((void*)s, 112); \
  196. src4_m = (v16u8)__msa_ld_b((void*)t, 64); \
  197. src5_m = (v16u8)__msa_ld_b((void*)t, 80); \
  198. src6_m = (v16u8)__msa_ld_b((void*)t, 96); \
  199. src7_m = (v16u8)__msa_ld_b((void*)t, 112); \
  200. vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
  201. vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
  202. vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
  203. vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
  204. vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
  205. vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
  206. vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
  207. vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
  208. reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \
  209. reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \
  210. reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \
  211. reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \
  212. reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \
  213. reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \
  214. reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \
  215. reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \
  216. reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
  217. reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
  218. reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
  219. reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
  220. reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
  221. reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
  222. reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
  223. reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
  224. reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
  225. reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
  226. reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
  227. reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
  228. argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
  229. argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
  230. }
  231. // Takes ARGB input and calculates U and V.
  232. #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
  233. shf0, shf1, shf2, shf3, v_out, u_out) \
  234. { \
  235. v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
  236. v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \
  237. \
  238. vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \
  239. vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \
  240. vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \
  241. vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \
  242. vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \
  243. vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \
  244. vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \
  245. vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \
  246. reg0_m = __msa_dotp_u_h(vec0_m, const1); \
  247. reg1_m = __msa_dotp_u_h(vec1_m, const1); \
  248. reg2_m = __msa_dotp_u_h(vec4_m, const1); \
  249. reg3_m = __msa_dotp_u_h(vec5_m, const1); \
  250. reg0_m += const3; \
  251. reg1_m += const3; \
  252. reg2_m += const3; \
  253. reg3_m += const3; \
  254. reg0_m -= __msa_dotp_u_h(vec2_m, const0); \
  255. reg1_m -= __msa_dotp_u_h(vec3_m, const0); \
  256. reg2_m -= __msa_dotp_u_h(vec6_m, const2); \
  257. reg3_m -= __msa_dotp_u_h(vec7_m, const2); \
  258. v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \
  259. u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \
  260. }
  261. // Load I444 pixel data
  262. #define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
  263. { \
  264. uint64_t y_m, u_m, v_m; \
  265. v2i64 zero_m = {0}; \
  266. y_m = LD(psrc_y); \
  267. u_m = LD(psrc_u); \
  268. v_m = LD(psrc_v); \
  269. out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \
  270. out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \
  271. out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \
  272. }
  273. void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
  274. int x;
  275. v16u8 src0, src1, src2, src3;
  276. v16u8 dst0, dst1, dst2, dst3;
  277. v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
  278. src += width - 64;
  279. for (x = 0; x < width; x += 64) {
  280. LD_UB4(src, 16, src3, src2, src1, src0);
  281. VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
  282. VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
  283. ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
  284. dst += 64;
  285. src -= 64;
  286. }
  287. }
  288. void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
  289. int x;
  290. v16u8 src0, src1, src2, src3;
  291. v16u8 dst0, dst1, dst2, dst3;
  292. v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
  293. src += width * 4 - 64;
  294. for (x = 0; x < width; x += 16) {
  295. LD_UB4(src, 16, src3, src2, src1, src0);
  296. VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
  297. VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
  298. ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
  299. dst += 64;
  300. src -= 64;
  301. }
  302. }
  303. void I422ToYUY2Row_MSA(const uint8_t* src_y,
  304. const uint8_t* src_u,
  305. const uint8_t* src_v,
  306. uint8_t* dst_yuy2,
  307. int width) {
  308. int x;
  309. v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
  310. v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
  311. for (x = 0; x < width; x += 32) {
  312. src_u0 = LD_UB(src_u);
  313. src_v0 = LD_UB(src_v);
  314. LD_UB2(src_y, 16, src_y0, src_y1);
  315. ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
  316. ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
  317. ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
  318. ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
  319. src_u += 16;
  320. src_v += 16;
  321. src_y += 32;
  322. dst_yuy2 += 64;
  323. }
  324. }
  325. void I422ToUYVYRow_MSA(const uint8_t* src_y,
  326. const uint8_t* src_u,
  327. const uint8_t* src_v,
  328. uint8_t* dst_uyvy,
  329. int width) {
  330. int x;
  331. v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
  332. v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
  333. for (x = 0; x < width; x += 32) {
  334. src_u0 = LD_UB(src_u);
  335. src_v0 = LD_UB(src_v);
  336. LD_UB2(src_y, 16, src_y0, src_y1);
  337. ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
  338. ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
  339. ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
  340. ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
  341. src_u += 16;
  342. src_v += 16;
  343. src_y += 32;
  344. dst_uyvy += 64;
  345. }
  346. }
  347. void I422ToARGBRow_MSA(const uint8_t* src_y,
  348. const uint8_t* src_u,
  349. const uint8_t* src_v,
  350. uint8_t* dst_argb,
  351. const struct YuvConstants* yuvconstants,
  352. int width) {
  353. int x;
  354. v16u8 src0, src1, src2;
  355. v8i16 vec0, vec1, vec2;
  356. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  357. v4i32 vec_ubvr, vec_ugvg;
  358. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  359. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  360. vec_br, vec_yg);
  361. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  362. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  363. for (x = 0; x < width; x += 8) {
  364. READYUV422(src_y, src_u, src_v, src0, src1, src2);
  365. src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
  366. YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  367. vec0, vec1, vec2);
  368. STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
  369. src_y += 8;
  370. src_u += 4;
  371. src_v += 4;
  372. dst_argb += 32;
  373. }
  374. }
  375. void I422ToRGBARow_MSA(const uint8_t* src_y,
  376. const uint8_t* src_u,
  377. const uint8_t* src_v,
  378. uint8_t* dst_argb,
  379. const struct YuvConstants* yuvconstants,
  380. int width) {
  381. int x;
  382. v16u8 src0, src1, src2;
  383. v8i16 vec0, vec1, vec2;
  384. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  385. v4i32 vec_ubvr, vec_ugvg;
  386. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  387. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  388. vec_br, vec_yg);
  389. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  390. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  391. for (x = 0; x < width; x += 8) {
  392. READYUV422(src_y, src_u, src_v, src0, src1, src2);
  393. src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
  394. YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  395. vec0, vec1, vec2);
  396. STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
  397. src_y += 8;
  398. src_u += 4;
  399. src_v += 4;
  400. dst_argb += 32;
  401. }
  402. }
  403. void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
  404. const uint8_t* src_u,
  405. const uint8_t* src_v,
  406. const uint8_t* src_a,
  407. uint8_t* dst_argb,
  408. const struct YuvConstants* yuvconstants,
  409. int width) {
  410. int x;
  411. int64_t data_a;
  412. v16u8 src0, src1, src2, src3;
  413. v8i16 vec0, vec1, vec2;
  414. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  415. v4i32 vec_ubvr, vec_ugvg;
  416. v4i32 zero = {0};
  417. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  418. vec_br, vec_yg);
  419. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  420. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  421. for (x = 0; x < width; x += 8) {
  422. data_a = LD(src_a);
  423. READYUV422(src_y, src_u, src_v, src0, src1, src2);
  424. src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
  425. src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
  426. YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  427. vec0, vec1, vec2);
  428. src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
  429. STOREARGB(vec0, vec1, vec2, src3, dst_argb);
  430. src_y += 8;
  431. src_u += 4;
  432. src_v += 4;
  433. src_a += 8;
  434. dst_argb += 32;
  435. }
  436. }
  437. void I422ToRGB24Row_MSA(const uint8_t* src_y,
  438. const uint8_t* src_u,
  439. const uint8_t* src_v,
  440. uint8_t* dst_argb,
  441. const struct YuvConstants* yuvconstants,
  442. int32_t width) {
  443. int x;
  444. int64_t data_u, data_v;
  445. v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
  446. v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
  447. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  448. v4i32 vec_ubvr, vec_ugvg;
  449. v16u8 reg0, reg1, reg2, reg3;
  450. v2i64 zero = {0};
  451. v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
  452. v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
  453. v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10,
  454. 11, 29, 12, 13, 30, 14, 15, 31};
  455. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  456. vec_br, vec_yg);
  457. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  458. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  459. for (x = 0; x < width; x += 16) {
  460. src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
  461. data_u = LD(src_u);
  462. data_v = LD(src_v);
  463. src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
  464. src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
  465. src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
  466. src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
  467. src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
  468. YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  469. vec0, vec1, vec2);
  470. YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  471. vec3, vec4, vec5);
  472. reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
  473. reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
  474. reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
  475. reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
  476. dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
  477. dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
  478. dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
  479. ST_UB2(dst0, dst1, dst_argb, 16);
  480. ST_UB(dst2, (dst_argb + 32));
  481. src_y += 16;
  482. src_u += 8;
  483. src_v += 8;
  484. dst_argb += 48;
  485. }
  486. }
  487. // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
  488. void I422ToRGB565Row_MSA(const uint8_t* src_y,
  489. const uint8_t* src_u,
  490. const uint8_t* src_v,
  491. uint8_t* dst_rgb565,
  492. const struct YuvConstants* yuvconstants,
  493. int width) {
  494. int x;
  495. v16u8 src0, src1, src2, dst0;
  496. v8i16 vec0, vec1, vec2;
  497. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  498. v4i32 vec_ubvr, vec_ugvg;
  499. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  500. vec_br, vec_yg);
  501. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  502. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  503. for (x = 0; x < width; x += 8) {
  504. READYUV422(src_y, src_u, src_v, src0, src1, src2);
  505. src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
  506. YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  507. vec0, vec2, vec1);
  508. vec0 = __msa_srai_h(vec0, 3);
  509. vec1 = __msa_srai_h(vec1, 3);
  510. vec2 = __msa_srai_h(vec2, 2);
  511. vec1 = __msa_slli_h(vec1, 11);
  512. vec2 = __msa_slli_h(vec2, 5);
  513. vec0 |= vec1;
  514. dst0 = (v16u8)(vec2 | vec0);
  515. ST_UB(dst0, dst_rgb565);
  516. src_y += 8;
  517. src_u += 4;
  518. src_v += 4;
  519. dst_rgb565 += 16;
  520. }
  521. }
  522. // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
  523. void I422ToARGB4444Row_MSA(const uint8_t* src_y,
  524. const uint8_t* src_u,
  525. const uint8_t* src_v,
  526. uint8_t* dst_argb4444,
  527. const struct YuvConstants* yuvconstants,
  528. int width) {
  529. int x;
  530. v16u8 src0, src1, src2, dst0;
  531. v8i16 vec0, vec1, vec2;
  532. v8u16 reg0, reg1, reg2;
  533. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  534. v4i32 vec_ubvr, vec_ugvg;
  535. v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
  536. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  537. vec_br, vec_yg);
  538. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  539. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  540. for (x = 0; x < width; x += 8) {
  541. READYUV422(src_y, src_u, src_v, src0, src1, src2);
  542. src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
  543. YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  544. vec0, vec1, vec2);
  545. reg0 = (v8u16)__msa_srai_h(vec0, 4);
  546. reg1 = (v8u16)__msa_srai_h(vec1, 4);
  547. reg2 = (v8u16)__msa_srai_h(vec2, 4);
  548. reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
  549. reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
  550. reg1 |= const_0xF000;
  551. reg0 |= reg2;
  552. dst0 = (v16u8)(reg1 | reg0);
  553. ST_UB(dst0, dst_argb4444);
  554. src_y += 8;
  555. src_u += 4;
  556. src_v += 4;
  557. dst_argb4444 += 16;
  558. }
  559. }
  560. void I422ToARGB1555Row_MSA(const uint8_t* src_y,
  561. const uint8_t* src_u,
  562. const uint8_t* src_v,
  563. uint8_t* dst_argb1555,
  564. const struct YuvConstants* yuvconstants,
  565. int width) {
  566. int x;
  567. v16u8 src0, src1, src2, dst0;
  568. v8i16 vec0, vec1, vec2;
  569. v8u16 reg0, reg1, reg2;
  570. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  571. v4i32 vec_ubvr, vec_ugvg;
  572. v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
  573. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  574. vec_br, vec_yg);
  575. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  576. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  577. for (x = 0; x < width; x += 8) {
  578. READYUV422(src_y, src_u, src_v, src0, src1, src2);
  579. src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
  580. YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  581. vec0, vec1, vec2);
  582. reg0 = (v8u16)__msa_srai_h(vec0, 3);
  583. reg1 = (v8u16)__msa_srai_h(vec1, 3);
  584. reg2 = (v8u16)__msa_srai_h(vec2, 3);
  585. reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
  586. reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
  587. reg1 |= const_0x8000;
  588. reg0 |= reg2;
  589. dst0 = (v16u8)(reg1 | reg0);
  590. ST_UB(dst0, dst_argb1555);
  591. src_y += 8;
  592. src_u += 4;
  593. src_v += 4;
  594. dst_argb1555 += 16;
  595. }
  596. }
  597. void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  598. int x;
  599. v16u8 src0, src1, src2, src3, dst0, dst1;
  600. for (x = 0; x < width; x += 32) {
  601. LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
  602. dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  603. dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
  604. ST_UB2(dst0, dst1, dst_y, 16);
  605. src_yuy2 += 64;
  606. dst_y += 32;
  607. }
  608. }
  609. void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
  610. int src_stride_yuy2,
  611. uint8_t* dst_u,
  612. uint8_t* dst_v,
  613. int width) {
  614. const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
  615. int x;
  616. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  617. v16u8 vec0, vec1, dst0, dst1;
  618. for (x = 0; x < width; x += 32) {
  619. LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
  620. LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
  621. src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  622. src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
  623. src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
  624. src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
  625. vec0 = __msa_aver_u_b(src0, src2);
  626. vec1 = __msa_aver_u_b(src1, src3);
  627. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  628. dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
  629. ST_UB(dst0, dst_u);
  630. ST_UB(dst1, dst_v);
  631. src_yuy2 += 64;
  632. src_yuy2_next += 64;
  633. dst_u += 16;
  634. dst_v += 16;
  635. }
  636. }
  637. void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
  638. uint8_t* dst_u,
  639. uint8_t* dst_v,
  640. int width) {
  641. int x;
  642. v16u8 src0, src1, src2, src3, dst0, dst1;
  643. for (x = 0; x < width; x += 32) {
  644. LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
  645. src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  646. src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
  647. dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  648. dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  649. ST_UB(dst0, dst_u);
  650. ST_UB(dst1, dst_v);
  651. src_yuy2 += 64;
  652. dst_u += 16;
  653. dst_v += 16;
  654. }
  655. }
  656. void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  657. int x;
  658. v16u8 src0, src1, src2, src3, dst0, dst1;
  659. for (x = 0; x < width; x += 32) {
  660. LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
  661. dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  662. dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
  663. ST_UB2(dst0, dst1, dst_y, 16);
  664. src_uyvy += 64;
  665. dst_y += 32;
  666. }
  667. }
  668. void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
  669. int src_stride_uyvy,
  670. uint8_t* dst_u,
  671. uint8_t* dst_v,
  672. int width) {
  673. const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
  674. int x;
  675. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  676. v16u8 vec0, vec1, dst0, dst1;
  677. for (x = 0; x < width; x += 32) {
  678. LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
  679. LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
  680. src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  681. src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
  682. src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
  683. src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
  684. vec0 = __msa_aver_u_b(src0, src2);
  685. vec1 = __msa_aver_u_b(src1, src3);
  686. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  687. dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
  688. ST_UB(dst0, dst_u);
  689. ST_UB(dst1, dst_v);
  690. src_uyvy += 64;
  691. src_uyvy_next += 64;
  692. dst_u += 16;
  693. dst_v += 16;
  694. }
  695. }
  696. void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
  697. uint8_t* dst_u,
  698. uint8_t* dst_v,
  699. int width) {
  700. int x;
  701. v16u8 src0, src1, src2, src3, dst0, dst1;
  702. for (x = 0; x < width; x += 32) {
  703. LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
  704. src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  705. src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
  706. dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  707. dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  708. ST_UB(dst0, dst_u);
  709. ST_UB(dst1, dst_v);
  710. src_uyvy += 64;
  711. dst_u += 16;
  712. dst_v += 16;
  713. }
  714. }
  715. void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  716. int x;
  717. v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
  718. v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
  719. v16i8 zero = {0};
  720. v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
  721. v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
  722. v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
  723. v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
  724. for (x = 0; x < width; x += 16) {
  725. src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
  726. src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
  727. src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
  728. src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
  729. vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  730. vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
  731. vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  732. vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
  733. reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
  734. reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
  735. reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
  736. reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
  737. reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
  738. reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
  739. reg0 *= const_0x19;
  740. reg1 *= const_0x19;
  741. reg2 *= const_0x81;
  742. reg3 *= const_0x81;
  743. reg4 *= const_0x42;
  744. reg5 *= const_0x42;
  745. reg0 += reg2;
  746. reg1 += reg3;
  747. reg0 += reg4;
  748. reg1 += reg5;
  749. reg0 += const_0x1080;
  750. reg1 += const_0x1080;
  751. reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
  752. reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
  753. dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
  754. ST_UB(dst0, dst_y);
  755. src_argb0 += 64;
  756. dst_y += 16;
  757. }
  758. }
  759. void ARGBToUVRow_MSA(const uint8_t* src_argb0,
  760. int src_stride_argb,
  761. uint8_t* dst_u,
  762. uint8_t* dst_v,
  763. int width) {
  764. int x;
  765. const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;
  766. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  767. v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
  768. v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
  769. v16u8 dst0, dst1;
  770. v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
  771. v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
  772. v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
  773. v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
  774. v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
  775. v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
  776. for (x = 0; x < width; x += 32) {
  777. src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
  778. src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
  779. src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
  780. src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
  781. src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
  782. src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
  783. src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
  784. src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
  785. vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  786. vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
  787. vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
  788. vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
  789. vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  790. vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
  791. vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
  792. vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
  793. vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  794. vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
  795. vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
  796. vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
  797. vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
  798. vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
  799. reg0 = __msa_hadd_u_h(vec8, vec8);
  800. reg1 = __msa_hadd_u_h(vec9, vec9);
  801. reg2 = __msa_hadd_u_h(vec4, vec4);
  802. reg3 = __msa_hadd_u_h(vec5, vec5);
  803. reg4 = __msa_hadd_u_h(vec0, vec0);
  804. reg5 = __msa_hadd_u_h(vec1, vec1);
  805. src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
  806. src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
  807. src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
  808. src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
  809. src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
  810. src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
  811. src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
  812. src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
  813. vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  814. vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
  815. vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
  816. vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
  817. vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  818. vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
  819. vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
  820. vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
  821. vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  822. vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
  823. vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
  824. vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
  825. vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
  826. vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
  827. reg0 += __msa_hadd_u_h(vec8, vec8);
  828. reg1 += __msa_hadd_u_h(vec9, vec9);
  829. reg2 += __msa_hadd_u_h(vec4, vec4);
  830. reg3 += __msa_hadd_u_h(vec5, vec5);
  831. reg4 += __msa_hadd_u_h(vec0, vec0);
  832. reg5 += __msa_hadd_u_h(vec1, vec1);
  833. reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
  834. reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
  835. reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
  836. reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
  837. reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
  838. reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
  839. reg6 = reg0 * const_0x70;
  840. reg7 = reg1 * const_0x70;
  841. reg8 = reg2 * const_0x4A;
  842. reg9 = reg3 * const_0x4A;
  843. reg6 += const_0x8080;
  844. reg7 += const_0x8080;
  845. reg8 += reg4 * const_0x26;
  846. reg9 += reg5 * const_0x26;
  847. reg0 *= const_0x12;
  848. reg1 *= const_0x12;
  849. reg2 *= const_0x5E;
  850. reg3 *= const_0x5E;
  851. reg4 *= const_0x70;
  852. reg5 *= const_0x70;
  853. reg2 += reg0;
  854. reg3 += reg1;
  855. reg4 += const_0x8080;
  856. reg5 += const_0x8080;
  857. reg6 -= reg8;
  858. reg7 -= reg9;
  859. reg4 -= reg2;
  860. reg5 -= reg3;
  861. reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
  862. reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
  863. reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
  864. reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
  865. dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
  866. dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
  867. ST_UB(dst0, dst_u);
  868. ST_UB(dst1, dst_v);
  869. src_argb0 += 128;
  870. src_argb0_next += 128;
  871. dst_u += 16;
  872. dst_v += 16;
  873. }
  874. }
  875. void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  876. int x;
  877. v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
  878. v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
  879. v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14,
  880. 16, 17, 18, 20, 21, 22, 24, 25};
  881. v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
  882. 21, 22, 24, 25, 26, 28, 29, 30};
  883. for (x = 0; x < width; x += 16) {
  884. src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
  885. src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
  886. src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
  887. src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
  888. dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
  889. dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
  890. dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
  891. ST_UB2(dst0, dst1, dst_rgb, 16);
  892. ST_UB(dst2, (dst_rgb + 32));
  893. src_argb += 64;
  894. dst_rgb += 48;
  895. }
  896. }
  897. void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  898. int x;
  899. v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
  900. v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
  901. v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12,
  902. 18, 17, 16, 22, 21, 20, 26, 25};
  903. v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22,
  904. 21, 20, 26, 25, 24, 30, 29, 28};
  905. for (x = 0; x < width; x += 16) {
  906. src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
  907. src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
  908. src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
  909. src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
  910. dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
  911. dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
  912. dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
  913. ST_UB2(dst0, dst1, dst_rgb, 16);
  914. ST_UB(dst2, (dst_rgb + 32));
  915. src_argb += 64;
  916. dst_rgb += 48;
  917. }
  918. }
  919. void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  920. int x;
  921. v16u8 src0, src1, dst0;
  922. v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  923. v16i8 zero = {0};
  924. for (x = 0; x < width; x += 8) {
  925. src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
  926. src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
  927. vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
  928. vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
  929. vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
  930. vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
  931. vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
  932. vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
  933. vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
  934. vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
  935. vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
  936. vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
  937. vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
  938. vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
  939. vec0 = __msa_binsli_b(vec0, vec1, 2);
  940. vec1 = __msa_binsli_b(vec2, vec3, 4);
  941. vec4 = __msa_binsli_b(vec4, vec5, 2);
  942. vec5 = __msa_binsli_b(vec6, vec7, 4);
  943. vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
  944. vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
  945. dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
  946. ST_UB(dst0, dst_rgb);
  947. src_argb += 32;
  948. dst_rgb += 16;
  949. }
  950. }
  951. void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
  952. uint8_t* dst_rgb,
  953. int width) {
  954. int x;
  955. v16u8 src0, src1, dst0;
  956. v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
  957. v16i8 zero = {0};
  958. for (x = 0; x < width; x += 8) {
  959. src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
  960. src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
  961. vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
  962. vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
  963. vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
  964. vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
  965. vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
  966. vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
  967. vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
  968. vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
  969. vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
  970. vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
  971. vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
  972. vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
  973. vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
  974. vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
  975. vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
  976. vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
  977. vec0 = __msa_binsli_b(vec0, vec1, 2);
  978. vec5 = __msa_binsli_b(vec5, vec6, 2);
  979. vec1 = __msa_binsli_b(vec2, vec3, 5);
  980. vec6 = __msa_binsli_b(vec7, vec8, 5);
  981. vec1 = __msa_binsli_b(vec1, vec4, 0);
  982. vec6 = __msa_binsli_b(vec6, vec9, 0);
  983. vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
  984. vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
  985. dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
  986. ST_UB(dst0, dst_rgb);
  987. src_argb += 32;
  988. dst_rgb += 16;
  989. }
  990. }
  991. void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
  992. uint8_t* dst_rgb,
  993. int width) {
  994. int x;
  995. v16u8 src0, src1;
  996. v16u8 vec0, vec1;
  997. v16u8 dst0;
  998. v16i8 zero = {0};
  999. for (x = 0; x < width; x += 8) {
  1000. src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
  1001. src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
  1002. vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
  1003. vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
  1004. src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
  1005. src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
  1006. vec0 = __msa_binsli_b(vec0, src0, 3);
  1007. vec1 = __msa_binsli_b(vec1, src1, 3);
  1008. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  1009. ST_UB(dst0, dst_rgb);
  1010. src_argb += 32;
  1011. dst_rgb += 16;
  1012. }
  1013. }
  1014. void ARGBToUV444Row_MSA(const uint8_t* src_argb,
  1015. uint8_t* dst_u,
  1016. uint8_t* dst_v,
  1017. int32_t width) {
  1018. int32_t x;
  1019. v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
  1020. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1021. v8u16 vec8, vec9, vec10, vec11;
  1022. v8u16 const_112 = (v8u16)__msa_ldi_h(112);
  1023. v8u16 const_74 = (v8u16)__msa_ldi_h(74);
  1024. v8u16 const_38 = (v8u16)__msa_ldi_h(38);
  1025. v8u16 const_94 = (v8u16)__msa_ldi_h(94);
  1026. v8u16 const_18 = (v8u16)__msa_ldi_h(18);
  1027. v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
  1028. v16i8 zero = {0};
  1029. for (x = width; x > 0; x -= 16) {
  1030. src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
  1031. src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
  1032. src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
  1033. src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
  1034. reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  1035. reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
  1036. reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  1037. reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
  1038. src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
  1039. src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
  1040. src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
  1041. vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
  1042. vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
  1043. vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
  1044. vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
  1045. vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
  1046. vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
  1047. vec10 = vec0 * const_18;
  1048. vec11 = vec1 * const_18;
  1049. vec8 = vec2 * const_94;
  1050. vec9 = vec3 * const_94;
  1051. vec6 = vec4 * const_112;
  1052. vec7 = vec5 * const_112;
  1053. vec0 *= const_112;
  1054. vec1 *= const_112;
  1055. vec2 *= const_74;
  1056. vec3 *= const_74;
  1057. vec4 *= const_38;
  1058. vec5 *= const_38;
  1059. vec8 += vec10;
  1060. vec9 += vec11;
  1061. vec6 += const_32896;
  1062. vec7 += const_32896;
  1063. vec0 += const_32896;
  1064. vec1 += const_32896;
  1065. vec2 += vec4;
  1066. vec3 += vec5;
  1067. vec0 -= vec2;
  1068. vec1 -= vec3;
  1069. vec6 -= vec8;
  1070. vec7 -= vec9;
  1071. vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
  1072. vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
  1073. vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
  1074. vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
  1075. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  1076. dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
  1077. ST_UB(dst0, dst_u);
  1078. ST_UB(dst1, dst_v);
  1079. src_argb += 64;
  1080. dst_u += 16;
  1081. dst_v += 16;
  1082. }
  1083. }
  1084. void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
  1085. const uint8_t* src_argb1,
  1086. uint8_t* dst_argb,
  1087. int width) {
  1088. int x;
  1089. v16u8 src0, src1, dst0;
  1090. v8u16 vec0, vec1, vec2, vec3;
  1091. v4u32 reg0, reg1, reg2, reg3;
  1092. v8i16 zero = {0};
  1093. for (x = 0; x < width; x += 4) {
  1094. src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
  1095. src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
  1096. vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
  1097. vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
  1098. vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
  1099. vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
  1100. reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
  1101. reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
  1102. reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
  1103. reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
  1104. reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
  1105. reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
  1106. reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
  1107. reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
  1108. reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
  1109. reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
  1110. reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
  1111. reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
  1112. vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
  1113. vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
  1114. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  1115. ST_UB(dst0, dst_argb);
  1116. src_argb0 += 16;
  1117. src_argb1 += 16;
  1118. dst_argb += 16;
  1119. }
  1120. }
  1121. void ARGBAddRow_MSA(const uint8_t* src_argb0,
  1122. const uint8_t* src_argb1,
  1123. uint8_t* dst_argb,
  1124. int width) {
  1125. int x;
  1126. v16u8 src0, src1, src2, src3, dst0, dst1;
  1127. for (x = 0; x < width; x += 8) {
  1128. src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
  1129. src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
  1130. src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
  1131. src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
  1132. dst0 = __msa_adds_u_b(src0, src2);
  1133. dst1 = __msa_adds_u_b(src1, src3);
  1134. ST_UB2(dst0, dst1, dst_argb, 16);
  1135. src_argb0 += 32;
  1136. src_argb1 += 32;
  1137. dst_argb += 32;
  1138. }
  1139. }
  1140. void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
  1141. const uint8_t* src_argb1,
  1142. uint8_t* dst_argb,
  1143. int width) {
  1144. int x;
  1145. v16u8 src0, src1, src2, src3, dst0, dst1;
  1146. for (x = 0; x < width; x += 8) {
  1147. src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
  1148. src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
  1149. src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
  1150. src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
  1151. dst0 = __msa_subs_u_b(src0, src2);
  1152. dst1 = __msa_subs_u_b(src1, src3);
  1153. ST_UB2(dst0, dst1, dst_argb, 16);
  1154. src_argb0 += 32;
  1155. src_argb1 += 32;
  1156. dst_argb += 32;
  1157. }
  1158. }
  1159. void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
  1160. uint8_t* dst_argb,
  1161. int width) {
  1162. int x;
  1163. v16u8 src0, src1, dst0, dst1;
  1164. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
  1165. v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
  1166. v8i16 zero = {0};
  1167. v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
  1168. for (x = 0; x < width; x += 8) {
  1169. src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
  1170. src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
  1171. vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
  1172. vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
  1173. vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
  1174. vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
  1175. vec4 = (v8u16)__msa_fill_h(vec0[3]);
  1176. vec5 = (v8u16)__msa_fill_h(vec0[7]);
  1177. vec6 = (v8u16)__msa_fill_h(vec1[3]);
  1178. vec7 = (v8u16)__msa_fill_h(vec1[7]);
  1179. vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
  1180. vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
  1181. vec6 = (v8u16)__msa_fill_h(vec2[3]);
  1182. vec7 = (v8u16)__msa_fill_h(vec2[7]);
  1183. vec8 = (v8u16)__msa_fill_h(vec3[3]);
  1184. vec9 = (v8u16)__msa_fill_h(vec3[7]);
  1185. vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
  1186. vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
  1187. reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
  1188. reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
  1189. reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
  1190. reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
  1191. reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
  1192. reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
  1193. reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
  1194. reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
  1195. reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
  1196. reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
  1197. reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
  1198. reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
  1199. reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
  1200. reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
  1201. reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
  1202. reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
  1203. reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
  1204. reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
  1205. reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
  1206. reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
  1207. reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
  1208. reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
  1209. reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
  1210. reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
  1211. vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
  1212. vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
  1213. vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
  1214. vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
  1215. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  1216. dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
  1217. dst0 = __msa_bmnz_v(dst0, src0, mask);
  1218. dst1 = __msa_bmnz_v(dst1, src1, mask);
  1219. ST_UB2(dst0, dst1, dst_argb, 16);
  1220. src_argb += 32;
  1221. dst_argb += 32;
  1222. }
  1223. }
  1224. void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
  1225. uint8_t* dst_rgb,
  1226. uint32_t dither4,
  1227. int width) {
  1228. int x;
  1229. v16u8 src0, src1, dst0, vec0, vec1;
  1230. v8i16 vec_d0;
  1231. v8i16 reg0, reg1, reg2;
  1232. v16i8 zero = {0};
  1233. v8i16 max = __msa_ldi_h(0xFF);
  1234. vec_d0 = (v8i16)__msa_fill_w(dither4);
  1235. vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
  1236. for (x = 0; x < width; x += 8) {
  1237. src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
  1238. src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
  1239. vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  1240. vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  1241. reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
  1242. reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
  1243. reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
  1244. reg0 += vec_d0;
  1245. reg1 += vec_d0;
  1246. reg2 += vec_d0;
  1247. reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
  1248. reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
  1249. reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
  1250. reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
  1251. reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
  1252. reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
  1253. reg0 = __msa_srai_h(reg0, 3);
  1254. reg2 = __msa_srai_h(reg2, 3);
  1255. reg1 = __msa_srai_h(reg1, 2);
  1256. reg2 = __msa_slli_h(reg2, 11);
  1257. reg1 = __msa_slli_h(reg1, 5);
  1258. reg0 |= reg1;
  1259. dst0 = (v16u8)(reg0 | reg2);
  1260. ST_UB(dst0, dst_rgb);
  1261. src_argb += 32;
  1262. dst_rgb += 16;
  1263. }
  1264. }
  1265. void ARGBShuffleRow_MSA(const uint8_t* src_argb,
  1266. uint8_t* dst_argb,
  1267. const uint8_t* shuffler,
  1268. int width) {
  1269. int x;
  1270. v16u8 src0, src1, dst0, dst1;
  1271. v16i8 vec0;
  1272. v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
  1273. int32_t val = LW((int32_t*)shuffler);
  1274. vec0 = (v16i8)__msa_fill_w(val);
  1275. shuffler_vec += vec0;
  1276. for (x = 0; x < width; x += 8) {
  1277. src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
  1278. src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
  1279. dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
  1280. dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
  1281. ST_UB2(dst0, dst1, dst_argb, 16);
  1282. src_argb += 32;
  1283. dst_argb += 32;
  1284. }
  1285. }
  1286. void ARGBShadeRow_MSA(const uint8_t* src_argb,
  1287. uint8_t* dst_argb,
  1288. int width,
  1289. uint32_t value) {
  1290. int x;
  1291. v16u8 src0, dst0;
  1292. v8u16 vec0, vec1;
  1293. v4u32 reg0, reg1, reg2, reg3, rgba_scale;
  1294. v8i16 zero = {0};
  1295. rgba_scale[0] = value;
  1296. rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
  1297. rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
  1298. for (x = 0; x < width; x += 4) {
  1299. src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
  1300. vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
  1301. vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
  1302. reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
  1303. reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
  1304. reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
  1305. reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
  1306. reg0 *= rgba_scale;
  1307. reg1 *= rgba_scale;
  1308. reg2 *= rgba_scale;
  1309. reg3 *= rgba_scale;
  1310. reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
  1311. reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
  1312. reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
  1313. reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
  1314. vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
  1315. vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
  1316. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  1317. ST_UB(dst0, dst_argb);
  1318. src_argb += 16;
  1319. dst_argb += 16;
  1320. }
  1321. }
  1322. void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  1323. int x;
  1324. v16u8 src0, src1, vec0, vec1, dst0, dst1;
  1325. v8u16 reg0;
  1326. v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
  1327. v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
  1328. for (x = 0; x < width; x += 8) {
  1329. src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
  1330. src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
  1331. vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
  1332. vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
  1333. reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
  1334. reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
  1335. reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
  1336. vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
  1337. vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
  1338. dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
  1339. dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
  1340. ST_UB2(dst0, dst1, dst_argb, 16);
  1341. src_argb += 32;
  1342. dst_argb += 32;
  1343. }
  1344. }
  1345. void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) {
  1346. int x;
  1347. v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
  1348. v8u16 reg0, reg1, reg2;
  1349. v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
  1350. v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
  1351. v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
  1352. v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
  1353. v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
  1354. v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
  1355. v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
  1356. for (x = 0; x < width; x += 8) {
  1357. src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
  1358. src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
  1359. vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
  1360. vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
  1361. vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
  1362. reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
  1363. reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
  1364. reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
  1365. reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
  1366. reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
  1367. reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
  1368. reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
  1369. reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
  1370. reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
  1371. reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
  1372. reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
  1373. vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
  1374. vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
  1375. vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
  1376. vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
  1377. vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
  1378. dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
  1379. dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
  1380. ST_UB2(dst0, dst1, dst_argb, 16);
  1381. dst_argb += 32;
  1382. }
  1383. }
  1384. void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
  1385. uint8_t* dst_argb,
  1386. int width) {
  1387. int x;
  1388. v16u8 src0, src1;
  1389. v8u16 vec0, vec1, vec2, vec3;
  1390. v16u8 dst0, dst1, dst2, dst3;
  1391. for (x = 0; x < width; x += 16) {
  1392. src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
  1393. src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
  1394. vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
  1395. vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
  1396. vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
  1397. vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
  1398. vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
  1399. vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
  1400. vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
  1401. vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
  1402. dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
  1403. dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
  1404. dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
  1405. dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
  1406. ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
  1407. src_argb4444 += 32;
  1408. dst_argb += 64;
  1409. }
  1410. }
  1411. void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
  1412. uint8_t* dst_argb,
  1413. int width) {
  1414. int x;
  1415. v8u16 src0, src1;
  1416. v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
  1417. v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
  1418. v16u8 dst0, dst1, dst2, dst3;
  1419. v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
  1420. for (x = 0; x < width; x += 16) {
  1421. src0 = (v8u16)__msa_ld_h((void*)src_argb1555, 0);
  1422. src1 = (v8u16)__msa_ld_h((void*)src_argb1555, 16);
  1423. vec0 = src0 & const_0x1F;
  1424. vec1 = src1 & const_0x1F;
  1425. src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
  1426. src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
  1427. vec2 = src0 & const_0x1F;
  1428. vec3 = src1 & const_0x1F;
  1429. src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
  1430. src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
  1431. vec4 = src0 & const_0x1F;
  1432. vec5 = src1 & const_0x1F;
  1433. src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
  1434. src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
  1435. reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  1436. reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
  1437. reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
  1438. reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  1439. reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
  1440. reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
  1441. reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
  1442. reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
  1443. reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
  1444. reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
  1445. reg3 = -reg3;
  1446. reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
  1447. reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
  1448. reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
  1449. reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
  1450. dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
  1451. dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
  1452. dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
  1453. dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
  1454. ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
  1455. src_argb1555 += 32;
  1456. dst_argb += 64;
  1457. }
  1458. }
  1459. void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
  1460. uint8_t* dst_argb,
  1461. int width) {
  1462. int x;
  1463. v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
  1464. v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
  1465. v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
  1466. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  1467. v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
  1468. v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
  1469. v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
  1470. for (x = 0; x < width; x += 16) {
  1471. src0 = (v8u16)__msa_ld_h((void*)src_rgb565, 0);
  1472. src1 = (v8u16)__msa_ld_h((void*)src_rgb565, 16);
  1473. vec0 = src0 & const_0x1F;
  1474. vec1 = src0 & const_0x7E0;
  1475. vec2 = src0 & const_0xF800;
  1476. vec3 = src1 & const_0x1F;
  1477. vec4 = src1 & const_0x7E0;
  1478. vec5 = src1 & const_0xF800;
  1479. reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
  1480. reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
  1481. reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
  1482. reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
  1483. reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
  1484. reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
  1485. reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
  1486. reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
  1487. reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
  1488. reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
  1489. reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
  1490. reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
  1491. res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
  1492. res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
  1493. res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
  1494. res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
  1495. dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
  1496. dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
  1497. dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
  1498. dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
  1499. ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
  1500. src_rgb565 += 32;
  1501. dst_argb += 64;
  1502. }
  1503. }
  1504. void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24,
  1505. uint8_t* dst_argb,
  1506. int width) {
  1507. int x;
  1508. v16u8 src0, src1, src2;
  1509. v16u8 vec0, vec1, vec2;
  1510. v16u8 dst0, dst1, dst2, dst3;
  1511. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  1512. v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
  1513. for (x = 0; x < width; x += 16) {
  1514. src0 = (v16u8)__msa_ld_b((void*)src_rgb24, 0);
  1515. src1 = (v16u8)__msa_ld_b((void*)src_rgb24, 16);
  1516. src2 = (v16u8)__msa_ld_b((void*)src_rgb24, 32);
  1517. vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
  1518. vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
  1519. vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
  1520. dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
  1521. dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
  1522. dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
  1523. dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
  1524. ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
  1525. src_rgb24 += 48;
  1526. dst_argb += 64;
  1527. }
  1528. }
  1529. void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  1530. int x;
  1531. v16u8 src0, src1, src2;
  1532. v16u8 vec0, vec1, vec2;
  1533. v16u8 dst0, dst1, dst2, dst3;
  1534. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  1535. v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
  1536. for (x = 0; x < width; x += 16) {
  1537. src0 = (v16u8)__msa_ld_b((void*)src_raw, 0);
  1538. src1 = (v16u8)__msa_ld_b((void*)src_raw, 16);
  1539. src2 = (v16u8)__msa_ld_b((void*)src_raw, 32);
  1540. vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
  1541. vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
  1542. vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
  1543. dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
  1544. dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
  1545. dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
  1546. dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
  1547. ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
  1548. src_raw += 48;
  1549. dst_argb += 64;
  1550. }
  1551. }
  1552. void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
  1553. uint8_t* dst_y,
  1554. int width) {
  1555. int x;
  1556. v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
  1557. v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
  1558. v16u8 dst0;
  1559. v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
  1560. v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
  1561. v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
  1562. v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
  1563. v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
  1564. for (x = 0; x < width; x += 16) {
  1565. src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0);
  1566. src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16);
  1567. vec0 = src0 & const_0x1F;
  1568. vec1 = src1 & const_0x1F;
  1569. src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
  1570. src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
  1571. vec2 = src0 & const_0x1F;
  1572. vec3 = src1 & const_0x1F;
  1573. src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
  1574. src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
  1575. vec4 = src0 & const_0x1F;
  1576. vec5 = src1 & const_0x1F;
  1577. reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
  1578. reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
  1579. reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
  1580. reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
  1581. reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
  1582. reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
  1583. reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
  1584. reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
  1585. reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
  1586. reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
  1587. reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
  1588. reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
  1589. reg0 *= const_0x19;
  1590. reg1 *= const_0x19;
  1591. reg2 *= const_0x81;
  1592. reg3 *= const_0x81;
  1593. reg4 *= const_0x42;
  1594. reg5 *= const_0x42;
  1595. reg0 += reg2;
  1596. reg1 += reg3;
  1597. reg0 += reg4;
  1598. reg1 += reg5;
  1599. reg0 += const_0x1080;
  1600. reg1 += const_0x1080;
  1601. reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
  1602. reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
  1603. dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
  1604. ST_UB(dst0, dst_y);
  1605. src_argb1555 += 32;
  1606. dst_y += 16;
  1607. }
  1608. }
  1609. void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
  1610. int x;
  1611. v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1612. v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
  1613. v4u32 res0, res1, res2, res3;
  1614. v16u8 dst0;
  1615. v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
  1616. v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
  1617. v8i16 const_0x1080 = __msa_fill_h(0x1080);
  1618. v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
  1619. v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
  1620. v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
  1621. for (x = 0; x < width; x += 16) {
  1622. src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0);
  1623. src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16);
  1624. vec0 = src0 & const_0x1F;
  1625. vec1 = src0 & const_0x7E0;
  1626. vec2 = src0 & const_0xF800;
  1627. vec3 = src1 & const_0x1F;
  1628. vec4 = src1 & const_0x7E0;
  1629. vec5 = src1 & const_0xF800;
  1630. reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
  1631. reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
  1632. reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
  1633. reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
  1634. reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
  1635. reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
  1636. reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
  1637. reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
  1638. reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
  1639. reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
  1640. reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
  1641. reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
  1642. vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
  1643. vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
  1644. vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
  1645. vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
  1646. vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
  1647. vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
  1648. vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
  1649. vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
  1650. res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
  1651. res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
  1652. res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
  1653. res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
  1654. res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
  1655. res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
  1656. res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
  1657. res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
  1658. res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
  1659. res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
  1660. res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
  1661. res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
  1662. vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
  1663. vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
  1664. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  1665. ST_UB(dst0, dst_y);
  1666. src_rgb565 += 32;
  1667. dst_y += 16;
  1668. }
  1669. }
  1670. void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  1671. int x;
  1672. v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
  1673. v8u16 vec0, vec1, vec2, vec3;
  1674. v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
  1675. v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
  1676. v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
  1677. v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
  1678. v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
  1679. 18, 19, 20, 21, 21, 22, 23, 24};
  1680. v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
  1681. v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
  1682. v16i8 zero = {0};
  1683. for (x = 0; x < width; x += 16) {
  1684. src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
  1685. src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
  1686. src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
  1687. reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
  1688. reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
  1689. reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
  1690. reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
  1691. vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
  1692. vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
  1693. vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
  1694. vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
  1695. vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
  1696. vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
  1697. vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
  1698. vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
  1699. vec0 += const_0x1080;
  1700. vec1 += const_0x1080;
  1701. vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
  1702. vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
  1703. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  1704. ST_UB(dst0, dst_y);
  1705. src_argb0 += 48;
  1706. dst_y += 16;
  1707. }
  1708. }
  1709. void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  1710. int x;
  1711. v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
  1712. v8u16 vec0, vec1, vec2, vec3;
  1713. v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
  1714. v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
  1715. v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
  1716. v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
  1717. v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
  1718. 18, 19, 20, 21, 21, 22, 23, 24};
  1719. v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
  1720. v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
  1721. v16i8 zero = {0};
  1722. for (x = 0; x < width; x += 16) {
  1723. src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
  1724. src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
  1725. src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
  1726. reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
  1727. reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
  1728. reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
  1729. reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
  1730. vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
  1731. vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
  1732. vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
  1733. vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
  1734. vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
  1735. vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
  1736. vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
  1737. vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
  1738. vec0 += const_0x1080;
  1739. vec1 += const_0x1080;
  1740. vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
  1741. vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
  1742. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  1743. ST_UB(dst0, dst_y);
  1744. src_argb0 += 48;
  1745. dst_y += 16;
  1746. }
  1747. }
  1748. void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
  1749. int src_stride_argb1555,
  1750. uint8_t* dst_u,
  1751. uint8_t* dst_v,
  1752. int width) {
  1753. int x;
  1754. const uint16_t* s = (const uint16_t*)src_argb1555;
  1755. const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
  1756. int64_t res0, res1;
  1757. v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
  1758. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
  1759. v16u8 dst0;
  1760. v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
  1761. v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
  1762. v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
  1763. v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
  1764. v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
  1765. v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
  1766. v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
  1767. for (x = 0; x < width; x += 16) {
  1768. src0 = (v8u16)__msa_ld_b((void*)s, 0);
  1769. src1 = (v8u16)__msa_ld_b((void*)s, 16);
  1770. src2 = (v8u16)__msa_ld_b((void*)t, 0);
  1771. src3 = (v8u16)__msa_ld_b((void*)t, 16);
  1772. vec0 = src0 & const_0x1F;
  1773. vec1 = src1 & const_0x1F;
  1774. vec0 += src2 & const_0x1F;
  1775. vec1 += src3 & const_0x1F;
  1776. vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  1777. src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
  1778. src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
  1779. src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
  1780. src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
  1781. vec2 = src0 & const_0x1F;
  1782. vec3 = src1 & const_0x1F;
  1783. vec2 += src2 & const_0x1F;
  1784. vec3 += src3 & const_0x1F;
  1785. vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
  1786. src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
  1787. src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
  1788. src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
  1789. src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
  1790. vec4 = src0 & const_0x1F;
  1791. vec5 = src1 & const_0x1F;
  1792. vec4 += src2 & const_0x1F;
  1793. vec5 += src3 & const_0x1F;
  1794. vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
  1795. vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
  1796. vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
  1797. vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
  1798. vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
  1799. vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
  1800. vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
  1801. vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
  1802. vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
  1803. vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
  1804. reg0 = vec6 * const_0x70;
  1805. reg1 = vec0 * const_0x4A;
  1806. reg2 = vec2 * const_0x70;
  1807. reg3 = vec0 * const_0x5E;
  1808. reg0 += const_0x8080;
  1809. reg1 += vec2 * const_0x26;
  1810. reg2 += const_0x8080;
  1811. reg3 += vec6 * const_0x12;
  1812. reg0 -= reg1;
  1813. reg2 -= reg3;
  1814. reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
  1815. reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
  1816. dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
  1817. res0 = __msa_copy_u_d((v2i64)dst0, 0);
  1818. res1 = __msa_copy_u_d((v2i64)dst0, 1);
  1819. SD(res0, dst_u);
  1820. SD(res1, dst_v);
  1821. s += 16;
  1822. t += 16;
  1823. dst_u += 8;
  1824. dst_v += 8;
  1825. }
  1826. }
  1827. void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
  1828. int src_stride_rgb565,
  1829. uint8_t* dst_u,
  1830. uint8_t* dst_v,
  1831. int width) {
  1832. int x;
  1833. const uint16_t* s = (const uint16_t*)src_rgb565;
  1834. const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
  1835. int64_t res0, res1;
  1836. v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
  1837. v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
  1838. v16u8 dst0;
  1839. v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
  1840. v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
  1841. v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
  1842. v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
  1843. v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
  1844. v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
  1845. v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
  1846. v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
  1847. for (x = 0; x < width; x += 16) {
  1848. src0 = (v8u16)__msa_ld_b((void*)s, 0);
  1849. src1 = (v8u16)__msa_ld_b((void*)s, 16);
  1850. src2 = (v8u16)__msa_ld_b((void*)t, 0);
  1851. src3 = (v8u16)__msa_ld_b((void*)t, 16);
  1852. vec0 = src0 & const_0x1F;
  1853. vec1 = src1 & const_0x1F;
  1854. vec0 += src2 & const_0x1F;
  1855. vec1 += src3 & const_0x1F;
  1856. vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  1857. src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
  1858. src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
  1859. src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
  1860. src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
  1861. vec2 = src0 & const_0x3F;
  1862. vec3 = src1 & const_0x3F;
  1863. vec2 += src2 & const_0x3F;
  1864. vec3 += src3 & const_0x3F;
  1865. vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
  1866. src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
  1867. src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
  1868. src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
  1869. src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
  1870. vec4 = src0 & const_0x1F;
  1871. vec5 = src1 & const_0x1F;
  1872. vec4 += src2 & const_0x1F;
  1873. vec5 += src3 & const_0x1F;
  1874. vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
  1875. vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
  1876. vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
  1877. vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
  1878. vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
  1879. vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
  1880. vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
  1881. vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
  1882. reg0 = vec3 * const_0x70;
  1883. reg1 = vec1 * const_0x4A;
  1884. reg2 = vec4 * const_0x70;
  1885. reg3 = vec1 * const_0x5E;
  1886. reg0 += const_32896;
  1887. reg1 += vec4 * const_0x26;
  1888. reg2 += const_32896;
  1889. reg3 += vec3 * const_0x12;
  1890. reg0 -= reg1;
  1891. reg2 -= reg3;
  1892. reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
  1893. reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
  1894. dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
  1895. res0 = __msa_copy_u_d((v2i64)dst0, 0);
  1896. res1 = __msa_copy_u_d((v2i64)dst0, 1);
  1897. SD(res0, dst_u);
  1898. SD(res1, dst_v);
  1899. s += 16;
  1900. t += 16;
  1901. dst_u += 8;
  1902. dst_v += 8;
  1903. }
  1904. }
  1905. void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
  1906. int src_stride_rgb,
  1907. uint8_t* dst_u,
  1908. uint8_t* dst_v,
  1909. int width) {
  1910. int x;
  1911. const uint8_t* s = src_rgb0;
  1912. const uint8_t* t = src_rgb0 + src_stride_rgb;
  1913. int64_t res0, res1;
  1914. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1915. v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
  1916. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1917. v8i16 reg0, reg1, reg2, reg3;
  1918. v16u8 dst0;
  1919. v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
  1920. v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
  1921. v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
  1922. v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
  1923. v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
  1924. v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
  1925. v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
  1926. v16i8 zero = {0};
  1927. for (x = 0; x < width; x += 16) {
  1928. inp0 = (v16u8)__msa_ld_b((void*)s, 0);
  1929. inp1 = (v16u8)__msa_ld_b((void*)s, 16);
  1930. inp2 = (v16u8)__msa_ld_b((void*)s, 32);
  1931. inp3 = (v16u8)__msa_ld_b((void*)t, 0);
  1932. inp4 = (v16u8)__msa_ld_b((void*)t, 16);
  1933. inp5 = (v16u8)__msa_ld_b((void*)t, 32);
  1934. src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
  1935. src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
  1936. src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
  1937. src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
  1938. src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
  1939. src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
  1940. src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
  1941. src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
  1942. src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
  1943. src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
  1944. src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
  1945. src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
  1946. src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
  1947. src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
  1948. vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
  1949. vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
  1950. vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
  1951. vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
  1952. vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
  1953. vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
  1954. vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
  1955. vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
  1956. vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
  1957. vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
  1958. vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
  1959. vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
  1960. vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
  1961. vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
  1962. vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
  1963. vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
  1964. reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
  1965. reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
  1966. reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
  1967. reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
  1968. reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
  1969. reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
  1970. reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
  1971. reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
  1972. reg0 = __msa_srai_h((v8i16)reg0, 2);
  1973. reg1 = __msa_srai_h((v8i16)reg1, 2);
  1974. reg2 = __msa_srai_h((v8i16)reg2, 2);
  1975. reg3 = __msa_srai_h((v8i16)reg3, 2);
  1976. vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
  1977. vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
  1978. vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
  1979. vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
  1980. vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
  1981. vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
  1982. vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
  1983. vec3 = vec0 * const_0x70;
  1984. vec4 = vec1 * const_0x4A;
  1985. vec5 = vec2 * const_0x26;
  1986. vec2 *= const_0x70;
  1987. vec1 *= const_0x5E;
  1988. vec0 *= const_0x12;
  1989. reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
  1990. reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
  1991. reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
  1992. reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
  1993. reg0 += reg1;
  1994. reg2 += reg3;
  1995. reg0 = __msa_srai_h(reg0, 8);
  1996. reg2 = __msa_srai_h(reg2, 8);
  1997. dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
  1998. res0 = __msa_copy_u_d((v2i64)dst0, 0);
  1999. res1 = __msa_copy_u_d((v2i64)dst0, 1);
  2000. SD(res0, dst_u);
  2001. SD(res1, dst_v);
  2002. t += 48;
  2003. s += 48;
  2004. dst_u += 8;
  2005. dst_v += 8;
  2006. }
  2007. }
  2008. void RAWToUVRow_MSA(const uint8_t* src_rgb0,
  2009. int src_stride_rgb,
  2010. uint8_t* dst_u,
  2011. uint8_t* dst_v,
  2012. int width) {
  2013. int x;
  2014. const uint8_t* s = src_rgb0;
  2015. const uint8_t* t = src_rgb0 + src_stride_rgb;
  2016. int64_t res0, res1;
  2017. v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
  2018. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  2019. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  2020. v8i16 reg0, reg1, reg2, reg3;
  2021. v16u8 dst0;
  2022. v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
  2023. v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
  2024. v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
  2025. v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
  2026. v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
  2027. v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
  2028. v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
  2029. v16i8 zero = {0};
  2030. for (x = 0; x < width; x += 16) {
  2031. inp0 = (v16u8)__msa_ld_b((void*)s, 0);
  2032. inp1 = (v16u8)__msa_ld_b((void*)s, 16);
  2033. inp2 = (v16u8)__msa_ld_b((void*)s, 32);
  2034. inp3 = (v16u8)__msa_ld_b((void*)t, 0);
  2035. inp4 = (v16u8)__msa_ld_b((void*)t, 16);
  2036. inp5 = (v16u8)__msa_ld_b((void*)t, 32);
  2037. src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
  2038. src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
  2039. src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
  2040. src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
  2041. src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
  2042. src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
  2043. src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
  2044. src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
  2045. src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
  2046. src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
  2047. src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
  2048. src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
  2049. src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
  2050. src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
  2051. vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
  2052. vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
  2053. vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
  2054. vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
  2055. vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
  2056. vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
  2057. vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
  2058. vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
  2059. vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
  2060. vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
  2061. vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
  2062. vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
  2063. vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
  2064. vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
  2065. vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
  2066. vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
  2067. reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
  2068. reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
  2069. reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
  2070. reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
  2071. reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
  2072. reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
  2073. reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
  2074. reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
  2075. reg0 = __msa_srai_h(reg0, 2);
  2076. reg1 = __msa_srai_h(reg1, 2);
  2077. reg2 = __msa_srai_h(reg2, 2);
  2078. reg3 = __msa_srai_h(reg3, 2);
  2079. vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
  2080. vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
  2081. vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
  2082. vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
  2083. vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
  2084. vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
  2085. vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
  2086. vec3 = vec0 * const_0x70;
  2087. vec4 = vec1 * const_0x4A;
  2088. vec5 = vec2 * const_0x26;
  2089. vec2 *= const_0x70;
  2090. vec1 *= const_0x5E;
  2091. vec0 *= const_0x12;
  2092. reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
  2093. reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
  2094. reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
  2095. reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
  2096. reg0 += reg1;
  2097. reg2 += reg3;
  2098. reg0 = __msa_srai_h(reg0, 8);
  2099. reg2 = __msa_srai_h(reg2, 8);
  2100. dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
  2101. res0 = __msa_copy_u_d((v2i64)dst0, 0);
  2102. res1 = __msa_copy_u_d((v2i64)dst0, 1);
  2103. SD(res0, dst_u);
  2104. SD(res1, dst_v);
  2105. t += 48;
  2106. s += 48;
  2107. dst_u += 8;
  2108. dst_v += 8;
  2109. }
  2110. }
  2111. void NV12ToARGBRow_MSA(const uint8_t* src_y,
  2112. const uint8_t* src_uv,
  2113. uint8_t* dst_argb,
  2114. const struct YuvConstants* yuvconstants,
  2115. int width) {
  2116. int x;
  2117. uint64_t val0, val1;
  2118. v16u8 src0, src1, res0, res1, dst0, dst1;
  2119. v8i16 vec0, vec1, vec2;
  2120. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  2121. v4i32 vec_ubvr, vec_ugvg;
  2122. v16u8 zero = {0};
  2123. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  2124. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  2125. vec_br, vec_yg);
  2126. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  2127. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  2128. for (x = 0; x < width; x += 8) {
  2129. val0 = LD(src_y);
  2130. val1 = LD(src_uv);
  2131. src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
  2132. src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
  2133. YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  2134. vec0, vec1, vec2);
  2135. res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
  2136. res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
  2137. dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
  2138. dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
  2139. ST_UB2(dst0, dst1, dst_argb, 16);
  2140. src_y += 8;
  2141. src_uv += 8;
  2142. dst_argb += 32;
  2143. }
  2144. }
  2145. void NV12ToRGB565Row_MSA(const uint8_t* src_y,
  2146. const uint8_t* src_uv,
  2147. uint8_t* dst_rgb565,
  2148. const struct YuvConstants* yuvconstants,
  2149. int width) {
  2150. int x;
  2151. uint64_t val0, val1;
  2152. v16u8 src0, src1, dst0;
  2153. v8i16 vec0, vec1, vec2;
  2154. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  2155. v4i32 vec_ubvr, vec_ugvg;
  2156. v16u8 zero = {0};
  2157. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  2158. vec_br, vec_yg);
  2159. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  2160. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  2161. for (x = 0; x < width; x += 8) {
  2162. val0 = LD(src_y);
  2163. val1 = LD(src_uv);
  2164. src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
  2165. src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
  2166. YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  2167. vec0, vec1, vec2);
  2168. vec0 = vec0 >> 3;
  2169. vec1 = (vec1 >> 2) << 5;
  2170. vec2 = (vec2 >> 3) << 11;
  2171. dst0 = (v16u8)(vec0 | vec1 | vec2);
  2172. ST_UB(dst0, dst_rgb565);
  2173. src_y += 8;
  2174. src_uv += 8;
  2175. dst_rgb565 += 16;
  2176. }
  2177. }
  2178. void NV21ToARGBRow_MSA(const uint8_t* src_y,
  2179. const uint8_t* src_vu,
  2180. uint8_t* dst_argb,
  2181. const struct YuvConstants* yuvconstants,
  2182. int width) {
  2183. int x;
  2184. uint64_t val0, val1;
  2185. v16u8 src0, src1, res0, res1, dst0, dst1;
  2186. v8i16 vec0, vec1, vec2;
  2187. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  2188. v4i32 vec_ubvr, vec_ugvg;
  2189. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  2190. v16u8 zero = {0};
  2191. v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
  2192. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  2193. vec_br, vec_yg);
  2194. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  2195. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  2196. for (x = 0; x < width; x += 8) {
  2197. val0 = LD(src_y);
  2198. val1 = LD(src_vu);
  2199. src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
  2200. src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
  2201. src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
  2202. YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  2203. vec0, vec1, vec2);
  2204. res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
  2205. res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
  2206. dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
  2207. dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
  2208. ST_UB2(dst0, dst1, dst_argb, 16);
  2209. src_y += 8;
  2210. src_vu += 8;
  2211. dst_argb += 32;
  2212. }
  2213. }
  2214. void SobelRow_MSA(const uint8_t* src_sobelx,
  2215. const uint8_t* src_sobely,
  2216. uint8_t* dst_argb,
  2217. int width) {
  2218. int x;
  2219. v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
  2220. v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
  2221. v16i8 const_0x4 = __msa_ldi_b(0x4);
  2222. v16i8 mask1 = mask0 + const_0x4;
  2223. v16i8 mask2 = mask1 + const_0x4;
  2224. v16i8 mask3 = mask2 + const_0x4;
  2225. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  2226. for (x = 0; x < width; x += 16) {
  2227. src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
  2228. src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
  2229. vec0 = __msa_adds_u_b(src0, src1);
  2230. dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
  2231. dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
  2232. dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
  2233. dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
  2234. ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
  2235. src_sobelx += 16;
  2236. src_sobely += 16;
  2237. dst_argb += 64;
  2238. }
  2239. }
  2240. void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
  2241. const uint8_t* src_sobely,
  2242. uint8_t* dst_y,
  2243. int width) {
  2244. int x;
  2245. v16u8 src0, src1, src2, src3, dst0, dst1;
  2246. for (x = 0; x < width; x += 32) {
  2247. src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
  2248. src1 = (v16u8)__msa_ld_b((void*)src_sobelx, 16);
  2249. src2 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
  2250. src3 = (v16u8)__msa_ld_b((void*)src_sobely, 16);
  2251. dst0 = __msa_adds_u_b(src0, src2);
  2252. dst1 = __msa_adds_u_b(src1, src3);
  2253. ST_UB2(dst0, dst1, dst_y, 16);
  2254. src_sobelx += 32;
  2255. src_sobely += 32;
  2256. dst_y += 32;
  2257. }
  2258. }
  2259. void SobelXYRow_MSA(const uint8_t* src_sobelx,
  2260. const uint8_t* src_sobely,
  2261. uint8_t* dst_argb,
  2262. int width) {
  2263. int x;
  2264. v16u8 src0, src1, vec0, vec1, vec2;
  2265. v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
  2266. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  2267. for (x = 0; x < width; x += 16) {
  2268. src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
  2269. src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
  2270. vec0 = __msa_adds_u_b(src0, src1);
  2271. vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
  2272. vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
  2273. reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
  2274. reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
  2275. dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
  2276. dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
  2277. dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
  2278. dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
  2279. ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
  2280. src_sobelx += 16;
  2281. src_sobely += 16;
  2282. dst_argb += 64;
  2283. }
  2284. }
  2285. void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  2286. int x;
  2287. v16u8 src0, src1, src2, src3, dst0;
  2288. v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
  2289. v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
  2290. v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
  2291. for (x = 0; x < width; x += 16) {
  2292. src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
  2293. src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
  2294. src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
  2295. src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
  2296. ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
  2297. dst0);
  2298. ST_UB(dst0, dst_y);
  2299. src_argb0 += 64;
  2300. dst_y += 16;
  2301. }
  2302. }
  2303. void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  2304. int x;
  2305. v16u8 src0, src1, src2, src3, dst0;
  2306. v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
  2307. v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
  2308. v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
  2309. for (x = 0; x < width; x += 16) {
  2310. src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
  2311. src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
  2312. src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
  2313. src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
  2314. ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
  2315. dst0);
  2316. ST_UB(dst0, dst_y);
  2317. src_argb0 += 64;
  2318. dst_y += 16;
  2319. }
  2320. }
  2321. void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  2322. int x;
  2323. v16u8 src0, src1, src2, src3, dst0;
  2324. v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
  2325. v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
  2326. v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
  2327. for (x = 0; x < width; x += 16) {
  2328. src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
  2329. src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
  2330. src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
  2331. src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
  2332. ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
  2333. dst0);
  2334. ST_UB(dst0, dst_y);
  2335. src_argb0 += 64;
  2336. dst_y += 16;
  2337. }
  2338. }
  2339. void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  2340. int x;
  2341. v16u8 src0, src1, src2, src3, dst0;
  2342. v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
  2343. v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
  2344. v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
  2345. for (x = 0; x < width; x += 16) {
  2346. src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
  2347. src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
  2348. src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
  2349. src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
  2350. ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
  2351. dst0);
  2352. ST_UB(dst0, dst_y);
  2353. src_argb0 += 64;
  2354. dst_y += 16;
  2355. }
  2356. }
  2357. void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
  2358. int src_stride_rgb,
  2359. uint8_t* dst_u,
  2360. uint8_t* dst_v,
  2361. int width) {
  2362. int x;
  2363. const uint8_t* s = src_rgb0;
  2364. const uint8_t* t = src_rgb0 + src_stride_rgb;
  2365. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  2366. v16u8 vec0, vec1, vec2, vec3;
  2367. v16u8 dst0, dst1;
  2368. v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
  2369. v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
  2370. 18, 19, 22, 23, 26, 27, 30, 31};
  2371. v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
  2372. v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
  2373. v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
  2374. v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
  2375. v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
  2376. v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
  2377. for (x = 0; x < width; x += 32) {
  2378. src0 = (v16u8)__msa_ld_b((void*)s, 0);
  2379. src1 = (v16u8)__msa_ld_b((void*)s, 16);
  2380. src2 = (v16u8)__msa_ld_b((void*)s, 32);
  2381. src3 = (v16u8)__msa_ld_b((void*)s, 48);
  2382. src4 = (v16u8)__msa_ld_b((void*)t, 0);
  2383. src5 = (v16u8)__msa_ld_b((void*)t, 16);
  2384. src6 = (v16u8)__msa_ld_b((void*)t, 32);
  2385. src7 = (v16u8)__msa_ld_b((void*)t, 48);
  2386. src0 = __msa_aver_u_b(src0, src4);
  2387. src1 = __msa_aver_u_b(src1, src5);
  2388. src2 = __msa_aver_u_b(src2, src6);
  2389. src3 = __msa_aver_u_b(src3, src7);
  2390. src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
  2391. src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
  2392. src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
  2393. src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
  2394. vec0 = __msa_aver_u_b(src4, src6);
  2395. vec1 = __msa_aver_u_b(src5, src7);
  2396. src0 = (v16u8)__msa_ld_b((void*)s, 64);
  2397. src1 = (v16u8)__msa_ld_b((void*)s, 80);
  2398. src2 = (v16u8)__msa_ld_b((void*)s, 96);
  2399. src3 = (v16u8)__msa_ld_b((void*)s, 112);
  2400. src4 = (v16u8)__msa_ld_b((void*)t, 64);
  2401. src5 = (v16u8)__msa_ld_b((void*)t, 80);
  2402. src6 = (v16u8)__msa_ld_b((void*)t, 96);
  2403. src7 = (v16u8)__msa_ld_b((void*)t, 112);
  2404. src0 = __msa_aver_u_b(src0, src4);
  2405. src1 = __msa_aver_u_b(src1, src5);
  2406. src2 = __msa_aver_u_b(src2, src6);
  2407. src3 = __msa_aver_u_b(src3, src7);
  2408. src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
  2409. src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
  2410. src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
  2411. src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
  2412. vec2 = __msa_aver_u_b(src4, src6);
  2413. vec3 = __msa_aver_u_b(src5, src7);
  2414. ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
  2415. const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
  2416. dst1);
  2417. ST_UB(dst0, dst_v);
  2418. ST_UB(dst1, dst_u);
  2419. s += 128;
  2420. t += 128;
  2421. dst_v += 16;
  2422. dst_u += 16;
  2423. }
  2424. }
  2425. void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
  2426. int src_stride_rgb,
  2427. uint8_t* dst_u,
  2428. uint8_t* dst_v,
  2429. int width) {
  2430. int x;
  2431. const uint8_t* s = src_rgb0;
  2432. const uint8_t* t = src_rgb0 + src_stride_rgb;
  2433. v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
  2434. v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
  2435. v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
  2436. 18, 19, 22, 23, 26, 27, 30, 31};
  2437. v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
  2438. v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
  2439. v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
  2440. v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
  2441. v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
  2442. v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
  2443. for (x = 0; x < width; x += 32) {
  2444. READ_ARGB(s, t, vec0, vec1, vec2, vec3);
  2445. ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
  2446. const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
  2447. dst1);
  2448. ST_UB(dst0, dst_v);
  2449. ST_UB(dst1, dst_u);
  2450. s += 128;
  2451. t += 128;
  2452. dst_v += 16;
  2453. dst_u += 16;
  2454. }
  2455. }
  2456. void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
  2457. int src_stride_rgb,
  2458. uint8_t* dst_u,
  2459. uint8_t* dst_v,
  2460. int width) {
  2461. int x;
  2462. const uint8_t* s = src_rgb0;
  2463. const uint8_t* t = src_rgb0 + src_stride_rgb;
  2464. v16u8 src0, src1, src2, src3;
  2465. v16u8 dst0, dst1;
  2466. v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
  2467. v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
  2468. 18, 19, 22, 23, 26, 27, 30, 31};
  2469. v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
  2470. v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
  2471. v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
  2472. v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
  2473. v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
  2474. v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
  2475. for (x = 0; x < width; x += 32) {
  2476. READ_ARGB(s, t, src0, src1, src2, src3);
  2477. ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
  2478. const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
  2479. dst1);
  2480. ST_UB(dst0, dst_u);
  2481. ST_UB(dst1, dst_v);
  2482. s += 128;
  2483. t += 128;
  2484. dst_u += 16;
  2485. dst_v += 16;
  2486. }
  2487. }
  2488. void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
  2489. int src_stride_rgb,
  2490. uint8_t* dst_u,
  2491. uint8_t* dst_v,
  2492. int width) {
  2493. int x;
  2494. const uint8_t* s = src_rgb0;
  2495. const uint8_t* t = src_rgb0 + src_stride_rgb;
  2496. v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
  2497. v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
  2498. v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
  2499. 18, 19, 22, 23, 26, 27, 30, 31};
  2500. v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
  2501. v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
  2502. v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
  2503. v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
  2504. v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
  2505. v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
  2506. for (x = 0; x < width; x += 32) {
  2507. READ_ARGB(s, t, vec0, vec1, vec2, vec3);
  2508. ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
  2509. const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
  2510. dst1);
  2511. ST_UB(dst0, dst_u);
  2512. ST_UB(dst1, dst_v);
  2513. s += 128;
  2514. t += 128;
  2515. dst_u += 16;
  2516. dst_v += 16;
  2517. }
  2518. }
  2519. void I444ToARGBRow_MSA(const uint8_t* src_y,
  2520. const uint8_t* src_u,
  2521. const uint8_t* src_v,
  2522. uint8_t* dst_argb,
  2523. const struct YuvConstants* yuvconstants,
  2524. int width) {
  2525. int x;
  2526. v16u8 src0, src1, src2, dst0, dst1;
  2527. v8u16 vec0, vec1, vec2;
  2528. v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
  2529. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  2530. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  2531. v8i16 zero = {0};
  2532. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  2533. vec_br, vec_yg);
  2534. for (x = 0; x < width; x += 8) {
  2535. READI444(src_y, src_u, src_v, src0, src1, src2);
  2536. vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
  2537. reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
  2538. reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
  2539. reg0 *= vec_yg;
  2540. reg1 *= vec_yg;
  2541. reg0 = __msa_srai_w(reg0, 16);
  2542. reg1 = __msa_srai_w(reg1, 16);
  2543. reg4 = reg0 + vec_br;
  2544. reg5 = reg1 + vec_br;
  2545. reg2 = reg0 + vec_bg;
  2546. reg3 = reg1 + vec_bg;
  2547. reg0 += vec_bb;
  2548. reg1 += vec_bb;
  2549. vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
  2550. vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
  2551. reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
  2552. reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
  2553. reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
  2554. reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
  2555. reg0 -= reg6 * vec_ub;
  2556. reg1 -= reg7 * vec_ub;
  2557. reg2 -= reg6 * vec_ug;
  2558. reg3 -= reg7 * vec_ug;
  2559. reg4 -= reg8 * vec_vr;
  2560. reg5 -= reg9 * vec_vr;
  2561. reg2 -= reg8 * vec_vg;
  2562. reg3 -= reg9 * vec_vg;
  2563. reg0 = __msa_srai_w(reg0, 6);
  2564. reg1 = __msa_srai_w(reg1, 6);
  2565. reg2 = __msa_srai_w(reg2, 6);
  2566. reg3 = __msa_srai_w(reg3, 6);
  2567. reg4 = __msa_srai_w(reg4, 6);
  2568. reg5 = __msa_srai_w(reg5, 6);
  2569. CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
  2570. vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
  2571. vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
  2572. vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
  2573. vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
  2574. vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
  2575. dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
  2576. dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
  2577. ST_UB2(dst0, dst1, dst_argb, 16);
  2578. src_y += 8;
  2579. src_u += 8;
  2580. src_v += 8;
  2581. dst_argb += 32;
  2582. }
  2583. }
  2584. void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  2585. int x;
  2586. v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
  2587. v8i16 vec0, vec1;
  2588. v4i32 reg0, reg1, reg2, reg3;
  2589. v4i32 vec_yg = __msa_fill_w(0x4A35);
  2590. v8i16 vec_ygb = __msa_fill_h(0xFB78);
  2591. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  2592. v8i16 max = __msa_ldi_h(0xFF);
  2593. v8i16 zero = {0};
  2594. for (x = 0; x < width; x += 16) {
  2595. src0 = (v16u8)__msa_ld_b((void*)src_y, 0);
  2596. vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
  2597. vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
  2598. reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
  2599. reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
  2600. reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
  2601. reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
  2602. reg0 *= vec_yg;
  2603. reg1 *= vec_yg;
  2604. reg2 *= vec_yg;
  2605. reg3 *= vec_yg;
  2606. reg0 = __msa_srai_w(reg0, 16);
  2607. reg1 = __msa_srai_w(reg1, 16);
  2608. reg2 = __msa_srai_w(reg2, 16);
  2609. reg3 = __msa_srai_w(reg3, 16);
  2610. vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
  2611. vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
  2612. vec0 += vec_ygb;
  2613. vec1 += vec_ygb;
  2614. vec0 = __msa_srai_h(vec0, 6);
  2615. vec1 = __msa_srai_h(vec1, 6);
  2616. vec0 = __msa_maxi_s_h(vec0, 0);
  2617. vec1 = __msa_maxi_s_h(vec1, 0);
  2618. vec0 = __msa_min_s_h(max, vec0);
  2619. vec1 = __msa_min_s_h(max, vec1);
  2620. res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  2621. res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
  2622. res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
  2623. res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
  2624. res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
  2625. dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
  2626. dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
  2627. dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
  2628. dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
  2629. ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
  2630. src_y += 16;
  2631. dst_argb += 64;
  2632. }
  2633. }
  2634. void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  2635. int x;
  2636. v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
  2637. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  2638. for (x = 0; x < width; x += 16) {
  2639. src0 = (v16u8)__msa_ld_b((void*)src_y, 0);
  2640. vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
  2641. vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
  2642. vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
  2643. vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
  2644. dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
  2645. dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
  2646. dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
  2647. dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
  2648. ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
  2649. src_y += 16;
  2650. dst_argb += 64;
  2651. }
  2652. }
  2653. void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
  2654. uint8_t* dst_argb,
  2655. const struct YuvConstants* yuvconstants,
  2656. int width) {
  2657. int x;
  2658. v16u8 src0, src1, src2;
  2659. v8i16 vec0, vec1, vec2;
  2660. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  2661. v4i32 vec_ubvr, vec_ugvg;
  2662. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  2663. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  2664. vec_br, vec_yg);
  2665. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  2666. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  2667. for (x = 0; x < width; x += 8) {
  2668. src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0);
  2669. src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
  2670. src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
  2671. YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  2672. vec0, vec1, vec2);
  2673. STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
  2674. src_yuy2 += 16;
  2675. dst_argb += 32;
  2676. }
  2677. }
  2678. void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
  2679. uint8_t* dst_argb,
  2680. const struct YuvConstants* yuvconstants,
  2681. int width) {
  2682. int x;
  2683. v16u8 src0, src1, src2;
  2684. v8i16 vec0, vec1, vec2;
  2685. v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
  2686. v4i32 vec_ubvr, vec_ugvg;
  2687. v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
  2688. YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
  2689. vec_br, vec_yg);
  2690. vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
  2691. vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
  2692. for (x = 0; x < width; x += 8) {
  2693. src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0);
  2694. src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
  2695. src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
  2696. YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
  2697. vec0, vec1, vec2);
  2698. STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
  2699. src_uyvy += 16;
  2700. dst_argb += 32;
  2701. }
  2702. }
  2703. void InterpolateRow_MSA(uint8_t* dst_ptr,
  2704. const uint8_t* src_ptr,
  2705. ptrdiff_t src_stride,
  2706. int width,
  2707. int32_t source_y_fraction) {
  2708. int32_t y1_fraction = source_y_fraction;
  2709. int32_t y0_fraction = 256 - y1_fraction;
  2710. uint16_t y_fractions;
  2711. const uint8_t* s = src_ptr;
  2712. const uint8_t* t = src_ptr + src_stride;
  2713. int x;
  2714. v16u8 src0, src1, src2, src3, dst0, dst1;
  2715. v8u16 vec0, vec1, vec2, vec3, y_frac;
  2716. if (0 == y1_fraction) {
  2717. memcpy(dst_ptr, src_ptr, width);
  2718. return;
  2719. }
  2720. if (128 == y1_fraction) {
  2721. for (x = 0; x < width; x += 32) {
  2722. src0 = (v16u8)__msa_ld_b((void*)s, 0);
  2723. src1 = (v16u8)__msa_ld_b((void*)s, 16);
  2724. src2 = (v16u8)__msa_ld_b((void*)t, 0);
  2725. src3 = (v16u8)__msa_ld_b((void*)t, 16);
  2726. dst0 = __msa_aver_u_b(src0, src2);
  2727. dst1 = __msa_aver_u_b(src1, src3);
  2728. ST_UB2(dst0, dst1, dst_ptr, 16);
  2729. s += 32;
  2730. t += 32;
  2731. dst_ptr += 32;
  2732. }
  2733. return;
  2734. }
  2735. y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
  2736. y_frac = (v8u16)__msa_fill_h(y_fractions);
  2737. for (x = 0; x < width; x += 32) {
  2738. src0 = (v16u8)__msa_ld_b((void*)s, 0);
  2739. src1 = (v16u8)__msa_ld_b((void*)s, 16);
  2740. src2 = (v16u8)__msa_ld_b((void*)t, 0);
  2741. src3 = (v16u8)__msa_ld_b((void*)t, 16);
  2742. vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
  2743. vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
  2744. vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
  2745. vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
  2746. vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
  2747. vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
  2748. vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
  2749. vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
  2750. vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
  2751. vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
  2752. vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
  2753. vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
  2754. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  2755. dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
  2756. ST_UB2(dst0, dst1, dst_ptr, 16);
  2757. s += 32;
  2758. t += 32;
  2759. dst_ptr += 32;
  2760. }
  2761. }
  2762. void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) {
  2763. int x;
  2764. v4i32 dst0 = __builtin_msa_fill_w(v32);
  2765. for (x = 0; x < width; x += 4) {
  2766. ST_UB(dst0, dst_argb);
  2767. dst_argb += 16;
  2768. }
  2769. }
  2770. void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
  2771. int x;
  2772. v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
  2773. v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
  2774. v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13,
  2775. 18, 17, 16, 21, 20, 19, 24, 23};
  2776. v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
  2777. 24, 23, 28, 27, 26, 31, 30, 29};
  2778. for (x = 0; x < width; x += 16) {
  2779. src0 = (v16u8)__msa_ld_b((void*)src_raw, 0);
  2780. src1 = (v16u8)__msa_ld_b((void*)src_raw, 16);
  2781. src2 = (v16u8)__msa_ld_b((void*)src_raw, 32);
  2782. src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
  2783. src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
  2784. dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
  2785. dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
  2786. dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
  2787. ST_UB2(dst0, dst1, dst_rgb24, 16);
  2788. ST_UB(dst2, (dst_rgb24 + 32));
  2789. src_raw += 48;
  2790. dst_rgb24 += 48;
  2791. }
  2792. }
  2793. void MergeUVRow_MSA(const uint8_t* src_u,
  2794. const uint8_t* src_v,
  2795. uint8_t* dst_uv,
  2796. int width) {
  2797. int x;
  2798. v16u8 src0, src1, dst0, dst1;
  2799. for (x = 0; x < width; x += 16) {
  2800. src0 = (v16u8)__msa_ld_b((void*)src_u, 0);
  2801. src1 = (v16u8)__msa_ld_b((void*)src_v, 0);
  2802. dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
  2803. dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
  2804. ST_UB2(dst0, dst1, dst_uv, 16);
  2805. src_u += 16;
  2806. src_v += 16;
  2807. dst_uv += 32;
  2808. }
  2809. }
  2810. void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
  2811. uint8_t* dst_a,
  2812. int width) {
  2813. int i;
  2814. v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
  2815. for (i = 0; i < width; i += 16) {
  2816. src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
  2817. src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
  2818. src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
  2819. src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
  2820. vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  2821. vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
  2822. dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
  2823. ST_UB(dst0, dst_a);
  2824. src_argb += 64;
  2825. dst_a += 16;
  2826. }
  2827. }
  2828. void ARGBBlendRow_MSA(const uint8_t* src_argb0,
  2829. const uint8_t* src_argb1,
  2830. uint8_t* dst_argb,
  2831. int width) {
  2832. int x;
  2833. v16u8 src0, src1, src2, src3, dst0, dst1;
  2834. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  2835. v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
  2836. v8u16 const_256 = (v8u16)__msa_ldi_h(256);
  2837. v16u8 const_255 = (v16u8)__msa_ldi_b(255);
  2838. v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
  2839. v16i8 zero = {0};
  2840. for (x = 0; x < width; x += 8) {
  2841. src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
  2842. src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
  2843. src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
  2844. src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
  2845. vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
  2846. vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
  2847. vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
  2848. vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
  2849. vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
  2850. vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
  2851. vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3);
  2852. vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3);
  2853. vec8 = (v8u16)__msa_fill_h(vec0[3]);
  2854. vec9 = (v8u16)__msa_fill_h(vec0[7]);
  2855. vec10 = (v8u16)__msa_fill_h(vec1[3]);
  2856. vec11 = (v8u16)__msa_fill_h(vec1[7]);
  2857. vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
  2858. vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
  2859. vec10 = (v8u16)__msa_fill_h(vec2[3]);
  2860. vec11 = (v8u16)__msa_fill_h(vec2[7]);
  2861. vec12 = (v8u16)__msa_fill_h(vec3[3]);
  2862. vec13 = (v8u16)__msa_fill_h(vec3[7]);
  2863. vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
  2864. vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12);
  2865. vec8 = const_256 - vec8;
  2866. vec9 = const_256 - vec9;
  2867. vec10 = const_256 - vec10;
  2868. vec11 = const_256 - vec11;
  2869. vec8 *= vec4;
  2870. vec9 *= vec5;
  2871. vec10 *= vec6;
  2872. vec11 *= vec7;
  2873. vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8);
  2874. vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
  2875. vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
  2876. vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
  2877. vec0 += vec8;
  2878. vec1 += vec9;
  2879. vec2 += vec10;
  2880. vec3 += vec11;
  2881. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  2882. dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
  2883. dst0 = __msa_bmnz_v(dst0, const_255, mask);
  2884. dst1 = __msa_bmnz_v(dst1, const_255, mask);
  2885. ST_UB2(dst0, dst1, dst_argb, 16);
  2886. src_argb0 += 32;
  2887. src_argb1 += 32;
  2888. dst_argb += 32;
  2889. }
  2890. }
  2891. void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
  2892. int scale,
  2893. int interval_size,
  2894. int interval_offset,
  2895. int width) {
  2896. int x;
  2897. v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
  2898. v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  2899. v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  2900. v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  2901. v4i32 vec_scale = __msa_fill_w(scale);
  2902. v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size);
  2903. v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset);
  2904. v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
  2905. v16i8 zero = {0};
  2906. for (x = 0; x < width; x += 8) {
  2907. src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0);
  2908. src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16);
  2909. src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32);
  2910. src3 = (v16u8)__msa_ld_b((void*)dst_argb, 48);
  2911. vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0);
  2912. vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0);
  2913. vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
  2914. vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
  2915. vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
  2916. vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
  2917. vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3);
  2918. vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3);
  2919. tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
  2920. tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
  2921. tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
  2922. tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
  2923. tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2);
  2924. tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2);
  2925. tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3);
  2926. tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3);
  2927. tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4);
  2928. tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4);
  2929. tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5);
  2930. tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5);
  2931. tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6);
  2932. tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6);
  2933. tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7);
  2934. tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7);
  2935. tmp0 *= vec_scale;
  2936. tmp1 *= vec_scale;
  2937. tmp2 *= vec_scale;
  2938. tmp3 *= vec_scale;
  2939. tmp4 *= vec_scale;
  2940. tmp5 *= vec_scale;
  2941. tmp6 *= vec_scale;
  2942. tmp7 *= vec_scale;
  2943. tmp8 *= vec_scale;
  2944. tmp9 *= vec_scale;
  2945. tmp10 *= vec_scale;
  2946. tmp11 *= vec_scale;
  2947. tmp12 *= vec_scale;
  2948. tmp13 *= vec_scale;
  2949. tmp14 *= vec_scale;
  2950. tmp15 *= vec_scale;
  2951. tmp0 >>= 16;
  2952. tmp1 >>= 16;
  2953. tmp2 >>= 16;
  2954. tmp3 >>= 16;
  2955. tmp4 >>= 16;
  2956. tmp5 >>= 16;
  2957. tmp6 >>= 16;
  2958. tmp7 >>= 16;
  2959. tmp8 >>= 16;
  2960. tmp9 >>= 16;
  2961. tmp10 >>= 16;
  2962. tmp11 >>= 16;
  2963. tmp12 >>= 16;
  2964. tmp13 >>= 16;
  2965. tmp14 >>= 16;
  2966. tmp15 >>= 16;
  2967. vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
  2968. vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
  2969. vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
  2970. vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
  2971. vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
  2972. vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
  2973. vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
  2974. vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
  2975. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  2976. dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
  2977. dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
  2978. dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
  2979. dst0 *= vec_int_sz;
  2980. dst1 *= vec_int_sz;
  2981. dst2 *= vec_int_sz;
  2982. dst3 *= vec_int_sz;
  2983. dst0 += vec_int_ofst;
  2984. dst1 += vec_int_ofst;
  2985. dst2 += vec_int_ofst;
  2986. dst3 += vec_int_ofst;
  2987. dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0);
  2988. dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1);
  2989. dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2);
  2990. dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3);
  2991. ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
  2992. dst_argb += 64;
  2993. }
  2994. }
  2995. void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
  2996. uint8_t* dst_argb,
  2997. const int8_t* matrix_argb,
  2998. int width) {
  2999. int32_t x;
  3000. v16i8 src0;
  3001. v16u8 src1, src2, dst0, dst1;
  3002. v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
  3003. v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
  3004. v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  3005. v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  3006. v16i8 zero = {0};
  3007. v8i16 max = __msa_ldi_h(255);
  3008. src0 = __msa_ld_b((void*)matrix_argb, 0);
  3009. vec0 = (v8i16)__msa_ilvr_b(zero, src0);
  3010. vec1 = (v8i16)__msa_ilvl_b(zero, src0);
  3011. for (x = 0; x < width; x += 8) {
  3012. src1 = (v16u8)__msa_ld_b((void*)src_argb, 0);
  3013. src2 = (v16u8)__msa_ld_b((void*)src_argb, 16);
  3014. vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
  3015. vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
  3016. vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
  3017. vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
  3018. vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2);
  3019. vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3);
  3020. vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4);
  3021. vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5);
  3022. vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2);
  3023. vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3);
  3024. vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4);
  3025. vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5);
  3026. vec10 = vec2 * vec0;
  3027. vec11 = vec2 * vec1;
  3028. vec12 = vec6 * vec0;
  3029. vec13 = vec6 * vec1;
  3030. tmp0 = __msa_hadd_s_w(vec10, vec10);
  3031. tmp1 = __msa_hadd_s_w(vec11, vec11);
  3032. tmp2 = __msa_hadd_s_w(vec12, vec12);
  3033. tmp3 = __msa_hadd_s_w(vec13, vec13);
  3034. vec14 = vec3 * vec0;
  3035. vec15 = vec3 * vec1;
  3036. vec16 = vec7 * vec0;
  3037. vec17 = vec7 * vec1;
  3038. tmp4 = __msa_hadd_s_w(vec14, vec14);
  3039. tmp5 = __msa_hadd_s_w(vec15, vec15);
  3040. tmp6 = __msa_hadd_s_w(vec16, vec16);
  3041. tmp7 = __msa_hadd_s_w(vec17, vec17);
  3042. vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
  3043. vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
  3044. vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
  3045. vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
  3046. tmp0 = __msa_hadd_s_w(vec10, vec10);
  3047. tmp1 = __msa_hadd_s_w(vec11, vec11);
  3048. tmp2 = __msa_hadd_s_w(vec12, vec12);
  3049. tmp3 = __msa_hadd_s_w(vec13, vec13);
  3050. tmp0 = __msa_srai_w(tmp0, 6);
  3051. tmp1 = __msa_srai_w(tmp1, 6);
  3052. tmp2 = __msa_srai_w(tmp2, 6);
  3053. tmp3 = __msa_srai_w(tmp3, 6);
  3054. vec2 = vec4 * vec0;
  3055. vec6 = vec4 * vec1;
  3056. vec3 = vec8 * vec0;
  3057. vec7 = vec8 * vec1;
  3058. tmp8 = __msa_hadd_s_w(vec2, vec2);
  3059. tmp9 = __msa_hadd_s_w(vec6, vec6);
  3060. tmp10 = __msa_hadd_s_w(vec3, vec3);
  3061. tmp11 = __msa_hadd_s_w(vec7, vec7);
  3062. vec4 = vec5 * vec0;
  3063. vec8 = vec5 * vec1;
  3064. vec5 = vec9 * vec0;
  3065. vec9 = vec9 * vec1;
  3066. tmp12 = __msa_hadd_s_w(vec4, vec4);
  3067. tmp13 = __msa_hadd_s_w(vec8, vec8);
  3068. tmp14 = __msa_hadd_s_w(vec5, vec5);
  3069. tmp15 = __msa_hadd_s_w(vec9, vec9);
  3070. vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
  3071. vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
  3072. vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
  3073. vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
  3074. tmp4 = __msa_hadd_s_w(vec14, vec14);
  3075. tmp5 = __msa_hadd_s_w(vec15, vec15);
  3076. tmp6 = __msa_hadd_s_w(vec16, vec16);
  3077. tmp7 = __msa_hadd_s_w(vec17, vec17);
  3078. tmp4 = __msa_srai_w(tmp4, 6);
  3079. tmp5 = __msa_srai_w(tmp5, 6);
  3080. tmp6 = __msa_srai_w(tmp6, 6);
  3081. tmp7 = __msa_srai_w(tmp7, 6);
  3082. vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
  3083. vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
  3084. vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
  3085. vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
  3086. vec10 = __msa_maxi_s_h(vec10, 0);
  3087. vec11 = __msa_maxi_s_h(vec11, 0);
  3088. vec12 = __msa_maxi_s_h(vec12, 0);
  3089. vec13 = __msa_maxi_s_h(vec13, 0);
  3090. vec10 = __msa_min_s_h(vec10, max);
  3091. vec11 = __msa_min_s_h(vec11, max);
  3092. vec12 = __msa_min_s_h(vec12, max);
  3093. vec13 = __msa_min_s_h(vec13, max);
  3094. dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
  3095. dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12);
  3096. ST_UB2(dst0, dst1, dst_argb, 16);
  3097. src_argb += 32;
  3098. dst_argb += 32;
  3099. }
  3100. }
  3101. void SplitUVRow_MSA(const uint8_t* src_uv,
  3102. uint8_t* dst_u,
  3103. uint8_t* dst_v,
  3104. int width) {
  3105. int x;
  3106. v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
  3107. for (x = 0; x < width; x += 32) {
  3108. src0 = (v16u8)__msa_ld_b((void*)src_uv, 0);
  3109. src1 = (v16u8)__msa_ld_b((void*)src_uv, 16);
  3110. src2 = (v16u8)__msa_ld_b((void*)src_uv, 32);
  3111. src3 = (v16u8)__msa_ld_b((void*)src_uv, 48);
  3112. dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
  3113. dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
  3114. dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
  3115. dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
  3116. ST_UB2(dst0, dst1, dst_u, 16);
  3117. ST_UB2(dst2, dst3, dst_v, 16);
  3118. src_uv += 64;
  3119. dst_u += 32;
  3120. dst_v += 32;
  3121. }
  3122. }
  3123. void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
  3124. int x;
  3125. v16u8 dst0 = (v16u8)__msa_fill_b(v8);
  3126. for (x = 0; x < width; x += 16) {
  3127. ST_UB(dst0, dst);
  3128. dst += 16;
  3129. }
  3130. }
  3131. void MirrorUVRow_MSA(const uint8_t* src_uv,
  3132. uint8_t* dst_u,
  3133. uint8_t* dst_v,
  3134. int width) {
  3135. int x;
  3136. v16u8 src0, src1, src2, src3;
  3137. v16u8 dst0, dst1, dst2, dst3;
  3138. v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
  3139. v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
  3140. src_uv += (2 * width);
  3141. for (x = 0; x < width; x += 32) {
  3142. src_uv -= 64;
  3143. src2 = (v16u8)__msa_ld_b((void*)src_uv, 0);
  3144. src3 = (v16u8)__msa_ld_b((void*)src_uv, 16);
  3145. src0 = (v16u8)__msa_ld_b((void*)src_uv, 32);
  3146. src1 = (v16u8)__msa_ld_b((void*)src_uv, 48);
  3147. dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
  3148. dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
  3149. dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
  3150. dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
  3151. ST_UB2(dst0, dst1, dst_v, 16);
  3152. ST_UB2(dst2, dst3, dst_u, 16);
  3153. dst_u += 32;
  3154. dst_v += 32;
  3155. }
  3156. }
  3157. void SobelXRow_MSA(const uint8_t* src_y0,
  3158. const uint8_t* src_y1,
  3159. const uint8_t* src_y2,
  3160. uint8_t* dst_sobelx,
  3161. int32_t width) {
  3162. int x;
  3163. v16u8 src0, src1, src2, src3, src4, src5, dst0;
  3164. v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
  3165. v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
  3166. v16i8 tmp = __msa_ldi_b(8);
  3167. v16i8 mask1 = mask0 + tmp;
  3168. v8i16 zero = {0};
  3169. v8i16 max = __msa_ldi_h(255);
  3170. for (x = 0; x < width; x += 16) {
  3171. src0 = (v16u8)__msa_ld_b((void*)src_y0, 0);
  3172. src1 = (v16u8)__msa_ld_b((void*)src_y0, 16);
  3173. src2 = (v16u8)__msa_ld_b((void*)src_y1, 0);
  3174. src3 = (v16u8)__msa_ld_b((void*)src_y1, 16);
  3175. src4 = (v16u8)__msa_ld_b((void*)src_y2, 0);
  3176. src5 = (v16u8)__msa_ld_b((void*)src_y2, 16);
  3177. vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
  3178. vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
  3179. vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
  3180. vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
  3181. vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
  3182. vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
  3183. vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
  3184. vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
  3185. vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
  3186. vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
  3187. vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
  3188. vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
  3189. vec0 += vec2;
  3190. vec1 += vec3;
  3191. vec4 += vec2;
  3192. vec5 += vec3;
  3193. vec0 += vec4;
  3194. vec1 += vec5;
  3195. vec0 = __msa_add_a_h(zero, vec0);
  3196. vec1 = __msa_add_a_h(zero, vec1);
  3197. vec0 = __msa_maxi_s_h(vec0, 0);
  3198. vec1 = __msa_maxi_s_h(vec1, 0);
  3199. vec0 = __msa_min_s_h(max, vec0);
  3200. vec1 = __msa_min_s_h(max, vec1);
  3201. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  3202. ST_UB(dst0, dst_sobelx);
  3203. src_y0 += 16;
  3204. src_y1 += 16;
  3205. src_y2 += 16;
  3206. dst_sobelx += 16;
  3207. }
  3208. }
  3209. void SobelYRow_MSA(const uint8_t* src_y0,
  3210. const uint8_t* src_y1,
  3211. uint8_t* dst_sobely,
  3212. int32_t width) {
  3213. int x;
  3214. v16u8 src0, src1, dst0;
  3215. v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
  3216. v8i16 zero = {0};
  3217. v8i16 max = __msa_ldi_h(255);
  3218. for (x = 0; x < width; x += 16) {
  3219. src0 = (v16u8)__msa_ld_b((void*)src_y0, 0);
  3220. src1 = (v16u8)__msa_ld_b((void*)src_y1, 0);
  3221. vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
  3222. vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
  3223. vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
  3224. vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
  3225. vec0 -= vec2;
  3226. vec1 -= vec3;
  3227. vec6[0] = src_y0[16] - src_y1[16];
  3228. vec6[1] = src_y0[17] - src_y1[17];
  3229. vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
  3230. vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
  3231. vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
  3232. vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
  3233. vec0 += vec2;
  3234. vec1 += vec3;
  3235. vec4 += vec2;
  3236. vec5 += vec3;
  3237. vec0 += vec4;
  3238. vec1 += vec5;
  3239. vec0 = __msa_add_a_h(zero, vec0);
  3240. vec1 = __msa_add_a_h(zero, vec1);
  3241. vec0 = __msa_maxi_s_h(vec0, 0);
  3242. vec1 = __msa_maxi_s_h(vec1, 0);
  3243. vec0 = __msa_min_s_h(max, vec0);
  3244. vec1 = __msa_min_s_h(max, vec1);
  3245. dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
  3246. ST_UB(dst0, dst_sobely);
  3247. src_y0 += 16;
  3248. src_y1 += 16;
  3249. dst_sobely += 16;
  3250. }
  3251. }
  3252. void HalfFloatRow_MSA(const uint16_t* src,
  3253. uint16_t* dst,
  3254. float scale,
  3255. int width) {
  3256. int i;
  3257. v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
  3258. v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  3259. v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;
  3260. v4f32 mult_vec;
  3261. v8i16 zero = {0};
  3262. mult_vec[0] = 1.9259299444e-34f * scale;
  3263. mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);
  3264. for (i = 0; i < width; i += 32) {
  3265. src0 = (v8u16)__msa_ld_h((void*)src, 0);
  3266. src1 = (v8u16)__msa_ld_h((void*)src, 16);
  3267. src2 = (v8u16)__msa_ld_h((void*)src, 32);
  3268. src3 = (v8u16)__msa_ld_h((void*)src, 48);
  3269. vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);
  3270. vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);
  3271. vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);
  3272. vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);
  3273. vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);
  3274. vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);
  3275. vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);
  3276. vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);
  3277. fvec0 = __msa_ffint_u_w(vec0);
  3278. fvec1 = __msa_ffint_u_w(vec1);
  3279. fvec2 = __msa_ffint_u_w(vec2);
  3280. fvec3 = __msa_ffint_u_w(vec3);
  3281. fvec4 = __msa_ffint_u_w(vec4);
  3282. fvec5 = __msa_ffint_u_w(vec5);
  3283. fvec6 = __msa_ffint_u_w(vec6);
  3284. fvec7 = __msa_ffint_u_w(vec7);
  3285. fvec0 *= mult_vec;
  3286. fvec1 *= mult_vec;
  3287. fvec2 *= mult_vec;
  3288. fvec3 *= mult_vec;
  3289. fvec4 *= mult_vec;
  3290. fvec5 *= mult_vec;
  3291. fvec6 *= mult_vec;
  3292. fvec7 *= mult_vec;
  3293. vec0 = ((v4u32)fvec0) >> 13;
  3294. vec1 = ((v4u32)fvec1) >> 13;
  3295. vec2 = ((v4u32)fvec2) >> 13;
  3296. vec3 = ((v4u32)fvec3) >> 13;
  3297. vec4 = ((v4u32)fvec4) >> 13;
  3298. vec5 = ((v4u32)fvec5) >> 13;
  3299. vec6 = ((v4u32)fvec6) >> 13;
  3300. vec7 = ((v4u32)fvec7) >> 13;
  3301. dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
  3302. dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);
  3303. dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
  3304. dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
  3305. ST_UH2(dst0, dst1, dst, 8);
  3306. ST_UH2(dst2, dst3, dst + 16, 8);
  3307. src += 32;
  3308. dst += 32;
  3309. }
  3310. }
  3311. #ifdef __cplusplus
  3312. } // extern "C"
  3313. } // namespace libyuv
  3314. #endif
  3315. #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)