sad_msa.c 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_dsp_rtcd.h"
  11. #include "vpx_dsp/mips/macros_msa.h"
  12. #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \
  13. { \
  14. out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
  15. out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
  16. out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
  17. out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
  18. }
  19. #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
  20. static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
  21. const uint8_t *ref_ptr, int32_t ref_stride,
  22. int32_t height) {
  23. int32_t ht_cnt;
  24. uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  25. v16u8 src = { 0 };
  26. v16u8 ref = { 0 };
  27. v16u8 diff;
  28. v8u16 sad = { 0 };
  29. for (ht_cnt = (height >> 2); ht_cnt--;) {
  30. LW4(src_ptr, src_stride, src0, src1, src2, src3);
  31. src_ptr += (4 * src_stride);
  32. LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
  33. ref_ptr += (4 * ref_stride);
  34. INSERT_W4_UB(src0, src1, src2, src3, src);
  35. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  36. diff = __msa_asub_u_b(src, ref);
  37. sad += __msa_hadd_u_h(diff, diff);
  38. }
  39. return HADD_UH_U32(sad);
  40. }
  41. static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
  42. const uint8_t *ref, int32_t ref_stride,
  43. int32_t height) {
  44. int32_t ht_cnt;
  45. v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  46. v8u16 sad = { 0 };
  47. for (ht_cnt = (height >> 2); ht_cnt--;) {
  48. LD_UB4(src, src_stride, src0, src1, src2, src3);
  49. src += (4 * src_stride);
  50. LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
  51. ref += (4 * ref_stride);
  52. PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
  53. ref0, ref1);
  54. sad += SAD_UB2_UH(src0, src1, ref0, ref1);
  55. }
  56. return HADD_UH_U32(sad);
  57. }
  58. static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
  59. const uint8_t *ref, int32_t ref_stride,
  60. int32_t height) {
  61. int32_t ht_cnt;
  62. v16u8 src0, src1, ref0, ref1;
  63. v8u16 sad = { 0 };
  64. for (ht_cnt = (height >> 2); ht_cnt--;) {
  65. LD_UB2(src, src_stride, src0, src1);
  66. src += (2 * src_stride);
  67. LD_UB2(ref, ref_stride, ref0, ref1);
  68. ref += (2 * ref_stride);
  69. sad += SAD_UB2_UH(src0, src1, ref0, ref1);
  70. LD_UB2(src, src_stride, src0, src1);
  71. src += (2 * src_stride);
  72. LD_UB2(ref, ref_stride, ref0, ref1);
  73. ref += (2 * ref_stride);
  74. sad += SAD_UB2_UH(src0, src1, ref0, ref1);
  75. }
  76. return HADD_UH_U32(sad);
  77. }
  78. static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
  79. const uint8_t *ref, int32_t ref_stride,
  80. int32_t height) {
  81. int32_t ht_cnt;
  82. v16u8 src0, src1, ref0, ref1;
  83. v8u16 sad = { 0 };
  84. for (ht_cnt = (height >> 2); ht_cnt--;) {
  85. LD_UB2(src, 16, src0, src1);
  86. src += src_stride;
  87. LD_UB2(ref, 16, ref0, ref1);
  88. ref += ref_stride;
  89. sad += SAD_UB2_UH(src0, src1, ref0, ref1);
  90. LD_UB2(src, 16, src0, src1);
  91. src += src_stride;
  92. LD_UB2(ref, 16, ref0, ref1);
  93. ref += ref_stride;
  94. sad += SAD_UB2_UH(src0, src1, ref0, ref1);
  95. LD_UB2(src, 16, src0, src1);
  96. src += src_stride;
  97. LD_UB2(ref, 16, ref0, ref1);
  98. ref += ref_stride;
  99. sad += SAD_UB2_UH(src0, src1, ref0, ref1);
  100. LD_UB2(src, 16, src0, src1);
  101. src += src_stride;
  102. LD_UB2(ref, 16, ref0, ref1);
  103. ref += ref_stride;
  104. sad += SAD_UB2_UH(src0, src1, ref0, ref1);
  105. }
  106. return HADD_UH_U32(sad);
  107. }
  108. static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
  109. const uint8_t *ref, int32_t ref_stride,
  110. int32_t height) {
  111. int32_t ht_cnt;
  112. uint32_t sad = 0;
  113. v16u8 src0, src1, src2, src3;
  114. v16u8 ref0, ref1, ref2, ref3;
  115. v8u16 sad0 = { 0 };
  116. v8u16 sad1 = { 0 };
  117. for (ht_cnt = (height >> 1); ht_cnt--;) {
  118. LD_UB4(src, 16, src0, src1, src2, src3);
  119. src += src_stride;
  120. LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
  121. ref += ref_stride;
  122. sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
  123. sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
  124. LD_UB4(src, 16, src0, src1, src2, src3);
  125. src += src_stride;
  126. LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
  127. ref += ref_stride;
  128. sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
  129. sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
  130. }
  131. sad = HADD_UH_U32(sad0);
  132. sad += HADD_UH_U32(sad1);
  133. return sad;
  134. }
  135. static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
  136. const uint8_t *ref_ptr, int32_t ref_stride,
  137. int32_t height, uint32_t *sad_array) {
  138. int32_t ht_cnt;
  139. uint32_t src0, src1, src2, src3;
  140. v16u8 src = { 0 };
  141. v16u8 ref = { 0 };
  142. v16u8 ref0, ref1, ref2, ref3, diff;
  143. v8u16 sad0 = { 0 };
  144. v8u16 sad1 = { 0 };
  145. v8u16 sad2 = { 0 };
  146. for (ht_cnt = (height >> 2); ht_cnt--;) {
  147. LW4(src_ptr, src_stride, src0, src1, src2, src3);
  148. src_ptr += (4 * src_stride);
  149. INSERT_W4_UB(src0, src1, src2, src3, src);
  150. LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
  151. ref_ptr += (4 * ref_stride);
  152. SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
  153. diff = __msa_asub_u_b(src, ref);
  154. sad0 += __msa_hadd_u_h(diff, diff);
  155. SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
  156. SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
  157. SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
  158. diff = __msa_asub_u_b(src, ref);
  159. sad1 += __msa_hadd_u_h(diff, diff);
  160. SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
  161. SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
  162. SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
  163. diff = __msa_asub_u_b(src, ref);
  164. sad2 += __msa_hadd_u_h(diff, diff);
  165. }
  166. sad_array[0] = HADD_UH_U32(sad0);
  167. sad_array[1] = HADD_UH_U32(sad1);
  168. sad_array[2] = HADD_UH_U32(sad2);
  169. }
  170. static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
  171. const uint8_t *ref, int32_t ref_stride,
  172. int32_t height, uint32_t *sad_array) {
  173. int32_t ht_cnt;
  174. v16u8 src0, src1, src2, src3;
  175. v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
  176. v8u16 sad0 = { 0 };
  177. v8u16 sad1 = { 0 };
  178. v8u16 sad2 = { 0 };
  179. for (ht_cnt = (height >> 2); ht_cnt--;) {
  180. LD_UB4(src, src_stride, src0, src1, src2, src3);
  181. src += (4 * src_stride);
  182. LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
  183. ref += (4 * ref_stride);
  184. PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
  185. ref0, ref1);
  186. sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
  187. SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
  188. SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
  189. PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
  190. sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
  191. SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
  192. SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
  193. PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
  194. sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
  195. }
  196. sad_array[0] = HADD_UH_U32(sad0);
  197. sad_array[1] = HADD_UH_U32(sad1);
  198. sad_array[2] = HADD_UH_U32(sad2);
  199. }
  200. static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
  201. const uint8_t *ref_ptr, int32_t ref_stride,
  202. int32_t height, uint32_t *sad_array) {
  203. int32_t ht_cnt;
  204. v16u8 src, ref, ref0, ref1, diff;
  205. v8u16 sad0 = { 0 };
  206. v8u16 sad1 = { 0 };
  207. v8u16 sad2 = { 0 };
  208. for (ht_cnt = (height >> 1); ht_cnt--;) {
  209. src = LD_UB(src_ptr);
  210. src_ptr += src_stride;
  211. LD_UB2(ref_ptr, 16, ref0, ref1);
  212. ref_ptr += ref_stride;
  213. diff = __msa_asub_u_b(src, ref0);
  214. sad0 += __msa_hadd_u_h(diff, diff);
  215. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
  216. diff = __msa_asub_u_b(src, ref);
  217. sad1 += __msa_hadd_u_h(diff, diff);
  218. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
  219. diff = __msa_asub_u_b(src, ref);
  220. sad2 += __msa_hadd_u_h(diff, diff);
  221. src = LD_UB(src_ptr);
  222. src_ptr += src_stride;
  223. LD_UB2(ref_ptr, 16, ref0, ref1);
  224. ref_ptr += ref_stride;
  225. diff = __msa_asub_u_b(src, ref0);
  226. sad0 += __msa_hadd_u_h(diff, diff);
  227. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
  228. diff = __msa_asub_u_b(src, ref);
  229. sad1 += __msa_hadd_u_h(diff, diff);
  230. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
  231. diff = __msa_asub_u_b(src, ref);
  232. sad2 += __msa_hadd_u_h(diff, diff);
  233. }
  234. sad_array[0] = HADD_UH_U32(sad0);
  235. sad_array[1] = HADD_UH_U32(sad1);
  236. sad_array[2] = HADD_UH_U32(sad2);
  237. }
  238. static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
  239. const uint8_t *ref_ptr, int32_t ref_stride,
  240. int32_t height, uint32_t *sad_array) {
  241. int32_t ht_cnt;
  242. uint32_t src0, src1, src2, src3;
  243. v16u8 ref0, ref1, ref2, ref3, diff;
  244. v16u8 src = { 0 };
  245. v16u8 ref = { 0 };
  246. v8u16 sad0 = { 0 };
  247. v8u16 sad1 = { 0 };
  248. v8u16 sad2 = { 0 };
  249. v8u16 sad3 = { 0 };
  250. v8u16 sad4 = { 0 };
  251. v8u16 sad5 = { 0 };
  252. v8u16 sad6 = { 0 };
  253. v8u16 sad7 = { 0 };
  254. for (ht_cnt = (height >> 2); ht_cnt--;) {
  255. LW4(src_ptr, src_stride, src0, src1, src2, src3);
  256. INSERT_W4_UB(src0, src1, src2, src3, src);
  257. src_ptr += (4 * src_stride);
  258. LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
  259. ref_ptr += (4 * ref_stride);
  260. SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
  261. diff = __msa_asub_u_b(src, ref);
  262. sad0 += __msa_hadd_u_h(diff, diff);
  263. SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
  264. SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
  265. SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
  266. diff = __msa_asub_u_b(src, ref);
  267. sad1 += __msa_hadd_u_h(diff, diff);
  268. SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
  269. SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
  270. SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
  271. diff = __msa_asub_u_b(src, ref);
  272. sad2 += __msa_hadd_u_h(diff, diff);
  273. SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
  274. SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
  275. SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
  276. diff = __msa_asub_u_b(src, ref);
  277. sad3 += __msa_hadd_u_h(diff, diff);
  278. SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
  279. SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
  280. SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
  281. diff = __msa_asub_u_b(src, ref);
  282. sad4 += __msa_hadd_u_h(diff, diff);
  283. SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
  284. SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
  285. SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
  286. diff = __msa_asub_u_b(src, ref);
  287. sad5 += __msa_hadd_u_h(diff, diff);
  288. SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
  289. SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
  290. SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
  291. diff = __msa_asub_u_b(src, ref);
  292. sad6 += __msa_hadd_u_h(diff, diff);
  293. SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
  294. SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
  295. SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
  296. diff = __msa_asub_u_b(src, ref);
  297. sad7 += __msa_hadd_u_h(diff, diff);
  298. }
  299. sad_array[0] = HADD_UH_U32(sad0);
  300. sad_array[1] = HADD_UH_U32(sad1);
  301. sad_array[2] = HADD_UH_U32(sad2);
  302. sad_array[3] = HADD_UH_U32(sad3);
  303. sad_array[4] = HADD_UH_U32(sad4);
  304. sad_array[5] = HADD_UH_U32(sad5);
  305. sad_array[6] = HADD_UH_U32(sad6);
  306. sad_array[7] = HADD_UH_U32(sad7);
  307. }
  308. static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
  309. const uint8_t *ref, int32_t ref_stride,
  310. int32_t height, uint32_t *sad_array) {
  311. int32_t ht_cnt;
  312. v16u8 src0, src1, src2, src3;
  313. v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
  314. v8u16 sad0 = { 0 };
  315. v8u16 sad1 = { 0 };
  316. v8u16 sad2 = { 0 };
  317. v8u16 sad3 = { 0 };
  318. v8u16 sad4 = { 0 };
  319. v8u16 sad5 = { 0 };
  320. v8u16 sad6 = { 0 };
  321. v8u16 sad7 = { 0 };
  322. for (ht_cnt = (height >> 2); ht_cnt--;) {
  323. LD_UB4(src, src_stride, src0, src1, src2, src3);
  324. src += (4 * src_stride);
  325. LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
  326. ref += (4 * ref_stride);
  327. PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
  328. ref0, ref1);
  329. sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
  330. SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
  331. SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
  332. PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
  333. sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
  334. SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
  335. SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
  336. PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
  337. sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
  338. SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
  339. SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
  340. PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
  341. sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
  342. SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
  343. SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
  344. PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
  345. sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
  346. SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
  347. SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
  348. PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
  349. sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
  350. SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
  351. SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
  352. PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
  353. sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
  354. SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
  355. SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
  356. PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
  357. sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
  358. }
  359. sad_array[0] = HADD_UH_U32(sad0);
  360. sad_array[1] = HADD_UH_U32(sad1);
  361. sad_array[2] = HADD_UH_U32(sad2);
  362. sad_array[3] = HADD_UH_U32(sad3);
  363. sad_array[4] = HADD_UH_U32(sad4);
  364. sad_array[5] = HADD_UH_U32(sad5);
  365. sad_array[6] = HADD_UH_U32(sad6);
  366. sad_array[7] = HADD_UH_U32(sad7);
  367. }
  368. static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
  369. const uint8_t *ref_ptr, int32_t ref_stride,
  370. int32_t height, uint32_t *sad_array) {
  371. int32_t ht_cnt;
  372. v16u8 src, ref0, ref1, ref;
  373. v16u8 diff;
  374. v8u16 sad0 = { 0 };
  375. v8u16 sad1 = { 0 };
  376. v8u16 sad2 = { 0 };
  377. v8u16 sad3 = { 0 };
  378. v8u16 sad4 = { 0 };
  379. v8u16 sad5 = { 0 };
  380. v8u16 sad6 = { 0 };
  381. v8u16 sad7 = { 0 };
  382. for (ht_cnt = (height >> 1); ht_cnt--;) {
  383. src = LD_UB(src_ptr);
  384. src_ptr += src_stride;
  385. LD_UB2(ref_ptr, 16, ref0, ref1);
  386. ref_ptr += ref_stride;
  387. diff = __msa_asub_u_b(src, ref0);
  388. sad0 += __msa_hadd_u_h(diff, diff);
  389. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
  390. diff = __msa_asub_u_b(src, ref);
  391. sad1 += __msa_hadd_u_h(diff, diff);
  392. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
  393. diff = __msa_asub_u_b(src, ref);
  394. sad2 += __msa_hadd_u_h(diff, diff);
  395. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
  396. diff = __msa_asub_u_b(src, ref);
  397. sad3 += __msa_hadd_u_h(diff, diff);
  398. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
  399. diff = __msa_asub_u_b(src, ref);
  400. sad4 += __msa_hadd_u_h(diff, diff);
  401. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
  402. diff = __msa_asub_u_b(src, ref);
  403. sad5 += __msa_hadd_u_h(diff, diff);
  404. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
  405. diff = __msa_asub_u_b(src, ref);
  406. sad6 += __msa_hadd_u_h(diff, diff);
  407. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
  408. diff = __msa_asub_u_b(src, ref);
  409. sad7 += __msa_hadd_u_h(diff, diff);
  410. src = LD_UB(src_ptr);
  411. src_ptr += src_stride;
  412. LD_UB2(ref_ptr, 16, ref0, ref1);
  413. ref_ptr += ref_stride;
  414. diff = __msa_asub_u_b(src, ref0);
  415. sad0 += __msa_hadd_u_h(diff, diff);
  416. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
  417. diff = __msa_asub_u_b(src, ref);
  418. sad1 += __msa_hadd_u_h(diff, diff);
  419. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
  420. diff = __msa_asub_u_b(src, ref);
  421. sad2 += __msa_hadd_u_h(diff, diff);
  422. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
  423. diff = __msa_asub_u_b(src, ref);
  424. sad3 += __msa_hadd_u_h(diff, diff);
  425. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
  426. diff = __msa_asub_u_b(src, ref);
  427. sad4 += __msa_hadd_u_h(diff, diff);
  428. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
  429. diff = __msa_asub_u_b(src, ref);
  430. sad5 += __msa_hadd_u_h(diff, diff);
  431. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
  432. diff = __msa_asub_u_b(src, ref);
  433. sad6 += __msa_hadd_u_h(diff, diff);
  434. ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
  435. diff = __msa_asub_u_b(src, ref);
  436. sad7 += __msa_hadd_u_h(diff, diff);
  437. }
  438. sad_array[0] = HADD_UH_U32(sad0);
  439. sad_array[1] = HADD_UH_U32(sad1);
  440. sad_array[2] = HADD_UH_U32(sad2);
  441. sad_array[3] = HADD_UH_U32(sad3);
  442. sad_array[4] = HADD_UH_U32(sad4);
  443. sad_array[5] = HADD_UH_U32(sad5);
  444. sad_array[6] = HADD_UH_U32(sad6);
  445. sad_array[7] = HADD_UH_U32(sad7);
  446. }
  447. static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
  448. const uint8_t *const aref_ptr[],
  449. int32_t ref_stride, int32_t height,
  450. uint32_t *sad_array) {
  451. const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
  452. int32_t ht_cnt;
  453. uint32_t src0, src1, src2, src3;
  454. uint32_t ref0, ref1, ref2, ref3;
  455. v16u8 src = { 0 };
  456. v16u8 ref = { 0 };
  457. v16u8 diff;
  458. v8u16 sad0 = { 0 };
  459. v8u16 sad1 = { 0 };
  460. v8u16 sad2 = { 0 };
  461. v8u16 sad3 = { 0 };
  462. ref0_ptr = aref_ptr[0];
  463. ref1_ptr = aref_ptr[1];
  464. ref2_ptr = aref_ptr[2];
  465. ref3_ptr = aref_ptr[3];
  466. for (ht_cnt = (height >> 2); ht_cnt--;) {
  467. LW4(src_ptr, src_stride, src0, src1, src2, src3);
  468. INSERT_W4_UB(src0, src1, src2, src3, src);
  469. src_ptr += (4 * src_stride);
  470. LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
  471. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  472. ref0_ptr += (4 * ref_stride);
  473. diff = __msa_asub_u_b(src, ref);
  474. sad0 += __msa_hadd_u_h(diff, diff);
  475. LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
  476. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  477. ref1_ptr += (4 * ref_stride);
  478. diff = __msa_asub_u_b(src, ref);
  479. sad1 += __msa_hadd_u_h(diff, diff);
  480. LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
  481. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  482. ref2_ptr += (4 * ref_stride);
  483. diff = __msa_asub_u_b(src, ref);
  484. sad2 += __msa_hadd_u_h(diff, diff);
  485. LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
  486. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  487. ref3_ptr += (4 * ref_stride);
  488. diff = __msa_asub_u_b(src, ref);
  489. sad3 += __msa_hadd_u_h(diff, diff);
  490. }
  491. sad_array[0] = HADD_UH_U32(sad0);
  492. sad_array[1] = HADD_UH_U32(sad1);
  493. sad_array[2] = HADD_UH_U32(sad2);
  494. sad_array[3] = HADD_UH_U32(sad3);
  495. }
  496. static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
  497. const uint8_t *const aref_ptr[],
  498. int32_t ref_stride, int32_t height,
  499. uint32_t *sad_array) {
  500. int32_t ht_cnt;
  501. const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
  502. v16u8 src0, src1, src2, src3;
  503. v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
  504. v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
  505. v8u16 sad0 = { 0 };
  506. v8u16 sad1 = { 0 };
  507. v8u16 sad2 = { 0 };
  508. v8u16 sad3 = { 0 };
  509. ref0_ptr = aref_ptr[0];
  510. ref1_ptr = aref_ptr[1];
  511. ref2_ptr = aref_ptr[2];
  512. ref3_ptr = aref_ptr[3];
  513. for (ht_cnt = (height >> 2); ht_cnt--;) {
  514. LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
  515. src_ptr += (4 * src_stride);
  516. LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
  517. ref0_ptr += (4 * ref_stride);
  518. LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
  519. ref1_ptr += (4 * ref_stride);
  520. LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
  521. ref2_ptr += (4 * ref_stride);
  522. LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
  523. ref3_ptr += (4 * ref_stride);
  524. PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
  525. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  526. sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
  527. PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
  528. sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
  529. PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
  530. sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
  531. PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
  532. sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
  533. }
  534. sad_array[0] = HADD_UH_U32(sad0);
  535. sad_array[1] = HADD_UH_U32(sad1);
  536. sad_array[2] = HADD_UH_U32(sad2);
  537. sad_array[3] = HADD_UH_U32(sad3);
  538. }
  539. static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
  540. const uint8_t *const aref_ptr[],
  541. int32_t ref_stride, int32_t height,
  542. uint32_t *sad_array) {
  543. int32_t ht_cnt;
  544. const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
  545. v16u8 src, ref0, ref1, ref2, ref3, diff;
  546. v8u16 sad0 = { 0 };
  547. v8u16 sad1 = { 0 };
  548. v8u16 sad2 = { 0 };
  549. v8u16 sad3 = { 0 };
  550. ref0_ptr = aref_ptr[0];
  551. ref1_ptr = aref_ptr[1];
  552. ref2_ptr = aref_ptr[2];
  553. ref3_ptr = aref_ptr[3];
  554. for (ht_cnt = (height >> 1); ht_cnt--;) {
  555. src = LD_UB(src_ptr);
  556. src_ptr += src_stride;
  557. ref0 = LD_UB(ref0_ptr);
  558. ref0_ptr += ref_stride;
  559. ref1 = LD_UB(ref1_ptr);
  560. ref1_ptr += ref_stride;
  561. ref2 = LD_UB(ref2_ptr);
  562. ref2_ptr += ref_stride;
  563. ref3 = LD_UB(ref3_ptr);
  564. ref3_ptr += ref_stride;
  565. diff = __msa_asub_u_b(src, ref0);
  566. sad0 += __msa_hadd_u_h(diff, diff);
  567. diff = __msa_asub_u_b(src, ref1);
  568. sad1 += __msa_hadd_u_h(diff, diff);
  569. diff = __msa_asub_u_b(src, ref2);
  570. sad2 += __msa_hadd_u_h(diff, diff);
  571. diff = __msa_asub_u_b(src, ref3);
  572. sad3 += __msa_hadd_u_h(diff, diff);
  573. src = LD_UB(src_ptr);
  574. src_ptr += src_stride;
  575. ref0 = LD_UB(ref0_ptr);
  576. ref0_ptr += ref_stride;
  577. ref1 = LD_UB(ref1_ptr);
  578. ref1_ptr += ref_stride;
  579. ref2 = LD_UB(ref2_ptr);
  580. ref2_ptr += ref_stride;
  581. ref3 = LD_UB(ref3_ptr);
  582. ref3_ptr += ref_stride;
  583. diff = __msa_asub_u_b(src, ref0);
  584. sad0 += __msa_hadd_u_h(diff, diff);
  585. diff = __msa_asub_u_b(src, ref1);
  586. sad1 += __msa_hadd_u_h(diff, diff);
  587. diff = __msa_asub_u_b(src, ref2);
  588. sad2 += __msa_hadd_u_h(diff, diff);
  589. diff = __msa_asub_u_b(src, ref3);
  590. sad3 += __msa_hadd_u_h(diff, diff);
  591. }
  592. sad_array[0] = HADD_UH_U32(sad0);
  593. sad_array[1] = HADD_UH_U32(sad1);
  594. sad_array[2] = HADD_UH_U32(sad2);
  595. sad_array[3] = HADD_UH_U32(sad3);
  596. }
  597. static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
  598. const uint8_t *const aref_ptr[],
  599. int32_t ref_stride, int32_t height,
  600. uint32_t *sad_array) {
  601. const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
  602. int32_t ht_cnt;
  603. v16u8 src0, src1, ref0, ref1;
  604. v8u16 sad0 = { 0 };
  605. v8u16 sad1 = { 0 };
  606. v8u16 sad2 = { 0 };
  607. v8u16 sad3 = { 0 };
  608. ref0_ptr = aref_ptr[0];
  609. ref1_ptr = aref_ptr[1];
  610. ref2_ptr = aref_ptr[2];
  611. ref3_ptr = aref_ptr[3];
  612. for (ht_cnt = height; ht_cnt--;) {
  613. LD_UB2(src, 16, src0, src1);
  614. src += src_stride;
  615. LD_UB2(ref0_ptr, 16, ref0, ref1);
  616. ref0_ptr += ref_stride;
  617. sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
  618. LD_UB2(ref1_ptr, 16, ref0, ref1);
  619. ref1_ptr += ref_stride;
  620. sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
  621. LD_UB2(ref2_ptr, 16, ref0, ref1);
  622. ref2_ptr += ref_stride;
  623. sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
  624. LD_UB2(ref3_ptr, 16, ref0, ref1);
  625. ref3_ptr += ref_stride;
  626. sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
  627. }
  628. sad_array[0] = HADD_UH_U32(sad0);
  629. sad_array[1] = HADD_UH_U32(sad1);
  630. sad_array[2] = HADD_UH_U32(sad2);
  631. sad_array[3] = HADD_UH_U32(sad3);
  632. }
  633. static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
  634. const uint8_t *const aref_ptr[],
  635. int32_t ref_stride, int32_t height,
  636. uint32_t *sad_array) {
  637. const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
  638. int32_t ht_cnt;
  639. v16u8 src0, src1, src2, src3;
  640. v16u8 ref0, ref1, ref2, ref3;
  641. v8u16 sad0_0 = { 0 };
  642. v8u16 sad0_1 = { 0 };
  643. v8u16 sad1_0 = { 0 };
  644. v8u16 sad1_1 = { 0 };
  645. v8u16 sad2_0 = { 0 };
  646. v8u16 sad2_1 = { 0 };
  647. v8u16 sad3_0 = { 0 };
  648. v8u16 sad3_1 = { 0 };
  649. v4u32 sad;
  650. ref0_ptr = aref_ptr[0];
  651. ref1_ptr = aref_ptr[1];
  652. ref2_ptr = aref_ptr[2];
  653. ref3_ptr = aref_ptr[3];
  654. for (ht_cnt = height; ht_cnt--;) {
  655. LD_UB4(src, 16, src0, src1, src2, src3);
  656. src += src_stride;
  657. LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
  658. ref0_ptr += ref_stride;
  659. sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
  660. sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
  661. LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
  662. ref1_ptr += ref_stride;
  663. sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
  664. sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
  665. LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
  666. ref2_ptr += ref_stride;
  667. sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
  668. sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
  669. LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
  670. ref3_ptr += ref_stride;
  671. sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
  672. sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
  673. }
  674. sad = __msa_hadd_u_w(sad0_0, sad0_0);
  675. sad += __msa_hadd_u_w(sad0_1, sad0_1);
  676. sad_array[0] = HADD_UW_U32(sad);
  677. sad = __msa_hadd_u_w(sad1_0, sad1_0);
  678. sad += __msa_hadd_u_w(sad1_1, sad1_1);
  679. sad_array[1] = HADD_UW_U32(sad);
  680. sad = __msa_hadd_u_w(sad2_0, sad2_0);
  681. sad += __msa_hadd_u_w(sad2_1, sad2_1);
  682. sad_array[2] = HADD_UW_U32(sad);
  683. sad = __msa_hadd_u_w(sad3_0, sad3_0);
  684. sad += __msa_hadd_u_w(sad3_1, sad3_1);
  685. sad_array[3] = HADD_UW_U32(sad);
  686. }
  687. static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
  688. const uint8_t *ref_ptr, int32_t ref_stride,
  689. int32_t height, const uint8_t *sec_pred) {
  690. int32_t ht_cnt;
  691. uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  692. v16u8 src = { 0 };
  693. v16u8 ref = { 0 };
  694. v16u8 diff, pred, comp;
  695. v8u16 sad = { 0 };
  696. for (ht_cnt = (height >> 2); ht_cnt--;) {
  697. LW4(src_ptr, src_stride, src0, src1, src2, src3);
  698. src_ptr += (4 * src_stride);
  699. LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
  700. ref_ptr += (4 * ref_stride);
  701. pred = LD_UB(sec_pred);
  702. sec_pred += 16;
  703. INSERT_W4_UB(src0, src1, src2, src3, src);
  704. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  705. comp = __msa_aver_u_b(pred, ref);
  706. diff = __msa_asub_u_b(src, comp);
  707. sad += __msa_hadd_u_h(diff, diff);
  708. }
  709. return HADD_UH_U32(sad);
  710. }
  711. static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
  712. const uint8_t *ref, int32_t ref_stride,
  713. int32_t height, const uint8_t *sec_pred) {
  714. int32_t ht_cnt;
  715. v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  716. v16u8 diff0, diff1, pred0, pred1;
  717. v8u16 sad = { 0 };
  718. for (ht_cnt = (height >> 2); ht_cnt--;) {
  719. LD_UB4(src, src_stride, src0, src1, src2, src3);
  720. src += (4 * src_stride);
  721. LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
  722. ref += (4 * ref_stride);
  723. LD_UB2(sec_pred, 16, pred0, pred1);
  724. sec_pred += 32;
  725. PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
  726. ref0, ref1);
  727. AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
  728. sad += SAD_UB2_UH(src0, src1, diff0, diff1);
  729. }
  730. return HADD_UH_U32(sad);
  731. }
  732. static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
  733. const uint8_t *ref, int32_t ref_stride,
  734. int32_t height, const uint8_t *sec_pred) {
  735. int32_t ht_cnt;
  736. v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  737. v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
  738. v8u16 sad = { 0 };
  739. for (ht_cnt = (height >> 3); ht_cnt--;) {
  740. LD_UB4(src, src_stride, src0, src1, src2, src3);
  741. src += (4 * src_stride);
  742. LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
  743. ref += (4 * ref_stride);
  744. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  745. sec_pred += (4 * 16);
  746. AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
  747. sad += SAD_UB2_UH(src0, src1, comp0, comp1);
  748. AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
  749. sad += SAD_UB2_UH(src2, src3, comp0, comp1);
  750. LD_UB4(src, src_stride, src0, src1, src2, src3);
  751. src += (4 * src_stride);
  752. LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
  753. ref += (4 * ref_stride);
  754. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  755. sec_pred += (4 * 16);
  756. AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
  757. sad += SAD_UB2_UH(src0, src1, comp0, comp1);
  758. AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
  759. sad += SAD_UB2_UH(src2, src3, comp0, comp1);
  760. }
  761. return HADD_UH_U32(sad);
  762. }
  763. static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
  764. const uint8_t *ref, int32_t ref_stride,
  765. int32_t height, const uint8_t *sec_pred) {
  766. int32_t ht_cnt;
  767. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  768. v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
  769. v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
  770. v16u8 comp0, comp1;
  771. v8u16 sad = { 0 };
  772. for (ht_cnt = (height >> 2); ht_cnt--;) {
  773. LD_UB4(src, src_stride, src0, src2, src4, src6);
  774. LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
  775. src += (4 * src_stride);
  776. LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
  777. LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
  778. ref += (4 * ref_stride);
  779. LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
  780. LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
  781. sec_pred += (4 * 32);
  782. AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
  783. sad += SAD_UB2_UH(src0, src1, comp0, comp1);
  784. AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
  785. sad += SAD_UB2_UH(src2, src3, comp0, comp1);
  786. AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
  787. sad += SAD_UB2_UH(src4, src5, comp0, comp1);
  788. AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
  789. sad += SAD_UB2_UH(src6, src7, comp0, comp1);
  790. }
  791. return HADD_UH_U32(sad);
  792. }
  793. static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
  794. const uint8_t *ref, int32_t ref_stride,
  795. int32_t height, const uint8_t *sec_pred) {
  796. int32_t ht_cnt;
  797. v16u8 src0, src1, src2, src3;
  798. v16u8 ref0, ref1, ref2, ref3;
  799. v16u8 comp0, comp1, comp2, comp3;
  800. v16u8 pred0, pred1, pred2, pred3;
  801. v8u16 sad0 = { 0 };
  802. v8u16 sad1 = { 0 };
  803. v4u32 sad;
  804. for (ht_cnt = (height >> 2); ht_cnt--;) {
  805. LD_UB4(src, 16, src0, src1, src2, src3);
  806. src += src_stride;
  807. LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
  808. ref += ref_stride;
  809. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  810. sec_pred += 64;
  811. AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
  812. comp1, comp2, comp3);
  813. sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
  814. sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
  815. LD_UB4(src, 16, src0, src1, src2, src3);
  816. src += src_stride;
  817. LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
  818. ref += ref_stride;
  819. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  820. sec_pred += 64;
  821. AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
  822. comp1, comp2, comp3);
  823. sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
  824. sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
  825. LD_UB4(src, 16, src0, src1, src2, src3);
  826. src += src_stride;
  827. LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
  828. ref += ref_stride;
  829. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  830. sec_pred += 64;
  831. AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
  832. comp1, comp2, comp3);
  833. sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
  834. sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
  835. LD_UB4(src, 16, src0, src1, src2, src3);
  836. src += src_stride;
  837. LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
  838. ref += ref_stride;
  839. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  840. sec_pred += 64;
  841. AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
  842. comp1, comp2, comp3);
  843. sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
  844. sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
  845. }
  846. sad = __msa_hadd_u_w(sad0, sad0);
  847. sad += __msa_hadd_u_w(sad1, sad1);
  848. return HADD_SW_S32(sad);
  849. }
  850. #define VPX_SAD_4xHEIGHT_MSA(height) \
  851. uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
  852. const uint8_t *ref, int32_t ref_stride) { \
  853. return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
  854. }
  855. #define VPX_SAD_8xHEIGHT_MSA(height) \
  856. uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
  857. const uint8_t *ref, int32_t ref_stride) { \
  858. return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
  859. }
  860. #define VPX_SAD_16xHEIGHT_MSA(height) \
  861. uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
  862. const uint8_t *ref, int32_t ref_stride) { \
  863. return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
  864. }
  865. #define VPX_SAD_32xHEIGHT_MSA(height) \
  866. uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
  867. const uint8_t *ref, int32_t ref_stride) { \
  868. return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
  869. }
  870. #define VPX_SAD_64xHEIGHT_MSA(height) \
  871. uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
  872. const uint8_t *ref, int32_t ref_stride) { \
  873. return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
  874. }
  875. #define VPX_SAD_4xHEIGHTx3_MSA(height) \
  876. void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
  877. const uint8_t *ref, int32_t ref_stride, \
  878. uint32_t *sads) { \
  879. sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
  880. }
  881. #define VPX_SAD_8xHEIGHTx3_MSA(height) \
  882. void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
  883. const uint8_t *ref, int32_t ref_stride, \
  884. uint32_t *sads) { \
  885. sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
  886. }
  887. #define VPX_SAD_16xHEIGHTx3_MSA(height) \
  888. void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
  889. const uint8_t *ref, int32_t ref_stride, \
  890. uint32_t *sads) { \
  891. sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
  892. }
  893. #define VPX_SAD_4xHEIGHTx8_MSA(height) \
  894. void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
  895. const uint8_t *ref, int32_t ref_stride, \
  896. uint32_t *sads) { \
  897. sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
  898. }
  899. #define VPX_SAD_8xHEIGHTx8_MSA(height) \
  900. void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
  901. const uint8_t *ref, int32_t ref_stride, \
  902. uint32_t *sads) { \
  903. sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
  904. }
  905. #define VPX_SAD_16xHEIGHTx8_MSA(height) \
  906. void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
  907. const uint8_t *ref, int32_t ref_stride, \
  908. uint32_t *sads) { \
  909. sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
  910. }
  911. #define VPX_SAD_4xHEIGHTx4D_MSA(height) \
  912. void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
  913. const uint8_t *const refs[], \
  914. int32_t ref_stride, uint32_t *sads) { \
  915. sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
  916. }
  917. #define VPX_SAD_8xHEIGHTx4D_MSA(height) \
  918. void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
  919. const uint8_t *const refs[], \
  920. int32_t ref_stride, uint32_t *sads) { \
  921. sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
  922. }
  923. #define VPX_SAD_16xHEIGHTx4D_MSA(height) \
  924. void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
  925. const uint8_t *const refs[], \
  926. int32_t ref_stride, uint32_t *sads) { \
  927. sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
  928. }
  929. #define VPX_SAD_32xHEIGHTx4D_MSA(height) \
  930. void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
  931. const uint8_t *const refs[], \
  932. int32_t ref_stride, uint32_t *sads) { \
  933. sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
  934. }
  935. #define VPX_SAD_64xHEIGHTx4D_MSA(height) \
  936. void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
  937. const uint8_t *const refs[], \
  938. int32_t ref_stride, uint32_t *sads) { \
  939. sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
  940. }
  941. #define VPX_AVGSAD_4xHEIGHT_MSA(height) \
  942. uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
  943. const uint8_t *ref, int32_t ref_stride, \
  944. const uint8_t *second_pred) { \
  945. return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \
  946. second_pred); \
  947. }
  948. #define VPX_AVGSAD_8xHEIGHT_MSA(height) \
  949. uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
  950. const uint8_t *ref, int32_t ref_stride, \
  951. const uint8_t *second_pred) { \
  952. return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \
  953. second_pred); \
  954. }
  955. #define VPX_AVGSAD_16xHEIGHT_MSA(height) \
  956. uint32_t vpx_sad16x##height##_avg_msa( \
  957. const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
  958. int32_t ref_stride, const uint8_t *second_pred) { \
  959. return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
  960. second_pred); \
  961. }
  962. #define VPX_AVGSAD_32xHEIGHT_MSA(height) \
  963. uint32_t vpx_sad32x##height##_avg_msa( \
  964. const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
  965. int32_t ref_stride, const uint8_t *second_pred) { \
  966. return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
  967. second_pred); \
  968. }
  969. #define VPX_AVGSAD_64xHEIGHT_MSA(height) \
  970. uint32_t vpx_sad64x##height##_avg_msa( \
  971. const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
  972. int32_t ref_stride, const uint8_t *second_pred) { \
  973. return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
  974. second_pred); \
  975. }
  976. // 64x64
  977. VPX_SAD_64xHEIGHT_MSA(64);
  978. VPX_SAD_64xHEIGHTx4D_MSA(64);
  979. VPX_AVGSAD_64xHEIGHT_MSA(64);
  980. // 64x32
  981. VPX_SAD_64xHEIGHT_MSA(32);
  982. VPX_SAD_64xHEIGHTx4D_MSA(32);
  983. VPX_AVGSAD_64xHEIGHT_MSA(32);
  984. // 32x64
  985. VPX_SAD_32xHEIGHT_MSA(64);
  986. VPX_SAD_32xHEIGHTx4D_MSA(64);
  987. VPX_AVGSAD_32xHEIGHT_MSA(64);
  988. // 32x32
  989. VPX_SAD_32xHEIGHT_MSA(32);
  990. VPX_SAD_32xHEIGHTx4D_MSA(32);
  991. VPX_AVGSAD_32xHEIGHT_MSA(32);
  992. // 32x16
  993. VPX_SAD_32xHEIGHT_MSA(16);
  994. VPX_SAD_32xHEIGHTx4D_MSA(16);
  995. VPX_AVGSAD_32xHEIGHT_MSA(16);
  996. // 16x32
  997. VPX_SAD_16xHEIGHT_MSA(32);
  998. VPX_SAD_16xHEIGHTx4D_MSA(32);
  999. VPX_AVGSAD_16xHEIGHT_MSA(32);
  1000. // 16x16
  1001. VPX_SAD_16xHEIGHT_MSA(16);
  1002. VPX_SAD_16xHEIGHTx3_MSA(16);
  1003. VPX_SAD_16xHEIGHTx8_MSA(16);
  1004. VPX_SAD_16xHEIGHTx4D_MSA(16);
  1005. VPX_AVGSAD_16xHEIGHT_MSA(16);
  1006. // 16x8
  1007. VPX_SAD_16xHEIGHT_MSA(8);
  1008. VPX_SAD_16xHEIGHTx3_MSA(8);
  1009. VPX_SAD_16xHEIGHTx8_MSA(8);
  1010. VPX_SAD_16xHEIGHTx4D_MSA(8);
  1011. VPX_AVGSAD_16xHEIGHT_MSA(8);
  1012. // 8x16
  1013. VPX_SAD_8xHEIGHT_MSA(16);
  1014. VPX_SAD_8xHEIGHTx3_MSA(16);
  1015. VPX_SAD_8xHEIGHTx8_MSA(16);
  1016. VPX_SAD_8xHEIGHTx4D_MSA(16);
  1017. VPX_AVGSAD_8xHEIGHT_MSA(16);
  1018. // 8x8
  1019. VPX_SAD_8xHEIGHT_MSA(8);
  1020. VPX_SAD_8xHEIGHTx3_MSA(8);
  1021. VPX_SAD_8xHEIGHTx8_MSA(8);
  1022. VPX_SAD_8xHEIGHTx4D_MSA(8);
  1023. VPX_AVGSAD_8xHEIGHT_MSA(8);
  1024. // 8x4
  1025. VPX_SAD_8xHEIGHT_MSA(4);
  1026. VPX_SAD_8xHEIGHTx4D_MSA(4);
  1027. VPX_AVGSAD_8xHEIGHT_MSA(4);
  1028. // 4x8
  1029. VPX_SAD_4xHEIGHT_MSA(8);
  1030. VPX_SAD_4xHEIGHTx4D_MSA(8);
  1031. VPX_AVGSAD_4xHEIGHT_MSA(8);
  1032. // 4x4
  1033. VPX_SAD_4xHEIGHT_MSA(4);
  1034. VPX_SAD_4xHEIGHTx3_MSA(4);
  1035. VPX_SAD_4xHEIGHTx8_MSA(4);
  1036. VPX_SAD_4xHEIGHTx4D_MSA(4);
  1037. VPX_AVGSAD_4xHEIGHT_MSA(4);