sub_pixel_variance_msa.c 65 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_dsp_rtcd.h"
  11. #include "vpx_ports/mem.h"
  12. #include "vpx_dsp/mips/macros_msa.h"
  13. #include "vpx_dsp/variance.h"
  14. static const uint8_t bilinear_filters_msa[8][2] = {
  15. { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
  16. { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
  17. };
  18. #define CALC_MSE_AVG_B(src, ref, var, sub) \
  19. { \
  20. v16u8 src_l0_m, src_l1_m; \
  21. v8i16 res_l0_m, res_l1_m; \
  22. \
  23. ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
  24. HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
  25. DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
  26. \
  27. (sub) += res_l0_m + res_l1_m; \
  28. }
  29. #define VARIANCE_WxH(sse, diff, shift) \
  30. (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
  31. #define VARIANCE_LARGE_WxH(sse, diff, shift) \
  32. (sse) - (((int64_t)(diff) * (diff)) >> (shift))
  33. static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
  34. int32_t src_stride,
  35. const uint8_t *ref_ptr,
  36. int32_t ref_stride,
  37. const uint8_t *sec_pred, int32_t height,
  38. int32_t *diff) {
  39. int32_t ht_cnt;
  40. uint32_t src0, src1, src2, src3;
  41. uint32_t ref0, ref1, ref2, ref3;
  42. v16u8 pred, src = { 0 };
  43. v16u8 ref = { 0 };
  44. v8i16 avg = { 0 };
  45. v4i32 vec, var = { 0 };
  46. for (ht_cnt = (height >> 2); ht_cnt--;) {
  47. pred = LD_UB(sec_pred);
  48. sec_pred += 16;
  49. LW4(src_ptr, src_stride, src0, src1, src2, src3);
  50. src_ptr += (4 * src_stride);
  51. LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
  52. ref_ptr += (4 * ref_stride);
  53. INSERT_W4_UB(src0, src1, src2, src3, src);
  54. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  55. src = __msa_aver_u_b(src, pred);
  56. CALC_MSE_AVG_B(src, ref, var, avg);
  57. }
  58. vec = __msa_hadd_s_w(avg, avg);
  59. *diff = HADD_SW_S32(vec);
  60. return HADD_SW_S32(var);
  61. }
  62. static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
  63. int32_t src_stride,
  64. const uint8_t *ref_ptr,
  65. int32_t ref_stride,
  66. const uint8_t *sec_pred, int32_t height,
  67. int32_t *diff) {
  68. int32_t ht_cnt;
  69. v16u8 src0, src1, src2, src3;
  70. v16u8 ref0, ref1, ref2, ref3;
  71. v16u8 pred0, pred1;
  72. v8i16 avg = { 0 };
  73. v4i32 vec, var = { 0 };
  74. for (ht_cnt = (height >> 2); ht_cnt--;) {
  75. LD_UB2(sec_pred, 16, pred0, pred1);
  76. sec_pred += 32;
  77. LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
  78. src_ptr += (4 * src_stride);
  79. LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
  80. ref_ptr += (4 * ref_stride);
  81. PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
  82. ref0, ref1);
  83. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  84. CALC_MSE_AVG_B(src0, ref0, var, avg);
  85. CALC_MSE_AVG_B(src1, ref1, var, avg);
  86. }
  87. vec = __msa_hadd_s_w(avg, avg);
  88. *diff = HADD_SW_S32(vec);
  89. return HADD_SW_S32(var);
  90. }
  91. static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
  92. int32_t src_stride,
  93. const uint8_t *ref_ptr,
  94. int32_t ref_stride,
  95. const uint8_t *sec_pred,
  96. int32_t height, int32_t *diff) {
  97. int32_t ht_cnt;
  98. v16u8 src, ref, pred;
  99. v8i16 avg = { 0 };
  100. v4i32 vec, var = { 0 };
  101. for (ht_cnt = (height >> 2); ht_cnt--;) {
  102. pred = LD_UB(sec_pred);
  103. sec_pred += 16;
  104. src = LD_UB(src_ptr);
  105. src_ptr += src_stride;
  106. ref = LD_UB(ref_ptr);
  107. ref_ptr += ref_stride;
  108. src = __msa_aver_u_b(src, pred);
  109. CALC_MSE_AVG_B(src, ref, var, avg);
  110. pred = LD_UB(sec_pred);
  111. sec_pred += 16;
  112. src = LD_UB(src_ptr);
  113. src_ptr += src_stride;
  114. ref = LD_UB(ref_ptr);
  115. ref_ptr += ref_stride;
  116. src = __msa_aver_u_b(src, pred);
  117. CALC_MSE_AVG_B(src, ref, var, avg);
  118. pred = LD_UB(sec_pred);
  119. sec_pred += 16;
  120. src = LD_UB(src_ptr);
  121. src_ptr += src_stride;
  122. ref = LD_UB(ref_ptr);
  123. ref_ptr += ref_stride;
  124. src = __msa_aver_u_b(src, pred);
  125. CALC_MSE_AVG_B(src, ref, var, avg);
  126. pred = LD_UB(sec_pred);
  127. sec_pred += 16;
  128. src = LD_UB(src_ptr);
  129. src_ptr += src_stride;
  130. ref = LD_UB(ref_ptr);
  131. ref_ptr += ref_stride;
  132. src = __msa_aver_u_b(src, pred);
  133. CALC_MSE_AVG_B(src, ref, var, avg);
  134. }
  135. vec = __msa_hadd_s_w(avg, avg);
  136. *diff = HADD_SW_S32(vec);
  137. return HADD_SW_S32(var);
  138. }
  139. static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
  140. int32_t src_stride,
  141. const uint8_t *ref_ptr,
  142. int32_t ref_stride,
  143. const uint8_t *sec_pred,
  144. int32_t height, int32_t *diff) {
  145. int32_t ht_cnt;
  146. v16u8 src0, src1, ref0, ref1, pred0, pred1;
  147. v8i16 avg = { 0 };
  148. v4i32 vec, var = { 0 };
  149. for (ht_cnt = (height >> 2); ht_cnt--;) {
  150. LD_UB2(sec_pred, 16, pred0, pred1);
  151. sec_pred += 32;
  152. LD_UB2(src_ptr, 16, src0, src1);
  153. src_ptr += src_stride;
  154. LD_UB2(ref_ptr, 16, ref0, ref1);
  155. ref_ptr += ref_stride;
  156. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  157. CALC_MSE_AVG_B(src0, ref0, var, avg);
  158. CALC_MSE_AVG_B(src1, ref1, var, avg);
  159. LD_UB2(sec_pred, 16, pred0, pred1);
  160. sec_pred += 32;
  161. LD_UB2(src_ptr, 16, src0, src1);
  162. src_ptr += src_stride;
  163. LD_UB2(ref_ptr, 16, ref0, ref1);
  164. ref_ptr += ref_stride;
  165. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  166. CALC_MSE_AVG_B(src0, ref0, var, avg);
  167. CALC_MSE_AVG_B(src1, ref1, var, avg);
  168. LD_UB2(sec_pred, 16, pred0, pred1);
  169. sec_pred += 32;
  170. LD_UB2(src_ptr, 16, src0, src1);
  171. src_ptr += src_stride;
  172. LD_UB2(ref_ptr, 16, ref0, ref1);
  173. ref_ptr += ref_stride;
  174. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  175. CALC_MSE_AVG_B(src0, ref0, var, avg);
  176. CALC_MSE_AVG_B(src1, ref1, var, avg);
  177. LD_UB2(sec_pred, 16, pred0, pred1);
  178. sec_pred += 32;
  179. LD_UB2(src_ptr, 16, src0, src1);
  180. src_ptr += src_stride;
  181. LD_UB2(ref_ptr, 16, ref0, ref1);
  182. ref_ptr += ref_stride;
  183. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  184. CALC_MSE_AVG_B(src0, ref0, var, avg);
  185. CALC_MSE_AVG_B(src1, ref1, var, avg);
  186. }
  187. vec = __msa_hadd_s_w(avg, avg);
  188. *diff = HADD_SW_S32(vec);
  189. return HADD_SW_S32(var);
  190. }
  191. static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
  192. int32_t src_stride,
  193. const uint8_t *ref_ptr,
  194. int32_t ref_stride,
  195. const uint8_t *sec_pred, int32_t *diff) {
  196. int32_t ht_cnt;
  197. v16u8 src0, src1, ref0, ref1, pred0, pred1;
  198. v8i16 avg0 = { 0 };
  199. v8i16 avg1 = { 0 };
  200. v4i32 vec, var = { 0 };
  201. for (ht_cnt = 16; ht_cnt--;) {
  202. LD_UB2(sec_pred, 16, pred0, pred1);
  203. sec_pred += 32;
  204. LD_UB2(src_ptr, 16, src0, src1);
  205. src_ptr += src_stride;
  206. LD_UB2(ref_ptr, 16, ref0, ref1);
  207. ref_ptr += ref_stride;
  208. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  209. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  210. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  211. LD_UB2(sec_pred, 16, pred0, pred1);
  212. sec_pred += 32;
  213. LD_UB2(src_ptr, 16, src0, src1);
  214. src_ptr += src_stride;
  215. LD_UB2(ref_ptr, 16, ref0, ref1);
  216. ref_ptr += ref_stride;
  217. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  218. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  219. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  220. LD_UB2(sec_pred, 16, pred0, pred1);
  221. sec_pred += 32;
  222. LD_UB2(src_ptr, 16, src0, src1);
  223. src_ptr += src_stride;
  224. LD_UB2(ref_ptr, 16, ref0, ref1);
  225. ref_ptr += ref_stride;
  226. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  227. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  228. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  229. LD_UB2(sec_pred, 16, pred0, pred1);
  230. sec_pred += 32;
  231. LD_UB2(src_ptr, 16, src0, src1);
  232. src_ptr += src_stride;
  233. LD_UB2(ref_ptr, 16, ref0, ref1);
  234. ref_ptr += ref_stride;
  235. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  236. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  237. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  238. }
  239. vec = __msa_hadd_s_w(avg0, avg0);
  240. vec += __msa_hadd_s_w(avg1, avg1);
  241. *diff = HADD_SW_S32(vec);
  242. return HADD_SW_S32(var);
  243. }
  244. static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
  245. int32_t src_stride,
  246. const uint8_t *ref_ptr,
  247. int32_t ref_stride,
  248. const uint8_t *sec_pred, int32_t *diff) {
  249. int32_t ht_cnt;
  250. v16u8 src0, src1, src2, src3;
  251. v16u8 ref0, ref1, ref2, ref3;
  252. v16u8 pred0, pred1, pred2, pred3;
  253. v8i16 avg0 = { 0 };
  254. v8i16 avg1 = { 0 };
  255. v4i32 vec, var = { 0 };
  256. for (ht_cnt = 16; ht_cnt--;) {
  257. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  258. sec_pred += 64;
  259. LD_UB4(src_ptr, 16, src0, src1, src2, src3);
  260. src_ptr += src_stride;
  261. LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
  262. ref_ptr += ref_stride;
  263. AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
  264. src2, src3);
  265. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  266. CALC_MSE_AVG_B(src2, ref2, var, avg0);
  267. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  268. CALC_MSE_AVG_B(src3, ref3, var, avg1);
  269. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  270. sec_pred += 64;
  271. LD_UB4(src_ptr, 16, src0, src1, src2, src3);
  272. src_ptr += src_stride;
  273. LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
  274. ref_ptr += ref_stride;
  275. AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
  276. src2, src3);
  277. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  278. CALC_MSE_AVG_B(src2, ref2, var, avg0);
  279. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  280. CALC_MSE_AVG_B(src3, ref3, var, avg1);
  281. }
  282. vec = __msa_hadd_s_w(avg0, avg0);
  283. vec += __msa_hadd_s_w(avg1, avg1);
  284. *diff = HADD_SW_S32(vec);
  285. return HADD_SW_S32(var);
  286. }
  287. static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
  288. int32_t src_stride,
  289. const uint8_t *ref_ptr,
  290. int32_t ref_stride,
  291. const uint8_t *sec_pred, int32_t *diff) {
  292. int32_t ht_cnt;
  293. v16u8 src0, src1, src2, src3;
  294. v16u8 ref0, ref1, ref2, ref3;
  295. v16u8 pred0, pred1, pred2, pred3;
  296. v8i16 avg0 = { 0 };
  297. v8i16 avg1 = { 0 };
  298. v8i16 avg2 = { 0 };
  299. v8i16 avg3 = { 0 };
  300. v4i32 vec, var = { 0 };
  301. for (ht_cnt = 32; ht_cnt--;) {
  302. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  303. sec_pred += 64;
  304. LD_UB4(src_ptr, 16, src0, src1, src2, src3);
  305. src_ptr += src_stride;
  306. LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
  307. ref_ptr += ref_stride;
  308. AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
  309. src2, src3);
  310. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  311. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  312. CALC_MSE_AVG_B(src2, ref2, var, avg2);
  313. CALC_MSE_AVG_B(src3, ref3, var, avg3);
  314. LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
  315. sec_pred += 64;
  316. LD_UB4(src_ptr, 16, src0, src1, src2, src3);
  317. src_ptr += src_stride;
  318. LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
  319. ref_ptr += ref_stride;
  320. AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
  321. src2, src3);
  322. CALC_MSE_AVG_B(src0, ref0, var, avg0);
  323. CALC_MSE_AVG_B(src1, ref1, var, avg1);
  324. CALC_MSE_AVG_B(src2, ref2, var, avg2);
  325. CALC_MSE_AVG_B(src3, ref3, var, avg3);
  326. }
  327. vec = __msa_hadd_s_w(avg0, avg0);
  328. vec += __msa_hadd_s_w(avg1, avg1);
  329. vec += __msa_hadd_s_w(avg2, avg2);
  330. vec += __msa_hadd_s_w(avg3, avg3);
  331. *diff = HADD_SW_S32(vec);
  332. return HADD_SW_S32(var);
  333. }
  334. static uint32_t sub_pixel_sse_diff_4width_h_msa(
  335. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  336. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  337. int16_t filtval;
  338. uint32_t loop_cnt;
  339. uint32_t ref0, ref1, ref2, ref3;
  340. v16u8 filt0, ref = { 0 };
  341. v16i8 src0, src1, src2, src3;
  342. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  343. v8u16 vec0, vec1, vec2, vec3;
  344. v8i16 avg = { 0 };
  345. v4i32 vec, var = { 0 };
  346. filtval = LH(filter);
  347. filt0 = (v16u8)__msa_fill_h(filtval);
  348. for (loop_cnt = (height >> 2); loop_cnt--;) {
  349. LD_SB4(src, src_stride, src0, src1, src2, src3);
  350. src += (4 * src_stride);
  351. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  352. dst += (4 * dst_stride);
  353. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  354. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  355. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  356. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  357. vec2, vec3);
  358. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  359. PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
  360. src2, src3);
  361. ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
  362. src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
  363. CALC_MSE_AVG_B(src0, ref, var, avg);
  364. }
  365. vec = __msa_hadd_s_w(avg, avg);
  366. *diff = HADD_SW_S32(vec);
  367. return HADD_SW_S32(var);
  368. }
  369. static uint32_t sub_pixel_sse_diff_8width_h_msa(
  370. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  371. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  372. int16_t filtval;
  373. uint32_t loop_cnt;
  374. v16u8 filt0, out, ref0, ref1, ref2, ref3;
  375. v16i8 src0, src1, src2, src3;
  376. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  377. v8u16 vec0, vec1, vec2, vec3;
  378. v8i16 avg = { 0 };
  379. v4i32 vec, var = { 0 };
  380. filtval = LH(filter);
  381. filt0 = (v16u8)__msa_fill_h(filtval);
  382. for (loop_cnt = (height >> 2); loop_cnt--;) {
  383. LD_SB4(src, src_stride, src0, src1, src2, src3);
  384. src += (4 * src_stride);
  385. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  386. dst += (4 * dst_stride);
  387. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  388. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  389. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  390. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  391. vec2, vec3);
  392. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  393. PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
  394. src2, src3);
  395. out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
  396. CALC_MSE_AVG_B(out, ref0, var, avg);
  397. out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
  398. CALC_MSE_AVG_B(out, ref1, var, avg);
  399. }
  400. vec = __msa_hadd_s_w(avg, avg);
  401. *diff = HADD_SW_S32(vec);
  402. return HADD_SW_S32(var);
  403. }
  404. static uint32_t sub_pixel_sse_diff_16width_h_msa(
  405. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  406. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  407. int16_t filtval;
  408. uint32_t loop_cnt;
  409. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  410. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  411. v16u8 dst0, dst1, dst2, dst3, filt0;
  412. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  413. v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
  414. v8i16 avg = { 0 };
  415. v4i32 vec, var = { 0 };
  416. filtval = LH(filter);
  417. filt0 = (v16u8)__msa_fill_h(filtval);
  418. for (loop_cnt = (height >> 2); loop_cnt--;) {
  419. LD_SB4(src, src_stride, src0, src2, src4, src6);
  420. LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
  421. src += (4 * src_stride);
  422. LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
  423. dst += (4 * dst_stride);
  424. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  425. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  426. VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
  427. VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
  428. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
  429. out2, out3);
  430. DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
  431. out6, out7);
  432. SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
  433. SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
  434. PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
  435. src2, src3);
  436. CALC_MSE_AVG_B(src0, dst0, var, avg);
  437. CALC_MSE_AVG_B(src1, dst1, var, avg);
  438. CALC_MSE_AVG_B(src2, dst2, var, avg);
  439. CALC_MSE_AVG_B(src3, dst3, var, avg);
  440. }
  441. vec = __msa_hadd_s_w(avg, avg);
  442. *diff = HADD_SW_S32(vec);
  443. return HADD_SW_S32(var);
  444. }
  445. static uint32_t sub_pixel_sse_diff_32width_h_msa(
  446. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  447. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  448. uint32_t loop_cnt, sse = 0;
  449. int32_t diff0[2];
  450. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  451. sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
  452. filter, height, &diff0[loop_cnt]);
  453. src += 16;
  454. dst += 16;
  455. }
  456. *diff = diff0[0] + diff0[1];
  457. return sse;
  458. }
  459. static uint32_t sub_pixel_sse_diff_64width_h_msa(
  460. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  461. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  462. uint32_t loop_cnt, sse = 0;
  463. int32_t diff0[4];
  464. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  465. sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
  466. filter, height, &diff0[loop_cnt]);
  467. src += 16;
  468. dst += 16;
  469. }
  470. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  471. return sse;
  472. }
  473. static uint32_t sub_pixel_sse_diff_4width_v_msa(
  474. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  475. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  476. int16_t filtval;
  477. uint32_t loop_cnt;
  478. uint32_t ref0, ref1, ref2, ref3;
  479. v16u8 src0, src1, src2, src3, src4, out;
  480. v16u8 src10_r, src32_r, src21_r, src43_r;
  481. v16u8 ref = { 0 };
  482. v16u8 src2110, src4332;
  483. v16u8 filt0;
  484. v8i16 avg = { 0 };
  485. v4i32 vec, var = { 0 };
  486. v8u16 tmp0, tmp1;
  487. filtval = LH(filter);
  488. filt0 = (v16u8)__msa_fill_h(filtval);
  489. src0 = LD_UB(src);
  490. src += src_stride;
  491. for (loop_cnt = (height >> 2); loop_cnt--;) {
  492. LD_UB4(src, src_stride, src1, src2, src3, src4);
  493. src += (4 * src_stride);
  494. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  495. dst += (4 * dst_stride);
  496. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  497. ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
  498. src32_r, src43_r);
  499. ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
  500. DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
  501. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  502. out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  503. CALC_MSE_AVG_B(out, ref, var, avg);
  504. src0 = src4;
  505. }
  506. vec = __msa_hadd_s_w(avg, avg);
  507. *diff = HADD_SW_S32(vec);
  508. return HADD_SW_S32(var);
  509. }
  510. static uint32_t sub_pixel_sse_diff_8width_v_msa(
  511. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  512. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  513. int16_t filtval;
  514. uint32_t loop_cnt;
  515. v16u8 src0, src1, src2, src3, src4;
  516. v16u8 ref0, ref1, ref2, ref3;
  517. v8u16 vec0, vec1, vec2, vec3;
  518. v8u16 tmp0, tmp1, tmp2, tmp3;
  519. v16u8 filt0;
  520. v8i16 avg = { 0 };
  521. v4i32 vec, var = { 0 };
  522. filtval = LH(filter);
  523. filt0 = (v16u8)__msa_fill_h(filtval);
  524. src0 = LD_UB(src);
  525. src += src_stride;
  526. for (loop_cnt = (height >> 2); loop_cnt--;) {
  527. LD_UB4(src, src_stride, src1, src2, src3, src4);
  528. src += (4 * src_stride);
  529. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  530. dst += (4 * dst_stride);
  531. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  532. ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
  533. vec3);
  534. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
  535. tmp2, tmp3);
  536. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  537. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
  538. CALC_MSE_AVG_B(src0, ref0, var, avg);
  539. CALC_MSE_AVG_B(src1, ref1, var, avg);
  540. src0 = src4;
  541. }
  542. vec = __msa_hadd_s_w(avg, avg);
  543. *diff = HADD_SW_S32(vec);
  544. return HADD_SW_S32(var);
  545. }
  546. static uint32_t sub_pixel_sse_diff_16width_v_msa(
  547. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  548. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  549. int16_t filtval;
  550. uint32_t loop_cnt;
  551. v16u8 ref0, ref1, ref2, ref3;
  552. v16u8 src0, src1, src2, src3, src4;
  553. v16u8 out0, out1, out2, out3;
  554. v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  555. v8u16 tmp0, tmp1, tmp2, tmp3;
  556. v16u8 filt0;
  557. v8i16 avg = { 0 };
  558. v4i32 vec, var = { 0 };
  559. filtval = LH(filter);
  560. filt0 = (v16u8)__msa_fill_h(filtval);
  561. src0 = LD_UB(src);
  562. src += src_stride;
  563. for (loop_cnt = (height >> 2); loop_cnt--;) {
  564. LD_UB4(src, src_stride, src1, src2, src3, src4);
  565. src += (4 * src_stride);
  566. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  567. dst += (4 * dst_stride);
  568. ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
  569. ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
  570. DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
  571. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  572. out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  573. ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
  574. ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
  575. DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
  576. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  577. out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
  578. DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
  579. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  580. out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  581. DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
  582. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  583. out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
  584. src0 = src4;
  585. CALC_MSE_AVG_B(out0, ref0, var, avg);
  586. CALC_MSE_AVG_B(out1, ref1, var, avg);
  587. CALC_MSE_AVG_B(out2, ref2, var, avg);
  588. CALC_MSE_AVG_B(out3, ref3, var, avg);
  589. }
  590. vec = __msa_hadd_s_w(avg, avg);
  591. *diff = HADD_SW_S32(vec);
  592. return HADD_SW_S32(var);
  593. }
  594. static uint32_t sub_pixel_sse_diff_32width_v_msa(
  595. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  596. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  597. uint32_t loop_cnt, sse = 0;
  598. int32_t diff0[2];
  599. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  600. sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
  601. filter, height, &diff0[loop_cnt]);
  602. src += 16;
  603. dst += 16;
  604. }
  605. *diff = diff0[0] + diff0[1];
  606. return sse;
  607. }
  608. static uint32_t sub_pixel_sse_diff_64width_v_msa(
  609. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  610. int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
  611. uint32_t loop_cnt, sse = 0;
  612. int32_t diff0[4];
  613. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  614. sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
  615. filter, height, &diff0[loop_cnt]);
  616. src += 16;
  617. dst += 16;
  618. }
  619. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  620. return sse;
  621. }
  622. static uint32_t sub_pixel_sse_diff_4width_hv_msa(
  623. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  624. int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
  625. int32_t height, int32_t *diff) {
  626. int16_t filtval;
  627. uint32_t loop_cnt;
  628. uint32_t ref0, ref1, ref2, ref3;
  629. v16u8 src0, src1, src2, src3, src4;
  630. v16u8 out, ref = { 0 };
  631. v16u8 filt_vt, filt_hz, vec0, vec1;
  632. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
  633. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
  634. v8u16 tmp0, tmp1;
  635. v8i16 avg = { 0 };
  636. v4i32 vec, var = { 0 };
  637. filtval = LH(filter_horiz);
  638. filt_hz = (v16u8)__msa_fill_h(filtval);
  639. filtval = LH(filter_vert);
  640. filt_vt = (v16u8)__msa_fill_h(filtval);
  641. src0 = LD_UB(src);
  642. src += src_stride;
  643. for (loop_cnt = (height >> 2); loop_cnt--;) {
  644. LD_UB4(src, src_stride, src1, src2, src3, src4);
  645. src += (4 * src_stride);
  646. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  647. dst += (4 * dst_stride);
  648. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  649. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  650. hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  651. hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  652. hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
  653. hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
  654. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  655. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  656. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  657. out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  658. CALC_MSE_AVG_B(out, ref, var, avg);
  659. src0 = src4;
  660. }
  661. vec = __msa_hadd_s_w(avg, avg);
  662. *diff = HADD_SW_S32(vec);
  663. return HADD_SW_S32(var);
  664. }
  665. static uint32_t sub_pixel_sse_diff_8width_hv_msa(
  666. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  667. int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
  668. int32_t height, int32_t *diff) {
  669. int16_t filtval;
  670. uint32_t loop_cnt;
  671. v16u8 ref0, ref1, ref2, ref3;
  672. v16u8 src0, src1, src2, src3, src4;
  673. v16u8 out0, out1;
  674. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  675. v8u16 hz_out0, hz_out1;
  676. v8u16 tmp0, tmp1, tmp2, tmp3;
  677. v16u8 filt_vt, filt_hz, vec0;
  678. v8i16 avg = { 0 };
  679. v4i32 vec, var = { 0 };
  680. filtval = LH(filter_horiz);
  681. filt_hz = (v16u8)__msa_fill_h(filtval);
  682. filtval = LH(filter_vert);
  683. filt_vt = (v16u8)__msa_fill_h(filtval);
  684. src0 = LD_UB(src);
  685. src += src_stride;
  686. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  687. for (loop_cnt = (height >> 2); loop_cnt--;) {
  688. LD_UB4(src, src_stride, src1, src2, src3, src4);
  689. src += (4 * src_stride);
  690. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  691. dst += (4 * dst_stride);
  692. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  693. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  694. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  695. tmp0 = __msa_dotp_u_h(vec0, filt_vt);
  696. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  697. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  698. tmp1 = __msa_dotp_u_h(vec0, filt_vt);
  699. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  700. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  701. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  702. tmp2 = __msa_dotp_u_h(vec0, filt_vt);
  703. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  704. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  705. tmp3 = __msa_dotp_u_h(vec0, filt_vt);
  706. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  707. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  708. CALC_MSE_AVG_B(out0, ref0, var, avg);
  709. CALC_MSE_AVG_B(out1, ref1, var, avg);
  710. }
  711. vec = __msa_hadd_s_w(avg, avg);
  712. *diff = HADD_SW_S32(vec);
  713. return HADD_SW_S32(var);
  714. }
  715. static uint32_t sub_pixel_sse_diff_16width_hv_msa(
  716. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  717. int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
  718. int32_t height, int32_t *diff) {
  719. int16_t filtval;
  720. uint32_t loop_cnt;
  721. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  722. v16u8 ref0, ref1, ref2, ref3;
  723. v16u8 filt_hz, filt_vt, vec0, vec1;
  724. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  725. v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
  726. v8u16 tmp0, tmp1;
  727. v8i16 avg = { 0 };
  728. v4i32 vec, var = { 0 };
  729. filtval = LH(filter_horiz);
  730. filt_hz = (v16u8)__msa_fill_h(filtval);
  731. filtval = LH(filter_vert);
  732. filt_vt = (v16u8)__msa_fill_h(filtval);
  733. LD_UB2(src, 8, src0, src1);
  734. src += src_stride;
  735. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  736. hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  737. for (loop_cnt = (height >> 2); loop_cnt--;) {
  738. LD_UB4(src, src_stride, src0, src2, src4, src6);
  739. LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
  740. src += (4 * src_stride);
  741. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  742. dst += (4 * dst_stride);
  743. hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  744. hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  745. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  746. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  747. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  748. src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  749. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  750. hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  751. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  752. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  753. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  754. src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  755. hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  756. hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
  757. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  758. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  759. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  760. src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  761. hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
  762. hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
  763. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  764. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  765. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  766. src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  767. CALC_MSE_AVG_B(src0, ref0, var, avg);
  768. CALC_MSE_AVG_B(src1, ref1, var, avg);
  769. CALC_MSE_AVG_B(src2, ref2, var, avg);
  770. CALC_MSE_AVG_B(src3, ref3, var, avg);
  771. }
  772. vec = __msa_hadd_s_w(avg, avg);
  773. *diff = HADD_SW_S32(vec);
  774. return HADD_SW_S32(var);
  775. }
  776. static uint32_t sub_pixel_sse_diff_32width_hv_msa(
  777. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  778. int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
  779. int32_t height, int32_t *diff) {
  780. uint32_t loop_cnt, sse = 0;
  781. int32_t diff0[2];
  782. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  783. sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
  784. filter_horiz, filter_vert, height,
  785. &diff0[loop_cnt]);
  786. src += 16;
  787. dst += 16;
  788. }
  789. *diff = diff0[0] + diff0[1];
  790. return sse;
  791. }
  792. static uint32_t sub_pixel_sse_diff_64width_hv_msa(
  793. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  794. int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
  795. int32_t height, int32_t *diff) {
  796. uint32_t loop_cnt, sse = 0;
  797. int32_t diff0[4];
  798. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  799. sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
  800. filter_horiz, filter_vert, height,
  801. &diff0[loop_cnt]);
  802. src += 16;
  803. dst += 16;
  804. }
  805. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  806. return sse;
  807. }
  808. static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
  809. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  810. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  811. int32_t height, int32_t *diff) {
  812. int16_t filtval;
  813. uint32_t loop_cnt;
  814. uint32_t ref0, ref1, ref2, ref3;
  815. v16u8 out, pred, filt0, ref = { 0 };
  816. v16i8 src0, src1, src2, src3;
  817. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  818. v8u16 vec0, vec1, vec2, vec3;
  819. v8i16 avg = { 0 };
  820. v4i32 vec, var = { 0 };
  821. filtval = LH(filter);
  822. filt0 = (v16u8)__msa_fill_h(filtval);
  823. for (loop_cnt = (height >> 2); loop_cnt--;) {
  824. LD_SB4(src, src_stride, src0, src1, src2, src3);
  825. src += (4 * src_stride);
  826. pred = LD_UB(sec_pred);
  827. sec_pred += 16;
  828. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  829. dst += (4 * dst_stride);
  830. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  831. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  832. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  833. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  834. vec2, vec3);
  835. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  836. PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
  837. src2, src3);
  838. ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
  839. out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
  840. out = __msa_aver_u_b(out, pred);
  841. CALC_MSE_AVG_B(out, ref, var, avg);
  842. }
  843. vec = __msa_hadd_s_w(avg, avg);
  844. *diff = HADD_SW_S32(vec);
  845. return HADD_SW_S32(var);
  846. }
  847. static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
  848. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  849. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  850. int32_t height, int32_t *diff) {
  851. int16_t filtval;
  852. uint32_t loop_cnt;
  853. v16u8 out, pred, filt0;
  854. v16u8 ref0, ref1, ref2, ref3;
  855. v16i8 src0, src1, src2, src3;
  856. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  857. v8u16 vec0, vec1, vec2, vec3;
  858. v8i16 avg = { 0 };
  859. v4i32 vec, var = { 0 };
  860. filtval = LH(filter);
  861. filt0 = (v16u8)__msa_fill_h(filtval);
  862. for (loop_cnt = (height >> 2); loop_cnt--;) {
  863. LD_SB4(src, src_stride, src0, src1, src2, src3);
  864. src += (4 * src_stride);
  865. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  866. dst += (4 * dst_stride);
  867. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  868. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  869. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  870. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
  871. vec2, vec3);
  872. SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
  873. PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
  874. src2, src3);
  875. out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
  876. pred = LD_UB(sec_pred);
  877. sec_pred += 16;
  878. out = __msa_aver_u_b(out, pred);
  879. CALC_MSE_AVG_B(out, ref0, var, avg);
  880. out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
  881. pred = LD_UB(sec_pred);
  882. sec_pred += 16;
  883. out = __msa_aver_u_b(out, pred);
  884. CALC_MSE_AVG_B(out, ref1, var, avg);
  885. }
  886. vec = __msa_hadd_s_w(avg, avg);
  887. *diff = HADD_SW_S32(vec);
  888. return HADD_SW_S32(var);
  889. }
  890. static uint32_t subpel_avg_ssediff_16w_h_msa(
  891. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  892. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  893. int32_t height, int32_t *diff, int32_t width) {
  894. int16_t filtval;
  895. uint32_t loop_cnt;
  896. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  897. v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  898. v16u8 dst0, dst1, dst2, dst3;
  899. v16u8 tmp0, tmp1, tmp2, tmp3;
  900. v16u8 pred0, pred1, pred2, pred3, filt0;
  901. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  902. v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
  903. v8i16 avg = { 0 };
  904. v4i32 vec, var = { 0 };
  905. filtval = LH(filter);
  906. filt0 = (v16u8)__msa_fill_h(filtval);
  907. for (loop_cnt = (height >> 2); loop_cnt--;) {
  908. LD_SB4(src, src_stride, src0, src2, src4, src6);
  909. LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
  910. src += (4 * src_stride);
  911. LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
  912. dst += (4 * dst_stride);
  913. LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
  914. sec_pred += (4 * width);
  915. VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
  916. VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
  917. VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
  918. VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
  919. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
  920. out2, out3);
  921. DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
  922. out6, out7);
  923. SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
  924. SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
  925. PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
  926. tmp2, tmp3);
  927. AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
  928. tmp2, tmp3);
  929. CALC_MSE_AVG_B(tmp0, dst0, var, avg);
  930. CALC_MSE_AVG_B(tmp1, dst1, var, avg);
  931. CALC_MSE_AVG_B(tmp2, dst2, var, avg);
  932. CALC_MSE_AVG_B(tmp3, dst3, var, avg);
  933. }
  934. vec = __msa_hadd_s_w(avg, avg);
  935. *diff = HADD_SW_S32(vec);
  936. return HADD_SW_S32(var);
  937. }
  938. static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
  939. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  940. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  941. int32_t height, int32_t *diff) {
  942. return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
  943. sec_pred, filter, height, diff, 16);
  944. }
  945. static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
  946. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  947. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  948. int32_t height, int32_t *diff) {
  949. uint32_t loop_cnt, sse = 0;
  950. int32_t diff0[2];
  951. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  952. sse +=
  953. subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
  954. filter, height, &diff0[loop_cnt], 32);
  955. src += 16;
  956. dst += 16;
  957. sec_pred += 16;
  958. }
  959. *diff = diff0[0] + diff0[1];
  960. return sse;
  961. }
  962. static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
  963. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  964. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  965. int32_t height, int32_t *diff) {
  966. uint32_t loop_cnt, sse = 0;
  967. int32_t diff0[4];
  968. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  969. sse +=
  970. subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
  971. filter, height, &diff0[loop_cnt], 64);
  972. src += 16;
  973. dst += 16;
  974. sec_pred += 16;
  975. }
  976. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  977. return sse;
  978. }
  979. static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
  980. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  981. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  982. int32_t height, int32_t *diff) {
  983. int16_t filtval;
  984. uint32_t loop_cnt;
  985. uint32_t ref0, ref1, ref2, ref3;
  986. v16u8 src0, src1, src2, src3, src4;
  987. v16u8 src10_r, src32_r, src21_r, src43_r;
  988. v16u8 out, pred, ref = { 0 };
  989. v16u8 src2110, src4332, filt0;
  990. v8i16 avg = { 0 };
  991. v4i32 vec, var = { 0 };
  992. v8u16 tmp0, tmp1;
  993. filtval = LH(filter);
  994. filt0 = (v16u8)__msa_fill_h(filtval);
  995. src0 = LD_UB(src);
  996. src += src_stride;
  997. for (loop_cnt = (height >> 2); loop_cnt--;) {
  998. LD_UB4(src, src_stride, src1, src2, src3, src4);
  999. src += (4 * src_stride);
  1000. pred = LD_UB(sec_pred);
  1001. sec_pred += 16;
  1002. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1003. dst += (4 * dst_stride);
  1004. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  1005. ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
  1006. src32_r, src43_r);
  1007. ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
  1008. DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
  1009. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1010. out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1011. out = __msa_aver_u_b(out, pred);
  1012. CALC_MSE_AVG_B(out, ref, var, avg);
  1013. src0 = src4;
  1014. }
  1015. vec = __msa_hadd_s_w(avg, avg);
  1016. *diff = HADD_SW_S32(vec);
  1017. return HADD_SW_S32(var);
  1018. }
  1019. static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
  1020. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1021. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  1022. int32_t height, int32_t *diff) {
  1023. int16_t filtval;
  1024. uint32_t loop_cnt;
  1025. v16u8 src0, src1, src2, src3, src4;
  1026. v16u8 ref0, ref1, ref2, ref3;
  1027. v16u8 pred0, pred1, filt0;
  1028. v8u16 vec0, vec1, vec2, vec3;
  1029. v8u16 tmp0, tmp1, tmp2, tmp3;
  1030. v8i16 avg = { 0 };
  1031. v4i32 vec, var = { 0 };
  1032. filtval = LH(filter);
  1033. filt0 = (v16u8)__msa_fill_h(filtval);
  1034. src0 = LD_UB(src);
  1035. src += src_stride;
  1036. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1037. LD_UB4(src, src_stride, src1, src2, src3, src4);
  1038. src += (4 * src_stride);
  1039. LD_UB2(sec_pred, 16, pred0, pred1);
  1040. sec_pred += 32;
  1041. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1042. dst += (4 * dst_stride);
  1043. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  1044. ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
  1045. vec3);
  1046. DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
  1047. tmp2, tmp3);
  1048. SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  1049. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
  1050. AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
  1051. CALC_MSE_AVG_B(src0, ref0, var, avg);
  1052. CALC_MSE_AVG_B(src1, ref1, var, avg);
  1053. src0 = src4;
  1054. }
  1055. vec = __msa_hadd_s_w(avg, avg);
  1056. *diff = HADD_SW_S32(vec);
  1057. return HADD_SW_S32(var);
  1058. }
  1059. static uint32_t subpel_avg_ssediff_16w_v_msa(
  1060. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1061. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  1062. int32_t height, int32_t *diff, int32_t width) {
  1063. int16_t filtval;
  1064. uint32_t loop_cnt;
  1065. v16u8 ref0, ref1, ref2, ref3;
  1066. v16u8 pred0, pred1, pred2, pred3;
  1067. v16u8 src0, src1, src2, src3, src4;
  1068. v16u8 out0, out1, out2, out3, filt0;
  1069. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1070. v8u16 tmp0, tmp1, tmp2, tmp3;
  1071. v8i16 avg = { 0 };
  1072. v4i32 vec, var = { 0 };
  1073. filtval = LH(filter);
  1074. filt0 = (v16u8)__msa_fill_h(filtval);
  1075. src0 = LD_UB(src);
  1076. src += src_stride;
  1077. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1078. LD_UB4(src, src_stride, src1, src2, src3, src4);
  1079. src += (4 * src_stride);
  1080. LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
  1081. sec_pred += (4 * width);
  1082. ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
  1083. ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
  1084. DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
  1085. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1086. out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1087. ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
  1088. ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
  1089. DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
  1090. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  1091. out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
  1092. DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
  1093. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1094. out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1095. DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
  1096. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  1097. out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
  1098. src0 = src4;
  1099. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1100. dst += (4 * dst_stride);
  1101. AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
  1102. out2, out3);
  1103. CALC_MSE_AVG_B(out0, ref0, var, avg);
  1104. CALC_MSE_AVG_B(out1, ref1, var, avg);
  1105. CALC_MSE_AVG_B(out2, ref2, var, avg);
  1106. CALC_MSE_AVG_B(out3, ref3, var, avg);
  1107. }
  1108. vec = __msa_hadd_s_w(avg, avg);
  1109. *diff = HADD_SW_S32(vec);
  1110. return HADD_SW_S32(var);
  1111. }
  1112. static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
  1113. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1114. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  1115. int32_t height, int32_t *diff) {
  1116. return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
  1117. sec_pred, filter, height, diff, 16);
  1118. }
  1119. static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
  1120. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1121. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  1122. int32_t height, int32_t *diff) {
  1123. uint32_t loop_cnt, sse = 0;
  1124. int32_t diff0[2];
  1125. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  1126. sse +=
  1127. subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
  1128. filter, height, &diff0[loop_cnt], 32);
  1129. src += 16;
  1130. dst += 16;
  1131. sec_pred += 16;
  1132. }
  1133. *diff = diff0[0] + diff0[1];
  1134. return sse;
  1135. }
  1136. static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
  1137. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1138. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
  1139. int32_t height, int32_t *diff) {
  1140. uint32_t loop_cnt, sse = 0;
  1141. int32_t diff0[4];
  1142. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  1143. sse +=
  1144. subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
  1145. filter, height, &diff0[loop_cnt], 64);
  1146. src += 16;
  1147. dst += 16;
  1148. sec_pred += 16;
  1149. }
  1150. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  1151. return sse;
  1152. }
  1153. static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
  1154. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1155. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1156. const uint8_t *filter_vert, int32_t height, int32_t *diff) {
  1157. int16_t filtval;
  1158. uint32_t loop_cnt;
  1159. uint32_t ref0, ref1, ref2, ref3;
  1160. v16u8 src0, src1, src2, src3, src4;
  1161. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
  1162. v16u8 filt_hz, filt_vt, vec0, vec1;
  1163. v16u8 out, pred, ref = { 0 };
  1164. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
  1165. v8i16 avg = { 0 };
  1166. v4i32 vec, var = { 0 };
  1167. filtval = LH(filter_horiz);
  1168. filt_hz = (v16u8)__msa_fill_h(filtval);
  1169. filtval = LH(filter_vert);
  1170. filt_vt = (v16u8)__msa_fill_h(filtval);
  1171. src0 = LD_UB(src);
  1172. src += src_stride;
  1173. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1174. LD_UB4(src, src_stride, src1, src2, src3, src4);
  1175. src += (4 * src_stride);
  1176. pred = LD_UB(sec_pred);
  1177. sec_pred += 16;
  1178. LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1179. dst += (4 * dst_stride);
  1180. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  1181. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  1182. hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  1183. hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  1184. hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
  1185. hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
  1186. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  1187. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  1188. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1189. out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1190. out = __msa_aver_u_b(out, pred);
  1191. CALC_MSE_AVG_B(out, ref, var, avg);
  1192. src0 = src4;
  1193. }
  1194. vec = __msa_hadd_s_w(avg, avg);
  1195. *diff = HADD_SW_S32(vec);
  1196. return HADD_SW_S32(var);
  1197. }
  1198. static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
  1199. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1200. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1201. const uint8_t *filter_vert, int32_t height, int32_t *diff) {
  1202. int16_t filtval;
  1203. uint32_t loop_cnt;
  1204. v16u8 ref0, ref1, ref2, ref3;
  1205. v16u8 src0, src1, src2, src3, src4;
  1206. v16u8 pred0, pred1, out0, out1;
  1207. v16u8 filt_hz, filt_vt, vec0;
  1208. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  1209. v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
  1210. v8i16 avg = { 0 };
  1211. v4i32 vec, var = { 0 };
  1212. filtval = LH(filter_horiz);
  1213. filt_hz = (v16u8)__msa_fill_h(filtval);
  1214. filtval = LH(filter_vert);
  1215. filt_vt = (v16u8)__msa_fill_h(filtval);
  1216. src0 = LD_UB(src);
  1217. src += src_stride;
  1218. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  1219. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1220. LD_UB4(src, src_stride, src1, src2, src3, src4);
  1221. src += (4 * src_stride);
  1222. LD_UB2(sec_pred, 16, pred0, pred1);
  1223. sec_pred += 32;
  1224. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1225. dst += (4 * dst_stride);
  1226. PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  1227. hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  1228. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  1229. tmp0 = __msa_dotp_u_h(vec0, filt_vt);
  1230. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  1231. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  1232. tmp1 = __msa_dotp_u_h(vec0, filt_vt);
  1233. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1234. hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  1235. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  1236. tmp2 = __msa_dotp_u_h(vec0, filt_vt);
  1237. hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  1238. vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  1239. tmp3 = __msa_dotp_u_h(vec0, filt_vt);
  1240. SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
  1241. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  1242. AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
  1243. CALC_MSE_AVG_B(out0, ref0, var, avg);
  1244. CALC_MSE_AVG_B(out1, ref1, var, avg);
  1245. }
  1246. vec = __msa_hadd_s_w(avg, avg);
  1247. *diff = HADD_SW_S32(vec);
  1248. return HADD_SW_S32(var);
  1249. }
  1250. static uint32_t subpel_avg_ssediff_16w_hv_msa(
  1251. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1252. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1253. const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
  1254. int16_t filtval;
  1255. uint32_t loop_cnt;
  1256. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1257. v16u8 ref0, ref1, ref2, ref3;
  1258. v16u8 pred0, pred1, pred2, pred3;
  1259. v16u8 out0, out1, out2, out3;
  1260. v16u8 filt_hz, filt_vt, vec0, vec1;
  1261. v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  1262. v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
  1263. v8i16 avg = { 0 };
  1264. v4i32 vec, var = { 0 };
  1265. filtval = LH(filter_horiz);
  1266. filt_hz = (v16u8)__msa_fill_h(filtval);
  1267. filtval = LH(filter_vert);
  1268. filt_vt = (v16u8)__msa_fill_h(filtval);
  1269. LD_UB2(src, 8, src0, src1);
  1270. src += src_stride;
  1271. hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  1272. hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  1273. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1274. LD_UB4(src, src_stride, src0, src2, src4, src6);
  1275. LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
  1276. src += (4 * src_stride);
  1277. LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
  1278. sec_pred += (4 * width);
  1279. hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  1280. hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  1281. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  1282. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  1283. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1284. out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1285. hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  1286. hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  1287. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  1288. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  1289. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1290. out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1291. hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  1292. hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
  1293. ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  1294. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  1295. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1296. out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1297. hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
  1298. hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
  1299. ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
  1300. DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  1301. SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  1302. out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
  1303. LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
  1304. dst += (4 * dst_stride);
  1305. AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
  1306. out2, out3);
  1307. CALC_MSE_AVG_B(out0, ref0, var, avg);
  1308. CALC_MSE_AVG_B(out1, ref1, var, avg);
  1309. CALC_MSE_AVG_B(out2, ref2, var, avg);
  1310. CALC_MSE_AVG_B(out3, ref3, var, avg);
  1311. }
  1312. vec = __msa_hadd_s_w(avg, avg);
  1313. *diff = HADD_SW_S32(vec);
  1314. return HADD_SW_S32(var);
  1315. }
  1316. static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
  1317. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1318. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1319. const uint8_t *filter_vert, int32_t height, int32_t *diff) {
  1320. return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
  1321. sec_pred, filter_horiz, filter_vert,
  1322. height, diff, 16);
  1323. }
  1324. static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
  1325. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1326. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1327. const uint8_t *filter_vert, int32_t height, int32_t *diff) {
  1328. uint32_t loop_cnt, sse = 0;
  1329. int32_t diff0[2];
  1330. for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
  1331. sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
  1332. sec_pred, filter_horiz, filter_vert,
  1333. height, &diff0[loop_cnt], 32);
  1334. src += 16;
  1335. dst += 16;
  1336. sec_pred += 16;
  1337. }
  1338. *diff = diff0[0] + diff0[1];
  1339. return sse;
  1340. }
  1341. static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
  1342. const uint8_t *src, int32_t src_stride, const uint8_t *dst,
  1343. int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
  1344. const uint8_t *filter_vert, int32_t height, int32_t *diff) {
  1345. uint32_t loop_cnt, sse = 0;
  1346. int32_t diff0[4];
  1347. for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
  1348. sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
  1349. sec_pred, filter_horiz, filter_vert,
  1350. height, &diff0[loop_cnt], 64);
  1351. src += 16;
  1352. dst += 16;
  1353. sec_pred += 16;
  1354. }
  1355. *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
  1356. return sse;
  1357. }
  1358. #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
  1359. #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
  1360. #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
  1361. #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
  1362. #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
  1363. #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
  1364. #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
  1365. #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
  1366. #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
  1367. #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
  1368. #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
  1369. #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
  1370. #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
  1371. #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
  1372. uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \
  1373. const uint8_t *src, int32_t src_stride, int32_t x_offset, \
  1374. int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \
  1375. uint32_t *sse) { \
  1376. int32_t diff; \
  1377. uint32_t var; \
  1378. const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \
  1379. const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \
  1380. \
  1381. if (y_offset) { \
  1382. if (x_offset) { \
  1383. *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \
  1384. src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
  1385. } else { \
  1386. *sse = sub_pixel_sse_diff_##wd##width_v_msa( \
  1387. src, src_stride, ref, ref_stride, v_filter, ht, &diff); \
  1388. } \
  1389. \
  1390. var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
  1391. } else { \
  1392. if (x_offset) { \
  1393. *sse = sub_pixel_sse_diff_##wd##width_h_msa( \
  1394. src, src_stride, ref, ref_stride, h_filter, ht, &diff); \
  1395. \
  1396. var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
  1397. } else { \
  1398. var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
  1399. sse); \
  1400. } \
  1401. } \
  1402. \
  1403. return var; \
  1404. }
  1405. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
  1406. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
  1407. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
  1408. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
  1409. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
  1410. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
  1411. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
  1412. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
  1413. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
  1414. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
  1415. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
  1416. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
  1417. VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
  1418. #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \
  1419. uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \
  1420. const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \
  1421. int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \
  1422. uint32_t *sse, const uint8_t *sec_pred) { \
  1423. int32_t diff; \
  1424. const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \
  1425. const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \
  1426. \
  1427. if (y_offset) { \
  1428. if (x_offset) { \
  1429. *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \
  1430. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
  1431. v_filter, ht, &diff); \
  1432. } else { \
  1433. *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \
  1434. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
  1435. &diff); \
  1436. } \
  1437. } else { \
  1438. if (x_offset) { \
  1439. *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \
  1440. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
  1441. &diff); \
  1442. } else { \
  1443. *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \
  1444. ref_stride, sec_pred, ht, &diff); \
  1445. } \
  1446. } \
  1447. \
  1448. return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
  1449. }
  1450. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
  1451. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
  1452. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
  1453. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
  1454. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
  1455. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
  1456. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
  1457. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
  1458. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
  1459. VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
  1460. uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
  1461. int32_t src_stride,
  1462. int32_t x_offset, int32_t y_offset,
  1463. const uint8_t *ref_ptr,
  1464. int32_t ref_stride, uint32_t *sse,
  1465. const uint8_t *sec_pred) {
  1466. int32_t diff;
  1467. const uint8_t *h_filter = bilinear_filters_msa[x_offset];
  1468. const uint8_t *v_filter = bilinear_filters_msa[y_offset];
  1469. if (y_offset) {
  1470. if (x_offset) {
  1471. *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
  1472. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
  1473. v_filter, 64, &diff);
  1474. } else {
  1475. *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
  1476. ref_stride, sec_pred,
  1477. v_filter, 64, &diff);
  1478. }
  1479. } else {
  1480. if (x_offset) {
  1481. *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
  1482. ref_stride, sec_pred,
  1483. h_filter, 64, &diff);
  1484. } else {
  1485. *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
  1486. sec_pred, &diff);
  1487. }
  1488. }
  1489. return VARIANCE_32Wx64H(*sse, diff);
  1490. }
  1491. #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \
  1492. uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \
  1493. const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \
  1494. int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \
  1495. uint32_t *sse, const uint8_t *sec_pred) { \
  1496. int32_t diff; \
  1497. const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \
  1498. const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \
  1499. \
  1500. if (y_offset) { \
  1501. if (x_offset) { \
  1502. *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \
  1503. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
  1504. v_filter, ht, &diff); \
  1505. } else { \
  1506. *sse = sub_pixel_avg_sse_diff_64width_v_msa( \
  1507. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
  1508. &diff); \
  1509. } \
  1510. } else { \
  1511. if (x_offset) { \
  1512. *sse = sub_pixel_avg_sse_diff_64width_h_msa( \
  1513. src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
  1514. &diff); \
  1515. } else { \
  1516. *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \
  1517. ref_stride, sec_pred, &diff); \
  1518. } \
  1519. } \
  1520. \
  1521. return VARIANCE_64Wx##ht##H(*sse, diff); \
  1522. }
  1523. VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
  1524. VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);