1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291 |
- /*
- * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
- #include "./vpx_dsp_rtcd.h"
- #include "vpx_dsp/variance.h"
- #include "vpx_ports/mem.h"
- #include "vpx/vpx_integer.h"
- #include "vpx_ports/asmdefs_mmi.h"
- static const uint8_t bilinear_filters[8][2] = {
- { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
- { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
- };
- /* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
- vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
- #define VARIANCE_SSE_SUM_8_FOR_W64 \
- /* sse */ \
- "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
- "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
- "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
- "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
- "paddw %[ftmp10], %[ftmp10], %[ftmp6] \n\t" \
- "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \
- \
- /* sum */ \
- "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
- "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
- "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
- "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \
- "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \
- "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \
- "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \
- "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \
- "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \
- "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \
- "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
- "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \
- "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \
- "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \
- "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \
- "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \
- "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \
- "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
- #define VARIANCE_SSE_SUM_4 \
- /* sse */ \
- "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
- "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
- "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \
- "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \
- \
- /* sum */ \
- "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
- "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
- "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \
- "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
- #define VARIANCE_SSE_SUM_8 \
- /* sse */ \
- "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
- "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
- "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
- "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
- "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
- "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \
- \
- /* sum */ \
- "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
- "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
- "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \
- "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \
- "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \
- "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t"
- #define VARIANCE_SSE_8 \
- "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
- "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" \
- "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" \
- "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
- "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
- "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
- "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
- "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
- "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
- #define VARIANCE_SSE_16 \
- VARIANCE_SSE_8 \
- "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
- "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" \
- "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" \
- "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
- "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
- "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
- "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
- "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
- "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \
- /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \
- "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
- "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
- "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
- "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \
- "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
- "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \
- /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \
- "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
- "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
- "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
- "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
- "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
- "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \
- /* calculate: temp2[0] ~ temp2[3] */ \
- "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
- "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
- "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
- "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \
- \
- /* store: temp2[0] ~ temp2[3] */ \
- "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
- "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
- "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \
- /* calculate: temp2[0] ~ temp2[3] */ \
- "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
- "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
- "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
- "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
- \
- /* store: temp2[0] ~ temp2[3] */ \
- "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
- "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
- "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
- /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
- "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
- "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
- "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
- "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \
- "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
- "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \
- "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
- "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
- "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
- "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
- "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
- /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
- "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \
- "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
- "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \
- "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \
- "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
- "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \
- "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \
- "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \
- "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \
- "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
- "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
- /* calculate: temp2[0] ~ temp2[3] */ \
- "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
- "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \
- "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
- "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
- \
- /* calculate: temp2[4] ~ temp2[7] */ \
- "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \
- "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \
- "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \
- "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \
- \
- /* store: temp2[0] ~ temp2[7] */ \
- "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
- "and %[ftmp3], %[ftmp3], %[mask] \n\t" \
- "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
- "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \
- "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
- /* calculate: temp2[0] ~ temp2[3] */ \
- "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \
- "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
- "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \
- "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
- \
- /* calculate: temp2[4] ~ temp2[7] */ \
- "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \
- "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \
- "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \
- "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \
- \
- /* store: temp2[0] ~ temp2[7] */ \
- "and %[ftmp8], %[ftmp8], %[mask] \n\t" \
- "and %[ftmp9], %[ftmp9], %[mask] \n\t" \
- "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
- "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \
- "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \
- /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
- \
- /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \
- "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
- "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \
- "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
- "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \
- "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
- "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \
- "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \
- "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
- "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
- "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
- "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \
- /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
- \
- /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \
- "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
- "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \
- "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \
- "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \
- "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \
- "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \
- "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
- "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \
- "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \
- "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \
- "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \
- "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
- "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
- \
- /* calculate: temp2[8] ~ temp2[11] */ \
- "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
- "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \
- "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
- "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
- \
- /* calculate: temp2[12] ~ temp2[15] */ \
- "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \
- "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \
- "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
- "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \
- \
- /* store: temp2[8] ~ temp2[15] */ \
- "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
- "and %[ftmp5], %[ftmp5], %[mask] \n\t" \
- "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
- "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \
- "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t"
- #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
- \
- /* calculate: temp2[8] ~ temp2[11] */ \
- "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \
- "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
- "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \
- "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
- \
- /* calculate: temp2[12] ~ temp2[15] */ \
- "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \
- "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
- "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \
- "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \
- "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \
- \
- /* store: temp2[8] ~ temp2[15] */ \
- "and %[ftmp10], %[ftmp10], %[mask] \n\t" \
- "and %[ftmp11], %[ftmp11], %[mask] \n\t" \
- "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
- "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \
- "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t"
- // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
- // or vertical direction to produce the filtered output block. Used to implement
- // the first-pass of 2-D separable filter.
- //
- // Produces int16_t output to retain precision for the next pass. Two filter
- // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
- // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
- // It defines the offset required to move from one input to the next.
- static void var_filter_block2d_bil_first_pass(
- const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line,
- int pixel_step, unsigned int output_height, unsigned int output_width,
- const uint8_t *filter) {
- unsigned int i, j;
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; ++j) {
- ref_ptr[j] = ROUND_POWER_OF_TWO(
- (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
- FILTER_BITS);
- ++src_ptr;
- }
- src_ptr += src_pixels_per_line - output_width;
- ref_ptr += output_width;
- }
- }
- // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
- // or vertical direction to produce the filtered output block. Used to implement
- // the second-pass of 2-D separable filter.
- //
- // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
- // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
- // filter is applied horizontally (pixel_step = 1) or vertically
- // (pixel_step = stride). It defines the offset required to move from one input
- // to the next. Output is 8-bit.
- static void var_filter_block2d_bil_second_pass(
- const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line,
- unsigned int pixel_step, unsigned int output_height,
- unsigned int output_width, const uint8_t *filter) {
- unsigned int i, j;
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; ++j) {
- ref_ptr[j] = ROUND_POWER_OF_TWO(
- (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
- FILTER_BITS);
- ++src_ptr;
- }
- src_ptr += src_pixels_per_line - output_width;
- ref_ptr += output_width;
- }
- }
- static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- uint32_t *sse, int high) {
- int sum;
- double ftmp[12];
- uint32_t tmp[3];
- *sse = 0;
- __asm__ volatile (
- "li %[tmp0], 0x20 \n\t"
- "mtc1 %[tmp0], %[ftmp11] \n\t"
- MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
- "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
- "1: \n\t"
- "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "gsldlc1 %[ftmp1], 0x27(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x20(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x27(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x20(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "gsldlc1 %[ftmp1], 0x2f(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x28(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x2f(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x28(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "gsldlc1 %[ftmp1], 0x37(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x30(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x37(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x30(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "gsldlc1 %[ftmp1], 0x3f(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x38(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x3f(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x38(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "addiu %[tmp0], %[tmp0], -0x01 \n\t"
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
- "bnez %[tmp0], 1b \n\t"
- "mfc1 %[tmp1], %[ftmp9] \n\t"
- "mfhc1 %[tmp2], %[ftmp9] \n\t"
- "addu %[sum], %[tmp1], %[tmp2] \n\t"
- "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
- "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
- "swc1 %[ftmp1], 0x00(%[sse]) \n\t"
- : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
- [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
- [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
- [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
- [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
- [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
- [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
- [tmp2]"=&r"(tmp[2]),
- [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr),
- [sum]"=&r"(sum)
- : [src_stride]"r"((mips_reg)src_stride),
- [ref_stride]"r"((mips_reg)ref_stride),
- [high]"r"(&high), [sse]"r"(sse)
- : "memory"
- );
- return *sse - (((int64_t)sum * sum) / (64 * high));
- }
- #define VPX_VARIANCE64XN(n) \
- uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- uint32_t *sse) { \
- return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
- }
- VPX_VARIANCE64XN(64)
- VPX_VARIANCE64XN(32)
- uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- uint32_t *sse) {
- int sum;
- double ftmp[12];
- uint32_t tmp[3];
- *sse = 0;
- __asm__ volatile (
- "li %[tmp0], 0x20 \n\t"
- "mtc1 %[tmp0], %[ftmp11] \n\t"
- "li %[tmp0], 0x40 \n\t"
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
- "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
- "1: \n\t"
- "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8_FOR_W64
- "addiu %[tmp0], %[tmp0], -0x01 \n\t"
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
- "bnez %[tmp0], 1b \n\t"
- "mfc1 %[tmp1], %[ftmp9] \n\t"
- "mfhc1 %[tmp2], %[ftmp9] \n\t"
- "addu %[sum], %[tmp1], %[tmp2] \n\t"
- "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
- "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
- "swc1 %[ftmp1], 0x00(%[sse]) \n\t"
- : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
- [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
- [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
- [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
- [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
- [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
- [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
- [tmp2]"=&r"(tmp[2]),
- [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr),
- [sum]"=&r"(sum)
- : [src_stride]"r"((mips_reg)src_stride),
- [ref_stride]"r"((mips_reg)ref_stride),
- [sse]"r"(sse)
- : "memory"
- );
- return *sse - (((int64_t)sum * sum) / 2048);
- }
- static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- uint32_t *sse, int high) {
- int sum;
- double ftmp[13];
- uint32_t tmp[3];
- *sse = 0;
- __asm__ volatile (
- "li %[tmp0], 0x20 \n\t"
- "mtc1 %[tmp0], %[ftmp11] \n\t"
- MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
- "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
- "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
- "1: \n\t"
- "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8
- "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8
- "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8
- "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8
- "addiu %[tmp0], %[tmp0], -0x01 \n\t"
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
- "bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
- "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
- "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
- "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
- "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
- "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
- "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
- "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
- "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
- "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
- "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
- "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
- "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
- : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
- [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
- [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
- [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
- [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
- [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
- [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
- [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
- : [src_stride]"r"((mips_reg)src_stride),
- [ref_stride]"r"((mips_reg)ref_stride),
- [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
- : "memory"
- );
- return *sse - (((int64_t)sum * sum) / (32 * high));
- }
- #define VPX_VARIANCE32XN(n) \
- uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- uint32_t *sse) { \
- return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
- }
- VPX_VARIANCE32XN(32)
- VPX_VARIANCE32XN(16)
- static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- uint32_t *sse, int high) {
- int sum;
- double ftmp[13];
- uint32_t tmp[3];
- *sse = 0;
- __asm__ volatile (
- "li %[tmp0], 0x20 \n\t"
- "mtc1 %[tmp0], %[ftmp11] \n\t"
- MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
- "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
- "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
- "1: \n\t"
- "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8
- "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8
- "addiu %[tmp0], %[tmp0], -0x01 \n\t"
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
- "bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
- "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
- "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
- "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
- "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
- "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
- "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
- "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
- "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
- "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
- "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
- "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
- "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
- : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
- [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
- [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
- [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
- [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
- [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
- [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
- [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
- : [src_stride]"r"((mips_reg)src_stride),
- [ref_stride]"r"((mips_reg)ref_stride),
- [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
- : "memory"
- );
- return *sse - (((int64_t)sum * sum) / (16 * high));
- }
- #define VPX_VARIANCE16XN(n) \
- uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- uint32_t *sse) { \
- return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
- }
- VPX_VARIANCE16XN(32)
- VPX_VARIANCE16XN(16)
- VPX_VARIANCE16XN(8)
- static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- uint32_t *sse, int high) {
- int sum;
- double ftmp[13];
- uint32_t tmp[3];
- *sse = 0;
- __asm__ volatile (
- "li %[tmp0], 0x20 \n\t"
- "mtc1 %[tmp0], %[ftmp11] \n\t"
- MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
- "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
- "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
- "1: \n\t"
- "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_8
- "addiu %[tmp0], %[tmp0], -0x01 \n\t"
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
- "bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
- "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
- "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
- "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
- "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
- "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
- "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
- "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
- "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
- "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
- "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
- "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
- "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
- : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
- [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
- [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
- [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
- [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
- [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
- [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
- [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
- : [src_stride]"r"((mips_reg)src_stride),
- [ref_stride]"r"((mips_reg)ref_stride),
- [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
- : "memory"
- );
- return *sse - (((int64_t)sum * sum) / (8 * high));
- }
- #define VPX_VARIANCE8XN(n) \
- uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- uint32_t *sse) { \
- return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
- }
- VPX_VARIANCE8XN(16)
- VPX_VARIANCE8XN(8)
- VPX_VARIANCE8XN(4)
- static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- uint32_t *sse, int high) {
- int sum;
- double ftmp[12];
- uint32_t tmp[3];
- *sse = 0;
- __asm__ volatile (
- "li %[tmp0], 0x20 \n\t"
- "mtc1 %[tmp0], %[ftmp10] \n\t"
- MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
- "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
- "1: \n\t"
- "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
- "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
- VARIANCE_SSE_SUM_4
- "addiu %[tmp0], %[tmp0], -0x01 \n\t"
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
- "bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t"
- "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
- "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
- "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t"
- "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t"
- "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t"
- "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t"
- "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
- "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
- "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
- "dsrl %[ftmp0], %[ftmp3], %[ftmp10] \n\t"
- "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
- "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
- : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
- [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
- [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
- [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
- [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
- [ftmp10]"=&f"(ftmp[10]),
- [tmp0]"=&r"(tmp[0]),
- [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
- : [src_stride]"r"((mips_reg)src_stride),
- [ref_stride]"r"((mips_reg)ref_stride),
- [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
- : "memory"
- );
- return *sse - (((int64_t)sum * sum) / (4 * high));
- }
- #define VPX_VARIANCE4XN(n) \
- uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- uint32_t *sse) { \
- return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
- }
- VPX_VARIANCE4XN(8)
- VPX_VARIANCE4XN(4)
- static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- uint32_t *sse, uint64_t high) {
- double ftmp[12];
- uint32_t tmp[1];
- *sse = 0;
- __asm__ volatile (
- "li %[tmp0], 0x20 \n\t"
- "mtc1 %[tmp0], %[ftmp11] \n\t"
- MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
- "1: \n\t"
- VARIANCE_SSE_16
- "addiu %[tmp0], %[tmp0], -0x01 \n\t"
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
- "bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
- "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
- "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
- : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
- [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
- [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
- [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
- [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
- [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
- [tmp0]"=&r"(tmp[0]),
- [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
- : [src_stride]"r"((mips_reg)src_stride),
- [ref_stride]"r"((mips_reg)ref_stride),
- [high]"r"(&high), [sse]"r"(sse)
- : "memory"
- );
- return *sse;
- }
- #define vpx_mse16xN(n) \
- uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- uint32_t *sse) { \
- return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
- }
- vpx_mse16xN(16);
- vpx_mse16xN(8);
- static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- uint32_t *sse, uint64_t high) {
- double ftmp[12];
- uint32_t tmp[1];
- *sse = 0;
- __asm__ volatile (
- "li %[tmp0], 0x20 \n\t"
- "mtc1 %[tmp0], %[ftmp11] \n\t"
- MMI_L(%[tmp0], %[high], 0x00)
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
- "1: \n\t"
- VARIANCE_SSE_8
- "addiu %[tmp0], %[tmp0], -0x01 \n\t"
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
- "bnez %[tmp0], 1b \n\t"
- "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
- "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
- "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
- : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
- [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
- [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
- [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
- [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
- [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
- [tmp0]"=&r"(tmp[0]),
- [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
- : [src_stride]"r"((mips_reg)src_stride),
- [ref_stride]"r"((mips_reg)ref_stride),
- [high]"r"(&high), [sse]"r"(sse)
- : "memory"
- );
- return *sse;
- }
- #define vpx_mse8xN(n) \
- uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- uint32_t *sse) { \
- return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
- }
- vpx_mse8xN(16);
- vpx_mse8xN(8);
- #define SUBPIX_VAR(W, H) \
- uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \
- const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
- const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
- uint16_t fdata3[((H) + 1) * (W)]; \
- uint8_t temp2[(H) * (W)]; \
- \
- var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
- W, bilinear_filters[x_offset]); \
- var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[y_offset]); \
- \
- return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse); \
- }
- SUBPIX_VAR(64, 64)
- SUBPIX_VAR(64, 32)
- SUBPIX_VAR(32, 64)
- SUBPIX_VAR(32, 32)
- SUBPIX_VAR(32, 16)
- SUBPIX_VAR(16, 32)
- static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
- int src_stride, int x_offset,
- int y_offset, uint8_t *temp2,
- int counter) {
- uint8_t *temp2_ptr = temp2;
- mips_reg l_counter = counter;
- double ftmp[15];
- mips_reg tmp[2];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
- DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
- const uint8_t *filter_x = bilinear_filters[x_offset];
- const uint8_t *filter_y = bilinear_filters[y_offset];
- __asm__ volatile (
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- MMI_LI(%[tmp0], 0x07)
- MMI_MTC1(%[tmp0], %[ftmp14])
- "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
- "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
- "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
- "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
- // fdata3: fdata3[0] ~ fdata3[15]
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
- // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15]
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
- // temp2: temp2[0] ~ temp2[15]
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
- // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15]
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
- // temp2+16*1: temp2[0] ~ temp2[15]
- MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
- "1: \n\t"
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
- MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
- MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
- "addiu %[counter], %[counter], -0x01 \n\t"
- "bnez %[counter], 1b \n\t"
- : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
- [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
- [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
- [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
- [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
- [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
- [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
- [counter]"+&r"(l_counter)
- : [filter_x0] "f"((uint64_t)filter_x[0]),
- [filter_x1] "f"((uint64_t)filter_x[1]),
- [filter_y0] "f"((uint64_t)filter_y[0]),
- [filter_y1] "f"((uint64_t)filter_y[1]),
- [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
- [mask] "f"(mask)
- : "memory"
- );
- }
- #define SUBPIX_VAR16XN(H) \
- uint32_t vpx_sub_pixel_variance16x##H##_mmi( \
- const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
- const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
- uint8_t temp2[16 * (H)]; \
- var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \
- ((H)-2) / 2); \
- \
- return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse); \
- }
- SUBPIX_VAR16XN(16)
- SUBPIX_VAR16XN(8)
- static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
- int src_stride, int x_offset,
- int y_offset, uint8_t *temp2,
- int counter) {
- uint8_t *temp2_ptr = temp2;
- mips_reg l_counter = counter;
- double ftmp[15];
- mips_reg tmp[2];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
- DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
- const uint8_t *filter_x = bilinear_filters[x_offset];
- const uint8_t *filter_y = bilinear_filters[y_offset];
- __asm__ volatile (
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- MMI_LI(%[tmp0], 0x07)
- MMI_MTC1(%[tmp0], %[ftmp14])
- "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
- "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
- "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
- "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
- // fdata3: fdata3[0] ~ fdata3[7]
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
- // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7]
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
- // temp2: temp2[0] ~ temp2[7]
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
- // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7]
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
- // temp2+8*1: temp2[0] ~ temp2[7]
- MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
- "1: \n\t"
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
- MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
- MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
- "addiu %[counter], %[counter], -0x01 \n\t"
- "bnez %[counter], 1b \n\t"
- : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
- [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
- [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
- [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
- [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
- [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
- [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
- [counter]"+&r"(l_counter)
- : [filter_x0] "f"((uint64_t)filter_x[0]),
- [filter_x1] "f"((uint64_t)filter_x[1]),
- [filter_y0] "f"((uint64_t)filter_y[0]),
- [filter_y1] "f"((uint64_t)filter_y[1]),
- [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
- [mask] "f"(mask)
- : "memory"
- );
- }
- #define SUBPIX_VAR8XN(H) \
- uint32_t vpx_sub_pixel_variance8x##H##_mmi( \
- const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
- const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
- uint8_t temp2[8 * (H)]; \
- var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \
- ((H)-2) / 2); \
- \
- return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse); \
- }
- SUBPIX_VAR8XN(16)
- SUBPIX_VAR8XN(8)
- SUBPIX_VAR8XN(4)
- static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
- int src_stride, int x_offset,
- int y_offset, uint8_t *temp2,
- int counter) {
- uint8_t *temp2_ptr = temp2;
- mips_reg l_counter = counter;
- double ftmp[7];
- mips_reg tmp[2];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
- DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
- const uint8_t *filter_x = bilinear_filters[x_offset];
- const uint8_t *filter_y = bilinear_filters[y_offset];
- __asm__ volatile (
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- MMI_LI(%[tmp0], 0x07)
- MMI_MTC1(%[tmp0], %[ftmp6])
- "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
- "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
- "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
- "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
- // fdata3: fdata3[0] ~ fdata3[3]
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
- // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3]
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
- // temp2: temp2[0] ~ temp2[7]
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
- // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3]
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
- // temp2+4*1: temp2[0] ~ temp2[7]
- MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
- "1: \n\t"
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
- MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
- MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
- VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
- MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
- VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
- "addiu %[counter], %[counter], -0x01 \n\t"
- "bnez %[counter], 1b \n\t"
- : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
- [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
- [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
- [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
- : [filter_x0] "f"((uint64_t)filter_x[0]),
- [filter_x1] "f"((uint64_t)filter_x[1]),
- [filter_y0] "f"((uint64_t)filter_y[0]),
- [filter_y1] "f"((uint64_t)filter_y[1]),
- [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
- [mask] "f"(mask)
- : "memory"
- );
- }
- #define SUBPIX_VAR4XN(H) \
- uint32_t vpx_sub_pixel_variance4x##H##_mmi( \
- const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
- const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
- uint8_t temp2[4 * (H)]; \
- var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \
- ((H)-2) / 2); \
- \
- return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse); \
- }
- SUBPIX_VAR4XN(8)
- SUBPIX_VAR4XN(4)
- #define SUBPIX_AVG_VAR(W, H) \
- uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \
- const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
- const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[((H) + 1) * (W)]; \
- uint8_t temp2[(H) * (W)]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[(H) * (W)]); \
- \
- var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
- W, bilinear_filters[x_offset]); \
- var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[y_offset]); \
- \
- vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \
- \
- return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse); \
- }
- SUBPIX_AVG_VAR(64, 64)
- SUBPIX_AVG_VAR(64, 32)
- SUBPIX_AVG_VAR(32, 64)
- SUBPIX_AVG_VAR(32, 32)
- SUBPIX_AVG_VAR(32, 16)
- SUBPIX_AVG_VAR(16, 32)
- SUBPIX_AVG_VAR(16, 16)
- SUBPIX_AVG_VAR(16, 8)
- SUBPIX_AVG_VAR(8, 16)
- SUBPIX_AVG_VAR(8, 8)
- SUBPIX_AVG_VAR(8, 4)
- SUBPIX_AVG_VAR(4, 8)
- SUBPIX_AVG_VAR(4, 4)
|