123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467 |
- ;
- ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- ;
- ; Use of this source code is governed by a BSD-style license
- ; that can be found in the LICENSE file in the root of the source
- ; tree. An additional intellectual property rights grant can be found
- ; in the file PATENTS. All contributing project authors may
- ; be found in the AUTHORS file in the root of the source tree.
- ;
- %include "third_party/x86inc/x86inc.asm"
- SECTION_RODATA
- pw_8: times 8 dw 8
- bilin_filter_m_sse2: times 8 dw 16
- times 8 dw 0
- times 8 dw 14
- times 8 dw 2
- times 8 dw 12
- times 8 dw 4
- times 8 dw 10
- times 8 dw 6
- times 16 dw 8
- times 8 dw 6
- times 8 dw 10
- times 8 dw 4
- times 8 dw 12
- times 8 dw 2
- times 8 dw 14
- bilin_filter_m_ssse3: times 8 db 16, 0
- times 8 db 14, 2
- times 8 db 12, 4
- times 8 db 10, 6
- times 16 db 8
- times 8 db 6, 10
- times 8 db 4, 12
- times 8 db 2, 14
- SECTION .text
- ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
- ; int x_offset, int y_offset,
- ; const uint8_t *ref, ptrdiff_t ref_stride,
- ; int height, unsigned int *sse);
- ;
- ; This function returns the SE and stores SSE in the given pointer.
- %macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
- psubw %3, %4
- psubw %1, %2
- paddw %5, %3
- pmaddwd %3, %3
- paddw %5, %1
- pmaddwd %1, %1
- paddd %6, %3
- paddd %6, %1
- %endmacro
- %macro STORE_AND_RET 1
- %if %1 > 4
- ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
- ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
- ; We have to sign-extend it before adding the words within the register
- ; and outputing to a dword.
- pcmpgtw m5, m6 ; mask for 0 > x
- movhlps m3, m7
- punpcklwd m4, m6, m5
- punpckhwd m6, m5 ; sign-extend m6 word->dword
- paddd m7, m3
- paddd m6, m4
- pshufd m3, m7, 0x1
- movhlps m4, m6
- paddd m7, m3
- paddd m6, m4
- mov r1, ssem ; r1 = unsigned int *sse
- pshufd m4, m6, 0x1
- movd [r1], m7 ; store sse
- paddd m6, m4
- movd raxd, m6 ; store sum as return value
- %else ; 4xh
- pshuflw m4, m6, 0xe
- pshuflw m3, m7, 0xe
- paddw m6, m4
- paddd m7, m3
- pcmpgtw m5, m6 ; mask for 0 > x
- mov r1, ssem ; r1 = unsigned int *sse
- punpcklwd m6, m5 ; sign-extend m6 word->dword
- movd [r1], m7 ; store sse
- pshuflw m4, m6, 0xe
- paddd m6, m4
- movd raxd, m6 ; store sum as return value
- %endif
- RET
- %endmacro
- %macro INC_SRC_BY_SRC_STRIDE 0
- %if ARCH_X86=1 && CONFIG_PIC=1
- add srcq, src_stridemp
- %else
- add srcq, src_strideq
- %endif
- %endmacro
- %macro SUBPEL_VARIANCE 1-2 0 ; W
- %if cpuflag(ssse3)
- %define bilin_filter_m bilin_filter_m_ssse3
- %define filter_idx_shift 4
- %else
- %define bilin_filter_m bilin_filter_m_sse2
- %define filter_idx_shift 5
- %endif
- ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
- ; 11, not 13, if the registers are ordered correctly. May make a minor speed
- ; difference on Win64
- %if ARCH_X86_64
- %if %2 == 1 ; avg
- cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
- x_offset, y_offset, ref, ref_stride, \
- second_pred, second_stride, height, sse
- %define second_str second_strideq
- %else
- cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
- x_offset, y_offset, ref, ref_stride, \
- height, sse
- %endif
- %define block_height heightd
- %define bilin_filter sseq
- %else
- %if CONFIG_PIC=1
- %if %2 == 1 ; avg
- cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, ref, ref_stride, \
- second_pred, second_stride, height, sse
- %define block_height dword heightm
- %define second_str second_stridemp
- %else
- cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, ref, ref_stride, \
- height, sse
- %define block_height heightd
- %endif
- ; reuse argument stack space
- %define g_bilin_filterm x_offsetm
- %define g_pw_8m y_offsetm
- ;Store bilin_filter and pw_8 location in stack
- %if GET_GOT_DEFINED == 1
- GET_GOT eax
- add esp, 4 ; restore esp
- %endif
- lea ecx, [GLOBAL(bilin_filter_m)]
- mov g_bilin_filterm, ecx
- lea ecx, [GLOBAL(pw_8)]
- mov g_pw_8m, ecx
- LOAD_IF_USED 0, 1 ; load eax, ecx back
- %else
- %if %2 == 1 ; avg
- cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, \
- ref, ref_stride, second_pred, second_stride, \
- height, sse
- %define block_height dword heightm
- %define second_str second_stridemp
- %else
- cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, ref, ref_stride, \
- height, sse
- %define block_height heightd
- %endif
- %define bilin_filter bilin_filter_m
- %endif
- %endif
- %if %1 == 4
- %define movx movd
- %else
- %define movx movh
- %endif
- ASSERT %1 <= 16 ; m6 overflows if w > 16
- pxor m6, m6 ; sum
- pxor m7, m7 ; sse
- ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
- ; could perhaps use it for something more productive then
- pxor m5, m5 ; dedicated zero register
- %if %1 < 16
- sar block_height, 1
- %if %2 == 1 ; avg
- shl second_str, 1
- %endif
- %endif
- ; FIXME(rbultje) replace by jumptable?
- test x_offsetd, x_offsetd
- jnz .x_nonzero
- ; x_offset == 0
- test y_offsetd, y_offsetd
- jnz .x_zero_y_nonzero
- ; x_offset == 0 && y_offset == 0
- .x_zero_y_zero_loop:
- %if %1 == 16
- movu m0, [srcq]
- mova m1, [refq]
- %if %2 == 1 ; avg
- pavgb m0, [second_predq]
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
- %endif
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %if %2 == 0 ; !avg
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
- %endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- add srcq, src_strideq
- add refq, ref_strideq
- %else ; %1 < 16
- movx m0, [srcq]
- %if %2 == 1 ; avg
- %if %1 > 4
- movhps m0, [srcq+src_strideq]
- %else ; 4xh
- movx m1, [srcq+src_strideq]
- punpckldq m0, m1
- %endif
- %else ; !avg
- movx m2, [srcq+src_strideq]
- %endif
- movx m1, [refq]
- movx m3, [refq+ref_strideq]
- %if %2 == 1 ; avg
- %if %1 > 4
- pavgb m0, [second_predq]
- %else
- movh m2, [second_predq]
- pavgb m0, m2
- %endif
- punpcklbw m3, m5
- punpcklbw m1, m5
- %if %1 > 4
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %else ; 4xh
- punpcklbw m0, m5
- movhlps m2, m0
- %endif
- %else ; !avg
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
- %endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- lea srcq, [srcq+src_strideq*2]
- lea refq, [refq+ref_strideq*2]
- %endif
- %if %2 == 1 ; avg
- add second_predq, second_str
- %endif
- dec block_height
- jg .x_zero_y_zero_loop
- STORE_AND_RET %1
- .x_zero_y_nonzero:
- cmp y_offsetd, 4
- jne .x_zero_y_nonhalf
- ; x_offset == 0 && y_offset == 0.5
- .x_zero_y_half_loop:
- %if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+src_strideq]
- mova m1, [refq]
- pavgb m0, m4
- punpckhbw m3, m1, m5
- %if %2 == 1 ; avg
- pavgb m0, [second_predq]
- %endif
- punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- add srcq, src_strideq
- add refq, ref_strideq
- %else ; %1 < 16
- movx m0, [srcq]
- movx m2, [srcq+src_strideq]
- %if %2 == 1 ; avg
- %if %1 > 4
- movhps m2, [srcq+src_strideq*2]
- %else ; 4xh
- movx m1, [srcq+src_strideq*2]
- punpckldq m2, m1
- %endif
- movx m1, [refq]
- %if %1 > 4
- movlhps m0, m2
- %else ; 4xh
- punpckldq m0, m2
- %endif
- movx m3, [refq+ref_strideq]
- pavgb m0, m2
- punpcklbw m1, m5
- %if %1 > 4
- pavgb m0, [second_predq]
- punpcklbw m3, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %else ; 4xh
- movh m4, [second_predq]
- pavgb m0, m4
- punpcklbw m3, m5
- punpcklbw m0, m5
- movhlps m2, m0
- %endif
- %else ; !avg
- movx m4, [srcq+src_strideq*2]
- movx m1, [refq]
- pavgb m0, m2
- movx m3, [refq+ref_strideq]
- pavgb m2, m4
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
- %endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- lea srcq, [srcq+src_strideq*2]
- lea refq, [refq+ref_strideq*2]
- %endif
- %if %2 == 1 ; avg
- add second_predq, second_str
- %endif
- dec block_height
- jg .x_zero_y_half_loop
- STORE_AND_RET %1
- .x_zero_y_nonhalf:
- ; x_offset == 0 && y_offset == bilin interpolation
- %if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
- %endif
- shl y_offsetd, filter_idx_shift
- %if ARCH_X86_64 && %1 > 4
- mova m8, [bilin_filter+y_offsetq]
- %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+y_offsetq+16]
- %endif
- mova m10, [GLOBAL(pw_8)]
- %define filter_y_a m8
- %define filter_y_b m9
- %define filter_rnd m10
- %else ; x86-32 or mmx
- %if ARCH_X86=1 && CONFIG_PIC=1
- ; x_offset == 0, reuse x_offset reg
- %define tempq x_offsetq
- add y_offsetq, g_bilin_filterm
- %define filter_y_a [y_offsetq]
- %define filter_y_b [y_offsetq+16]
- mov tempq, g_pw_8m
- %define filter_rnd [tempq]
- %else
- add y_offsetq, bilin_filter
- %define filter_y_a [y_offsetq]
- %define filter_y_b [y_offsetq+16]
- %define filter_rnd [GLOBAL(pw_8)]
- %endif
- %endif
- .x_zero_y_other_loop:
- %if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+src_strideq]
- mova m1, [refq]
- %if cpuflag(ssse3)
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_y_a
- pmaddubsw m0, filter_y_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
- %else
- punpckhbw m2, m0, m5
- punpckhbw m3, m4, m5
- punpcklbw m0, m5
- punpcklbw m4, m5
- ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
- ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
- ; instructions is the same (5), but it is 1 mul instead of 2, so might be
- ; slightly faster because of pmullw latency. It would also cut our rodata
- ; tables in half for this function, and save 1-2 registers on x86-64.
- pmullw m2, filter_y_a
- pmullw m3, filter_y_b
- paddw m2, filter_rnd
- pmullw m0, filter_y_a
- pmullw m4, filter_y_b
- paddw m0, filter_rnd
- paddw m2, m3
- paddw m0, m4
- %endif
- psraw m2, 4
- psraw m0, 4
- %if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [second_predq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %endif
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- add srcq, src_strideq
- add refq, ref_strideq
- %else ; %1 < 16
- movx m0, [srcq]
- movx m2, [srcq+src_strideq]
- movx m4, [srcq+src_strideq*2]
- movx m3, [refq+ref_strideq]
- %if cpuflag(ssse3)
- movx m1, [refq]
- punpcklbw m0, m2
- punpcklbw m2, m4
- pmaddubsw m0, filter_y_a
- pmaddubsw m2, filter_y_a
- punpcklbw m3, m5
- paddw m2, filter_rnd
- paddw m0, filter_rnd
- %else
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m4, m5
- pmullw m0, filter_y_a
- pmullw m1, m2, filter_y_b
- punpcklbw m3, m5
- paddw m0, filter_rnd
- pmullw m2, filter_y_a
- pmullw m4, filter_y_b
- paddw m0, m1
- paddw m2, filter_rnd
- movx m1, [refq]
- paddw m2, m4
- %endif
- psraw m0, 4
- psraw m2, 4
- %if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- %if %1 == 4
- movlhps m0, m2
- %endif
- packuswb m0, m2
- %if %1 > 4
- pavgb m0, [second_predq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %else ; 4xh
- movh m2, [second_predq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
- %endif
- %endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- lea srcq, [srcq+src_strideq*2]
- lea refq, [refq+ref_strideq*2]
- %endif
- %if %2 == 1 ; avg
- add second_predq, second_str
- %endif
- dec block_height
- jg .x_zero_y_other_loop
- %undef filter_y_a
- %undef filter_y_b
- %undef filter_rnd
- STORE_AND_RET %1
- .x_nonzero:
- cmp x_offsetd, 4
- jne .x_nonhalf
- ; x_offset == 0.5
- test y_offsetd, y_offsetd
- jnz .x_half_y_nonzero
- ; x_offset == 0.5 && y_offset == 0
- .x_half_y_zero_loop:
- %if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+1]
- mova m1, [refq]
- pavgb m0, m4
- punpckhbw m3, m1, m5
- %if %2 == 1 ; avg
- pavgb m0, [second_predq]
- %endif
- punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- add srcq, src_strideq
- add refq, ref_strideq
- %else ; %1 < 16
- movx m0, [srcq]
- movx m4, [srcq+1]
- %if %2 == 1 ; avg
- %if %1 > 4
- movhps m0, [srcq+src_strideq]
- movhps m4, [srcq+src_strideq+1]
- %else ; 4xh
- movx m1, [srcq+src_strideq]
- punpckldq m0, m1
- movx m2, [srcq+src_strideq+1]
- punpckldq m4, m2
- %endif
- movx m1, [refq]
- movx m3, [refq+ref_strideq]
- pavgb m0, m4
- punpcklbw m3, m5
- %if %1 > 4
- pavgb m0, [second_predq]
- punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %else ; 4xh
- movh m2, [second_predq]
- pavgb m0, m2
- punpcklbw m1, m5
- punpcklbw m0, m5
- movhlps m2, m0
- %endif
- %else ; !avg
- movx m2, [srcq+src_strideq]
- movx m1, [refq]
- pavgb m0, m4
- movx m4, [srcq+src_strideq+1]
- movx m3, [refq+ref_strideq]
- pavgb m2, m4
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
- %endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- lea srcq, [srcq+src_strideq*2]
- lea refq, [refq+ref_strideq*2]
- %endif
- %if %2 == 1 ; avg
- add second_predq, second_str
- %endif
- dec block_height
- jg .x_half_y_zero_loop
- STORE_AND_RET %1
- .x_half_y_nonzero:
- cmp y_offsetd, 4
- jne .x_half_y_nonhalf
- ; x_offset == 0.5 && y_offset == 0.5
- %if %1 == 16
- movu m0, [srcq]
- movu m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
- .x_half_y_half_loop:
- movu m4, [srcq]
- movu m3, [srcq+1]
- mova m1, [refq]
- pavgb m4, m3
- punpckhbw m3, m1, m5
- pavgb m0, m4
- %if %2 == 1 ; avg
- punpcklbw m1, m5
- pavgb m0, [second_predq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %else
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- punpcklbw m1, m5
- %endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
- add srcq, src_strideq
- add refq, ref_strideq
- %else ; %1 < 16
- movx m0, [srcq]
- movx m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
- .x_half_y_half_loop:
- movx m2, [srcq]
- movx m3, [srcq+1]
- %if %2 == 1 ; avg
- %if %1 > 4
- movhps m2, [srcq+src_strideq]
- movhps m3, [srcq+src_strideq+1]
- %else
- movx m1, [srcq+src_strideq]
- punpckldq m2, m1
- movx m1, [srcq+src_strideq+1]
- punpckldq m3, m1
- %endif
- pavgb m2, m3
- %if %1 > 4
- movlhps m0, m2
- movhlps m4, m2
- %else ; 4xh
- punpckldq m0, m2
- pshuflw m4, m2, 0xe
- %endif
- movx m1, [refq]
- pavgb m0, m2
- movx m3, [refq+ref_strideq]
- %if %1 > 4
- pavgb m0, [second_predq]
- %else
- movh m2, [second_predq]
- pavgb m0, m2
- %endif
- punpcklbw m3, m5
- punpcklbw m1, m5
- %if %1 > 4
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %else
- punpcklbw m0, m5
- movhlps m2, m0
- %endif
- %else ; !avg
- movx m4, [srcq+src_strideq]
- movx m1, [srcq+src_strideq+1]
- pavgb m2, m3
- pavgb m4, m1
- pavgb m0, m2
- pavgb m2, m4
- movx m1, [refq]
- movx m3, [refq+ref_strideq]
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
- %endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
- lea srcq, [srcq+src_strideq*2]
- lea refq, [refq+ref_strideq*2]
- %endif
- %if %2 == 1 ; avg
- add second_predq, second_str
- %endif
- dec block_height
- jg .x_half_y_half_loop
- STORE_AND_RET %1
- .x_half_y_nonhalf:
- ; x_offset == 0.5 && y_offset == bilin interpolation
- %if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
- %endif
- shl y_offsetd, filter_idx_shift
- %if ARCH_X86_64 && %1 > 4
- mova m8, [bilin_filter+y_offsetq]
- %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+y_offsetq+16]
- %endif
- mova m10, [GLOBAL(pw_8)]
- %define filter_y_a m8
- %define filter_y_b m9
- %define filter_rnd m10
- %else ;x86_32
- %if ARCH_X86=1 && CONFIG_PIC=1
- ; x_offset == 0.5. We can reuse x_offset reg
- %define tempq x_offsetq
- add y_offsetq, g_bilin_filterm
- %define filter_y_a [y_offsetq]
- %define filter_y_b [y_offsetq+16]
- mov tempq, g_pw_8m
- %define filter_rnd [tempq]
- %else
- add y_offsetq, bilin_filter
- %define filter_y_a [y_offsetq]
- %define filter_y_b [y_offsetq+16]
- %define filter_rnd [GLOBAL(pw_8)]
- %endif
- %endif
- %if %1 == 16
- movu m0, [srcq]
- movu m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
- .x_half_y_other_loop:
- movu m4, [srcq]
- movu m2, [srcq+1]
- mova m1, [refq]
- pavgb m4, m2
- %if cpuflag(ssse3)
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_y_a
- pmaddubsw m0, filter_y_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
- psraw m2, 4
- %else
- punpckhbw m2, m0, m5
- punpckhbw m3, m4, m5
- pmullw m2, filter_y_a
- pmullw m3, filter_y_b
- paddw m2, filter_rnd
- punpcklbw m0, m5
- paddw m2, m3
- punpcklbw m3, m4, m5
- pmullw m0, filter_y_a
- pmullw m3, filter_y_b
- paddw m0, filter_rnd
- psraw m2, 4
- paddw m0, m3
- %endif
- punpckhbw m3, m1, m5
- psraw m0, 4
- %if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [second_predq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
- add srcq, src_strideq
- add refq, ref_strideq
- %else ; %1 < 16
- movx m0, [srcq]
- movx m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
- %if notcpuflag(ssse3)
- punpcklbw m0, m5
- %endif
- .x_half_y_other_loop:
- movx m2, [srcq]
- movx m1, [srcq+1]
- movx m4, [srcq+src_strideq]
- movx m3, [srcq+src_strideq+1]
- pavgb m2, m1
- pavgb m4, m3
- movx m3, [refq+ref_strideq]
- %if cpuflag(ssse3)
- movx m1, [refq]
- punpcklbw m0, m2
- punpcklbw m2, m4
- pmaddubsw m0, filter_y_a
- pmaddubsw m2, filter_y_a
- punpcklbw m3, m5
- paddw m0, filter_rnd
- paddw m2, filter_rnd
- %else
- punpcklbw m2, m5
- punpcklbw m4, m5
- pmullw m0, filter_y_a
- pmullw m1, m2, filter_y_b
- punpcklbw m3, m5
- paddw m0, filter_rnd
- pmullw m2, filter_y_a
- paddw m0, m1
- pmullw m1, m4, filter_y_b
- paddw m2, filter_rnd
- paddw m2, m1
- movx m1, [refq]
- %endif
- psraw m0, 4
- psraw m2, 4
- %if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- %if %1 == 4
- movlhps m0, m2
- %endif
- packuswb m0, m2
- %if %1 > 4
- pavgb m0, [second_predq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %else
- movh m2, [second_predq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
- %endif
- %endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
- lea srcq, [srcq+src_strideq*2]
- lea refq, [refq+ref_strideq*2]
- %endif
- %if %2 == 1 ; avg
- add second_predq, second_str
- %endif
- dec block_height
- jg .x_half_y_other_loop
- %undef filter_y_a
- %undef filter_y_b
- %undef filter_rnd
- STORE_AND_RET %1
- .x_nonhalf:
- test y_offsetd, y_offsetd
- jnz .x_nonhalf_y_nonzero
- ; x_offset == bilin interpolation && y_offset == 0
- %if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
- %endif
- shl x_offsetd, filter_idx_shift
- %if ARCH_X86_64 && %1 > 4
- mova m8, [bilin_filter+x_offsetq]
- %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+x_offsetq+16]
- %endif
- mova m10, [GLOBAL(pw_8)]
- %define filter_x_a m8
- %define filter_x_b m9
- %define filter_rnd m10
- %else ; x86-32
- %if ARCH_X86=1 && CONFIG_PIC=1
- ;y_offset == 0. We can reuse y_offset reg.
- %define tempq y_offsetq
- add x_offsetq, g_bilin_filterm
- %define filter_x_a [x_offsetq]
- %define filter_x_b [x_offsetq+16]
- mov tempq, g_pw_8m
- %define filter_rnd [tempq]
- %else
- add x_offsetq, bilin_filter
- %define filter_x_a [x_offsetq]
- %define filter_x_b [x_offsetq+16]
- %define filter_rnd [GLOBAL(pw_8)]
- %endif
- %endif
- .x_other_y_zero_loop:
- %if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+1]
- mova m1, [refq]
- %if cpuflag(ssse3)
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_x_a
- pmaddubsw m0, filter_x_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
- %else
- punpckhbw m2, m0, m5
- punpckhbw m3, m4, m5
- punpcklbw m0, m5
- punpcklbw m4, m5
- pmullw m2, filter_x_a
- pmullw m3, filter_x_b
- paddw m2, filter_rnd
- pmullw m0, filter_x_a
- pmullw m4, filter_x_b
- paddw m0, filter_rnd
- paddw m2, m3
- paddw m0, m4
- %endif
- psraw m2, 4
- psraw m0, 4
- %if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [second_predq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %endif
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- add srcq, src_strideq
- add refq, ref_strideq
- %else ; %1 < 16
- movx m0, [srcq]
- movx m1, [srcq+1]
- movx m2, [srcq+src_strideq]
- movx m4, [srcq+src_strideq+1]
- movx m3, [refq+ref_strideq]
- %if cpuflag(ssse3)
- punpcklbw m0, m1
- movx m1, [refq]
- punpcklbw m2, m4
- pmaddubsw m0, filter_x_a
- pmaddubsw m2, filter_x_a
- punpcklbw m3, m5
- paddw m0, filter_rnd
- paddw m2, filter_rnd
- %else
- punpcklbw m0, m5
- punpcklbw m1, m5
- punpcklbw m2, m5
- punpcklbw m4, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- punpcklbw m3, m5
- paddw m0, filter_rnd
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m0, m1
- paddw m2, filter_rnd
- movx m1, [refq]
- paddw m2, m4
- %endif
- psraw m0, 4
- psraw m2, 4
- %if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- %if %1 == 4
- movlhps m0, m2
- %endif
- packuswb m0, m2
- %if %1 > 4
- pavgb m0, [second_predq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %else
- movh m2, [second_predq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
- %endif
- %endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- lea srcq, [srcq+src_strideq*2]
- lea refq, [refq+ref_strideq*2]
- %endif
- %if %2 == 1 ; avg
- add second_predq, second_str
- %endif
- dec block_height
- jg .x_other_y_zero_loop
- %undef filter_x_a
- %undef filter_x_b
- %undef filter_rnd
- STORE_AND_RET %1
- .x_nonhalf_y_nonzero:
- cmp y_offsetd, 4
- jne .x_nonhalf_y_nonhalf
- ; x_offset == bilin interpolation && y_offset == 0.5
- %if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
- %endif
- shl x_offsetd, filter_idx_shift
- %if ARCH_X86_64 && %1 > 4
- mova m8, [bilin_filter+x_offsetq]
- %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+x_offsetq+16]
- %endif
- mova m10, [GLOBAL(pw_8)]
- %define filter_x_a m8
- %define filter_x_b m9
- %define filter_rnd m10
- %else ; x86-32
- %if ARCH_X86=1 && CONFIG_PIC=1
- ; y_offset == 0.5. We can reuse y_offset reg.
- %define tempq y_offsetq
- add x_offsetq, g_bilin_filterm
- %define filter_x_a [x_offsetq]
- %define filter_x_b [x_offsetq+16]
- mov tempq, g_pw_8m
- %define filter_rnd [tempq]
- %else
- add x_offsetq, bilin_filter
- %define filter_x_a [x_offsetq]
- %define filter_x_b [x_offsetq+16]
- %define filter_rnd [GLOBAL(pw_8)]
- %endif
- %endif
- %if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+1]
- %if cpuflag(ssse3)
- punpckhbw m2, m0, m1
- punpcklbw m0, m1
- pmaddubsw m2, filter_x_a
- pmaddubsw m0, filter_x_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
- %else
- punpckhbw m2, m0, m5
- punpckhbw m3, m1, m5
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- pmullw m2, filter_x_a
- pmullw m3, filter_x_b
- paddw m2, filter_rnd
- paddw m0, m1
- paddw m2, m3
- %endif
- psraw m0, 4
- psraw m2, 4
- add srcq, src_strideq
- packuswb m0, m2
- .x_other_y_half_loop:
- movu m4, [srcq]
- movu m3, [srcq+1]
- %if cpuflag(ssse3)
- mova m1, [refq]
- punpckhbw m2, m4, m3
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- paddw m2, filter_rnd
- paddw m4, filter_rnd
- psraw m2, 4
- psraw m4, 4
- packuswb m4, m2
- pavgb m0, m4
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
- %else
- punpckhbw m2, m4, m5
- punpckhbw m1, m3, m5
- punpcklbw m4, m5
- punpcklbw m3, m5
- pmullw m4, filter_x_a
- pmullw m3, filter_x_b
- paddw m4, filter_rnd
- pmullw m2, filter_x_a
- pmullw m1, filter_x_b
- paddw m2, filter_rnd
- paddw m4, m3
- paddw m2, m1
- mova m1, [refq]
- psraw m4, 4
- psraw m2, 4
- punpckhbw m3, m1, m5
- ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
- ; have a 1-register shortage to be able to store the backup of the bilin
- ; filtered second line as words as cache for the next line. Packing into
- ; a byte costs 1 pack and 2 unpacks, but saves a register.
- packuswb m4, m2
- punpcklbw m1, m5
- pavgb m0, m4
- %endif
- %if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- pavgb m0, [second_predq]
- %endif
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
- add srcq, src_strideq
- add refq, ref_strideq
- %else ; %1 < 16
- movx m0, [srcq]
- movx m1, [srcq+1]
- %if cpuflag(ssse3)
- punpcklbw m0, m1
- pmaddubsw m0, filter_x_a
- paddw m0, filter_rnd
- %else
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- paddw m0, m1
- %endif
- add srcq, src_strideq
- psraw m0, 4
- .x_other_y_half_loop:
- movx m2, [srcq]
- movx m1, [srcq+1]
- movx m4, [srcq+src_strideq]
- movx m3, [srcq+src_strideq+1]
- %if cpuflag(ssse3)
- punpcklbw m2, m1
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- movx m1, [refq]
- movx m3, [refq+ref_strideq]
- paddw m2, filter_rnd
- paddw m4, filter_rnd
- %else
- punpcklbw m2, m5
- punpcklbw m1, m5
- punpcklbw m4, m5
- punpcklbw m3, m5
- pmullw m2, filter_x_a
- pmullw m1, filter_x_b
- paddw m2, filter_rnd
- pmullw m4, filter_x_a
- pmullw m3, filter_x_b
- paddw m4, filter_rnd
- paddw m2, m1
- movx m1, [refq]
- paddw m4, m3
- movx m3, [refq+ref_strideq]
- %endif
- psraw m2, 4
- psraw m4, 4
- pavgw m0, m2
- pavgw m2, m4
- %if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline - also consider going to bytes here
- %if %1 == 4
- movlhps m0, m2
- %endif
- packuswb m0, m2
- %if %1 > 4
- pavgb m0, [second_predq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %else
- movh m2, [second_predq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
- %endif
- %endif
- punpcklbw m3, m5
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
- lea srcq, [srcq+src_strideq*2]
- lea refq, [refq+ref_strideq*2]
- %endif
- %if %2 == 1 ; avg
- add second_predq, second_str
- %endif
- dec block_height
- jg .x_other_y_half_loop
- %undef filter_x_a
- %undef filter_x_b
- %undef filter_rnd
- STORE_AND_RET %1
- .x_nonhalf_y_nonhalf:
- %if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
- %endif
- shl x_offsetd, filter_idx_shift
- shl y_offsetd, filter_idx_shift
- %if ARCH_X86_64 && %1 > 4
- mova m8, [bilin_filter+x_offsetq]
- %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+x_offsetq+16]
- %endif
- mova m10, [bilin_filter+y_offsetq]
- %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m11, [bilin_filter+y_offsetq+16]
- %endif
- mova m12, [GLOBAL(pw_8)]
- %define filter_x_a m8
- %define filter_x_b m9
- %define filter_y_a m10
- %define filter_y_b m11
- %define filter_rnd m12
- %else ; x86-32
- %if ARCH_X86=1 && CONFIG_PIC=1
- ; In this case, there is NO unused register. Used src_stride register. Later,
- ; src_stride has to be loaded from stack when it is needed.
- %define tempq src_strideq
- mov tempq, g_bilin_filterm
- add x_offsetq, tempq
- add y_offsetq, tempq
- %define filter_x_a [x_offsetq]
- %define filter_x_b [x_offsetq+16]
- %define filter_y_a [y_offsetq]
- %define filter_y_b [y_offsetq+16]
- mov tempq, g_pw_8m
- %define filter_rnd [tempq]
- %else
- add x_offsetq, bilin_filter
- add y_offsetq, bilin_filter
- %define filter_x_a [x_offsetq]
- %define filter_x_b [x_offsetq+16]
- %define filter_y_a [y_offsetq]
- %define filter_y_b [y_offsetq+16]
- %define filter_rnd [GLOBAL(pw_8)]
- %endif
- %endif
- ; x_offset == bilin interpolation && y_offset == bilin interpolation
- %if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+1]
- %if cpuflag(ssse3)
- punpckhbw m2, m0, m1
- punpcklbw m0, m1
- pmaddubsw m2, filter_x_a
- pmaddubsw m0, filter_x_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
- %else
- punpckhbw m2, m0, m5
- punpckhbw m3, m1, m5
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- pmullw m2, filter_x_a
- pmullw m3, filter_x_b
- paddw m2, filter_rnd
- paddw m0, m1
- paddw m2, m3
- %endif
- psraw m0, 4
- psraw m2, 4
- INC_SRC_BY_SRC_STRIDE
- packuswb m0, m2
- .x_other_y_other_loop:
- %if cpuflag(ssse3)
- movu m4, [srcq]
- movu m3, [srcq+1]
- mova m1, [refq]
- punpckhbw m2, m4, m3
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- punpckhbw m3, m1, m5
- paddw m2, filter_rnd
- paddw m4, filter_rnd
- psraw m2, 4
- psraw m4, 4
- packuswb m4, m2
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_y_a
- pmaddubsw m0, filter_y_a
- punpcklbw m1, m5
- paddw m2, filter_rnd
- paddw m0, filter_rnd
- psraw m2, 4
- psraw m0, 4
- %else
- movu m3, [srcq]
- movu m4, [srcq+1]
- punpckhbw m1, m3, m5
- punpckhbw m2, m4, m5
- punpcklbw m3, m5
- punpcklbw m4, m5
- pmullw m3, filter_x_a
- pmullw m4, filter_x_b
- paddw m3, filter_rnd
- pmullw m1, filter_x_a
- pmullw m2, filter_x_b
- paddw m1, filter_rnd
- paddw m3, m4
- paddw m1, m2
- psraw m3, 4
- psraw m1, 4
- packuswb m4, m3, m1
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- pmullw m2, filter_y_a
- pmullw m1, filter_y_b
- paddw m2, filter_rnd
- pmullw m0, filter_y_a
- pmullw m3, filter_y_b
- paddw m2, m1
- mova m1, [refq]
- paddw m0, filter_rnd
- psraw m2, 4
- paddw m0, m3
- punpckhbw m3, m1, m5
- psraw m0, 4
- punpcklbw m1, m5
- %endif
- %if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [second_predq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
- INC_SRC_BY_SRC_STRIDE
- add refq, ref_strideq
- %else ; %1 < 16
- movx m0, [srcq]
- movx m1, [srcq+1]
- %if cpuflag(ssse3)
- punpcklbw m0, m1
- pmaddubsw m0, filter_x_a
- paddw m0, filter_rnd
- %else
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- paddw m0, m1
- %endif
- psraw m0, 4
- %if cpuflag(ssse3)
- packuswb m0, m0
- %endif
- INC_SRC_BY_SRC_STRIDE
- .x_other_y_other_loop:
- movx m2, [srcq]
- movx m1, [srcq+1]
- INC_SRC_BY_SRC_STRIDE
- movx m4, [srcq]
- movx m3, [srcq+1]
- %if cpuflag(ssse3)
- punpcklbw m2, m1
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- movx m3, [refq+ref_strideq]
- movx m1, [refq]
- paddw m2, filter_rnd
- paddw m4, filter_rnd
- psraw m2, 4
- psraw m4, 4
- packuswb m2, m2
- packuswb m4, m4
- punpcklbw m0, m2
- punpcklbw m2, m4
- pmaddubsw m0, filter_y_a
- pmaddubsw m2, filter_y_a
- punpcklbw m3, m5
- paddw m0, filter_rnd
- paddw m2, filter_rnd
- psraw m0, 4
- psraw m2, 4
- punpcklbw m1, m5
- %else
- punpcklbw m2, m5
- punpcklbw m1, m5
- punpcklbw m4, m5
- punpcklbw m3, m5
- pmullw m2, filter_x_a
- pmullw m1, filter_x_b
- paddw m2, filter_rnd
- pmullw m4, filter_x_a
- pmullw m3, filter_x_b
- paddw m4, filter_rnd
- paddw m2, m1
- paddw m4, m3
- psraw m2, 4
- psraw m4, 4
- pmullw m0, filter_y_a
- pmullw m3, m2, filter_y_b
- paddw m0, filter_rnd
- pmullw m2, filter_y_a
- pmullw m1, m4, filter_y_b
- paddw m2, filter_rnd
- paddw m0, m3
- movx m3, [refq+ref_strideq]
- paddw m2, m1
- movx m1, [refq]
- psraw m0, 4
- psraw m2, 4
- punpcklbw m3, m5
- punpcklbw m1, m5
- %endif
- %if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- %if %1 == 4
- movlhps m0, m2
- %endif
- packuswb m0, m2
- %if %1 > 4
- pavgb m0, [second_predq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- %else
- movh m2, [second_predq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
- %endif
- %endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
- INC_SRC_BY_SRC_STRIDE
- lea refq, [refq+ref_strideq*2]
- %endif
- %if %2 == 1 ; avg
- add second_predq, second_str
- %endif
- dec block_height
- jg .x_other_y_other_loop
- %undef filter_x_a
- %undef filter_x_b
- %undef filter_y_a
- %undef filter_y_b
- %undef filter_rnd
- %undef movx
- STORE_AND_RET %1
- %endmacro
- ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
- ; between the ssse3 and non-ssse3 version. It may make sense to merge their
- ; code in the sense that the ssse3 version would jump to the appropriate
- ; location in the sse/2 version, rather than duplicating that code in the
- ; binary.
- INIT_XMM sse2
- SUBPEL_VARIANCE 4
- SUBPEL_VARIANCE 8
- SUBPEL_VARIANCE 16
- INIT_XMM ssse3
- SUBPEL_VARIANCE 4
- SUBPEL_VARIANCE 8
- SUBPEL_VARIANCE 16
- INIT_XMM sse2
- SUBPEL_VARIANCE 4, 1
- SUBPEL_VARIANCE 8, 1
- SUBPEL_VARIANCE 16, 1
- INIT_XMM ssse3
- SUBPEL_VARIANCE 4, 1
- SUBPEL_VARIANCE 8, 1
- SUBPEL_VARIANCE 16, 1
|