123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218 |
- ;*****************************************************************************
- ;* x86-optimized functions for removegrain filter
- ;*
- ;* Copyright (C) 2015 James Darnley
- ;*
- ;* This file is part of FFmpeg.
- ;*
- ;* FFmpeg is free software; you can redistribute it and/or modify
- ;* it under the terms of the GNU General Public License as published by
- ;* the Free Software Foundation; either version 2 of the License, or
- ;* (at your option) any later version.
- ;*
- ;* FFmpeg is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;* GNU General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU General Public License along
- ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
- ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- ;*****************************************************************************
- ; column: -1 0 +1
- ; row -1: a1 a2 a3
- ; row 0: a4 c a5
- ; row +1: a6 a7 a8
- %include "libavutil/x86/x86util.asm"
- SECTION_RODATA 32
- pw_4: times 16 dw 4
- pw_8: times 16 dw 8
- pw_div9: times 16 dw ((1<<16)+4)/9
- SECTION .text
- ;*** Preprocessor helpers
- %define a1 srcq+stride_n-1
- %define a2 srcq+stride_n
- %define a3 srcq+stride_n+1
- %define a4 srcq-1
- %define c srcq
- %define a5 srcq+1
- %define a6 srcq+stride_p-1
- %define a7 srcq+stride_p
- %define a8 srcq+stride_p+1
- ; %1 dest simd register
- ; %2 source memory location
- ; %3 zero location (simd register/memory)
- %macro LOAD 3
- movh %1, %2
- punpcklbw %1, %3
- %endmacro
- %macro LOAD_SQUARE 0
- movu m1, [a1]
- movu m2, [a2]
- movu m3, [a3]
- movu m4, [a4]
- movu m0, [c]
- movu m5, [a5]
- movu m6, [a6]
- movu m7, [a7]
- movu m8, [a8]
- %endmacro
- ; %1 zero location (simd register/memory)
- %macro LOAD_SQUARE_16 1
- LOAD m1, [a1], %1
- LOAD m2, [a2], %1
- LOAD m3, [a3], %1
- LOAD m4, [a4], %1
- LOAD m0, [c], %1
- LOAD m5, [a5], %1
- LOAD m6, [a6], %1
- LOAD m7, [a7], %1
- LOAD m8, [a8], %1
- %endmacro
- ; %1 data type
- ; %2 simd register to hold maximums
- ; %3 simd register to hold minimums
- ; %4 temp location (simd register/memory)
- %macro SORT_PAIR 4
- mova %4, %2
- pmin%1 %2, %3
- pmax%1 %3, %4
- %endmacro
- %macro SORT_AXIS 0
- SORT_PAIR ub, m1, m8, m9
- SORT_PAIR ub, m2, m7, m10
- SORT_PAIR ub, m3, m6, m11
- SORT_PAIR ub, m4, m5, m12
- %endmacro
- %macro SORT_AXIS_16 0
- SORT_PAIR sw, m1, m8, m9
- SORT_PAIR sw, m2, m7, m10
- SORT_PAIR sw, m3, m6, m11
- SORT_PAIR sw, m4, m5, m12
- %endmacro
- ; The loop doesn't need to do all the iterations. It could stop when the right
- ; pixels are in the right registers.
- %macro SORT_SQUARE 0
- %assign k 7
- %rep 7
- %assign i 1
- %assign j 2
- %rep k
- SORT_PAIR ub, m %+ i , m %+ j , m9
- %assign i i+1
- %assign j j+1
- %endrep
- %assign k k-1
- %endrep
- %endmacro
- ; %1 dest simd register
- ; %2 source (simd register/memory)
- ; %3 temp simd register
- %macro ABS_DIFF 3
- mova %3, %2
- psubusb %3, %1
- psubusb %1, %2
- por %1, %3
- %endmacro
- ; %1 dest simd register
- ; %2 source (simd register/memory)
- ; %3 temp simd register
- %macro ABS_DIFF_W 3
- mova %3, %2
- psubusw %3, %1
- psubusw %1, %2
- por %1, %3
- %endmacro
- ; %1 simd register that holds the "false" values and will hold the result
- ; %2 simd register that holds the "true" values
- ; %3 location (simd register/memory) that hold the mask
- %macro BLEND 3
- %if cpuflag(avx2)
- vpblendvb %1, %1, %2, %3
- %else
- pand %2, %3
- pandn %3, %1
- por %3, %2
- SWAP %1, %3
- %endif
- %endmacro
- ; Functions
- INIT_XMM sse2
- cglobal rg_fl_mode_1, 4, 5, 3, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- .loop:
- movu m0, [a1]
- mova m1, m0
- movu m2, [a2]
- pmaxub m0, m2
- pminub m1, m2
- movu m2, [a3]
- pmaxub m0, m2
- pminub m1, m2
- movu m2, [a4]
- pmaxub m0, m2
- pminub m1, m2
- movu m2, [a5]
- pmaxub m0, m2
- pminub m1, m2
- movu m2, [a6]
- pmaxub m0, m2
- pminub m1, m2
- movu m2, [a7]
- pmaxub m0, m2
- pminub m1, m2
- movu m2, [a8]
- pmaxub m0, m2
- pminub m1, m2
- movu m2, [c]
- pminub m2, m0
- pmaxub m2, m1
- movu [dstq], m2
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- %if ARCH_X86_64
- cglobal rg_fl_mode_2, 4, 5, 10, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- .loop:
- LOAD_SQUARE
- SORT_SQUARE
- CLIPUB m0, m2, m7
- movu [dstq], m0
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- cglobal rg_fl_mode_3, 4, 5, 10, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- .loop:
- LOAD_SQUARE
- SORT_SQUARE
- CLIPUB m0, m3, m6
- movu [dstq], m0
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- cglobal rg_fl_mode_4, 4, 5, 10, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- .loop:
- LOAD_SQUARE
- SORT_SQUARE
- CLIPUB m0, m4, m5
- movu [dstq], m0
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- cglobal rg_fl_mode_5, 4, 5, 13, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- .loop:
- LOAD_SQUARE
- SORT_AXIS
- mova m9, m0
- mova m10, m0
- mova m11, m0
- mova m12, m0
- CLIPUB m9, m1, m8
- CLIPUB m10, m2, m7
- CLIPUB m11, m3, m6
- CLIPUB m12, m4, m5
- mova m8, m9 ; clip1
- mova m7, m10 ; clip2
- mova m6, m11 ; clip3
- mova m5, m12 ; clip4
- ABS_DIFF m9, m0, m1 ; c1
- ABS_DIFF m10, m0, m2 ; c2
- ABS_DIFF m11, m0, m3 ; c3
- ABS_DIFF m12, m0, m4 ; c4
- pminub m9, m10
- pminub m9, m11
- pminub m9, m12 ; mindiff
- pcmpeqb m10, m9
- pcmpeqb m11, m9
- pcmpeqb m12, m9
- ; Notice the order here: c1, c3, c2, c4
- BLEND m8, m6, m11
- BLEND m8, m7, m10
- BLEND m8, m5, m12
- movu [dstq], m8
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- cglobal rg_fl_mode_6, 4, 5, 16, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- ; Some register saving suggestions: the zero can be somewhere other than a
- ; register, the center pixels could be on the stack.
- pxor m15, m15
- .loop:
- LOAD_SQUARE_16 m15
- SORT_AXIS_16
- mova m9, m0
- mova m10, m0
- mova m11, m0
- mova m12, m0
- CLIPW m9, m1, m8 ; clip1
- CLIPW m10, m2, m7 ; clip2
- CLIPW m11, m3, m6 ; clip3
- CLIPW m12, m4, m5 ; clip4
- psubw m8, m1 ; d1
- psubw m7, m2 ; d2
- psubw m6, m3 ; d3
- psubw m5, m4 ; d4
- mova m1, m9
- mova m2, m10
- mova m3, m11
- mova m4, m12
- ABS_DIFF_W m1, m0, m13
- ABS_DIFF_W m2, m0, m14
- ABS_DIFF_W m3, m0, m13
- ABS_DIFF_W m4, m0, m14
- psllw m1, 1
- psllw m2, 1
- psllw m3, 1
- psllw m4, 1
- paddw m1, m8 ; c1
- paddw m2, m7 ; c2
- paddw m3, m6 ; c3
- paddw m4, m5 ; c4
- ; As the differences (d1..d4) can only be positive, there is no need to
- ; clip to zero. Also, the maximum positive value is less than 768.
- pminsw m1, m2
- pminsw m1, m3
- pminsw m1, m4
- pcmpeqw m2, m1
- pcmpeqw m3, m1
- pcmpeqw m4, m1
- BLEND m9, m11, m3
- BLEND m9, m10, m2
- BLEND m9, m12, m4
- packuswb m9, m9
- movh [dstq], m9
- add srcq, mmsize/2
- add dstq, mmsize/2
- sub pixelsd, mmsize/2
- jg .loop
- RET
- ; This is just copy-pasted straight from mode 6 with the left shifts removed.
- cglobal rg_fl_mode_7, 4, 5, 16, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- ; Can this be done without unpacking?
- pxor m15, m15
- .loop:
- LOAD_SQUARE_16 m15
- SORT_AXIS_16
- mova m9, m0
- mova m10, m0
- mova m11, m0
- mova m12, m0
- CLIPW m9, m1, m8 ; clip1
- CLIPW m10, m2, m7 ; clip2
- CLIPW m11, m3, m6 ; clip3
- CLIPW m12, m4, m5 ; clip4
- psubw m8, m1 ; d1
- psubw m7, m2 ; d2
- psubw m6, m3 ; d3
- psubw m5, m4 ; d4
- mova m1, m9
- mova m2, m10
- mova m3, m11
- mova m4, m12
- ABS_DIFF_W m1, m0, m13
- ABS_DIFF_W m2, m0, m14
- ABS_DIFF_W m3, m0, m13
- ABS_DIFF_W m4, m0, m14
- paddw m1, m8 ; c1
- paddw m2, m7 ; c2
- paddw m3, m6 ; c3
- paddw m4, m5 ; c4
- pminsw m1, m2
- pminsw m1, m3
- pminsw m1, m4
- pcmpeqw m2, m1
- pcmpeqw m3, m1
- pcmpeqw m4, m1
- BLEND m9, m11, m3
- BLEND m9, m10, m2
- BLEND m9, m12, m4
- packuswb m9, m9
- movh [dstq], m9
- add srcq, mmsize/2
- add dstq, mmsize/2
- sub pixelsd, mmsize/2
- jg .loop
- RET
- ; This is just copy-pasted straight from mode 6 with a few changes.
- cglobal rg_fl_mode_8, 4, 5, 16, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- pxor m15, m15
- .loop:
- LOAD_SQUARE_16 m15
- SORT_AXIS_16
- mova m9, m0
- mova m10, m0
- mova m11, m0
- mova m12, m0
- CLIPW m9, m1, m8 ; clip1
- CLIPW m10, m2, m7 ; clip2
- CLIPW m11, m3, m6 ; clip3
- CLIPW m12, m4, m5 ; clip4
- psubw m8, m1 ; d1
- psubw m7, m2 ; d2
- psubw m6, m3 ; d3
- psubw m5, m4 ; d4
- psllw m8, 1
- psllw m7, 1
- psllw m6, 1
- psllw m5, 1
- mova m1, m9
- mova m2, m10
- mova m3, m11
- mova m4, m12
- ABS_DIFF_W m1, m0, m13
- ABS_DIFF_W m2, m0, m14
- ABS_DIFF_W m3, m0, m13
- ABS_DIFF_W m4, m0, m14
- paddw m1, m8 ; c1
- paddw m2, m7 ; c1
- paddw m3, m6 ; c1
- paddw m4, m5 ; c1
- ; As the differences (d1..d4) can only be positive, there is no need to
- ; clip to zero. Also, the maximum positive value is less than 768.
- pminsw m1, m2
- pminsw m1, m3
- pminsw m1, m4
- pcmpeqw m2, m1
- pcmpeqw m3, m1
- pcmpeqw m4, m1
- BLEND m9, m11, m3
- BLEND m9, m10, m2
- BLEND m9, m12, m4
- packuswb m9, m9
- movh [dstq], m9
- add srcq, mmsize/2
- add dstq, mmsize/2
- sub pixelsd, mmsize/2
- jg .loop
- RET
- cglobal rg_fl_mode_9, 4, 5, 13, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- .loop:
- LOAD_SQUARE
- SORT_AXIS
- mova m9, m0
- mova m10, m0
- mova m11, m0
- mova m12, m0
- CLIPUB m9, m1, m8 ; clip1
- CLIPUB m10, m2, m7 ; clip2
- CLIPUB m11, m3, m6 ; clip3
- CLIPUB m12, m4, m5 ; clip4
- psubb m8, m1 ; d1
- psubb m7, m2 ; d2
- psubb m6, m3 ; d3
- psubb m5, m4 ; d4
- pminub m8, m7
- pminub m8, m6
- pminub m8, m5
- pcmpeqb m7, m8
- pcmpeqb m6, m8
- pcmpeqb m5, m8
- BLEND m9, m11, m6
- BLEND m9, m10, m7
- BLEND m9, m12, m5
- movu [dstq], m9
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- %endif
- cglobal rg_fl_mode_10, 4, 5, 8, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- .loop:
- movu m0, [c]
- movu m1, [a4]
- mova m2, m1
- ABS_DIFF m1, m0, m7
- movu m3, [a5] ; load pixel
- mova m4, m3
- ABS_DIFF m4, m0, m7 ; absolute difference from center
- pminub m1, m4 ; mindiff
- pcmpeqb m4, m1 ; if (difference == mindiff)
- BLEND m2, m3, m4 ; return pixel
- movu m5, [a1]
- mova m6, m5
- ABS_DIFF m6, m0, m7
- pminub m1, m6
- pcmpeqb m6, m1
- BLEND m2, m5, m6
- movu m3, [a3]
- mova m4, m3
- ABS_DIFF m4, m0, m7
- pminub m1, m4
- pcmpeqb m4, m1
- BLEND m2, m3, m4
- movu m5, [a2]
- mova m6, m5
- ABS_DIFF m6, m0, m7
- pminub m1, m6
- pcmpeqb m6, m1
- BLEND m2, m5, m6
- movu m3, [a6]
- mova m4, m3
- ABS_DIFF m4, m0, m7
- pminub m1, m4
- pcmpeqb m4, m1
- BLEND m2, m3, m4
- movu m5, [a8]
- mova m6, m5
- ABS_DIFF m6, m0, m7
- pminub m1, m6
- pcmpeqb m6, m1
- BLEND m2, m5, m6
- movu m3, [a7]
- mova m4, m3
- ABS_DIFF m4, m0, m7
- pminub m1, m4
- pcmpeqb m4, m1
- BLEND m2, m3, m4
- movu [dstq], m2
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- cglobal rg_fl_mode_11_12, 4, 5, 7, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- pxor m0, m0
- .loop:
- LOAD m1, [c], m0
- LOAD m2, [a2], m0
- LOAD m3, [a4], m0
- LOAD m4, [a5], m0
- LOAD m5, [a7], m0
- psllw m1, 2
- paddw m2, m3
- paddw m4, m5
- paddw m2, m4
- psllw m2, 1
- LOAD m3, [a1], m0
- LOAD m4, [a3], m0
- LOAD m5, [a6], m0
- LOAD m6, [a8], m0
- paddw m1, m2
- paddw m3, m4
- paddw m5, m6
- paddw m1, m3
- paddw m1, m5
- paddw m1, [pw_8]
- psraw m1, 4
- packuswb m1, m1
- movh [dstq], m1
- add srcq, mmsize/2
- add dstq, mmsize/2
- sub pixelsd, mmsize/2
- jg .loop
- RET
- cglobal rg_fl_mode_13_14, 4, 5, 8, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- .loop:
- movu m1, [a1]
- movu m2, [a8]
- mova m0, m1
- pavgb m1, m2
- ABS_DIFF m0, m2, m6
- movu m3, [a3]
- movu m4, [a6]
- mova m5, m3
- pavgb m3, m4
- ABS_DIFF m5, m4, m7
- pminub m0, m5
- pcmpeqb m5, m0
- BLEND m1, m3, m5
- movu m2, [a2]
- movu m3, [a7]
- mova m4, m2
- pavgb m2, m3
- ABS_DIFF m4, m3, m6
- pminub m0, m4
- pcmpeqb m4, m0
- BLEND m1, m2, m4
- movu [dstq], m1
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- %if ARCH_X86_64
- cglobal rg_fl_mode_15_16, 4, 5, 16, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- pxor m15, m15
- .loop:
- LOAD_SQUARE_16 m15
- mova m9, m1
- mova m10, m2
- mova m11, m3
- ABS_DIFF_W m9, m8, m12
- ABS_DIFF_W m10, m7, m13
- ABS_DIFF_W m11, m6, m14
- pminsw m9, m10
- pminsw m9, m11
- pcmpeqw m10, m9
- pcmpeqw m11, m9
- mova m12, m2
- mova m13, m1
- mova m14, m6
- paddw m12, m7
- psllw m12, 1
- paddw m13, m3
- paddw m14, m8
- paddw m12, [pw_4]
- paddw m13, m14
- paddw m12, m13
- psrlw m12, 3
- SORT_PAIR ub, m1, m8, m0
- SORT_PAIR ub, m2, m7, m9
- SORT_PAIR ub, m3, m6, m14
- mova m4, m12
- mova m5, m12
- CLIPW m4, m1, m8
- CLIPW m5, m2, m7
- CLIPW m12, m3, m6
- BLEND m4, m12, m11
- BLEND m4, m5, m10
- packuswb m4, m4
- movh [dstq], m4
- add srcq, mmsize/2
- add dstq, mmsize/2
- sub pixelsd, mmsize/2
- jg .loop
- RET
- cglobal rg_fl_mode_17, 4, 5, 9, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- .loop:
- LOAD_SQUARE
- SORT_AXIS
- pmaxub m1, m2
- pmaxub m3, m4
- pminub m8, m7
- pminub m5, m6
- pmaxub m1, m3
- pminub m8, m5
- mova m2, m1
- pminub m1, m8
- pmaxub m8, m2
- CLIPUB m0, m1, m8
- movu [dstq], m0
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- cglobal rg_fl_mode_18, 4, 5, 16, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- .loop:
- LOAD_SQUARE
- mova m9, m1
- mova m10, m8
- ABS_DIFF m9, m0, m11
- ABS_DIFF m10, m0, m12
- pmaxub m9, m10 ; m9 = d1
- mova m10, m2
- mova m11, m7
- ABS_DIFF m10, m0, m12
- ABS_DIFF m11, m0, m13
- pmaxub m10, m11 ; m10 = d2
- mova m11, m3
- mova m12, m6
- ABS_DIFF m11, m0, m13
- ABS_DIFF m12, m0, m14
- pmaxub m11, m12 ; m11 = d3
- mova m12, m4
- mova m13, m5
- ABS_DIFF m12, m0, m14
- ABS_DIFF m13, m0, m15
- pmaxub m12, m13 ; m12 = d4
- mova m13, m9
- pminub m13, m10
- pminub m13, m11
- pminub m13, m12 ; m13 = mindiff
- pcmpeqb m10, m13
- pcmpeqb m11, m13
- pcmpeqb m12, m13
- mova m14, m1
- pminub m1, m8
- pmaxub m8, m14
- mova m13, m0
- mova m14, m1
- pminub m1, m8
- pmaxub m8, m14
- CLIPUB m13, m1, m8 ; m13 = ret...d1
- mova m14, m0
- mova m15, m3
- pminub m3, m6
- pmaxub m6, m15
- CLIPUB m14, m3, m6
- pand m14, m11
- pandn m11, m13
- por m14, m11 ; m14 = ret...d3
- mova m15, m0
- mova m1, m2
- pminub m2, m7
- pmaxub m7, m1
- CLIPUB m15, m2, m7
- pand m15, m10
- pandn m10, m14
- por m15, m10 ; m15 = ret...d2
- mova m1, m0
- mova m2, m4
- pminub m4, m5
- pmaxub m5, m2
- CLIPUB m1, m4, m5
- pand m1, m12
- pandn m12, m15
- por m1, m12 ; m15 = ret...d4
- movu [dstq], m1
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- %endif
- cglobal rg_fl_mode_19, 4, 5, 7, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- pxor m0, m0
- .loop:
- LOAD m1, [a1], m0
- LOAD m2, [a2], m0
- paddw m1, m2
- LOAD m3, [a3], m0
- LOAD m4, [a4], m0
- paddw m3, m4
- LOAD m5, [a5], m0
- LOAD m6, [a6], m0
- paddw m5, m6
- LOAD m2, [a7], m0
- LOAD m4, [a8], m0
- paddw m2, m4
- paddw m1, m3
- paddw m2, m5
- paddw m1, m2
- paddw m1, [pw_4]
- psraw m1, 3
- packuswb m1, m1
- movh [dstq], m1
- add srcq, mmsize/2
- add dstq, mmsize/2
- sub pixelsd, mmsize/2
- jg .loop
- RET
- cglobal rg_fl_mode_20, 4, 5, 7, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- pxor m0, m0
- .loop:
- LOAD m1, [a1], m0
- LOAD m2, [a2], m0
- paddw m1, m2
- LOAD m3, [a3], m0
- LOAD m4, [a4], m0
- paddw m3, m4
- LOAD m5, [a5], m0
- LOAD m6, [a6], m0
- paddw m5, m6
- LOAD m2, [a7], m0
- LOAD m4, [a8], m0
- paddw m2, m4
- LOAD m6, [c], m0
- paddw m1, m3
- paddw m2, m5
- paddw m6, [pw_4]
- paddw m1, m2
- paddw m1, m6
- pmulhuw m1, [pw_div9]
- packuswb m1, m1
- movh [dstq], m1
- add srcq, mmsize/2
- add dstq, mmsize/2
- sub pixelsd, mmsize/2
- jg .loop
- RET
- cglobal rg_fl_mode_21, 4, 5, 8, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- pxor m0, m0
- .loop:
- movu m1, [a1]
- movu m2, [a8]
- pavgb m7, m1, m2
- punpckhbw m3, m1, m0
- punpcklbw m1, m0
- punpckhbw m4, m2, m0
- punpcklbw m2, m0
- paddw m3, m4
- paddw m1, m2
- psrlw m3, 1
- psrlw m1, 1
- packuswb m1, m3
- movu m2, [a2]
- movu m3, [a7]
- pavgb m6, m2, m3
- punpckhbw m4, m2, m0
- punpcklbw m2, m0
- punpckhbw m5, m3, m0
- punpcklbw m3, m0
- paddw m4, m5
- paddw m2, m3
- psrlw m4, 1
- psrlw m2, 1
- packuswb m2, m4
- pminub m1, m2
- pmaxub m7, m6
- movu m2, [a3]
- movu m3, [a6]
- pavgb m6, m2, m3
- punpckhbw m4, m2, m0
- punpcklbw m2, m0
- punpckhbw m5, m3, m0
- punpcklbw m3, m0
- paddw m4, m5
- paddw m2, m3
- psrlw m4, 1
- psrlw m2, 1
- packuswb m2, m4
- pminub m1, m2
- pmaxub m7, m6
- movu m2, [a4]
- movu m3, [a5]
- pavgb m6, m2, m3
- punpckhbw m4, m2, m0
- punpcklbw m2, m0
- punpckhbw m5, m3, m0
- punpcklbw m3, m0
- paddw m4, m5
- paddw m2, m3
- psrlw m4, 1
- psrlw m2, 1
- packuswb m2, m4
- pminub m1, m2
- pmaxub m7, m6
- movu m3, [c]
- CLIPUB m3, m1, m7
- movu [dstq], m3
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- cglobal rg_fl_mode_22, 4, 5, 8, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- .loop:
- movu m0, [a1]
- movu m1, [a8]
- pavgb m0, m1
- movu m2, [a2]
- movu m3, [a7]
- pavgb m2, m3
- movu m4, [a3]
- movu m5, [a6]
- pavgb m4, m5
- movu m6, [a4]
- movu m7, [a5]
- pavgb m6, m7
- mova m1, m0
- mova m3, m2
- mova m5, m4
- mova m7, m6
- pminub m0, m2
- pminub m4, m6
- pmaxub m1, m3
- pmaxub m5, m7
- pminub m0, m4
- pmaxub m1, m5
- movu m2, [c]
- CLIPUB m2, m0, m1
- movu [dstq], m2
- add srcq, mmsize
- add dstq, mmsize
- sub pixelsd, mmsize
- jg .loop
- RET
- %if ARCH_X86_64
- cglobal rg_fl_mode_23, 4, 5, 16, 0, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- pxor m15, m15
- .loop:
- LOAD_SQUARE_16 m15
- SORT_AXIS_16
- mova m9, m8
- mova m10, m7
- mova m11, m6
- mova m12, m5
- psubw m9, m1 ; linediff1
- psubw m10, m2 ; linediff2
- psubw m11, m3 ; linediff3
- psubw m12, m4 ; linediff4
- psubw m1, m0
- psubw m2, m0
- psubw m3, m0
- psubw m4, m0
- pminsw m1, m9 ; d1
- pminsw m2, m10 ; d2
- pminsw m3, m11 ; d3
- pminsw m4, m12 ; d4
- pmaxsw m1, m2
- pmaxsw m3, m4
- pmaxsw m1, m3
- pmaxsw m1, m15 ; d
- mova m13, m0
- mova m14, m0
- mova m2, m0
- mova m4, m0
- psubw m13, m8
- psubw m14, m7
- psubw m2, m6
- psubw m4, m5
- pminsw m9, m13 ; u1
- pminsw m10, m14 ; u2
- pminsw m11, m2 ; u3
- pminsw m12, m4 ; u4
- pmaxsw m9, m10
- pmaxsw m11, m12
- pmaxsw m9, m11
- pmaxsw m9, m15 ; u
- paddw m0, m1
- psubw m0, m9
- packuswb m0, m0
- movh [dstq], m0
- add srcq, mmsize/2
- add dstq, mmsize/2
- sub pixelsd, mmsize/2
- jg .loop
- RET
- cglobal rg_fl_mode_24, 4, 5, 16, mmsize, dst, src, stride, pixels
- mov r4q, strideq
- neg r4q
- %define stride_p strideq
- %define stride_n r4q
- pxor m15, m15
- .loop:
- LOAD_SQUARE_16 m15
- mova [rsp], m0
- SORT_AXIS_16
- mova m9, m8
- mova m10, m7
- mova m11, m6
- mova m12, m5
- psubw m9, m1 ; linediff1
- psubw m10, m2 ; linediff2
- psubw m11, m3 ; linediff3
- psubw m12, m4 ; linediff4
- psubw m1, [rsp] ; td1
- psubw m2, [rsp] ; td2
- psubw m3, [rsp] ; td3
- psubw m4, [rsp] ; td4
- mova m0, m9
- mova m13, m10
- mova m14, m11
- mova m15, m12
- psubw m0, m1
- psubw m13, m2
- psubw m14, m3
- psubw m15, m4
- pminsw m1, m0 ; d1
- pminsw m2, m13 ; d2
- pminsw m3, m14 ; d3
- pminsw m4, m15 ; d4
- pmaxsw m1, m2
- pmaxsw m3, m4
- mova m0, [rsp]
- mova m13, [rsp]
- mova m14, [rsp]
- mova m15, [rsp]
- psubw m0, m8 ; tu1
- psubw m13, m7 ; tu2
- psubw m14, m6 ; tu3
- psubw m15, m5 ; tu4
- psubw m9, m0
- psubw m10, m13
- psubw m11, m14
- psubw m12, m15
- pminsw m9, m0 ; u1
- pminsw m10, m13 ; u2
- pminsw m11, m14 ; u3
- pminsw m12, m15 ; u4
- pmaxsw m9, m10
- pmaxsw m11, m12
- pmaxsw m1, m3 ; d without max(d,0)
- pmaxsw m9, m11 ; u without max(u,0)
- pxor m15, m15
- pmaxsw m1, m15
- pmaxsw m9, m15
- mova m0, [rsp]
- paddw m0, m1
- psubw m0, m9
- packuswb m0, m0
- movh [dstq], m0
- add srcq, mmsize/2
- add dstq, mmsize/2
- sub pixelsd, mmsize/2
- jg .loop
- RET
- %endif
|