123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144 |
- ;*****************************************************************************
- ;* x86-optimized functions for overlay filter
- ;*
- ;* Copyright (C) 2018 Paul B Mahol
- ;* Copyright (C) 2018 Henrik Gramner
- ;*
- ;* This file is part of FFmpeg.
- ;*
- ;* FFmpeg is free software; you can redistribute it and/or
- ;* modify it under the terms of the GNU Lesser General Public
- ;* License as published by the Free Software Foundation; either
- ;* version 2.1 of the License, or (at your option) any later version.
- ;*
- ;* FFmpeg is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- ;* Lesser General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU Lesser General Public
- ;* License along with FFmpeg; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ;*****************************************************************************
- %include "libavutil/x86/x86util.asm"
- SECTION_RODATA
- pb_1: times 16 db 1
- pw_128: times 8 dw 128
- pw_255: times 8 dw 255
- pw_257: times 8 dw 257
- SECTION .text
- INIT_XMM sse4
- cglobal overlay_row_44, 5, 7, 6, 0, d, da, s, a, w, r, x
- xor xq, xq
- movsxdifnidn wq, wd
- mov rq, wq
- and rq, mmsize/2 - 1
- cmp wq, mmsize/2
- jl .end
- sub wq, rq
- mova m3, [pw_255]
- mova m4, [pw_128]
- mova m5, [pw_257]
- .loop:
- pmovzxbw m0, [sq+xq]
- pmovzxbw m2, [aq+xq]
- pmovzxbw m1, [dq+xq]
- pmullw m0, m2
- pxor m2, m3
- pmullw m1, m2
- paddw m0, m4
- paddw m0, m1
- pmulhuw m0, m5
- packuswb m0, m0
- movq [dq+xq], m0
- add xq, mmsize/2
- cmp xq, wq
- jl .loop
- .end:
- mov eax, xd
- RET
- INIT_XMM sse4
- cglobal overlay_row_22, 5, 7, 6, 0, d, da, s, a, w, r, x
- xor xq, xq
- movsxdifnidn wq, wd
- sub wq, 1
- mov rq, wq
- and rq, mmsize/2 - 1
- cmp wq, mmsize/2
- jl .end
- sub wq, rq
- mova m3, [pw_255]
- mova m4, [pw_128]
- mova m5, [pw_257]
- .loop:
- pmovzxbw m0, [sq+xq]
- movu m1, [aq+2*xq]
- pandn m2, m3, m1
- psllw m1, 8
- pavgw m2, m1
- pavgw m2, m1
- psrlw m2, 8
- pmovzxbw m1, [dq+xq]
- pmullw m0, m2
- pxor m2, m3
- pmullw m1, m2
- paddw m0, m4
- paddw m0, m1
- pmulhuw m0, m5
- packuswb m0, m0
- movq [dq+xq], m0
- add xq, mmsize/2
- cmp xq, wq
- jl .loop
- .end:
- mov eax, xd
- RET
- INIT_XMM sse4
- cglobal overlay_row_20, 6, 7, 7, 0, d, da, s, a, w, r, x
- mov daq, aq
- add daq, rmp
- xor xq, xq
- movsxdifnidn wq, wd
- sub wq, 1
- mov rq, wq
- and rq, mmsize/2 - 1
- cmp wq, mmsize/2
- jl .end
- sub wq, rq
- mova m3, [pw_255]
- mova m4, [pw_128]
- mova m5, [pw_257]
- mova m6, [pb_1]
- .loop:
- pmovzxbw m0, [sq+xq]
- movu m2, [aq+2*xq]
- movu m1, [daq+2*xq]
- pmaddubsw m2, m6
- pmaddubsw m1, m6
- paddw m2, m1
- psrlw m2, 2
- pmovzxbw m1, [dq+xq]
- pmullw m0, m2
- pxor m2, m3
- pmullw m1, m2
- paddw m0, m4
- paddw m0, m1
- pmulhuw m0, m5
- packuswb m0, m0
- movq [dq+xq], m0
- add xq, mmsize/2
- cmp xq, wq
- jl .loop
- .end:
- mov eax, xd
- RET
|