123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270 |
- ;*****************************************************************************
- ;* x86-optimized functions for bwdif filter
- ;*
- ;* Copyright (C) 2016 Thomas Mundt <loudmax@yahoo.de>
- ;*
- ;* Based on yadif simd code
- ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
- ;* 2013 Daniel Kang <daniel.d.kang@gmail.com>
- ;*
- ;* This file is part of FFmpeg.
- ;*
- ;* FFmpeg is free software; you can redistribute it and/or
- ;* modify it under the terms of the GNU Lesser General Public
- ;* License as published by the Free Software Foundation; either
- ;* version 2.1 of the License, or (at your option) any later version.
- ;*
- ;* FFmpeg is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- ;* Lesser General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU Lesser General Public
- ;* License along with FFmpeg; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ;******************************************************************************
- %include "libavutil/x86/x86util.asm"
- SECTION_RODATA
- pw_coefhf: times 4 dw 1016, 5570
- pw_coefhf1: times 8 dw -3801
- pw_coefsp: times 4 dw 5077, -981
- pw_splfdif: times 4 dw -768, 768
- SECTION .text
- %macro LOAD8 2
- movh %1, %2
- punpcklbw %1, m7
- %endmacro
- %macro LOAD12 2
- movu %1, %2
- %endmacro
- %macro DISP8 0
- packuswb m2, m2
- movh [dstq], m2
- %endmacro
- %macro DISP12 0
- CLIPW m2, m7, m12
- movu [dstq], m2
- %endmacro
- %macro FILTER 5
- pxor m7, m7
- .loop%1:
- LOAD%4 m0, [curq+t0*%5]
- LOAD%4 m1, [curq+t1*%5]
- LOAD%4 m2, [%2]
- LOAD%4 m3, [%3]
- mova m4, m3
- paddw m3, m2
- psubw m2, m4
- ABS1 m2, m4
- mova m8, m3
- mova m9, m2
- LOAD%4 m3, [prevq+t0*%5]
- LOAD%4 m4, [prevq+t1*%5]
- psubw m3, m0
- psubw m4, m1
- ABS2 m3, m4, m5, m6
- paddw m3, m4
- psrlw m2, 1
- psrlw m3, 1
- pmaxsw m2, m3
- LOAD%4 m3, [nextq+t0*%5]
- LOAD%4 m4, [nextq+t1*%5]
- psubw m3, m0
- psubw m4, m1
- ABS2 m3, m4, m5, m6
- paddw m3, m4
- psrlw m3, 1
- pmaxsw m2, m3
- LOAD%4 m3, [%2+t0*2*%5]
- LOAD%4 m4, [%3+t0*2*%5]
- LOAD%4 m5, [%2+t1*2*%5]
- LOAD%4 m6, [%3+t1*2*%5]
- paddw m3, m4
- paddw m5, m6
- mova m6, m3
- paddw m6, m5
- mova m10, m6
- psrlw m3, 1
- psrlw m5, 1
- psubw m3, m0
- psubw m5, m1
- mova m6, m3
- pminsw m3, m5
- pmaxsw m5, m6
- mova m4, m8
- psraw m4, 1
- mova m6, m4
- psubw m6, m0
- psubw m4, m1
- pmaxsw m3, m6
- pminsw m5, m6
- pmaxsw m3, m4
- pminsw m5, m4
- mova m6, m7
- psubw m6, m3
- pmaxsw m6, m5
- mova m3, m2
- pcmpgtw m3, m7
- pand m6, m3
- pmaxsw m2, m6
- mova m11, m2
- LOAD%4 m2, [%2+t0*4*%5]
- LOAD%4 m3, [%3+t0*4*%5]
- LOAD%4 m4, [%2+t1*4*%5]
- LOAD%4 m5, [%3+t1*4*%5]
- paddw m2, m3
- paddw m4, m5
- paddw m2, m4
- mova m3, m2
- punpcklwd m2, m8
- punpckhwd m3, m8
- pmaddwd m2, [pw_coefhf]
- pmaddwd m3, [pw_coefhf]
- mova m4, m10
- mova m6, m4
- pmullw m4, [pw_coefhf1]
- pmulhw m6, [pw_coefhf1]
- mova m5, m4
- punpcklwd m4, m6
- punpckhwd m5, m6
- paddd m2, m4
- paddd m3, m5
- psrad m2, 2
- psrad m3, 2
- mova m4, m0
- paddw m0, m1
- %if ARCH_X86_64
- LOAD%4 m5, [curq+t2*%5]
- LOAD%4 m6, [curq+t3*%5]
- %else
- mov r4, prefs3mp
- mov r5, mrefs3mp
- LOAD%4 m5, [curq+t0*%5]
- LOAD%4 m6, [curq+t1*%5]
- mov r4, prefsmp
- mov r5, mrefsmp
- %endif
- paddw m6, m5
- psubw m1, m4
- ABS1 m1, m4
- pcmpgtw m1, m9
- mova m4, m1
- punpcklwd m1, m4
- punpckhwd m4, m4
- pand m2, m1
- pand m3, m4
- mova m5, [pw_splfdif]
- mova m7, m5
- pand m5, m1
- pand m7, m4
- paddw m5, [pw_coefsp]
- paddw m7, [pw_coefsp]
- mova m4, m0
- punpcklwd m0, m6
- punpckhwd m4, m6
- pmaddwd m0, m5
- pmaddwd m4, m7
- paddd m2, m0
- paddd m3, m4
- psrad m2, 13
- psrad m3, 13
- packssdw m2, m3
- mova m4, m8
- psraw m4, 1
- mova m0, m11
- mova m3, m4
- psubw m4, m0
- paddw m3, m0
- CLIPW m2, m4, m3
- pxor m7, m7
- DISP%4
- add dstq, STEP
- add prevq, STEP
- add curq, STEP
- add nextq, STEP
- sub DWORD wm, mmsize/2
- jg .loop%1
- %endmacro
- %macro PROC 2
- %if ARCH_X86_64
- movsxd r5, DWORD prefsm
- movsxd r6, DWORD mrefsm
- movsxd r7, DWORD prefs3m
- movsxd r8, DWORD mrefs3m
- DECLARE_REG_TMP 5, 6, 7, 8
- %else
- %define m8 [rsp+ 0]
- %define m9 [rsp+16]
- %define m10 [rsp+32]
- %define m11 [rsp+48]
- mov r4, prefsmp
- mov r5, mrefsmp
- DECLARE_REG_TMP 4, 5
- %endif
- cmp DWORD paritym, 0
- je .parity0
- FILTER 1, prevq, curq, %1, %2
- jmp .ret
- .parity0:
- FILTER 0, curq, nextq, %1, %2
- .ret:
- RET
- %endmacro
- %macro BWDIF 0
- %if ARCH_X86_64
- cglobal bwdif_filter_line, 4, 9, 12, 0, dst, prev, cur, next, w, prefs, \
- mrefs, prefs2, mrefs2, prefs3, mrefs3, \
- prefs4, mrefs4, parity, clip_max
- %else
- cglobal bwdif_filter_line, 4, 6, 8, 64, dst, prev, cur, next, w, prefs, \
- mrefs, prefs2, mrefs2, prefs3, mrefs3, \
- prefs4, mrefs4, parity, clip_max
- %endif
- %define STEP mmsize/2
- PROC 8, 1
- %if ARCH_X86_64
- cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
- prefs, mrefs, prefs2, mrefs2, \
- prefs3, mrefs3, prefs4, \
- mrefs4, parity, clip_max
- movd m12, DWORD clip_maxm
- SPLATW m12, m12, 0
- %else
- cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
- prefs, mrefs, prefs2, mrefs2, \
- prefs3, mrefs3, prefs4, \
- mrefs4, parity, clip_max
- %define m12 [rsp+64]
- movd m0, DWORD clip_maxm
- SPLATW m0, m0, 0
- mova m12, m0
- %endif
- %define STEP mmsize
- PROC 12, 2
- %endmacro
- INIT_XMM ssse3
- BWDIF
- INIT_XMM sse2
- BWDIF
- %if ARCH_X86_32
- INIT_MMX mmxext
- BWDIF
- %endif
|