123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702 |
- ;
- ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- ;
- ; Use of this source code is governed by a BSD-style license
- ; that can be found in the LICENSE file in the root of the source
- ; tree. An additional intellectual property rights grant can be found
- ; in the file PATENTS. All contributing project authors may
- ; be found in the AUTHORS file in the root of the source tree.
- ;
- %include "vpx_ports/x86_abi_support.asm"
- extern sym(vp8_bilinear_filters_x86_8)
- %define BLOCK_HEIGHT_WIDTH 4
- %define vp8_filter_weight 128
- %define VP8_FILTER_SHIFT 7
- ;void vp8_filter_block1d_h6_mmx
- ;(
- ; unsigned char *src_ptr,
- ; unsigned short *output_ptr,
- ; unsigned int src_pixels_per_line,
- ; unsigned int pixel_step,
- ; unsigned int output_height,
- ; unsigned int output_width,
- ; short * vp8_filter
- ;)
- global sym(vp8_filter_block1d_h6_mmx) PRIVATE
- sym(vp8_filter_block1d_h6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- mov rdx, arg(6) ;vp8_filter
- movq mm1, [rdx + 16] ; do both the negative taps first!!!
- movq mm2, [rdx + 32] ;
- movq mm6, [rdx + 48] ;
- movq mm7, [rdx + 64] ;
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
- .nextrow:
- movq mm3, [rsi-2] ; mm3 = p-2..p5
- movq mm4, mm3 ; mm4 = p-2..p5
- psrlq mm3, 8 ; mm3 = p-1..p5
- punpcklbw mm3, mm0 ; mm3 = p-1..p2
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
- movq mm5, mm4 ; mm5 = p-2..p5
- punpckhbw mm4, mm0 ; mm5 = p2..p5
- pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
- movq mm4, mm5 ; mm4 = p-2..p5;
- psrlq mm5, 16 ; mm5 = p0..p5;
- punpcklbw mm5, mm0 ; mm5 = p0..p3
- pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
- movq mm5, mm4 ; mm5 = p-2..p5
- psrlq mm4, 24 ; mm4 = p1..p5
- punpcklbw mm4, mm0 ; mm4 = p1..p4
- pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
- ; do outer positive taps
- movd mm4, [rsi+3]
- punpcklbw mm4, mm0 ; mm5 = p3..p6
- pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
- punpcklbw mm5, mm0 ; mm5 = p-2..p1
- pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
- paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- punpcklbw mm3, mm0 ;
- movq [rdi], mm3 ; store the results in the destination
- %if ABI_IS_32BIT
- add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
- add rdi, rax;
- %else
- movsxd r8, dword ptr arg(2) ;src_pixels_per_line
- add rdi, rax;
- add rsi, r8 ; next line
- %endif
- dec rcx ; decrement count
- jnz .nextrow ; next row
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
- ;void vp8_filter_block1dc_v6_mmx
- ;(
- ; short *src_ptr,
- ; unsigned char *output_ptr,
- ; int output_pitch,
- ; unsigned int pixels_per_line,
- ; unsigned int pixel_step,
- ; unsigned int output_height,
- ; unsigned int output_width,
- ; short * vp8_filter
- ;)
- global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
- sym(vp8_filter_block1dc_v6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- movq mm5, [GLOBAL(rd)]
- push rbx
- mov rbx, arg(7) ;vp8_filter
- movq mm1, [rbx + 16] ; do both the negative taps first!!!
- movq mm2, [rbx + 32] ;
- movq mm6, [rbx + 48] ;
- movq mm7, [rbx + 64] ;
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- sub rsi, rdx
- sub rsi, rdx
- movsxd rcx, DWORD PTR arg(5) ;output_height
- movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
- .nextrow_cv:
- movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
- pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
- pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- movq mm4, [rsi] ; mm4 = p0..p3 = row -2
- pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
- pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
- pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- paddsw mm3, mm5 ; mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and saturate
- movd [rdi],mm3 ; store the results in the destination
- ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
- ; recon block should be in cache this shouldn't cost much. Its obviously
- ; avoidable!!!.
- lea rdi, [rdi+rax] ;
- dec rcx ; decrement count
- jnz .nextrow_cv ; next row
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
- ;void bilinear_predict8x8_mmx
- ;(
- ; unsigned char *src_ptr,
- ; int src_pixels_per_line,
- ; int xoffset,
- ; int yoffset,
- ; unsigned char *dst_ptr,
- ; int dst_pitch
- ;)
- global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
- sym(vp8_bilinear_predict8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
- shl rax, 5 ; offset * 32
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
- add rax, rcx ; HFilter
- mov rsi, arg(0) ;src_ptr ;
- movsxd rdx, dword ptr arg(5) ;dst_pitch
- movq mm1, [rax] ;
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
- pxor mm0, mm0 ;
- shl rax, 5 ; offset*32
- add rax, rcx ; VFilter
- lea rcx, [rdi+rdx*8] ;
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
- ; get the first horizontal line done ;
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
- add rsi, rdx ; next line
- .next_row_8x8:
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- movq mm5, mm7 ;
- movq mm6, mm7 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0
- pmullw mm5, [rax] ;
- pmullw mm6, [rax] ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
- pmullw mm3, [rax+16] ;
- pmullw mm4, [rax+16] ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- packuswb mm3, mm4
- movq [rdi], mm3 ; store the results in the destination
- %if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
- %else
- movsxd r8, dword ptr arg(5) ;dst_pitch
- add rsi, rdx ; next line
- add rdi, r8 ;dst_pitch
- %endif
- cmp rdi, rcx ;
- jne .next_row_8x8
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
- ;void bilinear_predict8x4_mmx
- ;(
- ; unsigned char *src_ptr,
- ; int src_pixels_per_line,
- ; int xoffset,
- ; int yoffset,
- ; unsigned char *dst_ptr,
- ; int dst_pitch
- ;)
- global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
- sym(vp8_bilinear_predict8x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
- shl rax, 5
- mov rsi, arg(0) ;src_ptr ;
- add rax, rcx
- movsxd rdx, dword ptr arg(5) ;dst_pitch
- movq mm1, [rax] ;
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
- pxor mm0, mm0 ;
- shl rax, 5
- add rax, rcx
- lea rcx, [rdi+rdx*4] ;
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
- ; get the first horizontal line done ;
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
- add rsi, rdx ; next line
- .next_row_8x4:
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- movq mm5, mm7 ;
- movq mm6, mm7 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0
- pmullw mm5, [rax] ;
- pmullw mm6, [rax] ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
- pmullw mm3, [rax+16] ;
- pmullw mm4, [rax+16] ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- packuswb mm3, mm4
- movq [rdi], mm3 ; store the results in the destination
- %if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
- %else
- movsxd r8, dword ptr arg(5) ;dst_pitch
- add rsi, rdx ; next line
- add rdi, r8
- %endif
- cmp rdi, rcx ;
- jne .next_row_8x4
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
- ;void bilinear_predict4x4_mmx
- ;(
- ; unsigned char *src_ptr,
- ; int src_pixels_per_line,
- ; int xoffset,
- ; int yoffset,
- ; unsigned char *dst_ptr,
- ; int dst_pitch
- ;)
- global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
- sym(vp8_bilinear_predict4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
- shl rax, 5
- add rax, rcx ; HFilter
- mov rsi, arg(0) ;src_ptr ;
- movsxd rdx, dword ptr arg(5) ;ldst_pitch
- movq mm1, [rax] ;
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
- pxor mm0, mm0 ;
- shl rax, 5
- add rax, rcx
- lea rcx, [rdi+rdx*4] ;
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
- ; get the first horizontal line done ;
- movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- pmullw mm3, mm1 ;
- movd mm5, [rsi+1] ;
- punpcklbw mm5, mm0 ;
- pmullw mm5, mm2 ;
- paddw mm3, mm5 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- movq mm7, mm3 ;
- packuswb mm7, mm0 ;
- add rsi, rdx ; next line
- .next_row_4x4:
- movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- pmullw mm3, mm1 ;
- movd mm5, [rsi+1] ;
- punpcklbw mm5, mm0 ;
- pmullw mm5, mm2 ;
- paddw mm3, mm5 ;
- movq mm5, mm7 ;
- punpcklbw mm5, mm0 ;
- pmullw mm5, [rax] ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- movq mm7, mm3 ;
- packuswb mm7, mm0 ;
- pmullw mm3, [rax+16] ;
- paddw mm3, mm5 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- packuswb mm3, mm0
- movd [rdi], mm3 ; store the results in the destination
- %if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
- %else
- movsxd r8, dword ptr arg(5) ;dst_pitch ;
- add rsi, rdx ; next line
- add rdi, r8
- %endif
- cmp rdi, rcx ;
- jne .next_row_4x4
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
- SECTION_RODATA
- align 16
- rd:
- times 4 dw 0x40
- align 16
- global HIDDEN_DATA(sym(vp8_six_tap_mmx))
- sym(vp8_six_tap_mmx):
- times 8 dw 0
- times 8 dw 0
- times 8 dw 128
- times 8 dw 0
- times 8 dw 0
- times 8 dw 0
- times 8 dw 0
- times 8 dw -6
- times 8 dw 123
- times 8 dw 12
- times 8 dw -1
- times 8 dw 0
- times 8 dw 2
- times 8 dw -11
- times 8 dw 108
- times 8 dw 36
- times 8 dw -8
- times 8 dw 1
- times 8 dw 0
- times 8 dw -9
- times 8 dw 93
- times 8 dw 50
- times 8 dw -6
- times 8 dw 0
- times 8 dw 3
- times 8 dw -16
- times 8 dw 77
- times 8 dw 77
- times 8 dw -16
- times 8 dw 3
- times 8 dw 0
- times 8 dw -6
- times 8 dw 50
- times 8 dw 93
- times 8 dw -9
- times 8 dw 0
- times 8 dw 1
- times 8 dw -8
- times 8 dw 36
- times 8 dw 108
- times 8 dw -11
- times 8 dw 2
- times 8 dw 0
- times 8 dw -1
- times 8 dw 12
- times 8 dw 123
- times 8 dw -6
- times 8 dw 0
|