123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- ;
- ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- ;
- ; Use of this source code is governed by a BSD-style license
- ; that can be found in the LICENSE file in the root of the source
- ; tree. An additional intellectual property rights grant can be found
- ; in the file PATENTS. All contributing project authors may
- ; be found in the AUTHORS file in the root of the source tree.
- ;
- %include "third_party/x86inc/x86inc.asm"
- SECTION .text
- ; void vpx_subtract_block(int rows, int cols,
- ; int16_t *diff, ptrdiff_t diff_stride,
- ; const uint8_t *src, ptrdiff_t src_stride,
- ; const uint8_t *pred, ptrdiff_t pred_stride)
- INIT_XMM sse2
- cglobal subtract_block, 7, 7, 8, \
- rows, cols, diff, diff_stride, src, src_stride, \
- pred, pred_stride
- %define pred_str colsq
- pxor m7, m7 ; dedicated zero register
- cmp colsd, 4
- je .case_4
- cmp colsd, 8
- je .case_8
- cmp colsd, 16
- je .case_16
- cmp colsd, 32
- je .case_32
- %macro loop16 6
- mova m0, [srcq+%1]
- mova m4, [srcq+%2]
- mova m1, [predq+%3]
- mova m5, [predq+%4]
- punpckhbw m2, m0, m7
- punpckhbw m3, m1, m7
- punpcklbw m0, m7
- punpcklbw m1, m7
- psubw m2, m3
- psubw m0, m1
- punpckhbw m1, m4, m7
- punpckhbw m3, m5, m7
- punpcklbw m4, m7
- punpcklbw m5, m7
- psubw m1, m3
- psubw m4, m5
- mova [diffq+mmsize*0+%5], m0
- mova [diffq+mmsize*1+%5], m2
- mova [diffq+mmsize*0+%6], m4
- mova [diffq+mmsize*1+%6], m1
- %endmacro
- mov pred_str, pred_stridemp
- .loop_64:
- loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
- loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
- lea diffq, [diffq+diff_strideq*2]
- add predq, pred_str
- add srcq, src_strideq
- dec rowsd
- jg .loop_64
- RET
- .case_32:
- mov pred_str, pred_stridemp
- .loop_32:
- loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
- lea diffq, [diffq+diff_strideq*2]
- add predq, pred_str
- add srcq, src_strideq
- dec rowsd
- jg .loop_32
- RET
- .case_16:
- mov pred_str, pred_stridemp
- .loop_16:
- loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
- lea diffq, [diffq+diff_strideq*4]
- lea predq, [predq+pred_str*2]
- lea srcq, [srcq+src_strideq*2]
- sub rowsd, 2
- jg .loop_16
- RET
- %macro loop_h 0
- movh m0, [srcq]
- movh m2, [srcq+src_strideq]
- movh m1, [predq]
- movh m3, [predq+pred_str]
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- psubw m0, m1
- psubw m2, m3
- mova [diffq], m0
- mova [diffq+diff_strideq*2], m2
- %endmacro
- .case_8:
- mov pred_str, pred_stridemp
- .loop_8:
- loop_h
- lea diffq, [diffq+diff_strideq*4]
- lea srcq, [srcq+src_strideq*2]
- lea predq, [predq+pred_str*2]
- sub rowsd, 2
- jg .loop_8
- RET
- INIT_MMX
- .case_4:
- mov pred_str, pred_stridemp
- .loop_4:
- loop_h
- lea diffq, [diffq+diff_strideq*4]
- lea srcq, [srcq+src_strideq*2]
- lea predq, [predq+pred_str*2]
- sub rowsd, 2
- jg .loop_4
- RET
|