123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363 |
- ;
- ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- ;
- ; Use of this source code is governed by a BSD-style license
- ; that can be found in the LICENSE file in the root of the source
- ; tree. An additional intellectual property rights grant can be found
- ; in the file PATENTS. All contributing project authors may
- ; be found in the AUTHORS file in the root of the source tree.
- ;
- %include "third_party/x86inc/x86inc.asm"
- SECTION .text
- %macro HIGH_SAD_FN 4
- %if %4 == 0
- %if %3 == 5
- cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
- %else ; %3 == 7
- cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- %endif ; %3 == 5/7
- %else ; avg
- %if %3 == 5
- cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
- second_pred, n_rows
- %else ; %3 == 7
- cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
- ref, ref_stride, \
- second_pred, \
- src_stride3, ref_stride3
- %if ARCH_X86_64
- %define n_rowsd r7d
- %else ; x86-32
- %define n_rowsd dword r0m
- %endif ; x86-32/64
- %endif ; %3 == 5/7
- %endif ; avg/sad
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- %if %3 == 7
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
- %endif ; %3 == 7
- ; convert src, ref & second_pred to short ptrs (from byte ptrs)
- shl srcq, 1
- shl refq, 1
- %if %4 == 1
- shl second_predq, 1
- %endif
- %endmacro
- ; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
- ; uint8_t *ref, int ref_stride);
- %macro HIGH_SAD64XN 1-2 0
- HIGH_SAD_FN 64, %1, 5, %2
- mov n_rowsd, %1
- pxor m0, m0
- pxor m6, m6
- .loop:
- ; first half of each row
- movu m1, [refq]
- movu m2, [refq+16]
- movu m3, [refq+32]
- movu m4, [refq+48]
- %if %2 == 1
- pavgw m1, [second_predq+mmsize*0]
- pavgw m2, [second_predq+mmsize*1]
- pavgw m3, [second_predq+mmsize*2]
- pavgw m4, [second_predq+mmsize*3]
- lea second_predq, [second_predq+mmsize*4]
- %endif
- mova m5, [srcq]
- psubusw m5, m1
- psubusw m1, [srcq]
- por m1, m5
- mova m5, [srcq+16]
- psubusw m5, m2
- psubusw m2, [srcq+16]
- por m2, m5
- mova m5, [srcq+32]
- psubusw m5, m3
- psubusw m3, [srcq+32]
- por m3, m5
- mova m5, [srcq+48]
- psubusw m5, m4
- psubusw m4, [srcq+48]
- por m4, m5
- paddw m1, m2
- paddw m3, m4
- movhlps m2, m1
- movhlps m4, m3
- paddw m1, m2
- paddw m3, m4
- punpcklwd m1, m6
- punpcklwd m3, m6
- paddd m0, m1
- paddd m0, m3
- ; second half of each row
- movu m1, [refq+64]
- movu m2, [refq+80]
- movu m3, [refq+96]
- movu m4, [refq+112]
- %if %2 == 1
- pavgw m1, [second_predq+mmsize*0]
- pavgw m2, [second_predq+mmsize*1]
- pavgw m3, [second_predq+mmsize*2]
- pavgw m4, [second_predq+mmsize*3]
- lea second_predq, [second_predq+mmsize*4]
- %endif
- mova m5, [srcq+64]
- psubusw m5, m1
- psubusw m1, [srcq+64]
- por m1, m5
- mova m5, [srcq+80]
- psubusw m5, m2
- psubusw m2, [srcq+80]
- por m2, m5
- mova m5, [srcq+96]
- psubusw m5, m3
- psubusw m3, [srcq+96]
- por m3, m5
- mova m5, [srcq+112]
- psubusw m5, m4
- psubusw m4, [srcq+112]
- por m4, m5
- paddw m1, m2
- paddw m3, m4
- movhlps m2, m1
- movhlps m4, m3
- paddw m1, m2
- paddw m3, m4
- punpcklwd m1, m6
- punpcklwd m3, m6
- lea refq, [refq+ref_strideq*2]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*2]
- paddd m0, m3
- dec n_rowsd
- jg .loop
- movhlps m1, m0
- paddd m0, m1
- punpckldq m0, m6
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
- %endmacro
- INIT_XMM sse2
- HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
- HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
- HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
- HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
- ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
- ; uint8_t *ref, int ref_stride);
- %macro HIGH_SAD32XN 1-2 0
- HIGH_SAD_FN 32, %1, 5, %2
- mov n_rowsd, %1
- pxor m0, m0
- pxor m6, m6
- .loop:
- movu m1, [refq]
- movu m2, [refq+16]
- movu m3, [refq+32]
- movu m4, [refq+48]
- %if %2 == 1
- pavgw m1, [second_predq+mmsize*0]
- pavgw m2, [second_predq+mmsize*1]
- pavgw m3, [second_predq+mmsize*2]
- pavgw m4, [second_predq+mmsize*3]
- lea second_predq, [second_predq+mmsize*4]
- %endif
- mova m5, [srcq]
- psubusw m5, m1
- psubusw m1, [srcq]
- por m1, m5
- mova m5, [srcq+16]
- psubusw m5, m2
- psubusw m2, [srcq+16]
- por m2, m5
- mova m5, [srcq+32]
- psubusw m5, m3
- psubusw m3, [srcq+32]
- por m3, m5
- mova m5, [srcq+48]
- psubusw m5, m4
- psubusw m4, [srcq+48]
- por m4, m5
- paddw m1, m2
- paddw m3, m4
- movhlps m2, m1
- movhlps m4, m3
- paddw m1, m2
- paddw m3, m4
- punpcklwd m1, m6
- punpcklwd m3, m6
- lea refq, [refq+ref_strideq*2]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*2]
- paddd m0, m3
- dec n_rowsd
- jg .loop
- movhlps m1, m0
- paddd m0, m1
- punpckldq m0, m6
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
- %endmacro
- INIT_XMM sse2
- HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
- HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
- HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
- HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
- HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
- HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
- ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
- ; uint8_t *ref, int ref_stride);
- %macro HIGH_SAD16XN 1-2 0
- HIGH_SAD_FN 16, %1, 5, %2
- mov n_rowsd, %1/2
- pxor m0, m0
- pxor m6, m6
- .loop:
- movu m1, [refq]
- movu m2, [refq+16]
- movu m3, [refq+ref_strideq*2]
- movu m4, [refq+ref_strideq*2+16]
- %if %2 == 1
- pavgw m1, [second_predq+mmsize*0]
- pavgw m2, [second_predq+16]
- pavgw m3, [second_predq+mmsize*2]
- pavgw m4, [second_predq+mmsize*2+16]
- lea second_predq, [second_predq+mmsize*4]
- %endif
- mova m5, [srcq]
- psubusw m5, m1
- psubusw m1, [srcq]
- por m1, m5
- mova m5, [srcq+16]
- psubusw m5, m2
- psubusw m2, [srcq+16]
- por m2, m5
- mova m5, [srcq+src_strideq*2]
- psubusw m5, m3
- psubusw m3, [srcq+src_strideq*2]
- por m3, m5
- mova m5, [srcq+src_strideq*2+16]
- psubusw m5, m4
- psubusw m4, [srcq+src_strideq*2+16]
- por m4, m5
- paddw m1, m2
- paddw m3, m4
- movhlps m2, m1
- movhlps m4, m3
- paddw m1, m2
- paddw m3, m4
- punpcklwd m1, m6
- punpcklwd m3, m6
- lea refq, [refq+ref_strideq*4]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*4]
- paddd m0, m3
- dec n_rowsd
- jg .loop
- movhlps m1, m0
- paddd m0, m1
- punpckldq m0, m6
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
- %endmacro
- INIT_XMM sse2
- HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
- HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
- HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
- HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
- HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
- HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
- ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
- ; uint8_t *ref, int ref_stride);
- %macro HIGH_SAD8XN 1-2 0
- HIGH_SAD_FN 8, %1, 7, %2
- mov n_rowsd, %1/4
- pxor m0, m0
- pxor m6, m6
- .loop:
- movu m1, [refq]
- movu m2, [refq+ref_strideq*2]
- movu m3, [refq+ref_strideq*4]
- movu m4, [refq+ref_stride3q*2]
- %if %2 == 1
- pavgw m1, [second_predq+mmsize*0]
- pavgw m2, [second_predq+mmsize*1]
- pavgw m3, [second_predq+mmsize*2]
- pavgw m4, [second_predq+mmsize*3]
- lea second_predq, [second_predq+mmsize*4]
- %endif
- mova m5, [srcq]
- psubusw m5, m1
- psubusw m1, [srcq]
- por m1, m5
- mova m5, [srcq+src_strideq*2]
- psubusw m5, m2
- psubusw m2, [srcq+src_strideq*2]
- por m2, m5
- mova m5, [srcq+src_strideq*4]
- psubusw m5, m3
- psubusw m3, [srcq+src_strideq*4]
- por m3, m5
- mova m5, [srcq+src_stride3q*2]
- psubusw m5, m4
- psubusw m4, [srcq+src_stride3q*2]
- por m4, m5
- paddw m1, m2
- paddw m3, m4
- movhlps m2, m1
- movhlps m4, m3
- paddw m1, m2
- paddw m3, m4
- punpcklwd m1, m6
- punpcklwd m3, m6
- lea refq, [refq+ref_strideq*8]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*8]
- paddd m0, m3
- dec n_rowsd
- jg .loop
- movhlps m1, m0
- paddd m0, m1
- punpckldq m0, m6
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
- %endmacro
- INIT_XMM sse2
- HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
- HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
- HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
- HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
- HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
- HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
|