123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261 |
- ;
- ; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- ;
- ; Use of this source code is governed by a BSD-style license
- ; that can be found in the LICENSE file in the root of the source
- ; tree. An additional intellectual property rights grant can be found
- ; in the file PATENTS. All contributing project authors may
- ; be found in the AUTHORS file in the root of the source tree.
- ;
- %define private_prefix vp9
- %include "third_party/x86inc/x86inc.asm"
- SECTION .text
- ALIGN 16
- ;
- ; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
- ; intptr_t block_size, int64_t *ssz)
- ;
- INIT_XMM avx
- cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
- vzeroupper
- ; If only one iteration is required, then handle this as a special case.
- ; It is the most frequent case, so we can have a significant gain here
- ; by not setting up a loop and accumulators.
- cmp sizeq, 16
- jne .generic
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Common case of size == 16
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ; Load input vectors
- mova xm0, [dqcq]
- packssdw xm0, [dqcq+16]
- mova xm2, [uqcq]
- packssdw xm2, [uqcq+16]
- mova xm1, [dqcq+32]
- packssdw xm1, [dqcq+48]
- mova xm3, [uqcq+32]
- packssdw xm3, [uqcq+48]
- ; Compute the errors.
- psubw xm0, xm2
- psubw xm1, xm3
- ; Individual errors are max 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
- pmaddwd xm2, xm2
- pmaddwd xm3, xm3
- pmaddwd xm0, xm0
- pmaddwd xm1, xm1
- ; Squares are always positive, so we can use unsigned arithmetic after
- ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
- ; fit in 32bits
- paddd xm2, xm3
- paddd xm0, xm1
- ; Accumulate horizontally in 64 bits, there is no chance of overflow here
- pxor xm5, xm5
- pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits
- psrlq xm2, 32 ; Zero extended high of a pair of 32 bits
- pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits
- psrlq xm0, 32 ; Zero extended high of a pair of 32 bits
- paddq xm2, xm3
- paddq xm0, xm1
- psrldq xm3, xm2, 8
- psrldq xm1, xm0, 8
- paddq xm2, xm3
- paddq xm0, xm1
- ; Store the return value
- %if ARCH_X86_64
- movq rax, xm0
- movq [sszq], xm2
- %else
- movd eax, xm0
- pextrd edx, xm0, 1
- movq [sszd], xm2
- %endif
- RET
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Generic case of size != 16, speculative low precision
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ALIGN 16
- .generic:
- pxor xm4, xm4 ; sse accumulator
- pxor xm5, xm5 ; overflow detection register for xm4
- pxor xm6, xm6 ; ssz accumulator
- pxor xm7, xm7 ; overflow detection register for xm6
- lea uqcq, [uqcq+sizeq*4]
- lea dqcq, [dqcq+sizeq*4]
- neg sizeq
- ; Push the negative size as the high precision code might need it
- push sizeq
- .loop:
- ; Load input vectors
- mova xm0, [dqcq+sizeq*4]
- packssdw xm0, [dqcq+sizeq*4+16]
- mova xm2, [uqcq+sizeq*4]
- packssdw xm2, [uqcq+sizeq*4+16]
- mova xm1, [dqcq+sizeq*4+32]
- packssdw xm1, [dqcq+sizeq*4+48]
- mova xm3, [uqcq+sizeq*4+32]
- packssdw xm3, [uqcq+sizeq*4+48]
- add sizeq, 16
- ; Compute the squared errors.
- ; Individual errors are max 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
- psubw xm0, xm2
- pmaddwd xm2, xm2
- pmaddwd xm0, xm0
- psubw xm1, xm3
- pmaddwd xm3, xm3
- pmaddwd xm1, xm1
- ; Squares are always positive, so we can use unsigned arithmetic after
- ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
- ; fit in 32bits
- paddd xm2, xm3
- paddd xm0, xm1
- ; We accumulate using 32 bit arithmetic, but detect potential overflow
- ; by checking if the MSB of the accumulators have ever been a set bit.
- ; If yes, we redo the whole compute at the end on higher precision, but
- ; this happens extremely rarely, so we still achieve a net gain.
- paddd xm4, xm0
- paddd xm6, xm2
- por xm5, xm4 ; OR in the accumulator for overflow detection
- por xm7, xm6 ; OR in the accumulator for overflow detection
- jnz .loop
- ; Add pairs horizontally (still only on 32 bits)
- phaddd xm4, xm4
- por xm5, xm4 ; OR in the accumulator for overflow detection
- phaddd xm6, xm6
- por xm7, xm6 ; OR in the accumulator for overflow detection
- ; Check for possibility of overflow by testing if bit 32 of each dword lane
- ; have ever been set. If they were not, then there was no overflow and the
- ; final sum will fit in 32 bits. If overflow happened, then
- ; we redo the whole computation on higher precision.
- por xm7, xm5
- pmovmskb r4, xm7
- test r4, 0x8888
- jnz .highprec
- phaddd xm4, xm4
- phaddd xm6, xm6
- pmovzxdq xm4, xm4
- pmovzxdq xm6, xm6
- ; Restore stack
- pop sizeq
- ; Store the return value
- %if ARCH_X86_64
- movq rax, xm4
- movq [sszq], xm6
- %else
- movd eax, xm4
- pextrd edx, xm4, 1
- movq [sszd], xm6
- %endif
- RET
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Generic case of size != 16, high precision case
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- .highprec:
- pxor xm4, xm4 ; sse accumulator
- pxor xm5, xm5 ; dedicated zero register
- pxor xm6, xm6 ; ssz accumulator
- pop sizeq
- .loophp:
- mova xm0, [dqcq+sizeq*4]
- packssdw xm0, [dqcq+sizeq*4+16]
- mova xm2, [uqcq+sizeq*4]
- packssdw xm2, [uqcq+sizeq*4+16]
- mova xm1, [dqcq+sizeq*4+32]
- packssdw xm1, [dqcq+sizeq*4+48]
- mova xm3, [uqcq+sizeq*4+32]
- packssdw xm3, [uqcq+sizeq*4+48]
- add sizeq, 16
- ; individual errors are max. 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
- psubw xm0, xm2
- pmaddwd xm2, xm2
- pmaddwd xm0, xm0
- psubw xm1, xm3
- pmaddwd xm3, xm3
- pmaddwd xm1, xm1
- ; accumulate in 64bit
- punpckldq xm7, xm0, xm5
- punpckhdq xm0, xm5
- paddq xm4, xm7
- punpckldq xm7, xm2, xm5
- punpckhdq xm2, xm5
- paddq xm6, xm7
- punpckldq xm7, xm1, xm5
- punpckhdq xm1, xm5
- paddq xm4, xm7
- punpckldq xm7, xm3, xm5
- punpckhdq xm3, xm5
- paddq xm6, xm7
- paddq xm4, xm0
- paddq xm4, xm1
- paddq xm6, xm2
- paddq xm6, xm3
- jnz .loophp
- ; Accumulate horizontally
- movhlps xm5, xm4
- movhlps xm7, xm6
- paddq xm4, xm5
- paddq xm6, xm7
- ; Store the return value
- %if ARCH_X86_64
- movq rax, xm4
- movq [sszq], xm6
- %else
- movd eax, xm4
- pextrd edx, xm4, 1
- movq [sszd], xm6
- %endif
- RET
- END
|