;
;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;

%define private_prefix vp9

%include "third_party/x86inc/x86inc.asm"

SECTION .text
ALIGN 16

;
; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
;                                     intptr_t block_size, int64_t *ssz)
;

INIT_XMM avx
cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
  vzeroupper

  ; If only one iteration is required, then handle this as a special case.
  ; It is the most frequent case, so we can have a significant gain here
  ; by not setting up a loop and accumulators.
  cmp    sizeq, 16
  jne   .generic

  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; Common case of size == 16
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

  ; Load input vectors
  mova      xm0, [dqcq]
  packssdw  xm0, [dqcq+16]
  mova      xm2, [uqcq]
  packssdw  xm2, [uqcq+16]

  mova      xm1, [dqcq+32]
  packssdw  xm1, [dqcq+48]
  mova      xm3, [uqcq+32]
  packssdw  xm3, [uqcq+48]

  ; Compute the errors.
  psubw     xm0, xm2
  psubw     xm1, xm3

  ; Individual errors are max 15bit+sign, so squares are 30bit, and
  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
  pmaddwd   xm2, xm2
  pmaddwd   xm3, xm3

  pmaddwd   xm0, xm0
  pmaddwd   xm1, xm1

  ; Squares are always positive, so we can use unsigned arithmetic after
  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
  ; fit in 32bits
  paddd     xm2, xm3
  paddd     xm0, xm1

  ; Accumulate horizontally in 64 bits, there is no chance of overflow here
  pxor      xm5, xm5

  pblendw   xm3, xm5, xm2, 0x33 ; Zero extended  low of a pair of 32 bits
  psrlq     xm2, 32             ; Zero extended high of a pair of 32 bits

  pblendw   xm1, xm5, xm0, 0x33 ; Zero extended  low of a pair of 32 bits
  psrlq     xm0, 32             ; Zero extended high of a pair of 32 bits

  paddq     xm2, xm3
  paddq     xm0, xm1

  psrldq    xm3, xm2, 8
  psrldq    xm1, xm0, 8

  paddq     xm2, xm3
  paddq     xm0, xm1

  ; Store the return value
%if ARCH_X86_64
  movq      rax, xm0
  movq   [sszq], xm2
%else
  movd      eax, xm0
  pextrd    edx, xm0, 1
  movq   [sszd], xm2
%endif
  RET

  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; Generic case of size != 16, speculative low precision
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ALIGN 16
.generic:
  pxor      xm4, xm4                ; sse accumulator
  pxor      xm5, xm5                ; overflow detection register for xm4
  pxor      xm6, xm6                ; ssz accumulator
  pxor      xm7, xm7                ; overflow detection register for xm6
  lea      uqcq, [uqcq+sizeq*4]
  lea      dqcq, [dqcq+sizeq*4]
  neg     sizeq

  ; Push the negative size as the high precision code might need it
  push    sizeq

.loop:
  ; Load input vectors
  mova      xm0, [dqcq+sizeq*4]
  packssdw  xm0, [dqcq+sizeq*4+16]
  mova      xm2, [uqcq+sizeq*4]
  packssdw  xm2, [uqcq+sizeq*4+16]

  mova      xm1, [dqcq+sizeq*4+32]
  packssdw  xm1, [dqcq+sizeq*4+48]
  mova      xm3, [uqcq+sizeq*4+32]
  packssdw  xm3, [uqcq+sizeq*4+48]

  add     sizeq, 16

  ; Compute the squared errors.
  ; Individual errors are max 15bit+sign, so squares are 30bit, and
  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
  psubw     xm0, xm2
  pmaddwd   xm2, xm2
  pmaddwd   xm0, xm0

  psubw     xm1, xm3
  pmaddwd   xm3, xm3
  pmaddwd   xm1, xm1

  ; Squares are always positive, so we can use unsigned arithmetic after
  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
  ; fit in 32bits
  paddd     xm2, xm3
  paddd     xm0, xm1

  ; We accumulate using 32 bit arithmetic, but detect potential overflow
  ; by checking if the MSB of the accumulators have ever been a set bit.
  ; If yes, we redo the whole compute at the end on higher precision, but
  ; this happens extremely rarely, so we still achieve a net gain.
  paddd     xm4, xm0
  paddd     xm6, xm2
  por       xm5, xm4  ; OR in the accumulator for overflow detection
  por       xm7, xm6  ; OR in the accumulator for overflow detection

  jnz .loop

  ; Add pairs horizontally (still only on 32 bits)
  phaddd    xm4, xm4
  por       xm5, xm4  ; OR in the accumulator for overflow detection
  phaddd    xm6, xm6
  por       xm7, xm6  ; OR in the accumulator for overflow detection

  ; Check for possibility of overflow by testing if bit 32 of each dword lane
  ; have ever been set. If they were not, then there was no overflow and the
  ; final sum will fit in 32 bits. If overflow happened, then
  ; we redo the whole computation on higher precision.
  por       xm7, xm5
  pmovmskb   r4, xm7
  test       r4, 0x8888
  jnz .highprec

  phaddd    xm4, xm4
  phaddd    xm6, xm6
  pmovzxdq  xm4, xm4
  pmovzxdq  xm6, xm6

  ; Restore stack
  pop     sizeq

  ; Store the return value
%if ARCH_X86_64
  movq      rax, xm4
  movq   [sszq], xm6
%else
  movd      eax, xm4
  pextrd    edx, xm4, 1
  movq   [sszd], xm6
%endif
  RET

  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; Generic case of size != 16, high precision case
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.highprec:
  pxor      xm4, xm4                 ; sse accumulator
  pxor      xm5, xm5                 ; dedicated zero register
  pxor      xm6, xm6                 ; ssz accumulator
  pop     sizeq

.loophp:
  mova      xm0, [dqcq+sizeq*4]
  packssdw  xm0, [dqcq+sizeq*4+16]
  mova      xm2, [uqcq+sizeq*4]
  packssdw  xm2, [uqcq+sizeq*4+16]

  mova      xm1, [dqcq+sizeq*4+32]
  packssdw  xm1, [dqcq+sizeq*4+48]
  mova      xm3, [uqcq+sizeq*4+32]
  packssdw  xm3, [uqcq+sizeq*4+48]

  add     sizeq, 16

  ; individual errors are max. 15bit+sign, so squares are 30bit, and
  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)

  psubw     xm0, xm2
  pmaddwd   xm2, xm2
  pmaddwd   xm0, xm0

  psubw     xm1, xm3
  pmaddwd   xm3, xm3
  pmaddwd   xm1, xm1

  ; accumulate in 64bit
  punpckldq xm7, xm0, xm5
  punpckhdq xm0, xm5
  paddq     xm4, xm7

  punpckldq xm7, xm2, xm5
  punpckhdq xm2, xm5
  paddq     xm6, xm7

  punpckldq xm7, xm1, xm5
  punpckhdq xm1, xm5
  paddq     xm4, xm7

  punpckldq xm7, xm3, xm5
  punpckhdq xm3, xm5
  paddq     xm6, xm7

  paddq     xm4, xm0
  paddq     xm4, xm1
  paddq     xm6, xm2
  paddq     xm6, xm3

  jnz .loophp

  ; Accumulate horizontally
  movhlps   xm5, xm4
  movhlps   xm7, xm6
  paddq     xm4, xm5
  paddq     xm6, xm7

  ; Store the return value
%if ARCH_X86_64
  movq      rax, xm4
  movq   [sszq], xm6
%else
  movd      eax, xm4
  pextrd    edx, xm4, 1
  movq   [sszd], xm6
%endif
  RET

END