fastquantizeb_neon.c 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vp8_rtcd.h"
  12. #include "vp8/encoder/block.h"
  13. static const uint16_t inv_zig_zag[16] = { 1, 2, 6, 7, 3, 5, 8, 13,
  14. 4, 9, 12, 14, 10, 11, 15, 16 };
  15. void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
  16. const int16x8_t one_q = vdupq_n_s16(-1), z0 = vld1q_s16(b->coeff),
  17. z1 = vld1q_s16(b->coeff + 8), round0 = vld1q_s16(b->round),
  18. round1 = vld1q_s16(b->round + 8),
  19. quant0 = vld1q_s16(b->quant_fast),
  20. quant1 = vld1q_s16(b->quant_fast + 8),
  21. dequant0 = vld1q_s16(d->dequant),
  22. dequant1 = vld1q_s16(d->dequant + 8);
  23. const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
  24. zig_zag1 = vld1q_u16(inv_zig_zag + 8);
  25. int16x8_t x0, x1, sz0, sz1, y0, y1;
  26. uint16x8_t eob0, eob1;
  27. #ifndef __aarch64__
  28. uint16x4_t eob_d16;
  29. uint32x2_t eob_d32;
  30. uint32x4_t eob_q32;
  31. #endif // __arch64__
  32. /* sign of z: z >> 15 */
  33. sz0 = vshrq_n_s16(z0, 15);
  34. sz1 = vshrq_n_s16(z1, 15);
  35. /* x = abs(z) */
  36. x0 = vabsq_s16(z0);
  37. x1 = vabsq_s16(z1);
  38. /* x += round */
  39. x0 = vaddq_s16(x0, round0);
  40. x1 = vaddq_s16(x1, round1);
  41. /* y = 2 * (x * quant) >> 16 */
  42. y0 = vqdmulhq_s16(x0, quant0);
  43. y1 = vqdmulhq_s16(x1, quant1);
  44. /* Compensate for doubling in vqdmulhq */
  45. y0 = vshrq_n_s16(y0, 1);
  46. y1 = vshrq_n_s16(y1, 1);
  47. /* Restore sign bit */
  48. y0 = veorq_s16(y0, sz0);
  49. y1 = veorq_s16(y1, sz1);
  50. x0 = vsubq_s16(y0, sz0);
  51. x1 = vsubq_s16(y1, sz1);
  52. /* find non-zero elements */
  53. eob0 = vtstq_s16(x0, one_q);
  54. eob1 = vtstq_s16(x1, one_q);
  55. /* mask zig zag */
  56. eob0 = vandq_u16(eob0, zig_zag0);
  57. eob1 = vandq_u16(eob1, zig_zag1);
  58. /* select the largest value */
  59. eob0 = vmaxq_u16(eob0, eob1);
  60. #ifdef __aarch64__
  61. *d->eob = (int8_t)vmaxvq_u16(eob0);
  62. #else
  63. eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
  64. eob_q32 = vmovl_u16(eob_d16);
  65. eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
  66. eob_d32 = vpmax_u32(eob_d32, eob_d32);
  67. vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
  68. #endif // __aarch64__
  69. /* qcoeff = x */
  70. vst1q_s16(d->qcoeff, x0);
  71. vst1q_s16(d->qcoeff + 8, x1);
  72. /* dqcoeff = x * dequant */
  73. vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
  74. vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
  75. }