fdct_partial_neon.c 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "./vpx_config.h"
  13. #include "vpx_dsp/arm/mem_neon.h"
  14. #include "vpx_dsp/arm/sum_neon.h"
  15. static INLINE tran_low_t get_lane(const int32x2_t a) {
  16. #if CONFIG_VP9_HIGHBITDEPTH
  17. return vget_lane_s32(a, 0);
  18. #else
  19. return vget_lane_s16(vreinterpret_s16_s32(a), 0);
  20. #endif // CONFIG_VP9_HIGHBITDETPH
  21. }
  22. void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
  23. int16x4_t a0, a1, a2, a3;
  24. int16x8_t b0, b1;
  25. int16x8_t c;
  26. int32x2_t d;
  27. a0 = vld1_s16(input);
  28. input += stride;
  29. a1 = vld1_s16(input);
  30. input += stride;
  31. a2 = vld1_s16(input);
  32. input += stride;
  33. a3 = vld1_s16(input);
  34. b0 = vcombine_s16(a0, a1);
  35. b1 = vcombine_s16(a2, a3);
  36. c = vaddq_s16(b0, b1);
  37. d = horizontal_add_int16x8(c);
  38. output[0] = get_lane(vshl_n_s32(d, 1));
  39. output[1] = 0;
  40. }
  41. void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
  42. int r;
  43. int16x8_t sum = vld1q_s16(&input[0]);
  44. for (r = 1; r < 8; ++r) {
  45. const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
  46. sum = vaddq_s16(sum, input_00);
  47. }
  48. output[0] = get_lane(horizontal_add_int16x8(sum));
  49. output[1] = 0;
  50. }
  51. void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
  52. int stride) {
  53. int r;
  54. int16x8_t left = vld1q_s16(input);
  55. int16x8_t right = vld1q_s16(input + 8);
  56. int32x2_t sum;
  57. input += stride;
  58. for (r = 1; r < 16; ++r) {
  59. const int16x8_t a = vld1q_s16(input);
  60. const int16x8_t b = vld1q_s16(input + 8);
  61. input += stride;
  62. left = vaddq_s16(left, a);
  63. right = vaddq_s16(right, b);
  64. }
  65. sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right));
  66. output[0] = get_lane(vshr_n_s32(sum, 1));
  67. output[1] = 0;
  68. }
  69. void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
  70. int stride) {
  71. int r;
  72. int16x8_t a0 = vld1q_s16(input);
  73. int16x8_t a1 = vld1q_s16(input + 8);
  74. int16x8_t a2 = vld1q_s16(input + 16);
  75. int16x8_t a3 = vld1q_s16(input + 24);
  76. int32x2_t sum;
  77. input += stride;
  78. for (r = 1; r < 32; ++r) {
  79. const int16x8_t b0 = vld1q_s16(input);
  80. const int16x8_t b1 = vld1q_s16(input + 8);
  81. const int16x8_t b2 = vld1q_s16(input + 16);
  82. const int16x8_t b3 = vld1q_s16(input + 24);
  83. input += stride;
  84. a0 = vaddq_s16(a0, b0);
  85. a1 = vaddq_s16(a1, b1);
  86. a2 = vaddq_s16(a2, b2);
  87. a3 = vaddq_s16(a3, b3);
  88. }
  89. sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1));
  90. sum = vadd_s32(sum, horizontal_add_int16x8(a2));
  91. sum = vadd_s32(sum, horizontal_add_int16x8(a3));
  92. output[0] = get_lane(vshr_n_s32(sum, 3));
  93. output[1] = 0;
  94. }