iwalsh_neon.c 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vp8_rtcd.h"
  12. void vp8_short_inv_walsh4x4_neon(int16_t *input, int16_t *mb_dqcoeff) {
  13. int16x8_t q0s16, q1s16, q2s16, q3s16;
  14. int16x4_t d4s16, d5s16, d6s16, d7s16;
  15. int16x4x2_t v2tmp0, v2tmp1;
  16. int32x2x2_t v2tmp2, v2tmp3;
  17. int16x8_t qAdd3;
  18. q0s16 = vld1q_s16(input);
  19. q1s16 = vld1q_s16(input + 8);
  20. // 1st for loop
  21. d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
  22. d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
  23. d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
  24. d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
  25. q2s16 = vcombine_s16(d4s16, d5s16);
  26. q3s16 = vcombine_s16(d6s16, d7s16);
  27. q0s16 = vaddq_s16(q2s16, q3s16);
  28. q1s16 = vsubq_s16(q2s16, q3s16);
  29. v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
  30. vreinterpret_s32_s16(vget_low_s16(q1s16)));
  31. v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
  32. vreinterpret_s32_s16(vget_high_s16(q1s16)));
  33. v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
  34. vreinterpret_s16_s32(v2tmp3.val[0]));
  35. v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
  36. vreinterpret_s16_s32(v2tmp3.val[1]));
  37. // 2nd for loop
  38. d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
  39. d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
  40. d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
  41. d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
  42. q2s16 = vcombine_s16(d4s16, d5s16);
  43. q3s16 = vcombine_s16(d6s16, d7s16);
  44. qAdd3 = vdupq_n_s16(3);
  45. q0s16 = vaddq_s16(q2s16, q3s16);
  46. q1s16 = vsubq_s16(q2s16, q3s16);
  47. q0s16 = vaddq_s16(q0s16, qAdd3);
  48. q1s16 = vaddq_s16(q1s16, qAdd3);
  49. q0s16 = vshrq_n_s16(q0s16, 3);
  50. q1s16 = vshrq_n_s16(q1s16, 3);
  51. // store
  52. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
  53. mb_dqcoeff += 16;
  54. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
  55. mb_dqcoeff += 16;
  56. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
  57. mb_dqcoeff += 16;
  58. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
  59. mb_dqcoeff += 16;
  60. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
  61. mb_dqcoeff += 16;
  62. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
  63. mb_dqcoeff += 16;
  64. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
  65. mb_dqcoeff += 16;
  66. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
  67. mb_dqcoeff += 16;
  68. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
  69. mb_dqcoeff += 16;
  70. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
  71. mb_dqcoeff += 16;
  72. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
  73. mb_dqcoeff += 16;
  74. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
  75. mb_dqcoeff += 16;
  76. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
  77. mb_dqcoeff += 16;
  78. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
  79. mb_dqcoeff += 16;
  80. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
  81. mb_dqcoeff += 16;
  82. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
  83. mb_dqcoeff += 16;
  84. return;
  85. }