mem_neon.h 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_VPX_DSP_ARM_MEM_NEON_H_
  11. #define VPX_VPX_DSP_ARM_MEM_NEON_H_
  12. #include <arm_neon.h>
  13. #include <assert.h>
  14. #include <string.h>
  15. #include "./vpx_config.h"
  16. #include "vpx/vpx_integer.h"
  17. #include "vpx_dsp/vpx_dsp_common.h"
  18. static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
  19. const int16_t c2, const int16_t c3) {
  20. return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) |
  21. ((int64_t)(uint16_t)c2 << 32) | ((int64_t)c3 << 48));
  22. }
  23. static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) {
  24. return vcreate_s32((uint32_t)c0 | ((int64_t)(uint32_t)c1 << 32));
  25. }
  26. static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1,
  27. const int32_t c2, const int32_t c3) {
  28. return vcombine_s32(create_s32x2_neon(c0, c1), create_s32x2_neon(c2, c3));
  29. }
  30. // Helper functions used to load tran_low_t into int16, narrowing if necessary.
  31. static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
  32. #if CONFIG_VP9_HIGHBITDEPTH
  33. const int32x4x2_t v0 = vld2q_s32(buf);
  34. const int32x4x2_t v1 = vld2q_s32(buf + 8);
  35. const int16x4_t s0 = vmovn_s32(v0.val[0]);
  36. const int16x4_t s1 = vmovn_s32(v0.val[1]);
  37. const int16x4_t s2 = vmovn_s32(v1.val[0]);
  38. const int16x4_t s3 = vmovn_s32(v1.val[1]);
  39. int16x8x2_t res;
  40. res.val[0] = vcombine_s16(s0, s2);
  41. res.val[1] = vcombine_s16(s1, s3);
  42. return res;
  43. #else
  44. return vld2q_s16(buf);
  45. #endif
  46. }
  47. static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
  48. #if CONFIG_VP9_HIGHBITDEPTH
  49. const int32x4_t v0 = vld1q_s32(buf);
  50. const int32x4_t v1 = vld1q_s32(buf + 4);
  51. const int16x4_t s0 = vmovn_s32(v0);
  52. const int16x4_t s1 = vmovn_s32(v1);
  53. return vcombine_s16(s0, s1);
  54. #else
  55. return vld1q_s16(buf);
  56. #endif
  57. }
  58. static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
  59. #if CONFIG_VP9_HIGHBITDEPTH
  60. const int32x4_t v0 = vld1q_s32(buf);
  61. return vmovn_s32(v0);
  62. #else
  63. return vld1_s16(buf);
  64. #endif
  65. }
  66. static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
  67. #if CONFIG_VP9_HIGHBITDEPTH
  68. const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
  69. const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
  70. vst1q_s32(buf, v0);
  71. vst1q_s32(buf + 4, v1);
  72. #else
  73. vst1q_s16(buf, a);
  74. #endif
  75. }
  76. // Propagate type information to the compiler. Without this the compiler may
  77. // assume the required alignment of uint32_t (4 bytes) and add alignment hints
  78. // to the memory access.
  79. //
  80. // This is used for functions operating on uint8_t which wish to load or store 4
  81. // values at a time but which may not be on 4 byte boundaries.
  82. static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
  83. memcpy(buf, &a, 4);
  84. }
  85. // Load 2 sets of 4 bytes when alignment is not guaranteed.
  86. static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
  87. uint32_t a;
  88. uint32x2_t a_u32 = vdup_n_u32(0);
  89. if (stride == 4) return vld1_u8(buf);
  90. memcpy(&a, buf, 4);
  91. buf += stride;
  92. a_u32 = vset_lane_u32(a, a_u32, 0);
  93. memcpy(&a, buf, 4);
  94. a_u32 = vset_lane_u32(a, a_u32, 1);
  95. return vreinterpret_u8_u32(a_u32);
  96. }
  97. // Store 2 sets of 4 bytes when alignment is not guaranteed.
  98. static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
  99. const uint8x8_t a) {
  100. const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
  101. if (stride == 4) {
  102. vst1_u8(buf, a);
  103. return;
  104. }
  105. uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
  106. buf += stride;
  107. uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
  108. }
  109. // Load 4 sets of 4 bytes when alignment is not guaranteed.
  110. static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
  111. uint32_t a;
  112. uint32x4_t a_u32 = vdupq_n_u32(0);
  113. if (stride == 4) return vld1q_u8(buf);
  114. memcpy(&a, buf, 4);
  115. buf += stride;
  116. a_u32 = vsetq_lane_u32(a, a_u32, 0);
  117. memcpy(&a, buf, 4);
  118. buf += stride;
  119. a_u32 = vsetq_lane_u32(a, a_u32, 1);
  120. memcpy(&a, buf, 4);
  121. buf += stride;
  122. a_u32 = vsetq_lane_u32(a, a_u32, 2);
  123. memcpy(&a, buf, 4);
  124. buf += stride;
  125. a_u32 = vsetq_lane_u32(a, a_u32, 3);
  126. return vreinterpretq_u8_u32(a_u32);
  127. }
  128. // Store 4 sets of 4 bytes when alignment is not guaranteed.
  129. static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
  130. const uint8x16_t a) {
  131. const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
  132. if (stride == 4) {
  133. vst1q_u8(buf, a);
  134. return;
  135. }
  136. uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
  137. buf += stride;
  138. uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
  139. buf += stride;
  140. uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2));
  141. buf += stride;
  142. uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3));
  143. }
  144. // Load 2 sets of 4 bytes when alignment is guaranteed.
  145. static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
  146. uint32x2_t a = vdup_n_u32(0);
  147. assert(!((intptr_t)buf % sizeof(uint32_t)));
  148. assert(!(stride % sizeof(uint32_t)));
  149. a = vld1_lane_u32((const uint32_t *)buf, a, 0);
  150. buf += stride;
  151. a = vld1_lane_u32((const uint32_t *)buf, a, 1);
  152. return vreinterpret_u8_u32(a);
  153. }
  154. // Store 2 sets of 4 bytes when alignment is guaranteed.
  155. static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
  156. uint32x2_t a_u32 = vreinterpret_u32_u8(a);
  157. assert(!((intptr_t)buf % sizeof(uint32_t)));
  158. assert(!(stride % sizeof(uint32_t)));
  159. vst1_lane_u32((uint32_t *)buf, a_u32, 0);
  160. buf += stride;
  161. vst1_lane_u32((uint32_t *)buf, a_u32, 1);
  162. }
  163. #endif // VPX_VPX_DSP_ARM_MEM_NEON_H_