sad_neon.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_config.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx/vpx_integer.h"
  14. #include "vpx_dsp/arm/mem_neon.h"
  15. #include "vpx_dsp/arm/sum_neon.h"
  16. uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
  17. const uint8_t *ref_ptr, int ref_stride) {
  18. const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
  19. const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
  20. uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
  21. abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
  22. return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
  23. }
  24. uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
  25. const uint8_t *ref_ptr, int ref_stride,
  26. const uint8_t *second_pred) {
  27. const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
  28. const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
  29. const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
  30. const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
  31. uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
  32. abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
  33. return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
  34. }
  35. uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
  36. const uint8_t *ref_ptr, int ref_stride) {
  37. int i;
  38. uint16x8_t abs = vdupq_n_u16(0);
  39. for (i = 0; i < 8; i += 4) {
  40. const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
  41. const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
  42. src_ptr += 4 * src_stride;
  43. ref_ptr += 4 * ref_stride;
  44. abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8));
  45. abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
  46. }
  47. return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
  48. }
  49. uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
  50. const uint8_t *ref_ptr, int ref_stride,
  51. const uint8_t *second_pred) {
  52. int i;
  53. uint16x8_t abs = vdupq_n_u16(0);
  54. for (i = 0; i < 8; i += 4) {
  55. const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
  56. const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
  57. const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
  58. const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
  59. src_ptr += 4 * src_stride;
  60. ref_ptr += 4 * ref_stride;
  61. second_pred += 16;
  62. abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg));
  63. abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
  64. }
  65. return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
  66. }
  67. static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
  68. const uint8_t *ref_ptr, int ref_stride,
  69. const int height) {
  70. int i;
  71. uint16x8_t abs = vdupq_n_u16(0);
  72. for (i = 0; i < height; ++i) {
  73. const uint8x8_t a_u8 = vld1_u8(src_ptr);
  74. const uint8x8_t b_u8 = vld1_u8(ref_ptr);
  75. src_ptr += src_stride;
  76. ref_ptr += ref_stride;
  77. abs = vabal_u8(abs, a_u8, b_u8);
  78. }
  79. return abs;
  80. }
  81. static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
  82. const uint8_t *ref_ptr, int ref_stride,
  83. const uint8_t *second_pred,
  84. const int height) {
  85. int i;
  86. uint16x8_t abs = vdupq_n_u16(0);
  87. for (i = 0; i < height; ++i) {
  88. const uint8x8_t a_u8 = vld1_u8(src_ptr);
  89. const uint8x8_t b_u8 = vld1_u8(ref_ptr);
  90. const uint8x8_t c_u8 = vld1_u8(second_pred);
  91. const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
  92. src_ptr += src_stride;
  93. ref_ptr += ref_stride;
  94. second_pred += 8;
  95. abs = vabal_u8(abs, a_u8, avg);
  96. }
  97. return abs;
  98. }
  99. #define sad8xN(n) \
  100. uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \
  101. const uint8_t *ref_ptr, int ref_stride) { \
  102. const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
  103. return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
  104. } \
  105. \
  106. uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
  107. const uint8_t *ref_ptr, int ref_stride, \
  108. const uint8_t *second_pred) { \
  109. const uint16x8_t abs = \
  110. sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
  111. return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
  112. }
  113. sad8xN(4);
  114. sad8xN(8);
  115. sad8xN(16);
  116. static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
  117. const uint8_t *ref_ptr, int ref_stride,
  118. const int height) {
  119. int i;
  120. uint16x8_t abs = vdupq_n_u16(0);
  121. for (i = 0; i < height; ++i) {
  122. const uint8x16_t a_u8 = vld1q_u8(src_ptr);
  123. const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
  124. src_ptr += src_stride;
  125. ref_ptr += ref_stride;
  126. abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
  127. abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
  128. }
  129. return abs;
  130. }
  131. static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
  132. const uint8_t *ref_ptr, int ref_stride,
  133. const uint8_t *second_pred,
  134. const int height) {
  135. int i;
  136. uint16x8_t abs = vdupq_n_u16(0);
  137. for (i = 0; i < height; ++i) {
  138. const uint8x16_t a_u8 = vld1q_u8(src_ptr);
  139. const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
  140. const uint8x16_t c_u8 = vld1q_u8(second_pred);
  141. const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
  142. src_ptr += src_stride;
  143. ref_ptr += ref_stride;
  144. second_pred += 16;
  145. abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
  146. abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
  147. }
  148. return abs;
  149. }
  150. #define sad16xN(n) \
  151. uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \
  152. const uint8_t *ref_ptr, int ref_stride) { \
  153. const uint16x8_t abs = \
  154. sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
  155. return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
  156. } \
  157. \
  158. uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
  159. const uint8_t *ref_ptr, int ref_stride, \
  160. const uint8_t *second_pred) { \
  161. const uint16x8_t abs = \
  162. sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
  163. return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
  164. }
  165. sad16xN(8);
  166. sad16xN(16);
  167. sad16xN(32);
  168. static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
  169. const uint8_t *ref_ptr, int ref_stride,
  170. const int height) {
  171. int i;
  172. uint16x8_t abs = vdupq_n_u16(0);
  173. for (i = 0; i < height; ++i) {
  174. const uint8x16_t a_lo = vld1q_u8(src_ptr);
  175. const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
  176. const uint8x16_t b_lo = vld1q_u8(ref_ptr);
  177. const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
  178. src_ptr += src_stride;
  179. ref_ptr += ref_stride;
  180. abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
  181. abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
  182. abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
  183. abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi));
  184. }
  185. return abs;
  186. }
  187. static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
  188. const uint8_t *ref_ptr, int ref_stride,
  189. const uint8_t *second_pred,
  190. const int height) {
  191. int i;
  192. uint16x8_t abs = vdupq_n_u16(0);
  193. for (i = 0; i < height; ++i) {
  194. const uint8x16_t a_lo = vld1q_u8(src_ptr);
  195. const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
  196. const uint8x16_t b_lo = vld1q_u8(ref_ptr);
  197. const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
  198. const uint8x16_t c_lo = vld1q_u8(second_pred);
  199. const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
  200. const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
  201. const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
  202. src_ptr += src_stride;
  203. ref_ptr += ref_stride;
  204. second_pred += 32;
  205. abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
  206. abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
  207. abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
  208. abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi));
  209. }
  210. return abs;
  211. }
  212. #define sad32xN(n) \
  213. uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \
  214. const uint8_t *ref_ptr, int ref_stride) { \
  215. const uint16x8_t abs = \
  216. sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
  217. return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
  218. } \
  219. \
  220. uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
  221. const uint8_t *ref_ptr, int ref_stride, \
  222. const uint8_t *second_pred) { \
  223. const uint16x8_t abs = \
  224. sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
  225. return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
  226. }
  227. sad32xN(16);
  228. sad32xN(32);
  229. sad32xN(64);
  230. static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
  231. const uint8_t *ref_ptr, int ref_stride,
  232. const int height) {
  233. int i;
  234. uint16x8_t abs_0 = vdupq_n_u16(0);
  235. uint16x8_t abs_1 = vdupq_n_u16(0);
  236. for (i = 0; i < height; ++i) {
  237. const uint8x16_t a_0 = vld1q_u8(src_ptr);
  238. const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
  239. const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
  240. const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
  241. const uint8x16_t b_0 = vld1q_u8(ref_ptr);
  242. const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
  243. const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
  244. const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
  245. src_ptr += src_stride;
  246. ref_ptr += ref_stride;
  247. abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
  248. abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
  249. abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
  250. abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1));
  251. abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2));
  252. abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2));
  253. abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3));
  254. abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3));
  255. }
  256. {
  257. const uint32x4_t sum = vpaddlq_u16(abs_0);
  258. return vpadalq_u16(sum, abs_1);
  259. }
  260. }
  261. static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
  262. const uint8_t *ref_ptr, int ref_stride,
  263. const uint8_t *second_pred,
  264. const int height) {
  265. int i;
  266. uint16x8_t abs_0 = vdupq_n_u16(0);
  267. uint16x8_t abs_1 = vdupq_n_u16(0);
  268. for (i = 0; i < height; ++i) {
  269. const uint8x16_t a_0 = vld1q_u8(src_ptr);
  270. const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
  271. const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
  272. const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
  273. const uint8x16_t b_0 = vld1q_u8(ref_ptr);
  274. const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
  275. const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
  276. const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
  277. const uint8x16_t c_0 = vld1q_u8(second_pred);
  278. const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
  279. const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
  280. const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
  281. const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
  282. const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
  283. const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
  284. const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
  285. src_ptr += src_stride;
  286. ref_ptr += ref_stride;
  287. second_pred += 64;
  288. abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
  289. abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
  290. abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
  291. abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1));
  292. abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2));
  293. abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2));
  294. abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3));
  295. abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3));
  296. }
  297. {
  298. const uint32x4_t sum = vpaddlq_u16(abs_0);
  299. return vpadalq_u16(sum, abs_1);
  300. }
  301. }
  302. #define sad64xN(n) \
  303. uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \
  304. const uint8_t *ref_ptr, int ref_stride) { \
  305. const uint32x4_t abs = \
  306. sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
  307. return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \
  308. } \
  309. \
  310. uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
  311. const uint8_t *ref_ptr, int ref_stride, \
  312. const uint8_t *second_pred) { \
  313. const uint32x4_t abs = \
  314. sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
  315. return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \
  316. }
  317. sad64xN(32);
  318. sad64xN(64);