sad4d_neon.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include <assert.h>
  12. #include "./vpx_config.h"
  13. #include "./vpx_dsp_rtcd.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vpx_dsp/arm/mem_neon.h"
  16. #include "vpx_dsp/arm/sum_neon.h"
  17. static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
  18. const void *const buf1) {
  19. uint32_t a;
  20. uint32x2_t aa = vdup_n_u32(0);
  21. memcpy(&a, buf0, 4);
  22. aa = vset_lane_u32(a, aa, 0);
  23. memcpy(&a, buf1, 4);
  24. aa = vset_lane_u32(a, aa, 1);
  25. return vreinterpret_u8_u32(aa);
  26. }
  27. static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
  28. const uint8_t *const ref_array[4],
  29. const int ref_stride, const int height,
  30. uint32_t *const res) {
  31. int i;
  32. uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
  33. uint16x4_t a[2];
  34. uint32x4_t r;
  35. assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
  36. assert(!(src_stride % sizeof(uint32_t)));
  37. for (i = 0; i < height; ++i) {
  38. const uint8x8_t s = vreinterpret_u8_u32(
  39. vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
  40. const uint8x8_t ref01 = load_unaligned_2_buffers(
  41. ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
  42. const uint8x8_t ref23 = load_unaligned_2_buffers(
  43. ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
  44. abs[0] = vabal_u8(abs[0], s, ref01);
  45. abs[1] = vabal_u8(abs[1], s, ref23);
  46. }
  47. a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
  48. a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
  49. r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
  50. vst1q_u32(res, r);
  51. }
  52. void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
  53. const uint8_t *const ref_array[4], int ref_stride,
  54. uint32_t *res) {
  55. sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res);
  56. }
  57. void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
  58. const uint8_t *const ref_array[4], int ref_stride,
  59. uint32_t *res) {
  60. sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res);
  61. }
  62. ////////////////////////////////////////////////////////////////////////////////
  63. // Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
  64. static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
  65. uint32_t *const res) {
  66. const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
  67. const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
  68. const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
  69. const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
  70. const uint16x4_t b0 = vpadd_u16(a0, a1);
  71. const uint16x4_t b1 = vpadd_u16(a2, a3);
  72. const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
  73. vst1q_u32(res, r);
  74. }
  75. // Can handle 1024 pixels' sad sum (such as 32x32)
  76. static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
  77. uint32_t *const res) {
  78. const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
  79. const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
  80. const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
  81. const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
  82. const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));
  83. const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
  84. const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
  85. const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
  86. vst1q_u32(res, vcombine_u32(c0, c1));
  87. }
  88. // Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
  89. static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
  90. uint32_t *const res) {
  91. const uint32x4_t a0 = vpaddlq_u16(sum[0]);
  92. const uint32x4_t a1 = vpaddlq_u16(sum[1]);
  93. const uint32x4_t a2 = vpaddlq_u16(sum[2]);
  94. const uint32x4_t a3 = vpaddlq_u16(sum[3]);
  95. const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
  96. const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
  97. const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
  98. const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
  99. const uint32x2_t c0 = vpadd_u32(b0, b1);
  100. const uint32x2_t c1 = vpadd_u32(b2, b3);
  101. vst1q_u32(res, vcombine_u32(c0, c1));
  102. }
  103. // Can handle 4096 pixels' sad sum (such as 64x64)
  104. static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
  105. uint32_t *const res) {
  106. const uint32x4_t a0 = vpaddlq_u16(sum[0]);
  107. const uint32x4_t a1 = vpaddlq_u16(sum[1]);
  108. const uint32x4_t a2 = vpaddlq_u16(sum[2]);
  109. const uint32x4_t a3 = vpaddlq_u16(sum[3]);
  110. const uint32x4_t a4 = vpaddlq_u16(sum[4]);
  111. const uint32x4_t a5 = vpaddlq_u16(sum[5]);
  112. const uint32x4_t a6 = vpaddlq_u16(sum[6]);
  113. const uint32x4_t a7 = vpaddlq_u16(sum[7]);
  114. const uint32x4_t b0 = vaddq_u32(a0, a1);
  115. const uint32x4_t b1 = vaddq_u32(a2, a3);
  116. const uint32x4_t b2 = vaddq_u32(a4, a5);
  117. const uint32x4_t b3 = vaddq_u32(a6, a7);
  118. const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
  119. const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
  120. const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
  121. const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
  122. const uint32x2_t d0 = vpadd_u32(c0, c1);
  123. const uint32x2_t d1 = vpadd_u32(c2, c3);
  124. vst1q_u32(res, vcombine_u32(d0, d1));
  125. }
  126. static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
  127. const uint8_t *const ref_array[4], int ref_stride,
  128. uint32_t *res, const int height) {
  129. int i, j;
  130. const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
  131. ref_array[3] };
  132. uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
  133. vdupq_n_u16(0) };
  134. for (i = 0; i < height; ++i) {
  135. const uint8x8_t s = vld1_u8(src_ptr);
  136. src_ptr += src_stride;
  137. for (j = 0; j < 4; ++j) {
  138. const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
  139. ref_loop[j] += ref_stride;
  140. sum[j] = vabal_u8(sum[j], s, b_u8);
  141. }
  142. }
  143. sad_512_pel_final_neon(sum, res);
  144. }
  145. void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
  146. const uint8_t *const ref_array[4], int ref_stride,
  147. uint32_t *res) {
  148. sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4);
  149. }
  150. void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
  151. const uint8_t *const ref_array[4], int ref_stride,
  152. uint32_t *res) {
  153. sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
  154. }
  155. void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
  156. const uint8_t *const ref_array[4], int ref_stride,
  157. uint32_t *res) {
  158. sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
  159. }
  160. ////////////////////////////////////////////////////////////////////////////////
  161. static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
  162. uint16x8_t *const sum) {
  163. const uint8x16_t r = vld1q_u8(ref_ptr);
  164. *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
  165. *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
  166. }
  167. static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
  168. const uint8_t *const ref_array[4], int ref_stride,
  169. uint32_t *res, const int height) {
  170. int i, j;
  171. const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
  172. ref_array[3] };
  173. uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
  174. vdupq_n_u16(0) };
  175. for (i = 0; i < height; ++i) {
  176. const uint8x16_t s = vld1q_u8(src_ptr);
  177. src_ptr += src_stride;
  178. for (j = 0; j < 4; ++j) {
  179. sad16_neon(ref_loop[j], s, &sum[j]);
  180. ref_loop[j] += ref_stride;
  181. }
  182. }
  183. sad_512_pel_final_neon(sum, res);
  184. }
  185. void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
  186. const uint8_t *const ref_array[4], int ref_stride,
  187. uint32_t *res) {
  188. sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
  189. }
  190. void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
  191. const uint8_t *const ref_array[4], int ref_stride,
  192. uint32_t *res) {
  193. sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
  194. }
  195. void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
  196. const uint8_t *const ref_array[4], int ref_stride,
  197. uint32_t *res) {
  198. sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
  199. }
  200. ////////////////////////////////////////////////////////////////////////////////
  201. static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
  202. const uint8_t *const ref_array[4], int ref_stride,
  203. const int height, uint16x8_t *const sum) {
  204. int i;
  205. const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
  206. ref_array[3] };
  207. sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
  208. for (i = 0; i < height; ++i) {
  209. uint8x16_t s;
  210. s = vld1q_u8(src_ptr + 0 * 16);
  211. sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
  212. sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
  213. sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
  214. sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
  215. s = vld1q_u8(src_ptr + 1 * 16);
  216. sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
  217. sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
  218. sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
  219. sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
  220. src_ptr += src_stride;
  221. ref_loop[0] += ref_stride;
  222. ref_loop[1] += ref_stride;
  223. ref_loop[2] += ref_stride;
  224. ref_loop[3] += ref_stride;
  225. }
  226. }
  227. void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
  228. const uint8_t *const ref_array[4], int ref_stride,
  229. uint32_t *res) {
  230. uint16x8_t sum[4];
  231. sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
  232. sad_512_pel_final_neon(sum, res);
  233. }
  234. void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
  235. const uint8_t *const ref_array[4], int ref_stride,
  236. uint32_t *res) {
  237. uint16x8_t sum[4];
  238. sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
  239. sad_1024_pel_final_neon(sum, res);
  240. }
  241. void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
  242. const uint8_t *const ref_array[4], int ref_stride,
  243. uint32_t *res) {
  244. uint16x8_t sum[4];
  245. sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
  246. sad_2048_pel_final_neon(sum, res);
  247. }
  248. ////////////////////////////////////////////////////////////////////////////////
  249. void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
  250. const uint8_t *const ref_array[4], int ref_stride,
  251. uint32_t *res) {
  252. int i;
  253. const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
  254. ref_array[3] };
  255. uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
  256. vdupq_n_u16(0) };
  257. for (i = 0; i < 32; ++i) {
  258. uint8x16_t s;
  259. s = vld1q_u8(src_ptr + 0 * 16);
  260. sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
  261. sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
  262. sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
  263. sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
  264. s = vld1q_u8(src_ptr + 1 * 16);
  265. sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
  266. sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
  267. sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
  268. sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
  269. s = vld1q_u8(src_ptr + 2 * 16);
  270. sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
  271. sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
  272. sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
  273. sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
  274. s = vld1q_u8(src_ptr + 3 * 16);
  275. sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
  276. sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
  277. sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
  278. sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
  279. src_ptr += src_stride;
  280. ref_loop[0] += ref_stride;
  281. ref_loop[1] += ref_stride;
  282. ref_loop[2] += ref_stride;
  283. ref_loop[3] += ref_stride;
  284. }
  285. sad_2048_pel_final_neon(sum, res);
  286. }
  287. void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
  288. const uint8_t *const ref_array[4], int ref_stride,
  289. uint32_t *res) {
  290. int i;
  291. const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
  292. ref_array[3] };
  293. uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
  294. vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
  295. vdupq_n_u16(0), vdupq_n_u16(0) };
  296. for (i = 0; i < 64; ++i) {
  297. uint8x16_t s;
  298. s = vld1q_u8(src_ptr + 0 * 16);
  299. sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
  300. sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
  301. sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
  302. sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
  303. s = vld1q_u8(src_ptr + 1 * 16);
  304. sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
  305. sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
  306. sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
  307. sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
  308. s = vld1q_u8(src_ptr + 2 * 16);
  309. sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
  310. sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
  311. sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
  312. sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
  313. s = vld1q_u8(src_ptr + 3 * 16);
  314. sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
  315. sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
  316. sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
  317. sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
  318. src_ptr += src_stride;
  319. ref_loop[0] += ref_stride;
  320. ref_loop[1] += ref_stride;
  321. ref_loop[2] += ref_stride;
  322. ref_loop[3] += ref_stride;
  323. }
  324. sad_4096_pel_final_neon(sum, res);
  325. }