2
0

sad_vsx.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <stdlib.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/ppc/types_vsx.h"
  13. #include "vpx/vpx_integer.h"
  14. #include "vpx_ports/mem.h"
  15. #define PROCESS16(offset) \
  16. v_a = vec_vsx_ld(offset, a); \
  17. v_b = vec_vsx_ld(offset, b); \
  18. v_abs = vec_absd(v_a, v_b); \
  19. v_sad = vec_sum4s(v_abs, v_sad);
  20. #define SAD8(height) \
  21. unsigned int vpx_sad8x##height##_vsx(const uint8_t *a, int a_stride, \
  22. const uint8_t *b, int b_stride) { \
  23. int y = 0; \
  24. uint8x16_t v_a, v_b, v_abs; \
  25. uint32x4_t v_sad = vec_zeros_u32; \
  26. \
  27. do { \
  28. PROCESS16(0) \
  29. \
  30. a += a_stride; \
  31. b += b_stride; \
  32. y++; \
  33. } while (y < height); \
  34. \
  35. return v_sad[1] + v_sad[0]; \
  36. }
  37. #define SAD16(height) \
  38. unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride, \
  39. const uint8_t *b, int b_stride) { \
  40. int y = 0; \
  41. uint8x16_t v_a, v_b, v_abs; \
  42. uint32x4_t v_sad = vec_zeros_u32; \
  43. \
  44. do { \
  45. PROCESS16(0); \
  46. \
  47. a += a_stride; \
  48. b += b_stride; \
  49. y++; \
  50. } while (y < height); \
  51. \
  52. return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \
  53. }
  54. #define SAD32(height) \
  55. unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride, \
  56. const uint8_t *b, int b_stride) { \
  57. int y = 0; \
  58. uint8x16_t v_a, v_b, v_abs; \
  59. uint32x4_t v_sad = vec_zeros_u32; \
  60. \
  61. do { \
  62. PROCESS16(0); \
  63. PROCESS16(16); \
  64. \
  65. a += a_stride; \
  66. b += b_stride; \
  67. y++; \
  68. } while (y < height); \
  69. \
  70. return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \
  71. }
  72. #define SAD64(height) \
  73. unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride, \
  74. const uint8_t *b, int b_stride) { \
  75. int y = 0; \
  76. uint8x16_t v_a, v_b, v_abs; \
  77. uint32x4_t v_sad = vec_zeros_u32; \
  78. \
  79. do { \
  80. PROCESS16(0); \
  81. PROCESS16(16); \
  82. PROCESS16(32); \
  83. PROCESS16(48); \
  84. \
  85. a += a_stride; \
  86. b += b_stride; \
  87. y++; \
  88. } while (y < height); \
  89. \
  90. return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \
  91. }
  92. SAD8(4);
  93. SAD8(8);
  94. SAD8(16);
  95. SAD16(8);
  96. SAD16(16);
  97. SAD16(32);
  98. SAD32(16);
  99. SAD32(32);
  100. SAD32(64);
  101. SAD64(32);
  102. SAD64(64);
  103. #define SAD16AVG(height) \
  104. unsigned int vpx_sad16x##height##_avg_vsx( \
  105. const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
  106. const uint8_t *second_pred) { \
  107. DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * (height)]); \
  108. vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref, \
  109. ref_stride); \
  110. \
  111. return vpx_sad16x##height##_vsx(src, src_stride, comp_pred, 16); \
  112. }
  113. #define SAD32AVG(height) \
  114. unsigned int vpx_sad32x##height##_avg_vsx( \
  115. const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
  116. const uint8_t *second_pred) { \
  117. DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * (height)]); \
  118. vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref, \
  119. ref_stride); \
  120. \
  121. return vpx_sad32x##height##_vsx(src, src_stride, comp_pred, 32); \
  122. }
  123. #define SAD64AVG(height) \
  124. unsigned int vpx_sad64x##height##_avg_vsx( \
  125. const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
  126. const uint8_t *second_pred) { \
  127. DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * (height)]); \
  128. vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref, \
  129. ref_stride); \
  130. return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64); \
  131. }
  132. SAD16AVG(8);
  133. SAD16AVG(16);
  134. SAD16AVG(32);
  135. SAD32AVG(16);
  136. SAD32AVG(32);
  137. SAD32AVG(64);
  138. SAD64AVG(32);
  139. SAD64AVG(64);
  140. #define PROCESS16_4D(offset, ref, v_h, v_l) \
  141. v_b = vec_vsx_ld(offset, ref); \
  142. v_bh = unpack_to_s16_h(v_b); \
  143. v_bl = unpack_to_s16_l(v_b); \
  144. v_subh = vec_sub(v_h, v_bh); \
  145. v_subl = vec_sub(v_l, v_bl); \
  146. v_absh = vec_abs(v_subh); \
  147. v_absl = vec_abs(v_subl); \
  148. v_sad = vec_sum4s(v_absh, v_sad); \
  149. v_sad = vec_sum4s(v_absl, v_sad);
  150. #define UNPACK_SRC(offset, srcv_h, srcv_l) \
  151. v_a = vec_vsx_ld(offset, src); \
  152. srcv_h = unpack_to_s16_h(v_a); \
  153. srcv_l = unpack_to_s16_l(v_a);
  154. #define SAD16_4D(height) \
  155. void vpx_sad16x##height##x4d_vsx(const uint8_t *src, int src_stride, \
  156. const uint8_t *const ref_array[], \
  157. int ref_stride, uint32_t *sad_array) { \
  158. int i; \
  159. int y; \
  160. unsigned int sad[4]; \
  161. uint8x16_t v_a, v_b; \
  162. int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \
  163. \
  164. for (i = 0; i < 4; i++) sad_array[i] = 0; \
  165. \
  166. for (y = 0; y < height; y++) { \
  167. UNPACK_SRC(y *src_stride, v_ah, v_al); \
  168. for (i = 0; i < 4; i++) { \
  169. int32x4_t v_sad = vec_splat_s32(0); \
  170. PROCESS16_4D(y *ref_stride, ref_array[i], v_ah, v_al); \
  171. \
  172. vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
  173. sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \
  174. } \
  175. } \
  176. }
  177. #define SAD32_4D(height) \
  178. void vpx_sad32x##height##x4d_vsx(const uint8_t *src, int src_stride, \
  179. const uint8_t *const ref_array[], \
  180. int ref_stride, uint32_t *sad_array) { \
  181. int i; \
  182. int y; \
  183. unsigned int sad[4]; \
  184. uint8x16_t v_a, v_b; \
  185. int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \
  186. int16x8_t v_absh, v_absl, v_subh, v_subl; \
  187. \
  188. for (i = 0; i < 4; i++) sad_array[i] = 0; \
  189. \
  190. for (y = 0; y < height; y++) { \
  191. UNPACK_SRC(y *src_stride, v_ah1, v_al1); \
  192. UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \
  193. for (i = 0; i < 4; i++) { \
  194. int32x4_t v_sad = vec_splat_s32(0); \
  195. PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \
  196. PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \
  197. \
  198. vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
  199. sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \
  200. } \
  201. } \
  202. }
  203. #define SAD64_4D(height) \
  204. void vpx_sad64x##height##x4d_vsx(const uint8_t *src, int src_stride, \
  205. const uint8_t *const ref_array[], \
  206. int ref_stride, uint32_t *sad_array) { \
  207. int i; \
  208. int y; \
  209. unsigned int sad[4]; \
  210. uint8x16_t v_a, v_b; \
  211. int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \
  212. int16x8_t v_ah3, v_al3, v_ah4, v_al4; \
  213. int16x8_t v_absh, v_absl, v_subh, v_subl; \
  214. \
  215. for (i = 0; i < 4; i++) sad_array[i] = 0; \
  216. \
  217. for (y = 0; y < height; y++) { \
  218. UNPACK_SRC(y *src_stride, v_ah1, v_al1); \
  219. UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \
  220. UNPACK_SRC(y *src_stride + 32, v_ah3, v_al3); \
  221. UNPACK_SRC(y *src_stride + 48, v_ah4, v_al4); \
  222. for (i = 0; i < 4; i++) { \
  223. int32x4_t v_sad = vec_splat_s32(0); \
  224. PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \
  225. PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \
  226. PROCESS16_4D(y *ref_stride + 32, ref_array[i], v_ah3, v_al3); \
  227. PROCESS16_4D(y *ref_stride + 48, ref_array[i], v_ah4, v_al4); \
  228. \
  229. vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
  230. sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \
  231. } \
  232. } \
  233. }
  234. SAD16_4D(8);
  235. SAD16_4D(16);
  236. SAD16_4D(32);
  237. SAD32_4D(16);
  238. SAD32_4D(32);
  239. SAD32_4D(64);
  240. SAD64_4D(32);
  241. SAD64_4D(64);