sad_avx2.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. /*
  2. * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <immintrin.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_ports/mem.h"
  13. #define FSAD64_H(h) \
  14. unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
  15. const uint8_t *ref_ptr, int ref_stride) { \
  16. int i, res; \
  17. __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
  18. __m256i sum_sad = _mm256_setzero_si256(); \
  19. __m256i sum_sad_h; \
  20. __m128i sum_sad128; \
  21. for (i = 0; i < h; i++) { \
  22. ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
  23. ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
  24. sad1_reg = _mm256_sad_epu8( \
  25. ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
  26. sad2_reg = _mm256_sad_epu8( \
  27. ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
  28. sum_sad = \
  29. _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
  30. ref_ptr += ref_stride; \
  31. src_ptr += src_stride; \
  32. } \
  33. sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
  34. sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
  35. sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
  36. sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
  37. res = _mm_cvtsi128_si32(sum_sad128); \
  38. return res; \
  39. }
  40. #define FSAD32_H(h) \
  41. unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
  42. const uint8_t *ref_ptr, int ref_stride) { \
  43. int i, res; \
  44. __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
  45. __m256i sum_sad = _mm256_setzero_si256(); \
  46. __m256i sum_sad_h; \
  47. __m128i sum_sad128; \
  48. int ref2_stride = ref_stride << 1; \
  49. int src2_stride = src_stride << 1; \
  50. int max = h >> 1; \
  51. for (i = 0; i < max; i++) { \
  52. ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
  53. ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
  54. sad1_reg = _mm256_sad_epu8( \
  55. ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
  56. sad2_reg = _mm256_sad_epu8( \
  57. ref2_reg, \
  58. _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
  59. sum_sad = \
  60. _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
  61. ref_ptr += ref2_stride; \
  62. src_ptr += src2_stride; \
  63. } \
  64. sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
  65. sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
  66. sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
  67. sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
  68. res = _mm_cvtsi128_si32(sum_sad128); \
  69. return res; \
  70. }
  71. #define FSAD64 \
  72. FSAD64_H(64); \
  73. FSAD64_H(32);
  74. #define FSAD32 \
  75. FSAD32_H(64); \
  76. FSAD32_H(32); \
  77. FSAD32_H(16);
  78. FSAD64;
  79. FSAD32;
  80. #undef FSAD64
  81. #undef FSAD32
  82. #undef FSAD64_H
  83. #undef FSAD32_H
  84. #define FSADAVG64_H(h) \
  85. unsigned int vpx_sad64x##h##_avg_avx2( \
  86. const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
  87. int ref_stride, const uint8_t *second_pred) { \
  88. int i, res; \
  89. __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
  90. __m256i sum_sad = _mm256_setzero_si256(); \
  91. __m256i sum_sad_h; \
  92. __m128i sum_sad128; \
  93. for (i = 0; i < h; i++) { \
  94. ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
  95. ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
  96. ref1_reg = _mm256_avg_epu8( \
  97. ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
  98. ref2_reg = _mm256_avg_epu8( \
  99. ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
  100. sad1_reg = _mm256_sad_epu8( \
  101. ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
  102. sad2_reg = _mm256_sad_epu8( \
  103. ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
  104. sum_sad = \
  105. _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
  106. ref_ptr += ref_stride; \
  107. src_ptr += src_stride; \
  108. second_pred += 64; \
  109. } \
  110. sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
  111. sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
  112. sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
  113. sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
  114. res = _mm_cvtsi128_si32(sum_sad128); \
  115. return res; \
  116. }
  117. #define FSADAVG32_H(h) \
  118. unsigned int vpx_sad32x##h##_avg_avx2( \
  119. const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
  120. int ref_stride, const uint8_t *second_pred) { \
  121. int i, res; \
  122. __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
  123. __m256i sum_sad = _mm256_setzero_si256(); \
  124. __m256i sum_sad_h; \
  125. __m128i sum_sad128; \
  126. int ref2_stride = ref_stride << 1; \
  127. int src2_stride = src_stride << 1; \
  128. int max = h >> 1; \
  129. for (i = 0; i < max; i++) { \
  130. ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
  131. ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
  132. ref1_reg = _mm256_avg_epu8( \
  133. ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
  134. ref2_reg = _mm256_avg_epu8( \
  135. ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
  136. sad1_reg = _mm256_sad_epu8( \
  137. ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
  138. sad2_reg = _mm256_sad_epu8( \
  139. ref2_reg, \
  140. _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
  141. sum_sad = \
  142. _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
  143. ref_ptr += ref2_stride; \
  144. src_ptr += src2_stride; \
  145. second_pred += 64; \
  146. } \
  147. sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
  148. sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
  149. sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
  150. sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
  151. res = _mm_cvtsi128_si32(sum_sad128); \
  152. return res; \
  153. }
  154. #define FSADAVG64 \
  155. FSADAVG64_H(64); \
  156. FSADAVG64_H(32);
  157. #define FSADAVG32 \
  158. FSADAVG32_H(64); \
  159. FSADAVG32_H(32); \
  160. FSADAVG32_H(16);
  161. FSADAVG64;
  162. FSADAVG32;
  163. #undef FSADAVG64
  164. #undef FSADAVG32
  165. #undef FSADAVG64_H
  166. #undef FSADAVG32_H