sad4d_avx2.c 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <immintrin.h> // AVX2
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx/vpx_integer.h"
  13. static INLINE void calc_final(const __m256i *const sums /*[4]*/,
  14. uint32_t sad_array[4]) {
  15. const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
  16. const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
  17. const __m256i t2 = _mm256_hadd_epi32(t0, t1);
  18. const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
  19. _mm256_extractf128_si256(t2, 1));
  20. _mm_storeu_si128((__m128i *)sad_array, sum);
  21. }
  22. void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
  23. const uint8_t *const ref_array[4], int ref_stride,
  24. uint32_t sad_array[4]) {
  25. int i;
  26. const uint8_t *refs[4];
  27. __m256i sums[4];
  28. refs[0] = ref_array[0];
  29. refs[1] = ref_array[1];
  30. refs[2] = ref_array[2];
  31. refs[3] = ref_array[3];
  32. sums[0] = _mm256_setzero_si256();
  33. sums[1] = _mm256_setzero_si256();
  34. sums[2] = _mm256_setzero_si256();
  35. sums[3] = _mm256_setzero_si256();
  36. for (i = 0; i < 32; i++) {
  37. __m256i r[4];
  38. // load src and all ref[]
  39. const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
  40. r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
  41. r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
  42. r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
  43. r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
  44. // sum of the absolute differences between every ref[] to src
  45. r[0] = _mm256_sad_epu8(r[0], s);
  46. r[1] = _mm256_sad_epu8(r[1], s);
  47. r[2] = _mm256_sad_epu8(r[2], s);
  48. r[3] = _mm256_sad_epu8(r[3], s);
  49. // sum every ref[]
  50. sums[0] = _mm256_add_epi32(sums[0], r[0]);
  51. sums[1] = _mm256_add_epi32(sums[1], r[1]);
  52. sums[2] = _mm256_add_epi32(sums[2], r[2]);
  53. sums[3] = _mm256_add_epi32(sums[3], r[3]);
  54. src_ptr += src_stride;
  55. refs[0] += ref_stride;
  56. refs[1] += ref_stride;
  57. refs[2] += ref_stride;
  58. refs[3] += ref_stride;
  59. }
  60. calc_final(sums, sad_array);
  61. }
  62. void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
  63. const uint8_t *const ref_array[4], int ref_stride,
  64. uint32_t sad_array[4]) {
  65. __m256i sums[4];
  66. int i;
  67. const uint8_t *refs[4];
  68. refs[0] = ref_array[0];
  69. refs[1] = ref_array[1];
  70. refs[2] = ref_array[2];
  71. refs[3] = ref_array[3];
  72. sums[0] = _mm256_setzero_si256();
  73. sums[1] = _mm256_setzero_si256();
  74. sums[2] = _mm256_setzero_si256();
  75. sums[3] = _mm256_setzero_si256();
  76. for (i = 0; i < 64; i++) {
  77. __m256i r_lo[4], r_hi[4];
  78. // load 64 bytes from src and all ref[]
  79. const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
  80. const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
  81. r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
  82. r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
  83. r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
  84. r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32));
  85. r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
  86. r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32));
  87. r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
  88. r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
  89. // sum of the absolute differences between every ref[] to src
  90. r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
  91. r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
  92. r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
  93. r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo);
  94. r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi);
  95. r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi);
  96. r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
  97. r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
  98. // sum every ref[]
  99. sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
  100. sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
  101. sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
  102. sums[3] = _mm256_add_epi32(sums[3], r_lo[3]);
  103. sums[0] = _mm256_add_epi32(sums[0], r_hi[0]);
  104. sums[1] = _mm256_add_epi32(sums[1], r_hi[1]);
  105. sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
  106. sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
  107. src_ptr += src_stride;
  108. refs[0] += ref_stride;
  109. refs[1] += ref_stride;
  110. refs[2] += ref_stride;
  111. refs[3] += ref_stride;
  112. }
  113. calc_final(sums, sad_array);
  114. }