avg_intrin_avx2.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <immintrin.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx/vpx_integer.h"
  13. #include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
  14. #include "vpx_ports/mem.h"
  15. #if CONFIG_VP9_HIGHBITDEPTH
  16. static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
  17. __m256i a0 = in[0];
  18. __m256i a1 = in[1];
  19. __m256i a2 = in[2];
  20. __m256i a3 = in[3];
  21. __m256i a4 = in[4];
  22. __m256i a5 = in[5];
  23. __m256i a6 = in[6];
  24. __m256i a7 = in[7];
  25. __m256i b0 = _mm256_add_epi32(a0, a1);
  26. __m256i b1 = _mm256_sub_epi32(a0, a1);
  27. __m256i b2 = _mm256_add_epi32(a2, a3);
  28. __m256i b3 = _mm256_sub_epi32(a2, a3);
  29. __m256i b4 = _mm256_add_epi32(a4, a5);
  30. __m256i b5 = _mm256_sub_epi32(a4, a5);
  31. __m256i b6 = _mm256_add_epi32(a6, a7);
  32. __m256i b7 = _mm256_sub_epi32(a6, a7);
  33. a0 = _mm256_add_epi32(b0, b2);
  34. a1 = _mm256_add_epi32(b1, b3);
  35. a2 = _mm256_sub_epi32(b0, b2);
  36. a3 = _mm256_sub_epi32(b1, b3);
  37. a4 = _mm256_add_epi32(b4, b6);
  38. a5 = _mm256_add_epi32(b5, b7);
  39. a6 = _mm256_sub_epi32(b4, b6);
  40. a7 = _mm256_sub_epi32(b5, b7);
  41. if (iter == 0) {
  42. b0 = _mm256_add_epi32(a0, a4);
  43. b7 = _mm256_add_epi32(a1, a5);
  44. b3 = _mm256_add_epi32(a2, a6);
  45. b4 = _mm256_add_epi32(a3, a7);
  46. b2 = _mm256_sub_epi32(a0, a4);
  47. b6 = _mm256_sub_epi32(a1, a5);
  48. b1 = _mm256_sub_epi32(a2, a6);
  49. b5 = _mm256_sub_epi32(a3, a7);
  50. a0 = _mm256_unpacklo_epi32(b0, b1);
  51. a1 = _mm256_unpacklo_epi32(b2, b3);
  52. a2 = _mm256_unpackhi_epi32(b0, b1);
  53. a3 = _mm256_unpackhi_epi32(b2, b3);
  54. a4 = _mm256_unpacklo_epi32(b4, b5);
  55. a5 = _mm256_unpacklo_epi32(b6, b7);
  56. a6 = _mm256_unpackhi_epi32(b4, b5);
  57. a7 = _mm256_unpackhi_epi32(b6, b7);
  58. b0 = _mm256_unpacklo_epi64(a0, a1);
  59. b1 = _mm256_unpacklo_epi64(a4, a5);
  60. b2 = _mm256_unpackhi_epi64(a0, a1);
  61. b3 = _mm256_unpackhi_epi64(a4, a5);
  62. b4 = _mm256_unpacklo_epi64(a2, a3);
  63. b5 = _mm256_unpacklo_epi64(a6, a7);
  64. b6 = _mm256_unpackhi_epi64(a2, a3);
  65. b7 = _mm256_unpackhi_epi64(a6, a7);
  66. in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
  67. in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
  68. in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
  69. in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
  70. in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
  71. in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
  72. in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
  73. in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
  74. } else {
  75. in[0] = _mm256_add_epi32(a0, a4);
  76. in[7] = _mm256_add_epi32(a1, a5);
  77. in[3] = _mm256_add_epi32(a2, a6);
  78. in[4] = _mm256_add_epi32(a3, a7);
  79. in[2] = _mm256_sub_epi32(a0, a4);
  80. in[6] = _mm256_sub_epi32(a1, a5);
  81. in[1] = _mm256_sub_epi32(a2, a6);
  82. in[5] = _mm256_sub_epi32(a3, a7);
  83. }
  84. }
  85. void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
  86. tran_low_t *coeff) {
  87. __m128i src16[8];
  88. __m256i src32[8];
  89. src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
  90. src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  91. src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  92. src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  93. src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  94. src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  95. src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  96. src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  97. src32[0] = _mm256_cvtepi16_epi32(src16[0]);
  98. src32[1] = _mm256_cvtepi16_epi32(src16[1]);
  99. src32[2] = _mm256_cvtepi16_epi32(src16[2]);
  100. src32[3] = _mm256_cvtepi16_epi32(src16[3]);
  101. src32[4] = _mm256_cvtepi16_epi32(src16[4]);
  102. src32[5] = _mm256_cvtepi16_epi32(src16[5]);
  103. src32[6] = _mm256_cvtepi16_epi32(src16[6]);
  104. src32[7] = _mm256_cvtepi16_epi32(src16[7]);
  105. highbd_hadamard_col8_avx2(src32, 0);
  106. highbd_hadamard_col8_avx2(src32, 1);
  107. _mm256_storeu_si256((__m256i *)coeff, src32[0]);
  108. coeff += 8;
  109. _mm256_storeu_si256((__m256i *)coeff, src32[1]);
  110. coeff += 8;
  111. _mm256_storeu_si256((__m256i *)coeff, src32[2]);
  112. coeff += 8;
  113. _mm256_storeu_si256((__m256i *)coeff, src32[3]);
  114. coeff += 8;
  115. _mm256_storeu_si256((__m256i *)coeff, src32[4]);
  116. coeff += 8;
  117. _mm256_storeu_si256((__m256i *)coeff, src32[5]);
  118. coeff += 8;
  119. _mm256_storeu_si256((__m256i *)coeff, src32[6]);
  120. coeff += 8;
  121. _mm256_storeu_si256((__m256i *)coeff, src32[7]);
  122. }
  123. void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
  124. ptrdiff_t src_stride, tran_low_t *coeff) {
  125. int idx;
  126. tran_low_t *t_coeff = coeff;
  127. for (idx = 0; idx < 4; ++idx) {
  128. const int16_t *src_ptr =
  129. src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
  130. vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
  131. }
  132. for (idx = 0; idx < 64; idx += 8) {
  133. __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
  134. __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
  135. __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
  136. __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
  137. __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
  138. __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
  139. __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
  140. __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
  141. b0 = _mm256_srai_epi32(b0, 1);
  142. b1 = _mm256_srai_epi32(b1, 1);
  143. b2 = _mm256_srai_epi32(b2, 1);
  144. b3 = _mm256_srai_epi32(b3, 1);
  145. coeff0 = _mm256_add_epi32(b0, b2);
  146. coeff1 = _mm256_add_epi32(b1, b3);
  147. coeff2 = _mm256_sub_epi32(b0, b2);
  148. coeff3 = _mm256_sub_epi32(b1, b3);
  149. _mm256_storeu_si256((__m256i *)coeff, coeff0);
  150. _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
  151. _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
  152. _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
  153. coeff += 8;
  154. t_coeff += 8;
  155. }
  156. }
  157. void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
  158. ptrdiff_t src_stride, tran_low_t *coeff) {
  159. int idx;
  160. tran_low_t *t_coeff = coeff;
  161. for (idx = 0; idx < 4; ++idx) {
  162. const int16_t *src_ptr =
  163. src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
  164. vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
  165. }
  166. for (idx = 0; idx < 256; idx += 8) {
  167. __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
  168. __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
  169. __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
  170. __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
  171. __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
  172. __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
  173. __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
  174. __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
  175. b0 = _mm256_srai_epi32(b0, 2);
  176. b1 = _mm256_srai_epi32(b1, 2);
  177. b2 = _mm256_srai_epi32(b2, 2);
  178. b3 = _mm256_srai_epi32(b3, 2);
  179. coeff0 = _mm256_add_epi32(b0, b2);
  180. coeff1 = _mm256_add_epi32(b1, b3);
  181. coeff2 = _mm256_sub_epi32(b0, b2);
  182. coeff3 = _mm256_sub_epi32(b1, b3);
  183. _mm256_storeu_si256((__m256i *)coeff, coeff0);
  184. _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
  185. _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
  186. _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
  187. coeff += 8;
  188. t_coeff += 8;
  189. }
  190. }
  191. #endif // CONFIG_VP9_HIGHBITDEPTH
  192. static void hadamard_col8x2_avx2(__m256i *in, int iter) {
  193. __m256i a0 = in[0];
  194. __m256i a1 = in[1];
  195. __m256i a2 = in[2];
  196. __m256i a3 = in[3];
  197. __m256i a4 = in[4];
  198. __m256i a5 = in[5];
  199. __m256i a6 = in[6];
  200. __m256i a7 = in[7];
  201. __m256i b0 = _mm256_add_epi16(a0, a1);
  202. __m256i b1 = _mm256_sub_epi16(a0, a1);
  203. __m256i b2 = _mm256_add_epi16(a2, a3);
  204. __m256i b3 = _mm256_sub_epi16(a2, a3);
  205. __m256i b4 = _mm256_add_epi16(a4, a5);
  206. __m256i b5 = _mm256_sub_epi16(a4, a5);
  207. __m256i b6 = _mm256_add_epi16(a6, a7);
  208. __m256i b7 = _mm256_sub_epi16(a6, a7);
  209. a0 = _mm256_add_epi16(b0, b2);
  210. a1 = _mm256_add_epi16(b1, b3);
  211. a2 = _mm256_sub_epi16(b0, b2);
  212. a3 = _mm256_sub_epi16(b1, b3);
  213. a4 = _mm256_add_epi16(b4, b6);
  214. a5 = _mm256_add_epi16(b5, b7);
  215. a6 = _mm256_sub_epi16(b4, b6);
  216. a7 = _mm256_sub_epi16(b5, b7);
  217. if (iter == 0) {
  218. b0 = _mm256_add_epi16(a0, a4);
  219. b7 = _mm256_add_epi16(a1, a5);
  220. b3 = _mm256_add_epi16(a2, a6);
  221. b4 = _mm256_add_epi16(a3, a7);
  222. b2 = _mm256_sub_epi16(a0, a4);
  223. b6 = _mm256_sub_epi16(a1, a5);
  224. b1 = _mm256_sub_epi16(a2, a6);
  225. b5 = _mm256_sub_epi16(a3, a7);
  226. a0 = _mm256_unpacklo_epi16(b0, b1);
  227. a1 = _mm256_unpacklo_epi16(b2, b3);
  228. a2 = _mm256_unpackhi_epi16(b0, b1);
  229. a3 = _mm256_unpackhi_epi16(b2, b3);
  230. a4 = _mm256_unpacklo_epi16(b4, b5);
  231. a5 = _mm256_unpacklo_epi16(b6, b7);
  232. a6 = _mm256_unpackhi_epi16(b4, b5);
  233. a7 = _mm256_unpackhi_epi16(b6, b7);
  234. b0 = _mm256_unpacklo_epi32(a0, a1);
  235. b1 = _mm256_unpacklo_epi32(a4, a5);
  236. b2 = _mm256_unpackhi_epi32(a0, a1);
  237. b3 = _mm256_unpackhi_epi32(a4, a5);
  238. b4 = _mm256_unpacklo_epi32(a2, a3);
  239. b5 = _mm256_unpacklo_epi32(a6, a7);
  240. b6 = _mm256_unpackhi_epi32(a2, a3);
  241. b7 = _mm256_unpackhi_epi32(a6, a7);
  242. in[0] = _mm256_unpacklo_epi64(b0, b1);
  243. in[1] = _mm256_unpackhi_epi64(b0, b1);
  244. in[2] = _mm256_unpacklo_epi64(b2, b3);
  245. in[3] = _mm256_unpackhi_epi64(b2, b3);
  246. in[4] = _mm256_unpacklo_epi64(b4, b5);
  247. in[5] = _mm256_unpackhi_epi64(b4, b5);
  248. in[6] = _mm256_unpacklo_epi64(b6, b7);
  249. in[7] = _mm256_unpackhi_epi64(b6, b7);
  250. } else {
  251. in[0] = _mm256_add_epi16(a0, a4);
  252. in[7] = _mm256_add_epi16(a1, a5);
  253. in[3] = _mm256_add_epi16(a2, a6);
  254. in[4] = _mm256_add_epi16(a3, a7);
  255. in[2] = _mm256_sub_epi16(a0, a4);
  256. in[6] = _mm256_sub_epi16(a1, a5);
  257. in[1] = _mm256_sub_epi16(a2, a6);
  258. in[5] = _mm256_sub_epi16(a3, a7);
  259. }
  260. }
  261. static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
  262. int16_t *coeff) {
  263. __m256i src[8];
  264. src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
  265. src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  266. src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  267. src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  268. src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  269. src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  270. src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  271. src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  272. (void)src_diff;
  273. hadamard_col8x2_avx2(src, 0);
  274. hadamard_col8x2_avx2(src, 1);
  275. _mm256_storeu_si256((__m256i *)coeff,
  276. _mm256_permute2x128_si256(src[0], src[1], 0x20));
  277. coeff += 16;
  278. _mm256_storeu_si256((__m256i *)coeff,
  279. _mm256_permute2x128_si256(src[2], src[3], 0x20));
  280. coeff += 16;
  281. _mm256_storeu_si256((__m256i *)coeff,
  282. _mm256_permute2x128_si256(src[4], src[5], 0x20));
  283. coeff += 16;
  284. _mm256_storeu_si256((__m256i *)coeff,
  285. _mm256_permute2x128_si256(src[6], src[7], 0x20));
  286. coeff += 16;
  287. _mm256_storeu_si256((__m256i *)coeff,
  288. _mm256_permute2x128_si256(src[0], src[1], 0x31));
  289. coeff += 16;
  290. _mm256_storeu_si256((__m256i *)coeff,
  291. _mm256_permute2x128_si256(src[2], src[3], 0x31));
  292. coeff += 16;
  293. _mm256_storeu_si256((__m256i *)coeff,
  294. _mm256_permute2x128_si256(src[4], src[5], 0x31));
  295. coeff += 16;
  296. _mm256_storeu_si256((__m256i *)coeff,
  297. _mm256_permute2x128_si256(src[6], src[7], 0x31));
  298. }
  299. static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
  300. ptrdiff_t src_stride, tran_low_t *coeff,
  301. int is_final) {
  302. #if CONFIG_VP9_HIGHBITDEPTH
  303. DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
  304. int16_t *t_coeff = temp_coeff;
  305. #else
  306. int16_t *t_coeff = coeff;
  307. #endif
  308. int16_t *coeff16 = (int16_t *)coeff;
  309. int idx;
  310. for (idx = 0; idx < 2; ++idx) {
  311. const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
  312. hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
  313. }
  314. for (idx = 0; idx < 64; idx += 16) {
  315. const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
  316. const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
  317. const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
  318. const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
  319. __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
  320. __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
  321. __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
  322. __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
  323. b0 = _mm256_srai_epi16(b0, 1);
  324. b1 = _mm256_srai_epi16(b1, 1);
  325. b2 = _mm256_srai_epi16(b2, 1);
  326. b3 = _mm256_srai_epi16(b3, 1);
  327. if (is_final) {
  328. store_tran_low(_mm256_add_epi16(b0, b2), coeff);
  329. store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
  330. store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
  331. store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
  332. coeff += 16;
  333. } else {
  334. _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
  335. _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
  336. _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
  337. _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
  338. coeff16 += 16;
  339. }
  340. t_coeff += 16;
  341. }
  342. }
  343. void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
  344. tran_low_t *coeff) {
  345. hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
  346. }
  347. void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
  348. tran_low_t *coeff) {
  349. #if CONFIG_VP9_HIGHBITDEPTH
  350. // For high bitdepths, it is unnecessary to store_tran_low
  351. // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
  352. // next stage. Output to an intermediate buffer first, then store_tran_low()
  353. // in the final stage.
  354. DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
  355. int16_t *t_coeff = temp_coeff;
  356. #else
  357. int16_t *t_coeff = coeff;
  358. #endif
  359. int idx;
  360. for (idx = 0; idx < 4; ++idx) {
  361. // src_diff: 9 bit, dynamic range [-255, 255]
  362. const int16_t *src_ptr =
  363. src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
  364. hadamard_16x16_avx2(src_ptr, src_stride,
  365. (tran_low_t *)(t_coeff + idx * 256), 0);
  366. }
  367. for (idx = 0; idx < 256; idx += 16) {
  368. const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
  369. const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
  370. const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
  371. const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
  372. __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
  373. __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
  374. __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
  375. __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
  376. b0 = _mm256_srai_epi16(b0, 2);
  377. b1 = _mm256_srai_epi16(b1, 2);
  378. b2 = _mm256_srai_epi16(b2, 2);
  379. b3 = _mm256_srai_epi16(b3, 2);
  380. store_tran_low(_mm256_add_epi16(b0, b2), coeff);
  381. store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
  382. store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
  383. store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
  384. coeff += 16;
  385. t_coeff += 16;
  386. }
  387. }
  388. int vpx_satd_avx2(const tran_low_t *coeff, int length) {
  389. const __m256i one = _mm256_set1_epi16(1);
  390. __m256i accum = _mm256_setzero_si256();
  391. int i;
  392. for (i = 0; i < length; i += 16) {
  393. const __m256i src_line = load_tran_low(coeff);
  394. const __m256i abs = _mm256_abs_epi16(src_line);
  395. const __m256i sum = _mm256_madd_epi16(abs, one);
  396. accum = _mm256_add_epi32(accum, sum);
  397. coeff += 16;
  398. }
  399. { // 32 bit horizontal add
  400. const __m256i a = _mm256_srli_si256(accum, 8);
  401. const __m256i b = _mm256_add_epi32(accum, a);
  402. const __m256i c = _mm256_srli_epi64(b, 32);
  403. const __m256i d = _mm256_add_epi32(b, c);
  404. const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
  405. _mm256_extractf128_si256(d, 1));
  406. return _mm_cvtsi128_si32(accum_128);
  407. }
  408. }
  409. #if CONFIG_VP9_HIGHBITDEPTH
  410. int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) {
  411. __m256i accum = _mm256_setzero_si256();
  412. int i;
  413. for (i = 0; i < length; i += 8, coeff += 8) {
  414. const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
  415. const __m256i abs = _mm256_abs_epi32(src_line);
  416. accum = _mm256_add_epi32(accum, abs);
  417. }
  418. { // 32 bit horizontal add
  419. const __m256i a = _mm256_srli_si256(accum, 8);
  420. const __m256i b = _mm256_add_epi32(accum, a);
  421. const __m256i c = _mm256_srli_epi64(b, 32);
  422. const __m256i d = _mm256_add_epi32(b, c);
  423. const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
  424. _mm256_extractf128_si256(d, 1));
  425. return _mm_cvtsi128_si32(accum_128);
  426. }
  427. }
  428. #endif // CONFIG_VP9_HIGHBITDEPTH