vpx_subpixel_8t_intrin_avx2.c 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991
  1. /*
  2. * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <immintrin.h>
  11. #include <stdio.h>
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_dsp/x86/convolve.h"
  14. #include "vpx_dsp/x86/convolve_avx2.h"
  15. #include "vpx_dsp/x86/convolve_sse2.h"
  16. #include "vpx_ports/mem.h"
  17. // filters for 16_h8
  18. DECLARE_ALIGNED(32, static const uint8_t,
  19. filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
  20. 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
  21. 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  22. DECLARE_ALIGNED(32, static const uint8_t,
  23. filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
  24. 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
  25. 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 };
  26. DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
  27. 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
  28. 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
  29. };
  30. DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
  31. 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
  32. 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
  33. };
  34. static INLINE void vpx_filter_block1d16_h8_x_avx2(
  35. const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
  36. ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
  37. const int avg) {
  38. __m128i outReg1, outReg2;
  39. __m256i outReg32b1, outReg32b2;
  40. unsigned int i;
  41. ptrdiff_t src_stride, dst_stride;
  42. __m256i f[4], filt[4], s[4];
  43. shuffle_filter_avx2(filter, f);
  44. filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
  45. filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
  46. filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
  47. filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
  48. // multiple the size of the source and destination stride by two
  49. src_stride = src_pixels_per_line << 1;
  50. dst_stride = output_pitch << 1;
  51. for (i = output_height; i > 1; i -= 2) {
  52. __m256i srcReg;
  53. // load the 2 strides of source
  54. srcReg =
  55. _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
  56. srcReg = _mm256_inserti128_si256(
  57. srcReg,
  58. _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
  59. 1);
  60. // filter the source buffer
  61. s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
  62. s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
  63. s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
  64. s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
  65. outReg32b1 = convolve8_16_avx2(s, f);
  66. // reading 2 strides of the next 16 bytes
  67. // (part of it was being read by earlier read)
  68. srcReg =
  69. _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
  70. srcReg = _mm256_inserti128_si256(
  71. srcReg,
  72. _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
  73. 1);
  74. // filter the source buffer
  75. s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
  76. s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
  77. s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
  78. s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
  79. outReg32b2 = convolve8_16_avx2(s, f);
  80. // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
  81. // contain the first and second convolve result respectively
  82. outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2);
  83. src_ptr += src_stride;
  84. // average if necessary
  85. outReg1 = _mm256_castsi256_si128(outReg32b1);
  86. outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
  87. if (avg) {
  88. outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
  89. outReg2 = _mm_avg_epu8(
  90. outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
  91. }
  92. // save 16 bytes
  93. _mm_store_si128((__m128i *)output_ptr, outReg1);
  94. // save the next 16 bits
  95. _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
  96. output_ptr += dst_stride;
  97. }
  98. // if the number of strides is odd.
  99. // process only 16 bytes
  100. if (i > 0) {
  101. __m128i srcReg;
  102. // load the first 16 bytes of the last row
  103. srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
  104. // filter the source buffer
  105. s[0] = _mm256_castsi128_si256(
  106. _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
  107. s[1] = _mm256_castsi128_si256(
  108. _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
  109. s[2] = _mm256_castsi128_si256(
  110. _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
  111. s[3] = _mm256_castsi128_si256(
  112. _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
  113. outReg1 = convolve8_8_avx2(s, f);
  114. // reading the next 16 bytes
  115. // (part of it was being read by earlier read)
  116. srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
  117. // filter the source buffer
  118. s[0] = _mm256_castsi128_si256(
  119. _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
  120. s[1] = _mm256_castsi128_si256(
  121. _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
  122. s[2] = _mm256_castsi128_si256(
  123. _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
  124. s[3] = _mm256_castsi128_si256(
  125. _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
  126. outReg2 = convolve8_8_avx2(s, f);
  127. // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
  128. // contain the first and second convolve result respectively
  129. outReg1 = _mm_packus_epi16(outReg1, outReg2);
  130. // average if necessary
  131. if (avg) {
  132. outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
  133. }
  134. // save 16 bytes
  135. _mm_store_si128((__m128i *)output_ptr, outReg1);
  136. }
  137. }
  138. static void vpx_filter_block1d16_h8_avx2(
  139. const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
  140. ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
  141. vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
  142. output_height, filter, 0);
  143. }
  144. static void vpx_filter_block1d16_h8_avg_avx2(
  145. const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
  146. ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
  147. vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
  148. output_height, filter, 1);
  149. }
  150. static INLINE void vpx_filter_block1d16_v8_x_avx2(
  151. const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
  152. ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
  153. const int avg) {
  154. __m128i outReg1, outReg2;
  155. __m256i srcRegHead1;
  156. unsigned int i;
  157. ptrdiff_t src_stride, dst_stride;
  158. __m256i f[4], s1[4], s2[4];
  159. shuffle_filter_avx2(filter, f);
  160. // multiple the size of the source and destination stride by two
  161. src_stride = src_pitch << 1;
  162. dst_stride = out_pitch << 1;
  163. {
  164. __m128i s[6];
  165. __m256i s32b[6];
  166. // load 16 bytes 7 times in stride of src_pitch
  167. s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch));
  168. s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch));
  169. s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch));
  170. s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch));
  171. s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch));
  172. s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch));
  173. srcRegHead1 = _mm256_castsi128_si256(
  174. _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch)));
  175. // have each consecutive loads on the same 256 register
  176. s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
  177. s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
  178. s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
  179. s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
  180. s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
  181. s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]),
  182. _mm256_castsi256_si128(srcRegHead1), 1);
  183. // merge every two consecutive registers except the last one
  184. // the first lanes contain values for filtering odd rows (1,3,5...) and
  185. // the second lanes contain values for filtering even rows (2,4,6...)
  186. s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]);
  187. s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]);
  188. s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]);
  189. s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]);
  190. s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]);
  191. s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
  192. }
  193. for (i = output_height; i > 1; i -= 2) {
  194. __m256i srcRegHead2, srcRegHead3;
  195. // load the next 2 loads of 16 bytes and have every two
  196. // consecutive loads in the same 256 bit register
  197. srcRegHead2 = _mm256_castsi128_si256(
  198. _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch)));
  199. srcRegHead1 = _mm256_inserti128_si256(
  200. srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1);
  201. srcRegHead3 = _mm256_castsi128_si256(
  202. _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch)));
  203. srcRegHead2 = _mm256_inserti128_si256(
  204. srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1);
  205. // merge the two new consecutive registers
  206. // the first lane contain values for filtering odd rows (1,3,5...) and
  207. // the second lane contain values for filtering even rows (2,4,6...)
  208. s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2);
  209. s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2);
  210. s1[0] = convolve8_16_avx2(s1, f);
  211. s2[0] = convolve8_16_avx2(s2, f);
  212. // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
  213. // contain the first and second convolve result respectively
  214. s1[0] = _mm256_packus_epi16(s1[0], s2[0]);
  215. src_ptr += src_stride;
  216. // average if necessary
  217. outReg1 = _mm256_castsi256_si128(s1[0]);
  218. outReg2 = _mm256_extractf128_si256(s1[0], 1);
  219. if (avg) {
  220. outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
  221. outReg2 = _mm_avg_epu8(
  222. outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
  223. }
  224. // save 16 bytes
  225. _mm_store_si128((__m128i *)output_ptr, outReg1);
  226. // save the next 16 bits
  227. _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
  228. output_ptr += dst_stride;
  229. // shift down by two rows
  230. s1[0] = s1[1];
  231. s2[0] = s2[1];
  232. s1[1] = s1[2];
  233. s2[1] = s2[2];
  234. s1[2] = s1[3];
  235. s2[2] = s2[3];
  236. srcRegHead1 = srcRegHead3;
  237. }
  238. // if the number of strides is odd.
  239. // process only 16 bytes
  240. if (i > 0) {
  241. // load the last 16 bytes
  242. const __m128i srcRegHead2 =
  243. _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
  244. // merge the last 2 results together
  245. s1[0] = _mm256_castsi128_si256(
  246. _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
  247. s2[0] = _mm256_castsi128_si256(
  248. _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
  249. outReg1 = convolve8_8_avx2(s1, f);
  250. outReg2 = convolve8_8_avx2(s2, f);
  251. // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
  252. // contain the first and second convolve result respectively
  253. outReg1 = _mm_packus_epi16(outReg1, outReg2);
  254. // average if necessary
  255. if (avg) {
  256. outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
  257. }
  258. // save 16 bytes
  259. _mm_store_si128((__m128i *)output_ptr, outReg1);
  260. }
  261. }
  262. static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
  263. ptrdiff_t src_stride, uint8_t *dst_ptr,
  264. ptrdiff_t dst_stride, uint32_t height,
  265. const int16_t *filter) {
  266. vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
  267. height, filter, 0);
  268. }
  269. static void vpx_filter_block1d16_v8_avg_avx2(
  270. const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr,
  271. ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) {
  272. vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
  273. height, filter, 1);
  274. }
  275. static void vpx_filter_block1d16_h4_avx2(const uint8_t *src_ptr,
  276. ptrdiff_t src_stride, uint8_t *dst_ptr,
  277. ptrdiff_t dst_stride, uint32_t height,
  278. const int16_t *kernel) {
  279. // We will cast the kernel from 16-bit words to 8-bit words, and then extract
  280. // the middle four elements of the kernel into two registers in the form
  281. // ... k[3] k[2] k[3] k[2]
  282. // ... k[5] k[4] k[5] k[4]
  283. // Then we shuffle the source into
  284. // ... s[1] s[0] s[0] s[-1]
  285. // ... s[3] s[2] s[2] s[1]
  286. // Calling multiply and add gives us half of the sum. Calling add gives us
  287. // first half of the output. Repeat again to get the second half of the
  288. // output. Finally we shuffle again to combine the two outputs.
  289. // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
  290. // time.
  291. __m128i kernel_reg; // Kernel
  292. __m256i kernel_reg_256, kernel_reg_23,
  293. kernel_reg_45; // Segments of the kernel used
  294. const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
  295. const ptrdiff_t unrolled_src_stride = src_stride << 1;
  296. const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
  297. int h;
  298. __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
  299. __m256i dst_first, dst_second;
  300. __m256i tmp_0, tmp_1;
  301. __m256i idx_shift_0 =
  302. _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
  303. 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
  304. __m256i idx_shift_2 =
  305. _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
  306. 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
  307. // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  308. src_ptr -= 1;
  309. // Load Kernel
  310. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  311. kernel_reg = _mm_srai_epi16(kernel_reg, 1);
  312. kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
  313. kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
  314. kernel_reg_23 =
  315. _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
  316. kernel_reg_45 =
  317. _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
  318. for (h = height; h >= 2; h -= 2) {
  319. // Load the source
  320. src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
  321. src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
  322. src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
  323. // Partial result for first half
  324. tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
  325. tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
  326. dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
  327. // Do again to get the second half of dst
  328. // Load the source
  329. src_reg = mm256_loadu2_si128(src_ptr + 8, src_ptr + src_stride + 8);
  330. src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
  331. src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
  332. // Partial result for second half
  333. tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
  334. tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
  335. dst_second = _mm256_adds_epi16(tmp_0, tmp_1);
  336. // Round each result
  337. dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
  338. dst_second = mm256_round_epi16(&dst_second, &reg_32, 6);
  339. // Finally combine to get the final dst
  340. dst_first = _mm256_packus_epi16(dst_first, dst_second);
  341. mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
  342. &dst_first);
  343. src_ptr += unrolled_src_stride;
  344. dst_ptr += unrolled_dst_stride;
  345. }
  346. // Repeat for the last row if needed
  347. if (h > 0) {
  348. src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
  349. // Reorder into 2 1 1 2
  350. src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
  351. src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
  352. src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
  353. tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
  354. tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
  355. dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
  356. dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
  357. dst_first = _mm256_packus_epi16(dst_first, dst_first);
  358. dst_first = _mm256_permute4x64_epi64(dst_first, 0x8);
  359. _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(dst_first));
  360. }
  361. }
  362. static void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr,
  363. ptrdiff_t src_stride, uint8_t *dst_ptr,
  364. ptrdiff_t dst_stride, uint32_t height,
  365. const int16_t *kernel) {
  366. // We will load two rows of pixels as 8-bit words, rearrange them into the
  367. // form
  368. // ... s[1,0] s[0,0] s[0,0] s[-1,0]
  369. // so that we can call multiply and add with the kernel partial output. Then
  370. // we can call add with another row to get the output.
  371. // Register for source s[-1:3, :]
  372. __m256i src_reg_1, src_reg_2, src_reg_3;
  373. // Interleaved rows of the source. lo is first half, hi second
  374. __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
  375. __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
  376. __m128i kernel_reg; // Kernel
  377. __m256i kernel_reg_256, kernel_reg_23,
  378. kernel_reg_45; // Segments of the kernel used
  379. // Result after multiply and add
  380. __m256i res_reg_m1001_lo, res_reg_1223_lo, res_reg_m1001_hi, res_reg_1223_hi;
  381. __m256i res_reg, res_reg_lo, res_reg_hi;
  382. const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
  383. // We will compute the result two rows at a time
  384. const ptrdiff_t src_stride_unrolled = src_stride << 1;
  385. const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
  386. int h;
  387. // Load Kernel
  388. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  389. kernel_reg = _mm_srai_epi16(kernel_reg, 1);
  390. kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
  391. kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
  392. kernel_reg_23 =
  393. _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
  394. kernel_reg_45 =
  395. _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
  396. // Row -1 to row 0
  397. src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
  398. (const __m128i *)(src_ptr + src_stride));
  399. // Row 0 to row 1
  400. src_reg_1 = _mm256_castsi128_si256(
  401. _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
  402. src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
  403. // First three rows
  404. src_reg_m1001_lo = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
  405. src_reg_m1001_hi = _mm256_unpackhi_epi8(src_reg_m10, src_reg_01);
  406. for (h = height; h > 1; h -= 2) {
  407. src_reg_2 = _mm256_castsi128_si256(
  408. _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
  409. src_reg_12 = _mm256_inserti128_si256(src_reg_1,
  410. _mm256_castsi256_si128(src_reg_2), 1);
  411. src_reg_3 = _mm256_castsi128_si256(
  412. _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
  413. src_reg_23 = _mm256_inserti128_si256(src_reg_2,
  414. _mm256_castsi256_si128(src_reg_3), 1);
  415. // Last three rows
  416. src_reg_1223_lo = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
  417. src_reg_1223_hi = _mm256_unpackhi_epi8(src_reg_12, src_reg_23);
  418. // Output from first half
  419. res_reg_m1001_lo = _mm256_maddubs_epi16(src_reg_m1001_lo, kernel_reg_23);
  420. res_reg_1223_lo = _mm256_maddubs_epi16(src_reg_1223_lo, kernel_reg_45);
  421. res_reg_lo = _mm256_adds_epi16(res_reg_m1001_lo, res_reg_1223_lo);
  422. // Output from second half
  423. res_reg_m1001_hi = _mm256_maddubs_epi16(src_reg_m1001_hi, kernel_reg_23);
  424. res_reg_1223_hi = _mm256_maddubs_epi16(src_reg_1223_hi, kernel_reg_45);
  425. res_reg_hi = _mm256_adds_epi16(res_reg_m1001_hi, res_reg_1223_hi);
  426. // Round the words
  427. res_reg_lo = mm256_round_epi16(&res_reg_lo, &reg_32, 6);
  428. res_reg_hi = mm256_round_epi16(&res_reg_hi, &reg_32, 6);
  429. // Combine to get the result
  430. res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi);
  431. // Save the result
  432. mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
  433. &res_reg);
  434. // Update the source by two rows
  435. src_ptr += src_stride_unrolled;
  436. dst_ptr += dst_stride_unrolled;
  437. src_reg_m1001_lo = src_reg_1223_lo;
  438. src_reg_m1001_hi = src_reg_1223_hi;
  439. src_reg_1 = src_reg_3;
  440. }
  441. }
  442. static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
  443. ptrdiff_t src_stride, uint8_t *dst_ptr,
  444. ptrdiff_t dst_stride, uint32_t height,
  445. const int16_t *kernel) {
  446. // We will cast the kernel from 16-bit words to 8-bit words, and then extract
  447. // the middle four elements of the kernel into two registers in the form
  448. // ... k[3] k[2] k[3] k[2]
  449. // ... k[5] k[4] k[5] k[4]
  450. // Then we shuffle the source into
  451. // ... s[1] s[0] s[0] s[-1]
  452. // ... s[3] s[2] s[2] s[1]
  453. // Calling multiply and add gives us half of the sum. Calling add gives us
  454. // first half of the output. Repeat again to get the second half of the
  455. // output. Finally we shuffle again to combine the two outputs.
  456. // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
  457. // time.
  458. __m128i kernel_reg_128; // Kernel
  459. __m256i kernel_reg, kernel_reg_23,
  460. kernel_reg_45; // Segments of the kernel used
  461. const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
  462. const ptrdiff_t unrolled_src_stride = src_stride << 1;
  463. const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
  464. int h;
  465. __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
  466. __m256i dst_reg;
  467. __m256i tmp_0, tmp_1;
  468. __m256i idx_shift_0 =
  469. _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
  470. 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
  471. __m256i idx_shift_2 =
  472. _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
  473. 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
  474. // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  475. src_ptr -= 1;
  476. // Load Kernel
  477. kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
  478. kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
  479. kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
  480. kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
  481. kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
  482. kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
  483. for (h = height; h >= 2; h -= 2) {
  484. // Load the source
  485. src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
  486. src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
  487. src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
  488. // Get the output
  489. tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
  490. tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
  491. dst_reg = _mm256_adds_epi16(tmp_0, tmp_1);
  492. // Round the result
  493. dst_reg = mm256_round_epi16(&dst_reg, &reg_32, 6);
  494. // Finally combine to get the final dst
  495. dst_reg = _mm256_packus_epi16(dst_reg, dst_reg);
  496. mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
  497. &dst_reg);
  498. src_ptr += unrolled_src_stride;
  499. dst_ptr += unrolled_dst_stride;
  500. }
  501. // Repeat for the last row if needed
  502. if (h > 0) {
  503. __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
  504. __m128i dst_reg;
  505. const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
  506. __m128i tmp_0, tmp_1;
  507. __m128i src_reg_shift_0 =
  508. _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_0));
  509. __m128i src_reg_shift_2 =
  510. _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_2));
  511. tmp_0 = _mm_maddubs_epi16(src_reg_shift_0,
  512. _mm256_castsi256_si128(kernel_reg_23));
  513. tmp_1 = _mm_maddubs_epi16(src_reg_shift_2,
  514. _mm256_castsi256_si128(kernel_reg_45));
  515. dst_reg = _mm_adds_epi16(tmp_0, tmp_1);
  516. dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32, 6);
  517. dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128());
  518. _mm_storel_epi64((__m128i *)dst_ptr, dst_reg);
  519. }
  520. }
  521. static void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr,
  522. ptrdiff_t src_stride, uint8_t *dst_ptr,
  523. ptrdiff_t dst_stride, uint32_t height,
  524. const int16_t *kernel) {
  525. // We will load two rows of pixels as 8-bit words, rearrange them into the
  526. // form
  527. // ... s[1,0] s[0,0] s[0,0] s[-1,0]
  528. // so that we can call multiply and add with the kernel partial output. Then
  529. // we can call add with another row to get the output.
  530. // Register for source s[-1:3, :]
  531. __m256i src_reg_1, src_reg_2, src_reg_3;
  532. // Interleaved rows of the source. lo is first half, hi second
  533. __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
  534. __m256i src_reg_m1001, src_reg_1223;
  535. __m128i kernel_reg_128; // Kernel
  536. __m256i kernel_reg, kernel_reg_23,
  537. kernel_reg_45; // Segments of the kernel used
  538. // Result after multiply and add
  539. __m256i res_reg_m1001, res_reg_1223;
  540. __m256i res_reg;
  541. const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
  542. // We will compute the result two rows at a time
  543. const ptrdiff_t src_stride_unrolled = src_stride << 1;
  544. const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
  545. int h;
  546. // Load Kernel
  547. kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
  548. kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
  549. kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
  550. kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
  551. kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
  552. kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
  553. // Row -1 to row 0
  554. src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
  555. (const __m128i *)(src_ptr + src_stride));
  556. // Row 0 to row 1
  557. src_reg_1 = _mm256_castsi128_si256(
  558. _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
  559. src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
  560. // First three rows
  561. src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
  562. for (h = height; h > 1; h -= 2) {
  563. src_reg_2 = _mm256_castsi128_si256(
  564. _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
  565. src_reg_12 = _mm256_inserti128_si256(src_reg_1,
  566. _mm256_castsi256_si128(src_reg_2), 1);
  567. src_reg_3 = _mm256_castsi128_si256(
  568. _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
  569. src_reg_23 = _mm256_inserti128_si256(src_reg_2,
  570. _mm256_castsi256_si128(src_reg_3), 1);
  571. // Last three rows
  572. src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
  573. // Output
  574. res_reg_m1001 = _mm256_maddubs_epi16(src_reg_m1001, kernel_reg_23);
  575. res_reg_1223 = _mm256_maddubs_epi16(src_reg_1223, kernel_reg_45);
  576. res_reg = _mm256_adds_epi16(res_reg_m1001, res_reg_1223);
  577. // Round the words
  578. res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
  579. // Combine to get the result
  580. res_reg = _mm256_packus_epi16(res_reg, res_reg);
  581. // Save the result
  582. mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
  583. &res_reg);
  584. // Update the source by two rows
  585. src_ptr += src_stride_unrolled;
  586. dst_ptr += dst_stride_unrolled;
  587. src_reg_m1001 = src_reg_1223;
  588. src_reg_1 = src_reg_3;
  589. }
  590. }
  591. static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
  592. ptrdiff_t src_stride, uint8_t *dst_ptr,
  593. ptrdiff_t dst_stride, uint32_t height,
  594. const int16_t *kernel) {
  595. // We will cast the kernel from 16-bit words to 8-bit words, and then extract
  596. // the middle four elements of the kernel into a single register in the form
  597. // k[5:2] k[5:2] k[5:2] k[5:2]
  598. // Then we shuffle the source into
  599. // s[5:2] s[4:1] s[3:0] s[2:-1]
  600. // Calling multiply and add gives us half of the sum next to each other.
  601. // Calling horizontal add then gives us the output.
  602. // Since avx2 has 256-bit register, we can do 2 rows at a time.
  603. __m128i kernel_reg_128; // Kernel
  604. __m256i kernel_reg;
  605. const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
  606. int h;
  607. const ptrdiff_t unrolled_src_stride = src_stride << 1;
  608. const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
  609. __m256i src_reg, src_reg_shuf;
  610. __m256i dst;
  611. __m256i shuf_idx =
  612. _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2,
  613. 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
  614. // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  615. src_ptr -= 1;
  616. // Load Kernel
  617. kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
  618. kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
  619. kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
  620. kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
  621. kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
  622. for (h = height; h > 1; h -= 2) {
  623. // Load the source
  624. src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr,
  625. (const __m128i *)(src_ptr + src_stride));
  626. src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
  627. // Get the result
  628. dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
  629. dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256());
  630. // Round result
  631. dst = mm256_round_epi16(&dst, &reg_32, 6);
  632. // Pack to 8-bits
  633. dst = _mm256_packus_epi16(dst, _mm256_setzero_si256());
  634. // Save
  635. mm256_storeu2_epi32((__m128i *const)dst_ptr,
  636. (__m128i *const)(dst_ptr + dst_stride), &dst);
  637. src_ptr += unrolled_src_stride;
  638. dst_ptr += unrolled_dst_stride;
  639. }
  640. if (h > 0) {
  641. // Load the source
  642. const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
  643. __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr);
  644. __m128i src_reg_shuf =
  645. _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx));
  646. // Get the result
  647. __m128i dst =
  648. _mm_maddubs_epi16(src_reg_shuf, _mm256_castsi256_si128(kernel_reg));
  649. dst = _mm_hadds_epi16(dst, _mm_setzero_si128());
  650. // Round result
  651. dst = mm_round_epi16_sse2(&dst, &reg_32, 6);
  652. // Pack to 8-bits
  653. dst = _mm_packus_epi16(dst, _mm_setzero_si128());
  654. *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
  655. }
  656. }
  657. static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
  658. ptrdiff_t src_stride, uint8_t *dst_ptr,
  659. ptrdiff_t dst_stride, uint32_t height,
  660. const int16_t *kernel) {
  661. // We will load two rows of pixels as 8-bit words, rearrange them into the
  662. // form
  663. // ... s[3,0] s[2,0] s[1,0] s[0,0] s[2,0] s[1,0] s[0,0] s[-1,0]
  664. // so that we can call multiply and add with the kernel to get partial output.
  665. // Calling horizontal add then gives us the completely output
  666. // Register for source s[-1:3, :]
  667. __m256i src_reg_1, src_reg_2, src_reg_3;
  668. // Interleaved rows of the source. lo is first half, hi second
  669. __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
  670. __m256i src_reg_m1001, src_reg_1223, src_reg_m1012_1023;
  671. __m128i kernel_reg_128; // Kernel
  672. __m256i kernel_reg;
  673. // Result after multiply and add
  674. __m256i res_reg;
  675. const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
  676. // We will compute the result two rows at a time
  677. const ptrdiff_t src_stride_unrolled = src_stride << 1;
  678. const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
  679. int h;
  680. // Load Kernel
  681. kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
  682. kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
  683. kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
  684. kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
  685. kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
  686. // Row -1 to row 0
  687. src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
  688. (const __m128i *)(src_ptr + src_stride));
  689. // Row 0 to row 1
  690. src_reg_1 = _mm256_castsi128_si256(
  691. _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
  692. src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
  693. // First three rows
  694. src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
  695. for (h = height; h > 1; h -= 2) {
  696. src_reg_2 = _mm256_castsi128_si256(
  697. _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
  698. src_reg_12 = _mm256_inserti128_si256(src_reg_1,
  699. _mm256_castsi256_si128(src_reg_2), 1);
  700. src_reg_3 = _mm256_castsi128_si256(
  701. _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
  702. src_reg_23 = _mm256_inserti128_si256(src_reg_2,
  703. _mm256_castsi256_si128(src_reg_3), 1);
  704. // Last three rows
  705. src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
  706. // Combine all the rows
  707. src_reg_m1012_1023 = _mm256_unpacklo_epi16(src_reg_m1001, src_reg_1223);
  708. // Output
  709. res_reg = _mm256_maddubs_epi16(src_reg_m1012_1023, kernel_reg);
  710. res_reg = _mm256_hadds_epi16(res_reg, _mm256_setzero_si256());
  711. // Round the words
  712. res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
  713. // Combine to get the result
  714. res_reg = _mm256_packus_epi16(res_reg, res_reg);
  715. // Save the result
  716. mm256_storeu2_epi32((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
  717. &res_reg);
  718. // Update the source by two rows
  719. src_ptr += src_stride_unrolled;
  720. dst_ptr += dst_stride_unrolled;
  721. src_reg_m1001 = src_reg_1223;
  722. src_reg_1 = src_reg_3;
  723. }
  724. }
  725. #if HAVE_AVX2 && HAVE_SSSE3
  726. filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
  727. #if ARCH_X86_64
  728. filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
  729. filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
  730. filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
  731. #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
  732. #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
  733. #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
  734. #else // ARCH_X86
  735. filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
  736. filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
  737. filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
  738. #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
  739. #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
  740. #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
  741. #endif // ARCH_X86_64
  742. filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
  743. filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
  744. filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
  745. filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
  746. #define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3
  747. #define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3
  748. #define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3
  749. #define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3
  750. filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
  751. filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
  752. filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
  753. filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
  754. filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
  755. filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
  756. #define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
  757. #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
  758. #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
  759. #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
  760. #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
  761. #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
  762. #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
  763. filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
  764. filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
  765. filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
  766. filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
  767. filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
  768. filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
  769. #define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3
  770. #define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3
  771. #define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3
  772. #define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3
  773. #define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
  774. #define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3
  775. #define vpx_filter_block1d16_v4_avg_avx2 vpx_filter_block1d16_v8_avg_avx2
  776. #define vpx_filter_block1d16_h4_avg_avx2 vpx_filter_block1d16_h8_avg_avx2
  777. #define vpx_filter_block1d8_v4_avg_avx2 vpx_filter_block1d8_v8_avg_avx2
  778. #define vpx_filter_block1d8_h4_avg_avx2 vpx_filter_block1d8_h8_avg_avx2
  779. #define vpx_filter_block1d4_v4_avg_avx2 vpx_filter_block1d4_v8_avg_avx2
  780. #define vpx_filter_block1d4_h4_avg_avx2 vpx_filter_block1d4_h8_avg_avx2
  781. // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
  782. // uint8_t *dst, ptrdiff_t dst_stride,
  783. // const InterpKernel *filter, int x0_q4,
  784. // int32_t x_step_q4, int y0_q4, int y_step_q4,
  785. // int w, int h);
  786. // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
  787. // uint8_t *dst, ptrdiff_t dst_stride,
  788. // const InterpKernel *filter, int x0_q4,
  789. // int32_t x_step_q4, int y0_q4, int y_step_q4,
  790. // int w, int h);
  791. // void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
  792. // uint8_t *dst, ptrdiff_t dst_stride,
  793. // const InterpKernel *filter, int x0_q4,
  794. // int32_t x_step_q4, int y0_q4,
  795. // int y_step_q4, int w, int h);
  796. // void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
  797. // uint8_t *dst, ptrdiff_t dst_stride,
  798. // const InterpKernel *filter, int x0_q4,
  799. // int32_t x_step_q4, int y0_q4,
  800. // int y_step_q4, int w, int h);
  801. FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
  802. FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
  803. avx2, 0);
  804. FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
  805. FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
  806. src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
  807. // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
  808. // uint8_t *dst, ptrdiff_t dst_stride,
  809. // const InterpKernel *filter, int x0_q4,
  810. // int32_t x_step_q4, int y0_q4, int y_step_q4,
  811. // int w, int h);
  812. // void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride,
  813. // uint8_t *dst, ptrdiff_t dst_stride,
  814. // const InterpKernel *filter, int x0_q4,
  815. // int32_t x_step_q4, int y0_q4, int y_step_q4,
  816. // int w, int h);
  817. FUN_CONV_2D(, avx2, 0);
  818. FUN_CONV_2D(avg_, avx2, 1);
  819. #endif // HAVE_AX2 && HAVE_SSSE3