highbd_convolve_avx2.c 54 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <immintrin.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/x86/convolve.h"
  13. #include "vpx_dsp/x86/convolve_avx2.h"
  14. // -----------------------------------------------------------------------------
  15. // Copy and average
  16. void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
  17. uint16_t *dst, ptrdiff_t dst_stride,
  18. const InterpKernel *filter, int x0_q4,
  19. int x_step_q4, int y0_q4, int y_step_q4,
  20. int w, int h, int bd) {
  21. (void)filter;
  22. (void)x0_q4;
  23. (void)x_step_q4;
  24. (void)y0_q4;
  25. (void)y_step_q4;
  26. (void)bd;
  27. assert(w % 4 == 0);
  28. if (w > 32) { // w = 64
  29. do {
  30. const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
  31. const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
  32. const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
  33. const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
  34. src += src_stride;
  35. _mm256_storeu_si256((__m256i *)dst, p0);
  36. _mm256_storeu_si256((__m256i *)(dst + 16), p1);
  37. _mm256_storeu_si256((__m256i *)(dst + 32), p2);
  38. _mm256_storeu_si256((__m256i *)(dst + 48), p3);
  39. dst += dst_stride;
  40. h--;
  41. } while (h > 0);
  42. } else if (w > 16) { // w = 32
  43. do {
  44. const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
  45. const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
  46. src += src_stride;
  47. _mm256_storeu_si256((__m256i *)dst, p0);
  48. _mm256_storeu_si256((__m256i *)(dst + 16), p1);
  49. dst += dst_stride;
  50. h--;
  51. } while (h > 0);
  52. } else if (w > 8) { // w = 16
  53. __m256i p0, p1;
  54. do {
  55. p0 = _mm256_loadu_si256((const __m256i *)src);
  56. src += src_stride;
  57. p1 = _mm256_loadu_si256((const __m256i *)src);
  58. src += src_stride;
  59. _mm256_storeu_si256((__m256i *)dst, p0);
  60. dst += dst_stride;
  61. _mm256_storeu_si256((__m256i *)dst, p1);
  62. dst += dst_stride;
  63. h -= 2;
  64. } while (h > 0);
  65. } else if (w > 4) { // w = 8
  66. __m128i p0, p1;
  67. do {
  68. p0 = _mm_loadu_si128((const __m128i *)src);
  69. src += src_stride;
  70. p1 = _mm_loadu_si128((const __m128i *)src);
  71. src += src_stride;
  72. _mm_storeu_si128((__m128i *)dst, p0);
  73. dst += dst_stride;
  74. _mm_storeu_si128((__m128i *)dst, p1);
  75. dst += dst_stride;
  76. h -= 2;
  77. } while (h > 0);
  78. } else { // w = 4
  79. __m128i p0, p1;
  80. do {
  81. p0 = _mm_loadl_epi64((const __m128i *)src);
  82. src += src_stride;
  83. p1 = _mm_loadl_epi64((const __m128i *)src);
  84. src += src_stride;
  85. _mm_storel_epi64((__m128i *)dst, p0);
  86. dst += dst_stride;
  87. _mm_storel_epi64((__m128i *)dst, p1);
  88. dst += dst_stride;
  89. h -= 2;
  90. } while (h > 0);
  91. }
  92. }
  93. void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
  94. uint16_t *dst, ptrdiff_t dst_stride,
  95. const InterpKernel *filter, int x0_q4,
  96. int x_step_q4, int y0_q4, int y_step_q4,
  97. int w, int h, int bd) {
  98. (void)filter;
  99. (void)x0_q4;
  100. (void)x_step_q4;
  101. (void)y0_q4;
  102. (void)y_step_q4;
  103. (void)bd;
  104. assert(w % 4 == 0);
  105. if (w > 32) { // w = 64
  106. __m256i p0, p1, p2, p3, u0, u1, u2, u3;
  107. do {
  108. p0 = _mm256_loadu_si256((const __m256i *)src);
  109. p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
  110. p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
  111. p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
  112. src += src_stride;
  113. u0 = _mm256_loadu_si256((const __m256i *)dst);
  114. u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
  115. u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
  116. u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
  117. _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
  118. _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
  119. _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
  120. _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
  121. dst += dst_stride;
  122. h--;
  123. } while (h > 0);
  124. } else if (w > 16) { // w = 32
  125. __m256i p0, p1, u0, u1;
  126. do {
  127. p0 = _mm256_loadu_si256((const __m256i *)src);
  128. p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
  129. src += src_stride;
  130. u0 = _mm256_loadu_si256((const __m256i *)dst);
  131. u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
  132. _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
  133. _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
  134. dst += dst_stride;
  135. h--;
  136. } while (h > 0);
  137. } else if (w > 8) { // w = 16
  138. __m256i p0, p1, u0, u1;
  139. do {
  140. p0 = _mm256_loadu_si256((const __m256i *)src);
  141. p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
  142. src += src_stride << 1;
  143. u0 = _mm256_loadu_si256((const __m256i *)dst);
  144. u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
  145. _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
  146. _mm256_storeu_si256((__m256i *)(dst + dst_stride),
  147. _mm256_avg_epu16(p1, u1));
  148. dst += dst_stride << 1;
  149. h -= 2;
  150. } while (h > 0);
  151. } else if (w > 4) { // w = 8
  152. __m128i p0, p1, u0, u1;
  153. do {
  154. p0 = _mm_loadu_si128((const __m128i *)src);
  155. p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
  156. src += src_stride << 1;
  157. u0 = _mm_loadu_si128((const __m128i *)dst);
  158. u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
  159. _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
  160. _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
  161. dst += dst_stride << 1;
  162. h -= 2;
  163. } while (h > 0);
  164. } else { // w = 4
  165. __m128i p0, p1, u0, u1;
  166. do {
  167. p0 = _mm_loadl_epi64((const __m128i *)src);
  168. p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
  169. src += src_stride << 1;
  170. u0 = _mm_loadl_epi64((const __m128i *)dst);
  171. u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
  172. _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
  173. _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
  174. dst += dst_stride << 1;
  175. h -= 2;
  176. } while (h > 0);
  177. }
  178. }
  179. // -----------------------------------------------------------------------------
  180. // Horizontal and vertical filtering
  181. static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
  182. 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
  183. 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
  184. static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
  185. 8, 9, 10, 11, 10, 11, 12, 13,
  186. 4, 5, 6, 7, 6, 7, 8, 9,
  187. 8, 9, 10, 11, 10, 11, 12, 13 };
  188. static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11,
  189. 10, 11, 12, 13, 12, 13, 14, 15,
  190. 6, 7, 8, 9, 8, 9, 10, 11,
  191. 10, 11, 12, 13, 12, 13, 14, 15 };
  192. static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
  193. #define CONV8_ROUNDING_BITS (7)
  194. #define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
  195. // -----------------------------------------------------------------------------
  196. // Horizontal Filtering
  197. static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
  198. const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
  199. const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
  200. const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
  201. const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
  202. p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6
  203. p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7
  204. p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4
  205. p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5
  206. }
  207. // Note:
  208. // Shared by 8x2 and 16x1 block
  209. static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
  210. __m256i *x /*x[8]*/) {
  211. __m256i pp[8];
  212. pack_pixels(s0, pp);
  213. pack_pixels(s1, &pp[4]);
  214. x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
  215. x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
  216. x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
  217. x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
  218. x[4] = x[2];
  219. x[5] = x[3];
  220. x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
  221. x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
  222. }
  223. static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
  224. __m256i pp[8];
  225. __m256i s0;
  226. s0 = _mm256_loadu_si256((const __m256i *)src);
  227. pack_pixels(&s0, pp);
  228. x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
  229. x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
  230. x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
  231. x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
  232. }
  233. static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
  234. __m256i *x) {
  235. __m256i s0, s1;
  236. s0 = _mm256_loadu_si256((const __m256i *)src);
  237. s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
  238. pack_16_pixels(&s0, &s1, x);
  239. }
  240. static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
  241. __m256i s0, s1;
  242. s0 = _mm256_loadu_si256((const __m256i *)src);
  243. s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
  244. pack_16_pixels(&s0, &s1, x);
  245. }
  246. // Note:
  247. // Shared by horizontal and vertical filtering
  248. static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
  249. const __m128i h = _mm_loadu_si128((const __m128i *)filter);
  250. const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
  251. const __m256i p0 = _mm256_set1_epi32(0x03020100);
  252. const __m256i p1 = _mm256_set1_epi32(0x07060504);
  253. const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
  254. const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
  255. f[0] = _mm256_shuffle_epi8(hh, p0);
  256. f[1] = _mm256_shuffle_epi8(hh, p1);
  257. f[2] = _mm256_shuffle_epi8(hh, p2);
  258. f[3] = _mm256_shuffle_epi8(hh, p3);
  259. }
  260. static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
  261. const __m256i *fil /*fil[4]*/,
  262. __m256i *y) {
  263. __m256i a, a0, a1;
  264. a0 = _mm256_madd_epi16(fil[0], sig[0]);
  265. a1 = _mm256_madd_epi16(fil[3], sig[3]);
  266. a = _mm256_add_epi32(a0, a1);
  267. a0 = _mm256_madd_epi16(fil[1], sig[1]);
  268. a1 = _mm256_madd_epi16(fil[2], sig[2]);
  269. {
  270. const __m256i min = _mm256_min_epi32(a0, a1);
  271. a = _mm256_add_epi32(a, min);
  272. }
  273. {
  274. const __m256i max = _mm256_max_epi32(a0, a1);
  275. a = _mm256_add_epi32(a, max);
  276. }
  277. {
  278. const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
  279. a = _mm256_add_epi32(a, rounding);
  280. *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
  281. }
  282. }
  283. static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
  284. uint16_t *dst) {
  285. const __m128i a0 = _mm256_castsi256_si128(*y);
  286. const __m128i a1 = _mm256_extractf128_si256(*y, 1);
  287. __m128i res = _mm_packus_epi32(a0, a1);
  288. res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
  289. _mm_storeu_si128((__m128i *)dst, res);
  290. }
  291. static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
  292. const __m256i *mask, uint16_t *dst,
  293. ptrdiff_t pitch) {
  294. __m256i a = _mm256_packus_epi32(*y0, *y1);
  295. a = _mm256_min_epi16(a, *mask);
  296. _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
  297. _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
  298. }
  299. static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
  300. const __m256i *mask, uint16_t *dst) {
  301. __m256i a = _mm256_packus_epi32(*y0, *y1);
  302. a = _mm256_min_epi16(a, *mask);
  303. _mm256_storeu_si256((__m256i *)dst, a);
  304. }
  305. static void vpx_highbd_filter_block1d8_h8_avx2(
  306. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  307. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  308. __m256i signal[8], res0, res1;
  309. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  310. __m256i ff[4];
  311. pack_filters(filter, ff);
  312. src_ptr -= 3;
  313. do {
  314. pack_8x2_pixels(src_ptr, src_pitch, signal);
  315. filter_8x1_pixels(signal, ff, &res0);
  316. filter_8x1_pixels(&signal[4], ff, &res1);
  317. store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  318. height -= 2;
  319. src_ptr += src_pitch << 1;
  320. dst_ptr += dst_pitch << 1;
  321. } while (height > 1);
  322. if (height > 0) {
  323. pack_8x1_pixels(src_ptr, signal);
  324. filter_8x1_pixels(signal, ff, &res0);
  325. store_8x1_pixels(&res0, &max, dst_ptr);
  326. }
  327. }
  328. static void vpx_highbd_filter_block1d16_h8_avx2(
  329. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  330. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  331. __m256i signal[8], res0, res1;
  332. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  333. __m256i ff[4];
  334. pack_filters(filter, ff);
  335. src_ptr -= 3;
  336. do {
  337. pack_16x1_pixels(src_ptr, signal);
  338. filter_8x1_pixels(signal, ff, &res0);
  339. filter_8x1_pixels(&signal[4], ff, &res1);
  340. store_16x1_pixels(&res0, &res1, &max, dst_ptr);
  341. height -= 1;
  342. src_ptr += src_pitch;
  343. dst_ptr += dst_pitch;
  344. } while (height > 0);
  345. }
  346. // -----------------------------------------------------------------------------
  347. // 2-tap horizontal filtering
  348. static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
  349. const __m128i h = _mm_loadu_si128((const __m128i *)filter);
  350. const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
  351. const __m256i p = _mm256_set1_epi32(0x09080706);
  352. f[0] = _mm256_shuffle_epi8(hh, p);
  353. }
  354. // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
  355. // the difference is s0/s1 specifies first and second rows or,
  356. // first 16 samples and 8-sample shifted 16 samples
  357. static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
  358. __m256i *sig) {
  359. const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
  360. const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
  361. __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
  362. __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
  363. __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
  364. __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
  365. r0 = _mm256_shuffle_epi8(r0, sf2);
  366. r1 = _mm256_shuffle_epi8(r1, sf2);
  367. sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
  368. sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
  369. }
  370. static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
  371. const ptrdiff_t pitch, __m256i *sig) {
  372. const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
  373. const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
  374. pack_16_2t_pixels(&r0, &r1, sig);
  375. }
  376. static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
  377. __m256i *sig /*sig[2]*/) {
  378. const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
  379. const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
  380. pack_16_2t_pixels(&r0, &r1, sig);
  381. }
  382. static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
  383. __m256i *sig /*sig[2]*/) {
  384. const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
  385. const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
  386. __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
  387. __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
  388. r0 = _mm256_permutevar8x32_epi32(r0, idx);
  389. r0 = _mm256_shuffle_epi8(r0, sf2);
  390. sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
  391. }
  392. // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
  393. static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
  394. __m256i *y0, __m256i *y1) {
  395. const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
  396. __m256i x0 = _mm256_madd_epi16(sig[0], *f);
  397. __m256i x1 = _mm256_madd_epi16(sig[1], *f);
  398. x0 = _mm256_add_epi32(x0, rounding);
  399. x1 = _mm256_add_epi32(x1, rounding);
  400. *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
  401. *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
  402. }
  403. static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
  404. __m256i *y0) {
  405. const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
  406. __m256i x0 = _mm256_madd_epi16(sig[0], *f);
  407. x0 = _mm256_add_epi32(x0, rounding);
  408. *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
  409. }
  410. static void vpx_highbd_filter_block1d8_h2_avx2(
  411. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  412. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  413. __m256i signal[2], res0, res1;
  414. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  415. __m256i ff;
  416. pack_2t_filter(filter, &ff);
  417. src_ptr -= 3;
  418. do {
  419. pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
  420. filter_16_2t_pixels(signal, &ff, &res0, &res1);
  421. store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  422. height -= 2;
  423. src_ptr += src_pitch << 1;
  424. dst_ptr += dst_pitch << 1;
  425. } while (height > 1);
  426. if (height > 0) {
  427. pack_8x1_2t_pixels(src_ptr, signal);
  428. filter_8x1_2t_pixels(signal, &ff, &res0);
  429. store_8x1_pixels(&res0, &max, dst_ptr);
  430. }
  431. }
  432. static void vpx_highbd_filter_block1d16_h2_avx2(
  433. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  434. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  435. __m256i signal[2], res0, res1;
  436. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  437. __m256i ff;
  438. pack_2t_filter(filter, &ff);
  439. src_ptr -= 3;
  440. do {
  441. pack_16x1_2t_pixels(src_ptr, signal);
  442. filter_16_2t_pixels(signal, &ff, &res0, &res1);
  443. store_16x1_pixels(&res0, &res1, &max, dst_ptr);
  444. height -= 1;
  445. src_ptr += src_pitch;
  446. dst_ptr += dst_pitch;
  447. } while (height > 0);
  448. }
  449. // -----------------------------------------------------------------------------
  450. // Vertical Filtering
  451. static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
  452. __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
  453. __m256i s1 =
  454. _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
  455. __m256i s2 = _mm256_castsi128_si256(
  456. _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
  457. __m256i s3 = _mm256_castsi128_si256(
  458. _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
  459. __m256i s4 = _mm256_castsi128_si256(
  460. _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
  461. __m256i s5 = _mm256_castsi128_si256(
  462. _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
  463. __m256i s6 = _mm256_castsi128_si256(
  464. _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
  465. s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
  466. s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
  467. s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
  468. s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
  469. s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
  470. s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
  471. sig[0] = _mm256_unpacklo_epi16(s0, s1);
  472. sig[4] = _mm256_unpackhi_epi16(s0, s1);
  473. sig[1] = _mm256_unpacklo_epi16(s2, s3);
  474. sig[5] = _mm256_unpackhi_epi16(s2, s3);
  475. sig[2] = _mm256_unpacklo_epi16(s4, s5);
  476. sig[6] = _mm256_unpackhi_epi16(s4, s5);
  477. sig[8] = s6;
  478. }
  479. static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
  480. __m256i *sig) {
  481. // base + 7th row
  482. __m256i s0 = _mm256_castsi128_si256(
  483. _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
  484. // base + 8th row
  485. __m256i s1 = _mm256_castsi128_si256(
  486. _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
  487. __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
  488. __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
  489. sig[3] = _mm256_unpacklo_epi16(s2, s3);
  490. sig[7] = _mm256_unpackhi_epi16(s2, s3);
  491. sig[8] = s1;
  492. }
  493. static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
  494. __m256i *y0, __m256i *y1) {
  495. filter_8x1_pixels(sig, f, y0);
  496. filter_8x1_pixels(&sig[4], f, y1);
  497. }
  498. static INLINE void update_pixels(__m256i *sig) {
  499. int i;
  500. for (i = 0; i < 3; ++i) {
  501. sig[i] = sig[i + 1];
  502. sig[i + 4] = sig[i + 5];
  503. }
  504. }
  505. static void vpx_highbd_filter_block1d8_v8_avx2(
  506. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  507. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  508. __m256i signal[9], res0, res1;
  509. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  510. __m256i ff[4];
  511. pack_filters(filter, ff);
  512. pack_8x9_init(src_ptr, src_pitch, signal);
  513. do {
  514. pack_8x9_pixels(src_ptr, src_pitch, signal);
  515. filter_8x9_pixels(signal, ff, &res0, &res1);
  516. store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  517. update_pixels(signal);
  518. src_ptr += src_pitch << 1;
  519. dst_ptr += dst_pitch << 1;
  520. height -= 2;
  521. } while (height > 0);
  522. }
  523. static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
  524. __m256i u0, u1, u2, u3;
  525. // load 0-6 rows
  526. const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
  527. const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
  528. const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
  529. const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
  530. const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
  531. const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
  532. const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
  533. u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low
  534. u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high
  535. u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low
  536. u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high
  537. sig[0] = _mm256_unpacklo_epi16(u0, u2);
  538. sig[4] = _mm256_unpackhi_epi16(u0, u2);
  539. sig[8] = _mm256_unpacklo_epi16(u1, u3);
  540. sig[12] = _mm256_unpackhi_epi16(u1, u3);
  541. u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
  542. u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
  543. u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
  544. u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
  545. sig[1] = _mm256_unpacklo_epi16(u0, u2);
  546. sig[5] = _mm256_unpackhi_epi16(u0, u2);
  547. sig[9] = _mm256_unpacklo_epi16(u1, u3);
  548. sig[13] = _mm256_unpackhi_epi16(u1, u3);
  549. u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
  550. u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
  551. u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
  552. u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
  553. sig[2] = _mm256_unpacklo_epi16(u0, u2);
  554. sig[6] = _mm256_unpackhi_epi16(u0, u2);
  555. sig[10] = _mm256_unpacklo_epi16(u1, u3);
  556. sig[14] = _mm256_unpackhi_epi16(u1, u3);
  557. sig[16] = s6;
  558. }
  559. static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
  560. __m256i *sig) {
  561. // base + 7th row
  562. const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
  563. // base + 8th row
  564. const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
  565. __m256i u0, u1, u2, u3;
  566. u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
  567. u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
  568. u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
  569. u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
  570. sig[3] = _mm256_unpacklo_epi16(u0, u2);
  571. sig[7] = _mm256_unpackhi_epi16(u0, u2);
  572. sig[11] = _mm256_unpacklo_epi16(u1, u3);
  573. sig[15] = _mm256_unpackhi_epi16(u1, u3);
  574. sig[16] = s8;
  575. }
  576. static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
  577. __m256i *y0, __m256i *y1) {
  578. __m256i res[4];
  579. int i;
  580. for (i = 0; i < 4; ++i) {
  581. filter_8x1_pixels(&sig[i << 2], f, &res[i]);
  582. }
  583. {
  584. const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
  585. const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
  586. *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
  587. *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
  588. }
  589. }
  590. static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
  591. const __m256i *mask, uint16_t *dst,
  592. ptrdiff_t pitch) {
  593. __m256i p = _mm256_min_epi16(*y0, *mask);
  594. _mm256_storeu_si256((__m256i *)dst, p);
  595. p = _mm256_min_epi16(*y1, *mask);
  596. _mm256_storeu_si256((__m256i *)(dst + pitch), p);
  597. }
  598. static void update_16x9_pixels(__m256i *sig) {
  599. update_pixels(&sig[0]);
  600. update_pixels(&sig[8]);
  601. }
  602. static void vpx_highbd_filter_block1d16_v8_avx2(
  603. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  604. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  605. __m256i signal[17], res0, res1;
  606. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  607. __m256i ff[4];
  608. pack_filters(filter, ff);
  609. pack_16x9_init(src_ptr, src_pitch, signal);
  610. do {
  611. pack_16x9_pixels(src_ptr, src_pitch, signal);
  612. filter_16x9_pixels(signal, ff, &res0, &res1);
  613. store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  614. update_16x9_pixels(signal);
  615. src_ptr += src_pitch << 1;
  616. dst_ptr += dst_pitch << 1;
  617. height -= 2;
  618. } while (height > 0);
  619. }
  620. // -----------------------------------------------------------------------------
  621. // 2-tap vertical filtering
  622. static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
  623. sig[2] = _mm256_loadu_si256((const __m256i *)src);
  624. }
  625. static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
  626. __m256i *sig) {
  627. // load the next row
  628. const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
  629. sig[0] = _mm256_unpacklo_epi16(sig[2], u);
  630. sig[1] = _mm256_unpackhi_epi16(sig[2], u);
  631. sig[2] = u;
  632. }
  633. static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
  634. __m256i *y0, __m256i *y1) {
  635. filter_16_2t_pixels(sig, f, y0, y1);
  636. }
  637. static void vpx_highbd_filter_block1d16_v2_avx2(
  638. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  639. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  640. __m256i signal[3], res0, res1;
  641. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  642. __m256i ff;
  643. pack_2t_filter(filter, &ff);
  644. pack_16x2_init(src_ptr, signal);
  645. do {
  646. pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
  647. filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
  648. store_16x1_pixels(&res0, &res1, &max, dst_ptr);
  649. src_ptr += src_pitch;
  650. dst_ptr += dst_pitch;
  651. height -= 1;
  652. } while (height > 0);
  653. }
  654. static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
  655. const __m128i h = _mm_loadu_si128((const __m128i *)filter);
  656. const __m128i p = _mm_set1_epi32(0x09080706);
  657. f[0] = _mm_shuffle_epi8(h, p);
  658. }
  659. static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
  660. sig[2] = _mm_loadu_si128((const __m128i *)src);
  661. }
  662. static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
  663. __m128i *sig) {
  664. // load the next row
  665. const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
  666. sig[0] = _mm_unpacklo_epi16(sig[2], u);
  667. sig[1] = _mm_unpackhi_epi16(sig[2], u);
  668. sig[2] = u;
  669. }
  670. static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
  671. __m128i *y0, __m128i *y1) {
  672. const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
  673. __m128i x0 = _mm_madd_epi16(sig[0], *f);
  674. __m128i x1 = _mm_madd_epi16(sig[1], *f);
  675. x0 = _mm_add_epi32(x0, rounding);
  676. x1 = _mm_add_epi32(x1, rounding);
  677. *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
  678. *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
  679. }
  680. static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
  681. const __m128i *mask, uint16_t *dst) {
  682. __m128i res = _mm_packus_epi32(*y0, *y1);
  683. res = _mm_min_epi16(res, *mask);
  684. _mm_storeu_si128((__m128i *)dst, res);
  685. }
  686. static void vpx_highbd_filter_block1d8_v2_avx2(
  687. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  688. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  689. __m128i signal[3], res0, res1;
  690. const __m128i max = _mm_set1_epi16((1 << bd) - 1);
  691. __m128i ff;
  692. pack_8x1_2t_filter(filter, &ff);
  693. pack_8x2_init(src_ptr, signal);
  694. do {
  695. pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
  696. filter_8_2t_pixels(signal, &ff, &res0, &res1);
  697. store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
  698. src_ptr += src_pitch;
  699. dst_ptr += dst_pitch;
  700. height -= 1;
  701. } while (height > 0);
  702. }
  703. // Calculation with averaging the input pixels
  704. static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
  705. uint16_t *dst) {
  706. const __m128i a0 = _mm256_castsi256_si128(*y0);
  707. const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
  708. __m128i res = _mm_packus_epi32(a0, a1);
  709. const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
  710. res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
  711. res = _mm_avg_epu16(res, pix);
  712. _mm_storeu_si128((__m128i *)dst, res);
  713. }
  714. static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
  715. const __m256i *mask, uint16_t *dst,
  716. ptrdiff_t pitch) {
  717. __m256i a = _mm256_packus_epi32(*y0, *y1);
  718. const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
  719. const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
  720. const __m256i pix =
  721. _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
  722. a = _mm256_min_epi16(a, *mask);
  723. a = _mm256_avg_epu16(a, pix);
  724. _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
  725. _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
  726. }
  727. static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
  728. const __m256i *mask, uint16_t *dst) {
  729. __m256i a = _mm256_packus_epi32(*y0, *y1);
  730. const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
  731. a = _mm256_min_epi16(a, *mask);
  732. a = _mm256_avg_epu16(a, pix);
  733. _mm256_storeu_si256((__m256i *)dst, a);
  734. }
  735. static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
  736. const __m256i *mask, uint16_t *dst,
  737. ptrdiff_t pitch) {
  738. const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
  739. const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
  740. __m256i p = _mm256_min_epi16(*y0, *mask);
  741. p = _mm256_avg_epu16(p, pix0);
  742. _mm256_storeu_si256((__m256i *)dst, p);
  743. p = _mm256_min_epi16(*y1, *mask);
  744. p = _mm256_avg_epu16(p, pix1);
  745. _mm256_storeu_si256((__m256i *)(dst + pitch), p);
  746. }
  747. static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
  748. const __m128i *y1,
  749. const __m128i *mask,
  750. uint16_t *dst) {
  751. __m128i res = _mm_packus_epi32(*y0, *y1);
  752. const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
  753. res = _mm_min_epi16(res, *mask);
  754. res = _mm_avg_epu16(res, pix);
  755. _mm_storeu_si128((__m128i *)dst, res);
  756. }
  757. static void vpx_highbd_filter_block1d8_h8_avg_avx2(
  758. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  759. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  760. __m256i signal[8], res0, res1;
  761. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  762. __m256i ff[4];
  763. pack_filters(filter, ff);
  764. src_ptr -= 3;
  765. do {
  766. pack_8x2_pixels(src_ptr, src_pitch, signal);
  767. filter_8x1_pixels(signal, ff, &res0);
  768. filter_8x1_pixels(&signal[4], ff, &res1);
  769. store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  770. height -= 2;
  771. src_ptr += src_pitch << 1;
  772. dst_ptr += dst_pitch << 1;
  773. } while (height > 1);
  774. if (height > 0) {
  775. pack_8x1_pixels(src_ptr, signal);
  776. filter_8x1_pixels(signal, ff, &res0);
  777. store_8x1_avg_pixels(&res0, &max, dst_ptr);
  778. }
  779. }
  780. static void vpx_highbd_filter_block1d16_h8_avg_avx2(
  781. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  782. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  783. __m256i signal[8], res0, res1;
  784. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  785. __m256i ff[4];
  786. pack_filters(filter, ff);
  787. src_ptr -= 3;
  788. do {
  789. pack_16x1_pixels(src_ptr, signal);
  790. filter_8x1_pixels(signal, ff, &res0);
  791. filter_8x1_pixels(&signal[4], ff, &res1);
  792. store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
  793. height -= 1;
  794. src_ptr += src_pitch;
  795. dst_ptr += dst_pitch;
  796. } while (height > 0);
  797. }
  798. static void vpx_highbd_filter_block1d4_h4_avx2(
  799. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  800. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  801. // We extract the middle four elements of the kernel into two registers in
  802. // the form
  803. // ... k[3] k[2] k[3] k[2]
  804. // ... k[5] k[4] k[5] k[4]
  805. // Then we shuffle the source into
  806. // ... s[1] s[0] s[0] s[-1]
  807. // ... s[3] s[2] s[2] s[1]
  808. // Calling multiply and add gives us half of the sum. Calling add on the two
  809. // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
  810. // can do this two rows at a time.
  811. __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
  812. __m256i res_reg;
  813. __m256i idx_shift_0 =
  814. _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
  815. 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
  816. __m256i idx_shift_2 =
  817. _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
  818. 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
  819. __m128i kernel_reg_128; // Kernel
  820. __m256i kernel_reg, kernel_reg_23,
  821. kernel_reg_45; // Segments of the kernel used
  822. const __m256i reg_round =
  823. _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
  824. const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
  825. const ptrdiff_t unrolled_src_stride = src_stride << 1;
  826. const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
  827. int h;
  828. // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  829. src_ptr -= 1;
  830. // Load Kernel
  831. kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
  832. kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
  833. kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
  834. kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
  835. for (h = height; h >= 2; h -= 2) {
  836. // Load the source
  837. src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
  838. src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
  839. src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
  840. // Get the output
  841. res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
  842. &kernel_reg_23, &kernel_reg_45);
  843. // Round the result
  844. res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
  845. // Finally combine to get the final dst
  846. res_reg = _mm256_packus_epi32(res_reg, res_reg);
  847. res_reg = _mm256_min_epi16(res_reg, reg_max);
  848. mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
  849. &res_reg);
  850. src_ptr += unrolled_src_stride;
  851. dst_ptr += unrolled_dst_stride;
  852. }
  853. // Repeat for the last row if needed
  854. if (h > 0) {
  855. // Load the source
  856. src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
  857. src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
  858. src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
  859. // Get the output
  860. res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
  861. &kernel_reg_23, &kernel_reg_45);
  862. // Round the result
  863. res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
  864. // Finally combine to get the final dst
  865. res_reg = _mm256_packus_epi32(res_reg, res_reg);
  866. res_reg = _mm256_min_epi16(res_reg, reg_max);
  867. _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
  868. }
  869. }
  870. static void vpx_highbd_filter_block1d8_h4_avx2(
  871. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  872. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  873. // We will extract the middle four elements of the kernel into two registers
  874. // in the form
  875. // ... k[3] k[2] k[3] k[2]
  876. // ... k[5] k[4] k[5] k[4]
  877. // Then we shuffle the source into
  878. // ... s[1] s[0] s[0] s[-1]
  879. // ... s[3] s[2] s[2] s[1]
  880. // Calling multiply and add gives us half of the sum of the first half.
  881. // Calling add gives us first half of the output. Repat again to get the whole
  882. // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
  883. // at a time.
  884. __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
  885. __m256i res_reg, res_first, res_last;
  886. __m256i idx_shift_0 =
  887. _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
  888. 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
  889. __m256i idx_shift_2 =
  890. _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
  891. 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
  892. __m128i kernel_reg_128; // Kernel
  893. __m256i kernel_reg, kernel_reg_23,
  894. kernel_reg_45; // Segments of the kernel used
  895. const __m256i reg_round =
  896. _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
  897. const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
  898. const ptrdiff_t unrolled_src_stride = src_stride << 1;
  899. const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
  900. int h;
  901. // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  902. src_ptr -= 1;
  903. // Load Kernel
  904. kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
  905. kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
  906. kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
  907. kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
  908. for (h = height; h >= 2; h -= 2) {
  909. // Load the source
  910. src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
  911. src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
  912. src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
  913. // Result for first half
  914. res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
  915. &kernel_reg_23, &kernel_reg_45);
  916. // Do again to get the second half of dst
  917. // Load the source
  918. src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
  919. src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
  920. src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
  921. // Result for second half
  922. res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
  923. &kernel_reg_23, &kernel_reg_45);
  924. // Round each result
  925. res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
  926. res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
  927. // Finally combine to get the final dst
  928. res_reg = _mm256_packus_epi32(res_first, res_last);
  929. res_reg = _mm256_min_epi16(res_reg, reg_max);
  930. mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
  931. &res_reg);
  932. src_ptr += unrolled_src_stride;
  933. dst_ptr += unrolled_dst_stride;
  934. }
  935. // Repeat for the last row if needed
  936. if (h > 0) {
  937. src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
  938. src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
  939. src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
  940. res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
  941. &kernel_reg_23, &kernel_reg_45);
  942. res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
  943. res_reg = _mm256_packus_epi32(res_reg, res_reg);
  944. res_reg = _mm256_min_epi16(res_reg, reg_max);
  945. mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
  946. }
  947. }
  948. static void vpx_highbd_filter_block1d16_h4_avx2(
  949. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  950. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  951. vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
  952. height, kernel, bd);
  953. vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
  954. dst_stride, height, kernel, bd);
  955. }
  956. static void vpx_highbd_filter_block1d8_v8_avg_avx2(
  957. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  958. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  959. __m256i signal[9], res0, res1;
  960. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  961. __m256i ff[4];
  962. pack_filters(filter, ff);
  963. pack_8x9_init(src_ptr, src_pitch, signal);
  964. do {
  965. pack_8x9_pixels(src_ptr, src_pitch, signal);
  966. filter_8x9_pixels(signal, ff, &res0, &res1);
  967. store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  968. update_pixels(signal);
  969. src_ptr += src_pitch << 1;
  970. dst_ptr += dst_pitch << 1;
  971. height -= 2;
  972. } while (height > 0);
  973. }
  974. static void vpx_highbd_filter_block1d16_v8_avg_avx2(
  975. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  976. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  977. __m256i signal[17], res0, res1;
  978. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  979. __m256i ff[4];
  980. pack_filters(filter, ff);
  981. pack_16x9_init(src_ptr, src_pitch, signal);
  982. do {
  983. pack_16x9_pixels(src_ptr, src_pitch, signal);
  984. filter_16x9_pixels(signal, ff, &res0, &res1);
  985. store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  986. update_16x9_pixels(signal);
  987. src_ptr += src_pitch << 1;
  988. dst_ptr += dst_pitch << 1;
  989. height -= 2;
  990. } while (height > 0);
  991. }
  992. static void vpx_highbd_filter_block1d8_h2_avg_avx2(
  993. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  994. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  995. __m256i signal[2], res0, res1;
  996. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  997. __m256i ff;
  998. pack_2t_filter(filter, &ff);
  999. src_ptr -= 3;
  1000. do {
  1001. pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
  1002. filter_16_2t_pixels(signal, &ff, &res0, &res1);
  1003. store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
  1004. height -= 2;
  1005. src_ptr += src_pitch << 1;
  1006. dst_ptr += dst_pitch << 1;
  1007. } while (height > 1);
  1008. if (height > 0) {
  1009. pack_8x1_2t_pixels(src_ptr, signal);
  1010. filter_8x1_2t_pixels(signal, &ff, &res0);
  1011. store_8x1_avg_pixels(&res0, &max, dst_ptr);
  1012. }
  1013. }
  1014. static void vpx_highbd_filter_block1d16_h2_avg_avx2(
  1015. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  1016. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  1017. __m256i signal[2], res0, res1;
  1018. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  1019. __m256i ff;
  1020. pack_2t_filter(filter, &ff);
  1021. src_ptr -= 3;
  1022. do {
  1023. pack_16x1_2t_pixels(src_ptr, signal);
  1024. filter_16_2t_pixels(signal, &ff, &res0, &res1);
  1025. store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
  1026. height -= 1;
  1027. src_ptr += src_pitch;
  1028. dst_ptr += dst_pitch;
  1029. } while (height > 0);
  1030. }
  1031. static void vpx_highbd_filter_block1d16_v2_avg_avx2(
  1032. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  1033. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  1034. __m256i signal[3], res0, res1;
  1035. const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
  1036. __m256i ff;
  1037. pack_2t_filter(filter, &ff);
  1038. pack_16x2_init(src_ptr, signal);
  1039. do {
  1040. pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
  1041. filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
  1042. store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
  1043. src_ptr += src_pitch;
  1044. dst_ptr += dst_pitch;
  1045. height -= 1;
  1046. } while (height > 0);
  1047. }
  1048. static void vpx_highbd_filter_block1d8_v2_avg_avx2(
  1049. const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
  1050. ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  1051. __m128i signal[3], res0, res1;
  1052. const __m128i max = _mm_set1_epi16((1 << bd) - 1);
  1053. __m128i ff;
  1054. pack_8x1_2t_filter(filter, &ff);
  1055. pack_8x2_init(src_ptr, signal);
  1056. do {
  1057. pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
  1058. filter_8_2t_pixels(signal, &ff, &res0, &res1);
  1059. store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
  1060. src_ptr += src_pitch;
  1061. dst_ptr += dst_pitch;
  1062. height -= 1;
  1063. } while (height > 0);
  1064. }
  1065. static void vpx_highbd_filter_block1d4_v4_avx2(
  1066. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  1067. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  1068. // We will load two rows of pixels and rearrange them into the form
  1069. // ... s[1,0] s[0,0] s[0,0] s[-1,0]
  1070. // so that we can call multiply and add with the kernel partial output. Then
  1071. // we can call add with another row to get the output.
  1072. // Register for source s[-1:3, :]
  1073. __m256i src_reg_1, src_reg_2, src_reg_3;
  1074. // Interleaved rows of the source. lo is first half, hi second
  1075. __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
  1076. __m256i src_reg_m1001, src_reg_1223;
  1077. // Result after multiply and add
  1078. __m256i res_reg;
  1079. __m128i kernel_reg_128; // Kernel
  1080. __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel used
  1081. const __m256i reg_round =
  1082. _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
  1083. const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
  1084. const ptrdiff_t src_stride_unrolled = src_stride << 1;
  1085. const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
  1086. int h;
  1087. // Load Kernel
  1088. kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
  1089. kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
  1090. kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
  1091. kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
  1092. // Row -1 to row 0
  1093. src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
  1094. (const __m128i *)(src_ptr + src_stride));
  1095. // Row 0 to row 1
  1096. src_reg_1 = _mm256_castsi128_si256(
  1097. _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
  1098. src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
  1099. // First three rows
  1100. src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
  1101. for (h = height; h > 1; h -= 2) {
  1102. src_reg_2 = _mm256_castsi128_si256(
  1103. _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
  1104. src_reg_12 = _mm256_inserti128_si256(src_reg_1,
  1105. _mm256_castsi256_si128(src_reg_2), 1);
  1106. src_reg_3 = _mm256_castsi128_si256(
  1107. _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
  1108. src_reg_23 = _mm256_inserti128_si256(src_reg_2,
  1109. _mm256_castsi256_si128(src_reg_3), 1);
  1110. // Last three rows
  1111. src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
  1112. // Output
  1113. res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
  1114. &kernel_reg_23, &kernel_reg_45);
  1115. // Round the words
  1116. res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
  1117. // Combine to get the result
  1118. res_reg = _mm256_packus_epi32(res_reg, res_reg);
  1119. res_reg = _mm256_min_epi16(res_reg, reg_max);
  1120. // Save the result
  1121. mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
  1122. &res_reg);
  1123. // Update the source by two rows
  1124. src_ptr += src_stride_unrolled;
  1125. dst_ptr += dst_stride_unrolled;
  1126. src_reg_m1001 = src_reg_1223;
  1127. src_reg_1 = src_reg_3;
  1128. }
  1129. }
  1130. static void vpx_highbd_filter_block1d8_v4_avx2(
  1131. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  1132. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  1133. // We will load two rows of pixels and rearrange them into the form
  1134. // ... s[1,0] s[0,0] s[0,0] s[-1,0]
  1135. // so that we can call multiply and add with the kernel partial output. Then
  1136. // we can call add with another row to get the output.
  1137. // Register for source s[-1:3, :]
  1138. __m256i src_reg_1, src_reg_2, src_reg_3;
  1139. // Interleaved rows of the source. lo is first half, hi second
  1140. __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
  1141. __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
  1142. __m128i kernel_reg_128; // Kernel
  1143. __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel
  1144. // Result after multiply and add
  1145. __m256i res_reg, res_reg_lo, res_reg_hi;
  1146. const __m256i reg_round =
  1147. _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
  1148. const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
  1149. const ptrdiff_t src_stride_unrolled = src_stride << 1;
  1150. const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
  1151. int h;
  1152. // Load Kernel
  1153. kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
  1154. kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
  1155. kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
  1156. kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
  1157. // Row -1 to row 0
  1158. src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
  1159. (const __m128i *)(src_ptr + src_stride));
  1160. // Row 0 to row 1
  1161. src_reg_1 = _mm256_castsi128_si256(
  1162. _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
  1163. src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
  1164. // First three rows
  1165. src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
  1166. src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
  1167. for (h = height; h > 1; h -= 2) {
  1168. src_reg_2 = _mm256_castsi128_si256(
  1169. _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
  1170. src_reg_12 = _mm256_inserti128_si256(src_reg_1,
  1171. _mm256_castsi256_si128(src_reg_2), 1);
  1172. src_reg_3 = _mm256_castsi128_si256(
  1173. _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
  1174. src_reg_23 = _mm256_inserti128_si256(src_reg_2,
  1175. _mm256_castsi256_si128(src_reg_3), 1);
  1176. // Last three rows
  1177. src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
  1178. src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
  1179. // Output from first half
  1180. res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
  1181. &kernel_reg_23, &kernel_reg_45);
  1182. // Output from second half
  1183. res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
  1184. &kernel_reg_23, &kernel_reg_45);
  1185. // Round the words
  1186. res_reg_lo =
  1187. mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
  1188. res_reg_hi =
  1189. mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
  1190. // Combine to get the result
  1191. res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
  1192. res_reg = _mm256_min_epi16(res_reg, reg_max);
  1193. // Save the result
  1194. mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
  1195. &res_reg);
  1196. // Update the source by two rows
  1197. src_ptr += src_stride_unrolled;
  1198. dst_ptr += dst_stride_unrolled;
  1199. src_reg_m1001_lo = src_reg_1223_lo;
  1200. src_reg_m1001_hi = src_reg_1223_hi;
  1201. src_reg_1 = src_reg_3;
  1202. }
  1203. }
  1204. static void vpx_highbd_filter_block1d16_v4_avx2(
  1205. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  1206. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  1207. vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
  1208. height, kernel, bd);
  1209. vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
  1210. dst_stride, height, kernel, bd);
  1211. }
  1212. // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
  1213. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
  1214. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
  1215. // From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
  1216. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
  1217. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
  1218. #define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
  1219. #define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
  1220. #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
  1221. #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
  1222. // Use the [vh]8 version because there is no [vh]4 implementation.
  1223. #define vpx_highbd_filter_block1d16_v4_avg_avx2 \
  1224. vpx_highbd_filter_block1d16_v8_avg_avx2
  1225. #define vpx_highbd_filter_block1d16_h4_avg_avx2 \
  1226. vpx_highbd_filter_block1d16_h8_avg_avx2
  1227. #define vpx_highbd_filter_block1d8_v4_avg_avx2 \
  1228. vpx_highbd_filter_block1d8_v8_avg_avx2
  1229. #define vpx_highbd_filter_block1d8_h4_avg_avx2 \
  1230. vpx_highbd_filter_block1d8_h8_avg_avx2
  1231. #define vpx_highbd_filter_block1d4_v4_avg_avx2 \
  1232. vpx_highbd_filter_block1d4_v8_avg_avx2
  1233. #define vpx_highbd_filter_block1d4_h4_avg_avx2 \
  1234. vpx_highbd_filter_block1d4_h8_avg_avx2
  1235. HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
  1236. HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
  1237. src - src_stride * (num_taps / 2 - 1), , avx2, 0);
  1238. HIGH_FUN_CONV_2D(, avx2, 0);
  1239. // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
  1240. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
  1241. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
  1242. // From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
  1243. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
  1244. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
  1245. #define vpx_highbd_filter_block1d4_h8_avg_avx2 \
  1246. vpx_highbd_filter_block1d4_h8_avg_sse2
  1247. #define vpx_highbd_filter_block1d4_h2_avg_avx2 \
  1248. vpx_highbd_filter_block1d4_h2_avg_sse2
  1249. #define vpx_highbd_filter_block1d4_v8_avg_avx2 \
  1250. vpx_highbd_filter_block1d4_v8_avg_sse2
  1251. #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
  1252. vpx_highbd_filter_block1d4_v2_avg_sse2
  1253. HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
  1254. HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
  1255. src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
  1256. HIGH_FUN_CONV_2D(avg_, avx2, 1);
  1257. #undef HIGHBD_FUNC