vpx_subpixel_4t_intrin_sse2.c 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161
  1. /*
  2. * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <emmintrin.h>
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx/vpx_integer.h"
  13. #include "vpx_dsp/x86/convolve.h"
  14. #include "vpx_dsp/x86/convolve_sse2.h"
  15. #include "vpx_ports/mem.h"
  16. #define CONV8_ROUNDING_BITS (7)
  17. #define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
  18. static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
  19. ptrdiff_t src_stride, uint8_t *dst_ptr,
  20. ptrdiff_t dst_stride, uint32_t height,
  21. const int16_t *kernel) {
  22. __m128i kernel_reg; // Kernel
  23. __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
  24. const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
  25. int h;
  26. __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
  27. __m128i dst_first, dst_second;
  28. __m128i even, odd;
  29. // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  30. src_ptr -= 1;
  31. // Load Kernel
  32. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  33. kernel_reg = _mm_srai_epi16(kernel_reg, 1);
  34. kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  35. kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
  36. for (h = height; h > 0; --h) {
  37. // We will load multiple shifted versions of the row and shuffle them into
  38. // 16-bit words of the form
  39. // ... s[2] s[1] s[0] s[-1]
  40. // ... s[4] s[3] s[2] s[1]
  41. // Then we call multiply and add to get partial results
  42. // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
  43. // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
  44. // The two results are then added together for the first half of even
  45. // output.
  46. // Repeat multiple times to get the whole outoput
  47. src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
  48. src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
  49. src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
  50. src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
  51. // Output 6 4 2 0
  52. even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
  53. &kernel_reg_45);
  54. // Output 7 5 3 1
  55. odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
  56. &kernel_reg_23, &kernel_reg_45);
  57. // Combine to get the first half of the dst
  58. dst_first = mm_zip_epi32_sse2(&even, &odd);
  59. // Do again to get the second half of dst
  60. src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
  61. src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
  62. src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
  63. src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
  64. // Output 14 12 10 8
  65. even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
  66. &kernel_reg_45);
  67. // Output 15 13 11 9
  68. odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
  69. &kernel_reg_23, &kernel_reg_45);
  70. // Combine to get the second half of the dst
  71. dst_second = mm_zip_epi32_sse2(&even, &odd);
  72. // Round each result
  73. dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
  74. dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
  75. // Finally combine to get the final dst
  76. dst_first = _mm_packus_epi16(dst_first, dst_second);
  77. _mm_store_si128((__m128i *)dst_ptr, dst_first);
  78. src_ptr += src_stride;
  79. dst_ptr += dst_stride;
  80. }
  81. }
  82. /* The macro used to generate functions shifts the src_ptr up by 3 rows already
  83. * */
  84. static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr,
  85. ptrdiff_t src_stride, uint8_t *dst_ptr,
  86. ptrdiff_t dst_stride, uint32_t height,
  87. const int16_t *kernel) {
  88. // Register for source s[-1:3, :]
  89. __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
  90. // Interleaved rows of the source. lo is first half, hi second
  91. __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
  92. __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
  93. // Half of half of the interleaved rows
  94. __m128i src_reg_m10_lo_1, src_reg_m10_lo_2, src_reg_m10_hi_1,
  95. src_reg_m10_hi_2;
  96. __m128i src_reg_01_lo_1, src_reg_01_lo_2, src_reg_01_hi_1, src_reg_01_hi_2;
  97. __m128i src_reg_12_lo_1, src_reg_12_lo_2, src_reg_12_hi_1, src_reg_12_hi_2;
  98. __m128i src_reg_23_lo_1, src_reg_23_lo_2, src_reg_23_hi_1, src_reg_23_hi_2;
  99. __m128i kernel_reg; // Kernel
  100. __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
  101. // Result after multiply and add
  102. __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
  103. __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
  104. __m128i res_reg_m1012, res_reg_0123;
  105. __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
  106. const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
  107. // We will compute the result two rows at a time
  108. const ptrdiff_t src_stride_unrolled = src_stride << 1;
  109. const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
  110. int h;
  111. // Load Kernel
  112. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  113. kernel_reg = _mm_srai_epi16(kernel_reg, 1);
  114. kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  115. kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
  116. // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
  117. // words,
  118. // shuffle the data into the form
  119. // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
  120. // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
  121. // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
  122. // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
  123. // so that we can call multiply and add with the kernel to get 32-bit words of
  124. // the form
  125. // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
  126. // Finally, we can add multiple rows together to get the desired output.
  127. // First shuffle the data
  128. src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
  129. src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
  130. src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
  131. src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
  132. src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
  133. src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
  134. src_reg_m10_hi_1 = _mm_unpacklo_epi8(src_reg_m10_hi, _mm_setzero_si128());
  135. src_reg_m10_hi_2 = _mm_unpackhi_epi8(src_reg_m10_hi, _mm_setzero_si128());
  136. // More shuffling
  137. src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
  138. src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
  139. src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
  140. src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
  141. src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
  142. src_reg_01_hi_1 = _mm_unpacklo_epi8(src_reg_01_hi, _mm_setzero_si128());
  143. src_reg_01_hi_2 = _mm_unpackhi_epi8(src_reg_01_hi, _mm_setzero_si128());
  144. for (h = height; h > 1; h -= 2) {
  145. src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
  146. src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
  147. src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
  148. src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
  149. src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
  150. src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
  151. // Partial output from first half
  152. res_reg_m10_lo = mm_madd_packs_epi16_sse2(
  153. &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
  154. res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
  155. &kernel_reg_23);
  156. src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
  157. src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
  158. res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
  159. &kernel_reg_45);
  160. src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
  161. src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
  162. res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
  163. &kernel_reg_45);
  164. // Add to get first half of the results
  165. res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
  166. res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
  167. // Now repeat everything again for the second half
  168. // Partial output for second half
  169. res_reg_m10_hi = mm_madd_packs_epi16_sse2(
  170. &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23);
  171. res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2,
  172. &kernel_reg_23);
  173. src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128());
  174. src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128());
  175. res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2,
  176. &kernel_reg_45);
  177. src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128());
  178. src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128());
  179. res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2,
  180. &kernel_reg_45);
  181. // Second half of the results
  182. res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
  183. res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
  184. // Round the words
  185. res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
  186. res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
  187. res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
  188. res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
  189. // Combine to get the result
  190. res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
  191. res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
  192. _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
  193. _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
  194. // Update the source by two rows
  195. src_ptr += src_stride_unrolled;
  196. dst_ptr += dst_stride_unrolled;
  197. src_reg_m10_lo_1 = src_reg_12_lo_1;
  198. src_reg_m10_lo_2 = src_reg_12_lo_2;
  199. src_reg_m10_hi_1 = src_reg_12_hi_1;
  200. src_reg_m10_hi_2 = src_reg_12_hi_2;
  201. src_reg_01_lo_1 = src_reg_23_lo_1;
  202. src_reg_01_lo_2 = src_reg_23_lo_2;
  203. src_reg_01_hi_1 = src_reg_23_hi_1;
  204. src_reg_01_hi_2 = src_reg_23_hi_2;
  205. src_reg_1 = src_reg_3;
  206. }
  207. }
  208. static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
  209. ptrdiff_t src_stride, uint8_t *dst_ptr,
  210. ptrdiff_t dst_stride, uint32_t height,
  211. const int16_t *kernel) {
  212. __m128i kernel_reg; // Kernel
  213. __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
  214. const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
  215. int h;
  216. __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
  217. __m128i dst_first;
  218. __m128i even, odd;
  219. // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  220. src_ptr -= 1;
  221. // Load Kernel
  222. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  223. kernel_reg = _mm_srai_epi16(kernel_reg, 1);
  224. kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  225. kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
  226. for (h = height; h > 0; --h) {
  227. // We will load multiple shifted versions of the row and shuffle them into
  228. // 16-bit words of the form
  229. // ... s[2] s[1] s[0] s[-1]
  230. // ... s[4] s[3] s[2] s[1]
  231. // Then we call multiply and add to get partial results
  232. // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
  233. // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
  234. // The two results are then added together to get the even output
  235. src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
  236. src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
  237. src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
  238. src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
  239. // Output 6 4 2 0
  240. even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
  241. &kernel_reg_45);
  242. // Output 7 5 3 1
  243. odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
  244. &kernel_reg_23, &kernel_reg_45);
  245. // Combine to get the first half of the dst
  246. dst_first = mm_zip_epi32_sse2(&even, &odd);
  247. dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
  248. // Saturate and convert to 8-bit words
  249. dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
  250. _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
  251. src_ptr += src_stride;
  252. dst_ptr += dst_stride;
  253. }
  254. }
  255. static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr,
  256. ptrdiff_t src_stride, uint8_t *dst_ptr,
  257. ptrdiff_t dst_stride, uint32_t height,
  258. const int16_t *kernel) {
  259. // Register for source s[-1:3, :]
  260. __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
  261. // Interleaved rows of the source. lo is first half, hi second
  262. __m128i src_reg_m10_lo, src_reg_01_lo;
  263. __m128i src_reg_12_lo, src_reg_23_lo;
  264. // Half of half of the interleaved rows
  265. __m128i src_reg_m10_lo_1, src_reg_m10_lo_2;
  266. __m128i src_reg_01_lo_1, src_reg_01_lo_2;
  267. __m128i src_reg_12_lo_1, src_reg_12_lo_2;
  268. __m128i src_reg_23_lo_1, src_reg_23_lo_2;
  269. __m128i kernel_reg; // Kernel
  270. __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
  271. // Result after multiply and add
  272. __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
  273. __m128i res_reg_m1012, res_reg_0123;
  274. __m128i res_reg_m1012_lo, res_reg_0123_lo;
  275. const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
  276. // We will compute the result two rows at a time
  277. const ptrdiff_t src_stride_unrolled = src_stride << 1;
  278. const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
  279. int h;
  280. // Load Kernel
  281. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  282. kernel_reg = _mm_srai_epi16(kernel_reg, 1);
  283. kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  284. kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
  285. // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
  286. // words,
  287. // shuffle the data into the form
  288. // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
  289. // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
  290. // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
  291. // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
  292. // so that we can call multiply and add with the kernel to get 32-bit words of
  293. // the form
  294. // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
  295. // Finally, we can add multiple rows together to get the desired output.
  296. // First shuffle the data
  297. src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
  298. src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
  299. src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
  300. src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
  301. src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
  302. // More shuffling
  303. src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
  304. src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
  305. src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
  306. src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
  307. for (h = height; h > 1; h -= 2) {
  308. src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
  309. src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
  310. src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
  311. src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
  312. // Partial output
  313. res_reg_m10_lo = mm_madd_packs_epi16_sse2(
  314. &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
  315. res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
  316. &kernel_reg_23);
  317. src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
  318. src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
  319. res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
  320. &kernel_reg_45);
  321. src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
  322. src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
  323. res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
  324. &kernel_reg_45);
  325. // Add to get results
  326. res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
  327. res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
  328. // Round the words
  329. res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
  330. res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
  331. // Convert to 8-bit words
  332. res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128());
  333. res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, _mm_setzero_si128());
  334. // Save only half of the register (8 words)
  335. _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
  336. _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
  337. // Update the source by two rows
  338. src_ptr += src_stride_unrolled;
  339. dst_ptr += dst_stride_unrolled;
  340. src_reg_m10_lo_1 = src_reg_12_lo_1;
  341. src_reg_m10_lo_2 = src_reg_12_lo_2;
  342. src_reg_01_lo_1 = src_reg_23_lo_1;
  343. src_reg_01_lo_2 = src_reg_23_lo_2;
  344. src_reg_1 = src_reg_3;
  345. }
  346. }
  347. static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
  348. ptrdiff_t src_stride, uint8_t *dst_ptr,
  349. ptrdiff_t dst_stride, uint32_t height,
  350. const int16_t *kernel) {
  351. __m128i kernel_reg; // Kernel
  352. __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
  353. const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
  354. int h;
  355. __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
  356. __m128i dst_first;
  357. __m128i tmp_0, tmp_1;
  358. // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  359. src_ptr -= 1;
  360. // Load Kernel
  361. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  362. kernel_reg = _mm_srai_epi16(kernel_reg, 1);
  363. kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  364. kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
  365. for (h = height; h > 0; --h) {
  366. // We will load multiple shifted versions of the row and shuffle them into
  367. // 16-bit words of the form
  368. // ... s[1] s[0] s[0] s[-1]
  369. // ... s[3] s[2] s[2] s[1]
  370. // Then we call multiply and add to get partial results
  371. // s[1]k[3]+s[0]k[2] s[0]k[3]s[-1]k[2]
  372. // s[3]k[5]+s[2]k[4] s[2]k[5]s[1]k[4]
  373. // The two results are then added together to get the output
  374. src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
  375. src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
  376. src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
  377. src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
  378. // Convert to 16-bit words
  379. src_reg = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128());
  380. src_reg_shift_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128());
  381. src_reg_shift_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128());
  382. src_reg_shift_3 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128());
  383. // Shuffle into the right format
  384. tmp_0 = _mm_unpacklo_epi32(src_reg, src_reg_shift_1);
  385. tmp_1 = _mm_unpacklo_epi32(src_reg_shift_2, src_reg_shift_3);
  386. // Partial output
  387. tmp_0 = _mm_madd_epi16(tmp_0, kernel_reg_23);
  388. tmp_1 = _mm_madd_epi16(tmp_1, kernel_reg_45);
  389. // Output
  390. dst_first = _mm_add_epi32(tmp_0, tmp_1);
  391. dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128());
  392. dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
  393. // Saturate and convert to 8-bit words
  394. dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
  395. *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
  396. src_ptr += src_stride;
  397. dst_ptr += dst_stride;
  398. }
  399. }
  400. static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
  401. ptrdiff_t src_stride, uint8_t *dst_ptr,
  402. ptrdiff_t dst_stride, uint32_t height,
  403. const int16_t *kernel) {
  404. // Register for source s[-1:3, :]
  405. __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
  406. // Interleaved rows of the source. lo is first half, hi second
  407. __m128i src_reg_m10_lo, src_reg_01_lo;
  408. __m128i src_reg_12_lo, src_reg_23_lo;
  409. // Half of half of the interleaved rows
  410. __m128i src_reg_m10_lo_1;
  411. __m128i src_reg_01_lo_1;
  412. __m128i src_reg_12_lo_1;
  413. __m128i src_reg_23_lo_1;
  414. __m128i kernel_reg; // Kernel
  415. __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
  416. // Result after multiply and add
  417. __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
  418. __m128i res_reg_m1012, res_reg_0123;
  419. __m128i res_reg_m1012_lo, res_reg_0123_lo;
  420. const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
  421. const __m128i reg_zero = _mm_setzero_si128();
  422. // We will compute the result two rows at a time
  423. const ptrdiff_t src_stride_unrolled = src_stride << 1;
  424. const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
  425. int h;
  426. // Load Kernel
  427. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  428. kernel_reg = _mm_srai_epi16(kernel_reg, 1);
  429. kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  430. kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
  431. // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
  432. // words,
  433. // shuffle the data into the form
  434. // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
  435. // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
  436. // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
  437. // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
  438. // so that we can call multiply and add with the kernel to get 32-bit words of
  439. // the form
  440. // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
  441. // Finally, we can add multiple rows together to get the desired output.
  442. // First shuffle the data
  443. src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
  444. src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
  445. src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
  446. src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
  447. // More shuffling
  448. src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
  449. src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
  450. src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
  451. for (h = height; h > 1; h -= 2) {
  452. src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
  453. src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
  454. src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
  455. src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
  456. // Partial output
  457. res_reg_m10_lo =
  458. mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, &reg_zero, &kernel_reg_23);
  459. res_reg_01_lo =
  460. mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &reg_zero, &kernel_reg_23);
  461. src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
  462. res_reg_12_lo =
  463. mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &reg_zero, &kernel_reg_45);
  464. src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
  465. res_reg_23_lo =
  466. mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &reg_zero, &kernel_reg_45);
  467. // Add to get results
  468. res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
  469. res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
  470. // Round the words
  471. res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
  472. res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
  473. // Convert to 8-bit words
  474. res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero);
  475. res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
  476. // Save only half of the register (8 words)
  477. *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
  478. *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
  479. // Update the source by two rows
  480. src_ptr += src_stride_unrolled;
  481. dst_ptr += dst_stride_unrolled;
  482. src_reg_m10_lo_1 = src_reg_12_lo_1;
  483. src_reg_01_lo_1 = src_reg_23_lo_1;
  484. src_reg_1 = src_reg_3;
  485. }
  486. }
  487. #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
  488. static void vpx_highbd_filter_block1d4_h4_sse2(
  489. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  490. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  491. // We will load multiple shifted versions of the row and shuffle them into
  492. // 16-bit words of the form
  493. // ... s[2] s[1] s[0] s[-1]
  494. // ... s[4] s[3] s[2] s[1]
  495. // Then we call multiply and add to get partial results
  496. // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
  497. // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
  498. // The two results are then added together to get the even output
  499. __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
  500. __m128i res_reg;
  501. __m128i even, odd;
  502. __m128i kernel_reg; // Kernel
  503. __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
  504. const __m128i reg_round =
  505. _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
  506. const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
  507. const __m128i reg_zero = _mm_setzero_si128();
  508. int h;
  509. // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  510. src_ptr -= 1;
  511. // Load Kernel
  512. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  513. kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  514. kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
  515. for (h = height; h > 0; --h) {
  516. src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
  517. src_reg_shift_1 = _mm_srli_si128(src_reg, 2);
  518. src_reg_shift_2 = _mm_srli_si128(src_reg, 4);
  519. src_reg_shift_3 = _mm_srli_si128(src_reg, 6);
  520. // Output 2 0
  521. even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
  522. &kernel_reg_45);
  523. // Output 3 1
  524. odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
  525. &kernel_reg_23, &kernel_reg_45);
  526. // Combine to get the first half of the dst
  527. res_reg = _mm_unpacklo_epi32(even, odd);
  528. res_reg = mm_round_epi32_sse2(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
  529. res_reg = _mm_packs_epi32(res_reg, reg_zero);
  530. // Saturate the result and save
  531. res_reg = _mm_min_epi16(res_reg, reg_max);
  532. res_reg = _mm_max_epi16(res_reg, reg_zero);
  533. _mm_storel_epi64((__m128i *)dst_ptr, res_reg);
  534. src_ptr += src_stride;
  535. dst_ptr += dst_stride;
  536. }
  537. }
  538. static void vpx_highbd_filter_block1d4_v4_sse2(
  539. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  540. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  541. // We will load two rows of pixels as 16-bit words, and shuffle them into the
  542. // form
  543. // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
  544. // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
  545. // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
  546. // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
  547. // so that we can call multiply and add with the kernel to get 32-bit words of
  548. // the form
  549. // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
  550. // Finally, we can add multiple rows together to get the desired output.
  551. // Register for source s[-1:3, :]
  552. __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
  553. // Interleaved rows of the source. lo is first half, hi second
  554. __m128i src_reg_m10, src_reg_01;
  555. __m128i src_reg_12, src_reg_23;
  556. __m128i kernel_reg; // Kernel
  557. __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
  558. // Result after multiply and add
  559. __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
  560. __m128i res_reg_m1012, res_reg_0123;
  561. const __m128i reg_round =
  562. _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
  563. const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
  564. const __m128i reg_zero = _mm_setzero_si128();
  565. // We will compute the result two rows at a time
  566. const ptrdiff_t src_stride_unrolled = src_stride << 1;
  567. const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
  568. int h;
  569. // Load Kernel
  570. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  571. kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  572. kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
  573. // First shuffle the data
  574. src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
  575. src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
  576. src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
  577. // More shuffling
  578. src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
  579. src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
  580. for (h = height; h > 1; h -= 2) {
  581. src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
  582. src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
  583. src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
  584. src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
  585. // Partial output
  586. res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23);
  587. res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23);
  588. res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45);
  589. res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45);
  590. // Add to get results
  591. res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12);
  592. res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23);
  593. // Round the words
  594. res_reg_m1012 =
  595. mm_round_epi32_sse2(&res_reg_m1012, &reg_round, CONV8_ROUNDING_BITS);
  596. res_reg_0123 =
  597. mm_round_epi32_sse2(&res_reg_0123, &reg_round, CONV8_ROUNDING_BITS);
  598. res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero);
  599. res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero);
  600. // Saturate according to bit depth
  601. res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
  602. res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
  603. res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
  604. res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
  605. // Save only half of the register (8 words)
  606. _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
  607. _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
  608. // Update the source by two rows
  609. src_ptr += src_stride_unrolled;
  610. dst_ptr += dst_stride_unrolled;
  611. src_reg_m10 = src_reg_12;
  612. src_reg_01 = src_reg_23;
  613. src_reg_1 = src_reg_3;
  614. }
  615. }
  616. static void vpx_highbd_filter_block1d8_h4_sse2(
  617. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  618. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  619. // We will load multiple shifted versions of the row and shuffle them into
  620. // 16-bit words of the form
  621. // ... s[2] s[1] s[0] s[-1]
  622. // ... s[4] s[3] s[2] s[1]
  623. // Then we call multiply and add to get partial results
  624. // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
  625. // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
  626. // The two results are then added together for the first half of even
  627. // output.
  628. // Repeat multiple times to get the whole outoput
  629. __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2,
  630. src_reg_shift_3;
  631. __m128i res_reg;
  632. __m128i even, odd;
  633. __m128i tmp_0, tmp_1;
  634. __m128i kernel_reg; // Kernel
  635. __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
  636. const __m128i reg_round =
  637. _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
  638. const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
  639. const __m128i reg_zero = _mm_setzero_si128();
  640. int h;
  641. // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  642. src_ptr -= 1;
  643. // Load Kernel
  644. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  645. kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  646. kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
  647. for (h = height; h > 0; --h) {
  648. // We will put first half in the first half of the reg, and second half in
  649. // second half
  650. src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
  651. src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
  652. // Output 6 4 2 0
  653. tmp_0 = _mm_srli_si128(src_reg, 4);
  654. tmp_1 = _mm_srli_si128(src_reg_next, 2);
  655. src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1);
  656. even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
  657. &kernel_reg_45);
  658. // Output 7 5 3 1
  659. tmp_0 = _mm_srli_si128(src_reg, 2);
  660. tmp_1 = src_reg_next;
  661. src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1);
  662. tmp_0 = _mm_srli_si128(src_reg, 6);
  663. tmp_1 = _mm_srli_si128(src_reg_next, 4);
  664. src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1);
  665. odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
  666. &kernel_reg_23, &kernel_reg_45);
  667. // Combine to get the first half of the dst
  668. even = mm_round_epi32_sse2(&even, &reg_round, CONV8_ROUNDING_BITS);
  669. odd = mm_round_epi32_sse2(&odd, &reg_round, CONV8_ROUNDING_BITS);
  670. res_reg = mm_zip_epi32_sse2(&even, &odd);
  671. // Saturate the result and save
  672. res_reg = _mm_min_epi16(res_reg, reg_max);
  673. res_reg = _mm_max_epi16(res_reg, reg_zero);
  674. _mm_store_si128((__m128i *)dst_ptr, res_reg);
  675. src_ptr += src_stride;
  676. dst_ptr += dst_stride;
  677. }
  678. }
  679. static void vpx_highbd_filter_block1d8_v4_sse2(
  680. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  681. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  682. // We will load two rows of pixels as 16-bit words, and shuffle them into the
  683. // form
  684. // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
  685. // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
  686. // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
  687. // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
  688. // so that we can call multiply and add with the kernel to get 32-bit words of
  689. // the form
  690. // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
  691. // Finally, we can add multiple rows together to get the desired output.
  692. // Register for source s[-1:3, :]
  693. __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
  694. // Interleaved rows of the source. lo is first half, hi second
  695. __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi;
  696. __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi;
  697. // Result after multiply and add
  698. __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
  699. __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
  700. __m128i res_reg_m1012, res_reg_0123;
  701. __m128i res_reg_m1012_lo, res_reg_0123_lo;
  702. __m128i res_reg_m1012_hi, res_reg_0123_hi;
  703. __m128i kernel_reg; // Kernel
  704. __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
  705. const __m128i reg_round =
  706. _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
  707. const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
  708. const __m128i reg_zero = _mm_setzero_si128();
  709. // We will compute the result two rows at a time
  710. const ptrdiff_t src_stride_unrolled = src_stride << 1;
  711. const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
  712. int h;
  713. // Load Kernel
  714. kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  715. kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  716. kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
  717. // First shuffle the data
  718. src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
  719. src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
  720. src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
  721. src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0);
  722. // More shuffling
  723. src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
  724. src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
  725. src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1);
  726. for (h = height; h > 1; h -= 2) {
  727. src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
  728. src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
  729. src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2);
  730. src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
  731. src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
  732. src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3);
  733. // Partial output for first half
  734. res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23);
  735. res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23);
  736. res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45);
  737. res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45);
  738. // Add to get results
  739. res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo);
  740. res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo);
  741. // Round the words
  742. res_reg_m1012_lo =
  743. mm_round_epi32_sse2(&res_reg_m1012_lo, &reg_round, CONV8_ROUNDING_BITS);
  744. res_reg_0123_lo =
  745. mm_round_epi32_sse2(&res_reg_0123_lo, &reg_round, CONV8_ROUNDING_BITS);
  746. // Partial output for first half
  747. res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23);
  748. res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23);
  749. res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45);
  750. res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45);
  751. // Add to get results
  752. res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi);
  753. res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi);
  754. // Round the words
  755. res_reg_m1012_hi =
  756. mm_round_epi32_sse2(&res_reg_m1012_hi, &reg_round, CONV8_ROUNDING_BITS);
  757. res_reg_0123_hi =
  758. mm_round_epi32_sse2(&res_reg_0123_hi, &reg_round, CONV8_ROUNDING_BITS);
  759. // Combine the two halfs
  760. res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi);
  761. res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi);
  762. // Saturate according to bit depth
  763. res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
  764. res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
  765. res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
  766. res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
  767. // Save only half of the register (8 words)
  768. _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
  769. _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
  770. // Update the source by two rows
  771. src_ptr += src_stride_unrolled;
  772. dst_ptr += dst_stride_unrolled;
  773. src_reg_m10_lo = src_reg_12_lo;
  774. src_reg_m10_hi = src_reg_12_hi;
  775. src_reg_01_lo = src_reg_23_lo;
  776. src_reg_01_hi = src_reg_23_hi;
  777. src_reg_1 = src_reg_3;
  778. }
  779. }
  780. static void vpx_highbd_filter_block1d16_h4_sse2(
  781. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  782. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  783. vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
  784. height, kernel, bd);
  785. vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
  786. dst_stride, height, kernel, bd);
  787. }
  788. static void vpx_highbd_filter_block1d16_v4_sse2(
  789. const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
  790. ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  791. vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
  792. height, kernel, bd);
  793. vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
  794. dst_stride, height, kernel, bd);
  795. }
  796. #endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
  797. // From vpx_subpixel_8t_sse2.asm.
  798. filter8_1dfunction vpx_filter_block1d16_v8_sse2;
  799. filter8_1dfunction vpx_filter_block1d16_h8_sse2;
  800. filter8_1dfunction vpx_filter_block1d8_v8_sse2;
  801. filter8_1dfunction vpx_filter_block1d8_h8_sse2;
  802. filter8_1dfunction vpx_filter_block1d4_v8_sse2;
  803. filter8_1dfunction vpx_filter_block1d4_h8_sse2;
  804. filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
  805. filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
  806. filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
  807. filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
  808. filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
  809. filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
  810. // Use the [vh]8 version because there is no [vh]4 implementation.
  811. #define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2
  812. #define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2
  813. #define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2
  814. #define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2
  815. #define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2
  816. #define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2
  817. // From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm.
  818. filter8_1dfunction vpx_filter_block1d16_v2_sse2;
  819. filter8_1dfunction vpx_filter_block1d16_h2_sse2;
  820. filter8_1dfunction vpx_filter_block1d8_v2_sse2;
  821. filter8_1dfunction vpx_filter_block1d8_h2_sse2;
  822. filter8_1dfunction vpx_filter_block1d4_v2_sse2;
  823. filter8_1dfunction vpx_filter_block1d4_h2_sse2;
  824. filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
  825. filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
  826. filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
  827. filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
  828. filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
  829. filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
  830. // void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
  831. // uint8_t *dst, ptrdiff_t dst_stride,
  832. // const InterpKernel *filter, int x0_q4,
  833. // int32_t x_step_q4, int y0_q4, int y_step_q4,
  834. // int w, int h);
  835. // void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
  836. // uint8_t *dst, ptrdiff_t dst_stride,
  837. // const InterpKernel *filter, int x0_q4,
  838. // int32_t x_step_q4, int y0_q4, int y_step_q4,
  839. // int w, int h);
  840. // void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
  841. // uint8_t *dst, ptrdiff_t dst_stride,
  842. // const InterpKernel *filter, int x0_q4,
  843. // int32_t x_step_q4, int y0_q4,
  844. // int y_step_q4, int w, int h);
  845. // void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
  846. // uint8_t *dst, ptrdiff_t dst_stride,
  847. // const InterpKernel *filter, int x0_q4,
  848. // int32_t x_step_q4, int y0_q4, int y_step_q4,
  849. // int w, int h);
  850. FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
  851. FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
  852. sse2, 0);
  853. FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
  854. FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
  855. src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1);
  856. // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
  857. // uint8_t *dst, ptrdiff_t dst_stride,
  858. // const InterpKernel *filter, int x0_q4,
  859. // int32_t x_step_q4, int y0_q4, int y_step_q4,
  860. // int w, int h);
  861. // void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
  862. // uint8_t *dst, ptrdiff_t dst_stride,
  863. // const InterpKernel *filter, int x0_q4,
  864. // int32_t x_step_q4, int y0_q4, int y_step_q4,
  865. // int w, int h);
  866. FUN_CONV_2D(, sse2, 0);
  867. FUN_CONV_2D(avg_, sse2, 1);
  868. #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
  869. // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
  870. highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
  871. highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
  872. highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
  873. highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
  874. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
  875. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
  876. highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
  877. highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
  878. highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
  879. highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
  880. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
  881. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
  882. // Use the [vh]8 version because there is no [vh]4 implementation.
  883. #define vpx_highbd_filter_block1d16_v4_avg_sse2 \
  884. vpx_highbd_filter_block1d16_v8_avg_sse2
  885. #define vpx_highbd_filter_block1d16_h4_avg_sse2 \
  886. vpx_highbd_filter_block1d16_h8_avg_sse2
  887. #define vpx_highbd_filter_block1d8_v4_avg_sse2 \
  888. vpx_highbd_filter_block1d8_v8_avg_sse2
  889. #define vpx_highbd_filter_block1d8_h4_avg_sse2 \
  890. vpx_highbd_filter_block1d8_h8_avg_sse2
  891. #define vpx_highbd_filter_block1d4_v4_avg_sse2 \
  892. vpx_highbd_filter_block1d4_v8_avg_sse2
  893. #define vpx_highbd_filter_block1d4_h4_avg_sse2 \
  894. vpx_highbd_filter_block1d4_h8_avg_sse2
  895. // From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
  896. highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
  897. highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
  898. highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
  899. highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
  900. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
  901. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
  902. highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
  903. highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
  904. highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
  905. highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
  906. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
  907. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
  908. // void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
  909. // ptrdiff_t src_stride,
  910. // uint8_t *dst,
  911. // ptrdiff_t dst_stride,
  912. // const int16_t *filter_x,
  913. // int x_step_q4,
  914. // const int16_t *filter_y,
  915. // int y_step_q4,
  916. // int w, int h, int bd);
  917. // void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
  918. // ptrdiff_t src_stride,
  919. // uint8_t *dst,
  920. // ptrdiff_t dst_stride,
  921. // const int16_t *filter_x,
  922. // int x_step_q4,
  923. // const int16_t *filter_y,
  924. // int y_step_q4,
  925. // int w, int h, int bd);
  926. // void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
  927. // ptrdiff_t src_stride,
  928. // uint8_t *dst,
  929. // ptrdiff_t dst_stride,
  930. // const int16_t *filter_x,
  931. // int x_step_q4,
  932. // const int16_t *filter_y,
  933. // int y_step_q4,
  934. // int w, int h, int bd);
  935. // void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
  936. // ptrdiff_t src_stride,
  937. // uint8_t *dst,
  938. // ptrdiff_t dst_stride,
  939. // const int16_t *filter_x,
  940. // int x_step_q4,
  941. // const int16_t *filter_y,
  942. // int y_step_q4,
  943. // int w, int h, int bd);
  944. HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
  945. HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
  946. src - src_stride * (num_taps / 2 - 1), , sse2, 0);
  947. HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
  948. HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
  949. src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1);
  950. // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
  951. // uint8_t *dst, ptrdiff_t dst_stride,
  952. // const InterpKernel *filter, int x0_q4,
  953. // int32_t x_step_q4, int y0_q4, int y_step_q4,
  954. // int w, int h, int bd);
  955. // void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
  956. // uint8_t *dst, ptrdiff_t dst_stride,
  957. // const InterpKernel *filter, int x0_q4,
  958. // int32_t x_step_q4, int y0_q4,
  959. // int y_step_q4, int w, int h, int bd);
  960. HIGH_FUN_CONV_2D(, sse2, 0);
  961. HIGH_FUN_CONV_2D(avg_, sse2, 1);
  962. #endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64