highbd_intrapred_intrin_sse2.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <emmintrin.h> // SSE2
  11. #include "./vpx_config.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx/vpx_integer.h"
  14. // -----------------------------------------------------------------------------
  15. void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
  16. const uint16_t *above,
  17. const uint16_t *left, int bd) {
  18. const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
  19. const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
  20. const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
  21. const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
  22. const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
  23. (void)above;
  24. (void)bd;
  25. _mm_storel_epi64((__m128i *)dst, row0);
  26. dst += stride;
  27. _mm_storel_epi64((__m128i *)dst, row1);
  28. dst += stride;
  29. _mm_storel_epi64((__m128i *)dst, row2);
  30. dst += stride;
  31. _mm_storel_epi64((__m128i *)dst, row3);
  32. }
  33. void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
  34. const uint16_t *above,
  35. const uint16_t *left, int bd) {
  36. const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
  37. const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
  38. const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
  39. const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
  40. const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
  41. const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
  42. const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
  43. const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
  44. const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
  45. (void)above;
  46. (void)bd;
  47. _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
  48. dst += stride;
  49. _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
  50. dst += stride;
  51. _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
  52. dst += stride;
  53. _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
  54. dst += stride;
  55. _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
  56. dst += stride;
  57. _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
  58. dst += stride;
  59. _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
  60. dst += stride;
  61. _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
  62. }
  63. static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
  64. const __m128i *row) {
  65. const __m128i val = _mm_unpacklo_epi64(*row, *row);
  66. _mm_store_si128((__m128i *)*dst, val);
  67. _mm_store_si128((__m128i *)(*dst + 8), val);
  68. *dst += stride;
  69. }
  70. static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
  71. const __m128i *row) {
  72. const __m128i val = _mm_unpackhi_epi64(*row, *row);
  73. _mm_store_si128((__m128i *)(*dst), val);
  74. _mm_store_si128((__m128i *)(*dst + 8), val);
  75. *dst += stride;
  76. }
  77. void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
  78. const uint16_t *above,
  79. const uint16_t *left, int bd) {
  80. int i;
  81. (void)above;
  82. (void)bd;
  83. for (i = 0; i < 2; i++, left += 8) {
  84. const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
  85. const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
  86. const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
  87. const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
  88. const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
  89. const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
  90. const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
  91. const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
  92. const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
  93. h_store_16_unpacklo(&dst, stride, &row0);
  94. h_store_16_unpacklo(&dst, stride, &row1);
  95. h_store_16_unpacklo(&dst, stride, &row2);
  96. h_store_16_unpacklo(&dst, stride, &row3);
  97. h_store_16_unpackhi(&dst, stride, &row4);
  98. h_store_16_unpackhi(&dst, stride, &row5);
  99. h_store_16_unpackhi(&dst, stride, &row6);
  100. h_store_16_unpackhi(&dst, stride, &row7);
  101. }
  102. }
  103. static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
  104. const __m128i *row) {
  105. const __m128i val = _mm_unpacklo_epi64(*row, *row);
  106. _mm_store_si128((__m128i *)(*dst), val);
  107. _mm_store_si128((__m128i *)(*dst + 8), val);
  108. _mm_store_si128((__m128i *)(*dst + 16), val);
  109. _mm_store_si128((__m128i *)(*dst + 24), val);
  110. *dst += stride;
  111. }
  112. static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
  113. const __m128i *row) {
  114. const __m128i val = _mm_unpackhi_epi64(*row, *row);
  115. _mm_store_si128((__m128i *)(*dst), val);
  116. _mm_store_si128((__m128i *)(*dst + 8), val);
  117. _mm_store_si128((__m128i *)(*dst + 16), val);
  118. _mm_store_si128((__m128i *)(*dst + 24), val);
  119. *dst += stride;
  120. }
  121. void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
  122. const uint16_t *above,
  123. const uint16_t *left, int bd) {
  124. int i;
  125. (void)above;
  126. (void)bd;
  127. for (i = 0; i < 4; i++, left += 8) {
  128. const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
  129. const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
  130. const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
  131. const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
  132. const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
  133. const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
  134. const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
  135. const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
  136. const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
  137. h_store_32_unpacklo(&dst, stride, &row0);
  138. h_store_32_unpacklo(&dst, stride, &row1);
  139. h_store_32_unpacklo(&dst, stride, &row2);
  140. h_store_32_unpacklo(&dst, stride, &row3);
  141. h_store_32_unpackhi(&dst, stride, &row4);
  142. h_store_32_unpackhi(&dst, stride, &row5);
  143. h_store_32_unpackhi(&dst, stride, &row6);
  144. h_store_32_unpackhi(&dst, stride, &row7);
  145. }
  146. }
  147. //------------------------------------------------------------------------------
  148. // DC 4x4
  149. static INLINE __m128i dc_sum_4(const uint16_t *ref) {
  150. const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
  151. const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
  152. const __m128i a = _mm_add_epi16(_dcba, _xxdc);
  153. return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
  154. }
  155. static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
  156. const __m128i *dc) {
  157. const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
  158. int i;
  159. for (i = 0; i < 4; ++i, dst += stride) {
  160. _mm_storel_epi64((__m128i *)dst, dc_dup);
  161. }
  162. }
  163. void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
  164. const uint16_t *above,
  165. const uint16_t *left, int bd) {
  166. const __m128i two = _mm_cvtsi32_si128(2);
  167. const __m128i sum = dc_sum_4(left);
  168. const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
  169. (void)above;
  170. (void)bd;
  171. dc_store_4x4(dst, stride, &dc);
  172. }
  173. void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
  174. const uint16_t *above,
  175. const uint16_t *left, int bd) {
  176. const __m128i two = _mm_cvtsi32_si128(2);
  177. const __m128i sum = dc_sum_4(above);
  178. const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
  179. (void)left;
  180. (void)bd;
  181. dc_store_4x4(dst, stride, &dc);
  182. }
  183. void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
  184. const uint16_t *above,
  185. const uint16_t *left, int bd) {
  186. const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  187. const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  188. (void)above;
  189. (void)left;
  190. dc_store_4x4(dst, stride, &dc_dup);
  191. }
  192. //------------------------------------------------------------------------------
  193. // DC 8x8
  194. static INLINE __m128i dc_sum_8(const uint16_t *ref) {
  195. const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
  196. const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
  197. const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
  198. const __m128i a = _mm_add_epi16(_dcba, _xxdc);
  199. return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
  200. }
  201. static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
  202. const __m128i *dc) {
  203. const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
  204. const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
  205. int i;
  206. for (i = 0; i < 8; ++i, dst += stride) {
  207. _mm_store_si128((__m128i *)dst, dc_dup);
  208. }
  209. }
  210. void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
  211. const uint16_t *above,
  212. const uint16_t *left, int bd) {
  213. const __m128i four = _mm_cvtsi32_si128(4);
  214. const __m128i sum = dc_sum_8(left);
  215. const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
  216. (void)above;
  217. (void)bd;
  218. dc_store_8x8(dst, stride, &dc);
  219. }
  220. void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
  221. const uint16_t *above,
  222. const uint16_t *left, int bd) {
  223. const __m128i four = _mm_cvtsi32_si128(4);
  224. const __m128i sum = dc_sum_8(above);
  225. const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
  226. (void)left;
  227. (void)bd;
  228. dc_store_8x8(dst, stride, &dc);
  229. }
  230. void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
  231. const uint16_t *above,
  232. const uint16_t *left, int bd) {
  233. const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  234. const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  235. (void)above;
  236. (void)left;
  237. dc_store_8x8(dst, stride, &dc_dup);
  238. }
  239. //------------------------------------------------------------------------------
  240. // DC 16x16
  241. static INLINE __m128i dc_sum_16(const uint16_t *ref) {
  242. const __m128i sum_lo = dc_sum_8(ref);
  243. const __m128i sum_hi = dc_sum_8(ref + 8);
  244. return _mm_add_epi16(sum_lo, sum_hi);
  245. }
  246. static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
  247. const __m128i *dc) {
  248. const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
  249. const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
  250. int i;
  251. for (i = 0; i < 16; ++i, dst += stride) {
  252. _mm_store_si128((__m128i *)dst, dc_dup);
  253. _mm_store_si128((__m128i *)(dst + 8), dc_dup);
  254. }
  255. }
  256. void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
  257. const uint16_t *above,
  258. const uint16_t *left, int bd) {
  259. const __m128i eight = _mm_cvtsi32_si128(8);
  260. const __m128i sum = dc_sum_16(left);
  261. const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  262. (void)above;
  263. (void)bd;
  264. dc_store_16x16(dst, stride, &dc);
  265. }
  266. void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
  267. const uint16_t *above,
  268. const uint16_t *left, int bd) {
  269. const __m128i eight = _mm_cvtsi32_si128(8);
  270. const __m128i sum = dc_sum_16(above);
  271. const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  272. (void)left;
  273. (void)bd;
  274. dc_store_16x16(dst, stride, &dc);
  275. }
  276. void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
  277. const uint16_t *above,
  278. const uint16_t *left, int bd) {
  279. const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  280. const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  281. (void)above;
  282. (void)left;
  283. dc_store_16x16(dst, stride, &dc_dup);
  284. }
  285. //------------------------------------------------------------------------------
  286. // DC 32x32
  287. static INLINE __m128i dc_sum_32(const uint16_t *ref) {
  288. const __m128i zero = _mm_setzero_si128();
  289. const __m128i sum_a = dc_sum_16(ref);
  290. const __m128i sum_b = dc_sum_16(ref + 16);
  291. // 12 bit bd will outrange, so expand to 32 bit before adding final total
  292. return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
  293. _mm_unpacklo_epi16(sum_b, zero));
  294. }
  295. static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
  296. const __m128i *dc) {
  297. const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
  298. const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
  299. int i;
  300. for (i = 0; i < 32; ++i, dst += stride) {
  301. _mm_store_si128((__m128i *)dst, dc_dup);
  302. _mm_store_si128((__m128i *)(dst + 8), dc_dup);
  303. _mm_store_si128((__m128i *)(dst + 16), dc_dup);
  304. _mm_store_si128((__m128i *)(dst + 24), dc_dup);
  305. }
  306. }
  307. void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
  308. const uint16_t *above,
  309. const uint16_t *left, int bd) {
  310. const __m128i sixteen = _mm_cvtsi32_si128(16);
  311. const __m128i sum = dc_sum_32(left);
  312. const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
  313. (void)above;
  314. (void)bd;
  315. dc_store_32x32(dst, stride, &dc);
  316. }
  317. void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
  318. const uint16_t *above,
  319. const uint16_t *left, int bd) {
  320. const __m128i sixteen = _mm_cvtsi32_si128(16);
  321. const __m128i sum = dc_sum_32(above);
  322. const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
  323. (void)left;
  324. (void)bd;
  325. dc_store_32x32(dst, stride, &dc);
  326. }
  327. void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
  328. const uint16_t *above,
  329. const uint16_t *left, int bd) {
  330. const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  331. const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  332. (void)above;
  333. (void)left;
  334. dc_store_32x32(dst, stride, &dc_dup);
  335. }
  336. // -----------------------------------------------------------------------------
  337. /*
  338. ; ------------------------------------------
  339. ; input: x, y, z, result
  340. ;
  341. ; trick from pascal
  342. ; (x+2y+z+2)>>2 can be calculated as:
  343. ; result = avg(x,z)
  344. ; result -= xor(x,z) & 1
  345. ; result = avg(result,y)
  346. ; ------------------------------------------
  347. */
  348. static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
  349. const __m128i *z) {
  350. const __m128i one = _mm_set1_epi16(1);
  351. const __m128i a = _mm_avg_epu16(*x, *z);
  352. const __m128i b =
  353. _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
  354. return _mm_avg_epu16(b, *y);
  355. }
  356. void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
  357. const uint16_t *above,
  358. const uint16_t *left, int bd) {
  359. const int I = left[0];
  360. const int J = left[1];
  361. const int K = left[2];
  362. const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
  363. const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
  364. const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
  365. const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
  366. const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
  367. const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
  368. const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
  369. const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
  370. const __m128i row0 = _mm_srli_si128(avg2, 6);
  371. const __m128i row1 = _mm_srli_si128(avg3, 4);
  372. const __m128i row2 = _mm_srli_si128(avg2, 4);
  373. const __m128i row3 = _mm_srli_si128(avg3, 2);
  374. (void)bd;
  375. _mm_storel_epi64((__m128i *)dst, row0);
  376. dst += stride;
  377. _mm_storel_epi64((__m128i *)dst, row1);
  378. dst += stride;
  379. _mm_storel_epi64((__m128i *)dst, row2);
  380. dst += stride;
  381. _mm_storel_epi64((__m128i *)dst, row3);
  382. dst -= stride;
  383. dst[0] = _mm_extract_epi16(avg3, 1);
  384. dst[stride] = _mm_extract_epi16(avg3, 0);
  385. }
  386. void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
  387. const uint16_t *above,
  388. const uint16_t *left, int bd) {
  389. const int I = left[0];
  390. const int J = left[1];
  391. const int K = left[2];
  392. const int L = left[3];
  393. const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
  394. const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
  395. const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
  396. const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
  397. const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
  398. const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
  399. const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
  400. const __m128i row0 = _mm_srli_si128(avg3, 6);
  401. const __m128i row1 = _mm_srli_si128(avg3, 4);
  402. const __m128i row2 = _mm_srli_si128(avg3, 2);
  403. const __m128i row3 = avg3;
  404. (void)bd;
  405. _mm_storel_epi64((__m128i *)dst, row0);
  406. dst += stride;
  407. _mm_storel_epi64((__m128i *)dst, row1);
  408. dst += stride;
  409. _mm_storel_epi64((__m128i *)dst, row2);
  410. dst += stride;
  411. _mm_storel_epi64((__m128i *)dst, row3);
  412. }
  413. void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
  414. const uint16_t *above,
  415. const uint16_t *left, int bd) {
  416. const int I = left[0];
  417. const int J = left[1];
  418. const int K = left[2];
  419. const int L = left[3];
  420. const __m128i XXXXXABC = _mm_castps_si128(
  421. _mm_loadh_pi(_mm_setzero_ps(), (const __m64 *)(above - 1)));
  422. const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
  423. const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
  424. const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
  425. const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
  426. const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
  427. const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
  428. const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
  429. const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
  430. const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
  431. const __m128i row2 = _mm_srli_si128(row3, 4);
  432. const __m128i row1 = _mm_srli_si128(row3, 8);
  433. const __m128i row0 = _mm_srli_si128(avg3, 4);
  434. (void)bd;
  435. _mm_storel_epi64((__m128i *)dst, row0);
  436. dst[0] = _mm_extract_epi16(avg2, 3);
  437. dst += stride;
  438. _mm_storel_epi64((__m128i *)dst, row1);
  439. dst += stride;
  440. _mm_storel_epi64((__m128i *)dst, row2);
  441. dst += stride;
  442. _mm_storel_epi64((__m128i *)dst, row3);
  443. }
  444. void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
  445. const uint16_t *above,
  446. const uint16_t *left, int bd) {
  447. const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left);
  448. const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff);
  449. const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000);
  450. const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2);
  451. const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4);
  452. const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00);
  453. const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0);
  454. const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3);
  455. const __m128i row1 = _mm_srli_si128(row0, 4);
  456. const __m128i row2 = _mm_srli_si128(row0, 8);
  457. const __m128i row3 = LLLL0000;
  458. (void)above;
  459. (void)bd;
  460. _mm_storel_epi64((__m128i *)dst, row0);
  461. dst += stride;
  462. _mm_storel_epi64((__m128i *)dst, row1);
  463. dst += stride;
  464. _mm_storel_epi64((__m128i *)dst, row2);
  465. dst += stride;
  466. _mm_storel_epi64((__m128i *)dst, row3);
  467. }
  468. void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
  469. const uint16_t *above,
  470. const uint16_t *left, int bd) {
  471. const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
  472. const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
  473. const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
  474. const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
  475. const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0);
  476. const __m128i row0 = avg2;
  477. const __m128i row1 = avg3;
  478. const __m128i row2 = _mm_srli_si128(avg2, 2);
  479. const __m128i row3 = _mm_srli_si128(avg3, 2);
  480. (void)left;
  481. (void)bd;
  482. _mm_storel_epi64((__m128i *)dst, row0);
  483. dst += stride;
  484. _mm_storel_epi64((__m128i *)dst, row1);
  485. dst += stride;
  486. _mm_storel_epi64((__m128i *)dst, row2);
  487. dst += stride;
  488. _mm_storel_epi64((__m128i *)dst, row3);
  489. }