highbd_inv_txfm_sse2.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
  11. #define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
  12. #include <emmintrin.h> // SSE2
  13. #include "./vpx_config.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vpx_dsp/inv_txfm.h"
  16. #include "vpx_dsp/x86/transpose_sse2.h"
  17. #include "vpx_dsp/x86/txfm_common_sse2.h"
  18. // Note: There is no 64-bit bit-level shifting SIMD instruction. All
  19. // coefficients are left shifted by 2, so that dct_const_round_shift() can be
  20. // done by right shifting 2 bytes.
  21. static INLINE void extend_64bit(const __m128i in,
  22. __m128i *const out /*out[2]*/) {
  23. out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1
  24. out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3
  25. }
  26. static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1,
  27. const __m128i rounding) {
  28. __m128i temp[2];
  29. temp[0] = _mm_add_epi32(in0, rounding);
  30. temp[1] = _mm_add_epi32(in1, rounding);
  31. temp[0] = _mm_srai_epi32(temp[0], 4);
  32. temp[1] = _mm_srai_epi32(temp[1], 4);
  33. return _mm_packs_epi32(temp[0], temp[1]);
  34. }
  35. static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1,
  36. const __m128i rounding) {
  37. __m128i temp[2];
  38. temp[0] = _mm_add_epi32(in0, rounding);
  39. temp[1] = _mm_add_epi32(in1, rounding);
  40. temp[0] = _mm_srai_epi32(temp[0], 5);
  41. temp[1] = _mm_srai_epi32(temp[1], 5);
  42. return _mm_packs_epi32(temp[0], temp[1]);
  43. }
  44. static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
  45. const __m128i t =
  46. _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0));
  47. return _mm_srli_si128(t, 2);
  48. }
  49. static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) {
  50. const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 2
  51. const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 1, 3
  52. return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3
  53. }
  54. static INLINE void abs_extend_64bit_sse2(const __m128i in,
  55. __m128i *const out /*out[2]*/,
  56. __m128i *const sign /*sign[2]*/) {
  57. sign[0] = _mm_srai_epi32(in, 31);
  58. out[0] = _mm_xor_si128(in, sign[0]);
  59. out[0] = _mm_sub_epi32(out[0], sign[0]);
  60. sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3
  61. sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1
  62. out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3
  63. out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1
  64. }
  65. // Note: cospi must be non negative.
  66. static INLINE __m128i multiply_apply_sign_sse2(const __m128i in,
  67. const __m128i sign,
  68. const __m128i cospi) {
  69. __m128i out = _mm_mul_epu32(in, cospi);
  70. out = _mm_xor_si128(out, sign);
  71. return _mm_sub_epi64(out, sign);
  72. }
  73. // Note: c must be non negative.
  74. static INLINE __m128i multiplication_round_shift_sse2(
  75. const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
  76. const int c) {
  77. const __m128i pair_c = pair_set_epi32(c << 2, 0);
  78. __m128i t0, t1;
  79. assert(c >= 0);
  80. t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
  81. t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
  82. t0 = dct_const_round_shift_64bit(t0);
  83. t1 = dct_const_round_shift_64bit(t1);
  84. return pack_4(t0, t1);
  85. }
  86. // Note: c must be non negative.
  87. static INLINE __m128i multiplication_neg_round_shift_sse2(
  88. const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
  89. const int c) {
  90. const __m128i pair_c = pair_set_epi32(c << 2, 0);
  91. __m128i t0, t1;
  92. assert(c >= 0);
  93. t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
  94. t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
  95. t0 = _mm_sub_epi64(_mm_setzero_si128(), t0);
  96. t1 = _mm_sub_epi64(_mm_setzero_si128(), t1);
  97. t0 = dct_const_round_shift_64bit(t0);
  98. t1 = dct_const_round_shift_64bit(t1);
  99. return pack_4(t0, t1);
  100. }
  101. // Note: c0 and c1 must be non negative.
  102. static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1,
  103. const int c0, const int c1,
  104. __m128i *const out0,
  105. __m128i *const out1) {
  106. const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
  107. const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
  108. __m128i temp1[4], temp2[4], sign1[2], sign2[2];
  109. assert(c0 >= 0);
  110. assert(c1 >= 0);
  111. abs_extend_64bit_sse2(in0, temp1, sign1);
  112. abs_extend_64bit_sse2(in1, temp2, sign2);
  113. temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1);
  114. temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1);
  115. temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0);
  116. temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0);
  117. temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0);
  118. temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0);
  119. temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1);
  120. temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1);
  121. temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
  122. temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
  123. temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
  124. temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
  125. temp1[0] = dct_const_round_shift_64bit(temp1[0]);
  126. temp1[1] = dct_const_round_shift_64bit(temp1[1]);
  127. temp2[0] = dct_const_round_shift_64bit(temp2[0]);
  128. temp2[1] = dct_const_round_shift_64bit(temp2[1]);
  129. *out0 = pack_4(temp1[0], temp1[1]);
  130. *out1 = pack_4(temp2[0], temp2[1]);
  131. }
  132. // Note: c0 and c1 must be non negative.
  133. static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0,
  134. const int c1,
  135. __m128i *const out0,
  136. __m128i *const out1) {
  137. __m128i temp[2], sign[2];
  138. assert(c0 >= 0);
  139. assert(c1 >= 0);
  140. abs_extend_64bit_sse2(in, temp, sign);
  141. *out0 = multiplication_round_shift_sse2(temp, sign, c0);
  142. *out1 = multiplication_round_shift_sse2(temp, sign, c1);
  143. }
  144. // Note: c0 and c1 must be non negative.
  145. static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in,
  146. const int c0, const int c1,
  147. __m128i *const out0,
  148. __m128i *const out1) {
  149. __m128i temp[2], sign[2];
  150. assert(c0 >= 0);
  151. assert(c1 >= 0);
  152. abs_extend_64bit_sse2(in, temp, sign);
  153. *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1);
  154. *out1 = multiplication_round_shift_sse2(temp, sign, c0);
  155. }
  156. static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
  157. const __m128i in1,
  158. __m128i *const out0,
  159. __m128i *const out1) {
  160. __m128i temp1[2], temp2, sign[2];
  161. temp2 = _mm_add_epi32(in0, in1);
  162. abs_extend_64bit_sse2(temp2, temp1, sign);
  163. *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
  164. temp2 = _mm_sub_epi32(in0, in1);
  165. abs_extend_64bit_sse2(temp2, temp1, sign);
  166. *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
  167. }
  168. // Only do addition and subtraction butterfly, size = 16, 32
  169. static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out,
  170. int size) {
  171. int i = 0;
  172. const int num = size >> 1;
  173. const int bound = size - 1;
  174. while (i < num) {
  175. out[i] = _mm_add_epi32(in[i], in[bound - i]);
  176. out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]);
  177. i++;
  178. }
  179. }
  180. static INLINE void highbd_idct8_stage4(const __m128i *const in,
  181. __m128i *const out) {
  182. out[0] = _mm_add_epi32(in[0], in[7]);
  183. out[1] = _mm_add_epi32(in[1], in[6]);
  184. out[2] = _mm_add_epi32(in[2], in[5]);
  185. out[3] = _mm_add_epi32(in[3], in[4]);
  186. out[4] = _mm_sub_epi32(in[3], in[4]);
  187. out[5] = _mm_sub_epi32(in[2], in[5]);
  188. out[6] = _mm_sub_epi32(in[1], in[6]);
  189. out[7] = _mm_sub_epi32(in[0], in[7]);
  190. }
  191. static INLINE void highbd_idct8x8_final_round(__m128i *const io) {
  192. io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
  193. io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
  194. io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
  195. io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
  196. io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
  197. io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
  198. io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
  199. io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
  200. }
  201. static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
  202. __m128i *const out) {
  203. out[0] = _mm_add_epi32(in[0], in[15]);
  204. out[1] = _mm_add_epi32(in[1], in[14]);
  205. out[2] = _mm_add_epi32(in[2], in[13]);
  206. out[3] = _mm_add_epi32(in[3], in[12]);
  207. out[4] = _mm_add_epi32(in[4], in[11]);
  208. out[5] = _mm_add_epi32(in[5], in[10]);
  209. out[6] = _mm_add_epi32(in[6], in[9]);
  210. out[7] = _mm_add_epi32(in[7], in[8]);
  211. out[8] = _mm_sub_epi32(in[7], in[8]);
  212. out[9] = _mm_sub_epi32(in[6], in[9]);
  213. out[10] = _mm_sub_epi32(in[5], in[10]);
  214. out[11] = _mm_sub_epi32(in[4], in[11]);
  215. out[12] = _mm_sub_epi32(in[3], in[12]);
  216. out[13] = _mm_sub_epi32(in[2], in[13]);
  217. out[14] = _mm_sub_epi32(in[1], in[14]);
  218. out[15] = _mm_sub_epi32(in[0], in[15]);
  219. }
  220. static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
  221. const int bd) {
  222. const __m128i zero = _mm_set1_epi16(0);
  223. // Faster than _mm_set1_epi16((1 << bd) - 1).
  224. const __m128i one = _mm_set1_epi16(1);
  225. const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
  226. __m128i d;
  227. d = _mm_adds_epi16(in0, in1);
  228. d = _mm_max_epi16(d, zero);
  229. d = _mm_min_epi16(d, max);
  230. return d;
  231. }
  232. static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
  233. uint16_t *dest, int stride, int bd,
  234. const int size) {
  235. int a1, i, j;
  236. tran_low_t out;
  237. __m128i dc, d;
  238. out = HIGHBD_WRAPLOW(
  239. dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
  240. out =
  241. HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
  242. a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);
  243. dc = _mm_set1_epi16(a1);
  244. for (i = 0; i < size; ++i) {
  245. for (j = 0; j < size; j += 8) {
  246. d = _mm_load_si128((const __m128i *)(&dest[j]));
  247. d = add_clamp(d, dc, bd);
  248. _mm_store_si128((__m128i *)(&dest[j]), d);
  249. }
  250. dest += stride;
  251. }
  252. }
  253. static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
  254. const int bd) {
  255. __m128i d;
  256. d = _mm_loadl_epi64((const __m128i *)dest);
  257. d = add_clamp(d, in, bd);
  258. _mm_storel_epi64((__m128i *)dest, d);
  259. }
  260. static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
  261. const int stride, const int bd) {
  262. __m128i d;
  263. d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
  264. d = _mm_castps_si128(
  265. _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
  266. d = add_clamp(d, in, bd);
  267. _mm_storel_epi64((__m128i *)(dest + 0 * stride), d);
  268. _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
  269. }
  270. static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
  271. const int stride, const int bd) {
  272. recon_and_store_4x2(in[0], dest, stride, bd);
  273. dest += 2 * stride;
  274. recon_and_store_4x2(in[1], dest, stride, bd);
  275. }
  276. static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
  277. const int stride, const int bd) {
  278. __m128i d;
  279. d = _mm_load_si128((const __m128i *)(*dest));
  280. d = add_clamp(d, in, bd);
  281. _mm_store_si128((__m128i *)(*dest), d);
  282. *dest += stride;
  283. }
  284. static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
  285. const int stride, const int bd) {
  286. recon_and_store_8(in[0], &dest, stride, bd);
  287. recon_and_store_8(in[1], &dest, stride, bd);
  288. recon_and_store_8(in[2], &dest, stride, bd);
  289. recon_and_store_8(in[3], &dest, stride, bd);
  290. recon_and_store_8(in[4], &dest, stride, bd);
  291. recon_and_store_8(in[5], &dest, stride, bd);
  292. recon_and_store_8(in[6], &dest, stride, bd);
  293. recon_and_store_8(in[7], &dest, stride, bd);
  294. }
  295. static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
  296. const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
  297. const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
  298. return _mm_packs_epi32(t0, t1);
  299. }
  300. static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input,
  301. const int stride,
  302. __m128i *const in) {
  303. in[0] = load_pack_8_32bit(input + 0 * stride);
  304. in[1] = load_pack_8_32bit(input + 1 * stride);
  305. in[2] = load_pack_8_32bit(input + 2 * stride);
  306. in[3] = load_pack_8_32bit(input + 3 * stride);
  307. in[4] = load_pack_8_32bit(input + 4 * stride);
  308. in[5] = load_pack_8_32bit(input + 5 * stride);
  309. in[6] = load_pack_8_32bit(input + 6 * stride);
  310. in[7] = load_pack_8_32bit(input + 7 * stride);
  311. transpose_16bit_8x8(in, in);
  312. }
  313. static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input,
  314. const int stride,
  315. __m128i *in) {
  316. in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
  317. in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4));
  318. in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
  319. in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4));
  320. in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
  321. in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4));
  322. in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
  323. in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4));
  324. transpose_32bit_8x4(in, in);
  325. }
  326. static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input,
  327. const int stride,
  328. __m128i *in) {
  329. in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
  330. in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
  331. in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
  332. in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
  333. transpose_32bit_4x4(in, in);
  334. }
  335. static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
  336. const int bd) {
  337. const __m128i final_rounding = _mm_set1_epi16(1 << 5);
  338. __m128i out;
  339. out = _mm_adds_epi16(in, final_rounding);
  340. out = _mm_srai_epi16(out, 6);
  341. recon_and_store_8(out, &dest, 0, bd);
  342. }
  343. static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
  344. const int bd) {
  345. const __m128i final_rounding = _mm_set1_epi32(1 << 5);
  346. __m128i out;
  347. out = _mm_add_epi32(in, final_rounding);
  348. out = _mm_srai_epi32(out, 6);
  349. out = _mm_packs_epi32(out, out);
  350. recon_and_store_4(out, dest, bd);
  351. }
  352. #endif // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_