highbd_idct16x16_add_sse2.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <emmintrin.h> // SSE2
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
  13. #include "vpx_dsp/x86/inv_txfm_sse2.h"
  14. #include "vpx_dsp/x86/transpose_sse2.h"
  15. #include "vpx_dsp/x86/txfm_common_sse2.h"
  16. static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
  17. __m128i *const out) {
  18. // stage 5
  19. out[0] = _mm_add_epi32(in[0], in[3]);
  20. out[1] = _mm_add_epi32(in[1], in[2]);
  21. out[2] = _mm_sub_epi32(in[1], in[2]);
  22. out[3] = _mm_sub_epi32(in[0], in[3]);
  23. highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
  24. out[8] = _mm_add_epi32(in[8], in[11]);
  25. out[9] = _mm_add_epi32(in[9], in[10]);
  26. out[10] = _mm_sub_epi32(in[9], in[10]);
  27. out[11] = _mm_sub_epi32(in[8], in[11]);
  28. out[12] = _mm_sub_epi32(in[15], in[12]);
  29. out[13] = _mm_sub_epi32(in[14], in[13]);
  30. out[14] = _mm_add_epi32(in[14], in[13]);
  31. out[15] = _mm_add_epi32(in[15], in[12]);
  32. }
  33. static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
  34. __m128i *const out) {
  35. out[0] = _mm_add_epi32(in[0], in[7]);
  36. out[1] = _mm_add_epi32(in[1], in[6]);
  37. out[2] = _mm_add_epi32(in[2], in[5]);
  38. out[3] = _mm_add_epi32(in[3], in[4]);
  39. out[4] = _mm_sub_epi32(in[3], in[4]);
  40. out[5] = _mm_sub_epi32(in[2], in[5]);
  41. out[6] = _mm_sub_epi32(in[1], in[6]);
  42. out[7] = _mm_sub_epi32(in[0], in[7]);
  43. out[8] = in[8];
  44. out[9] = in[9];
  45. highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
  46. highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
  47. out[14] = in[14];
  48. out[15] = in[15];
  49. }
  50. static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
  51. __m128i step1[16], step2[16];
  52. // stage 2
  53. highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
  54. &step2[15]);
  55. highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
  56. &step2[14]);
  57. highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
  58. &step2[13]);
  59. highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
  60. &step2[12]);
  61. // stage 3
  62. highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
  63. &step1[7]);
  64. highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
  65. &step1[6]);
  66. step1[8] = _mm_add_epi32(step2[8], step2[9]);
  67. step1[9] = _mm_sub_epi32(step2[8], step2[9]);
  68. step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
  69. step1[11] = _mm_add_epi32(step2[10], step2[11]);
  70. step1[12] = _mm_add_epi32(step2[13], step2[12]);
  71. step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
  72. step1[14] = _mm_sub_epi32(step2[15], step2[14]);
  73. step1[15] = _mm_add_epi32(step2[15], step2[14]);
  74. // stage 4
  75. highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
  76. highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
  77. &step2[3]);
  78. highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
  79. &step2[14]);
  80. highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
  81. &step2[13], &step2[10]);
  82. step2[5] = _mm_sub_epi32(step1[4], step1[5]);
  83. step1[4] = _mm_add_epi32(step1[4], step1[5]);
  84. step2[6] = _mm_sub_epi32(step1[7], step1[6]);
  85. step1[7] = _mm_add_epi32(step1[7], step1[6]);
  86. step2[8] = step1[8];
  87. step2[11] = step1[11];
  88. step2[12] = step1[12];
  89. step2[15] = step1[15];
  90. highbd_idct16_4col_stage5(step2, step1);
  91. highbd_idct16_4col_stage6(step1, step2);
  92. highbd_idct16_4col_stage7(step2, io);
  93. }
  94. static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
  95. __m128i step1[16], step2[16];
  96. __m128i temp1[2], sign[2];
  97. // stage 2
  98. highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
  99. &step2[15]);
  100. highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9],
  101. &step2[14]);
  102. highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10],
  103. &step2[13]);
  104. highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
  105. &step2[12]);
  106. // stage 3
  107. highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
  108. &step1[7]);
  109. highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5],
  110. &step1[6]);
  111. step1[8] = _mm_add_epi32(step2[8], step2[9]);
  112. step1[9] = _mm_sub_epi32(step2[8], step2[9]);
  113. step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
  114. step1[11] = _mm_add_epi32(step2[10], step2[11]);
  115. step1[12] = _mm_add_epi32(step2[13], step2[12]);
  116. step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
  117. step1[14] = _mm_sub_epi32(step2[15], step2[14]);
  118. step1[15] = _mm_add_epi32(step2[15], step2[14]);
  119. // stage 4
  120. abs_extend_64bit_sse2(io[0], temp1, sign);
  121. step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
  122. step2[1] = step2[0];
  123. highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2],
  124. &step2[3]);
  125. highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
  126. &step2[14]);
  127. highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
  128. &step2[13], &step2[10]);
  129. step2[5] = _mm_sub_epi32(step1[4], step1[5]);
  130. step1[4] = _mm_add_epi32(step1[4], step1[5]);
  131. step2[6] = _mm_sub_epi32(step1[7], step1[6]);
  132. step1[7] = _mm_add_epi32(step1[7], step1[6]);
  133. step2[8] = step1[8];
  134. step2[11] = step1[11];
  135. step2[12] = step1[12];
  136. step2[15] = step1[15];
  137. highbd_idct16_4col_stage5(step2, step1);
  138. highbd_idct16_4col_stage6(step1, step2);
  139. highbd_idct16_4col_stage7(step2, io);
  140. }
  141. static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
  142. __m128i step1[16], step2[16];
  143. __m128i temp[2], sign[2];
  144. // stage 2
  145. highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
  146. &step2[15]);
  147. highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
  148. &step2[12]);
  149. // stage 3
  150. highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
  151. &step1[7]);
  152. step1[8] = step2[8];
  153. step1[9] = step2[8];
  154. step1[10] =
  155. _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10]
  156. step1[11] = step2[11];
  157. step1[12] = step2[12];
  158. step1[13] =
  159. _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13]
  160. step1[14] = step2[15];
  161. step1[15] = step2[15];
  162. // stage 4
  163. abs_extend_64bit_sse2(io[0], temp, sign);
  164. step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64);
  165. step2[1] = step2[0];
  166. step2[2] = _mm_setzero_si128();
  167. step2[3] = _mm_setzero_si128();
  168. highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
  169. &step2[14]);
  170. highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
  171. &step2[13], &step2[10]);
  172. step2[5] = step1[4];
  173. step2[6] = step1[7];
  174. step2[8] = step1[8];
  175. step2[11] = step1[11];
  176. step2[12] = step1[12];
  177. step2[15] = step1[15];
  178. highbd_idct16_4col_stage5(step2, step1);
  179. highbd_idct16_4col_stage6(step1, step2);
  180. highbd_idct16_4col_stage7(step2, io);
  181. }
  182. void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
  183. int stride, int bd) {
  184. int i;
  185. __m128i out[16], *in;
  186. if (bd == 8) {
  187. __m128i l[16], r[16];
  188. in = l;
  189. for (i = 0; i < 2; i++) {
  190. highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
  191. highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
  192. idct16_8col(in, in);
  193. in = r;
  194. input += 128;
  195. }
  196. for (i = 0; i < 16; i += 8) {
  197. int j;
  198. transpose_16bit_8x8(l + i, out);
  199. transpose_16bit_8x8(r + i, out + 8);
  200. idct16_8col(out, out);
  201. for (j = 0; j < 16; ++j) {
  202. highbd_write_buffer_8(dest + j * stride, out[j], bd);
  203. }
  204. dest += 8;
  205. }
  206. } else {
  207. __m128i all[4][16];
  208. for (i = 0; i < 4; i++) {
  209. in = all[i];
  210. highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
  211. highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
  212. highbd_idct16_4col(in);
  213. input += 4 * 16;
  214. }
  215. for (i = 0; i < 16; i += 4) {
  216. int j;
  217. transpose_32bit_4x4(all[0] + i, out + 0);
  218. transpose_32bit_4x4(all[1] + i, out + 4);
  219. transpose_32bit_4x4(all[2] + i, out + 8);
  220. transpose_32bit_4x4(all[3] + i, out + 12);
  221. highbd_idct16_4col(out);
  222. for (j = 0; j < 16; ++j) {
  223. highbd_write_buffer_4(dest + j * stride, out[j], bd);
  224. }
  225. dest += 4;
  226. }
  227. }
  228. }
  229. void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
  230. int stride, int bd) {
  231. int i;
  232. __m128i out[16];
  233. if (bd == 8) {
  234. __m128i in[16], temp[16];
  235. highbd_load_pack_transpose_32bit_8x8(input, 16, in);
  236. for (i = 8; i < 16; i++) {
  237. in[i] = _mm_setzero_si128();
  238. }
  239. idct16_8col(in, temp);
  240. for (i = 0; i < 16; i += 8) {
  241. int j;
  242. transpose_16bit_8x8(temp + i, in);
  243. idct16_8col(in, out);
  244. for (j = 0; j < 16; ++j) {
  245. highbd_write_buffer_8(dest + j * stride, out[j], bd);
  246. }
  247. dest += 8;
  248. }
  249. } else {
  250. __m128i all[2][16], *in;
  251. for (i = 0; i < 2; i++) {
  252. in = all[i];
  253. highbd_load_transpose_32bit_8x4(input, 16, in);
  254. highbd_idct16x16_38_4col(in);
  255. input += 4 * 16;
  256. }
  257. for (i = 0; i < 16; i += 4) {
  258. int j;
  259. transpose_32bit_4x4(all[0] + i, out + 0);
  260. transpose_32bit_4x4(all[1] + i, out + 4);
  261. highbd_idct16x16_38_4col(out);
  262. for (j = 0; j < 16; ++j) {
  263. highbd_write_buffer_4(dest + j * stride, out[j], bd);
  264. }
  265. dest += 4;
  266. }
  267. }
  268. }
  269. void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
  270. int stride, int bd) {
  271. int i;
  272. __m128i out[16];
  273. if (bd == 8) {
  274. __m128i in[16], l[16];
  275. in[0] = load_pack_8_32bit(input + 0 * 16);
  276. in[1] = load_pack_8_32bit(input + 1 * 16);
  277. in[2] = load_pack_8_32bit(input + 2 * 16);
  278. in[3] = load_pack_8_32bit(input + 3 * 16);
  279. idct16x16_10_pass1(in, l);
  280. for (i = 0; i < 16; i += 8) {
  281. int j;
  282. idct16x16_10_pass2(l + i, in);
  283. for (j = 0; j < 16; ++j) {
  284. highbd_write_buffer_8(dest + j * stride, in[j], bd);
  285. }
  286. dest += 8;
  287. }
  288. } else {
  289. __m128i all[2][16], *in;
  290. for (i = 0; i < 2; i++) {
  291. in = all[i];
  292. highbd_load_transpose_32bit_4x4(input, 16, in);
  293. highbd_idct16x16_10_4col(in);
  294. input += 4 * 16;
  295. }
  296. for (i = 0; i < 16; i += 4) {
  297. int j;
  298. transpose_32bit_4x4(&all[0][i], out);
  299. highbd_idct16x16_10_4col(out);
  300. for (j = 0; j < 16; ++j) {
  301. highbd_write_buffer_4(dest + j * stride, out[j], bd);
  302. }
  303. dest += 4;
  304. }
  305. }
  306. }
  307. void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
  308. int stride, int bd) {
  309. highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
  310. }