highbd_idct32x32_1024_add_neon.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_config.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_dsp/arm/idct_neon.h"
  14. #include "vpx_dsp/arm/transpose_neon.h"
  15. #include "vpx_dsp/txfm_common.h"
  16. static INLINE void load_from_transformed(const int32_t *const trans_buf,
  17. const int first, const int second,
  18. int32x4x2_t *const q0,
  19. int32x4x2_t *const q1) {
  20. q0->val[0] = vld1q_s32(trans_buf + first * 8);
  21. q0->val[1] = vld1q_s32(trans_buf + first * 8 + 4);
  22. q1->val[0] = vld1q_s32(trans_buf + second * 8);
  23. q1->val[1] = vld1q_s32(trans_buf + second * 8 + 4);
  24. }
  25. static INLINE void load_from_output(const int32_t *const out, const int first,
  26. const int second, int32x4x2_t *const q0,
  27. int32x4x2_t *const q1) {
  28. q0->val[0] = vld1q_s32(out + first * 32);
  29. q0->val[1] = vld1q_s32(out + first * 32 + 4);
  30. q1->val[0] = vld1q_s32(out + second * 32);
  31. q1->val[1] = vld1q_s32(out + second * 32 + 4);
  32. }
  33. static INLINE void store_in_output(int32_t *const out, const int first,
  34. const int second, const int32x4x2_t q0,
  35. const int32x4x2_t q1) {
  36. vst1q_s32(out + first * 32, q0.val[0]);
  37. vst1q_s32(out + first * 32 + 4, q0.val[1]);
  38. vst1q_s32(out + second * 32, q1.val[0]);
  39. vst1q_s32(out + second * 32 + 4, q1.val[1]);
  40. }
  41. static INLINE void highbd_store_combine_results(
  42. uint16_t *p1, uint16_t *p2, const int stride, const int32x4x2_t q0,
  43. const int32x4x2_t q1, const int32x4x2_t q2, const int32x4x2_t q3,
  44. const int16x8_t max) {
  45. int16x8_t o[4];
  46. uint16x8_t d[4];
  47. d[0] = vld1q_u16(p1);
  48. p1 += stride;
  49. d[1] = vld1q_u16(p1);
  50. d[3] = vld1q_u16(p2);
  51. p2 -= stride;
  52. d[2] = vld1q_u16(p2);
  53. o[0] = vcombine_s16(vrshrn_n_s32(q0.val[0], 6), vrshrn_n_s32(q0.val[1], 6));
  54. o[1] = vcombine_s16(vrshrn_n_s32(q1.val[0], 6), vrshrn_n_s32(q1.val[1], 6));
  55. o[2] = vcombine_s16(vrshrn_n_s32(q2.val[0], 6), vrshrn_n_s32(q2.val[1], 6));
  56. o[3] = vcombine_s16(vrshrn_n_s32(q3.val[0], 6), vrshrn_n_s32(q3.val[1], 6));
  57. o[0] = vqaddq_s16(o[0], vreinterpretq_s16_u16(d[0]));
  58. o[1] = vqaddq_s16(o[1], vreinterpretq_s16_u16(d[1]));
  59. o[2] = vqaddq_s16(o[2], vreinterpretq_s16_u16(d[2]));
  60. o[3] = vqaddq_s16(o[3], vreinterpretq_s16_u16(d[3]));
  61. o[0] = vminq_s16(o[0], max);
  62. o[1] = vminq_s16(o[1], max);
  63. o[2] = vminq_s16(o[2], max);
  64. o[3] = vminq_s16(o[3], max);
  65. d[0] = vqshluq_n_s16(o[0], 0);
  66. d[1] = vqshluq_n_s16(o[1], 0);
  67. d[2] = vqshluq_n_s16(o[2], 0);
  68. d[3] = vqshluq_n_s16(o[3], 0);
  69. vst1q_u16(p1, d[1]);
  70. p1 -= stride;
  71. vst1q_u16(p1, d[0]);
  72. vst1q_u16(p2, d[2]);
  73. p2 += stride;
  74. vst1q_u16(p2, d[3]);
  75. }
  76. static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1,
  77. const int32_t first_const,
  78. const int32_t second_const,
  79. int32x4x2_t *const qOut0,
  80. int32x4x2_t *const qOut1) {
  81. int64x2x2_t q[4];
  82. int32x2_t d[6];
  83. // Note: using v{mul, mla, mls}l_n_s32 here slows down 35% with gcc 4.9.
  84. d[4] = vdup_n_s32(first_const);
  85. d[5] = vdup_n_s32(second_const);
  86. q[0].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[4]);
  87. q[0].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[4]);
  88. q[1].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[4]);
  89. q[1].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[4]);
  90. q[0].val[0] = vmlsl_s32(q[0].val[0], vget_low_s32(qIn1.val[0]), d[5]);
  91. q[0].val[1] = vmlsl_s32(q[0].val[1], vget_high_s32(qIn1.val[0]), d[5]);
  92. q[1].val[0] = vmlsl_s32(q[1].val[0], vget_low_s32(qIn1.val[1]), d[5]);
  93. q[1].val[1] = vmlsl_s32(q[1].val[1], vget_high_s32(qIn1.val[1]), d[5]);
  94. q[2].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[5]);
  95. q[2].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[5]);
  96. q[3].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[5]);
  97. q[3].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[5]);
  98. q[2].val[0] = vmlal_s32(q[2].val[0], vget_low_s32(qIn1.val[0]), d[4]);
  99. q[2].val[1] = vmlal_s32(q[2].val[1], vget_high_s32(qIn1.val[0]), d[4]);
  100. q[3].val[0] = vmlal_s32(q[3].val[0], vget_low_s32(qIn1.val[1]), d[4]);
  101. q[3].val[1] = vmlal_s32(q[3].val[1], vget_high_s32(qIn1.val[1]), d[4]);
  102. qOut0->val[0] = vcombine_s32(vrshrn_n_s64(q[0].val[0], DCT_CONST_BITS),
  103. vrshrn_n_s64(q[0].val[1], DCT_CONST_BITS));
  104. qOut0->val[1] = vcombine_s32(vrshrn_n_s64(q[1].val[0], DCT_CONST_BITS),
  105. vrshrn_n_s64(q[1].val[1], DCT_CONST_BITS));
  106. qOut1->val[0] = vcombine_s32(vrshrn_n_s64(q[2].val[0], DCT_CONST_BITS),
  107. vrshrn_n_s64(q[2].val[1], DCT_CONST_BITS));
  108. qOut1->val[1] = vcombine_s32(vrshrn_n_s64(q[3].val[0], DCT_CONST_BITS),
  109. vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS));
  110. }
  111. static INLINE void load_s32x4q_dual(const int32_t *in, int32x4x2_t *const s) {
  112. s[0].val[0] = vld1q_s32(in);
  113. s[0].val[1] = vld1q_s32(in + 4);
  114. in += 32;
  115. s[1].val[0] = vld1q_s32(in);
  116. s[1].val[1] = vld1q_s32(in + 4);
  117. in += 32;
  118. s[2].val[0] = vld1q_s32(in);
  119. s[2].val[1] = vld1q_s32(in + 4);
  120. in += 32;
  121. s[3].val[0] = vld1q_s32(in);
  122. s[3].val[1] = vld1q_s32(in + 4);
  123. in += 32;
  124. s[4].val[0] = vld1q_s32(in);
  125. s[4].val[1] = vld1q_s32(in + 4);
  126. in += 32;
  127. s[5].val[0] = vld1q_s32(in);
  128. s[5].val[1] = vld1q_s32(in + 4);
  129. in += 32;
  130. s[6].val[0] = vld1q_s32(in);
  131. s[6].val[1] = vld1q_s32(in + 4);
  132. in += 32;
  133. s[7].val[0] = vld1q_s32(in);
  134. s[7].val[1] = vld1q_s32(in + 4);
  135. }
  136. static INLINE void transpose_and_store_s32_8x8(int32x4x2_t *const a,
  137. int32_t **out) {
  138. transpose_s32_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
  139. vst1q_s32(*out, a[0].val[0]);
  140. *out += 4;
  141. vst1q_s32(*out, a[0].val[1]);
  142. *out += 4;
  143. vst1q_s32(*out, a[1].val[0]);
  144. *out += 4;
  145. vst1q_s32(*out, a[1].val[1]);
  146. *out += 4;
  147. vst1q_s32(*out, a[2].val[0]);
  148. *out += 4;
  149. vst1q_s32(*out, a[2].val[1]);
  150. *out += 4;
  151. vst1q_s32(*out, a[3].val[0]);
  152. *out += 4;
  153. vst1q_s32(*out, a[3].val[1]);
  154. *out += 4;
  155. vst1q_s32(*out, a[4].val[0]);
  156. *out += 4;
  157. vst1q_s32(*out, a[4].val[1]);
  158. *out += 4;
  159. vst1q_s32(*out, a[5].val[0]);
  160. *out += 4;
  161. vst1q_s32(*out, a[5].val[1]);
  162. *out += 4;
  163. vst1q_s32(*out, a[6].val[0]);
  164. *out += 4;
  165. vst1q_s32(*out, a[6].val[1]);
  166. *out += 4;
  167. vst1q_s32(*out, a[7].val[0]);
  168. *out += 4;
  169. vst1q_s32(*out, a[7].val[1]);
  170. *out += 4;
  171. }
  172. static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) {
  173. int i;
  174. int32x4x2_t s[8];
  175. for (i = 0; i < 4; i++, input += 8) {
  176. load_s32x4q_dual(input, s);
  177. transpose_and_store_s32_8x8(s, &t_buf);
  178. }
  179. }
  180. static INLINE void idct32_bands_end_1st_pass(int32_t *const out,
  181. int32x4x2_t *const q) {
  182. store_in_output(out, 16, 17, q[6], q[7]);
  183. store_in_output(out, 14, 15, q[8], q[9]);
  184. load_from_output(out, 30, 31, &q[0], &q[1]);
  185. q[4] = highbd_idct_add_dual(q[2], q[1]);
  186. q[5] = highbd_idct_add_dual(q[3], q[0]);
  187. q[6] = highbd_idct_sub_dual(q[3], q[0]);
  188. q[7] = highbd_idct_sub_dual(q[2], q[1]);
  189. store_in_output(out, 30, 31, q[6], q[7]);
  190. store_in_output(out, 0, 1, q[4], q[5]);
  191. load_from_output(out, 12, 13, &q[0], &q[1]);
  192. q[2] = highbd_idct_add_dual(q[10], q[1]);
  193. q[3] = highbd_idct_add_dual(q[11], q[0]);
  194. q[4] = highbd_idct_sub_dual(q[11], q[0]);
  195. q[5] = highbd_idct_sub_dual(q[10], q[1]);
  196. load_from_output(out, 18, 19, &q[0], &q[1]);
  197. q[8] = highbd_idct_add_dual(q[4], q[1]);
  198. q[9] = highbd_idct_add_dual(q[5], q[0]);
  199. q[6] = highbd_idct_sub_dual(q[5], q[0]);
  200. q[7] = highbd_idct_sub_dual(q[4], q[1]);
  201. store_in_output(out, 18, 19, q[6], q[7]);
  202. store_in_output(out, 12, 13, q[8], q[9]);
  203. load_from_output(out, 28, 29, &q[0], &q[1]);
  204. q[4] = highbd_idct_add_dual(q[2], q[1]);
  205. q[5] = highbd_idct_add_dual(q[3], q[0]);
  206. q[6] = highbd_idct_sub_dual(q[3], q[0]);
  207. q[7] = highbd_idct_sub_dual(q[2], q[1]);
  208. store_in_output(out, 28, 29, q[6], q[7]);
  209. store_in_output(out, 2, 3, q[4], q[5]);
  210. load_from_output(out, 10, 11, &q[0], &q[1]);
  211. q[2] = highbd_idct_add_dual(q[12], q[1]);
  212. q[3] = highbd_idct_add_dual(q[13], q[0]);
  213. q[4] = highbd_idct_sub_dual(q[13], q[0]);
  214. q[5] = highbd_idct_sub_dual(q[12], q[1]);
  215. load_from_output(out, 20, 21, &q[0], &q[1]);
  216. q[8] = highbd_idct_add_dual(q[4], q[1]);
  217. q[9] = highbd_idct_add_dual(q[5], q[0]);
  218. q[6] = highbd_idct_sub_dual(q[5], q[0]);
  219. q[7] = highbd_idct_sub_dual(q[4], q[1]);
  220. store_in_output(out, 20, 21, q[6], q[7]);
  221. store_in_output(out, 10, 11, q[8], q[9]);
  222. load_from_output(out, 26, 27, &q[0], &q[1]);
  223. q[4] = highbd_idct_add_dual(q[2], q[1]);
  224. q[5] = highbd_idct_add_dual(q[3], q[0]);
  225. q[6] = highbd_idct_sub_dual(q[3], q[0]);
  226. q[7] = highbd_idct_sub_dual(q[2], q[1]);
  227. store_in_output(out, 26, 27, q[6], q[7]);
  228. store_in_output(out, 4, 5, q[4], q[5]);
  229. load_from_output(out, 8, 9, &q[0], &q[1]);
  230. q[2] = highbd_idct_add_dual(q[14], q[1]);
  231. q[3] = highbd_idct_add_dual(q[15], q[0]);
  232. q[4] = highbd_idct_sub_dual(q[15], q[0]);
  233. q[5] = highbd_idct_sub_dual(q[14], q[1]);
  234. load_from_output(out, 22, 23, &q[0], &q[1]);
  235. q[8] = highbd_idct_add_dual(q[4], q[1]);
  236. q[9] = highbd_idct_add_dual(q[5], q[0]);
  237. q[6] = highbd_idct_sub_dual(q[5], q[0]);
  238. q[7] = highbd_idct_sub_dual(q[4], q[1]);
  239. store_in_output(out, 22, 23, q[6], q[7]);
  240. store_in_output(out, 8, 9, q[8], q[9]);
  241. load_from_output(out, 24, 25, &q[0], &q[1]);
  242. q[4] = highbd_idct_add_dual(q[2], q[1]);
  243. q[5] = highbd_idct_add_dual(q[3], q[0]);
  244. q[6] = highbd_idct_sub_dual(q[3], q[0]);
  245. q[7] = highbd_idct_sub_dual(q[2], q[1]);
  246. store_in_output(out, 24, 25, q[6], q[7]);
  247. store_in_output(out, 6, 7, q[4], q[5]);
  248. }
  249. static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out,
  250. uint16_t *const dest,
  251. const int stride,
  252. const int16x8_t max,
  253. int32x4x2_t *const q) {
  254. uint16_t *dest0 = dest + 0 * stride;
  255. uint16_t *dest1 = dest + 31 * stride;
  256. uint16_t *dest2 = dest + 16 * stride;
  257. uint16_t *dest3 = dest + 15 * stride;
  258. const int str2 = stride << 1;
  259. highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
  260. max);
  261. dest2 += str2;
  262. dest3 -= str2;
  263. load_from_output(out, 30, 31, &q[0], &q[1]);
  264. q[4] = highbd_idct_add_dual(q[2], q[1]);
  265. q[5] = highbd_idct_add_dual(q[3], q[0]);
  266. q[6] = highbd_idct_sub_dual(q[3], q[0]);
  267. q[7] = highbd_idct_sub_dual(q[2], q[1]);
  268. highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
  269. max);
  270. dest0 += str2;
  271. dest1 -= str2;
  272. load_from_output(out, 12, 13, &q[0], &q[1]);
  273. q[2] = highbd_idct_add_dual(q[10], q[1]);
  274. q[3] = highbd_idct_add_dual(q[11], q[0]);
  275. q[4] = highbd_idct_sub_dual(q[11], q[0]);
  276. q[5] = highbd_idct_sub_dual(q[10], q[1]);
  277. load_from_output(out, 18, 19, &q[0], &q[1]);
  278. q[8] = highbd_idct_add_dual(q[4], q[1]);
  279. q[9] = highbd_idct_add_dual(q[5], q[0]);
  280. q[6] = highbd_idct_sub_dual(q[5], q[0]);
  281. q[7] = highbd_idct_sub_dual(q[4], q[1]);
  282. highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
  283. max);
  284. dest2 += str2;
  285. dest3 -= str2;
  286. load_from_output(out, 28, 29, &q[0], &q[1]);
  287. q[4] = highbd_idct_add_dual(q[2], q[1]);
  288. q[5] = highbd_idct_add_dual(q[3], q[0]);
  289. q[6] = highbd_idct_sub_dual(q[3], q[0]);
  290. q[7] = highbd_idct_sub_dual(q[2], q[1]);
  291. highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
  292. max);
  293. dest0 += str2;
  294. dest1 -= str2;
  295. load_from_output(out, 10, 11, &q[0], &q[1]);
  296. q[2] = highbd_idct_add_dual(q[12], q[1]);
  297. q[3] = highbd_idct_add_dual(q[13], q[0]);
  298. q[4] = highbd_idct_sub_dual(q[13], q[0]);
  299. q[5] = highbd_idct_sub_dual(q[12], q[1]);
  300. load_from_output(out, 20, 21, &q[0], &q[1]);
  301. q[8] = highbd_idct_add_dual(q[4], q[1]);
  302. q[9] = highbd_idct_add_dual(q[5], q[0]);
  303. q[6] = highbd_idct_sub_dual(q[5], q[0]);
  304. q[7] = highbd_idct_sub_dual(q[4], q[1]);
  305. highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
  306. max);
  307. dest2 += str2;
  308. dest3 -= str2;
  309. load_from_output(out, 26, 27, &q[0], &q[1]);
  310. q[4] = highbd_idct_add_dual(q[2], q[1]);
  311. q[5] = highbd_idct_add_dual(q[3], q[0]);
  312. q[6] = highbd_idct_sub_dual(q[3], q[0]);
  313. q[7] = highbd_idct_sub_dual(q[2], q[1]);
  314. highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
  315. max);
  316. dest0 += str2;
  317. dest1 -= str2;
  318. load_from_output(out, 8, 9, &q[0], &q[1]);
  319. q[2] = highbd_idct_add_dual(q[14], q[1]);
  320. q[3] = highbd_idct_add_dual(q[15], q[0]);
  321. q[4] = highbd_idct_sub_dual(q[15], q[0]);
  322. q[5] = highbd_idct_sub_dual(q[14], q[1]);
  323. load_from_output(out, 22, 23, &q[0], &q[1]);
  324. q[8] = highbd_idct_add_dual(q[4], q[1]);
  325. q[9] = highbd_idct_add_dual(q[5], q[0]);
  326. q[6] = highbd_idct_sub_dual(q[5], q[0]);
  327. q[7] = highbd_idct_sub_dual(q[4], q[1]);
  328. highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
  329. max);
  330. load_from_output(out, 24, 25, &q[0], &q[1]);
  331. q[4] = highbd_idct_add_dual(q[2], q[1]);
  332. q[5] = highbd_idct_add_dual(q[3], q[0]);
  333. q[6] = highbd_idct_sub_dual(q[3], q[0]);
  334. q[7] = highbd_idct_sub_dual(q[2], q[1]);
  335. highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
  336. max);
  337. }
  338. static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
  339. uint16_t *dst, const int stride,
  340. const int bd) {
  341. int i, idct32_pass_loop;
  342. int32_t trans_buf[32 * 8];
  343. int32_t pass1[32 * 32];
  344. int32_t pass2[32 * 32];
  345. int32_t *out;
  346. int32x4x2_t q[16];
  347. for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
  348. idct32_pass_loop++, input = pass1, out = pass2) {
  349. for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop
  350. idct32_transpose_pair(input, trans_buf);
  351. input += 32 * 8;
  352. // -----------------------------------------
  353. // BLOCK A: 16-19,28-31
  354. // -----------------------------------------
  355. // generate 16,17,30,31
  356. // part of stage 1
  357. load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]);
  358. do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]);
  359. load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]);
  360. do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]);
  361. // part of stage 2
  362. q[4] = highbd_idct_add_dual(q[0], q[1]);
  363. q[13] = highbd_idct_sub_dual(q[0], q[1]);
  364. q[6] = highbd_idct_add_dual(q[2], q[3]);
  365. q[14] = highbd_idct_sub_dual(q[2], q[3]);
  366. // part of stage 3
  367. do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]);
  368. // generate 18,19,28,29
  369. // part of stage 1
  370. load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]);
  371. do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]);
  372. load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]);
  373. do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]);
  374. // part of stage 2
  375. q[13] = highbd_idct_sub_dual(q[3], q[2]);
  376. q[3] = highbd_idct_add_dual(q[3], q[2]);
  377. q[14] = highbd_idct_sub_dual(q[1], q[0]);
  378. q[2] = highbd_idct_add_dual(q[1], q[0]);
  379. // part of stage 3
  380. do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]);
  381. // part of stage 4
  382. q[8] = highbd_idct_add_dual(q[4], q[2]);
  383. q[9] = highbd_idct_add_dual(q[5], q[0]);
  384. q[10] = highbd_idct_add_dual(q[7], q[1]);
  385. q[15] = highbd_idct_add_dual(q[6], q[3]);
  386. q[13] = highbd_idct_sub_dual(q[5], q[0]);
  387. q[14] = highbd_idct_sub_dual(q[7], q[1]);
  388. store_in_output(out, 16, 31, q[8], q[15]);
  389. store_in_output(out, 17, 30, q[9], q[10]);
  390. // part of stage 5
  391. do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]);
  392. store_in_output(out, 29, 18, q[1], q[0]);
  393. // part of stage 4
  394. q[13] = highbd_idct_sub_dual(q[4], q[2]);
  395. q[14] = highbd_idct_sub_dual(q[6], q[3]);
  396. // part of stage 5
  397. do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]);
  398. store_in_output(out, 19, 28, q[4], q[6]);
  399. // -----------------------------------------
  400. // BLOCK B: 20-23,24-27
  401. // -----------------------------------------
  402. // generate 20,21,26,27
  403. // part of stage 1
  404. load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]);
  405. do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]);
  406. load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]);
  407. do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]);
  408. // part of stage 2
  409. q[13] = highbd_idct_sub_dual(q[0], q[1]);
  410. q[0] = highbd_idct_add_dual(q[0], q[1]);
  411. q[14] = highbd_idct_sub_dual(q[2], q[3]);
  412. q[2] = highbd_idct_add_dual(q[2], q[3]);
  413. // part of stage 3
  414. do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
  415. // generate 22,23,24,25
  416. // part of stage 1
  417. load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]);
  418. do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]);
  419. load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]);
  420. do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]);
  421. // part of stage 2
  422. q[14] = highbd_idct_sub_dual(q[4], q[5]);
  423. q[5] = highbd_idct_add_dual(q[4], q[5]);
  424. q[13] = highbd_idct_sub_dual(q[6], q[7]);
  425. q[6] = highbd_idct_add_dual(q[6], q[7]);
  426. // part of stage 3
  427. do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]);
  428. // part of stage 4
  429. q[10] = highbd_idct_add_dual(q[7], q[1]);
  430. q[11] = highbd_idct_add_dual(q[5], q[0]);
  431. q[12] = highbd_idct_add_dual(q[6], q[2]);
  432. q[15] = highbd_idct_add_dual(q[4], q[3]);
  433. // part of stage 6
  434. load_from_output(out, 16, 17, &q[14], &q[13]);
  435. q[8] = highbd_idct_add_dual(q[14], q[11]);
  436. q[9] = highbd_idct_add_dual(q[13], q[10]);
  437. q[13] = highbd_idct_sub_dual(q[13], q[10]);
  438. q[11] = highbd_idct_sub_dual(q[14], q[11]);
  439. store_in_output(out, 17, 16, q[9], q[8]);
  440. load_from_output(out, 30, 31, &q[14], &q[9]);
  441. q[8] = highbd_idct_sub_dual(q[9], q[12]);
  442. q[10] = highbd_idct_add_dual(q[14], q[15]);
  443. q[14] = highbd_idct_sub_dual(q[14], q[15]);
  444. q[12] = highbd_idct_add_dual(q[9], q[12]);
  445. store_in_output(out, 30, 31, q[10], q[12]);
  446. // part of stage 7
  447. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
  448. store_in_output(out, 25, 22, q[14], q[13]);
  449. do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]);
  450. store_in_output(out, 24, 23, q[14], q[13]);
  451. // part of stage 4
  452. q[14] = highbd_idct_sub_dual(q[5], q[0]);
  453. q[13] = highbd_idct_sub_dual(q[6], q[2]);
  454. do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]);
  455. q[14] = highbd_idct_sub_dual(q[7], q[1]);
  456. q[13] = highbd_idct_sub_dual(q[4], q[3]);
  457. do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]);
  458. // part of stage 6
  459. load_from_output(out, 18, 19, &q[14], &q[13]);
  460. q[8] = highbd_idct_add_dual(q[14], q[1]);
  461. q[9] = highbd_idct_add_dual(q[13], q[6]);
  462. q[13] = highbd_idct_sub_dual(q[13], q[6]);
  463. q[1] = highbd_idct_sub_dual(q[14], q[1]);
  464. store_in_output(out, 18, 19, q[8], q[9]);
  465. load_from_output(out, 28, 29, &q[8], &q[9]);
  466. q[14] = highbd_idct_sub_dual(q[8], q[5]);
  467. q[10] = highbd_idct_add_dual(q[8], q[5]);
  468. q[11] = highbd_idct_add_dual(q[9], q[0]);
  469. q[0] = highbd_idct_sub_dual(q[9], q[0]);
  470. store_in_output(out, 28, 29, q[10], q[11]);
  471. // part of stage 7
  472. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
  473. store_in_output(out, 20, 27, q[13], q[14]);
  474. do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]);
  475. store_in_output(out, 21, 26, q[1], q[0]);
  476. // -----------------------------------------
  477. // BLOCK C: 8-10,11-15
  478. // -----------------------------------------
  479. // generate 8,9,14,15
  480. // part of stage 2
  481. load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]);
  482. do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]);
  483. load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]);
  484. do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]);
  485. // part of stage 3
  486. q[13] = highbd_idct_sub_dual(q[0], q[1]);
  487. q[0] = highbd_idct_add_dual(q[0], q[1]);
  488. q[14] = highbd_idct_sub_dual(q[2], q[3]);
  489. q[2] = highbd_idct_add_dual(q[2], q[3]);
  490. // part of stage 4
  491. do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]);
  492. // generate 10,11,12,13
  493. // part of stage 2
  494. load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]);
  495. do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]);
  496. load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]);
  497. do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]);
  498. // part of stage 3
  499. q[14] = highbd_idct_sub_dual(q[4], q[5]);
  500. q[5] = highbd_idct_add_dual(q[4], q[5]);
  501. q[13] = highbd_idct_sub_dual(q[6], q[7]);
  502. q[6] = highbd_idct_add_dual(q[6], q[7]);
  503. // part of stage 4
  504. do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]);
  505. // part of stage 5
  506. q[8] = highbd_idct_add_dual(q[0], q[5]);
  507. q[9] = highbd_idct_add_dual(q[1], q[7]);
  508. q[13] = highbd_idct_sub_dual(q[1], q[7]);
  509. q[14] = highbd_idct_sub_dual(q[3], q[4]);
  510. q[10] = highbd_idct_add_dual(q[3], q[4]);
  511. q[15] = highbd_idct_add_dual(q[2], q[6]);
  512. store_in_output(out, 8, 15, q[8], q[15]);
  513. store_in_output(out, 9, 14, q[9], q[10]);
  514. // part of stage 6
  515. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
  516. store_in_output(out, 13, 10, q[3], q[1]);
  517. q[13] = highbd_idct_sub_dual(q[0], q[5]);
  518. q[14] = highbd_idct_sub_dual(q[2], q[6]);
  519. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
  520. store_in_output(out, 11, 12, q[1], q[3]);
  521. // -----------------------------------------
  522. // BLOCK D: 0-3,4-7
  523. // -----------------------------------------
  524. // generate 4,5,6,7
  525. // part of stage 3
  526. load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]);
  527. do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]);
  528. load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]);
  529. do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
  530. // part of stage 4
  531. q[13] = highbd_idct_sub_dual(q[0], q[1]);
  532. q[0] = highbd_idct_add_dual(q[0], q[1]);
  533. q[14] = highbd_idct_sub_dual(q[2], q[3]);
  534. q[2] = highbd_idct_add_dual(q[2], q[3]);
  535. // part of stage 5
  536. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
  537. // generate 0,1,2,3
  538. // part of stage 4
  539. load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]);
  540. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]);
  541. load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]);
  542. do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]);
  543. // part of stage 5
  544. q[4] = highbd_idct_add_dual(q[7], q[6]);
  545. q[7] = highbd_idct_sub_dual(q[7], q[6]);
  546. q[6] = highbd_idct_sub_dual(q[5], q[14]);
  547. q[5] = highbd_idct_add_dual(q[5], q[14]);
  548. // part of stage 6
  549. q[8] = highbd_idct_add_dual(q[4], q[2]);
  550. q[9] = highbd_idct_add_dual(q[5], q[3]);
  551. q[10] = highbd_idct_add_dual(q[6], q[1]);
  552. q[11] = highbd_idct_add_dual(q[7], q[0]);
  553. q[12] = highbd_idct_sub_dual(q[7], q[0]);
  554. q[13] = highbd_idct_sub_dual(q[6], q[1]);
  555. q[14] = highbd_idct_sub_dual(q[5], q[3]);
  556. q[15] = highbd_idct_sub_dual(q[4], q[2]);
  557. // part of stage 7
  558. load_from_output(out, 14, 15, &q[0], &q[1]);
  559. q[2] = highbd_idct_add_dual(q[8], q[1]);
  560. q[3] = highbd_idct_add_dual(q[9], q[0]);
  561. q[4] = highbd_idct_sub_dual(q[9], q[0]);
  562. q[5] = highbd_idct_sub_dual(q[8], q[1]);
  563. load_from_output(out, 16, 17, &q[0], &q[1]);
  564. q[8] = highbd_idct_add_dual(q[4], q[1]);
  565. q[9] = highbd_idct_add_dual(q[5], q[0]);
  566. q[6] = highbd_idct_sub_dual(q[5], q[0]);
  567. q[7] = highbd_idct_sub_dual(q[4], q[1]);
  568. if (idct32_pass_loop == 0) {
  569. idct32_bands_end_1st_pass(out, q);
  570. } else {
  571. const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
  572. idct32_bands_end_2nd_pass(out, dst, stride, max, q);
  573. dst += 8;
  574. }
  575. }
  576. }
  577. }
  578. void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest,
  579. int stride, int bd) {
  580. if (bd == 8) {
  581. vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1);
  582. } else {
  583. vpx_highbd_idct32_32_neon(input, dest, stride, bd);
  584. }
  585. }