idct32x32_add_neon.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_config.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_dsp/arm/idct_neon.h"
  14. #include "vpx_dsp/arm/mem_neon.h"
  15. #include "vpx_dsp/arm/transpose_neon.h"
  16. #include "vpx_dsp/txfm_common.h"
  17. static INLINE void load_from_transformed(const int16_t *const trans_buf,
  18. const int first, const int second,
  19. int16x8_t *const q0,
  20. int16x8_t *const q1) {
  21. *q0 = vld1q_s16(trans_buf + first * 8);
  22. *q1 = vld1q_s16(trans_buf + second * 8);
  23. }
  24. static INLINE void load_from_output(const int16_t *const out, const int first,
  25. const int second, int16x8_t *const q0,
  26. int16x8_t *const q1) {
  27. *q0 = vld1q_s16(out + first * 32);
  28. *q1 = vld1q_s16(out + second * 32);
  29. }
  30. static INLINE void store_in_output(int16_t *const out, const int first,
  31. const int second, const int16x8_t q0,
  32. const int16x8_t q1) {
  33. vst1q_s16(out + first * 32, q0);
  34. vst1q_s16(out + second * 32, q1);
  35. }
  36. static INLINE void store_combine_results(uint8_t *p1, uint8_t *p2,
  37. const int stride, int16x8_t q0,
  38. int16x8_t q1, int16x8_t q2,
  39. int16x8_t q3) {
  40. uint8x8_t d[4];
  41. d[0] = vld1_u8(p1);
  42. p1 += stride;
  43. d[1] = vld1_u8(p1);
  44. d[3] = vld1_u8(p2);
  45. p2 -= stride;
  46. d[2] = vld1_u8(p2);
  47. q0 = vrshrq_n_s16(q0, 6);
  48. q1 = vrshrq_n_s16(q1, 6);
  49. q2 = vrshrq_n_s16(q2, 6);
  50. q3 = vrshrq_n_s16(q3, 6);
  51. q0 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q0), d[0]));
  52. q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), d[1]));
  53. q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), d[2]));
  54. q3 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q3), d[3]));
  55. d[0] = vqmovun_s16(q0);
  56. d[1] = vqmovun_s16(q1);
  57. d[2] = vqmovun_s16(q2);
  58. d[3] = vqmovun_s16(q3);
  59. vst1_u8(p1, d[1]);
  60. p1 -= stride;
  61. vst1_u8(p1, d[0]);
  62. vst1_u8(p2, d[2]);
  63. p2 += stride;
  64. vst1_u8(p2, d[3]);
  65. }
  66. static INLINE void highbd_store_combine_results_bd8(uint16_t *p1, uint16_t *p2,
  67. const int stride,
  68. int16x8_t q0, int16x8_t q1,
  69. int16x8_t q2,
  70. int16x8_t q3) {
  71. uint16x8_t d[4];
  72. d[0] = vld1q_u16(p1);
  73. p1 += stride;
  74. d[1] = vld1q_u16(p1);
  75. d[3] = vld1q_u16(p2);
  76. p2 -= stride;
  77. d[2] = vld1q_u16(p2);
  78. q0 = vrshrq_n_s16(q0, 6);
  79. q1 = vrshrq_n_s16(q1, 6);
  80. q2 = vrshrq_n_s16(q2, 6);
  81. q3 = vrshrq_n_s16(q3, 6);
  82. q0 = vaddq_s16(q0, vreinterpretq_s16_u16(d[0]));
  83. q1 = vaddq_s16(q1, vreinterpretq_s16_u16(d[1]));
  84. q2 = vaddq_s16(q2, vreinterpretq_s16_u16(d[2]));
  85. q3 = vaddq_s16(q3, vreinterpretq_s16_u16(d[3]));
  86. d[0] = vmovl_u8(vqmovun_s16(q0));
  87. d[1] = vmovl_u8(vqmovun_s16(q1));
  88. d[2] = vmovl_u8(vqmovun_s16(q2));
  89. d[3] = vmovl_u8(vqmovun_s16(q3));
  90. vst1q_u16(p1, d[1]);
  91. p1 -= stride;
  92. vst1q_u16(p1, d[0]);
  93. vst1q_u16(p2, d[2]);
  94. p2 += stride;
  95. vst1q_u16(p2, d[3]);
  96. }
  97. static INLINE void do_butterfly(const int16x8_t qIn0, const int16x8_t qIn1,
  98. const int16_t first_const,
  99. const int16_t second_const,
  100. int16x8_t *const qOut0,
  101. int16x8_t *const qOut1) {
  102. int32x4_t q[4];
  103. int16x4_t d[6];
  104. d[0] = vget_low_s16(qIn0);
  105. d[1] = vget_high_s16(qIn0);
  106. d[2] = vget_low_s16(qIn1);
  107. d[3] = vget_high_s16(qIn1);
  108. // Note: using v{mul, mla, mls}l_n_s16 here slows down 35% with gcc 4.9.
  109. d[4] = vdup_n_s16(first_const);
  110. d[5] = vdup_n_s16(second_const);
  111. q[0] = vmull_s16(d[0], d[4]);
  112. q[1] = vmull_s16(d[1], d[4]);
  113. q[0] = vmlsl_s16(q[0], d[2], d[5]);
  114. q[1] = vmlsl_s16(q[1], d[3], d[5]);
  115. q[2] = vmull_s16(d[0], d[5]);
  116. q[3] = vmull_s16(d[1], d[5]);
  117. q[2] = vmlal_s16(q[2], d[2], d[4]);
  118. q[3] = vmlal_s16(q[3], d[3], d[4]);
  119. *qOut0 = vcombine_s16(vrshrn_n_s32(q[0], DCT_CONST_BITS),
  120. vrshrn_n_s32(q[1], DCT_CONST_BITS));
  121. *qOut1 = vcombine_s16(vrshrn_n_s32(q[2], DCT_CONST_BITS),
  122. vrshrn_n_s32(q[3], DCT_CONST_BITS));
  123. }
  124. static INLINE void load_s16x8q(const int16_t *in, int16x8_t *const s0,
  125. int16x8_t *const s1, int16x8_t *const s2,
  126. int16x8_t *const s3, int16x8_t *const s4,
  127. int16x8_t *const s5, int16x8_t *const s6,
  128. int16x8_t *const s7) {
  129. *s0 = vld1q_s16(in);
  130. in += 32;
  131. *s1 = vld1q_s16(in);
  132. in += 32;
  133. *s2 = vld1q_s16(in);
  134. in += 32;
  135. *s3 = vld1q_s16(in);
  136. in += 32;
  137. *s4 = vld1q_s16(in);
  138. in += 32;
  139. *s5 = vld1q_s16(in);
  140. in += 32;
  141. *s6 = vld1q_s16(in);
  142. in += 32;
  143. *s7 = vld1q_s16(in);
  144. }
  145. static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1,
  146. int16x8_t a2, int16x8_t a3,
  147. int16x8_t a4, int16x8_t a5,
  148. int16x8_t a6, int16x8_t a7,
  149. int16_t **out) {
  150. transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
  151. vst1q_s16(*out, a0);
  152. *out += 8;
  153. vst1q_s16(*out, a1);
  154. *out += 8;
  155. vst1q_s16(*out, a2);
  156. *out += 8;
  157. vst1q_s16(*out, a3);
  158. *out += 8;
  159. vst1q_s16(*out, a4);
  160. *out += 8;
  161. vst1q_s16(*out, a5);
  162. *out += 8;
  163. vst1q_s16(*out, a6);
  164. *out += 8;
  165. vst1q_s16(*out, a7);
  166. *out += 8;
  167. }
  168. static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) {
  169. int i;
  170. int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
  171. for (i = 0; i < 4; i++, input += 8) {
  172. load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
  173. transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
  174. }
  175. }
  176. #if CONFIG_VP9_HIGHBITDEPTH
  177. static INLINE void load_s16x8q_tran_low(
  178. const tran_low_t *in, int16x8_t *const s0, int16x8_t *const s1,
  179. int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4,
  180. int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7) {
  181. *s0 = load_tran_low_to_s16q(in);
  182. in += 32;
  183. *s1 = load_tran_low_to_s16q(in);
  184. in += 32;
  185. *s2 = load_tran_low_to_s16q(in);
  186. in += 32;
  187. *s3 = load_tran_low_to_s16q(in);
  188. in += 32;
  189. *s4 = load_tran_low_to_s16q(in);
  190. in += 32;
  191. *s5 = load_tran_low_to_s16q(in);
  192. in += 32;
  193. *s6 = load_tran_low_to_s16q(in);
  194. in += 32;
  195. *s7 = load_tran_low_to_s16q(in);
  196. }
  197. static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input,
  198. int16_t *t_buf) {
  199. int i;
  200. int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
  201. for (i = 0; i < 4; i++, input += 8) {
  202. load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
  203. transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
  204. }
  205. }
  206. #else // !CONFIG_VP9_HIGHBITDEPTH
  207. #define idct32_transpose_pair_tran_low idct32_transpose_pair
  208. #endif // CONFIG_VP9_HIGHBITDEPTH
  209. static INLINE void idct32_bands_end_1st_pass(int16_t *const out,
  210. int16x8_t *const q) {
  211. store_in_output(out, 16, 17, q[6], q[7]);
  212. store_in_output(out, 14, 15, q[8], q[9]);
  213. load_from_output(out, 30, 31, &q[0], &q[1]);
  214. q[4] = vaddq_s16(q[2], q[1]);
  215. q[5] = vaddq_s16(q[3], q[0]);
  216. q[6] = vsubq_s16(q[3], q[0]);
  217. q[7] = vsubq_s16(q[2], q[1]);
  218. store_in_output(out, 30, 31, q[6], q[7]);
  219. store_in_output(out, 0, 1, q[4], q[5]);
  220. load_from_output(out, 12, 13, &q[0], &q[1]);
  221. q[2] = vaddq_s16(q[10], q[1]);
  222. q[3] = vaddq_s16(q[11], q[0]);
  223. q[4] = vsubq_s16(q[11], q[0]);
  224. q[5] = vsubq_s16(q[10], q[1]);
  225. load_from_output(out, 18, 19, &q[0], &q[1]);
  226. q[8] = vaddq_s16(q[4], q[1]);
  227. q[9] = vaddq_s16(q[5], q[0]);
  228. q[6] = vsubq_s16(q[5], q[0]);
  229. q[7] = vsubq_s16(q[4], q[1]);
  230. store_in_output(out, 18, 19, q[6], q[7]);
  231. store_in_output(out, 12, 13, q[8], q[9]);
  232. load_from_output(out, 28, 29, &q[0], &q[1]);
  233. q[4] = vaddq_s16(q[2], q[1]);
  234. q[5] = vaddq_s16(q[3], q[0]);
  235. q[6] = vsubq_s16(q[3], q[0]);
  236. q[7] = vsubq_s16(q[2], q[1]);
  237. store_in_output(out, 28, 29, q[6], q[7]);
  238. store_in_output(out, 2, 3, q[4], q[5]);
  239. load_from_output(out, 10, 11, &q[0], &q[1]);
  240. q[2] = vaddq_s16(q[12], q[1]);
  241. q[3] = vaddq_s16(q[13], q[0]);
  242. q[4] = vsubq_s16(q[13], q[0]);
  243. q[5] = vsubq_s16(q[12], q[1]);
  244. load_from_output(out, 20, 21, &q[0], &q[1]);
  245. q[8] = vaddq_s16(q[4], q[1]);
  246. q[9] = vaddq_s16(q[5], q[0]);
  247. q[6] = vsubq_s16(q[5], q[0]);
  248. q[7] = vsubq_s16(q[4], q[1]);
  249. store_in_output(out, 20, 21, q[6], q[7]);
  250. store_in_output(out, 10, 11, q[8], q[9]);
  251. load_from_output(out, 26, 27, &q[0], &q[1]);
  252. q[4] = vaddq_s16(q[2], q[1]);
  253. q[5] = vaddq_s16(q[3], q[0]);
  254. q[6] = vsubq_s16(q[3], q[0]);
  255. q[7] = vsubq_s16(q[2], q[1]);
  256. store_in_output(out, 26, 27, q[6], q[7]);
  257. store_in_output(out, 4, 5, q[4], q[5]);
  258. load_from_output(out, 8, 9, &q[0], &q[1]);
  259. q[2] = vaddq_s16(q[14], q[1]);
  260. q[3] = vaddq_s16(q[15], q[0]);
  261. q[4] = vsubq_s16(q[15], q[0]);
  262. q[5] = vsubq_s16(q[14], q[1]);
  263. load_from_output(out, 22, 23, &q[0], &q[1]);
  264. q[8] = vaddq_s16(q[4], q[1]);
  265. q[9] = vaddq_s16(q[5], q[0]);
  266. q[6] = vsubq_s16(q[5], q[0]);
  267. q[7] = vsubq_s16(q[4], q[1]);
  268. store_in_output(out, 22, 23, q[6], q[7]);
  269. store_in_output(out, 8, 9, q[8], q[9]);
  270. load_from_output(out, 24, 25, &q[0], &q[1]);
  271. q[4] = vaddq_s16(q[2], q[1]);
  272. q[5] = vaddq_s16(q[3], q[0]);
  273. q[6] = vsubq_s16(q[3], q[0]);
  274. q[7] = vsubq_s16(q[2], q[1]);
  275. store_in_output(out, 24, 25, q[6], q[7]);
  276. store_in_output(out, 6, 7, q[4], q[5]);
  277. }
  278. static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
  279. uint8_t *const dest,
  280. const int stride,
  281. int16x8_t *const q) {
  282. uint8_t *dest0 = dest + 0 * stride;
  283. uint8_t *dest1 = dest + 31 * stride;
  284. uint8_t *dest2 = dest + 16 * stride;
  285. uint8_t *dest3 = dest + 15 * stride;
  286. const int str2 = stride << 1;
  287. store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
  288. dest2 += str2;
  289. dest3 -= str2;
  290. load_from_output(out, 30, 31, &q[0], &q[1]);
  291. q[4] = final_add(q[2], q[1]);
  292. q[5] = final_add(q[3], q[0]);
  293. q[6] = final_sub(q[3], q[0]);
  294. q[7] = final_sub(q[2], q[1]);
  295. store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
  296. dest0 += str2;
  297. dest1 -= str2;
  298. load_from_output(out, 12, 13, &q[0], &q[1]);
  299. q[2] = vaddq_s16(q[10], q[1]);
  300. q[3] = vaddq_s16(q[11], q[0]);
  301. q[4] = vsubq_s16(q[11], q[0]);
  302. q[5] = vsubq_s16(q[10], q[1]);
  303. load_from_output(out, 18, 19, &q[0], &q[1]);
  304. q[8] = final_add(q[4], q[1]);
  305. q[9] = final_add(q[5], q[0]);
  306. q[6] = final_sub(q[5], q[0]);
  307. q[7] = final_sub(q[4], q[1]);
  308. store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
  309. dest2 += str2;
  310. dest3 -= str2;
  311. load_from_output(out, 28, 29, &q[0], &q[1]);
  312. q[4] = final_add(q[2], q[1]);
  313. q[5] = final_add(q[3], q[0]);
  314. q[6] = final_sub(q[3], q[0]);
  315. q[7] = final_sub(q[2], q[1]);
  316. store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
  317. dest0 += str2;
  318. dest1 -= str2;
  319. load_from_output(out, 10, 11, &q[0], &q[1]);
  320. q[2] = vaddq_s16(q[12], q[1]);
  321. q[3] = vaddq_s16(q[13], q[0]);
  322. q[4] = vsubq_s16(q[13], q[0]);
  323. q[5] = vsubq_s16(q[12], q[1]);
  324. load_from_output(out, 20, 21, &q[0], &q[1]);
  325. q[8] = final_add(q[4], q[1]);
  326. q[9] = final_add(q[5], q[0]);
  327. q[6] = final_sub(q[5], q[0]);
  328. q[7] = final_sub(q[4], q[1]);
  329. store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
  330. dest2 += str2;
  331. dest3 -= str2;
  332. load_from_output(out, 26, 27, &q[0], &q[1]);
  333. q[4] = final_add(q[2], q[1]);
  334. q[5] = final_add(q[3], q[0]);
  335. q[6] = final_sub(q[3], q[0]);
  336. q[7] = final_sub(q[2], q[1]);
  337. store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
  338. dest0 += str2;
  339. dest1 -= str2;
  340. load_from_output(out, 8, 9, &q[0], &q[1]);
  341. q[2] = vaddq_s16(q[14], q[1]);
  342. q[3] = vaddq_s16(q[15], q[0]);
  343. q[4] = vsubq_s16(q[15], q[0]);
  344. q[5] = vsubq_s16(q[14], q[1]);
  345. load_from_output(out, 22, 23, &q[0], &q[1]);
  346. q[8] = final_add(q[4], q[1]);
  347. q[9] = final_add(q[5], q[0]);
  348. q[6] = final_sub(q[5], q[0]);
  349. q[7] = final_sub(q[4], q[1]);
  350. store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
  351. load_from_output(out, 24, 25, &q[0], &q[1]);
  352. q[4] = final_add(q[2], q[1]);
  353. q[5] = final_add(q[3], q[0]);
  354. q[6] = final_sub(q[3], q[0]);
  355. q[7] = final_sub(q[2], q[1]);
  356. store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
  357. }
  358. static INLINE void highbd_idct32_bands_end_2nd_pass_bd8(
  359. const int16_t *const out, uint16_t *const dest, const int stride,
  360. int16x8_t *const q) {
  361. uint16_t *dest0 = dest + 0 * stride;
  362. uint16_t *dest1 = dest + 31 * stride;
  363. uint16_t *dest2 = dest + 16 * stride;
  364. uint16_t *dest3 = dest + 15 * stride;
  365. const int str2 = stride << 1;
  366. highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
  367. q[9]);
  368. dest2 += str2;
  369. dest3 -= str2;
  370. load_from_output(out, 30, 31, &q[0], &q[1]);
  371. q[4] = final_add(q[2], q[1]);
  372. q[5] = final_add(q[3], q[0]);
  373. q[6] = final_sub(q[3], q[0]);
  374. q[7] = final_sub(q[2], q[1]);
  375. highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
  376. q[7]);
  377. dest0 += str2;
  378. dest1 -= str2;
  379. load_from_output(out, 12, 13, &q[0], &q[1]);
  380. q[2] = vaddq_s16(q[10], q[1]);
  381. q[3] = vaddq_s16(q[11], q[0]);
  382. q[4] = vsubq_s16(q[11], q[0]);
  383. q[5] = vsubq_s16(q[10], q[1]);
  384. load_from_output(out, 18, 19, &q[0], &q[1]);
  385. q[8] = final_add(q[4], q[1]);
  386. q[9] = final_add(q[5], q[0]);
  387. q[6] = final_sub(q[5], q[0]);
  388. q[7] = final_sub(q[4], q[1]);
  389. highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
  390. q[9]);
  391. dest2 += str2;
  392. dest3 -= str2;
  393. load_from_output(out, 28, 29, &q[0], &q[1]);
  394. q[4] = final_add(q[2], q[1]);
  395. q[5] = final_add(q[3], q[0]);
  396. q[6] = final_sub(q[3], q[0]);
  397. q[7] = final_sub(q[2], q[1]);
  398. highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
  399. q[7]);
  400. dest0 += str2;
  401. dest1 -= str2;
  402. load_from_output(out, 10, 11, &q[0], &q[1]);
  403. q[2] = vaddq_s16(q[12], q[1]);
  404. q[3] = vaddq_s16(q[13], q[0]);
  405. q[4] = vsubq_s16(q[13], q[0]);
  406. q[5] = vsubq_s16(q[12], q[1]);
  407. load_from_output(out, 20, 21, &q[0], &q[1]);
  408. q[8] = final_add(q[4], q[1]);
  409. q[9] = final_add(q[5], q[0]);
  410. q[6] = final_sub(q[5], q[0]);
  411. q[7] = final_sub(q[4], q[1]);
  412. highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
  413. q[9]);
  414. dest2 += str2;
  415. dest3 -= str2;
  416. load_from_output(out, 26, 27, &q[0], &q[1]);
  417. q[4] = final_add(q[2], q[1]);
  418. q[5] = final_add(q[3], q[0]);
  419. q[6] = final_sub(q[3], q[0]);
  420. q[7] = final_sub(q[2], q[1]);
  421. highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
  422. q[7]);
  423. dest0 += str2;
  424. dest1 -= str2;
  425. load_from_output(out, 8, 9, &q[0], &q[1]);
  426. q[2] = vaddq_s16(q[14], q[1]);
  427. q[3] = vaddq_s16(q[15], q[0]);
  428. q[4] = vsubq_s16(q[15], q[0]);
  429. q[5] = vsubq_s16(q[14], q[1]);
  430. load_from_output(out, 22, 23, &q[0], &q[1]);
  431. q[8] = final_add(q[4], q[1]);
  432. q[9] = final_add(q[5], q[0]);
  433. q[6] = final_sub(q[5], q[0]);
  434. q[7] = final_sub(q[4], q[1]);
  435. highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
  436. q[9]);
  437. load_from_output(out, 24, 25, &q[0], &q[1]);
  438. q[4] = final_add(q[2], q[1]);
  439. q[5] = final_add(q[3], q[0]);
  440. q[6] = final_sub(q[3], q[0]);
  441. q[7] = final_sub(q[2], q[1]);
  442. highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
  443. q[7]);
  444. }
  445. void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
  446. const int stride, const int highbd_flag) {
  447. int i, idct32_pass_loop;
  448. int16_t trans_buf[32 * 8];
  449. int16_t pass1[32 * 32];
  450. int16_t pass2[32 * 32];
  451. const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1
  452. int16_t *out;
  453. int16x8_t q[16];
  454. uint16_t *dst = CAST_TO_SHORTPTR(dest);
  455. for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
  456. idct32_pass_loop++, out = pass2) {
  457. for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop
  458. if (idct32_pass_loop == 0) {
  459. idct32_transpose_pair_tran_low(input, trans_buf);
  460. input += 32 * 8;
  461. } else {
  462. idct32_transpose_pair(input_pass2, trans_buf);
  463. input_pass2 += 32 * 8;
  464. }
  465. // -----------------------------------------
  466. // BLOCK A: 16-19,28-31
  467. // -----------------------------------------
  468. // generate 16,17,30,31
  469. // part of stage 1
  470. load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]);
  471. do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]);
  472. load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]);
  473. do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]);
  474. // part of stage 2
  475. q[4] = vaddq_s16(q[0], q[1]);
  476. q[13] = vsubq_s16(q[0], q[1]);
  477. q[6] = vaddq_s16(q[2], q[3]);
  478. q[14] = vsubq_s16(q[2], q[3]);
  479. // part of stage 3
  480. do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]);
  481. // generate 18,19,28,29
  482. // part of stage 1
  483. load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]);
  484. do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]);
  485. load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]);
  486. do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]);
  487. // part of stage 2
  488. q[13] = vsubq_s16(q[3], q[2]);
  489. q[3] = vaddq_s16(q[3], q[2]);
  490. q[14] = vsubq_s16(q[1], q[0]);
  491. q[2] = vaddq_s16(q[1], q[0]);
  492. // part of stage 3
  493. do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]);
  494. // part of stage 4
  495. q[8] = vaddq_s16(q[4], q[2]);
  496. q[9] = vaddq_s16(q[5], q[0]);
  497. q[10] = vaddq_s16(q[7], q[1]);
  498. q[15] = vaddq_s16(q[6], q[3]);
  499. q[13] = vsubq_s16(q[5], q[0]);
  500. q[14] = vsubq_s16(q[7], q[1]);
  501. store_in_output(out, 16, 31, q[8], q[15]);
  502. store_in_output(out, 17, 30, q[9], q[10]);
  503. // part of stage 5
  504. do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]);
  505. store_in_output(out, 29, 18, q[1], q[0]);
  506. // part of stage 4
  507. q[13] = vsubq_s16(q[4], q[2]);
  508. q[14] = vsubq_s16(q[6], q[3]);
  509. // part of stage 5
  510. do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]);
  511. store_in_output(out, 19, 28, q[4], q[6]);
  512. // -----------------------------------------
  513. // BLOCK B: 20-23,24-27
  514. // -----------------------------------------
  515. // generate 20,21,26,27
  516. // part of stage 1
  517. load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]);
  518. do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]);
  519. load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]);
  520. do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]);
  521. // part of stage 2
  522. q[13] = vsubq_s16(q[0], q[1]);
  523. q[0] = vaddq_s16(q[0], q[1]);
  524. q[14] = vsubq_s16(q[2], q[3]);
  525. q[2] = vaddq_s16(q[2], q[3]);
  526. // part of stage 3
  527. do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
  528. // generate 22,23,24,25
  529. // part of stage 1
  530. load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]);
  531. do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]);
  532. load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]);
  533. do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]);
  534. // part of stage 2
  535. q[14] = vsubq_s16(q[4], q[5]);
  536. q[5] = vaddq_s16(q[4], q[5]);
  537. q[13] = vsubq_s16(q[6], q[7]);
  538. q[6] = vaddq_s16(q[6], q[7]);
  539. // part of stage 3
  540. do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]);
  541. // part of stage 4
  542. q[10] = vaddq_s16(q[7], q[1]);
  543. q[11] = vaddq_s16(q[5], q[0]);
  544. q[12] = vaddq_s16(q[6], q[2]);
  545. q[15] = vaddq_s16(q[4], q[3]);
  546. // part of stage 6
  547. load_from_output(out, 16, 17, &q[14], &q[13]);
  548. q[8] = vaddq_s16(q[14], q[11]);
  549. q[9] = vaddq_s16(q[13], q[10]);
  550. q[13] = vsubq_s16(q[13], q[10]);
  551. q[11] = vsubq_s16(q[14], q[11]);
  552. store_in_output(out, 17, 16, q[9], q[8]);
  553. load_from_output(out, 30, 31, &q[14], &q[9]);
  554. q[8] = vsubq_s16(q[9], q[12]);
  555. q[10] = vaddq_s16(q[14], q[15]);
  556. q[14] = vsubq_s16(q[14], q[15]);
  557. q[12] = vaddq_s16(q[9], q[12]);
  558. store_in_output(out, 30, 31, q[10], q[12]);
  559. // part of stage 7
  560. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
  561. store_in_output(out, 25, 22, q[14], q[13]);
  562. do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]);
  563. store_in_output(out, 24, 23, q[14], q[13]);
  564. // part of stage 4
  565. q[14] = vsubq_s16(q[5], q[0]);
  566. q[13] = vsubq_s16(q[6], q[2]);
  567. do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]);
  568. q[14] = vsubq_s16(q[7], q[1]);
  569. q[13] = vsubq_s16(q[4], q[3]);
  570. do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]);
  571. // part of stage 6
  572. load_from_output(out, 18, 19, &q[14], &q[13]);
  573. q[8] = vaddq_s16(q[14], q[1]);
  574. q[9] = vaddq_s16(q[13], q[6]);
  575. q[13] = vsubq_s16(q[13], q[6]);
  576. q[1] = vsubq_s16(q[14], q[1]);
  577. store_in_output(out, 18, 19, q[8], q[9]);
  578. load_from_output(out, 28, 29, &q[8], &q[9]);
  579. q[14] = vsubq_s16(q[8], q[5]);
  580. q[10] = vaddq_s16(q[8], q[5]);
  581. q[11] = vaddq_s16(q[9], q[0]);
  582. q[0] = vsubq_s16(q[9], q[0]);
  583. store_in_output(out, 28, 29, q[10], q[11]);
  584. // part of stage 7
  585. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
  586. store_in_output(out, 20, 27, q[13], q[14]);
  587. do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]);
  588. store_in_output(out, 21, 26, q[1], q[0]);
  589. // -----------------------------------------
  590. // BLOCK C: 8-10,11-15
  591. // -----------------------------------------
  592. // generate 8,9,14,15
  593. // part of stage 2
  594. load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]);
  595. do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]);
  596. load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]);
  597. do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]);
  598. // part of stage 3
  599. q[13] = vsubq_s16(q[0], q[1]);
  600. q[0] = vaddq_s16(q[0], q[1]);
  601. q[14] = vsubq_s16(q[2], q[3]);
  602. q[2] = vaddq_s16(q[2], q[3]);
  603. // part of stage 4
  604. do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]);
  605. // generate 10,11,12,13
  606. // part of stage 2
  607. load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]);
  608. do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]);
  609. load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]);
  610. do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]);
  611. // part of stage 3
  612. q[14] = vsubq_s16(q[4], q[5]);
  613. q[5] = vaddq_s16(q[4], q[5]);
  614. q[13] = vsubq_s16(q[6], q[7]);
  615. q[6] = vaddq_s16(q[6], q[7]);
  616. // part of stage 4
  617. do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]);
  618. // part of stage 5
  619. q[8] = vaddq_s16(q[0], q[5]);
  620. q[9] = vaddq_s16(q[1], q[7]);
  621. q[13] = vsubq_s16(q[1], q[7]);
  622. q[14] = vsubq_s16(q[3], q[4]);
  623. q[10] = vaddq_s16(q[3], q[4]);
  624. q[15] = vaddq_s16(q[2], q[6]);
  625. store_in_output(out, 8, 15, q[8], q[15]);
  626. store_in_output(out, 9, 14, q[9], q[10]);
  627. // part of stage 6
  628. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
  629. store_in_output(out, 13, 10, q[3], q[1]);
  630. q[13] = vsubq_s16(q[0], q[5]);
  631. q[14] = vsubq_s16(q[2], q[6]);
  632. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
  633. store_in_output(out, 11, 12, q[1], q[3]);
  634. // -----------------------------------------
  635. // BLOCK D: 0-3,4-7
  636. // -----------------------------------------
  637. // generate 4,5,6,7
  638. // part of stage 3
  639. load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]);
  640. do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]);
  641. load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]);
  642. do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
  643. // part of stage 4
  644. q[13] = vsubq_s16(q[0], q[1]);
  645. q[0] = vaddq_s16(q[0], q[1]);
  646. q[14] = vsubq_s16(q[2], q[3]);
  647. q[2] = vaddq_s16(q[2], q[3]);
  648. // part of stage 5
  649. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
  650. // generate 0,1,2,3
  651. // part of stage 4
  652. load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]);
  653. do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]);
  654. load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]);
  655. do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]);
  656. // part of stage 5
  657. q[4] = vaddq_s16(q[7], q[6]);
  658. q[7] = vsubq_s16(q[7], q[6]);
  659. q[6] = vsubq_s16(q[5], q[14]);
  660. q[5] = vaddq_s16(q[5], q[14]);
  661. // part of stage 6
  662. q[8] = vaddq_s16(q[4], q[2]);
  663. q[9] = vaddq_s16(q[5], q[3]);
  664. q[10] = vaddq_s16(q[6], q[1]);
  665. q[11] = vaddq_s16(q[7], q[0]);
  666. q[12] = vsubq_s16(q[7], q[0]);
  667. q[13] = vsubq_s16(q[6], q[1]);
  668. q[14] = vsubq_s16(q[5], q[3]);
  669. q[15] = vsubq_s16(q[4], q[2]);
  670. // part of stage 7
  671. load_from_output(out, 14, 15, &q[0], &q[1]);
  672. q[2] = vaddq_s16(q[8], q[1]);
  673. q[3] = vaddq_s16(q[9], q[0]);
  674. q[4] = vsubq_s16(q[9], q[0]);
  675. q[5] = vsubq_s16(q[8], q[1]);
  676. load_from_output(out, 16, 17, &q[0], &q[1]);
  677. q[8] = final_add(q[4], q[1]);
  678. q[9] = final_add(q[5], q[0]);
  679. q[6] = final_sub(q[5], q[0]);
  680. q[7] = final_sub(q[4], q[1]);
  681. if (idct32_pass_loop == 0) {
  682. idct32_bands_end_1st_pass(out, q);
  683. } else {
  684. if (highbd_flag) {
  685. highbd_idct32_bands_end_2nd_pass_bd8(out, dst, stride, q);
  686. dst += 8;
  687. } else {
  688. idct32_bands_end_2nd_pass(out, dest, stride, q);
  689. dest += 8;
  690. }
  691. }
  692. }
  693. }
  694. }
  695. void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
  696. int stride) {
  697. vpx_idct32_32_neon(input, dest, stride, 0);
  698. }