highbd_idct32x32_135_add_neon.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_config.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_dsp/arm/highbd_idct_neon.h"
  14. #include "vpx_dsp/arm/idct_neon.h"
  15. #include "vpx_dsp/arm/transpose_neon.h"
  16. #include "vpx_dsp/txfm_common.h"
  17. static INLINE void load_8x8_s32_dual(
  18. const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1,
  19. int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4,
  20. int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) {
  21. in0->val[0] = vld1q_s32(input);
  22. in0->val[1] = vld1q_s32(input + 4);
  23. input += 32;
  24. in1->val[0] = vld1q_s32(input);
  25. in1->val[1] = vld1q_s32(input + 4);
  26. input += 32;
  27. in2->val[0] = vld1q_s32(input);
  28. in2->val[1] = vld1q_s32(input + 4);
  29. input += 32;
  30. in3->val[0] = vld1q_s32(input);
  31. in3->val[1] = vld1q_s32(input + 4);
  32. input += 32;
  33. in4->val[0] = vld1q_s32(input);
  34. in4->val[1] = vld1q_s32(input + 4);
  35. input += 32;
  36. in5->val[0] = vld1q_s32(input);
  37. in5->val[1] = vld1q_s32(input + 4);
  38. input += 32;
  39. in6->val[0] = vld1q_s32(input);
  40. in6->val[1] = vld1q_s32(input + 4);
  41. input += 32;
  42. in7->val[0] = vld1q_s32(input);
  43. in7->val[1] = vld1q_s32(input + 4);
  44. }
  45. static INLINE void load_4x8_s32_dual(const tran_low_t *input,
  46. int32x4_t *const in0, int32x4_t *const in1,
  47. int32x4_t *const in2, int32x4_t *const in3,
  48. int32x4_t *const in4, int32x4_t *const in5,
  49. int32x4_t *const in6,
  50. int32x4_t *const in7) {
  51. *in0 = vld1q_s32(input);
  52. input += 32;
  53. *in1 = vld1q_s32(input);
  54. input += 32;
  55. *in2 = vld1q_s32(input);
  56. input += 32;
  57. *in3 = vld1q_s32(input);
  58. input += 32;
  59. *in4 = vld1q_s32(input);
  60. input += 32;
  61. *in5 = vld1q_s32(input);
  62. input += 32;
  63. *in6 = vld1q_s32(input);
  64. input += 32;
  65. *in7 = vld1q_s32(input);
  66. }
  67. // Only for the first pass of the _135_ variant. Since it only uses values from
  68. // the top left 16x16 it can safely assume all the remaining values are 0 and
  69. // skip an awful lot of calculations. In fact, only the first 12 columns make
  70. // the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
  71. // used so it skips any calls to input[12|13|14|15] too.
  72. // In C this does a single row of 32 for each call. Here it transposes the top
  73. // left 12x8 to allow using SIMD.
  74. // vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
  75. // coefficients as follows:
  76. // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  77. // 0 0 2 5 10 17 25 38 47 62 83 101 121
  78. // 1 1 4 8 15 22 30 45 58 74 92 112 133
  79. // 2 3 7 12 18 28 36 52 64 82 102 118
  80. // 3 6 11 16 23 31 43 60 73 90 109 126
  81. // 4 9 14 19 29 37 50 65 78 98 116 134
  82. // 5 13 20 26 35 44 54 72 85 105 123
  83. // 6 21 27 33 42 53 63 80 94 113 132
  84. // 7 24 32 39 48 57 71 88 104 120
  85. // 8 34 40 46 56 68 81 96 111 130
  86. // 9 41 49 55 67 77 91 107 124
  87. // 10 51 59 66 76 89 99 119 131
  88. // 11 61 69 75 87 100 114 129
  89. // 12 70 79 86 97 108 122
  90. // 13 84 93 103 110 125
  91. // 14 98 106 115 127
  92. // 15 117 128
  93. static void vpx_highbd_idct32_12_neon(const tran_low_t *const input,
  94. int32_t *output) {
  95. int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
  96. s8[32];
  97. load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5],
  98. &in[6], &in[7]);
  99. transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
  100. &in[7]);
  101. load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0],
  102. &in[9].val[1], &in[10].val[0], &in[10].val[1],
  103. &in[11].val[0], &in[11].val[1]);
  104. transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1],
  105. &in[10].val[0], &in[10].val[1], &in[11].val[0],
  106. &in[11].val[1]);
  107. // stage 1
  108. s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
  109. s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
  110. s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
  111. s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
  112. s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
  113. s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
  114. s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
  115. s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
  116. s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
  117. s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
  118. s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
  119. s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
  120. // stage 2
  121. s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
  122. s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
  123. s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
  124. s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
  125. s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
  126. s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
  127. s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
  128. s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
  129. s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
  130. s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
  131. s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
  132. s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
  133. s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
  134. s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
  135. // stage 3
  136. s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
  137. s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
  138. s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
  139. s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
  140. s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
  141. s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
  142. s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
  143. s1[31], cospi_28_64);
  144. s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
  145. s1[31], cospi_4_64);
  146. s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
  147. s2[29], -cospi_4_64);
  148. s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
  149. s2[29], cospi_28_64);
  150. s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
  151. s2[26], cospi_12_64);
  152. s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
  153. s2[26], cospi_20_64);
  154. s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
  155. s1[24], -cospi_20_64);
  156. s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
  157. s1[24], cospi_12_64);
  158. // stage 4
  159. s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
  160. s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
  161. s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
  162. s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
  163. s2[15], cospi_24_64);
  164. s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
  165. s2[15], cospi_8_64);
  166. s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
  167. s3[13], -cospi_8_64);
  168. s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
  169. s3[13], cospi_24_64);
  170. s4[16] = highbd_idct_add_dual(s1[16], s2[19]);
  171. s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
  172. s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
  173. s4[19] = highbd_idct_sub_dual(s1[16], s2[19]);
  174. s4[20] = highbd_idct_sub_dual(s1[23], s2[20]);
  175. s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
  176. s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
  177. s4[23] = highbd_idct_add_dual(s2[20], s1[23]);
  178. s4[24] = highbd_idct_add_dual(s1[24], s2[27]);
  179. s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
  180. s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
  181. s4[27] = highbd_idct_sub_dual(s1[24], s2[27]);
  182. s4[28] = highbd_idct_sub_dual(s1[31], s2[28]);
  183. s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
  184. s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
  185. s4[31] = highbd_idct_add_dual(s2[28], s1[31]);
  186. // stage 5
  187. s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
  188. s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
  189. s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
  190. s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
  191. s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64);
  192. s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64);
  193. s5[8] = highbd_idct_add_dual(s2[8], s3[11]);
  194. s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
  195. s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
  196. s5[11] = highbd_idct_sub_dual(s2[8], s3[11]);
  197. s5[12] = highbd_idct_sub_dual(s2[15], s3[12]);
  198. s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
  199. s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
  200. s5[15] = highbd_idct_add_dual(s2[15], s3[12]);
  201. s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
  202. s4[29], cospi_24_64);
  203. s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
  204. s4[29], cospi_8_64);
  205. s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
  206. s4[28], cospi_24_64);
  207. s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
  208. s4[28], cospi_8_64);
  209. s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
  210. s4[27], -cospi_8_64);
  211. s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
  212. s4[27], cospi_24_64);
  213. s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
  214. s4[26], -cospi_8_64);
  215. s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
  216. s4[26], cospi_24_64);
  217. // stage 6
  218. s6[0] = highbd_idct_add_dual(s5[0], s3[7]);
  219. s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
  220. s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
  221. s6[3] = highbd_idct_add_dual(s5[3], s3[4]);
  222. s6[4] = highbd_idct_sub_dual(s5[3], s3[4]);
  223. s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
  224. s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
  225. s6[7] = highbd_idct_sub_dual(s5[0], s3[7]);
  226. s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
  227. s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
  228. s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
  229. s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
  230. s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
  231. s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
  232. s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
  233. s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
  234. s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
  235. s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
  236. s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
  237. s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
  238. s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
  239. s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
  240. s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
  241. s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
  242. s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
  243. s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
  244. s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
  245. s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
  246. // stage 7
  247. s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
  248. s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
  249. s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
  250. s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
  251. s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
  252. s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
  253. s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
  254. s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
  255. s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
  256. s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
  257. s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
  258. s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
  259. s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
  260. s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
  261. s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
  262. s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
  263. s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
  264. s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
  265. s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
  266. s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
  267. s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
  268. s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
  269. s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
  270. s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
  271. // final stage
  272. s8[0] = highbd_idct_add_dual(s7[0], s6[31]);
  273. s8[1] = highbd_idct_add_dual(s7[1], s6[30]);
  274. s8[2] = highbd_idct_add_dual(s7[2], s6[29]);
  275. s8[3] = highbd_idct_add_dual(s7[3], s6[28]);
  276. s8[4] = highbd_idct_add_dual(s7[4], s7[27]);
  277. s8[5] = highbd_idct_add_dual(s7[5], s7[26]);
  278. s8[6] = highbd_idct_add_dual(s7[6], s7[25]);
  279. s8[7] = highbd_idct_add_dual(s7[7], s7[24]);
  280. s8[8] = highbd_idct_add_dual(s7[8], s7[23]);
  281. s8[9] = highbd_idct_add_dual(s7[9], s7[22]);
  282. s8[10] = highbd_idct_add_dual(s7[10], s7[21]);
  283. s8[11] = highbd_idct_add_dual(s7[11], s7[20]);
  284. s8[12] = highbd_idct_add_dual(s7[12], s6[19]);
  285. s8[13] = highbd_idct_add_dual(s7[13], s6[18]);
  286. s8[14] = highbd_idct_add_dual(s7[14], s6[17]);
  287. s8[15] = highbd_idct_add_dual(s7[15], s6[16]);
  288. s8[16] = highbd_idct_sub_dual(s7[15], s6[16]);
  289. s8[17] = highbd_idct_sub_dual(s7[14], s6[17]);
  290. s8[18] = highbd_idct_sub_dual(s7[13], s6[18]);
  291. s8[19] = highbd_idct_sub_dual(s7[12], s6[19]);
  292. s8[20] = highbd_idct_sub_dual(s7[11], s7[20]);
  293. s8[21] = highbd_idct_sub_dual(s7[10], s7[21]);
  294. s8[22] = highbd_idct_sub_dual(s7[9], s7[22]);
  295. s8[23] = highbd_idct_sub_dual(s7[8], s7[23]);
  296. s8[24] = highbd_idct_sub_dual(s7[7], s7[24]);
  297. s8[25] = highbd_idct_sub_dual(s7[6], s7[25]);
  298. s8[26] = highbd_idct_sub_dual(s7[5], s7[26]);
  299. s8[27] = highbd_idct_sub_dual(s7[4], s7[27]);
  300. s8[28] = highbd_idct_sub_dual(s7[3], s6[28]);
  301. s8[29] = highbd_idct_sub_dual(s7[2], s6[29]);
  302. s8[30] = highbd_idct_sub_dual(s7[1], s6[30]);
  303. s8[31] = highbd_idct_sub_dual(s7[0], s6[31]);
  304. vst1q_s32(output + 0, s8[0].val[0]);
  305. vst1q_s32(output + 4, s8[0].val[1]);
  306. output += 16;
  307. vst1q_s32(output + 0, s8[1].val[0]);
  308. vst1q_s32(output + 4, s8[1].val[1]);
  309. output += 16;
  310. vst1q_s32(output + 0, s8[2].val[0]);
  311. vst1q_s32(output + 4, s8[2].val[1]);
  312. output += 16;
  313. vst1q_s32(output + 0, s8[3].val[0]);
  314. vst1q_s32(output + 4, s8[3].val[1]);
  315. output += 16;
  316. vst1q_s32(output + 0, s8[4].val[0]);
  317. vst1q_s32(output + 4, s8[4].val[1]);
  318. output += 16;
  319. vst1q_s32(output + 0, s8[5].val[0]);
  320. vst1q_s32(output + 4, s8[5].val[1]);
  321. output += 16;
  322. vst1q_s32(output + 0, s8[6].val[0]);
  323. vst1q_s32(output + 4, s8[6].val[1]);
  324. output += 16;
  325. vst1q_s32(output + 0, s8[7].val[0]);
  326. vst1q_s32(output + 4, s8[7].val[1]);
  327. output += 16;
  328. vst1q_s32(output + 0, s8[8].val[0]);
  329. vst1q_s32(output + 4, s8[8].val[1]);
  330. output += 16;
  331. vst1q_s32(output + 0, s8[9].val[0]);
  332. vst1q_s32(output + 4, s8[9].val[1]);
  333. output += 16;
  334. vst1q_s32(output + 0, s8[10].val[0]);
  335. vst1q_s32(output + 4, s8[10].val[1]);
  336. output += 16;
  337. vst1q_s32(output + 0, s8[11].val[0]);
  338. vst1q_s32(output + 4, s8[11].val[1]);
  339. output += 16;
  340. vst1q_s32(output + 0, s8[12].val[0]);
  341. vst1q_s32(output + 4, s8[12].val[1]);
  342. output += 16;
  343. vst1q_s32(output + 0, s8[13].val[0]);
  344. vst1q_s32(output + 4, s8[13].val[1]);
  345. output += 16;
  346. vst1q_s32(output + 0, s8[14].val[0]);
  347. vst1q_s32(output + 4, s8[14].val[1]);
  348. output += 16;
  349. vst1q_s32(output + 0, s8[15].val[0]);
  350. vst1q_s32(output + 4, s8[15].val[1]);
  351. output += 16;
  352. vst1q_s32(output + 0, s8[16].val[0]);
  353. vst1q_s32(output + 4, s8[16].val[1]);
  354. output += 16;
  355. vst1q_s32(output + 0, s8[17].val[0]);
  356. vst1q_s32(output + 4, s8[17].val[1]);
  357. output += 16;
  358. vst1q_s32(output + 0, s8[18].val[0]);
  359. vst1q_s32(output + 4, s8[18].val[1]);
  360. output += 16;
  361. vst1q_s32(output + 0, s8[19].val[0]);
  362. vst1q_s32(output + 4, s8[19].val[1]);
  363. output += 16;
  364. vst1q_s32(output + 0, s8[20].val[0]);
  365. vst1q_s32(output + 4, s8[20].val[1]);
  366. output += 16;
  367. vst1q_s32(output + 0, s8[21].val[0]);
  368. vst1q_s32(output + 4, s8[21].val[1]);
  369. output += 16;
  370. vst1q_s32(output + 0, s8[22].val[0]);
  371. vst1q_s32(output + 4, s8[22].val[1]);
  372. output += 16;
  373. vst1q_s32(output + 0, s8[23].val[0]);
  374. vst1q_s32(output + 4, s8[23].val[1]);
  375. output += 16;
  376. vst1q_s32(output + 0, s8[24].val[0]);
  377. vst1q_s32(output + 4, s8[24].val[1]);
  378. output += 16;
  379. vst1q_s32(output + 0, s8[25].val[0]);
  380. vst1q_s32(output + 4, s8[25].val[1]);
  381. output += 16;
  382. vst1q_s32(output + 0, s8[26].val[0]);
  383. vst1q_s32(output + 4, s8[26].val[1]);
  384. output += 16;
  385. vst1q_s32(output + 0, s8[27].val[0]);
  386. vst1q_s32(output + 4, s8[27].val[1]);
  387. output += 16;
  388. vst1q_s32(output + 0, s8[28].val[0]);
  389. vst1q_s32(output + 4, s8[28].val[1]);
  390. output += 16;
  391. vst1q_s32(output + 0, s8[29].val[0]);
  392. vst1q_s32(output + 4, s8[29].val[1]);
  393. output += 16;
  394. vst1q_s32(output + 0, s8[30].val[0]);
  395. vst1q_s32(output + 4, s8[30].val[1]);
  396. output += 16;
  397. vst1q_s32(output + 0, s8[31].val[0]);
  398. vst1q_s32(output + 4, s8[31].val[1]);
  399. }
  400. static void vpx_highbd_idct32_16_neon(const int32_t *const input,
  401. uint16_t *const output, const int stride,
  402. const int bd) {
  403. int32x4x2_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
  404. out[32];
  405. load_and_transpose_s32_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4],
  406. &in[5], &in[6], &in[7]);
  407. load_and_transpose_s32_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11],
  408. &in[12], &in[13], &in[14], &in[15]);
  409. // stage 1
  410. s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
  411. s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
  412. s1[17] = multiply_shift_and_narrow_s32_dual(in[15], -cospi_17_64);
  413. s1[30] = multiply_shift_and_narrow_s32_dual(in[15], cospi_15_64);
  414. s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
  415. s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
  416. s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
  417. s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
  418. s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
  419. s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
  420. s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
  421. s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
  422. s1[22] = multiply_shift_and_narrow_s32_dual(in[13], cospi_19_64);
  423. s1[25] = multiply_shift_and_narrow_s32_dual(in[13], cospi_13_64);
  424. s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
  425. s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
  426. // stage 2
  427. s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
  428. s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
  429. s2[9] = multiply_shift_and_narrow_s32_dual(in[14], -cospi_18_64);
  430. s2[14] = multiply_shift_and_narrow_s32_dual(in[14], cospi_14_64);
  431. s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
  432. s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
  433. s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
  434. s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
  435. s2[16] = highbd_idct_add_dual(s1[16], s1[17]);
  436. s2[17] = highbd_idct_sub_dual(s1[16], s1[17]);
  437. s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
  438. s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
  439. s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
  440. s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
  441. s2[22] = highbd_idct_sub_dual(s1[23], s1[22]);
  442. s2[23] = highbd_idct_add_dual(s1[22], s1[23]);
  443. s2[24] = highbd_idct_add_dual(s1[24], s1[25]);
  444. s2[25] = highbd_idct_sub_dual(s1[24], s1[25]);
  445. s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
  446. s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
  447. s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
  448. s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
  449. s2[30] = highbd_idct_sub_dual(s1[31], s1[30]);
  450. s2[31] = highbd_idct_add_dual(s1[30], s1[31]);
  451. // stage 3
  452. s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
  453. s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
  454. s3[5] = multiply_shift_and_narrow_s32_dual(in[12], -cospi_20_64);
  455. s3[6] = multiply_shift_and_narrow_s32_dual(in[12], cospi_12_64);
  456. s3[8] = highbd_idct_add_dual(s2[8], s2[9]);
  457. s3[9] = highbd_idct_sub_dual(s2[8], s2[9]);
  458. s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
  459. s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
  460. s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
  461. s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
  462. s3[14] = highbd_idct_sub_dual(s2[15], s2[14]);
  463. s3[15] = highbd_idct_add_dual(s2[14], s2[15]);
  464. s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], -cospi_4_64,
  465. s2[30], cospi_28_64);
  466. s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], cospi_28_64,
  467. s2[30], cospi_4_64);
  468. s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
  469. s2[29], -cospi_4_64);
  470. s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
  471. s2[29], cospi_28_64);
  472. s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
  473. s2[26], cospi_12_64);
  474. s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
  475. s2[26], cospi_20_64);
  476. s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_12_64,
  477. s2[25], -cospi_20_64);
  478. s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_20_64,
  479. s2[25], cospi_12_64);
  480. // stage 4
  481. s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
  482. s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
  483. s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
  484. s4[4] = highbd_idct_add_dual(s3[4], s3[5]);
  485. s4[5] = highbd_idct_sub_dual(s3[4], s3[5]);
  486. s4[6] = highbd_idct_sub_dual(s3[7], s3[6]);
  487. s4[7] = highbd_idct_add_dual(s3[6], s3[7]);
  488. s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], -cospi_8_64,
  489. s3[14], cospi_24_64);
  490. s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], cospi_24_64,
  491. s3[14], cospi_8_64);
  492. s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
  493. s3[13], -cospi_8_64);
  494. s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
  495. s3[13], cospi_24_64);
  496. s4[16] = highbd_idct_add_dual(s2[16], s2[19]);
  497. s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
  498. s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
  499. s4[19] = highbd_idct_sub_dual(s2[16], s2[19]);
  500. s4[20] = highbd_idct_sub_dual(s2[23], s2[20]);
  501. s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
  502. s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
  503. s4[23] = highbd_idct_add_dual(s2[20], s2[23]);
  504. s4[24] = highbd_idct_add_dual(s2[24], s2[27]);
  505. s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
  506. s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
  507. s4[27] = highbd_idct_sub_dual(s2[24], s2[27]);
  508. s4[28] = highbd_idct_sub_dual(s2[31], s2[28]);
  509. s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
  510. s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
  511. s4[31] = highbd_idct_add_dual(s2[28], s2[31]);
  512. // stage 5
  513. s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
  514. s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
  515. s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
  516. s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
  517. s5[5] = sub_multiply_shift_and_narrow_s32_dual(s4[6], s4[5], cospi_16_64);
  518. s5[6] = add_multiply_shift_and_narrow_s32_dual(s4[5], s4[6], cospi_16_64);
  519. s5[8] = highbd_idct_add_dual(s3[8], s3[11]);
  520. s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
  521. s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
  522. s5[11] = highbd_idct_sub_dual(s3[8], s3[11]);
  523. s5[12] = highbd_idct_sub_dual(s3[15], s3[12]);
  524. s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
  525. s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
  526. s5[15] = highbd_idct_add_dual(s3[15], s3[12]);
  527. s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
  528. s4[29], cospi_24_64);
  529. s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
  530. s4[29], cospi_8_64);
  531. s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
  532. s4[28], cospi_24_64);
  533. s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
  534. s4[28], cospi_8_64);
  535. s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
  536. s4[27], -cospi_8_64);
  537. s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
  538. s4[27], cospi_24_64);
  539. s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
  540. s4[26], -cospi_8_64);
  541. s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
  542. s4[26], cospi_24_64);
  543. // stage 6
  544. s6[0] = highbd_idct_add_dual(s5[0], s4[7]);
  545. s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
  546. s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
  547. s6[3] = highbd_idct_add_dual(s5[3], s4[4]);
  548. s6[4] = highbd_idct_sub_dual(s5[3], s4[4]);
  549. s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
  550. s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
  551. s6[7] = highbd_idct_sub_dual(s5[0], s4[7]);
  552. s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
  553. s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
  554. s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
  555. s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
  556. s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
  557. s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
  558. s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
  559. s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
  560. s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
  561. s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
  562. s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
  563. s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
  564. s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
  565. s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
  566. s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
  567. s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
  568. s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
  569. s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
  570. s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
  571. s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
  572. // stage 7
  573. s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
  574. s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
  575. s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
  576. s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
  577. s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
  578. s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
  579. s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
  580. s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
  581. s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
  582. s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
  583. s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
  584. s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
  585. s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
  586. s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
  587. s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
  588. s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
  589. s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
  590. s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
  591. s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
  592. s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
  593. s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
  594. s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
  595. s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
  596. s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
  597. // final stage
  598. out[0] = highbd_idct_add_dual(s7[0], s6[31]);
  599. out[1] = highbd_idct_add_dual(s7[1], s6[30]);
  600. out[2] = highbd_idct_add_dual(s7[2], s6[29]);
  601. out[3] = highbd_idct_add_dual(s7[3], s6[28]);
  602. out[4] = highbd_idct_add_dual(s7[4], s7[27]);
  603. out[5] = highbd_idct_add_dual(s7[5], s7[26]);
  604. out[6] = highbd_idct_add_dual(s7[6], s7[25]);
  605. out[7] = highbd_idct_add_dual(s7[7], s7[24]);
  606. out[8] = highbd_idct_add_dual(s7[8], s7[23]);
  607. out[9] = highbd_idct_add_dual(s7[9], s7[22]);
  608. out[10] = highbd_idct_add_dual(s7[10], s7[21]);
  609. out[11] = highbd_idct_add_dual(s7[11], s7[20]);
  610. out[12] = highbd_idct_add_dual(s7[12], s6[19]);
  611. out[13] = highbd_idct_add_dual(s7[13], s6[18]);
  612. out[14] = highbd_idct_add_dual(s7[14], s6[17]);
  613. out[15] = highbd_idct_add_dual(s7[15], s6[16]);
  614. out[16] = highbd_idct_sub_dual(s7[15], s6[16]);
  615. out[17] = highbd_idct_sub_dual(s7[14], s6[17]);
  616. out[18] = highbd_idct_sub_dual(s7[13], s6[18]);
  617. out[19] = highbd_idct_sub_dual(s7[12], s6[19]);
  618. out[20] = highbd_idct_sub_dual(s7[11], s7[20]);
  619. out[21] = highbd_idct_sub_dual(s7[10], s7[21]);
  620. out[22] = highbd_idct_sub_dual(s7[9], s7[22]);
  621. out[23] = highbd_idct_sub_dual(s7[8], s7[23]);
  622. out[24] = highbd_idct_sub_dual(s7[7], s7[24]);
  623. out[25] = highbd_idct_sub_dual(s7[6], s7[25]);
  624. out[26] = highbd_idct_sub_dual(s7[5], s7[26]);
  625. out[27] = highbd_idct_sub_dual(s7[4], s7[27]);
  626. out[28] = highbd_idct_sub_dual(s7[3], s6[28]);
  627. out[29] = highbd_idct_sub_dual(s7[2], s6[29]);
  628. out[30] = highbd_idct_sub_dual(s7[1], s6[30]);
  629. out[31] = highbd_idct_sub_dual(s7[0], s6[31]);
  630. highbd_idct16x16_add_store(out, output, stride, bd);
  631. highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
  632. }
  633. void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest,
  634. int stride, int bd) {
  635. int i;
  636. if (bd == 8) {
  637. int16_t temp[32 * 16];
  638. int16_t *t = temp;
  639. vpx_idct32_12_neon(input, temp);
  640. vpx_idct32_12_neon(input + 32 * 8, temp + 8);
  641. for (i = 0; i < 32; i += 8) {
  642. vpx_idct32_16_neon(t, dest, stride, 1);
  643. t += (16 * 8);
  644. dest += 8;
  645. }
  646. } else {
  647. int32_t temp[32 * 16];
  648. int32_t *t = temp;
  649. vpx_highbd_idct32_12_neon(input, temp);
  650. vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8);
  651. for (i = 0; i < 32; i += 8) {
  652. vpx_highbd_idct32_16_neon(t, dest, stride, bd);
  653. t += (16 * 8);
  654. dest += 8;
  655. }
  656. }
  657. }