fdct32x32_neon.c 52 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_config.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_dsp/txfm_common.h"
  14. #include "vpx_dsp/arm/mem_neon.h"
  15. #include "vpx_dsp/arm/transpose_neon.h"
  16. // Most gcc 4.9 distributions outside of Android do not generate correct code
  17. // for this function.
  18. #if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
  19. __GNUC__ == 4 && __GNUC_MINOR__ <= 9
  20. void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
  21. vpx_fdct32x32_c(input, output, stride);
  22. }
  23. void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
  24. int stride) {
  25. vpx_fdct32x32_rd_c(input, output, stride);
  26. }
  27. #else
  28. #define LOAD_INCREMENT(src, stride, dest, index) \
  29. do { \
  30. dest[index] = vld1q_s16(src); \
  31. src += stride; \
  32. } while (0)
  33. #define ADD_S16(src, index0, index1, dest, index3) \
  34. do { \
  35. dest[index3] = vaddq_s16(src[index0], src[index1]); \
  36. } while (0)
  37. #define ADD_SHIFT_S16(src, index0, index1) \
  38. do { \
  39. src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
  40. } while (0)
  41. // Load, cross, and multiply by 4. Load the first 8 and last 8, then the
  42. // middle
  43. // 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
  44. static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
  45. const int16_t *a_end = a + 24 * stride;
  46. int16x8_t c[8];
  47. LOAD_INCREMENT(a, stride, b, 0);
  48. LOAD_INCREMENT(a, stride, b, 1);
  49. LOAD_INCREMENT(a, stride, b, 2);
  50. LOAD_INCREMENT(a, stride, b, 3);
  51. LOAD_INCREMENT(a, stride, b, 4);
  52. LOAD_INCREMENT(a, stride, b, 5);
  53. LOAD_INCREMENT(a, stride, b, 6);
  54. LOAD_INCREMENT(a, stride, b, 7);
  55. LOAD_INCREMENT(a_end, stride, b, 24);
  56. LOAD_INCREMENT(a_end, stride, b, 25);
  57. LOAD_INCREMENT(a_end, stride, b, 26);
  58. LOAD_INCREMENT(a_end, stride, b, 27);
  59. LOAD_INCREMENT(a_end, stride, b, 28);
  60. LOAD_INCREMENT(a_end, stride, b, 29);
  61. LOAD_INCREMENT(a_end, stride, b, 30);
  62. LOAD_INCREMENT(a_end, stride, b, 31);
  63. ADD_S16(b, 0, 31, c, 0);
  64. ADD_S16(b, 1, 30, c, 1);
  65. ADD_S16(b, 2, 29, c, 2);
  66. ADD_S16(b, 3, 28, c, 3);
  67. ADD_S16(b, 4, 27, c, 4);
  68. ADD_S16(b, 5, 26, c, 5);
  69. ADD_S16(b, 6, 25, c, 6);
  70. ADD_S16(b, 7, 24, c, 7);
  71. ADD_SHIFT_S16(b, 7, 24);
  72. ADD_SHIFT_S16(b, 6, 25);
  73. ADD_SHIFT_S16(b, 5, 26);
  74. ADD_SHIFT_S16(b, 4, 27);
  75. ADD_SHIFT_S16(b, 3, 28);
  76. ADD_SHIFT_S16(b, 2, 29);
  77. ADD_SHIFT_S16(b, 1, 30);
  78. ADD_SHIFT_S16(b, 0, 31);
  79. b[0] = vshlq_n_s16(c[0], 2);
  80. b[1] = vshlq_n_s16(c[1], 2);
  81. b[2] = vshlq_n_s16(c[2], 2);
  82. b[3] = vshlq_n_s16(c[3], 2);
  83. b[4] = vshlq_n_s16(c[4], 2);
  84. b[5] = vshlq_n_s16(c[5], 2);
  85. b[6] = vshlq_n_s16(c[6], 2);
  86. b[7] = vshlq_n_s16(c[7], 2);
  87. LOAD_INCREMENT(a, stride, b, 8);
  88. LOAD_INCREMENT(a, stride, b, 9);
  89. LOAD_INCREMENT(a, stride, b, 10);
  90. LOAD_INCREMENT(a, stride, b, 11);
  91. LOAD_INCREMENT(a, stride, b, 12);
  92. LOAD_INCREMENT(a, stride, b, 13);
  93. LOAD_INCREMENT(a, stride, b, 14);
  94. LOAD_INCREMENT(a, stride, b, 15);
  95. LOAD_INCREMENT(a, stride, b, 16);
  96. LOAD_INCREMENT(a, stride, b, 17);
  97. LOAD_INCREMENT(a, stride, b, 18);
  98. LOAD_INCREMENT(a, stride, b, 19);
  99. LOAD_INCREMENT(a, stride, b, 20);
  100. LOAD_INCREMENT(a, stride, b, 21);
  101. LOAD_INCREMENT(a, stride, b, 22);
  102. LOAD_INCREMENT(a, stride, b, 23);
  103. ADD_S16(b, 8, 23, c, 0);
  104. ADD_S16(b, 9, 22, c, 1);
  105. ADD_S16(b, 10, 21, c, 2);
  106. ADD_S16(b, 11, 20, c, 3);
  107. ADD_S16(b, 12, 19, c, 4);
  108. ADD_S16(b, 13, 18, c, 5);
  109. ADD_S16(b, 14, 17, c, 6);
  110. ADD_S16(b, 15, 16, c, 7);
  111. ADD_SHIFT_S16(b, 15, 16);
  112. ADD_SHIFT_S16(b, 14, 17);
  113. ADD_SHIFT_S16(b, 13, 18);
  114. ADD_SHIFT_S16(b, 12, 19);
  115. ADD_SHIFT_S16(b, 11, 20);
  116. ADD_SHIFT_S16(b, 10, 21);
  117. ADD_SHIFT_S16(b, 9, 22);
  118. ADD_SHIFT_S16(b, 8, 23);
  119. b[8] = vshlq_n_s16(c[0], 2);
  120. b[9] = vshlq_n_s16(c[1], 2);
  121. b[10] = vshlq_n_s16(c[2], 2);
  122. b[11] = vshlq_n_s16(c[3], 2);
  123. b[12] = vshlq_n_s16(c[4], 2);
  124. b[13] = vshlq_n_s16(c[5], 2);
  125. b[14] = vshlq_n_s16(c[6], 2);
  126. b[15] = vshlq_n_s16(c[7], 2);
  127. }
  128. #undef LOAD_INCREMENT
  129. #undef ADD_S16
  130. #undef ADD_SHIFT_S16
  131. #define STORE_S16(src, index, dest) \
  132. do { \
  133. store_s16q_to_tran_low(dest, src[index]); \
  134. dest += 8; \
  135. } while (0);
  136. // Store 32 16x8 values, assuming stride == 32.
  137. // Slight twist: store horizontally in blocks of 8.
  138. static INLINE void store(tran_low_t *a, const int16x8_t *b) {
  139. STORE_S16(b, 0, a);
  140. STORE_S16(b, 8, a);
  141. STORE_S16(b, 16, a);
  142. STORE_S16(b, 24, a);
  143. STORE_S16(b, 1, a);
  144. STORE_S16(b, 9, a);
  145. STORE_S16(b, 17, a);
  146. STORE_S16(b, 25, a);
  147. STORE_S16(b, 2, a);
  148. STORE_S16(b, 10, a);
  149. STORE_S16(b, 18, a);
  150. STORE_S16(b, 26, a);
  151. STORE_S16(b, 3, a);
  152. STORE_S16(b, 11, a);
  153. STORE_S16(b, 19, a);
  154. STORE_S16(b, 27, a);
  155. STORE_S16(b, 4, a);
  156. STORE_S16(b, 12, a);
  157. STORE_S16(b, 20, a);
  158. STORE_S16(b, 28, a);
  159. STORE_S16(b, 5, a);
  160. STORE_S16(b, 13, a);
  161. STORE_S16(b, 21, a);
  162. STORE_S16(b, 29, a);
  163. STORE_S16(b, 6, a);
  164. STORE_S16(b, 14, a);
  165. STORE_S16(b, 22, a);
  166. STORE_S16(b, 30, a);
  167. STORE_S16(b, 7, a);
  168. STORE_S16(b, 15, a);
  169. STORE_S16(b, 23, a);
  170. STORE_S16(b, 31, a);
  171. }
  172. #undef STORE_S16
  173. // fdct_round_shift((a +/- b) * c)
  174. static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
  175. const tran_high_t constant,
  176. int16x8_t *add, int16x8_t *sub) {
  177. const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
  178. const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
  179. const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
  180. const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
  181. const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
  182. const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
  183. const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
  184. const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
  185. const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
  186. const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
  187. *add = vcombine_s16(rounded0, rounded1);
  188. *sub = vcombine_s16(rounded2, rounded3);
  189. }
  190. // fdct_round_shift(a * c0 +/- b * c1)
  191. static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
  192. const tran_coef_t constant0,
  193. const tran_coef_t constant1,
  194. int16x8_t *add, int16x8_t *sub) {
  195. const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
  196. const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
  197. const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
  198. const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
  199. const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
  200. const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
  201. const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
  202. const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
  203. const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
  204. const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
  205. const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
  206. const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
  207. *add = vcombine_s16(rounded0, rounded1);
  208. *sub = vcombine_s16(rounded2, rounded3);
  209. }
  210. // Add 2 if positive, 1 if negative, and shift by 2.
  211. // In practice, subtract the sign bit, then shift with rounding.
  212. static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
  213. const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
  214. const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
  215. const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
  216. return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
  217. }
  218. static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
  219. int16x8_t a[32];
  220. int16x8_t b[32];
  221. // Stage 1: Done as part of the load.
  222. // Stage 2.
  223. // Mini cross. X the first 16 values and the middle 8 of the second half.
  224. a[0] = vaddq_s16(in[0], in[15]);
  225. a[1] = vaddq_s16(in[1], in[14]);
  226. a[2] = vaddq_s16(in[2], in[13]);
  227. a[3] = vaddq_s16(in[3], in[12]);
  228. a[4] = vaddq_s16(in[4], in[11]);
  229. a[5] = vaddq_s16(in[5], in[10]);
  230. a[6] = vaddq_s16(in[6], in[9]);
  231. a[7] = vaddq_s16(in[7], in[8]);
  232. a[8] = vsubq_s16(in[7], in[8]);
  233. a[9] = vsubq_s16(in[6], in[9]);
  234. a[10] = vsubq_s16(in[5], in[10]);
  235. a[11] = vsubq_s16(in[4], in[11]);
  236. a[12] = vsubq_s16(in[3], in[12]);
  237. a[13] = vsubq_s16(in[2], in[13]);
  238. a[14] = vsubq_s16(in[1], in[14]);
  239. a[15] = vsubq_s16(in[0], in[15]);
  240. a[16] = in[16];
  241. a[17] = in[17];
  242. a[18] = in[18];
  243. a[19] = in[19];
  244. butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
  245. butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
  246. butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
  247. butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
  248. a[28] = in[28];
  249. a[29] = in[29];
  250. a[30] = in[30];
  251. a[31] = in[31];
  252. // Stage 3.
  253. b[0] = vaddq_s16(a[0], a[7]);
  254. b[1] = vaddq_s16(a[1], a[6]);
  255. b[2] = vaddq_s16(a[2], a[5]);
  256. b[3] = vaddq_s16(a[3], a[4]);
  257. b[4] = vsubq_s16(a[3], a[4]);
  258. b[5] = vsubq_s16(a[2], a[5]);
  259. b[6] = vsubq_s16(a[1], a[6]);
  260. b[7] = vsubq_s16(a[0], a[7]);
  261. b[8] = a[8];
  262. b[9] = a[9];
  263. butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
  264. butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
  265. b[14] = a[14];
  266. b[15] = a[15];
  267. b[16] = vaddq_s16(in[16], a[23]);
  268. b[17] = vaddq_s16(in[17], a[22]);
  269. b[18] = vaddq_s16(in[18], a[21]);
  270. b[19] = vaddq_s16(in[19], a[20]);
  271. b[20] = vsubq_s16(in[19], a[20]);
  272. b[21] = vsubq_s16(in[18], a[21]);
  273. b[22] = vsubq_s16(in[17], a[22]);
  274. b[23] = vsubq_s16(in[16], a[23]);
  275. b[24] = vsubq_s16(in[31], a[24]);
  276. b[25] = vsubq_s16(in[30], a[25]);
  277. b[26] = vsubq_s16(in[29], a[26]);
  278. b[27] = vsubq_s16(in[28], a[27]);
  279. b[28] = vaddq_s16(in[28], a[27]);
  280. b[29] = vaddq_s16(in[29], a[26]);
  281. b[30] = vaddq_s16(in[30], a[25]);
  282. b[31] = vaddq_s16(in[31], a[24]);
  283. // Stage 4.
  284. a[0] = vaddq_s16(b[0], b[3]);
  285. a[1] = vaddq_s16(b[1], b[2]);
  286. a[2] = vsubq_s16(b[1], b[2]);
  287. a[3] = vsubq_s16(b[0], b[3]);
  288. a[4] = b[4];
  289. butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
  290. a[7] = b[7];
  291. a[8] = vaddq_s16(b[8], b[11]);
  292. a[9] = vaddq_s16(b[9], b[10]);
  293. a[10] = vsubq_s16(b[9], b[10]);
  294. a[11] = vsubq_s16(b[8], b[11]);
  295. a[12] = vsubq_s16(b[15], b[12]);
  296. a[13] = vsubq_s16(b[14], b[13]);
  297. a[14] = vaddq_s16(b[14], b[13]);
  298. a[15] = vaddq_s16(b[15], b[12]);
  299. a[16] = b[16];
  300. a[17] = b[17];
  301. butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
  302. butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
  303. butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
  304. butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
  305. a[22] = b[22];
  306. a[23] = b[23];
  307. a[24] = b[24];
  308. a[25] = b[25];
  309. a[30] = b[30];
  310. a[31] = b[31];
  311. // Stage 5.
  312. butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
  313. butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
  314. b[4] = vaddq_s16(a[4], a[5]);
  315. b[5] = vsubq_s16(a[4], a[5]);
  316. b[6] = vsubq_s16(a[7], a[6]);
  317. b[7] = vaddq_s16(a[7], a[6]);
  318. b[8] = a[8];
  319. butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
  320. butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
  321. b[11] = a[11];
  322. b[12] = a[12];
  323. b[15] = a[15];
  324. b[16] = vaddq_s16(a[19], a[16]);
  325. b[17] = vaddq_s16(a[18], a[17]);
  326. b[18] = vsubq_s16(a[17], a[18]);
  327. b[19] = vsubq_s16(a[16], a[19]);
  328. b[20] = vsubq_s16(a[23], a[20]);
  329. b[21] = vsubq_s16(a[22], a[21]);
  330. b[22] = vaddq_s16(a[21], a[22]);
  331. b[23] = vaddq_s16(a[20], a[23]);
  332. b[24] = vaddq_s16(a[27], a[24]);
  333. b[25] = vaddq_s16(a[26], a[25]);
  334. b[26] = vsubq_s16(a[25], a[26]);
  335. b[27] = vsubq_s16(a[24], a[27]);
  336. b[28] = vsubq_s16(a[31], a[28]);
  337. b[29] = vsubq_s16(a[30], a[29]);
  338. b[30] = vaddq_s16(a[29], a[30]);
  339. b[31] = vaddq_s16(a[28], a[31]);
  340. // Stage 6.
  341. a[0] = b[0];
  342. a[1] = b[1];
  343. a[2] = b[2];
  344. a[3] = b[3];
  345. butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
  346. butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
  347. a[8] = vaddq_s16(b[8], b[9]);
  348. a[9] = vsubq_s16(b[8], b[9]);
  349. a[10] = vsubq_s16(b[11], b[10]);
  350. a[11] = vaddq_s16(b[11], b[10]);
  351. a[12] = vaddq_s16(b[12], b[13]);
  352. a[13] = vsubq_s16(b[12], b[13]);
  353. a[14] = vsubq_s16(b[15], b[14]);
  354. a[15] = vaddq_s16(b[15], b[14]);
  355. a[16] = b[16];
  356. a[19] = b[19];
  357. a[20] = b[20];
  358. a[23] = b[23];
  359. a[24] = b[24];
  360. a[27] = b[27];
  361. a[28] = b[28];
  362. a[31] = b[31];
  363. butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
  364. butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
  365. butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
  366. butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
  367. // Stage 7.
  368. b[0] = a[0];
  369. b[1] = a[1];
  370. b[2] = a[2];
  371. b[3] = a[3];
  372. b[4] = a[4];
  373. b[5] = a[5];
  374. b[6] = a[6];
  375. b[7] = a[7];
  376. butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
  377. butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
  378. butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
  379. butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
  380. b[16] = vaddq_s16(a[16], a[17]);
  381. b[17] = vsubq_s16(a[16], a[17]);
  382. b[18] = vsubq_s16(a[19], a[18]);
  383. b[19] = vaddq_s16(a[19], a[18]);
  384. b[20] = vaddq_s16(a[20], a[21]);
  385. b[21] = vsubq_s16(a[20], a[21]);
  386. b[22] = vsubq_s16(a[23], a[22]);
  387. b[23] = vaddq_s16(a[23], a[22]);
  388. b[24] = vaddq_s16(a[24], a[25]);
  389. b[25] = vsubq_s16(a[24], a[25]);
  390. b[26] = vsubq_s16(a[27], a[26]);
  391. b[27] = vaddq_s16(a[27], a[26]);
  392. b[28] = vaddq_s16(a[28], a[29]);
  393. b[29] = vsubq_s16(a[28], a[29]);
  394. b[30] = vsubq_s16(a[31], a[30]);
  395. b[31] = vaddq_s16(a[31], a[30]);
  396. // Final stage.
  397. // Also compute partial rounding shift:
  398. // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
  399. out[0] = sub_round_shift(b[0]);
  400. out[16] = sub_round_shift(b[1]);
  401. out[8] = sub_round_shift(b[2]);
  402. out[24] = sub_round_shift(b[3]);
  403. out[4] = sub_round_shift(b[4]);
  404. out[20] = sub_round_shift(b[5]);
  405. out[12] = sub_round_shift(b[6]);
  406. out[28] = sub_round_shift(b[7]);
  407. out[2] = sub_round_shift(b[8]);
  408. out[18] = sub_round_shift(b[9]);
  409. out[10] = sub_round_shift(b[10]);
  410. out[26] = sub_round_shift(b[11]);
  411. out[6] = sub_round_shift(b[12]);
  412. out[22] = sub_round_shift(b[13]);
  413. out[14] = sub_round_shift(b[14]);
  414. out[30] = sub_round_shift(b[15]);
  415. butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
  416. out[1] = sub_round_shift(a[1]);
  417. out[31] = sub_round_shift(a[31]);
  418. butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
  419. out[17] = sub_round_shift(a[17]);
  420. out[15] = sub_round_shift(a[15]);
  421. butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
  422. out[9] = sub_round_shift(a[9]);
  423. out[23] = sub_round_shift(a[23]);
  424. butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
  425. out[25] = sub_round_shift(a[25]);
  426. out[7] = sub_round_shift(a[7]);
  427. butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
  428. out[5] = sub_round_shift(a[5]);
  429. out[27] = sub_round_shift(a[27]);
  430. butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
  431. out[21] = sub_round_shift(a[21]);
  432. out[11] = sub_round_shift(a[11]);
  433. butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
  434. out[13] = sub_round_shift(a[13]);
  435. out[19] = sub_round_shift(a[19]);
  436. butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
  437. out[29] = sub_round_shift(a[29]);
  438. out[3] = sub_round_shift(a[3]);
  439. }
  440. #define PASS_THROUGH(src, dst, element) \
  441. do { \
  442. dst##_lo[element] = src##_lo[element]; \
  443. dst##_hi[element] = src##_hi[element]; \
  444. } while (0)
  445. #define ADD_S16_S32(a, left_index, right_index, b, b_index) \
  446. do { \
  447. b##_lo[b_index] = \
  448. vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
  449. b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
  450. vget_high_s16(a[right_index])); \
  451. } while (0)
  452. #define SUB_S16_S32(a, left_index, right_index, b, b_index) \
  453. do { \
  454. b##_lo[b_index] = \
  455. vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
  456. b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
  457. vget_high_s16(a[right_index])); \
  458. } while (0)
  459. #define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
  460. do { \
  461. c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
  462. c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
  463. } while (0)
  464. #define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
  465. do { \
  466. temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
  467. temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
  468. c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
  469. c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
  470. } while (0)
  471. #define ADD_S32(a, left_index, right_index, b, b_index) \
  472. do { \
  473. b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
  474. b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
  475. } while (0)
  476. #define SUB_S32(a, left_index, right_index, b, b_index) \
  477. do { \
  478. b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
  479. b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
  480. } while (0)
  481. // Like butterfly_one_coeff, but don't narrow results.
  482. static INLINE void butterfly_one_coeff_s16_s32(
  483. const int16x8_t a, const int16x8_t b, const tran_high_t constant,
  484. int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
  485. int32x4_t *sub_hi) {
  486. const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
  487. const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
  488. const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
  489. const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
  490. const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
  491. const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
  492. *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
  493. *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
  494. *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
  495. *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
  496. }
  497. #define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
  498. add_index, sub_index) \
  499. do { \
  500. butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
  501. &b##_lo[add_index], &b##_hi[add_index], \
  502. &b##_lo[sub_index], &b##_hi[sub_index]); \
  503. } while (0)
  504. // Like butterfly_one_coeff, but with s32.
  505. static INLINE void butterfly_one_coeff_s32(
  506. const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
  507. const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
  508. int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
  509. const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
  510. const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
  511. const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
  512. const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
  513. const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
  514. const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
  515. *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
  516. *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
  517. *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
  518. *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
  519. }
  520. #define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
  521. sub_index) \
  522. do { \
  523. butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
  524. a##_lo[right_index], a##_hi[right_index], \
  525. constant, &b##_lo[add_index], &b##_hi[add_index], \
  526. &b##_lo[sub_index], &b##_hi[sub_index]); \
  527. } while (0)
  528. // Like butterfly_two_coeff, but with s32.
  529. static INLINE void butterfly_two_coeff_s32(
  530. const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
  531. const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
  532. int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
  533. int32x4_t *sub_hi) {
  534. const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
  535. const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
  536. const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
  537. const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
  538. const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
  539. const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
  540. const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
  541. const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
  542. *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
  543. *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
  544. *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
  545. *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
  546. }
  547. #define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
  548. right_constant, b, add_index, sub_index) \
  549. do { \
  550. butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
  551. a##_lo[right_index], a##_hi[right_index], \
  552. left_constant, right_constant, &b##_lo[add_index], \
  553. &b##_hi[add_index], &b##_lo[sub_index], \
  554. &b##_hi[sub_index]); \
  555. } while (0)
  556. // Add 1 if positive, 2 if negative, and shift by 2.
  557. // In practice, add 1, then add the sign bit, then shift without rounding.
  558. static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
  559. const int32x4_t a_hi) {
  560. const int32x4_t one = vdupq_n_s32(1);
  561. const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
  562. const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
  563. const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
  564. const int16x4_t b_lo =
  565. vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
  566. const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
  567. const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
  568. const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
  569. const int16x4_t b_hi =
  570. vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
  571. return vcombine_s16(b_lo, b_hi);
  572. }
  573. static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
  574. int16x8_t a[32];
  575. int16x8_t b[32];
  576. int32x4_t c_lo[32];
  577. int32x4_t c_hi[32];
  578. int32x4_t d_lo[32];
  579. int32x4_t d_hi[32];
  580. // Stage 1. Done as part of the load for the first pass.
  581. a[0] = vaddq_s16(in[0], in[31]);
  582. a[1] = vaddq_s16(in[1], in[30]);
  583. a[2] = vaddq_s16(in[2], in[29]);
  584. a[3] = vaddq_s16(in[3], in[28]);
  585. a[4] = vaddq_s16(in[4], in[27]);
  586. a[5] = vaddq_s16(in[5], in[26]);
  587. a[6] = vaddq_s16(in[6], in[25]);
  588. a[7] = vaddq_s16(in[7], in[24]);
  589. a[8] = vaddq_s16(in[8], in[23]);
  590. a[9] = vaddq_s16(in[9], in[22]);
  591. a[10] = vaddq_s16(in[10], in[21]);
  592. a[11] = vaddq_s16(in[11], in[20]);
  593. a[12] = vaddq_s16(in[12], in[19]);
  594. a[13] = vaddq_s16(in[13], in[18]);
  595. a[14] = vaddq_s16(in[14], in[17]);
  596. a[15] = vaddq_s16(in[15], in[16]);
  597. a[16] = vsubq_s16(in[15], in[16]);
  598. a[17] = vsubq_s16(in[14], in[17]);
  599. a[18] = vsubq_s16(in[13], in[18]);
  600. a[19] = vsubq_s16(in[12], in[19]);
  601. a[20] = vsubq_s16(in[11], in[20]);
  602. a[21] = vsubq_s16(in[10], in[21]);
  603. a[22] = vsubq_s16(in[9], in[22]);
  604. a[23] = vsubq_s16(in[8], in[23]);
  605. a[24] = vsubq_s16(in[7], in[24]);
  606. a[25] = vsubq_s16(in[6], in[25]);
  607. a[26] = vsubq_s16(in[5], in[26]);
  608. a[27] = vsubq_s16(in[4], in[27]);
  609. a[28] = vsubq_s16(in[3], in[28]);
  610. a[29] = vsubq_s16(in[2], in[29]);
  611. a[30] = vsubq_s16(in[1], in[30]);
  612. a[31] = vsubq_s16(in[0], in[31]);
  613. // Stage 2.
  614. b[0] = vaddq_s16(a[0], a[15]);
  615. b[1] = vaddq_s16(a[1], a[14]);
  616. b[2] = vaddq_s16(a[2], a[13]);
  617. b[3] = vaddq_s16(a[3], a[12]);
  618. b[4] = vaddq_s16(a[4], a[11]);
  619. b[5] = vaddq_s16(a[5], a[10]);
  620. b[6] = vaddq_s16(a[6], a[9]);
  621. b[7] = vaddq_s16(a[7], a[8]);
  622. b[8] = vsubq_s16(a[7], a[8]);
  623. b[9] = vsubq_s16(a[6], a[9]);
  624. b[10] = vsubq_s16(a[5], a[10]);
  625. b[11] = vsubq_s16(a[4], a[11]);
  626. b[12] = vsubq_s16(a[3], a[12]);
  627. b[13] = vsubq_s16(a[2], a[13]);
  628. b[14] = vsubq_s16(a[1], a[14]);
  629. b[15] = vsubq_s16(a[0], a[15]);
  630. b[16] = a[16];
  631. b[17] = a[17];
  632. b[18] = a[18];
  633. b[19] = a[19];
  634. butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
  635. butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
  636. butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
  637. butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
  638. b[28] = a[28];
  639. b[29] = a[29];
  640. b[30] = a[30];
  641. b[31] = a[31];
  642. // Stage 3. With extreme values for input this calculation rolls over int16_t.
  643. // The sources for b[0] get added multiple times and, through testing, have
  644. // been shown to overflow starting here.
  645. ADD_S16_S32(b, 0, 7, c, 0);
  646. ADD_S16_S32(b, 1, 6, c, 1);
  647. ADD_S16_S32(b, 2, 5, c, 2);
  648. ADD_S16_S32(b, 3, 4, c, 3);
  649. SUB_S16_S32(b, 3, 4, c, 4);
  650. SUB_S16_S32(b, 2, 5, c, 5);
  651. SUB_S16_S32(b, 1, 6, c, 6);
  652. SUB_S16_S32(b, 0, 7, c, 7);
  653. a[8] = b[8];
  654. a[9] = b[9];
  655. BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
  656. BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
  657. a[14] = b[14];
  658. a[15] = b[15];
  659. ADD_S16_S32(b, 16, 23, c, 16);
  660. ADD_S16_S32(b, 17, 22, c, 17);
  661. ADD_S16_S32(b, 18, 21, c, 18);
  662. ADD_S16_S32(b, 19, 20, c, 19);
  663. SUB_S16_S32(b, 19, 20, c, 20);
  664. SUB_S16_S32(b, 18, 21, c, 21);
  665. SUB_S16_S32(b, 17, 22, c, 22);
  666. SUB_S16_S32(b, 16, 23, c, 23);
  667. SUB_S16_S32(b, 31, 24, c, 24);
  668. SUB_S16_S32(b, 30, 25, c, 25);
  669. SUB_S16_S32(b, 29, 26, c, 26);
  670. SUB_S16_S32(b, 28, 27, c, 27);
  671. ADD_S16_S32(b, 28, 27, c, 28);
  672. ADD_S16_S32(b, 29, 26, c, 29);
  673. ADD_S16_S32(b, 30, 25, c, 30);
  674. ADD_S16_S32(b, 31, 24, c, 31);
  675. // Stage 4.
  676. ADD_S32(c, 0, 3, d, 0);
  677. ADD_S32(c, 1, 2, d, 1);
  678. SUB_S32(c, 1, 2, d, 2);
  679. SUB_S32(c, 0, 3, d, 3);
  680. PASS_THROUGH(c, d, 4);
  681. BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
  682. PASS_THROUGH(c, d, 7);
  683. ADDW_S16_S32(c, 11, a, 8, d, 8);
  684. ADDW_S16_S32(c, 10, a, 9, d, 9);
  685. SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
  686. SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
  687. SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
  688. SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
  689. ADDW_S16_S32(c, 13, b, 14, d, 14);
  690. ADDW_S16_S32(c, 12, b, 15, d, 15);
  691. PASS_THROUGH(c, d, 16);
  692. PASS_THROUGH(c, d, 17);
  693. BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
  694. BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
  695. BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
  696. BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
  697. PASS_THROUGH(c, d, 22);
  698. PASS_THROUGH(c, d, 23);
  699. PASS_THROUGH(c, d, 24);
  700. PASS_THROUGH(c, d, 25);
  701. PASS_THROUGH(c, d, 30);
  702. PASS_THROUGH(c, d, 31);
  703. // Stage 5.
  704. BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
  705. BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
  706. ADD_S32(d, 4, 5, c, 4);
  707. SUB_S32(d, 4, 5, c, 5);
  708. SUB_S32(d, 7, 6, c, 6);
  709. ADD_S32(d, 7, 6, c, 7);
  710. PASS_THROUGH(d, c, 8);
  711. BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
  712. BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
  713. PASS_THROUGH(d, c, 11);
  714. PASS_THROUGH(d, c, 12);
  715. PASS_THROUGH(d, c, 15);
  716. ADD_S32(d, 16, 19, c, 16);
  717. ADD_S32(d, 17, 18, c, 17);
  718. SUB_S32(d, 17, 18, c, 18);
  719. SUB_S32(d, 16, 19, c, 19);
  720. SUB_S32(d, 23, 20, c, 20);
  721. SUB_S32(d, 22, 21, c, 21);
  722. ADD_S32(d, 22, 21, c, 22);
  723. ADD_S32(d, 23, 20, c, 23);
  724. ADD_S32(d, 24, 27, c, 24);
  725. ADD_S32(d, 25, 26, c, 25);
  726. SUB_S32(d, 25, 26, c, 26);
  727. SUB_S32(d, 24, 27, c, 27);
  728. SUB_S32(d, 31, 28, c, 28);
  729. SUB_S32(d, 30, 29, c, 29);
  730. ADD_S32(d, 30, 29, c, 30);
  731. ADD_S32(d, 31, 28, c, 31);
  732. // Stage 6.
  733. PASS_THROUGH(c, d, 0);
  734. PASS_THROUGH(c, d, 1);
  735. PASS_THROUGH(c, d, 2);
  736. PASS_THROUGH(c, d, 3);
  737. BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
  738. BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
  739. ADD_S32(c, 8, 9, d, 8);
  740. SUB_S32(c, 8, 9, d, 9);
  741. SUB_S32(c, 11, 10, d, 10);
  742. ADD_S32(c, 11, 10, d, 11);
  743. ADD_S32(c, 12, 13, d, 12);
  744. SUB_S32(c, 12, 13, d, 13);
  745. SUB_S32(c, 15, 14, d, 14);
  746. ADD_S32(c, 15, 14, d, 15);
  747. PASS_THROUGH(c, d, 16);
  748. PASS_THROUGH(c, d, 19);
  749. PASS_THROUGH(c, d, 20);
  750. PASS_THROUGH(c, d, 23);
  751. PASS_THROUGH(c, d, 24);
  752. PASS_THROUGH(c, d, 27);
  753. PASS_THROUGH(c, d, 28);
  754. PASS_THROUGH(c, d, 31);
  755. BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
  756. BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
  757. BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
  758. BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
  759. // Stage 7.
  760. PASS_THROUGH(d, c, 0);
  761. PASS_THROUGH(d, c, 1);
  762. PASS_THROUGH(d, c, 2);
  763. PASS_THROUGH(d, c, 3);
  764. PASS_THROUGH(d, c, 4);
  765. PASS_THROUGH(d, c, 5);
  766. PASS_THROUGH(d, c, 6);
  767. PASS_THROUGH(d, c, 7);
  768. BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
  769. BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
  770. BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
  771. BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
  772. ADD_S32(d, 16, 17, c, 16);
  773. SUB_S32(d, 16, 17, c, 17);
  774. SUB_S32(d, 19, 18, c, 18);
  775. ADD_S32(d, 19, 18, c, 19);
  776. ADD_S32(d, 20, 21, c, 20);
  777. SUB_S32(d, 20, 21, c, 21);
  778. SUB_S32(d, 23, 22, c, 22);
  779. ADD_S32(d, 23, 22, c, 23);
  780. ADD_S32(d, 24, 25, c, 24);
  781. SUB_S32(d, 24, 25, c, 25);
  782. SUB_S32(d, 27, 26, c, 26);
  783. ADD_S32(d, 27, 26, c, 27);
  784. ADD_S32(d, 28, 29, c, 28);
  785. SUB_S32(d, 28, 29, c, 29);
  786. SUB_S32(d, 31, 30, c, 30);
  787. ADD_S32(d, 31, 30, c, 31);
  788. // Final stage.
  789. // Roll rounding into this function so we can pass back int16x8.
  790. out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
  791. out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
  792. out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
  793. out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
  794. out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
  795. out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
  796. out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
  797. out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
  798. out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
  799. out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
  800. out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
  801. out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
  802. out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
  803. out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
  804. out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
  805. out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
  806. BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
  807. out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
  808. out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
  809. BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
  810. out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
  811. out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
  812. BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
  813. out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
  814. out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
  815. BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
  816. out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
  817. out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
  818. BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
  819. out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
  820. out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
  821. BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
  822. out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
  823. out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
  824. BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
  825. out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
  826. out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
  827. BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
  828. out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
  829. out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
  830. }
  831. // Add 1 if positive, 2 if negative, and shift by 2.
  832. // In practice, add 1, then add the sign bit, then shift without rounding.
  833. static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
  834. const int16x8_t one = vdupq_n_s16(1);
  835. const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
  836. const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
  837. const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
  838. return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
  839. }
  840. static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
  841. int16x8_t a[32];
  842. int16x8_t b[32];
  843. // Stage 1. Done as part of the load for the first pass.
  844. a[0] = vaddq_s16(in[0], in[31]);
  845. a[1] = vaddq_s16(in[1], in[30]);
  846. a[2] = vaddq_s16(in[2], in[29]);
  847. a[3] = vaddq_s16(in[3], in[28]);
  848. a[4] = vaddq_s16(in[4], in[27]);
  849. a[5] = vaddq_s16(in[5], in[26]);
  850. a[6] = vaddq_s16(in[6], in[25]);
  851. a[7] = vaddq_s16(in[7], in[24]);
  852. a[8] = vaddq_s16(in[8], in[23]);
  853. a[9] = vaddq_s16(in[9], in[22]);
  854. a[10] = vaddq_s16(in[10], in[21]);
  855. a[11] = vaddq_s16(in[11], in[20]);
  856. a[12] = vaddq_s16(in[12], in[19]);
  857. a[13] = vaddq_s16(in[13], in[18]);
  858. a[14] = vaddq_s16(in[14], in[17]);
  859. a[15] = vaddq_s16(in[15], in[16]);
  860. a[16] = vsubq_s16(in[15], in[16]);
  861. a[17] = vsubq_s16(in[14], in[17]);
  862. a[18] = vsubq_s16(in[13], in[18]);
  863. a[19] = vsubq_s16(in[12], in[19]);
  864. a[20] = vsubq_s16(in[11], in[20]);
  865. a[21] = vsubq_s16(in[10], in[21]);
  866. a[22] = vsubq_s16(in[9], in[22]);
  867. a[23] = vsubq_s16(in[8], in[23]);
  868. a[24] = vsubq_s16(in[7], in[24]);
  869. a[25] = vsubq_s16(in[6], in[25]);
  870. a[26] = vsubq_s16(in[5], in[26]);
  871. a[27] = vsubq_s16(in[4], in[27]);
  872. a[28] = vsubq_s16(in[3], in[28]);
  873. a[29] = vsubq_s16(in[2], in[29]);
  874. a[30] = vsubq_s16(in[1], in[30]);
  875. a[31] = vsubq_s16(in[0], in[31]);
  876. // Stage 2.
  877. // For the "rd" version, all the values are rounded down after stage 2 to keep
  878. // the values in 16 bits.
  879. b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
  880. b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
  881. b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
  882. b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
  883. b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
  884. b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
  885. b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
  886. b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
  887. b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
  888. b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
  889. b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
  890. b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
  891. b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
  892. b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
  893. b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
  894. b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
  895. b[16] = add_round_shift_s16(a[16]);
  896. b[17] = add_round_shift_s16(a[17]);
  897. b[18] = add_round_shift_s16(a[18]);
  898. b[19] = add_round_shift_s16(a[19]);
  899. butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
  900. butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
  901. butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
  902. butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
  903. b[20] = add_round_shift_s16(b[20]);
  904. b[21] = add_round_shift_s16(b[21]);
  905. b[22] = add_round_shift_s16(b[22]);
  906. b[23] = add_round_shift_s16(b[23]);
  907. b[24] = add_round_shift_s16(b[24]);
  908. b[25] = add_round_shift_s16(b[25]);
  909. b[26] = add_round_shift_s16(b[26]);
  910. b[27] = add_round_shift_s16(b[27]);
  911. b[28] = add_round_shift_s16(a[28]);
  912. b[29] = add_round_shift_s16(a[29]);
  913. b[30] = add_round_shift_s16(a[30]);
  914. b[31] = add_round_shift_s16(a[31]);
  915. // Stage 3.
  916. a[0] = vaddq_s16(b[0], b[7]);
  917. a[1] = vaddq_s16(b[1], b[6]);
  918. a[2] = vaddq_s16(b[2], b[5]);
  919. a[3] = vaddq_s16(b[3], b[4]);
  920. a[4] = vsubq_s16(b[3], b[4]);
  921. a[5] = vsubq_s16(b[2], b[5]);
  922. a[6] = vsubq_s16(b[1], b[6]);
  923. a[7] = vsubq_s16(b[0], b[7]);
  924. a[8] = b[8];
  925. a[9] = b[9];
  926. butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
  927. butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
  928. a[14] = b[14];
  929. a[15] = b[15];
  930. a[16] = vaddq_s16(b[16], b[23]);
  931. a[17] = vaddq_s16(b[17], b[22]);
  932. a[18] = vaddq_s16(b[18], b[21]);
  933. a[19] = vaddq_s16(b[19], b[20]);
  934. a[20] = vsubq_s16(b[19], b[20]);
  935. a[21] = vsubq_s16(b[18], b[21]);
  936. a[22] = vsubq_s16(b[17], b[22]);
  937. a[23] = vsubq_s16(b[16], b[23]);
  938. a[24] = vsubq_s16(b[31], b[24]);
  939. a[25] = vsubq_s16(b[30], b[25]);
  940. a[26] = vsubq_s16(b[29], b[26]);
  941. a[27] = vsubq_s16(b[28], b[27]);
  942. a[28] = vaddq_s16(b[28], b[27]);
  943. a[29] = vaddq_s16(b[29], b[26]);
  944. a[30] = vaddq_s16(b[30], b[25]);
  945. a[31] = vaddq_s16(b[31], b[24]);
  946. // Stage 4.
  947. b[0] = vaddq_s16(a[0], a[3]);
  948. b[1] = vaddq_s16(a[1], a[2]);
  949. b[2] = vsubq_s16(a[1], a[2]);
  950. b[3] = vsubq_s16(a[0], a[3]);
  951. b[4] = a[4];
  952. butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
  953. b[7] = a[7];
  954. b[8] = vaddq_s16(a[8], a[11]);
  955. b[9] = vaddq_s16(a[9], a[10]);
  956. b[10] = vsubq_s16(a[9], a[10]);
  957. b[11] = vsubq_s16(a[8], a[11]);
  958. b[12] = vsubq_s16(a[15], a[12]);
  959. b[13] = vsubq_s16(a[14], a[13]);
  960. b[14] = vaddq_s16(a[14], a[13]);
  961. b[15] = vaddq_s16(a[15], a[12]);
  962. b[16] = a[16];
  963. b[17] = a[17];
  964. butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
  965. butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
  966. butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
  967. butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
  968. b[22] = a[22];
  969. b[23] = a[23];
  970. b[24] = a[24];
  971. b[25] = a[25];
  972. b[30] = a[30];
  973. b[31] = a[31];
  974. // Stage 5.
  975. butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
  976. butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
  977. a[4] = vaddq_s16(b[4], b[5]);
  978. a[5] = vsubq_s16(b[4], b[5]);
  979. a[6] = vsubq_s16(b[7], b[6]);
  980. a[7] = vaddq_s16(b[7], b[6]);
  981. a[8] = b[8];
  982. butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
  983. butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
  984. a[11] = b[11];
  985. a[12] = b[12];
  986. a[15] = b[15];
  987. a[16] = vaddq_s16(b[19], b[16]);
  988. a[17] = vaddq_s16(b[18], b[17]);
  989. a[18] = vsubq_s16(b[17], b[18]);
  990. a[19] = vsubq_s16(b[16], b[19]);
  991. a[20] = vsubq_s16(b[23], b[20]);
  992. a[21] = vsubq_s16(b[22], b[21]);
  993. a[22] = vaddq_s16(b[21], b[22]);
  994. a[23] = vaddq_s16(b[20], b[23]);
  995. a[24] = vaddq_s16(b[27], b[24]);
  996. a[25] = vaddq_s16(b[26], b[25]);
  997. a[26] = vsubq_s16(b[25], b[26]);
  998. a[27] = vsubq_s16(b[24], b[27]);
  999. a[28] = vsubq_s16(b[31], b[28]);
  1000. a[29] = vsubq_s16(b[30], b[29]);
  1001. a[30] = vaddq_s16(b[29], b[30]);
  1002. a[31] = vaddq_s16(b[28], b[31]);
  1003. // Stage 6.
  1004. b[0] = a[0];
  1005. b[1] = a[1];
  1006. b[2] = a[2];
  1007. b[3] = a[3];
  1008. butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
  1009. butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
  1010. b[8] = vaddq_s16(a[8], a[9]);
  1011. b[9] = vsubq_s16(a[8], a[9]);
  1012. b[10] = vsubq_s16(a[11], a[10]);
  1013. b[11] = vaddq_s16(a[11], a[10]);
  1014. b[12] = vaddq_s16(a[12], a[13]);
  1015. b[13] = vsubq_s16(a[12], a[13]);
  1016. b[14] = vsubq_s16(a[15], a[14]);
  1017. b[15] = vaddq_s16(a[15], a[14]);
  1018. b[16] = a[16];
  1019. b[19] = a[19];
  1020. b[20] = a[20];
  1021. b[23] = a[23];
  1022. b[24] = a[24];
  1023. b[27] = a[27];
  1024. b[28] = a[28];
  1025. b[31] = a[31];
  1026. butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
  1027. butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
  1028. butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
  1029. butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
  1030. // Stage 7.
  1031. a[0] = b[0];
  1032. a[1] = b[1];
  1033. a[2] = b[2];
  1034. a[3] = b[3];
  1035. a[4] = b[4];
  1036. a[5] = b[5];
  1037. a[6] = b[6];
  1038. a[7] = b[7];
  1039. butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
  1040. butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
  1041. butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
  1042. butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
  1043. a[16] = vaddq_s16(b[16], b[17]);
  1044. a[17] = vsubq_s16(b[16], b[17]);
  1045. a[18] = vsubq_s16(b[19], b[18]);
  1046. a[19] = vaddq_s16(b[19], b[18]);
  1047. a[20] = vaddq_s16(b[20], b[21]);
  1048. a[21] = vsubq_s16(b[20], b[21]);
  1049. a[22] = vsubq_s16(b[23], b[22]);
  1050. a[23] = vaddq_s16(b[23], b[22]);
  1051. a[24] = vaddq_s16(b[24], b[25]);
  1052. a[25] = vsubq_s16(b[24], b[25]);
  1053. a[26] = vsubq_s16(b[27], b[26]);
  1054. a[27] = vaddq_s16(b[27], b[26]);
  1055. a[28] = vaddq_s16(b[28], b[29]);
  1056. a[29] = vsubq_s16(b[28], b[29]);
  1057. a[30] = vsubq_s16(b[31], b[30]);
  1058. a[31] = vaddq_s16(b[31], b[30]);
  1059. // Final stage.
  1060. out[0] = a[0];
  1061. out[16] = a[1];
  1062. out[8] = a[2];
  1063. out[24] = a[3];
  1064. out[4] = a[4];
  1065. out[20] = a[5];
  1066. out[12] = a[6];
  1067. out[28] = a[7];
  1068. out[2] = a[8];
  1069. out[18] = a[9];
  1070. out[10] = a[10];
  1071. out[26] = a[11];
  1072. out[6] = a[12];
  1073. out[22] = a[13];
  1074. out[14] = a[14];
  1075. out[30] = a[15];
  1076. butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
  1077. butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
  1078. &out[15]);
  1079. butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
  1080. butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
  1081. butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
  1082. butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
  1083. &out[11]);
  1084. butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
  1085. &out[19]);
  1086. butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
  1087. }
  1088. #undef PASS_THROUGH
  1089. #undef ADD_S16_S32
  1090. #undef SUB_S16_S32
  1091. #undef ADDW_S16_S32
  1092. #undef SUBW_S16_S32
  1093. #undef ADD_S32
  1094. #undef SUB_S32
  1095. #undef BUTTERFLY_ONE_S16_S32
  1096. #undef BUTTERFLY_ONE_S32
  1097. #undef BUTTERFLY_TWO_S32
  1098. // Transpose 8x8 to a new location. Don't use transpose_neon.h because those
  1099. // are all in-place.
  1100. // TODO(johannkoenig): share with other fdcts.
  1101. static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
  1102. // Swap 16 bit elements.
  1103. const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
  1104. const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
  1105. const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
  1106. const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
  1107. // Swap 32 bit elements.
  1108. const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
  1109. vreinterpretq_s32_s16(c1.val[0]));
  1110. const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
  1111. vreinterpretq_s32_s16(c1.val[1]));
  1112. const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
  1113. vreinterpretq_s32_s16(c3.val[0]));
  1114. const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
  1115. vreinterpretq_s32_s16(c3.val[1]));
  1116. // Swap 64 bit elements
  1117. const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
  1118. const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
  1119. const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
  1120. const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
  1121. b[0] = e0.val[0];
  1122. b[1] = e1.val[0];
  1123. b[2] = e2.val[0];
  1124. b[3] = e3.val[0];
  1125. b[4] = e0.val[1];
  1126. b[5] = e1.val[1];
  1127. b[6] = e2.val[1];
  1128. b[7] = e3.val[1];
  1129. }
  1130. void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
  1131. int16x8_t temp0[32];
  1132. int16x8_t temp1[32];
  1133. int16x8_t temp2[32];
  1134. int16x8_t temp3[32];
  1135. int16x8_t temp4[32];
  1136. int16x8_t temp5[32];
  1137. // Process in 8x32 columns.
  1138. load(input, stride, temp0);
  1139. dct_body_first_pass(temp0, temp1);
  1140. load(input + 8, stride, temp0);
  1141. dct_body_first_pass(temp0, temp2);
  1142. load(input + 16, stride, temp0);
  1143. dct_body_first_pass(temp0, temp3);
  1144. load(input + 24, stride, temp0);
  1145. dct_body_first_pass(temp0, temp4);
  1146. // Generate the top row by munging the first set of 8 from each one together.
  1147. transpose_8x8(&temp1[0], &temp0[0]);
  1148. transpose_8x8(&temp2[0], &temp0[8]);
  1149. transpose_8x8(&temp3[0], &temp0[16]);
  1150. transpose_8x8(&temp4[0], &temp0[24]);
  1151. dct_body_second_pass(temp0, temp5);
  1152. transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
  1153. &temp5[5], &temp5[6], &temp5[7]);
  1154. transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
  1155. &temp5[13], &temp5[14], &temp5[15]);
  1156. transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
  1157. &temp5[21], &temp5[22], &temp5[23]);
  1158. transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
  1159. &temp5[29], &temp5[30], &temp5[31]);
  1160. store(output, temp5);
  1161. // Second row of 8x32.
  1162. transpose_8x8(&temp1[8], &temp0[0]);
  1163. transpose_8x8(&temp2[8], &temp0[8]);
  1164. transpose_8x8(&temp3[8], &temp0[16]);
  1165. transpose_8x8(&temp4[8], &temp0[24]);
  1166. dct_body_second_pass(temp0, temp5);
  1167. transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
  1168. &temp5[5], &temp5[6], &temp5[7]);
  1169. transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
  1170. &temp5[13], &temp5[14], &temp5[15]);
  1171. transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
  1172. &temp5[21], &temp5[22], &temp5[23]);
  1173. transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
  1174. &temp5[29], &temp5[30], &temp5[31]);
  1175. store(output + 8 * 32, temp5);
  1176. // Third row of 8x32
  1177. transpose_8x8(&temp1[16], &temp0[0]);
  1178. transpose_8x8(&temp2[16], &temp0[8]);
  1179. transpose_8x8(&temp3[16], &temp0[16]);
  1180. transpose_8x8(&temp4[16], &temp0[24]);
  1181. dct_body_second_pass(temp0, temp5);
  1182. transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
  1183. &temp5[5], &temp5[6], &temp5[7]);
  1184. transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
  1185. &temp5[13], &temp5[14], &temp5[15]);
  1186. transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
  1187. &temp5[21], &temp5[22], &temp5[23]);
  1188. transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
  1189. &temp5[29], &temp5[30], &temp5[31]);
  1190. store(output + 16 * 32, temp5);
  1191. // Final row of 8x32.
  1192. transpose_8x8(&temp1[24], &temp0[0]);
  1193. transpose_8x8(&temp2[24], &temp0[8]);
  1194. transpose_8x8(&temp3[24], &temp0[16]);
  1195. transpose_8x8(&temp4[24], &temp0[24]);
  1196. dct_body_second_pass(temp0, temp5);
  1197. transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
  1198. &temp5[5], &temp5[6], &temp5[7]);
  1199. transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
  1200. &temp5[13], &temp5[14], &temp5[15]);
  1201. transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
  1202. &temp5[21], &temp5[22], &temp5[23]);
  1203. transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
  1204. &temp5[29], &temp5[30], &temp5[31]);
  1205. store(output + 24 * 32, temp5);
  1206. }
  1207. void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
  1208. int stride) {
  1209. int16x8_t temp0[32];
  1210. int16x8_t temp1[32];
  1211. int16x8_t temp2[32];
  1212. int16x8_t temp3[32];
  1213. int16x8_t temp4[32];
  1214. int16x8_t temp5[32];
  1215. // Process in 8x32 columns.
  1216. load(input, stride, temp0);
  1217. dct_body_first_pass(temp0, temp1);
  1218. load(input + 8, stride, temp0);
  1219. dct_body_first_pass(temp0, temp2);
  1220. load(input + 16, stride, temp0);
  1221. dct_body_first_pass(temp0, temp3);
  1222. load(input + 24, stride, temp0);
  1223. dct_body_first_pass(temp0, temp4);
  1224. // Generate the top row by munging the first set of 8 from each one together.
  1225. transpose_8x8(&temp1[0], &temp0[0]);
  1226. transpose_8x8(&temp2[0], &temp0[8]);
  1227. transpose_8x8(&temp3[0], &temp0[16]);
  1228. transpose_8x8(&temp4[0], &temp0[24]);
  1229. dct_body_second_pass_rd(temp0, temp5);
  1230. transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
  1231. &temp5[5], &temp5[6], &temp5[7]);
  1232. transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
  1233. &temp5[13], &temp5[14], &temp5[15]);
  1234. transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
  1235. &temp5[21], &temp5[22], &temp5[23]);
  1236. transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
  1237. &temp5[29], &temp5[30], &temp5[31]);
  1238. store(output, temp5);
  1239. // Second row of 8x32.
  1240. transpose_8x8(&temp1[8], &temp0[0]);
  1241. transpose_8x8(&temp2[8], &temp0[8]);
  1242. transpose_8x8(&temp3[8], &temp0[16]);
  1243. transpose_8x8(&temp4[8], &temp0[24]);
  1244. dct_body_second_pass_rd(temp0, temp5);
  1245. transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
  1246. &temp5[5], &temp5[6], &temp5[7]);
  1247. transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
  1248. &temp5[13], &temp5[14], &temp5[15]);
  1249. transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
  1250. &temp5[21], &temp5[22], &temp5[23]);
  1251. transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
  1252. &temp5[29], &temp5[30], &temp5[31]);
  1253. store(output + 8 * 32, temp5);
  1254. // Third row of 8x32
  1255. transpose_8x8(&temp1[16], &temp0[0]);
  1256. transpose_8x8(&temp2[16], &temp0[8]);
  1257. transpose_8x8(&temp3[16], &temp0[16]);
  1258. transpose_8x8(&temp4[16], &temp0[24]);
  1259. dct_body_second_pass_rd(temp0, temp5);
  1260. transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
  1261. &temp5[5], &temp5[6], &temp5[7]);
  1262. transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
  1263. &temp5[13], &temp5[14], &temp5[15]);
  1264. transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
  1265. &temp5[21], &temp5[22], &temp5[23]);
  1266. transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
  1267. &temp5[29], &temp5[30], &temp5[31]);
  1268. store(output + 16 * 32, temp5);
  1269. // Final row of 8x32.
  1270. transpose_8x8(&temp1[24], &temp0[0]);
  1271. transpose_8x8(&temp2[24], &temp0[8]);
  1272. transpose_8x8(&temp3[24], &temp0[16]);
  1273. transpose_8x8(&temp4[24], &temp0[24]);
  1274. dct_body_second_pass_rd(temp0, temp5);
  1275. transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
  1276. &temp5[5], &temp5[6], &temp5[7]);
  1277. transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
  1278. &temp5[13], &temp5[14], &temp5[15]);
  1279. transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
  1280. &temp5[21], &temp5[22], &temp5[23]);
  1281. transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
  1282. &temp5[29], &temp5[30], &temp5[31]);
  1283. store(output + 24 * 32, temp5);
  1284. }
  1285. #endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
  1286. // __GNUC__ == 4 && __GNUC_MINOR__ <= 9