2
0

fdct32x32_vsx.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553
  1. /*
  2. * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_config.h"
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/ppc/transpose_vsx.h"
  13. #include "vpx_dsp/ppc/txfm_common_vsx.h"
  14. #include "vpx_dsp/ppc/types_vsx.h"
  15. // Returns ((a +/- b) * cospi16 + (2 << 13)) >> 14.
  16. static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add,
  17. int16x8_t *sub) {
  18. // Since a + b can overflow 16 bits, the multiplication is distributed
  19. // (a * c +/- b * c).
  20. const int32x4_t ac_e = vec_mule(a, cospi16_v);
  21. const int32x4_t ac_o = vec_mulo(a, cospi16_v);
  22. const int32x4_t bc_e = vec_mule(b, cospi16_v);
  23. const int32x4_t bc_o = vec_mulo(b, cospi16_v);
  24. // Reuse the same multiplies for sum and difference.
  25. const int32x4_t sum_e = vec_add(ac_e, bc_e);
  26. const int32x4_t sum_o = vec_add(ac_o, bc_o);
  27. const int32x4_t diff_e = vec_sub(ac_e, bc_e);
  28. const int32x4_t diff_o = vec_sub(ac_o, bc_o);
  29. // Add rounding offset
  30. const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding);
  31. const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding);
  32. const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding);
  33. const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding);
  34. const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits);
  35. const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits);
  36. const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits);
  37. const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
  38. // There's no pack operation for even and odd, so we need to permute.
  39. *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
  40. *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
  41. }
  42. // Returns (a * c1 +/- b * c2 + (2 << 13)) >> 14
  43. static INLINE void double_butterfly(int16x8_t a, int16x8_t c1, int16x8_t b,
  44. int16x8_t c2, int16x8_t *add,
  45. int16x8_t *sub) {
  46. const int32x4_t ac1_o = vec_mulo(a, c1);
  47. const int32x4_t ac1_e = vec_mule(a, c1);
  48. const int32x4_t ac2_o = vec_mulo(a, c2);
  49. const int32x4_t ac2_e = vec_mule(a, c2);
  50. const int32x4_t bc1_o = vec_mulo(b, c1);
  51. const int32x4_t bc1_e = vec_mule(b, c1);
  52. const int32x4_t bc2_o = vec_mulo(b, c2);
  53. const int32x4_t bc2_e = vec_mule(b, c2);
  54. const int32x4_t sum_o = vec_add(ac1_o, bc2_o);
  55. const int32x4_t sum_e = vec_add(ac1_e, bc2_e);
  56. const int32x4_t diff_o = vec_sub(ac2_o, bc1_o);
  57. const int32x4_t diff_e = vec_sub(ac2_e, bc1_e);
  58. // Add rounding offset
  59. const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding);
  60. const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding);
  61. const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding);
  62. const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding);
  63. const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits);
  64. const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits);
  65. const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits);
  66. const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
  67. // There's no pack operation for even and odd, so we need to permute.
  68. *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
  69. *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
  70. }
  71. // While other architecture combine the load and the stage 1 operations, Power9
  72. // benchmarking show no benefit in such an approach.
  73. static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
  74. // Tried out different combinations of load and shift instructions, this is
  75. // the fastest one.
  76. {
  77. const int16x8_t l0 = vec_vsx_ld(0, a);
  78. const int16x8_t l1 = vec_vsx_ld(0, a + stride);
  79. const int16x8_t l2 = vec_vsx_ld(0, a + 2 * stride);
  80. const int16x8_t l3 = vec_vsx_ld(0, a + 3 * stride);
  81. const int16x8_t l4 = vec_vsx_ld(0, a + 4 * stride);
  82. const int16x8_t l5 = vec_vsx_ld(0, a + 5 * stride);
  83. const int16x8_t l6 = vec_vsx_ld(0, a + 6 * stride);
  84. const int16x8_t l7 = vec_vsx_ld(0, a + 7 * stride);
  85. const int16x8_t l8 = vec_vsx_ld(0, a + 8 * stride);
  86. const int16x8_t l9 = vec_vsx_ld(0, a + 9 * stride);
  87. const int16x8_t l10 = vec_vsx_ld(0, a + 10 * stride);
  88. const int16x8_t l11 = vec_vsx_ld(0, a + 11 * stride);
  89. const int16x8_t l12 = vec_vsx_ld(0, a + 12 * stride);
  90. const int16x8_t l13 = vec_vsx_ld(0, a + 13 * stride);
  91. const int16x8_t l14 = vec_vsx_ld(0, a + 14 * stride);
  92. const int16x8_t l15 = vec_vsx_ld(0, a + 15 * stride);
  93. b[0] = vec_sl(l0, vec_dct_scale_log2);
  94. b[1] = vec_sl(l1, vec_dct_scale_log2);
  95. b[2] = vec_sl(l2, vec_dct_scale_log2);
  96. b[3] = vec_sl(l3, vec_dct_scale_log2);
  97. b[4] = vec_sl(l4, vec_dct_scale_log2);
  98. b[5] = vec_sl(l5, vec_dct_scale_log2);
  99. b[6] = vec_sl(l6, vec_dct_scale_log2);
  100. b[7] = vec_sl(l7, vec_dct_scale_log2);
  101. b[8] = vec_sl(l8, vec_dct_scale_log2);
  102. b[9] = vec_sl(l9, vec_dct_scale_log2);
  103. b[10] = vec_sl(l10, vec_dct_scale_log2);
  104. b[11] = vec_sl(l11, vec_dct_scale_log2);
  105. b[12] = vec_sl(l12, vec_dct_scale_log2);
  106. b[13] = vec_sl(l13, vec_dct_scale_log2);
  107. b[14] = vec_sl(l14, vec_dct_scale_log2);
  108. b[15] = vec_sl(l15, vec_dct_scale_log2);
  109. }
  110. {
  111. const int16x8_t l16 = vec_vsx_ld(0, a + 16 * stride);
  112. const int16x8_t l17 = vec_vsx_ld(0, a + 17 * stride);
  113. const int16x8_t l18 = vec_vsx_ld(0, a + 18 * stride);
  114. const int16x8_t l19 = vec_vsx_ld(0, a + 19 * stride);
  115. const int16x8_t l20 = vec_vsx_ld(0, a + 20 * stride);
  116. const int16x8_t l21 = vec_vsx_ld(0, a + 21 * stride);
  117. const int16x8_t l22 = vec_vsx_ld(0, a + 22 * stride);
  118. const int16x8_t l23 = vec_vsx_ld(0, a + 23 * stride);
  119. const int16x8_t l24 = vec_vsx_ld(0, a + 24 * stride);
  120. const int16x8_t l25 = vec_vsx_ld(0, a + 25 * stride);
  121. const int16x8_t l26 = vec_vsx_ld(0, a + 26 * stride);
  122. const int16x8_t l27 = vec_vsx_ld(0, a + 27 * stride);
  123. const int16x8_t l28 = vec_vsx_ld(0, a + 28 * stride);
  124. const int16x8_t l29 = vec_vsx_ld(0, a + 29 * stride);
  125. const int16x8_t l30 = vec_vsx_ld(0, a + 30 * stride);
  126. const int16x8_t l31 = vec_vsx_ld(0, a + 31 * stride);
  127. b[16] = vec_sl(l16, vec_dct_scale_log2);
  128. b[17] = vec_sl(l17, vec_dct_scale_log2);
  129. b[18] = vec_sl(l18, vec_dct_scale_log2);
  130. b[19] = vec_sl(l19, vec_dct_scale_log2);
  131. b[20] = vec_sl(l20, vec_dct_scale_log2);
  132. b[21] = vec_sl(l21, vec_dct_scale_log2);
  133. b[22] = vec_sl(l22, vec_dct_scale_log2);
  134. b[23] = vec_sl(l23, vec_dct_scale_log2);
  135. b[24] = vec_sl(l24, vec_dct_scale_log2);
  136. b[25] = vec_sl(l25, vec_dct_scale_log2);
  137. b[26] = vec_sl(l26, vec_dct_scale_log2);
  138. b[27] = vec_sl(l27, vec_dct_scale_log2);
  139. b[28] = vec_sl(l28, vec_dct_scale_log2);
  140. b[29] = vec_sl(l29, vec_dct_scale_log2);
  141. b[30] = vec_sl(l30, vec_dct_scale_log2);
  142. b[31] = vec_sl(l31, vec_dct_scale_log2);
  143. }
  144. }
  145. static INLINE void store(tran_low_t *a, const int16x8_t *b) {
  146. vec_vsx_st(b[0], 0, a);
  147. vec_vsx_st(b[8], 0, a + 8);
  148. vec_vsx_st(b[16], 0, a + 16);
  149. vec_vsx_st(b[24], 0, a + 24);
  150. vec_vsx_st(b[1], 0, a + 32);
  151. vec_vsx_st(b[9], 0, a + 40);
  152. vec_vsx_st(b[17], 0, a + 48);
  153. vec_vsx_st(b[25], 0, a + 56);
  154. vec_vsx_st(b[2], 0, a + 64);
  155. vec_vsx_st(b[10], 0, a + 72);
  156. vec_vsx_st(b[18], 0, a + 80);
  157. vec_vsx_st(b[26], 0, a + 88);
  158. vec_vsx_st(b[3], 0, a + 96);
  159. vec_vsx_st(b[11], 0, a + 104);
  160. vec_vsx_st(b[19], 0, a + 112);
  161. vec_vsx_st(b[27], 0, a + 120);
  162. vec_vsx_st(b[4], 0, a + 128);
  163. vec_vsx_st(b[12], 0, a + 136);
  164. vec_vsx_st(b[20], 0, a + 144);
  165. vec_vsx_st(b[28], 0, a + 152);
  166. vec_vsx_st(b[5], 0, a + 160);
  167. vec_vsx_st(b[13], 0, a + 168);
  168. vec_vsx_st(b[21], 0, a + 176);
  169. vec_vsx_st(b[29], 0, a + 184);
  170. vec_vsx_st(b[6], 0, a + 192);
  171. vec_vsx_st(b[14], 0, a + 200);
  172. vec_vsx_st(b[22], 0, a + 208);
  173. vec_vsx_st(b[30], 0, a + 216);
  174. vec_vsx_st(b[7], 0, a + 224);
  175. vec_vsx_st(b[15], 0, a + 232);
  176. vec_vsx_st(b[23], 0, a + 240);
  177. vec_vsx_st(b[31], 0, a + 248);
  178. }
  179. // Returns 1 if negative 0 if positive
  180. static INLINE int16x8_t vec_sign_s16(int16x8_t a) {
  181. return vec_sr(a, vec_shift_sign_s16);
  182. }
  183. // Add 2 if positive, 1 if negative, and shift by 2.
  184. static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
  185. const int16x8_t sign = vec_sign_s16(a);
  186. return vec_sra(vec_sub(vec_add(a, vec_twos_s16), sign), vec_dct_scale_log2);
  187. }
  188. // Add 1 if positive, 2 if negative, and shift by 2.
  189. // In practice, add 1, then add the sign bit, then shift without rounding.
  190. static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
  191. const int16x8_t sign = vec_sign_s16(a);
  192. return vec_sra(vec_add(vec_add(a, vec_ones_s16), sign), vec_dct_scale_log2);
  193. }
  194. static void fdct32_vsx(const int16x8_t *in, int16x8_t *out, int pass) {
  195. int16x8_t temp0[32]; // Hold stages: 1, 4, 7
  196. int16x8_t temp1[32]; // Hold stages: 2, 5
  197. int16x8_t temp2[32]; // Hold stages: 3, 6
  198. int i;
  199. // Stage 1
  200. // Unrolling this loops actually slows down Power9 benchmarks
  201. for (i = 0; i < 16; i++) {
  202. temp0[i] = vec_add(in[i], in[31 - i]);
  203. // pass through to stage 3.
  204. temp1[i + 16] = vec_sub(in[15 - i], in[i + 16]);
  205. }
  206. // Stage 2
  207. // Unrolling this loops actually slows down Power9 benchmarks
  208. for (i = 0; i < 8; i++) {
  209. temp1[i] = vec_add(temp0[i], temp0[15 - i]);
  210. temp1[i + 8] = vec_sub(temp0[7 - i], temp0[i + 8]);
  211. }
  212. // Apply butterflies (in place) on pass through to stage 3.
  213. single_butterfly(temp1[27], temp1[20], &temp1[27], &temp1[20]);
  214. single_butterfly(temp1[26], temp1[21], &temp1[26], &temp1[21]);
  215. single_butterfly(temp1[25], temp1[22], &temp1[25], &temp1[22]);
  216. single_butterfly(temp1[24], temp1[23], &temp1[24], &temp1[23]);
  217. // dump the magnitude by 4, hence the intermediate values are within
  218. // the range of 16 bits.
  219. if (pass) {
  220. temp1[0] = add_round_shift_s16(temp1[0]);
  221. temp1[1] = add_round_shift_s16(temp1[1]);
  222. temp1[2] = add_round_shift_s16(temp1[2]);
  223. temp1[3] = add_round_shift_s16(temp1[3]);
  224. temp1[4] = add_round_shift_s16(temp1[4]);
  225. temp1[5] = add_round_shift_s16(temp1[5]);
  226. temp1[6] = add_round_shift_s16(temp1[6]);
  227. temp1[7] = add_round_shift_s16(temp1[7]);
  228. temp1[8] = add_round_shift_s16(temp1[8]);
  229. temp1[9] = add_round_shift_s16(temp1[9]);
  230. temp1[10] = add_round_shift_s16(temp1[10]);
  231. temp1[11] = add_round_shift_s16(temp1[11]);
  232. temp1[12] = add_round_shift_s16(temp1[12]);
  233. temp1[13] = add_round_shift_s16(temp1[13]);
  234. temp1[14] = add_round_shift_s16(temp1[14]);
  235. temp1[15] = add_round_shift_s16(temp1[15]);
  236. temp1[16] = add_round_shift_s16(temp1[16]);
  237. temp1[17] = add_round_shift_s16(temp1[17]);
  238. temp1[18] = add_round_shift_s16(temp1[18]);
  239. temp1[19] = add_round_shift_s16(temp1[19]);
  240. temp1[20] = add_round_shift_s16(temp1[20]);
  241. temp1[21] = add_round_shift_s16(temp1[21]);
  242. temp1[22] = add_round_shift_s16(temp1[22]);
  243. temp1[23] = add_round_shift_s16(temp1[23]);
  244. temp1[24] = add_round_shift_s16(temp1[24]);
  245. temp1[25] = add_round_shift_s16(temp1[25]);
  246. temp1[26] = add_round_shift_s16(temp1[26]);
  247. temp1[27] = add_round_shift_s16(temp1[27]);
  248. temp1[28] = add_round_shift_s16(temp1[28]);
  249. temp1[29] = add_round_shift_s16(temp1[29]);
  250. temp1[30] = add_round_shift_s16(temp1[30]);
  251. temp1[31] = add_round_shift_s16(temp1[31]);
  252. }
  253. // Stage 3
  254. temp2[0] = vec_add(temp1[0], temp1[7]);
  255. temp2[1] = vec_add(temp1[1], temp1[6]);
  256. temp2[2] = vec_add(temp1[2], temp1[5]);
  257. temp2[3] = vec_add(temp1[3], temp1[4]);
  258. temp2[5] = vec_sub(temp1[2], temp1[5]);
  259. temp2[6] = vec_sub(temp1[1], temp1[6]);
  260. temp2[8] = temp1[8];
  261. temp2[9] = temp1[9];
  262. single_butterfly(temp1[13], temp1[10], &temp2[13], &temp2[10]);
  263. single_butterfly(temp1[12], temp1[11], &temp2[12], &temp2[11]);
  264. temp2[14] = temp1[14];
  265. temp2[15] = temp1[15];
  266. temp2[18] = vec_add(temp1[18], temp1[21]);
  267. temp2[19] = vec_add(temp1[19], temp1[20]);
  268. temp2[20] = vec_sub(temp1[19], temp1[20]);
  269. temp2[21] = vec_sub(temp1[18], temp1[21]);
  270. temp2[26] = vec_sub(temp1[29], temp1[26]);
  271. temp2[27] = vec_sub(temp1[28], temp1[27]);
  272. temp2[28] = vec_add(temp1[28], temp1[27]);
  273. temp2[29] = vec_add(temp1[29], temp1[26]);
  274. // Pass through Stage 4
  275. temp0[7] = vec_sub(temp1[0], temp1[7]);
  276. temp0[4] = vec_sub(temp1[3], temp1[4]);
  277. temp0[16] = vec_add(temp1[16], temp1[23]);
  278. temp0[17] = vec_add(temp1[17], temp1[22]);
  279. temp0[22] = vec_sub(temp1[17], temp1[22]);
  280. temp0[23] = vec_sub(temp1[16], temp1[23]);
  281. temp0[24] = vec_sub(temp1[31], temp1[24]);
  282. temp0[25] = vec_sub(temp1[30], temp1[25]);
  283. temp0[30] = vec_add(temp1[30], temp1[25]);
  284. temp0[31] = vec_add(temp1[31], temp1[24]);
  285. // Stage 4
  286. temp0[0] = vec_add(temp2[0], temp2[3]);
  287. temp0[1] = vec_add(temp2[1], temp2[2]);
  288. temp0[2] = vec_sub(temp2[1], temp2[2]);
  289. temp0[3] = vec_sub(temp2[0], temp2[3]);
  290. single_butterfly(temp2[6], temp2[5], &temp0[6], &temp0[5]);
  291. temp0[9] = vec_add(temp2[9], temp2[10]);
  292. temp0[10] = vec_sub(temp2[9], temp2[10]);
  293. temp0[13] = vec_sub(temp2[14], temp2[13]);
  294. temp0[14] = vec_add(temp2[14], temp2[13]);
  295. double_butterfly(temp2[29], cospi8_v, temp2[18], cospi24_v, &temp0[29],
  296. &temp0[18]);
  297. double_butterfly(temp2[28], cospi8_v, temp2[19], cospi24_v, &temp0[28],
  298. &temp0[19]);
  299. double_butterfly(temp2[27], cospi24_v, temp2[20], cospi8m_v, &temp0[27],
  300. &temp0[20]);
  301. double_butterfly(temp2[26], cospi24_v, temp2[21], cospi8m_v, &temp0[26],
  302. &temp0[21]);
  303. // Pass through Stage 5
  304. temp1[8] = vec_add(temp2[8], temp2[11]);
  305. temp1[11] = vec_sub(temp2[8], temp2[11]);
  306. temp1[12] = vec_sub(temp2[15], temp2[12]);
  307. temp1[15] = vec_add(temp2[15], temp2[12]);
  308. // Stage 5
  309. // 0 and 1 pass through to 0 and 16 at the end
  310. single_butterfly(temp0[0], temp0[1], &out[0], &out[16]);
  311. // 2 and 3 pass through to 8 and 24 at the end
  312. double_butterfly(temp0[3], cospi8_v, temp0[2], cospi24_v, &out[8], &out[24]);
  313. temp1[4] = vec_add(temp0[4], temp0[5]);
  314. temp1[5] = vec_sub(temp0[4], temp0[5]);
  315. temp1[6] = vec_sub(temp0[7], temp0[6]);
  316. temp1[7] = vec_add(temp0[7], temp0[6]);
  317. double_butterfly(temp0[14], cospi8_v, temp0[9], cospi24_v, &temp1[14],
  318. &temp1[9]);
  319. double_butterfly(temp0[13], cospi24_v, temp0[10], cospi8m_v, &temp1[13],
  320. &temp1[10]);
  321. temp1[17] = vec_add(temp0[17], temp0[18]);
  322. temp1[18] = vec_sub(temp0[17], temp0[18]);
  323. temp1[21] = vec_sub(temp0[22], temp0[21]);
  324. temp1[22] = vec_add(temp0[22], temp0[21]);
  325. temp1[25] = vec_add(temp0[25], temp0[26]);
  326. temp1[26] = vec_sub(temp0[25], temp0[26]);
  327. temp1[29] = vec_sub(temp0[30], temp0[29]);
  328. temp1[30] = vec_add(temp0[30], temp0[29]);
  329. // Pass through Stage 6
  330. temp2[16] = vec_add(temp0[16], temp0[19]);
  331. temp2[19] = vec_sub(temp0[16], temp0[19]);
  332. temp2[20] = vec_sub(temp0[23], temp0[20]);
  333. temp2[23] = vec_add(temp0[23], temp0[20]);
  334. temp2[24] = vec_add(temp0[24], temp0[27]);
  335. temp2[27] = vec_sub(temp0[24], temp0[27]);
  336. temp2[28] = vec_sub(temp0[31], temp0[28]);
  337. temp2[31] = vec_add(temp0[31], temp0[28]);
  338. // Stage 6
  339. // 4 and 7 pass through to 4 and 28 at the end
  340. double_butterfly(temp1[7], cospi4_v, temp1[4], cospi28_v, &out[4], &out[28]);
  341. // 5 and 6 pass through to 20 and 12 at the end
  342. double_butterfly(temp1[6], cospi20_v, temp1[5], cospi12_v, &out[20],
  343. &out[12]);
  344. temp2[8] = vec_add(temp1[8], temp1[9]);
  345. temp2[9] = vec_sub(temp1[8], temp1[9]);
  346. temp2[10] = vec_sub(temp1[11], temp1[10]);
  347. temp2[11] = vec_add(temp1[11], temp1[10]);
  348. temp2[12] = vec_add(temp1[12], temp1[13]);
  349. temp2[13] = vec_sub(temp1[12], temp1[13]);
  350. temp2[14] = vec_sub(temp1[15], temp1[14]);
  351. temp2[15] = vec_add(temp1[15], temp1[14]);
  352. double_butterfly(temp1[30], cospi4_v, temp1[17], cospi28_v, &temp2[30],
  353. &temp2[17]);
  354. double_butterfly(temp1[29], cospi28_v, temp1[18], cospi4m_v, &temp2[29],
  355. &temp2[18]);
  356. double_butterfly(temp1[26], cospi20_v, temp1[21], cospi12_v, &temp2[26],
  357. &temp2[21]);
  358. double_butterfly(temp1[25], cospi12_v, temp1[22], cospi20m_v, &temp2[25],
  359. &temp2[22]);
  360. // Stage 7
  361. double_butterfly(temp2[15], cospi2_v, temp2[8], cospi30_v, &out[2], &out[30]);
  362. double_butterfly(temp2[14], cospi18_v, temp2[9], cospi14_v, &out[18],
  363. &out[14]);
  364. double_butterfly(temp2[13], cospi10_v, temp2[10], cospi22_v, &out[10],
  365. &out[22]);
  366. double_butterfly(temp2[12], cospi26_v, temp2[11], cospi6_v, &out[26],
  367. &out[6]);
  368. temp0[16] = vec_add(temp2[16], temp2[17]);
  369. temp0[17] = vec_sub(temp2[16], temp2[17]);
  370. temp0[18] = vec_sub(temp2[19], temp2[18]);
  371. temp0[19] = vec_add(temp2[19], temp2[18]);
  372. temp0[20] = vec_add(temp2[20], temp2[21]);
  373. temp0[21] = vec_sub(temp2[20], temp2[21]);
  374. temp0[22] = vec_sub(temp2[23], temp2[22]);
  375. temp0[23] = vec_add(temp2[23], temp2[22]);
  376. temp0[24] = vec_add(temp2[24], temp2[25]);
  377. temp0[25] = vec_sub(temp2[24], temp2[25]);
  378. temp0[26] = vec_sub(temp2[27], temp2[26]);
  379. temp0[27] = vec_add(temp2[27], temp2[26]);
  380. temp0[28] = vec_add(temp2[28], temp2[29]);
  381. temp0[29] = vec_sub(temp2[28], temp2[29]);
  382. temp0[30] = vec_sub(temp2[31], temp2[30]);
  383. temp0[31] = vec_add(temp2[31], temp2[30]);
  384. // Final stage --- outputs indices are bit-reversed.
  385. double_butterfly(temp0[31], cospi1_v, temp0[16], cospi31_v, &out[1],
  386. &out[31]);
  387. double_butterfly(temp0[30], cospi17_v, temp0[17], cospi15_v, &out[17],
  388. &out[15]);
  389. double_butterfly(temp0[29], cospi9_v, temp0[18], cospi23_v, &out[9],
  390. &out[23]);
  391. double_butterfly(temp0[28], cospi25_v, temp0[19], cospi7_v, &out[25],
  392. &out[7]);
  393. double_butterfly(temp0[27], cospi5_v, temp0[20], cospi27_v, &out[5],
  394. &out[27]);
  395. double_butterfly(temp0[26], cospi21_v, temp0[21], cospi11_v, &out[21],
  396. &out[11]);
  397. double_butterfly(temp0[25], cospi13_v, temp0[22], cospi19_v, &out[13],
  398. &out[19]);
  399. double_butterfly(temp0[24], cospi29_v, temp0[23], cospi3_v, &out[29],
  400. &out[3]);
  401. if (pass == 0) {
  402. for (i = 0; i < 32; i++) {
  403. out[i] = sub_round_shift(out[i]);
  404. }
  405. }
  406. }
  407. void vpx_fdct32x32_rd_vsx(const int16_t *input, tran_low_t *out, int stride) {
  408. int16x8_t temp0[32];
  409. int16x8_t temp1[32];
  410. int16x8_t temp2[32];
  411. int16x8_t temp3[32];
  412. int16x8_t temp4[32];
  413. int16x8_t temp5[32];
  414. int16x8_t temp6[32];
  415. // Process in 8x32 columns.
  416. load(input, stride, temp0);
  417. fdct32_vsx(temp0, temp1, 0);
  418. load(input + 8, stride, temp0);
  419. fdct32_vsx(temp0, temp2, 0);
  420. load(input + 16, stride, temp0);
  421. fdct32_vsx(temp0, temp3, 0);
  422. load(input + 24, stride, temp0);
  423. fdct32_vsx(temp0, temp4, 0);
  424. // Generate the top row by munging the first set of 8 from each one
  425. // together.
  426. transpose_8x8(&temp1[0], &temp0[0]);
  427. transpose_8x8(&temp2[0], &temp0[8]);
  428. transpose_8x8(&temp3[0], &temp0[16]);
  429. transpose_8x8(&temp4[0], &temp0[24]);
  430. fdct32_vsx(temp0, temp5, 1);
  431. transpose_8x8(&temp5[0], &temp6[0]);
  432. transpose_8x8(&temp5[8], &temp6[8]);
  433. transpose_8x8(&temp5[16], &temp6[16]);
  434. transpose_8x8(&temp5[24], &temp6[24]);
  435. store(out, temp6);
  436. // Second row of 8x32.
  437. transpose_8x8(&temp1[8], &temp0[0]);
  438. transpose_8x8(&temp2[8], &temp0[8]);
  439. transpose_8x8(&temp3[8], &temp0[16]);
  440. transpose_8x8(&temp4[8], &temp0[24]);
  441. fdct32_vsx(temp0, temp5, 1);
  442. transpose_8x8(&temp5[0], &temp6[0]);
  443. transpose_8x8(&temp5[8], &temp6[8]);
  444. transpose_8x8(&temp5[16], &temp6[16]);
  445. transpose_8x8(&temp5[24], &temp6[24]);
  446. store(out + 8 * 32, temp6);
  447. // Third row of 8x32
  448. transpose_8x8(&temp1[16], &temp0[0]);
  449. transpose_8x8(&temp2[16], &temp0[8]);
  450. transpose_8x8(&temp3[16], &temp0[16]);
  451. transpose_8x8(&temp4[16], &temp0[24]);
  452. fdct32_vsx(temp0, temp5, 1);
  453. transpose_8x8(&temp5[0], &temp6[0]);
  454. transpose_8x8(&temp5[8], &temp6[8]);
  455. transpose_8x8(&temp5[16], &temp6[16]);
  456. transpose_8x8(&temp5[24], &temp6[24]);
  457. store(out + 16 * 32, temp6);
  458. // Final row of 8x32.
  459. transpose_8x8(&temp1[24], &temp0[0]);
  460. transpose_8x8(&temp2[24], &temp0[8]);
  461. transpose_8x8(&temp3[24], &temp0[16]);
  462. transpose_8x8(&temp4[24], &temp0[24]);
  463. fdct32_vsx(temp0, temp5, 1);
  464. transpose_8x8(&temp5[0], &temp6[0]);
  465. transpose_8x8(&temp5[8], &temp6[8]);
  466. transpose_8x8(&temp5[16], &temp6[16]);
  467. transpose_8x8(&temp5[24], &temp6[24]);
  468. store(out + 24 * 32, temp6);
  469. }