2
0

vp9_dct.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795
  1. /*
  2. * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <math.h>
  12. #include "./vp9_rtcd.h"
  13. #include "./vpx_config.h"
  14. #include "./vpx_dsp_rtcd.h"
  15. #include "vp9/common/vp9_blockd.h"
  16. #include "vp9/common/vp9_idct.h"
  17. #include "vpx_dsp/fwd_txfm.h"
  18. #include "vpx_ports/mem.h"
  19. static void fdct4(const tran_low_t *input, tran_low_t *output) {
  20. tran_high_t step[4];
  21. tran_high_t temp1, temp2;
  22. step[0] = input[0] + input[3];
  23. step[1] = input[1] + input[2];
  24. step[2] = input[1] - input[2];
  25. step[3] = input[0] - input[3];
  26. temp1 = (step[0] + step[1]) * cospi_16_64;
  27. temp2 = (step[0] - step[1]) * cospi_16_64;
  28. output[0] = (tran_low_t)fdct_round_shift(temp1);
  29. output[2] = (tran_low_t)fdct_round_shift(temp2);
  30. temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
  31. temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
  32. output[1] = (tran_low_t)fdct_round_shift(temp1);
  33. output[3] = (tran_low_t)fdct_round_shift(temp2);
  34. }
  35. static void fdct8(const tran_low_t *input, tran_low_t *output) {
  36. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
  37. tran_high_t t0, t1, t2, t3; // needs32
  38. tran_high_t x0, x1, x2, x3; // canbe16
  39. // stage 1
  40. s0 = input[0] + input[7];
  41. s1 = input[1] + input[6];
  42. s2 = input[2] + input[5];
  43. s3 = input[3] + input[4];
  44. s4 = input[3] - input[4];
  45. s5 = input[2] - input[5];
  46. s6 = input[1] - input[6];
  47. s7 = input[0] - input[7];
  48. // fdct4(step, step);
  49. x0 = s0 + s3;
  50. x1 = s1 + s2;
  51. x2 = s1 - s2;
  52. x3 = s0 - s3;
  53. t0 = (x0 + x1) * cospi_16_64;
  54. t1 = (x0 - x1) * cospi_16_64;
  55. t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
  56. t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
  57. output[0] = (tran_low_t)fdct_round_shift(t0);
  58. output[2] = (tran_low_t)fdct_round_shift(t2);
  59. output[4] = (tran_low_t)fdct_round_shift(t1);
  60. output[6] = (tran_low_t)fdct_round_shift(t3);
  61. // Stage 2
  62. t0 = (s6 - s5) * cospi_16_64;
  63. t1 = (s6 + s5) * cospi_16_64;
  64. t2 = (tran_low_t)fdct_round_shift(t0);
  65. t3 = (tran_low_t)fdct_round_shift(t1);
  66. // Stage 3
  67. x0 = s4 + t2;
  68. x1 = s4 - t2;
  69. x2 = s7 - t3;
  70. x3 = s7 + t3;
  71. // Stage 4
  72. t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
  73. t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
  74. t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
  75. t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
  76. output[1] = (tran_low_t)fdct_round_shift(t0);
  77. output[3] = (tran_low_t)fdct_round_shift(t2);
  78. output[5] = (tran_low_t)fdct_round_shift(t1);
  79. output[7] = (tran_low_t)fdct_round_shift(t3);
  80. }
  81. static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
  82. tran_high_t step1[8]; // canbe16
  83. tran_high_t step2[8]; // canbe16
  84. tran_high_t step3[8]; // canbe16
  85. tran_high_t input[8]; // canbe16
  86. tran_high_t temp1, temp2; // needs32
  87. // step 1
  88. input[0] = in[0] + in[15];
  89. input[1] = in[1] + in[14];
  90. input[2] = in[2] + in[13];
  91. input[3] = in[3] + in[12];
  92. input[4] = in[4] + in[11];
  93. input[5] = in[5] + in[10];
  94. input[6] = in[6] + in[9];
  95. input[7] = in[7] + in[8];
  96. step1[0] = in[7] - in[8];
  97. step1[1] = in[6] - in[9];
  98. step1[2] = in[5] - in[10];
  99. step1[3] = in[4] - in[11];
  100. step1[4] = in[3] - in[12];
  101. step1[5] = in[2] - in[13];
  102. step1[6] = in[1] - in[14];
  103. step1[7] = in[0] - in[15];
  104. // fdct8(step, step);
  105. {
  106. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
  107. tran_high_t t0, t1, t2, t3; // needs32
  108. tran_high_t x0, x1, x2, x3; // canbe16
  109. // stage 1
  110. s0 = input[0] + input[7];
  111. s1 = input[1] + input[6];
  112. s2 = input[2] + input[5];
  113. s3 = input[3] + input[4];
  114. s4 = input[3] - input[4];
  115. s5 = input[2] - input[5];
  116. s6 = input[1] - input[6];
  117. s7 = input[0] - input[7];
  118. // fdct4(step, step);
  119. x0 = s0 + s3;
  120. x1 = s1 + s2;
  121. x2 = s1 - s2;
  122. x3 = s0 - s3;
  123. t0 = (x0 + x1) * cospi_16_64;
  124. t1 = (x0 - x1) * cospi_16_64;
  125. t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
  126. t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
  127. out[0] = (tran_low_t)fdct_round_shift(t0);
  128. out[4] = (tran_low_t)fdct_round_shift(t2);
  129. out[8] = (tran_low_t)fdct_round_shift(t1);
  130. out[12] = (tran_low_t)fdct_round_shift(t3);
  131. // Stage 2
  132. t0 = (s6 - s5) * cospi_16_64;
  133. t1 = (s6 + s5) * cospi_16_64;
  134. t2 = fdct_round_shift(t0);
  135. t3 = fdct_round_shift(t1);
  136. // Stage 3
  137. x0 = s4 + t2;
  138. x1 = s4 - t2;
  139. x2 = s7 - t3;
  140. x3 = s7 + t3;
  141. // Stage 4
  142. t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
  143. t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
  144. t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
  145. t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
  146. out[2] = (tran_low_t)fdct_round_shift(t0);
  147. out[6] = (tran_low_t)fdct_round_shift(t2);
  148. out[10] = (tran_low_t)fdct_round_shift(t1);
  149. out[14] = (tran_low_t)fdct_round_shift(t3);
  150. }
  151. // step 2
  152. temp1 = (step1[5] - step1[2]) * cospi_16_64;
  153. temp2 = (step1[4] - step1[3]) * cospi_16_64;
  154. step2[2] = fdct_round_shift(temp1);
  155. step2[3] = fdct_round_shift(temp2);
  156. temp1 = (step1[4] + step1[3]) * cospi_16_64;
  157. temp2 = (step1[5] + step1[2]) * cospi_16_64;
  158. step2[4] = fdct_round_shift(temp1);
  159. step2[5] = fdct_round_shift(temp2);
  160. // step 3
  161. step3[0] = step1[0] + step2[3];
  162. step3[1] = step1[1] + step2[2];
  163. step3[2] = step1[1] - step2[2];
  164. step3[3] = step1[0] - step2[3];
  165. step3[4] = step1[7] - step2[4];
  166. step3[5] = step1[6] - step2[5];
  167. step3[6] = step1[6] + step2[5];
  168. step3[7] = step1[7] + step2[4];
  169. // step 4
  170. temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
  171. temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
  172. step2[1] = fdct_round_shift(temp1);
  173. step2[2] = fdct_round_shift(temp2);
  174. temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
  175. temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
  176. step2[5] = fdct_round_shift(temp1);
  177. step2[6] = fdct_round_shift(temp2);
  178. // step 5
  179. step1[0] = step3[0] + step2[1];
  180. step1[1] = step3[0] - step2[1];
  181. step1[2] = step3[3] + step2[2];
  182. step1[3] = step3[3] - step2[2];
  183. step1[4] = step3[4] - step2[5];
  184. step1[5] = step3[4] + step2[5];
  185. step1[6] = step3[7] - step2[6];
  186. step1[7] = step3[7] + step2[6];
  187. // step 6
  188. temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
  189. temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
  190. out[1] = (tran_low_t)fdct_round_shift(temp1);
  191. out[9] = (tran_low_t)fdct_round_shift(temp2);
  192. temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
  193. temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
  194. out[5] = (tran_low_t)fdct_round_shift(temp1);
  195. out[13] = (tran_low_t)fdct_round_shift(temp2);
  196. temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
  197. temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
  198. out[3] = (tran_low_t)fdct_round_shift(temp1);
  199. out[11] = (tran_low_t)fdct_round_shift(temp2);
  200. temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
  201. temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
  202. out[7] = (tran_low_t)fdct_round_shift(temp1);
  203. out[15] = (tran_low_t)fdct_round_shift(temp2);
  204. }
  205. static void fadst4(const tran_low_t *input, tran_low_t *output) {
  206. tran_high_t x0, x1, x2, x3;
  207. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  208. x0 = input[0];
  209. x1 = input[1];
  210. x2 = input[2];
  211. x3 = input[3];
  212. if (!(x0 | x1 | x2 | x3)) {
  213. output[0] = output[1] = output[2] = output[3] = 0;
  214. return;
  215. }
  216. s0 = sinpi_1_9 * x0;
  217. s1 = sinpi_4_9 * x0;
  218. s2 = sinpi_2_9 * x1;
  219. s3 = sinpi_1_9 * x1;
  220. s4 = sinpi_3_9 * x2;
  221. s5 = sinpi_4_9 * x3;
  222. s6 = sinpi_2_9 * x3;
  223. s7 = x0 + x1 - x3;
  224. x0 = s0 + s2 + s5;
  225. x1 = sinpi_3_9 * s7;
  226. x2 = s1 - s3 + s6;
  227. x3 = s4;
  228. s0 = x0 + x3;
  229. s1 = x1;
  230. s2 = x2 - x3;
  231. s3 = x2 - x0 + x3;
  232. // 1-D transform scaling factor is sqrt(2).
  233. output[0] = (tran_low_t)fdct_round_shift(s0);
  234. output[1] = (tran_low_t)fdct_round_shift(s1);
  235. output[2] = (tran_low_t)fdct_round_shift(s2);
  236. output[3] = (tran_low_t)fdct_round_shift(s3);
  237. }
  238. static void fadst8(const tran_low_t *input, tran_low_t *output) {
  239. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  240. tran_high_t x0 = input[7];
  241. tran_high_t x1 = input[0];
  242. tran_high_t x2 = input[5];
  243. tran_high_t x3 = input[2];
  244. tran_high_t x4 = input[3];
  245. tran_high_t x5 = input[4];
  246. tran_high_t x6 = input[1];
  247. tran_high_t x7 = input[6];
  248. // stage 1
  249. s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
  250. s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
  251. s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
  252. s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
  253. s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
  254. s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
  255. s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
  256. s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
  257. x0 = fdct_round_shift(s0 + s4);
  258. x1 = fdct_round_shift(s1 + s5);
  259. x2 = fdct_round_shift(s2 + s6);
  260. x3 = fdct_round_shift(s3 + s7);
  261. x4 = fdct_round_shift(s0 - s4);
  262. x5 = fdct_round_shift(s1 - s5);
  263. x6 = fdct_round_shift(s2 - s6);
  264. x7 = fdct_round_shift(s3 - s7);
  265. // stage 2
  266. s0 = x0;
  267. s1 = x1;
  268. s2 = x2;
  269. s3 = x3;
  270. s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
  271. s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
  272. s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
  273. s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
  274. x0 = s0 + s2;
  275. x1 = s1 + s3;
  276. x2 = s0 - s2;
  277. x3 = s1 - s3;
  278. x4 = fdct_round_shift(s4 + s6);
  279. x5 = fdct_round_shift(s5 + s7);
  280. x6 = fdct_round_shift(s4 - s6);
  281. x7 = fdct_round_shift(s5 - s7);
  282. // stage 3
  283. s2 = cospi_16_64 * (x2 + x3);
  284. s3 = cospi_16_64 * (x2 - x3);
  285. s6 = cospi_16_64 * (x6 + x7);
  286. s7 = cospi_16_64 * (x6 - x7);
  287. x2 = fdct_round_shift(s2);
  288. x3 = fdct_round_shift(s3);
  289. x6 = fdct_round_shift(s6);
  290. x7 = fdct_round_shift(s7);
  291. output[0] = (tran_low_t)x0;
  292. output[1] = (tran_low_t)-x4;
  293. output[2] = (tran_low_t)x6;
  294. output[3] = (tran_low_t)-x2;
  295. output[4] = (tran_low_t)x3;
  296. output[5] = (tran_low_t)-x7;
  297. output[6] = (tran_low_t)x5;
  298. output[7] = (tran_low_t)-x1;
  299. }
  300. static void fadst16(const tran_low_t *input, tran_low_t *output) {
  301. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  302. tran_high_t s9, s10, s11, s12, s13, s14, s15;
  303. tran_high_t x0 = input[15];
  304. tran_high_t x1 = input[0];
  305. tran_high_t x2 = input[13];
  306. tran_high_t x3 = input[2];
  307. tran_high_t x4 = input[11];
  308. tran_high_t x5 = input[4];
  309. tran_high_t x6 = input[9];
  310. tran_high_t x7 = input[6];
  311. tran_high_t x8 = input[7];
  312. tran_high_t x9 = input[8];
  313. tran_high_t x10 = input[5];
  314. tran_high_t x11 = input[10];
  315. tran_high_t x12 = input[3];
  316. tran_high_t x13 = input[12];
  317. tran_high_t x14 = input[1];
  318. tran_high_t x15 = input[14];
  319. // stage 1
  320. s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
  321. s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
  322. s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
  323. s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
  324. s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
  325. s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
  326. s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
  327. s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
  328. s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
  329. s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
  330. s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
  331. s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
  332. s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
  333. s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
  334. s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
  335. s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
  336. x0 = fdct_round_shift(s0 + s8);
  337. x1 = fdct_round_shift(s1 + s9);
  338. x2 = fdct_round_shift(s2 + s10);
  339. x3 = fdct_round_shift(s3 + s11);
  340. x4 = fdct_round_shift(s4 + s12);
  341. x5 = fdct_round_shift(s5 + s13);
  342. x6 = fdct_round_shift(s6 + s14);
  343. x7 = fdct_round_shift(s7 + s15);
  344. x8 = fdct_round_shift(s0 - s8);
  345. x9 = fdct_round_shift(s1 - s9);
  346. x10 = fdct_round_shift(s2 - s10);
  347. x11 = fdct_round_shift(s3 - s11);
  348. x12 = fdct_round_shift(s4 - s12);
  349. x13 = fdct_round_shift(s5 - s13);
  350. x14 = fdct_round_shift(s6 - s14);
  351. x15 = fdct_round_shift(s7 - s15);
  352. // stage 2
  353. s0 = x0;
  354. s1 = x1;
  355. s2 = x2;
  356. s3 = x3;
  357. s4 = x4;
  358. s5 = x5;
  359. s6 = x6;
  360. s7 = x7;
  361. s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
  362. s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
  363. s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
  364. s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
  365. s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
  366. s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
  367. s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
  368. s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
  369. x0 = s0 + s4;
  370. x1 = s1 + s5;
  371. x2 = s2 + s6;
  372. x3 = s3 + s7;
  373. x4 = s0 - s4;
  374. x5 = s1 - s5;
  375. x6 = s2 - s6;
  376. x7 = s3 - s7;
  377. x8 = fdct_round_shift(s8 + s12);
  378. x9 = fdct_round_shift(s9 + s13);
  379. x10 = fdct_round_shift(s10 + s14);
  380. x11 = fdct_round_shift(s11 + s15);
  381. x12 = fdct_round_shift(s8 - s12);
  382. x13 = fdct_round_shift(s9 - s13);
  383. x14 = fdct_round_shift(s10 - s14);
  384. x15 = fdct_round_shift(s11 - s15);
  385. // stage 3
  386. s0 = x0;
  387. s1 = x1;
  388. s2 = x2;
  389. s3 = x3;
  390. s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
  391. s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
  392. s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
  393. s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
  394. s8 = x8;
  395. s9 = x9;
  396. s10 = x10;
  397. s11 = x11;
  398. s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
  399. s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
  400. s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
  401. s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
  402. x0 = s0 + s2;
  403. x1 = s1 + s3;
  404. x2 = s0 - s2;
  405. x3 = s1 - s3;
  406. x4 = fdct_round_shift(s4 + s6);
  407. x5 = fdct_round_shift(s5 + s7);
  408. x6 = fdct_round_shift(s4 - s6);
  409. x7 = fdct_round_shift(s5 - s7);
  410. x8 = s8 + s10;
  411. x9 = s9 + s11;
  412. x10 = s8 - s10;
  413. x11 = s9 - s11;
  414. x12 = fdct_round_shift(s12 + s14);
  415. x13 = fdct_round_shift(s13 + s15);
  416. x14 = fdct_round_shift(s12 - s14);
  417. x15 = fdct_round_shift(s13 - s15);
  418. // stage 4
  419. s2 = (-cospi_16_64) * (x2 + x3);
  420. s3 = cospi_16_64 * (x2 - x3);
  421. s6 = cospi_16_64 * (x6 + x7);
  422. s7 = cospi_16_64 * (-x6 + x7);
  423. s10 = cospi_16_64 * (x10 + x11);
  424. s11 = cospi_16_64 * (-x10 + x11);
  425. s14 = (-cospi_16_64) * (x14 + x15);
  426. s15 = cospi_16_64 * (x14 - x15);
  427. x2 = fdct_round_shift(s2);
  428. x3 = fdct_round_shift(s3);
  429. x6 = fdct_round_shift(s6);
  430. x7 = fdct_round_shift(s7);
  431. x10 = fdct_round_shift(s10);
  432. x11 = fdct_round_shift(s11);
  433. x14 = fdct_round_shift(s14);
  434. x15 = fdct_round_shift(s15);
  435. output[0] = (tran_low_t)x0;
  436. output[1] = (tran_low_t)-x8;
  437. output[2] = (tran_low_t)x12;
  438. output[3] = (tran_low_t)-x4;
  439. output[4] = (tran_low_t)x6;
  440. output[5] = (tran_low_t)x14;
  441. output[6] = (tran_low_t)x10;
  442. output[7] = (tran_low_t)x2;
  443. output[8] = (tran_low_t)x3;
  444. output[9] = (tran_low_t)x11;
  445. output[10] = (tran_low_t)x15;
  446. output[11] = (tran_low_t)x7;
  447. output[12] = (tran_low_t)x5;
  448. output[13] = (tran_low_t)-x13;
  449. output[14] = (tran_low_t)x9;
  450. output[15] = (tran_low_t)-x1;
  451. }
  452. static const transform_2d FHT_4[] = {
  453. { fdct4, fdct4 }, // DCT_DCT = 0
  454. { fadst4, fdct4 }, // ADST_DCT = 1
  455. { fdct4, fadst4 }, // DCT_ADST = 2
  456. { fadst4, fadst4 } // ADST_ADST = 3
  457. };
  458. static const transform_2d FHT_8[] = {
  459. { fdct8, fdct8 }, // DCT_DCT = 0
  460. { fadst8, fdct8 }, // ADST_DCT = 1
  461. { fdct8, fadst8 }, // DCT_ADST = 2
  462. { fadst8, fadst8 } // ADST_ADST = 3
  463. };
  464. static const transform_2d FHT_16[] = {
  465. { fdct16, fdct16 }, // DCT_DCT = 0
  466. { fadst16, fdct16 }, // ADST_DCT = 1
  467. { fdct16, fadst16 }, // DCT_ADST = 2
  468. { fadst16, fadst16 } // ADST_ADST = 3
  469. };
  470. void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
  471. int tx_type) {
  472. if (tx_type == DCT_DCT) {
  473. vpx_fdct4x4_c(input, output, stride);
  474. } else {
  475. tran_low_t out[4 * 4];
  476. int i, j;
  477. tran_low_t temp_in[4], temp_out[4];
  478. const transform_2d ht = FHT_4[tx_type];
  479. // Columns
  480. for (i = 0; i < 4; ++i) {
  481. for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
  482. if (i == 0 && temp_in[0]) temp_in[0] += 1;
  483. ht.cols(temp_in, temp_out);
  484. for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
  485. }
  486. // Rows
  487. for (i = 0; i < 4; ++i) {
  488. for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
  489. ht.rows(temp_in, temp_out);
  490. for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
  491. }
  492. }
  493. }
  494. void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
  495. tran_low_t *coeff_ptr, intptr_t n_coeffs,
  496. int skip_block, const int16_t *zbin_ptr,
  497. const int16_t *round_ptr, const int16_t *quant_ptr,
  498. const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
  499. tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
  500. uint16_t *eob_ptr, const int16_t *scan,
  501. const int16_t *iscan) {
  502. int eob = -1;
  503. int i, j;
  504. tran_low_t intermediate[64];
  505. // Transform columns
  506. {
  507. tran_low_t *output = intermediate;
  508. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
  509. tran_high_t t0, t1, t2, t3; // needs32
  510. tran_high_t x0, x1, x2, x3; // canbe16
  511. int i;
  512. for (i = 0; i < 8; i++) {
  513. // stage 1
  514. s0 = (input[0 * stride] + input[7 * stride]) * 4;
  515. s1 = (input[1 * stride] + input[6 * stride]) * 4;
  516. s2 = (input[2 * stride] + input[5 * stride]) * 4;
  517. s3 = (input[3 * stride] + input[4 * stride]) * 4;
  518. s4 = (input[3 * stride] - input[4 * stride]) * 4;
  519. s5 = (input[2 * stride] - input[5 * stride]) * 4;
  520. s6 = (input[1 * stride] - input[6 * stride]) * 4;
  521. s7 = (input[0 * stride] - input[7 * stride]) * 4;
  522. // fdct4(step, step);
  523. x0 = s0 + s3;
  524. x1 = s1 + s2;
  525. x2 = s1 - s2;
  526. x3 = s0 - s3;
  527. t0 = (x0 + x1) * cospi_16_64;
  528. t1 = (x0 - x1) * cospi_16_64;
  529. t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
  530. t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
  531. output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
  532. output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
  533. output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
  534. output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
  535. // Stage 2
  536. t0 = (s6 - s5) * cospi_16_64;
  537. t1 = (s6 + s5) * cospi_16_64;
  538. t2 = fdct_round_shift(t0);
  539. t3 = fdct_round_shift(t1);
  540. // Stage 3
  541. x0 = s4 + t2;
  542. x1 = s4 - t2;
  543. x2 = s7 - t3;
  544. x3 = s7 + t3;
  545. // Stage 4
  546. t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
  547. t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
  548. t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
  549. t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
  550. output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
  551. output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
  552. output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
  553. output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
  554. input++;
  555. output++;
  556. }
  557. }
  558. // Rows
  559. for (i = 0; i < 8; ++i) {
  560. fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
  561. for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
  562. }
  563. // TODO(jingning) Decide the need of these arguments after the
  564. // quantization process is completed.
  565. (void)zbin_ptr;
  566. (void)quant_shift_ptr;
  567. (void)iscan;
  568. memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
  569. memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
  570. if (!skip_block) {
  571. // Quantization pass: All coefficients with index >= zero_flag are
  572. // skippable. Note: zero_flag can be zero.
  573. for (i = 0; i < n_coeffs; i++) {
  574. const int rc = scan[i];
  575. const int coeff = coeff_ptr[rc];
  576. const int coeff_sign = (coeff >> 31);
  577. const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
  578. int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
  579. tmp = (tmp * quant_ptr[rc != 0]) >> 16;
  580. qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
  581. dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
  582. if (tmp) eob = i;
  583. }
  584. }
  585. *eob_ptr = eob + 1;
  586. }
  587. void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
  588. int tx_type) {
  589. if (tx_type == DCT_DCT) {
  590. vpx_fdct8x8_c(input, output, stride);
  591. } else {
  592. tran_low_t out[64];
  593. int i, j;
  594. tran_low_t temp_in[8], temp_out[8];
  595. const transform_2d ht = FHT_8[tx_type];
  596. // Columns
  597. for (i = 0; i < 8; ++i) {
  598. for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
  599. ht.cols(temp_in, temp_out);
  600. for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
  601. }
  602. // Rows
  603. for (i = 0; i < 8; ++i) {
  604. for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
  605. ht.rows(temp_in, temp_out);
  606. for (j = 0; j < 8; ++j)
  607. output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
  608. }
  609. }
  610. }
  611. /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
  612. pixel. */
  613. void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
  614. int i;
  615. tran_high_t a1, b1, c1, d1, e1;
  616. const int16_t *ip_pass0 = input;
  617. const tran_low_t *ip = NULL;
  618. tran_low_t *op = output;
  619. for (i = 0; i < 4; i++) {
  620. a1 = ip_pass0[0 * stride];
  621. b1 = ip_pass0[1 * stride];
  622. c1 = ip_pass0[2 * stride];
  623. d1 = ip_pass0[3 * stride];
  624. a1 += b1;
  625. d1 = d1 - c1;
  626. e1 = (a1 - d1) >> 1;
  627. b1 = e1 - b1;
  628. c1 = e1 - c1;
  629. a1 -= c1;
  630. d1 += b1;
  631. op[0] = (tran_low_t)a1;
  632. op[4] = (tran_low_t)c1;
  633. op[8] = (tran_low_t)d1;
  634. op[12] = (tran_low_t)b1;
  635. ip_pass0++;
  636. op++;
  637. }
  638. ip = output;
  639. op = output;
  640. for (i = 0; i < 4; i++) {
  641. a1 = ip[0];
  642. b1 = ip[1];
  643. c1 = ip[2];
  644. d1 = ip[3];
  645. a1 += b1;
  646. d1 -= c1;
  647. e1 = (a1 - d1) >> 1;
  648. b1 = e1 - b1;
  649. c1 = e1 - c1;
  650. a1 -= c1;
  651. d1 += b1;
  652. op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
  653. op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
  654. op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
  655. op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
  656. ip += 4;
  657. op += 4;
  658. }
  659. }
  660. void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
  661. int tx_type) {
  662. if (tx_type == DCT_DCT) {
  663. vpx_fdct16x16_c(input, output, stride);
  664. } else {
  665. tran_low_t out[256];
  666. int i, j;
  667. tran_low_t temp_in[16], temp_out[16];
  668. const transform_2d ht = FHT_16[tx_type];
  669. // Columns
  670. for (i = 0; i < 16; ++i) {
  671. for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
  672. ht.cols(temp_in, temp_out);
  673. for (j = 0; j < 16; ++j)
  674. out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
  675. }
  676. // Rows
  677. for (i = 0; i < 16; ++i) {
  678. for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
  679. ht.rows(temp_in, temp_out);
  680. for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
  681. }
  682. }
  683. }
  684. #if CONFIG_VP9_HIGHBITDEPTH
  685. void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
  686. int tx_type) {
  687. vp9_fht4x4_c(input, output, stride, tx_type);
  688. }
  689. void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
  690. int tx_type) {
  691. vp9_fht8x8_c(input, output, stride, tx_type);
  692. }
  693. void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
  694. int stride) {
  695. vp9_fwht4x4_c(input, output, stride);
  696. }
  697. void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
  698. int tx_type) {
  699. vp9_fht16x16_c(input, output, stride, tx_type);
  700. }
  701. #endif // CONFIG_VP9_HIGHBITDEPTH