itrans16_dspr2.c 64 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_config.h"
  11. #include "./vpx_dsp_rtcd.h"
  12. #include "vpx_dsp/mips/inv_txfm_dspr2.h"
  13. #include "vpx_dsp/txfm_common.h"
  14. #if HAVE_DSPR2
  15. void idct16_rows_dspr2(const int16_t *input, int16_t *output,
  16. uint32_t no_rows) {
  17. int i;
  18. int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
  19. int step1_10, step1_11, step1_12, step1_13;
  20. int step2_0, step2_1, step2_2, step2_3;
  21. int step2_8, step2_9, step2_10, step2_11;
  22. int step2_12, step2_13, step2_14, step2_15;
  23. int load1, load2, load3, load4, load5, load6, load7, load8;
  24. int result1, result2, result3, result4;
  25. const int const_2_power_13 = 8192;
  26. for (i = no_rows; i--;) {
  27. /* prefetch row */
  28. prefetch_load((const uint8_t *)(input + 16));
  29. __asm__ __volatile__(
  30. "lh %[load1], 0(%[input]) \n\t"
  31. "lh %[load2], 16(%[input]) \n\t"
  32. "lh %[load3], 8(%[input]) \n\t"
  33. "lh %[load4], 24(%[input]) \n\t"
  34. "mtlo %[const_2_power_13], $ac1 \n\t"
  35. "mthi $zero, $ac1 \n\t"
  36. "mtlo %[const_2_power_13], $ac2 \n\t"
  37. "mthi $zero, $ac2 \n\t"
  38. "add %[result1], %[load1], %[load2] \n\t"
  39. "sub %[result2], %[load1], %[load2] \n\t"
  40. "madd $ac1, %[result1], %[cospi_16_64] \n\t"
  41. "madd $ac2, %[result2], %[cospi_16_64] \n\t"
  42. "extp %[step2_0], $ac1, 31 \n\t"
  43. "extp %[step2_1], $ac2, 31 \n\t"
  44. "mtlo %[const_2_power_13], $ac3 \n\t"
  45. "mthi $zero, $ac3 \n\t"
  46. "madd $ac3, %[load3], %[cospi_24_64] \n\t"
  47. "msub $ac3, %[load4], %[cospi_8_64] \n\t"
  48. "extp %[step2_2], $ac3, 31 \n\t"
  49. "mtlo %[const_2_power_13], $ac1 \n\t"
  50. "mthi $zero, $ac1 \n\t"
  51. "madd $ac1, %[load3], %[cospi_8_64] \n\t"
  52. "madd $ac1, %[load4], %[cospi_24_64] \n\t"
  53. "extp %[step2_3], $ac1, 31 \n\t"
  54. "add %[step1_0], %[step2_0], %[step2_3] \n\t"
  55. "add %[step1_1], %[step2_1], %[step2_2] \n\t"
  56. "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
  57. "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
  58. : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
  59. [load4] "=&r"(load4), [result1] "=&r"(result1),
  60. [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
  61. [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
  62. [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
  63. [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
  64. [step1_3] "=r"(step1_3)
  65. : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
  66. [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
  67. [cospi_16_64] "r"(cospi_16_64));
  68. __asm__ __volatile__(
  69. "lh %[load5], 2(%[input]) \n\t"
  70. "lh %[load6], 30(%[input]) \n\t"
  71. "lh %[load7], 18(%[input]) \n\t"
  72. "lh %[load8], 14(%[input]) \n\t"
  73. "mtlo %[const_2_power_13], $ac1 \n\t"
  74. "mthi $zero, $ac1 \n\t"
  75. "mtlo %[const_2_power_13], $ac3 \n\t"
  76. "mthi $zero, $ac3 \n\t"
  77. "madd $ac1, %[load5], %[cospi_30_64] \n\t"
  78. "msub $ac1, %[load6], %[cospi_2_64] \n\t"
  79. "extp %[result1], $ac1, 31 \n\t"
  80. "madd $ac3, %[load7], %[cospi_14_64] \n\t"
  81. "msub $ac3, %[load8], %[cospi_18_64] \n\t"
  82. "extp %[result2], $ac3, 31 \n\t"
  83. "mtlo %[const_2_power_13], $ac1 \n\t"
  84. "mthi $zero, $ac1 \n\t"
  85. "mtlo %[const_2_power_13], $ac2 \n\t"
  86. "mthi $zero, $ac2 \n\t"
  87. "madd $ac1, %[load7], %[cospi_18_64] \n\t"
  88. "madd $ac1, %[load8], %[cospi_14_64] \n\t"
  89. "extp %[result3], $ac1, 31 \n\t"
  90. "madd $ac2, %[load5], %[cospi_2_64] \n\t"
  91. "madd $ac2, %[load6], %[cospi_30_64] \n\t"
  92. "extp %[result4], $ac2, 31 \n\t"
  93. "sub %[load5], %[result1], %[result2] \n\t"
  94. "sub %[load6], %[result4], %[result3] \n\t"
  95. "mtlo %[const_2_power_13], $ac1 \n\t"
  96. "mthi $zero, $ac1 \n\t"
  97. "mtlo %[const_2_power_13], $ac3 \n\t"
  98. "mthi $zero, $ac3 \n\t"
  99. "madd $ac1, %[load6], %[cospi_24_64] \n\t"
  100. "msub $ac1, %[load5], %[cospi_8_64] \n\t"
  101. "madd $ac3, %[load5], %[cospi_24_64] \n\t"
  102. "madd $ac3, %[load6], %[cospi_8_64] \n\t"
  103. "extp %[step2_9], $ac1, 31 \n\t"
  104. "extp %[step2_14], $ac3, 31 \n\t"
  105. "add %[step2_8], %[result1], %[result2] \n\t"
  106. "add %[step2_15], %[result4], %[result3] \n\t"
  107. : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
  108. [load8] "=&r"(load8), [result1] "=&r"(result1),
  109. [result2] "=&r"(result2), [result3] "=&r"(result3),
  110. [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
  111. [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
  112. [step2_14] "=r"(step2_14)
  113. : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
  114. [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
  115. [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
  116. [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
  117. __asm__ __volatile__(
  118. "lh %[load1], 10(%[input]) \n\t"
  119. "lh %[load2], 22(%[input]) \n\t"
  120. "lh %[load3], 26(%[input]) \n\t"
  121. "lh %[load4], 6(%[input]) \n\t"
  122. "mtlo %[const_2_power_13], $ac1 \n\t"
  123. "mthi $zero, $ac1 \n\t"
  124. "mtlo %[const_2_power_13], $ac3 \n\t"
  125. "mthi $zero, $ac3 \n\t"
  126. "madd $ac1, %[load1], %[cospi_22_64] \n\t"
  127. "msub $ac1, %[load2], %[cospi_10_64] \n\t"
  128. "extp %[result1], $ac1, 31 \n\t"
  129. "madd $ac3, %[load3], %[cospi_6_64] \n\t"
  130. "msub $ac3, %[load4], %[cospi_26_64] \n\t"
  131. "extp %[result2], $ac3, 31 \n\t"
  132. "mtlo %[const_2_power_13], $ac1 \n\t"
  133. "mthi $zero, $ac1 \n\t"
  134. "mtlo %[const_2_power_13], $ac2 \n\t"
  135. "mthi $zero, $ac2 \n\t"
  136. "madd $ac1, %[load1], %[cospi_10_64] \n\t"
  137. "madd $ac1, %[load2], %[cospi_22_64] \n\t"
  138. "extp %[result3], $ac1, 31 \n\t"
  139. "madd $ac2, %[load3], %[cospi_26_64] \n\t"
  140. "madd $ac2, %[load4], %[cospi_6_64] \n\t"
  141. "extp %[result4], $ac2, 31 \n\t"
  142. "mtlo %[const_2_power_13], $ac1 \n\t"
  143. "mthi $zero, $ac1 \n\t"
  144. "mtlo %[const_2_power_13], $ac3 \n\t"
  145. "mthi $zero, $ac3 \n\t"
  146. "sub %[load1], %[result2], %[result1] \n\t"
  147. "sub %[load2], %[result4], %[result3] \n\t"
  148. "msub $ac1, %[load1], %[cospi_24_64] \n\t"
  149. "msub $ac1, %[load2], %[cospi_8_64] \n\t"
  150. "madd $ac3, %[load2], %[cospi_24_64] \n\t"
  151. "msub $ac3, %[load1], %[cospi_8_64] \n\t"
  152. "extp %[step2_10], $ac1, 31 \n\t"
  153. "extp %[step2_13], $ac3, 31 \n\t"
  154. "add %[step2_11], %[result1], %[result2] \n\t"
  155. "add %[step2_12], %[result4], %[result3] \n\t"
  156. : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
  157. [load4] "=&r"(load4), [result1] "=&r"(result1),
  158. [result2] "=&r"(result2), [result3] "=&r"(result3),
  159. [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
  160. [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
  161. [step2_13] "=r"(step2_13)
  162. : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
  163. [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
  164. [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
  165. [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
  166. __asm__ __volatile__(
  167. "lh %[load5], 4(%[input]) \n\t"
  168. "lh %[load6], 28(%[input]) \n\t"
  169. "lh %[load7], 20(%[input]) \n\t"
  170. "lh %[load8], 12(%[input]) \n\t"
  171. "mtlo %[const_2_power_13], $ac1 \n\t"
  172. "mthi $zero, $ac1 \n\t"
  173. "mtlo %[const_2_power_13], $ac3 \n\t"
  174. "mthi $zero, $ac3 \n\t"
  175. "madd $ac1, %[load5], %[cospi_28_64] \n\t"
  176. "msub $ac1, %[load6], %[cospi_4_64] \n\t"
  177. "extp %[result1], $ac1, 31 \n\t"
  178. "madd $ac3, %[load7], %[cospi_12_64] \n\t"
  179. "msub $ac3, %[load8], %[cospi_20_64] \n\t"
  180. "extp %[result2], $ac3, 31 \n\t"
  181. "mtlo %[const_2_power_13], $ac1 \n\t"
  182. "mthi $zero, $ac1 \n\t"
  183. "mtlo %[const_2_power_13], $ac2 \n\t"
  184. "mthi $zero, $ac2 \n\t"
  185. "madd $ac1, %[load7], %[cospi_20_64] \n\t"
  186. "madd $ac1, %[load8], %[cospi_12_64] \n\t"
  187. "extp %[result3], $ac1, 31 \n\t"
  188. "madd $ac2, %[load5], %[cospi_4_64] \n\t"
  189. "madd $ac2, %[load6], %[cospi_28_64] \n\t"
  190. "extp %[result4], $ac2, 31 \n\t"
  191. "mtlo %[const_2_power_13], $ac1 \n\t"
  192. "mthi $zero, $ac1 \n\t"
  193. "mtlo %[const_2_power_13], $ac3 \n\t"
  194. "mthi $zero, $ac3 \n\t"
  195. "sub %[load5], %[result4], %[result3] \n\t"
  196. "sub %[load5], %[load5], %[result1] \n\t"
  197. "add %[load5], %[load5], %[result2] \n\t"
  198. "sub %[load6], %[result1], %[result2] \n\t"
  199. "sub %[load6], %[load6], %[result3] \n\t"
  200. "add %[load6], %[load6], %[result4] \n\t"
  201. "madd $ac1, %[load5], %[cospi_16_64] \n\t"
  202. "madd $ac3, %[load6], %[cospi_16_64] \n\t"
  203. "extp %[step1_5], $ac1, 31 \n\t"
  204. "extp %[step1_6], $ac3, 31 \n\t"
  205. "add %[step1_4], %[result1], %[result2] \n\t"
  206. "add %[step1_7], %[result4], %[result3] \n\t"
  207. : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
  208. [load8] "=&r"(load8), [result1] "=&r"(result1),
  209. [result2] "=&r"(result2), [result3] "=&r"(result3),
  210. [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
  211. [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
  212. [step1_7] "=r"(step1_7)
  213. : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
  214. [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
  215. [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
  216. [cospi_16_64] "r"(cospi_16_64));
  217. __asm__ __volatile__(
  218. "mtlo %[const_2_power_13], $ac0 \n\t"
  219. "mthi $zero, $ac0 \n\t"
  220. "mtlo %[const_2_power_13], $ac1 \n\t"
  221. "mthi $zero, $ac1 \n\t"
  222. "sub %[load5], %[step2_14], %[step2_13] \n\t"
  223. "sub %[load5], %[load5], %[step2_9] \n\t"
  224. "add %[load5], %[load5], %[step2_10] \n\t"
  225. "madd $ac0, %[load5], %[cospi_16_64] \n\t"
  226. "sub %[load6], %[step2_14], %[step2_13] \n\t"
  227. "sub %[load6], %[load6], %[step2_10] \n\t"
  228. "add %[load6], %[load6], %[step2_9] \n\t"
  229. "madd $ac1, %[load6], %[cospi_16_64] \n\t"
  230. "mtlo %[const_2_power_13], $ac2 \n\t"
  231. "mthi $zero, $ac2 \n\t"
  232. "mtlo %[const_2_power_13], $ac3 \n\t"
  233. "mthi $zero, $ac3 \n\t"
  234. "sub %[load5], %[step2_15], %[step2_12] \n\t"
  235. "sub %[load5], %[load5], %[step2_8] \n\t"
  236. "add %[load5], %[load5], %[step2_11] \n\t"
  237. "madd $ac2, %[load5], %[cospi_16_64] \n\t"
  238. "sub %[load6], %[step2_15], %[step2_12] \n\t"
  239. "sub %[load6], %[load6], %[step2_11] \n\t"
  240. "add %[load6], %[load6], %[step2_8] \n\t"
  241. "madd $ac3, %[load6], %[cospi_16_64] \n\t"
  242. "extp %[step1_10], $ac0, 31 \n\t"
  243. "extp %[step1_13], $ac1, 31 \n\t"
  244. "extp %[step1_11], $ac2, 31 \n\t"
  245. "extp %[step1_12], $ac3, 31 \n\t"
  246. : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
  247. [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
  248. [step1_13] "=r"(step1_13)
  249. : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
  250. [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
  251. [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
  252. [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
  253. [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
  254. __asm__ __volatile__(
  255. "add %[load5], %[step1_0], %[step1_7] \n\t"
  256. "add %[load5], %[load5], %[step2_12] \n\t"
  257. "add %[load5], %[load5], %[step2_15] \n\t"
  258. "add %[load6], %[step1_1], %[step1_6] \n\t"
  259. "add %[load6], %[load6], %[step2_13] \n\t"
  260. "add %[load6], %[load6], %[step2_14] \n\t"
  261. "sh %[load5], 0(%[output]) \n\t"
  262. "sh %[load6], 32(%[output]) \n\t"
  263. "sub %[load5], %[step1_1], %[step1_6] \n\t"
  264. "add %[load5], %[load5], %[step2_9] \n\t"
  265. "add %[load5], %[load5], %[step2_10] \n\t"
  266. "sub %[load6], %[step1_0], %[step1_7] \n\t"
  267. "add %[load6], %[load6], %[step2_8] \n\t"
  268. "add %[load6], %[load6], %[step2_11] \n\t"
  269. "sh %[load5], 192(%[output]) \n\t"
  270. "sh %[load6], 224(%[output]) \n\t"
  271. "sub %[load5], %[step1_0], %[step1_7] \n\t"
  272. "sub %[load5], %[load5], %[step2_8] \n\t"
  273. "sub %[load5], %[load5], %[step2_11] \n\t"
  274. "sub %[load6], %[step1_1], %[step1_6] \n\t"
  275. "sub %[load6], %[load6], %[step2_9] \n\t"
  276. "sub %[load6], %[load6], %[step2_10] \n\t"
  277. "sh %[load5], 256(%[output]) \n\t"
  278. "sh %[load6], 288(%[output]) \n\t"
  279. "add %[load5], %[step1_1], %[step1_6] \n\t"
  280. "sub %[load5], %[load5], %[step2_13] \n\t"
  281. "sub %[load5], %[load5], %[step2_14] \n\t"
  282. "add %[load6], %[step1_0], %[step1_7] \n\t"
  283. "sub %[load6], %[load6], %[step2_12] \n\t"
  284. "sub %[load6], %[load6], %[step2_15] \n\t"
  285. "sh %[load5], 448(%[output]) \n\t"
  286. "sh %[load6], 480(%[output]) \n\t"
  287. : [load5] "=&r"(load5), [load6] "=&r"(load6)
  288. : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
  289. [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
  290. [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
  291. [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
  292. [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
  293. [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
  294. __asm__ __volatile__(
  295. "add %[load5], %[step1_2], %[step1_5] \n\t"
  296. "add %[load5], %[load5], %[step1_13] \n\t"
  297. "add %[load6], %[step1_3], %[step1_4] \n\t"
  298. "add %[load6], %[load6], %[step1_12] \n\t"
  299. "sh %[load5], 64(%[output]) \n\t"
  300. "sh %[load6], 96(%[output]) \n\t"
  301. "sub %[load5], %[step1_3], %[step1_4] \n\t"
  302. "add %[load5], %[load5], %[step1_11] \n\t"
  303. "sub %[load6], %[step1_2], %[step1_5] \n\t"
  304. "add %[load6], %[load6], %[step1_10] \n\t"
  305. "sh %[load5], 128(%[output]) \n\t"
  306. "sh %[load6], 160(%[output]) \n\t"
  307. "sub %[load5], %[step1_2], %[step1_5] \n\t"
  308. "sub %[load5], %[load5], %[step1_10] \n\t"
  309. "sub %[load6], %[step1_3], %[step1_4] \n\t"
  310. "sub %[load6], %[load6], %[step1_11] \n\t"
  311. "sh %[load5], 320(%[output]) \n\t"
  312. "sh %[load6], 352(%[output]) \n\t"
  313. "add %[load5], %[step1_3], %[step1_4] \n\t"
  314. "sub %[load5], %[load5], %[step1_12] \n\t"
  315. "add %[load6], %[step1_2], %[step1_5] \n\t"
  316. "sub %[load6], %[load6], %[step1_13] \n\t"
  317. "sh %[load5], 384(%[output]) \n\t"
  318. "sh %[load6], 416(%[output]) \n\t"
  319. : [load5] "=&r"(load5), [load6] "=&r"(load6)
  320. : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
  321. [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
  322. [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
  323. [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
  324. input += 16;
  325. output += 1;
  326. }
  327. }
  328. void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
  329. int i;
  330. int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
  331. int step1_8, step1_9, step1_10, step1_11;
  332. int step1_12, step1_13, step1_14, step1_15;
  333. int step2_0, step2_1, step2_2, step2_3;
  334. int step2_8, step2_9, step2_10, step2_11;
  335. int step2_12, step2_13, step2_14, step2_15;
  336. int load1, load2, load3, load4, load5, load6, load7, load8;
  337. int result1, result2, result3, result4;
  338. const int const_2_power_13 = 8192;
  339. uint8_t *dest_pix;
  340. uint8_t *cm = vpx_ff_cropTbl;
  341. /* prefetch vpx_ff_cropTbl */
  342. prefetch_load(vpx_ff_cropTbl);
  343. prefetch_load(vpx_ff_cropTbl + 32);
  344. prefetch_load(vpx_ff_cropTbl + 64);
  345. prefetch_load(vpx_ff_cropTbl + 96);
  346. prefetch_load(vpx_ff_cropTbl + 128);
  347. prefetch_load(vpx_ff_cropTbl + 160);
  348. prefetch_load(vpx_ff_cropTbl + 192);
  349. prefetch_load(vpx_ff_cropTbl + 224);
  350. for (i = 0; i < 16; ++i) {
  351. dest_pix = (dest + i);
  352. __asm__ __volatile__(
  353. "lh %[load1], 0(%[input]) \n\t"
  354. "lh %[load2], 16(%[input]) \n\t"
  355. "lh %[load3], 8(%[input]) \n\t"
  356. "lh %[load4], 24(%[input]) \n\t"
  357. "mtlo %[const_2_power_13], $ac1 \n\t"
  358. "mthi $zero, $ac1 \n\t"
  359. "mtlo %[const_2_power_13], $ac2 \n\t"
  360. "mthi $zero, $ac2 \n\t"
  361. "add %[result1], %[load1], %[load2] \n\t"
  362. "sub %[result2], %[load1], %[load2] \n\t"
  363. "madd $ac1, %[result1], %[cospi_16_64] \n\t"
  364. "madd $ac2, %[result2], %[cospi_16_64] \n\t"
  365. "extp %[step2_0], $ac1, 31 \n\t"
  366. "extp %[step2_1], $ac2, 31 \n\t"
  367. "mtlo %[const_2_power_13], $ac3 \n\t"
  368. "mthi $zero, $ac3 \n\t"
  369. "madd $ac3, %[load3], %[cospi_24_64] \n\t"
  370. "msub $ac3, %[load4], %[cospi_8_64] \n\t"
  371. "extp %[step2_2], $ac3, 31 \n\t"
  372. "mtlo %[const_2_power_13], $ac1 \n\t"
  373. "mthi $zero, $ac1 \n\t"
  374. "madd $ac1, %[load3], %[cospi_8_64] \n\t"
  375. "madd $ac1, %[load4], %[cospi_24_64] \n\t"
  376. "extp %[step2_3], $ac1, 31 \n\t"
  377. "add %[step1_0], %[step2_0], %[step2_3] \n\t"
  378. "add %[step1_1], %[step2_1], %[step2_2] \n\t"
  379. "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
  380. "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
  381. : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
  382. [load4] "=&r"(load4), [result1] "=&r"(result1),
  383. [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
  384. [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
  385. [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
  386. [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
  387. [step1_3] "=r"(step1_3)
  388. : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
  389. [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
  390. [cospi_16_64] "r"(cospi_16_64));
  391. __asm__ __volatile__(
  392. "lh %[load5], 2(%[input]) \n\t"
  393. "lh %[load6], 30(%[input]) \n\t"
  394. "lh %[load7], 18(%[input]) \n\t"
  395. "lh %[load8], 14(%[input]) \n\t"
  396. "mtlo %[const_2_power_13], $ac1 \n\t"
  397. "mthi $zero, $ac1 \n\t"
  398. "mtlo %[const_2_power_13], $ac3 \n\t"
  399. "mthi $zero, $ac3 \n\t"
  400. "madd $ac1, %[load5], %[cospi_30_64] \n\t"
  401. "msub $ac1, %[load6], %[cospi_2_64] \n\t"
  402. "extp %[result1], $ac1, 31 \n\t"
  403. "madd $ac3, %[load7], %[cospi_14_64] \n\t"
  404. "msub $ac3, %[load8], %[cospi_18_64] \n\t"
  405. "extp %[result2], $ac3, 31 \n\t"
  406. "mtlo %[const_2_power_13], $ac1 \n\t"
  407. "mthi $zero, $ac1 \n\t"
  408. "mtlo %[const_2_power_13], $ac2 \n\t"
  409. "mthi $zero, $ac2 \n\t"
  410. "madd $ac1, %[load7], %[cospi_18_64] \n\t"
  411. "madd $ac1, %[load8], %[cospi_14_64] \n\t"
  412. "extp %[result3], $ac1, 31 \n\t"
  413. "madd $ac2, %[load5], %[cospi_2_64] \n\t"
  414. "madd $ac2, %[load6], %[cospi_30_64] \n\t"
  415. "extp %[result4], $ac2, 31 \n\t"
  416. "sub %[load5], %[result1], %[result2] \n\t"
  417. "sub %[load6], %[result4], %[result3] \n\t"
  418. "mtlo %[const_2_power_13], $ac1 \n\t"
  419. "mthi $zero, $ac1 \n\t"
  420. "mtlo %[const_2_power_13], $ac3 \n\t"
  421. "mthi $zero, $ac3 \n\t"
  422. "madd $ac1, %[load6], %[cospi_24_64] \n\t"
  423. "msub $ac1, %[load5], %[cospi_8_64] \n\t"
  424. "madd $ac3, %[load5], %[cospi_24_64] \n\t"
  425. "madd $ac3, %[load6], %[cospi_8_64] \n\t"
  426. "extp %[step2_9], $ac1, 31 \n\t"
  427. "extp %[step2_14], $ac3, 31 \n\t"
  428. "add %[step2_8], %[result1], %[result2] \n\t"
  429. "add %[step2_15], %[result4], %[result3] \n\t"
  430. : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
  431. [load8] "=&r"(load8), [result1] "=&r"(result1),
  432. [result2] "=&r"(result2), [result3] "=&r"(result3),
  433. [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
  434. [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
  435. [step2_14] "=r"(step2_14)
  436. : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
  437. [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
  438. [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
  439. [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
  440. __asm__ __volatile__(
  441. "lh %[load1], 10(%[input]) \n\t"
  442. "lh %[load2], 22(%[input]) \n\t"
  443. "lh %[load3], 26(%[input]) \n\t"
  444. "lh %[load4], 6(%[input]) \n\t"
  445. "mtlo %[const_2_power_13], $ac1 \n\t"
  446. "mthi $zero, $ac1 \n\t"
  447. "mtlo %[const_2_power_13], $ac3 \n\t"
  448. "mthi $zero, $ac3 \n\t"
  449. "madd $ac1, %[load1], %[cospi_22_64] \n\t"
  450. "msub $ac1, %[load2], %[cospi_10_64] \n\t"
  451. "extp %[result1], $ac1, 31 \n\t"
  452. "madd $ac3, %[load3], %[cospi_6_64] \n\t"
  453. "msub $ac3, %[load4], %[cospi_26_64] \n\t"
  454. "extp %[result2], $ac3, 31 \n\t"
  455. "mtlo %[const_2_power_13], $ac1 \n\t"
  456. "mthi $zero, $ac1 \n\t"
  457. "mtlo %[const_2_power_13], $ac2 \n\t"
  458. "mthi $zero, $ac2 \n\t"
  459. "madd $ac1, %[load1], %[cospi_10_64] \n\t"
  460. "madd $ac1, %[load2], %[cospi_22_64] \n\t"
  461. "extp %[result3], $ac1, 31 \n\t"
  462. "madd $ac2, %[load3], %[cospi_26_64] \n\t"
  463. "madd $ac2, %[load4], %[cospi_6_64] \n\t"
  464. "extp %[result4], $ac2, 31 \n\t"
  465. "mtlo %[const_2_power_13], $ac1 \n\t"
  466. "mthi $zero, $ac1 \n\t"
  467. "mtlo %[const_2_power_13], $ac3 \n\t"
  468. "mthi $zero, $ac3 \n\t"
  469. "sub %[load1], %[result2], %[result1] \n\t"
  470. "sub %[load2], %[result4], %[result3] \n\t"
  471. "msub $ac1, %[load1], %[cospi_24_64] \n\t"
  472. "msub $ac1, %[load2], %[cospi_8_64] \n\t"
  473. "madd $ac3, %[load2], %[cospi_24_64] \n\t"
  474. "msub $ac3, %[load1], %[cospi_8_64] \n\t"
  475. "extp %[step2_10], $ac1, 31 \n\t"
  476. "extp %[step2_13], $ac3, 31 \n\t"
  477. "add %[step2_11], %[result1], %[result2] \n\t"
  478. "add %[step2_12], %[result4], %[result3] \n\t"
  479. : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
  480. [load4] "=&r"(load4), [result1] "=&r"(result1),
  481. [result2] "=&r"(result2), [result3] "=&r"(result3),
  482. [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
  483. [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
  484. [step2_13] "=r"(step2_13)
  485. : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
  486. [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
  487. [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
  488. [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
  489. __asm__ __volatile__(
  490. "lh %[load5], 4(%[input]) \n\t"
  491. "lh %[load6], 28(%[input]) \n\t"
  492. "lh %[load7], 20(%[input]) \n\t"
  493. "lh %[load8], 12(%[input]) \n\t"
  494. "mtlo %[const_2_power_13], $ac1 \n\t"
  495. "mthi $zero, $ac1 \n\t"
  496. "mtlo %[const_2_power_13], $ac3 \n\t"
  497. "mthi $zero, $ac3 \n\t"
  498. "madd $ac1, %[load5], %[cospi_28_64] \n\t"
  499. "msub $ac1, %[load6], %[cospi_4_64] \n\t"
  500. "extp %[result1], $ac1, 31 \n\t"
  501. "madd $ac3, %[load7], %[cospi_12_64] \n\t"
  502. "msub $ac3, %[load8], %[cospi_20_64] \n\t"
  503. "extp %[result2], $ac3, 31 \n\t"
  504. "mtlo %[const_2_power_13], $ac1 \n\t"
  505. "mthi $zero, $ac1 \n\t"
  506. "mtlo %[const_2_power_13], $ac2 \n\t"
  507. "mthi $zero, $ac2 \n\t"
  508. "madd $ac1, %[load7], %[cospi_20_64] \n\t"
  509. "madd $ac1, %[load8], %[cospi_12_64] \n\t"
  510. "extp %[result3], $ac1, 31 \n\t"
  511. "madd $ac2, %[load5], %[cospi_4_64] \n\t"
  512. "madd $ac2, %[load6], %[cospi_28_64] \n\t"
  513. "extp %[result4], $ac2, 31 \n\t"
  514. "mtlo %[const_2_power_13], $ac1 \n\t"
  515. "mthi $zero, $ac1 \n\t"
  516. "mtlo %[const_2_power_13], $ac3 \n\t"
  517. "mthi $zero, $ac3 \n\t"
  518. "sub %[load5], %[result4], %[result3] \n\t"
  519. "sub %[load5], %[load5], %[result1] \n\t"
  520. "add %[load5], %[load5], %[result2] \n\t"
  521. "sub %[load6], %[result1], %[result2] \n\t"
  522. "sub %[load6], %[load6], %[result3] \n\t"
  523. "add %[load6], %[load6], %[result4] \n\t"
  524. "madd $ac1, %[load5], %[cospi_16_64] \n\t"
  525. "madd $ac3, %[load6], %[cospi_16_64] \n\t"
  526. "extp %[step1_5], $ac1, 31 \n\t"
  527. "extp %[step1_6], $ac3, 31 \n\t"
  528. "add %[step1_4], %[result1], %[result2] \n\t"
  529. "add %[step1_7], %[result4], %[result3] \n\t"
  530. : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
  531. [load8] "=&r"(load8), [result1] "=&r"(result1),
  532. [result2] "=&r"(result2), [result3] "=&r"(result3),
  533. [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
  534. [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
  535. [step1_7] "=r"(step1_7)
  536. : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
  537. [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
  538. [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
  539. [cospi_16_64] "r"(cospi_16_64));
  540. __asm__ __volatile__(
  541. "mtlo %[const_2_power_13], $ac0 \n\t"
  542. "mthi $zero, $ac0 \n\t"
  543. "mtlo %[const_2_power_13], $ac1 \n\t"
  544. "mthi $zero, $ac1 \n\t"
  545. "sub %[load5], %[step2_14], %[step2_13] \n\t"
  546. "sub %[load5], %[load5], %[step2_9] \n\t"
  547. "add %[load5], %[load5], %[step2_10] \n\t"
  548. "madd $ac0, %[load5], %[cospi_16_64] \n\t"
  549. "sub %[load6], %[step2_14], %[step2_13] \n\t"
  550. "sub %[load6], %[load6], %[step2_10] \n\t"
  551. "add %[load6], %[load6], %[step2_9] \n\t"
  552. "madd $ac1, %[load6], %[cospi_16_64] \n\t"
  553. "mtlo %[const_2_power_13], $ac2 \n\t"
  554. "mthi $zero, $ac2 \n\t"
  555. "mtlo %[const_2_power_13], $ac3 \n\t"
  556. "mthi $zero, $ac3 \n\t"
  557. "sub %[load5], %[step2_15], %[step2_12] \n\t"
  558. "sub %[load5], %[load5], %[step2_8] \n\t"
  559. "add %[load5], %[load5], %[step2_11] \n\t"
  560. "madd $ac2, %[load5], %[cospi_16_64] \n\t"
  561. "sub %[load6], %[step2_15], %[step2_12] \n\t"
  562. "sub %[load6], %[load6], %[step2_11] \n\t"
  563. "add %[load6], %[load6], %[step2_8] \n\t"
  564. "madd $ac3, %[load6], %[cospi_16_64] \n\t"
  565. "extp %[step1_10], $ac0, 31 \n\t"
  566. "extp %[step1_13], $ac1, 31 \n\t"
  567. "extp %[step1_11], $ac2, 31 \n\t"
  568. "extp %[step1_12], $ac3, 31 \n\t"
  569. : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
  570. [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
  571. [step1_13] "=r"(step1_13)
  572. : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
  573. [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
  574. [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
  575. [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
  576. [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
  577. step1_8 = step2_8 + step2_11;
  578. step1_9 = step2_9 + step2_10;
  579. step1_14 = step2_13 + step2_14;
  580. step1_15 = step2_12 + step2_15;
  581. __asm__ __volatile__(
  582. "lbu %[load7], 0(%[dest_pix]) \n\t"
  583. "add %[load5], %[step1_0], %[step1_7] \n\t"
  584. "add %[load5], %[load5], %[step1_15] \n\t"
  585. "addi %[load5], %[load5], 32 \n\t"
  586. "sra %[load5], %[load5], 6 \n\t"
  587. "add %[load7], %[load7], %[load5] \n\t"
  588. "lbux %[load5], %[load7](%[cm]) \n\t"
  589. "add %[load6], %[step1_1], %[step1_6] \n\t"
  590. "add %[load6], %[load6], %[step1_14] \n\t"
  591. "sb %[load5], 0(%[dest_pix]) \n\t"
  592. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  593. "lbu %[load8], 0(%[dest_pix]) \n\t"
  594. "addi %[load6], %[load6], 32 \n\t"
  595. "sra %[load6], %[load6], 6 \n\t"
  596. "add %[load8], %[load8], %[load6] \n\t"
  597. "lbux %[load6], %[load8](%[cm]) \n\t"
  598. "sb %[load6], 0(%[dest_pix]) \n\t"
  599. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  600. "lbu %[load7], 0(%[dest_pix]) \n\t"
  601. "add %[load5], %[step1_2], %[step1_5] \n\t"
  602. "add %[load5], %[load5], %[step1_13] \n\t"
  603. "addi %[load5], %[load5], 32 \n\t"
  604. "sra %[load5], %[load5], 6 \n\t"
  605. "add %[load7], %[load7], %[load5] \n\t"
  606. "lbux %[load5], %[load7](%[cm]) \n\t"
  607. "add %[load6], %[step1_3], %[step1_4] \n\t"
  608. "add %[load6], %[load6], %[step1_12] \n\t"
  609. "sb %[load5], 0(%[dest_pix]) \n\t"
  610. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  611. "lbu %[load8], 0(%[dest_pix]) \n\t"
  612. "addi %[load6], %[load6], 32 \n\t"
  613. "sra %[load6], %[load6], 6 \n\t"
  614. "add %[load8], %[load8], %[load6] \n\t"
  615. "lbux %[load6], %[load8](%[cm]) \n\t"
  616. "sb %[load6], 0(%[dest_pix]) \n\t"
  617. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  618. "lbu %[load7], 0(%[dest_pix]) \n\t"
  619. "sub %[load5], %[step1_3], %[step1_4] \n\t"
  620. "add %[load5], %[load5], %[step1_11] \n\t"
  621. "addi %[load5], %[load5], 32 \n\t"
  622. "sra %[load5], %[load5], 6 \n\t"
  623. "add %[load7], %[load7], %[load5] \n\t"
  624. "lbux %[load5], %[load7](%[cm]) \n\t"
  625. "sub %[load6], %[step1_2], %[step1_5] \n\t"
  626. "add %[load6], %[load6], %[step1_10] \n\t"
  627. "sb %[load5], 0(%[dest_pix]) \n\t"
  628. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  629. "lbu %[load8], 0(%[dest_pix]) \n\t"
  630. "addi %[load6], %[load6], 32 \n\t"
  631. "sra %[load6], %[load6], 6 \n\t"
  632. "add %[load8], %[load8], %[load6] \n\t"
  633. "lbux %[load6], %[load8](%[cm]) \n\t"
  634. "sb %[load6], 0(%[dest_pix]) \n\t"
  635. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  636. "sub %[load5], %[step1_1], %[step1_6] \n\t"
  637. "lbu %[load7], 0(%[dest_pix]) \n\t"
  638. "add %[load5], %[load5], %[step1_9] \n\t"
  639. "addi %[load5], %[load5], 32 \n\t"
  640. "sra %[load5], %[load5], 6 \n\t"
  641. "add %[load7], %[load7], %[load5] \n\t"
  642. "lbux %[load5], %[load7](%[cm]) \n\t"
  643. "sub %[load6], %[step1_0], %[step1_7] \n\t"
  644. "add %[load6], %[load6], %[step1_8] \n\t"
  645. "sb %[load5], 0(%[dest_pix]) \n\t"
  646. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  647. "lbu %[load8], 0(%[dest_pix]) \n\t"
  648. "addi %[load6], %[load6], 32 \n\t"
  649. "sra %[load6], %[load6], 6 \n\t"
  650. "add %[load8], %[load8], %[load6] \n\t"
  651. "lbux %[load6], %[load8](%[cm]) \n\t"
  652. "sb %[load6], 0(%[dest_pix]) \n\t"
  653. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  654. "lbu %[load7], 0(%[dest_pix]) \n\t"
  655. "sub %[load5], %[step1_0], %[step1_7] \n\t"
  656. "sub %[load5], %[load5], %[step1_8] \n\t"
  657. "addi %[load5], %[load5], 32 \n\t"
  658. "sra %[load5], %[load5], 6 \n\t"
  659. "add %[load7], %[load7], %[load5] \n\t"
  660. "lbux %[load5], %[load7](%[cm]) \n\t"
  661. "sub %[load6], %[step1_1], %[step1_6] \n\t"
  662. "sub %[load6], %[load6], %[step1_9] \n\t"
  663. "sb %[load5], 0(%[dest_pix]) \n\t"
  664. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  665. "lbu %[load8], 0(%[dest_pix]) \n\t"
  666. "addi %[load6], %[load6], 32 \n\t"
  667. "sra %[load6], %[load6], 6 \n\t"
  668. "add %[load8], %[load8], %[load6] \n\t"
  669. "lbux %[load6], %[load8](%[cm]) \n\t"
  670. "sb %[load6], 0(%[dest_pix]) \n\t"
  671. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  672. "lbu %[load7], 0(%[dest_pix]) \n\t"
  673. "sub %[load5], %[step1_2], %[step1_5] \n\t"
  674. "sub %[load5], %[load5], %[step1_10] \n\t"
  675. "addi %[load5], %[load5], 32 \n\t"
  676. "sra %[load5], %[load5], 6 \n\t"
  677. "add %[load7], %[load7], %[load5] \n\t"
  678. "lbux %[load5], %[load7](%[cm]) \n\t"
  679. "sub %[load6], %[step1_3], %[step1_4] \n\t"
  680. "sub %[load6], %[load6], %[step1_11] \n\t"
  681. "sb %[load5], 0(%[dest_pix]) \n\t"
  682. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  683. "lbu %[load8], 0(%[dest_pix]) \n\t"
  684. "addi %[load6], %[load6], 32 \n\t"
  685. "sra %[load6], %[load6], 6 \n\t"
  686. "add %[load8], %[load8], %[load6] \n\t"
  687. "lbux %[load6], %[load8](%[cm]) \n\t"
  688. "sb %[load6], 0(%[dest_pix]) \n\t"
  689. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  690. "lbu %[load7], 0(%[dest_pix]) \n\t"
  691. "add %[load5], %[step1_3], %[step1_4] \n\t"
  692. "sub %[load5], %[load5], %[step1_12] \n\t"
  693. "addi %[load5], %[load5], 32 \n\t"
  694. "sra %[load5], %[load5], 6 \n\t"
  695. "add %[load7], %[load7], %[load5] \n\t"
  696. "lbux %[load5], %[load7](%[cm]) \n\t"
  697. "add %[load6], %[step1_2], %[step1_5] \n\t"
  698. "sub %[load6], %[load6], %[step1_13] \n\t"
  699. "sb %[load5], 0(%[dest_pix]) \n\t"
  700. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  701. "lbu %[load8], 0(%[dest_pix]) \n\t"
  702. "addi %[load6], %[load6], 32 \n\t"
  703. "sra %[load6], %[load6], 6 \n\t"
  704. "add %[load8], %[load8], %[load6] \n\t"
  705. "lbux %[load6], %[load8](%[cm]) \n\t"
  706. "sb %[load6], 0(%[dest_pix]) \n\t"
  707. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  708. "lbu %[load7], 0(%[dest_pix]) \n\t"
  709. "add %[load5], %[step1_1], %[step1_6] \n\t"
  710. "sub %[load5], %[load5], %[step1_14] \n\t"
  711. "addi %[load5], %[load5], 32 \n\t"
  712. "sra %[load5], %[load5], 6 \n\t"
  713. "add %[load7], %[load7], %[load5] \n\t"
  714. "lbux %[load5], %[load7](%[cm]) \n\t"
  715. "add %[load6], %[step1_0], %[step1_7] \n\t"
  716. "sub %[load6], %[load6], %[step1_15] \n\t"
  717. "sb %[load5], 0(%[dest_pix]) \n\t"
  718. "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
  719. "lbu %[load8], 0(%[dest_pix]) \n\t"
  720. "addi %[load6], %[load6], 32 \n\t"
  721. "sra %[load6], %[load6], 6 \n\t"
  722. "add %[load8], %[load8], %[load6] \n\t"
  723. "lbux %[load6], %[load8](%[cm]) \n\t"
  724. "sb %[load6], 0(%[dest_pix]) \n\t"
  725. : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
  726. [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
  727. :
  728. [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
  729. [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
  730. [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
  731. [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
  732. [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
  733. [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
  734. [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
  735. input += 16;
  736. }
  737. }
  738. void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
  739. int stride) {
  740. DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
  741. uint32_t pos = 45;
  742. /* bit positon for extract from acc */
  743. __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
  744. // First transform rows
  745. idct16_rows_dspr2(input, out, 16);
  746. // Then transform columns and add to dest
  747. idct16_cols_add_blk_dspr2(out, dest, stride);
  748. }
  749. void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
  750. int stride) {
  751. DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
  752. int16_t *outptr = out;
  753. uint32_t i;
  754. uint32_t pos = 45;
  755. /* bit positon for extract from acc */
  756. __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
  757. // First transform rows. Since all non-zero dct coefficients are in
  758. // upper-left 4x4 area, we only need to calculate first 4 rows here.
  759. idct16_rows_dspr2(input, outptr, 4);
  760. outptr += 4;
  761. for (i = 0; i < 6; ++i) {
  762. __asm__ __volatile__(
  763. "sw $zero, 0(%[outptr]) \n\t"
  764. "sw $zero, 32(%[outptr]) \n\t"
  765. "sw $zero, 64(%[outptr]) \n\t"
  766. "sw $zero, 96(%[outptr]) \n\t"
  767. "sw $zero, 128(%[outptr]) \n\t"
  768. "sw $zero, 160(%[outptr]) \n\t"
  769. "sw $zero, 192(%[outptr]) \n\t"
  770. "sw $zero, 224(%[outptr]) \n\t"
  771. "sw $zero, 256(%[outptr]) \n\t"
  772. "sw $zero, 288(%[outptr]) \n\t"
  773. "sw $zero, 320(%[outptr]) \n\t"
  774. "sw $zero, 352(%[outptr]) \n\t"
  775. "sw $zero, 384(%[outptr]) \n\t"
  776. "sw $zero, 416(%[outptr]) \n\t"
  777. "sw $zero, 448(%[outptr]) \n\t"
  778. "sw $zero, 480(%[outptr]) \n\t"
  779. :
  780. : [outptr] "r"(outptr));
  781. outptr += 2;
  782. }
  783. // Then transform columns
  784. idct16_cols_add_blk_dspr2(out, dest, stride);
  785. }
  786. void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
  787. int stride) {
  788. uint32_t pos = 45;
  789. int32_t out;
  790. int32_t r;
  791. int32_t a1, absa1;
  792. int32_t vector_a1;
  793. int32_t t1, t2, t3, t4;
  794. int32_t vector_1, vector_2, vector_3, vector_4;
  795. /* bit positon for extract from acc */
  796. __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
  797. :
  798. : [pos] "r"(pos));
  799. out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
  800. __asm__ __volatile__(
  801. "addi %[out], %[out], 32 \n\t"
  802. "sra %[a1], %[out], 6 \n\t"
  803. : [out] "+r"(out), [a1] "=r"(a1)
  804. :);
  805. if (a1 < 0) {
  806. /* use quad-byte
  807. * input and output memory are four byte aligned */
  808. __asm__ __volatile__(
  809. "abs %[absa1], %[a1] \n\t"
  810. "replv.qb %[vector_a1], %[absa1] \n\t"
  811. : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
  812. : [a1] "r"(a1));
  813. for (r = 16; r--;) {
  814. __asm__ __volatile__(
  815. "lw %[t1], 0(%[dest]) \n\t"
  816. "lw %[t2], 4(%[dest]) \n\t"
  817. "lw %[t3], 8(%[dest]) \n\t"
  818. "lw %[t4], 12(%[dest]) \n\t"
  819. "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
  820. "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
  821. "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
  822. "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
  823. "sw %[vector_1], 0(%[dest]) \n\t"
  824. "sw %[vector_2], 4(%[dest]) \n\t"
  825. "sw %[vector_3], 8(%[dest]) \n\t"
  826. "sw %[vector_4], 12(%[dest]) \n\t"
  827. "add %[dest], %[dest], %[stride] \n\t"
  828. : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
  829. [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
  830. [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
  831. [dest] "+&r"(dest)
  832. : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
  833. }
  834. } else if (a1 > 255) {
  835. int32_t a11, a12, vector_a11, vector_a12;
  836. /* use quad-byte
  837. * input and output memory are four byte aligned */
  838. a11 = a1 >> 1;
  839. a12 = a1 - a11;
  840. __asm__ __volatile__(
  841. "replv.qb %[vector_a11], %[a11] \n\t"
  842. "replv.qb %[vector_a12], %[a12] \n\t"
  843. : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
  844. : [a11] "r"(a11), [a12] "r"(a12));
  845. for (r = 16; r--;) {
  846. __asm__ __volatile__(
  847. "lw %[t1], 0(%[dest]) \n\t"
  848. "lw %[t2], 4(%[dest]) \n\t"
  849. "lw %[t3], 8(%[dest]) \n\t"
  850. "lw %[t4], 12(%[dest]) \n\t"
  851. "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t"
  852. "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t"
  853. "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t"
  854. "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t"
  855. "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t"
  856. "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t"
  857. "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t"
  858. "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t"
  859. "sw %[vector_1], 0(%[dest]) \n\t"
  860. "sw %[vector_2], 4(%[dest]) \n\t"
  861. "sw %[vector_3], 8(%[dest]) \n\t"
  862. "sw %[vector_4], 12(%[dest]) \n\t"
  863. "add %[dest], %[dest], %[stride] \n\t"
  864. : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
  865. [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
  866. [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
  867. [dest] "+&r"(dest)
  868. : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
  869. [vector_a12] "r"(vector_a12));
  870. }
  871. } else {
  872. /* use quad-byte
  873. * input and output memory are four byte aligned */
  874. __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
  875. : [vector_a1] "=r"(vector_a1)
  876. : [a1] "r"(a1));
  877. for (r = 16; r--;) {
  878. __asm__ __volatile__(
  879. "lw %[t1], 0(%[dest]) \n\t"
  880. "lw %[t2], 4(%[dest]) \n\t"
  881. "lw %[t3], 8(%[dest]) \n\t"
  882. "lw %[t4], 12(%[dest]) \n\t"
  883. "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
  884. "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
  885. "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
  886. "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
  887. "sw %[vector_1], 0(%[dest]) \n\t"
  888. "sw %[vector_2], 4(%[dest]) \n\t"
  889. "sw %[vector_3], 8(%[dest]) \n\t"
  890. "sw %[vector_4], 12(%[dest]) \n\t"
  891. "add %[dest], %[dest], %[stride] \n\t"
  892. : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
  893. [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
  894. [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
  895. [dest] "+&r"(dest)
  896. : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
  897. }
  898. }
  899. }
  900. void iadst16_dspr2(const int16_t *input, int16_t *output) {
  901. int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
  902. int x0 = input[15];
  903. int x1 = input[0];
  904. int x2 = input[13];
  905. int x3 = input[2];
  906. int x4 = input[11];
  907. int x5 = input[4];
  908. int x6 = input[9];
  909. int x7 = input[6];
  910. int x8 = input[7];
  911. int x9 = input[8];
  912. int x10 = input[5];
  913. int x11 = input[10];
  914. int x12 = input[3];
  915. int x13 = input[12];
  916. int x14 = input[1];
  917. int x15 = input[14];
  918. if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
  919. x13 | x14 | x15)) {
  920. output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
  921. output[6] = output[7] = output[8] = output[9] = output[10] =
  922. output[11] = output[12] = output[13] = output[14] = output[15] = 0;
  923. return;
  924. }
  925. // stage 1
  926. s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
  927. s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
  928. s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
  929. s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
  930. s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
  931. s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
  932. s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
  933. s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
  934. s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
  935. s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
  936. s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
  937. s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
  938. s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
  939. s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
  940. s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
  941. s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
  942. x0 = dct_const_round_shift(s0 + s8);
  943. x1 = dct_const_round_shift(s1 + s9);
  944. x2 = dct_const_round_shift(s2 + s10);
  945. x3 = dct_const_round_shift(s3 + s11);
  946. x4 = dct_const_round_shift(s4 + s12);
  947. x5 = dct_const_round_shift(s5 + s13);
  948. x6 = dct_const_round_shift(s6 + s14);
  949. x7 = dct_const_round_shift(s7 + s15);
  950. x8 = dct_const_round_shift(s0 - s8);
  951. x9 = dct_const_round_shift(s1 - s9);
  952. x10 = dct_const_round_shift(s2 - s10);
  953. x11 = dct_const_round_shift(s3 - s11);
  954. x12 = dct_const_round_shift(s4 - s12);
  955. x13 = dct_const_round_shift(s5 - s13);
  956. x14 = dct_const_round_shift(s6 - s14);
  957. x15 = dct_const_round_shift(s7 - s15);
  958. // stage 2
  959. s0 = x0;
  960. s1 = x1;
  961. s2 = x2;
  962. s3 = x3;
  963. s4 = x4;
  964. s5 = x5;
  965. s6 = x6;
  966. s7 = x7;
  967. s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
  968. s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
  969. s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
  970. s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
  971. s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
  972. s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
  973. s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
  974. s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
  975. x0 = s0 + s4;
  976. x1 = s1 + s5;
  977. x2 = s2 + s6;
  978. x3 = s3 + s7;
  979. x4 = s0 - s4;
  980. x5 = s1 - s5;
  981. x6 = s2 - s6;
  982. x7 = s3 - s7;
  983. x8 = dct_const_round_shift(s8 + s12);
  984. x9 = dct_const_round_shift(s9 + s13);
  985. x10 = dct_const_round_shift(s10 + s14);
  986. x11 = dct_const_round_shift(s11 + s15);
  987. x12 = dct_const_round_shift(s8 - s12);
  988. x13 = dct_const_round_shift(s9 - s13);
  989. x14 = dct_const_round_shift(s10 - s14);
  990. x15 = dct_const_round_shift(s11 - s15);
  991. // stage 3
  992. s0 = x0;
  993. s1 = x1;
  994. s2 = x2;
  995. s3 = x3;
  996. s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
  997. s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
  998. s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
  999. s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
  1000. s8 = x8;
  1001. s9 = x9;
  1002. s10 = x10;
  1003. s11 = x11;
  1004. s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
  1005. s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
  1006. s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
  1007. s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
  1008. x0 = s0 + s2;
  1009. x1 = s1 + s3;
  1010. x2 = s0 - s2;
  1011. x3 = s1 - s3;
  1012. x4 = dct_const_round_shift(s4 + s6);
  1013. x5 = dct_const_round_shift(s5 + s7);
  1014. x6 = dct_const_round_shift(s4 - s6);
  1015. x7 = dct_const_round_shift(s5 - s7);
  1016. x8 = s8 + s10;
  1017. x9 = s9 + s11;
  1018. x10 = s8 - s10;
  1019. x11 = s9 - s11;
  1020. x12 = dct_const_round_shift(s12 + s14);
  1021. x13 = dct_const_round_shift(s13 + s15);
  1022. x14 = dct_const_round_shift(s12 - s14);
  1023. x15 = dct_const_round_shift(s13 - s15);
  1024. // stage 4
  1025. s2 = (-cospi_16_64) * (x2 + x3);
  1026. s3 = cospi_16_64 * (x2 - x3);
  1027. s6 = cospi_16_64 * (x6 + x7);
  1028. s7 = cospi_16_64 * (-x6 + x7);
  1029. s10 = cospi_16_64 * (x10 + x11);
  1030. s11 = cospi_16_64 * (-x10 + x11);
  1031. s14 = (-cospi_16_64) * (x14 + x15);
  1032. s15 = cospi_16_64 * (x14 - x15);
  1033. x2 = dct_const_round_shift(s2);
  1034. x3 = dct_const_round_shift(s3);
  1035. x6 = dct_const_round_shift(s6);
  1036. x7 = dct_const_round_shift(s7);
  1037. x10 = dct_const_round_shift(s10);
  1038. x11 = dct_const_round_shift(s11);
  1039. x14 = dct_const_round_shift(s14);
  1040. x15 = dct_const_round_shift(s15);
  1041. output[0] = x0;
  1042. output[1] = -x8;
  1043. output[2] = x12;
  1044. output[3] = -x4;
  1045. output[4] = x6;
  1046. output[5] = x14;
  1047. output[6] = x10;
  1048. output[7] = x2;
  1049. output[8] = x3;
  1050. output[9] = x11;
  1051. output[10] = x15;
  1052. output[11] = x7;
  1053. output[12] = x5;
  1054. output[13] = -x13;
  1055. output[14] = x9;
  1056. output[15] = -x1;
  1057. }
  1058. #endif // HAVE_DSPR2