convolve2_dspr2.c 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. /*
  2. * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <stdio.h>
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_dsp/mips/convolve_common_dspr2.h"
  14. #include "vpx_dsp/vpx_dsp_common.h"
  15. #include "vpx_dsp/vpx_filter.h"
  16. #include "vpx_ports/mem.h"
  17. #if HAVE_DSPR2
  18. static void convolve_bi_horiz_4_transposed_dspr2(
  19. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  20. const int16_t *filter_x0, int32_t h) {
  21. int32_t y;
  22. uint8_t *cm = vpx_ff_cropTbl;
  23. uint8_t *dst_ptr;
  24. int32_t Temp1, Temp2;
  25. uint32_t vector4a = 64;
  26. uint32_t tp1, tp2;
  27. uint32_t p1, p2;
  28. const int16_t *filter = &filter_x0[3];
  29. uint32_t filter45;
  30. filter45 = ((const int32_t *)filter)[0];
  31. for (y = h; y--;) {
  32. dst_ptr = dst;
  33. /* prefetch data to cache memory */
  34. prefetch_load(src + src_stride);
  35. prefetch_load(src + src_stride + 32);
  36. __asm__ __volatile__(
  37. "ulw %[tp1], 0(%[src]) \n\t"
  38. "ulw %[tp2], 4(%[src]) \n\t"
  39. /* even 1. pixel */
  40. "mtlo %[vector4a], $ac3 \n\t"
  41. "mthi $zero, $ac3 \n\t"
  42. "preceu.ph.qbr %[p1], %[tp1] \n\t"
  43. "preceu.ph.qbl %[p2], %[tp1] \n\t"
  44. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  45. "extp %[Temp1], $ac3, 31 \n\t"
  46. /* even 2. pixel */
  47. "mtlo %[vector4a], $ac2 \n\t"
  48. "mthi $zero, $ac2 \n\t"
  49. "balign %[tp2], %[tp1], 3 \n\t"
  50. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
  51. "extp %[Temp2], $ac2, 31 \n\t"
  52. /* odd 1. pixel */
  53. "lbux %[tp1], %[Temp1](%[cm]) \n\t"
  54. "mtlo %[vector4a], $ac3 \n\t"
  55. "mthi $zero, $ac3 \n\t"
  56. "preceu.ph.qbr %[p1], %[tp2] \n\t"
  57. "preceu.ph.qbl %[p2], %[tp2] \n\t"
  58. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  59. "extp %[Temp1], $ac3, 31 \n\t"
  60. /* odd 2. pixel */
  61. "lbux %[tp2], %[Temp2](%[cm]) \n\t"
  62. "mtlo %[vector4a], $ac2 \n\t"
  63. "mthi $zero, $ac2 \n\t"
  64. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
  65. "extp %[Temp2], $ac2, 31 \n\t"
  66. /* clamp */
  67. "lbux %[p1], %[Temp1](%[cm]) \n\t"
  68. "lbux %[p2], %[Temp2](%[cm]) \n\t"
  69. /* store bytes */
  70. "sb %[tp1], 0(%[dst_ptr]) \n\t"
  71. "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
  72. "sb %[p1], 0(%[dst_ptr]) \n\t"
  73. "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
  74. "sb %[tp2], 0(%[dst_ptr]) \n\t"
  75. "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
  76. "sb %[p2], 0(%[dst_ptr]) \n\t"
  77. "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
  78. : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
  79. [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
  80. : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
  81. [src] "r"(src), [dst_stride] "r"(dst_stride));
  82. /* Next row... */
  83. src += src_stride;
  84. dst += 1;
  85. }
  86. }
  87. static void convolve_bi_horiz_8_transposed_dspr2(
  88. const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
  89. const int16_t *filter_x0, int32_t h) {
  90. int32_t y;
  91. uint8_t *cm = vpx_ff_cropTbl;
  92. uint8_t *dst_ptr;
  93. uint32_t vector4a = 64;
  94. int32_t Temp1, Temp2, Temp3;
  95. uint32_t tp1, tp2, tp3;
  96. uint32_t p1, p2, p3, p4;
  97. uint8_t *odd_dst;
  98. uint32_t dst_pitch_2 = (dst_stride << 1);
  99. const int16_t *filter = &filter_x0[3];
  100. uint32_t filter45;
  101. filter45 = ((const int32_t *)filter)[0];
  102. for (y = h; y--;) {
  103. /* prefetch data to cache memory */
  104. prefetch_load(src + src_stride);
  105. prefetch_load(src + src_stride + 32);
  106. dst_ptr = dst;
  107. odd_dst = (dst_ptr + dst_stride);
  108. __asm__ __volatile__(
  109. "ulw %[tp1], 0(%[src]) \n\t"
  110. "ulw %[tp2], 4(%[src]) \n\t"
  111. /* even 1. pixel */
  112. "mtlo %[vector4a], $ac3 \n\t"
  113. "mthi $zero, $ac3 \n\t"
  114. "mtlo %[vector4a], $ac2 \n\t"
  115. "mthi $zero, $ac2 \n\t"
  116. "preceu.ph.qbr %[p1], %[tp1] \n\t"
  117. "preceu.ph.qbl %[p2], %[tp1] \n\t"
  118. "preceu.ph.qbr %[p3], %[tp2] \n\t"
  119. "preceu.ph.qbl %[p4], %[tp2] \n\t"
  120. "ulw %[tp3], 8(%[src]) \n\t"
  121. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  122. "extp %[Temp1], $ac3, 31 \n\t"
  123. /* even 2. pixel */
  124. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
  125. "extp %[Temp3], $ac2, 31 \n\t"
  126. /* even 3. pixel */
  127. "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
  128. "mtlo %[vector4a], $ac1 \n\t"
  129. "mthi $zero, $ac1 \n\t"
  130. "balign %[tp3], %[tp2], 3 \n\t"
  131. "balign %[tp2], %[tp1], 3 \n\t"
  132. "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
  133. "lbux %[tp1], %[Temp3](%[cm]) \n\t"
  134. "extp %[p3], $ac1, 31 \n\t"
  135. /* even 4. pixel */
  136. "mtlo %[vector4a], $ac2 \n\t"
  137. "mthi $zero, $ac2 \n\t"
  138. "mtlo %[vector4a], $ac3 \n\t"
  139. "mthi $zero, $ac3 \n\t"
  140. "sb %[Temp2], 0(%[dst_ptr]) \n\t"
  141. "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
  142. "sb %[tp1], 0(%[dst_ptr]) \n\t"
  143. "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
  144. "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
  145. "extp %[Temp3], $ac2, 31 \n\t"
  146. "lbux %[Temp1], %[p3](%[cm]) "
  147. "\n\t"
  148. /* odd 1. pixel */
  149. "mtlo %[vector4a], $ac1 \n\t"
  150. "mthi $zero, $ac1 \n\t"
  151. "preceu.ph.qbr %[p1], %[tp2] \n\t"
  152. "preceu.ph.qbl %[p2], %[tp2] \n\t"
  153. "preceu.ph.qbr %[p3], %[tp3] \n\t"
  154. "preceu.ph.qbl %[p4], %[tp3] \n\t"
  155. "sb %[Temp1], 0(%[dst_ptr]) \n\t"
  156. "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
  157. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  158. "extp %[Temp2], $ac3, 31 \n\t"
  159. /* odd 2. pixel */
  160. "lbux %[tp1], %[Temp3](%[cm]) \n\t"
  161. "mtlo %[vector4a], $ac3 \n\t"
  162. "mthi $zero, $ac3 \n\t"
  163. "mtlo %[vector4a], $ac2 \n\t"
  164. "mthi $zero, $ac2 \n\t"
  165. "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
  166. "sb %[tp1], 0(%[dst_ptr]) \n\t"
  167. "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
  168. "extp %[Temp3], $ac1, 31 \n\t"
  169. /* odd 3. pixel */
  170. "lbux %[tp3], %[Temp2](%[cm]) \n\t"
  171. "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
  172. "extp %[Temp2], $ac3, 31 \n\t"
  173. /* odd 4. pixel */
  174. "sb %[tp3], 0(%[odd_dst]) \n\t"
  175. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
  176. "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
  177. "extp %[Temp1], $ac2, 31 \n\t"
  178. /* clamp */
  179. "lbux %[p4], %[Temp3](%[cm]) \n\t"
  180. "lbux %[p2], %[Temp2](%[cm]) \n\t"
  181. "lbux %[p1], %[Temp1](%[cm]) \n\t"
  182. /* store bytes */
  183. "sb %[p4], 0(%[odd_dst]) \n\t"
  184. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
  185. "sb %[p2], 0(%[odd_dst]) \n\t"
  186. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
  187. "sb %[p1], 0(%[odd_dst]) \n\t"
  188. : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
  189. [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
  190. [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
  191. [odd_dst] "+r"(odd_dst)
  192. : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
  193. [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
  194. /* Next row... */
  195. src += src_stride;
  196. dst += 1;
  197. }
  198. }
  199. static void convolve_bi_horiz_16_transposed_dspr2(
  200. const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
  201. int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
  202. int32_t c, y;
  203. const uint8_t *src;
  204. uint8_t *dst;
  205. uint8_t *cm = vpx_ff_cropTbl;
  206. uint32_t vector_64 = 64;
  207. int32_t Temp1, Temp2, Temp3;
  208. uint32_t qload1, qload2;
  209. uint32_t p1, p2, p3, p4, p5;
  210. uint32_t st1, st2, st3;
  211. uint32_t dst_pitch_2 = (dst_stride << 1);
  212. uint8_t *odd_dst;
  213. const int16_t *filter = &filter_x0[3];
  214. uint32_t filter45;
  215. filter45 = ((const int32_t *)filter)[0];
  216. for (y = h; y--;) {
  217. /* prefetch data to cache memory */
  218. prefetch_load(src_ptr + src_stride);
  219. prefetch_load(src_ptr + src_stride + 32);
  220. src = src_ptr;
  221. dst = dst_ptr;
  222. odd_dst = (dst + dst_stride);
  223. for (c = 0; c < count; c++) {
  224. __asm__ __volatile__(
  225. "ulw %[qload1], 0(%[src]) "
  226. "\n\t"
  227. "ulw %[qload2], 4(%[src]) "
  228. "\n\t"
  229. /* even 1. pixel */
  230. "mtlo %[vector_64], $ac1 "
  231. "\n\t" /* even 1 */
  232. "mthi $zero, $ac1 "
  233. "\n\t"
  234. "mtlo %[vector_64], $ac2 "
  235. "\n\t" /* even 2 */
  236. "mthi $zero, $ac2 "
  237. "\n\t"
  238. "preceu.ph.qbr %[p1], %[qload1] "
  239. "\n\t"
  240. "preceu.ph.qbl %[p2], %[qload1] "
  241. "\n\t"
  242. "preceu.ph.qbr %[p3], %[qload2] "
  243. "\n\t"
  244. "preceu.ph.qbl %[p4], %[qload2] "
  245. "\n\t"
  246. "ulw %[qload1], 8(%[src]) "
  247. "\n\t"
  248. "dpa.w.ph $ac1, %[p1], %[filter45] "
  249. "\n\t" /* even 1 */
  250. "extp %[Temp1], $ac1, 31 "
  251. "\n\t" /* even 1 */
  252. /* even 2. pixel */
  253. "mtlo %[vector_64], $ac3 "
  254. "\n\t" /* even 3 */
  255. "mthi $zero, $ac3 "
  256. "\n\t"
  257. "preceu.ph.qbr %[p1], %[qload1] "
  258. "\n\t"
  259. "preceu.ph.qbl %[p5], %[qload1] "
  260. "\n\t"
  261. "ulw %[qload2], 12(%[src]) "
  262. "\n\t"
  263. "dpa.w.ph $ac2, %[p2], %[filter45] "
  264. "\n\t" /* even 1 */
  265. "lbux %[st1], %[Temp1](%[cm]) "
  266. "\n\t" /* even 1 */
  267. "extp %[Temp2], $ac2, 31 "
  268. "\n\t" /* even 1 */
  269. /* even 3. pixel */
  270. "mtlo %[vector_64], $ac1 "
  271. "\n\t" /* even 4 */
  272. "mthi $zero, $ac1 "
  273. "\n\t"
  274. "preceu.ph.qbr %[p2], %[qload2] "
  275. "\n\t"
  276. "sb %[st1], 0(%[dst]) "
  277. "\n\t" /* even 1 */
  278. "addu %[dst], %[dst], %[dst_pitch_2] "
  279. " \n\t"
  280. "dpa.w.ph $ac3, %[p3], %[filter45] "
  281. "\n\t" /* even 3 */
  282. "extp %[Temp3], $ac3, 31 "
  283. "\n\t" /* even 3 */
  284. "lbux %[st2], %[Temp2](%[cm]) "
  285. "\n\t" /* even 1 */
  286. /* even 4. pixel */
  287. "mtlo %[vector_64], $ac2 "
  288. "\n\t" /* even 5 */
  289. "mthi $zero, $ac2 "
  290. "\n\t"
  291. "preceu.ph.qbl %[p3], %[qload2] "
  292. "\n\t"
  293. "sb %[st2], 0(%[dst]) "
  294. "\n\t" /* even 2 */
  295. "addu %[dst], %[dst], %[dst_pitch_2] "
  296. "\n\t"
  297. "dpa.w.ph $ac1, %[p4], %[filter45] "
  298. "\n\t" /* even 4 */
  299. "extp %[Temp1], $ac1, 31 "
  300. "\n\t" /* even 4 */
  301. "lbux %[st3], %[Temp3](%[cm]) "
  302. "\n\t" /* even 3 */
  303. /* even 5. pixel */
  304. "mtlo %[vector_64], $ac3 "
  305. "\n\t" /* even 6 */
  306. "mthi $zero, $ac3 "
  307. "\n\t"
  308. "sb %[st3], 0(%[dst]) "
  309. "\n\t" /* even 3 */
  310. "addu %[dst], %[dst], %[dst_pitch_2] "
  311. "\n\t"
  312. "dpa.w.ph $ac2, %[p1], %[filter45] "
  313. "\n\t" /* even 5 */
  314. "extp %[Temp2], $ac2, 31 "
  315. "\n\t" /* even 5 */
  316. "lbux %[st1], %[Temp1](%[cm]) "
  317. "\n\t" /* even 4 */
  318. /* even 6. pixel */
  319. "mtlo %[vector_64], $ac1 "
  320. "\n\t" /* even 7 */
  321. "mthi $zero, $ac1 "
  322. "\n\t"
  323. "sb %[st1], 0(%[dst]) "
  324. "\n\t" /* even 4 */
  325. "addu %[dst], %[dst], %[dst_pitch_2] "
  326. "\n\t"
  327. "ulw %[qload1], 20(%[src]) "
  328. "\n\t"
  329. "dpa.w.ph $ac3, %[p5], %[filter45] "
  330. "\n\t" /* even 6 */
  331. "extp %[Temp3], $ac3, 31 "
  332. "\n\t" /* even 6 */
  333. "lbux %[st2], %[Temp2](%[cm]) "
  334. "\n\t" /* even 5 */
  335. /* even 7. pixel */
  336. "mtlo %[vector_64], $ac2 "
  337. "\n\t" /* even 8 */
  338. "mthi $zero, $ac2 "
  339. "\n\t"
  340. "preceu.ph.qbr %[p5], %[qload1] "
  341. "\n\t"
  342. "sb %[st2], 0(%[dst]) "
  343. "\n\t" /* even 5 */
  344. "addu %[dst], %[dst], %[dst_pitch_2] "
  345. "\n\t"
  346. "dpa.w.ph $ac1, %[p2], %[filter45] "
  347. "\n\t" /* even 7 */
  348. "extp %[Temp1], $ac1, 31 "
  349. "\n\t" /* even 7 */
  350. "lbux %[st3], %[Temp3](%[cm]) "
  351. "\n\t" /* even 6 */
  352. /* even 8. pixel */
  353. "mtlo %[vector_64], $ac3 "
  354. "\n\t" /* odd 1 */
  355. "mthi $zero, $ac3 "
  356. "\n\t"
  357. "dpa.w.ph $ac2, %[p3], %[filter45] "
  358. "\n\t" /* even 8 */
  359. "sb %[st3], 0(%[dst]) "
  360. "\n\t" /* even 6 */
  361. "addu %[dst], %[dst], %[dst_pitch_2] "
  362. "\n\t"
  363. "extp %[Temp2], $ac2, 31 "
  364. "\n\t" /* even 8 */
  365. "lbux %[st1], %[Temp1](%[cm]) "
  366. "\n\t" /* even 7 */
  367. /* ODD pixels */
  368. "ulw %[qload1], 1(%[src]) "
  369. "\n\t"
  370. "ulw %[qload2], 5(%[src]) "
  371. "\n\t"
  372. /* odd 1. pixel */
  373. "mtlo %[vector_64], $ac1 "
  374. "\n\t" /* odd 2 */
  375. "mthi $zero, $ac1 "
  376. "\n\t"
  377. "preceu.ph.qbr %[p1], %[qload1] "
  378. "\n\t"
  379. "preceu.ph.qbl %[p2], %[qload1] "
  380. "\n\t"
  381. "preceu.ph.qbr %[p3], %[qload2] "
  382. "\n\t"
  383. "preceu.ph.qbl %[p4], %[qload2] "
  384. "\n\t"
  385. "sb %[st1], 0(%[dst]) "
  386. "\n\t" /* even 7 */
  387. "addu %[dst], %[dst], %[dst_pitch_2] "
  388. "\n\t"
  389. "ulw %[qload2], 9(%[src]) "
  390. "\n\t"
  391. "dpa.w.ph $ac3, %[p1], %[filter45] "
  392. "\n\t" /* odd 1 */
  393. "extp %[Temp3], $ac3, 31 "
  394. "\n\t" /* odd 1 */
  395. "lbux %[st2], %[Temp2](%[cm]) "
  396. "\n\t" /* even 8 */
  397. /* odd 2. pixel */
  398. "mtlo %[vector_64], $ac2 "
  399. "\n\t" /* odd 3 */
  400. "mthi $zero, $ac2 "
  401. "\n\t"
  402. "preceu.ph.qbr %[p1], %[qload2] "
  403. "\n\t"
  404. "preceu.ph.qbl %[p5], %[qload2] "
  405. "\n\t"
  406. "sb %[st2], 0(%[dst]) "
  407. "\n\t" /* even 8 */
  408. "ulw %[qload1], 13(%[src]) "
  409. "\n\t"
  410. "dpa.w.ph $ac1, %[p2], %[filter45] "
  411. "\n\t" /* odd 2 */
  412. "extp %[Temp1], $ac1, 31 "
  413. "\n\t" /* odd 2 */
  414. "lbux %[st3], %[Temp3](%[cm]) "
  415. "\n\t" /* odd 1 */
  416. /* odd 3. pixel */
  417. "mtlo %[vector_64], $ac3 "
  418. "\n\t" /* odd 4 */
  419. "mthi $zero, $ac3 "
  420. "\n\t"
  421. "preceu.ph.qbr %[p2], %[qload1] "
  422. "\n\t"
  423. "sb %[st3], 0(%[odd_dst]) "
  424. "\n\t" /* odd 1 */
  425. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  426. "\n\t"
  427. "dpa.w.ph $ac2, %[p3], %[filter45] "
  428. "\n\t" /* odd 3 */
  429. "extp %[Temp2], $ac2, 31 "
  430. "\n\t" /* odd 3 */
  431. "lbux %[st1], %[Temp1](%[cm]) "
  432. "\n\t" /* odd 2 */
  433. /* odd 4. pixel */
  434. "mtlo %[vector_64], $ac1 "
  435. "\n\t" /* odd 5 */
  436. "mthi $zero, $ac1 "
  437. "\n\t"
  438. "preceu.ph.qbl %[p3], %[qload1] "
  439. "\n\t"
  440. "sb %[st1], 0(%[odd_dst]) "
  441. "\n\t" /* odd 2 */
  442. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  443. "\n\t"
  444. "dpa.w.ph $ac3, %[p4], %[filter45] "
  445. "\n\t" /* odd 4 */
  446. "extp %[Temp3], $ac3, 31 "
  447. "\n\t" /* odd 4 */
  448. "lbux %[st2], %[Temp2](%[cm]) "
  449. "\n\t" /* odd 3 */
  450. /* odd 5. pixel */
  451. "mtlo %[vector_64], $ac2 "
  452. "\n\t" /* odd 6 */
  453. "mthi $zero, $ac2 "
  454. "\n\t"
  455. "sb %[st2], 0(%[odd_dst]) "
  456. "\n\t" /* odd 3 */
  457. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  458. "\n\t"
  459. "dpa.w.ph $ac1, %[p1], %[filter45] "
  460. "\n\t" /* odd 5 */
  461. "extp %[Temp1], $ac1, 31 "
  462. "\n\t" /* odd 5 */
  463. "lbux %[st3], %[Temp3](%[cm]) "
  464. "\n\t" /* odd 4 */
  465. /* odd 6. pixel */
  466. "mtlo %[vector_64], $ac3 "
  467. "\n\t" /* odd 7 */
  468. "mthi $zero, $ac3 "
  469. "\n\t"
  470. "sb %[st3], 0(%[odd_dst]) "
  471. "\n\t" /* odd 4 */
  472. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  473. "\n\t"
  474. "ulw %[qload1], 21(%[src]) "
  475. "\n\t"
  476. "dpa.w.ph $ac2, %[p5], %[filter45] "
  477. "\n\t" /* odd 6 */
  478. "extp %[Temp2], $ac2, 31 "
  479. "\n\t" /* odd 6 */
  480. "lbux %[st1], %[Temp1](%[cm]) "
  481. "\n\t" /* odd 5 */
  482. /* odd 7. pixel */
  483. "mtlo %[vector_64], $ac1 "
  484. "\n\t" /* odd 8 */
  485. "mthi $zero, $ac1 "
  486. "\n\t"
  487. "preceu.ph.qbr %[p5], %[qload1] "
  488. "\n\t"
  489. "sb %[st1], 0(%[odd_dst]) "
  490. "\n\t" /* odd 5 */
  491. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  492. "\n\t"
  493. "dpa.w.ph $ac3, %[p2], %[filter45] "
  494. "\n\t" /* odd 7 */
  495. "extp %[Temp3], $ac3, 31 "
  496. "\n\t" /* odd 7 */
  497. /* odd 8. pixel */
  498. "dpa.w.ph $ac1, %[p3], %[filter45] "
  499. "\n\t" /* odd 8 */
  500. "extp %[Temp1], $ac1, 31 "
  501. "\n\t" /* odd 8 */
  502. "lbux %[st2], %[Temp2](%[cm]) "
  503. "\n\t" /* odd 6 */
  504. "lbux %[st3], %[Temp3](%[cm]) "
  505. "\n\t" /* odd 7 */
  506. "lbux %[st1], %[Temp1](%[cm]) "
  507. "\n\t" /* odd 8 */
  508. "sb %[st2], 0(%[odd_dst]) "
  509. "\n\t" /* odd 6 */
  510. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  511. "\n\t"
  512. "sb %[st3], 0(%[odd_dst]) "
  513. "\n\t" /* odd 7 */
  514. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  515. "\n\t"
  516. "sb %[st1], 0(%[odd_dst]) "
  517. "\n\t" /* odd 8 */
  518. : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
  519. [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
  520. [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
  521. [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
  522. [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
  523. : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
  524. [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
  525. src += 16;
  526. dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
  527. odd_dst = (dst + dst_stride);
  528. }
  529. /* Next row... */
  530. src_ptr += src_stride;
  531. dst_ptr += 1;
  532. }
  533. }
  534. static void convolve_bi_horiz_64_transposed_dspr2(
  535. const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
  536. int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
  537. int32_t c, y;
  538. const uint8_t *src;
  539. uint8_t *dst;
  540. uint8_t *cm = vpx_ff_cropTbl;
  541. uint32_t vector_64 = 64;
  542. int32_t Temp1, Temp2, Temp3;
  543. uint32_t qload1, qload2;
  544. uint32_t p1, p2, p3, p4, p5;
  545. uint32_t st1, st2, st3;
  546. uint32_t dst_pitch_2 = (dst_stride << 1);
  547. uint8_t *odd_dst;
  548. const int16_t *filter = &filter_x0[3];
  549. uint32_t filter45;
  550. filter45 = ((const int32_t *)filter)[0];
  551. for (y = h; y--;) {
  552. /* prefetch data to cache memory */
  553. prefetch_load(src_ptr + src_stride);
  554. prefetch_load(src_ptr + src_stride + 32);
  555. prefetch_load(src_ptr + src_stride + 64);
  556. src = src_ptr;
  557. dst = dst_ptr;
  558. odd_dst = (dst + dst_stride);
  559. for (c = 0; c < 4; c++) {
  560. __asm__ __volatile__(
  561. "ulw %[qload1], 0(%[src]) "
  562. "\n\t"
  563. "ulw %[qload2], 4(%[src]) "
  564. "\n\t"
  565. /* even 1. pixel */
  566. "mtlo %[vector_64], $ac1 "
  567. "\n\t" /* even 1 */
  568. "mthi $zero, $ac1 "
  569. "\n\t"
  570. "mtlo %[vector_64], $ac2 "
  571. "\n\t" /* even 2 */
  572. "mthi $zero, $ac2 "
  573. "\n\t"
  574. "preceu.ph.qbr %[p1], %[qload1] "
  575. "\n\t"
  576. "preceu.ph.qbl %[p2], %[qload1] "
  577. "\n\t"
  578. "preceu.ph.qbr %[p3], %[qload2] "
  579. "\n\t"
  580. "preceu.ph.qbl %[p4], %[qload2] "
  581. "\n\t"
  582. "ulw %[qload1], 8(%[src]) "
  583. "\n\t"
  584. "dpa.w.ph $ac1, %[p1], %[filter45] "
  585. "\n\t" /* even 1 */
  586. "extp %[Temp1], $ac1, 31 "
  587. "\n\t" /* even 1 */
  588. /* even 2. pixel */
  589. "mtlo %[vector_64], $ac3 "
  590. "\n\t" /* even 3 */
  591. "mthi $zero, $ac3 "
  592. "\n\t"
  593. "preceu.ph.qbr %[p1], %[qload1] "
  594. "\n\t"
  595. "preceu.ph.qbl %[p5], %[qload1] "
  596. "\n\t"
  597. "ulw %[qload2], 12(%[src]) "
  598. "\n\t"
  599. "dpa.w.ph $ac2, %[p2], %[filter45] "
  600. "\n\t" /* even 1 */
  601. "lbux %[st1], %[Temp1](%[cm]) "
  602. "\n\t" /* even 1 */
  603. "extp %[Temp2], $ac2, 31 "
  604. "\n\t" /* even 1 */
  605. /* even 3. pixel */
  606. "mtlo %[vector_64], $ac1 "
  607. "\n\t" /* even 4 */
  608. "mthi $zero, $ac1 "
  609. "\n\t"
  610. "preceu.ph.qbr %[p2], %[qload2] "
  611. "\n\t"
  612. "sb %[st1], 0(%[dst]) "
  613. "\n\t" /* even 1 */
  614. "addu %[dst], %[dst], %[dst_pitch_2] "
  615. " \n\t"
  616. "dpa.w.ph $ac3, %[p3], %[filter45] "
  617. "\n\t" /* even 3 */
  618. "extp %[Temp3], $ac3, 31 "
  619. "\n\t" /* even 3 */
  620. "lbux %[st2], %[Temp2](%[cm]) "
  621. "\n\t" /* even 1 */
  622. /* even 4. pixel */
  623. "mtlo %[vector_64], $ac2 "
  624. "\n\t" /* even 5 */
  625. "mthi $zero, $ac2 "
  626. "\n\t"
  627. "preceu.ph.qbl %[p3], %[qload2] "
  628. "\n\t"
  629. "sb %[st2], 0(%[dst]) "
  630. "\n\t" /* even 2 */
  631. "addu %[dst], %[dst], %[dst_pitch_2] "
  632. "\n\t"
  633. "dpa.w.ph $ac1, %[p4], %[filter45] "
  634. "\n\t" /* even 4 */
  635. "extp %[Temp1], $ac1, 31 "
  636. "\n\t" /* even 4 */
  637. "lbux %[st3], %[Temp3](%[cm]) "
  638. "\n\t" /* even 3 */
  639. /* even 5. pixel */
  640. "mtlo %[vector_64], $ac3 "
  641. "\n\t" /* even 6 */
  642. "mthi $zero, $ac3 "
  643. "\n\t"
  644. "sb %[st3], 0(%[dst]) "
  645. "\n\t" /* even 3 */
  646. "addu %[dst], %[dst], %[dst_pitch_2] "
  647. "\n\t"
  648. "dpa.w.ph $ac2, %[p1], %[filter45] "
  649. "\n\t" /* even 5 */
  650. "extp %[Temp2], $ac2, 31 "
  651. "\n\t" /* even 5 */
  652. "lbux %[st1], %[Temp1](%[cm]) "
  653. "\n\t" /* even 4 */
  654. /* even 6. pixel */
  655. "mtlo %[vector_64], $ac1 "
  656. "\n\t" /* even 7 */
  657. "mthi $zero, $ac1 "
  658. "\n\t"
  659. "sb %[st1], 0(%[dst]) "
  660. "\n\t" /* even 4 */
  661. "addu %[dst], %[dst], %[dst_pitch_2] "
  662. "\n\t"
  663. "ulw %[qload1], 20(%[src]) "
  664. "\n\t"
  665. "dpa.w.ph $ac3, %[p5], %[filter45] "
  666. "\n\t" /* even 6 */
  667. "extp %[Temp3], $ac3, 31 "
  668. "\n\t" /* even 6 */
  669. "lbux %[st2], %[Temp2](%[cm]) "
  670. "\n\t" /* even 5 */
  671. /* even 7. pixel */
  672. "mtlo %[vector_64], $ac2 "
  673. "\n\t" /* even 8 */
  674. "mthi $zero, $ac2 "
  675. "\n\t"
  676. "preceu.ph.qbr %[p5], %[qload1] "
  677. "\n\t"
  678. "sb %[st2], 0(%[dst]) "
  679. "\n\t" /* even 5 */
  680. "addu %[dst], %[dst], %[dst_pitch_2] "
  681. "\n\t"
  682. "dpa.w.ph $ac1, %[p2], %[filter45] "
  683. "\n\t" /* even 7 */
  684. "extp %[Temp1], $ac1, 31 "
  685. "\n\t" /* even 7 */
  686. "lbux %[st3], %[Temp3](%[cm]) "
  687. "\n\t" /* even 6 */
  688. /* even 8. pixel */
  689. "mtlo %[vector_64], $ac3 "
  690. "\n\t" /* odd 1 */
  691. "mthi $zero, $ac3 "
  692. "\n\t"
  693. "dpa.w.ph $ac2, %[p3], %[filter45] "
  694. "\n\t" /* even 8 */
  695. "sb %[st3], 0(%[dst]) "
  696. "\n\t" /* even 6 */
  697. "addu %[dst], %[dst], %[dst_pitch_2] "
  698. "\n\t"
  699. "extp %[Temp2], $ac2, 31 "
  700. "\n\t" /* even 8 */
  701. "lbux %[st1], %[Temp1](%[cm]) "
  702. "\n\t" /* even 7 */
  703. /* ODD pixels */
  704. "ulw %[qload1], 1(%[src]) "
  705. "\n\t"
  706. "ulw %[qload2], 5(%[src]) "
  707. "\n\t"
  708. /* odd 1. pixel */
  709. "mtlo %[vector_64], $ac1 "
  710. "\n\t" /* odd 2 */
  711. "mthi $zero, $ac1 "
  712. "\n\t"
  713. "preceu.ph.qbr %[p1], %[qload1] "
  714. "\n\t"
  715. "preceu.ph.qbl %[p2], %[qload1] "
  716. "\n\t"
  717. "preceu.ph.qbr %[p3], %[qload2] "
  718. "\n\t"
  719. "preceu.ph.qbl %[p4], %[qload2] "
  720. "\n\t"
  721. "sb %[st1], 0(%[dst]) "
  722. "\n\t" /* even 7 */
  723. "addu %[dst], %[dst], %[dst_pitch_2] "
  724. "\n\t"
  725. "ulw %[qload2], 9(%[src]) "
  726. "\n\t"
  727. "dpa.w.ph $ac3, %[p1], %[filter45] "
  728. "\n\t" /* odd 1 */
  729. "extp %[Temp3], $ac3, 31 "
  730. "\n\t" /* odd 1 */
  731. "lbux %[st2], %[Temp2](%[cm]) "
  732. "\n\t" /* even 8 */
  733. /* odd 2. pixel */
  734. "mtlo %[vector_64], $ac2 "
  735. "\n\t" /* odd 3 */
  736. "mthi $zero, $ac2 "
  737. "\n\t"
  738. "preceu.ph.qbr %[p1], %[qload2] "
  739. "\n\t"
  740. "preceu.ph.qbl %[p5], %[qload2] "
  741. "\n\t"
  742. "sb %[st2], 0(%[dst]) "
  743. "\n\t" /* even 8 */
  744. "ulw %[qload1], 13(%[src]) "
  745. "\n\t"
  746. "dpa.w.ph $ac1, %[p2], %[filter45] "
  747. "\n\t" /* odd 2 */
  748. "extp %[Temp1], $ac1, 31 "
  749. "\n\t" /* odd 2 */
  750. "lbux %[st3], %[Temp3](%[cm]) "
  751. "\n\t" /* odd 1 */
  752. /* odd 3. pixel */
  753. "mtlo %[vector_64], $ac3 "
  754. "\n\t" /* odd 4 */
  755. "mthi $zero, $ac3 "
  756. "\n\t"
  757. "preceu.ph.qbr %[p2], %[qload1] "
  758. "\n\t"
  759. "sb %[st3], 0(%[odd_dst]) "
  760. "\n\t" /* odd 1 */
  761. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  762. "\n\t"
  763. "dpa.w.ph $ac2, %[p3], %[filter45] "
  764. "\n\t" /* odd 3 */
  765. "extp %[Temp2], $ac2, 31 "
  766. "\n\t" /* odd 3 */
  767. "lbux %[st1], %[Temp1](%[cm]) "
  768. "\n\t" /* odd 2 */
  769. /* odd 4. pixel */
  770. "mtlo %[vector_64], $ac1 "
  771. "\n\t" /* odd 5 */
  772. "mthi $zero, $ac1 "
  773. "\n\t"
  774. "preceu.ph.qbl %[p3], %[qload1] "
  775. "\n\t"
  776. "sb %[st1], 0(%[odd_dst]) "
  777. "\n\t" /* odd 2 */
  778. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  779. "\n\t"
  780. "dpa.w.ph $ac3, %[p4], %[filter45] "
  781. "\n\t" /* odd 4 */
  782. "extp %[Temp3], $ac3, 31 "
  783. "\n\t" /* odd 4 */
  784. "lbux %[st2], %[Temp2](%[cm]) "
  785. "\n\t" /* odd 3 */
  786. /* odd 5. pixel */
  787. "mtlo %[vector_64], $ac2 "
  788. "\n\t" /* odd 6 */
  789. "mthi $zero, $ac2 "
  790. "\n\t"
  791. "sb %[st2], 0(%[odd_dst]) "
  792. "\n\t" /* odd 3 */
  793. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  794. "\n\t"
  795. "dpa.w.ph $ac1, %[p1], %[filter45] "
  796. "\n\t" /* odd 5 */
  797. "extp %[Temp1], $ac1, 31 "
  798. "\n\t" /* odd 5 */
  799. "lbux %[st3], %[Temp3](%[cm]) "
  800. "\n\t" /* odd 4 */
  801. /* odd 6. pixel */
  802. "mtlo %[vector_64], $ac3 "
  803. "\n\t" /* odd 7 */
  804. "mthi $zero, $ac3 "
  805. "\n\t"
  806. "sb %[st3], 0(%[odd_dst]) "
  807. "\n\t" /* odd 4 */
  808. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  809. "\n\t"
  810. "ulw %[qload1], 21(%[src]) "
  811. "\n\t"
  812. "dpa.w.ph $ac2, %[p5], %[filter45] "
  813. "\n\t" /* odd 6 */
  814. "extp %[Temp2], $ac2, 31 "
  815. "\n\t" /* odd 6 */
  816. "lbux %[st1], %[Temp1](%[cm]) "
  817. "\n\t" /* odd 5 */
  818. /* odd 7. pixel */
  819. "mtlo %[vector_64], $ac1 "
  820. "\n\t" /* odd 8 */
  821. "mthi $zero, $ac1 "
  822. "\n\t"
  823. "preceu.ph.qbr %[p5], %[qload1] "
  824. "\n\t"
  825. "sb %[st1], 0(%[odd_dst]) "
  826. "\n\t" /* odd 5 */
  827. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  828. "\n\t"
  829. "dpa.w.ph $ac3, %[p2], %[filter45] "
  830. "\n\t" /* odd 7 */
  831. "extp %[Temp3], $ac3, 31 "
  832. "\n\t" /* odd 7 */
  833. /* odd 8. pixel */
  834. "dpa.w.ph $ac1, %[p3], %[filter45] "
  835. "\n\t" /* odd 8 */
  836. "extp %[Temp1], $ac1, 31 "
  837. "\n\t" /* odd 8 */
  838. "lbux %[st2], %[Temp2](%[cm]) "
  839. "\n\t" /* odd 6 */
  840. "lbux %[st3], %[Temp3](%[cm]) "
  841. "\n\t" /* odd 7 */
  842. "lbux %[st1], %[Temp1](%[cm]) "
  843. "\n\t" /* odd 8 */
  844. "sb %[st2], 0(%[odd_dst]) "
  845. "\n\t" /* odd 6 */
  846. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  847. "\n\t"
  848. "sb %[st3], 0(%[odd_dst]) "
  849. "\n\t" /* odd 7 */
  850. "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
  851. "\n\t"
  852. "sb %[st1], 0(%[odd_dst]) "
  853. "\n\t" /* odd 8 */
  854. : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
  855. [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
  856. [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
  857. [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
  858. [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
  859. : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
  860. [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
  861. src += 16;
  862. dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
  863. odd_dst = (dst + dst_stride);
  864. }
  865. /* Next row... */
  866. src_ptr += src_stride;
  867. dst_ptr += 1;
  868. }
  869. }
  870. void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
  871. uint8_t *dst, ptrdiff_t dst_stride,
  872. const int16_t *filter, int w, int h) {
  873. int x, y;
  874. for (y = 0; y < h; ++y) {
  875. for (x = 0; x < w; ++x) {
  876. int sum = 0;
  877. sum += src[x] * filter[3];
  878. sum += src[x + 1] * filter[4];
  879. dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
  880. }
  881. src += src_stride;
  882. dst += 1;
  883. }
  884. }
  885. void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
  886. ptrdiff_t dst_stride, const int16_t *filter, int w,
  887. int h) {
  888. uint32_t pos = 38;
  889. /* bit positon for extract from acc */
  890. __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
  891. :
  892. : [pos] "r"(pos));
  893. /* prefetch data to cache memory */
  894. prefetch_load(src);
  895. prefetch_load(src + 32);
  896. switch (w) {
  897. case 4:
  898. convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
  899. filter, h);
  900. break;
  901. case 8:
  902. convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
  903. filter, h);
  904. break;
  905. case 16:
  906. case 32:
  907. convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
  908. filter, h, (w / 16));
  909. break;
  910. case 64:
  911. prefetch_load(src + 32);
  912. convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
  913. filter, h);
  914. break;
  915. default:
  916. convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
  917. h);
  918. break;
  919. }
  920. }
  921. #endif