convolve2_horiz_dspr2.c 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681
  1. /*
  2. * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <stdio.h>
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_dsp/mips/convolve_common_dspr2.h"
  14. #include "vpx_dsp/vpx_convolve.h"
  15. #include "vpx_dsp/vpx_dsp_common.h"
  16. #include "vpx_ports/mem.h"
  17. #if HAVE_DSPR2
  18. static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
  19. uint8_t *dst, int32_t dst_stride,
  20. const int16_t *filter_x0, int32_t h) {
  21. int32_t y;
  22. uint8_t *cm = vpx_ff_cropTbl;
  23. int32_t Temp1, Temp2, Temp3, Temp4;
  24. uint32_t vector4a = 64;
  25. uint32_t tp1, tp2;
  26. uint32_t p1, p2;
  27. const int16_t *filter = &filter_x0[3];
  28. uint32_t filter45;
  29. filter45 = ((const int32_t *)filter)[0];
  30. for (y = h; y--;) {
  31. /* prefetch data to cache memory */
  32. prefetch_load(src + src_stride);
  33. prefetch_load(src + src_stride + 32);
  34. prefetch_store(dst + dst_stride);
  35. __asm__ __volatile__(
  36. "ulw %[tp1], 0(%[src]) \n\t"
  37. "ulw %[tp2], 4(%[src]) \n\t"
  38. /* even 1. pixel */
  39. "mtlo %[vector4a], $ac3 \n\t"
  40. "mthi $zero, $ac3 \n\t"
  41. "preceu.ph.qbr %[p1], %[tp1] \n\t"
  42. "preceu.ph.qbl %[p2], %[tp1] \n\t"
  43. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  44. "extp %[Temp1], $ac3, 31 \n\t"
  45. /* even 2. pixel */
  46. "mtlo %[vector4a], $ac2 \n\t"
  47. "mthi $zero, $ac2 \n\t"
  48. "balign %[tp2], %[tp1], 3 \n\t"
  49. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
  50. "extp %[Temp3], $ac2, 31 \n\t"
  51. /* odd 1. pixel */
  52. "lbux %[tp1], %[Temp1](%[cm]) \n\t"
  53. "mtlo %[vector4a], $ac3 \n\t"
  54. "mthi $zero, $ac3 \n\t"
  55. "preceu.ph.qbr %[p1], %[tp2] \n\t"
  56. "preceu.ph.qbl %[p2], %[tp2] \n\t"
  57. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  58. "extp %[Temp2], $ac3, 31 \n\t"
  59. /* odd 2. pixel */
  60. "lbux %[tp2], %[Temp3](%[cm]) \n\t"
  61. "mtlo %[vector4a], $ac2 \n\t"
  62. "mthi $zero, $ac2 \n\t"
  63. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
  64. "extp %[Temp4], $ac2, 31 \n\t"
  65. /* clamp */
  66. "lbux %[p1], %[Temp2](%[cm]) \n\t"
  67. "lbux %[p2], %[Temp4](%[cm]) \n\t"
  68. /* store bytes */
  69. "sb %[tp1], 0(%[dst]) \n\t"
  70. "sb %[p1], 1(%[dst]) \n\t"
  71. "sb %[tp2], 2(%[dst]) \n\t"
  72. "sb %[p2], 3(%[dst]) \n\t"
  73. : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
  74. [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
  75. [Temp4] "=&r"(Temp4)
  76. : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
  77. [dst] "r"(dst), [src] "r"(src));
  78. /* Next row... */
  79. src += src_stride;
  80. dst += dst_stride;
  81. }
  82. }
  83. static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
  84. uint8_t *dst, int32_t dst_stride,
  85. const int16_t *filter_x0, int32_t h) {
  86. int32_t y;
  87. uint8_t *cm = vpx_ff_cropTbl;
  88. uint32_t vector4a = 64;
  89. int32_t Temp1, Temp2, Temp3;
  90. uint32_t tp1, tp2, tp3;
  91. uint32_t p1, p2, p3, p4;
  92. uint32_t st0, st1;
  93. const int16_t *filter = &filter_x0[3];
  94. uint32_t filter45;
  95. filter45 = ((const int32_t *)filter)[0];
  96. for (y = h; y--;) {
  97. /* prefetch data to cache memory */
  98. prefetch_load(src + src_stride);
  99. prefetch_load(src + src_stride + 32);
  100. prefetch_store(dst + dst_stride);
  101. __asm__ __volatile__(
  102. "ulw %[tp1], 0(%[src]) \n\t"
  103. "ulw %[tp2], 4(%[src]) \n\t"
  104. /* even 1. pixel */
  105. "mtlo %[vector4a], $ac3 \n\t"
  106. "mthi $zero, $ac3 \n\t"
  107. "mtlo %[vector4a], $ac2 \n\t"
  108. "mthi $zero, $ac2 \n\t"
  109. "preceu.ph.qbr %[p1], %[tp1] \n\t"
  110. "preceu.ph.qbl %[p2], %[tp1] \n\t"
  111. "preceu.ph.qbr %[p3], %[tp2] \n\t"
  112. "preceu.ph.qbl %[p4], %[tp2] \n\t"
  113. "ulw %[tp3], 8(%[src]) \n\t"
  114. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  115. "extp %[Temp1], $ac3, 31 \n\t"
  116. /* even 2. pixel */
  117. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
  118. "extp %[Temp3], $ac2, 31 \n\t"
  119. /* even 3. pixel */
  120. "lbux %[st0], %[Temp1](%[cm]) \n\t"
  121. "mtlo %[vector4a], $ac1 \n\t"
  122. "mthi $zero, $ac1 \n\t"
  123. "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
  124. "extp %[Temp1], $ac1, 31 \n\t"
  125. /* even 4. pixel */
  126. "mtlo %[vector4a], $ac2 \n\t"
  127. "mthi $zero, $ac2 \n\t"
  128. "mtlo %[vector4a], $ac3 \n\t"
  129. "mthi $zero, $ac3 \n\t"
  130. "sb %[st0], 0(%[dst]) \n\t"
  131. "lbux %[st1], %[Temp3](%[cm]) \n\t"
  132. "balign %[tp3], %[tp2], 3 \n\t"
  133. "balign %[tp2], %[tp1], 3 \n\t"
  134. "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
  135. "extp %[Temp3], $ac2, 31 \n\t"
  136. "lbux %[st0], %[Temp1](%[cm]) \n\t"
  137. /* odd 1. pixel */
  138. "mtlo %[vector4a], $ac1 \n\t"
  139. "mthi $zero, $ac1 \n\t"
  140. "sb %[st1], 2(%[dst]) \n\t"
  141. "preceu.ph.qbr %[p1], %[tp2] \n\t"
  142. "preceu.ph.qbl %[p2], %[tp2] \n\t"
  143. "preceu.ph.qbr %[p3], %[tp3] \n\t"
  144. "preceu.ph.qbl %[p4], %[tp3] \n\t"
  145. "sb %[st0], 4(%[dst]) \n\t"
  146. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  147. "extp %[Temp2], $ac3, 31 \n\t"
  148. /* odd 2. pixel */
  149. "mtlo %[vector4a], $ac3 \n\t"
  150. "mthi $zero, $ac3 \n\t"
  151. "mtlo %[vector4a], $ac2 \n\t"
  152. "mthi $zero, $ac2 \n\t"
  153. "lbux %[st0], %[Temp3](%[cm]) \n\t"
  154. "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
  155. "extp %[Temp3], $ac1, 31 \n\t"
  156. /* odd 3. pixel */
  157. "lbux %[st1], %[Temp2](%[cm]) \n\t"
  158. "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
  159. "extp %[Temp2], $ac3, 31 \n\t"
  160. /* odd 4. pixel */
  161. "sb %[st1], 1(%[dst]) \n\t"
  162. "sb %[st0], 6(%[dst]) \n\t"
  163. "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
  164. "extp %[Temp1], $ac2, 31 \n\t"
  165. /* clamp */
  166. "lbux %[p4], %[Temp3](%[cm]) \n\t"
  167. "lbux %[p2], %[Temp2](%[cm]) \n\t"
  168. "lbux %[p1], %[Temp1](%[cm]) \n\t"
  169. /* store bytes */
  170. "sb %[p4], 3(%[dst]) \n\t"
  171. "sb %[p2], 5(%[dst]) \n\t"
  172. "sb %[p1], 7(%[dst]) \n\t"
  173. : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
  174. [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
  175. [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
  176. [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
  177. : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
  178. [dst] "r"(dst), [src] "r"(src));
  179. /* Next row... */
  180. src += src_stride;
  181. dst += dst_stride;
  182. }
  183. }
  184. static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
  185. int32_t src_stride, uint8_t *dst_ptr,
  186. int32_t dst_stride,
  187. const int16_t *filter_x0, int32_t h,
  188. int32_t count) {
  189. int32_t y, c;
  190. const uint8_t *src;
  191. uint8_t *dst;
  192. uint8_t *cm = vpx_ff_cropTbl;
  193. uint32_t vector_64 = 64;
  194. int32_t Temp1, Temp2, Temp3;
  195. uint32_t qload1, qload2, qload3;
  196. uint32_t p1, p2, p3, p4, p5;
  197. uint32_t st1, st2, st3;
  198. const int16_t *filter = &filter_x0[3];
  199. uint32_t filter45;
  200. filter45 = ((const int32_t *)filter)[0];
  201. for (y = h; y--;) {
  202. src = src_ptr;
  203. dst = dst_ptr;
  204. /* prefetch data to cache memory */
  205. prefetch_load(src_ptr + src_stride);
  206. prefetch_load(src_ptr + src_stride + 32);
  207. prefetch_store(dst_ptr + dst_stride);
  208. for (c = 0; c < count; c++) {
  209. __asm__ __volatile__(
  210. "ulw %[qload1], 0(%[src]) \n\t"
  211. "ulw %[qload2], 4(%[src]) \n\t"
  212. /* even 1. pixel */
  213. "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
  214. "mthi $zero, $ac1 \n\t"
  215. "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
  216. "mthi $zero, $ac2 \n\t"
  217. "preceu.ph.qbr %[p1], %[qload1] \n\t"
  218. "preceu.ph.qbl %[p2], %[qload1] \n\t"
  219. "preceu.ph.qbr %[p3], %[qload2] \n\t"
  220. "preceu.ph.qbl %[p4], %[qload2] \n\t"
  221. "ulw %[qload3], 8(%[src]) \n\t"
  222. "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
  223. "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
  224. /* even 2. pixel */
  225. "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
  226. "mthi $zero, $ac3 \n\t"
  227. "preceu.ph.qbr %[p1], %[qload3] \n\t"
  228. "preceu.ph.qbl %[p5], %[qload3] \n\t"
  229. "ulw %[qload1], 12(%[src]) \n\t"
  230. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
  231. "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
  232. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
  233. /* even 3. pixel */
  234. "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
  235. "mthi $zero, $ac1 \n\t"
  236. "preceu.ph.qbr %[p2], %[qload1] \n\t"
  237. "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
  238. "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
  239. "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
  240. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
  241. /* even 4. pixel */
  242. "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
  243. "mthi $zero, $ac2 \n\t"
  244. "preceu.ph.qbl %[p3], %[qload1] \n\t"
  245. "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
  246. "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
  247. "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
  248. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
  249. /* even 5. pixel */
  250. "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
  251. "mthi $zero, $ac3 \n\t"
  252. "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
  253. "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
  254. "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
  255. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
  256. /* even 6. pixel */
  257. "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
  258. "mthi $zero, $ac1 \n\t"
  259. "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
  260. "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
  261. "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
  262. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
  263. /* even 7. pixel */
  264. "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
  265. "mthi $zero, $ac2 \n\t"
  266. "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
  267. "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
  268. "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
  269. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
  270. /* even 8. pixel */
  271. "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
  272. "mthi $zero, $ac3 \n\t"
  273. "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
  274. "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
  275. "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
  276. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
  277. /* ODD pixels */
  278. "ulw %[qload1], 1(%[src]) \n\t"
  279. "ulw %[qload2], 5(%[src]) \n\t"
  280. /* odd 1. pixel */
  281. "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
  282. "mthi $zero, $ac1 \n\t"
  283. "preceu.ph.qbr %[p1], %[qload1] \n\t"
  284. "preceu.ph.qbl %[p2], %[qload1] \n\t"
  285. "preceu.ph.qbr %[p3], %[qload2] \n\t"
  286. "preceu.ph.qbl %[p4], %[qload2] \n\t"
  287. "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
  288. "ulw %[qload3], 9(%[src]) \n\t"
  289. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
  290. "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
  291. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
  292. /* odd 2. pixel */
  293. "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
  294. "mthi $zero, $ac2 \n\t"
  295. "preceu.ph.qbr %[p1], %[qload3] \n\t"
  296. "preceu.ph.qbl %[p5], %[qload3] \n\t"
  297. "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
  298. "ulw %[qload1], 13(%[src]) \n\t"
  299. "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
  300. "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
  301. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
  302. /* odd 3. pixel */
  303. "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
  304. "mthi $zero, $ac3 \n\t"
  305. "preceu.ph.qbr %[p2], %[qload1] \n\t"
  306. "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
  307. "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
  308. "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
  309. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
  310. /* odd 4. pixel */
  311. "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
  312. "mthi $zero, $ac1 \n\t"
  313. "preceu.ph.qbl %[p3], %[qload1] \n\t"
  314. "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
  315. "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
  316. "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
  317. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
  318. /* odd 5. pixel */
  319. "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
  320. "mthi $zero, $ac2 \n\t"
  321. "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
  322. "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
  323. "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
  324. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
  325. /* odd 6. pixel */
  326. "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
  327. "mthi $zero, $ac3 \n\t"
  328. "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
  329. "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
  330. "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
  331. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
  332. /* odd 7. pixel */
  333. "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
  334. "mthi $zero, $ac1 \n\t"
  335. "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
  336. "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
  337. "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
  338. /* odd 8. pixel */
  339. "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
  340. "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
  341. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
  342. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
  343. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
  344. "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
  345. "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
  346. "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
  347. : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
  348. [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
  349. [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
  350. [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
  351. [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
  352. : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
  353. [dst] "r"(dst), [src] "r"(src));
  354. src += 16;
  355. dst += 16;
  356. }
  357. /* Next row... */
  358. src_ptr += src_stride;
  359. dst_ptr += dst_stride;
  360. }
  361. }
  362. static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
  363. int32_t src_stride, uint8_t *dst_ptr,
  364. int32_t dst_stride,
  365. const int16_t *filter_x0, int32_t h) {
  366. int32_t y, c;
  367. const uint8_t *src;
  368. uint8_t *dst;
  369. uint8_t *cm = vpx_ff_cropTbl;
  370. uint32_t vector_64 = 64;
  371. int32_t Temp1, Temp2, Temp3;
  372. uint32_t qload1, qload2, qload3;
  373. uint32_t p1, p2, p3, p4, p5;
  374. uint32_t st1, st2, st3;
  375. const int16_t *filter = &filter_x0[3];
  376. uint32_t filter45;
  377. filter45 = ((const int32_t *)filter)[0];
  378. for (y = h; y--;) {
  379. src = src_ptr;
  380. dst = dst_ptr;
  381. /* prefetch data to cache memory */
  382. prefetch_load(src_ptr + src_stride);
  383. prefetch_load(src_ptr + src_stride + 32);
  384. prefetch_load(src_ptr + src_stride + 64);
  385. prefetch_store(dst_ptr + dst_stride);
  386. prefetch_store(dst_ptr + dst_stride + 32);
  387. for (c = 0; c < 4; c++) {
  388. __asm__ __volatile__(
  389. "ulw %[qload1], 0(%[src]) \n\t"
  390. "ulw %[qload2], 4(%[src]) \n\t"
  391. /* even 1. pixel */
  392. "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
  393. "mthi $zero, $ac1 \n\t"
  394. "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
  395. "mthi $zero, $ac2 \n\t"
  396. "preceu.ph.qbr %[p1], %[qload1] \n\t"
  397. "preceu.ph.qbl %[p2], %[qload1] \n\t"
  398. "preceu.ph.qbr %[p3], %[qload2] \n\t"
  399. "preceu.ph.qbl %[p4], %[qload2] \n\t"
  400. "ulw %[qload3], 8(%[src]) \n\t"
  401. "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
  402. "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
  403. /* even 2. pixel */
  404. "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
  405. "mthi $zero, $ac3 \n\t"
  406. "preceu.ph.qbr %[p1], %[qload3] \n\t"
  407. "preceu.ph.qbl %[p5], %[qload3] \n\t"
  408. "ulw %[qload1], 12(%[src]) \n\t"
  409. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
  410. "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
  411. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
  412. /* even 3. pixel */
  413. "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
  414. "mthi $zero, $ac1 \n\t"
  415. "preceu.ph.qbr %[p2], %[qload1] \n\t"
  416. "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
  417. "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
  418. "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
  419. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
  420. /* even 4. pixel */
  421. "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
  422. "mthi $zero, $ac2 \n\t"
  423. "preceu.ph.qbl %[p3], %[qload1] \n\t"
  424. "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
  425. "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
  426. "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
  427. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
  428. /* even 5. pixel */
  429. "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
  430. "mthi $zero, $ac3 \n\t"
  431. "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
  432. "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
  433. "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
  434. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
  435. /* even 6. pixel */
  436. "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
  437. "mthi $zero, $ac1 \n\t"
  438. "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
  439. "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
  440. "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
  441. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
  442. /* even 7. pixel */
  443. "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
  444. "mthi $zero, $ac2 \n\t"
  445. "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
  446. "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
  447. "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
  448. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
  449. /* even 8. pixel */
  450. "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
  451. "mthi $zero, $ac3 \n\t"
  452. "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
  453. "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
  454. "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
  455. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
  456. /* ODD pixels */
  457. "ulw %[qload1], 1(%[src]) \n\t"
  458. "ulw %[qload2], 5(%[src]) \n\t"
  459. /* odd 1. pixel */
  460. "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
  461. "mthi $zero, $ac1 \n\t"
  462. "preceu.ph.qbr %[p1], %[qload1] \n\t"
  463. "preceu.ph.qbl %[p2], %[qload1] \n\t"
  464. "preceu.ph.qbr %[p3], %[qload2] \n\t"
  465. "preceu.ph.qbl %[p4], %[qload2] \n\t"
  466. "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
  467. "ulw %[qload3], 9(%[src]) \n\t"
  468. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
  469. "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
  470. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
  471. /* odd 2. pixel */
  472. "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
  473. "mthi $zero, $ac2 \n\t"
  474. "preceu.ph.qbr %[p1], %[qload3] \n\t"
  475. "preceu.ph.qbl %[p5], %[qload3] \n\t"
  476. "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
  477. "ulw %[qload1], 13(%[src]) \n\t"
  478. "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
  479. "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
  480. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
  481. /* odd 3. pixel */
  482. "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
  483. "mthi $zero, $ac3 \n\t"
  484. "preceu.ph.qbr %[p2], %[qload1] \n\t"
  485. "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
  486. "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
  487. "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
  488. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
  489. /* odd 4. pixel */
  490. "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
  491. "mthi $zero, $ac1 \n\t"
  492. "preceu.ph.qbl %[p3], %[qload1] \n\t"
  493. "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
  494. "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
  495. "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
  496. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
  497. /* odd 5. pixel */
  498. "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
  499. "mthi $zero, $ac2 \n\t"
  500. "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
  501. "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
  502. "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
  503. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
  504. /* odd 6. pixel */
  505. "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
  506. "mthi $zero, $ac3 \n\t"
  507. "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
  508. "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
  509. "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
  510. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
  511. /* odd 7. pixel */
  512. "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
  513. "mthi $zero, $ac1 \n\t"
  514. "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
  515. "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
  516. "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
  517. /* odd 8. pixel */
  518. "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
  519. "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
  520. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
  521. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
  522. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
  523. "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
  524. "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
  525. "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
  526. : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
  527. [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
  528. [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
  529. [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
  530. [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
  531. : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
  532. [dst] "r"(dst), [src] "r"(src));
  533. src += 16;
  534. dst += 16;
  535. }
  536. /* Next row... */
  537. src_ptr += src_stride;
  538. dst_ptr += dst_stride;
  539. }
  540. }
  541. void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
  542. uint8_t *dst, ptrdiff_t dst_stride,
  543. const InterpKernel *filter, int x0_q4,
  544. int32_t x_step_q4, int y0_q4, int y_step_q4,
  545. int w, int h) {
  546. const int16_t *const filter_x = filter[x0_q4];
  547. uint32_t pos = 38;
  548. assert(x_step_q4 == 16);
  549. prefetch_load((const uint8_t *)filter_x);
  550. /* bit positon for extract from acc */
  551. __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
  552. :
  553. : [pos] "r"(pos));
  554. /* prefetch data to cache memory */
  555. prefetch_load(src);
  556. prefetch_load(src + 32);
  557. prefetch_store(dst);
  558. switch (w) {
  559. case 4:
  560. convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
  561. (int32_t)dst_stride, filter_x, (int32_t)h);
  562. break;
  563. case 8:
  564. convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
  565. (int32_t)dst_stride, filter_x, (int32_t)h);
  566. break;
  567. case 16:
  568. convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
  569. (int32_t)dst_stride, filter_x, (int32_t)h, 1);
  570. break;
  571. case 32:
  572. convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
  573. (int32_t)dst_stride, filter_x, (int32_t)h, 2);
  574. break;
  575. case 64:
  576. prefetch_load(src + 64);
  577. prefetch_store(dst + 32);
  578. convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
  579. (int32_t)dst_stride, filter_x, (int32_t)h);
  580. break;
  581. default:
  582. vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  583. x_step_q4, y0_q4, y_step_q4, w, h);
  584. break;
  585. }
  586. }
  587. #endif