convolve2_avg_horiz_dspr2.c 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802
  1. /*
  2. * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <stdio.h>
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_dsp/mips/convolve_common_dspr2.h"
  14. #include "vpx_dsp/vpx_convolve.h"
  15. #include "vpx_dsp/vpx_dsp_common.h"
  16. #include "vpx_ports/mem.h"
  17. #if HAVE_DSPR2
  18. static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
  19. int32_t src_stride, uint8_t *dst,
  20. int32_t dst_stride,
  21. const int16_t *filter_x0, int32_t h) {
  22. int32_t y;
  23. uint8_t *cm = vpx_ff_cropTbl;
  24. int32_t Temp1, Temp2, Temp3, Temp4;
  25. uint32_t vector4a = 64;
  26. uint32_t tp1, tp2;
  27. uint32_t p1, p2, p3;
  28. uint32_t tn1, tn2;
  29. const int16_t *filter = &filter_x0[3];
  30. uint32_t filter45;
  31. filter45 = ((const int32_t *)filter)[0];
  32. for (y = h; y--;) {
  33. /* prefetch data to cache memory */
  34. prefetch_load(src + src_stride);
  35. prefetch_load(src + src_stride + 32);
  36. prefetch_store(dst + dst_stride);
  37. __asm__ __volatile__(
  38. "ulw %[tp1], 0(%[src]) \n\t"
  39. "ulw %[tp2], 4(%[src]) \n\t"
  40. /* even 1. pixel */
  41. "mtlo %[vector4a], $ac3 \n\t"
  42. "mthi $zero, $ac3 \n\t"
  43. "preceu.ph.qbr %[p1], %[tp1] \n\t"
  44. "preceu.ph.qbl %[p2], %[tp1] \n\t"
  45. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  46. "extp %[Temp1], $ac3, 31 \n\t"
  47. /* even 2. pixel */
  48. "mtlo %[vector4a], $ac2 \n\t"
  49. "mthi $zero, $ac2 \n\t"
  50. "balign %[tp2], %[tp1], 3 \n\t"
  51. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
  52. "extp %[Temp3], $ac2, 31 \n\t"
  53. "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
  54. /* odd 1. pixel */
  55. "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
  56. "mtlo %[vector4a], $ac3 \n\t"
  57. "mthi $zero, $ac3 \n\t"
  58. "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
  59. "preceu.ph.qbr %[p1], %[tp2] \n\t"
  60. "preceu.ph.qbl %[p3], %[tp2] \n\t"
  61. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  62. "extp %[Temp2], $ac3, 31 \n\t"
  63. "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
  64. /* odd 2. pixel */
  65. "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
  66. "mtlo %[vector4a], $ac2 \n\t"
  67. "mthi $zero, $ac2 \n\t"
  68. "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
  69. "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
  70. "dpa.w.ph $ac2, %[p3], %[filter45] \n\t"
  71. "extp %[Temp4], $ac2, 31 \n\t"
  72. "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
  73. "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
  74. /* clamp */
  75. "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
  76. "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
  77. "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
  78. "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
  79. "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
  80. "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
  81. "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
  82. : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
  83. [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
  84. [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
  85. [Temp4] "=&r"(Temp4)
  86. : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
  87. [dst] "r"(dst), [src] "r"(src));
  88. /* Next row... */
  89. src += src_stride;
  90. dst += dst_stride;
  91. }
  92. }
  93. static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
  94. int32_t src_stride, uint8_t *dst,
  95. int32_t dst_stride,
  96. const int16_t *filter_x0, int32_t h) {
  97. int32_t y;
  98. uint8_t *cm = vpx_ff_cropTbl;
  99. uint32_t vector4a = 64;
  100. int32_t Temp1, Temp2, Temp3;
  101. uint32_t tp1, tp2, tp3, tp4;
  102. uint32_t p1, p2, p3, p4, n1;
  103. uint32_t st0, st1;
  104. const int16_t *filter = &filter_x0[3];
  105. uint32_t filter45;
  106. filter45 = ((const int32_t *)filter)[0];
  107. for (y = h; y--;) {
  108. /* prefetch data to cache memory */
  109. prefetch_load(src + src_stride);
  110. prefetch_load(src + src_stride + 32);
  111. prefetch_store(dst + dst_stride);
  112. __asm__ __volatile__(
  113. "ulw %[tp1], 0(%[src]) \n\t"
  114. "ulw %[tp2], 4(%[src]) \n\t"
  115. /* even 1. pixel */
  116. "mtlo %[vector4a], $ac3 \n\t"
  117. "mthi $zero, $ac3 \n\t"
  118. "mtlo %[vector4a], $ac2 \n\t"
  119. "mthi $zero, $ac2 \n\t"
  120. "preceu.ph.qbr %[p1], %[tp1] \n\t"
  121. "preceu.ph.qbl %[p2], %[tp1] \n\t"
  122. "preceu.ph.qbr %[p3], %[tp2] \n\t"
  123. "preceu.ph.qbl %[p4], %[tp2] \n\t"
  124. "ulw %[tp3], 8(%[src]) \n\t"
  125. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  126. "extp %[Temp1], $ac3, 31 \n\t"
  127. "lbu %[Temp2], 0(%[dst]) \n\t"
  128. "lbu %[tp4], 2(%[dst]) \n\t"
  129. /* even 2. pixel */
  130. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
  131. "extp %[Temp3], $ac2, 31 \n\t"
  132. /* even 3. pixel */
  133. "lbux %[st0], %[Temp1](%[cm]) \n\t"
  134. "mtlo %[vector4a], $ac1 \n\t"
  135. "mthi $zero, $ac1 \n\t"
  136. "lbux %[st1], %[Temp3](%[cm]) \n\t"
  137. "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
  138. "extp %[Temp1], $ac1, 31 \n\t"
  139. "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
  140. "addqh_r.w %[tp4], %[tp4], %[st1] \n\t"
  141. "sb %[Temp2], 0(%[dst]) \n\t"
  142. "sb %[tp4], 2(%[dst]) \n\t"
  143. /* even 4. pixel */
  144. "mtlo %[vector4a], $ac2 \n\t"
  145. "mthi $zero, $ac2 \n\t"
  146. "mtlo %[vector4a], $ac3 \n\t"
  147. "mthi $zero, $ac3 \n\t"
  148. "balign %[tp3], %[tp2], 3 \n\t"
  149. "balign %[tp2], %[tp1], 3 \n\t"
  150. "lbux %[st0], %[Temp1](%[cm]) \n\t"
  151. "lbu %[Temp2], 4(%[dst]) \n\t"
  152. "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
  153. "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
  154. "extp %[Temp3], $ac2, 31 \n\t"
  155. /* odd 1. pixel */
  156. "mtlo %[vector4a], $ac1 \n\t"
  157. "mthi $zero, $ac1 \n\t"
  158. "sb %[Temp2], 4(%[dst]) \n\t"
  159. "preceu.ph.qbr %[p1], %[tp2] \n\t"
  160. "preceu.ph.qbl %[p2], %[tp2] \n\t"
  161. "preceu.ph.qbr %[p3], %[tp3] \n\t"
  162. "preceu.ph.qbl %[p4], %[tp3] \n\t"
  163. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
  164. "extp %[Temp2], $ac3, 31 \n\t"
  165. "lbu %[tp1], 6(%[dst]) \n\t"
  166. /* odd 2. pixel */
  167. "mtlo %[vector4a], $ac3 \n\t"
  168. "mthi $zero, $ac3 \n\t"
  169. "mtlo %[vector4a], $ac2 \n\t"
  170. "mthi $zero, $ac2 \n\t"
  171. "lbux %[st0], %[Temp3](%[cm]) \n\t"
  172. "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
  173. "extp %[Temp3], $ac1, 31 \n\t"
  174. "lbu %[tp2], 1(%[dst]) \n\t"
  175. "lbu %[tp3], 3(%[dst]) \n\t"
  176. "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
  177. /* odd 3. pixel */
  178. "lbux %[st1], %[Temp2](%[cm]) \n\t"
  179. "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
  180. "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
  181. "extp %[Temp2], $ac3, 31 \n\t"
  182. "lbu %[tp4], 5(%[dst]) \n\t"
  183. /* odd 4. pixel */
  184. "sb %[tp2], 1(%[dst]) \n\t"
  185. "sb %[tp1], 6(%[dst]) \n\t"
  186. "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
  187. "extp %[Temp1], $ac2, 31 \n\t"
  188. "lbu %[tp1], 7(%[dst]) \n\t"
  189. /* clamp */
  190. "lbux %[p4], %[Temp3](%[cm]) \n\t"
  191. "addqh_r.w %[tp3], %[tp3], %[p4] \n\t"
  192. "lbux %[p2], %[Temp2](%[cm]) \n\t"
  193. "addqh_r.w %[tp4], %[tp4], %[p2] \n\t"
  194. "lbux %[p1], %[Temp1](%[cm]) \n\t"
  195. "addqh_r.w %[tp1], %[tp1], %[p1] \n\t"
  196. /* store bytes */
  197. "sb %[tp3], 3(%[dst]) \n\t"
  198. "sb %[tp4], 5(%[dst]) \n\t"
  199. "sb %[tp1], 7(%[dst]) \n\t"
  200. : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
  201. [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
  202. [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
  203. [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
  204. : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
  205. [dst] "r"(dst), [src] "r"(src));
  206. /* Next row... */
  207. src += src_stride;
  208. dst += dst_stride;
  209. }
  210. }
  211. static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
  212. int32_t src_stride, uint8_t *dst_ptr,
  213. int32_t dst_stride,
  214. const int16_t *filter_x0, int32_t h,
  215. int32_t count) {
  216. int32_t y, c;
  217. const uint8_t *src;
  218. uint8_t *dst;
  219. uint8_t *cm = vpx_ff_cropTbl;
  220. uint32_t vector_64 = 64;
  221. int32_t Temp1, Temp2, Temp3;
  222. uint32_t qload1, qload2, qload3;
  223. uint32_t p1, p2, p3, p4, p5;
  224. uint32_t st1, st2, st3;
  225. const int16_t *filter = &filter_x0[3];
  226. uint32_t filter45;
  227. filter45 = ((const int32_t *)filter)[0];
  228. for (y = h; y--;) {
  229. src = src_ptr;
  230. dst = dst_ptr;
  231. /* prefetch data to cache memory */
  232. prefetch_load(src_ptr + src_stride);
  233. prefetch_load(src_ptr + src_stride + 32);
  234. prefetch_store(dst_ptr + dst_stride);
  235. for (c = 0; c < count; c++) {
  236. __asm__ __volatile__(
  237. "ulw %[qload1], 0(%[src]) \n\t"
  238. "ulw %[qload2], 4(%[src]) \n\t"
  239. /* even 1. pixel */
  240. "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
  241. "mthi $zero, $ac1 \n\t"
  242. "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
  243. "mthi $zero, $ac2 \n\t"
  244. "preceu.ph.qbr %[p1], %[qload1] \n\t"
  245. "preceu.ph.qbl %[p2], %[qload1] \n\t"
  246. "preceu.ph.qbr %[p3], %[qload2] \n\t"
  247. "preceu.ph.qbl %[p4], %[qload2] \n\t"
  248. "ulw %[qload3], 8(%[src]) \n\t"
  249. "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
  250. "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
  251. "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
  252. /* even 2. pixel */
  253. "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
  254. "mthi $zero, $ac3 \n\t"
  255. "preceu.ph.qbr %[p1], %[qload3] \n\t"
  256. "preceu.ph.qbl %[p5], %[qload3] \n\t"
  257. "ulw %[qload1], 12(%[src]) \n\t"
  258. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
  259. "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
  260. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
  261. "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
  262. /* even 3. pixel */
  263. "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
  264. "mthi $zero, $ac1 \n\t"
  265. "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
  266. "preceu.ph.qbr %[p2], %[qload1] \n\t"
  267. "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
  268. "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
  269. "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
  270. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
  271. /* even 4. pixel */
  272. "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
  273. "mthi $zero, $ac2 \n\t"
  274. "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
  275. "preceu.ph.qbl %[p3], %[qload1] \n\t"
  276. "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
  277. "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
  278. "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
  279. "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
  280. "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
  281. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
  282. /* even 5. pixel */
  283. "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
  284. "mthi $zero, $ac3 \n\t"
  285. "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
  286. "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
  287. "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
  288. "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
  289. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
  290. /* even 6. pixel */
  291. "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
  292. "mthi $zero, $ac1 \n\t"
  293. "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
  294. "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
  295. "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
  296. "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
  297. "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
  298. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
  299. /* even 7. pixel */
  300. "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
  301. "mthi $zero, $ac2 \n\t"
  302. "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
  303. "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
  304. "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
  305. "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
  306. "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
  307. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
  308. "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
  309. /* even 8. pixel */
  310. "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
  311. "mthi $zero, $ac3 \n\t"
  312. "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
  313. "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
  314. "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
  315. "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
  316. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
  317. /* ODD pixels */
  318. "ulw %[qload1], 1(%[src]) \n\t"
  319. "ulw %[qload2], 5(%[src]) \n\t"
  320. "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
  321. /* odd 1. pixel */
  322. "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
  323. "mthi $zero, $ac1 \n\t"
  324. "preceu.ph.qbr %[p1], %[qload1] \n\t"
  325. "preceu.ph.qbl %[p2], %[qload1] \n\t"
  326. "preceu.ph.qbr %[p3], %[qload2] \n\t"
  327. "preceu.ph.qbl %[p4], %[qload2] \n\t"
  328. "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
  329. "ulw %[qload3], 9(%[src]) \n\t"
  330. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
  331. "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
  332. "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
  333. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
  334. "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
  335. /* odd 2. pixel */
  336. "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
  337. "mthi $zero, $ac2 \n\t"
  338. "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
  339. "preceu.ph.qbr %[p1], %[qload3] \n\t"
  340. "preceu.ph.qbl %[p5], %[qload3] \n\t"
  341. "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
  342. "ulw %[qload1], 13(%[src]) \n\t"
  343. "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
  344. "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
  345. "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
  346. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
  347. /* odd 3. pixel */
  348. "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
  349. "mthi $zero, $ac3 \n\t"
  350. "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
  351. "preceu.ph.qbr %[p2], %[qload1] \n\t"
  352. "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
  353. "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
  354. "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
  355. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
  356. /* odd 4. pixel */
  357. "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
  358. "mthi $zero, $ac1 \n\t"
  359. "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
  360. "preceu.ph.qbl %[p3], %[qload1] \n\t"
  361. "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
  362. "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
  363. "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
  364. "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
  365. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
  366. "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
  367. /* odd 5. pixel */
  368. "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
  369. "mthi $zero, $ac2 \n\t"
  370. "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
  371. "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
  372. "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
  373. "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
  374. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
  375. "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
  376. /* odd 6. pixel */
  377. "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
  378. "mthi $zero, $ac3 \n\t"
  379. "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
  380. "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
  381. "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
  382. "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
  383. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
  384. /* odd 7. pixel */
  385. "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
  386. "mthi $zero, $ac1 \n\t"
  387. "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
  388. "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
  389. "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
  390. "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
  391. "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
  392. "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
  393. /* odd 8. pixel */
  394. "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
  395. "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
  396. "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
  397. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
  398. "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
  399. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
  400. "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
  401. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
  402. "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
  403. "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
  404. "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
  405. "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
  406. : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
  407. [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
  408. [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
  409. [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
  410. [Temp3] "=&r"(Temp3)
  411. : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
  412. [dst] "r"(dst), [src] "r"(src));
  413. src += 16;
  414. dst += 16;
  415. }
  416. /* Next row... */
  417. src_ptr += src_stride;
  418. dst_ptr += dst_stride;
  419. }
  420. }
  421. static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
  422. int32_t src_stride, uint8_t *dst_ptr,
  423. int32_t dst_stride,
  424. const int16_t *filter_x0,
  425. int32_t h) {
  426. int32_t y, c;
  427. const uint8_t *src;
  428. uint8_t *dst;
  429. uint8_t *cm = vpx_ff_cropTbl;
  430. uint32_t vector_64 = 64;
  431. int32_t Temp1, Temp2, Temp3;
  432. uint32_t qload1, qload2, qload3;
  433. uint32_t p1, p2, p3, p4, p5;
  434. uint32_t st1, st2, st3;
  435. const int16_t *filter = &filter_x0[3];
  436. uint32_t filter45;
  437. filter45 = ((const int32_t *)filter)[0];
  438. for (y = h; y--;) {
  439. src = src_ptr;
  440. dst = dst_ptr;
  441. /* prefetch data to cache memory */
  442. prefetch_load(src_ptr + src_stride);
  443. prefetch_load(src_ptr + src_stride + 32);
  444. prefetch_load(src_ptr + src_stride + 64);
  445. prefetch_store(dst_ptr + dst_stride);
  446. prefetch_store(dst_ptr + dst_stride + 32);
  447. for (c = 0; c < 4; c++) {
  448. __asm__ __volatile__(
  449. "ulw %[qload1], 0(%[src]) \n\t"
  450. "ulw %[qload2], 4(%[src]) \n\t"
  451. /* even 1. pixel */
  452. "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
  453. "mthi $zero, $ac1 \n\t"
  454. "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
  455. "mthi $zero, $ac2 \n\t"
  456. "preceu.ph.qbr %[p1], %[qload1] \n\t"
  457. "preceu.ph.qbl %[p2], %[qload1] \n\t"
  458. "preceu.ph.qbr %[p3], %[qload2] \n\t"
  459. "preceu.ph.qbl %[p4], %[qload2] \n\t"
  460. "ulw %[qload3], 8(%[src]) \n\t"
  461. "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
  462. "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
  463. "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
  464. /* even 2. pixel */
  465. "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
  466. "mthi $zero, $ac3 \n\t"
  467. "preceu.ph.qbr %[p1], %[qload3] \n\t"
  468. "preceu.ph.qbl %[p5], %[qload3] \n\t"
  469. "ulw %[qload1], 12(%[src]) \n\t"
  470. "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
  471. "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
  472. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
  473. "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
  474. /* even 3. pixel */
  475. "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
  476. "mthi $zero, $ac1 \n\t"
  477. "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
  478. "preceu.ph.qbr %[p2], %[qload1] \n\t"
  479. "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
  480. "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
  481. "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
  482. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
  483. /* even 4. pixel */
  484. "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
  485. "mthi $zero, $ac2 \n\t"
  486. "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
  487. "preceu.ph.qbl %[p3], %[qload1] \n\t"
  488. "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
  489. "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
  490. "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
  491. "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
  492. "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
  493. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
  494. /* even 5. pixel */
  495. "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
  496. "mthi $zero, $ac3 \n\t"
  497. "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
  498. "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
  499. "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
  500. "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
  501. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
  502. /* even 6. pixel */
  503. "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
  504. "mthi $zero, $ac1 \n\t"
  505. "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
  506. "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
  507. "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
  508. "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
  509. "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
  510. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
  511. /* even 7. pixel */
  512. "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
  513. "mthi $zero, $ac2 \n\t"
  514. "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
  515. "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
  516. "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
  517. "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
  518. "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
  519. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
  520. "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
  521. /* even 8. pixel */
  522. "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
  523. "mthi $zero, $ac3 \n\t"
  524. "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
  525. "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
  526. "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
  527. "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
  528. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
  529. /* ODD pixels */
  530. "ulw %[qload1], 1(%[src]) \n\t"
  531. "ulw %[qload2], 5(%[src]) \n\t"
  532. "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
  533. /* odd 1. pixel */
  534. "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
  535. "mthi $zero, $ac1 \n\t"
  536. "preceu.ph.qbr %[p1], %[qload1] \n\t"
  537. "preceu.ph.qbl %[p2], %[qload1] \n\t"
  538. "preceu.ph.qbr %[p3], %[qload2] \n\t"
  539. "preceu.ph.qbl %[p4], %[qload2] \n\t"
  540. "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
  541. "ulw %[qload3], 9(%[src]) \n\t"
  542. "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
  543. "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
  544. "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
  545. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
  546. "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
  547. /* odd 2. pixel */
  548. "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
  549. "mthi $zero, $ac2 \n\t"
  550. "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
  551. "preceu.ph.qbr %[p1], %[qload3] \n\t"
  552. "preceu.ph.qbl %[p5], %[qload3] \n\t"
  553. "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
  554. "ulw %[qload1], 13(%[src]) \n\t"
  555. "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
  556. "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
  557. "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
  558. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
  559. /* odd 3. pixel */
  560. "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
  561. "mthi $zero, $ac3 \n\t"
  562. "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
  563. "preceu.ph.qbr %[p2], %[qload1] \n\t"
  564. "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
  565. "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
  566. "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
  567. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
  568. /* odd 4. pixel */
  569. "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
  570. "mthi $zero, $ac1 \n\t"
  571. "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
  572. "preceu.ph.qbl %[p3], %[qload1] \n\t"
  573. "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
  574. "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
  575. "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
  576. "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
  577. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
  578. "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
  579. /* odd 5. pixel */
  580. "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
  581. "mthi $zero, $ac2 \n\t"
  582. "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
  583. "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
  584. "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
  585. "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
  586. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
  587. "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
  588. /* odd 6. pixel */
  589. "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
  590. "mthi $zero, $ac3 \n\t"
  591. "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
  592. "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
  593. "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
  594. "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
  595. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
  596. /* odd 7. pixel */
  597. "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
  598. "mthi $zero, $ac1 \n\t"
  599. "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
  600. "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
  601. "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
  602. "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
  603. "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
  604. "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
  605. /* odd 8. pixel */
  606. "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
  607. "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
  608. "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
  609. "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
  610. "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
  611. "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
  612. "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
  613. "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
  614. "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
  615. "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
  616. "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
  617. "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
  618. : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
  619. [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
  620. [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
  621. [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
  622. [Temp3] "=&r"(Temp3)
  623. : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
  624. [dst] "r"(dst), [src] "r"(src));
  625. src += 16;
  626. dst += 16;
  627. }
  628. /* Next row... */
  629. src_ptr += src_stride;
  630. dst_ptr += dst_stride;
  631. }
  632. }
  633. void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
  634. uint8_t *dst, ptrdiff_t dst_stride,
  635. const InterpKernel *filter, int x0_q4,
  636. int32_t x_step_q4, int y0_q4, int y_step_q4,
  637. int w, int h) {
  638. const int16_t *const filter_x = filter[x0_q4];
  639. uint32_t pos = 38;
  640. assert(x_step_q4 == 16);
  641. /* bit positon for extract from acc */
  642. __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
  643. :
  644. : [pos] "r"(pos));
  645. /* prefetch data to cache memory */
  646. prefetch_load(src);
  647. prefetch_load(src + 32);
  648. prefetch_store(dst);
  649. switch (w) {
  650. case 4:
  651. convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
  652. h);
  653. break;
  654. case 8:
  655. convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
  656. h);
  657. break;
  658. case 16:
  659. convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
  660. h, 1);
  661. break;
  662. case 32:
  663. convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
  664. h, 2);
  665. break;
  666. case 64:
  667. prefetch_load(src + 64);
  668. prefetch_store(dst + 32);
  669. convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
  670. h);
  671. break;
  672. default:
  673. vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
  674. x_step_q4, y0_q4, y_step_q4, w, h);
  675. break;
  676. }
  677. }
  678. #endif