vpx_convolve8_mmi.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716
  1. /*
  2. * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <string.h>
  12. #include "./vpx_config.h"
  13. #include "./vpx_dsp_rtcd.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vpx_dsp/vpx_dsp_common.h"
  16. #include "vpx_dsp/vpx_filter.h"
  17. #include "vpx_ports/asmdefs_mmi.h"
  18. #include "vpx_ports/mem.h"
  19. #define GET_DATA_H_MMI \
  20. "pmaddhw %[ftmp4], %[ftmp4], %[filter1] \n\t" \
  21. "pmaddhw %[ftmp5], %[ftmp5], %[filter2] \n\t" \
  22. "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
  23. "punpckhwd %[ftmp5], %[ftmp4], %[ftmp0] \n\t" \
  24. "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
  25. "pmaddhw %[ftmp6], %[ftmp6], %[filter1] \n\t" \
  26. "pmaddhw %[ftmp7], %[ftmp7], %[filter2] \n\t" \
  27. "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \
  28. "punpckhwd %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
  29. "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \
  30. "punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \
  31. "pmaddhw %[ftmp8], %[ftmp8], %[filter1] \n\t" \
  32. "pmaddhw %[ftmp9], %[ftmp9], %[filter2] \n\t" \
  33. "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
  34. "punpckhwd %[ftmp9], %[ftmp8], %[ftmp0] \n\t" \
  35. "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
  36. "pmaddhw %[ftmp10], %[ftmp10], %[filter1] \n\t" \
  37. "pmaddhw %[ftmp11], %[ftmp11], %[filter2] \n\t" \
  38. "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
  39. "punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \
  40. "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
  41. "punpcklwd %[srch], %[ftmp8], %[ftmp10] \n\t"
  42. #define GET_DATA_V_MMI \
  43. "punpcklhw %[srcl], %[ftmp4], %[ftmp5] \n\t" \
  44. "pmaddhw %[srcl], %[srcl], %[filter10] \n\t" \
  45. "punpcklhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \
  46. "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \
  47. "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \
  48. "punpcklhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \
  49. "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \
  50. "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \
  51. "punpcklhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \
  52. "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \
  53. "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \
  54. "punpckhhw %[srch], %[ftmp4], %[ftmp5] \n\t" \
  55. "pmaddhw %[srch], %[srch], %[filter10] \n\t" \
  56. "punpckhhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \
  57. "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \
  58. "paddw %[srch], %[srch], %[ftmp12] \n\t" \
  59. "punpckhhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \
  60. "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \
  61. "paddw %[srch], %[srch], %[ftmp12] \n\t" \
  62. "punpckhhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \
  63. "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \
  64. "paddw %[srch], %[srch], %[ftmp12] \n\t"
  65. /* clang-format off */
  66. #define ROUND_POWER_OF_TWO_MMI \
  67. /* Add para[0] */ \
  68. "lw %[tmp0], 0x00(%[para]) \n\t" \
  69. MMI_MTC1(%[tmp0], %[ftmp6]) \
  70. "punpcklwd %[ftmp6], %[ftmp6], %[ftmp6] \n\t" \
  71. "paddw %[srcl], %[srcl], %[ftmp6] \n\t" \
  72. "paddw %[srch], %[srch], %[ftmp6] \n\t" \
  73. /* Arithmetic right shift para[1] bits */ \
  74. "lw %[tmp0], 0x04(%[para]) \n\t" \
  75. MMI_MTC1(%[tmp0], %[ftmp5]) \
  76. "psraw %[srcl], %[srcl], %[ftmp5] \n\t" \
  77. "psraw %[srch], %[srch], %[ftmp5] \n\t"
  78. /* clang-format on */
  79. #define CLIP_PIXEL_MMI \
  80. /* Staturated operation */ \
  81. "packsswh %[srcl], %[srcl], %[srch] \n\t" \
  82. "packushb %[ftmp12], %[srcl], %[ftmp0] \n\t"
  83. static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
  84. uint8_t *dst, ptrdiff_t dst_stride,
  85. const InterpKernel *filter, int x0_q4,
  86. int x_step_q4, int32_t w, int32_t h) {
  87. const int16_t *filter_x = filter[x0_q4];
  88. double ftmp[14];
  89. uint32_t tmp[2];
  90. uint32_t para[5];
  91. para[0] = (1 << ((FILTER_BITS)-1));
  92. para[1] = FILTER_BITS;
  93. src -= SUBPEL_TAPS / 2 - 1;
  94. src_stride -= w;
  95. dst_stride -= w;
  96. (void)x_step_q4;
  97. /* clang-format off */
  98. __asm__ volatile(
  99. "move %[tmp1], %[width] \n\t"
  100. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  101. "gsldlc1 %[filter1], 0x03(%[filter]) \n\t"
  102. "gsldrc1 %[filter1], 0x00(%[filter]) \n\t"
  103. "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t"
  104. "gsldrc1 %[filter2], 0x08(%[filter]) \n\t"
  105. "1: \n\t"
  106. /* Get 8 data per row */
  107. "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t"
  108. "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t"
  109. "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t"
  110. "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t"
  111. "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t"
  112. "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t"
  113. "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t"
  114. "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t"
  115. "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t"
  116. "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  117. "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t"
  118. "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  119. "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t"
  120. "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
  121. "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t"
  122. "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
  123. MMI_ADDIU(%[width], %[width], -0x04)
  124. /* Get raw data */
  125. GET_DATA_H_MMI
  126. ROUND_POWER_OF_TWO_MMI
  127. CLIP_PIXEL_MMI
  128. "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
  129. MMI_ADDIU(%[dst], %[dst], 0x04)
  130. MMI_ADDIU(%[src], %[src], 0x04)
  131. /* Loop count */
  132. "bnez %[width], 1b \n\t"
  133. "move %[width], %[tmp1] \n\t"
  134. MMI_ADDU(%[src], %[src], %[src_stride])
  135. MMI_ADDU(%[dst], %[dst], %[dst_stride])
  136. MMI_ADDIU(%[height], %[height], -0x01)
  137. "bnez %[height], 1b \n\t"
  138. : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
  139. [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]),
  140. [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]),
  141. [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]),
  142. [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]),
  143. [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]),
  144. [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]),
  145. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  146. [src]"+&r"(src), [width]"+&r"(w),
  147. [dst]"+&r"(dst), [height]"+&r"(h)
  148. : [filter]"r"(filter_x), [para]"r"(para),
  149. [src_stride]"r"((mips_reg)src_stride),
  150. [dst_stride]"r"((mips_reg)dst_stride)
  151. : "memory"
  152. );
  153. /* clang-format on */
  154. }
  155. static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
  156. uint8_t *dst, ptrdiff_t dst_stride,
  157. const InterpKernel *filter, int y0_q4,
  158. int y_step_q4, int32_t w, int32_t h) {
  159. const int16_t *filter_y = filter[y0_q4];
  160. double ftmp[16];
  161. uint32_t tmp[1];
  162. uint32_t para[2];
  163. ptrdiff_t addr = src_stride;
  164. para[0] = (1 << ((FILTER_BITS)-1));
  165. para[1] = FILTER_BITS;
  166. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  167. src_stride -= w;
  168. dst_stride -= w;
  169. (void)y_step_q4;
  170. __asm__ volatile(
  171. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  172. "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t"
  173. "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t"
  174. "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t"
  175. "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t"
  176. "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t"
  177. "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t"
  178. "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t"
  179. "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t"
  180. "1: \n\t"
  181. /* Get 8 data per column */
  182. "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t"
  183. "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t"
  184. MMI_ADDU(%[tmp0], %[src], %[addr])
  185. "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t"
  186. "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t"
  187. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  188. "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t"
  189. "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t"
  190. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  191. "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t"
  192. "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t"
  193. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  194. "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t"
  195. "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t"
  196. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  197. "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t"
  198. "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t"
  199. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  200. "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t"
  201. "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t"
  202. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  203. "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t"
  204. "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t"
  205. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  206. "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  207. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  208. "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  209. "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
  210. "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
  211. "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
  212. "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
  213. MMI_ADDIU(%[width], %[width], -0x04)
  214. /* Get raw data */
  215. GET_DATA_V_MMI
  216. ROUND_POWER_OF_TWO_MMI
  217. CLIP_PIXEL_MMI
  218. "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
  219. MMI_ADDIU(%[dst], %[dst], 0x04)
  220. MMI_ADDIU(%[src], %[src], 0x04)
  221. /* Loop count */
  222. "bnez %[width], 1b \n\t"
  223. MMI_SUBU(%[width], %[addr], %[src_stride])
  224. MMI_ADDU(%[src], %[src], %[src_stride])
  225. MMI_ADDU(%[dst], %[dst], %[dst_stride])
  226. MMI_ADDIU(%[height], %[height], -0x01)
  227. "bnez %[height], 1b \n\t"
  228. : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
  229. [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
  230. [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
  231. [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]),
  232. [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]),
  233. [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]),
  234. [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]),
  235. [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]),
  236. [src]"+&r"(src), [dst]"+&r"(dst),
  237. [width]"+&r"(w), [height]"+&r"(h),
  238. [tmp0]"=&r"(tmp[0])
  239. : [filter]"r"(filter_y), [para]"r"(para),
  240. [src_stride]"r"((mips_reg)src_stride),
  241. [dst_stride]"r"((mips_reg)dst_stride),
  242. [addr]"r"((mips_reg)addr)
  243. : "memory"
  244. );
  245. }
  246. static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
  247. uint8_t *dst, ptrdiff_t dst_stride,
  248. const InterpKernel *filter, int x0_q4,
  249. int x_step_q4, int32_t w, int32_t h) {
  250. const int16_t *filter_x = filter[x0_q4];
  251. double ftmp[14];
  252. uint32_t tmp[2];
  253. uint32_t para[2];
  254. para[0] = (1 << ((FILTER_BITS)-1));
  255. para[1] = FILTER_BITS;
  256. src -= SUBPEL_TAPS / 2 - 1;
  257. src_stride -= w;
  258. dst_stride -= w;
  259. (void)x_step_q4;
  260. __asm__ volatile(
  261. "move %[tmp1], %[width] \n\t"
  262. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  263. "gsldlc1 %[filter1], 0x03(%[filter]) \n\t"
  264. "gsldrc1 %[filter1], 0x00(%[filter]) \n\t"
  265. "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t"
  266. "gsldrc1 %[filter2], 0x08(%[filter]) \n\t"
  267. "1: \n\t"
  268. /* Get 8 data per row */
  269. "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t"
  270. "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t"
  271. "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t"
  272. "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t"
  273. "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t"
  274. "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t"
  275. "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t"
  276. "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t"
  277. "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t"
  278. "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  279. "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t"
  280. "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  281. "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t"
  282. "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
  283. "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t"
  284. "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
  285. MMI_ADDIU(%[width], %[width], -0x04)
  286. /* Get raw data */
  287. GET_DATA_H_MMI
  288. ROUND_POWER_OF_TWO_MMI
  289. CLIP_PIXEL_MMI
  290. "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
  291. "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t"
  292. "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t"
  293. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  294. "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t"
  295. "li %[tmp0], 0x10001 \n\t"
  296. MMI_MTC1(%[tmp0], %[ftmp5])
  297. "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  298. "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
  299. "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
  300. "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
  301. "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
  302. MMI_ADDIU(%[dst], %[dst], 0x04)
  303. MMI_ADDIU(%[src], %[src], 0x04)
  304. /* Loop count */
  305. "bnez %[width], 1b \n\t"
  306. "move %[width], %[tmp1] \n\t"
  307. MMI_ADDU(%[src], %[src], %[src_stride])
  308. MMI_ADDU(%[dst], %[dst], %[dst_stride])
  309. MMI_ADDIU(%[height], %[height], -0x01)
  310. "bnez %[height], 1b \n\t"
  311. : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
  312. [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]),
  313. [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]),
  314. [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]),
  315. [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]),
  316. [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]),
  317. [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]),
  318. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  319. [src]"+&r"(src), [width]"+&r"(w),
  320. [dst]"+&r"(dst), [height]"+&r"(h)
  321. : [filter]"r"(filter_x), [para]"r"(para),
  322. [src_stride]"r"((mips_reg)src_stride),
  323. [dst_stride]"r"((mips_reg)dst_stride)
  324. : "memory"
  325. );
  326. }
  327. static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
  328. uint8_t *dst, ptrdiff_t dst_stride,
  329. const InterpKernel *filter, int y0_q4,
  330. int y_step_q4, int32_t w, int32_t h) {
  331. const int16_t *filter_y = filter[y0_q4];
  332. double ftmp[16];
  333. uint32_t tmp[1];
  334. uint32_t para[2];
  335. ptrdiff_t addr = src_stride;
  336. para[0] = (1 << ((FILTER_BITS)-1));
  337. para[1] = FILTER_BITS;
  338. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  339. src_stride -= w;
  340. dst_stride -= w;
  341. (void)y_step_q4;
  342. __asm__ volatile(
  343. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  344. "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t"
  345. "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t"
  346. "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t"
  347. "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t"
  348. "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t"
  349. "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t"
  350. "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t"
  351. "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t"
  352. "1: \n\t"
  353. /* Get 8 data per column */
  354. "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t"
  355. "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t"
  356. MMI_ADDU(%[tmp0], %[src], %[addr])
  357. "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t"
  358. "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t"
  359. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  360. "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t"
  361. "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t"
  362. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  363. "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t"
  364. "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t"
  365. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  366. "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t"
  367. "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t"
  368. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  369. "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t"
  370. "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t"
  371. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  372. "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t"
  373. "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t"
  374. MMI_ADDU(%[tmp0], %[tmp0], %[addr])
  375. "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t"
  376. "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t"
  377. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  378. "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  379. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  380. "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  381. "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
  382. "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
  383. "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
  384. "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
  385. MMI_ADDIU(%[width], %[width], -0x04)
  386. /* Get raw data */
  387. GET_DATA_V_MMI
  388. ROUND_POWER_OF_TWO_MMI
  389. CLIP_PIXEL_MMI
  390. "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
  391. "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t"
  392. "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t"
  393. "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  394. "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t"
  395. "li %[tmp0], 0x10001 \n\t"
  396. MMI_MTC1(%[tmp0], %[ftmp5])
  397. "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
  398. "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
  399. "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
  400. "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
  401. "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
  402. MMI_ADDIU(%[dst], %[dst], 0x04)
  403. MMI_ADDIU(%[src], %[src], 0x04)
  404. /* Loop count */
  405. "bnez %[width], 1b \n\t"
  406. MMI_SUBU(%[width], %[addr], %[src_stride])
  407. MMI_ADDU(%[src], %[src], %[src_stride])
  408. MMI_ADDU(%[dst], %[dst], %[dst_stride])
  409. MMI_ADDIU(%[height], %[height], -0x01)
  410. "bnez %[height], 1b \n\t"
  411. : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
  412. [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
  413. [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
  414. [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]),
  415. [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]),
  416. [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]),
  417. [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]),
  418. [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]),
  419. [src]"+&r"(src), [dst]"+&r"(dst),
  420. [width]"+&r"(w), [height]"+&r"(h),
  421. [tmp0]"=&r"(tmp[0])
  422. : [filter]"r"(filter_y), [para]"r"(para),
  423. [src_stride]"r"((mips_reg)src_stride),
  424. [dst_stride]"r"((mips_reg)dst_stride),
  425. [addr]"r"((mips_reg)addr)
  426. : "memory"
  427. );
  428. }
  429. void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
  430. uint8_t *dst, ptrdiff_t dst_stride,
  431. const InterpKernel *filter, int x0_q4, int x_step_q4,
  432. int y0_q4, int y_step_q4, int w, int h) {
  433. int x, y;
  434. (void)filter;
  435. (void)x0_q4;
  436. (void)x_step_q4;
  437. (void)y0_q4;
  438. (void)y_step_q4;
  439. if (w & 0x03) {
  440. for (y = 0; y < h; ++y) {
  441. for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
  442. src += src_stride;
  443. dst += dst_stride;
  444. }
  445. } else {
  446. double ftmp[4];
  447. uint32_t tmp[2];
  448. src_stride -= w;
  449. dst_stride -= w;
  450. __asm__ volatile(
  451. "move %[tmp1], %[width] \n\t"
  452. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  453. "li %[tmp0], 0x10001 \n\t"
  454. MMI_MTC1(%[tmp0], %[ftmp3])
  455. "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
  456. "1: \n\t"
  457. "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t"
  458. "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t"
  459. "gsldlc1 %[ftmp2], 0x07(%[dst]) \n\t"
  460. "gsldrc1 %[ftmp2], 0x00(%[dst]) \n\t"
  461. "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  462. "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  463. "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  464. "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  465. "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
  466. "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  467. "swc1 %[ftmp1], 0x00(%[dst]) \n\t"
  468. MMI_ADDIU(%[width], %[width], -0x04)
  469. MMI_ADDIU(%[dst], %[dst], 0x04)
  470. MMI_ADDIU(%[src], %[src], 0x04)
  471. "bnez %[width], 1b \n\t"
  472. "move %[width], %[tmp1] \n\t"
  473. MMI_ADDU(%[dst], %[dst], %[dst_stride])
  474. MMI_ADDU(%[src], %[src], %[src_stride])
  475. MMI_ADDIU(%[height], %[height], -0x01)
  476. "bnez %[height], 1b \n\t"
  477. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  478. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  479. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  480. [src]"+&r"(src), [dst]"+&r"(dst),
  481. [width]"+&r"(w), [height]"+&r"(h)
  482. : [src_stride]"r"((mips_reg)src_stride),
  483. [dst_stride]"r"((mips_reg)dst_stride)
  484. : "memory"
  485. );
  486. }
  487. }
  488. static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
  489. uint8_t *dst, ptrdiff_t dst_stride,
  490. const InterpKernel *x_filters, int x0_q4,
  491. int x_step_q4, int w, int h) {
  492. int x, y;
  493. src -= SUBPEL_TAPS / 2 - 1;
  494. for (y = 0; y < h; ++y) {
  495. int x_q4 = x0_q4;
  496. for (x = 0; x < w; ++x) {
  497. const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
  498. const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
  499. int k, sum = 0;
  500. for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
  501. dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
  502. x_q4 += x_step_q4;
  503. }
  504. src += src_stride;
  505. dst += dst_stride;
  506. }
  507. }
  508. static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
  509. uint8_t *dst, ptrdiff_t dst_stride,
  510. const InterpKernel *y_filters, int y0_q4,
  511. int y_step_q4, int w, int h) {
  512. int x, y;
  513. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  514. for (x = 0; x < w; ++x) {
  515. int y_q4 = y0_q4;
  516. for (y = 0; y < h; ++y) {
  517. const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
  518. const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
  519. int k, sum = 0;
  520. for (k = 0; k < SUBPEL_TAPS; ++k)
  521. sum += src_y[k * src_stride] * y_filter[k];
  522. dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
  523. y_q4 += y_step_q4;
  524. }
  525. ++src;
  526. ++dst;
  527. }
  528. }
  529. static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
  530. uint8_t *dst, ptrdiff_t dst_stride,
  531. const InterpKernel *y_filters, int y0_q4,
  532. int y_step_q4, int w, int h) {
  533. int x, y;
  534. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  535. for (x = 0; x < w; ++x) {
  536. int y_q4 = y0_q4;
  537. for (y = 0; y < h; ++y) {
  538. const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
  539. const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
  540. int k, sum = 0;
  541. for (k = 0; k < SUBPEL_TAPS; ++k)
  542. sum += src_y[k * src_stride] * y_filter[k];
  543. dst[y * dst_stride] = ROUND_POWER_OF_TWO(
  544. dst[y * dst_stride] +
  545. clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
  546. 1);
  547. y_q4 += y_step_q4;
  548. }
  549. ++src;
  550. ++dst;
  551. }
  552. }
  553. static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
  554. uint8_t *dst, ptrdiff_t dst_stride,
  555. const InterpKernel *x_filters, int x0_q4,
  556. int x_step_q4, int w, int h) {
  557. int x, y;
  558. src -= SUBPEL_TAPS / 2 - 1;
  559. for (y = 0; y < h; ++y) {
  560. int x_q4 = x0_q4;
  561. for (x = 0; x < w; ++x) {
  562. const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
  563. const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
  564. int k, sum = 0;
  565. for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
  566. dst[x] = ROUND_POWER_OF_TWO(
  567. dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
  568. x_q4 += x_step_q4;
  569. }
  570. src += src_stride;
  571. dst += dst_stride;
  572. }
  573. }
  574. void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
  575. ptrdiff_t dst_stride, const InterpKernel *filter,
  576. int x0_q4, int32_t x_step_q4, int y0_q4,
  577. int32_t y_step_q4, int32_t w, int32_t h) {
  578. // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  579. // 2d filtering proceeds in 2 steps:
  580. // (1) Interpolate horizontally into an intermediate buffer, temp.
  581. // (2) Interpolate temp vertically to derive the sub-pixel result.
  582. // Deriving the maximum number of rows in the temp buffer (135):
  583. // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  584. // --Largest block size is 64x64 pixels.
  585. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  586. // original frame (in 1/16th pixel units).
  587. // --Must round-up because block may be located at sub-pixel position.
  588. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  589. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  590. // When calling in frame scaling function, the smallest scaling factor is x1/4
  591. // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
  592. // big enough.
  593. uint8_t temp[64 * 135];
  594. const int intermediate_height =
  595. (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
  596. assert(w <= 64);
  597. assert(h <= 64);
  598. assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
  599. assert(x_step_q4 <= 64);
  600. if (w & 0x03) {
  601. convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
  602. 64, filter, x0_q4, x_step_q4, w, intermediate_height);
  603. convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
  604. filter, y0_q4, y_step_q4, w, h);
  605. } else {
  606. convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
  607. temp, 64, filter, x0_q4, x_step_q4, w,
  608. intermediate_height);
  609. convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
  610. filter, y0_q4, y_step_q4, w, h);
  611. }
  612. }
  613. void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
  614. uint8_t *dst, ptrdiff_t dst_stride,
  615. const InterpKernel *filter, int x0_q4,
  616. int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
  617. int32_t w, int32_t h) {
  618. (void)y0_q4;
  619. (void)y_step_q4;
  620. if (w & 0x03)
  621. convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
  622. w, h);
  623. else
  624. convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
  625. x_step_q4, w, h);
  626. }
  627. void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
  628. uint8_t *dst, ptrdiff_t dst_stride,
  629. const InterpKernel *filter, int x0_q4,
  630. int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
  631. int h) {
  632. (void)x0_q4;
  633. (void)x_step_q4;
  634. if (w & 0x03)
  635. convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
  636. h);
  637. else
  638. convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
  639. y_step_q4, w, h);
  640. }
  641. void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
  642. uint8_t *dst, ptrdiff_t dst_stride,
  643. const InterpKernel *filter, int x0_q4,
  644. int32_t x_step_q4, int y0_q4, int y_step_q4,
  645. int w, int h) {
  646. (void)y0_q4;
  647. (void)y_step_q4;
  648. if (w & 0x03)
  649. convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
  650. x_step_q4, w, h);
  651. else
  652. convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
  653. x_step_q4, w, h);
  654. }
  655. void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
  656. uint8_t *dst, ptrdiff_t dst_stride,
  657. const InterpKernel *filter, int x0_q4,
  658. int32_t x_step_q4, int y0_q4, int y_step_q4,
  659. int w, int h) {
  660. (void)x0_q4;
  661. (void)x_step_q4;
  662. if (w & 0x03)
  663. convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
  664. y_step_q4, w, h);
  665. else
  666. convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
  667. y_step_q4, w, h);
  668. }
  669. void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
  670. uint8_t *dst, ptrdiff_t dst_stride,
  671. const InterpKernel *filter, int x0_q4,
  672. int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
  673. int32_t w, int32_t h) {
  674. // Fixed size intermediate buffer places limits on parameters.
  675. DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
  676. assert(w <= 64);
  677. assert(h <= 64);
  678. vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
  679. y_step_q4, w, h);
  680. vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
  681. }