idctllm_mmi.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vp8_rtcd.h"
  11. #include "vpx_ports/mem.h"
  12. #include "vpx_ports/asmdefs_mmi.h"
  13. #define TRANSPOSE_4H \
  14. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
  15. MMI_LI(%[tmp0], 0x93) \
  16. "mtc1 %[tmp0], %[ftmp10] \n\t" \
  17. "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
  18. "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
  19. "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
  20. "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
  21. "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
  22. "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
  23. "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
  24. "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \
  25. "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
  26. "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
  27. "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
  28. "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \
  29. "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
  30. "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
  31. "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
  32. "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
  33. "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \
  34. "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \
  35. "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \
  36. "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
  37. void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
  38. int pred_stride, unsigned char *dst_ptr,
  39. int dst_stride) {
  40. double ftmp[12];
  41. uint32_t tmp[0];
  42. DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL };
  43. DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL };
  44. DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL };
  45. __asm__ volatile (
  46. MMI_LI(%[tmp0], 0x02)
  47. "mtc1 %[tmp0], %[ftmp11] \n\t"
  48. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  49. "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
  50. "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
  51. "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t"
  52. "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t"
  53. "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t"
  54. "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t"
  55. "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t"
  56. "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t"
  57. // ip[0...3] + ip[8...11]
  58. "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
  59. // ip[0...3] - ip[8...11]
  60. "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
  61. // (ip[12...15] * sinpi8sqrt2) >> 16
  62. "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
  63. "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
  64. // (ip[ 4... 7] * sinpi8sqrt2) >> 16
  65. "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
  66. "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
  67. // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16)
  68. "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
  69. "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
  70. // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16)
  71. "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
  72. "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
  73. "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
  74. "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
  75. "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
  76. "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
  77. "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
  78. "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
  79. "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
  80. "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
  81. TRANSPOSE_4H
  82. // a
  83. "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
  84. // b
  85. "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
  86. // c
  87. "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
  88. "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
  89. "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
  90. "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
  91. "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
  92. // d
  93. "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
  94. "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
  95. "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
  96. "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
  97. "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
  98. MMI_LI(%[tmp0], 0x03)
  99. "mtc1 %[tmp0], %[ftmp11] \n\t"
  100. // a + d
  101. "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
  102. "paddh %[ftmp1], %[ftmp1], %[ff_ph_04] \n\t"
  103. "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
  104. // b + c
  105. "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
  106. "paddh %[ftmp2], %[ftmp2], %[ff_ph_04] \n\t"
  107. "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
  108. // b - c
  109. "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
  110. "paddh %[ftmp3], %[ftmp3], %[ff_ph_04] \n\t"
  111. "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
  112. // a - d
  113. "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
  114. "paddh %[ftmp4], %[ftmp4], %[ff_ph_04] \n\t"
  115. "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
  116. TRANSPOSE_4H
  117. #if _MIPS_SIM == _ABIO32
  118. "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
  119. "mtc1 %[tmp0], %[ftmp5] \n\t"
  120. #else
  121. "gslwlc1 %[ftmp5], 0x03(%[pred_ptr]) \n\t"
  122. "gslwrc1 %[ftmp5], 0x00(%[pred_ptr]) \n\t"
  123. #endif
  124. "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
  125. "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  126. "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
  127. "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
  128. "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
  129. MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
  130. MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
  131. #if _MIPS_SIM == _ABIO32
  132. "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
  133. "mtc1 %[tmp0], %[ftmp6] \n\t"
  134. #else
  135. "gslwlc1 %[ftmp6], 0x03(%[pred_ptr]) \n\t"
  136. "gslwrc1 %[ftmp6], 0x00(%[pred_ptr]) \n\t"
  137. #endif
  138. "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
  139. "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  140. "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
  141. "gsswlc1 %[ftmp2], 0x03(%[dst_ptr]) \n\t"
  142. "gsswrc1 %[ftmp2], 0x00(%[dst_ptr]) \n\t"
  143. MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
  144. MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
  145. #if _MIPS_SIM == _ABIO32
  146. "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
  147. "mtc1 %[tmp0], %[ftmp7] \n\t"
  148. #else
  149. "gslwlc1 %[ftmp7], 0x03(%[pred_ptr]) \n\t"
  150. "gslwrc1 %[ftmp7], 0x00(%[pred_ptr]) \n\t"
  151. #endif
  152. "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
  153. "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  154. "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  155. "gsswlc1 %[ftmp3], 0x03(%[dst_ptr]) \n\t"
  156. "gsswrc1 %[ftmp3], 0x00(%[dst_ptr]) \n\t"
  157. MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
  158. MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
  159. #if _MIPS_SIM == _ABIO32
  160. "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
  161. "mtc1 %[tmp0], %[ftmp8] \n\t"
  162. #else
  163. "gslwlc1 %[ftmp8], 0x03(%[pred_ptr]) \n\t"
  164. "gslwrc1 %[ftmp8], 0x00(%[pred_ptr]) \n\t"
  165. #endif
  166. "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
  167. "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
  168. "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
  169. "gsswlc1 %[ftmp4], 0x03(%[dst_ptr]) \n\t"
  170. "gsswrc1 %[ftmp4], 0x00(%[dst_ptr]) \n\t"
  171. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
  172. [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  173. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
  174. [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
  175. [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
  176. [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr)
  177. : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3),
  178. [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04),
  179. [pred_stride]"r"((mips_reg)pred_stride),
  180. [dst_stride]"r"((mips_reg)dst_stride)
  181. : "memory"
  182. );
  183. }
  184. void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
  185. int pred_stride, unsigned char *dst_ptr,
  186. int dst_stride) {
  187. int a1 = ((input_dc + 4) >> 3);
  188. double ftmp[5];
  189. int low32;
  190. __asm__ volatile (
  191. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  192. "pshufh %[a1], %[a1], %[ftmp0] \n\t"
  193. "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
  194. "mtc1 %[low32], %[ftmp1] \n\t"
  195. "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
  196. "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
  197. "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
  198. "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
  199. "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
  200. MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
  201. MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
  202. "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
  203. "mtc1 %[low32], %[ftmp1] \n\t"
  204. "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
  205. "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
  206. "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
  207. "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
  208. "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
  209. MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
  210. MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
  211. "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
  212. "mtc1 %[low32], %[ftmp1] \n\t"
  213. "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
  214. "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
  215. "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
  216. "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
  217. "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
  218. MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
  219. MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
  220. "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
  221. "mtc1 %[low32], %[ftmp1] \n\t"
  222. "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
  223. "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
  224. "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
  225. "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
  226. "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
  227. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
  228. [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
  229. [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr)
  230. : [dst_stride]"r"((mips_reg)dst_stride),
  231. [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1)
  232. : "memory"
  233. );
  234. }
  235. void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
  236. int i;
  237. int16_t output[16];
  238. double ftmp[12];
  239. uint32_t tmp[1];
  240. DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL };
  241. __asm__ volatile (
  242. MMI_LI(%[tmp0], 0x03)
  243. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  244. "mtc1 %[tmp0], %[ftmp11] \n\t"
  245. "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
  246. "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
  247. "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t"
  248. "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t"
  249. "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t"
  250. "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t"
  251. "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t"
  252. "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t"
  253. "paddh %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
  254. "psubh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
  255. "paddh %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
  256. "psubh %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
  257. "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
  258. "psubh %[ftmp2], %[ftmp5], %[ftmp7] \n\t"
  259. "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
  260. "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
  261. TRANSPOSE_4H
  262. // a
  263. "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
  264. // d
  265. "psubh %[ftmp6], %[ftmp1], %[ftmp4] \n\t"
  266. // b
  267. "paddh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
  268. // c
  269. "psubh %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
  270. "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
  271. "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
  272. "psubh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
  273. "psubh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
  274. "paddh %[ftmp1], %[ftmp1], %[ff_ph_03] \n\t"
  275. "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
  276. "paddh %[ftmp2], %[ftmp2], %[ff_ph_03] \n\t"
  277. "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
  278. "paddh %[ftmp3], %[ftmp3], %[ff_ph_03] \n\t"
  279. "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
  280. "paddh %[ftmp4], %[ftmp4], %[ff_ph_03] \n\t"
  281. "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
  282. TRANSPOSE_4H
  283. "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t"
  284. "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t"
  285. "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t"
  286. "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t"
  287. "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t"
  288. "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t"
  289. "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t"
  290. "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t"
  291. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
  292. [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  293. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
  294. [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
  295. [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0])
  296. : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03)
  297. : "memory"
  298. );
  299. for (i = 0; i < 16; i++) {
  300. mb_dqcoeff[i * 16] = output[i];
  301. }
  302. }