dct_mmi.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vp8_rtcd.h"
  11. #include "vpx_ports/mem.h"
  12. #include "vpx_ports/asmdefs_mmi.h"
  13. /* clang-format off */
  14. /* TRANSPOSE_4H: transpose 4x4 matrix.
  15. Input: ftmp1,ftmp2,ftmp3,ftmp4
  16. Output: ftmp1,ftmp2,ftmp3,ftmp4
  17. Note: ftmp0 always be 0, ftmp5~9 used for temporary value.
  18. */
  19. #define TRANSPOSE_4H \
  20. MMI_LI(%[tmp0], 0x93) \
  21. "mtc1 %[tmp0], %[ftmp10] \n\t" \
  22. "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
  23. "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
  24. "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
  25. "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
  26. "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
  27. "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
  28. "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
  29. "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \
  30. "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
  31. "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
  32. "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
  33. "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \
  34. "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
  35. "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
  36. "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
  37. "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
  38. "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \
  39. "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \
  40. "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \
  41. "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
  42. /* clang-format on */
  43. void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
  44. uint64_t tmp[1];
  45. int16_t *ip = input;
  46. #if _MIPS_SIM == _ABIO32
  47. register double ftmp0 asm("$f0");
  48. register double ftmp1 asm("$f2");
  49. register double ftmp2 asm("$f4");
  50. register double ftmp3 asm("$f6");
  51. register double ftmp4 asm("$f8");
  52. register double ftmp5 asm("$f10");
  53. register double ftmp6 asm("$f12");
  54. register double ftmp7 asm("$f14");
  55. register double ftmp8 asm("$f16");
  56. register double ftmp9 asm("$f18");
  57. register double ftmp10 asm("$f20");
  58. register double ftmp11 asm("$f22");
  59. register double ftmp12 asm("$f24");
  60. #else
  61. register double ftmp0 asm("$f0");
  62. register double ftmp1 asm("$f1");
  63. register double ftmp2 asm("$f2");
  64. register double ftmp3 asm("$f3");
  65. register double ftmp4 asm("$f4");
  66. register double ftmp5 asm("$f5");
  67. register double ftmp6 asm("$f6");
  68. register double ftmp7 asm("$f7");
  69. register double ftmp8 asm("$f8");
  70. register double ftmp9 asm("$f9");
  71. register double ftmp10 asm("$f10");
  72. register double ftmp11 asm("$f11");
  73. register double ftmp12 asm("$f12");
  74. #endif // _MIPS_SIM == _ABIO32
  75. DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
  76. DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
  77. DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
  78. DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
  79. DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
  80. DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
  81. DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
  82. DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
  83. DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
  84. DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
  85. DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
  86. __asm__ volatile (
  87. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  88. "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
  89. "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
  90. MMI_ADDU(%[ip], %[ip], %[pitch])
  91. "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t"
  92. "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t"
  93. MMI_ADDU(%[ip], %[ip], %[pitch])
  94. "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t"
  95. "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t"
  96. MMI_ADDU(%[ip], %[ip], %[pitch])
  97. "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t"
  98. "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t"
  99. MMI_ADDU(%[ip], %[ip], %[pitch])
  100. TRANSPOSE_4H
  101. "ldc1 %[ftmp11], %[ff_ph_8] \n\t"
  102. // f1 + f4
  103. "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
  104. // a1
  105. "pmullh %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
  106. // f2 + f3
  107. "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
  108. // b1
  109. "pmullh %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
  110. // f2 - f3
  111. "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
  112. // c1
  113. "pmullh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
  114. // f1 - f4
  115. "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t"
  116. // d1
  117. "pmullh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
  118. // op[0] = a1 + b1
  119. "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
  120. // op[2] = a1 - b1
  121. "psubh %[ftmp3], %[ftmp5], %[ftmp6] \n\t"
  122. // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
  123. MMI_LI(%[tmp0], 0x0c)
  124. "mtc1 %[tmp0], %[ftmp11] \n\t"
  125. "ldc1 %[ftmp12], %[ff_pw_14500] \n\t"
  126. "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
  127. "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t"
  128. "punpckhhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
  129. "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op1] \n\t"
  130. "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
  131. "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
  132. "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
  133. "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
  134. "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
  135. // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12
  136. "ldc1 %[ftmp12], %[ff_pw_7500] \n\t"
  137. "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t"
  138. "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t"
  139. "punpckhhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t"
  140. "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op3] \n\t"
  141. "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
  142. "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
  143. "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
  144. "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
  145. "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
  146. TRANSPOSE_4H
  147. "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
  148. "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
  149. "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
  150. "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t"
  151. "pcmpeqh %[ftmp0], %[ftmp8], %[ftmp0] \n\t"
  152. "ldc1 %[ftmp9], %[ff_ph_01] \n\t"
  153. "paddh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
  154. "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
  155. "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
  156. "ldc1 %[ftmp9], %[ff_ph_07] \n\t"
  157. "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
  158. "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
  159. MMI_LI(%[tmp0], 0x04)
  160. "mtc1 %[tmp0], %[ftmp9] \n\t"
  161. "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
  162. "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
  163. MMI_LI(%[tmp0], 0x10)
  164. "mtc1 %[tmp0], %[ftmp9] \n\t"
  165. "ldc1 %[ftmp12], %[ff_pw_12000] \n\t"
  166. "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
  167. "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t"
  168. "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
  169. "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op1] \n\t"
  170. "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
  171. "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
  172. "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
  173. "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
  174. "packsswh %[ftmp3], %[ftmp10], %[ftmp11] \n\t"
  175. "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
  176. "ldc1 %[ftmp12], %[ff_pw_51000] \n\t"
  177. "punpcklhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t"
  178. "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op3] \n\t"
  179. "punpckhhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t"
  180. "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op3] \n\t"
  181. "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
  182. "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
  183. "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
  184. "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
  185. "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t"
  186. "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t"
  187. "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t"
  188. "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t"
  189. "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t"
  190. "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t"
  191. "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t"
  192. "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t"
  193. "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t"
  194. : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
  195. [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
  196. [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
  197. [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
  198. [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)
  199. : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
  200. [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),
  201. [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
  202. [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
  203. [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
  204. [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
  205. : "memory"
  206. );
  207. }
  208. void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
  209. vp8_short_fdct4x4_mmi(input, output, pitch);
  210. vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
  211. }
  212. void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
  213. double ftmp[13];
  214. uint32_t tmp[1];
  215. DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
  216. DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
  217. DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
  218. DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
  219. __asm__ volatile (
  220. MMI_LI(%[tmp0], 0x02)
  221. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  222. "mtc1 %[tmp0], %[ftmp11] \n\t"
  223. "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
  224. "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
  225. MMI_ADDU(%[ip], %[ip], %[pitch])
  226. "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t"
  227. "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t"
  228. MMI_ADDU(%[ip], %[ip], %[pitch])
  229. "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t"
  230. "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t"
  231. MMI_ADDU(%[ip], %[ip], %[pitch])
  232. "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t"
  233. "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t"
  234. TRANSPOSE_4H
  235. "psllh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
  236. "psllh %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
  237. "psllh %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
  238. "psllh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
  239. // a
  240. "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
  241. // d
  242. "paddh %[ftmp6], %[ftmp2], %[ftmp4] \n\t"
  243. // c
  244. "psubh %[ftmp7], %[ftmp2], %[ftmp4] \n\t"
  245. // b
  246. "psubh %[ftmp8], %[ftmp1], %[ftmp3] \n\t"
  247. // a + d
  248. "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
  249. // b + c
  250. "paddh %[ftmp2], %[ftmp8], %[ftmp7] \n\t"
  251. // b - c
  252. "psubh %[ftmp3], %[ftmp8], %[ftmp7] \n\t"
  253. // a - d
  254. "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
  255. "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t"
  256. "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t"
  257. "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
  258. TRANSPOSE_4H
  259. // op[2], op[0]
  260. "pmaddhw %[ftmp5], %[ftmp1], %[ff_pw_01] \n\t"
  261. // op[3], op[1]
  262. "pmaddhw %[ftmp1], %[ftmp1], %[ff_pw_mask] \n\t"
  263. // op[6], op[4]
  264. "pmaddhw %[ftmp6], %[ftmp2], %[ff_pw_01] \n\t"
  265. // op[7], op[5]
  266. "pmaddhw %[ftmp2], %[ftmp2], %[ff_pw_mask] \n\t"
  267. // op[10], op[8]
  268. "pmaddhw %[ftmp7], %[ftmp3], %[ff_pw_01] \n\t"
  269. // op[11], op[9]
  270. "pmaddhw %[ftmp3], %[ftmp3], %[ff_pw_mask] \n\t"
  271. // op[14], op[12]
  272. "pmaddhw %[ftmp8], %[ftmp4], %[ff_pw_01] \n\t"
  273. // op[15], op[13]
  274. "pmaddhw %[ftmp4], %[ftmp4], %[ff_pw_mask] \n\t"
  275. // a1, a3
  276. "paddw %[ftmp9], %[ftmp5], %[ftmp7] \n\t"
  277. // d1, d3
  278. "paddw %[ftmp10], %[ftmp6], %[ftmp8] \n\t"
  279. // c1, c3
  280. "psubw %[ftmp11], %[ftmp6], %[ftmp8] \n\t"
  281. // b1, b3
  282. "psubw %[ftmp12], %[ftmp5], %[ftmp7] \n\t"
  283. // a1 + d1, a3 + d3
  284. "paddw %[ftmp5], %[ftmp9], %[ftmp10] \n\t"
  285. // b1 + c1, b3 + c3
  286. "paddw %[ftmp6], %[ftmp12], %[ftmp11] \n\t"
  287. // b1 - c1, b3 - c3
  288. "psubw %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
  289. // a1 - d1, a3 - d3
  290. "psubw %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
  291. // a2, a4
  292. "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t"
  293. // d2, d4
  294. "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t"
  295. // c2, c4
  296. "psubw %[ftmp11], %[ftmp2], %[ftmp4] \n\t"
  297. // b2, b4
  298. "psubw %[ftmp12], %[ftmp1], %[ftmp3] \n\t"
  299. // a2 + d2, a4 + d4
  300. "paddw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
  301. // b2 + c2, b4 + c4
  302. "paddw %[ftmp2], %[ftmp12], %[ftmp11] \n\t"
  303. // b2 - c2, b4 - c4
  304. "psubw %[ftmp3], %[ftmp12], %[ftmp11] \n\t"
  305. // a2 - d2, a4 - d4
  306. "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t"
  307. MMI_LI(%[tmp0], 0x03)
  308. "mtc1 %[tmp0], %[ftmp11] \n\t"
  309. "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t"
  310. "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
  311. "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
  312. "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t"
  313. "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
  314. "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t"
  315. "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
  316. "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
  317. "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t"
  318. "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
  319. "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t"
  320. "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
  321. "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
  322. "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t"
  323. "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
  324. "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t"
  325. "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
  326. "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
  327. "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t"
  328. "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
  329. "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t"
  330. "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
  331. "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
  332. "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t"
  333. "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
  334. "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t"
  335. "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
  336. "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
  337. "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t"
  338. "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
  339. "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t"
  340. "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
  341. "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
  342. "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t"
  343. "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
  344. "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t"
  345. "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
  346. "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t"
  347. "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t"
  348. "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
  349. "packsswh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
  350. "packsswh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  351. "packsswh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
  352. "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
  353. MMI_LI(%[tmp0], 0x72)
  354. "mtc1 %[tmp0], %[ftmp11] \n\t"
  355. "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
  356. "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
  357. "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
  358. "pshufh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
  359. "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t"
  360. "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t"
  361. "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t"
  362. "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t"
  363. "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t"
  364. "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t"
  365. "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t"
  366. "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t"
  367. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  368. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  369. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  370. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  371. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  372. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  373. [ftmp12]"=&f"(ftmp[12]),
  374. [tmp0]"=&r"(tmp[0]),
  375. [ip]"+&r"(input)
  376. : [op]"r"(output),
  377. [ff_pw_01]"f"(ff_pw_01), [pitch]"r"((mips_reg)pitch),
  378. [ff_pw_03]"f"(ff_pw_03), [ff_pw_mask]"f"(ff_pw_mask),
  379. [ff_ph_01]"f"(ff_ph_01)
  380. : "memory"
  381. );
  382. }