2
0

row_mmi.cc 366 KB


  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include <string.h> // For memcpy and memset.
  12. #include "libyuv/basic_types.h"
  13. #ifdef __cplusplus
  14. namespace libyuv {
  15. extern "C" {
  16. #endif
  17. // This module is for Mips MMI.
  18. #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
  19. void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
  20. uint8_t* dst_argb,
  21. int width) {
  22. uint64_t src0, src1, dest;
  23. const uint64_t mask = 0xff000000ULL;
  24. __asm__ volatile(
  25. "1: \n\t"
  26. "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
  27. "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
  28. "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
  29. "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
  30. "or %[src0], %[src0], %[mask] \n\t"
  31. "or %[src1], %[src1], %[mask] \n\t"
  32. "punpcklwd %[dest], %[src0], %[src1] \n\t"
  33. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  34. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  35. "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t"
  36. "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t"
  37. "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t"
  38. "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t"
  39. "or %[src0], %[src0], %[mask] \n\t"
  40. "or %[src1], %[src1], %[mask] \n\t"
  41. "punpcklwd %[dest], %[src0], %[src1] \n\t"
  42. "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
  43. "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
  44. "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
  45. "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
  46. "daddi %[width], %[width], -0x04 \n\t"
  47. "bnez %[width], 1b \n\t"
  48. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
  49. : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width),
  50. [mask] "f"(mask)
  51. : "memory");
  52. }
  53. void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  54. uint64_t src0, src1, dest;
  55. const uint64_t mask0 = 0x0;
  56. const uint64_t mask1 = 0xff000000ULL;
  57. const uint64_t mask2 = 0xc6;
  58. __asm__ volatile(
  59. "1: \n\t"
  60. "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
  61. "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
  62. "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
  63. "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
  64. "or %[src0], %[src0], %[mask1] \n\t"
  65. "punpcklbh %[src0], %[src0], %[mask0] \n\t"
  66. "pshufh %[src0], %[src0], %[mask2] \n\t"
  67. "or %[src1], %[src1], %[mask1] \n\t"
  68. "punpcklbh %[src1], %[src1], %[mask0] \n\t"
  69. "pshufh %[src1], %[src1], %[mask2] \n\t"
  70. "packushb %[dest], %[src0], %[src1] \n\t"
  71. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  72. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  73. "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t"
  74. "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t"
  75. "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t"
  76. "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t"
  77. "or %[src0], %[src0], %[mask1] \n\t"
  78. "punpcklbh %[src0], %[src0], %[mask0] \n\t"
  79. "pshufh %[src0], %[src0], %[mask2] \n\t"
  80. "or %[src1], %[src1], %[mask1] \n\t"
  81. "punpcklbh %[src1], %[src1], %[mask0] \n\t"
  82. "pshufh %[src1], %[src1], %[mask2] \n\t"
  83. "packushb %[dest], %[src0], %[src1] \n\t"
  84. "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
  85. "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
  86. "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
  87. "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
  88. "daddi %[width], %[width], -0x04 \n\t"
  89. "bnez %[width], 1b \n\t"
  90. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
  91. : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
  92. [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width)
  93. : "memory");
  94. }
  95. void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
  96. uint64_t src0, src1;
  97. uint64_t ftmp[4];
  98. uint64_t mask0 = 0xc6;
  99. uint64_t mask1 = 0x6c;
  100. __asm__ volatile(
  101. "1: \n\t"
  102. "gsldrc1 %[src0], 0x00(%[src_raw]) \n\t"
  103. "gsldlc1 %[src0], 0x07(%[src_raw]) \n\t"
  104. "gslwrc1 %[src1], 0x08(%[src_raw]) \n\t"
  105. "gslwlc1 %[src1], 0x0b(%[src_raw]) \n\t"
  106. "punpcklbh %[ftmp0], %[src0], %[zero] \n\t"
  107. "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t"
  108. "punpckhbh %[ftmp1], %[src0], %[zero] \n\t"
  109. "punpcklbh %[src1], %[src1], %[zero] \n\t"
  110. "pextrh %[ftmp2], %[ftmp0], %[three] \n\t"
  111. "pextrh %[ftmp3], %[ftmp1], %[one] \n\t"
  112. "pinsrh_3 %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
  113. "pextrh %[ftmp3], %[ftmp1], %[two] \n\t"
  114. "pinsrh_1 %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  115. "pshufh %[src1], %[src1], %[mask1] \n\t"
  116. "pextrh %[ftmp2], %[src1], %[zero] \n\t"
  117. "pinsrh_2 %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
  118. "pinsrh_0 %[src1], %[src1], %[ftmp3] \n\t"
  119. "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  120. "packushb %[src1], %[src1], %[zero] \n\t"
  121. "gssdrc1 %[ftmp0], 0x00(%[dst_rgb24]) \n\t"
  122. "gssdlc1 %[ftmp0], 0x07(%[dst_rgb24]) \n\t"
  123. "gsswrc1 %[src1], 0x08(%[dst_rgb24]) \n\t"
  124. "gsswlc1 %[src1], 0x0b(%[dst_rgb24]) \n\t"
  125. "daddiu %[src_raw], %[src_raw], 0x0c \n\t"
  126. "daddiu %[dst_rgb24], %[dst_rgb24], 0x0c \n\t"
  127. "daddiu %[width], %[width], -0x04 \n\t"
  128. "bgtz %[width], 1b \n\t"
  129. : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
  130. [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3])
  131. : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width),
  132. [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
  133. [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03)
  134. : "memory");
  135. }
  136. void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
  137. uint8_t* dst_argb,
  138. int width) {
  139. uint64_t ftmp[5];
  140. uint64_t c0 = 0x001f001f001f001f;
  141. uint64_t c1 = 0x00ff00ff00ff00ff;
  142. uint64_t c2 = 0x0007000700070007;
  143. __asm__ volatile(
  144. "1: \n\t"
  145. "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
  146. "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
  147. "psrlh %[src1], %[src0], %[eight] \n\t"
  148. "and %[b], %[src0], %[c0] \n\t"
  149. "and %[src0], %[src0], %[c1] \n\t"
  150. "psrlh %[src0], %[src0], %[five] \n\t"
  151. "and %[g], %[src1], %[c2] \n\t"
  152. "psllh %[g], %[g], %[three] \n\t"
  153. "or %[g], %[src0], %[g] \n\t"
  154. "psrlh %[r], %[src1], %[three] \n\t"
  155. "psllh %[src0], %[b], %[three] \n\t"
  156. "psrlh %[src1], %[b], %[two] \n\t"
  157. "or %[b], %[src0], %[src1] \n\t"
  158. "psllh %[src0], %[g], %[two] \n\t"
  159. "psrlh %[src1], %[g], %[four] \n\t"
  160. "or %[g], %[src0], %[src1] \n\t"
  161. "psllh %[src0], %[r], %[three] \n\t"
  162. "psrlh %[src1], %[r], %[two] \n\t"
  163. "or %[r], %[src0], %[src1] \n\t"
  164. "packushb %[b], %[b], %[r] \n\t"
  165. "packushb %[g], %[g], %[c1] \n\t"
  166. "punpcklbh %[src0], %[b], %[g] \n\t"
  167. "punpckhbh %[src1], %[b], %[g] \n\t"
  168. "punpcklhw %[r], %[src0], %[src1] \n\t"
  169. "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
  170. "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
  171. "punpckhhw %[r], %[src0], %[src1] \n\t"
  172. "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
  173. "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
  174. "daddiu %[src_rgb565], %[src_rgb565], 0x08 \n\t"
  175. "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
  176. "daddiu %[width], %[width], -0x04 \n\t"
  177. "bgtz %[width], 1b \n\t"
  178. : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
  179. [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4])
  180. : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb),
  181. [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
  182. [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02),
  183. [four] "f"(0x04)
  184. : "memory");
  185. }
  186. void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
  187. uint8_t* dst_argb,
  188. int width) {
  189. uint64_t ftmp[6];
  190. uint64_t c0 = 0x001f001f001f001f;
  191. uint64_t c1 = 0x00ff00ff00ff00ff;
  192. uint64_t c2 = 0x0003000300030003;
  193. uint64_t c3 = 0x007c007c007c007c;
  194. uint64_t c4 = 0x0001000100010001;
  195. __asm__ volatile(
  196. "1: \n\t"
  197. "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
  198. "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
  199. "psrlh %[src1], %[src0], %[eight] \n\t"
  200. "and %[b], %[src0], %[c0] \n\t"
  201. "and %[src0], %[src0], %[c1] \n\t"
  202. "psrlh %[src0], %[src0], %[five] \n\t"
  203. "and %[g], %[src1], %[c2] \n\t"
  204. "psllh %[g], %[g], %[three] \n\t"
  205. "or %[g], %[src0], %[g] \n\t"
  206. "and %[r], %[src1], %[c3] \n\t"
  207. "psrlh %[r], %[r], %[two] \n\t"
  208. "psrlh %[a], %[src1], %[seven] \n\t"
  209. "psllh %[src0], %[b], %[three] \n\t"
  210. "psrlh %[src1], %[b], %[two] \n\t"
  211. "or %[b], %[src0], %[src1] \n\t"
  212. "psllh %[src0], %[g], %[three] \n\t"
  213. "psrlh %[src1], %[g], %[two] \n\t"
  214. "or %[g], %[src0], %[src1] \n\t"
  215. "psllh %[src0], %[r], %[three] \n\t"
  216. "psrlh %[src1], %[r], %[two] \n\t"
  217. "or %[r], %[src0], %[src1] \n\t"
  218. "xor %[a], %[a], %[c1] \n\t"
  219. "paddb %[a], %[a], %[c4] \n\t"
  220. "packushb %[b], %[b], %[r] \n\t"
  221. "packushb %[g], %[g], %[a] \n\t"
  222. "punpcklbh %[src0], %[b], %[g] \n\t"
  223. "punpckhbh %[src1], %[b], %[g] \n\t"
  224. "punpcklhw %[r], %[src0], %[src1] \n\t"
  225. "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
  226. "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
  227. "punpckhhw %[r], %[src0], %[src1] \n\t"
  228. "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
  229. "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
  230. "daddiu %[src_argb1555], %[src_argb1555], 0x08 \n\t"
  231. "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
  232. "daddiu %[width], %[width], -0x04 \n\t"
  233. "bgtz %[width], 1b \n\t"
  234. : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
  235. [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
  236. : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb),
  237. [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
  238. [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05),
  239. [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
  240. : "memory");
  241. }
  242. void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
  243. uint8_t* dst_argb,
  244. int width) {
  245. uint64_t ftmp[6];
  246. uint64_t c0 = 0x000f000f000f000f;
  247. uint64_t c1 = 0x00ff00ff00ff00ff;
  248. __asm__ volatile(
  249. "1: \n\t"
  250. "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
  251. "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
  252. "psrlh %[src1], %[src0], %[eight] \n\t"
  253. "and %[b], %[src0], %[c0] \n\t"
  254. "and %[src0], %[src0], %[c1] \n\t"
  255. "psrlh %[g], %[src0], %[four] \n\t"
  256. "and %[r], %[src1], %[c0] \n\t"
  257. "psrlh %[a], %[src1], %[four] \n\t"
  258. "psllh %[src0], %[b], %[four] \n\t"
  259. "or %[b], %[src0], %[b] \n\t"
  260. "psllh %[src0], %[g], %[four] \n\t"
  261. "or %[g], %[src0], %[g] \n\t"
  262. "psllh %[src0], %[r], %[four] \n\t"
  263. "or %[r], %[src0], %[r] \n\t"
  264. "psllh %[src0], %[a], %[four] \n\t"
  265. "or %[a], %[src0], %[a] \n\t"
  266. "packushb %[b], %[b], %[r] \n\t"
  267. "packushb %[g], %[g], %[a] \n\t"
  268. "punpcklbh %[src0], %[b], %[g] \n\t"
  269. "punpckhbh %[src1], %[b], %[g] \n\t"
  270. "punpcklhw %[r], %[src0], %[src1] \n\t"
  271. "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
  272. "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
  273. "punpckhhw %[r], %[src0], %[src1] \n\t"
  274. "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
  275. "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
  276. "daddiu %[src_argb4444], %[src_argb4444], 0x08 \n\t"
  277. "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
  278. "daddiu %[width], %[width], -0x04 \n\t"
  279. "bgtz %[width], 1b \n\t"
  280. : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
  281. [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
  282. : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb),
  283. [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08),
  284. [four] "f"(0x04)
  285. : "memory");
  286. }
  287. void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  288. uint64_t src;
  289. __asm__ volatile(
  290. "1: \n\t"
  291. "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t"
  292. "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t"
  293. "gsswlc1 %[src], 0x03(%[dst_ptr]) \n\t"
  294. "gsswrc1 %[src], 0x00(%[dst_ptr]) \n\t"
  295. "gslwlc1 %[src], 0x07(%[src_ptr]) \n\t"
  296. "gslwrc1 %[src], 0x04(%[src_ptr]) \n\t"
  297. "gsswlc1 %[src], 0x06(%[dst_ptr]) \n\t"
  298. "gsswrc1 %[src], 0x03(%[dst_ptr]) \n\t"
  299. "gslwlc1 %[src], 0x0b(%[src_ptr]) \n\t"
  300. "gslwrc1 %[src], 0x08(%[src_ptr]) \n\t"
  301. "gsswlc1 %[src], 0x09(%[dst_ptr]) \n\t"
  302. "gsswrc1 %[src], 0x06(%[dst_ptr]) \n\t"
  303. "gslwlc1 %[src], 0x0f(%[src_ptr]) \n\t"
  304. "gslwrc1 %[src], 0x0c(%[src_ptr]) \n\t"
  305. "gsswlc1 %[src], 0x0c(%[dst_ptr]) \n\t"
  306. "gsswrc1 %[src], 0x09(%[dst_ptr]) \n\t"
  307. "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
  308. "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t"
  309. "daddi %[width], %[width], -0x04 \n\t"
  310. "bnez %[width], 1b \n\t"
  311. : [src] "=&f"(src)
  312. : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width)
  313. : "memory");
  314. }
  315. void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  316. uint64_t src0, src1;
  317. uint64_t ftmp[3];
  318. uint64_t mask0 = 0xc6;
  319. uint64_t mask1 = 0x18;
  320. __asm__ volatile(
  321. "1: \n\t"
  322. "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
  323. "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
  324. "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
  325. "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
  326. "punpcklbh %[ftmp0], %[src0], %[zero] \n\t"
  327. "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t"
  328. "punpckhbh %[ftmp1], %[src0], %[zero] \n\t"
  329. "punpcklbh %[ftmp2], %[src1], %[zero] \n\t"
  330. "punpckhbh %[src1], %[src1], %[zero] \n\t"
  331. "pextrh %[src0], %[ftmp1], %[two] \n\t"
  332. "pinsrh_3 %[ftmp0], %[ftmp0], %[src0] \n\t"
  333. "pshufh %[ftmp1], %[ftmp1], %[one] \n\t"
  334. "pextrh %[src0], %[ftmp2], %[two] \n\t"
  335. "pinsrh_2 %[ftmp1], %[ftmp1], %[src0] \n\t"
  336. "pextrh %[src0], %[ftmp2], %[one] \n\t"
  337. "pinsrh_3 %[ftmp1], %[ftmp1], %[src0] \n\t"
  338. "pextrh %[src0], %[ftmp2], %[zero] \n\t"
  339. "pshufh %[src1], %[src1], %[mask1] \n\t"
  340. "pinsrh_0 %[src1], %[src1], %[src0] \n\t"
  341. "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
  342. "packushb %[src1], %[src1], %[zero] \n\t"
  343. "gssdrc1 %[ftmp0], 0x00(%[dst_rgb]) \n\t"
  344. "gssdlc1 %[ftmp0], 0x07(%[dst_rgb]) \n\t"
  345. "gsswrc1 %[src1], 0x08(%[dst_rgb]) \n\t"
  346. "gsswlc1 %[src1], 0x0b(%[dst_rgb]) \n\t"
  347. "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
  348. "daddiu %[dst_rgb], %[dst_rgb], 0x0c \n\t"
  349. "daddiu %[width], %[width], -0x04 \n\t"
  350. "bgtz %[width], 1b \n\t"
  351. : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
  352. [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2])
  353. : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
  354. [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
  355. [one] "f"(0x01), [two] "f"(0x02)
  356. : "memory");
  357. }
  358. void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  359. uint64_t src0, src1;
  360. uint64_t ftmp[3];
  361. __asm__ volatile(
  362. "1: \n\t"
  363. "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
  364. "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
  365. "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
  366. "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
  367. "punpcklbh %[b], %[src0], %[src1] \n\t"
  368. "punpckhbh %[g], %[src0], %[src1] \n\t"
  369. "punpcklbh %[src0], %[b], %[g] \n\t"
  370. "punpckhbh %[src1], %[b], %[g] \n\t"
  371. "punpcklbh %[b], %[src0], %[zero] \n\t"
  372. "punpckhbh %[g], %[src0], %[zero] \n\t"
  373. "punpcklbh %[r], %[src1], %[zero] \n\t"
  374. "psrlh %[b], %[b], %[three] \n\t"
  375. "psrlh %[g], %[g], %[two] \n\t"
  376. "psrlh %[r], %[r], %[three] \n\t"
  377. "psllh %[g], %[g], %[five] \n\t"
  378. "psllh %[r], %[r], %[eleven] \n\t"
  379. "or %[b], %[b], %[g] \n\t"
  380. "or %[b], %[b], %[r] \n\t"
  381. "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
  382. "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
  383. "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
  384. "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
  385. "daddiu %[width], %[width], -0x04 \n\t"
  386. "bgtz %[width], 1b \n\t"
  387. : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
  388. [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
  389. : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
  390. [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05),
  391. [eleven] "f"(0x0b)
  392. : "memory");
  393. }
  394. // dither4 is a row of 4 values from 4x4 dither matrix.
  395. // The 4x4 matrix contains values to increase RGB. When converting to
  396. // fewer bits (565) this provides an ordered dither.
  397. // The order in the 4x4 matrix in first byte is upper left.
  398. // The 4 values are passed as an int, then referenced as an array, so
  399. // endian will not affect order of the original matrix. But the dither4
  400. // will containing the first pixel in the lower byte for little endian
  401. // or the upper byte for big endian.
  402. void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
  403. uint8_t* dst_rgb,
  404. const uint32_t dither4,
  405. int width) {
  406. uint64_t src0, src1;
  407. uint64_t ftmp[3];
  408. uint64_t c0 = 0x00ff00ff00ff00ff;
  409. __asm__ volatile(
  410. "punpcklbh %[dither], %[dither], %[zero] \n\t"
  411. "1: \n\t"
  412. "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
  413. "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
  414. "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
  415. "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
  416. "punpcklbh %[b], %[src0], %[src1] \n\t"
  417. "punpckhbh %[g], %[src0], %[src1] \n\t"
  418. "punpcklbh %[src0], %[b], %[g] \n\t"
  419. "punpckhbh %[src1], %[b], %[g] \n\t"
  420. "punpcklbh %[b], %[src0], %[zero] \n\t"
  421. "punpckhbh %[g], %[src0], %[zero] \n\t"
  422. "punpcklbh %[r], %[src1], %[zero] \n\t"
  423. "paddh %[b], %[b], %[dither] \n\t"
  424. "paddh %[g], %[g], %[dither] \n\t"
  425. "paddh %[r], %[r], %[dither] \n\t"
  426. "pcmpgth %[src0], %[b], %[c0] \n\t"
  427. "or %[src0], %[src0], %[b] \n\t"
  428. "and %[b], %[src0], %[c0] \n\t"
  429. "pcmpgth %[src0], %[g], %[c0] \n\t"
  430. "or %[src0], %[src0], %[g] \n\t"
  431. "and %[g], %[src0], %[c0] \n\t"
  432. "pcmpgth %[src0], %[r], %[c0] \n\t"
  433. "or %[src0], %[src0], %[r] \n\t"
  434. "and %[r], %[src0], %[c0] \n\t"
  435. "psrlh %[b], %[b], %[three] \n\t"
  436. "psrlh %[g], %[g], %[two] \n\t"
  437. "psrlh %[r], %[r], %[three] \n\t"
  438. "psllh %[g], %[g], %[five] \n\t"
  439. "psllh %[r], %[r], %[eleven] \n\t"
  440. "or %[b], %[b], %[g] \n\t"
  441. "or %[b], %[b], %[r] \n\t"
  442. "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
  443. "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
  444. "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
  445. "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
  446. "daddiu %[width], %[width], -0x04 \n\t"
  447. "bgtz %[width], 1b \n\t"
  448. : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
  449. [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
  450. : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
  451. [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02),
  452. [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b)
  453. : "memory");
  454. }
  455. void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
  456. uint8_t* dst_rgb,
  457. int width) {
  458. uint64_t src0, src1;
  459. uint64_t ftmp[4];
  460. __asm__ volatile(
  461. "1: \n\t"
  462. "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
  463. "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
  464. "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
  465. "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
  466. "punpcklbh %[b], %[src0], %[src1] \n\t"
  467. "punpckhbh %[g], %[src0], %[src1] \n\t"
  468. "punpcklbh %[src0], %[b], %[g] \n\t"
  469. "punpckhbh %[src1], %[b], %[g] \n\t"
  470. "punpcklbh %[b], %[src0], %[zero] \n\t"
  471. "punpckhbh %[g], %[src0], %[zero] \n\t"
  472. "punpcklbh %[r], %[src1], %[zero] \n\t"
  473. "punpckhbh %[a], %[src1], %[zero] \n\t"
  474. "psrlh %[b], %[b], %[three] \n\t"
  475. "psrlh %[g], %[g], %[three] \n\t"
  476. "psrlh %[r], %[r], %[three] \n\t"
  477. "psrlh %[a], %[a], %[seven] \n\t"
  478. "psllh %[g], %[g], %[five] \n\t"
  479. "psllh %[r], %[r], %[ten] \n\t"
  480. "psllh %[a], %[a], %[fifteen] \n\t"
  481. "or %[b], %[b], %[g] \n\t"
  482. "or %[b], %[b], %[r] \n\t"
  483. "or %[b], %[b], %[a] \n\t"
  484. "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
  485. "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
  486. "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
  487. "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
  488. "daddiu %[width], %[width], -0x04 \n\t"
  489. "bgtz %[width], 1b \n\t"
  490. : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
  491. [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
  492. : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
  493. [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05),
  494. [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f)
  495. : "memory");
  496. }
  497. void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
  498. uint8_t* dst_rgb,
  499. int width) {
  500. uint64_t src0, src1;
  501. uint64_t ftmp[4];
  502. __asm__ volatile(
  503. "1: \n\t"
  504. "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
  505. "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
  506. "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
  507. "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
  508. "punpcklbh %[b], %[src0], %[src1] \n\t"
  509. "punpckhbh %[g], %[src0], %[src1] \n\t"
  510. "punpcklbh %[src0], %[b], %[g] \n\t"
  511. "punpckhbh %[src1], %[b], %[g] \n\t"
  512. "punpcklbh %[b], %[src0], %[zero] \n\t"
  513. "punpckhbh %[g], %[src0], %[zero] \n\t"
  514. "punpcklbh %[r], %[src1], %[zero] \n\t"
  515. "punpckhbh %[a], %[src1], %[zero] \n\t"
  516. "psrlh %[b], %[b], %[four] \n\t"
  517. "psrlh %[g], %[g], %[four] \n\t"
  518. "psrlh %[r], %[r], %[four] \n\t"
  519. "psrlh %[a], %[a], %[four] \n\t"
  520. "psllh %[g], %[g], %[four] \n\t"
  521. "psllh %[r], %[r], %[eight] \n\t"
  522. "psllh %[a], %[a], %[twelve] \n\t"
  523. "or %[b], %[b], %[g] \n\t"
  524. "or %[b], %[b], %[r] \n\t"
  525. "or %[b], %[b], %[a] \n\t"
  526. "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
  527. "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
  528. "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
  529. "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
  530. "daddiu %[width], %[width], -0x04 \n\t"
  531. "bgtz %[width], 1b \n\t"
  532. : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
  533. [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
  534. : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
  535. [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08),
  536. [twelve] "f"(0x0c)
  537. : "memory");
  538. }
  539. void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  540. uint64_t src, src_hi, src_lo;
  541. uint64_t dest0, dest1, dest2, dest3;
  542. const uint64_t value = 0x1080;
  543. const uint64_t mask = 0x0001004200810019;
  544. __asm__ volatile(
  545. "1: \n\t"
  546. "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
  547. "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
  548. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  549. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  550. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  551. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  552. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  553. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  554. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  555. "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
  556. "paddw %[dest0], %[dest0], %[src] \n\t"
  557. "psrlw %[dest0], %[dest0], %[eight] \n\t"
  558. "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
  559. "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
  560. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  561. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  562. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  563. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  564. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  565. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  566. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  567. "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
  568. "paddw %[dest1], %[dest1], %[src] \n\t"
  569. "psrlw %[dest1], %[dest1], %[eight] \n\t"
  570. "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
  571. "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
  572. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  573. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  574. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  575. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  576. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  577. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  578. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  579. "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
  580. "paddw %[dest2], %[dest2], %[src] \n\t"
  581. "psrlw %[dest2], %[dest2], %[eight] \n\t"
  582. "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
  583. "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
  584. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  585. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  586. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  587. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  588. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  589. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  590. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  591. "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
  592. "paddw %[dest3], %[dest3], %[src] \n\t"
  593. "psrlw %[dest3], %[dest3], %[eight] \n\t"
  594. "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
  595. "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
  596. "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
  597. "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
  598. "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
  599. "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
  600. "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
  601. "daddi %[width], %[width], -0x08 \n\t"
  602. "bnez %[width], 1b \n\t"
  603. : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
  604. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
  605. [dest3] "=&f"(dest3)
  606. : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
  607. [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
  608. [zero] "f"(0x00)
  609. : "memory");
  610. }
  611. void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
  612. int src_stride_rgb,
  613. uint8_t* dst_u,
  614. uint8_t* dst_v,
  615. int width) {
  616. uint64_t src_rgb1;
  617. uint64_t ftmp[12];
  618. const uint64_t value = 0x4040;
  619. const uint64_t mask_u = 0x0026004a00700002;
  620. const uint64_t mask_v = 0x00020070005e0012;
  621. __asm__ volatile(
  622. "1: \n\t"
  623. "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
  624. "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
  625. "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
  626. "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
  627. "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
  628. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  629. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  630. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  631. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  632. "paddh %[src0], %[src0], %[src_lo] \n\t"
  633. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  634. "paddh %[src0], %[src0], %[src_hi] \n\t"
  635. "psrlh %[src0], %[src0], %[two] \n\t"
  636. "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
  637. "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
  638. "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
  639. "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
  640. "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
  641. "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
  642. "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
  643. "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
  644. "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
  645. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  646. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  647. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  648. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  649. "paddh %[src0], %[src0], %[src_lo] \n\t"
  650. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  651. "paddh %[src0], %[src0], %[src_hi] \n\t"
  652. "psrlh %[src0], %[src0], %[two] \n\t"
  653. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  654. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  655. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  656. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  657. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  658. "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
  659. "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
  660. "psubw %[dest0_u], %[src0], %[src1] \n\t"
  661. "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
  662. "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
  663. "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
  664. "psubw %[dest0_v], %[src1], %[src0] \n\t"
  665. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  666. "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
  667. "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
  668. "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
  669. "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
  670. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  671. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  672. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  673. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  674. "paddh %[src0], %[src0], %[src_lo] \n\t"
  675. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  676. "paddh %[src0], %[src0], %[src_hi] \n\t"
  677. "psrlh %[src0], %[src0], %[two] \n\t"
  678. "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
  679. "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
  680. "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
  681. "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
  682. "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
  683. "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
  684. "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
  685. "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
  686. "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
  687. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  688. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  689. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  690. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  691. "paddh %[src0], %[src0], %[src_lo] \n\t"
  692. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  693. "paddh %[src0], %[src0], %[src_hi] \n\t"
  694. "psrlh %[src0], %[src0], %[two] \n\t"
  695. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  696. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  697. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  698. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  699. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  700. "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
  701. "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
  702. "psubw %[dest1_u], %[src0], %[src1] \n\t"
  703. "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
  704. "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
  705. "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
  706. "psubw %[dest1_v], %[src1], %[src0] \n\t"
  707. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  708. "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
  709. "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
  710. "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
  711. "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
  712. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  713. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  714. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  715. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  716. "paddh %[src0], %[src0], %[src_lo] \n\t"
  717. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  718. "paddh %[src0], %[src0], %[src_hi] \n\t"
  719. "psrlh %[src0], %[src0], %[two] \n\t"
  720. "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
  721. "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
  722. "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
  723. "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
  724. "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
  725. "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
  726. "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
  727. "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
  728. "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
  729. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  730. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  731. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  732. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  733. "paddh %[src0], %[src0], %[src_lo] \n\t"
  734. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  735. "paddh %[src0], %[src0], %[src_hi] \n\t"
  736. "psrlh %[src0], %[src0], %[two] \n\t"
  737. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  738. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  739. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  740. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  741. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  742. "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
  743. "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
  744. "psubw %[dest2_u], %[src0], %[src1] \n\t"
  745. "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
  746. "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
  747. "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
  748. "psubw %[dest2_v], %[src1], %[src0] \n\t"
  749. "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
  750. "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
  751. "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
  752. "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
  753. "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
  754. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  755. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  756. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  757. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  758. "paddh %[src0], %[src0], %[src_lo] \n\t"
  759. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  760. "paddh %[src0], %[src0], %[src_hi] \n\t"
  761. "psrlh %[src0], %[src0], %[two] \n\t"
  762. "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
  763. "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
  764. "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
  765. "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
  766. "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
  767. "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
  768. "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
  769. "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
  770. "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
  771. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  772. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  773. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  774. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  775. "paddh %[src0], %[src0], %[src_lo] \n\t"
  776. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  777. "paddh %[src0], %[src0], %[src_hi] \n\t"
  778. "psrlh %[src0], %[src0], %[two] \n\t"
  779. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  780. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  781. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  782. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  783. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  784. "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
  785. "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
  786. "psubw %[dest3_u], %[src0], %[src1] \n\t"
  787. "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
  788. "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
  789. "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
  790. "psubw %[dest3_v], %[src1], %[src0] \n\t"
  791. "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
  792. "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
  793. "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
  794. "packushb %[dest0_u], %[src0], %[src1] \n\t"
  795. "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
  796. "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
  797. "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
  798. "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
  799. "packushb %[dest0_v], %[src0], %[src1] \n\t"
  800. "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
  801. "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
  802. "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
  803. "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
  804. "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
  805. "daddi %[width], %[width], -0x10 \n\t"
  806. "bgtz %[width], 1b \n\t"
  807. : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
  808. [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
  809. [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
  810. [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
  811. [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
  812. [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
  813. : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
  814. [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
  815. [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
  816. [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
  817. [sixteen] "f"(0x10)
  818. : "memory");
  819. }
  820. void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  821. uint64_t src, src_hi, src_lo;
  822. uint64_t dest0, dest1, dest2, dest3;
  823. const uint64_t value = 0x1080;
  824. const uint64_t mask = 0x0019008100420001;
  825. __asm__ volatile(
  826. "1: \n\t"
  827. "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
  828. "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
  829. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  830. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  831. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  832. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  833. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  834. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  835. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  836. "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
  837. "paddw %[dest0], %[dest0], %[src] \n\t"
  838. "psrlw %[dest0], %[dest0], %[eight] \n\t"
  839. "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
  840. "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
  841. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  842. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  843. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  844. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  845. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  846. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  847. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  848. "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
  849. "paddw %[dest1], %[dest1], %[src] \n\t"
  850. "psrlw %[dest1], %[dest1], %[eight] \n\t"
  851. "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
  852. "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
  853. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  854. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  855. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  856. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  857. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  858. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  859. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  860. "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
  861. "paddw %[dest2], %[dest2], %[src] \n\t"
  862. "psrlw %[dest2], %[dest2], %[eight] \n\t"
  863. "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
  864. "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
  865. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  866. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  867. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  868. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  869. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  870. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  871. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  872. "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
  873. "paddw %[dest3], %[dest3], %[src] \n\t"
  874. "psrlw %[dest3], %[dest3], %[eight] \n\t"
  875. "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
  876. "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
  877. "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
  878. "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
  879. "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
  880. "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
  881. "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
  882. "daddi %[width], %[width], -0x08 \n\t"
  883. "bnez %[width], 1b \n\t"
  884. : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
  885. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
  886. [dest3] "=&f"(dest3)
  887. : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
  888. [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
  889. [zero] "f"(0x00)
  890. : "memory");
  891. }
  892. void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
  893. int src_stride_rgb,
  894. uint8_t* dst_u,
  895. uint8_t* dst_v,
  896. int width) {
  897. uint64_t src_rgb1;
  898. uint64_t ftmp[12];
  899. const uint64_t value = 0x4040;
  900. const uint64_t mask_u = 0x00020070004a0026;
  901. const uint64_t mask_v = 0x0012005e00700002;
  902. __asm__ volatile(
  903. "1: \n\t"
  904. "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
  905. "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
  906. "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
  907. "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
  908. "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
  909. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  910. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  911. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  912. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  913. "paddh %[src0], %[src0], %[src_lo] \n\t"
  914. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  915. "paddh %[src0], %[src0], %[src_hi] \n\t"
  916. "psrlh %[src0], %[src0], %[two] \n\t"
  917. "dsrl %[dest0_u], %[src0], %[sixteen] \n\t"
  918. "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t"
  919. "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t"
  920. "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
  921. "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
  922. "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
  923. "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
  924. "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
  925. "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
  926. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  927. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  928. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  929. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  930. "paddh %[src0], %[src0], %[src_lo] \n\t"
  931. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  932. "paddh %[src0], %[src0], %[src_hi] \n\t"
  933. "psrlh %[src0], %[src0], %[two] \n\t"
  934. "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
  935. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  936. "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
  937. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  938. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  939. "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
  940. "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
  941. "psubw %[dest0_u], %[src1], %[src0] \n\t"
  942. "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
  943. "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
  944. "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
  945. "psubw %[dest0_v], %[src0], %[src1] \n\t"
  946. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  947. "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
  948. "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
  949. "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
  950. "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
  951. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  952. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  953. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  954. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  955. "paddh %[src0], %[src0], %[src_lo] \n\t"
  956. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  957. "paddh %[src0], %[src0], %[src_hi] \n\t"
  958. "psrlh %[src0], %[src0], %[two] \n\t"
  959. "dsrl %[dest1_u], %[src0], %[sixteen] \n\t"
  960. "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t"
  961. "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t"
  962. "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
  963. "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
  964. "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
  965. "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
  966. "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
  967. "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
  968. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  969. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  970. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  971. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  972. "paddh %[src0], %[src0], %[src_lo] \n\t"
  973. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  974. "paddh %[src0], %[src0], %[src_hi] \n\t"
  975. "psrlh %[src0], %[src0], %[two] \n\t"
  976. "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
  977. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  978. "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
  979. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  980. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  981. "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
  982. "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
  983. "psubw %[dest1_u], %[src1], %[src0] \n\t"
  984. "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
  985. "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
  986. "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
  987. "psubw %[dest1_v], %[src0], %[src1] \n\t"
  988. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  989. "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
  990. "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
  991. "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
  992. "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
  993. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  994. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  995. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  996. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  997. "paddh %[src0], %[src0], %[src_lo] \n\t"
  998. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  999. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1000. "psrlh %[src0], %[src0], %[two] \n\t"
  1001. "dsrl %[dest2_u], %[src0], %[sixteen] \n\t"
  1002. "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t"
  1003. "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t"
  1004. "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
  1005. "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
  1006. "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
  1007. "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
  1008. "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
  1009. "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
  1010. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1011. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1012. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1013. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1014. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1015. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1016. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1017. "psrlh %[src0], %[src0], %[two] \n\t"
  1018. "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
  1019. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1020. "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
  1021. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1022. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1023. "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
  1024. "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
  1025. "psubw %[dest2_u], %[src1], %[src0] \n\t"
  1026. "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
  1027. "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
  1028. "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
  1029. "psubw %[dest2_v], %[src0], %[src1] \n\t"
  1030. "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
  1031. "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
  1032. "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
  1033. "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
  1034. "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
  1035. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1036. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1037. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1038. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1039. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1040. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1041. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1042. "psrlh %[src0], %[src0], %[two] \n\t"
  1043. "dsrl %[dest3_u], %[src0], %[sixteen] \n\t"
  1044. "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t"
  1045. "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t"
  1046. "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
  1047. "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
  1048. "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
  1049. "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
  1050. "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
  1051. "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
  1052. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1053. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1054. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1055. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1056. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1057. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1058. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1059. "psrlh %[src0], %[src0], %[two] \n\t"
  1060. "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
  1061. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1062. "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
  1063. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1064. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1065. "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
  1066. "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
  1067. "psubw %[dest3_u], %[src1], %[src0] \n\t"
  1068. "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
  1069. "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
  1070. "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
  1071. "psubw %[dest3_v], %[src0], %[src1] \n\t"
  1072. "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
  1073. "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
  1074. "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
  1075. "packushb %[dest0_u], %[src0], %[src1] \n\t"
  1076. "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
  1077. "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
  1078. "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
  1079. "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
  1080. "packushb %[dest0_v], %[src0], %[src1] \n\t"
  1081. "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
  1082. "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
  1083. "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
  1084. "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
  1085. "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
  1086. "daddi %[width], %[width], -0x10 \n\t"
  1087. "bgtz %[width], 1b \n\t"
  1088. : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
  1089. [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
  1090. [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
  1091. [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
  1092. [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
  1093. [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
  1094. : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
  1095. [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
  1096. [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
  1097. [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
  1098. [sixteen] "f"(0x10)
  1099. : "memory");
  1100. }
  1101. void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  1102. uint64_t src, src_hi, src_lo;
  1103. uint64_t dest0, dest1, dest2, dest3;
  1104. const uint64_t value = 0x1080;
  1105. const uint64_t mask = 0x0001001900810042;
  1106. __asm__ volatile(
  1107. "1: \n\t"
  1108. "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
  1109. "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
  1110. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1111. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1112. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1113. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1114. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1115. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1116. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1117. "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
  1118. "paddw %[dest0], %[dest0], %[src] \n\t"
  1119. "psrlw %[dest0], %[dest0], %[eight] \n\t"
  1120. "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
  1121. "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
  1122. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1123. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1124. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1125. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1126. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1127. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1128. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1129. "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
  1130. "paddw %[dest1], %[dest1], %[src] \n\t"
  1131. "psrlw %[dest1], %[dest1], %[eight] \n\t"
  1132. "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
  1133. "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
  1134. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1135. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1136. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1137. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1138. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1139. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1140. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1141. "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
  1142. "paddw %[dest2], %[dest2], %[src] \n\t"
  1143. "psrlw %[dest2], %[dest2], %[eight] \n\t"
  1144. "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
  1145. "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
  1146. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1147. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1148. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1149. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1150. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1151. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1152. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1153. "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
  1154. "paddw %[dest3], %[dest3], %[src] \n\t"
  1155. "psrlw %[dest3], %[dest3], %[eight] \n\t"
  1156. "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
  1157. "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
  1158. "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
  1159. "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
  1160. "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
  1161. "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
  1162. "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
  1163. "daddi %[width], %[width], -0x08 \n\t"
  1164. "bnez %[width], 1b \n\t"
  1165. : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
  1166. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
  1167. [dest3] "=&f"(dest3)
  1168. : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
  1169. [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
  1170. [zero] "f"(0x00)
  1171. : "memory");
  1172. }
  1173. void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
  1174. int src_stride_rgb,
  1175. uint8_t* dst_u,
  1176. uint8_t* dst_v,
  1177. int width) {
  1178. uint64_t src_rgb1;
  1179. uint64_t ftmp[12];
  1180. const uint64_t value = 0x4040;
  1181. const uint64_t mask_u = 0x00020070004a0026;
  1182. const uint64_t mask_v = 0x0012005e00700002;
  1183. __asm__ volatile(
  1184. "1: \n\t"
  1185. "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
  1186. "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
  1187. "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
  1188. "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
  1189. "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
  1190. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1191. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1192. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1193. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1194. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1195. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1196. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1197. "psrlh %[src0], %[src0], %[two] \n\t"
  1198. "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
  1199. "dsll %[dest0_v], %[src0], %[sixteen] \n\t"
  1200. "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
  1201. "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
  1202. "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
  1203. "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
  1204. "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
  1205. "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
  1206. "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
  1207. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1208. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1209. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1210. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1211. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1212. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1213. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1214. "psrlh %[src0], %[src0], %[two] \n\t"
  1215. "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
  1216. "dsll %[src_hi], %[src0], %[sixteen] \n\t"
  1217. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  1218. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1219. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1220. "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
  1221. "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
  1222. "psubw %[dest0_u], %[src1], %[src0] \n\t"
  1223. "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
  1224. "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
  1225. "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
  1226. "psubw %[dest0_v], %[src0], %[src1] \n\t"
  1227. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  1228. "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
  1229. "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
  1230. "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
  1231. "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
  1232. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1233. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1234. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1235. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1236. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1237. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1238. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1239. "psrlh %[src0], %[src0], %[two] \n\t"
  1240. "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
  1241. "dsll %[dest1_v], %[src0], %[sixteen] \n\t"
  1242. "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
  1243. "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
  1244. "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
  1245. "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
  1246. "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
  1247. "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
  1248. "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
  1249. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1250. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1251. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1252. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1253. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1254. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1255. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1256. "psrlh %[src0], %[src0], %[two] \n\t"
  1257. "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
  1258. "dsll %[src_hi], %[src0], %[sixteen] \n\t"
  1259. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  1260. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1261. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1262. "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
  1263. "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
  1264. "psubw %[dest1_u], %[src1], %[src0] \n\t"
  1265. "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
  1266. "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
  1267. "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
  1268. "psubw %[dest1_v], %[src0], %[src1] \n\t"
  1269. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  1270. "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
  1271. "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
  1272. "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
  1273. "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
  1274. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1275. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1276. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1277. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1278. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1279. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1280. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1281. "psrlh %[src0], %[src0], %[two] \n\t"
  1282. "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
  1283. "dsll %[dest2_v], %[src0], %[sixteen] \n\t"
  1284. "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
  1285. "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
  1286. "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
  1287. "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
  1288. "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
  1289. "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
  1290. "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
  1291. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1292. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1293. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1294. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1295. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1296. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1297. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1298. "psrlh %[src0], %[src0], %[two] \n\t"
  1299. "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
  1300. "dsll %[src_hi], %[src0], %[sixteen] \n\t"
  1301. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  1302. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1303. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1304. "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
  1305. "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
  1306. "psubw %[dest2_u], %[src1], %[src0] \n\t"
  1307. "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
  1308. "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
  1309. "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
  1310. "psubw %[dest2_v], %[src0], %[src1] \n\t"
  1311. "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
  1312. "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
  1313. "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
  1314. "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
  1315. "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
  1316. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1317. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1318. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1319. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1320. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1321. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1322. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1323. "psrlh %[src0], %[src0], %[two] \n\t"
  1324. "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
  1325. "dsll %[dest3_v], %[src0], %[sixteen] \n\t"
  1326. "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
  1327. "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
  1328. "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
  1329. "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
  1330. "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
  1331. "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
  1332. "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
  1333. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1334. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1335. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1336. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1337. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1338. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1339. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1340. "psrlh %[src0], %[src0], %[two] \n\t"
  1341. "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
  1342. "dsll %[src_hi], %[src0], %[sixteen] \n\t"
  1343. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  1344. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1345. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1346. "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
  1347. "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
  1348. "psubw %[dest3_u], %[src1], %[src0] \n\t"
  1349. "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
  1350. "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
  1351. "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
  1352. "psubw %[dest3_v], %[src0], %[src1] \n\t"
  1353. "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
  1354. "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
  1355. "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
  1356. "packushb %[dest0_u], %[src0], %[src1] \n\t"
  1357. "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
  1358. "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
  1359. "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
  1360. "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
  1361. "packushb %[dest0_v], %[src0], %[src1] \n\t"
  1362. "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
  1363. "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
  1364. "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
  1365. "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
  1366. "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
  1367. "daddi %[width], %[width], -0x10 \n\t"
  1368. "bgtz %[width], 1b \n\t"
  1369. : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
  1370. [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
  1371. [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
  1372. [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
  1373. [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
  1374. [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
  1375. : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
  1376. [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
  1377. [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
  1378. [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
  1379. [sixteen] "f"(0x10)
  1380. : "memory");
  1381. }
  1382. void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  1383. uint64_t src, src_hi, src_lo;
  1384. uint64_t dest0, dest1, dest2, dest3;
  1385. const uint64_t value = 0x1080;
  1386. const uint64_t mask = 0x0042008100190001;
  1387. __asm__ volatile(
  1388. "1: \n\t"
  1389. "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
  1390. "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
  1391. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1392. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  1393. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1394. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1395. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  1396. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1397. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1398. "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
  1399. "paddw %[dest0], %[dest0], %[src] \n\t"
  1400. "psrlw %[dest0], %[dest0], %[eight] \n\t"
  1401. "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
  1402. "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
  1403. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1404. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  1405. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1406. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1407. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  1408. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1409. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1410. "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
  1411. "paddw %[dest1], %[dest1], %[src] \n\t"
  1412. "psrlw %[dest1], %[dest1], %[eight] \n\t"
  1413. "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
  1414. "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
  1415. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1416. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  1417. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1418. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1419. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  1420. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1421. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1422. "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
  1423. "paddw %[dest2], %[dest2], %[src] \n\t"
  1424. "psrlw %[dest2], %[dest2], %[eight] \n\t"
  1425. "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
  1426. "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
  1427. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1428. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  1429. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1430. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1431. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  1432. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1433. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1434. "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
  1435. "paddw %[dest3], %[dest3], %[src] \n\t"
  1436. "psrlw %[dest3], %[dest3], %[eight] \n\t"
  1437. "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
  1438. "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
  1439. "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
  1440. "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
  1441. "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
  1442. "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
  1443. "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
  1444. "daddi %[width], %[width], -0x08 \n\t"
  1445. "bnez %[width], 1b \n\t"
  1446. : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
  1447. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
  1448. [dest3] "=&f"(dest3)
  1449. : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
  1450. [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
  1451. [zero] "f"(0x00)
  1452. : "memory");
  1453. }
  1454. void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
  1455. int src_stride_rgb,
  1456. uint8_t* dst_u,
  1457. uint8_t* dst_v,
  1458. int width) {
  1459. uint64_t src_rgb1;
  1460. uint64_t ftmp[12];
  1461. const uint64_t value = 0x4040;
  1462. const uint64_t mask_u = 0x0026004a00700002;
  1463. const uint64_t mask_v = 0x00020070005e0012;
  1464. __asm__ volatile(
  1465. "1: \n\t"
  1466. "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
  1467. "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
  1468. "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
  1469. "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
  1470. "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
  1471. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1472. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1473. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1474. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1475. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1476. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1477. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1478. "psrlh %[src0], %[src0], %[two] \n\t"
  1479. "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t"
  1480. "dsrl %[dest0_v], %[src0], %[sixteen] \n\t"
  1481. "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t"
  1482. "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
  1483. "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
  1484. "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
  1485. "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
  1486. "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
  1487. "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
  1488. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1489. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1490. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1491. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1492. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1493. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1494. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1495. "psrlh %[src0], %[src0], %[two] \n\t"
  1496. "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
  1497. "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
  1498. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1499. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1500. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1501. "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
  1502. "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
  1503. "psubw %[dest0_u], %[src0], %[src1] \n\t"
  1504. "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
  1505. "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
  1506. "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
  1507. "psubw %[dest0_v], %[src1], %[src0] \n\t"
  1508. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  1509. "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
  1510. "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
  1511. "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
  1512. "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
  1513. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1514. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1515. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1516. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1517. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1518. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1519. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1520. "psrlh %[src0], %[src0], %[two] \n\t"
  1521. "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t"
  1522. "dsrl %[dest1_v], %[src0], %[sixteen] \n\t"
  1523. "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t"
  1524. "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
  1525. "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
  1526. "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
  1527. "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
  1528. "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
  1529. "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
  1530. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1531. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1532. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1533. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1534. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1535. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1536. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1537. "psrlh %[src0], %[src0], %[two] \n\t"
  1538. "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
  1539. "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
  1540. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1541. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1542. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1543. "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
  1544. "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
  1545. "psubw %[dest1_u], %[src0], %[src1] \n\t"
  1546. "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
  1547. "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
  1548. "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
  1549. "psubw %[dest1_v], %[src1], %[src0] \n\t"
  1550. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  1551. "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
  1552. "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
  1553. "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
  1554. "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
  1555. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1556. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1557. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1558. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1559. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1560. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1561. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1562. "psrlh %[src0], %[src0], %[two] \n\t"
  1563. "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t"
  1564. "dsrl %[dest2_v], %[src0], %[sixteen] \n\t"
  1565. "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t"
  1566. "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
  1567. "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
  1568. "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
  1569. "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
  1570. "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
  1571. "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
  1572. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1573. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1574. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1575. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1576. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1577. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1578. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1579. "psrlh %[src0], %[src0], %[two] \n\t"
  1580. "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
  1581. "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
  1582. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1583. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1584. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1585. "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
  1586. "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
  1587. "psubw %[dest2_u], %[src0], %[src1] \n\t"
  1588. "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
  1589. "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
  1590. "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
  1591. "psubw %[dest2_v], %[src1], %[src0] \n\t"
  1592. "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
  1593. "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
  1594. "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
  1595. "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
  1596. "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
  1597. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1598. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1599. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1600. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1601. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1602. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1603. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1604. "psrlh %[src0], %[src0], %[two] \n\t"
  1605. "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t"
  1606. "dsrl %[dest3_v], %[src0], %[sixteen] \n\t"
  1607. "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t"
  1608. "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
  1609. "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
  1610. "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
  1611. "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
  1612. "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
  1613. "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
  1614. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1615. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1616. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1617. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1618. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1619. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1620. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1621. "psrlh %[src0], %[src0], %[two] \n\t"
  1622. "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
  1623. "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
  1624. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1625. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1626. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1627. "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
  1628. "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
  1629. "psubw %[dest3_u], %[src0], %[src1] \n\t"
  1630. "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
  1631. "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
  1632. "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
  1633. "psubw %[dest3_v], %[src1], %[src0] \n\t"
  1634. "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
  1635. "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
  1636. "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
  1637. "packushb %[dest0_u], %[src0], %[src1] \n\t"
  1638. "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
  1639. "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
  1640. "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
  1641. "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
  1642. "packushb %[dest0_v], %[src0], %[src1] \n\t"
  1643. "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
  1644. "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
  1645. "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
  1646. "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
  1647. "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
  1648. "daddi %[width], %[width], -0x10 \n\t"
  1649. "bgtz %[width], 1b \n\t"
  1650. : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
  1651. [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
  1652. [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
  1653. [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
  1654. [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
  1655. [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
  1656. : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
  1657. [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
  1658. [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
  1659. [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
  1660. [sixteen] "f"(0x10)
  1661. : "memory");
  1662. }
  1663. void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  1664. uint64_t src, src_hi, src_lo;
  1665. uint64_t dest0, dest1, dest2, dest3;
  1666. const uint64_t value = 0x1080;
  1667. const uint64_t mask = 0x0001004200810019;
  1668. __asm__ volatile(
  1669. "1: \n\t"
  1670. "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
  1671. "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
  1672. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1673. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1674. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1675. "dsll %[src], %[src], %[eight] \n\t"
  1676. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1677. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1678. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1679. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1680. "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
  1681. "paddw %[dest0], %[dest0], %[src] \n\t"
  1682. "psrlw %[dest0], %[dest0], %[eight] \n\t"
  1683. "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t"
  1684. "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t"
  1685. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1686. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1687. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1688. "dsll %[src], %[src], %[eight] \n\t"
  1689. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1690. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1691. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1692. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1693. "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
  1694. "paddw %[dest1], %[dest1], %[src] \n\t"
  1695. "psrlw %[dest1], %[dest1], %[eight] \n\t"
  1696. "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t"
  1697. "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t"
  1698. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1699. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1700. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1701. "dsll %[src], %[src], %[eight] \n\t"
  1702. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1703. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1704. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1705. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1706. "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
  1707. "paddw %[dest2], %[dest2], %[src] \n\t"
  1708. "psrlw %[dest2], %[dest2], %[eight] \n\t"
  1709. "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t"
  1710. "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t"
  1711. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1712. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1713. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1714. "dsll %[src], %[src], %[eight] \n\t"
  1715. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1716. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1717. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1718. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1719. "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
  1720. "paddw %[dest3], %[dest3], %[src] \n\t"
  1721. "psrlw %[dest3], %[dest3], %[eight] \n\t"
  1722. "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
  1723. "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
  1724. "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
  1725. "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
  1726. "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
  1727. "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t"
  1728. "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
  1729. "daddi %[width], %[width], -0x08 \n\t"
  1730. "bnez %[width], 1b \n\t"
  1731. : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
  1732. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
  1733. [dest3] "=&f"(dest3)
  1734. : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
  1735. [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
  1736. [zero] "f"(0x00)
  1737. : "memory");
  1738. }
  1739. void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
  1740. int src_stride_rgb,
  1741. uint8_t* dst_u,
  1742. uint8_t* dst_v,
  1743. int width) {
  1744. uint64_t src_rgb1;
  1745. uint64_t ftmp[12];
  1746. const uint64_t value = 0x4040;
  1747. const uint64_t mask_u = 0x0026004a00700002;
  1748. const uint64_t mask_v = 0x00020070005e0012;
  1749. __asm__ volatile(
  1750. "1: \n\t"
  1751. "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
  1752. "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
  1753. "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
  1754. "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
  1755. "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
  1756. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1757. "dsll %[src0], %[src0], %[eight] \n\t"
  1758. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1759. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1760. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1761. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1762. "dsll %[src1], %[src1], %[eight] \n\t"
  1763. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1764. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1765. "psrlh %[src0], %[src0], %[two] \n\t"
  1766. "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
  1767. "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
  1768. "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
  1769. "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
  1770. "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
  1771. "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t"
  1772. "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t"
  1773. "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t"
  1774. "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t"
  1775. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1776. "dsll %[src0], %[src0], %[eight] \n\t"
  1777. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1778. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1779. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1780. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1781. "dsll %[src1], %[src1], %[eight] \n\t"
  1782. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1783. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1784. "psrlh %[src0], %[src0], %[two] \n\t"
  1785. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  1786. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  1787. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  1788. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1789. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1790. "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
  1791. "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
  1792. "psubw %[dest0_u], %[src0], %[src1] \n\t"
  1793. "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
  1794. "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
  1795. "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
  1796. "psubw %[dest0_v], %[src1], %[src0] \n\t"
  1797. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  1798. "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t"
  1799. "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t"
  1800. "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t"
  1801. "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t"
  1802. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1803. "dsll %[src0], %[src0], %[eight] \n\t"
  1804. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1805. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1806. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1807. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1808. "dsll %[src1], %[src1], %[eight] \n\t"
  1809. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1810. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1811. "psrlh %[src0], %[src0], %[two] \n\t"
  1812. "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
  1813. "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
  1814. "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
  1815. "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
  1816. "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
  1817. "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t"
  1818. "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t"
  1819. "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t"
  1820. "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t"
  1821. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1822. "dsll %[src0], %[src0], %[eight] \n\t"
  1823. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1824. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1825. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1826. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1827. "dsll %[src1], %[src1], %[eight] \n\t"
  1828. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1829. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1830. "psrlh %[src0], %[src0], %[two] \n\t"
  1831. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  1832. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  1833. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  1834. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1835. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1836. "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
  1837. "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
  1838. "psubw %[dest1_u], %[src0], %[src1] \n\t"
  1839. "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
  1840. "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
  1841. "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
  1842. "psubw %[dest1_v], %[src1], %[src0] \n\t"
  1843. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  1844. "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
  1845. "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
  1846. "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
  1847. "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
  1848. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1849. "dsll %[src0], %[src0], %[eight] \n\t"
  1850. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1851. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1852. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1853. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1854. "dsll %[src1], %[src1], %[eight] \n\t"
  1855. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1856. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1857. "psrlh %[src0], %[src0], %[two] \n\t"
  1858. "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
  1859. "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
  1860. "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
  1861. "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
  1862. "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
  1863. "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t"
  1864. "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t"
  1865. "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t"
  1866. "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t"
  1867. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1868. "dsll %[src0], %[src0], %[eight] \n\t"
  1869. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1870. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1871. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1872. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1873. "dsll %[src1], %[src1], %[eight] \n\t"
  1874. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1875. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1876. "psrlh %[src0], %[src0], %[two] \n\t"
  1877. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  1878. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  1879. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  1880. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1881. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1882. "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
  1883. "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
  1884. "psubw %[dest2_u], %[src0], %[src1] \n\t"
  1885. "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
  1886. "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
  1887. "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
  1888. "psubw %[dest2_v], %[src1], %[src0] \n\t"
  1889. "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
  1890. "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t"
  1891. "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t"
  1892. "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t"
  1893. "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t"
  1894. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1895. "dsll %[src0], %[src0], %[eight] \n\t"
  1896. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1897. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1898. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1899. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1900. "dsll %[src1], %[src1], %[eight] \n\t"
  1901. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1902. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1903. "psrlh %[src0], %[src0], %[two] \n\t"
  1904. "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
  1905. "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
  1906. "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
  1907. "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
  1908. "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
  1909. "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t"
  1910. "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t"
  1911. "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t"
  1912. "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t"
  1913. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  1914. "dsll %[src0], %[src0], %[eight] \n\t"
  1915. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  1916. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  1917. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  1918. "paddh %[src0], %[src0], %[src_lo] \n\t"
  1919. "dsll %[src1], %[src1], %[eight] \n\t"
  1920. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  1921. "paddh %[src0], %[src0], %[src_hi] \n\t"
  1922. "psrlh %[src0], %[src0], %[two] \n\t"
  1923. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  1924. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  1925. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  1926. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  1927. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  1928. "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
  1929. "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
  1930. "psubw %[dest3_u], %[src0], %[src1] \n\t"
  1931. "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
  1932. "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
  1933. "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
  1934. "psubw %[dest3_v], %[src1], %[src0] \n\t"
  1935. "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
  1936. "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
  1937. "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
  1938. "packushb %[dest0_u], %[src0], %[src1] \n\t"
  1939. "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
  1940. "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
  1941. "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
  1942. "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
  1943. "packushb %[dest0_v], %[src0], %[src1] \n\t"
  1944. "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
  1945. "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
  1946. "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t"
  1947. "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
  1948. "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
  1949. "daddi %[width], %[width], -0x10 \n\t"
  1950. "bgtz %[width], 1b \n\t"
  1951. : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
  1952. [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
  1953. [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
  1954. [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
  1955. [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
  1956. [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
  1957. : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
  1958. [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
  1959. [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
  1960. [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
  1961. [sixteen] "f"(0x10)
  1962. : "memory");
  1963. }
  1964. void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  1965. uint64_t src, src_hi, src_lo;
  1966. uint64_t dest0, dest1, dest2, dest3;
  1967. const uint64_t value = 0x1080;
  1968. const uint64_t mask = 0x0001001900810042;
  1969. __asm__ volatile(
  1970. "1: \n\t"
  1971. "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
  1972. "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
  1973. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1974. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1975. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1976. "dsll %[src], %[src], %[eight] \n\t"
  1977. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1978. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1979. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1980. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1981. "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
  1982. "paddw %[dest0], %[dest0], %[src] \n\t"
  1983. "psrlw %[dest0], %[dest0], %[eight] \n\t"
  1984. "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t"
  1985. "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t"
  1986. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  1987. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  1988. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  1989. "dsll %[src], %[src], %[eight] \n\t"
  1990. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  1991. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  1992. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  1993. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  1994. "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
  1995. "paddw %[dest1], %[dest1], %[src] \n\t"
  1996. "psrlw %[dest1], %[dest1], %[eight] \n\t"
  1997. "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t"
  1998. "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t"
  1999. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  2000. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  2001. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2002. "dsll %[src], %[src], %[eight] \n\t"
  2003. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  2004. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  2005. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2006. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  2007. "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
  2008. "paddw %[dest2], %[dest2], %[src] \n\t"
  2009. "psrlw %[dest2], %[dest2], %[eight] \n\t"
  2010. "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t"
  2011. "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t"
  2012. "punpcklbh %[src_lo], %[src], %[zero] \n\t"
  2013. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  2014. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2015. "dsll %[src], %[src], %[eight] \n\t"
  2016. "punpckhbh %[src_hi], %[src], %[zero] \n\t"
  2017. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  2018. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2019. "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
  2020. "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
  2021. "paddw %[dest3], %[dest3], %[src] \n\t"
  2022. "psrlw %[dest3], %[dest3], %[eight] \n\t"
  2023. "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
  2024. "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
  2025. "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
  2026. "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
  2027. "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
  2028. "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t"
  2029. "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
  2030. "daddi %[width], %[width], -0x08 \n\t"
  2031. "bnez %[width], 1b \n\t"
  2032. : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
  2033. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
  2034. [dest3] "=&f"(dest3)
  2035. : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
  2036. [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
  2037. [zero] "f"(0x00)
  2038. : "memory");
  2039. }
  2040. void RAWToUVRow_MMI(const uint8_t* src_rgb0,
  2041. int src_stride_rgb,
  2042. uint8_t* dst_u,
  2043. uint8_t* dst_v,
  2044. int width) {
  2045. uint64_t src_rgb1;
  2046. uint64_t ftmp[12];
  2047. const uint64_t value = 0x4040;
  2048. const uint64_t mask_u = 0x00020070004a0026;
  2049. const uint64_t mask_v = 0x0012005e00700002;
  2050. __asm__ volatile(
  2051. "1: \n\t"
  2052. "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
  2053. "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
  2054. "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
  2055. "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
  2056. "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
  2057. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2058. "dsll %[src0], %[src0], %[eight] \n\t"
  2059. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2060. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  2061. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  2062. "paddh %[src0], %[src0], %[src_lo] \n\t"
  2063. "dsll %[src1], %[src1], %[eight] \n\t"
  2064. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  2065. "paddh %[src0], %[src0], %[src_hi] \n\t"
  2066. "psrlh %[src0], %[src0], %[two] \n\t"
  2067. "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
  2068. "dsll %[dest0_v], %[src0], %[sixteen] \n\t"
  2069. "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
  2070. "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
  2071. "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
  2072. "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t"
  2073. "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t"
  2074. "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t"
  2075. "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t"
  2076. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2077. "dsll %[src0], %[src0], %[eight] \n\t"
  2078. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2079. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  2080. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  2081. "paddh %[src0], %[src0], %[src_lo] \n\t"
  2082. "dsll %[src1], %[src1], %[eight] \n\t"
  2083. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  2084. "paddh %[src0], %[src0], %[src_hi] \n\t"
  2085. "psrlh %[src0], %[src0], %[two] \n\t"
  2086. "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
  2087. "dsll %[src_hi], %[src0], %[sixteen] \n\t"
  2088. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  2089. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  2090. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  2091. "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
  2092. "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
  2093. "psubw %[dest0_u], %[src1], %[src0] \n\t"
  2094. "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
  2095. "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
  2096. "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
  2097. "psubw %[dest0_v], %[src0], %[src1] \n\t"
  2098. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  2099. "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t"
  2100. "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t"
  2101. "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t"
  2102. "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t"
  2103. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2104. "dsll %[src0], %[src0], %[eight] \n\t"
  2105. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2106. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  2107. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  2108. "paddh %[src0], %[src0], %[src_lo] \n\t"
  2109. "dsll %[src1], %[src1], %[eight] \n\t"
  2110. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  2111. "paddh %[src0], %[src0], %[src_hi] \n\t"
  2112. "psrlh %[src0], %[src0], %[two] \n\t"
  2113. "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
  2114. "dsll %[dest1_v], %[src0], %[sixteen] \n\t"
  2115. "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
  2116. "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
  2117. "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
  2118. "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t"
  2119. "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t"
  2120. "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t"
  2121. "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t"
  2122. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2123. "dsll %[src0], %[src0], %[eight] \n\t"
  2124. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2125. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  2126. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  2127. "paddh %[src0], %[src0], %[src_lo] \n\t"
  2128. "dsll %[src1], %[src1], %[eight] \n\t"
  2129. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  2130. "paddh %[src0], %[src0], %[src_hi] \n\t"
  2131. "psrlh %[src0], %[src0], %[two] \n\t"
  2132. "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
  2133. "dsll %[src_hi], %[src0], %[sixteen] \n\t"
  2134. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  2135. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  2136. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  2137. "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
  2138. "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
  2139. "psubw %[dest1_u], %[src1], %[src0] \n\t"
  2140. "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
  2141. "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
  2142. "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
  2143. "psubw %[dest1_v], %[src0], %[src1] \n\t"
  2144. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  2145. "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
  2146. "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
  2147. "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
  2148. "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
  2149. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2150. "dsll %[src0], %[src0], %[eight] \n\t"
  2151. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2152. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  2153. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  2154. "paddh %[src0], %[src0], %[src_lo] \n\t"
  2155. "dsll %[src1], %[src1], %[eight] \n\t"
  2156. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  2157. "paddh %[src0], %[src0], %[src_hi] \n\t"
  2158. "psrlh %[src0], %[src0], %[two] \n\t"
  2159. "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
  2160. "dsll %[dest2_v], %[src0], %[sixteen] \n\t"
  2161. "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
  2162. "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
  2163. "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
  2164. "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t"
  2165. "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t"
  2166. "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t"
  2167. "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t"
  2168. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2169. "dsll %[src0], %[src0], %[eight] \n\t"
  2170. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2171. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  2172. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  2173. "paddh %[src0], %[src0], %[src_lo] \n\t"
  2174. "dsll %[src1], %[src1], %[eight] \n\t"
  2175. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  2176. "paddh %[src0], %[src0], %[src_hi] \n\t"
  2177. "psrlh %[src0], %[src0], %[two] \n\t"
  2178. "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
  2179. "dsll %[src_hi], %[src0], %[sixteen] \n\t"
  2180. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  2181. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  2182. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  2183. "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
  2184. "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
  2185. "psubw %[dest2_u], %[src1], %[src0] \n\t"
  2186. "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
  2187. "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
  2188. "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
  2189. "psubw %[dest2_v], %[src0], %[src1] \n\t"
  2190. "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
  2191. "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t"
  2192. "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t"
  2193. "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t"
  2194. "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t"
  2195. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2196. "dsll %[src0], %[src0], %[eight] \n\t"
  2197. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2198. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  2199. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  2200. "paddh %[src0], %[src0], %[src_lo] \n\t"
  2201. "dsll %[src1], %[src1], %[eight] \n\t"
  2202. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  2203. "paddh %[src0], %[src0], %[src_hi] \n\t"
  2204. "psrlh %[src0], %[src0], %[two] \n\t"
  2205. "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
  2206. "dsll %[dest3_v], %[src0], %[sixteen] \n\t"
  2207. "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
  2208. "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
  2209. "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
  2210. "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t"
  2211. "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t"
  2212. "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t"
  2213. "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t"
  2214. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2215. "dsll %[src0], %[src0], %[eight] \n\t"
  2216. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2217. "paddh %[src0], %[src_lo], %[src_hi] \n\t"
  2218. "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
  2219. "paddh %[src0], %[src0], %[src_lo] \n\t"
  2220. "dsll %[src1], %[src1], %[eight] \n\t"
  2221. "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
  2222. "paddh %[src0], %[src0], %[src_hi] \n\t"
  2223. "psrlh %[src0], %[src0], %[two] \n\t"
  2224. "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
  2225. "dsll %[src_hi], %[src0], %[sixteen] \n\t"
  2226. "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
  2227. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  2228. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  2229. "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
  2230. "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
  2231. "psubw %[dest3_u], %[src1], %[src0] \n\t"
  2232. "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
  2233. "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
  2234. "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
  2235. "psubw %[dest3_v], %[src0], %[src1] \n\t"
  2236. "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
  2237. "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
  2238. "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
  2239. "packushb %[dest0_u], %[src0], %[src1] \n\t"
  2240. "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
  2241. "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
  2242. "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
  2243. "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
  2244. "packushb %[dest0_v], %[src0], %[src1] \n\t"
  2245. "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
  2246. "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
  2247. "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t"
  2248. "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
  2249. "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
  2250. "daddi %[width], %[width], -0x10 \n\t"
  2251. "bgtz %[width], 1b \n\t"
  2252. : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
  2253. [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
  2254. [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
  2255. [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
  2256. [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
  2257. [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
  2258. : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
  2259. [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
  2260. [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
  2261. [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
  2262. [sixteen] "f"(0x10)
  2263. : "memory");
  2264. }
  2265. void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  2266. uint64_t src, src_hi, src_lo;
  2267. uint64_t dest, dest0, dest1, dest2, dest3;
  2268. uint64_t tmp0, tmp1;
  2269. const uint64_t shift = 0x07;
  2270. const uint64_t value = 0x0040;
  2271. const uint64_t mask0 = 0x0;
  2272. const uint64_t mask1 = 0x00010026004B000FULL;
  2273. __asm__ volatile(
  2274. "1: \n\t"
  2275. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  2276. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  2277. "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
  2278. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  2279. "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
  2280. "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
  2281. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  2282. "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
  2283. "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
  2284. "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
  2285. "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
  2286. "psrlw %[dest0], %[dest0], %[shift] \n\t"
  2287. "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t"
  2288. "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t"
  2289. "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
  2290. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  2291. "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
  2292. "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
  2293. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  2294. "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
  2295. "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
  2296. "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
  2297. "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
  2298. "psrlw %[dest1], %[dest1], %[shift] \n\t"
  2299. "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t"
  2300. "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t"
  2301. "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
  2302. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  2303. "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
  2304. "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
  2305. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  2306. "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
  2307. "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
  2308. "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
  2309. "paddw %[dest2], %[tmp0], %[tmp1] \n\t"
  2310. "psrlw %[dest2], %[dest2], %[shift] \n\t"
  2311. "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t"
  2312. "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t"
  2313. "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
  2314. "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
  2315. "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
  2316. "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
  2317. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  2318. "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
  2319. "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
  2320. "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
  2321. "paddw %[dest3], %[tmp0], %[tmp1] \n\t"
  2322. "psrlw %[dest3], %[dest3], %[shift] \n\t"
  2323. "packsswh %[tmp0], %[dest0], %[dest1] \n\t"
  2324. "packsswh %[tmp1], %[dest2], %[dest3] \n\t"
  2325. "packushb %[dest], %[tmp0], %[tmp1] \n\t"
  2326. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  2327. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  2328. "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
  2329. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  2330. "daddi %[width], %[width], -0x08 \n\t"
  2331. "bnez %[width], 1b \n\t"
  2332. : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
  2333. [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
  2334. [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
  2335. [tmp1] "=&f"(tmp1)
  2336. : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
  2337. [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
  2338. [width] "r"(width)
  2339. : "memory");
  2340. }
  2341. void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
  2342. int src_stride_rgb,
  2343. uint8_t* dst_u,
  2344. uint8_t* dst_v,
  2345. int width) {
  2346. uint64_t src_rgb1;
  2347. uint64_t ftmp[12];
  2348. const uint64_t value = 0x4040;
  2349. const uint64_t mask_u = 0x002b0054007f0002;
  2350. const uint64_t mask_v = 0x0002007f006b0014;
  2351. __asm__ volatile(
  2352. "1: \n\t"
  2353. "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
  2354. "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
  2355. "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
  2356. "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
  2357. "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
  2358. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2359. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2360. "punpcklbh %[src0], %[src1], %[zero] \n\t"
  2361. "punpckhbh %[src1], %[src1], %[zero] \n\t"
  2362. "pavgh %[src0], %[src_lo], %[src0] \n\t"
  2363. "pavgh %[src1], %[src_hi], %[src1] \n\t"
  2364. "pavgh %[src0], %[src0], %[src1] \n\t"
  2365. "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
  2366. "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
  2367. "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
  2368. "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
  2369. "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
  2370. "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
  2371. "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
  2372. "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
  2373. "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
  2374. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2375. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2376. "punpcklbh %[src0], %[src1], %[zero] \n\t"
  2377. "punpckhbh %[src1], %[src1], %[zero] \n\t"
  2378. "pavgh %[src0], %[src_lo], %[src0] \n\t"
  2379. "pavgh %[src1], %[src_hi], %[src1] \n\t"
  2380. "pavgh %[src0], %[src0], %[src1] \n\t"
  2381. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  2382. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  2383. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  2384. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  2385. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  2386. "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
  2387. "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
  2388. "psubw %[dest0_u], %[src0], %[src1] \n\t"
  2389. "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
  2390. "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
  2391. "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
  2392. "psubw %[dest0_v], %[src1], %[src0] \n\t"
  2393. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  2394. "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
  2395. "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
  2396. "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
  2397. "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
  2398. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2399. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2400. "punpcklbh %[src0], %[src1], %[zero] \n\t"
  2401. "punpckhbh %[src1], %[src1], %[zero] \n\t"
  2402. "pavgh %[src0], %[src_lo], %[src0] \n\t"
  2403. "pavgh %[src1], %[src_hi], %[src1] \n\t"
  2404. "pavgh %[src0], %[src0], %[src1] \n\t"
  2405. "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
  2406. "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
  2407. "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
  2408. "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
  2409. "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
  2410. "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
  2411. "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
  2412. "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
  2413. "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
  2414. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2415. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2416. "punpcklbh %[src0], %[src1], %[zero] \n\t"
  2417. "punpckhbh %[src1], %[src1], %[zero] \n\t"
  2418. "pavgh %[src0], %[src_lo], %[src0] \n\t"
  2419. "pavgh %[src1], %[src_hi], %[src1] \n\t"
  2420. "pavgh %[src0], %[src0], %[src1] \n\t"
  2421. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  2422. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  2423. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  2424. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  2425. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  2426. "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
  2427. "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
  2428. "psubw %[dest1_u], %[src0], %[src1] \n\t"
  2429. "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
  2430. "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
  2431. "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
  2432. "psubw %[dest1_v], %[src1], %[src0] \n\t"
  2433. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  2434. "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
  2435. "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
  2436. "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
  2437. "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
  2438. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2439. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2440. "punpcklbh %[src0], %[src1], %[zero] \n\t"
  2441. "punpckhbh %[src1], %[src1], %[zero] \n\t"
  2442. "pavgh %[src0], %[src_lo], %[src0] \n\t"
  2443. "pavgh %[src1], %[src_hi], %[src1] \n\t"
  2444. "pavgh %[src0], %[src0], %[src1] \n\t"
  2445. "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
  2446. "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
  2447. "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
  2448. "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
  2449. "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
  2450. "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
  2451. "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
  2452. "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
  2453. "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
  2454. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2455. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2456. "punpcklbh %[src0], %[src1], %[zero] \n\t"
  2457. "punpckhbh %[src1], %[src1], %[zero] \n\t"
  2458. "pavgh %[src0], %[src_lo], %[src0] \n\t"
  2459. "pavgh %[src1], %[src_hi], %[src1] \n\t"
  2460. "pavgh %[src0], %[src0], %[src1] \n\t"
  2461. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  2462. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  2463. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  2464. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  2465. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  2466. "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
  2467. "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
  2468. "psubw %[dest2_u], %[src0], %[src1] \n\t"
  2469. "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
  2470. "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
  2471. "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
  2472. "psubw %[dest2_v], %[src1], %[src0] \n\t"
  2473. "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
  2474. "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
  2475. "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
  2476. "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
  2477. "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
  2478. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2479. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2480. "punpcklbh %[src0], %[src1], %[zero] \n\t"
  2481. "punpckhbh %[src1], %[src1], %[zero] \n\t"
  2482. "pavgh %[src0], %[src_lo], %[src0] \n\t"
  2483. "pavgh %[src1], %[src_hi], %[src1] \n\t"
  2484. "pavgh %[src0], %[src0], %[src1] \n\t"
  2485. "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
  2486. "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
  2487. "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
  2488. "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
  2489. "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
  2490. "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
  2491. "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
  2492. "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
  2493. "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
  2494. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  2495. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  2496. "punpcklbh %[src0], %[src1], %[zero] \n\t"
  2497. "punpckhbh %[src1], %[src1], %[zero] \n\t"
  2498. "pavgh %[src0], %[src_lo], %[src0] \n\t"
  2499. "pavgh %[src1], %[src_hi], %[src1] \n\t"
  2500. "pavgh %[src0], %[src0], %[src1] \n\t"
  2501. "dsll %[src_lo], %[src0], %[sixteen] \n\t"
  2502. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  2503. "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
  2504. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  2505. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  2506. "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
  2507. "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
  2508. "psubw %[dest3_u], %[src0], %[src1] \n\t"
  2509. "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
  2510. "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
  2511. "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
  2512. "psubw %[dest3_v], %[src1], %[src0] \n\t"
  2513. "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
  2514. "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
  2515. "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
  2516. "packushb %[dest0_u], %[src0], %[src1] \n\t"
  2517. "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
  2518. "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
  2519. "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
  2520. "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
  2521. "packushb %[dest0_v], %[src0], %[src1] \n\t"
  2522. "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
  2523. "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
  2524. "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
  2525. "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
  2526. "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
  2527. "daddi %[width], %[width], -0x10 \n\t"
  2528. "bgtz %[width], 1b \n\t"
  2529. : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
  2530. [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
  2531. [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
  2532. [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
  2533. [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
  2534. [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
  2535. : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
  2536. [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
  2537. [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
  2538. [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
  2539. [sixteen] "f"(0x10)
  2540. : "memory");
  2541. }
  2542. void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
  2543. uint64_t ftmp[11];
  2544. const uint64_t value = 0x1080108010801080;
  2545. const uint64_t mask = 0x0001004200810019;
  2546. uint64_t c0 = 0x001f001f001f001f;
  2547. uint64_t c1 = 0x00ff00ff00ff00ff;
  2548. uint64_t c2 = 0x0007000700070007;
  2549. __asm__ volatile(
  2550. "1: \n\t"
  2551. "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
  2552. "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
  2553. "psrlh %[src1], %[src0], %[eight] \n\t"
  2554. "and %[b], %[src0], %[c0] \n\t"
  2555. "and %[src0], %[src0], %[c1] \n\t"
  2556. "psrlh %[src0], %[src0], %[five] \n\t"
  2557. "and %[g], %[src1], %[c2] \n\t"
  2558. "psllh %[g], %[g], %[three] \n\t"
  2559. "or %[g], %[src0], %[g] \n\t"
  2560. "psrlh %[r], %[src1], %[three] \n\t"
  2561. "psllh %[src0], %[b], %[three] \n\t"
  2562. "psrlh %[src1], %[b], %[two] \n\t"
  2563. "or %[b], %[src0], %[src1] \n\t"
  2564. "psllh %[src0], %[g], %[two] \n\t"
  2565. "psrlh %[src1], %[g], %[four] \n\t"
  2566. "or %[g], %[src0], %[src1] \n\t"
  2567. "psllh %[src0], %[r], %[three] \n\t"
  2568. "psrlh %[src1], %[r], %[two] \n\t"
  2569. "or %[r], %[src0], %[src1] \n\t"
  2570. "punpcklhw %[src0], %[b], %[r] \n\t"
  2571. "punpcklhw %[src1], %[g], %[value] \n\t"
  2572. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2573. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2574. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2575. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2576. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2577. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2578. "paddw %[dest0], %[src0], %[src1] \n\t"
  2579. "psrlw %[dest0], %[dest0], %[eight] \n\t"
  2580. "punpckhhw %[src0], %[b], %[r] \n\t"
  2581. "punpckhhw %[src1], %[g], %[value] \n\t"
  2582. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2583. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2584. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2585. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2586. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2587. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2588. "paddw %[dest1], %[src0], %[src1] \n\t"
  2589. "psrlw %[dest1], %[dest1], %[eight] \n\t"
  2590. "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t"
  2591. "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t"
  2592. "psrlh %[src1], %[src0], %[eight] \n\t"
  2593. "and %[b], %[src0], %[c0] \n\t"
  2594. "and %[src0], %[src0], %[c1] \n\t"
  2595. "psrlh %[src0], %[src0], %[five] \n\t"
  2596. "and %[g], %[src1], %[c2] \n\t"
  2597. "psllh %[g], %[g], %[three] \n\t"
  2598. "or %[g], %[src0], %[g] \n\t"
  2599. "psrlh %[r], %[src1], %[three] \n\t"
  2600. "psllh %[src0], %[b], %[three] \n\t"
  2601. "psrlh %[src1], %[b], %[two] \n\t"
  2602. "or %[b], %[src0], %[src1] \n\t"
  2603. "psllh %[src0], %[g], %[two] \n\t"
  2604. "psrlh %[src1], %[g], %[four] \n\t"
  2605. "or %[g], %[src0], %[src1] \n\t"
  2606. "psllh %[src0], %[r], %[three] \n\t"
  2607. "psrlh %[src1], %[r], %[two] \n\t"
  2608. "or %[r], %[src0], %[src1] \n\t"
  2609. "punpcklhw %[src0], %[b], %[r] \n\t"
  2610. "punpcklhw %[src1], %[g], %[value] \n\t"
  2611. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2612. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2613. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2614. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2615. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2616. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2617. "paddw %[dest2], %[src0], %[src1] \n\t"
  2618. "psrlw %[dest2], %[dest2], %[eight] \n\t"
  2619. "punpckhhw %[src0], %[b], %[r] \n\t"
  2620. "punpckhhw %[src1], %[g], %[value] \n\t"
  2621. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2622. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2623. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2624. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2625. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2626. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2627. "paddw %[dest3], %[src0], %[src1] \n\t"
  2628. "psrlw %[dest3], %[dest3], %[eight] \n\t"
  2629. "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
  2630. "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
  2631. "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
  2632. "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
  2633. "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
  2634. "daddiu %[src_rgb565], %[src_rgb565], 0x10 \n\t"
  2635. "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
  2636. "daddiu %[width], %[width], -0x08 \n\t"
  2637. "bgtz %[width], 1b \n\t"
  2638. : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
  2639. [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
  2640. [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
  2641. [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
  2642. : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value),
  2643. [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
  2644. [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05),
  2645. [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04)
  2646. : "memory");
  2647. }
  2648. void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555,
  2649. uint8_t* dst_y,
  2650. int width) {
  2651. uint64_t ftmp[11];
  2652. const uint64_t value = 0x1080108010801080;
  2653. const uint64_t mask = 0x0001004200810019;
  2654. uint64_t c0 = 0x001f001f001f001f;
  2655. uint64_t c1 = 0x00ff00ff00ff00ff;
  2656. uint64_t c2 = 0x0003000300030003;
  2657. uint64_t c3 = 0x007c007c007c007c;
  2658. __asm__ volatile(
  2659. "1: \n\t"
  2660. "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
  2661. "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
  2662. "psrlh %[src1], %[src0], %[eight] \n\t"
  2663. "and %[b], %[src0], %[c0] \n\t"
  2664. "and %[src0], %[src0], %[c1] \n\t"
  2665. "psrlh %[src0], %[src0], %[five] \n\t"
  2666. "and %[g], %[src1], %[c2] \n\t"
  2667. "psllh %[g], %[g], %[three] \n\t"
  2668. "or %[g], %[src0], %[g] \n\t"
  2669. "and %[r], %[src1], %[c3] \n\t"
  2670. "psrlh %[r], %[r], %[two] \n\t"
  2671. "psllh %[src0], %[b], %[three] \n\t"
  2672. "psrlh %[src1], %[b], %[two] \n\t"
  2673. "or %[b], %[src0], %[src1] \n\t"
  2674. "psllh %[src0], %[g], %[three] \n\t"
  2675. "psrlh %[src1], %[g], %[two] \n\t"
  2676. "or %[g], %[src0], %[src1] \n\t"
  2677. "psllh %[src0], %[r], %[three] \n\t"
  2678. "psrlh %[src1], %[r], %[two] \n\t"
  2679. "or %[r], %[src0], %[src1] \n\t"
  2680. "punpcklhw %[src0], %[b], %[r] \n\t"
  2681. "punpcklhw %[src1], %[g], %[value] \n\t"
  2682. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2683. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2684. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2685. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2686. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2687. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2688. "paddw %[dest0], %[src0], %[src1] \n\t"
  2689. "psrlw %[dest0], %[dest0], %[eight] \n\t"
  2690. "punpckhhw %[src0], %[b], %[r] \n\t"
  2691. "punpckhhw %[src1], %[g], %[value] \n\t"
  2692. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2693. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2694. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2695. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2696. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2697. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2698. "paddw %[dest1], %[src0], %[src1] \n\t"
  2699. "psrlw %[dest1], %[dest1], %[eight] \n\t"
  2700. "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t"
  2701. "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t"
  2702. "psrlh %[src1], %[src0], %[eight] \n\t"
  2703. "and %[b], %[src0], %[c0] \n\t"
  2704. "and %[src0], %[src0], %[c1] \n\t"
  2705. "psrlh %[src0], %[src0], %[five] \n\t"
  2706. "and %[g], %[src1], %[c2] \n\t"
  2707. "psllh %[g], %[g], %[three] \n\t"
  2708. "or %[g], %[src0], %[g] \n\t"
  2709. "and %[r], %[src1], %[c3] \n\t"
  2710. "psrlh %[r], %[r], %[two] \n\t"
  2711. "psllh %[src0], %[b], %[three] \n\t"
  2712. "psrlh %[src1], %[b], %[two] \n\t"
  2713. "or %[b], %[src0], %[src1] \n\t"
  2714. "psllh %[src0], %[g], %[three] \n\t"
  2715. "psrlh %[src1], %[g], %[two] \n\t"
  2716. "or %[g], %[src0], %[src1] \n\t"
  2717. "psllh %[src0], %[r], %[three] \n\t"
  2718. "psrlh %[src1], %[r], %[two] \n\t"
  2719. "or %[r], %[src0], %[src1] \n\t"
  2720. "punpcklhw %[src0], %[b], %[r] \n\t"
  2721. "punpcklhw %[src1], %[g], %[value] \n\t"
  2722. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2723. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2724. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2725. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2726. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2727. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2728. "paddw %[dest2], %[src0], %[src1] \n\t"
  2729. "psrlw %[dest2], %[dest2], %[eight] \n\t"
  2730. "punpckhhw %[src0], %[b], %[r] \n\t"
  2731. "punpckhhw %[src1], %[g], %[value] \n\t"
  2732. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2733. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2734. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2735. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2736. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2737. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2738. "paddw %[dest3], %[src0], %[src1] \n\t"
  2739. "psrlw %[dest3], %[dest3], %[eight] \n\t"
  2740. "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
  2741. "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
  2742. "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
  2743. "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
  2744. "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
  2745. "daddiu %[src_argb1555], %[src_argb1555], 0x10 \n\t"
  2746. "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
  2747. "daddiu %[width], %[width], -0x08 \n\t"
  2748. "bgtz %[width], 1b \n\t"
  2749. : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
  2750. [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
  2751. [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
  2752. [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
  2753. : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y),
  2754. [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
  2755. [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08),
  2756. [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
  2757. : "memory");
  2758. }
  2759. void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444,
  2760. uint8_t* dst_y,
  2761. int width) {
  2762. uint64_t ftmp[11];
  2763. uint64_t value = 0x1080108010801080;
  2764. uint64_t mask = 0x0001004200810019;
  2765. uint64_t c0 = 0x000f000f000f000f;
  2766. uint64_t c1 = 0x00ff00ff00ff00ff;
  2767. __asm__ volatile(
  2768. "1: \n\t"
  2769. "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
  2770. "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
  2771. "psrlh %[src1], %[src0], %[eight] \n\t"
  2772. "and %[b], %[src0], %[c0] \n\t"
  2773. "and %[src0], %[src0], %[c1] \n\t"
  2774. "psrlh %[g], %[src0], %[four] \n\t"
  2775. "and %[r], %[src1], %[c0] \n\t"
  2776. "psllh %[src0], %[b], %[four] \n\t"
  2777. "or %[b], %[src0], %[b] \n\t"
  2778. "psllh %[src0], %[g], %[four] \n\t"
  2779. "or %[g], %[src0], %[g] \n\t"
  2780. "psllh %[src0], %[r], %[four] \n\t"
  2781. "or %[r], %[src0], %[r] \n\t"
  2782. "punpcklhw %[src0], %[b], %[r] \n\t"
  2783. "punpcklhw %[src1], %[g], %[value] \n\t"
  2784. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2785. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2786. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2787. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2788. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2789. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2790. "paddw %[dest0], %[src0], %[src1] \n\t"
  2791. "psrlw %[dest0], %[dest0], %[eight] \n\t"
  2792. "punpckhhw %[src0], %[b], %[r] \n\t"
  2793. "punpckhhw %[src1], %[g], %[value] \n\t"
  2794. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2795. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2796. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2797. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2798. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2799. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2800. "paddw %[dest1], %[src0], %[src1] \n\t"
  2801. "psrlw %[dest1], %[dest1], %[eight] \n\t"
  2802. "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t"
  2803. "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t"
  2804. "psrlh %[src1], %[src0], %[eight] \n\t"
  2805. "and %[b], %[src0], %[c0] \n\t"
  2806. "and %[src0], %[src0], %[c1] \n\t"
  2807. "psrlh %[g], %[src0], %[four] \n\t"
  2808. "and %[r], %[src1], %[c0] \n\t"
  2809. "psllh %[src0], %[b], %[four] \n\t"
  2810. "or %[b], %[src0], %[b] \n\t"
  2811. "psllh %[src0], %[g], %[four] \n\t"
  2812. "or %[g], %[src0], %[g] \n\t"
  2813. "psllh %[src0], %[r], %[four] \n\t"
  2814. "or %[r], %[src0], %[r] \n\t"
  2815. "punpcklhw %[src0], %[b], %[r] \n\t"
  2816. "punpcklhw %[src1], %[g], %[value] \n\t"
  2817. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2818. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2819. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2820. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2821. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2822. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2823. "paddw %[dest2], %[src0], %[src1] \n\t"
  2824. "psrlw %[dest2], %[dest2], %[eight] \n\t"
  2825. "punpckhhw %[src0], %[b], %[r] \n\t"
  2826. "punpckhhw %[src1], %[g], %[value] \n\t"
  2827. "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
  2828. "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
  2829. "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
  2830. "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
  2831. "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
  2832. "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
  2833. "paddw %[dest3], %[src0], %[src1] \n\t"
  2834. "psrlw %[dest3], %[dest3], %[eight] \n\t"
  2835. "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
  2836. "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
  2837. "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
  2838. "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
  2839. "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
  2840. "daddiu %[src_argb4444], %[src_argb4444], 0x10 \n\t"
  2841. "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
  2842. "daddiu %[width], %[width], -0x08 \n\t"
  2843. "bgtz %[width], 1b \n\t"
  2844. : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
  2845. [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
  2846. [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
  2847. [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
  2848. : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y),
  2849. [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
  2850. [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04)
  2851. : "memory");
  2852. }
  2853. void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
  2854. int src_stride_rgb565,
  2855. uint8_t* dst_u,
  2856. uint8_t* dst_v,
  2857. int width) {
  2858. uint64_t ftmp[13];
  2859. uint64_t value = 0x2020202020202020;
  2860. uint64_t mask_u = 0x0026004a00700002;
  2861. uint64_t mask_v = 0x00020070005e0012;
  2862. uint64_t mask = 0x93;
  2863. uint64_t c0 = 0x001f001f001f001f;
  2864. uint64_t c1 = 0x00ff00ff00ff00ff;
  2865. uint64_t c2 = 0x0007000700070007;
  2866. __asm__ volatile(
  2867. "daddu %[next_rgb565], %[src_rgb565], %[next_rgb565] \n\t"
  2868. "1: \n\t"
  2869. "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
  2870. "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
  2871. "gsldrc1 %[src1], 0x00(%[next_rgb565]) \n\t"
  2872. "gsldlc1 %[src1], 0x07(%[next_rgb565]) \n\t"
  2873. "psrlh %[dest0_u], %[src0], %[eight] \n\t"
  2874. "and %[b0], %[src0], %[c0] \n\t"
  2875. "and %[src0], %[src0], %[c1] \n\t"
  2876. "psrlh %[src0], %[src0], %[five] \n\t"
  2877. "and %[g0], %[dest0_u], %[c2] \n\t"
  2878. "psllh %[g0], %[g0], %[three] \n\t"
  2879. "or %[g0], %[src0], %[g0] \n\t"
  2880. "psrlh %[r0], %[dest0_u], %[three] \n\t"
  2881. "psrlh %[src0], %[src1], %[eight] \n\t"
  2882. "and %[dest0_u], %[src1], %[c0] \n\t"
  2883. "and %[src1], %[src1], %[c1] \n\t"
  2884. "psrlh %[src1], %[src1], %[five] \n\t"
  2885. "and %[dest0_v], %[src0], %[c2] \n\t"
  2886. "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
  2887. "or %[dest0_v], %[src1], %[dest0_v] \n\t"
  2888. "psrlh %[src0], %[src0], %[three] \n\t"
  2889. "paddh %[b0], %[b0], %[dest0_u] \n\t"
  2890. "paddh %[g0], %[g0], %[dest0_v] \n\t"
  2891. "paddh %[r0], %[r0], %[src0] \n\t"
  2892. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  2893. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  2894. "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
  2895. "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
  2896. "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
  2897. "psrlh %[b0], %[src0], %[six] \n\t"
  2898. "psllh %[r0], %[src0], %[one] \n\t"
  2899. "or %[b0], %[b0], %[r0] \n\t"
  2900. "punpcklhw %[src0], %[g0], %[value] \n\t"
  2901. "punpckhhw %[src1], %[g0], %[value] \n\t"
  2902. "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
  2903. "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
  2904. "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
  2905. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  2906. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  2907. "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
  2908. "pshufh %[dest0_u], %[src0], %[mask] \n\t"
  2909. "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
  2910. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  2911. "pshufh %[b0], %[src1], %[mask] \n\t"
  2912. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  2913. "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
  2914. "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
  2915. "psubw %[dest0_u], %[src0], %[src1] \n\t"
  2916. "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
  2917. "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
  2918. "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
  2919. "psubw %[dest0_v], %[src1], %[src0] \n\t"
  2920. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  2921. "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t"
  2922. "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t"
  2923. "gsldrc1 %[src1], 0x08(%[next_rgb565]) \n\t"
  2924. "gsldlc1 %[src1], 0x0f(%[next_rgb565]) \n\t"
  2925. "psrlh %[dest1_u], %[src0], %[eight] \n\t"
  2926. "and %[b0], %[src0], %[c0] \n\t"
  2927. "and %[src0], %[src0], %[c1] \n\t"
  2928. "psrlh %[src0], %[src0], %[five] \n\t"
  2929. "and %[g0], %[dest1_u], %[c2] \n\t"
  2930. "psllh %[g0], %[g0], %[three] \n\t"
  2931. "or %[g0], %[src0], %[g0] \n\t"
  2932. "psrlh %[r0], %[dest1_u], %[three] \n\t"
  2933. "psrlh %[src0], %[src1], %[eight] \n\t"
  2934. "and %[dest1_u], %[src1], %[c0] \n\t"
  2935. "and %[src1], %[src1], %[c1] \n\t"
  2936. "psrlh %[src1], %[src1], %[five] \n\t"
  2937. "and %[dest1_v], %[src0], %[c2] \n\t"
  2938. "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
  2939. "or %[dest1_v], %[src1], %[dest1_v] \n\t"
  2940. "psrlh %[src0], %[src0], %[three] \n\t"
  2941. "paddh %[b0], %[b0], %[dest1_u] \n\t"
  2942. "paddh %[g0], %[g0], %[dest1_v] \n\t"
  2943. "paddh %[r0], %[r0], %[src0] \n\t"
  2944. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  2945. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  2946. "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
  2947. "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
  2948. "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
  2949. "psrlh %[b0], %[src0], %[six] \n\t"
  2950. "psllh %[r0], %[src0], %[one] \n\t"
  2951. "or %[b0], %[b0], %[r0] \n\t"
  2952. "punpcklhw %[src0], %[g0], %[value] \n\t"
  2953. "punpckhhw %[src1], %[g0], %[value] \n\t"
  2954. "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
  2955. "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
  2956. "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
  2957. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  2958. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  2959. "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
  2960. "pshufh %[dest1_u], %[src0], %[mask] \n\t"
  2961. "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
  2962. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  2963. "pshufh %[b0], %[src1], %[mask] \n\t"
  2964. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  2965. "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
  2966. "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
  2967. "psubw %[dest1_u], %[src0], %[src1] \n\t"
  2968. "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
  2969. "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
  2970. "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
  2971. "psubw %[dest1_v], %[src1], %[src0] \n\t"
  2972. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  2973. "gsldrc1 %[src0], 0x10(%[src_rgb565]) \n\t"
  2974. "gsldlc1 %[src0], 0x17(%[src_rgb565]) \n\t"
  2975. "gsldrc1 %[src1], 0x10(%[next_rgb565]) \n\t"
  2976. "gsldlc1 %[src1], 0x17(%[next_rgb565]) \n\t"
  2977. "psrlh %[dest2_u], %[src0], %[eight] \n\t"
  2978. "and %[b0], %[src0], %[c0] \n\t"
  2979. "and %[src0], %[src0], %[c1] \n\t"
  2980. "psrlh %[src0], %[src0], %[five] \n\t"
  2981. "and %[g0], %[dest2_u], %[c2] \n\t"
  2982. "psllh %[g0], %[g0], %[three] \n\t"
  2983. "or %[g0], %[src0], %[g0] \n\t"
  2984. "psrlh %[r0], %[dest2_u], %[three] \n\t"
  2985. "psrlh %[src0], %[src1], %[eight] \n\t"
  2986. "and %[dest2_u], %[src1], %[c0] \n\t"
  2987. "and %[src1], %[src1], %[c1] \n\t"
  2988. "psrlh %[src1], %[src1], %[five] \n\t"
  2989. "and %[dest2_v], %[src0], %[c2] \n\t"
  2990. "psllh %[dest2_v], %[dest2_v], %[three] \n\t"
  2991. "or %[dest2_v], %[src1], %[dest2_v] \n\t"
  2992. "psrlh %[src0], %[src0], %[three] \n\t"
  2993. "paddh %[b0], %[b0], %[dest2_u] \n\t"
  2994. "paddh %[g0], %[g0], %[dest2_v] \n\t"
  2995. "paddh %[r0], %[r0], %[src0] \n\t"
  2996. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  2997. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  2998. "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
  2999. "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
  3000. "paddh %[src0], %[dest2_u], %[dest2_v] \n\t"
  3001. "psrlh %[b0], %[src0], %[six] \n\t"
  3002. "psllh %[r0], %[src0], %[one] \n\t"
  3003. "or %[b0], %[b0], %[r0] \n\t"
  3004. "punpcklhw %[src0], %[g0], %[value] \n\t"
  3005. "punpckhhw %[src1], %[g0], %[value] \n\t"
  3006. "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
  3007. "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
  3008. "paddh %[g0], %[dest2_u], %[dest2_v] \n\t"
  3009. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  3010. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  3011. "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t"
  3012. "pshufh %[dest2_u], %[src0], %[mask] \n\t"
  3013. "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
  3014. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  3015. "pshufh %[b0], %[src1], %[mask] \n\t"
  3016. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  3017. "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
  3018. "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
  3019. "psubw %[dest2_u], %[src0], %[src1] \n\t"
  3020. "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
  3021. "punpcklwd %[src0], %[dest2_v], %[g0] \n\t"
  3022. "punpckhwd %[src1], %[dest2_v], %[g0] \n\t"
  3023. "psubw %[dest2_v], %[src1], %[src0] \n\t"
  3024. "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
  3025. "gsldrc1 %[src0], 0x18(%[src_rgb565]) \n\t"
  3026. "gsldlc1 %[src0], 0x1f(%[src_rgb565]) \n\t"
  3027. "gsldrc1 %[src1], 0x18(%[next_rgb565]) \n\t"
  3028. "gsldlc1 %[src1], 0x1f(%[next_rgb565]) \n\t"
  3029. "psrlh %[dest3_u], %[src0], %[eight] \n\t"
  3030. "and %[b0], %[src0], %[c0] \n\t"
  3031. "and %[src0], %[src0], %[c1] \n\t"
  3032. "psrlh %[src0], %[src0], %[five] \n\t"
  3033. "and %[g0], %[dest3_u], %[c2] \n\t"
  3034. "psllh %[g0], %[g0], %[three] \n\t"
  3035. "or %[g0], %[src0], %[g0] \n\t"
  3036. "psrlh %[r0], %[dest3_u], %[three] \n\t"
  3037. "psrlh %[src0], %[src1], %[eight] \n\t"
  3038. "and %[dest3_u], %[src1], %[c0] \n\t"
  3039. "and %[src1], %[src1], %[c1] \n\t"
  3040. "psrlh %[src1], %[src1], %[five] \n\t"
  3041. "and %[dest3_v], %[src0], %[c2] \n\t"
  3042. "psllh %[dest3_v], %[dest3_v], %[three] \n\t"
  3043. "or %[dest3_v], %[src1], %[dest3_v] \n\t"
  3044. "psrlh %[src0], %[src0], %[three] \n\t"
  3045. "paddh %[b0], %[b0], %[dest3_u] \n\t"
  3046. "paddh %[g0], %[g0], %[dest3_v] \n\t"
  3047. "paddh %[r0], %[r0], %[src0] \n\t"
  3048. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  3049. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  3050. "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
  3051. "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
  3052. "paddh %[src0], %[dest3_u], %[dest3_v] \n\t"
  3053. "psrlh %[b0], %[src0], %[six] \n\t"
  3054. "psllh %[r0], %[src0], %[one] \n\t"
  3055. "or %[b0], %[b0], %[r0] \n\t"
  3056. "punpcklhw %[src0], %[g0], %[value] \n\t"
  3057. "punpckhhw %[src1], %[g0], %[value] \n\t"
  3058. "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
  3059. "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
  3060. "paddh %[g0], %[dest3_u], %[dest3_v] \n\t"
  3061. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  3062. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  3063. "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t"
  3064. "pshufh %[dest3_u], %[src0], %[mask] \n\t"
  3065. "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
  3066. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  3067. "pshufh %[b0], %[src1], %[mask] \n\t"
  3068. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  3069. "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
  3070. "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
  3071. "psubw %[dest3_u], %[src0], %[src1] \n\t"
  3072. "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
  3073. "punpcklwd %[src0], %[dest3_v], %[g0] \n\t"
  3074. "punpckhwd %[src1], %[dest3_v], %[g0] \n\t"
  3075. "psubw %[dest3_v], %[src1], %[src0] \n\t"
  3076. "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
  3077. "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
  3078. "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
  3079. "packushb %[dest0_u], %[src0], %[src1] \n\t"
  3080. "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
  3081. "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
  3082. "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
  3083. "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
  3084. "packushb %[dest0_v], %[src0], %[src1] \n\t"
  3085. "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
  3086. "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
  3087. "daddiu %[src_rgb565], %[src_rgb565], 0x20 \n\t"
  3088. "daddiu %[next_rgb565], %[next_rgb565], 0x20 \n\t"
  3089. "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
  3090. "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
  3091. "daddiu %[width], %[width], -0x10 \n\t"
  3092. "bgtz %[width], 1b \n\t"
  3093. : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
  3094. [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
  3095. [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
  3096. [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
  3097. [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
  3098. [dest3_v] "=&f"(ftmp[12])
  3099. : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565),
  3100. [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
  3101. [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
  3102. [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
  3103. [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
  3104. [one] "f"(0x01)
  3105. : "memory");
  3106. }
  3107. void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
  3108. int src_stride_argb1555,
  3109. uint8_t* dst_u,
  3110. uint8_t* dst_v,
  3111. int width) {
  3112. uint64_t ftmp[11];
  3113. uint64_t value = 0x2020202020202020;
  3114. uint64_t mask_u = 0x0026004a00700002;
  3115. uint64_t mask_v = 0x00020070005e0012;
  3116. uint64_t mask = 0x93;
  3117. uint64_t c0 = 0x001f001f001f001f;
  3118. uint64_t c1 = 0x00ff00ff00ff00ff;
  3119. uint64_t c2 = 0x0003000300030003;
  3120. uint64_t c3 = 0x007c007c007c007c;
  3121. __asm__ volatile(
  3122. "daddu %[next_argb1555], %[src_argb1555], %[next_argb1555] \n\t"
  3123. "1: \n\t"
  3124. "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
  3125. "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
  3126. "gsldrc1 %[src1], 0x00(%[next_argb1555]) \n\t"
  3127. "gsldlc1 %[src1], 0x07(%[next_argb1555]) \n\t"
  3128. "psrlh %[dest0_u], %[src0], %[eight] \n\t"
  3129. "and %[b0], %[src0], %[c0] \n\t"
  3130. "and %[src0], %[src0], %[c1] \n\t"
  3131. "psrlh %[src0], %[src0], %[five] \n\t"
  3132. "and %[g0], %[dest0_u], %[c2] \n\t"
  3133. "psllh %[g0], %[g0], %[three] \n\t"
  3134. "or %[g0], %[src0], %[g0] \n\t"
  3135. "and %[r0], %[dest0_u], %[c3] \n\t"
  3136. "psrlh %[r0], %[r0], %[two] \n\t"
  3137. "psrlh %[src0], %[src1], %[eight] \n\t"
  3138. "and %[dest0_u], %[src1], %[c0] \n\t"
  3139. "and %[src1], %[src1], %[c1] \n\t"
  3140. "psrlh %[src1], %[src1], %[five] \n\t"
  3141. "and %[dest0_v], %[src0], %[c2] \n\t"
  3142. "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
  3143. "or %[dest0_v], %[src1], %[dest0_v] \n\t"
  3144. "and %[src0], %[src0], %[c3] \n\t"
  3145. "psrlh %[src0], %[src0], %[two] \n\t"
  3146. "paddh %[b0], %[b0], %[dest0_u] \n\t"
  3147. "paddh %[g0], %[g0], %[dest0_v] \n\t"
  3148. "paddh %[r0], %[r0], %[src0] \n\t"
  3149. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  3150. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  3151. "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
  3152. "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
  3153. "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
  3154. "psrlh %[b0], %[src0], %[six] \n\t"
  3155. "psllh %[r0], %[src0], %[one] \n\t"
  3156. "or %[b0], %[b0], %[r0] \n\t"
  3157. "psrlh %[r0], %[g0], %[six] \n\t"
  3158. "psllh %[g0], %[g0], %[one] \n\t"
  3159. "or %[g0], %[g0], %[r0] \n\t"
  3160. "punpcklhw %[src0], %[g0], %[value] \n\t"
  3161. "punpckhhw %[src1], %[g0], %[value] \n\t"
  3162. "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
  3163. "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
  3164. "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
  3165. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  3166. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  3167. "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
  3168. "pshufh %[dest0_u], %[src0], %[mask] \n\t"
  3169. "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
  3170. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  3171. "pshufh %[b0], %[src1], %[mask] \n\t"
  3172. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  3173. "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
  3174. "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
  3175. "psubw %[dest0_u], %[src0], %[src1] \n\t"
  3176. "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
  3177. "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
  3178. "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
  3179. "psubw %[dest0_v], %[src1], %[src0] \n\t"
  3180. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  3181. "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t"
  3182. "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t"
  3183. "gsldrc1 %[src1], 0x08(%[next_argb1555]) \n\t"
  3184. "gsldlc1 %[src1], 0x0f(%[next_argb1555]) \n\t"
  3185. "psrlh %[dest1_u], %[src0], %[eight] \n\t"
  3186. "and %[b0], %[src0], %[c0] \n\t"
  3187. "and %[src0], %[src0], %[c1] \n\t"
  3188. "psrlh %[src0], %[src0], %[five] \n\t"
  3189. "and %[g0], %[dest1_u], %[c2] \n\t"
  3190. "psllh %[g0], %[g0], %[three] \n\t"
  3191. "or %[g0], %[src0], %[g0] \n\t"
  3192. "and %[r0], %[dest1_u], %[c3] \n\t"
  3193. "psrlh %[r0], %[r0], %[two] \n\t"
  3194. "psrlh %[src0], %[src1], %[eight] \n\t"
  3195. "and %[dest1_u], %[src1], %[c0] \n\t"
  3196. "and %[src1], %[src1], %[c1] \n\t"
  3197. "psrlh %[src1], %[src1], %[five] \n\t"
  3198. "and %[dest1_v], %[src0], %[c2] \n\t"
  3199. "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
  3200. "or %[dest1_v], %[src1], %[dest1_v] \n\t"
  3201. "and %[src0], %[src0], %[c3] \n\t"
  3202. "psrlh %[src0], %[src0], %[two] \n\t"
  3203. "paddh %[b0], %[b0], %[dest1_u] \n\t"
  3204. "paddh %[g0], %[g0], %[dest1_v] \n\t"
  3205. "paddh %[r0], %[r0], %[src0] \n\t"
  3206. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  3207. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  3208. "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
  3209. "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
  3210. "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
  3211. "psrlh %[b0], %[src0], %[six] \n\t"
  3212. "psllh %[r0], %[src0], %[one] \n\t"
  3213. "or %[b0], %[b0], %[r0] \n\t"
  3214. "psrlh %[r0], %[g0], %[six] \n\t"
  3215. "psllh %[g0], %[g0], %[one] \n\t"
  3216. "or %[g0], %[g0], %[r0] \n\t"
  3217. "punpcklhw %[src0], %[g0], %[value] \n\t"
  3218. "punpckhhw %[src1], %[g0], %[value] \n\t"
  3219. "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
  3220. "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
  3221. "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
  3222. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  3223. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  3224. "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
  3225. "pshufh %[dest1_u], %[src0], %[mask] \n\t"
  3226. "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
  3227. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  3228. "pshufh %[b0], %[src1], %[mask] \n\t"
  3229. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  3230. "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
  3231. "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
  3232. "psubw %[dest1_u], %[src0], %[src1] \n\t"
  3233. "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
  3234. "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
  3235. "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
  3236. "psubw %[dest1_v], %[src1], %[src0] \n\t"
  3237. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  3238. "packsswh %[dest0_u], %[dest0_u], %[dest1_u] \n\t"
  3239. "packsswh %[dest1_u], %[dest0_v], %[dest1_v] \n\t"
  3240. "gsldrc1 %[src0], 0x10(%[src_argb1555]) \n\t"
  3241. "gsldlc1 %[src0], 0x17(%[src_argb1555]) \n\t"
  3242. "gsldrc1 %[src1], 0x10(%[next_argb1555]) \n\t"
  3243. "gsldlc1 %[src1], 0x17(%[next_argb1555]) \n\t"
  3244. "psrlh %[dest2_u], %[src0], %[eight] \n\t"
  3245. "and %[b0], %[src0], %[c0] \n\t"
  3246. "and %[src0], %[src0], %[c1] \n\t"
  3247. "psrlh %[src0], %[src0], %[five] \n\t"
  3248. "and %[g0], %[dest2_u], %[c2] \n\t"
  3249. "psllh %[g0], %[g0], %[three] \n\t"
  3250. "or %[g0], %[src0], %[g0] \n\t"
  3251. "and %[r0], %[dest2_u], %[c3] \n\t"
  3252. "psrlh %[r0], %[r0], %[two] \n\t"
  3253. "psrlh %[src0], %[src1], %[eight] \n\t"
  3254. "and %[dest2_u], %[src1], %[c0] \n\t"
  3255. "and %[src1], %[src1], %[c1] \n\t"
  3256. "psrlh %[src1], %[src1], %[five] \n\t"
  3257. "and %[dest0_v], %[src0], %[c2] \n\t"
  3258. "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
  3259. "or %[dest0_v], %[src1], %[dest0_v] \n\t"
  3260. "and %[src0], %[src0], %[c3] \n\t"
  3261. "psrlh %[src0], %[src0], %[two] \n\t"
  3262. "paddh %[b0], %[b0], %[dest2_u] \n\t"
  3263. "paddh %[g0], %[g0], %[dest0_v] \n\t"
  3264. "paddh %[r0], %[r0], %[src0] \n\t"
  3265. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  3266. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  3267. "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
  3268. "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
  3269. "paddh %[src0], %[dest2_u], %[dest0_v] \n\t"
  3270. "psrlh %[b0], %[src0], %[six] \n\t"
  3271. "psllh %[r0], %[src0], %[one] \n\t"
  3272. "or %[b0], %[b0], %[r0] \n\t"
  3273. "psrlh %[r0], %[g0], %[six] \n\t"
  3274. "psllh %[g0], %[g0], %[one] \n\t"
  3275. "or %[g0], %[g0], %[r0] \n\t"
  3276. "punpcklhw %[src0], %[g0], %[value] \n\t"
  3277. "punpckhhw %[src1], %[g0], %[value] \n\t"
  3278. "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
  3279. "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
  3280. "paddh %[g0], %[dest2_u], %[dest0_v] \n\t"
  3281. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  3282. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  3283. "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
  3284. "pshufh %[dest2_u], %[src0], %[mask] \n\t"
  3285. "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
  3286. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  3287. "pshufh %[b0], %[src1], %[mask] \n\t"
  3288. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  3289. "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
  3290. "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
  3291. "psubw %[dest2_u], %[src0], %[src1] \n\t"
  3292. "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
  3293. "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
  3294. "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
  3295. "psubw %[dest0_v], %[src1], %[src0] \n\t"
  3296. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  3297. "gsldrc1 %[src0], 0x18(%[src_argb1555]) \n\t"
  3298. "gsldlc1 %[src0], 0x1f(%[src_argb1555]) \n\t"
  3299. "gsldrc1 %[src1], 0x18(%[next_argb1555]) \n\t"
  3300. "gsldlc1 %[src1], 0x1f(%[next_argb1555]) \n\t"
  3301. "psrlh %[dest3_u], %[src0], %[eight] \n\t"
  3302. "and %[b0], %[src0], %[c0] \n\t"
  3303. "and %[src0], %[src0], %[c1] \n\t"
  3304. "psrlh %[src0], %[src0], %[five] \n\t"
  3305. "and %[g0], %[dest3_u], %[c2] \n\t"
  3306. "psllh %[g0], %[g0], %[three] \n\t"
  3307. "or %[g0], %[src0], %[g0] \n\t"
  3308. "and %[r0], %[dest3_u], %[c3] \n\t"
  3309. "psrlh %[r0], %[r0], %[two] \n\t"
  3310. "psrlh %[src0], %[src1], %[eight] \n\t"
  3311. "and %[dest3_u], %[src1], %[c0] \n\t"
  3312. "and %[src1], %[src1], %[c1] \n\t"
  3313. "psrlh %[src1], %[src1], %[five] \n\t"
  3314. "and %[dest1_v], %[src0], %[c2] \n\t"
  3315. "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
  3316. "or %[dest1_v], %[src1], %[dest1_v] \n\t"
  3317. "and %[src0], %[src0], %[c3] \n\t"
  3318. "psrlh %[src0], %[src0], %[two] \n\t"
  3319. "paddh %[b0], %[b0], %[dest3_u] \n\t"
  3320. "paddh %[g0], %[g0], %[dest1_v] \n\t"
  3321. "paddh %[r0], %[r0], %[src0] \n\t"
  3322. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  3323. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  3324. "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
  3325. "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
  3326. "paddh %[src0], %[dest3_u], %[dest1_v] \n\t"
  3327. "psrlh %[b0], %[src0], %[six] \n\t"
  3328. "psllh %[r0], %[src0], %[one] \n\t"
  3329. "or %[b0], %[b0], %[r0] \n\t"
  3330. "psrlh %[r0], %[g0], %[six] \n\t"
  3331. "psllh %[g0], %[g0], %[one] \n\t"
  3332. "or %[g0], %[g0], %[r0] \n\t"
  3333. "punpcklhw %[src0], %[g0], %[value] \n\t"
  3334. "punpckhhw %[src1], %[g0], %[value] \n\t"
  3335. "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
  3336. "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
  3337. "paddh %[g0], %[dest3_u], %[dest1_v] \n\t"
  3338. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  3339. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  3340. "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
  3341. "pshufh %[dest3_u], %[src0], %[mask] \n\t"
  3342. "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
  3343. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  3344. "pshufh %[b0], %[src1], %[mask] \n\t"
  3345. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  3346. "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
  3347. "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
  3348. "psubw %[dest3_u], %[src0], %[src1] \n\t"
  3349. "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
  3350. "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
  3351. "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
  3352. "psubw %[dest1_v], %[src1], %[src0] \n\t"
  3353. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  3354. "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
  3355. "packushb %[dest0_u], %[dest0_u], %[src1] \n\t"
  3356. "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
  3357. "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
  3358. "packsswh %[src1], %[dest0_v], %[dest1_v] \n\t"
  3359. "packushb %[dest0_v], %[dest1_u], %[src1] \n\t"
  3360. "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
  3361. "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
  3362. "daddiu %[src_argb1555], %[src_argb1555], 0x20 \n\t"
  3363. "daddiu %[next_argb1555], %[next_argb1555], 0x20 \n\t"
  3364. "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
  3365. "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
  3366. "daddiu %[width], %[width], -0x10 \n\t"
  3367. "bgtz %[width], 1b \n\t"
  3368. : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
  3369. [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
  3370. [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
  3371. [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
  3372. [dest1_v] "=&f"(ftmp[10])
  3373. : [src_argb1555] "r"(src_argb1555),
  3374. [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u),
  3375. [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
  3376. [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3),
  3377. [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
  3378. [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
  3379. [two] "f"(0x02), [one] "f"(0x01)
  3380. : "memory");
  3381. }
  3382. void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
  3383. int src_stride_argb4444,
  3384. uint8_t* dst_u,
  3385. uint8_t* dst_v,
  3386. int width) {
  3387. uint64_t ftmp[13];
  3388. uint64_t value = 0x2020202020202020;
  3389. uint64_t mask_u = 0x0026004a00700002;
  3390. uint64_t mask_v = 0x00020070005e0012;
  3391. uint64_t mask = 0x93;
  3392. uint64_t c0 = 0x000f000f000f000f;
  3393. uint64_t c1 = 0x00ff00ff00ff00ff;
  3394. __asm__ volatile(
  3395. "daddu %[next_argb4444], %[src_argb4444], %[next_argb4444] \n\t"
  3396. "1: \n\t"
  3397. "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
  3398. "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
  3399. "gsldrc1 %[src1], 0x00(%[next_argb4444]) \n\t"
  3400. "gsldlc1 %[src1], 0x07(%[next_argb4444]) \n\t"
  3401. "psrlh %[dest0_u], %[src0], %[eight] \n\t"
  3402. "and %[b0], %[src0], %[c0] \n\t"
  3403. "and %[src0], %[src0], %[c1] \n\t"
  3404. "psrlh %[g0], %[src0], %[four] \n\t"
  3405. "and %[r0], %[dest0_u], %[c0] \n\t"
  3406. "psrlh %[src0], %[src1], %[eight] \n\t"
  3407. "and %[dest0_u], %[src1], %[c0] \n\t"
  3408. "and %[src1], %[src1], %[c1] \n\t"
  3409. "psrlh %[dest0_v], %[src1], %[four] \n\t"
  3410. "and %[src0], %[src0], %[c0] \n\t"
  3411. "paddh %[b0], %[b0], %[dest0_u] \n\t"
  3412. "paddh %[g0], %[g0], %[dest0_v] \n\t"
  3413. "paddh %[r0], %[r0], %[src0] \n\t"
  3414. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  3415. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  3416. "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
  3417. "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
  3418. "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
  3419. "psrlh %[b0], %[src0], %[four] \n\t"
  3420. "psllh %[r0], %[src0], %[two] \n\t"
  3421. "or %[b0], %[b0], %[r0] \n\t"
  3422. "psrlh %[r0], %[g0], %[four] \n\t"
  3423. "psllh %[g0], %[g0], %[two] \n\t"
  3424. "or %[g0], %[g0], %[r0] \n\t"
  3425. "punpcklhw %[src0], %[g0], %[value] \n\t"
  3426. "punpckhhw %[src1], %[g0], %[value] \n\t"
  3427. "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
  3428. "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
  3429. "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
  3430. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  3431. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  3432. "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
  3433. "pshufh %[dest0_u], %[src0], %[mask] \n\t"
  3434. "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
  3435. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  3436. "pshufh %[b0], %[src1], %[mask] \n\t"
  3437. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  3438. "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
  3439. "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
  3440. "psubw %[dest0_u], %[src0], %[src1] \n\t"
  3441. "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
  3442. "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
  3443. "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
  3444. "psubw %[dest0_v], %[src1], %[src0] \n\t"
  3445. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  3446. "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t"
  3447. "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t"
  3448. "gsldrc1 %[src1], 0x08(%[next_argb4444]) \n\t"
  3449. "gsldlc1 %[src1], 0x0f(%[next_argb4444]) \n\t"
  3450. "psrlh %[dest1_u], %[src0], %[eight] \n\t"
  3451. "and %[b0], %[src0], %[c0] \n\t"
  3452. "and %[src0], %[src0], %[c1] \n\t"
  3453. "psrlh %[g0], %[src0], %[four] \n\t"
  3454. "and %[r0], %[dest1_u], %[c0] \n\t"
  3455. "psrlh %[src0], %[src1], %[eight] \n\t"
  3456. "and %[dest1_u], %[src1], %[c0] \n\t"
  3457. "and %[src1], %[src1], %[c1] \n\t"
  3458. "psrlh %[dest1_v], %[src1], %[four] \n\t"
  3459. "and %[src0], %[src0], %[c0] \n\t"
  3460. "paddh %[b0], %[b0], %[dest1_u] \n\t"
  3461. "paddh %[g0], %[g0], %[dest1_v] \n\t"
  3462. "paddh %[r0], %[r0], %[src0] \n\t"
  3463. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  3464. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  3465. "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
  3466. "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
  3467. "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
  3468. "psrlh %[b0], %[src0], %[four] \n\t"
  3469. "psllh %[r0], %[src0], %[two] \n\t"
  3470. "or %[b0], %[b0], %[r0] \n\t"
  3471. "psrlh %[r0], %[g0], %[four] \n\t"
  3472. "psllh %[g0], %[g0], %[two] \n\t"
  3473. "or %[g0], %[g0], %[r0] \n\t"
  3474. "punpcklhw %[src0], %[g0], %[value] \n\t"
  3475. "punpckhhw %[src1], %[g0], %[value] \n\t"
  3476. "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
  3477. "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
  3478. "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
  3479. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  3480. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  3481. "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
  3482. "pshufh %[dest1_u], %[src0], %[mask] \n\t"
  3483. "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
  3484. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  3485. "pshufh %[b0], %[src1], %[mask] \n\t"
  3486. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  3487. "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
  3488. "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
  3489. "psubw %[dest1_u], %[src0], %[src1] \n\t"
  3490. "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
  3491. "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
  3492. "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
  3493. "psubw %[dest1_v], %[src1], %[src0] \n\t"
  3494. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  3495. "gsldrc1 %[src0], 0x10(%[src_argb4444]) \n\t"
  3496. "gsldlc1 %[src0], 0x17(%[src_argb4444]) \n\t"
  3497. "gsldrc1 %[src1], 0x10(%[next_argb4444]) \n\t"
  3498. "gsldlc1 %[src1], 0x17(%[next_argb4444]) \n\t"
  3499. "psrlh %[dest2_u], %[src0], %[eight] \n\t"
  3500. "and %[b0], %[src0], %[c0] \n\t"
  3501. "and %[src0], %[src0], %[c1] \n\t"
  3502. "psrlh %[g0], %[src0], %[four] \n\t"
  3503. "and %[r0], %[dest2_u], %[c0] \n\t"
  3504. "psrlh %[src0], %[src1], %[eight] \n\t"
  3505. "and %[dest2_u], %[src1], %[c0] \n\t"
  3506. "and %[src1], %[src1], %[c1] \n\t"
  3507. "psrlh %[dest2_v], %[src1], %[four] \n\t"
  3508. "and %[src0], %[src0], %[c0] \n\t"
  3509. "paddh %[b0], %[b0], %[dest2_u] \n\t"
  3510. "paddh %[g0], %[g0], %[dest2_v] \n\t"
  3511. "paddh %[r0], %[r0], %[src0] \n\t"
  3512. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  3513. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  3514. "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
  3515. "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
  3516. "paddh %[src0], %[dest2_u], %[dest2_v] \n\t"
  3517. "psrlh %[b0], %[src0], %[four] \n\t"
  3518. "psllh %[r0], %[src0], %[two] \n\t"
  3519. "or %[b0], %[b0], %[r0] \n\t"
  3520. "psrlh %[r0], %[g0], %[four] \n\t"
  3521. "psllh %[g0], %[g0], %[two] \n\t"
  3522. "or %[g0], %[g0], %[r0] \n\t"
  3523. "punpcklhw %[src0], %[g0], %[value] \n\t"
  3524. "punpckhhw %[src1], %[g0], %[value] \n\t"
  3525. "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
  3526. "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
  3527. "paddh %[g0], %[dest2_u], %[dest2_v] \n\t"
  3528. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  3529. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  3530. "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t"
  3531. "pshufh %[dest2_u], %[src0], %[mask] \n\t"
  3532. "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
  3533. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  3534. "pshufh %[b0], %[src1], %[mask] \n\t"
  3535. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  3536. "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
  3537. "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
  3538. "psubw %[dest2_u], %[src0], %[src1] \n\t"
  3539. "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
  3540. "punpcklwd %[src0], %[dest2_v], %[g0] \n\t"
  3541. "punpckhwd %[src1], %[dest2_v], %[g0] \n\t"
  3542. "psubw %[dest2_v], %[src1], %[src0] \n\t"
  3543. "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
  3544. "gsldrc1 %[src0], 0x18(%[src_argb4444]) \n\t"
  3545. "gsldlc1 %[src0], 0x1f(%[src_argb4444]) \n\t"
  3546. "gsldrc1 %[src1], 0x18(%[next_argb4444]) \n\t"
  3547. "gsldlc1 %[src1], 0x1f(%[next_argb4444]) \n\t"
  3548. "psrlh %[dest3_u], %[src0], %[eight] \n\t"
  3549. "and %[b0], %[src0], %[c0] \n\t"
  3550. "and %[src0], %[src0], %[c1] \n\t"
  3551. "psrlh %[g0], %[src0], %[four] \n\t"
  3552. "and %[r0], %[dest3_u], %[c0] \n\t"
  3553. "psrlh %[src0], %[src1], %[eight] \n\t"
  3554. "and %[dest3_u], %[src1], %[c0] \n\t"
  3555. "and %[src1], %[src1], %[c1] \n\t"
  3556. "psrlh %[dest3_v], %[src1], %[four] \n\t"
  3557. "and %[src0], %[src0], %[c0] \n\t"
  3558. "paddh %[b0], %[b0], %[dest3_u] \n\t"
  3559. "paddh %[g0], %[g0], %[dest3_v] \n\t"
  3560. "paddh %[r0], %[r0], %[src0] \n\t"
  3561. "punpcklhw %[src0], %[b0], %[r0] \n\t"
  3562. "punpckhhw %[src1], %[b0], %[r0] \n\t"
  3563. "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
  3564. "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
  3565. "paddh %[src0], %[dest3_u], %[dest3_v] \n\t"
  3566. "psrlh %[b0], %[src0], %[four] \n\t"
  3567. "psllh %[r0], %[src0], %[two] \n\t"
  3568. "or %[b0], %[b0], %[r0] \n\t"
  3569. "psrlh %[r0], %[g0], %[four] \n\t"
  3570. "psllh %[g0], %[g0], %[two] \n\t"
  3571. "or %[g0], %[g0], %[r0] \n\t"
  3572. "punpcklhw %[src0], %[g0], %[value] \n\t"
  3573. "punpckhhw %[src1], %[g0], %[value] \n\t"
  3574. "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
  3575. "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
  3576. "paddh %[g0], %[dest3_u], %[dest3_v] \n\t"
  3577. "punpcklhw %[src0], %[b0], %[g0] \n\t"
  3578. "punpckhhw %[src1], %[b0], %[g0] \n\t"
  3579. "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t"
  3580. "pshufh %[dest3_u], %[src0], %[mask] \n\t"
  3581. "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
  3582. "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
  3583. "pshufh %[b0], %[src1], %[mask] \n\t"
  3584. "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
  3585. "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
  3586. "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
  3587. "psubw %[dest3_u], %[src0], %[src1] \n\t"
  3588. "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
  3589. "punpcklwd %[src0], %[dest3_v], %[g0] \n\t"
  3590. "punpckhwd %[src1], %[dest3_v], %[g0] \n\t"
  3591. "psubw %[dest3_v], %[src1], %[src0] \n\t"
  3592. "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
  3593. "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
  3594. "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
  3595. "packushb %[dest0_u], %[src0], %[src1] \n\t"
  3596. "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
  3597. "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
  3598. "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
  3599. "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
  3600. "packushb %[dest0_v], %[src0], %[src1] \n\t"
  3601. "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
  3602. "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
  3603. "daddiu %[src_argb4444], %[src_argb4444], 0x20 \n\t"
  3604. "daddiu %[next_argb4444], %[next_argb4444], 0x20 \n\t"
  3605. "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
  3606. "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
  3607. "daddiu %[width], %[width], -0x10 \n\t"
  3608. "bgtz %[width], 1b \n\t"
  3609. : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
  3610. [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
  3611. [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
  3612. [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
  3613. [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
  3614. [dest3_v] "=&f"(ftmp[12])
  3615. : [src_argb4444] "r"(src_argb4444),
  3616. [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u),
  3617. [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
  3618. [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u),
  3619. [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04),
  3620. [two] "f"(0x02)
  3621. : "memory");
  3622. }
  3623. void ARGBToUV444Row_MMI(const uint8_t* src_argb,
  3624. uint8_t* dst_u,
  3625. uint8_t* dst_v,
  3626. int width) {
  3627. uint64_t ftmp[12];
  3628. const uint64_t value = 0x4040;
  3629. const uint64_t mask_u = 0x0026004a00700002;
  3630. const uint64_t mask_v = 0x00020070005e0012;
  3631. __asm__ volatile(
  3632. "1: \n\t"
  3633. "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
  3634. "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
  3635. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  3636. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  3637. "dsll %[dest0_u], %[src_lo], %[sixteen] \n\t"
  3638. "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
  3639. "pinsrh_3 %[dest0_v], %[src_lo], %[value] \n\t"
  3640. "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
  3641. "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
  3642. "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
  3643. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  3644. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  3645. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  3646. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  3647. "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
  3648. "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
  3649. "psubw %[dest0_u], %[src0], %[src1] \n\t"
  3650. "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
  3651. "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
  3652. "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
  3653. "psubw %[dest0_v], %[src1], %[src0] \n\t"
  3654. "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
  3655. "gsldrc1 %[src0], 0x08(%[src_argb]) \n\t"
  3656. "gsldlc1 %[src0], 0x0f(%[src_argb]) \n\t"
  3657. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  3658. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  3659. "dsll %[dest1_u], %[src_lo], %[sixteen] \n\t"
  3660. "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
  3661. "pinsrh_3 %[dest1_v], %[src_lo], %[value] \n\t"
  3662. "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
  3663. "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
  3664. "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
  3665. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  3666. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  3667. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  3668. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  3669. "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
  3670. "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
  3671. "psubw %[dest1_u], %[src0], %[src1] \n\t"
  3672. "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
  3673. "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
  3674. "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
  3675. "psubw %[dest1_v], %[src1], %[src0] \n\t"
  3676. "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
  3677. "gsldrc1 %[src0], 0x10(%[src_argb]) \n\t"
  3678. "gsldlc1 %[src0], 0x17(%[src_argb]) \n\t"
  3679. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  3680. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  3681. "dsll %[dest2_u], %[src_lo], %[sixteen] \n\t"
  3682. "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
  3683. "pinsrh_3 %[dest2_v], %[src_lo], %[value] \n\t"
  3684. "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
  3685. "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
  3686. "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
  3687. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  3688. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  3689. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  3690. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  3691. "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
  3692. "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
  3693. "psubw %[dest2_u], %[src0], %[src1] \n\t"
  3694. "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
  3695. "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
  3696. "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
  3697. "psubw %[dest2_v], %[src1], %[src0] \n\t"
  3698. "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
  3699. "gsldrc1 %[src0], 0x18(%[src_argb]) \n\t"
  3700. "gsldlc1 %[src0], 0x1f(%[src_argb]) \n\t"
  3701. "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
  3702. "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
  3703. "dsll %[dest3_u], %[src_lo], %[sixteen] \n\t"
  3704. "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
  3705. "pinsrh_3 %[dest3_v], %[src_lo], %[value] \n\t"
  3706. "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
  3707. "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
  3708. "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
  3709. "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
  3710. "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
  3711. "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
  3712. "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
  3713. "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
  3714. "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
  3715. "psubw %[dest3_u], %[src0], %[src1] \n\t"
  3716. "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
  3717. "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
  3718. "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
  3719. "psubw %[dest3_v], %[src1], %[src0] \n\t"
  3720. "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
  3721. "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
  3722. "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
  3723. "packushb %[dest0_u], %[src0], %[src1] \n\t"
  3724. "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
  3725. "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
  3726. "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
  3727. "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
  3728. "packushb %[dest0_v], %[src0], %[src1] \n\t"
  3729. "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
  3730. "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
  3731. "daddiu %[src_argb], %[src_argb], 0x20 \n\t"
  3732. "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
  3733. "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
  3734. "daddi %[width], %[width], -0x08 \n\t"
  3735. "bgtz %[width], 1b \n\t"
  3736. : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
  3737. [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]),
  3738. [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]),
  3739. [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]),
  3740. [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]),
  3741. [dest3_v] "=&f"(ftmp[11])
  3742. : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
  3743. [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
  3744. [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10),
  3745. [eight] "f"(0x08)
  3746. : "memory");
  3747. }
  3748. void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  3749. uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi;
  3750. uint64_t tmp0, tmp1;
  3751. const uint64_t mask0 = 0x0;
  3752. const uint64_t mask1 = 0x01;
  3753. const uint64_t mask2 = 0x00400026004B000FULL;
  3754. const uint64_t mask3 = 0xFF000000FF000000ULL;
  3755. const uint64_t mask4 = ~mask3;
  3756. const uint64_t shift = 0x07;
  3757. __asm__ volatile(
  3758. "1: \n\t"
  3759. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  3760. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  3761. "and %[src37], %[src], %[mask3] \n\t"
  3762. "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
  3763. "pinsrh_3 %[src_lo], %[src_lo], %[mask1] \n\t"
  3764. "pmaddhw %[dest_lo], %[src_lo], %[mask2] \n\t"
  3765. "punpcklwd %[tmp0], %[dest_lo], %[dest_lo] \n\t"
  3766. "punpckhwd %[tmp1], %[dest_lo], %[dest_lo] \n\t"
  3767. "paddw %[dest_lo], %[tmp0], %[tmp1] \n\t"
  3768. "psrlw %[dest_lo], %[dest_lo], %[shift] \n\t"
  3769. "packsswh %[dest_lo], %[dest_lo], %[dest_lo] \n\t"
  3770. "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
  3771. "pinsrh_3 %[src_hi], %[src_hi], %[mask1] \n\t"
  3772. "pmaddhw %[dest_hi], %[src_hi], %[mask2] \n\t"
  3773. "punpcklwd %[tmp0], %[dest_hi], %[dest_hi] \n\t"
  3774. "punpckhwd %[tmp1], %[dest_hi], %[dest_hi] \n\t"
  3775. "paddw %[dest_hi], %[tmp0], %[tmp1] \n\t"
  3776. "psrlw %[dest_hi], %[dest_hi], %[shift] \n\t"
  3777. "packsswh %[dest_hi], %[dest_hi], %[dest_hi] \n\t"
  3778. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  3779. "and %[dest], %[dest], %[mask4] \n\t"
  3780. "or %[dest], %[dest], %[src37] \n\t"
  3781. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  3782. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  3783. "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
  3784. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  3785. "daddi %[width], %[width], -0x02 \n\t"
  3786. "bnez %[width], 1b \n\t"
  3787. : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
  3788. [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0),
  3789. [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest),
  3790. [src37] "=&f"(src37)
  3791. : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
  3792. [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1),
  3793. [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4)
  3794. : "memory");
  3795. }
  3796. // Convert a row of image to Sepia tone.
  3797. void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) {
  3798. uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2;
  3799. uint64_t tmp0, tmp1;
  3800. const uint64_t mask0 = 0x0;
  3801. const uint64_t mask1 = 0x002300440011ULL;
  3802. const uint64_t mask2 = 0x002D00580016ULL;
  3803. const uint64_t mask3 = 0x003200620018ULL;
  3804. const uint64_t mask4 = 0xFF000000FF000000ULL;
  3805. const uint64_t shift = 0x07;
  3806. __asm__ volatile(
  3807. "1: \n\t"
  3808. "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  3809. "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  3810. "and %[dest37], %[dest], %[mask4] \n\t"
  3811. "punpcklbh %[dest_lo], %[dest], %[mask0] \n\t"
  3812. "pmaddhw %[dest0], %[dest_lo], %[mask1] \n\t"
  3813. "pmaddhw %[dest1], %[dest_lo], %[mask2] \n\t"
  3814. "pmaddhw %[dest2], %[dest_lo], %[mask3] \n\t"
  3815. "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t"
  3816. "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t"
  3817. "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
  3818. "psrlw %[dest0], %[dest0], %[shift] \n\t"
  3819. "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t"
  3820. "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t"
  3821. "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
  3822. "psrlw %[dest1], %[dest1], %[shift] \n\t"
  3823. "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
  3824. "punpckhbh %[dest_hi], %[dest], %[mask0] \n\t"
  3825. "pmaddhw %[dest0], %[dest_hi], %[mask1] \n\t"
  3826. "pmaddhw %[dest1], %[dest_hi], %[mask2] \n\t"
  3827. "pmaddhw %[dest2], %[dest_hi], %[mask3] \n\t"
  3828. "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t"
  3829. "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t"
  3830. "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
  3831. "psrlw %[dest0], %[dest0], %[shift] \n\t"
  3832. "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t"
  3833. "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t"
  3834. "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
  3835. "psrlw %[dest1], %[dest1], %[shift] \n\t"
  3836. "packsswh %[dest_hi], %[dest0], %[dest1] \n\t"
  3837. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  3838. "or %[dest], %[dest], %[dest37] \n\t"
  3839. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  3840. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  3841. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  3842. "daddi %[width], %[width], -0x02 \n\t"
  3843. "bnez %[width], 1b \n\t"
  3844. : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
  3845. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
  3846. [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1),
  3847. [dest] "=&f"(dest)
  3848. : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0),
  3849. [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
  3850. [mask4] "f"(mask4), [shift] "f"(shift)
  3851. : "memory");
  3852. }
  3853. // Apply color matrix to a row of image. Matrix is signed.
  3854. // TODO(fbarchard): Consider adding rounding (+32).
  3855. void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
  3856. uint8_t* dst_argb,
  3857. const int8_t* matrix_argb,
  3858. int width) {
  3859. uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2,
  3860. dest3;
  3861. uint64_t matrix, matrix_hi, matrix_lo;
  3862. uint64_t tmp0, tmp1;
  3863. const uint64_t shift0 = 0x06;
  3864. const uint64_t shift1 = 0x08;
  3865. const uint64_t mask0 = 0x0;
  3866. const uint64_t mask1 = 0x08;
  3867. __asm__ volatile(
  3868. "1: \n\t"
  3869. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  3870. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  3871. "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
  3872. "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t"
  3873. "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t"
  3874. "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
  3875. "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
  3876. "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
  3877. "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
  3878. "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
  3879. "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
  3880. "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t"
  3881. "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t"
  3882. "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
  3883. "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
  3884. "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
  3885. "psraw %[dest0], %[dest0], %[shift0] \n\t"
  3886. "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t"
  3887. "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t"
  3888. "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
  3889. "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
  3890. "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
  3891. "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
  3892. "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
  3893. "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
  3894. "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t"
  3895. "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t"
  3896. "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
  3897. "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
  3898. "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
  3899. "psraw %[dest1], %[dest1], %[shift0] \n\t"
  3900. "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
  3901. "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t"
  3902. "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t"
  3903. "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
  3904. "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
  3905. "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
  3906. "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
  3907. "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
  3908. "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
  3909. "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t"
  3910. "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t"
  3911. "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
  3912. "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
  3913. "paddw %[dest2], %[tmp0], %[tmp1] \n\t"
  3914. "psraw %[dest2], %[dest2], %[shift0] \n\t"
  3915. "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t"
  3916. "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t"
  3917. "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
  3918. "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
  3919. "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
  3920. "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
  3921. "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
  3922. "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
  3923. "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t"
  3924. "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t"
  3925. "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
  3926. "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
  3927. "paddw %[dest3], %[tmp0], %[tmp1] \n\t"
  3928. "psraw %[dest3], %[dest3], %[shift0] \n\t"
  3929. "packsswh %[tmp0], %[dest0], %[dest1] \n\t"
  3930. "packsswh %[tmp1], %[dest2], %[dest3] \n\t"
  3931. "packushb %[dest], %[tmp0], %[tmp1] \n\t"
  3932. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  3933. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  3934. "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
  3935. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  3936. "daddi %[width], %[width], -0x02 \n\t"
  3937. "bnez %[width], 1b \n\t"
  3938. : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
  3939. [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
  3940. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
  3941. [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest),
  3942. [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi),
  3943. [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix)
  3944. : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb),
  3945. [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0),
  3946. [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1)
  3947. : "memory");
  3948. }
  3949. void ARGBShadeRow_MMI(const uint8_t* src_argb,
  3950. uint8_t* dst_argb,
  3951. int width,
  3952. uint32_t value) {
  3953. uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi;
  3954. const uint64_t shift = 0x08;
  3955. __asm__ volatile(
  3956. "1: \n\t"
  3957. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  3958. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  3959. "punpcklbh %[src_lo], %[src], %[src] \n\t"
  3960. "punpckhbh %[src_hi], %[src], %[src] \n\t"
  3961. "punpcklbh %[value], %[value], %[value] \n\t"
  3962. "pmulhuh %[dest_lo], %[src_lo], %[value] \n\t"
  3963. "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
  3964. "pmulhuh %[dest_hi], %[src_hi], %[value] \n\t"
  3965. "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
  3966. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  3967. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  3968. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  3969. "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
  3970. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  3971. "daddi %[width], %[width], -0x02 \n\t"
  3972. "bnez %[width], 1b \n\t"
  3973. : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
  3974. [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src),
  3975. [dest] "=&f"(dest)
  3976. : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
  3977. [value] "f"(value), [shift] "f"(shift)
  3978. : "memory");
  3979. }
  3980. void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
  3981. const uint8_t* src_argb1,
  3982. uint8_t* dst_argb,
  3983. int width) {
  3984. uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo;
  3985. uint64_t dest, dest_lo, dest_hi;
  3986. const uint64_t mask = 0x0;
  3987. __asm__ volatile(
  3988. "1: \n\t"
  3989. "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
  3990. "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
  3991. "punpcklbh %[src0_lo], %[src0], %[src0] \n\t"
  3992. "punpckhbh %[src0_hi], %[src0], %[src0] \n\t"
  3993. "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
  3994. "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
  3995. "punpcklbh %[src1_lo], %[src1], %[mask] \n\t"
  3996. "punpckhbh %[src1_hi], %[src1], %[mask] \n\t"
  3997. "pmulhuh %[dest_lo], %[src0_lo], %[src1_lo] \n\t"
  3998. "pmulhuh %[dest_hi], %[src0_hi], %[src1_hi] \n\t"
  3999. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  4000. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  4001. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  4002. "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
  4003. "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
  4004. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  4005. "daddi %[width], %[width], -0x02 \n\t"
  4006. "bnez %[width], 1b \n\t"
  4007. : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
  4008. [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
  4009. [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
  4010. [src1] "=&f"(src1), [dest] "=&f"(dest)
  4011. : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
  4012. [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
  4013. : "memory");
  4014. }
  4015. void ARGBAddRow_MMI(const uint8_t* src_argb0,
  4016. const uint8_t* src_argb1,
  4017. uint8_t* dst_argb,
  4018. int width) {
  4019. uint64_t src0, src1, dest;
  4020. __asm__ volatile(
  4021. "1: \n\t"
  4022. "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
  4023. "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
  4024. "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
  4025. "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
  4026. "paddusb %[dest], %[src0], %[src1] \n\t"
  4027. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  4028. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  4029. "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
  4030. "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
  4031. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  4032. "daddi %[width], %[width], -0x02 \n\t"
  4033. "bnez %[width], 1b \n\t"
  4034. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
  4035. : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
  4036. [dst_ptr] "r"(dst_argb), [width] "r"(width)
  4037. : "memory");
  4038. }
  4039. void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
  4040. const uint8_t* src_argb1,
  4041. uint8_t* dst_argb,
  4042. int width) {
  4043. uint64_t src0, src1, dest;
  4044. __asm__ volatile(
  4045. "1: \n\t"
  4046. "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
  4047. "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
  4048. "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
  4049. "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
  4050. "psubusb %[dest], %[src0], %[src1] \n\t"
  4051. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  4052. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  4053. "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
  4054. "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
  4055. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  4056. "daddi %[width], %[width], -0x02 \n\t"
  4057. "bnez %[width], 1b \n\t"
  4058. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
  4059. : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
  4060. [dst_ptr] "r"(dst_argb), [width] "r"(width)
  4061. : "memory");
  4062. }
  4063. // Sobel functions which mimics SSSE3.
  4064. void SobelXRow_MMI(const uint8_t* src_y0,
  4065. const uint8_t* src_y1,
  4066. const uint8_t* src_y2,
  4067. uint8_t* dst_sobelx,
  4068. int width) {
  4069. uint64_t y00 = 0, y10 = 0, y20 = 0;
  4070. uint64_t y02 = 0, y12 = 0, y22 = 0;
  4071. uint64_t zero = 0x0;
  4072. uint64_t sobel = 0x0;
  4073. __asm__ volatile(
  4074. "1: \n\t"
  4075. "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i]
  4076. "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t"
  4077. "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // a_sub=src_y0[i+2]
  4078. "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t"
  4079. "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // b=src_y1[i]
  4080. "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t"
  4081. "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // b_sub=src_y1[i+2]
  4082. "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t"
  4083. "gsldlc1 %[y20], 0x07(%[src_y2]) \n\t" // c=src_y2[i]
  4084. "gsldrc1 %[y20], 0x00(%[src_y2]) \n\t"
  4085. "gsldlc1 %[y22], 0x09(%[src_y2]) \n\t" // c_sub=src_y2[i+2]
  4086. "gsldrc1 %[y22], 0x02(%[src_y2]) \n\t"
  4087. "punpcklbh %[y00], %[y00], %[zero] \n\t"
  4088. "punpcklbh %[y10], %[y10], %[zero] \n\t"
  4089. "punpcklbh %[y20], %[y20], %[zero] \n\t"
  4090. "punpcklbh %[y02], %[y02], %[zero] \n\t"
  4091. "punpcklbh %[y12], %[y12], %[zero] \n\t"
  4092. "punpcklbh %[y22], %[y22], %[zero] \n\t"
  4093. "paddh %[y00], %[y00], %[y10] \n\t" // a+b
  4094. "paddh %[y20], %[y20], %[y10] \n\t" // c+b
  4095. "paddh %[y00], %[y00], %[y20] \n\t" // a+2b+c
  4096. "paddh %[y02], %[y02], %[y12] \n\t" // a_sub+b_sub
  4097. "paddh %[y22], %[y22], %[y12] \n\t" // c_sub+b_sub
  4098. "paddh %[y02], %[y02], %[y22] \n\t" // a_sub+2b_sub+c_sub
  4099. "pmaxsh %[y10], %[y00], %[y02] \n\t"
  4100. "pminsh %[y20], %[y00], %[y02] \n\t"
  4101. "psubh %[sobel], %[y10], %[y20] \n\t" // Abs
  4102. "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t"
  4103. "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t"
  4104. "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t"
  4105. "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t"
  4106. "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t"
  4107. "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t"
  4108. "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t"
  4109. "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t"
  4110. "gsldlc1 %[y20], 0x0B(%[src_y2]) \n\t"
  4111. "gsldrc1 %[y20], 0x04(%[src_y2]) \n\t"
  4112. "gsldlc1 %[y22], 0x0D(%[src_y2]) \n\t"
  4113. "gsldrc1 %[y22], 0x06(%[src_y2]) \n\t"
  4114. "punpcklbh %[y00], %[y00], %[zero] \n\t"
  4115. "punpcklbh %[y10], %[y10], %[zero] \n\t"
  4116. "punpcklbh %[y20], %[y20], %[zero] \n\t"
  4117. "punpcklbh %[y02], %[y02], %[zero] \n\t"
  4118. "punpcklbh %[y12], %[y12], %[zero] \n\t"
  4119. "punpcklbh %[y22], %[y22], %[zero] \n\t"
  4120. "paddh %[y00], %[y00], %[y10] \n\t"
  4121. "paddh %[y20], %[y20], %[y10] \n\t"
  4122. "paddh %[y00], %[y00], %[y20] \n\t"
  4123. "paddh %[y02], %[y02], %[y12] \n\t"
  4124. "paddh %[y22], %[y22], %[y12] \n\t"
  4125. "paddh %[y02], %[y02], %[y22] \n\t"
  4126. "pmaxsh %[y10], %[y00], %[y02] \n\t"
  4127. "pminsh %[y20], %[y00], %[y02] \n\t"
  4128. "psubh %[y00], %[y10], %[y20] \n\t"
  4129. "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255
  4130. "gssdrc1 %[sobel], 0(%[dst_sobelx]) \n\t"
  4131. "gssdlc1 %[sobel], 7(%[dst_sobelx]) \n\t"
  4132. "daddiu %[src_y0], %[src_y0], 8 \n\t"
  4133. "daddiu %[src_y1], %[src_y1], 8 \n\t"
  4134. "daddiu %[src_y2], %[src_y2], 8 \n\t"
  4135. "daddiu %[dst_sobelx], %[dst_sobelx], 8 \n\t"
  4136. "daddiu %[width], %[width], -8 \n\t"
  4137. "bgtz %[width], 1b \n\t"
  4138. "nop \n\t"
  4139. : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10),
  4140. [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22)
  4141. : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2),
  4142. [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero)
  4143. : "memory");
  4144. }
  4145. void SobelYRow_MMI(const uint8_t* src_y0,
  4146. const uint8_t* src_y1,
  4147. uint8_t* dst_sobely,
  4148. int width) {
  4149. uint64_t y00 = 0, y01 = 0, y02 = 0;
  4150. uint64_t y10 = 0, y11 = 0, y12 = 0;
  4151. uint64_t zero = 0x0;
  4152. uint64_t sobel = 0x0;
  4153. __asm__ volatile(
  4154. "1: \n\t"
  4155. "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i]
  4156. "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t"
  4157. "gsldlc1 %[y01], 0x08(%[src_y0]) \n\t" // b=src_y0[i+1]
  4158. "gsldrc1 %[y01], 0x01(%[src_y0]) \n\t"
  4159. "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // c=src_y0[i+2]
  4160. "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t"
  4161. "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // a_sub=src_y1[i]
  4162. "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t"
  4163. "gsldlc1 %[y11], 0x08(%[src_y1]) \n\t" // b_sub=src_y1[i+1]
  4164. "gsldrc1 %[y11], 0x01(%[src_y1]) \n\t"
  4165. "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // c_sub=src_y1[i+2]
  4166. "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t"
  4167. "punpcklbh %[y00], %[y00], %[zero] \n\t"
  4168. "punpcklbh %[y01], %[y01], %[zero] \n\t"
  4169. "punpcklbh %[y02], %[y02], %[zero] \n\t"
  4170. "punpcklbh %[y10], %[y10], %[zero] \n\t"
  4171. "punpcklbh %[y11], %[y11], %[zero] \n\t"
  4172. "punpcklbh %[y12], %[y12], %[zero] \n\t"
  4173. "paddh %[y00], %[y00], %[y01] \n\t" // a+b
  4174. "paddh %[y02], %[y02], %[y01] \n\t" // c+b
  4175. "paddh %[y00], %[y00], %[y02] \n\t" // a+2b+c
  4176. "paddh %[y10], %[y10], %[y11] \n\t" // a_sub+b_sub
  4177. "paddh %[y12], %[y12], %[y11] \n\t" // c_sub+b_sub
  4178. "paddh %[y10], %[y10], %[y12] \n\t" // a_sub+2b_sub+c_sub
  4179. "pmaxsh %[y02], %[y00], %[y10] \n\t"
  4180. "pminsh %[y12], %[y00], %[y10] \n\t"
  4181. "psubh %[sobel], %[y02], %[y12] \n\t" // Abs
  4182. "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t"
  4183. "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t"
  4184. "gsldlc1 %[y01], 0x0C(%[src_y0]) \n\t"
  4185. "gsldrc1 %[y01], 0x05(%[src_y0]) \n\t"
  4186. "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t"
  4187. "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t"
  4188. "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t"
  4189. "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t"
  4190. "gsldlc1 %[y11], 0x0C(%[src_y1]) \n\t"
  4191. "gsldrc1 %[y11], 0x05(%[src_y1]) \n\t"
  4192. "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t"
  4193. "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t"
  4194. "punpcklbh %[y00], %[y00], %[zero] \n\t"
  4195. "punpcklbh %[y01], %[y01], %[zero] \n\t"
  4196. "punpcklbh %[y02], %[y02], %[zero] \n\t"
  4197. "punpcklbh %[y10], %[y10], %[zero] \n\t"
  4198. "punpcklbh %[y11], %[y11], %[zero] \n\t"
  4199. "punpcklbh %[y12], %[y12], %[zero] \n\t"
  4200. "paddh %[y00], %[y00], %[y01] \n\t"
  4201. "paddh %[y02], %[y02], %[y01] \n\t"
  4202. "paddh %[y00], %[y00], %[y02] \n\t"
  4203. "paddh %[y10], %[y10], %[y11] \n\t"
  4204. "paddh %[y12], %[y12], %[y11] \n\t"
  4205. "paddh %[y10], %[y10], %[y12] \n\t"
  4206. "pmaxsh %[y02], %[y00], %[y10] \n\t"
  4207. "pminsh %[y12], %[y00], %[y10] \n\t"
  4208. "psubh %[y00], %[y02], %[y12] \n\t"
  4209. "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255
  4210. "gssdrc1 %[sobel], 0(%[dst_sobely]) \n\t"
  4211. "gssdlc1 %[sobel], 7(%[dst_sobely]) \n\t"
  4212. "daddiu %[src_y0], %[src_y0], 8 \n\t"
  4213. "daddiu %[src_y1], %[src_y1], 8 \n\t"
  4214. "daddiu %[dst_sobely], %[dst_sobely], 8 \n\t"
  4215. "daddiu %[width], %[width], -8 \n\t"
  4216. "bgtz %[width], 1b \n\t"
  4217. "nop \n\t"
  4218. : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01),
  4219. [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12)
  4220. : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1),
  4221. [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero)
  4222. : "memory");
  4223. }
  4224. void SobelRow_MMI(const uint8_t* src_sobelx,
  4225. const uint8_t* src_sobely,
  4226. uint8_t* dst_argb,
  4227. int width) {
  4228. double temp[3];
  4229. uint64_t c1 = 0xff000000ff000000;
  4230. __asm__ volatile(
  4231. "1: \n\t"
  4232. "gsldlc1 %[t0], 0x07(%[src_sobelx]) \n\t" // a=src_sobelx[i]
  4233. "gsldrc1 %[t0], 0x00(%[src_sobelx]) \n\t"
  4234. "gsldlc1 %[t1], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i]
  4235. "gsldrc1 %[t1], 0x00(%[src_sobely]) \n\t"
  4236. // s7 s6 s5 s4 s3 s2 s1 s0 = a+b
  4237. "paddusb %[t2] , %[t0], %[t1] \n\t"
  4238. // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0
  4239. "punpcklbh %[t0], %[t2], %[t2] \n\t"
  4240. // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0
  4241. "punpcklbh %[t1], %[t0], %[t0] \n\t"
  4242. "or %[t1], %[t1], %[c1] \n\t"
  4243. // 255 s1 s1 s1 s55 s0 s0 s0
  4244. "gssdrc1 %[t1], 0x00(%[dst_argb]) \n\t"
  4245. "gssdlc1 %[t1], 0x07(%[dst_argb]) \n\t"
  4246. // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
  4247. "punpckhbh %[t1], %[t0], %[t0] \n\t"
  4248. "or %[t1], %[t1], %[c1] \n\t"
  4249. // 255 s3 s3 s3 255 s2 s2 s2
  4250. "gssdrc1 %[t1], 0x08(%[dst_argb]) \n\t"
  4251. "gssdlc1 %[t1], 0x0f(%[dst_argb]) \n\t"
  4252. // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
  4253. "punpckhbh %[t0], %[t2], %[t2] \n\t"
  4254. // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
  4255. "punpcklbh %[t1], %[t0], %[t0] \n\t"
  4256. "or %[t1], %[t1], %[c1] \n\t"
  4257. "gssdrc1 %[t1], 0x10(%[dst_argb]) \n\t"
  4258. "gssdlc1 %[t1], 0x17(%[dst_argb]) \n\t"
  4259. // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
  4260. "punpckhbh %[t1], %[t0], %[t0] \n\t"
  4261. "or %[t1], %[t1], %[c1] \n\t"
  4262. "gssdrc1 %[t1], 0x18(%[dst_argb]) \n\t"
  4263. "gssdlc1 %[t1], 0x1f(%[dst_argb]) \n\t"
  4264. "daddiu %[dst_argb], %[dst_argb], 32 \n\t"
  4265. "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
  4266. "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
  4267. "daddiu %[width], %[width], -8 \n\t"
  4268. "bgtz %[width], 1b \n\t"
  4269. "nop \n\t"
  4270. : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
  4271. : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
  4272. [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
  4273. : "memory");
  4274. }
  4275. void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
  4276. const uint8_t* src_sobely,
  4277. uint8_t* dst_y,
  4278. int width) {
  4279. uint64_t tr = 0;
  4280. uint64_t tb = 0;
  4281. __asm__ volatile(
  4282. "1: \n\t"
  4283. "gsldrc1 %[tr], 0x0(%[src_sobelx]) \n\t"
  4284. "gsldlc1 %[tr], 0x7(%[src_sobelx]) \n\t" // r=src_sobelx[i]
  4285. "gsldrc1 %[tb], 0x0(%[src_sobely]) \n\t"
  4286. "gsldlc1 %[tb], 0x7(%[src_sobely]) \n\t" // b=src_sobely[i]
  4287. "paddusb %[tr], %[tr], %[tb] \n\t" // g
  4288. "gssdrc1 %[tr], 0x0(%[dst_y]) \n\t"
  4289. "gssdlc1 %[tr], 0x7(%[dst_y]) \n\t"
  4290. "daddiu %[dst_y], %[dst_y], 8 \n\t"
  4291. "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
  4292. "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
  4293. "daddiu %[width], %[width], -8 \n\t"
  4294. "bgtz %[width], 1b \n\t"
  4295. "nop \n\t"
  4296. : [tr] "=&f"(tr), [tb] "=&f"(tb)
  4297. : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
  4298. [dst_y] "r"(dst_y), [width] "r"(width)
  4299. : "memory");
  4300. }
  4301. void SobelXYRow_MMI(const uint8_t* src_sobelx,
  4302. const uint8_t* src_sobely,
  4303. uint8_t* dst_argb,
  4304. int width) {
  4305. uint64_t temp[3];
  4306. uint64_t result = 0;
  4307. uint64_t gb = 0;
  4308. uint64_t cr = 0;
  4309. uint64_t c1 = 0xffffffffffffffff;
  4310. __asm__ volatile(
  4311. "1: \n\t"
  4312. "gsldlc1 %[tr], 0x07(%[src_sobelx]) \n\t" // r=src_sobelx[i]
  4313. "gsldrc1 %[tr], 0x00(%[src_sobelx]) \n\t"
  4314. "gsldlc1 %[tb], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i]
  4315. "gsldrc1 %[tb], 0x00(%[src_sobely]) \n\t"
  4316. "paddusb %[tg] , %[tr], %[tb] \n\t" // g
  4317. // g3 b3 g2 b2 g1 b1 g0 b0
  4318. "punpcklbh %[gb], %[tb], %[tg] \n\t"
  4319. // c3 r3 r2 r2 c1 r1 c0 r0
  4320. "punpcklbh %[cr], %[tr], %[c1] \n\t"
  4321. // c1 r1 g1 b1 c0 r0 g0 b0
  4322. "punpcklhw %[result], %[gb], %[cr] \n\t"
  4323. "gssdrc1 %[result], 0x00(%[dst_argb]) \n\t"
  4324. "gssdlc1 %[result], 0x07(%[dst_argb]) \n\t"
  4325. // c3 r3 g3 b3 c2 r2 g2 b2
  4326. "punpckhhw %[result], %[gb], %[cr] \n\t"
  4327. "gssdrc1 %[result], 0x08(%[dst_argb]) \n\t"
  4328. "gssdlc1 %[result], 0x0f(%[dst_argb]) \n\t"
  4329. // g7 b7 g6 b6 g5 b5 g4 b4
  4330. "punpckhbh %[gb], %[tb], %[tg] \n\t"
  4331. // c7 r7 c6 r6 c5 r5 c4 r4
  4332. "punpckhbh %[cr], %[tr], %[c1] \n\t"
  4333. // c5 r5 g5 b5 c4 r4 g4 b4
  4334. "punpcklhw %[result], %[gb], %[cr] \n\t"
  4335. "gssdrc1 %[result], 0x10(%[dst_argb]) \n\t"
  4336. "gssdlc1 %[result], 0x17(%[dst_argb]) \n\t"
  4337. // c7 r7 g7 b7 c6 r6 g6 b6
  4338. "punpckhhw %[result], %[gb], %[cr] \n\t"
  4339. "gssdrc1 %[result], 0x18(%[dst_argb]) \n\t"
  4340. "gssdlc1 %[result], 0x1f(%[dst_argb]) \n\t"
  4341. "daddiu %[dst_argb], %[dst_argb], 32 \n\t"
  4342. "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
  4343. "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
  4344. "daddiu %[width], %[width], -8 \n\t"
  4345. "bgtz %[width], 1b \n\t"
  4346. "nop \n\t"
  4347. : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]),
  4348. [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result)
  4349. : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
  4350. [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
  4351. : "memory");
  4352. }
  4353. void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  4354. // Copy a Y to RGB.
  4355. uint64_t src, dest;
  4356. const uint64_t mask0 = 0x00ffffff00ffffffULL;
  4357. const uint64_t mask1 = ~mask0;
  4358. __asm__ volatile(
  4359. "1: \n\t"
  4360. "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t"
  4361. "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t"
  4362. "punpcklbh %[src], %[src], %[src] \n\t"
  4363. "punpcklhw %[dest], %[src], %[src] \n\t"
  4364. "and %[dest], %[dest], %[mask0] \n\t"
  4365. "or %[dest], %[dest], %[mask1] \n\t"
  4366. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  4367. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  4368. "punpckhhw %[dest], %[src], %[src] \n\t"
  4369. "and %[dest], %[dest], %[mask0] \n\t"
  4370. "or %[dest], %[dest], %[mask1] \n\t"
  4371. "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
  4372. "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
  4373. "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t"
  4374. "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
  4375. "daddi %[width], %[width], -0x04 \n\t"
  4376. "bnez %[width], 1b \n\t"
  4377. : [src] "=&f"(src), [dest] "=&f"(dest)
  4378. : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
  4379. [mask1] "f"(mask1), [width] "r"(width)
  4380. : "memory");
  4381. }
  4382. void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
  4383. uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
  4384. const uint64_t mask0 = 0x0;
  4385. const uint64_t mask1 = 0x55;
  4386. const uint64_t mask2 = 0xAA;
  4387. const uint64_t mask3 = 0xFF;
  4388. const uint64_t mask4 = 0x4A354A354A354A35ULL;
  4389. const uint64_t mask5 = 0x0488048804880488ULL;
  4390. const uint64_t shift0 = 0x08;
  4391. const uint64_t shift1 = 0x06;
  4392. __asm__ volatile(
  4393. "1: \n\t"
  4394. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  4395. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  4396. "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
  4397. "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
  4398. "pshufh %[src], %[src_lo], %[mask0] \n\t"
  4399. "psllh %[dest_lo], %[src], %[shift0] \n\t"
  4400. "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
  4401. "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
  4402. "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
  4403. "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
  4404. "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
  4405. "pshufh %[src], %[src_lo], %[mask1] \n\t"
  4406. "psllh %[dest_hi], %[src], %[shift0] \n\t"
  4407. "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
  4408. "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
  4409. "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
  4410. "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
  4411. "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
  4412. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  4413. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  4414. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  4415. "pshufh %[src], %[src_lo], %[mask2] \n\t"
  4416. "psllh %[dest_lo], %[src], %[shift0] \n\t"
  4417. "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
  4418. "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
  4419. "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
  4420. "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
  4421. "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
  4422. "pshufh %[src], %[src_lo], %[mask3] \n\t"
  4423. "psllh %[dest_hi], %[src], %[shift0] \n\t"
  4424. "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
  4425. "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
  4426. "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
  4427. "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
  4428. "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
  4429. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  4430. "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
  4431. "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
  4432. "pshufh %[src], %[src_hi], %[mask0] \n\t"
  4433. "psllh %[dest_lo], %[src], %[shift0] \n\t"
  4434. "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
  4435. "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
  4436. "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
  4437. "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
  4438. "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
  4439. "pshufh %[src], %[src_hi], %[mask1] \n\t"
  4440. "psllh %[dest_hi], %[src], %[shift0] \n\t"
  4441. "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
  4442. "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
  4443. "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
  4444. "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
  4445. "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
  4446. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  4447. "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
  4448. "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
  4449. "pshufh %[src], %[src_hi], %[mask2] \n\t"
  4450. "psllh %[dest_lo], %[src], %[shift0] \n\t"
  4451. "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
  4452. "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
  4453. "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
  4454. "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
  4455. "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
  4456. "pshufh %[src], %[src_hi], %[mask3] \n\t"
  4457. "psllh %[dest_hi], %[src], %[shift0] \n\t"
  4458. "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
  4459. "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
  4460. "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
  4461. "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
  4462. "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
  4463. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  4464. "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
  4465. "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
  4466. "daddi %[src_ptr], %[src_ptr], 0x08 \n\t"
  4467. "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t"
  4468. "daddi %[width], %[width], -0x08 \n\t"
  4469. "bnez %[width], 1b \n\t"
  4470. : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
  4471. [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
  4472. [dest_lo] "=&f"(dest_lo)
  4473. : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0),
  4474. [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
  4475. [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0),
  4476. [shift1] "f"(shift1), [width] "r"(width)
  4477. : "memory");
  4478. }
  4479. void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
  4480. uint64_t source, src0, src1, dest;
  4481. const uint64_t mask0 = 0x0;
  4482. const uint64_t mask1 = 0x1b;
  4483. src += width - 1;
  4484. __asm__ volatile(
  4485. "1: \n\t"
  4486. "gsldlc1 %[source], 0(%[src_ptr]) \n\t"
  4487. "gsldrc1 %[source], -7(%[src_ptr]) \n\t"
  4488. "punpcklbh %[src0], %[source], %[mask0] \n\t"
  4489. "pshufh %[src0], %[src0], %[mask1] \n\t"
  4490. "punpckhbh %[src1], %[source], %[mask0] \n\t"
  4491. "pshufh %[src1], %[src1], %[mask1] \n\t"
  4492. "packushb %[dest], %[src1], %[src0] \n\t"
  4493. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  4494. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  4495. "daddi %[src_ptr], %[src_ptr], -0x08 \n\t"
  4496. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  4497. "daddi %[width], %[width], -0x08 \n\t"
  4498. "bnez %[width], 1b \n\t"
  4499. : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0),
  4500. [src1] "=&f"(src1)
  4501. : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
  4502. [mask1] "f"(mask1), [width] "r"(width)
  4503. : "memory");
  4504. }
  4505. void MirrorUVRow_MMI(const uint8_t* src_uv,
  4506. uint8_t* dst_u,
  4507. uint8_t* dst_v,
  4508. int width) {
  4509. uint64_t src0, src1, dest0, dest1;
  4510. const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
  4511. const uint64_t mask1 = 0x1b;
  4512. const uint64_t shift = 0x08;
  4513. src_uv += (width - 1) << 1;
  4514. __asm__ volatile(
  4515. "1: \n\t"
  4516. "gsldlc1 %[src0], 1(%[src_ptr]) \n\t"
  4517. "gsldrc1 %[src0], -6(%[src_ptr]) \n\t"
  4518. "gsldlc1 %[src1], -7(%[src_ptr]) \n\t"
  4519. "gsldrc1 %[src1], -14(%[src_ptr]) \n\t"
  4520. "and %[dest0], %[src0], %[mask0] \n\t"
  4521. "pshufh %[dest0], %[dest0], %[mask1] \n\t"
  4522. "and %[dest1], %[src1], %[mask0] \n\t"
  4523. "pshufh %[dest1], %[dest1], %[mask1] \n\t"
  4524. "packushb %[dest0], %[dest0], %[dest1] \n\t"
  4525. "gssdlc1 %[dest0], 0x07(%[dstu_ptr]) \n\t"
  4526. "gssdrc1 %[dest0], 0x00(%[dstu_ptr]) \n\t"
  4527. "psrlh %[dest0], %[src0], %[shift] \n\t"
  4528. "pshufh %[dest0], %[dest0], %[mask1] \n\t"
  4529. "psrlh %[dest1], %[src1], %[shift] \n\t"
  4530. "pshufh %[dest1], %[dest1], %[mask1] \n\t"
  4531. "packushb %[dest0], %[dest0], %[dest1] \n\t"
  4532. "gssdlc1 %[dest0], 0x07(%[dstv_ptr]) \n\t"
  4533. "gssdrc1 %[dest0], 0x00(%[dstv_ptr]) \n\t"
  4534. "daddi %[src_ptr], %[src_ptr], -0x10 \n\t"
  4535. "daddiu %[dstu_ptr], %[dstu_ptr], 0x08 \n\t"
  4536. "daddiu %[dstv_ptr], %[dstv_ptr], 0x08 \n\t"
  4537. "daddi %[width], %[width], -0x08 \n\t"
  4538. "bnez %[width], 1b \n\t"
  4539. : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
  4540. [src1] "=&f"(src1)
  4541. : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v),
  4542. [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1),
  4543. [shift] "f"(shift)
  4544. : "memory");
  4545. }
  4546. void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
  4547. src += (width - 1) * 4;
  4548. uint64_t temp = 0x0;
  4549. uint64_t shuff = 0x4e; // 01 00 11 10
  4550. __asm__ volatile(
  4551. "1: \n\t"
  4552. "gsldlc1 %[temp], 3(%[src]) \n\t"
  4553. "gsldrc1 %[temp], -4(%[src]) \n\t"
  4554. "pshufh %[temp], %[temp], %[shuff] \n\t"
  4555. "gssdrc1 %[temp], 0x0(%[dst]) \n\t"
  4556. "gssdlc1 %[temp], 0x7(%[dst]) \n\t"
  4557. "daddiu %[src], %[src], -0x08 \n\t"
  4558. "daddiu %[dst], %[dst], 0x08 \n\t"
  4559. "daddiu %[width], %[width], -0x02 \n\t"
  4560. "bnez %[width], 1b \n\t"
  4561. : [temp] "=&f"(temp)
  4562. : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff)
  4563. : "memory");
  4564. }
  4565. void SplitUVRow_MMI(const uint8_t* src_uv,
  4566. uint8_t* dst_u,
  4567. uint8_t* dst_v,
  4568. int width) {
  4569. uint64_t c0 = 0x00ff00ff00ff00ff;
  4570. uint64_t temp[4];
  4571. uint64_t shift = 0x08;
  4572. __asm__ volatile(
  4573. "1: \n\t"
  4574. "gsldrc1 %[t0], 0x00(%[src_uv]) \n\t"
  4575. "gsldlc1 %[t0], 0x07(%[src_uv]) \n\t"
  4576. "gsldrc1 %[t1], 0x08(%[src_uv]) \n\t"
  4577. "gsldlc1 %[t1], 0x0f(%[src_uv]) \n\t"
  4578. "and %[t2], %[t0], %[c0] \n\t"
  4579. "and %[t3], %[t1], %[c0] \n\t"
  4580. "packushb %[t2], %[t2], %[t3] \n\t"
  4581. "gssdrc1 %[t2], 0x0(%[dst_u]) \n\t"
  4582. "gssdlc1 %[t2], 0x7(%[dst_u]) \n\t"
  4583. "psrlh %[t2], %[t0], %[shift] \n\t"
  4584. "psrlh %[t3], %[t1], %[shift] \n\t"
  4585. "packushb %[t2], %[t2], %[t3] \n\t"
  4586. "gssdrc1 %[t2], 0x0(%[dst_v]) \n\t"
  4587. "gssdlc1 %[t2], 0x7(%[dst_v]) \n\t"
  4588. "daddiu %[src_uv], %[src_uv], 16 \n\t"
  4589. "daddiu %[dst_u], %[dst_u], 8 \n\t"
  4590. "daddiu %[dst_v], %[dst_v], 8 \n\t"
  4591. "daddiu %[width], %[width], -8 \n\t"
  4592. "bgtz %[width], 1b \n\t"
  4593. "nop \n\t"
  4594. : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
  4595. [t3] "=&f"(temp[3])
  4596. : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
  4597. [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
  4598. : "memory");
  4599. }
  4600. void MergeUVRow_MMI(const uint8_t* src_u,
  4601. const uint8_t* src_v,
  4602. uint8_t* dst_uv,
  4603. int width) {
  4604. uint64_t temp[3];
  4605. __asm__ volatile(
  4606. "1: \n\t"
  4607. "gsldrc1 %[t0], 0x0(%[src_u]) \n\t"
  4608. "gsldlc1 %[t0], 0x7(%[src_u]) \n\t"
  4609. "gsldrc1 %[t1], 0x0(%[src_v]) \n\t"
  4610. "gsldlc1 %[t1], 0x7(%[src_v]) \n\t"
  4611. "punpcklbh %[t2], %[t0], %[t1] \n\t"
  4612. "gssdrc1 %[t2], 0x0(%[dst_uv]) \n\t"
  4613. "gssdlc1 %[t2], 0x7(%[dst_uv]) \n\t"
  4614. "punpckhbh %[t2], %[t0], %[t1] \n\t"
  4615. "gssdrc1 %[t2], 0x8(%[dst_uv]) \n\t"
  4616. "gssdlc1 %[t2], 0xf(%[dst_uv]) \n\t"
  4617. "daddiu %[src_u], %[src_u], 8 \n\t"
  4618. "daddiu %[src_v], %[src_v], 8 \n\t"
  4619. "daddiu %[dst_uv], %[dst_uv], 16 \n\t"
  4620. "daddiu %[width], %[width], -8 \n\t"
  4621. "bgtz %[width], 1b \n\t"
  4622. "nop \n\t"
  4623. : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
  4624. : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v),
  4625. [width] "r"(width)
  4626. : "memory");
  4627. }
  4628. void SplitRGBRow_MMI(const uint8_t* src_rgb,
  4629. uint8_t* dst_r,
  4630. uint8_t* dst_g,
  4631. uint8_t* dst_b,
  4632. int width) {
  4633. uint64_t src[4];
  4634. uint64_t dest_hi, dest_lo, dest;
  4635. __asm__ volatile(
  4636. "1: \n\t"
  4637. "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
  4638. "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
  4639. "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
  4640. "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
  4641. "punpcklbh %[dest_lo], %[src0], %[src1] \n\t"
  4642. "gslwlc1 %[src2], 0x09(%[src_ptr]) \n\t"
  4643. "gslwrc1 %[src2], 0x06(%[src_ptr]) \n\t"
  4644. "gslwlc1 %[src3], 0x0c(%[src_ptr]) \n\t"
  4645. "gslwrc1 %[src3], 0x09(%[src_ptr]) \n\t"
  4646. "punpcklbh %[dest_hi], %[src2], %[src3] \n\t"
  4647. "punpcklhw %[dest], %[dest_lo], %[dest_hi] \n\t"
  4648. "gsswlc1 %[dest], 0x03(%[dstr_ptr]) \n\t"
  4649. "gsswrc1 %[dest], 0x00(%[dstr_ptr]) \n\t"
  4650. "punpckhwd %[dest], %[dest], %[dest] \n\t"
  4651. "gsswlc1 %[dest], 0x03(%[dstg_ptr]) \n\t"
  4652. "gsswrc1 %[dest], 0x00(%[dstg_ptr]) \n\t"
  4653. "punpckhhw %[dest], %[dest_lo], %[dest_hi] \n\t"
  4654. "gsswlc1 %[dest], 0x03(%[dstb_ptr]) \n\t"
  4655. "gsswrc1 %[dest], 0x00(%[dstb_ptr]) \n\t"
  4656. "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
  4657. "daddiu %[dstr_ptr], %[dstr_ptr], 0x04 \n\t"
  4658. "daddiu %[dstg_ptr], %[dstg_ptr], 0x04 \n\t"
  4659. "daddiu %[dstb_ptr], %[dstb_ptr], 0x04 \n\t"
  4660. "daddi %[width], %[width], -0x04 \n\t"
  4661. "bnez %[width], 1b \n\t"
  4662. : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]),
  4663. [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi),
  4664. [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
  4665. : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g),
  4666. [dstb_ptr] "r"(dst_b), [width] "r"(width)
  4667. : "memory");
  4668. }
  4669. void MergeRGBRow_MMI(const uint8_t* src_r,
  4670. const uint8_t* src_g,
  4671. const uint8_t* src_b,
  4672. uint8_t* dst_rgb,
  4673. int width) {
  4674. uint64_t srcr, srcg, srcb, dest;
  4675. uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo;
  4676. const uint64_t temp = 0x0;
  4677. __asm__ volatile(
  4678. "1: \n\t"
  4679. "gsldlc1 %[srcr], 0x07(%[srcr_ptr]) \n\t"
  4680. "gsldrc1 %[srcr], 0x00(%[srcr_ptr]) \n\t"
  4681. "gsldlc1 %[srcg], 0x07(%[srcg_ptr]) \n\t"
  4682. "gsldrc1 %[srcg], 0x00(%[srcg_ptr]) \n\t"
  4683. "punpcklbh %[srcrg_lo], %[srcr], %[srcg] \n\t"
  4684. "punpckhbh %[srcrg_hi], %[srcr], %[srcg] \n\t"
  4685. "gsldlc1 %[srcb], 0x07(%[srcb_ptr]) \n\t"
  4686. "gsldrc1 %[srcb], 0x00(%[srcb_ptr]) \n\t"
  4687. "punpcklbh %[srcbz_lo], %[srcb], %[temp] \n\t"
  4688. "punpckhbh %[srcbz_hi], %[srcb], %[temp] \n\t"
  4689. "punpcklhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t"
  4690. "gsswlc1 %[dest], 0x03(%[dst_ptr]) \n\t"
  4691. "gsswrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  4692. "punpckhwd %[dest], %[dest], %[dest] \n\t"
  4693. "gsswlc1 %[dest], 0x06(%[dst_ptr]) \n\t"
  4694. "gsswrc1 %[dest], 0x03(%[dst_ptr]) \n\t"
  4695. "punpckhhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t"
  4696. "gsswlc1 %[dest], 0x09(%[dst_ptr]) \n\t"
  4697. "gsswrc1 %[dest], 0x06(%[dst_ptr]) \n\t"
  4698. "punpckhwd %[dest], %[dest], %[dest] \n\t"
  4699. "gsswlc1 %[dest], 0x0c(%[dst_ptr]) \n\t"
  4700. "gsswrc1 %[dest], 0x09(%[dst_ptr]) \n\t"
  4701. "punpcklhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t"
  4702. "gsswlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
  4703. "gsswrc1 %[dest], 0x0c(%[dst_ptr]) \n\t"
  4704. "punpckhwd %[dest], %[dest], %[dest] \n\t"
  4705. "gsswlc1 %[dest], 0x12(%[dst_ptr]) \n\t"
  4706. "gsswrc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
  4707. "punpckhhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t"
  4708. "gsswlc1 %[dest], 0x15(%[dst_ptr]) \n\t"
  4709. "gsswrc1 %[dest], 0x12(%[dst_ptr]) \n\t"
  4710. "punpckhwd %[dest], %[dest], %[dest] \n\t"
  4711. "gsswlc1 %[dest], 0x18(%[dst_ptr]) \n\t"
  4712. "gsswrc1 %[dest], 0x15(%[dst_ptr]) \n\t"
  4713. "daddiu %[srcr_ptr], %[srcr_ptr], 0x08 \n\t"
  4714. "daddiu %[srcg_ptr], %[srcg_ptr], 0x08 \n\t"
  4715. "daddiu %[srcb_ptr], %[srcb_ptr], 0x08 \n\t"
  4716. "daddiu %[dst_ptr], %[dst_ptr], 0x18 \n\t"
  4717. "daddi %[width], %[width], -0x08 \n\t"
  4718. "bnez %[width], 1b \n\t"
  4719. : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb),
  4720. [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi),
  4721. [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi),
  4722. [srcbz_lo] "=&f"(srcbz_lo)
  4723. : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b),
  4724. [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp)
  4725. : "memory");
  4726. }
  4727. // Filter 2 rows of YUY2 UV's (422) into U and V (420).
  4728. void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
  4729. int src_stride_yuy2,
  4730. uint8_t* dst_u,
  4731. uint8_t* dst_v,
  4732. int width) {
  4733. uint64_t c0 = 0xff00ff00ff00ff00;
  4734. uint64_t c1 = 0x00ff00ff00ff00ff;
  4735. uint64_t temp[3];
  4736. uint64_t data[4];
  4737. uint64_t shift = 0x08;
  4738. uint64_t src_stride = 0x0;
  4739. __asm__ volatile(
  4740. "1: \n\t"
  4741. "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
  4742. "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
  4743. "daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t"
  4744. "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t"
  4745. "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t"
  4746. "pavgb %[t0], %[t0], %[t1] \n\t"
  4747. "gsldrc1 %[t2], 0x08(%[src_yuy2]) \n\t"
  4748. "gsldlc1 %[t2], 0x0f(%[src_yuy2]) \n\t"
  4749. "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t"
  4750. "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t"
  4751. "pavgb %[t1], %[t2], %[t1] \n\t"
  4752. "and %[t0], %[t0], %[c0] \n\t"
  4753. "and %[t1], %[t1], %[c0] \n\t"
  4754. "psrlh %[t0], %[t0], %[shift] \n\t"
  4755. "psrlh %[t1], %[t1], %[shift] \n\t"
  4756. "packushb %[t0], %[t0], %[t1] \n\t"
  4757. "mov.s %[t1], %[t0] \n\t"
  4758. "and %[d0], %[t0], %[c1] \n\t"
  4759. "psrlh %[d1], %[t1], %[shift] \n\t"
  4760. "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t"
  4761. "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t"
  4762. "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t"
  4763. "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t"
  4764. "pavgb %[t0], %[t0], %[t1] \n\t"
  4765. "gsldrc1 %[t2], 0x18(%[src_yuy2]) \n\t"
  4766. "gsldlc1 %[t2], 0x1f(%[src_yuy2]) \n\t"
  4767. "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t"
  4768. "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t"
  4769. "pavgb %[t1], %[t2], %[t1] \n\t"
  4770. "and %[t0], %[t0], %[c0] \n\t"
  4771. "and %[t1], %[t1], %[c0] \n\t"
  4772. "psrlh %[t0], %[t0], %[shift] \n\t"
  4773. "psrlh %[t1], %[t1], %[shift] \n\t"
  4774. "packushb %[t0], %[t0], %[t1] \n\t"
  4775. "mov.s %[t1], %[t0] \n\t"
  4776. "and %[d2], %[t0], %[c1] \n\t"
  4777. "psrlh %[d3], %[t1], %[shift] \n\t"
  4778. "packushb %[d0], %[d0], %[d2] \n\t"
  4779. "packushb %[d1], %[d1], %[d3] \n\t"
  4780. "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
  4781. "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
  4782. "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
  4783. "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
  4784. "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t"
  4785. "daddiu %[dst_u], %[dst_u], 8 \n\t"
  4786. "daddiu %[dst_v], %[dst_v], 8 \n\t"
  4787. "daddiu %[width], %[width], -16 \n\t"
  4788. "bgtz %[width], 1b \n\t"
  4789. "nop \n\t"
  4790. : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
  4791. [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
  4792. [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
  4793. : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2),
  4794. [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
  4795. [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
  4796. : "memory");
  4797. }
  4798. // Copy row of YUY2 UV's (422) into U and V (422).
  4799. void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
  4800. uint8_t* dst_u,
  4801. uint8_t* dst_v,
  4802. int width) {
  4803. uint64_t c0 = 0xff00ff00ff00ff00;
  4804. uint64_t c1 = 0x00ff00ff00ff00ff;
  4805. uint64_t temp[2];
  4806. uint64_t data[4];
  4807. uint64_t shift = 0x08;
  4808. __asm__ volatile(
  4809. "1: \n\t"
  4810. "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
  4811. "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
  4812. "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t"
  4813. "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t"
  4814. "and %[t0], %[t0], %[c0] \n\t"
  4815. "and %[t1], %[t1], %[c0] \n\t"
  4816. "psrlh %[t0], %[t0], %[shift] \n\t"
  4817. "psrlh %[t1], %[t1], %[shift] \n\t"
  4818. "packushb %[t0], %[t0], %[t1] \n\t"
  4819. "mov.s %[t1], %[t0] \n\t"
  4820. "and %[d0], %[t0], %[c1] \n\t"
  4821. "psrlh %[d1], %[t1], %[shift] \n\t"
  4822. "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t"
  4823. "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t"
  4824. "gsldrc1 %[t1], 0x18(%[src_yuy2]) \n\t"
  4825. "gsldlc1 %[t1], 0x1f(%[src_yuy2]) \n\t"
  4826. "and %[t0], %[t0], %[c0] \n\t"
  4827. "and %[t1], %[t1], %[c0] \n\t"
  4828. "psrlh %[t0], %[t0], %[shift] \n\t"
  4829. "psrlh %[t1], %[t1], %[shift] \n\t"
  4830. "packushb %[t0], %[t0], %[t1] \n\t"
  4831. "mov.s %[t1], %[t0] \n\t"
  4832. "and %[d2], %[t0], %[c1] \n\t"
  4833. "psrlh %[d3], %[t1], %[shift] \n\t"
  4834. "packushb %[d0], %[d0], %[d2] \n\t"
  4835. "packushb %[d1], %[d1], %[d3] \n\t"
  4836. "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
  4837. "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
  4838. "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
  4839. "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
  4840. "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t"
  4841. "daddiu %[dst_u], %[dst_u], 8 \n\t"
  4842. "daddiu %[dst_v], %[dst_v], 8 \n\t"
  4843. "daddiu %[width], %[width], -16 \n\t"
  4844. "bgtz %[width], 1b \n\t"
  4845. "nop \n\t"
  4846. : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
  4847. [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
  4848. : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
  4849. [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
  4850. : "memory");
  4851. }
  4852. // Copy row of YUY2 Y's (422) into Y (420/422).
  4853. void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  4854. uint64_t c0 = 0x00ff00ff00ff00ff;
  4855. uint64_t temp[2];
  4856. __asm__ volatile(
  4857. "1: \n\t"
  4858. "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
  4859. "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
  4860. "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t"
  4861. "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t"
  4862. "and %[t0], %[t0], %[c0] \n\t"
  4863. "and %[t1], %[t1], %[c0] \n\t"
  4864. "packushb %[t0], %[t0], %[t1] \n\t"
  4865. "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t"
  4866. "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t"
  4867. "daddiu %[src_yuy2], %[src_yuy2], 16 \n\t"
  4868. "daddiu %[dst_y], %[dst_y], 8 \n\t"
  4869. "daddiu %[width], %[width], -8 \n\t"
  4870. "bgtz %[width], 1b \n\t"
  4871. "nop \n\t"
  4872. : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
  4873. : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width),
  4874. [c0] "f"(c0)
  4875. : "memory");
  4876. }
  4877. // Filter 2 rows of UYVY UV's (422) into U and V (420).
  4878. void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
  4879. int src_stride_uyvy,
  4880. uint8_t* dst_u,
  4881. uint8_t* dst_v,
  4882. int width) {
  4883. // Output a row of UV values.
  4884. uint64_t c0 = 0x00ff00ff00ff00ff;
  4885. uint64_t temp[3];
  4886. uint64_t data[4];
  4887. uint64_t shift = 0x08;
  4888. uint64_t src_stride = 0x0;
  4889. __asm__ volatile(
  4890. "1: \n\t"
  4891. "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
  4892. "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
  4893. "daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t"
  4894. "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t"
  4895. "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t"
  4896. "pavgb %[t0], %[t0], %[t1] \n\t"
  4897. "gsldrc1 %[t2], 0x08(%[src_uyvy]) \n\t"
  4898. "gsldlc1 %[t2], 0x0f(%[src_uyvy]) \n\t"
  4899. "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t"
  4900. "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t"
  4901. "pavgb %[t1], %[t2], %[t1] \n\t"
  4902. "and %[t0], %[t0], %[c0] \n\t"
  4903. "and %[t1], %[t1], %[c0] \n\t"
  4904. "packushb %[t0], %[t0], %[t1] \n\t"
  4905. "mov.s %[t1], %[t0] \n\t"
  4906. "and %[d0], %[t0], %[c0] \n\t"
  4907. "psrlh %[d1], %[t1], %[shift] \n\t"
  4908. "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t"
  4909. "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t"
  4910. "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t"
  4911. "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t"
  4912. "pavgb %[t0], %[t0], %[t1] \n\t"
  4913. "gsldrc1 %[t2], 0x18(%[src_uyvy]) \n\t"
  4914. "gsldlc1 %[t2], 0x1f(%[src_uyvy]) \n\t"
  4915. "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t"
  4916. "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t"
  4917. "pavgb %[t1], %[t2], %[t1] \n\t"
  4918. "and %[t0], %[t0], %[c0] \n\t"
  4919. "and %[t1], %[t1], %[c0] \n\t"
  4920. "packushb %[t0], %[t0], %[t1] \n\t"
  4921. "mov.s %[t1], %[t0] \n\t"
  4922. "and %[d2], %[t0], %[c0] \n\t"
  4923. "psrlh %[d3], %[t1], %[shift] \n\t"
  4924. "packushb %[d0], %[d0], %[d2] \n\t"
  4925. "packushb %[d1], %[d1], %[d3] \n\t"
  4926. "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
  4927. "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
  4928. "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
  4929. "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
  4930. "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t"
  4931. "daddiu %[dst_u], %[dst_u], 8 \n\t"
  4932. "daddiu %[dst_v], %[dst_v], 8 \n\t"
  4933. "daddiu %[width], %[width], -16 \n\t"
  4934. "bgtz %[width], 1b \n\t"
  4935. "nop \n\t"
  4936. : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
  4937. [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
  4938. [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
  4939. : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy),
  4940. [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
  4941. [c0] "f"(c0), [shift] "f"(shift)
  4942. : "memory");
  4943. }
  4944. // Copy row of UYVY UV's (422) into U and V (422).
  4945. void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
  4946. uint8_t* dst_u,
  4947. uint8_t* dst_v,
  4948. int width) {
  4949. // Output a row of UV values.
  4950. uint64_t c0 = 0x00ff00ff00ff00ff;
  4951. uint64_t temp[2];
  4952. uint64_t data[4];
  4953. uint64_t shift = 0x08;
  4954. __asm__ volatile(
  4955. "1: \n\t"
  4956. "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
  4957. "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
  4958. "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t"
  4959. "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t"
  4960. "and %[t0], %[t0], %[c0] \n\t"
  4961. "and %[t1], %[t1], %[c0] \n\t"
  4962. "packushb %[t0], %[t0], %[t1] \n\t"
  4963. "mov.s %[t1], %[t0] \n\t"
  4964. "and %[d0], %[t0], %[c0] \n\t"
  4965. "psrlh %[d1], %[t1], %[shift] \n\t"
  4966. "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t"
  4967. "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t"
  4968. "gsldrc1 %[t1], 0x18(%[src_uyvy]) \n\t"
  4969. "gsldlc1 %[t1], 0x1f(%[src_uyvy]) \n\t"
  4970. "and %[t0], %[t0], %[c0] \n\t"
  4971. "and %[t1], %[t1], %[c0] \n\t"
  4972. "packushb %[t0], %[t0], %[t1] \n\t"
  4973. "mov.s %[t1], %[t0] \n\t"
  4974. "and %[d2], %[t0], %[c0] \n\t"
  4975. "psrlh %[d3], %[t1], %[shift] \n\t"
  4976. "packushb %[d0], %[d0], %[d2] \n\t"
  4977. "packushb %[d1], %[d1], %[d3] \n\t"
  4978. "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
  4979. "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
  4980. "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
  4981. "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
  4982. "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t"
  4983. "daddiu %[dst_u], %[dst_u], 8 \n\t"
  4984. "daddiu %[dst_v], %[dst_v], 8 \n\t"
  4985. "daddiu %[width], %[width], -16 \n\t"
  4986. "bgtz %[width], 1b \n\t"
  4987. "nop \n\t"
  4988. : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
  4989. [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
  4990. : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
  4991. [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
  4992. : "memory");
  4993. }
  4994. // Copy row of UYVY Y's (422) into Y (420/422).
  4995. void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  4996. // Output a row of Y values.
  4997. uint64_t c0 = 0x00ff00ff00ff00ff;
  4998. uint64_t shift = 0x08;
  4999. uint64_t temp[2];
  5000. __asm__ volatile(
  5001. "1: \n\t"
  5002. "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
  5003. "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
  5004. "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t"
  5005. "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t"
  5006. "dsrl %[t0], %[t0], %[shift] \n\t"
  5007. "dsrl %[t1], %[t1], %[shift] \n\t"
  5008. "and %[t0], %[t0], %[c0] \n\t"
  5009. "and %[t1], %[t1], %[c0] \n\t"
  5010. "and %[t1], %[t1], %[c0] \n\t"
  5011. "packushb %[t0], %[t0], %[t1] \n\t"
  5012. "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t"
  5013. "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t"
  5014. "daddiu %[src_uyvy], %[src_uyvy], 16 \n\t"
  5015. "daddiu %[dst_y], %[dst_y], 8 \n\t"
  5016. "daddiu %[width], %[width], -8 \n\t"
  5017. "bgtz %[width], 1b \n\t"
  5018. "nop \n\t"
  5019. : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
  5020. : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width),
  5021. [c0] "f"(c0), [shift] "f"(shift)
  5022. : "memory");
  5023. }
  5024. // Blend src_argb0 over src_argb1 and store to dst_argb.
  5025. // dst_argb may be src_argb0 or src_argb1.
  5026. // This code mimics the SSSE3 version for better testability.
  5027. void ARGBBlendRow_MMI(const uint8_t* src_argb0,
  5028. const uint8_t* src_argb1,
  5029. uint8_t* dst_argb,
  5030. int width) {
  5031. uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi,
  5032. dest_lo;
  5033. const uint64_t mask0 = 0x0;
  5034. const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL;
  5035. const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
  5036. const uint64_t mask3 = 0xFF;
  5037. const uint64_t mask4 = ~mask1;
  5038. const uint64_t shift = 0x08;
  5039. __asm__ volatile(
  5040. "1: \n\t"
  5041. "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
  5042. "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
  5043. "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t"
  5044. "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
  5045. "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
  5046. "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t"
  5047. "psubush %[alpha], %[mask2], %[src0_lo] \n\t"
  5048. "pshufh %[alpha], %[alpha], %[mask3] \n\t"
  5049. "pmullh %[dest_lo], %[src1_lo], %[alpha] \n\t"
  5050. "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
  5051. "paddush %[dest_lo], %[dest_lo], %[src0_lo] \n\t"
  5052. "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t"
  5053. "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t"
  5054. "psubush %[alpha], %[mask2], %[src0_hi] \n\t"
  5055. "pshufh %[alpha], %[alpha], %[mask3] \n\t"
  5056. "pmullh %[dest_hi], %[src1_hi], %[alpha] \n\t"
  5057. "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
  5058. "paddush %[dest_hi], %[dest_hi], %[src0_hi] \n\t"
  5059. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  5060. "and %[dest], %[dest], %[mask1] \n\t"
  5061. "or %[dest], %[dest], %[mask4] \n\t"
  5062. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  5063. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  5064. "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
  5065. "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
  5066. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  5067. "daddi %[width], %[width], -0x02 \n\t"
  5068. "bnez %[width], 1b \n\t"
  5069. : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha),
  5070. [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
  5071. [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
  5072. [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
  5073. : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
  5074. [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
  5075. [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
  5076. [shift] "f"(shift), [width] "r"(width)
  5077. : "memory");
  5078. }
  5079. void BlendPlaneRow_MMI(const uint8_t* src0,
  5080. const uint8_t* src1,
  5081. const uint8_t* alpha,
  5082. uint8_t* dst,
  5083. int width) {
  5084. uint64_t source0, source1, dest, alph;
  5085. uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi,
  5086. dest_lo;
  5087. uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi;
  5088. const uint64_t mask0 = 0x0;
  5089. const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL;
  5090. const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
  5091. const uint64_t shift = 0x08;
  5092. __asm__ volatile(
  5093. "1: \n\t"
  5094. "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
  5095. "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
  5096. "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t"
  5097. "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t"
  5098. "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
  5099. "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
  5100. "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t"
  5101. "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t"
  5102. "gsldlc1 %[alpha], 0x07(%[alpha_ptr]) \n\t"
  5103. "gsldrc1 %[alpha], 0x00(%[alpha_ptr]) \n\t"
  5104. "psubusb %[alpha_r], %[mask1], %[alpha] \n\t"
  5105. "punpcklbh %[alpha_lo], %[alpha], %[mask0] \n\t"
  5106. "punpckhbh %[alpha_hi], %[alpha], %[mask0] \n\t"
  5107. "punpcklbh %[alpha_rlo], %[alpha_r], %[mask0] \n\t"
  5108. "punpckhbh %[alpha_rhi], %[alpha_r], %[mask0] \n\t"
  5109. "pmullh %[dest_lo], %[src0_lo], %[alpha_lo] \n\t"
  5110. "pmullh %[dest], %[src1_lo], %[alpha_rlo] \n\t"
  5111. "paddush %[dest_lo], %[dest_lo], %[dest] \n\t"
  5112. "paddush %[dest_lo], %[dest_lo], %[mask2] \n\t"
  5113. "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
  5114. "pmullh %[dest_hi], %[src0_hi], %[alpha_hi] \n\t"
  5115. "pmullh %[dest], %[src1_hi], %[alpha_rhi] \n\t"
  5116. "paddush %[dest_hi], %[dest_hi], %[dest] \n\t"
  5117. "paddush %[dest_hi], %[dest_hi], %[mask2] \n\t"
  5118. "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
  5119. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  5120. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  5121. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  5122. "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
  5123. "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
  5124. "daddiu %[alpha_ptr], %[alpha_ptr], 0x08 \n\t"
  5125. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  5126. "daddi %[width], %[width], -0x08 \n\t"
  5127. "bnez %[width], 1b \n\t"
  5128. : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph),
  5129. [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
  5130. [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
  5131. [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo),
  5132. [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
  5133. [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi),
  5134. [alpha_r] "=&f"(alpha_rev)
  5135. : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha),
  5136. [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1),
  5137. [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width)
  5138. : "memory");
  5139. }
  5140. // Multiply source RGB by alpha and store to destination.
  5141. // This code mimics the SSSE3 version for better testability.
  5142. void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
  5143. uint8_t* dst_argb,
  5144. int width) {
  5145. uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha;
  5146. const uint64_t mask0 = 0xFF;
  5147. const uint64_t mask1 = 0xFF000000FF000000ULL;
  5148. const uint64_t mask2 = ~mask1;
  5149. const uint64_t shift = 0x08;
  5150. __asm__ volatile(
  5151. "1: \n\t"
  5152. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  5153. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  5154. "punpcklbh %[src_lo], %[src], %[src] \n\t"
  5155. "punpckhbh %[src_hi], %[src], %[src] \n\t"
  5156. "pshufh %[alpha], %[src_lo], %[mask0] \n\t"
  5157. "pmulhuh %[dest_lo], %[alpha], %[src_lo] \n\t"
  5158. "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
  5159. "pshufh %[alpha], %[src_hi], %[mask0] \n\t"
  5160. "pmulhuh %[dest_hi], %[alpha], %[src_hi] \n\t"
  5161. "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
  5162. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  5163. "and %[dest], %[dest], %[mask2] \n\t"
  5164. "and %[src], %[src], %[mask1] \n\t"
  5165. "or %[dest], %[dest], %[src] \n\t"
  5166. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  5167. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  5168. "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
  5169. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  5170. "daddi %[width], %[width], -0x02 \n\t"
  5171. "bnez %[width], 1b \n\t"
  5172. : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
  5173. [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
  5174. [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha)
  5175. : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
  5176. [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift),
  5177. [width] "r"(width)
  5178. : "memory");
  5179. }
  5180. void ComputeCumulativeSumRow_MMI(const uint8_t* row,
  5181. int32_t* cumsum,
  5182. const int32_t* previous_cumsum,
  5183. int width) {
  5184. int64_t row_sum[2] = {0, 0};
  5185. uint64_t src, dest0, dest1, presrc0, presrc1, dest;
  5186. const uint64_t mask = 0x0;
  5187. __asm__ volatile(
  5188. "xor %[row_sum0], %[row_sum0], %[row_sum0] \n\t"
  5189. "xor %[row_sum1], %[row_sum1], %[row_sum1] \n\t"
  5190. "1: \n\t"
  5191. "gslwlc1 %[src], 0x03(%[row_ptr]) \n\t"
  5192. "gslwrc1 %[src], 0x00(%[row_ptr]) \n\t"
  5193. "punpcklbh %[src], %[src], %[mask] \n\t"
  5194. "punpcklhw %[dest0], %[src], %[mask] \n\t"
  5195. "punpckhhw %[dest1], %[src], %[mask] \n\t"
  5196. "paddw %[row_sum0], %[row_sum0], %[dest0] \n\t"
  5197. "paddw %[row_sum1], %[row_sum1], %[dest1] \n\t"
  5198. "gsldlc1 %[presrc0], 0x07(%[pre_ptr]) \n\t"
  5199. "gsldrc1 %[presrc0], 0x00(%[pre_ptr]) \n\t"
  5200. "gsldlc1 %[presrc1], 0x0f(%[pre_ptr]) \n\t"
  5201. "gsldrc1 %[presrc1], 0x08(%[pre_ptr]) \n\t"
  5202. "paddw %[dest0], %[row_sum0], %[presrc0] \n\t"
  5203. "paddw %[dest1], %[row_sum1], %[presrc1] \n\t"
  5204. "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
  5205. "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
  5206. "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
  5207. "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
  5208. "daddiu %[row_ptr], %[row_ptr], 0x04 \n\t"
  5209. "daddiu %[pre_ptr], %[pre_ptr], 0x10 \n\t"
  5210. "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
  5211. "daddi %[width], %[width], -0x01 \n\t"
  5212. "bnez %[width], 1b \n\t"
  5213. : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
  5214. [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]),
  5215. [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0),
  5216. [presrc1] "=&f"(presrc1)
  5217. : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum),
  5218. [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask)
  5219. : "memory");
  5220. }
  5221. // C version 2x2 -> 2x1.
  5222. void InterpolateRow_MMI(uint8_t* dst_ptr,
  5223. const uint8_t* src_ptr,
  5224. ptrdiff_t src_stride,
  5225. int width,
  5226. int source_y_fraction) {
  5227. if (source_y_fraction == 0) {
  5228. __asm__ volatile(
  5229. "1: \n\t"
  5230. "ld $t0, 0x0(%[src_ptr]) \n\t"
  5231. "sd $t0, 0x0(%[dst_ptr]) \n\t"
  5232. "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
  5233. "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
  5234. "daddiu %[width], %[width], -8 \n\t"
  5235. "bgtz %[width], 1b \n\t"
  5236. "nop \n\t"
  5237. :
  5238. : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width)
  5239. : "memory");
  5240. return;
  5241. }
  5242. if (source_y_fraction == 128) {
  5243. uint64_t uv = 0x0;
  5244. uint64_t uv_stride = 0x0;
  5245. __asm__ volatile(
  5246. "1: \n\t"
  5247. "gsldrc1 %[uv], 0x0(%[src_ptr]) \n\t"
  5248. "gsldlc1 %[uv], 0x7(%[src_ptr]) \n\t"
  5249. "daddu $t0, %[src_ptr], %[stride] \n\t"
  5250. "gsldrc1 %[uv_stride], 0x0($t0) \n\t"
  5251. "gsldlc1 %[uv_stride], 0x7($t0) \n\t"
  5252. "pavgb %[uv], %[uv], %[uv_stride] \n\t"
  5253. "gssdrc1 %[uv], 0x0(%[dst_ptr]) \n\t"
  5254. "gssdlc1 %[uv], 0x7(%[dst_ptr]) \n\t"
  5255. "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
  5256. "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
  5257. "daddiu %[width], %[width], -8 \n\t"
  5258. "bgtz %[width], 1b \n\t"
  5259. "nop \n\t"
  5260. : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
  5261. : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
  5262. [stride] "r"((int64_t)src_stride)
  5263. : "memory");
  5264. return;
  5265. }
  5266. const uint8_t* src_ptr1 = src_ptr + src_stride;
  5267. uint64_t temp;
  5268. uint64_t data[4];
  5269. uint64_t zero = 0x0;
  5270. uint64_t c0 = 0x0080008000800080;
  5271. uint64_t fy0 = 0x0100010001000100;
  5272. uint64_t shift = 0x8;
  5273. __asm__ volatile(
  5274. "pshufh %[fy1], %[fy1], %[zero] \n\t"
  5275. "psubh %[fy0], %[fy0], %[fy1] \n\t"
  5276. "1: \n\t"
  5277. "gsldrc1 %[t0], 0x0(%[src_ptr]) \n\t"
  5278. "gsldlc1 %[t0], 0x7(%[src_ptr]) \n\t"
  5279. "punpcklbh %[d0], %[t0], %[zero] \n\t"
  5280. "punpckhbh %[d1], %[t0], %[zero] \n\t"
  5281. "gsldrc1 %[t0], 0x0(%[src_ptr1]) \n\t"
  5282. "gsldlc1 %[t0], 0x7(%[src_ptr1]) \n\t"
  5283. "punpcklbh %[d2], %[t0], %[zero] \n\t"
  5284. "punpckhbh %[d3], %[t0], %[zero] \n\t"
  5285. "pmullh %[d0], %[d0], %[fy0] \n\t"
  5286. "pmullh %[d2], %[d2], %[fy1] \n\t"
  5287. "paddh %[d0], %[d0], %[d2] \n\t"
  5288. "paddh %[d0], %[d0], %[c0] \n\t"
  5289. "psrlh %[d0], %[d0], %[shift] \n\t"
  5290. "pmullh %[d1], %[d1], %[fy0] \n\t"
  5291. "pmullh %[d3], %[d3], %[fy1] \n\t"
  5292. "paddh %[d1], %[d1], %[d3] \n\t"
  5293. "paddh %[d1], %[d1], %[c0] \n\t"
  5294. "psrlh %[d1], %[d1], %[shift] \n\t"
  5295. "packushb %[d0], %[d0], %[d1] \n\t"
  5296. "gssdrc1 %[d0], 0x0(%[dst_ptr]) \n\t"
  5297. "gssdlc1 %[d0], 0x7(%[dst_ptr]) \n\t"
  5298. "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
  5299. "daddiu %[src_ptr1], %[src_ptr1], 8 \n\t"
  5300. "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
  5301. "daddiu %[width], %[width], -8 \n\t"
  5302. "bgtz %[width], 1b \n\t"
  5303. "nop \n\t"
  5304. : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]),
  5305. [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
  5306. : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1),
  5307. [dst_ptr] "r"(dst_ptr), [width] "r"(width),
  5308. [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0),
  5309. [shift] "f"(shift), [zero] "f"(zero)
  5310. : "memory");
  5311. }
  5312. // Use first 4 shuffler values to reorder ARGB channels.
  5313. void ARGBShuffleRow_MMI(const uint8_t* src_argb,
  5314. uint8_t* dst_argb,
  5315. const uint8_t* shuffler,
  5316. int width) {
  5317. uint64_t source, dest0, dest1, dest;
  5318. const uint64_t mask0 = 0x0;
  5319. const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) |
  5320. ((shuffler[2] & 0x03) << 4) |
  5321. ((shuffler[3] & 0x03) << 6);
  5322. __asm__ volatile(
  5323. "1: \n\t"
  5324. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  5325. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  5326. "punpcklbh %[dest0], %[src], %[mask0] \n\t"
  5327. "pshufh %[dest0], %[dest0], %[mask1] \n\t"
  5328. "punpckhbh %[dest1], %[src], %[mask0] \n\t"
  5329. "pshufh %[dest1], %[dest1], %[mask1] \n\t"
  5330. "packushb %[dest], %[dest0], %[dest1] \n\t"
  5331. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  5332. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  5333. "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
  5334. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  5335. "daddi %[width], %[width], -0x02 \n\t"
  5336. "bnez %[width], 1b \n\t"
  5337. : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
  5338. [dest1] "=&f"(dest1)
  5339. : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
  5340. [mask1] "f"(mask1), [width] "r"(width)
  5341. : "memory");
  5342. }
  5343. void I422ToYUY2Row_MMI(const uint8_t* src_y,
  5344. const uint8_t* src_u,
  5345. const uint8_t* src_v,
  5346. uint8_t* dst_frame,
  5347. int width) {
  5348. uint64_t temp[3];
  5349. uint64_t vu = 0x0;
  5350. __asm__ volatile(
  5351. "1: \n\t"
  5352. "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i]
  5353. "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i]
  5354. "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i]
  5355. "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i]
  5356. "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i]
  5357. "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i]
  5358. "punpcklbh %[vu], %[tu], %[tv] \n\t" // g
  5359. "punpcklbh %[tu], %[ty], %[vu] \n\t" // g
  5360. "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t"
  5361. "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t"
  5362. "punpckhbh %[tu], %[ty], %[vu] \n\t" // g
  5363. "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t"
  5364. "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t"
  5365. "daddiu %[src_y], %[src_y], 8 \n\t"
  5366. "daddiu %[src_u], %[src_u], 4 \n\t"
  5367. "daddiu %[src_v], %[src_v], 4 \n\t"
  5368. "daddiu %[dst_frame], %[dst_frame], 16 \n\t"
  5369. "daddiu %[width], %[width], -8 \n\t"
  5370. "bgtz %[width], 1b \n\t"
  5371. "nop \n\t"
  5372. : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
  5373. [vu] "=&f"(vu)
  5374. : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
  5375. [dst_frame] "r"(dst_frame), [width] "r"(width)
  5376. : "memory");
  5377. }
  5378. void I422ToUYVYRow_MMI(const uint8_t* src_y,
  5379. const uint8_t* src_u,
  5380. const uint8_t* src_v,
  5381. uint8_t* dst_frame,
  5382. int width) {
  5383. uint64_t temp[3];
  5384. uint64_t vu = 0x0;
  5385. __asm__ volatile(
  5386. "1: \n\t"
  5387. "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i]
  5388. "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i]
  5389. "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i]
  5390. "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i]
  5391. "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i]
  5392. "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i]
  5393. "punpcklbh %[vu], %[tu], %[tv] \n\t" // g
  5394. "punpcklbh %[tu], %[vu], %[ty] \n\t" // g
  5395. "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t"
  5396. "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t"
  5397. "punpckhbh %[tu], %[vu], %[ty] \n\t" // g
  5398. "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t"
  5399. "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t"
  5400. "daddiu %[src_y], %[src_y], 8 \n\t"
  5401. "daddiu %[src_u], %[src_u], 4 \n\t"
  5402. "daddiu %[src_v], %[src_v], 4 \n\t"
  5403. "daddiu %[dst_frame], %[dst_frame], 16 \n\t"
  5404. "daddiu %[width], %[width], -8 \n\t"
  5405. "bgtz %[width], 1b \n\t"
  5406. "nop \n\t"
  5407. : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
  5408. [vu] "=&f"(vu)
  5409. : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
  5410. [dst_frame] "r"(dst_frame), [width] "r"(width)
  5411. : "memory");
  5412. }
  5413. void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
  5414. uint64_t source, dest;
  5415. const uint64_t mask0 = 0xff000000ff000000ULL;
  5416. const uint64_t mask1 = ~mask0;
  5417. __asm__ volatile(
  5418. "1: \n\t"
  5419. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  5420. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  5421. "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  5422. "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  5423. "and %[src], %[src], %[mask0] \n\t"
  5424. "and %[dest], %[dest], %[mask1] \n\t"
  5425. "or %[dest], %[src], %[dest] \n\t"
  5426. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  5427. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  5428. "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
  5429. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  5430. "daddi %[width], %[width], -0x02 \n\t"
  5431. "bnez %[width], 1b \n\t"
  5432. : [src] "=&f"(source), [dest] "=&f"(dest)
  5433. : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
  5434. [mask1] "f"(mask1), [width] "r"(width)
  5435. : "memory");
  5436. }
  5437. void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
  5438. uint8_t* dst_a,
  5439. int width) {
  5440. uint64_t src, dest0, dest1, dest_lo, dest_hi, dest;
  5441. const uint64_t mask = 0xff000000ff000000ULL;
  5442. const uint64_t shift = 0x18;
  5443. __asm__ volatile(
  5444. "1: \n\t"
  5445. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  5446. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  5447. "and %[dest0], %[src], %[mask] \n\t"
  5448. "psrlw %[dest0], %[dest0], %[shift] \n\t"
  5449. "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t"
  5450. "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t"
  5451. "and %[dest1], %[src], %[mask] \n\t"
  5452. "psrlw %[dest1], %[dest1], %[shift] \n\t"
  5453. "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
  5454. "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t"
  5455. "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t"
  5456. "and %[dest0], %[src], %[mask] \n\t"
  5457. "psrlw %[dest0], %[dest0], %[shift] \n\t"
  5458. "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t"
  5459. "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t"
  5460. "and %[dest1], %[src], %[mask] \n\t"
  5461. "psrlw %[dest1], %[dest1], %[shift] \n\t"
  5462. "packsswh %[dest_hi], %[dest0], %[dest1] \n\t"
  5463. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  5464. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  5465. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  5466. "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
  5467. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  5468. "daddi %[width], %[width], -0x08 \n\t"
  5469. "bnez %[width], 1b \n\t"
  5470. : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
  5471. [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi)
  5472. : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask),
  5473. [shift] "f"(shift), [width] "r"(width)
  5474. : "memory");
  5475. }
  5476. void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
  5477. uint64_t source, dest0, dest1, dest;
  5478. const uint64_t mask0 = 0x0;
  5479. const uint64_t mask1 = 0x00ffffff00ffffffULL;
  5480. __asm__ volatile(
  5481. "1: \n\t"
  5482. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  5483. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  5484. "punpcklbh %[dest0], %[mask0], %[src] \n\t"
  5485. "punpcklhw %[dest1], %[mask0], %[dest0] \n\t"
  5486. "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  5487. "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  5488. "and %[dest], %[dest], %[mask1] \n\t"
  5489. "or %[dest], %[dest], %[dest1] \n\t"
  5490. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  5491. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  5492. "punpckhhw %[dest1], %[mask0], %[dest0] \n\t"
  5493. "gsldlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
  5494. "gsldrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
  5495. "and %[dest], %[dest], %[mask1] \n\t"
  5496. "or %[dest], %[dest], %[dest1] \n\t"
  5497. "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
  5498. "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
  5499. "punpckhbh %[dest0], %[mask0], %[src] \n\t"
  5500. "punpcklhw %[dest1], %[mask0], %[dest0] \n\t"
  5501. "gsldlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
  5502. "gsldrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
  5503. "and %[dest], %[dest], %[mask1] \n\t"
  5504. "or %[dest], %[dest], %[dest1] \n\t"
  5505. "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
  5506. "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
  5507. "punpckhhw %[dest1], %[mask0], %[dest0] \n\t"
  5508. "gsldlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
  5509. "gsldrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
  5510. "and %[dest], %[dest], %[mask1] \n\t"
  5511. "or %[dest], %[dest], %[dest1] \n\t"
  5512. "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
  5513. "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
  5514. "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
  5515. "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t"
  5516. "daddi %[width], %[width], -0x08 \n\t"
  5517. "bnez %[width], 1b \n\t"
  5518. : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
  5519. [dest1] "=&f"(dest1)
  5520. : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
  5521. [mask1] "f"(mask1), [width] "r"(width)
  5522. : "memory");
  5523. }
  5524. #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
  5525. #ifdef __cplusplus
  5526. } // extern "C"
  5527. } // namespace libyuv
  5528. #endif