rotate_mmi.cc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/rotate_row.h"
  11. #include "libyuv/row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for Mips MMI.
  17. #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
  18. void TransposeWx8_MMI(const uint8_t* src,
  19. int src_stride,
  20. uint8_t* dst,
  21. int dst_stride,
  22. int width) {
  23. uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
  24. uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
  25. uint8_t* src_tmp = nullptr;
  26. __asm__ volatile(
  27. "1: \n\t"
  28. "ldc1 %[tmp12], 0x00(%[src]) \n\t"
  29. "dadd %[src_tmp], %[src], %[src_stride] \n\t"
  30. "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
  31. /* tmp0 = (00 10 01 11 02 12 03 13) */
  32. "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
  33. /* tmp1 = (04 14 05 15 06 16 07 17) */
  34. "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
  35. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  36. "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
  37. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  38. "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
  39. /* tmp2 = (20 30 21 31 22 32 23 33) */
  40. "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
  41. /* tmp3 = (24 34 25 35 26 36 27 37) */
  42. "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
  43. /* tmp4 = (00 10 20 30 01 11 21 31) */
  44. "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t"
  45. /* tmp5 = (02 12 22 32 03 13 23 33) */
  46. "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t"
  47. /* tmp6 = (04 14 24 34 05 15 25 35) */
  48. "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t"
  49. /* tmp7 = (06 16 26 36 07 17 27 37) */
  50. "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t"
  51. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  52. "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
  53. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  54. "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
  55. /* tmp0 = (40 50 41 51 42 52 43 53) */
  56. "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
  57. /* tmp1 = (44 54 45 55 46 56 47 57) */
  58. "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
  59. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  60. "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
  61. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  62. "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
  63. /* tmp2 = (60 70 61 71 62 72 63 73) */
  64. "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
  65. /* tmp3 = (64 74 65 75 66 76 67 77) */
  66. "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
  67. /* tmp8 = (40 50 60 70 41 51 61 71) */
  68. "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t"
  69. /* tmp9 = (42 52 62 72 43 53 63 73) */
  70. "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t"
  71. /* tmp10 = (44 54 64 74 45 55 65 75) */
  72. "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t"
  73. /* tmp11 = (46 56 66 76 47 57 67 77) */
  74. "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t"
  75. /* tmp0 = (00 10 20 30 40 50 60 70) */
  76. "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t"
  77. /* tmp1 = (01 11 21 31 41 51 61 71) */
  78. "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t"
  79. "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
  80. "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
  81. "dadd %[dst], %[dst], %[dst_stride] \n\t"
  82. "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
  83. "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
  84. /* tmp0 = (02 12 22 32 42 52 62 72) */
  85. "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t"
  86. /* tmp1 = (03 13 23 33 43 53 63 73) */
  87. "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t"
  88. "dadd %[dst], %[dst], %[dst_stride] \n\t"
  89. "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
  90. "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
  91. "dadd %[dst], %[dst], %[dst_stride] \n\t"
  92. "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
  93. "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
  94. /* tmp0 = (04 14 24 34 44 54 64 74) */
  95. "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t"
  96. /* tmp1 = (05 15 25 35 45 55 65 75) */
  97. "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t"
  98. "dadd %[dst], %[dst], %[dst_stride] \n\t"
  99. "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
  100. "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
  101. "dadd %[dst], %[dst], %[dst_stride] \n\t"
  102. "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
  103. "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
  104. /* tmp0 = (06 16 26 36 46 56 66 76) */
  105. "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t"
  106. /* tmp1 = (07 17 27 37 47 57 67 77) */
  107. "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t"
  108. "dadd %[dst], %[dst], %[dst_stride] \n\t"
  109. "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
  110. "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
  111. "dadd %[dst], %[dst], %[dst_stride] \n\t"
  112. "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
  113. "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
  114. "dadd %[dst], %[dst], %[dst_stride] \n\t"
  115. "daddi %[src], %[src], 0x08 \n\t"
  116. "daddi %[width], %[width], -0x08 \n\t"
  117. "bnez %[width], 1b \n\t"
  118. : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
  119. [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
  120. [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
  121. [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
  122. [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst),
  123. [src_tmp] "+&r"(src_tmp)
  124. : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride),
  125. [dst_stride] "r"(dst_stride)
  126. : "memory");
  127. }
  128. void TransposeUVWx8_MMI(const uint8_t* src,
  129. int src_stride,
  130. uint8_t* dst_a,
  131. int dst_stride_a,
  132. uint8_t* dst_b,
  133. int dst_stride_b,
  134. int width) {
  135. uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
  136. uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
  137. uint8_t* src_tmp = nullptr;
  138. __asm__ volatile(
  139. "1: \n\t"
  140. /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */
  141. "ldc1 %[tmp12], 0x00(%[src]) \n\t"
  142. "dadd %[src_tmp], %[src], %[src_stride] \n\t"
  143. /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */
  144. "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
  145. /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */
  146. "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
  147. /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */
  148. "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
  149. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  150. /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */
  151. "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
  152. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  153. /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */
  154. "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
  155. /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */
  156. "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
  157. /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */
  158. "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
  159. /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */
  160. "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t"
  161. /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */
  162. "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t"
  163. /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */
  164. "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t"
  165. /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */
  166. "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t"
  167. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  168. /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */
  169. "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
  170. /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */
  171. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  172. "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
  173. /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */
  174. "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
  175. /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */
  176. "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
  177. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  178. /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */
  179. "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
  180. /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */
  181. "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
  182. "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
  183. /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */
  184. "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
  185. /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */
  186. "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
  187. /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */
  188. "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t"
  189. /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */
  190. "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t"
  191. /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */
  192. "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t"
  193. /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */
  194. "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t"
  195. /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */
  196. "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t"
  197. /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */
  198. "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t"
  199. "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
  200. "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
  201. "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
  202. "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
  203. /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */
  204. "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t"
  205. /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */
  206. "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t"
  207. "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
  208. "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
  209. "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
  210. "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
  211. "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
  212. "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
  213. /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */
  214. "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t"
  215. /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */
  216. "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t"
  217. "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
  218. "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
  219. "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
  220. "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
  221. "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
  222. "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
  223. /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */
  224. "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t"
  225. /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */
  226. "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t"
  227. "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
  228. "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
  229. "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
  230. "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
  231. "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
  232. "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
  233. "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
  234. "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
  235. "daddiu %[src], %[src], 0x08 \n\t"
  236. "daddi %[width], %[width], -0x04 \n\t"
  237. "bnez %[width], 1b \n\t"
  238. : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
  239. [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
  240. [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
  241. [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
  242. [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a),
  243. [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp)
  244. : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a),
  245. [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride)
  246. : "memory");
  247. }
  248. #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
  249. #ifdef __cplusplus
  250. } // extern "C"
  251. } // namespace libyuv
  252. #endif