2
0

rotate_mips.cc 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include "libyuv/rotate_row.h"
  12. #include "libyuv/basic_types.h"
  13. #ifdef __cplusplus
  14. namespace libyuv {
  15. extern "C" {
  16. #endif
  17. #if !defined(LIBYUV_DISABLE_MIPS) && \
  18. defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
  19. (_MIPS_SIM == _MIPS_SIM_ABI32)
  20. void TransposeWx8_DSPR2(const uint8* src, int src_stride,
  21. uint8* dst, int dst_stride, int width) {
  22. __asm__ __volatile__ (
  23. ".set push \n"
  24. ".set noreorder \n"
  25. "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
  26. "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
  27. "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
  28. "addu $t3, $t2, %[src_stride] \n"
  29. "addu $t5, $t4, %[src_stride] \n"
  30. "addu $t6, $t2, $t4 \n"
  31. "andi $t0, %[dst], 0x3 \n"
  32. "andi $t1, %[dst_stride], 0x3 \n"
  33. "or $t0, $t0, $t1 \n"
  34. "bnez $t0, 11f \n"
  35. " subu $t7, $t9, %[src_stride] \n"
  36. //dst + dst_stride word aligned
  37. "1: \n"
  38. "lbu $t0, 0(%[src]) \n"
  39. "lbux $t1, %[src_stride](%[src]) \n"
  40. "lbux $t8, $t2(%[src]) \n"
  41. "lbux $t9, $t3(%[src]) \n"
  42. "sll $t1, $t1, 16 \n"
  43. "sll $t9, $t9, 16 \n"
  44. "or $t0, $t0, $t1 \n"
  45. "or $t8, $t8, $t9 \n"
  46. "precr.qb.ph $s0, $t8, $t0 \n"
  47. "lbux $t0, $t4(%[src]) \n"
  48. "lbux $t1, $t5(%[src]) \n"
  49. "lbux $t8, $t6(%[src]) \n"
  50. "lbux $t9, $t7(%[src]) \n"
  51. "sll $t1, $t1, 16 \n"
  52. "sll $t9, $t9, 16 \n"
  53. "or $t0, $t0, $t1 \n"
  54. "or $t8, $t8, $t9 \n"
  55. "precr.qb.ph $s1, $t8, $t0 \n"
  56. "sw $s0, 0(%[dst]) \n"
  57. "addiu %[width], -1 \n"
  58. "addiu %[src], 1 \n"
  59. "sw $s1, 4(%[dst]) \n"
  60. "bnez %[width], 1b \n"
  61. " addu %[dst], %[dst], %[dst_stride] \n"
  62. "b 2f \n"
  63. //dst + dst_stride unaligned
  64. "11: \n"
  65. "lbu $t0, 0(%[src]) \n"
  66. "lbux $t1, %[src_stride](%[src]) \n"
  67. "lbux $t8, $t2(%[src]) \n"
  68. "lbux $t9, $t3(%[src]) \n"
  69. "sll $t1, $t1, 16 \n"
  70. "sll $t9, $t9, 16 \n"
  71. "or $t0, $t0, $t1 \n"
  72. "or $t8, $t8, $t9 \n"
  73. "precr.qb.ph $s0, $t8, $t0 \n"
  74. "lbux $t0, $t4(%[src]) \n"
  75. "lbux $t1, $t5(%[src]) \n"
  76. "lbux $t8, $t6(%[src]) \n"
  77. "lbux $t9, $t7(%[src]) \n"
  78. "sll $t1, $t1, 16 \n"
  79. "sll $t9, $t9, 16 \n"
  80. "or $t0, $t0, $t1 \n"
  81. "or $t8, $t8, $t9 \n"
  82. "precr.qb.ph $s1, $t8, $t0 \n"
  83. "swr $s0, 0(%[dst]) \n"
  84. "swl $s0, 3(%[dst]) \n"
  85. "addiu %[width], -1 \n"
  86. "addiu %[src], 1 \n"
  87. "swr $s1, 4(%[dst]) \n"
  88. "swl $s1, 7(%[dst]) \n"
  89. "bnez %[width], 11b \n"
  90. "addu %[dst], %[dst], %[dst_stride] \n"
  91. "2: \n"
  92. ".set pop \n"
  93. :[src] "+r" (src),
  94. [dst] "+r" (dst),
  95. [width] "+r" (width)
  96. :[src_stride] "r" (src_stride),
  97. [dst_stride] "r" (dst_stride)
  98. : "t0", "t1", "t2", "t3", "t4", "t5",
  99. "t6", "t7", "t8", "t9",
  100. "s0", "s1"
  101. );
  102. }
  103. void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
  104. uint8* dst, int dst_stride, int width) {
  105. __asm__ __volatile__ (
  106. ".set noat \n"
  107. ".set push \n"
  108. ".set noreorder \n"
  109. "beqz %[width], 2f \n"
  110. " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
  111. "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
  112. "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
  113. "addu $t3, $t2, %[src_stride] \n"
  114. "addu $t5, $t4, %[src_stride] \n"
  115. "addu $t6, $t2, $t4 \n"
  116. "srl $AT, %[width], 0x2 \n"
  117. "andi $t0, %[dst], 0x3 \n"
  118. "andi $t1, %[dst_stride], 0x3 \n"
  119. "or $t0, $t0, $t1 \n"
  120. "bnez $t0, 11f \n"
  121. " subu $t7, $t9, %[src_stride] \n"
  122. //dst + dst_stride word aligned
  123. "1: \n"
  124. "lw $t0, 0(%[src]) \n"
  125. "lwx $t1, %[src_stride](%[src]) \n"
  126. "lwx $t8, $t2(%[src]) \n"
  127. "lwx $t9, $t3(%[src]) \n"
  128. // t0 = | 30 | 20 | 10 | 00 |
  129. // t1 = | 31 | 21 | 11 | 01 |
  130. // t8 = | 32 | 22 | 12 | 02 |
  131. // t9 = | 33 | 23 | 13 | 03 |
  132. "precr.qb.ph $s0, $t1, $t0 \n"
  133. "precr.qb.ph $s1, $t9, $t8 \n"
  134. "precrq.qb.ph $s2, $t1, $t0 \n"
  135. "precrq.qb.ph $s3, $t9, $t8 \n"
  136. // s0 = | 21 | 01 | 20 | 00 |
  137. // s1 = | 23 | 03 | 22 | 02 |
  138. // s2 = | 31 | 11 | 30 | 10 |
  139. // s3 = | 33 | 13 | 32 | 12 |
  140. "precr.qb.ph $s4, $s1, $s0 \n"
  141. "precrq.qb.ph $s5, $s1, $s0 \n"
  142. "precr.qb.ph $s6, $s3, $s2 \n"
  143. "precrq.qb.ph $s7, $s3, $s2 \n"
  144. // s4 = | 03 | 02 | 01 | 00 |
  145. // s5 = | 23 | 22 | 21 | 20 |
  146. // s6 = | 13 | 12 | 11 | 10 |
  147. // s7 = | 33 | 32 | 31 | 30 |
  148. "lwx $t0, $t4(%[src]) \n"
  149. "lwx $t1, $t5(%[src]) \n"
  150. "lwx $t8, $t6(%[src]) \n"
  151. "lwx $t9, $t7(%[src]) \n"
  152. // t0 = | 34 | 24 | 14 | 04 |
  153. // t1 = | 35 | 25 | 15 | 05 |
  154. // t8 = | 36 | 26 | 16 | 06 |
  155. // t9 = | 37 | 27 | 17 | 07 |
  156. "precr.qb.ph $s0, $t1, $t0 \n"
  157. "precr.qb.ph $s1, $t9, $t8 \n"
  158. "precrq.qb.ph $s2, $t1, $t0 \n"
  159. "precrq.qb.ph $s3, $t9, $t8 \n"
  160. // s0 = | 25 | 05 | 24 | 04 |
  161. // s1 = | 27 | 07 | 26 | 06 |
  162. // s2 = | 35 | 15 | 34 | 14 |
  163. // s3 = | 37 | 17 | 36 | 16 |
  164. "precr.qb.ph $t0, $s1, $s0 \n"
  165. "precrq.qb.ph $t1, $s1, $s0 \n"
  166. "precr.qb.ph $t8, $s3, $s2 \n"
  167. "precrq.qb.ph $t9, $s3, $s2 \n"
  168. // t0 = | 07 | 06 | 05 | 04 |
  169. // t1 = | 27 | 26 | 25 | 24 |
  170. // t8 = | 17 | 16 | 15 | 14 |
  171. // t9 = | 37 | 36 | 35 | 34 |
  172. "addu $s0, %[dst], %[dst_stride] \n"
  173. "addu $s1, $s0, %[dst_stride] \n"
  174. "addu $s2, $s1, %[dst_stride] \n"
  175. "sw $s4, 0(%[dst]) \n"
  176. "sw $t0, 4(%[dst]) \n"
  177. "sw $s6, 0($s0) \n"
  178. "sw $t8, 4($s0) \n"
  179. "sw $s5, 0($s1) \n"
  180. "sw $t1, 4($s1) \n"
  181. "sw $s7, 0($s2) \n"
  182. "sw $t9, 4($s2) \n"
  183. "addiu $AT, -1 \n"
  184. "addiu %[src], 4 \n"
  185. "bnez $AT, 1b \n"
  186. " addu %[dst], $s2, %[dst_stride] \n"
  187. "b 2f \n"
  188. //dst + dst_stride unaligned
  189. "11: \n"
  190. "lw $t0, 0(%[src]) \n"
  191. "lwx $t1, %[src_stride](%[src]) \n"
  192. "lwx $t8, $t2(%[src]) \n"
  193. "lwx $t9, $t3(%[src]) \n"
  194. // t0 = | 30 | 20 | 10 | 00 |
  195. // t1 = | 31 | 21 | 11 | 01 |
  196. // t8 = | 32 | 22 | 12 | 02 |
  197. // t9 = | 33 | 23 | 13 | 03 |
  198. "precr.qb.ph $s0, $t1, $t0 \n"
  199. "precr.qb.ph $s1, $t9, $t8 \n"
  200. "precrq.qb.ph $s2, $t1, $t0 \n"
  201. "precrq.qb.ph $s3, $t9, $t8 \n"
  202. // s0 = | 21 | 01 | 20 | 00 |
  203. // s1 = | 23 | 03 | 22 | 02 |
  204. // s2 = | 31 | 11 | 30 | 10 |
  205. // s3 = | 33 | 13 | 32 | 12 |
  206. "precr.qb.ph $s4, $s1, $s0 \n"
  207. "precrq.qb.ph $s5, $s1, $s0 \n"
  208. "precr.qb.ph $s6, $s3, $s2 \n"
  209. "precrq.qb.ph $s7, $s3, $s2 \n"
  210. // s4 = | 03 | 02 | 01 | 00 |
  211. // s5 = | 23 | 22 | 21 | 20 |
  212. // s6 = | 13 | 12 | 11 | 10 |
  213. // s7 = | 33 | 32 | 31 | 30 |
  214. "lwx $t0, $t4(%[src]) \n"
  215. "lwx $t1, $t5(%[src]) \n"
  216. "lwx $t8, $t6(%[src]) \n"
  217. "lwx $t9, $t7(%[src]) \n"
  218. // t0 = | 34 | 24 | 14 | 04 |
  219. // t1 = | 35 | 25 | 15 | 05 |
  220. // t8 = | 36 | 26 | 16 | 06 |
  221. // t9 = | 37 | 27 | 17 | 07 |
  222. "precr.qb.ph $s0, $t1, $t0 \n"
  223. "precr.qb.ph $s1, $t9, $t8 \n"
  224. "precrq.qb.ph $s2, $t1, $t0 \n"
  225. "precrq.qb.ph $s3, $t9, $t8 \n"
  226. // s0 = | 25 | 05 | 24 | 04 |
  227. // s1 = | 27 | 07 | 26 | 06 |
  228. // s2 = | 35 | 15 | 34 | 14 |
  229. // s3 = | 37 | 17 | 36 | 16 |
  230. "precr.qb.ph $t0, $s1, $s0 \n"
  231. "precrq.qb.ph $t1, $s1, $s0 \n"
  232. "precr.qb.ph $t8, $s3, $s2 \n"
  233. "precrq.qb.ph $t9, $s3, $s2 \n"
  234. // t0 = | 07 | 06 | 05 | 04 |
  235. // t1 = | 27 | 26 | 25 | 24 |
  236. // t8 = | 17 | 16 | 15 | 14 |
  237. // t9 = | 37 | 36 | 35 | 34 |
  238. "addu $s0, %[dst], %[dst_stride] \n"
  239. "addu $s1, $s0, %[dst_stride] \n"
  240. "addu $s2, $s1, %[dst_stride] \n"
  241. "swr $s4, 0(%[dst]) \n"
  242. "swl $s4, 3(%[dst]) \n"
  243. "swr $t0, 4(%[dst]) \n"
  244. "swl $t0, 7(%[dst]) \n"
  245. "swr $s6, 0($s0) \n"
  246. "swl $s6, 3($s0) \n"
  247. "swr $t8, 4($s0) \n"
  248. "swl $t8, 7($s0) \n"
  249. "swr $s5, 0($s1) \n"
  250. "swl $s5, 3($s1) \n"
  251. "swr $t1, 4($s1) \n"
  252. "swl $t1, 7($s1) \n"
  253. "swr $s7, 0($s2) \n"
  254. "swl $s7, 3($s2) \n"
  255. "swr $t9, 4($s2) \n"
  256. "swl $t9, 7($s2) \n"
  257. "addiu $AT, -1 \n"
  258. "addiu %[src], 4 \n"
  259. "bnez $AT, 11b \n"
  260. " addu %[dst], $s2, %[dst_stride] \n"
  261. "2: \n"
  262. ".set pop \n"
  263. ".set at \n"
  264. :[src] "+r" (src),
  265. [dst] "+r" (dst),
  266. [width] "+r" (width)
  267. :[src_stride] "r" (src_stride),
  268. [dst_stride] "r" (dst_stride)
  269. : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
  270. "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
  271. );
  272. }
  273. void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
  274. uint8* dst_a, int dst_stride_a,
  275. uint8* dst_b, int dst_stride_b,
  276. int width) {
  277. __asm__ __volatile__ (
  278. ".set push \n"
  279. ".set noreorder \n"
  280. "beqz %[width], 2f \n"
  281. " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
  282. "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
  283. "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
  284. "addu $t3, $t2, %[src_stride] \n"
  285. "addu $t5, $t4, %[src_stride] \n"
  286. "addu $t6, $t2, $t4 \n"
  287. "subu $t7, $t9, %[src_stride] \n"
  288. "srl $t1, %[width], 1 \n"
  289. // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
  290. "andi $t0, %[dst_a], 0x3 \n"
  291. "andi $t8, %[dst_b], 0x3 \n"
  292. "or $t0, $t0, $t8 \n"
  293. "andi $t8, %[dst_stride_a], 0x3 \n"
  294. "andi $s5, %[dst_stride_b], 0x3 \n"
  295. "or $t8, $t8, $s5 \n"
  296. "or $t0, $t0, $t8 \n"
  297. "bnez $t0, 11f \n"
  298. " nop \n"
  299. // dst + dst_stride word aligned (both, a & b dst addresses)
  300. "1: \n"
  301. "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
  302. "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
  303. "addu $s5, %[dst_a], %[dst_stride_a] \n"
  304. "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
  305. "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
  306. "addu $s6, %[dst_b], %[dst_stride_b] \n"
  307. "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
  308. "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
  309. "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
  310. "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
  311. "sll $t0, $t0, 16 \n"
  312. "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
  313. "sll $t9, $t9, 16 \n"
  314. "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
  315. "sw $s3, 0($s5) \n"
  316. "sw $s4, 0($s6) \n"
  317. "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
  318. "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
  319. "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
  320. "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
  321. "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
  322. "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
  323. "sw $s3, 0(%[dst_a]) \n"
  324. "sw $s4, 0(%[dst_b]) \n"
  325. "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
  326. "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
  327. "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
  328. "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
  329. "sll $t0, $t0, 16 \n"
  330. "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
  331. "sll $t9, $t9, 16 \n"
  332. "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
  333. "sw $s3, 4($s5) \n"
  334. "sw $s4, 4($s6) \n"
  335. "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
  336. "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
  337. "addiu %[src], 4 \n"
  338. "addiu $t1, -1 \n"
  339. "sll $t0, %[dst_stride_a], 1 \n"
  340. "sll $t8, %[dst_stride_b], 1 \n"
  341. "sw $s3, 4(%[dst_a]) \n"
  342. "sw $s4, 4(%[dst_b]) \n"
  343. "addu %[dst_a], %[dst_a], $t0 \n"
  344. "bnez $t1, 1b \n"
  345. " addu %[dst_b], %[dst_b], $t8 \n"
  346. "b 2f \n"
  347. " nop \n"
  348. // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
  349. "11: \n"
  350. "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
  351. "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
  352. "addu $s5, %[dst_a], %[dst_stride_a] \n"
  353. "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
  354. "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
  355. "addu $s6, %[dst_b], %[dst_stride_b] \n"
  356. "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
  357. "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
  358. "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
  359. "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
  360. "sll $t0, $t0, 16 \n"
  361. "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
  362. "sll $t9, $t9, 16 \n"
  363. "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
  364. "swr $s3, 0($s5) \n"
  365. "swl $s3, 3($s5) \n"
  366. "swr $s4, 0($s6) \n"
  367. "swl $s4, 3($s6) \n"
  368. "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
  369. "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
  370. "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
  371. "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
  372. "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
  373. "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
  374. "swr $s3, 0(%[dst_a]) \n"
  375. "swl $s3, 3(%[dst_a]) \n"
  376. "swr $s4, 0(%[dst_b]) \n"
  377. "swl $s4, 3(%[dst_b]) \n"
  378. "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
  379. "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
  380. "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
  381. "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
  382. "sll $t0, $t0, 16 \n"
  383. "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
  384. "sll $t9, $t9, 16 \n"
  385. "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
  386. "swr $s3, 4($s5) \n"
  387. "swl $s3, 7($s5) \n"
  388. "swr $s4, 4($s6) \n"
  389. "swl $s4, 7($s6) \n"
  390. "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
  391. "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
  392. "addiu %[src], 4 \n"
  393. "addiu $t1, -1 \n"
  394. "sll $t0, %[dst_stride_a], 1 \n"
  395. "sll $t8, %[dst_stride_b], 1 \n"
  396. "swr $s3, 4(%[dst_a]) \n"
  397. "swl $s3, 7(%[dst_a]) \n"
  398. "swr $s4, 4(%[dst_b]) \n"
  399. "swl $s4, 7(%[dst_b]) \n"
  400. "addu %[dst_a], %[dst_a], $t0 \n"
  401. "bnez $t1, 11b \n"
  402. " addu %[dst_b], %[dst_b], $t8 \n"
  403. "2: \n"
  404. ".set pop \n"
  405. : [src] "+r" (src),
  406. [dst_a] "+r" (dst_a),
  407. [dst_b] "+r" (dst_b),
  408. [width] "+r" (width),
  409. [src_stride] "+r" (src_stride)
  410. : [dst_stride_a] "r" (dst_stride_a),
  411. [dst_stride_b] "r" (dst_stride_b)
  412. : "t0", "t1", "t2", "t3", "t4", "t5",
  413. "t6", "t7", "t8", "t9",
  414. "s0", "s1", "s2", "s3",
  415. "s4", "s5", "s6"
  416. );
  417. }
  418. #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
  419. #ifdef __cplusplus
  420. } // extern "C"
  421. } // namespace libyuv
  422. #endif