rotate_win.cc 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. /*
  2. * Copyright 2013 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/rotate_row.h"
  11. #include "libyuv/row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for 32 bit Visual C x86 and clangcl
  17. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
  18. __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
  19. int src_stride,
  20. uint8_t* dst,
  21. int dst_stride,
  22. int width) {
  23. __asm {
  24. push edi
  25. push esi
  26. push ebp
  27. mov eax, [esp + 12 + 4] // src
  28. mov edi, [esp + 12 + 8] // src_stride
  29. mov edx, [esp + 12 + 12] // dst
  30. mov esi, [esp + 12 + 16] // dst_stride
  31. mov ecx, [esp + 12 + 20] // width
  32. // Read in the data from the source pointer.
  33. // First round of bit swap.
  34. align 4
  35. convertloop:
  36. movq xmm0, qword ptr [eax]
  37. lea ebp, [eax + 8]
  38. movq xmm1, qword ptr [eax + edi]
  39. lea eax, [eax + 2 * edi]
  40. punpcklbw xmm0, xmm1
  41. movq xmm2, qword ptr [eax]
  42. movdqa xmm1, xmm0
  43. palignr xmm1, xmm1, 8
  44. movq xmm3, qword ptr [eax + edi]
  45. lea eax, [eax + 2 * edi]
  46. punpcklbw xmm2, xmm3
  47. movdqa xmm3, xmm2
  48. movq xmm4, qword ptr [eax]
  49. palignr xmm3, xmm3, 8
  50. movq xmm5, qword ptr [eax + edi]
  51. punpcklbw xmm4, xmm5
  52. lea eax, [eax + 2 * edi]
  53. movdqa xmm5, xmm4
  54. movq xmm6, qword ptr [eax]
  55. palignr xmm5, xmm5, 8
  56. movq xmm7, qword ptr [eax + edi]
  57. punpcklbw xmm6, xmm7
  58. mov eax, ebp
  59. movdqa xmm7, xmm6
  60. palignr xmm7, xmm7, 8
  61. // Second round of bit swap.
  62. punpcklwd xmm0, xmm2
  63. punpcklwd xmm1, xmm3
  64. movdqa xmm2, xmm0
  65. movdqa xmm3, xmm1
  66. palignr xmm2, xmm2, 8
  67. palignr xmm3, xmm3, 8
  68. punpcklwd xmm4, xmm6
  69. punpcklwd xmm5, xmm7
  70. movdqa xmm6, xmm4
  71. movdqa xmm7, xmm5
  72. palignr xmm6, xmm6, 8
  73. palignr xmm7, xmm7, 8
  74. // Third round of bit swap.
  75. // Write to the destination pointer.
  76. punpckldq xmm0, xmm4
  77. movq qword ptr [edx], xmm0
  78. movdqa xmm4, xmm0
  79. palignr xmm4, xmm4, 8
  80. movq qword ptr [edx + esi], xmm4
  81. lea edx, [edx + 2 * esi]
  82. punpckldq xmm2, xmm6
  83. movdqa xmm6, xmm2
  84. palignr xmm6, xmm6, 8
  85. movq qword ptr [edx], xmm2
  86. punpckldq xmm1, xmm5
  87. movq qword ptr [edx + esi], xmm6
  88. lea edx, [edx + 2 * esi]
  89. movdqa xmm5, xmm1
  90. movq qword ptr [edx], xmm1
  91. palignr xmm5, xmm5, 8
  92. punpckldq xmm3, xmm7
  93. movq qword ptr [edx + esi], xmm5
  94. lea edx, [edx + 2 * esi]
  95. movq qword ptr [edx], xmm3
  96. movdqa xmm7, xmm3
  97. palignr xmm7, xmm7, 8
  98. sub ecx, 8
  99. movq qword ptr [edx + esi], xmm7
  100. lea edx, [edx + 2 * esi]
  101. jg convertloop
  102. pop ebp
  103. pop esi
  104. pop edi
  105. ret
  106. }
  107. }
  108. __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
  109. int src_stride,
  110. uint8_t* dst_a,
  111. int dst_stride_a,
  112. uint8_t* dst_b,
  113. int dst_stride_b,
  114. int w) {
  115. __asm {
  116. push ebx
  117. push esi
  118. push edi
  119. push ebp
  120. mov eax, [esp + 16 + 4] // src
  121. mov edi, [esp + 16 + 8] // src_stride
  122. mov edx, [esp + 16 + 12] // dst_a
  123. mov esi, [esp + 16 + 16] // dst_stride_a
  124. mov ebx, [esp + 16 + 20] // dst_b
  125. mov ebp, [esp + 16 + 24] // dst_stride_b
  126. mov ecx, esp
  127. sub esp, 4 + 16
  128. and esp, ~15
  129. mov [esp + 16], ecx
  130. mov ecx, [ecx + 16 + 28] // w
  131. align 4
  132. // Read in the data from the source pointer.
  133. // First round of bit swap.
  134. convertloop:
  135. movdqu xmm0, [eax]
  136. movdqu xmm1, [eax + edi]
  137. lea eax, [eax + 2 * edi]
  138. movdqa xmm7, xmm0 // use xmm7 as temp register.
  139. punpcklbw xmm0, xmm1
  140. punpckhbw xmm7, xmm1
  141. movdqa xmm1, xmm7
  142. movdqu xmm2, [eax]
  143. movdqu xmm3, [eax + edi]
  144. lea eax, [eax + 2 * edi]
  145. movdqa xmm7, xmm2
  146. punpcklbw xmm2, xmm3
  147. punpckhbw xmm7, xmm3
  148. movdqa xmm3, xmm7
  149. movdqu xmm4, [eax]
  150. movdqu xmm5, [eax + edi]
  151. lea eax, [eax + 2 * edi]
  152. movdqa xmm7, xmm4
  153. punpcklbw xmm4, xmm5
  154. punpckhbw xmm7, xmm5
  155. movdqa xmm5, xmm7
  156. movdqu xmm6, [eax]
  157. movdqu xmm7, [eax + edi]
  158. lea eax, [eax + 2 * edi]
  159. movdqu [esp], xmm5 // backup xmm5
  160. neg edi
  161. movdqa xmm5, xmm6 // use xmm5 as temp register.
  162. punpcklbw xmm6, xmm7
  163. punpckhbw xmm5, xmm7
  164. movdqa xmm7, xmm5
  165. lea eax, [eax + 8 * edi + 16]
  166. neg edi
  167. // Second round of bit swap.
  168. movdqa xmm5, xmm0
  169. punpcklwd xmm0, xmm2
  170. punpckhwd xmm5, xmm2
  171. movdqa xmm2, xmm5
  172. movdqa xmm5, xmm1
  173. punpcklwd xmm1, xmm3
  174. punpckhwd xmm5, xmm3
  175. movdqa xmm3, xmm5
  176. movdqa xmm5, xmm4
  177. punpcklwd xmm4, xmm6
  178. punpckhwd xmm5, xmm6
  179. movdqa xmm6, xmm5
  180. movdqu xmm5, [esp] // restore xmm5
  181. movdqu [esp], xmm6 // backup xmm6
  182. movdqa xmm6, xmm5 // use xmm6 as temp register.
  183. punpcklwd xmm5, xmm7
  184. punpckhwd xmm6, xmm7
  185. movdqa xmm7, xmm6
  186. // Third round of bit swap.
  187. // Write to the destination pointer.
  188. movdqa xmm6, xmm0
  189. punpckldq xmm0, xmm4
  190. punpckhdq xmm6, xmm4
  191. movdqa xmm4, xmm6
  192. movdqu xmm6, [esp] // restore xmm6
  193. movlpd qword ptr [edx], xmm0
  194. movhpd qword ptr [ebx], xmm0
  195. movlpd qword ptr [edx + esi], xmm4
  196. lea edx, [edx + 2 * esi]
  197. movhpd qword ptr [ebx + ebp], xmm4
  198. lea ebx, [ebx + 2 * ebp]
  199. movdqa xmm0, xmm2 // use xmm0 as the temp register.
  200. punpckldq xmm2, xmm6
  201. movlpd qword ptr [edx], xmm2
  202. movhpd qword ptr [ebx], xmm2
  203. punpckhdq xmm0, xmm6
  204. movlpd qword ptr [edx + esi], xmm0
  205. lea edx, [edx + 2 * esi]
  206. movhpd qword ptr [ebx + ebp], xmm0
  207. lea ebx, [ebx + 2 * ebp]
  208. movdqa xmm0, xmm1 // use xmm0 as the temp register.
  209. punpckldq xmm1, xmm5
  210. movlpd qword ptr [edx], xmm1
  211. movhpd qword ptr [ebx], xmm1
  212. punpckhdq xmm0, xmm5
  213. movlpd qword ptr [edx + esi], xmm0
  214. lea edx, [edx + 2 * esi]
  215. movhpd qword ptr [ebx + ebp], xmm0
  216. lea ebx, [ebx + 2 * ebp]
  217. movdqa xmm0, xmm3 // use xmm0 as the temp register.
  218. punpckldq xmm3, xmm7
  219. movlpd qword ptr [edx], xmm3
  220. movhpd qword ptr [ebx], xmm3
  221. punpckhdq xmm0, xmm7
  222. sub ecx, 8
  223. movlpd qword ptr [edx + esi], xmm0
  224. lea edx, [edx + 2 * esi]
  225. movhpd qword ptr [ebx + ebp], xmm0
  226. lea ebx, [ebx + 2 * ebp]
  227. jg convertloop
  228. mov esp, [esp + 16]
  229. pop ebp
  230. pop edi
  231. pop esi
  232. pop ebx
  233. ret
  234. }
  235. }
  236. #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
  237. #ifdef __cplusplus
  238. } // extern "C"
  239. } // namespace libyuv
  240. #endif