123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252 |
- /*
- * Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
- #include "libyuv/rotate_row.h"
- #include "libyuv/row.h"
- #ifdef __cplusplus
- namespace libyuv {
- extern "C" {
- #endif
- // This module is for 32 bit Visual C x86 and clangcl
- #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
- __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
- int src_stride,
- uint8_t* dst,
- int dst_stride,
- int width) {
- __asm {
- push edi
- push esi
- push ebp
- mov eax, [esp + 12 + 4] // src
- mov edi, [esp + 12 + 8] // src_stride
- mov edx, [esp + 12 + 12] // dst
- mov esi, [esp + 12 + 16] // dst_stride
- mov ecx, [esp + 12 + 20] // width
- // Read in the data from the source pointer.
- // First round of bit swap.
- align 4
- convertloop:
- movq xmm0, qword ptr [eax]
- lea ebp, [eax + 8]
- movq xmm1, qword ptr [eax + edi]
- lea eax, [eax + 2 * edi]
- punpcklbw xmm0, xmm1
- movq xmm2, qword ptr [eax]
- movdqa xmm1, xmm0
- palignr xmm1, xmm1, 8
- movq xmm3, qword ptr [eax + edi]
- lea eax, [eax + 2 * edi]
- punpcklbw xmm2, xmm3
- movdqa xmm3, xmm2
- movq xmm4, qword ptr [eax]
- palignr xmm3, xmm3, 8
- movq xmm5, qword ptr [eax + edi]
- punpcklbw xmm4, xmm5
- lea eax, [eax + 2 * edi]
- movdqa xmm5, xmm4
- movq xmm6, qword ptr [eax]
- palignr xmm5, xmm5, 8
- movq xmm7, qword ptr [eax + edi]
- punpcklbw xmm6, xmm7
- mov eax, ebp
- movdqa xmm7, xmm6
- palignr xmm7, xmm7, 8
- // Second round of bit swap.
- punpcklwd xmm0, xmm2
- punpcklwd xmm1, xmm3
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- palignr xmm2, xmm2, 8
- palignr xmm3, xmm3, 8
- punpcklwd xmm4, xmm6
- punpcklwd xmm5, xmm7
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- palignr xmm6, xmm6, 8
- palignr xmm7, xmm7, 8
- // Third round of bit swap.
- // Write to the destination pointer.
- punpckldq xmm0, xmm4
- movq qword ptr [edx], xmm0
- movdqa xmm4, xmm0
- palignr xmm4, xmm4, 8
- movq qword ptr [edx + esi], xmm4
- lea edx, [edx + 2 * esi]
- punpckldq xmm2, xmm6
- movdqa xmm6, xmm2
- palignr xmm6, xmm6, 8
- movq qword ptr [edx], xmm2
- punpckldq xmm1, xmm5
- movq qword ptr [edx + esi], xmm6
- lea edx, [edx + 2 * esi]
- movdqa xmm5, xmm1
- movq qword ptr [edx], xmm1
- palignr xmm5, xmm5, 8
- punpckldq xmm3, xmm7
- movq qword ptr [edx + esi], xmm5
- lea edx, [edx + 2 * esi]
- movq qword ptr [edx], xmm3
- movdqa xmm7, xmm3
- palignr xmm7, xmm7, 8
- sub ecx, 8
- movq qword ptr [edx + esi], xmm7
- lea edx, [edx + 2 * esi]
- jg convertloop
- pop ebp
- pop esi
- pop edi
- ret
- }
- }
- __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int w) {
- __asm {
- push ebx
- push esi
- push edi
- push ebp
- mov eax, [esp + 16 + 4] // src
- mov edi, [esp + 16 + 8] // src_stride
- mov edx, [esp + 16 + 12] // dst_a
- mov esi, [esp + 16 + 16] // dst_stride_a
- mov ebx, [esp + 16 + 20] // dst_b
- mov ebp, [esp + 16 + 24] // dst_stride_b
- mov ecx, esp
- sub esp, 4 + 16
- and esp, ~15
- mov [esp + 16], ecx
- mov ecx, [ecx + 16 + 28] // w
- align 4
- // Read in the data from the source pointer.
- // First round of bit swap.
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqa xmm7, xmm0 // use xmm7 as temp register.
- punpcklbw xmm0, xmm1
- punpckhbw xmm7, xmm1
- movdqa xmm1, xmm7
- movdqu xmm2, [eax]
- movdqu xmm3, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqa xmm7, xmm2
- punpcklbw xmm2, xmm3
- punpckhbw xmm7, xmm3
- movdqa xmm3, xmm7
- movdqu xmm4, [eax]
- movdqu xmm5, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqa xmm7, xmm4
- punpcklbw xmm4, xmm5
- punpckhbw xmm7, xmm5
- movdqa xmm5, xmm7
- movdqu xmm6, [eax]
- movdqu xmm7, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqu [esp], xmm5 // backup xmm5
- neg edi
- movdqa xmm5, xmm6 // use xmm5 as temp register.
- punpcklbw xmm6, xmm7
- punpckhbw xmm5, xmm7
- movdqa xmm7, xmm5
- lea eax, [eax + 8 * edi + 16]
- neg edi
- // Second round of bit swap.
- movdqa xmm5, xmm0
- punpcklwd xmm0, xmm2
- punpckhwd xmm5, xmm2
- movdqa xmm2, xmm5
- movdqa xmm5, xmm1
- punpcklwd xmm1, xmm3
- punpckhwd xmm5, xmm3
- movdqa xmm3, xmm5
- movdqa xmm5, xmm4
- punpcklwd xmm4, xmm6
- punpckhwd xmm5, xmm6
- movdqa xmm6, xmm5
- movdqu xmm5, [esp] // restore xmm5
- movdqu [esp], xmm6 // backup xmm6
- movdqa xmm6, xmm5 // use xmm6 as temp register.
- punpcklwd xmm5, xmm7
- punpckhwd xmm6, xmm7
- movdqa xmm7, xmm6
- // Third round of bit swap.
- // Write to the destination pointer.
- movdqa xmm6, xmm0
- punpckldq xmm0, xmm4
- punpckhdq xmm6, xmm4
- movdqa xmm4, xmm6
- movdqu xmm6, [esp] // restore xmm6
- movlpd qword ptr [edx], xmm0
- movhpd qword ptr [ebx], xmm0
- movlpd qword ptr [edx + esi], xmm4
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm4
- lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm2 // use xmm0 as the temp register.
- punpckldq xmm2, xmm6
- movlpd qword ptr [edx], xmm2
- movhpd qword ptr [ebx], xmm2
- punpckhdq xmm0, xmm6
- movlpd qword ptr [edx + esi], xmm0
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm0
- lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm1 // use xmm0 as the temp register.
- punpckldq xmm1, xmm5
- movlpd qword ptr [edx], xmm1
- movhpd qword ptr [ebx], xmm1
- punpckhdq xmm0, xmm5
- movlpd qword ptr [edx + esi], xmm0
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm0
- lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm3 // use xmm0 as the temp register.
- punpckldq xmm3, xmm7
- movlpd qword ptr [edx], xmm3
- movhpd qword ptr [ebx], xmm3
- punpckhdq xmm0, xmm7
- sub ecx, 8
- movlpd qword ptr [edx + esi], xmm0
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm0
- lea ebx, [ebx + 2 * ebp]
- jg convertloop
- mov esp, [esp + 16]
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
- }
- }
- #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
- #ifdef __cplusplus
- } // extern "C"
- } // namespace libyuv
- #endif
|