copy_sse2.asm 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. SECTION .text
  12. ;void vp8_copy32xn_sse2(
  13. ; unsigned char *src_ptr,
  14. ; int src_stride,
  15. ; unsigned char *dst_ptr,
  16. ; int dst_stride,
  17. ; int height);
  18. global sym(vp8_copy32xn_sse2) PRIVATE
  19. sym(vp8_copy32xn_sse2):
  20. push rbp
  21. mov rbp, rsp
  22. SHADOW_ARGS_TO_STACK 5
  23. SAVE_XMM 7
  24. push rsi
  25. push rdi
  26. ; end prolog
  27. mov rsi, arg(0) ;src_ptr
  28. mov rdi, arg(2) ;dst_ptr
  29. movsxd rax, dword ptr arg(1) ;src_stride
  30. movsxd rdx, dword ptr arg(3) ;dst_stride
  31. movsxd rcx, dword ptr arg(4) ;height
  32. .block_copy_sse2_loopx4:
  33. movdqu xmm0, XMMWORD PTR [rsi]
  34. movdqu xmm1, XMMWORD PTR [rsi + 16]
  35. movdqu xmm2, XMMWORD PTR [rsi + rax]
  36. movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
  37. lea rsi, [rsi+rax*2]
  38. movdqu xmm4, XMMWORD PTR [rsi]
  39. movdqu xmm5, XMMWORD PTR [rsi + 16]
  40. movdqu xmm6, XMMWORD PTR [rsi + rax]
  41. movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
  42. lea rsi, [rsi+rax*2]
  43. movdqa XMMWORD PTR [rdi], xmm0
  44. movdqa XMMWORD PTR [rdi + 16], xmm1
  45. movdqa XMMWORD PTR [rdi + rdx], xmm2
  46. movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
  47. lea rdi, [rdi+rdx*2]
  48. movdqa XMMWORD PTR [rdi], xmm4
  49. movdqa XMMWORD PTR [rdi + 16], xmm5
  50. movdqa XMMWORD PTR [rdi + rdx], xmm6
  51. movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
  52. lea rdi, [rdi+rdx*2]
  53. sub rcx, 4
  54. cmp rcx, 4
  55. jge .block_copy_sse2_loopx4
  56. cmp rcx, 0
  57. je .copy_is_done
  58. .block_copy_sse2_loop:
  59. movdqu xmm0, XMMWORD PTR [rsi]
  60. movdqu xmm1, XMMWORD PTR [rsi + 16]
  61. lea rsi, [rsi+rax]
  62. movdqa XMMWORD PTR [rdi], xmm0
  63. movdqa XMMWORD PTR [rdi + 16], xmm1
  64. lea rdi, [rdi+rdx]
  65. sub rcx, 1
  66. jne .block_copy_sse2_loop
  67. .copy_is_done:
  68. ; begin epilog
  69. pop rdi
  70. pop rsi
  71. RESTORE_XMM
  72. UNSHADOW_ARGS
  73. pop rbp
  74. ret