copy_sse2.asm 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;void vp8_copy32xn_sse2(
  12. ; unsigned char *src_ptr,
  13. ; int src_stride,
  14. ; unsigned char *dst_ptr,
  15. ; int dst_stride,
  16. ; int height);
  17. global sym(vp8_copy32xn_sse2) PRIVATE
  18. sym(vp8_copy32xn_sse2):
  19. push rbp
  20. mov rbp, rsp
  21. SHADOW_ARGS_TO_STACK 5
  22. SAVE_XMM 7
  23. push rsi
  24. push rdi
  25. ; end prolog
  26. mov rsi, arg(0) ;src_ptr
  27. mov rdi, arg(2) ;dst_ptr
  28. movsxd rax, dword ptr arg(1) ;src_stride
  29. movsxd rdx, dword ptr arg(3) ;dst_stride
  30. movsxd rcx, dword ptr arg(4) ;height
  31. .block_copy_sse2_loopx4:
  32. movdqu xmm0, XMMWORD PTR [rsi]
  33. movdqu xmm1, XMMWORD PTR [rsi + 16]
  34. movdqu xmm2, XMMWORD PTR [rsi + rax]
  35. movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
  36. lea rsi, [rsi+rax*2]
  37. movdqu xmm4, XMMWORD PTR [rsi]
  38. movdqu xmm5, XMMWORD PTR [rsi + 16]
  39. movdqu xmm6, XMMWORD PTR [rsi + rax]
  40. movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
  41. lea rsi, [rsi+rax*2]
  42. movdqa XMMWORD PTR [rdi], xmm0
  43. movdqa XMMWORD PTR [rdi + 16], xmm1
  44. movdqa XMMWORD PTR [rdi + rdx], xmm2
  45. movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
  46. lea rdi, [rdi+rdx*2]
  47. movdqa XMMWORD PTR [rdi], xmm4
  48. movdqa XMMWORD PTR [rdi + 16], xmm5
  49. movdqa XMMWORD PTR [rdi + rdx], xmm6
  50. movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
  51. lea rdi, [rdi+rdx*2]
  52. sub rcx, 4
  53. cmp rcx, 4
  54. jge .block_copy_sse2_loopx4
  55. cmp rcx, 0
  56. je .copy_is_done
  57. .block_copy_sse2_loop:
  58. movdqu xmm0, XMMWORD PTR [rsi]
  59. movdqu xmm1, XMMWORD PTR [rsi + 16]
  60. lea rsi, [rsi+rax]
  61. movdqa XMMWORD PTR [rdi], xmm0
  62. movdqa XMMWORD PTR [rdi + 16], xmm1
  63. lea rdi, [rdi+rdx]
  64. sub rcx, 1
  65. jne .block_copy_sse2_loop
  66. .copy_is_done:
  67. ; begin epilog
  68. pop rdi
  69. pop rsi
  70. RESTORE_XMM
  71. UNSHADOW_ARGS
  72. pop rbp
  73. ret