copy_sse3.asm 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %macro STACK_FRAME_CREATE_X3 0
  12. %if ABI_IS_32BIT
  13. %define src_ptr rsi
  14. %define src_stride rax
  15. %define ref_ptr rdi
  16. %define ref_stride rdx
  17. %define end_ptr rcx
  18. %define ret_var rbx
  19. %define result_ptr arg(4)
  20. %define max_sad arg(4)
  21. %define height dword ptr arg(4)
  22. push rbp
  23. mov rbp, rsp
  24. push rsi
  25. push rdi
  26. push rbx
  27. mov rsi, arg(0) ; src_ptr
  28. mov rdi, arg(2) ; ref_ptr
  29. movsxd rax, dword ptr arg(1) ; src_stride
  30. movsxd rdx, dword ptr arg(3) ; ref_stride
  31. %else
  32. %if LIBVPX_YASM_WIN64
  33. SAVE_XMM 7, u
  34. %define src_ptr rcx
  35. %define src_stride rdx
  36. %define ref_ptr r8
  37. %define ref_stride r9
  38. %define end_ptr r10
  39. %define ret_var r11
  40. %define result_ptr [rsp+xmm_stack_space+8+4*8]
  41. %define max_sad [rsp+xmm_stack_space+8+4*8]
  42. %define height dword ptr [rsp+xmm_stack_space+8+4*8]
  43. %else
  44. %define src_ptr rdi
  45. %define src_stride rsi
  46. %define ref_ptr rdx
  47. %define ref_stride rcx
  48. %define end_ptr r9
  49. %define ret_var r10
  50. %define result_ptr r8
  51. %define max_sad r8
  52. %define height r8
  53. %endif
  54. %endif
  55. %endmacro
  56. %macro STACK_FRAME_DESTROY_X3 0
  57. %define src_ptr
  58. %define src_stride
  59. %define ref_ptr
  60. %define ref_stride
  61. %define end_ptr
  62. %define ret_var
  63. %define result_ptr
  64. %define max_sad
  65. %define height
  66. %if ABI_IS_32BIT
  67. pop rbx
  68. pop rdi
  69. pop rsi
  70. pop rbp
  71. %else
  72. %if LIBVPX_YASM_WIN64
  73. RESTORE_XMM
  74. %endif
  75. %endif
  76. ret
  77. %endmacro
  78. SECTION .text
  79. ;void vp8_copy32xn_sse3(
  80. ; unsigned char *src_ptr,
  81. ; int src_stride,
  82. ; unsigned char *dst_ptr,
  83. ; int dst_stride,
  84. ; int height);
  85. global sym(vp8_copy32xn_sse3) PRIVATE
  86. sym(vp8_copy32xn_sse3):
  87. STACK_FRAME_CREATE_X3
  88. .block_copy_sse3_loopx4:
  89. lea end_ptr, [src_ptr+src_stride*2]
  90. movdqu xmm0, XMMWORD PTR [src_ptr]
  91. movdqu xmm1, XMMWORD PTR [src_ptr + 16]
  92. movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
  93. movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
  94. movdqu xmm4, XMMWORD PTR [end_ptr]
  95. movdqu xmm5, XMMWORD PTR [end_ptr + 16]
  96. movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
  97. movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
  98. lea src_ptr, [src_ptr+src_stride*4]
  99. lea end_ptr, [ref_ptr+ref_stride*2]
  100. movdqa XMMWORD PTR [ref_ptr], xmm0
  101. movdqa XMMWORD PTR [ref_ptr + 16], xmm1
  102. movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
  103. movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
  104. movdqa XMMWORD PTR [end_ptr], xmm4
  105. movdqa XMMWORD PTR [end_ptr + 16], xmm5
  106. movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
  107. movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
  108. lea ref_ptr, [ref_ptr+ref_stride*4]
  109. sub height, 4
  110. cmp height, 4
  111. jge .block_copy_sse3_loopx4
  112. ;Check to see if there is more rows need to be copied.
  113. cmp height, 0
  114. je .copy_is_done
  115. .block_copy_sse3_loop:
  116. movdqu xmm0, XMMWORD PTR [src_ptr]
  117. movdqu xmm1, XMMWORD PTR [src_ptr + 16]
  118. lea src_ptr, [src_ptr+src_stride]
  119. movdqa XMMWORD PTR [ref_ptr], xmm0
  120. movdqa XMMWORD PTR [ref_ptr + 16], xmm1
  121. lea ref_ptr, [ref_ptr+ref_stride]
  122. sub height, 1
  123. jne .block_copy_sse3_loop
  124. .copy_is_done:
  125. STACK_FRAME_DESTROY_X3