recon_mmx.asm 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;void copy_mem8x8_mmx(
  12. ; unsigned char *src,
  13. ; int src_stride,
  14. ; unsigned char *dst,
  15. ; int dst_stride
  16. ; )
  17. global sym(vp8_copy_mem8x8_mmx) PRIVATE
  18. sym(vp8_copy_mem8x8_mmx):
  19. push rbp
  20. mov rbp, rsp
  21. SHADOW_ARGS_TO_STACK 4
  22. push rsi
  23. push rdi
  24. ; end prolog
  25. mov rsi, arg(0) ;src;
  26. movq mm0, [rsi]
  27. movsxd rax, dword ptr arg(1) ;src_stride;
  28. mov rdi, arg(2) ;dst;
  29. movq mm1, [rsi+rax]
  30. movq mm2, [rsi+rax*2]
  31. movsxd rcx, dword ptr arg(3) ;dst_stride
  32. lea rsi, [rsi+rax*2]
  33. movq [rdi], mm0
  34. add rsi, rax
  35. movq [rdi+rcx], mm1
  36. movq [rdi+rcx*2], mm2
  37. lea rdi, [rdi+rcx*2]
  38. movq mm3, [rsi]
  39. add rdi, rcx
  40. movq mm4, [rsi+rax]
  41. movq mm5, [rsi+rax*2]
  42. movq [rdi], mm3
  43. lea rsi, [rsi+rax*2]
  44. movq [rdi+rcx], mm4
  45. movq [rdi+rcx*2], mm5
  46. lea rdi, [rdi+rcx*2]
  47. movq mm0, [rsi+rax]
  48. movq mm1, [rsi+rax*2]
  49. movq [rdi+rcx], mm0
  50. movq [rdi+rcx*2],mm1
  51. ; begin epilog
  52. pop rdi
  53. pop rsi
  54. UNSHADOW_ARGS
  55. pop rbp
  56. ret
  57. ;void copy_mem8x4_mmx(
  58. ; unsigned char *src,
  59. ; int src_stride,
  60. ; unsigned char *dst,
  61. ; int dst_stride
  62. ; )
  63. global sym(vp8_copy_mem8x4_mmx) PRIVATE
  64. sym(vp8_copy_mem8x4_mmx):
  65. push rbp
  66. mov rbp, rsp
  67. SHADOW_ARGS_TO_STACK 4
  68. push rsi
  69. push rdi
  70. ; end prolog
  71. mov rsi, arg(0) ;src;
  72. movq mm0, [rsi]
  73. movsxd rax, dword ptr arg(1) ;src_stride;
  74. mov rdi, arg(2) ;dst;
  75. movq mm1, [rsi+rax]
  76. movq mm2, [rsi+rax*2]
  77. movsxd rcx, dword ptr arg(3) ;dst_stride
  78. lea rsi, [rsi+rax*2]
  79. movq [rdi], mm0
  80. movq [rdi+rcx], mm1
  81. movq [rdi+rcx*2], mm2
  82. lea rdi, [rdi+rcx*2]
  83. movq mm3, [rsi+rax]
  84. movq [rdi+rcx], mm3
  85. ; begin epilog
  86. pop rdi
  87. pop rsi
  88. UNSHADOW_ARGS
  89. pop rbp
  90. ret
  91. ;void copy_mem16x16_mmx(
  92. ; unsigned char *src,
  93. ; int src_stride,
  94. ; unsigned char *dst,
  95. ; int dst_stride
  96. ; )
  97. global sym(vp8_copy_mem16x16_mmx) PRIVATE
  98. sym(vp8_copy_mem16x16_mmx):
  99. push rbp
  100. mov rbp, rsp
  101. SHADOW_ARGS_TO_STACK 4
  102. push rsi
  103. push rdi
  104. ; end prolog
  105. mov rsi, arg(0) ;src;
  106. movsxd rax, dword ptr arg(1) ;src_stride;
  107. mov rdi, arg(2) ;dst;
  108. movsxd rcx, dword ptr arg(3) ;dst_stride
  109. movq mm0, [rsi]
  110. movq mm3, [rsi+8];
  111. movq mm1, [rsi+rax]
  112. movq mm4, [rsi+rax+8]
  113. movq mm2, [rsi+rax*2]
  114. movq mm5, [rsi+rax*2+8]
  115. lea rsi, [rsi+rax*2]
  116. add rsi, rax
  117. movq [rdi], mm0
  118. movq [rdi+8], mm3
  119. movq [rdi+rcx], mm1
  120. movq [rdi+rcx+8], mm4
  121. movq [rdi+rcx*2], mm2
  122. movq [rdi+rcx*2+8], mm5
  123. lea rdi, [rdi+rcx*2]
  124. add rdi, rcx
  125. movq mm0, [rsi]
  126. movq mm3, [rsi+8];
  127. movq mm1, [rsi+rax]
  128. movq mm4, [rsi+rax+8]
  129. movq mm2, [rsi+rax*2]
  130. movq mm5, [rsi+rax*2+8]
  131. lea rsi, [rsi+rax*2]
  132. add rsi, rax
  133. movq [rdi], mm0
  134. movq [rdi+8], mm3
  135. movq [rdi+rcx], mm1
  136. movq [rdi+rcx+8], mm4
  137. movq [rdi+rcx*2], mm2
  138. movq [rdi+rcx*2+8], mm5
  139. lea rdi, [rdi+rcx*2]
  140. add rdi, rcx
  141. movq mm0, [rsi]
  142. movq mm3, [rsi+8];
  143. movq mm1, [rsi+rax]
  144. movq mm4, [rsi+rax+8]
  145. movq mm2, [rsi+rax*2]
  146. movq mm5, [rsi+rax*2+8]
  147. lea rsi, [rsi+rax*2]
  148. add rsi, rax
  149. movq [rdi], mm0
  150. movq [rdi+8], mm3
  151. movq [rdi+rcx], mm1
  152. movq [rdi+rcx+8], mm4
  153. movq [rdi+rcx*2], mm2
  154. movq [rdi+rcx*2+8], mm5
  155. lea rdi, [rdi+rcx*2]
  156. add rdi, rcx
  157. movq mm0, [rsi]
  158. movq mm3, [rsi+8];
  159. movq mm1, [rsi+rax]
  160. movq mm4, [rsi+rax+8]
  161. movq mm2, [rsi+rax*2]
  162. movq mm5, [rsi+rax*2+8]
  163. lea rsi, [rsi+rax*2]
  164. add rsi, rax
  165. movq [rdi], mm0
  166. movq [rdi+8], mm3
  167. movq [rdi+rcx], mm1
  168. movq [rdi+rcx+8], mm4
  169. movq [rdi+rcx*2], mm2
  170. movq [rdi+rcx*2+8], mm5
  171. lea rdi, [rdi+rcx*2]
  172. add rdi, rcx
  173. movq mm0, [rsi]
  174. movq mm3, [rsi+8];
  175. movq mm1, [rsi+rax]
  176. movq mm4, [rsi+rax+8]
  177. movq mm2, [rsi+rax*2]
  178. movq mm5, [rsi+rax*2+8]
  179. lea rsi, [rsi+rax*2]
  180. add rsi, rax
  181. movq [rdi], mm0
  182. movq [rdi+8], mm3
  183. movq [rdi+rcx], mm1
  184. movq [rdi+rcx+8], mm4
  185. movq [rdi+rcx*2], mm2
  186. movq [rdi+rcx*2+8], mm5
  187. lea rdi, [rdi+rcx*2]
  188. add rdi, rcx
  189. movq mm0, [rsi]
  190. movq mm3, [rsi+8];
  191. movq [rdi], mm0
  192. movq [rdi+8], mm3
  193. ; begin epilog
  194. pop rdi
  195. pop rsi
  196. UNSHADOW_ARGS
  197. pop rbp
  198. ret