iwalsh_mmx.asm 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
  12. global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
  13. sym(vp8_short_inv_walsh4x4_mmx):
  14. push rbp
  15. mov rbp, rsp
  16. SHADOW_ARGS_TO_STACK 2
  17. ; end prolog
  18. mov rdx, arg(0)
  19. mov rax, 30003h
  20. movq mm0, [rdx + 0] ;ip[0]
  21. movq mm1, [rdx + 8] ;ip[4]
  22. movq mm7, rax
  23. movq mm2, [rdx + 16] ;ip[8]
  24. movq mm3, [rdx + 24] ;ip[12]
  25. punpcklwd mm7, mm7 ;0003000300030003h
  26. mov rdx, arg(1)
  27. movq mm4, mm0
  28. movq mm5, mm1
  29. paddw mm4, mm3 ;ip[0] + ip[12] aka al
  30. paddw mm5, mm2 ;ip[4] + ip[8] aka bl
  31. movq mm6, mm4 ;temp al
  32. paddw mm4, mm5 ;al + bl
  33. psubw mm6, mm5 ;al - bl
  34. psubw mm0, mm3 ;ip[0] - ip[12] aka d1
  35. psubw mm1, mm2 ;ip[4] - ip[8] aka c1
  36. movq mm5, mm0 ;temp dl
  37. paddw mm0, mm1 ;dl + cl
  38. psubw mm5, mm1 ;dl - cl
  39. ; 03 02 01 00
  40. ; 13 12 11 10
  41. ; 23 22 21 20
  42. ; 33 32 31 30
  43. movq mm3, mm4 ; 03 02 01 00
  44. punpcklwd mm4, mm0 ; 11 01 10 00
  45. punpckhwd mm3, mm0 ; 13 03 12 02
  46. movq mm1, mm6 ; 23 22 21 20
  47. punpcklwd mm6, mm5 ; 31 21 30 20
  48. punpckhwd mm1, mm5 ; 33 23 32 22
  49. movq mm0, mm4 ; 11 01 10 00
  50. movq mm2, mm3 ; 13 03 12 02
  51. punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
  52. punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
  53. punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
  54. punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
  55. ;~~~~~~~~~~~~~~~~~~~~~
  56. movq mm1, mm0
  57. movq mm5, mm4
  58. paddw mm1, mm3 ;ip[0] + ip[12] aka al
  59. paddw mm5, mm2 ;ip[4] + ip[8] aka bl
  60. movq mm6, mm1 ;temp al
  61. paddw mm1, mm5 ;al + bl
  62. psubw mm6, mm5 ;al - bl
  63. paddw mm1, mm7
  64. paddw mm6, mm7
  65. psraw mm1, 3
  66. psraw mm6, 3
  67. psubw mm0, mm3 ;ip[0] - ip[12] aka d1
  68. psubw mm4, mm2 ;ip[4] - ip[8] aka c1
  69. movq mm5, mm0 ;temp dl
  70. paddw mm0, mm4 ;dl + cl
  71. psubw mm5, mm4 ;dl - cl
  72. paddw mm0, mm7
  73. paddw mm5, mm7
  74. psraw mm0, 3
  75. psraw mm5, 3
  76. ;~~~~~~~~~~~~~~~~~~~~~
  77. movd eax, mm1
  78. movd ecx, mm0
  79. psrlq mm0, 32
  80. psrlq mm1, 32
  81. mov word ptr[rdx+32*0], ax
  82. mov word ptr[rdx+32*1], cx
  83. shr eax, 16
  84. shr ecx, 16
  85. mov word ptr[rdx+32*4], ax
  86. mov word ptr[rdx+32*5], cx
  87. movd eax, mm1
  88. movd ecx, mm0
  89. mov word ptr[rdx+32*8], ax
  90. mov word ptr[rdx+32*9], cx
  91. shr eax, 16
  92. shr ecx, 16
  93. mov word ptr[rdx+32*12], ax
  94. mov word ptr[rdx+32*13], cx
  95. movd eax, mm6
  96. movd ecx, mm5
  97. psrlq mm5, 32
  98. psrlq mm6, 32
  99. mov word ptr[rdx+32*2], ax
  100. mov word ptr[rdx+32*3], cx
  101. shr eax, 16
  102. shr ecx, 16
  103. mov word ptr[rdx+32*6], ax
  104. mov word ptr[rdx+32*7], cx
  105. movd eax, mm6
  106. movd ecx, mm5
  107. mov word ptr[rdx+32*10], ax
  108. mov word ptr[rdx+32*11], cx
  109. shr eax, 16
  110. shr ecx, 16
  111. mov word ptr[rdx+32*14], ax
  112. mov word ptr[rdx+32*15], cx
  113. ; begin epilog
  114. UNSHADOW_ARGS
  115. pop rbp
  116. ret