ssim_opt_x86_64.asm 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
  12. %macro TABULATE_SSIM 0
  13. paddusw xmm15, xmm3 ; sum_s
  14. paddusw xmm14, xmm4 ; sum_r
  15. movdqa xmm1, xmm3
  16. pmaddwd xmm1, xmm1
  17. paddd xmm13, xmm1 ; sum_sq_s
  18. movdqa xmm2, xmm4
  19. pmaddwd xmm2, xmm2
  20. paddd xmm12, xmm2 ; sum_sq_r
  21. pmaddwd xmm3, xmm4
  22. paddd xmm11, xmm3 ; sum_sxr
  23. %endmacro
  24. ; Sum across the register %1 starting with q words
  25. %macro SUM_ACROSS_Q 1
  26. movdqa xmm2,%1
  27. punpckldq %1,xmm0
  28. punpckhdq xmm2,xmm0
  29. paddq %1,xmm2
  30. movdqa xmm2,%1
  31. punpcklqdq %1,xmm0
  32. punpckhqdq xmm2,xmm0
  33. paddq %1,xmm2
  34. %endmacro
  35. ; Sum across the register %1 starting with q words
  36. %macro SUM_ACROSS_W 1
  37. movdqa xmm1, %1
  38. punpcklwd %1,xmm0
  39. punpckhwd xmm1,xmm0
  40. paddd %1, xmm1
  41. SUM_ACROSS_Q %1
  42. %endmacro
  43. SECTION .text
  44. ;void ssim_parms_sse2(
  45. ; unsigned char *s,
  46. ; int sp,
  47. ; unsigned char *r,
  48. ; int rp
  49. ; uint32_t *sum_s,
  50. ; uint32_t *sum_r,
  51. ; uint32_t *sum_sq_s,
  52. ; uint32_t *sum_sq_r,
  53. ; uint32_t *sum_sxr);
  54. ;
  55. ; TODO: Use parm passing through structure, probably don't need the pxors
  56. ; ( calling app will initialize to 0 ) could easily fit everything in sse2
  57. ; without too much hastle, and can probably do better estimates with psadw
  58. ; or pavgb At this point this is just meant to be first pass for calculating
  59. ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
  60. ; in mode selection code.
  61. global sym(vpx_ssim_parms_16x16_sse2) PRIVATE
  62. sym(vpx_ssim_parms_16x16_sse2):
  63. push rbp
  64. mov rbp, rsp
  65. SHADOW_ARGS_TO_STACK 9
  66. SAVE_XMM 15
  67. push rsi
  68. push rdi
  69. ; end prolog
  70. mov rsi, arg(0) ;s
  71. mov rcx, arg(1) ;sp
  72. mov rdi, arg(2) ;r
  73. mov rax, arg(3) ;rp
  74. pxor xmm0, xmm0
  75. pxor xmm15,xmm15 ;sum_s
  76. pxor xmm14,xmm14 ;sum_r
  77. pxor xmm13,xmm13 ;sum_sq_s
  78. pxor xmm12,xmm12 ;sum_sq_r
  79. pxor xmm11,xmm11 ;sum_sxr
  80. mov rdx, 16 ;row counter
  81. .NextRow:
  82. ;grab source and reference pixels
  83. movdqu xmm5, [rsi]
  84. movdqu xmm6, [rdi]
  85. movdqa xmm3, xmm5
  86. movdqa xmm4, xmm6
  87. punpckhbw xmm3, xmm0 ; high_s
  88. punpckhbw xmm4, xmm0 ; high_r
  89. TABULATE_SSIM
  90. movdqa xmm3, xmm5
  91. movdqa xmm4, xmm6
  92. punpcklbw xmm3, xmm0 ; low_s
  93. punpcklbw xmm4, xmm0 ; low_r
  94. TABULATE_SSIM
  95. add rsi, rcx ; next s row
  96. add rdi, rax ; next r row
  97. dec rdx ; counter
  98. jnz .NextRow
  99. SUM_ACROSS_W xmm15
  100. SUM_ACROSS_W xmm14
  101. SUM_ACROSS_Q xmm13
  102. SUM_ACROSS_Q xmm12
  103. SUM_ACROSS_Q xmm11
  104. mov rdi,arg(4)
  105. movd [rdi], xmm15;
  106. mov rdi,arg(5)
  107. movd [rdi], xmm14;
  108. mov rdi,arg(6)
  109. movd [rdi], xmm13;
  110. mov rdi,arg(7)
  111. movd [rdi], xmm12;
  112. mov rdi,arg(8)
  113. movd [rdi], xmm11;
  114. ; begin epilog
  115. pop rdi
  116. pop rsi
  117. RESTORE_XMM
  118. UNSHADOW_ARGS
  119. pop rbp
  120. ret
  121. ;void ssim_parms_sse2(
  122. ; unsigned char *s,
  123. ; int sp,
  124. ; unsigned char *r,
  125. ; int rp
  126. ; uint32_t *sum_s,
  127. ; uint32_t *sum_r,
  128. ; uint32_t *sum_sq_s,
  129. ; uint32_t *sum_sq_r,
  130. ; uint32_t *sum_sxr);
  131. ;
  132. ; TODO: Use parm passing through structure, probably don't need the pxors
  133. ; ( calling app will initialize to 0 ) could easily fit everything in sse2
  134. ; without too much hastle, and can probably do better estimates with psadw
  135. ; or pavgb At this point this is just meant to be first pass for calculating
  136. ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
  137. ; in mode selection code.
  138. global sym(vpx_ssim_parms_8x8_sse2) PRIVATE
  139. sym(vpx_ssim_parms_8x8_sse2):
  140. push rbp
  141. mov rbp, rsp
  142. SHADOW_ARGS_TO_STACK 9
  143. SAVE_XMM 15
  144. push rsi
  145. push rdi
  146. ; end prolog
  147. mov rsi, arg(0) ;s
  148. mov rcx, arg(1) ;sp
  149. mov rdi, arg(2) ;r
  150. mov rax, arg(3) ;rp
  151. pxor xmm0, xmm0
  152. pxor xmm15,xmm15 ;sum_s
  153. pxor xmm14,xmm14 ;sum_r
  154. pxor xmm13,xmm13 ;sum_sq_s
  155. pxor xmm12,xmm12 ;sum_sq_r
  156. pxor xmm11,xmm11 ;sum_sxr
  157. mov rdx, 8 ;row counter
  158. .NextRow:
  159. ;grab source and reference pixels
  160. movq xmm3, [rsi]
  161. movq xmm4, [rdi]
  162. punpcklbw xmm3, xmm0 ; low_s
  163. punpcklbw xmm4, xmm0 ; low_r
  164. TABULATE_SSIM
  165. add rsi, rcx ; next s row
  166. add rdi, rax ; next r row
  167. dec rdx ; counter
  168. jnz .NextRow
  169. SUM_ACROSS_W xmm15
  170. SUM_ACROSS_W xmm14
  171. SUM_ACROSS_Q xmm13
  172. SUM_ACROSS_Q xmm12
  173. SUM_ACROSS_Q xmm11
  174. mov rdi,arg(4)
  175. movd [rdi], xmm15;
  176. mov rdi,arg(5)
  177. movd [rdi], xmm14;
  178. mov rdi,arg(6)
  179. movd [rdi], xmm13;
  180. mov rdi,arg(7)
  181. movd [rdi], xmm12;
  182. mov rdi,arg(8)
  183. movd [rdi], xmm11;
  184. ; begin epilog
  185. pop rdi
  186. pop rsi
  187. RESTORE_XMM
  188. UNSHADOW_ARGS
  189. pop rbp
  190. ret