2
0

sad_ssse3.asm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %macro PROCESS_16X2X3 1
  12. %if %1
  13. movdqa xmm0, XMMWORD PTR [rsi]
  14. lddqu xmm5, XMMWORD PTR [rdi]
  15. lddqu xmm6, XMMWORD PTR [rdi+1]
  16. lddqu xmm7, XMMWORD PTR [rdi+2]
  17. psadbw xmm5, xmm0
  18. psadbw xmm6, xmm0
  19. psadbw xmm7, xmm0
  20. %else
  21. movdqa xmm0, XMMWORD PTR [rsi]
  22. lddqu xmm1, XMMWORD PTR [rdi]
  23. lddqu xmm2, XMMWORD PTR [rdi+1]
  24. lddqu xmm3, XMMWORD PTR [rdi+2]
  25. psadbw xmm1, xmm0
  26. psadbw xmm2, xmm0
  27. psadbw xmm3, xmm0
  28. paddw xmm5, xmm1
  29. paddw xmm6, xmm2
  30. paddw xmm7, xmm3
  31. %endif
  32. movdqa xmm0, XMMWORD PTR [rsi+rax]
  33. lddqu xmm1, XMMWORD PTR [rdi+rdx]
  34. lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
  35. lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
  36. lea rsi, [rsi+rax*2]
  37. lea rdi, [rdi+rdx*2]
  38. psadbw xmm1, xmm0
  39. psadbw xmm2, xmm0
  40. psadbw xmm3, xmm0
  41. paddw xmm5, xmm1
  42. paddw xmm6, xmm2
  43. paddw xmm7, xmm3
  44. %endmacro
  45. %macro PROCESS_16X2X3_OFFSET 2
  46. %if %1
  47. movdqa xmm0, XMMWORD PTR [rsi]
  48. movdqa xmm4, XMMWORD PTR [rdi]
  49. movdqa xmm7, XMMWORD PTR [rdi+16]
  50. movdqa xmm5, xmm7
  51. palignr xmm5, xmm4, %2
  52. movdqa xmm6, xmm7
  53. palignr xmm6, xmm4, (%2+1)
  54. palignr xmm7, xmm4, (%2+2)
  55. psadbw xmm5, xmm0
  56. psadbw xmm6, xmm0
  57. psadbw xmm7, xmm0
  58. %else
  59. movdqa xmm0, XMMWORD PTR [rsi]
  60. movdqa xmm4, XMMWORD PTR [rdi]
  61. movdqa xmm3, XMMWORD PTR [rdi+16]
  62. movdqa xmm1, xmm3
  63. palignr xmm1, xmm4, %2
  64. movdqa xmm2, xmm3
  65. palignr xmm2, xmm4, (%2+1)
  66. palignr xmm3, xmm4, (%2+2)
  67. psadbw xmm1, xmm0
  68. psadbw xmm2, xmm0
  69. psadbw xmm3, xmm0
  70. paddw xmm5, xmm1
  71. paddw xmm6, xmm2
  72. paddw xmm7, xmm3
  73. %endif
  74. movdqa xmm0, XMMWORD PTR [rsi+rax]
  75. movdqa xmm4, XMMWORD PTR [rdi+rdx]
  76. movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
  77. movdqa xmm1, xmm3
  78. palignr xmm1, xmm4, %2
  79. movdqa xmm2, xmm3
  80. palignr xmm2, xmm4, (%2+1)
  81. palignr xmm3, xmm4, (%2+2)
  82. lea rsi, [rsi+rax*2]
  83. lea rdi, [rdi+rdx*2]
  84. psadbw xmm1, xmm0
  85. psadbw xmm2, xmm0
  86. psadbw xmm3, xmm0
  87. paddw xmm5, xmm1
  88. paddw xmm6, xmm2
  89. paddw xmm7, xmm3
  90. %endmacro
  91. %macro PROCESS_16X16X3_OFFSET 2
  92. %2_aligned_by_%1:
  93. sub rdi, %1
  94. PROCESS_16X2X3_OFFSET 1, %1
  95. PROCESS_16X2X3_OFFSET 0, %1
  96. PROCESS_16X2X3_OFFSET 0, %1
  97. PROCESS_16X2X3_OFFSET 0, %1
  98. PROCESS_16X2X3_OFFSET 0, %1
  99. PROCESS_16X2X3_OFFSET 0, %1
  100. PROCESS_16X2X3_OFFSET 0, %1
  101. PROCESS_16X2X3_OFFSET 0, %1
  102. jmp %2_store_off
  103. %endmacro
  104. %macro PROCESS_16X8X3_OFFSET 2
  105. %2_aligned_by_%1:
  106. sub rdi, %1
  107. PROCESS_16X2X3_OFFSET 1, %1
  108. PROCESS_16X2X3_OFFSET 0, %1
  109. PROCESS_16X2X3_OFFSET 0, %1
  110. PROCESS_16X2X3_OFFSET 0, %1
  111. jmp %2_store_off
  112. %endmacro
  113. SECTION .text
  114. ;void int vpx_sad16x16x3_ssse3(
  115. ; unsigned char *src_ptr,
  116. ; int src_stride,
  117. ; unsigned char *ref_ptr,
  118. ; int ref_stride,
  119. ; int *results)
  120. global sym(vpx_sad16x16x3_ssse3) PRIVATE
  121. sym(vpx_sad16x16x3_ssse3):
  122. push rbp
  123. mov rbp, rsp
  124. SHADOW_ARGS_TO_STACK 5
  125. SAVE_XMM 7
  126. push rsi
  127. push rdi
  128. push rcx
  129. ; end prolog
  130. mov rsi, arg(0) ;src_ptr
  131. mov rdi, arg(2) ;ref_ptr
  132. mov rdx, 0xf
  133. and rdx, rdi
  134. jmp .vpx_sad16x16x3_ssse3_skiptable
  135. .vpx_sad16x16x3_ssse3_jumptable:
  136. dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump
  137. dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump
  138. dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump
  139. dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump
  140. dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump
  141. dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump
  142. dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump
  143. dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump
  144. dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump
  145. dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump
  146. dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
  147. dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
  148. dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
  149. dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
  150. dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
  151. dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
  152. .vpx_sad16x16x3_ssse3_skiptable:
  153. call .vpx_sad16x16x3_ssse3_do_jump
  154. .vpx_sad16x16x3_ssse3_do_jump:
  155. pop rcx ; get the address of do_jump
  156. mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
  157. add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
  158. movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
  159. add rcx, rax
  160. movsxd rax, dword ptr arg(1) ;src_stride
  161. movsxd rdx, dword ptr arg(3) ;ref_stride
  162. jmp rcx
  163. PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3
  164. PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3
  165. PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3
  166. PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3
  167. PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3
  168. PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3
  169. PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3
  170. PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3
  171. PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3
  172. PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3
  173. PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
  174. PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
  175. PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
  176. PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
  177. PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
  178. .vpx_sad16x16x3_ssse3_aligned_by_15:
  179. PROCESS_16X2X3 1
  180. PROCESS_16X2X3 0
  181. PROCESS_16X2X3 0
  182. PROCESS_16X2X3 0
  183. PROCESS_16X2X3 0
  184. PROCESS_16X2X3 0
  185. PROCESS_16X2X3 0
  186. PROCESS_16X2X3 0
  187. .vpx_sad16x16x3_ssse3_store_off:
  188. mov rdi, arg(4) ;Results
  189. movq xmm0, xmm5
  190. psrldq xmm5, 8
  191. paddw xmm0, xmm5
  192. movd [rdi], xmm0
  193. ;-
  194. movq xmm0, xmm6
  195. psrldq xmm6, 8
  196. paddw xmm0, xmm6
  197. movd [rdi+4], xmm0
  198. ;-
  199. movq xmm0, xmm7
  200. psrldq xmm7, 8
  201. paddw xmm0, xmm7
  202. movd [rdi+8], xmm0
  203. ; begin epilog
  204. pop rcx
  205. pop rdi
  206. pop rsi
  207. RESTORE_XMM
  208. UNSHADOW_ARGS
  209. pop rbp
  210. ret
  211. ;void int vpx_sad16x8x3_ssse3(
  212. ; unsigned char *src_ptr,
  213. ; int src_stride,
  214. ; unsigned char *ref_ptr,
  215. ; int ref_stride,
  216. ; int *results)
  217. global sym(vpx_sad16x8x3_ssse3) PRIVATE
  218. sym(vpx_sad16x8x3_ssse3):
  219. push rbp
  220. mov rbp, rsp
  221. SHADOW_ARGS_TO_STACK 5
  222. SAVE_XMM 7
  223. push rsi
  224. push rdi
  225. push rcx
  226. ; end prolog
  227. mov rsi, arg(0) ;src_ptr
  228. mov rdi, arg(2) ;ref_ptr
  229. mov rdx, 0xf
  230. and rdx, rdi
  231. jmp .vpx_sad16x8x3_ssse3_skiptable
  232. .vpx_sad16x8x3_ssse3_jumptable:
  233. dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump
  234. dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump
  235. dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump
  236. dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump
  237. dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump
  238. dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump
  239. dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump
  240. dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump
  241. dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump
  242. dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump
  243. dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
  244. dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
  245. dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
  246. dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
  247. dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
  248. dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
  249. .vpx_sad16x8x3_ssse3_skiptable:
  250. call .vpx_sad16x8x3_ssse3_do_jump
  251. .vpx_sad16x8x3_ssse3_do_jump:
  252. pop rcx ; get the address of do_jump
  253. mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
  254. add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
  255. movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
  256. add rcx, rax
  257. movsxd rax, dword ptr arg(1) ;src_stride
  258. movsxd rdx, dword ptr arg(3) ;ref_stride
  259. jmp rcx
  260. PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3
  261. PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3
  262. PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3
  263. PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3
  264. PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3
  265. PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3
  266. PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3
  267. PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3
  268. PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3
  269. PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3
  270. PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
  271. PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
  272. PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
  273. PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
  274. PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
  275. .vpx_sad16x8x3_ssse3_aligned_by_15:
  276. PROCESS_16X2X3 1
  277. PROCESS_16X2X3 0
  278. PROCESS_16X2X3 0
  279. PROCESS_16X2X3 0
  280. .vpx_sad16x8x3_ssse3_store_off:
  281. mov rdi, arg(4) ;Results
  282. movq xmm0, xmm5
  283. psrldq xmm5, 8
  284. paddw xmm0, xmm5
  285. movd [rdi], xmm0
  286. ;-
  287. movq xmm0, xmm6
  288. psrldq xmm6, 8
  289. paddw xmm0, xmm6
  290. movd [rdi+4], xmm0
  291. ;-
  292. movq xmm0, xmm7
  293. psrldq xmm7, 8
  294. paddw xmm0, xmm7
  295. movd [rdi+8], xmm0
  296. ; begin epilog
  297. pop rcx
  298. pop rdi
  299. pop rsi
  300. RESTORE_XMM
  301. UNSHADOW_ARGS
  302. pop rbp
  303. ret