vpx_subpixel_bilinear_ssse3.asm 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %macro GET_PARAM_4 0
  12. mov rdx, arg(5) ;filter ptr
  13. mov rsi, arg(0) ;src_ptr
  14. mov rdi, arg(2) ;output_ptr
  15. mov ecx, 0x01000100
  16. movdqa xmm3, [rdx] ;load filters
  17. psrldq xmm3, 6
  18. packsswb xmm3, xmm3
  19. pshuflw xmm3, xmm3, 0b ;k3_k4
  20. movd xmm2, ecx ;rounding_shift
  21. pshufd xmm2, xmm2, 0
  22. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  23. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  24. movsxd rcx, DWORD PTR arg(4) ;output_height
  25. %endm
  26. %macro APPLY_FILTER_4 1
  27. punpcklbw xmm0, xmm1
  28. pmaddubsw xmm0, xmm3
  29. pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7)
  30. packuswb xmm0, xmm0 ;pack to byte
  31. %if %1
  32. movd xmm1, [rdi]
  33. pavgb xmm0, xmm1
  34. %endif
  35. movd [rdi], xmm0
  36. lea rsi, [rsi + rax]
  37. lea rdi, [rdi + rdx]
  38. dec rcx
  39. %endm
  40. %macro GET_PARAM 0
  41. mov rdx, arg(5) ;filter ptr
  42. mov rsi, arg(0) ;src_ptr
  43. mov rdi, arg(2) ;output_ptr
  44. mov ecx, 0x01000100
  45. movdqa xmm7, [rdx] ;load filters
  46. psrldq xmm7, 6
  47. packsswb xmm7, xmm7
  48. pshuflw xmm7, xmm7, 0b ;k3_k4
  49. punpcklwd xmm7, xmm7
  50. movd xmm6, ecx ;rounding_shift
  51. pshufd xmm6, xmm6, 0
  52. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  53. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  54. movsxd rcx, DWORD PTR arg(4) ;output_height
  55. %endm
  56. %macro APPLY_FILTER_8 1
  57. punpcklbw xmm0, xmm1
  58. pmaddubsw xmm0, xmm7
  59. pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
  60. packuswb xmm0, xmm0 ;pack back to byte
  61. %if %1
  62. movq xmm1, [rdi]
  63. pavgb xmm0, xmm1
  64. %endif
  65. movq [rdi], xmm0 ;store the result
  66. lea rsi, [rsi + rax]
  67. lea rdi, [rdi + rdx]
  68. dec rcx
  69. %endm
  70. %macro APPLY_FILTER_16 1
  71. punpcklbw xmm0, xmm1
  72. punpckhbw xmm2, xmm1
  73. pmaddubsw xmm0, xmm7
  74. pmaddubsw xmm2, xmm7
  75. pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
  76. pmulhrsw xmm2, xmm6
  77. packuswb xmm0, xmm2 ;pack back to byte
  78. %if %1
  79. movdqu xmm1, [rdi]
  80. pavgb xmm0, xmm1
  81. %endif
  82. movdqu [rdi], xmm0 ;store the result
  83. lea rsi, [rsi + rax]
  84. lea rdi, [rdi + rdx]
  85. dec rcx
  86. %endm
  87. SECTION .text
  88. global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
  89. sym(vpx_filter_block1d4_v2_ssse3):
  90. push rbp
  91. mov rbp, rsp
  92. SHADOW_ARGS_TO_STACK 6
  93. push rsi
  94. push rdi
  95. ; end prolog
  96. GET_PARAM_4
  97. .loop:
  98. movd xmm0, [rsi] ;load src
  99. movd xmm1, [rsi + rax]
  100. APPLY_FILTER_4 0
  101. jnz .loop
  102. ; begin epilog
  103. pop rdi
  104. pop rsi
  105. UNSHADOW_ARGS
  106. pop rbp
  107. ret
  108. global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
  109. sym(vpx_filter_block1d8_v2_ssse3):
  110. push rbp
  111. mov rbp, rsp
  112. SHADOW_ARGS_TO_STACK 6
  113. SAVE_XMM 7
  114. push rsi
  115. push rdi
  116. ; end prolog
  117. GET_PARAM
  118. .loop:
  119. movq xmm0, [rsi] ;0
  120. movq xmm1, [rsi + rax] ;1
  121. APPLY_FILTER_8 0
  122. jnz .loop
  123. ; begin epilog
  124. pop rdi
  125. pop rsi
  126. RESTORE_XMM
  127. UNSHADOW_ARGS
  128. pop rbp
  129. ret
  130. global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
  131. sym(vpx_filter_block1d16_v2_ssse3):
  132. push rbp
  133. mov rbp, rsp
  134. SHADOW_ARGS_TO_STACK 6
  135. SAVE_XMM 7
  136. push rsi
  137. push rdi
  138. ; end prolog
  139. GET_PARAM
  140. .loop:
  141. movdqu xmm0, [rsi] ;0
  142. movdqu xmm1, [rsi + rax] ;1
  143. movdqa xmm2, xmm0
  144. APPLY_FILTER_16 0
  145. jnz .loop
  146. ; begin epilog
  147. pop rdi
  148. pop rsi
  149. RESTORE_XMM
  150. UNSHADOW_ARGS
  151. pop rbp
  152. ret
  153. global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
  154. sym(vpx_filter_block1d4_v2_avg_ssse3):
  155. push rbp
  156. mov rbp, rsp
  157. SHADOW_ARGS_TO_STACK 6
  158. push rsi
  159. push rdi
  160. ; end prolog
  161. GET_PARAM_4
  162. .loop:
  163. movd xmm0, [rsi] ;load src
  164. movd xmm1, [rsi + rax]
  165. APPLY_FILTER_4 1
  166. jnz .loop
  167. ; begin epilog
  168. pop rdi
  169. pop rsi
  170. UNSHADOW_ARGS
  171. pop rbp
  172. ret
  173. global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
  174. sym(vpx_filter_block1d8_v2_avg_ssse3):
  175. push rbp
  176. mov rbp, rsp
  177. SHADOW_ARGS_TO_STACK 6
  178. SAVE_XMM 7
  179. push rsi
  180. push rdi
  181. ; end prolog
  182. GET_PARAM
  183. .loop:
  184. movq xmm0, [rsi] ;0
  185. movq xmm1, [rsi + rax] ;1
  186. APPLY_FILTER_8 1
  187. jnz .loop
  188. ; begin epilog
  189. pop rdi
  190. pop rsi
  191. RESTORE_XMM
  192. UNSHADOW_ARGS
  193. pop rbp
  194. ret
  195. global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
  196. sym(vpx_filter_block1d16_v2_avg_ssse3):
  197. push rbp
  198. mov rbp, rsp
  199. SHADOW_ARGS_TO_STACK 6
  200. SAVE_XMM 7
  201. push rsi
  202. push rdi
  203. ; end prolog
  204. GET_PARAM
  205. .loop:
  206. movdqu xmm0, [rsi] ;0
  207. movdqu xmm1, [rsi + rax] ;1
  208. movdqa xmm2, xmm0
  209. APPLY_FILTER_16 1
  210. jnz .loop
  211. ; begin epilog
  212. pop rdi
  213. pop rsi
  214. RESTORE_XMM
  215. UNSHADOW_ARGS
  216. pop rbp
  217. ret
  218. global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
  219. sym(vpx_filter_block1d4_h2_ssse3):
  220. push rbp
  221. mov rbp, rsp
  222. SHADOW_ARGS_TO_STACK 6
  223. push rsi
  224. push rdi
  225. ; end prolog
  226. GET_PARAM_4
  227. .loop:
  228. movdqu xmm0, [rsi] ;load src
  229. movdqa xmm1, xmm0
  230. psrldq xmm1, 1
  231. APPLY_FILTER_4 0
  232. jnz .loop
  233. ; begin epilog
  234. pop rdi
  235. pop rsi
  236. UNSHADOW_ARGS
  237. pop rbp
  238. ret
  239. global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
  240. sym(vpx_filter_block1d8_h2_ssse3):
  241. push rbp
  242. mov rbp, rsp
  243. SHADOW_ARGS_TO_STACK 6
  244. SAVE_XMM 7
  245. push rsi
  246. push rdi
  247. ; end prolog
  248. GET_PARAM
  249. .loop:
  250. movdqu xmm0, [rsi] ;load src
  251. movdqa xmm1, xmm0
  252. psrldq xmm1, 1
  253. APPLY_FILTER_8 0
  254. jnz .loop
  255. ; begin epilog
  256. pop rdi
  257. pop rsi
  258. RESTORE_XMM
  259. UNSHADOW_ARGS
  260. pop rbp
  261. ret
  262. global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
  263. sym(vpx_filter_block1d16_h2_ssse3):
  264. push rbp
  265. mov rbp, rsp
  266. SHADOW_ARGS_TO_STACK 6
  267. SAVE_XMM 7
  268. push rsi
  269. push rdi
  270. ; end prolog
  271. GET_PARAM
  272. .loop:
  273. movdqu xmm0, [rsi] ;load src
  274. movdqu xmm1, [rsi + 1]
  275. movdqa xmm2, xmm0
  276. APPLY_FILTER_16 0
  277. jnz .loop
  278. ; begin epilog
  279. pop rdi
  280. pop rsi
  281. RESTORE_XMM
  282. UNSHADOW_ARGS
  283. pop rbp
  284. ret
  285. global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
  286. sym(vpx_filter_block1d4_h2_avg_ssse3):
  287. push rbp
  288. mov rbp, rsp
  289. SHADOW_ARGS_TO_STACK 6
  290. push rsi
  291. push rdi
  292. ; end prolog
  293. GET_PARAM_4
  294. .loop:
  295. movdqu xmm0, [rsi] ;load src
  296. movdqa xmm1, xmm0
  297. psrldq xmm1, 1
  298. APPLY_FILTER_4 1
  299. jnz .loop
  300. ; begin epilog
  301. pop rdi
  302. pop rsi
  303. UNSHADOW_ARGS
  304. pop rbp
  305. ret
  306. global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
  307. sym(vpx_filter_block1d8_h2_avg_ssse3):
  308. push rbp
  309. mov rbp, rsp
  310. SHADOW_ARGS_TO_STACK 6
  311. SAVE_XMM 7
  312. push rsi
  313. push rdi
  314. ; end prolog
  315. GET_PARAM
  316. .loop:
  317. movdqu xmm0, [rsi] ;load src
  318. movdqa xmm1, xmm0
  319. psrldq xmm1, 1
  320. APPLY_FILTER_8 1
  321. jnz .loop
  322. ; begin epilog
  323. pop rdi
  324. pop rsi
  325. RESTORE_XMM
  326. UNSHADOW_ARGS
  327. pop rbp
  328. ret
  329. global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
  330. sym(vpx_filter_block1d16_h2_avg_ssse3):
  331. push rbp
  332. mov rbp, rsp
  333. SHADOW_ARGS_TO_STACK 6
  334. SAVE_XMM 7
  335. push rsi
  336. push rdi
  337. ; end prolog
  338. GET_PARAM
  339. .loop:
  340. movdqu xmm0, [rsi] ;load src
  341. movdqu xmm1, [rsi + 1]
  342. movdqa xmm2, xmm0
  343. APPLY_FILTER_16 1
  344. jnz .loop
  345. ; begin epilog
  346. pop rdi
  347. pop rsi
  348. RESTORE_XMM
  349. UNSHADOW_ARGS
  350. pop rbp
  351. ret