vpx_subpixel_bilinear_sse2.asm 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %macro GET_PARAM_4 0
  12. mov rdx, arg(5) ;filter ptr
  13. mov rsi, arg(0) ;src_ptr
  14. mov rdi, arg(2) ;output_ptr
  15. mov rcx, 0x0400040
  16. movdqa xmm3, [rdx] ;load filters
  17. pshuflw xmm4, xmm3, 11111111b ;k3
  18. psrldq xmm3, 8
  19. pshuflw xmm3, xmm3, 0b ;k4
  20. punpcklqdq xmm4, xmm3 ;k3k4
  21. movq xmm3, rcx ;rounding
  22. pshufd xmm3, xmm3, 0
  23. pxor xmm2, xmm2
  24. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  25. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  26. movsxd rcx, DWORD PTR arg(4) ;output_height
  27. %endm
  28. %macro APPLY_FILTER_4 1
  29. punpckldq xmm0, xmm1 ;two row in one register
  30. punpcklbw xmm0, xmm2 ;unpack to word
  31. pmullw xmm0, xmm4 ;multiply the filter factors
  32. movdqa xmm1, xmm0
  33. psrldq xmm1, 8
  34. paddsw xmm0, xmm1
  35. paddsw xmm0, xmm3 ;rounding
  36. psraw xmm0, 7 ;shift
  37. packuswb xmm0, xmm0 ;pack to byte
  38. %if %1
  39. movd xmm1, [rdi]
  40. pavgb xmm0, xmm1
  41. %endif
  42. movd [rdi], xmm0
  43. lea rsi, [rsi + rax]
  44. lea rdi, [rdi + rdx]
  45. dec rcx
  46. %endm
  47. %macro GET_PARAM 0
  48. mov rdx, arg(5) ;filter ptr
  49. mov rsi, arg(0) ;src_ptr
  50. mov rdi, arg(2) ;output_ptr
  51. mov rcx, 0x0400040
  52. movdqa xmm7, [rdx] ;load filters
  53. pshuflw xmm6, xmm7, 11111111b ;k3
  54. pshufhw xmm7, xmm7, 0b ;k4
  55. punpcklwd xmm6, xmm6
  56. punpckhwd xmm7, xmm7
  57. movq xmm4, rcx ;rounding
  58. pshufd xmm4, xmm4, 0
  59. pxor xmm5, xmm5
  60. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  61. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  62. movsxd rcx, DWORD PTR arg(4) ;output_height
  63. %endm
  64. %macro APPLY_FILTER_8 1
  65. punpcklbw xmm0, xmm5
  66. punpcklbw xmm1, xmm5
  67. pmullw xmm0, xmm6
  68. pmullw xmm1, xmm7
  69. paddsw xmm0, xmm1
  70. paddsw xmm0, xmm4 ;rounding
  71. psraw xmm0, 7 ;shift
  72. packuswb xmm0, xmm0 ;pack back to byte
  73. %if %1
  74. movq xmm1, [rdi]
  75. pavgb xmm0, xmm1
  76. %endif
  77. movq [rdi], xmm0 ;store the result
  78. lea rsi, [rsi + rax]
  79. lea rdi, [rdi + rdx]
  80. dec rcx
  81. %endm
  82. %macro APPLY_FILTER_16 1
  83. punpcklbw xmm0, xmm5
  84. punpcklbw xmm1, xmm5
  85. punpckhbw xmm2, xmm5
  86. punpckhbw xmm3, xmm5
  87. pmullw xmm0, xmm6
  88. pmullw xmm1, xmm7
  89. pmullw xmm2, xmm6
  90. pmullw xmm3, xmm7
  91. paddsw xmm0, xmm1
  92. paddsw xmm2, xmm3
  93. paddsw xmm0, xmm4 ;rounding
  94. paddsw xmm2, xmm4
  95. psraw xmm0, 7 ;shift
  96. psraw xmm2, 7
  97. packuswb xmm0, xmm2 ;pack back to byte
  98. %if %1
  99. movdqu xmm1, [rdi]
  100. pavgb xmm0, xmm1
  101. %endif
  102. movdqu [rdi], xmm0 ;store the result
  103. lea rsi, [rsi + rax]
  104. lea rdi, [rdi + rdx]
  105. dec rcx
  106. %endm
  107. SECTION .text
  108. global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
  109. sym(vpx_filter_block1d4_v2_sse2):
  110. push rbp
  111. mov rbp, rsp
  112. SHADOW_ARGS_TO_STACK 6
  113. push rsi
  114. push rdi
  115. ; end prolog
  116. GET_PARAM_4
  117. .loop:
  118. movd xmm0, [rsi] ;load src
  119. movd xmm1, [rsi + rax]
  120. APPLY_FILTER_4 0
  121. jnz .loop
  122. ; begin epilog
  123. pop rdi
  124. pop rsi
  125. UNSHADOW_ARGS
  126. pop rbp
  127. ret
  128. global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
  129. sym(vpx_filter_block1d8_v2_sse2):
  130. push rbp
  131. mov rbp, rsp
  132. SHADOW_ARGS_TO_STACK 6
  133. SAVE_XMM 7
  134. push rsi
  135. push rdi
  136. ; end prolog
  137. GET_PARAM
  138. .loop:
  139. movq xmm0, [rsi] ;0
  140. movq xmm1, [rsi + rax] ;1
  141. APPLY_FILTER_8 0
  142. jnz .loop
  143. ; begin epilog
  144. pop rdi
  145. pop rsi
  146. RESTORE_XMM
  147. UNSHADOW_ARGS
  148. pop rbp
  149. ret
  150. global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
  151. sym(vpx_filter_block1d16_v2_sse2):
  152. push rbp
  153. mov rbp, rsp
  154. SHADOW_ARGS_TO_STACK 6
  155. SAVE_XMM 7
  156. push rsi
  157. push rdi
  158. ; end prolog
  159. GET_PARAM
  160. .loop:
  161. movdqu xmm0, [rsi] ;0
  162. movdqu xmm1, [rsi + rax] ;1
  163. movdqa xmm2, xmm0
  164. movdqa xmm3, xmm1
  165. APPLY_FILTER_16 0
  166. jnz .loop
  167. ; begin epilog
  168. pop rdi
  169. pop rsi
  170. RESTORE_XMM
  171. UNSHADOW_ARGS
  172. pop rbp
  173. ret
  174. global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
  175. sym(vpx_filter_block1d4_v2_avg_sse2):
  176. push rbp
  177. mov rbp, rsp
  178. SHADOW_ARGS_TO_STACK 6
  179. push rsi
  180. push rdi
  181. ; end prolog
  182. GET_PARAM_4
  183. .loop:
  184. movd xmm0, [rsi] ;load src
  185. movd xmm1, [rsi + rax]
  186. APPLY_FILTER_4 1
  187. jnz .loop
  188. ; begin epilog
  189. pop rdi
  190. pop rsi
  191. UNSHADOW_ARGS
  192. pop rbp
  193. ret
  194. global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
  195. sym(vpx_filter_block1d8_v2_avg_sse2):
  196. push rbp
  197. mov rbp, rsp
  198. SHADOW_ARGS_TO_STACK 6
  199. SAVE_XMM 7
  200. push rsi
  201. push rdi
  202. ; end prolog
  203. GET_PARAM
  204. .loop:
  205. movq xmm0, [rsi] ;0
  206. movq xmm1, [rsi + rax] ;1
  207. APPLY_FILTER_8 1
  208. jnz .loop
  209. ; begin epilog
  210. pop rdi
  211. pop rsi
  212. RESTORE_XMM
  213. UNSHADOW_ARGS
  214. pop rbp
  215. ret
  216. global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
  217. sym(vpx_filter_block1d16_v2_avg_sse2):
  218. push rbp
  219. mov rbp, rsp
  220. SHADOW_ARGS_TO_STACK 6
  221. SAVE_XMM 7
  222. push rsi
  223. push rdi
  224. ; end prolog
  225. GET_PARAM
  226. .loop:
  227. movdqu xmm0, [rsi] ;0
  228. movdqu xmm1, [rsi + rax] ;1
  229. movdqa xmm2, xmm0
  230. movdqa xmm3, xmm1
  231. APPLY_FILTER_16 1
  232. jnz .loop
  233. ; begin epilog
  234. pop rdi
  235. pop rsi
  236. RESTORE_XMM
  237. UNSHADOW_ARGS
  238. pop rbp
  239. ret
  240. global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
  241. sym(vpx_filter_block1d4_h2_sse2):
  242. push rbp
  243. mov rbp, rsp
  244. SHADOW_ARGS_TO_STACK 6
  245. push rsi
  246. push rdi
  247. ; end prolog
  248. GET_PARAM_4
  249. .loop:
  250. movdqu xmm0, [rsi] ;load src
  251. movdqa xmm1, xmm0
  252. psrldq xmm1, 1
  253. APPLY_FILTER_4 0
  254. jnz .loop
  255. ; begin epilog
  256. pop rdi
  257. pop rsi
  258. UNSHADOW_ARGS
  259. pop rbp
  260. ret
  261. global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
  262. sym(vpx_filter_block1d8_h2_sse2):
  263. push rbp
  264. mov rbp, rsp
  265. SHADOW_ARGS_TO_STACK 6
  266. SAVE_XMM 7
  267. push rsi
  268. push rdi
  269. ; end prolog
  270. GET_PARAM
  271. .loop:
  272. movdqu xmm0, [rsi] ;load src
  273. movdqa xmm1, xmm0
  274. psrldq xmm1, 1
  275. APPLY_FILTER_8 0
  276. jnz .loop
  277. ; begin epilog
  278. pop rdi
  279. pop rsi
  280. RESTORE_XMM
  281. UNSHADOW_ARGS
  282. pop rbp
  283. ret
  284. global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
  285. sym(vpx_filter_block1d16_h2_sse2):
  286. push rbp
  287. mov rbp, rsp
  288. SHADOW_ARGS_TO_STACK 6
  289. SAVE_XMM 7
  290. push rsi
  291. push rdi
  292. ; end prolog
  293. GET_PARAM
  294. .loop:
  295. movdqu xmm0, [rsi] ;load src
  296. movdqu xmm1, [rsi + 1]
  297. movdqa xmm2, xmm0
  298. movdqa xmm3, xmm1
  299. APPLY_FILTER_16 0
  300. jnz .loop
  301. ; begin epilog
  302. pop rdi
  303. pop rsi
  304. RESTORE_XMM
  305. UNSHADOW_ARGS
  306. pop rbp
  307. ret
  308. global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
  309. sym(vpx_filter_block1d4_h2_avg_sse2):
  310. push rbp
  311. mov rbp, rsp
  312. SHADOW_ARGS_TO_STACK 6
  313. push rsi
  314. push rdi
  315. ; end prolog
  316. GET_PARAM_4
  317. .loop:
  318. movdqu xmm0, [rsi] ;load src
  319. movdqa xmm1, xmm0
  320. psrldq xmm1, 1
  321. APPLY_FILTER_4 1
  322. jnz .loop
  323. ; begin epilog
  324. pop rdi
  325. pop rsi
  326. UNSHADOW_ARGS
  327. pop rbp
  328. ret
  329. global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
  330. sym(vpx_filter_block1d8_h2_avg_sse2):
  331. push rbp
  332. mov rbp, rsp
  333. SHADOW_ARGS_TO_STACK 6
  334. SAVE_XMM 7
  335. push rsi
  336. push rdi
  337. ; end prolog
  338. GET_PARAM
  339. .loop:
  340. movdqu xmm0, [rsi] ;load src
  341. movdqa xmm1, xmm0
  342. psrldq xmm1, 1
  343. APPLY_FILTER_8 1
  344. jnz .loop
  345. ; begin epilog
  346. pop rdi
  347. pop rsi
  348. RESTORE_XMM
  349. UNSHADOW_ARGS
  350. pop rbp
  351. ret
  352. global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
  353. sym(vpx_filter_block1d16_h2_avg_sse2):
  354. push rbp
  355. mov rbp, rsp
  356. SHADOW_ARGS_TO_STACK 6
  357. SAVE_XMM 7
  358. push rsi
  359. push rdi
  360. ; end prolog
  361. GET_PARAM
  362. .loop:
  363. movdqu xmm0, [rsi] ;load src
  364. movdqu xmm1, [rsi + 1]
  365. movdqa xmm2, xmm0
  366. movdqa xmm3, xmm1
  367. APPLY_FILTER_16 1
  368. jnz .loop
  369. ; begin epilog
  370. pop rdi
  371. pop rsi
  372. RESTORE_XMM
  373. UNSHADOW_ARGS
  374. pop rbp
  375. ret