vpx_high_subpixel_bilinear_sse2.asm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %macro HIGH_GET_PARAM_4 0
  12. mov rdx, arg(5) ;filter ptr
  13. mov rsi, arg(0) ;src_ptr
  14. mov rdi, arg(2) ;output_ptr
  15. mov rcx, 0x00000040
  16. movdqa xmm3, [rdx] ;load filters
  17. pshuflw xmm4, xmm3, 11111111b ;k3
  18. psrldq xmm3, 8
  19. pshuflw xmm3, xmm3, 0b ;k4
  20. punpcklwd xmm4, xmm3 ;k3k4
  21. movq xmm3, rcx ;rounding
  22. pshufd xmm3, xmm3, 0
  23. mov rdx, 0x00010001
  24. movsxd rcx, DWORD PTR arg(6) ;bd
  25. movq xmm5, rdx
  26. movq xmm2, rcx
  27. pshufd xmm5, xmm5, 0b
  28. movdqa xmm1, xmm5
  29. psllw xmm5, xmm2
  30. psubw xmm5, xmm1 ;max value (for clamping)
  31. pxor xmm2, xmm2 ;min value (for clamping)
  32. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  33. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  34. movsxd rcx, DWORD PTR arg(4) ;output_height
  35. %endm
  36. %macro HIGH_APPLY_FILTER_4 1
  37. punpcklwd xmm0, xmm1 ;two row in one register
  38. pmaddwd xmm0, xmm4 ;multiply the filter factors
  39. paddd xmm0, xmm3 ;rounding
  40. psrad xmm0, 7 ;shift
  41. packssdw xmm0, xmm0 ;pack to word
  42. ;clamp the values
  43. pminsw xmm0, xmm5
  44. pmaxsw xmm0, xmm2
  45. %if %1
  46. movq xmm1, [rdi]
  47. pavgw xmm0, xmm1
  48. %endif
  49. movq [rdi], xmm0
  50. lea rsi, [rsi + 2*rax]
  51. lea rdi, [rdi + 2*rdx]
  52. dec rcx
  53. %endm
  54. %if ARCH_X86_64
  55. %macro HIGH_GET_PARAM 0
  56. mov rdx, arg(5) ;filter ptr
  57. mov rsi, arg(0) ;src_ptr
  58. mov rdi, arg(2) ;output_ptr
  59. mov rcx, 0x00000040
  60. movdqa xmm6, [rdx] ;load filters
  61. pshuflw xmm7, xmm6, 11111111b ;k3
  62. pshufhw xmm6, xmm6, 0b ;k4
  63. psrldq xmm6, 8
  64. punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
  65. movq xmm4, rcx ;rounding
  66. pshufd xmm4, xmm4, 0
  67. mov rdx, 0x00010001
  68. movsxd rcx, DWORD PTR arg(6) ;bd
  69. movq xmm8, rdx
  70. movq xmm5, rcx
  71. pshufd xmm8, xmm8, 0b
  72. movdqa xmm1, xmm8
  73. psllw xmm8, xmm5
  74. psubw xmm8, xmm1 ;max value (for clamping)
  75. pxor xmm5, xmm5 ;min value (for clamping)
  76. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  77. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  78. movsxd rcx, DWORD PTR arg(4) ;output_height
  79. %endm
  80. %macro HIGH_APPLY_FILTER_8 1
  81. movdqa xmm6, xmm0
  82. punpckhwd xmm6, xmm1
  83. punpcklwd xmm0, xmm1
  84. pmaddwd xmm6, xmm7
  85. pmaddwd xmm0, xmm7
  86. paddd xmm6, xmm4 ;rounding
  87. paddd xmm0, xmm4 ;rounding
  88. psrad xmm6, 7 ;shift
  89. psrad xmm0, 7 ;shift
  90. packssdw xmm0, xmm6 ;pack back to word
  91. ;clamp the values
  92. pminsw xmm0, xmm8
  93. pmaxsw xmm0, xmm5
  94. %if %1
  95. movdqu xmm1, [rdi]
  96. pavgw xmm0, xmm1
  97. %endif
  98. movdqu [rdi], xmm0 ;store the result
  99. lea rsi, [rsi + 2*rax]
  100. lea rdi, [rdi + 2*rdx]
  101. dec rcx
  102. %endm
  103. %macro HIGH_APPLY_FILTER_16 1
  104. movdqa xmm9, xmm0
  105. movdqa xmm6, xmm2
  106. punpckhwd xmm9, xmm1
  107. punpckhwd xmm6, xmm3
  108. punpcklwd xmm0, xmm1
  109. punpcklwd xmm2, xmm3
  110. pmaddwd xmm9, xmm7
  111. pmaddwd xmm6, xmm7
  112. pmaddwd xmm0, xmm7
  113. pmaddwd xmm2, xmm7
  114. paddd xmm9, xmm4 ;rounding
  115. paddd xmm6, xmm4
  116. paddd xmm0, xmm4
  117. paddd xmm2, xmm4
  118. psrad xmm9, 7 ;shift
  119. psrad xmm6, 7
  120. psrad xmm0, 7
  121. psrad xmm2, 7
  122. packssdw xmm0, xmm9 ;pack back to word
  123. packssdw xmm2, xmm6 ;pack back to word
  124. ;clamp the values
  125. pminsw xmm0, xmm8
  126. pmaxsw xmm0, xmm5
  127. pminsw xmm2, xmm8
  128. pmaxsw xmm2, xmm5
  129. %if %1
  130. movdqu xmm1, [rdi]
  131. movdqu xmm3, [rdi + 16]
  132. pavgw xmm0, xmm1
  133. pavgw xmm2, xmm3
  134. %endif
  135. movdqu [rdi], xmm0 ;store the result
  136. movdqu [rdi + 16], xmm2 ;store the result
  137. lea rsi, [rsi + 2*rax]
  138. lea rdi, [rdi + 2*rdx]
  139. dec rcx
  140. %endm
  141. %endif
  142. SECTION .text
  143. global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
  144. sym(vpx_highbd_filter_block1d4_v2_sse2):
  145. push rbp
  146. mov rbp, rsp
  147. SHADOW_ARGS_TO_STACK 7
  148. push rsi
  149. push rdi
  150. ; end prolog
  151. HIGH_GET_PARAM_4
  152. .loop:
  153. movq xmm0, [rsi] ;load src
  154. movq xmm1, [rsi + 2*rax]
  155. HIGH_APPLY_FILTER_4 0
  156. jnz .loop
  157. ; begin epilog
  158. pop rdi
  159. pop rsi
  160. UNSHADOW_ARGS
  161. pop rbp
  162. ret
  163. %if ARCH_X86_64
  164. global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE
  165. sym(vpx_highbd_filter_block1d8_v2_sse2):
  166. push rbp
  167. mov rbp, rsp
  168. SHADOW_ARGS_TO_STACK 7
  169. SAVE_XMM 8
  170. push rsi
  171. push rdi
  172. ; end prolog
  173. HIGH_GET_PARAM
  174. .loop:
  175. movdqu xmm0, [rsi] ;0
  176. movdqu xmm1, [rsi + 2*rax] ;1
  177. HIGH_APPLY_FILTER_8 0
  178. jnz .loop
  179. ; begin epilog
  180. pop rdi
  181. pop rsi
  182. RESTORE_XMM
  183. UNSHADOW_ARGS
  184. pop rbp
  185. ret
  186. global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE
  187. sym(vpx_highbd_filter_block1d16_v2_sse2):
  188. push rbp
  189. mov rbp, rsp
  190. SHADOW_ARGS_TO_STACK 7
  191. SAVE_XMM 9
  192. push rsi
  193. push rdi
  194. ; end prolog
  195. HIGH_GET_PARAM
  196. .loop:
  197. movdqu xmm0, [rsi] ;0
  198. movdqu xmm2, [rsi + 16]
  199. movdqu xmm1, [rsi + 2*rax] ;1
  200. movdqu xmm3, [rsi + 2*rax + 16]
  201. HIGH_APPLY_FILTER_16 0
  202. jnz .loop
  203. ; begin epilog
  204. pop rdi
  205. pop rsi
  206. RESTORE_XMM
  207. UNSHADOW_ARGS
  208. pop rbp
  209. ret
  210. %endif
  211. global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
  212. sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
  213. push rbp
  214. mov rbp, rsp
  215. SHADOW_ARGS_TO_STACK 7
  216. push rsi
  217. push rdi
  218. ; end prolog
  219. HIGH_GET_PARAM_4
  220. .loop:
  221. movq xmm0, [rsi] ;load src
  222. movq xmm1, [rsi + 2*rax]
  223. HIGH_APPLY_FILTER_4 1
  224. jnz .loop
  225. ; begin epilog
  226. pop rdi
  227. pop rsi
  228. UNSHADOW_ARGS
  229. pop rbp
  230. ret
  231. %if ARCH_X86_64
  232. global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
  233. sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
  234. push rbp
  235. mov rbp, rsp
  236. SHADOW_ARGS_TO_STACK 7
  237. SAVE_XMM 8
  238. push rsi
  239. push rdi
  240. ; end prolog
  241. HIGH_GET_PARAM
  242. .loop:
  243. movdqu xmm0, [rsi] ;0
  244. movdqu xmm1, [rsi + 2*rax] ;1
  245. HIGH_APPLY_FILTER_8 1
  246. jnz .loop
  247. ; begin epilog
  248. pop rdi
  249. pop rsi
  250. RESTORE_XMM
  251. UNSHADOW_ARGS
  252. pop rbp
  253. ret
  254. global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
  255. sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
  256. push rbp
  257. mov rbp, rsp
  258. SHADOW_ARGS_TO_STACK 7
  259. SAVE_XMM 9
  260. push rsi
  261. push rdi
  262. ; end prolog
  263. HIGH_GET_PARAM
  264. .loop:
  265. movdqu xmm0, [rsi] ;0
  266. movdqu xmm1, [rsi + 2*rax] ;1
  267. movdqu xmm2, [rsi + 16]
  268. movdqu xmm3, [rsi + 2*rax + 16]
  269. HIGH_APPLY_FILTER_16 1
  270. jnz .loop
  271. ; begin epilog
  272. pop rdi
  273. pop rsi
  274. RESTORE_XMM
  275. UNSHADOW_ARGS
  276. pop rbp
  277. ret
  278. %endif
  279. global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE
  280. sym(vpx_highbd_filter_block1d4_h2_sse2):
  281. push rbp
  282. mov rbp, rsp
  283. SHADOW_ARGS_TO_STACK 7
  284. push rsi
  285. push rdi
  286. ; end prolog
  287. HIGH_GET_PARAM_4
  288. .loop:
  289. movdqu xmm0, [rsi] ;load src
  290. movdqa xmm1, xmm0
  291. psrldq xmm1, 2
  292. HIGH_APPLY_FILTER_4 0
  293. jnz .loop
  294. ; begin epilog
  295. pop rdi
  296. pop rsi
  297. UNSHADOW_ARGS
  298. pop rbp
  299. ret
  300. %if ARCH_X86_64
  301. global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE
  302. sym(vpx_highbd_filter_block1d8_h2_sse2):
  303. push rbp
  304. mov rbp, rsp
  305. SHADOW_ARGS_TO_STACK 7
  306. SAVE_XMM 8
  307. push rsi
  308. push rdi
  309. ; end prolog
  310. HIGH_GET_PARAM
  311. .loop:
  312. movdqu xmm0, [rsi] ;load src
  313. movdqu xmm1, [rsi + 2]
  314. HIGH_APPLY_FILTER_8 0
  315. jnz .loop
  316. ; begin epilog
  317. pop rdi
  318. pop rsi
  319. RESTORE_XMM
  320. UNSHADOW_ARGS
  321. pop rbp
  322. ret
  323. global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE
  324. sym(vpx_highbd_filter_block1d16_h2_sse2):
  325. push rbp
  326. mov rbp, rsp
  327. SHADOW_ARGS_TO_STACK 7
  328. SAVE_XMM 9
  329. push rsi
  330. push rdi
  331. ; end prolog
  332. HIGH_GET_PARAM
  333. .loop:
  334. movdqu xmm0, [rsi] ;load src
  335. movdqu xmm1, [rsi + 2]
  336. movdqu xmm2, [rsi + 16]
  337. movdqu xmm3, [rsi + 18]
  338. HIGH_APPLY_FILTER_16 0
  339. jnz .loop
  340. ; begin epilog
  341. pop rdi
  342. pop rsi
  343. RESTORE_XMM
  344. UNSHADOW_ARGS
  345. pop rbp
  346. ret
  347. %endif
  348. global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
  349. sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
  350. push rbp
  351. mov rbp, rsp
  352. SHADOW_ARGS_TO_STACK 7
  353. push rsi
  354. push rdi
  355. ; end prolog
  356. HIGH_GET_PARAM_4
  357. .loop:
  358. movdqu xmm0, [rsi] ;load src
  359. movdqa xmm1, xmm0
  360. psrldq xmm1, 2
  361. HIGH_APPLY_FILTER_4 1
  362. jnz .loop
  363. ; begin epilog
  364. pop rdi
  365. pop rsi
  366. UNSHADOW_ARGS
  367. pop rbp
  368. ret
  369. %if ARCH_X86_64
  370. global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
  371. sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
  372. push rbp
  373. mov rbp, rsp
  374. SHADOW_ARGS_TO_STACK 7
  375. SAVE_XMM 8
  376. push rsi
  377. push rdi
  378. ; end prolog
  379. HIGH_GET_PARAM
  380. .loop:
  381. movdqu xmm0, [rsi] ;load src
  382. movdqu xmm1, [rsi + 2]
  383. HIGH_APPLY_FILTER_8 1
  384. jnz .loop
  385. ; begin epilog
  386. pop rdi
  387. pop rsi
  388. RESTORE_XMM
  389. UNSHADOW_ARGS
  390. pop rbp
  391. ret
  392. global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
  393. sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
  394. push rbp
  395. mov rbp, rsp
  396. SHADOW_ARGS_TO_STACK 7
  397. SAVE_XMM 9
  398. push rsi
  399. push rdi
  400. ; end prolog
  401. HIGH_GET_PARAM
  402. .loop:
  403. movdqu xmm0, [rsi] ;load src
  404. movdqu xmm1, [rsi + 2]
  405. movdqu xmm2, [rsi + 16]
  406. movdqu xmm3, [rsi + 18]
  407. HIGH_APPLY_FILTER_16 1
  408. jnz .loop
  409. ; begin epilog
  410. pop rdi
  411. pop rsi
  412. RESTORE_XMM
  413. UNSHADOW_ARGS
  414. pop rbp
  415. ret
  416. %endif