vpx_high_subpixel_8t_sse2.asm 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
  12. ;overflow.
  13. %macro HIGH_GET_FILTERS_4 0
  14. mov rdx, arg(5) ;filter ptr
  15. mov rcx, 0x00000040
  16. movdqa xmm7, [rdx] ;load filters
  17. pshuflw xmm0, xmm7, 0b ;k0
  18. pshuflw xmm1, xmm7, 01010101b ;k1
  19. pshuflw xmm2, xmm7, 10101010b ;k2
  20. pshuflw xmm3, xmm7, 11111111b ;k3
  21. psrldq xmm7, 8
  22. pshuflw xmm4, xmm7, 0b ;k4
  23. pshuflw xmm5, xmm7, 01010101b ;k5
  24. pshuflw xmm6, xmm7, 10101010b ;k6
  25. pshuflw xmm7, xmm7, 11111111b ;k7
  26. punpcklwd xmm0, xmm6
  27. punpcklwd xmm2, xmm5
  28. punpcklwd xmm3, xmm4
  29. punpcklwd xmm1, xmm7
  30. movdqa k0k6, xmm0
  31. movdqa k2k5, xmm2
  32. movdqa k3k4, xmm3
  33. movdqa k1k7, xmm1
  34. movq xmm6, rcx
  35. pshufd xmm6, xmm6, 0
  36. movdqa krd, xmm6
  37. ;Compute max and min values of a pixel
  38. mov rdx, 0x00010001
  39. movsxd rcx, DWORD PTR arg(6) ;bd
  40. movq xmm0, rdx
  41. movq xmm1, rcx
  42. pshufd xmm0, xmm0, 0b
  43. movdqa xmm2, xmm0
  44. psllw xmm0, xmm1
  45. psubw xmm0, xmm2
  46. pxor xmm1, xmm1
  47. movdqa max, xmm0 ;max value (for clamping)
  48. movdqa min, xmm1 ;min value (for clamping)
  49. %endm
  50. %macro HIGH_APPLY_FILTER_4 1
  51. punpcklwd xmm0, xmm6 ;two row in one register
  52. punpcklwd xmm1, xmm7
  53. punpcklwd xmm2, xmm5
  54. punpcklwd xmm3, xmm4
  55. pmaddwd xmm0, k0k6 ;multiply the filter factors
  56. pmaddwd xmm1, k1k7
  57. pmaddwd xmm2, k2k5
  58. pmaddwd xmm3, k3k4
  59. paddd xmm0, xmm1 ;sum
  60. paddd xmm0, xmm2
  61. paddd xmm0, xmm3
  62. paddd xmm0, krd ;rounding
  63. psrad xmm0, 7 ;shift
  64. packssdw xmm0, xmm0 ;pack to word
  65. ;clamp the values
  66. pminsw xmm0, max
  67. pmaxsw xmm0, min
  68. %if %1
  69. movq xmm1, [rdi]
  70. pavgw xmm0, xmm1
  71. %endif
  72. movq [rdi], xmm0
  73. %endm
  74. %macro HIGH_GET_FILTERS 0
  75. mov rdx, arg(5) ;filter ptr
  76. mov rsi, arg(0) ;src_ptr
  77. mov rdi, arg(2) ;output_ptr
  78. mov rcx, 0x00000040
  79. movdqa xmm7, [rdx] ;load filters
  80. pshuflw xmm0, xmm7, 0b ;k0
  81. pshuflw xmm1, xmm7, 01010101b ;k1
  82. pshuflw xmm2, xmm7, 10101010b ;k2
  83. pshuflw xmm3, xmm7, 11111111b ;k3
  84. pshufhw xmm4, xmm7, 0b ;k4
  85. pshufhw xmm5, xmm7, 01010101b ;k5
  86. pshufhw xmm6, xmm7, 10101010b ;k6
  87. pshufhw xmm7, xmm7, 11111111b ;k7
  88. punpcklqdq xmm2, xmm2
  89. punpcklqdq xmm3, xmm3
  90. punpcklwd xmm0, xmm1
  91. punpckhwd xmm6, xmm7
  92. punpckhwd xmm2, xmm5
  93. punpckhwd xmm3, xmm4
  94. movdqa k0k1, xmm0 ;store filter factors on stack
  95. movdqa k6k7, xmm6
  96. movdqa k2k5, xmm2
  97. movdqa k3k4, xmm3
  98. movq xmm6, rcx
  99. pshufd xmm6, xmm6, 0
  100. movdqa krd, xmm6 ;rounding
  101. ;Compute max and min values of a pixel
  102. mov rdx, 0x00010001
  103. movsxd rcx, DWORD PTR arg(6) ;bd
  104. movq xmm0, rdx
  105. movq xmm1, rcx
  106. pshufd xmm0, xmm0, 0b
  107. movdqa xmm2, xmm0
  108. psllw xmm0, xmm1
  109. psubw xmm0, xmm2
  110. pxor xmm1, xmm1
  111. movdqa max, xmm0 ;max value (for clamping)
  112. movdqa min, xmm1 ;min value (for clamping)
  113. %endm
  114. %macro LOAD_VERT_8 1
  115. movdqu xmm0, [rsi + %1] ;0
  116. movdqu xmm1, [rsi + rax + %1] ;1
  117. movdqu xmm6, [rsi + rdx * 2 + %1] ;6
  118. lea rsi, [rsi + rax]
  119. movdqu xmm7, [rsi + rdx * 2 + %1] ;7
  120. movdqu xmm2, [rsi + rax + %1] ;2
  121. movdqu xmm3, [rsi + rax * 2 + %1] ;3
  122. movdqu xmm4, [rsi + rdx + %1] ;4
  123. movdqu xmm5, [rsi + rax * 4 + %1] ;5
  124. %endm
  125. %macro HIGH_APPLY_FILTER_8 2
  126. movdqu temp, xmm4
  127. movdqa xmm4, xmm0
  128. punpcklwd xmm0, xmm1
  129. punpckhwd xmm4, xmm1
  130. movdqa xmm1, xmm6
  131. punpcklwd xmm6, xmm7
  132. punpckhwd xmm1, xmm7
  133. movdqa xmm7, xmm2
  134. punpcklwd xmm2, xmm5
  135. punpckhwd xmm7, xmm5
  136. movdqu xmm5, temp
  137. movdqu temp, xmm4
  138. movdqa xmm4, xmm3
  139. punpcklwd xmm3, xmm5
  140. punpckhwd xmm4, xmm5
  141. movdqu xmm5, temp
  142. pmaddwd xmm0, k0k1
  143. pmaddwd xmm5, k0k1
  144. pmaddwd xmm6, k6k7
  145. pmaddwd xmm1, k6k7
  146. pmaddwd xmm2, k2k5
  147. pmaddwd xmm7, k2k5
  148. pmaddwd xmm3, k3k4
  149. pmaddwd xmm4, k3k4
  150. paddd xmm0, xmm6
  151. paddd xmm0, xmm2
  152. paddd xmm0, xmm3
  153. paddd xmm5, xmm1
  154. paddd xmm5, xmm7
  155. paddd xmm5, xmm4
  156. paddd xmm0, krd ;rounding
  157. paddd xmm5, krd
  158. psrad xmm0, 7 ;shift
  159. psrad xmm5, 7
  160. packssdw xmm0, xmm5 ;pack back to word
  161. ;clamp the values
  162. pminsw xmm0, max
  163. pmaxsw xmm0, min
  164. %if %1
  165. movdqu xmm1, [rdi + %2]
  166. pavgw xmm0, xmm1
  167. %endif
  168. movdqu [rdi + %2], xmm0
  169. %endm
  170. SECTION .text
  171. ;void vpx_highbd_filter_block1d4_v8_sse2
  172. ;(
  173. ; unsigned char *src_ptr,
  174. ; unsigned int src_pitch,
  175. ; unsigned char *output_ptr,
  176. ; unsigned int out_pitch,
  177. ; unsigned int output_height,
  178. ; short *filter
  179. ;)
  180. global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE
  181. sym(vpx_highbd_filter_block1d4_v8_sse2):
  182. push rbp
  183. mov rbp, rsp
  184. SHADOW_ARGS_TO_STACK 7
  185. SAVE_XMM 7
  186. push rsi
  187. push rdi
  188. push rbx
  189. ; end prolog
  190. ALIGN_STACK 16, rax
  191. sub rsp, 16 * 7
  192. %define k0k6 [rsp + 16 * 0]
  193. %define k2k5 [rsp + 16 * 1]
  194. %define k3k4 [rsp + 16 * 2]
  195. %define k1k7 [rsp + 16 * 3]
  196. %define krd [rsp + 16 * 4]
  197. %define max [rsp + 16 * 5]
  198. %define min [rsp + 16 * 6]
  199. HIGH_GET_FILTERS_4
  200. mov rsi, arg(0) ;src_ptr
  201. mov rdi, arg(2) ;output_ptr
  202. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  203. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  204. lea rax, [rax + rax] ;bytes per line
  205. lea rbx, [rbx + rbx]
  206. lea rdx, [rax + rax * 2]
  207. movsxd rcx, DWORD PTR arg(4) ;output_height
  208. .loop:
  209. movq xmm0, [rsi] ;load src: row 0
  210. movq xmm1, [rsi + rax] ;1
  211. movq xmm6, [rsi + rdx * 2] ;6
  212. lea rsi, [rsi + rax]
  213. movq xmm7, [rsi + rdx * 2] ;7
  214. movq xmm2, [rsi + rax] ;2
  215. movq xmm3, [rsi + rax * 2] ;3
  216. movq xmm4, [rsi + rdx] ;4
  217. movq xmm5, [rsi + rax * 4] ;5
  218. HIGH_APPLY_FILTER_4 0
  219. lea rdi, [rdi + rbx]
  220. dec rcx
  221. jnz .loop
  222. add rsp, 16 * 7
  223. pop rsp
  224. pop rbx
  225. ; begin epilog
  226. pop rdi
  227. pop rsi
  228. RESTORE_XMM
  229. UNSHADOW_ARGS
  230. pop rbp
  231. ret
  232. ;void vpx_highbd_filter_block1d8_v8_sse2
  233. ;(
  234. ; unsigned char *src_ptr,
  235. ; unsigned int src_pitch,
  236. ; unsigned char *output_ptr,
  237. ; unsigned int out_pitch,
  238. ; unsigned int output_height,
  239. ; short *filter
  240. ;)
  241. global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE
  242. sym(vpx_highbd_filter_block1d8_v8_sse2):
  243. push rbp
  244. mov rbp, rsp
  245. SHADOW_ARGS_TO_STACK 7
  246. SAVE_XMM 7
  247. push rsi
  248. push rdi
  249. push rbx
  250. ; end prolog
  251. ALIGN_STACK 16, rax
  252. sub rsp, 16 * 8
  253. %define k0k1 [rsp + 16 * 0]
  254. %define k6k7 [rsp + 16 * 1]
  255. %define k2k5 [rsp + 16 * 2]
  256. %define k3k4 [rsp + 16 * 3]
  257. %define krd [rsp + 16 * 4]
  258. %define temp [rsp + 16 * 5]
  259. %define max [rsp + 16 * 6]
  260. %define min [rsp + 16 * 7]
  261. HIGH_GET_FILTERS
  262. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  263. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  264. lea rax, [rax + rax] ;bytes per line
  265. lea rbx, [rbx + rbx]
  266. lea rdx, [rax + rax * 2]
  267. movsxd rcx, DWORD PTR arg(4) ;output_height
  268. .loop:
  269. LOAD_VERT_8 0
  270. HIGH_APPLY_FILTER_8 0, 0
  271. lea rdi, [rdi + rbx]
  272. dec rcx
  273. jnz .loop
  274. add rsp, 16 * 8
  275. pop rsp
  276. pop rbx
  277. ; begin epilog
  278. pop rdi
  279. pop rsi
  280. RESTORE_XMM
  281. UNSHADOW_ARGS
  282. pop rbp
  283. ret
  284. ;void vpx_highbd_filter_block1d16_v8_sse2
  285. ;(
  286. ; unsigned char *src_ptr,
  287. ; unsigned int src_pitch,
  288. ; unsigned char *output_ptr,
  289. ; unsigned int out_pitch,
  290. ; unsigned int output_height,
  291. ; short *filter
  292. ;)
  293. global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE
  294. sym(vpx_highbd_filter_block1d16_v8_sse2):
  295. push rbp
  296. mov rbp, rsp
  297. SHADOW_ARGS_TO_STACK 7
  298. SAVE_XMM 7
  299. push rsi
  300. push rdi
  301. push rbx
  302. ; end prolog
  303. ALIGN_STACK 16, rax
  304. sub rsp, 16 * 8
  305. %define k0k1 [rsp + 16 * 0]
  306. %define k6k7 [rsp + 16 * 1]
  307. %define k2k5 [rsp + 16 * 2]
  308. %define k3k4 [rsp + 16 * 3]
  309. %define krd [rsp + 16 * 4]
  310. %define temp [rsp + 16 * 5]
  311. %define max [rsp + 16 * 6]
  312. %define min [rsp + 16 * 7]
  313. HIGH_GET_FILTERS
  314. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  315. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  316. lea rax, [rax + rax] ;bytes per line
  317. lea rbx, [rbx + rbx]
  318. lea rdx, [rax + rax * 2]
  319. movsxd rcx, DWORD PTR arg(4) ;output_height
  320. .loop:
  321. LOAD_VERT_8 0
  322. HIGH_APPLY_FILTER_8 0, 0
  323. sub rsi, rax
  324. LOAD_VERT_8 16
  325. HIGH_APPLY_FILTER_8 0, 16
  326. add rdi, rbx
  327. dec rcx
  328. jnz .loop
  329. add rsp, 16 * 8
  330. pop rsp
  331. pop rbx
  332. ; begin epilog
  333. pop rdi
  334. pop rsi
  335. RESTORE_XMM
  336. UNSHADOW_ARGS
  337. pop rbp
  338. ret
  339. global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
  340. sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
  341. push rbp
  342. mov rbp, rsp
  343. SHADOW_ARGS_TO_STACK 7
  344. SAVE_XMM 7
  345. push rsi
  346. push rdi
  347. push rbx
  348. ; end prolog
  349. ALIGN_STACK 16, rax
  350. sub rsp, 16 * 7
  351. %define k0k6 [rsp + 16 * 0]
  352. %define k2k5 [rsp + 16 * 1]
  353. %define k3k4 [rsp + 16 * 2]
  354. %define k1k7 [rsp + 16 * 3]
  355. %define krd [rsp + 16 * 4]
  356. %define max [rsp + 16 * 5]
  357. %define min [rsp + 16 * 6]
  358. HIGH_GET_FILTERS_4
  359. mov rsi, arg(0) ;src_ptr
  360. mov rdi, arg(2) ;output_ptr
  361. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  362. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  363. lea rax, [rax + rax] ;bytes per line
  364. lea rbx, [rbx + rbx]
  365. lea rdx, [rax + rax * 2]
  366. movsxd rcx, DWORD PTR arg(4) ;output_height
  367. .loop:
  368. movq xmm0, [rsi] ;load src: row 0
  369. movq xmm1, [rsi + rax] ;1
  370. movq xmm6, [rsi + rdx * 2] ;6
  371. lea rsi, [rsi + rax]
  372. movq xmm7, [rsi + rdx * 2] ;7
  373. movq xmm2, [rsi + rax] ;2
  374. movq xmm3, [rsi + rax * 2] ;3
  375. movq xmm4, [rsi + rdx] ;4
  376. movq xmm5, [rsi + rax * 4] ;5
  377. HIGH_APPLY_FILTER_4 1
  378. lea rdi, [rdi + rbx]
  379. dec rcx
  380. jnz .loop
  381. add rsp, 16 * 7
  382. pop rsp
  383. pop rbx
  384. ; begin epilog
  385. pop rdi
  386. pop rsi
  387. RESTORE_XMM
  388. UNSHADOW_ARGS
  389. pop rbp
  390. ret
  391. global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
  392. sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
  393. push rbp
  394. mov rbp, rsp
  395. SHADOW_ARGS_TO_STACK 7
  396. SAVE_XMM 7
  397. push rsi
  398. push rdi
  399. push rbx
  400. ; end prolog
  401. ALIGN_STACK 16, rax
  402. sub rsp, 16 * 8
  403. %define k0k1 [rsp + 16 * 0]
  404. %define k6k7 [rsp + 16 * 1]
  405. %define k2k5 [rsp + 16 * 2]
  406. %define k3k4 [rsp + 16 * 3]
  407. %define krd [rsp + 16 * 4]
  408. %define temp [rsp + 16 * 5]
  409. %define max [rsp + 16 * 6]
  410. %define min [rsp + 16 * 7]
  411. HIGH_GET_FILTERS
  412. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  413. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  414. lea rax, [rax + rax] ;bytes per line
  415. lea rbx, [rbx + rbx]
  416. lea rdx, [rax + rax * 2]
  417. movsxd rcx, DWORD PTR arg(4) ;output_height
  418. .loop:
  419. LOAD_VERT_8 0
  420. HIGH_APPLY_FILTER_8 1, 0
  421. lea rdi, [rdi + rbx]
  422. dec rcx
  423. jnz .loop
  424. add rsp, 16 * 8
  425. pop rsp
  426. pop rbx
  427. ; begin epilog
  428. pop rdi
  429. pop rsi
  430. RESTORE_XMM
  431. UNSHADOW_ARGS
  432. pop rbp
  433. ret
  434. global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
  435. sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
  436. push rbp
  437. mov rbp, rsp
  438. SHADOW_ARGS_TO_STACK 7
  439. SAVE_XMM 7
  440. push rsi
  441. push rdi
  442. push rbx
  443. ; end prolog
  444. ALIGN_STACK 16, rax
  445. sub rsp, 16 * 8
  446. %define k0k1 [rsp + 16 * 0]
  447. %define k6k7 [rsp + 16 * 1]
  448. %define k2k5 [rsp + 16 * 2]
  449. %define k3k4 [rsp + 16 * 3]
  450. %define krd [rsp + 16 * 4]
  451. %define temp [rsp + 16 * 5]
  452. %define max [rsp + 16 * 6]
  453. %define min [rsp + 16 * 7]
  454. HIGH_GET_FILTERS
  455. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  456. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  457. lea rax, [rax + rax] ;bytes per line
  458. lea rbx, [rbx + rbx]
  459. lea rdx, [rax + rax * 2]
  460. movsxd rcx, DWORD PTR arg(4) ;output_height
  461. .loop:
  462. LOAD_VERT_8 0
  463. HIGH_APPLY_FILTER_8 1, 0
  464. sub rsi, rax
  465. LOAD_VERT_8 16
  466. HIGH_APPLY_FILTER_8 1, 16
  467. add rdi, rbx
  468. dec rcx
  469. jnz .loop
  470. add rsp, 16 * 8
  471. pop rsp
  472. pop rbx
  473. ; begin epilog
  474. pop rdi
  475. pop rsi
  476. RESTORE_XMM
  477. UNSHADOW_ARGS
  478. pop rbp
  479. ret
  480. ;void vpx_highbd_filter_block1d4_h8_sse2
  481. ;(
  482. ; unsigned char *src_ptr,
  483. ; unsigned int src_pixels_per_line,
  484. ; unsigned char *output_ptr,
  485. ; unsigned int output_pitch,
  486. ; unsigned int output_height,
  487. ; short *filter
  488. ;)
  489. global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE
  490. sym(vpx_highbd_filter_block1d4_h8_sse2):
  491. push rbp
  492. mov rbp, rsp
  493. SHADOW_ARGS_TO_STACK 7
  494. SAVE_XMM 7
  495. push rsi
  496. push rdi
  497. ; end prolog
  498. ALIGN_STACK 16, rax
  499. sub rsp, 16 * 7
  500. %define k0k6 [rsp + 16 * 0]
  501. %define k2k5 [rsp + 16 * 1]
  502. %define k3k4 [rsp + 16 * 2]
  503. %define k1k7 [rsp + 16 * 3]
  504. %define krd [rsp + 16 * 4]
  505. %define max [rsp + 16 * 5]
  506. %define min [rsp + 16 * 6]
  507. HIGH_GET_FILTERS_4
  508. mov rsi, arg(0) ;src_ptr
  509. mov rdi, arg(2) ;output_ptr
  510. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  511. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  512. lea rax, [rax + rax] ;bytes per line
  513. lea rdx, [rdx + rdx]
  514. movsxd rcx, DWORD PTR arg(4) ;output_height
  515. .loop:
  516. movdqu xmm0, [rsi - 6] ;load src
  517. movdqu xmm4, [rsi + 2]
  518. movdqa xmm1, xmm0
  519. movdqa xmm6, xmm4
  520. movdqa xmm7, xmm4
  521. movdqa xmm2, xmm0
  522. movdqa xmm3, xmm0
  523. movdqa xmm5, xmm4
  524. psrldq xmm1, 2
  525. psrldq xmm6, 4
  526. psrldq xmm7, 6
  527. psrldq xmm2, 4
  528. psrldq xmm3, 6
  529. psrldq xmm5, 2
  530. HIGH_APPLY_FILTER_4 0
  531. lea rsi, [rsi + rax]
  532. lea rdi, [rdi + rdx]
  533. dec rcx
  534. jnz .loop
  535. add rsp, 16 * 7
  536. pop rsp
  537. ; begin epilog
  538. pop rdi
  539. pop rsi
  540. RESTORE_XMM
  541. UNSHADOW_ARGS
  542. pop rbp
  543. ret
  544. ;void vpx_highbd_filter_block1d8_h8_sse2
  545. ;(
  546. ; unsigned char *src_ptr,
  547. ; unsigned int src_pixels_per_line,
  548. ; unsigned char *output_ptr,
  549. ; unsigned int output_pitch,
  550. ; unsigned int output_height,
  551. ; short *filter
  552. ;)
  553. global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE
  554. sym(vpx_highbd_filter_block1d8_h8_sse2):
  555. push rbp
  556. mov rbp, rsp
  557. SHADOW_ARGS_TO_STACK 7
  558. SAVE_XMM 7
  559. push rsi
  560. push rdi
  561. ; end prolog
  562. ALIGN_STACK 16, rax
  563. sub rsp, 16 * 8
  564. %define k0k1 [rsp + 16 * 0]
  565. %define k6k7 [rsp + 16 * 1]
  566. %define k2k5 [rsp + 16 * 2]
  567. %define k3k4 [rsp + 16 * 3]
  568. %define krd [rsp + 16 * 4]
  569. %define temp [rsp + 16 * 5]
  570. %define max [rsp + 16 * 6]
  571. %define min [rsp + 16 * 7]
  572. HIGH_GET_FILTERS
  573. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  574. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  575. lea rax, [rax + rax] ;bytes per line
  576. lea rdx, [rdx + rdx]
  577. movsxd rcx, DWORD PTR arg(4) ;output_height
  578. .loop:
  579. movdqu xmm0, [rsi - 6] ;load src
  580. movdqu xmm1, [rsi - 4]
  581. movdqu xmm2, [rsi - 2]
  582. movdqu xmm3, [rsi]
  583. movdqu xmm4, [rsi + 2]
  584. movdqu xmm5, [rsi + 4]
  585. movdqu xmm6, [rsi + 6]
  586. movdqu xmm7, [rsi + 8]
  587. HIGH_APPLY_FILTER_8 0, 0
  588. lea rsi, [rsi + rax]
  589. lea rdi, [rdi + rdx]
  590. dec rcx
  591. jnz .loop
  592. add rsp, 16 * 8
  593. pop rsp
  594. ; begin epilog
  595. pop rdi
  596. pop rsi
  597. RESTORE_XMM
  598. UNSHADOW_ARGS
  599. pop rbp
  600. ret
  601. ;void vpx_highbd_filter_block1d16_h8_sse2
  602. ;(
  603. ; unsigned char *src_ptr,
  604. ; unsigned int src_pixels_per_line,
  605. ; unsigned char *output_ptr,
  606. ; unsigned int output_pitch,
  607. ; unsigned int output_height,
  608. ; short *filter
  609. ;)
  610. global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE
  611. sym(vpx_highbd_filter_block1d16_h8_sse2):
  612. push rbp
  613. mov rbp, rsp
  614. SHADOW_ARGS_TO_STACK 7
  615. SAVE_XMM 7
  616. push rsi
  617. push rdi
  618. ; end prolog
  619. ALIGN_STACK 16, rax
  620. sub rsp, 16 * 8
  621. %define k0k1 [rsp + 16 * 0]
  622. %define k6k7 [rsp + 16 * 1]
  623. %define k2k5 [rsp + 16 * 2]
  624. %define k3k4 [rsp + 16 * 3]
  625. %define krd [rsp + 16 * 4]
  626. %define temp [rsp + 16 * 5]
  627. %define max [rsp + 16 * 6]
  628. %define min [rsp + 16 * 7]
  629. HIGH_GET_FILTERS
  630. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  631. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  632. lea rax, [rax + rax] ;bytes per line
  633. lea rdx, [rdx + rdx]
  634. movsxd rcx, DWORD PTR arg(4) ;output_height
  635. .loop:
  636. movdqu xmm0, [rsi - 6] ;load src
  637. movdqu xmm1, [rsi - 4]
  638. movdqu xmm2, [rsi - 2]
  639. movdqu xmm3, [rsi]
  640. movdqu xmm4, [rsi + 2]
  641. movdqu xmm5, [rsi + 4]
  642. movdqu xmm6, [rsi + 6]
  643. movdqu xmm7, [rsi + 8]
  644. HIGH_APPLY_FILTER_8 0, 0
  645. movdqu xmm0, [rsi + 10] ;load src
  646. movdqu xmm1, [rsi + 12]
  647. movdqu xmm2, [rsi + 14]
  648. movdqu xmm3, [rsi + 16]
  649. movdqu xmm4, [rsi + 18]
  650. movdqu xmm5, [rsi + 20]
  651. movdqu xmm6, [rsi + 22]
  652. movdqu xmm7, [rsi + 24]
  653. HIGH_APPLY_FILTER_8 0, 16
  654. lea rsi, [rsi + rax]
  655. lea rdi, [rdi + rdx]
  656. dec rcx
  657. jnz .loop
  658. add rsp, 16 * 8
  659. pop rsp
  660. ; begin epilog
  661. pop rdi
  662. pop rsi
  663. RESTORE_XMM
  664. UNSHADOW_ARGS
  665. pop rbp
  666. ret
  667. global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
  668. sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
  669. push rbp
  670. mov rbp, rsp
  671. SHADOW_ARGS_TO_STACK 7
  672. SAVE_XMM 7
  673. push rsi
  674. push rdi
  675. ; end prolog
  676. ALIGN_STACK 16, rax
  677. sub rsp, 16 * 7
  678. %define k0k6 [rsp + 16 * 0]
  679. %define k2k5 [rsp + 16 * 1]
  680. %define k3k4 [rsp + 16 * 2]
  681. %define k1k7 [rsp + 16 * 3]
  682. %define krd [rsp + 16 * 4]
  683. %define max [rsp + 16 * 5]
  684. %define min [rsp + 16 * 6]
  685. HIGH_GET_FILTERS_4
  686. mov rsi, arg(0) ;src_ptr
  687. mov rdi, arg(2) ;output_ptr
  688. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  689. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  690. lea rax, [rax + rax] ;bytes per line
  691. lea rdx, [rdx + rdx]
  692. movsxd rcx, DWORD PTR arg(4) ;output_height
  693. .loop:
  694. movdqu xmm0, [rsi - 6] ;load src
  695. movdqu xmm4, [rsi + 2]
  696. movdqa xmm1, xmm0
  697. movdqa xmm6, xmm4
  698. movdqa xmm7, xmm4
  699. movdqa xmm2, xmm0
  700. movdqa xmm3, xmm0
  701. movdqa xmm5, xmm4
  702. psrldq xmm1, 2
  703. psrldq xmm6, 4
  704. psrldq xmm7, 6
  705. psrldq xmm2, 4
  706. psrldq xmm3, 6
  707. psrldq xmm5, 2
  708. HIGH_APPLY_FILTER_4 1
  709. lea rsi, [rsi + rax]
  710. lea rdi, [rdi + rdx]
  711. dec rcx
  712. jnz .loop
  713. add rsp, 16 * 7
  714. pop rsp
  715. ; begin epilog
  716. pop rdi
  717. pop rsi
  718. RESTORE_XMM
  719. UNSHADOW_ARGS
  720. pop rbp
  721. ret
  722. global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
  723. sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
  724. push rbp
  725. mov rbp, rsp
  726. SHADOW_ARGS_TO_STACK 7
  727. SAVE_XMM 7
  728. push rsi
  729. push rdi
  730. ; end prolog
  731. ALIGN_STACK 16, rax
  732. sub rsp, 16 * 8
  733. %define k0k1 [rsp + 16 * 0]
  734. %define k6k7 [rsp + 16 * 1]
  735. %define k2k5 [rsp + 16 * 2]
  736. %define k3k4 [rsp + 16 * 3]
  737. %define krd [rsp + 16 * 4]
  738. %define temp [rsp + 16 * 5]
  739. %define max [rsp + 16 * 6]
  740. %define min [rsp + 16 * 7]
  741. HIGH_GET_FILTERS
  742. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  743. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  744. lea rax, [rax + rax] ;bytes per line
  745. lea rdx, [rdx + rdx]
  746. movsxd rcx, DWORD PTR arg(4) ;output_height
  747. .loop:
  748. movdqu xmm0, [rsi - 6] ;load src
  749. movdqu xmm1, [rsi - 4]
  750. movdqu xmm2, [rsi - 2]
  751. movdqu xmm3, [rsi]
  752. movdqu xmm4, [rsi + 2]
  753. movdqu xmm5, [rsi + 4]
  754. movdqu xmm6, [rsi + 6]
  755. movdqu xmm7, [rsi + 8]
  756. HIGH_APPLY_FILTER_8 1, 0
  757. lea rsi, [rsi + rax]
  758. lea rdi, [rdi + rdx]
  759. dec rcx
  760. jnz .loop
  761. add rsp, 16 * 8
  762. pop rsp
  763. ; begin epilog
  764. pop rdi
  765. pop rsi
  766. RESTORE_XMM
  767. UNSHADOW_ARGS
  768. pop rbp
  769. ret
  770. global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
  771. sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
  772. push rbp
  773. mov rbp, rsp
  774. SHADOW_ARGS_TO_STACK 7
  775. SAVE_XMM 7
  776. push rsi
  777. push rdi
  778. ; end prolog
  779. ALIGN_STACK 16, rax
  780. sub rsp, 16 * 8
  781. %define k0k1 [rsp + 16 * 0]
  782. %define k6k7 [rsp + 16 * 1]
  783. %define k2k5 [rsp + 16 * 2]
  784. %define k3k4 [rsp + 16 * 3]
  785. %define krd [rsp + 16 * 4]
  786. %define temp [rsp + 16 * 5]
  787. %define max [rsp + 16 * 6]
  788. %define min [rsp + 16 * 7]
  789. HIGH_GET_FILTERS
  790. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  791. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  792. lea rax, [rax + rax] ;bytes per line
  793. lea rdx, [rdx + rdx]
  794. movsxd rcx, DWORD PTR arg(4) ;output_height
  795. .loop:
  796. movdqu xmm0, [rsi - 6] ;load src
  797. movdqu xmm1, [rsi - 4]
  798. movdqu xmm2, [rsi - 2]
  799. movdqu xmm3, [rsi]
  800. movdqu xmm4, [rsi + 2]
  801. movdqu xmm5, [rsi + 4]
  802. movdqu xmm6, [rsi + 6]
  803. movdqu xmm7, [rsi + 8]
  804. HIGH_APPLY_FILTER_8 1, 0
  805. movdqu xmm0, [rsi + 10] ;load src
  806. movdqu xmm1, [rsi + 12]
  807. movdqu xmm2, [rsi + 14]
  808. movdqu xmm3, [rsi + 16]
  809. movdqu xmm4, [rsi + 18]
  810. movdqu xmm5, [rsi + 20]
  811. movdqu xmm6, [rsi + 22]
  812. movdqu xmm7, [rsi + 24]
  813. HIGH_APPLY_FILTER_8 1, 16
  814. lea rsi, [rsi + rax]
  815. lea rdi, [rdi + rdx]
  816. dec rcx
  817. jnz .loop
  818. add rsp, 16 * 8
  819. pop rsp
  820. ; begin epilog
  821. pop rdi
  822. pop rsi
  823. RESTORE_XMM
  824. UNSHADOW_ARGS
  825. pop rbp
  826. ret