vpx_subpixel_8t_sse2.asm 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
  12. ;overflow.
  13. %macro GET_FILTERS_4 0
  14. mov rdx, arg(5) ;filter ptr
  15. mov rcx, 0x0400040
  16. movdqa xmm7, [rdx] ;load filters
  17. pshuflw xmm0, xmm7, 0b ;k0
  18. pshuflw xmm1, xmm7, 01010101b ;k1
  19. pshuflw xmm2, xmm7, 10101010b ;k2
  20. pshuflw xmm3, xmm7, 11111111b ;k3
  21. psrldq xmm7, 8
  22. pshuflw xmm4, xmm7, 0b ;k4
  23. pshuflw xmm5, xmm7, 01010101b ;k5
  24. pshuflw xmm6, xmm7, 10101010b ;k6
  25. pshuflw xmm7, xmm7, 11111111b ;k7
  26. punpcklqdq xmm0, xmm1
  27. punpcklqdq xmm2, xmm3
  28. punpcklqdq xmm5, xmm4
  29. punpcklqdq xmm6, xmm7
  30. movdqa k0k1, xmm0
  31. movdqa k2k3, xmm2
  32. movdqa k5k4, xmm5
  33. movdqa k6k7, xmm6
  34. movq xmm6, rcx
  35. pshufd xmm6, xmm6, 0
  36. movdqa krd, xmm6
  37. pxor xmm7, xmm7
  38. movdqa zero, xmm7
  39. %endm
  40. %macro APPLY_FILTER_4 1
  41. punpckldq xmm0, xmm1 ;two row in one register
  42. punpckldq xmm6, xmm7
  43. punpckldq xmm2, xmm3
  44. punpckldq xmm5, xmm4
  45. punpcklbw xmm0, zero ;unpack to word
  46. punpcklbw xmm6, zero
  47. punpcklbw xmm2, zero
  48. punpcklbw xmm5, zero
  49. pmullw xmm0, k0k1 ;multiply the filter factors
  50. pmullw xmm6, k6k7
  51. pmullw xmm2, k2k3
  52. pmullw xmm5, k5k4
  53. paddsw xmm0, xmm6 ;sum
  54. movdqa xmm1, xmm0
  55. psrldq xmm1, 8
  56. paddsw xmm0, xmm1
  57. paddsw xmm0, xmm2
  58. psrldq xmm2, 8
  59. paddsw xmm0, xmm5
  60. psrldq xmm5, 8
  61. paddsw xmm0, xmm2
  62. paddsw xmm0, xmm5
  63. paddsw xmm0, krd ;rounding
  64. psraw xmm0, 7 ;shift
  65. packuswb xmm0, xmm0 ;pack to byte
  66. %if %1
  67. movd xmm1, [rdi]
  68. pavgb xmm0, xmm1
  69. %endif
  70. movd [rdi], xmm0
  71. %endm
  72. %macro GET_FILTERS 0
  73. mov rdx, arg(5) ;filter ptr
  74. mov rsi, arg(0) ;src_ptr
  75. mov rdi, arg(2) ;output_ptr
  76. mov rcx, 0x0400040
  77. movdqa xmm7, [rdx] ;load filters
  78. pshuflw xmm0, xmm7, 0b ;k0
  79. pshuflw xmm1, xmm7, 01010101b ;k1
  80. pshuflw xmm2, xmm7, 10101010b ;k2
  81. pshuflw xmm3, xmm7, 11111111b ;k3
  82. pshufhw xmm4, xmm7, 0b ;k4
  83. pshufhw xmm5, xmm7, 01010101b ;k5
  84. pshufhw xmm6, xmm7, 10101010b ;k6
  85. pshufhw xmm7, xmm7, 11111111b ;k7
  86. punpcklwd xmm0, xmm0
  87. punpcklwd xmm1, xmm1
  88. punpcklwd xmm2, xmm2
  89. punpcklwd xmm3, xmm3
  90. punpckhwd xmm4, xmm4
  91. punpckhwd xmm5, xmm5
  92. punpckhwd xmm6, xmm6
  93. punpckhwd xmm7, xmm7
  94. movdqa k0, xmm0 ;store filter factors on stack
  95. movdqa k1, xmm1
  96. movdqa k2, xmm2
  97. movdqa k3, xmm3
  98. movdqa k4, xmm4
  99. movdqa k5, xmm5
  100. movdqa k6, xmm6
  101. movdqa k7, xmm7
  102. movq xmm6, rcx
  103. pshufd xmm6, xmm6, 0
  104. movdqa krd, xmm6 ;rounding
  105. pxor xmm7, xmm7
  106. movdqa zero, xmm7
  107. %endm
  108. %macro LOAD_VERT_8 1
  109. movq xmm0, [rsi + %1] ;0
  110. movq xmm1, [rsi + rax + %1] ;1
  111. movq xmm6, [rsi + rdx * 2 + %1] ;6
  112. lea rsi, [rsi + rax]
  113. movq xmm7, [rsi + rdx * 2 + %1] ;7
  114. movq xmm2, [rsi + rax + %1] ;2
  115. movq xmm3, [rsi + rax * 2 + %1] ;3
  116. movq xmm4, [rsi + rdx + %1] ;4
  117. movq xmm5, [rsi + rax * 4 + %1] ;5
  118. %endm
  119. %macro APPLY_FILTER_8 2
  120. punpcklbw xmm0, zero
  121. punpcklbw xmm1, zero
  122. punpcklbw xmm6, zero
  123. punpcklbw xmm7, zero
  124. punpcklbw xmm2, zero
  125. punpcklbw xmm5, zero
  126. punpcklbw xmm3, zero
  127. punpcklbw xmm4, zero
  128. pmullw xmm0, k0
  129. pmullw xmm1, k1
  130. pmullw xmm6, k6
  131. pmullw xmm7, k7
  132. pmullw xmm2, k2
  133. pmullw xmm5, k5
  134. pmullw xmm3, k3
  135. pmullw xmm4, k4
  136. paddsw xmm0, xmm1
  137. paddsw xmm0, xmm6
  138. paddsw xmm0, xmm7
  139. paddsw xmm0, xmm2
  140. paddsw xmm0, xmm5
  141. paddsw xmm0, xmm3
  142. paddsw xmm0, xmm4
  143. paddsw xmm0, krd ;rounding
  144. psraw xmm0, 7 ;shift
  145. packuswb xmm0, xmm0 ;pack back to byte
  146. %if %1
  147. movq xmm1, [rdi + %2]
  148. pavgb xmm0, xmm1
  149. %endif
  150. movq [rdi + %2], xmm0
  151. %endm
  152. SECTION .text
  153. ;void vpx_filter_block1d4_v8_sse2
  154. ;(
  155. ; unsigned char *src_ptr,
  156. ; unsigned int src_pitch,
  157. ; unsigned char *output_ptr,
  158. ; unsigned int out_pitch,
  159. ; unsigned int output_height,
  160. ; short *filter
  161. ;)
  162. global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
  163. sym(vpx_filter_block1d4_v8_sse2):
  164. push rbp
  165. mov rbp, rsp
  166. SHADOW_ARGS_TO_STACK 6
  167. SAVE_XMM 7
  168. push rsi
  169. push rdi
  170. push rbx
  171. ; end prolog
  172. ALIGN_STACK 16, rax
  173. sub rsp, 16 * 6
  174. %define k0k1 [rsp + 16 * 0]
  175. %define k2k3 [rsp + 16 * 1]
  176. %define k5k4 [rsp + 16 * 2]
  177. %define k6k7 [rsp + 16 * 3]
  178. %define krd [rsp + 16 * 4]
  179. %define zero [rsp + 16 * 5]
  180. GET_FILTERS_4
  181. mov rsi, arg(0) ;src_ptr
  182. mov rdi, arg(2) ;output_ptr
  183. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  184. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  185. lea rdx, [rax + rax * 2]
  186. movsxd rcx, DWORD PTR arg(4) ;output_height
  187. .loop:
  188. movd xmm0, [rsi] ;load src: row 0
  189. movd xmm1, [rsi + rax] ;1
  190. movd xmm6, [rsi + rdx * 2] ;6
  191. lea rsi, [rsi + rax]
  192. movd xmm7, [rsi + rdx * 2] ;7
  193. movd xmm2, [rsi + rax] ;2
  194. movd xmm3, [rsi + rax * 2] ;3
  195. movd xmm4, [rsi + rdx] ;4
  196. movd xmm5, [rsi + rax * 4] ;5
  197. APPLY_FILTER_4 0
  198. lea rdi, [rdi + rbx]
  199. dec rcx
  200. jnz .loop
  201. add rsp, 16 * 6
  202. pop rsp
  203. pop rbx
  204. ; begin epilog
  205. pop rdi
  206. pop rsi
  207. RESTORE_XMM
  208. UNSHADOW_ARGS
  209. pop rbp
  210. ret
  211. ;void vpx_filter_block1d8_v8_sse2
  212. ;(
  213. ; unsigned char *src_ptr,
  214. ; unsigned int src_pitch,
  215. ; unsigned char *output_ptr,
  216. ; unsigned int out_pitch,
  217. ; unsigned int output_height,
  218. ; short *filter
  219. ;)
  220. global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
  221. sym(vpx_filter_block1d8_v8_sse2):
  222. push rbp
  223. mov rbp, rsp
  224. SHADOW_ARGS_TO_STACK 6
  225. SAVE_XMM 7
  226. push rsi
  227. push rdi
  228. push rbx
  229. ; end prolog
  230. ALIGN_STACK 16, rax
  231. sub rsp, 16 * 10
  232. %define k0 [rsp + 16 * 0]
  233. %define k1 [rsp + 16 * 1]
  234. %define k2 [rsp + 16 * 2]
  235. %define k3 [rsp + 16 * 3]
  236. %define k4 [rsp + 16 * 4]
  237. %define k5 [rsp + 16 * 5]
  238. %define k6 [rsp + 16 * 6]
  239. %define k7 [rsp + 16 * 7]
  240. %define krd [rsp + 16 * 8]
  241. %define zero [rsp + 16 * 9]
  242. GET_FILTERS
  243. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  244. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  245. lea rdx, [rax + rax * 2]
  246. movsxd rcx, DWORD PTR arg(4) ;output_height
  247. .loop:
  248. LOAD_VERT_8 0
  249. APPLY_FILTER_8 0, 0
  250. lea rdi, [rdi + rbx]
  251. dec rcx
  252. jnz .loop
  253. add rsp, 16 * 10
  254. pop rsp
  255. pop rbx
  256. ; begin epilog
  257. pop rdi
  258. pop rsi
  259. RESTORE_XMM
  260. UNSHADOW_ARGS
  261. pop rbp
  262. ret
  263. ;void vpx_filter_block1d16_v8_sse2
  264. ;(
  265. ; unsigned char *src_ptr,
  266. ; unsigned int src_pitch,
  267. ; unsigned char *output_ptr,
  268. ; unsigned int out_pitch,
  269. ; unsigned int output_height,
  270. ; short *filter
  271. ;)
  272. global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
  273. sym(vpx_filter_block1d16_v8_sse2):
  274. push rbp
  275. mov rbp, rsp
  276. SHADOW_ARGS_TO_STACK 6
  277. SAVE_XMM 7
  278. push rsi
  279. push rdi
  280. push rbx
  281. ; end prolog
  282. ALIGN_STACK 16, rax
  283. sub rsp, 16 * 10
  284. %define k0 [rsp + 16 * 0]
  285. %define k1 [rsp + 16 * 1]
  286. %define k2 [rsp + 16 * 2]
  287. %define k3 [rsp + 16 * 3]
  288. %define k4 [rsp + 16 * 4]
  289. %define k5 [rsp + 16 * 5]
  290. %define k6 [rsp + 16 * 6]
  291. %define k7 [rsp + 16 * 7]
  292. %define krd [rsp + 16 * 8]
  293. %define zero [rsp + 16 * 9]
  294. GET_FILTERS
  295. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  296. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  297. lea rdx, [rax + rax * 2]
  298. movsxd rcx, DWORD PTR arg(4) ;output_height
  299. .loop:
  300. LOAD_VERT_8 0
  301. APPLY_FILTER_8 0, 0
  302. sub rsi, rax
  303. LOAD_VERT_8 8
  304. APPLY_FILTER_8 0, 8
  305. add rdi, rbx
  306. dec rcx
  307. jnz .loop
  308. add rsp, 16 * 10
  309. pop rsp
  310. pop rbx
  311. ; begin epilog
  312. pop rdi
  313. pop rsi
  314. RESTORE_XMM
  315. UNSHADOW_ARGS
  316. pop rbp
  317. ret
  318. global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
  319. sym(vpx_filter_block1d4_v8_avg_sse2):
  320. push rbp
  321. mov rbp, rsp
  322. SHADOW_ARGS_TO_STACK 6
  323. SAVE_XMM 7
  324. push rsi
  325. push rdi
  326. push rbx
  327. ; end prolog
  328. ALIGN_STACK 16, rax
  329. sub rsp, 16 * 6
  330. %define k0k1 [rsp + 16 * 0]
  331. %define k2k3 [rsp + 16 * 1]
  332. %define k5k4 [rsp + 16 * 2]
  333. %define k6k7 [rsp + 16 * 3]
  334. %define krd [rsp + 16 * 4]
  335. %define zero [rsp + 16 * 5]
  336. GET_FILTERS_4
  337. mov rsi, arg(0) ;src_ptr
  338. mov rdi, arg(2) ;output_ptr
  339. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  340. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  341. lea rdx, [rax + rax * 2]
  342. movsxd rcx, DWORD PTR arg(4) ;output_height
  343. .loop:
  344. movd xmm0, [rsi] ;load src: row 0
  345. movd xmm1, [rsi + rax] ;1
  346. movd xmm6, [rsi + rdx * 2] ;6
  347. lea rsi, [rsi + rax]
  348. movd xmm7, [rsi + rdx * 2] ;7
  349. movd xmm2, [rsi + rax] ;2
  350. movd xmm3, [rsi + rax * 2] ;3
  351. movd xmm4, [rsi + rdx] ;4
  352. movd xmm5, [rsi + rax * 4] ;5
  353. APPLY_FILTER_4 1
  354. lea rdi, [rdi + rbx]
  355. dec rcx
  356. jnz .loop
  357. add rsp, 16 * 6
  358. pop rsp
  359. pop rbx
  360. ; begin epilog
  361. pop rdi
  362. pop rsi
  363. RESTORE_XMM
  364. UNSHADOW_ARGS
  365. pop rbp
  366. ret
  367. global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
  368. sym(vpx_filter_block1d8_v8_avg_sse2):
  369. push rbp
  370. mov rbp, rsp
  371. SHADOW_ARGS_TO_STACK 6
  372. SAVE_XMM 7
  373. push rsi
  374. push rdi
  375. push rbx
  376. ; end prolog
  377. ALIGN_STACK 16, rax
  378. sub rsp, 16 * 10
  379. %define k0 [rsp + 16 * 0]
  380. %define k1 [rsp + 16 * 1]
  381. %define k2 [rsp + 16 * 2]
  382. %define k3 [rsp + 16 * 3]
  383. %define k4 [rsp + 16 * 4]
  384. %define k5 [rsp + 16 * 5]
  385. %define k6 [rsp + 16 * 6]
  386. %define k7 [rsp + 16 * 7]
  387. %define krd [rsp + 16 * 8]
  388. %define zero [rsp + 16 * 9]
  389. GET_FILTERS
  390. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  391. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  392. lea rdx, [rax + rax * 2]
  393. movsxd rcx, DWORD PTR arg(4) ;output_height
  394. .loop:
  395. LOAD_VERT_8 0
  396. APPLY_FILTER_8 1, 0
  397. lea rdi, [rdi + rbx]
  398. dec rcx
  399. jnz .loop
  400. add rsp, 16 * 10
  401. pop rsp
  402. pop rbx
  403. ; begin epilog
  404. pop rdi
  405. pop rsi
  406. RESTORE_XMM
  407. UNSHADOW_ARGS
  408. pop rbp
  409. ret
  410. global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
  411. sym(vpx_filter_block1d16_v8_avg_sse2):
  412. push rbp
  413. mov rbp, rsp
  414. SHADOW_ARGS_TO_STACK 6
  415. SAVE_XMM 7
  416. push rsi
  417. push rdi
  418. push rbx
  419. ; end prolog
  420. ALIGN_STACK 16, rax
  421. sub rsp, 16 * 10
  422. %define k0 [rsp + 16 * 0]
  423. %define k1 [rsp + 16 * 1]
  424. %define k2 [rsp + 16 * 2]
  425. %define k3 [rsp + 16 * 3]
  426. %define k4 [rsp + 16 * 4]
  427. %define k5 [rsp + 16 * 5]
  428. %define k6 [rsp + 16 * 6]
  429. %define k7 [rsp + 16 * 7]
  430. %define krd [rsp + 16 * 8]
  431. %define zero [rsp + 16 * 9]
  432. GET_FILTERS
  433. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  434. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  435. lea rdx, [rax + rax * 2]
  436. movsxd rcx, DWORD PTR arg(4) ;output_height
  437. .loop:
  438. LOAD_VERT_8 0
  439. APPLY_FILTER_8 1, 0
  440. sub rsi, rax
  441. LOAD_VERT_8 8
  442. APPLY_FILTER_8 1, 8
  443. add rdi, rbx
  444. dec rcx
  445. jnz .loop
  446. add rsp, 16 * 10
  447. pop rsp
  448. pop rbx
  449. ; begin epilog
  450. pop rdi
  451. pop rsi
  452. RESTORE_XMM
  453. UNSHADOW_ARGS
  454. pop rbp
  455. ret
  456. ;void vpx_filter_block1d4_h8_sse2
  457. ;(
  458. ; unsigned char *src_ptr,
  459. ; unsigned int src_pixels_per_line,
  460. ; unsigned char *output_ptr,
  461. ; unsigned int output_pitch,
  462. ; unsigned int output_height,
  463. ; short *filter
  464. ;)
  465. global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
  466. sym(vpx_filter_block1d4_h8_sse2):
  467. push rbp
  468. mov rbp, rsp
  469. SHADOW_ARGS_TO_STACK 6
  470. SAVE_XMM 7
  471. push rsi
  472. push rdi
  473. ; end prolog
  474. ALIGN_STACK 16, rax
  475. sub rsp, 16 * 6
  476. %define k0k1 [rsp + 16 * 0]
  477. %define k2k3 [rsp + 16 * 1]
  478. %define k5k4 [rsp + 16 * 2]
  479. %define k6k7 [rsp + 16 * 3]
  480. %define krd [rsp + 16 * 4]
  481. %define zero [rsp + 16 * 5]
  482. GET_FILTERS_4
  483. mov rsi, arg(0) ;src_ptr
  484. mov rdi, arg(2) ;output_ptr
  485. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  486. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  487. movsxd rcx, DWORD PTR arg(4) ;output_height
  488. .loop:
  489. movdqu xmm0, [rsi - 3] ;load src
  490. movdqa xmm1, xmm0
  491. movdqa xmm6, xmm0
  492. movdqa xmm7, xmm0
  493. movdqa xmm2, xmm0
  494. movdqa xmm3, xmm0
  495. movdqa xmm5, xmm0
  496. movdqa xmm4, xmm0
  497. psrldq xmm1, 1
  498. psrldq xmm6, 6
  499. psrldq xmm7, 7
  500. psrldq xmm2, 2
  501. psrldq xmm3, 3
  502. psrldq xmm5, 5
  503. psrldq xmm4, 4
  504. APPLY_FILTER_4 0
  505. lea rsi, [rsi + rax]
  506. lea rdi, [rdi + rdx]
  507. dec rcx
  508. jnz .loop
  509. add rsp, 16 * 6
  510. pop rsp
  511. ; begin epilog
  512. pop rdi
  513. pop rsi
  514. RESTORE_XMM
  515. UNSHADOW_ARGS
  516. pop rbp
  517. ret
  518. ;void vpx_filter_block1d8_h8_sse2
  519. ;(
  520. ; unsigned char *src_ptr,
  521. ; unsigned int src_pixels_per_line,
  522. ; unsigned char *output_ptr,
  523. ; unsigned int output_pitch,
  524. ; unsigned int output_height,
  525. ; short *filter
  526. ;)
  527. global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
  528. sym(vpx_filter_block1d8_h8_sse2):
  529. push rbp
  530. mov rbp, rsp
  531. SHADOW_ARGS_TO_STACK 6
  532. SAVE_XMM 7
  533. push rsi
  534. push rdi
  535. ; end prolog
  536. ALIGN_STACK 16, rax
  537. sub rsp, 16 * 10
  538. %define k0 [rsp + 16 * 0]
  539. %define k1 [rsp + 16 * 1]
  540. %define k2 [rsp + 16 * 2]
  541. %define k3 [rsp + 16 * 3]
  542. %define k4 [rsp + 16 * 4]
  543. %define k5 [rsp + 16 * 5]
  544. %define k6 [rsp + 16 * 6]
  545. %define k7 [rsp + 16 * 7]
  546. %define krd [rsp + 16 * 8]
  547. %define zero [rsp + 16 * 9]
  548. GET_FILTERS
  549. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  550. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  551. movsxd rcx, DWORD PTR arg(4) ;output_height
  552. .loop:
  553. movdqu xmm0, [rsi - 3] ;load src
  554. movdqa xmm1, xmm0
  555. movdqa xmm6, xmm0
  556. movdqa xmm7, xmm0
  557. movdqa xmm2, xmm0
  558. movdqa xmm5, xmm0
  559. movdqa xmm3, xmm0
  560. movdqa xmm4, xmm0
  561. psrldq xmm1, 1
  562. psrldq xmm6, 6
  563. psrldq xmm7, 7
  564. psrldq xmm2, 2
  565. psrldq xmm5, 5
  566. psrldq xmm3, 3
  567. psrldq xmm4, 4
  568. APPLY_FILTER_8 0, 0
  569. lea rsi, [rsi + rax]
  570. lea rdi, [rdi + rdx]
  571. dec rcx
  572. jnz .loop
  573. add rsp, 16 * 10
  574. pop rsp
  575. ; begin epilog
  576. pop rdi
  577. pop rsi
  578. RESTORE_XMM
  579. UNSHADOW_ARGS
  580. pop rbp
  581. ret
  582. ;void vpx_filter_block1d16_h8_sse2
  583. ;(
  584. ; unsigned char *src_ptr,
  585. ; unsigned int src_pixels_per_line,
  586. ; unsigned char *output_ptr,
  587. ; unsigned int output_pitch,
  588. ; unsigned int output_height,
  589. ; short *filter
  590. ;)
  591. global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
  592. sym(vpx_filter_block1d16_h8_sse2):
  593. push rbp
  594. mov rbp, rsp
  595. SHADOW_ARGS_TO_STACK 6
  596. SAVE_XMM 7
  597. push rsi
  598. push rdi
  599. ; end prolog
  600. ALIGN_STACK 16, rax
  601. sub rsp, 16 * 10
  602. %define k0 [rsp + 16 * 0]
  603. %define k1 [rsp + 16 * 1]
  604. %define k2 [rsp + 16 * 2]
  605. %define k3 [rsp + 16 * 3]
  606. %define k4 [rsp + 16 * 4]
  607. %define k5 [rsp + 16 * 5]
  608. %define k6 [rsp + 16 * 6]
  609. %define k7 [rsp + 16 * 7]
  610. %define krd [rsp + 16 * 8]
  611. %define zero [rsp + 16 * 9]
  612. GET_FILTERS
  613. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  614. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  615. movsxd rcx, DWORD PTR arg(4) ;output_height
  616. .loop:
  617. movdqu xmm0, [rsi - 3] ;load src
  618. movdqa xmm1, xmm0
  619. movdqa xmm6, xmm0
  620. movdqa xmm7, xmm0
  621. movdqa xmm2, xmm0
  622. movdqa xmm5, xmm0
  623. movdqa xmm3, xmm0
  624. movdqa xmm4, xmm0
  625. psrldq xmm1, 1
  626. psrldq xmm6, 6
  627. psrldq xmm7, 7
  628. psrldq xmm2, 2
  629. psrldq xmm5, 5
  630. psrldq xmm3, 3
  631. psrldq xmm4, 4
  632. APPLY_FILTER_8 0, 0
  633. movdqu xmm0, [rsi + 5] ;load src
  634. movdqa xmm1, xmm0
  635. movdqa xmm6, xmm0
  636. movdqa xmm7, xmm0
  637. movdqa xmm2, xmm0
  638. movdqa xmm5, xmm0
  639. movdqa xmm3, xmm0
  640. movdqa xmm4, xmm0
  641. psrldq xmm1, 1
  642. psrldq xmm6, 6
  643. psrldq xmm7, 7
  644. psrldq xmm2, 2
  645. psrldq xmm5, 5
  646. psrldq xmm3, 3
  647. psrldq xmm4, 4
  648. APPLY_FILTER_8 0, 8
  649. lea rsi, [rsi + rax]
  650. lea rdi, [rdi + rdx]
  651. dec rcx
  652. jnz .loop
  653. add rsp, 16 * 10
  654. pop rsp
  655. ; begin epilog
  656. pop rdi
  657. pop rsi
  658. RESTORE_XMM
  659. UNSHADOW_ARGS
  660. pop rbp
  661. ret
  662. global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
  663. sym(vpx_filter_block1d4_h8_avg_sse2):
  664. push rbp
  665. mov rbp, rsp
  666. SHADOW_ARGS_TO_STACK 6
  667. SAVE_XMM 7
  668. push rsi
  669. push rdi
  670. ; end prolog
  671. ALIGN_STACK 16, rax
  672. sub rsp, 16 * 6
  673. %define k0k1 [rsp + 16 * 0]
  674. %define k2k3 [rsp + 16 * 1]
  675. %define k5k4 [rsp + 16 * 2]
  676. %define k6k7 [rsp + 16 * 3]
  677. %define krd [rsp + 16 * 4]
  678. %define zero [rsp + 16 * 5]
  679. GET_FILTERS_4
  680. mov rsi, arg(0) ;src_ptr
  681. mov rdi, arg(2) ;output_ptr
  682. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  683. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  684. movsxd rcx, DWORD PTR arg(4) ;output_height
  685. .loop:
  686. movdqu xmm0, [rsi - 3] ;load src
  687. movdqa xmm1, xmm0
  688. movdqa xmm6, xmm0
  689. movdqa xmm7, xmm0
  690. movdqa xmm2, xmm0
  691. movdqa xmm3, xmm0
  692. movdqa xmm5, xmm0
  693. movdqa xmm4, xmm0
  694. psrldq xmm1, 1
  695. psrldq xmm6, 6
  696. psrldq xmm7, 7
  697. psrldq xmm2, 2
  698. psrldq xmm3, 3
  699. psrldq xmm5, 5
  700. psrldq xmm4, 4
  701. APPLY_FILTER_4 1
  702. lea rsi, [rsi + rax]
  703. lea rdi, [rdi + rdx]
  704. dec rcx
  705. jnz .loop
  706. add rsp, 16 * 6
  707. pop rsp
  708. ; begin epilog
  709. pop rdi
  710. pop rsi
  711. RESTORE_XMM
  712. UNSHADOW_ARGS
  713. pop rbp
  714. ret
  715. global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
  716. sym(vpx_filter_block1d8_h8_avg_sse2):
  717. push rbp
  718. mov rbp, rsp
  719. SHADOW_ARGS_TO_STACK 6
  720. SAVE_XMM 7
  721. push rsi
  722. push rdi
  723. ; end prolog
  724. ALIGN_STACK 16, rax
  725. sub rsp, 16 * 10
  726. %define k0 [rsp + 16 * 0]
  727. %define k1 [rsp + 16 * 1]
  728. %define k2 [rsp + 16 * 2]
  729. %define k3 [rsp + 16 * 3]
  730. %define k4 [rsp + 16 * 4]
  731. %define k5 [rsp + 16 * 5]
  732. %define k6 [rsp + 16 * 6]
  733. %define k7 [rsp + 16 * 7]
  734. %define krd [rsp + 16 * 8]
  735. %define zero [rsp + 16 * 9]
  736. GET_FILTERS
  737. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  738. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  739. movsxd rcx, DWORD PTR arg(4) ;output_height
  740. .loop:
  741. movdqu xmm0, [rsi - 3] ;load src
  742. movdqa xmm1, xmm0
  743. movdqa xmm6, xmm0
  744. movdqa xmm7, xmm0
  745. movdqa xmm2, xmm0
  746. movdqa xmm5, xmm0
  747. movdqa xmm3, xmm0
  748. movdqa xmm4, xmm0
  749. psrldq xmm1, 1
  750. psrldq xmm6, 6
  751. psrldq xmm7, 7
  752. psrldq xmm2, 2
  753. psrldq xmm5, 5
  754. psrldq xmm3, 3
  755. psrldq xmm4, 4
  756. APPLY_FILTER_8 1, 0
  757. lea rsi, [rsi + rax]
  758. lea rdi, [rdi + rdx]
  759. dec rcx
  760. jnz .loop
  761. add rsp, 16 * 10
  762. pop rsp
  763. ; begin epilog
  764. pop rdi
  765. pop rsi
  766. RESTORE_XMM
  767. UNSHADOW_ARGS
  768. pop rbp
  769. ret
  770. global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
  771. sym(vpx_filter_block1d16_h8_avg_sse2):
  772. push rbp
  773. mov rbp, rsp
  774. SHADOW_ARGS_TO_STACK 6
  775. SAVE_XMM 7
  776. push rsi
  777. push rdi
  778. ; end prolog
  779. ALIGN_STACK 16, rax
  780. sub rsp, 16 * 10
  781. %define k0 [rsp + 16 * 0]
  782. %define k1 [rsp + 16 * 1]
  783. %define k2 [rsp + 16 * 2]
  784. %define k3 [rsp + 16 * 3]
  785. %define k4 [rsp + 16 * 4]
  786. %define k5 [rsp + 16 * 5]
  787. %define k6 [rsp + 16 * 6]
  788. %define k7 [rsp + 16 * 7]
  789. %define krd [rsp + 16 * 8]
  790. %define zero [rsp + 16 * 9]
  791. GET_FILTERS
  792. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  793. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  794. movsxd rcx, DWORD PTR arg(4) ;output_height
  795. .loop:
  796. movdqu xmm0, [rsi - 3] ;load src
  797. movdqa xmm1, xmm0
  798. movdqa xmm6, xmm0
  799. movdqa xmm7, xmm0
  800. movdqa xmm2, xmm0
  801. movdqa xmm5, xmm0
  802. movdqa xmm3, xmm0
  803. movdqa xmm4, xmm0
  804. psrldq xmm1, 1
  805. psrldq xmm6, 6
  806. psrldq xmm7, 7
  807. psrldq xmm2, 2
  808. psrldq xmm5, 5
  809. psrldq xmm3, 3
  810. psrldq xmm4, 4
  811. APPLY_FILTER_8 1, 0
  812. movdqu xmm0, [rsi + 5] ;load src
  813. movdqa xmm1, xmm0
  814. movdqa xmm6, xmm0
  815. movdqa xmm7, xmm0
  816. movdqa xmm2, xmm0
  817. movdqa xmm5, xmm0
  818. movdqa xmm3, xmm0
  819. movdqa xmm4, xmm0
  820. psrldq xmm1, 1
  821. psrldq xmm6, 6
  822. psrldq xmm7, 7
  823. psrldq xmm2, 2
  824. psrldq xmm5, 5
  825. psrldq xmm3, 3
  826. psrldq xmm4, 4
  827. APPLY_FILTER_8 1, 8
  828. lea rsi, [rsi + rax]
  829. lea rdi, [rdi + rdx]
  830. dec rcx
  831. jnz .loop
  832. add rsp, 16 * 10
  833. pop rsp
  834. ; begin epilog
  835. pop rdi
  836. pop rsi
  837. RESTORE_XMM
  838. UNSHADOW_ARGS
  839. pop rbp
  840. ret