subpixel_sse2.asm 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %define BLOCK_HEIGHT_WIDTH 4
  12. %define VP8_FILTER_WEIGHT 128
  13. %define VP8_FILTER_SHIFT 7
  14. SECTION .text
  15. ;/************************************************************************************
  16. ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
  17. ; input pixel array has output_height rows. This routine assumes that output_height is an
  18. ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
  19. ; rows each iteration to take advantage of the 128 bits operations.
  20. ;*************************************************************************************/
  21. ;void vp8_filter_block1d8_h6_sse2
  22. ;(
  23. ; unsigned char *src_ptr,
  24. ; unsigned short *output_ptr,
  25. ; unsigned int src_pixels_per_line,
  26. ; unsigned int pixel_step,
  27. ; unsigned int output_height,
  28. ; unsigned int output_width,
  29. ; short *vp8_filter
  30. ;)
  31. global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
  32. sym(vp8_filter_block1d8_h6_sse2):
  33. push rbp
  34. mov rbp, rsp
  35. SHADOW_ARGS_TO_STACK 7
  36. SAVE_XMM 7
  37. GET_GOT rbx
  38. push rsi
  39. push rdi
  40. ; end prolog
  41. mov rdx, arg(6) ;vp8_filter
  42. mov rsi, arg(0) ;src_ptr
  43. mov rdi, arg(1) ;output_ptr
  44. movsxd rcx, dword ptr arg(4) ;output_height
  45. movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
  46. %if ABI_IS_32BIT=0
  47. movsxd r8, dword ptr arg(5) ;output_width
  48. %endif
  49. pxor xmm0, xmm0 ; clear xmm0 for unpack
  50. .filter_block1d8_h6_rowloop:
  51. movq xmm3, MMWORD PTR [rsi - 2]
  52. movq xmm1, MMWORD PTR [rsi + 6]
  53. prefetcht2 [rsi+rax-2]
  54. pslldq xmm1, 8
  55. por xmm1, xmm3
  56. movdqa xmm4, xmm1
  57. movdqa xmm5, xmm1
  58. movdqa xmm6, xmm1
  59. movdqa xmm7, xmm1
  60. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  61. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  62. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  63. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  64. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  65. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  66. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  67. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  68. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  69. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  70. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  71. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  72. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  73. psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  74. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  75. punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  76. pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  77. paddsw xmm4, xmm7
  78. paddsw xmm4, xmm5
  79. paddsw xmm4, xmm3
  80. paddsw xmm4, xmm6
  81. paddsw xmm4, xmm1
  82. paddsw xmm4, [GLOBAL(rd)]
  83. psraw xmm4, 7
  84. packuswb xmm4, xmm0
  85. punpcklbw xmm4, xmm0
  86. movdqa XMMWORD Ptr [rdi], xmm4
  87. lea rsi, [rsi + rax]
  88. %if ABI_IS_32BIT
  89. add rdi, DWORD Ptr arg(5) ;[output_width]
  90. %else
  91. add rdi, r8
  92. %endif
  93. dec rcx
  94. jnz .filter_block1d8_h6_rowloop ; next row
  95. ; begin epilog
  96. pop rdi
  97. pop rsi
  98. RESTORE_GOT
  99. RESTORE_XMM
  100. UNSHADOW_ARGS
  101. pop rbp
  102. ret
  103. ;void vp8_filter_block1d16_h6_sse2
  104. ;(
  105. ; unsigned char *src_ptr,
  106. ; unsigned short *output_ptr,
  107. ; unsigned int src_pixels_per_line,
  108. ; unsigned int pixel_step,
  109. ; unsigned int output_height,
  110. ; unsigned int output_width,
  111. ; short *vp8_filter
  112. ;)
  113. ;/************************************************************************************
  114. ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
  115. ; input pixel array has output_height rows. This routine assumes that output_height is an
  116. ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
  117. ; rows each iteration to take advantage of the 128 bits operations.
  118. ;*************************************************************************************/
  119. global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
  120. sym(vp8_filter_block1d16_h6_sse2):
  121. push rbp
  122. mov rbp, rsp
  123. SHADOW_ARGS_TO_STACK 7
  124. SAVE_XMM 7
  125. GET_GOT rbx
  126. push rsi
  127. push rdi
  128. ; end prolog
  129. mov rdx, arg(6) ;vp8_filter
  130. mov rsi, arg(0) ;src_ptr
  131. mov rdi, arg(1) ;output_ptr
  132. movsxd rcx, dword ptr arg(4) ;output_height
  133. movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
  134. %if ABI_IS_32BIT=0
  135. movsxd r8, dword ptr arg(5) ;output_width
  136. %endif
  137. pxor xmm0, xmm0 ; clear xmm0 for unpack
  138. .filter_block1d16_h6_sse2_rowloop:
  139. movq xmm3, MMWORD PTR [rsi - 2]
  140. movq xmm1, MMWORD PTR [rsi + 6]
  141. ; Load from 11 to avoid reading out of bounds.
  142. movq xmm2, MMWORD PTR [rsi +11]
  143. ; The lower bits are not cleared before 'or'ing with xmm1,
  144. ; but that is OK because the values in the overlapping positions
  145. ; are already equal to the ones in xmm1.
  146. pslldq xmm2, 5
  147. por xmm2, xmm1
  148. prefetcht2 [rsi+rax-2]
  149. pslldq xmm1, 8
  150. por xmm1, xmm3
  151. movdqa xmm4, xmm1
  152. movdqa xmm5, xmm1
  153. movdqa xmm6, xmm1
  154. movdqa xmm7, xmm1
  155. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  156. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  157. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  158. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  159. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  160. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  161. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  162. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  163. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  164. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  165. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  166. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  167. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  168. psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  169. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  170. punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  171. pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  172. paddsw xmm4, xmm7
  173. paddsw xmm4, xmm5
  174. paddsw xmm4, xmm3
  175. paddsw xmm4, xmm6
  176. paddsw xmm4, xmm1
  177. paddsw xmm4, [GLOBAL(rd)]
  178. psraw xmm4, 7
  179. packuswb xmm4, xmm0
  180. punpcklbw xmm4, xmm0
  181. movdqa XMMWORD Ptr [rdi], xmm4
  182. movdqa xmm3, xmm2
  183. movdqa xmm4, xmm2
  184. movdqa xmm5, xmm2
  185. movdqa xmm6, xmm2
  186. movdqa xmm7, xmm2
  187. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  188. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  189. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  190. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  191. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  192. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  193. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  194. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  195. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  196. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  197. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  198. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  199. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  200. psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  201. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  202. punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  203. pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  204. paddsw xmm4, xmm7
  205. paddsw xmm4, xmm5
  206. paddsw xmm4, xmm3
  207. paddsw xmm4, xmm6
  208. paddsw xmm4, xmm2
  209. paddsw xmm4, [GLOBAL(rd)]
  210. psraw xmm4, 7
  211. packuswb xmm4, xmm0
  212. punpcklbw xmm4, xmm0
  213. movdqa XMMWORD Ptr [rdi+16], xmm4
  214. lea rsi, [rsi + rax]
  215. %if ABI_IS_32BIT
  216. add rdi, DWORD Ptr arg(5) ;[output_width]
  217. %else
  218. add rdi, r8
  219. %endif
  220. dec rcx
  221. jnz .filter_block1d16_h6_sse2_rowloop ; next row
  222. ; begin epilog
  223. pop rdi
  224. pop rsi
  225. RESTORE_GOT
  226. RESTORE_XMM
  227. UNSHADOW_ARGS
  228. pop rbp
  229. ret
  230. ;void vp8_filter_block1d8_v6_sse2
  231. ;(
  232. ; short *src_ptr,
  233. ; unsigned char *output_ptr,
  234. ; int dst_ptich,
  235. ; unsigned int pixels_per_line,
  236. ; unsigned int pixel_step,
  237. ; unsigned int output_height,
  238. ; unsigned int output_width,
  239. ; short * vp8_filter
  240. ;)
  241. ;/************************************************************************************
  242. ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
  243. ; input pixel array has output_height rows.
  244. ;*************************************************************************************/
  245. global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
  246. sym(vp8_filter_block1d8_v6_sse2):
  247. push rbp
  248. mov rbp, rsp
  249. SHADOW_ARGS_TO_STACK 8
  250. SAVE_XMM 7
  251. GET_GOT rbx
  252. push rsi
  253. push rdi
  254. ; end prolog
  255. mov rax, arg(7) ;vp8_filter
  256. movsxd rdx, dword ptr arg(3) ;pixels_per_line
  257. mov rdi, arg(1) ;output_ptr
  258. mov rsi, arg(0) ;src_ptr
  259. sub rsi, rdx
  260. sub rsi, rdx
  261. movsxd rcx, DWORD PTR arg(5) ;[output_height]
  262. pxor xmm0, xmm0 ; clear xmm0
  263. movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
  264. %if ABI_IS_32BIT=0
  265. movsxd r8, dword ptr arg(2) ; dst_ptich
  266. %endif
  267. .vp8_filter_block1d8_v6_sse2_loop:
  268. movdqa xmm1, XMMWORD PTR [rsi]
  269. pmullw xmm1, [rax]
  270. movdqa xmm2, XMMWORD PTR [rsi + rdx]
  271. pmullw xmm2, [rax + 16]
  272. movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
  273. pmullw xmm3, [rax + 32]
  274. movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
  275. pmullw xmm5, [rax + 64]
  276. add rsi, rdx
  277. movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
  278. pmullw xmm4, [rax + 48]
  279. movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
  280. pmullw xmm6, [rax + 80]
  281. paddsw xmm2, xmm5
  282. paddsw xmm2, xmm3
  283. paddsw xmm2, xmm1
  284. paddsw xmm2, xmm4
  285. paddsw xmm2, xmm6
  286. paddsw xmm2, xmm7
  287. psraw xmm2, 7
  288. packuswb xmm2, xmm0 ; pack and saturate
  289. movq QWORD PTR [rdi], xmm2 ; store the results in the destination
  290. %if ABI_IS_32BIT
  291. add rdi, DWORD PTR arg(2) ;[dst_ptich]
  292. %else
  293. add rdi, r8
  294. %endif
  295. dec rcx ; decrement count
  296. jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
  297. ; begin epilog
  298. pop rdi
  299. pop rsi
  300. RESTORE_GOT
  301. RESTORE_XMM
  302. UNSHADOW_ARGS
  303. pop rbp
  304. ret
  305. ;void vp8_filter_block1d16_v6_sse2
  306. ;(
  307. ; unsigned short *src_ptr,
  308. ; unsigned char *output_ptr,
  309. ; int dst_ptich,
  310. ; unsigned int pixels_per_line,
  311. ; unsigned int pixel_step,
  312. ; unsigned int output_height,
  313. ; unsigned int output_width,
  314. ; const short *vp8_filter
  315. ;)
  316. ;/************************************************************************************
  317. ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
  318. ; input pixel array has output_height rows.
  319. ;*************************************************************************************/
  320. global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
  321. sym(vp8_filter_block1d16_v6_sse2):
  322. push rbp
  323. mov rbp, rsp
  324. SHADOW_ARGS_TO_STACK 8
  325. SAVE_XMM 7
  326. GET_GOT rbx
  327. push rsi
  328. push rdi
  329. ; end prolog
  330. mov rax, arg(7) ;vp8_filter
  331. movsxd rdx, dword ptr arg(3) ;pixels_per_line
  332. mov rdi, arg(1) ;output_ptr
  333. mov rsi, arg(0) ;src_ptr
  334. sub rsi, rdx
  335. sub rsi, rdx
  336. movsxd rcx, DWORD PTR arg(5) ;[output_height]
  337. %if ABI_IS_32BIT=0
  338. movsxd r8, dword ptr arg(2) ; dst_ptich
  339. %endif
  340. .vp8_filter_block1d16_v6_sse2_loop:
  341. ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
  342. movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
  343. movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
  344. pmullw xmm1, [rax + 16]
  345. pmullw xmm2, [rax + 16]
  346. movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
  347. movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
  348. pmullw xmm3, [rax + 64]
  349. pmullw xmm4, [rax + 64]
  350. movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
  351. movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
  352. pmullw xmm5, [rax + 32]
  353. pmullw xmm6, [rax + 32]
  354. movdqa xmm7, XMMWORD PTR [rsi] ; line 1
  355. movdqa xmm0, XMMWORD PTR [rsi + 16]
  356. pmullw xmm7, [rax]
  357. pmullw xmm0, [rax]
  358. paddsw xmm1, xmm3
  359. paddsw xmm2, xmm4
  360. paddsw xmm1, xmm5
  361. paddsw xmm2, xmm6
  362. paddsw xmm1, xmm7
  363. paddsw xmm2, xmm0
  364. add rsi, rdx
  365. movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
  366. movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
  367. pmullw xmm3, [rax + 48]
  368. pmullw xmm4, [rax + 48]
  369. movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
  370. movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
  371. pmullw xmm5, [rax + 80]
  372. pmullw xmm6, [rax + 80]
  373. movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
  374. pxor xmm0, xmm0 ; clear xmm0
  375. paddsw xmm1, xmm3
  376. paddsw xmm2, xmm4
  377. paddsw xmm1, xmm5
  378. paddsw xmm2, xmm6
  379. paddsw xmm1, xmm7
  380. paddsw xmm2, xmm7
  381. psraw xmm1, 7
  382. psraw xmm2, 7
  383. packuswb xmm1, xmm2 ; pack and saturate
  384. movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
  385. %if ABI_IS_32BIT
  386. add rdi, DWORD PTR arg(2) ;[dst_ptich]
  387. %else
  388. add rdi, r8
  389. %endif
  390. dec rcx ; decrement count
  391. jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
  392. ; begin epilog
  393. pop rdi
  394. pop rsi
  395. RESTORE_GOT
  396. RESTORE_XMM
  397. UNSHADOW_ARGS
  398. pop rbp
  399. ret
  400. ;void vp8_filter_block1d8_h6_only_sse2
  401. ;(
  402. ; unsigned char *src_ptr,
  403. ; unsigned int src_pixels_per_line,
  404. ; unsigned char *output_ptr,
  405. ; int dst_ptich,
  406. ; unsigned int output_height,
  407. ; const short *vp8_filter
  408. ;)
  409. ; First-pass filter only when yoffset==0
  410. global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
  411. sym(vp8_filter_block1d8_h6_only_sse2):
  412. push rbp
  413. mov rbp, rsp
  414. SHADOW_ARGS_TO_STACK 6
  415. SAVE_XMM 7
  416. GET_GOT rbx
  417. push rsi
  418. push rdi
  419. ; end prolog
  420. mov rdx, arg(5) ;vp8_filter
  421. mov rsi, arg(0) ;src_ptr
  422. mov rdi, arg(2) ;output_ptr
  423. movsxd rcx, dword ptr arg(4) ;output_height
  424. movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
  425. %if ABI_IS_32BIT=0
  426. movsxd r8, dword ptr arg(3) ;dst_ptich
  427. %endif
  428. pxor xmm0, xmm0 ; clear xmm0 for unpack
  429. .filter_block1d8_h6_only_rowloop:
  430. movq xmm3, MMWORD PTR [rsi - 2]
  431. movq xmm1, MMWORD PTR [rsi + 6]
  432. prefetcht2 [rsi+rax-2]
  433. pslldq xmm1, 8
  434. por xmm1, xmm3
  435. movdqa xmm4, xmm1
  436. movdqa xmm5, xmm1
  437. movdqa xmm6, xmm1
  438. movdqa xmm7, xmm1
  439. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  440. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  441. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  442. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  443. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  444. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  445. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  446. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  447. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  448. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  449. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  450. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  451. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  452. psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  453. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  454. punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  455. pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  456. paddsw xmm4, xmm7
  457. paddsw xmm4, xmm5
  458. paddsw xmm4, xmm3
  459. paddsw xmm4, xmm6
  460. paddsw xmm4, xmm1
  461. paddsw xmm4, [GLOBAL(rd)]
  462. psraw xmm4, 7
  463. packuswb xmm4, xmm0
  464. movq QWORD PTR [rdi], xmm4 ; store the results in the destination
  465. lea rsi, [rsi + rax]
  466. %if ABI_IS_32BIT
  467. add rdi, DWORD Ptr arg(3) ;dst_ptich
  468. %else
  469. add rdi, r8
  470. %endif
  471. dec rcx
  472. jnz .filter_block1d8_h6_only_rowloop ; next row
  473. ; begin epilog
  474. pop rdi
  475. pop rsi
  476. RESTORE_GOT
  477. RESTORE_XMM
  478. UNSHADOW_ARGS
  479. pop rbp
  480. ret
  481. ;void vp8_filter_block1d16_h6_only_sse2
  482. ;(
  483. ; unsigned char *src_ptr,
  484. ; unsigned int src_pixels_per_line,
  485. ; unsigned char *output_ptr,
  486. ; int dst_ptich,
  487. ; unsigned int output_height,
  488. ; const short *vp8_filter
  489. ;)
  490. ; First-pass filter only when yoffset==0
  491. global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
  492. sym(vp8_filter_block1d16_h6_only_sse2):
  493. push rbp
  494. mov rbp, rsp
  495. SHADOW_ARGS_TO_STACK 6
  496. SAVE_XMM 7
  497. GET_GOT rbx
  498. push rsi
  499. push rdi
  500. ; end prolog
  501. mov rdx, arg(5) ;vp8_filter
  502. mov rsi, arg(0) ;src_ptr
  503. mov rdi, arg(2) ;output_ptr
  504. movsxd rcx, dword ptr arg(4) ;output_height
  505. movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
  506. %if ABI_IS_32BIT=0
  507. movsxd r8, dword ptr arg(3) ;dst_ptich
  508. %endif
  509. pxor xmm0, xmm0 ; clear xmm0 for unpack
  510. .filter_block1d16_h6_only_sse2_rowloop:
  511. movq xmm3, MMWORD PTR [rsi - 2]
  512. movq xmm1, MMWORD PTR [rsi + 6]
  513. movq xmm2, MMWORD PTR [rsi +14]
  514. pslldq xmm2, 8
  515. por xmm2, xmm1
  516. prefetcht2 [rsi+rax-2]
  517. pslldq xmm1, 8
  518. por xmm1, xmm3
  519. movdqa xmm4, xmm1
  520. movdqa xmm5, xmm1
  521. movdqa xmm6, xmm1
  522. movdqa xmm7, xmm1
  523. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  524. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  525. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  526. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  527. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  528. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  529. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  530. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  531. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  532. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  533. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  534. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  535. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  536. psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  537. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  538. punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  539. pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  540. paddsw xmm4, xmm7
  541. paddsw xmm4, xmm5
  542. paddsw xmm4, xmm3
  543. paddsw xmm4, xmm6
  544. paddsw xmm4, xmm1
  545. paddsw xmm4, [GLOBAL(rd)]
  546. psraw xmm4, 7
  547. packuswb xmm4, xmm0 ; lower 8 bytes
  548. movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
  549. movdqa xmm3, xmm2
  550. movdqa xmm4, xmm2
  551. movdqa xmm5, xmm2
  552. movdqa xmm6, xmm2
  553. movdqa xmm7, xmm2
  554. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  555. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  556. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  557. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  558. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  559. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  560. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  561. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  562. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  563. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  564. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  565. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  566. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  567. psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  568. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  569. punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  570. pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  571. paddsw xmm4, xmm7
  572. paddsw xmm4, xmm5
  573. paddsw xmm4, xmm3
  574. paddsw xmm4, xmm6
  575. paddsw xmm4, xmm2
  576. paddsw xmm4, [GLOBAL(rd)]
  577. psraw xmm4, 7
  578. packuswb xmm4, xmm0 ; higher 8 bytes
  579. movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
  580. lea rsi, [rsi + rax]
  581. %if ABI_IS_32BIT
  582. add rdi, DWORD Ptr arg(3) ;dst_ptich
  583. %else
  584. add rdi, r8
  585. %endif
  586. dec rcx
  587. jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
  588. ; begin epilog
  589. pop rdi
  590. pop rsi
  591. RESTORE_GOT
  592. RESTORE_XMM
  593. UNSHADOW_ARGS
  594. pop rbp
  595. ret
  596. ;void vp8_filter_block1d8_v6_only_sse2
  597. ;(
  598. ; unsigned char *src_ptr,
  599. ; unsigned int src_pixels_per_line,
  600. ; unsigned char *output_ptr,
  601. ; int dst_ptich,
  602. ; unsigned int output_height,
  603. ; const short *vp8_filter
  604. ;)
  605. ; Second-pass filter only when xoffset==0
  606. global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
  607. sym(vp8_filter_block1d8_v6_only_sse2):
  608. push rbp
  609. mov rbp, rsp
  610. SHADOW_ARGS_TO_STACK 6
  611. SAVE_XMM 7
  612. GET_GOT rbx
  613. push rsi
  614. push rdi
  615. ; end prolog
  616. mov rsi, arg(0) ;src_ptr
  617. mov rdi, arg(2) ;output_ptr
  618. movsxd rcx, dword ptr arg(4) ;output_height
  619. movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
  620. mov rax, arg(5) ;vp8_filter
  621. pxor xmm0, xmm0 ; clear xmm0
  622. movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
  623. %if ABI_IS_32BIT=0
  624. movsxd r8, dword ptr arg(3) ; dst_ptich
  625. %endif
  626. .vp8_filter_block1d8_v6_only_sse2_loop:
  627. movq xmm1, MMWORD PTR [rsi]
  628. movq xmm2, MMWORD PTR [rsi + rdx]
  629. movq xmm3, MMWORD PTR [rsi + rdx * 2]
  630. movq xmm5, MMWORD PTR [rsi + rdx * 4]
  631. add rsi, rdx
  632. movq xmm4, MMWORD PTR [rsi + rdx * 2]
  633. movq xmm6, MMWORD PTR [rsi + rdx * 4]
  634. punpcklbw xmm1, xmm0
  635. pmullw xmm1, [rax]
  636. punpcklbw xmm2, xmm0
  637. pmullw xmm2, [rax + 16]
  638. punpcklbw xmm3, xmm0
  639. pmullw xmm3, [rax + 32]
  640. punpcklbw xmm5, xmm0
  641. pmullw xmm5, [rax + 64]
  642. punpcklbw xmm4, xmm0
  643. pmullw xmm4, [rax + 48]
  644. punpcklbw xmm6, xmm0
  645. pmullw xmm6, [rax + 80]
  646. paddsw xmm2, xmm5
  647. paddsw xmm2, xmm3
  648. paddsw xmm2, xmm1
  649. paddsw xmm2, xmm4
  650. paddsw xmm2, xmm6
  651. paddsw xmm2, xmm7
  652. psraw xmm2, 7
  653. packuswb xmm2, xmm0 ; pack and saturate
  654. movq QWORD PTR [rdi], xmm2 ; store the results in the destination
  655. %if ABI_IS_32BIT
  656. add rdi, DWORD PTR arg(3) ;[dst_ptich]
  657. %else
  658. add rdi, r8
  659. %endif
  660. dec rcx ; decrement count
  661. jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
  662. ; begin epilog
  663. pop rdi
  664. pop rsi
  665. RESTORE_GOT
  666. RESTORE_XMM
  667. UNSHADOW_ARGS
  668. pop rbp
  669. ret
  670. ;void vp8_unpack_block1d16_h6_sse2
  671. ;(
  672. ; unsigned char *src_ptr,
  673. ; unsigned short *output_ptr,
  674. ; unsigned int src_pixels_per_line,
  675. ; unsigned int output_height,
  676. ; unsigned int output_width
  677. ;)
  678. global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
  679. sym(vp8_unpack_block1d16_h6_sse2):
  680. push rbp
  681. mov rbp, rsp
  682. SHADOW_ARGS_TO_STACK 5
  683. GET_GOT rbx
  684. push rsi
  685. push rdi
  686. ; end prolog
  687. mov rsi, arg(0) ;src_ptr
  688. mov rdi, arg(1) ;output_ptr
  689. movsxd rcx, dword ptr arg(3) ;output_height
  690. movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
  691. pxor xmm0, xmm0 ; clear xmm0 for unpack
  692. %if ABI_IS_32BIT=0
  693. movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
  694. %endif
  695. .unpack_block1d16_h6_sse2_rowloop:
  696. movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
  697. movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
  698. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  699. punpcklbw xmm1, xmm0
  700. movdqa XMMWORD Ptr [rdi], xmm1
  701. movdqa XMMWORD Ptr [rdi + 16], xmm3
  702. lea rsi, [rsi + rax]
  703. %if ABI_IS_32BIT
  704. add rdi, DWORD Ptr arg(4) ;[output_width]
  705. %else
  706. add rdi, r8
  707. %endif
  708. dec rcx
  709. jnz .unpack_block1d16_h6_sse2_rowloop ; next row
  710. ; begin epilog
  711. pop rdi
  712. pop rsi
  713. RESTORE_GOT
  714. UNSHADOW_ARGS
  715. pop rbp
  716. ret
  717. SECTION_RODATA
  718. align 16
  719. rd:
  720. times 8 dw 0x40