2
0

subpixel_ssse3.asm 42 KB


  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %define BLOCK_HEIGHT_WIDTH 4
  12. %define VP8_FILTER_WEIGHT 128
  13. %define VP8_FILTER_SHIFT 7
  14. ;/************************************************************************************
  15. ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
  16. ; input pixel array has output_height rows. This routine assumes that output_height is an
  17. ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
  18. ; rows each iteration to take advantage of the 128 bits operations.
  19. ;
  20. ; This is an implementation of some of the SSE optimizations first seen in ffvp8
  21. ;
  22. ;*************************************************************************************/
  23. ;void vp8_filter_block1d8_h6_ssse3
  24. ;(
  25. ; unsigned char *src_ptr,
  26. ; unsigned int src_pixels_per_line,
  27. ; unsigned char *output_ptr,
  28. ; unsigned int output_pitch,
  29. ; unsigned int output_height,
  30. ; unsigned int vp8_filter_index
  31. ;)
  32. global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
  33. sym(vp8_filter_block1d8_h6_ssse3):
  34. push rbp
  35. mov rbp, rsp
  36. SHADOW_ARGS_TO_STACK 6
  37. SAVE_XMM 7
  38. GET_GOT rbx
  39. push rsi
  40. push rdi
  41. ; end prolog
  42. movsxd rdx, DWORD PTR arg(5) ;table index
  43. xor rsi, rsi
  44. shl rdx, 4
  45. movdqa xmm7, [GLOBAL(rd)]
  46. lea rax, [GLOBAL(k0_k5)]
  47. add rax, rdx
  48. mov rdi, arg(2) ;output_ptr
  49. cmp esi, DWORD PTR [rax]
  50. je vp8_filter_block1d8_h4_ssse3
  51. movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
  52. movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
  53. movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
  54. mov rsi, arg(0) ;src_ptr
  55. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  56. movsxd rcx, dword ptr arg(4) ;output_height
  57. movsxd rdx, dword ptr arg(3) ;output_pitch
  58. sub rdi, rdx
  59. ;xmm3 free
  60. .filter_block1d8_h6_rowloop_ssse3:
  61. movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
  62. movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
  63. punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
  64. movdqa xmm1, xmm0
  65. pmaddubsw xmm0, xmm4
  66. movdqa xmm2, xmm1
  67. pshufb xmm1, [GLOBAL(shuf2bfrom1)]
  68. pshufb xmm2, [GLOBAL(shuf3bfrom1)]
  69. pmaddubsw xmm1, xmm5
  70. lea rdi, [rdi + rdx]
  71. pmaddubsw xmm2, xmm6
  72. lea rsi, [rsi + rax]
  73. dec rcx
  74. paddsw xmm0, xmm1
  75. paddsw xmm2, xmm7
  76. paddsw xmm0, xmm2
  77. psraw xmm0, 7
  78. packuswb xmm0, xmm0
  79. movq MMWORD Ptr [rdi], xmm0
  80. jnz .filter_block1d8_h6_rowloop_ssse3
  81. ; begin epilog
  82. pop rdi
  83. pop rsi
  84. RESTORE_GOT
  85. RESTORE_XMM
  86. UNSHADOW_ARGS
  87. pop rbp
  88. ret
  89. vp8_filter_block1d8_h4_ssse3:
  90. movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
  91. movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
  92. movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
  93. movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
  94. mov rsi, arg(0) ;src_ptr
  95. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  96. movsxd rcx, dword ptr arg(4) ;output_height
  97. movsxd rdx, dword ptr arg(3) ;output_pitch
  98. sub rdi, rdx
  99. .filter_block1d8_h4_rowloop_ssse3:
  100. movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
  101. movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
  102. punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
  103. movdqa xmm2, xmm0
  104. pshufb xmm0, xmm3
  105. pshufb xmm2, xmm4
  106. pmaddubsw xmm0, xmm5
  107. lea rdi, [rdi + rdx]
  108. pmaddubsw xmm2, xmm6
  109. lea rsi, [rsi + rax]
  110. dec rcx
  111. paddsw xmm0, xmm7
  112. paddsw xmm0, xmm2
  113. psraw xmm0, 7
  114. packuswb xmm0, xmm0
  115. movq MMWORD Ptr [rdi], xmm0
  116. jnz .filter_block1d8_h4_rowloop_ssse3
  117. ; begin epilog
  118. pop rdi
  119. pop rsi
  120. RESTORE_GOT
  121. RESTORE_XMM
  122. UNSHADOW_ARGS
  123. pop rbp
  124. ret
  125. ;void vp8_filter_block1d16_h6_ssse3
  126. ;(
  127. ; unsigned char *src_ptr,
  128. ; unsigned int src_pixels_per_line,
  129. ; unsigned char *output_ptr,
  130. ; unsigned int output_pitch,
  131. ; unsigned int output_height,
  132. ; unsigned int vp8_filter_index
  133. ;)
  134. global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
  135. sym(vp8_filter_block1d16_h6_ssse3):
  136. push rbp
  137. mov rbp, rsp
  138. SHADOW_ARGS_TO_STACK 6
  139. SAVE_XMM 7
  140. GET_GOT rbx
  141. push rsi
  142. push rdi
  143. ; end prolog
  144. movsxd rdx, DWORD PTR arg(5) ;table index
  145. xor rsi, rsi
  146. shl rdx, 4 ;
  147. lea rax, [GLOBAL(k0_k5)]
  148. add rax, rdx
  149. mov rdi, arg(2) ;output_ptr
  150. mov rsi, arg(0) ;src_ptr
  151. movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
  152. movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
  153. movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
  154. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  155. movsxd rcx, dword ptr arg(4) ;output_height
  156. movsxd rdx, dword ptr arg(3) ;output_pitch
  157. .filter_block1d16_h6_rowloop_ssse3:
  158. movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
  159. movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
  160. punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
  161. movdqa xmm1, xmm0
  162. pmaddubsw xmm0, xmm4
  163. movdqa xmm2, xmm1
  164. pshufb xmm1, [GLOBAL(shuf2bfrom1)]
  165. pshufb xmm2, [GLOBAL(shuf3bfrom1)]
  166. movq xmm3, MMWORD PTR [rsi + 6]
  167. pmaddubsw xmm1, xmm5
  168. movq xmm7, MMWORD PTR [rsi + 11]
  169. pmaddubsw xmm2, xmm6
  170. punpcklbw xmm3, xmm7
  171. paddsw xmm0, xmm1
  172. movdqa xmm1, xmm3
  173. pmaddubsw xmm3, xmm4
  174. paddsw xmm0, xmm2
  175. movdqa xmm2, xmm1
  176. paddsw xmm0, [GLOBAL(rd)]
  177. pshufb xmm1, [GLOBAL(shuf2bfrom1)]
  178. pshufb xmm2, [GLOBAL(shuf3bfrom1)]
  179. psraw xmm0, 7
  180. pmaddubsw xmm1, xmm5
  181. pmaddubsw xmm2, xmm6
  182. packuswb xmm0, xmm0
  183. lea rsi, [rsi + rax]
  184. paddsw xmm3, xmm1
  185. paddsw xmm3, xmm2
  186. paddsw xmm3, [GLOBAL(rd)]
  187. psraw xmm3, 7
  188. packuswb xmm3, xmm3
  189. punpcklqdq xmm0, xmm3
  190. movdqa XMMWORD Ptr [rdi], xmm0
  191. lea rdi, [rdi + rdx]
  192. dec rcx
  193. jnz .filter_block1d16_h6_rowloop_ssse3
  194. ; begin epilog
  195. pop rdi
  196. pop rsi
  197. RESTORE_GOT
  198. RESTORE_XMM
  199. UNSHADOW_ARGS
  200. pop rbp
  201. ret
  202. ;void vp8_filter_block1d4_h6_ssse3
  203. ;(
  204. ; unsigned char *src_ptr,
  205. ; unsigned int src_pixels_per_line,
  206. ; unsigned char *output_ptr,
  207. ; unsigned int output_pitch,
  208. ; unsigned int output_height,
  209. ; unsigned int vp8_filter_index
  210. ;)
  211. global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
  212. sym(vp8_filter_block1d4_h6_ssse3):
  213. push rbp
  214. mov rbp, rsp
  215. SHADOW_ARGS_TO_STACK 6
  216. SAVE_XMM 7
  217. GET_GOT rbx
  218. push rsi
  219. push rdi
  220. ; end prolog
  221. movsxd rdx, DWORD PTR arg(5) ;table index
  222. xor rsi, rsi
  223. shl rdx, 4 ;
  224. lea rax, [GLOBAL(k0_k5)]
  225. add rax, rdx
  226. movdqa xmm7, [GLOBAL(rd)]
  227. cmp esi, DWORD PTR [rax]
  228. je .vp8_filter_block1d4_h4_ssse3
  229. movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
  230. movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
  231. movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
  232. mov rsi, arg(0) ;src_ptr
  233. mov rdi, arg(2) ;output_ptr
  234. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  235. movsxd rcx, dword ptr arg(4) ;output_height
  236. movsxd rdx, dword ptr arg(3) ;output_pitch
  237. ;xmm3 free
  238. .filter_block1d4_h6_rowloop_ssse3:
  239. movdqu xmm0, XMMWORD PTR [rsi - 2]
  240. movdqa xmm1, xmm0
  241. pshufb xmm0, [GLOBAL(shuf1b)]
  242. movdqa xmm2, xmm1
  243. pshufb xmm1, [GLOBAL(shuf2b)]
  244. pmaddubsw xmm0, xmm4
  245. pshufb xmm2, [GLOBAL(shuf3b)]
  246. pmaddubsw xmm1, xmm5
  247. ;--
  248. pmaddubsw xmm2, xmm6
  249. lea rsi, [rsi + rax]
  250. ;--
  251. paddsw xmm0, xmm1
  252. paddsw xmm0, xmm7
  253. pxor xmm1, xmm1
  254. paddsw xmm0, xmm2
  255. psraw xmm0, 7
  256. packuswb xmm0, xmm0
  257. movd DWORD PTR [rdi], xmm0
  258. add rdi, rdx
  259. dec rcx
  260. jnz .filter_block1d4_h6_rowloop_ssse3
  261. ; begin epilog
  262. pop rdi
  263. pop rsi
  264. RESTORE_GOT
  265. RESTORE_XMM
  266. UNSHADOW_ARGS
  267. pop rbp
  268. ret
  269. .vp8_filter_block1d4_h4_ssse3:
  270. movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
  271. movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
  272. movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
  273. movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
  274. mov rsi, arg(0) ;src_ptr
  275. mov rdi, arg(2) ;output_ptr
  276. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  277. movsxd rcx, dword ptr arg(4) ;output_height
  278. movsxd rdx, dword ptr arg(3) ;output_pitch
  279. .filter_block1d4_h4_rowloop_ssse3:
  280. movdqu xmm1, XMMWORD PTR [rsi - 2]
  281. movdqa xmm2, xmm1
  282. pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
  283. pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
  284. pmaddubsw xmm1, xmm5
  285. ;--
  286. pmaddubsw xmm2, xmm6
  287. lea rsi, [rsi + rax]
  288. ;--
  289. paddsw xmm1, xmm7
  290. paddsw xmm1, xmm2
  291. psraw xmm1, 7
  292. packuswb xmm1, xmm1
  293. movd DWORD PTR [rdi], xmm1
  294. add rdi, rdx
  295. dec rcx
  296. jnz .filter_block1d4_h4_rowloop_ssse3
  297. ; begin epilog
  298. pop rdi
  299. pop rsi
  300. RESTORE_GOT
  301. RESTORE_XMM
  302. UNSHADOW_ARGS
  303. pop rbp
  304. ret
  305. ;void vp8_filter_block1d16_v6_ssse3
  306. ;(
  307. ; unsigned char *src_ptr,
  308. ; unsigned int src_pitch,
  309. ; unsigned char *output_ptr,
  310. ; unsigned int out_pitch,
  311. ; unsigned int output_height,
  312. ; unsigned int vp8_filter_index
  313. ;)
  314. global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
  315. sym(vp8_filter_block1d16_v6_ssse3):
  316. push rbp
  317. mov rbp, rsp
  318. SHADOW_ARGS_TO_STACK 6
  319. SAVE_XMM 7
  320. GET_GOT rbx
  321. push rsi
  322. push rdi
  323. ; end prolog
  324. movsxd rdx, DWORD PTR arg(5) ;table index
  325. xor rsi, rsi
  326. shl rdx, 4 ;
  327. lea rax, [GLOBAL(k0_k5)]
  328. add rax, rdx
  329. cmp esi, DWORD PTR [rax]
  330. je .vp8_filter_block1d16_v4_ssse3
  331. movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
  332. movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
  333. movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
  334. mov rsi, arg(0) ;src_ptr
  335. movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
  336. mov rdi, arg(2) ;output_ptr
  337. %if ABI_IS_32BIT=0
  338. movsxd r8, DWORD PTR arg(3) ;out_pitch
  339. %endif
  340. mov rax, rsi
  341. movsxd rcx, DWORD PTR arg(4) ;output_height
  342. add rax, rdx
  343. .vp8_filter_block1d16_v6_ssse3_loop:
  344. movq xmm1, MMWORD PTR [rsi] ;A
  345. movq xmm2, MMWORD PTR [rsi + rdx] ;B
  346. movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
  347. movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
  348. movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
  349. punpcklbw xmm2, xmm4 ;B D
  350. punpcklbw xmm3, xmm0 ;C E
  351. movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
  352. pmaddubsw xmm3, xmm6
  353. punpcklbw xmm1, xmm0 ;A F
  354. pmaddubsw xmm2, xmm7
  355. pmaddubsw xmm1, xmm5
  356. paddsw xmm2, xmm3
  357. paddsw xmm2, xmm1
  358. paddsw xmm2, [GLOBAL(rd)]
  359. psraw xmm2, 7
  360. packuswb xmm2, xmm2
  361. movq MMWORD PTR [rdi], xmm2 ;store the results
  362. movq xmm1, MMWORD PTR [rsi + 8] ;A
  363. movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
  364. movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
  365. movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
  366. movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
  367. punpcklbw xmm2, xmm4 ;B D
  368. punpcklbw xmm3, xmm0 ;C E
  369. movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
  370. pmaddubsw xmm3, xmm6
  371. punpcklbw xmm1, xmm0 ;A F
  372. pmaddubsw xmm2, xmm7
  373. pmaddubsw xmm1, xmm5
  374. add rsi, rdx
  375. add rax, rdx
  376. ;--
  377. ;--
  378. paddsw xmm2, xmm3
  379. paddsw xmm2, xmm1
  380. paddsw xmm2, [GLOBAL(rd)]
  381. psraw xmm2, 7
  382. packuswb xmm2, xmm2
  383. movq MMWORD PTR [rdi+8], xmm2
  384. %if ABI_IS_32BIT
  385. add rdi, DWORD PTR arg(3) ;out_pitch
  386. %else
  387. add rdi, r8
  388. %endif
  389. dec rcx
  390. jnz .vp8_filter_block1d16_v6_ssse3_loop
  391. ; begin epilog
  392. pop rdi
  393. pop rsi
  394. RESTORE_GOT
  395. RESTORE_XMM
  396. UNSHADOW_ARGS
  397. pop rbp
  398. ret
  399. .vp8_filter_block1d16_v4_ssse3:
  400. movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
  401. movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
  402. mov rsi, arg(0) ;src_ptr
  403. movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
  404. mov rdi, arg(2) ;output_ptr
  405. %if ABI_IS_32BIT=0
  406. movsxd r8, DWORD PTR arg(3) ;out_pitch
  407. %endif
  408. mov rax, rsi
  409. movsxd rcx, DWORD PTR arg(4) ;output_height
  410. add rax, rdx
  411. .vp8_filter_block1d16_v4_ssse3_loop:
  412. movq xmm2, MMWORD PTR [rsi + rdx] ;B
  413. movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
  414. movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
  415. movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
  416. punpcklbw xmm2, xmm4 ;B D
  417. punpcklbw xmm3, xmm0 ;C E
  418. pmaddubsw xmm3, xmm6
  419. pmaddubsw xmm2, xmm7
  420. movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
  421. movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
  422. movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
  423. movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
  424. paddsw xmm2, [GLOBAL(rd)]
  425. paddsw xmm2, xmm3
  426. psraw xmm2, 7
  427. packuswb xmm2, xmm2
  428. punpcklbw xmm5, xmm4 ;B D
  429. punpcklbw xmm1, xmm0 ;C E
  430. pmaddubsw xmm1, xmm6
  431. pmaddubsw xmm5, xmm7
  432. movdqa xmm4, [GLOBAL(rd)]
  433. add rsi, rdx
  434. add rax, rdx
  435. ;--
  436. ;--
  437. paddsw xmm5, xmm1
  438. paddsw xmm5, xmm4
  439. psraw xmm5, 7
  440. packuswb xmm5, xmm5
  441. punpcklqdq xmm2, xmm5
  442. movdqa XMMWORD PTR [rdi], xmm2
  443. %if ABI_IS_32BIT
  444. add rdi, DWORD PTR arg(3) ;out_pitch
  445. %else
  446. add rdi, r8
  447. %endif
  448. dec rcx
  449. jnz .vp8_filter_block1d16_v4_ssse3_loop
  450. ; begin epilog
  451. pop rdi
  452. pop rsi
  453. RESTORE_GOT
  454. RESTORE_XMM
  455. UNSHADOW_ARGS
  456. pop rbp
  457. ret
  458. ;void vp8_filter_block1d8_v6_ssse3
  459. ;(
  460. ; unsigned char *src_ptr,
  461. ; unsigned int src_pitch,
  462. ; unsigned char *output_ptr,
  463. ; unsigned int out_pitch,
  464. ; unsigned int output_height,
  465. ; unsigned int vp8_filter_index
  466. ;)
  467. global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
  468. sym(vp8_filter_block1d8_v6_ssse3):
  469. push rbp
  470. mov rbp, rsp
  471. SHADOW_ARGS_TO_STACK 6
  472. SAVE_XMM 7
  473. GET_GOT rbx
  474. push rsi
  475. push rdi
  476. ; end prolog
  477. movsxd rdx, DWORD PTR arg(5) ;table index
  478. xor rsi, rsi
  479. shl rdx, 4 ;
  480. lea rax, [GLOBAL(k0_k5)]
  481. add rax, rdx
  482. movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
  483. mov rdi, arg(2) ;output_ptr
  484. %if ABI_IS_32BIT=0
  485. movsxd r8, DWORD PTR arg(3) ; out_pitch
  486. %endif
  487. movsxd rcx, DWORD PTR arg(4) ;[output_height]
  488. cmp esi, DWORD PTR [rax]
  489. je .vp8_filter_block1d8_v4_ssse3
  490. movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
  491. movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
  492. movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
  493. mov rsi, arg(0) ;src_ptr
  494. mov rax, rsi
  495. add rax, rdx
  496. .vp8_filter_block1d8_v6_ssse3_loop:
  497. movq xmm1, MMWORD PTR [rsi] ;A
  498. movq xmm2, MMWORD PTR [rsi + rdx] ;B
  499. movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
  500. movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
  501. movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
  502. punpcklbw xmm2, xmm4 ;B D
  503. punpcklbw xmm3, xmm0 ;C E
  504. movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
  505. movdqa xmm4, [GLOBAL(rd)]
  506. pmaddubsw xmm3, xmm6
  507. punpcklbw xmm1, xmm0 ;A F
  508. pmaddubsw xmm2, xmm7
  509. pmaddubsw xmm1, xmm5
  510. add rsi, rdx
  511. add rax, rdx
  512. ;--
  513. ;--
  514. paddsw xmm2, xmm3
  515. paddsw xmm2, xmm1
  516. paddsw xmm2, xmm4
  517. psraw xmm2, 7
  518. packuswb xmm2, xmm2
  519. movq MMWORD PTR [rdi], xmm2
  520. %if ABI_IS_32BIT
  521. add rdi, DWORD PTR arg(3) ;[out_pitch]
  522. %else
  523. add rdi, r8
  524. %endif
  525. dec rcx
  526. jnz .vp8_filter_block1d8_v6_ssse3_loop
  527. ; begin epilog
  528. pop rdi
  529. pop rsi
  530. RESTORE_GOT
  531. RESTORE_XMM
  532. UNSHADOW_ARGS
  533. pop rbp
  534. ret
  535. .vp8_filter_block1d8_v4_ssse3:
  536. movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
  537. movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
  538. movdqa xmm5, [GLOBAL(rd)]
  539. mov rsi, arg(0) ;src_ptr
  540. mov rax, rsi
  541. add rax, rdx
  542. .vp8_filter_block1d8_v4_ssse3_loop:
  543. movq xmm2, MMWORD PTR [rsi + rdx] ;B
  544. movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
  545. movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
  546. movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
  547. punpcklbw xmm2, xmm4 ;B D
  548. punpcklbw xmm3, xmm0 ;C E
  549. pmaddubsw xmm3, xmm6
  550. pmaddubsw xmm2, xmm7
  551. add rsi, rdx
  552. add rax, rdx
  553. ;--
  554. ;--
  555. paddsw xmm2, xmm3
  556. paddsw xmm2, xmm5
  557. psraw xmm2, 7
  558. packuswb xmm2, xmm2
  559. movq MMWORD PTR [rdi], xmm2
  560. %if ABI_IS_32BIT
  561. add rdi, DWORD PTR arg(3) ;[out_pitch]
  562. %else
  563. add rdi, r8
  564. %endif
  565. dec rcx
  566. jnz .vp8_filter_block1d8_v4_ssse3_loop
  567. ; begin epilog
  568. pop rdi
  569. pop rsi
  570. RESTORE_GOT
  571. RESTORE_XMM
  572. UNSHADOW_ARGS
  573. pop rbp
  574. ret
  575. ;void vp8_filter_block1d4_v6_ssse3
  576. ;(
  577. ; unsigned char *src_ptr,
  578. ; unsigned int src_pitch,
  579. ; unsigned char *output_ptr,
  580. ; unsigned int out_pitch,
  581. ; unsigned int output_height,
  582. ; unsigned int vp8_filter_index
  583. ;)
  584. global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
  585. sym(vp8_filter_block1d4_v6_ssse3):
  586. push rbp
  587. mov rbp, rsp
  588. SHADOW_ARGS_TO_STACK 6
  589. GET_GOT rbx
  590. push rsi
  591. push rdi
  592. ; end prolog
  593. movsxd rdx, DWORD PTR arg(5) ;table index
  594. xor rsi, rsi
  595. shl rdx, 4 ;
  596. lea rax, [GLOBAL(k0_k5)]
  597. add rax, rdx
  598. movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
  599. mov rdi, arg(2) ;output_ptr
  600. %if ABI_IS_32BIT=0
  601. movsxd r8, DWORD PTR arg(3) ; out_pitch
  602. %endif
  603. movsxd rcx, DWORD PTR arg(4) ;[output_height]
  604. cmp esi, DWORD PTR [rax]
  605. je .vp8_filter_block1d4_v4_ssse3
  606. movq mm5, MMWORD PTR [rax] ;k0_k5
  607. movq mm6, MMWORD PTR [rax+256] ;k2_k4
  608. movq mm7, MMWORD PTR [rax+128] ;k1_k3
  609. mov rsi, arg(0) ;src_ptr
  610. mov rax, rsi
  611. add rax, rdx
  612. .vp8_filter_block1d4_v6_ssse3_loop:
  613. movd mm1, DWORD PTR [rsi] ;A
  614. movd mm2, DWORD PTR [rsi + rdx] ;B
  615. movd mm3, DWORD PTR [rsi + rdx * 2] ;C
  616. movd mm4, DWORD PTR [rax + rdx * 2] ;D
  617. movd mm0, DWORD PTR [rsi + rdx * 4] ;E
  618. punpcklbw mm2, mm4 ;B D
  619. punpcklbw mm3, mm0 ;C E
  620. movd mm0, DWORD PTR [rax + rdx * 4] ;F
  621. movq mm4, [GLOBAL(rd)]
  622. pmaddubsw mm3, mm6
  623. punpcklbw mm1, mm0 ;A F
  624. pmaddubsw mm2, mm7
  625. pmaddubsw mm1, mm5
  626. add rsi, rdx
  627. add rax, rdx
  628. ;--
  629. ;--
  630. paddsw mm2, mm3
  631. paddsw mm2, mm1
  632. paddsw mm2, mm4
  633. psraw mm2, 7
  634. packuswb mm2, mm2
  635. movd DWORD PTR [rdi], mm2
  636. %if ABI_IS_32BIT
  637. add rdi, DWORD PTR arg(3) ;[out_pitch]
  638. %else
  639. add rdi, r8
  640. %endif
  641. dec rcx
  642. jnz .vp8_filter_block1d4_v6_ssse3_loop
  643. ; begin epilog
  644. pop rdi
  645. pop rsi
  646. RESTORE_GOT
  647. UNSHADOW_ARGS
  648. pop rbp
  649. ret
  650. .vp8_filter_block1d4_v4_ssse3:
  651. movq mm6, MMWORD PTR [rax+256] ;k2_k4
  652. movq mm7, MMWORD PTR [rax+128] ;k1_k3
  653. movq mm5, MMWORD PTR [GLOBAL(rd)]
  654. mov rsi, arg(0) ;src_ptr
  655. mov rax, rsi
  656. add rax, rdx
  657. .vp8_filter_block1d4_v4_ssse3_loop:
  658. movd mm2, DWORD PTR [rsi + rdx] ;B
  659. movd mm3, DWORD PTR [rsi + rdx * 2] ;C
  660. movd mm4, DWORD PTR [rax + rdx * 2] ;D
  661. movd mm0, DWORD PTR [rsi + rdx * 4] ;E
  662. punpcklbw mm2, mm4 ;B D
  663. punpcklbw mm3, mm0 ;C E
  664. pmaddubsw mm3, mm6
  665. pmaddubsw mm2, mm7
  666. add rsi, rdx
  667. add rax, rdx
  668. ;--
  669. ;--
  670. paddsw mm2, mm3
  671. paddsw mm2, mm5
  672. psraw mm2, 7
  673. packuswb mm2, mm2
  674. movd DWORD PTR [rdi], mm2
  675. %if ABI_IS_32BIT
  676. add rdi, DWORD PTR arg(3) ;[out_pitch]
  677. %else
  678. add rdi, r8
  679. %endif
  680. dec rcx
  681. jnz .vp8_filter_block1d4_v4_ssse3_loop
  682. ; begin epilog
  683. pop rdi
  684. pop rsi
  685. RESTORE_GOT
  686. UNSHADOW_ARGS
  687. pop rbp
  688. ret
  689. ;void vp8_bilinear_predict16x16_ssse3
  690. ;(
  691. ; unsigned char *src_ptr,
  692. ; int src_pixels_per_line,
  693. ; int xoffset,
  694. ; int yoffset,
  695. ; unsigned char *dst_ptr,
  696. ; int dst_pitch
  697. ;)
  698. global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
  699. sym(vp8_bilinear_predict16x16_ssse3):
  700. push rbp
  701. mov rbp, rsp
  702. SHADOW_ARGS_TO_STACK 6
  703. SAVE_XMM 7
  704. GET_GOT rbx
  705. push rsi
  706. push rdi
  707. ; end prolog
  708. lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
  709. movsxd rax, dword ptr arg(2) ; xoffset
  710. cmp rax, 0 ; skip first_pass filter if xoffset=0
  711. je .b16x16_sp_only
  712. shl rax, 4
  713. lea rax, [rax + rcx] ; HFilter
  714. mov rdi, arg(4) ; dst_ptr
  715. mov rsi, arg(0) ; src_ptr
  716. movsxd rdx, dword ptr arg(5) ; dst_pitch
  717. movdqa xmm1, [rax]
  718. movsxd rax, dword ptr arg(3) ; yoffset
  719. cmp rax, 0 ; skip second_pass filter if yoffset=0
  720. je .b16x16_fp_only
  721. shl rax, 4
  722. lea rax, [rax + rcx] ; VFilter
  723. lea rcx, [rdi+rdx*8]
  724. lea rcx, [rcx+rdx*8]
  725. movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
  726. movdqa xmm2, [rax]
  727. %if ABI_IS_32BIT=0
  728. movsxd r8, dword ptr arg(5) ; dst_pitch
  729. %endif
  730. movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
  731. movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
  732. punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
  733. movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
  734. movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
  735. lea rsi, [rsi + rdx] ; next line
  736. pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
  737. punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
  738. pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
  739. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  740. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  741. paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
  742. psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
  743. movdqa xmm7, xmm3
  744. packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  745. .next_row:
  746. movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
  747. movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
  748. punpcklbw xmm6, xmm5
  749. movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
  750. movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
  751. lea rsi, [rsi + rdx] ; next line
  752. pmaddubsw xmm6, xmm1
  753. punpcklbw xmm4, xmm5
  754. pmaddubsw xmm4, xmm1
  755. paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
  756. psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
  757. paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
  758. psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
  759. packuswb xmm6, xmm4
  760. movdqa xmm5, xmm7
  761. punpcklbw xmm5, xmm6
  762. pmaddubsw xmm5, xmm2
  763. punpckhbw xmm7, xmm6
  764. pmaddubsw xmm7, xmm2
  765. paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
  766. psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
  767. paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
  768. psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
  769. packuswb xmm5, xmm7
  770. movdqa xmm7, xmm6
  771. movdqa [rdi], xmm5 ; store the results in the destination
  772. %if ABI_IS_32BIT
  773. add rdi, DWORD PTR arg(5) ; dst_pitch
  774. %else
  775. add rdi, r8
  776. %endif
  777. cmp rdi, rcx
  778. jne .next_row
  779. jmp .done
  780. .b16x16_sp_only:
  781. movsxd rax, dword ptr arg(3) ; yoffset
  782. shl rax, 4
  783. lea rax, [rax + rcx] ; VFilter
  784. mov rdi, arg(4) ; dst_ptr
  785. mov rsi, arg(0) ; src_ptr
  786. movsxd rdx, dword ptr arg(5) ; dst_pitch
  787. movdqa xmm1, [rax] ; VFilter
  788. lea rcx, [rdi+rdx*8]
  789. lea rcx, [rcx+rdx*8]
  790. movsxd rax, dword ptr arg(1) ; src_pixels_per_line
  791. ; get the first horizontal line done
  792. movq xmm4, [rsi] ; load row 0
  793. movq xmm2, [rsi + 8] ; load row 0
  794. lea rsi, [rsi + rax] ; next line
  795. .next_row_sp:
  796. movq xmm3, [rsi] ; load row + 1
  797. movq xmm5, [rsi + 8] ; load row + 1
  798. punpcklbw xmm4, xmm3
  799. punpcklbw xmm2, xmm5
  800. pmaddubsw xmm4, xmm1
  801. movq xmm7, [rsi + rax] ; load row + 2
  802. pmaddubsw xmm2, xmm1
  803. movq xmm6, [rsi + rax + 8] ; load row + 2
  804. punpcklbw xmm3, xmm7
  805. punpcklbw xmm5, xmm6
  806. pmaddubsw xmm3, xmm1
  807. paddw xmm4, [GLOBAL(rd)]
  808. pmaddubsw xmm5, xmm1
  809. paddw xmm2, [GLOBAL(rd)]
  810. psraw xmm4, VP8_FILTER_SHIFT
  811. psraw xmm2, VP8_FILTER_SHIFT
  812. packuswb xmm4, xmm2
  813. paddw xmm3, [GLOBAL(rd)]
  814. movdqa [rdi], xmm4 ; store row 0
  815. paddw xmm5, [GLOBAL(rd)]
  816. psraw xmm3, VP8_FILTER_SHIFT
  817. psraw xmm5, VP8_FILTER_SHIFT
  818. packuswb xmm3, xmm5
  819. movdqa xmm4, xmm7
  820. movdqa [rdi + rdx],xmm3 ; store row 1
  821. lea rsi, [rsi + 2*rax]
  822. movdqa xmm2, xmm6
  823. lea rdi, [rdi + 2*rdx]
  824. cmp rdi, rcx
  825. jne .next_row_sp
  826. jmp .done
  827. .b16x16_fp_only:
  828. lea rcx, [rdi+rdx*8]
  829. lea rcx, [rcx+rdx*8]
  830. movsxd rax, dword ptr arg(1) ; src_pixels_per_line
  831. .next_row_fp:
  832. movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
  833. movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
  834. punpcklbw xmm2, xmm4
  835. movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
  836. pmaddubsw xmm2, xmm1
  837. movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
  838. lea rsi, [rsi + rax] ; next line
  839. punpcklbw xmm3, xmm4
  840. pmaddubsw xmm3, xmm1
  841. movq xmm5, [rsi]
  842. paddw xmm2, [GLOBAL(rd)]
  843. movq xmm7, [rsi+1]
  844. movq xmm6, [rsi+8]
  845. psraw xmm2, VP8_FILTER_SHIFT
  846. punpcklbw xmm5, xmm7
  847. movq xmm7, [rsi+9]
  848. paddw xmm3, [GLOBAL(rd)]
  849. pmaddubsw xmm5, xmm1
  850. psraw xmm3, VP8_FILTER_SHIFT
  851. punpcklbw xmm6, xmm7
  852. packuswb xmm2, xmm3
  853. pmaddubsw xmm6, xmm1
  854. movdqa [rdi], xmm2 ; store the results in the destination
  855. paddw xmm5, [GLOBAL(rd)]
  856. lea rdi, [rdi + rdx] ; dst_pitch
  857. psraw xmm5, VP8_FILTER_SHIFT
  858. paddw xmm6, [GLOBAL(rd)]
  859. psraw xmm6, VP8_FILTER_SHIFT
  860. packuswb xmm5, xmm6
  861. lea rsi, [rsi + rax] ; next line
  862. movdqa [rdi], xmm5 ; store the results in the destination
  863. lea rdi, [rdi + rdx] ; dst_pitch
  864. cmp rdi, rcx
  865. jne .next_row_fp
  866. .done:
  867. ; begin epilog
  868. pop rdi
  869. pop rsi
  870. RESTORE_GOT
  871. RESTORE_XMM
  872. UNSHADOW_ARGS
  873. pop rbp
  874. ret
  875. ;void vp8_bilinear_predict8x8_ssse3
  876. ;(
  877. ; unsigned char *src_ptr,
  878. ; int src_pixels_per_line,
  879. ; int xoffset,
  880. ; int yoffset,
  881. ; unsigned char *dst_ptr,
  882. ; int dst_pitch
  883. ;)
  884. global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
  885. sym(vp8_bilinear_predict8x8_ssse3):
  886. push rbp
  887. mov rbp, rsp
  888. SHADOW_ARGS_TO_STACK 6
  889. SAVE_XMM 7
  890. GET_GOT rbx
  891. push rsi
  892. push rdi
  893. ; end prolog
  894. ALIGN_STACK 16, rax
  895. sub rsp, 144 ; reserve 144 bytes
  896. lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
  897. mov rsi, arg(0) ;src_ptr
  898. movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
  899. ;Read 9-line unaligned data in and put them on stack. This gives a big
  900. ;performance boost.
  901. movdqu xmm0, [rsi]
  902. lea rax, [rdx + rdx*2]
  903. movdqu xmm1, [rsi+rdx]
  904. movdqu xmm2, [rsi+rdx*2]
  905. add rsi, rax
  906. movdqu xmm3, [rsi]
  907. movdqu xmm4, [rsi+rdx]
  908. movdqu xmm5, [rsi+rdx*2]
  909. add rsi, rax
  910. movdqu xmm6, [rsi]
  911. movdqu xmm7, [rsi+rdx]
  912. movdqa XMMWORD PTR [rsp], xmm0
  913. movdqu xmm0, [rsi+rdx*2]
  914. movdqa XMMWORD PTR [rsp+16], xmm1
  915. movdqa XMMWORD PTR [rsp+32], xmm2
  916. movdqa XMMWORD PTR [rsp+48], xmm3
  917. movdqa XMMWORD PTR [rsp+64], xmm4
  918. movdqa XMMWORD PTR [rsp+80], xmm5
  919. movdqa XMMWORD PTR [rsp+96], xmm6
  920. movdqa XMMWORD PTR [rsp+112], xmm7
  921. movdqa XMMWORD PTR [rsp+128], xmm0
  922. movsxd rax, dword ptr arg(2) ; xoffset
  923. cmp rax, 0 ; skip first_pass filter if xoffset=0
  924. je .b8x8_sp_only
  925. shl rax, 4
  926. add rax, rcx ; HFilter
  927. mov rdi, arg(4) ; dst_ptr
  928. movsxd rdx, dword ptr arg(5) ; dst_pitch
  929. movdqa xmm0, [rax]
  930. movsxd rax, dword ptr arg(3) ; yoffset
  931. cmp rax, 0 ; skip second_pass filter if yoffset=0
  932. je .b8x8_fp_only
  933. shl rax, 4
  934. lea rax, [rax + rcx] ; VFilter
  935. lea rcx, [rdi+rdx*8]
  936. movdqa xmm1, [rax]
  937. ; get the first horizontal line done
  938. movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  939. movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
  940. psrldq xmm5, 1
  941. lea rsp, [rsp + 16] ; next line
  942. punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
  943. pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
  944. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  945. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  946. movdqa xmm7, xmm3
  947. packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  948. .next_row:
  949. movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  950. lea rsp, [rsp + 16] ; next line
  951. movdqa xmm5, xmm6
  952. psrldq xmm5, 1
  953. punpcklbw xmm6, xmm5
  954. pmaddubsw xmm6, xmm0
  955. paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
  956. psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
  957. packuswb xmm6, xmm6
  958. punpcklbw xmm7, xmm6
  959. pmaddubsw xmm7, xmm1
  960. paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
  961. psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
  962. packuswb xmm7, xmm7
  963. movq [rdi], xmm7 ; store the results in the destination
  964. lea rdi, [rdi + rdx]
  965. movdqa xmm7, xmm6
  966. cmp rdi, rcx
  967. jne .next_row
  968. jmp .done8x8
  969. .b8x8_sp_only:
  970. movsxd rax, dword ptr arg(3) ; yoffset
  971. shl rax, 4
  972. lea rax, [rax + rcx] ; VFilter
  973. mov rdi, arg(4) ;dst_ptr
  974. movsxd rdx, dword ptr arg(5) ; dst_pitch
  975. movdqa xmm0, [rax] ; VFilter
  976. movq xmm1, XMMWORD PTR [rsp]
  977. movq xmm2, XMMWORD PTR [rsp+16]
  978. movq xmm3, XMMWORD PTR [rsp+32]
  979. punpcklbw xmm1, xmm2
  980. movq xmm4, XMMWORD PTR [rsp+48]
  981. punpcklbw xmm2, xmm3
  982. movq xmm5, XMMWORD PTR [rsp+64]
  983. punpcklbw xmm3, xmm4
  984. movq xmm6, XMMWORD PTR [rsp+80]
  985. punpcklbw xmm4, xmm5
  986. movq xmm7, XMMWORD PTR [rsp+96]
  987. punpcklbw xmm5, xmm6
  988. ; Because the source register (xmm0) is always treated as signed by
  989. ; pmaddubsw, the constant '128' is treated as '-128'.
  990. pmaddubsw xmm1, xmm0
  991. pmaddubsw xmm2, xmm0
  992. pmaddubsw xmm3, xmm0
  993. pmaddubsw xmm4, xmm0
  994. pmaddubsw xmm5, xmm0
  995. punpcklbw xmm6, xmm7
  996. pmaddubsw xmm6, xmm0
  997. paddw xmm1, [GLOBAL(rd)]
  998. paddw xmm2, [GLOBAL(rd)]
  999. psraw xmm1, VP8_FILTER_SHIFT
  1000. paddw xmm3, [GLOBAL(rd)]
  1001. psraw xmm2, VP8_FILTER_SHIFT
  1002. paddw xmm4, [GLOBAL(rd)]
  1003. psraw xmm3, VP8_FILTER_SHIFT
  1004. paddw xmm5, [GLOBAL(rd)]
  1005. psraw xmm4, VP8_FILTER_SHIFT
  1006. paddw xmm6, [GLOBAL(rd)]
  1007. psraw xmm5, VP8_FILTER_SHIFT
  1008. psraw xmm6, VP8_FILTER_SHIFT
  1009. ; Having multiplied everything by '-128' and obtained negative
  1010. ; numbers, the unsigned saturation truncates those values to 0,
  1011. ; resulting in incorrect handling of xoffset == 0 && yoffset == 0
  1012. packuswb xmm1, xmm1
  1013. packuswb xmm2, xmm2
  1014. movq [rdi], xmm1
  1015. packuswb xmm3, xmm3
  1016. movq [rdi+rdx], xmm2
  1017. packuswb xmm4, xmm4
  1018. movq xmm1, XMMWORD PTR [rsp+112]
  1019. lea rdi, [rdi + 2*rdx]
  1020. movq xmm2, XMMWORD PTR [rsp+128]
  1021. packuswb xmm5, xmm5
  1022. movq [rdi], xmm3
  1023. packuswb xmm6, xmm6
  1024. movq [rdi+rdx], xmm4
  1025. lea rdi, [rdi + 2*rdx]
  1026. punpcklbw xmm7, xmm1
  1027. movq [rdi], xmm5
  1028. pmaddubsw xmm7, xmm0
  1029. movq [rdi+rdx], xmm6
  1030. punpcklbw xmm1, xmm2
  1031. pmaddubsw xmm1, xmm0
  1032. paddw xmm7, [GLOBAL(rd)]
  1033. psraw xmm7, VP8_FILTER_SHIFT
  1034. paddw xmm1, [GLOBAL(rd)]
  1035. psraw xmm1, VP8_FILTER_SHIFT
  1036. packuswb xmm7, xmm7
  1037. packuswb xmm1, xmm1
  1038. lea rdi, [rdi + 2*rdx]
  1039. movq [rdi], xmm7
  1040. movq [rdi+rdx], xmm1
  1041. lea rsp, [rsp + 144]
  1042. jmp .done8x8
  1043. .b8x8_fp_only:
  1044. lea rcx, [rdi+rdx*8]
  1045. .next_row_fp:
  1046. movdqa xmm1, XMMWORD PTR [rsp]
  1047. movdqa xmm3, XMMWORD PTR [rsp+16]
  1048. movdqa xmm2, xmm1
  1049. movdqa xmm5, XMMWORD PTR [rsp+32]
  1050. psrldq xmm2, 1
  1051. movdqa xmm7, XMMWORD PTR [rsp+48]
  1052. movdqa xmm4, xmm3
  1053. psrldq xmm4, 1
  1054. movdqa xmm6, xmm5
  1055. psrldq xmm6, 1
  1056. punpcklbw xmm1, xmm2
  1057. pmaddubsw xmm1, xmm0
  1058. punpcklbw xmm3, xmm4
  1059. pmaddubsw xmm3, xmm0
  1060. punpcklbw xmm5, xmm6
  1061. pmaddubsw xmm5, xmm0
  1062. movdqa xmm2, xmm7
  1063. psrldq xmm2, 1
  1064. punpcklbw xmm7, xmm2
  1065. pmaddubsw xmm7, xmm0
  1066. paddw xmm1, [GLOBAL(rd)]
  1067. psraw xmm1, VP8_FILTER_SHIFT
  1068. paddw xmm3, [GLOBAL(rd)]
  1069. psraw xmm3, VP8_FILTER_SHIFT
  1070. paddw xmm5, [GLOBAL(rd)]
  1071. psraw xmm5, VP8_FILTER_SHIFT
  1072. paddw xmm7, [GLOBAL(rd)]
  1073. psraw xmm7, VP8_FILTER_SHIFT
  1074. packuswb xmm1, xmm1
  1075. packuswb xmm3, xmm3
  1076. packuswb xmm5, xmm5
  1077. movq [rdi], xmm1
  1078. packuswb xmm7, xmm7
  1079. movq [rdi+rdx], xmm3
  1080. lea rdi, [rdi + 2*rdx]
  1081. movq [rdi], xmm5
  1082. lea rsp, [rsp + 4*16]
  1083. movq [rdi+rdx], xmm7
  1084. lea rdi, [rdi + 2*rdx]
  1085. cmp rdi, rcx
  1086. jne .next_row_fp
  1087. lea rsp, [rsp + 16]
  1088. .done8x8:
  1089. ;add rsp, 144
  1090. pop rsp
  1091. ; begin epilog
  1092. pop rdi
  1093. pop rsi
  1094. RESTORE_GOT
  1095. RESTORE_XMM
  1096. UNSHADOW_ARGS
  1097. pop rbp
  1098. ret
  1099. SECTION_RODATA
  1100. align 16
  1101. shuf1b:
  1102. db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  1103. shuf2b:
  1104. db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
  1105. shuf3b:
  1106. db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
  1107. align 16
  1108. shuf2bfrom1:
  1109. db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
  1110. align 16
  1111. shuf3bfrom1:
  1112. db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
  1113. align 16
  1114. rd:
  1115. times 8 dw 0x40
  1116. align 16
  1117. k0_k5:
  1118. times 8 db 0, 0 ;placeholder
  1119. times 8 db 0, 0
  1120. times 8 db 2, 1
  1121. times 8 db 0, 0
  1122. times 8 db 3, 3
  1123. times 8 db 0, 0
  1124. times 8 db 1, 2
  1125. times 8 db 0, 0
  1126. k1_k3:
  1127. times 8 db 0, 0 ;placeholder
  1128. times 8 db -6, 12
  1129. times 8 db -11, 36
  1130. times 8 db -9, 50
  1131. times 8 db -16, 77
  1132. times 8 db -6, 93
  1133. times 8 db -8, 108
  1134. times 8 db -1, 123
  1135. k2_k4:
  1136. times 8 db 128, 0 ;placeholder
  1137. times 8 db 123, -1
  1138. times 8 db 108, -8
  1139. times 8 db 93, -6
  1140. times 8 db 77, -16
  1141. times 8 db 50, -9
  1142. times 8 db 36, -11
  1143. times 8 db 12, -6
  1144. align 16
  1145. vp8_bilinear_filters_ssse3:
  1146. times 8 db 128, 0
  1147. times 8 db 112, 16
  1148. times 8 db 96, 32
  1149. times 8 db 80, 48
  1150. times 8 db 64, 64
  1151. times 8 db 48, 80
  1152. times 8 db 32, 96
  1153. times 8 db 16, 112