deblock_sse2.asm 12 KB


  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;macro in deblock functions
  12. %macro FIRST_2_ROWS 0
  13. movdqa xmm4, xmm0
  14. movdqa xmm6, xmm0
  15. movdqa xmm5, xmm1
  16. pavgb xmm5, xmm3
  17. ;calculate absolute value
  18. psubusb xmm4, xmm1
  19. psubusb xmm1, xmm0
  20. psubusb xmm6, xmm3
  21. psubusb xmm3, xmm0
  22. paddusb xmm4, xmm1
  23. paddusb xmm6, xmm3
  24. ;get threshold
  25. movdqa xmm2, flimit
  26. pxor xmm1, xmm1
  27. movdqa xmm7, xmm2
  28. ;get mask
  29. psubusb xmm2, xmm4
  30. psubusb xmm7, xmm6
  31. pcmpeqb xmm2, xmm1
  32. pcmpeqb xmm7, xmm1
  33. por xmm7, xmm2
  34. %endmacro
  35. %macro SECOND_2_ROWS 0
  36. movdqa xmm6, xmm0
  37. movdqa xmm4, xmm0
  38. movdqa xmm2, xmm1
  39. pavgb xmm1, xmm3
  40. ;calculate absolute value
  41. psubusb xmm6, xmm2
  42. psubusb xmm2, xmm0
  43. psubusb xmm4, xmm3
  44. psubusb xmm3, xmm0
  45. paddusb xmm6, xmm2
  46. paddusb xmm4, xmm3
  47. pavgb xmm5, xmm1
  48. ;get threshold
  49. movdqa xmm2, flimit
  50. pxor xmm1, xmm1
  51. movdqa xmm3, xmm2
  52. ;get mask
  53. psubusb xmm2, xmm6
  54. psubusb xmm3, xmm4
  55. pcmpeqb xmm2, xmm1
  56. pcmpeqb xmm3, xmm1
  57. por xmm7, xmm2
  58. por xmm7, xmm3
  59. pavgb xmm5, xmm0
  60. ;decide if or not to use filtered value
  61. pand xmm0, xmm7
  62. pandn xmm7, xmm5
  63. paddusb xmm0, xmm7
  64. %endmacro
  65. %macro UPDATE_FLIMIT 0
  66. movdqu xmm2, XMMWORD PTR [rbx]
  67. movdqu [rsp], xmm2
  68. add rbx, 16
  69. %endmacro
  70. SECTION .text
  71. ;void vpx_post_proc_down_and_across_mb_row_sse2
  72. ;(
  73. ; unsigned char *src_ptr,
  74. ; unsigned char *dst_ptr,
  75. ; int src_pixels_per_line,
  76. ; int dst_pixels_per_line,
  77. ; int cols,
  78. ; int *flimits,
  79. ; int size
  80. ;)
  81. global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE
  82. sym(vpx_post_proc_down_and_across_mb_row_sse2):
  83. push rbp
  84. mov rbp, rsp
  85. SHADOW_ARGS_TO_STACK 7
  86. SAVE_XMM 7
  87. push rbx
  88. push rsi
  89. push rdi
  90. ; end prolog
  91. ALIGN_STACK 16, rax
  92. sub rsp, 16
  93. ; put flimit on stack
  94. mov rbx, arg(5) ;flimits ptr
  95. UPDATE_FLIMIT
  96. %define flimit [rsp]
  97. mov rsi, arg(0) ;src_ptr
  98. mov rdi, arg(1) ;dst_ptr
  99. movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
  100. movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
  101. .nextrow:
  102. xor rdx, rdx ;col
  103. .nextcol:
  104. ;load current and next 2 rows
  105. movdqu xmm0, XMMWORD PTR [rsi]
  106. movdqu xmm1, XMMWORD PTR [rsi + rax]
  107. movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
  108. FIRST_2_ROWS
  109. ;load above 2 rows
  110. neg rax
  111. movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
  112. movdqu xmm3, XMMWORD PTR [rsi + rax]
  113. SECOND_2_ROWS
  114. movdqu XMMWORD PTR [rdi], xmm0
  115. neg rax ; positive stride
  116. add rsi, 16
  117. add rdi, 16
  118. add rdx, 16
  119. cmp edx, dword arg(4) ;cols
  120. jge .downdone
  121. UPDATE_FLIMIT
  122. jmp .nextcol
  123. .downdone:
  124. ; done with the all cols, start the across filtering in place
  125. sub rsi, rdx
  126. sub rdi, rdx
  127. mov rbx, arg(5) ; flimits
  128. UPDATE_FLIMIT
  129. ; dup the first byte into the left border 8 times
  130. movq mm1, [rdi]
  131. punpcklbw mm1, mm1
  132. punpcklwd mm1, mm1
  133. punpckldq mm1, mm1
  134. mov rdx, -8
  135. movq [rdi+rdx], mm1
  136. ; dup the last byte into the right border
  137. movsxd rdx, dword arg(4)
  138. movq mm1, [rdi + rdx + -1]
  139. punpcklbw mm1, mm1
  140. punpcklwd mm1, mm1
  141. punpckldq mm1, mm1
  142. movq [rdi+rdx], mm1
  143. xor rdx, rdx
  144. movq mm0, QWORD PTR [rdi-16];
  145. movq mm1, QWORD PTR [rdi-8];
  146. .acrossnextcol:
  147. movdqu xmm0, XMMWORD PTR [rdi + rdx]
  148. movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
  149. movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
  150. FIRST_2_ROWS
  151. movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
  152. movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
  153. SECOND_2_ROWS
  154. movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
  155. movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
  156. movdq2q mm0, xmm0
  157. psrldq xmm0, 8
  158. movdq2q mm1, xmm0
  159. add rdx, 16
  160. cmp edx, dword arg(4) ;cols
  161. jge .acrossdone
  162. UPDATE_FLIMIT
  163. jmp .acrossnextcol
  164. .acrossdone:
  165. ; last 16 pixels
  166. movq QWORD PTR [rdi+rdx-16], mm0
  167. cmp edx, dword arg(4)
  168. jne .throw_last_8
  169. movq QWORD PTR [rdi+rdx-8], mm1
  170. .throw_last_8:
  171. ; done with this rwo
  172. add rsi,rax ;next src line
  173. mov eax, dword arg(3) ;dst_pixels_per_line
  174. add rdi,rax ;next destination
  175. mov eax, dword arg(2) ;src_pixels_per_line
  176. mov rbx, arg(5) ;flimits
  177. UPDATE_FLIMIT
  178. dec rcx ;decrement count
  179. jnz .nextrow ;next row
  180. add rsp, 16
  181. pop rsp
  182. ; begin epilog
  183. pop rdi
  184. pop rsi
  185. pop rbx
  186. RESTORE_XMM
  187. UNSHADOW_ARGS
  188. pop rbp
  189. ret
  190. %undef flimit
  191. ;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
  192. ; int pitch, int rows, int cols,int flimit)
  193. global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE
  194. sym(vpx_mbpost_proc_across_ip_sse2):
  195. push rbp
  196. mov rbp, rsp
  197. SHADOW_ARGS_TO_STACK 5
  198. SAVE_XMM 7
  199. GET_GOT rbx
  200. push rsi
  201. push rdi
  202. ; end prolog
  203. ALIGN_STACK 16, rax
  204. sub rsp, 16
  205. ; create flimit4 at [rsp]
  206. mov eax, dword ptr arg(4) ;flimit
  207. mov [rsp], eax
  208. mov [rsp+4], eax
  209. mov [rsp+8], eax
  210. mov [rsp+12], eax
  211. %define flimit4 [rsp]
  212. ;for(r=0;r<rows;r++)
  213. .ip_row_loop:
  214. xor rdx, rdx ;sumsq=0;
  215. xor rcx, rcx ;sum=0;
  216. mov rsi, arg(0); s
  217. ; dup the first byte into the left border 8 times
  218. movq mm1, [rsi]
  219. punpcklbw mm1, mm1
  220. punpcklwd mm1, mm1
  221. punpckldq mm1, mm1
  222. mov rdi, -8
  223. movq [rsi+rdi], mm1
  224. ; dup the last byte into the right border
  225. movsxd rdx, dword arg(3)
  226. movq mm1, [rsi + rdx + -1]
  227. punpcklbw mm1, mm1
  228. punpcklwd mm1, mm1
  229. punpckldq mm1, mm1
  230. movq [rsi+rdx], mm1
  231. .ip_var_loop:
  232. ;for(i=-8;i<=6;i++)
  233. ;{
  234. ; sumsq += s[i]*s[i];
  235. ; sum += s[i];
  236. ;}
  237. movzx eax, byte [rsi+rdi]
  238. add ecx, eax
  239. mul al
  240. add edx, eax
  241. add rdi, 1
  242. cmp rdi, 6
  243. jle .ip_var_loop
  244. ;mov rax, sumsq
  245. ;movd xmm7, rax
  246. movd xmm7, edx
  247. ;mov rax, sum
  248. ;movd xmm6, rax
  249. movd xmm6, ecx
  250. mov rsi, arg(0) ;s
  251. xor rcx, rcx
  252. movsxd rdx, dword arg(3) ;cols
  253. add rdx, 8
  254. pxor mm0, mm0
  255. pxor mm1, mm1
  256. pxor xmm0, xmm0
  257. .nextcol4:
  258. movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
  259. movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
  260. punpcklbw xmm1, xmm0 ; expanding
  261. punpcklbw xmm2, xmm0 ; expanding
  262. punpcklwd xmm1, xmm0 ; expanding to dwords
  263. punpcklwd xmm2, xmm0 ; expanding to dwords
  264. psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
  265. paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
  266. paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
  267. pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
  268. paddd xmm6, xmm2
  269. paddd xmm7, xmm1
  270. pshufd xmm6, xmm6, 0 ; duplicate the last ones
  271. pshufd xmm7, xmm7, 0 ; duplicate the last ones
  272. psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
  273. psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
  274. pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
  275. pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
  276. paddd xmm6, xmm4
  277. paddd xmm7, xmm3
  278. pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
  279. pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
  280. paddd xmm7, xmm3
  281. paddd xmm6, xmm4
  282. pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
  283. pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
  284. paddd xmm7, xmm3
  285. paddd xmm6, xmm4
  286. movdqa xmm3, xmm6
  287. pmaddwd xmm3, xmm3
  288. movdqa xmm5, xmm7
  289. pslld xmm5, 4
  290. psubd xmm5, xmm7
  291. psubd xmm5, xmm3
  292. psubd xmm5, flimit4
  293. psrad xmm5, 31
  294. packssdw xmm5, xmm0
  295. packsswb xmm5, xmm0
  296. movd xmm1, DWORD PTR [rsi+rcx]
  297. movq xmm2, xmm1
  298. punpcklbw xmm1, xmm0
  299. punpcklwd xmm1, xmm0
  300. paddd xmm1, xmm6
  301. paddd xmm1, [GLOBAL(four8s)]
  302. psrad xmm1, 4
  303. packssdw xmm1, xmm0
  304. packuswb xmm1, xmm0
  305. pand xmm1, xmm5
  306. pandn xmm5, xmm2
  307. por xmm5, xmm1
  308. movd [rsi+rcx-8], mm0
  309. movq mm0, mm1
  310. movdq2q mm1, xmm5
  311. psrldq xmm7, 12
  312. psrldq xmm6, 12
  313. add rcx, 4
  314. cmp rcx, rdx
  315. jl .nextcol4
  316. ;s+=pitch;
  317. movsxd rax, dword arg(1)
  318. add arg(0), rax
  319. sub dword arg(2), 1 ;rows-=1
  320. cmp dword arg(2), 0
  321. jg .ip_row_loop
  322. add rsp, 16
  323. pop rsp
  324. ; begin epilog
  325. pop rdi
  326. pop rsi
  327. RESTORE_GOT
  328. RESTORE_XMM
  329. UNSHADOW_ARGS
  330. pop rbp
  331. ret
  332. %undef flimit4
  333. SECTION_RODATA
  334. align 16
  335. four8s:
  336. times 4 dd 8