subpixel_sse2.asm 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. extern sym(vp8_bilinear_filters_x86_8)
  12. %define BLOCK_HEIGHT_WIDTH 4
  13. %define VP8_FILTER_WEIGHT 128
  14. %define VP8_FILTER_SHIFT 7
  15. ;/************************************************************************************
  16. ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
  17. ; input pixel array has output_height rows. This routine assumes that output_height is an
  18. ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
  19. ; rows each iteration to take advantage of the 128 bits operations.
  20. ;*************************************************************************************/
  21. ;void vp8_filter_block1d8_h6_sse2
  22. ;(
  23. ; unsigned char *src_ptr,
  24. ; unsigned short *output_ptr,
  25. ; unsigned int src_pixels_per_line,
  26. ; unsigned int pixel_step,
  27. ; unsigned int output_height,
  28. ; unsigned int output_width,
  29. ; short *vp8_filter
  30. ;)
  31. global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
  32. sym(vp8_filter_block1d8_h6_sse2):
  33. push rbp
  34. mov rbp, rsp
  35. SHADOW_ARGS_TO_STACK 7
  36. SAVE_XMM 7
  37. GET_GOT rbx
  38. push rsi
  39. push rdi
  40. ; end prolog
  41. mov rdx, arg(6) ;vp8_filter
  42. mov rsi, arg(0) ;src_ptr
  43. mov rdi, arg(1) ;output_ptr
  44. movsxd rcx, dword ptr arg(4) ;output_height
  45. movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
  46. %if ABI_IS_32BIT=0
  47. movsxd r8, dword ptr arg(5) ;output_width
  48. %endif
  49. pxor xmm0, xmm0 ; clear xmm0 for unpack
  50. .filter_block1d8_h6_rowloop:
  51. movq xmm3, MMWORD PTR [rsi - 2]
  52. movq xmm1, MMWORD PTR [rsi + 6]
  53. prefetcht2 [rsi+rax-2]
  54. pslldq xmm1, 8
  55. por xmm1, xmm3
  56. movdqa xmm4, xmm1
  57. movdqa xmm5, xmm1
  58. movdqa xmm6, xmm1
  59. movdqa xmm7, xmm1
  60. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  61. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  62. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  63. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  64. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  65. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  66. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  67. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  68. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  69. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  70. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  71. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  72. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  73. psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  74. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  75. punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  76. pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  77. paddsw xmm4, xmm7
  78. paddsw xmm4, xmm5
  79. paddsw xmm4, xmm3
  80. paddsw xmm4, xmm6
  81. paddsw xmm4, xmm1
  82. paddsw xmm4, [GLOBAL(rd)]
  83. psraw xmm4, 7
  84. packuswb xmm4, xmm0
  85. punpcklbw xmm4, xmm0
  86. movdqa XMMWORD Ptr [rdi], xmm4
  87. lea rsi, [rsi + rax]
  88. %if ABI_IS_32BIT
  89. add rdi, DWORD Ptr arg(5) ;[output_width]
  90. %else
  91. add rdi, r8
  92. %endif
  93. dec rcx
  94. jnz .filter_block1d8_h6_rowloop ; next row
  95. ; begin epilog
  96. pop rdi
  97. pop rsi
  98. RESTORE_GOT
  99. RESTORE_XMM
  100. UNSHADOW_ARGS
  101. pop rbp
  102. ret
  103. ;void vp8_filter_block1d16_h6_sse2
  104. ;(
  105. ; unsigned char *src_ptr,
  106. ; unsigned short *output_ptr,
  107. ; unsigned int src_pixels_per_line,
  108. ; unsigned int pixel_step,
  109. ; unsigned int output_height,
  110. ; unsigned int output_width,
  111. ; short *vp8_filter
  112. ;)
  113. ;/************************************************************************************
  114. ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
  115. ; input pixel array has output_height rows. This routine assumes that output_height is an
  116. ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
  117. ; rows each iteration to take advantage of the 128 bits operations.
  118. ;*************************************************************************************/
  119. global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
  120. sym(vp8_filter_block1d16_h6_sse2):
  121. push rbp
  122. mov rbp, rsp
  123. SHADOW_ARGS_TO_STACK 7
  124. SAVE_XMM 7
  125. GET_GOT rbx
  126. push rsi
  127. push rdi
  128. ; end prolog
  129. mov rdx, arg(6) ;vp8_filter
  130. mov rsi, arg(0) ;src_ptr
  131. mov rdi, arg(1) ;output_ptr
  132. movsxd rcx, dword ptr arg(4) ;output_height
  133. movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
  134. %if ABI_IS_32BIT=0
  135. movsxd r8, dword ptr arg(5) ;output_width
  136. %endif
  137. pxor xmm0, xmm0 ; clear xmm0 for unpack
  138. .filter_block1d16_h6_sse2_rowloop:
  139. movq xmm3, MMWORD PTR [rsi - 2]
  140. movq xmm1, MMWORD PTR [rsi + 6]
  141. ; Load from 11 to avoid reading out of bounds.
  142. movq xmm2, MMWORD PTR [rsi +11]
  143. ; The lower bits are not cleared before 'or'ing with xmm1,
  144. ; but that is OK because the values in the overlapping positions
  145. ; are already equal to the ones in xmm1.
  146. pslldq xmm2, 5
  147. por xmm2, xmm1
  148. prefetcht2 [rsi+rax-2]
  149. pslldq xmm1, 8
  150. por xmm1, xmm3
  151. movdqa xmm4, xmm1
  152. movdqa xmm5, xmm1
  153. movdqa xmm6, xmm1
  154. movdqa xmm7, xmm1
  155. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  156. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  157. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  158. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  159. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  160. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  161. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  162. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  163. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  164. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  165. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  166. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  167. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  168. psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  169. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  170. punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  171. pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  172. paddsw xmm4, xmm7
  173. paddsw xmm4, xmm5
  174. paddsw xmm4, xmm3
  175. paddsw xmm4, xmm6
  176. paddsw xmm4, xmm1
  177. paddsw xmm4, [GLOBAL(rd)]
  178. psraw xmm4, 7
  179. packuswb xmm4, xmm0
  180. punpcklbw xmm4, xmm0
  181. movdqa XMMWORD Ptr [rdi], xmm4
  182. movdqa xmm3, xmm2
  183. movdqa xmm4, xmm2
  184. movdqa xmm5, xmm2
  185. movdqa xmm6, xmm2
  186. movdqa xmm7, xmm2
  187. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  188. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  189. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  190. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  191. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  192. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  193. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  194. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  195. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  196. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  197. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  198. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  199. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  200. psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  201. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  202. punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  203. pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  204. paddsw xmm4, xmm7
  205. paddsw xmm4, xmm5
  206. paddsw xmm4, xmm3
  207. paddsw xmm4, xmm6
  208. paddsw xmm4, xmm2
  209. paddsw xmm4, [GLOBAL(rd)]
  210. psraw xmm4, 7
  211. packuswb xmm4, xmm0
  212. punpcklbw xmm4, xmm0
  213. movdqa XMMWORD Ptr [rdi+16], xmm4
  214. lea rsi, [rsi + rax]
  215. %if ABI_IS_32BIT
  216. add rdi, DWORD Ptr arg(5) ;[output_width]
  217. %else
  218. add rdi, r8
  219. %endif
  220. dec rcx
  221. jnz .filter_block1d16_h6_sse2_rowloop ; next row
  222. ; begin epilog
  223. pop rdi
  224. pop rsi
  225. RESTORE_GOT
  226. RESTORE_XMM
  227. UNSHADOW_ARGS
  228. pop rbp
  229. ret
  230. ;void vp8_filter_block1d8_v6_sse2
  231. ;(
  232. ; short *src_ptr,
  233. ; unsigned char *output_ptr,
  234. ; int dst_ptich,
  235. ; unsigned int pixels_per_line,
  236. ; unsigned int pixel_step,
  237. ; unsigned int output_height,
  238. ; unsigned int output_width,
  239. ; short * vp8_filter
  240. ;)
  241. ;/************************************************************************************
  242. ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
  243. ; input pixel array has output_height rows.
  244. ;*************************************************************************************/
  245. global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
  246. sym(vp8_filter_block1d8_v6_sse2):
  247. push rbp
  248. mov rbp, rsp
  249. SHADOW_ARGS_TO_STACK 8
  250. SAVE_XMM 7
  251. GET_GOT rbx
  252. push rsi
  253. push rdi
  254. ; end prolog
  255. mov rax, arg(7) ;vp8_filter
  256. movsxd rdx, dword ptr arg(3) ;pixels_per_line
  257. mov rdi, arg(1) ;output_ptr
  258. mov rsi, arg(0) ;src_ptr
  259. sub rsi, rdx
  260. sub rsi, rdx
  261. movsxd rcx, DWORD PTR arg(5) ;[output_height]
  262. pxor xmm0, xmm0 ; clear xmm0
  263. movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
  264. %if ABI_IS_32BIT=0
  265. movsxd r8, dword ptr arg(2) ; dst_ptich
  266. %endif
  267. .vp8_filter_block1d8_v6_sse2_loop:
  268. movdqa xmm1, XMMWORD PTR [rsi]
  269. pmullw xmm1, [rax]
  270. movdqa xmm2, XMMWORD PTR [rsi + rdx]
  271. pmullw xmm2, [rax + 16]
  272. movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
  273. pmullw xmm3, [rax + 32]
  274. movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
  275. pmullw xmm5, [rax + 64]
  276. add rsi, rdx
  277. movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
  278. pmullw xmm4, [rax + 48]
  279. movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
  280. pmullw xmm6, [rax + 80]
  281. paddsw xmm2, xmm5
  282. paddsw xmm2, xmm3
  283. paddsw xmm2, xmm1
  284. paddsw xmm2, xmm4
  285. paddsw xmm2, xmm6
  286. paddsw xmm2, xmm7
  287. psraw xmm2, 7
  288. packuswb xmm2, xmm0 ; pack and saturate
  289. movq QWORD PTR [rdi], xmm2 ; store the results in the destination
  290. %if ABI_IS_32BIT
  291. add rdi, DWORD PTR arg(2) ;[dst_ptich]
  292. %else
  293. add rdi, r8
  294. %endif
  295. dec rcx ; decrement count
  296. jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
  297. ; begin epilog
  298. pop rdi
  299. pop rsi
  300. RESTORE_GOT
  301. RESTORE_XMM
  302. UNSHADOW_ARGS
  303. pop rbp
  304. ret
  305. ;void vp8_filter_block1d16_v6_sse2
  306. ;(
  307. ; unsigned short *src_ptr,
  308. ; unsigned char *output_ptr,
  309. ; int dst_ptich,
  310. ; unsigned int pixels_per_line,
  311. ; unsigned int pixel_step,
  312. ; unsigned int output_height,
  313. ; unsigned int output_width,
  314. ; const short *vp8_filter
  315. ;)
  316. ;/************************************************************************************
  317. ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
  318. ; input pixel array has output_height rows.
  319. ;*************************************************************************************/
  320. global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
  321. sym(vp8_filter_block1d16_v6_sse2):
  322. push rbp
  323. mov rbp, rsp
  324. SHADOW_ARGS_TO_STACK 8
  325. SAVE_XMM 7
  326. GET_GOT rbx
  327. push rsi
  328. push rdi
  329. ; end prolog
  330. mov rax, arg(7) ;vp8_filter
  331. movsxd rdx, dword ptr arg(3) ;pixels_per_line
  332. mov rdi, arg(1) ;output_ptr
  333. mov rsi, arg(0) ;src_ptr
  334. sub rsi, rdx
  335. sub rsi, rdx
  336. movsxd rcx, DWORD PTR arg(5) ;[output_height]
  337. %if ABI_IS_32BIT=0
  338. movsxd r8, dword ptr arg(2) ; dst_ptich
  339. %endif
  340. .vp8_filter_block1d16_v6_sse2_loop:
  341. ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
  342. movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
  343. movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
  344. pmullw xmm1, [rax + 16]
  345. pmullw xmm2, [rax + 16]
  346. movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
  347. movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
  348. pmullw xmm3, [rax + 64]
  349. pmullw xmm4, [rax + 64]
  350. movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
  351. movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
  352. pmullw xmm5, [rax + 32]
  353. pmullw xmm6, [rax + 32]
  354. movdqa xmm7, XMMWORD PTR [rsi] ; line 1
  355. movdqa xmm0, XMMWORD PTR [rsi + 16]
  356. pmullw xmm7, [rax]
  357. pmullw xmm0, [rax]
  358. paddsw xmm1, xmm3
  359. paddsw xmm2, xmm4
  360. paddsw xmm1, xmm5
  361. paddsw xmm2, xmm6
  362. paddsw xmm1, xmm7
  363. paddsw xmm2, xmm0
  364. add rsi, rdx
  365. movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
  366. movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
  367. pmullw xmm3, [rax + 48]
  368. pmullw xmm4, [rax + 48]
  369. movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
  370. movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
  371. pmullw xmm5, [rax + 80]
  372. pmullw xmm6, [rax + 80]
  373. movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
  374. pxor xmm0, xmm0 ; clear xmm0
  375. paddsw xmm1, xmm3
  376. paddsw xmm2, xmm4
  377. paddsw xmm1, xmm5
  378. paddsw xmm2, xmm6
  379. paddsw xmm1, xmm7
  380. paddsw xmm2, xmm7
  381. psraw xmm1, 7
  382. psraw xmm2, 7
  383. packuswb xmm1, xmm2 ; pack and saturate
  384. movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
  385. %if ABI_IS_32BIT
  386. add rdi, DWORD PTR arg(2) ;[dst_ptich]
  387. %else
  388. add rdi, r8
  389. %endif
  390. dec rcx ; decrement count
  391. jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
  392. ; begin epilog
  393. pop rdi
  394. pop rsi
  395. RESTORE_GOT
  396. RESTORE_XMM
  397. UNSHADOW_ARGS
  398. pop rbp
  399. ret
  400. ;void vp8_filter_block1d8_h6_only_sse2
  401. ;(
  402. ; unsigned char *src_ptr,
  403. ; unsigned int src_pixels_per_line,
  404. ; unsigned char *output_ptr,
  405. ; int dst_ptich,
  406. ; unsigned int output_height,
  407. ; const short *vp8_filter
  408. ;)
  409. ; First-pass filter only when yoffset==0
  410. global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
  411. sym(vp8_filter_block1d8_h6_only_sse2):
  412. push rbp
  413. mov rbp, rsp
  414. SHADOW_ARGS_TO_STACK 6
  415. SAVE_XMM 7
  416. GET_GOT rbx
  417. push rsi
  418. push rdi
  419. ; end prolog
  420. mov rdx, arg(5) ;vp8_filter
  421. mov rsi, arg(0) ;src_ptr
  422. mov rdi, arg(2) ;output_ptr
  423. movsxd rcx, dword ptr arg(4) ;output_height
  424. movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
  425. %if ABI_IS_32BIT=0
  426. movsxd r8, dword ptr arg(3) ;dst_ptich
  427. %endif
  428. pxor xmm0, xmm0 ; clear xmm0 for unpack
  429. .filter_block1d8_h6_only_rowloop:
  430. movq xmm3, MMWORD PTR [rsi - 2]
  431. movq xmm1, MMWORD PTR [rsi + 6]
  432. prefetcht2 [rsi+rax-2]
  433. pslldq xmm1, 8
  434. por xmm1, xmm3
  435. movdqa xmm4, xmm1
  436. movdqa xmm5, xmm1
  437. movdqa xmm6, xmm1
  438. movdqa xmm7, xmm1
  439. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  440. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  441. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  442. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  443. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  444. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  445. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  446. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  447. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  448. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  449. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  450. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  451. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  452. psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  453. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  454. punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  455. pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  456. paddsw xmm4, xmm7
  457. paddsw xmm4, xmm5
  458. paddsw xmm4, xmm3
  459. paddsw xmm4, xmm6
  460. paddsw xmm4, xmm1
  461. paddsw xmm4, [GLOBAL(rd)]
  462. psraw xmm4, 7
  463. packuswb xmm4, xmm0
  464. movq QWORD PTR [rdi], xmm4 ; store the results in the destination
  465. lea rsi, [rsi + rax]
  466. %if ABI_IS_32BIT
  467. add rdi, DWORD Ptr arg(3) ;dst_ptich
  468. %else
  469. add rdi, r8
  470. %endif
  471. dec rcx
  472. jnz .filter_block1d8_h6_only_rowloop ; next row
  473. ; begin epilog
  474. pop rdi
  475. pop rsi
  476. RESTORE_GOT
  477. RESTORE_XMM
  478. UNSHADOW_ARGS
  479. pop rbp
  480. ret
  481. ;void vp8_filter_block1d16_h6_only_sse2
  482. ;(
  483. ; unsigned char *src_ptr,
  484. ; unsigned int src_pixels_per_line,
  485. ; unsigned char *output_ptr,
  486. ; int dst_ptich,
  487. ; unsigned int output_height,
  488. ; const short *vp8_filter
  489. ;)
  490. ; First-pass filter only when yoffset==0
  491. global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
  492. sym(vp8_filter_block1d16_h6_only_sse2):
  493. push rbp
  494. mov rbp, rsp
  495. SHADOW_ARGS_TO_STACK 6
  496. SAVE_XMM 7
  497. GET_GOT rbx
  498. push rsi
  499. push rdi
  500. ; end prolog
  501. mov rdx, arg(5) ;vp8_filter
  502. mov rsi, arg(0) ;src_ptr
  503. mov rdi, arg(2) ;output_ptr
  504. movsxd rcx, dword ptr arg(4) ;output_height
  505. movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
  506. %if ABI_IS_32BIT=0
  507. movsxd r8, dword ptr arg(3) ;dst_ptich
  508. %endif
  509. pxor xmm0, xmm0 ; clear xmm0 for unpack
  510. .filter_block1d16_h6_only_sse2_rowloop:
  511. movq xmm3, MMWORD PTR [rsi - 2]
  512. movq xmm1, MMWORD PTR [rsi + 6]
  513. movq xmm2, MMWORD PTR [rsi +14]
  514. pslldq xmm2, 8
  515. por xmm2, xmm1
  516. prefetcht2 [rsi+rax-2]
  517. pslldq xmm1, 8
  518. por xmm1, xmm3
  519. movdqa xmm4, xmm1
  520. movdqa xmm5, xmm1
  521. movdqa xmm6, xmm1
  522. movdqa xmm7, xmm1
  523. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  524. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  525. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  526. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  527. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  528. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  529. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  530. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  531. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  532. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  533. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  534. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  535. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  536. psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  537. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  538. punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  539. pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  540. paddsw xmm4, xmm7
  541. paddsw xmm4, xmm5
  542. paddsw xmm4, xmm3
  543. paddsw xmm4, xmm6
  544. paddsw xmm4, xmm1
  545. paddsw xmm4, [GLOBAL(rd)]
  546. psraw xmm4, 7
  547. packuswb xmm4, xmm0 ; lower 8 bytes
  548. movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
  549. movdqa xmm3, xmm2
  550. movdqa xmm4, xmm2
  551. movdqa xmm5, xmm2
  552. movdqa xmm6, xmm2
  553. movdqa xmm7, xmm2
  554. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  555. psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  556. pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
  557. punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  558. psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  559. pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
  560. punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  561. psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  562. pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
  563. punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  564. psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  565. pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
  566. punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  567. psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  568. pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
  569. punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
  570. pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
  571. paddsw xmm4, xmm7
  572. paddsw xmm4, xmm5
  573. paddsw xmm4, xmm3
  574. paddsw xmm4, xmm6
  575. paddsw xmm4, xmm2
  576. paddsw xmm4, [GLOBAL(rd)]
  577. psraw xmm4, 7
  578. packuswb xmm4, xmm0 ; higher 8 bytes
  579. movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
  580. lea rsi, [rsi + rax]
  581. %if ABI_IS_32BIT
  582. add rdi, DWORD Ptr arg(3) ;dst_ptich
  583. %else
  584. add rdi, r8
  585. %endif
  586. dec rcx
  587. jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
  588. ; begin epilog
  589. pop rdi
  590. pop rsi
  591. RESTORE_GOT
  592. RESTORE_XMM
  593. UNSHADOW_ARGS
  594. pop rbp
  595. ret
  596. ;void vp8_filter_block1d8_v6_only_sse2
  597. ;(
  598. ; unsigned char *src_ptr,
  599. ; unsigned int src_pixels_per_line,
  600. ; unsigned char *output_ptr,
  601. ; int dst_ptich,
  602. ; unsigned int output_height,
  603. ; const short *vp8_filter
  604. ;)
  605. ; Second-pass filter only when xoffset==0
  606. global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
  607. sym(vp8_filter_block1d8_v6_only_sse2):
  608. push rbp
  609. mov rbp, rsp
  610. SHADOW_ARGS_TO_STACK 6
  611. SAVE_XMM 7
  612. GET_GOT rbx
  613. push rsi
  614. push rdi
  615. ; end prolog
  616. mov rsi, arg(0) ;src_ptr
  617. mov rdi, arg(2) ;output_ptr
  618. movsxd rcx, dword ptr arg(4) ;output_height
  619. movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
  620. mov rax, arg(5) ;vp8_filter
  621. pxor xmm0, xmm0 ; clear xmm0
  622. movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
  623. %if ABI_IS_32BIT=0
  624. movsxd r8, dword ptr arg(3) ; dst_ptich
  625. %endif
  626. .vp8_filter_block1d8_v6_only_sse2_loop:
  627. movq xmm1, MMWORD PTR [rsi]
  628. movq xmm2, MMWORD PTR [rsi + rdx]
  629. movq xmm3, MMWORD PTR [rsi + rdx * 2]
  630. movq xmm5, MMWORD PTR [rsi + rdx * 4]
  631. add rsi, rdx
  632. movq xmm4, MMWORD PTR [rsi + rdx * 2]
  633. movq xmm6, MMWORD PTR [rsi + rdx * 4]
  634. punpcklbw xmm1, xmm0
  635. pmullw xmm1, [rax]
  636. punpcklbw xmm2, xmm0
  637. pmullw xmm2, [rax + 16]
  638. punpcklbw xmm3, xmm0
  639. pmullw xmm3, [rax + 32]
  640. punpcklbw xmm5, xmm0
  641. pmullw xmm5, [rax + 64]
  642. punpcklbw xmm4, xmm0
  643. pmullw xmm4, [rax + 48]
  644. punpcklbw xmm6, xmm0
  645. pmullw xmm6, [rax + 80]
  646. paddsw xmm2, xmm5
  647. paddsw xmm2, xmm3
  648. paddsw xmm2, xmm1
  649. paddsw xmm2, xmm4
  650. paddsw xmm2, xmm6
  651. paddsw xmm2, xmm7
  652. psraw xmm2, 7
  653. packuswb xmm2, xmm0 ; pack and saturate
  654. movq QWORD PTR [rdi], xmm2 ; store the results in the destination
  655. %if ABI_IS_32BIT
  656. add rdi, DWORD PTR arg(3) ;[dst_ptich]
  657. %else
  658. add rdi, r8
  659. %endif
  660. dec rcx ; decrement count
  661. jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
  662. ; begin epilog
  663. pop rdi
  664. pop rsi
  665. RESTORE_GOT
  666. RESTORE_XMM
  667. UNSHADOW_ARGS
  668. pop rbp
  669. ret
  670. ;void vp8_unpack_block1d16_h6_sse2
  671. ;(
  672. ; unsigned char *src_ptr,
  673. ; unsigned short *output_ptr,
  674. ; unsigned int src_pixels_per_line,
  675. ; unsigned int output_height,
  676. ; unsigned int output_width
  677. ;)
  678. global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
  679. sym(vp8_unpack_block1d16_h6_sse2):
  680. push rbp
  681. mov rbp, rsp
  682. SHADOW_ARGS_TO_STACK 5
  683. GET_GOT rbx
  684. push rsi
  685. push rdi
  686. ; end prolog
  687. mov rsi, arg(0) ;src_ptr
  688. mov rdi, arg(1) ;output_ptr
  689. movsxd rcx, dword ptr arg(3) ;output_height
  690. movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
  691. pxor xmm0, xmm0 ; clear xmm0 for unpack
  692. %if ABI_IS_32BIT=0
  693. movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
  694. %endif
  695. .unpack_block1d16_h6_sse2_rowloop:
  696. movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
  697. movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
  698. punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  699. punpcklbw xmm1, xmm0
  700. movdqa XMMWORD Ptr [rdi], xmm1
  701. movdqa XMMWORD Ptr [rdi + 16], xmm3
  702. lea rsi, [rsi + rax]
  703. %if ABI_IS_32BIT
  704. add rdi, DWORD Ptr arg(4) ;[output_width]
  705. %else
  706. add rdi, r8
  707. %endif
  708. dec rcx
  709. jnz .unpack_block1d16_h6_sse2_rowloop ; next row
  710. ; begin epilog
  711. pop rdi
  712. pop rsi
  713. RESTORE_GOT
  714. UNSHADOW_ARGS
  715. pop rbp
  716. ret
  717. ;void vp8_bilinear_predict16x16_sse2
  718. ;(
  719. ; unsigned char *src_ptr,
  720. ; int src_pixels_per_line,
  721. ; int xoffset,
  722. ; int yoffset,
  723. ; unsigned char *dst_ptr,
  724. ; int dst_pitch
  725. ;)
  726. extern sym(vp8_bilinear_filters_x86_8)
  727. global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
  728. sym(vp8_bilinear_predict16x16_sse2):
  729. push rbp
  730. mov rbp, rsp
  731. SHADOW_ARGS_TO_STACK 6
  732. SAVE_XMM 7
  733. GET_GOT rbx
  734. push rsi
  735. push rdi
  736. ; end prolog
  737. ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
  738. ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
  739. lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
  740. movsxd rax, dword ptr arg(2) ;xoffset
  741. cmp rax, 0 ;skip first_pass filter if xoffset=0
  742. je .b16x16_sp_only
  743. shl rax, 5
  744. add rax, rcx ;HFilter
  745. mov rdi, arg(4) ;dst_ptr
  746. mov rsi, arg(0) ;src_ptr
  747. movsxd rdx, dword ptr arg(5) ;dst_pitch
  748. movdqa xmm1, [rax]
  749. movdqa xmm2, [rax+16]
  750. movsxd rax, dword ptr arg(3) ;yoffset
  751. cmp rax, 0 ;skip second_pass filter if yoffset=0
  752. je .b16x16_fp_only
  753. shl rax, 5
  754. add rax, rcx ;VFilter
  755. lea rcx, [rdi+rdx*8]
  756. lea rcx, [rcx+rdx*8]
  757. movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
  758. pxor xmm0, xmm0
  759. %if ABI_IS_32BIT=0
  760. movsxd r8, dword ptr arg(5) ;dst_pitch
  761. %endif
  762. ; get the first horizontal line done
  763. movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  764. movdqa xmm4, xmm3 ; make a copy of current line
  765. punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
  766. punpckhbw xmm4, xmm0
  767. pmullw xmm3, xmm1
  768. pmullw xmm4, xmm1
  769. movdqu xmm5, [rsi+1]
  770. movdqa xmm6, xmm5
  771. punpcklbw xmm5, xmm0
  772. punpckhbw xmm6, xmm0
  773. pmullw xmm5, xmm2
  774. pmullw xmm6, xmm2
  775. paddw xmm3, xmm5
  776. paddw xmm4, xmm6
  777. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  778. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  779. paddw xmm4, [GLOBAL(rd)]
  780. psraw xmm4, VP8_FILTER_SHIFT
  781. movdqa xmm7, xmm3
  782. packuswb xmm7, xmm4
  783. add rsi, rdx ; next line
  784. .next_row:
  785. movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  786. movdqa xmm4, xmm3 ; make a copy of current line
  787. punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
  788. punpckhbw xmm4, xmm0
  789. pmullw xmm3, xmm1
  790. pmullw xmm4, xmm1
  791. movdqu xmm5, [rsi+1]
  792. movdqa xmm6, xmm5
  793. punpcklbw xmm5, xmm0
  794. punpckhbw xmm6, xmm0
  795. pmullw xmm5, xmm2
  796. pmullw xmm6, xmm2
  797. paddw xmm3, xmm5
  798. paddw xmm4, xmm6
  799. movdqa xmm5, xmm7
  800. movdqa xmm6, xmm7
  801. punpcklbw xmm5, xmm0
  802. punpckhbw xmm6, xmm0
  803. pmullw xmm5, [rax]
  804. pmullw xmm6, [rax]
  805. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  806. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  807. paddw xmm4, [GLOBAL(rd)]
  808. psraw xmm4, VP8_FILTER_SHIFT
  809. movdqa xmm7, xmm3
  810. packuswb xmm7, xmm4
  811. pmullw xmm3, [rax+16]
  812. pmullw xmm4, [rax+16]
  813. paddw xmm3, xmm5
  814. paddw xmm4, xmm6
  815. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  816. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  817. paddw xmm4, [GLOBAL(rd)]
  818. psraw xmm4, VP8_FILTER_SHIFT
  819. packuswb xmm3, xmm4
  820. movdqa [rdi], xmm3 ; store the results in the destination
  821. add rsi, rdx ; next line
  822. %if ABI_IS_32BIT
  823. add rdi, DWORD PTR arg(5) ;dst_pitch
  824. %else
  825. add rdi, r8
  826. %endif
  827. cmp rdi, rcx
  828. jne .next_row
  829. jmp .done
  830. .b16x16_sp_only:
  831. movsxd rax, dword ptr arg(3) ;yoffset
  832. shl rax, 5
  833. add rax, rcx ;VFilter
  834. mov rdi, arg(4) ;dst_ptr
  835. mov rsi, arg(0) ;src_ptr
  836. movsxd rdx, dword ptr arg(5) ;dst_pitch
  837. movdqa xmm1, [rax]
  838. movdqa xmm2, [rax+16]
  839. lea rcx, [rdi+rdx*8]
  840. lea rcx, [rcx+rdx*8]
  841. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  842. pxor xmm0, xmm0
  843. ; get the first horizontal line done
  844. movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  845. add rsi, rax ; next line
  846. .next_row_spo:
  847. movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  848. movdqa xmm5, xmm7
  849. movdqa xmm6, xmm7
  850. movdqa xmm4, xmm3 ; make a copy of current line
  851. movdqa xmm7, xmm3
  852. punpcklbw xmm5, xmm0
  853. punpckhbw xmm6, xmm0
  854. punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
  855. punpckhbw xmm4, xmm0
  856. pmullw xmm5, xmm1
  857. pmullw xmm6, xmm1
  858. pmullw xmm3, xmm2
  859. pmullw xmm4, xmm2
  860. paddw xmm3, xmm5
  861. paddw xmm4, xmm6
  862. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  863. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  864. paddw xmm4, [GLOBAL(rd)]
  865. psraw xmm4, VP8_FILTER_SHIFT
  866. packuswb xmm3, xmm4
  867. movdqa [rdi], xmm3 ; store the results in the destination
  868. add rsi, rax ; next line
  869. add rdi, rdx ;dst_pitch
  870. cmp rdi, rcx
  871. jne .next_row_spo
  872. jmp .done
  873. .b16x16_fp_only:
  874. lea rcx, [rdi+rdx*8]
  875. lea rcx, [rcx+rdx*8]
  876. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  877. pxor xmm0, xmm0
  878. .next_row_fpo:
  879. movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  880. movdqa xmm4, xmm3 ; make a copy of current line
  881. punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
  882. punpckhbw xmm4, xmm0
  883. pmullw xmm3, xmm1
  884. pmullw xmm4, xmm1
  885. movdqu xmm5, [rsi+1]
  886. movdqa xmm6, xmm5
  887. punpcklbw xmm5, xmm0
  888. punpckhbw xmm6, xmm0
  889. pmullw xmm5, xmm2
  890. pmullw xmm6, xmm2
  891. paddw xmm3, xmm5
  892. paddw xmm4, xmm6
  893. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  894. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  895. paddw xmm4, [GLOBAL(rd)]
  896. psraw xmm4, VP8_FILTER_SHIFT
  897. packuswb xmm3, xmm4
  898. movdqa [rdi], xmm3 ; store the results in the destination
  899. add rsi, rax ; next line
  900. add rdi, rdx ; dst_pitch
  901. cmp rdi, rcx
  902. jne .next_row_fpo
  903. .done:
  904. ; begin epilog
  905. pop rdi
  906. pop rsi
  907. RESTORE_GOT
  908. RESTORE_XMM
  909. UNSHADOW_ARGS
  910. pop rbp
  911. ret
  912. ;void vp8_bilinear_predict8x8_sse2
  913. ;(
  914. ; unsigned char *src_ptr,
  915. ; int src_pixels_per_line,
  916. ; int xoffset,
  917. ; int yoffset,
  918. ; unsigned char *dst_ptr,
  919. ; int dst_pitch
  920. ;)
  921. global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
  922. sym(vp8_bilinear_predict8x8_sse2):
  923. push rbp
  924. mov rbp, rsp
  925. SHADOW_ARGS_TO_STACK 6
  926. SAVE_XMM 7
  927. GET_GOT rbx
  928. push rsi
  929. push rdi
  930. ; end prolog
  931. ALIGN_STACK 16, rax
  932. sub rsp, 144 ; reserve 144 bytes
  933. ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
  934. ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
  935. lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
  936. mov rsi, arg(0) ;src_ptr
  937. movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
  938. ;Read 9-line unaligned data in and put them on stack. This gives a big
  939. ;performance boost.
  940. movdqu xmm0, [rsi]
  941. lea rax, [rdx + rdx*2]
  942. movdqu xmm1, [rsi+rdx]
  943. movdqu xmm2, [rsi+rdx*2]
  944. add rsi, rax
  945. movdqu xmm3, [rsi]
  946. movdqu xmm4, [rsi+rdx]
  947. movdqu xmm5, [rsi+rdx*2]
  948. add rsi, rax
  949. movdqu xmm6, [rsi]
  950. movdqu xmm7, [rsi+rdx]
  951. movdqa XMMWORD PTR [rsp], xmm0
  952. movdqu xmm0, [rsi+rdx*2]
  953. movdqa XMMWORD PTR [rsp+16], xmm1
  954. movdqa XMMWORD PTR [rsp+32], xmm2
  955. movdqa XMMWORD PTR [rsp+48], xmm3
  956. movdqa XMMWORD PTR [rsp+64], xmm4
  957. movdqa XMMWORD PTR [rsp+80], xmm5
  958. movdqa XMMWORD PTR [rsp+96], xmm6
  959. movdqa XMMWORD PTR [rsp+112], xmm7
  960. movdqa XMMWORD PTR [rsp+128], xmm0
  961. movsxd rax, dword ptr arg(2) ;xoffset
  962. shl rax, 5
  963. add rax, rcx ;HFilter
  964. mov rdi, arg(4) ;dst_ptr
  965. movsxd rdx, dword ptr arg(5) ;dst_pitch
  966. movdqa xmm1, [rax]
  967. movdqa xmm2, [rax+16]
  968. movsxd rax, dword ptr arg(3) ;yoffset
  969. shl rax, 5
  970. add rax, rcx ;VFilter
  971. lea rcx, [rdi+rdx*8]
  972. movdqa xmm5, [rax]
  973. movdqa xmm6, [rax+16]
  974. pxor xmm0, xmm0
  975. ; get the first horizontal line done
  976. movdqa xmm3, XMMWORD PTR [rsp]
  977. movdqa xmm4, xmm3 ; make a copy of current line
  978. psrldq xmm4, 1
  979. punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
  980. punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
  981. pmullw xmm3, xmm1
  982. pmullw xmm4, xmm2
  983. paddw xmm3, xmm4
  984. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  985. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  986. movdqa xmm7, xmm3
  987. add rsp, 16 ; next line
  988. .next_row8x8:
  989. movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  990. movdqa xmm4, xmm3 ; make a copy of current line
  991. psrldq xmm4, 1
  992. punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
  993. punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
  994. pmullw xmm3, xmm1
  995. pmullw xmm4, xmm2
  996. paddw xmm3, xmm4
  997. pmullw xmm7, xmm5
  998. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  999. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  1000. movdqa xmm4, xmm3
  1001. pmullw xmm3, xmm6
  1002. paddw xmm3, xmm7
  1003. movdqa xmm7, xmm4
  1004. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  1005. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  1006. packuswb xmm3, xmm0
  1007. movq [rdi], xmm3 ; store the results in the destination
  1008. add rsp, 16 ; next line
  1009. add rdi, rdx
  1010. cmp rdi, rcx
  1011. jne .next_row8x8
  1012. ;add rsp, 144
  1013. pop rsp
  1014. ; begin epilog
  1015. pop rdi
  1016. pop rsi
  1017. RESTORE_GOT
  1018. RESTORE_XMM
  1019. UNSHADOW_ARGS
  1020. pop rbp
  1021. ret
  1022. SECTION_RODATA
  1023. align 16
  1024. rd:
  1025. times 8 dw 0x40