subpixel_ssse3.asm 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %define BLOCK_HEIGHT_WIDTH 4
  12. %define VP8_FILTER_WEIGHT 128
  13. %define VP8_FILTER_SHIFT 7
  14. SECTION .text
  15. ;/************************************************************************************
  16. ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
  17. ; input pixel array has output_height rows. This routine assumes that output_height is an
  18. ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
  19. ; rows each iteration to take advantage of the 128 bits operations.
  20. ;
  21. ; This is an implementation of some of the SSE optimizations first seen in ffvp8
  22. ;
  23. ;*************************************************************************************/
  24. ;void vp8_filter_block1d8_h6_ssse3
  25. ;(
  26. ; unsigned char *src_ptr,
  27. ; unsigned int src_pixels_per_line,
  28. ; unsigned char *output_ptr,
  29. ; unsigned int output_pitch,
  30. ; unsigned int output_height,
  31. ; unsigned int vp8_filter_index
  32. ;)
  33. global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
  34. sym(vp8_filter_block1d8_h6_ssse3):
  35. push rbp
  36. mov rbp, rsp
  37. SHADOW_ARGS_TO_STACK 6
  38. SAVE_XMM 7
  39. GET_GOT rbx
  40. push rsi
  41. push rdi
  42. ; end prolog
  43. movsxd rdx, DWORD PTR arg(5) ;table index
  44. xor rsi, rsi
  45. shl rdx, 4
  46. movdqa xmm7, [GLOBAL(rd)]
  47. lea rax, [GLOBAL(k0_k5)]
  48. add rax, rdx
  49. mov rdi, arg(2) ;output_ptr
  50. cmp esi, DWORD PTR [rax]
  51. je vp8_filter_block1d8_h4_ssse3
  52. movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
  53. movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
  54. movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
  55. mov rsi, arg(0) ;src_ptr
  56. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  57. movsxd rcx, dword ptr arg(4) ;output_height
  58. movsxd rdx, dword ptr arg(3) ;output_pitch
  59. sub rdi, rdx
  60. ;xmm3 free
  61. .filter_block1d8_h6_rowloop_ssse3:
  62. movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
  63. movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
  64. punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
  65. movdqa xmm1, xmm0
  66. pmaddubsw xmm0, xmm4
  67. movdqa xmm2, xmm1
  68. pshufb xmm1, [GLOBAL(shuf2bfrom1)]
  69. pshufb xmm2, [GLOBAL(shuf3bfrom1)]
  70. pmaddubsw xmm1, xmm5
  71. lea rdi, [rdi + rdx]
  72. pmaddubsw xmm2, xmm6
  73. lea rsi, [rsi + rax]
  74. dec rcx
  75. paddsw xmm0, xmm1
  76. paddsw xmm2, xmm7
  77. paddsw xmm0, xmm2
  78. psraw xmm0, 7
  79. packuswb xmm0, xmm0
  80. movq MMWORD Ptr [rdi], xmm0
  81. jnz .filter_block1d8_h6_rowloop_ssse3
  82. ; begin epilog
  83. pop rdi
  84. pop rsi
  85. RESTORE_GOT
  86. RESTORE_XMM
  87. UNSHADOW_ARGS
  88. pop rbp
  89. ret
  90. vp8_filter_block1d8_h4_ssse3:
  91. movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
  92. movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
  93. movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
  94. movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
  95. mov rsi, arg(0) ;src_ptr
  96. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  97. movsxd rcx, dword ptr arg(4) ;output_height
  98. movsxd rdx, dword ptr arg(3) ;output_pitch
  99. sub rdi, rdx
  100. .filter_block1d8_h4_rowloop_ssse3:
  101. movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
  102. movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
  103. punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
  104. movdqa xmm2, xmm0
  105. pshufb xmm0, xmm3
  106. pshufb xmm2, xmm4
  107. pmaddubsw xmm0, xmm5
  108. lea rdi, [rdi + rdx]
  109. pmaddubsw xmm2, xmm6
  110. lea rsi, [rsi + rax]
  111. dec rcx
  112. paddsw xmm0, xmm7
  113. paddsw xmm0, xmm2
  114. psraw xmm0, 7
  115. packuswb xmm0, xmm0
  116. movq MMWORD Ptr [rdi], xmm0
  117. jnz .filter_block1d8_h4_rowloop_ssse3
  118. ; begin epilog
  119. pop rdi
  120. pop rsi
  121. RESTORE_GOT
  122. RESTORE_XMM
  123. UNSHADOW_ARGS
  124. pop rbp
  125. ret
  126. ;void vp8_filter_block1d16_h6_ssse3
  127. ;(
  128. ; unsigned char *src_ptr,
  129. ; unsigned int src_pixels_per_line,
  130. ; unsigned char *output_ptr,
  131. ; unsigned int output_pitch,
  132. ; unsigned int output_height,
  133. ; unsigned int vp8_filter_index
  134. ;)
  135. global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
  136. sym(vp8_filter_block1d16_h6_ssse3):
  137. push rbp
  138. mov rbp, rsp
  139. SHADOW_ARGS_TO_STACK 6
  140. SAVE_XMM 7
  141. GET_GOT rbx
  142. push rsi
  143. push rdi
  144. ; end prolog
  145. movsxd rdx, DWORD PTR arg(5) ;table index
  146. xor rsi, rsi
  147. shl rdx, 4 ;
  148. lea rax, [GLOBAL(k0_k5)]
  149. add rax, rdx
  150. mov rdi, arg(2) ;output_ptr
  151. mov rsi, arg(0) ;src_ptr
  152. movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
  153. movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
  154. movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
  155. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  156. movsxd rcx, dword ptr arg(4) ;output_height
  157. movsxd rdx, dword ptr arg(3) ;output_pitch
  158. .filter_block1d16_h6_rowloop_ssse3:
  159. movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
  160. movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
  161. punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
  162. movdqa xmm1, xmm0
  163. pmaddubsw xmm0, xmm4
  164. movdqa xmm2, xmm1
  165. pshufb xmm1, [GLOBAL(shuf2bfrom1)]
  166. pshufb xmm2, [GLOBAL(shuf3bfrom1)]
  167. movq xmm3, MMWORD PTR [rsi + 6]
  168. pmaddubsw xmm1, xmm5
  169. movq xmm7, MMWORD PTR [rsi + 11]
  170. pmaddubsw xmm2, xmm6
  171. punpcklbw xmm3, xmm7
  172. paddsw xmm0, xmm1
  173. movdqa xmm1, xmm3
  174. pmaddubsw xmm3, xmm4
  175. paddsw xmm0, xmm2
  176. movdqa xmm2, xmm1
  177. paddsw xmm0, [GLOBAL(rd)]
  178. pshufb xmm1, [GLOBAL(shuf2bfrom1)]
  179. pshufb xmm2, [GLOBAL(shuf3bfrom1)]
  180. psraw xmm0, 7
  181. pmaddubsw xmm1, xmm5
  182. pmaddubsw xmm2, xmm6
  183. packuswb xmm0, xmm0
  184. lea rsi, [rsi + rax]
  185. paddsw xmm3, xmm1
  186. paddsw xmm3, xmm2
  187. paddsw xmm3, [GLOBAL(rd)]
  188. psraw xmm3, 7
  189. packuswb xmm3, xmm3
  190. punpcklqdq xmm0, xmm3
  191. movdqa XMMWORD Ptr [rdi], xmm0
  192. lea rdi, [rdi + rdx]
  193. dec rcx
  194. jnz .filter_block1d16_h6_rowloop_ssse3
  195. ; begin epilog
  196. pop rdi
  197. pop rsi
  198. RESTORE_GOT
  199. RESTORE_XMM
  200. UNSHADOW_ARGS
  201. pop rbp
  202. ret
  203. ;void vp8_filter_block1d4_h6_ssse3
  204. ;(
  205. ; unsigned char *src_ptr,
  206. ; unsigned int src_pixels_per_line,
  207. ; unsigned char *output_ptr,
  208. ; unsigned int output_pitch,
  209. ; unsigned int output_height,
  210. ; unsigned int vp8_filter_index
  211. ;)
  212. global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
  213. sym(vp8_filter_block1d4_h6_ssse3):
  214. push rbp
  215. mov rbp, rsp
  216. SHADOW_ARGS_TO_STACK 6
  217. SAVE_XMM 7
  218. GET_GOT rbx
  219. push rsi
  220. push rdi
  221. ; end prolog
  222. movsxd rdx, DWORD PTR arg(5) ;table index
  223. xor rsi, rsi
  224. shl rdx, 4 ;
  225. lea rax, [GLOBAL(k0_k5)]
  226. add rax, rdx
  227. movdqa xmm7, [GLOBAL(rd)]
  228. cmp esi, DWORD PTR [rax]
  229. je .vp8_filter_block1d4_h4_ssse3
  230. movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
  231. movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
  232. movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
  233. mov rsi, arg(0) ;src_ptr
  234. mov rdi, arg(2) ;output_ptr
  235. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  236. movsxd rcx, dword ptr arg(4) ;output_height
  237. movsxd rdx, dword ptr arg(3) ;output_pitch
  238. ;xmm3 free
  239. .filter_block1d4_h6_rowloop_ssse3:
  240. movdqu xmm0, XMMWORD PTR [rsi - 2]
  241. movdqa xmm1, xmm0
  242. pshufb xmm0, [GLOBAL(shuf1b)]
  243. movdqa xmm2, xmm1
  244. pshufb xmm1, [GLOBAL(shuf2b)]
  245. pmaddubsw xmm0, xmm4
  246. pshufb xmm2, [GLOBAL(shuf3b)]
  247. pmaddubsw xmm1, xmm5
  248. ;--
  249. pmaddubsw xmm2, xmm6
  250. lea rsi, [rsi + rax]
  251. ;--
  252. paddsw xmm0, xmm1
  253. paddsw xmm0, xmm7
  254. pxor xmm1, xmm1
  255. paddsw xmm0, xmm2
  256. psraw xmm0, 7
  257. packuswb xmm0, xmm0
  258. movd DWORD PTR [rdi], xmm0
  259. add rdi, rdx
  260. dec rcx
  261. jnz .filter_block1d4_h6_rowloop_ssse3
  262. ; begin epilog
  263. pop rdi
  264. pop rsi
  265. RESTORE_GOT
  266. RESTORE_XMM
  267. UNSHADOW_ARGS
  268. pop rbp
  269. ret
  270. .vp8_filter_block1d4_h4_ssse3:
  271. movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
  272. movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
  273. movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
  274. movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
  275. mov rsi, arg(0) ;src_ptr
  276. mov rdi, arg(2) ;output_ptr
  277. movsxd rax, dword ptr arg(1) ;src_pixels_per_line
  278. movsxd rcx, dword ptr arg(4) ;output_height
  279. movsxd rdx, dword ptr arg(3) ;output_pitch
  280. .filter_block1d4_h4_rowloop_ssse3:
  281. movdqu xmm1, XMMWORD PTR [rsi - 2]
  282. movdqa xmm2, xmm1
  283. pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
  284. pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
  285. pmaddubsw xmm1, xmm5
  286. ;--
  287. pmaddubsw xmm2, xmm6
  288. lea rsi, [rsi + rax]
  289. ;--
  290. paddsw xmm1, xmm7
  291. paddsw xmm1, xmm2
  292. psraw xmm1, 7
  293. packuswb xmm1, xmm1
  294. movd DWORD PTR [rdi], xmm1
  295. add rdi, rdx
  296. dec rcx
  297. jnz .filter_block1d4_h4_rowloop_ssse3
  298. ; begin epilog
  299. pop rdi
  300. pop rsi
  301. RESTORE_GOT
  302. RESTORE_XMM
  303. UNSHADOW_ARGS
  304. pop rbp
  305. ret
  306. ;void vp8_filter_block1d16_v6_ssse3
  307. ;(
  308. ; unsigned char *src_ptr,
  309. ; unsigned int src_pitch,
  310. ; unsigned char *output_ptr,
  311. ; unsigned int out_pitch,
  312. ; unsigned int output_height,
  313. ; unsigned int vp8_filter_index
  314. ;)
  315. global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
  316. sym(vp8_filter_block1d16_v6_ssse3):
  317. push rbp
  318. mov rbp, rsp
  319. SHADOW_ARGS_TO_STACK 6
  320. SAVE_XMM 7
  321. GET_GOT rbx
  322. push rsi
  323. push rdi
  324. ; end prolog
  325. movsxd rdx, DWORD PTR arg(5) ;table index
  326. xor rsi, rsi
  327. shl rdx, 4 ;
  328. lea rax, [GLOBAL(k0_k5)]
  329. add rax, rdx
  330. cmp esi, DWORD PTR [rax]
  331. je .vp8_filter_block1d16_v4_ssse3
  332. movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
  333. movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
  334. movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
  335. mov rsi, arg(0) ;src_ptr
  336. movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
  337. mov rdi, arg(2) ;output_ptr
  338. %if ABI_IS_32BIT=0
  339. movsxd r8, DWORD PTR arg(3) ;out_pitch
  340. %endif
  341. mov rax, rsi
  342. movsxd rcx, DWORD PTR arg(4) ;output_height
  343. add rax, rdx
  344. .vp8_filter_block1d16_v6_ssse3_loop:
  345. movq xmm1, MMWORD PTR [rsi] ;A
  346. movq xmm2, MMWORD PTR [rsi + rdx] ;B
  347. movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
  348. movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
  349. movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
  350. punpcklbw xmm2, xmm4 ;B D
  351. punpcklbw xmm3, xmm0 ;C E
  352. movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
  353. pmaddubsw xmm3, xmm6
  354. punpcklbw xmm1, xmm0 ;A F
  355. pmaddubsw xmm2, xmm7
  356. pmaddubsw xmm1, xmm5
  357. paddsw xmm2, xmm3
  358. paddsw xmm2, xmm1
  359. paddsw xmm2, [GLOBAL(rd)]
  360. psraw xmm2, 7
  361. packuswb xmm2, xmm2
  362. movq MMWORD PTR [rdi], xmm2 ;store the results
  363. movq xmm1, MMWORD PTR [rsi + 8] ;A
  364. movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
  365. movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
  366. movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
  367. movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
  368. punpcklbw xmm2, xmm4 ;B D
  369. punpcklbw xmm3, xmm0 ;C E
  370. movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
  371. pmaddubsw xmm3, xmm6
  372. punpcklbw xmm1, xmm0 ;A F
  373. pmaddubsw xmm2, xmm7
  374. pmaddubsw xmm1, xmm5
  375. add rsi, rdx
  376. add rax, rdx
  377. ;--
  378. ;--
  379. paddsw xmm2, xmm3
  380. paddsw xmm2, xmm1
  381. paddsw xmm2, [GLOBAL(rd)]
  382. psraw xmm2, 7
  383. packuswb xmm2, xmm2
  384. movq MMWORD PTR [rdi+8], xmm2
  385. %if ABI_IS_32BIT
  386. add rdi, DWORD PTR arg(3) ;out_pitch
  387. %else
  388. add rdi, r8
  389. %endif
  390. dec rcx
  391. jnz .vp8_filter_block1d16_v6_ssse3_loop
  392. ; begin epilog
  393. pop rdi
  394. pop rsi
  395. RESTORE_GOT
  396. RESTORE_XMM
  397. UNSHADOW_ARGS
  398. pop rbp
  399. ret
  400. .vp8_filter_block1d16_v4_ssse3:
  401. movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
  402. movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
  403. mov rsi, arg(0) ;src_ptr
  404. movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
  405. mov rdi, arg(2) ;output_ptr
  406. %if ABI_IS_32BIT=0
  407. movsxd r8, DWORD PTR arg(3) ;out_pitch
  408. %endif
  409. mov rax, rsi
  410. movsxd rcx, DWORD PTR arg(4) ;output_height
  411. add rax, rdx
  412. .vp8_filter_block1d16_v4_ssse3_loop:
  413. movq xmm2, MMWORD PTR [rsi + rdx] ;B
  414. movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
  415. movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
  416. movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
  417. punpcklbw xmm2, xmm4 ;B D
  418. punpcklbw xmm3, xmm0 ;C E
  419. pmaddubsw xmm3, xmm6
  420. pmaddubsw xmm2, xmm7
  421. movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
  422. movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
  423. movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
  424. movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
  425. paddsw xmm2, [GLOBAL(rd)]
  426. paddsw xmm2, xmm3
  427. psraw xmm2, 7
  428. packuswb xmm2, xmm2
  429. punpcklbw xmm5, xmm4 ;B D
  430. punpcklbw xmm1, xmm0 ;C E
  431. pmaddubsw xmm1, xmm6
  432. pmaddubsw xmm5, xmm7
  433. movdqa xmm4, [GLOBAL(rd)]
  434. add rsi, rdx
  435. add rax, rdx
  436. ;--
  437. ;--
  438. paddsw xmm5, xmm1
  439. paddsw xmm5, xmm4
  440. psraw xmm5, 7
  441. packuswb xmm5, xmm5
  442. punpcklqdq xmm2, xmm5
  443. movdqa XMMWORD PTR [rdi], xmm2
  444. %if ABI_IS_32BIT
  445. add rdi, DWORD PTR arg(3) ;out_pitch
  446. %else
  447. add rdi, r8
  448. %endif
  449. dec rcx
  450. jnz .vp8_filter_block1d16_v4_ssse3_loop
  451. ; begin epilog
  452. pop rdi
  453. pop rsi
  454. RESTORE_GOT
  455. RESTORE_XMM
  456. UNSHADOW_ARGS
  457. pop rbp
  458. ret
  459. ;void vp8_filter_block1d8_v6_ssse3
  460. ;(
  461. ; unsigned char *src_ptr,
  462. ; unsigned int src_pitch,
  463. ; unsigned char *output_ptr,
  464. ; unsigned int out_pitch,
  465. ; unsigned int output_height,
  466. ; unsigned int vp8_filter_index
  467. ;)
  468. global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
  469. sym(vp8_filter_block1d8_v6_ssse3):
  470. push rbp
  471. mov rbp, rsp
  472. SHADOW_ARGS_TO_STACK 6
  473. SAVE_XMM 7
  474. GET_GOT rbx
  475. push rsi
  476. push rdi
  477. ; end prolog
  478. movsxd rdx, DWORD PTR arg(5) ;table index
  479. xor rsi, rsi
  480. shl rdx, 4 ;
  481. lea rax, [GLOBAL(k0_k5)]
  482. add rax, rdx
  483. movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
  484. mov rdi, arg(2) ;output_ptr
  485. %if ABI_IS_32BIT=0
  486. movsxd r8, DWORD PTR arg(3) ; out_pitch
  487. %endif
  488. movsxd rcx, DWORD PTR arg(4) ;[output_height]
  489. cmp esi, DWORD PTR [rax]
  490. je .vp8_filter_block1d8_v4_ssse3
  491. movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
  492. movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
  493. movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
  494. mov rsi, arg(0) ;src_ptr
  495. mov rax, rsi
  496. add rax, rdx
  497. .vp8_filter_block1d8_v6_ssse3_loop:
  498. movq xmm1, MMWORD PTR [rsi] ;A
  499. movq xmm2, MMWORD PTR [rsi + rdx] ;B
  500. movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
  501. movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
  502. movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
  503. punpcklbw xmm2, xmm4 ;B D
  504. punpcklbw xmm3, xmm0 ;C E
  505. movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
  506. movdqa xmm4, [GLOBAL(rd)]
  507. pmaddubsw xmm3, xmm6
  508. punpcklbw xmm1, xmm0 ;A F
  509. pmaddubsw xmm2, xmm7
  510. pmaddubsw xmm1, xmm5
  511. add rsi, rdx
  512. add rax, rdx
  513. ;--
  514. ;--
  515. paddsw xmm2, xmm3
  516. paddsw xmm2, xmm1
  517. paddsw xmm2, xmm4
  518. psraw xmm2, 7
  519. packuswb xmm2, xmm2
  520. movq MMWORD PTR [rdi], xmm2
  521. %if ABI_IS_32BIT
  522. add rdi, DWORD PTR arg(3) ;[out_pitch]
  523. %else
  524. add rdi, r8
  525. %endif
  526. dec rcx
  527. jnz .vp8_filter_block1d8_v6_ssse3_loop
  528. ; begin epilog
  529. pop rdi
  530. pop rsi
  531. RESTORE_GOT
  532. RESTORE_XMM
  533. UNSHADOW_ARGS
  534. pop rbp
  535. ret
  536. .vp8_filter_block1d8_v4_ssse3:
  537. movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
  538. movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
  539. movdqa xmm5, [GLOBAL(rd)]
  540. mov rsi, arg(0) ;src_ptr
  541. mov rax, rsi
  542. add rax, rdx
  543. .vp8_filter_block1d8_v4_ssse3_loop:
  544. movq xmm2, MMWORD PTR [rsi + rdx] ;B
  545. movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
  546. movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
  547. movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
  548. punpcklbw xmm2, xmm4 ;B D
  549. punpcklbw xmm3, xmm0 ;C E
  550. pmaddubsw xmm3, xmm6
  551. pmaddubsw xmm2, xmm7
  552. add rsi, rdx
  553. add rax, rdx
  554. ;--
  555. ;--
  556. paddsw xmm2, xmm3
  557. paddsw xmm2, xmm5
  558. psraw xmm2, 7
  559. packuswb xmm2, xmm2
  560. movq MMWORD PTR [rdi], xmm2
  561. %if ABI_IS_32BIT
  562. add rdi, DWORD PTR arg(3) ;[out_pitch]
  563. %else
  564. add rdi, r8
  565. %endif
  566. dec rcx
  567. jnz .vp8_filter_block1d8_v4_ssse3_loop
  568. ; begin epilog
  569. pop rdi
  570. pop rsi
  571. RESTORE_GOT
  572. RESTORE_XMM
  573. UNSHADOW_ARGS
  574. pop rbp
  575. ret
  576. ;void vp8_filter_block1d4_v6_ssse3
  577. ;(
  578. ; unsigned char *src_ptr,
  579. ; unsigned int src_pitch,
  580. ; unsigned char *output_ptr,
  581. ; unsigned int out_pitch,
  582. ; unsigned int output_height,
  583. ; unsigned int vp8_filter_index
  584. ;)
  585. global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
  586. sym(vp8_filter_block1d4_v6_ssse3):
  587. push rbp
  588. mov rbp, rsp
  589. SHADOW_ARGS_TO_STACK 6
  590. GET_GOT rbx
  591. push rsi
  592. push rdi
  593. ; end prolog
  594. movsxd rdx, DWORD PTR arg(5) ;table index
  595. xor rsi, rsi
  596. shl rdx, 4 ;
  597. lea rax, [GLOBAL(k0_k5)]
  598. add rax, rdx
  599. movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
  600. mov rdi, arg(2) ;output_ptr
  601. %if ABI_IS_32BIT=0
  602. movsxd r8, DWORD PTR arg(3) ; out_pitch
  603. %endif
  604. movsxd rcx, DWORD PTR arg(4) ;[output_height]
  605. cmp esi, DWORD PTR [rax]
  606. je .vp8_filter_block1d4_v4_ssse3
  607. movq mm5, MMWORD PTR [rax] ;k0_k5
  608. movq mm6, MMWORD PTR [rax+256] ;k2_k4
  609. movq mm7, MMWORD PTR [rax+128] ;k1_k3
  610. mov rsi, arg(0) ;src_ptr
  611. mov rax, rsi
  612. add rax, rdx
  613. .vp8_filter_block1d4_v6_ssse3_loop:
  614. movd mm1, DWORD PTR [rsi] ;A
  615. movd mm2, DWORD PTR [rsi + rdx] ;B
  616. movd mm3, DWORD PTR [rsi + rdx * 2] ;C
  617. movd mm4, DWORD PTR [rax + rdx * 2] ;D
  618. movd mm0, DWORD PTR [rsi + rdx * 4] ;E
  619. punpcklbw mm2, mm4 ;B D
  620. punpcklbw mm3, mm0 ;C E
  621. movd mm0, DWORD PTR [rax + rdx * 4] ;F
  622. movq mm4, [GLOBAL(rd)]
  623. pmaddubsw mm3, mm6
  624. punpcklbw mm1, mm0 ;A F
  625. pmaddubsw mm2, mm7
  626. pmaddubsw mm1, mm5
  627. add rsi, rdx
  628. add rax, rdx
  629. ;--
  630. ;--
  631. paddsw mm2, mm3
  632. paddsw mm2, mm1
  633. paddsw mm2, mm4
  634. psraw mm2, 7
  635. packuswb mm2, mm2
  636. movd DWORD PTR [rdi], mm2
  637. %if ABI_IS_32BIT
  638. add rdi, DWORD PTR arg(3) ;[out_pitch]
  639. %else
  640. add rdi, r8
  641. %endif
  642. dec rcx
  643. jnz .vp8_filter_block1d4_v6_ssse3_loop
  644. ; begin epilog
  645. pop rdi
  646. pop rsi
  647. RESTORE_GOT
  648. UNSHADOW_ARGS
  649. pop rbp
  650. ret
  651. .vp8_filter_block1d4_v4_ssse3:
  652. movq mm6, MMWORD PTR [rax+256] ;k2_k4
  653. movq mm7, MMWORD PTR [rax+128] ;k1_k3
  654. movq mm5, MMWORD PTR [GLOBAL(rd)]
  655. mov rsi, arg(0) ;src_ptr
  656. mov rax, rsi
  657. add rax, rdx
  658. .vp8_filter_block1d4_v4_ssse3_loop:
  659. movd mm2, DWORD PTR [rsi + rdx] ;B
  660. movd mm3, DWORD PTR [rsi + rdx * 2] ;C
  661. movd mm4, DWORD PTR [rax + rdx * 2] ;D
  662. movd mm0, DWORD PTR [rsi + rdx * 4] ;E
  663. punpcklbw mm2, mm4 ;B D
  664. punpcklbw mm3, mm0 ;C E
  665. pmaddubsw mm3, mm6
  666. pmaddubsw mm2, mm7
  667. add rsi, rdx
  668. add rax, rdx
  669. ;--
  670. ;--
  671. paddsw mm2, mm3
  672. paddsw mm2, mm5
  673. psraw mm2, 7
  674. packuswb mm2, mm2
  675. movd DWORD PTR [rdi], mm2
  676. %if ABI_IS_32BIT
  677. add rdi, DWORD PTR arg(3) ;[out_pitch]
  678. %else
  679. add rdi, r8
  680. %endif
  681. dec rcx
  682. jnz .vp8_filter_block1d4_v4_ssse3_loop
  683. ; begin epilog
  684. pop rdi
  685. pop rsi
  686. RESTORE_GOT
  687. UNSHADOW_ARGS
  688. pop rbp
  689. ret
  690. ;void vp8_bilinear_predict16x16_ssse3
  691. ;(
  692. ; unsigned char *src_ptr,
  693. ; int src_pixels_per_line,
  694. ; int xoffset,
  695. ; int yoffset,
  696. ; unsigned char *dst_ptr,
  697. ; int dst_pitch
  698. ;)
  699. global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
  700. sym(vp8_bilinear_predict16x16_ssse3):
  701. push rbp
  702. mov rbp, rsp
  703. SHADOW_ARGS_TO_STACK 6
  704. SAVE_XMM 7
  705. GET_GOT rbx
  706. push rsi
  707. push rdi
  708. ; end prolog
  709. lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
  710. movsxd rax, dword ptr arg(2) ; xoffset
  711. cmp rax, 0 ; skip first_pass filter if xoffset=0
  712. je .b16x16_sp_only
  713. shl rax, 4
  714. lea rax, [rax + rcx] ; HFilter
  715. mov rdi, arg(4) ; dst_ptr
  716. mov rsi, arg(0) ; src_ptr
  717. movsxd rdx, dword ptr arg(5) ; dst_pitch
  718. movdqa xmm1, [rax]
  719. movsxd rax, dword ptr arg(3) ; yoffset
  720. cmp rax, 0 ; skip second_pass filter if yoffset=0
  721. je .b16x16_fp_only
  722. shl rax, 4
  723. lea rax, [rax + rcx] ; VFilter
  724. lea rcx, [rdi+rdx*8]
  725. lea rcx, [rcx+rdx*8]
  726. movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
  727. movdqa xmm2, [rax]
  728. %if ABI_IS_32BIT=0
  729. movsxd r8, dword ptr arg(5) ; dst_pitch
  730. %endif
  731. movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
  732. movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
  733. punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
  734. movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
  735. movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
  736. lea rsi, [rsi + rdx] ; next line
  737. pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
  738. punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
  739. pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
  740. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  741. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  742. paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
  743. psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
  744. movdqa xmm7, xmm3
  745. packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  746. .next_row:
  747. movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
  748. movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
  749. punpcklbw xmm6, xmm5
  750. movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
  751. movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
  752. lea rsi, [rsi + rdx] ; next line
  753. pmaddubsw xmm6, xmm1
  754. punpcklbw xmm4, xmm5
  755. pmaddubsw xmm4, xmm1
  756. paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
  757. psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
  758. paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
  759. psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
  760. packuswb xmm6, xmm4
  761. movdqa xmm5, xmm7
  762. punpcklbw xmm5, xmm6
  763. pmaddubsw xmm5, xmm2
  764. punpckhbw xmm7, xmm6
  765. pmaddubsw xmm7, xmm2
  766. paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
  767. psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
  768. paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
  769. psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
  770. packuswb xmm5, xmm7
  771. movdqa xmm7, xmm6
  772. movdqa [rdi], xmm5 ; store the results in the destination
  773. %if ABI_IS_32BIT
  774. add rdi, DWORD PTR arg(5) ; dst_pitch
  775. %else
  776. add rdi, r8
  777. %endif
  778. cmp rdi, rcx
  779. jne .next_row
  780. jmp .done
  781. .b16x16_sp_only:
  782. movsxd rax, dword ptr arg(3) ; yoffset
  783. shl rax, 4
  784. lea rax, [rax + rcx] ; VFilter
  785. mov rdi, arg(4) ; dst_ptr
  786. mov rsi, arg(0) ; src_ptr
  787. movsxd rdx, dword ptr arg(5) ; dst_pitch
  788. movdqa xmm1, [rax] ; VFilter
  789. lea rcx, [rdi+rdx*8]
  790. lea rcx, [rcx+rdx*8]
  791. movsxd rax, dword ptr arg(1) ; src_pixels_per_line
  792. ; get the first horizontal line done
  793. movq xmm4, [rsi] ; load row 0
  794. movq xmm2, [rsi + 8] ; load row 0
  795. lea rsi, [rsi + rax] ; next line
  796. .next_row_sp:
  797. movq xmm3, [rsi] ; load row + 1
  798. movq xmm5, [rsi + 8] ; load row + 1
  799. punpcklbw xmm4, xmm3
  800. punpcklbw xmm2, xmm5
  801. pmaddubsw xmm4, xmm1
  802. movq xmm7, [rsi + rax] ; load row + 2
  803. pmaddubsw xmm2, xmm1
  804. movq xmm6, [rsi + rax + 8] ; load row + 2
  805. punpcklbw xmm3, xmm7
  806. punpcklbw xmm5, xmm6
  807. pmaddubsw xmm3, xmm1
  808. paddw xmm4, [GLOBAL(rd)]
  809. pmaddubsw xmm5, xmm1
  810. paddw xmm2, [GLOBAL(rd)]
  811. psraw xmm4, VP8_FILTER_SHIFT
  812. psraw xmm2, VP8_FILTER_SHIFT
  813. packuswb xmm4, xmm2
  814. paddw xmm3, [GLOBAL(rd)]
  815. movdqa [rdi], xmm4 ; store row 0
  816. paddw xmm5, [GLOBAL(rd)]
  817. psraw xmm3, VP8_FILTER_SHIFT
  818. psraw xmm5, VP8_FILTER_SHIFT
  819. packuswb xmm3, xmm5
  820. movdqa xmm4, xmm7
  821. movdqa [rdi + rdx],xmm3 ; store row 1
  822. lea rsi, [rsi + 2*rax]
  823. movdqa xmm2, xmm6
  824. lea rdi, [rdi + 2*rdx]
  825. cmp rdi, rcx
  826. jne .next_row_sp
  827. jmp .done
  828. .b16x16_fp_only:
  829. lea rcx, [rdi+rdx*8]
  830. lea rcx, [rcx+rdx*8]
  831. movsxd rax, dword ptr arg(1) ; src_pixels_per_line
  832. .next_row_fp:
  833. movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
  834. movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
  835. punpcklbw xmm2, xmm4
  836. movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
  837. pmaddubsw xmm2, xmm1
  838. movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
  839. lea rsi, [rsi + rax] ; next line
  840. punpcklbw xmm3, xmm4
  841. pmaddubsw xmm3, xmm1
  842. movq xmm5, [rsi]
  843. paddw xmm2, [GLOBAL(rd)]
  844. movq xmm7, [rsi+1]
  845. movq xmm6, [rsi+8]
  846. psraw xmm2, VP8_FILTER_SHIFT
  847. punpcklbw xmm5, xmm7
  848. movq xmm7, [rsi+9]
  849. paddw xmm3, [GLOBAL(rd)]
  850. pmaddubsw xmm5, xmm1
  851. psraw xmm3, VP8_FILTER_SHIFT
  852. punpcklbw xmm6, xmm7
  853. packuswb xmm2, xmm3
  854. pmaddubsw xmm6, xmm1
  855. movdqa [rdi], xmm2 ; store the results in the destination
  856. paddw xmm5, [GLOBAL(rd)]
  857. lea rdi, [rdi + rdx] ; dst_pitch
  858. psraw xmm5, VP8_FILTER_SHIFT
  859. paddw xmm6, [GLOBAL(rd)]
  860. psraw xmm6, VP8_FILTER_SHIFT
  861. packuswb xmm5, xmm6
  862. lea rsi, [rsi + rax] ; next line
  863. movdqa [rdi], xmm5 ; store the results in the destination
  864. lea rdi, [rdi + rdx] ; dst_pitch
  865. cmp rdi, rcx
  866. jne .next_row_fp
  867. .done:
  868. ; begin epilog
  869. pop rdi
  870. pop rsi
  871. RESTORE_GOT
  872. RESTORE_XMM
  873. UNSHADOW_ARGS
  874. pop rbp
  875. ret
  876. ;void vp8_bilinear_predict8x8_ssse3
  877. ;(
  878. ; unsigned char *src_ptr,
  879. ; int src_pixels_per_line,
  880. ; int xoffset,
  881. ; int yoffset,
  882. ; unsigned char *dst_ptr,
  883. ; int dst_pitch
  884. ;)
  885. global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
  886. sym(vp8_bilinear_predict8x8_ssse3):
  887. push rbp
  888. mov rbp, rsp
  889. SHADOW_ARGS_TO_STACK 6
  890. SAVE_XMM 7
  891. GET_GOT rbx
  892. push rsi
  893. push rdi
  894. ; end prolog
  895. ALIGN_STACK 16, rax
  896. sub rsp, 144 ; reserve 144 bytes
  897. lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
  898. mov rsi, arg(0) ;src_ptr
  899. movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
  900. ;Read 9-line unaligned data in and put them on stack. This gives a big
  901. ;performance boost.
  902. movdqu xmm0, [rsi]
  903. lea rax, [rdx + rdx*2]
  904. movdqu xmm1, [rsi+rdx]
  905. movdqu xmm2, [rsi+rdx*2]
  906. add rsi, rax
  907. movdqu xmm3, [rsi]
  908. movdqu xmm4, [rsi+rdx]
  909. movdqu xmm5, [rsi+rdx*2]
  910. add rsi, rax
  911. movdqu xmm6, [rsi]
  912. movdqu xmm7, [rsi+rdx]
  913. movdqa XMMWORD PTR [rsp], xmm0
  914. movdqu xmm0, [rsi+rdx*2]
  915. movdqa XMMWORD PTR [rsp+16], xmm1
  916. movdqa XMMWORD PTR [rsp+32], xmm2
  917. movdqa XMMWORD PTR [rsp+48], xmm3
  918. movdqa XMMWORD PTR [rsp+64], xmm4
  919. movdqa XMMWORD PTR [rsp+80], xmm5
  920. movdqa XMMWORD PTR [rsp+96], xmm6
  921. movdqa XMMWORD PTR [rsp+112], xmm7
  922. movdqa XMMWORD PTR [rsp+128], xmm0
  923. movsxd rax, dword ptr arg(2) ; xoffset
  924. cmp rax, 0 ; skip first_pass filter if xoffset=0
  925. je .b8x8_sp_only
  926. shl rax, 4
  927. add rax, rcx ; HFilter
  928. mov rdi, arg(4) ; dst_ptr
  929. movsxd rdx, dword ptr arg(5) ; dst_pitch
  930. movdqa xmm0, [rax]
  931. movsxd rax, dword ptr arg(3) ; yoffset
  932. cmp rax, 0 ; skip second_pass filter if yoffset=0
  933. je .b8x8_fp_only
  934. shl rax, 4
  935. lea rax, [rax + rcx] ; VFilter
  936. lea rcx, [rdi+rdx*8]
  937. movdqa xmm1, [rax]
  938. ; get the first horizontal line done
  939. movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  940. movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
  941. psrldq xmm5, 1
  942. lea rsp, [rsp + 16] ; next line
  943. punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
  944. pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
  945. paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
  946. psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  947. movdqa xmm7, xmm3
  948. packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  949. .next_row:
  950. movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  951. lea rsp, [rsp + 16] ; next line
  952. movdqa xmm5, xmm6
  953. psrldq xmm5, 1
  954. punpcklbw xmm6, xmm5
  955. pmaddubsw xmm6, xmm0
  956. paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
  957. psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
  958. packuswb xmm6, xmm6
  959. punpcklbw xmm7, xmm6
  960. pmaddubsw xmm7, xmm1
  961. paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
  962. psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
  963. packuswb xmm7, xmm7
  964. movq [rdi], xmm7 ; store the results in the destination
  965. lea rdi, [rdi + rdx]
  966. movdqa xmm7, xmm6
  967. cmp rdi, rcx
  968. jne .next_row
  969. jmp .done8x8
  970. .b8x8_sp_only:
  971. movsxd rax, dword ptr arg(3) ; yoffset
  972. shl rax, 4
  973. lea rax, [rax + rcx] ; VFilter
  974. mov rdi, arg(4) ;dst_ptr
  975. movsxd rdx, dword ptr arg(5) ; dst_pitch
  976. movdqa xmm0, [rax] ; VFilter
  977. movq xmm1, XMMWORD PTR [rsp]
  978. movq xmm2, XMMWORD PTR [rsp+16]
  979. movq xmm3, XMMWORD PTR [rsp+32]
  980. punpcklbw xmm1, xmm2
  981. movq xmm4, XMMWORD PTR [rsp+48]
  982. punpcklbw xmm2, xmm3
  983. movq xmm5, XMMWORD PTR [rsp+64]
  984. punpcklbw xmm3, xmm4
  985. movq xmm6, XMMWORD PTR [rsp+80]
  986. punpcklbw xmm4, xmm5
  987. movq xmm7, XMMWORD PTR [rsp+96]
  988. punpcklbw xmm5, xmm6
  989. ; Because the source register (xmm0) is always treated as signed by
  990. ; pmaddubsw, the constant '128' is treated as '-128'.
  991. pmaddubsw xmm1, xmm0
  992. pmaddubsw xmm2, xmm0
  993. pmaddubsw xmm3, xmm0
  994. pmaddubsw xmm4, xmm0
  995. pmaddubsw xmm5, xmm0
  996. punpcklbw xmm6, xmm7
  997. pmaddubsw xmm6, xmm0
  998. paddw xmm1, [GLOBAL(rd)]
  999. paddw xmm2, [GLOBAL(rd)]
  1000. psraw xmm1, VP8_FILTER_SHIFT
  1001. paddw xmm3, [GLOBAL(rd)]
  1002. psraw xmm2, VP8_FILTER_SHIFT
  1003. paddw xmm4, [GLOBAL(rd)]
  1004. psraw xmm3, VP8_FILTER_SHIFT
  1005. paddw xmm5, [GLOBAL(rd)]
  1006. psraw xmm4, VP8_FILTER_SHIFT
  1007. paddw xmm6, [GLOBAL(rd)]
  1008. psraw xmm5, VP8_FILTER_SHIFT
  1009. psraw xmm6, VP8_FILTER_SHIFT
  1010. ; Having multiplied everything by '-128' and obtained negative
  1011. ; numbers, the unsigned saturation truncates those values to 0,
  1012. ; resulting in incorrect handling of xoffset == 0 && yoffset == 0
  1013. packuswb xmm1, xmm1
  1014. packuswb xmm2, xmm2
  1015. movq [rdi], xmm1
  1016. packuswb xmm3, xmm3
  1017. movq [rdi+rdx], xmm2
  1018. packuswb xmm4, xmm4
  1019. movq xmm1, XMMWORD PTR [rsp+112]
  1020. lea rdi, [rdi + 2*rdx]
  1021. movq xmm2, XMMWORD PTR [rsp+128]
  1022. packuswb xmm5, xmm5
  1023. movq [rdi], xmm3
  1024. packuswb xmm6, xmm6
  1025. movq [rdi+rdx], xmm4
  1026. lea rdi, [rdi + 2*rdx]
  1027. punpcklbw xmm7, xmm1
  1028. movq [rdi], xmm5
  1029. pmaddubsw xmm7, xmm0
  1030. movq [rdi+rdx], xmm6
  1031. punpcklbw xmm1, xmm2
  1032. pmaddubsw xmm1, xmm0
  1033. paddw xmm7, [GLOBAL(rd)]
  1034. psraw xmm7, VP8_FILTER_SHIFT
  1035. paddw xmm1, [GLOBAL(rd)]
  1036. psraw xmm1, VP8_FILTER_SHIFT
  1037. packuswb xmm7, xmm7
  1038. packuswb xmm1, xmm1
  1039. lea rdi, [rdi + 2*rdx]
  1040. movq [rdi], xmm7
  1041. movq [rdi+rdx], xmm1
  1042. lea rsp, [rsp + 144]
  1043. jmp .done8x8
  1044. .b8x8_fp_only:
  1045. lea rcx, [rdi+rdx*8]
  1046. .next_row_fp:
  1047. movdqa xmm1, XMMWORD PTR [rsp]
  1048. movdqa xmm3, XMMWORD PTR [rsp+16]
  1049. movdqa xmm2, xmm1
  1050. movdqa xmm5, XMMWORD PTR [rsp+32]
  1051. psrldq xmm2, 1
  1052. movdqa xmm7, XMMWORD PTR [rsp+48]
  1053. movdqa xmm4, xmm3
  1054. psrldq xmm4, 1
  1055. movdqa xmm6, xmm5
  1056. psrldq xmm6, 1
  1057. punpcklbw xmm1, xmm2
  1058. pmaddubsw xmm1, xmm0
  1059. punpcklbw xmm3, xmm4
  1060. pmaddubsw xmm3, xmm0
  1061. punpcklbw xmm5, xmm6
  1062. pmaddubsw xmm5, xmm0
  1063. movdqa xmm2, xmm7
  1064. psrldq xmm2, 1
  1065. punpcklbw xmm7, xmm2
  1066. pmaddubsw xmm7, xmm0
  1067. paddw xmm1, [GLOBAL(rd)]
  1068. psraw xmm1, VP8_FILTER_SHIFT
  1069. paddw xmm3, [GLOBAL(rd)]
  1070. psraw xmm3, VP8_FILTER_SHIFT
  1071. paddw xmm5, [GLOBAL(rd)]
  1072. psraw xmm5, VP8_FILTER_SHIFT
  1073. paddw xmm7, [GLOBAL(rd)]
  1074. psraw xmm7, VP8_FILTER_SHIFT
  1075. packuswb xmm1, xmm1
  1076. packuswb xmm3, xmm3
  1077. packuswb xmm5, xmm5
  1078. movq [rdi], xmm1
  1079. packuswb xmm7, xmm7
  1080. movq [rdi+rdx], xmm3
  1081. lea rdi, [rdi + 2*rdx]
  1082. movq [rdi], xmm5
  1083. lea rsp, [rsp + 4*16]
  1084. movq [rdi+rdx], xmm7
  1085. lea rdi, [rdi + 2*rdx]
  1086. cmp rdi, rcx
  1087. jne .next_row_fp
  1088. lea rsp, [rsp + 16]
  1089. .done8x8:
  1090. ;add rsp, 144
  1091. pop rsp
  1092. ; begin epilog
  1093. pop rdi
  1094. pop rsi
  1095. RESTORE_GOT
  1096. RESTORE_XMM
  1097. UNSHADOW_ARGS
  1098. pop rbp
  1099. ret
  1100. SECTION_RODATA
  1101. align 16
  1102. shuf1b:
  1103. db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  1104. shuf2b:
  1105. db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
  1106. shuf3b:
  1107. db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
  1108. align 16
  1109. shuf2bfrom1:
  1110. db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
  1111. align 16
  1112. shuf3bfrom1:
  1113. db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
  1114. align 16
  1115. rd:
  1116. times 8 dw 0x40
  1117. align 16
  1118. k0_k5:
  1119. times 8 db 0, 0 ;placeholder
  1120. times 8 db 0, 0
  1121. times 8 db 2, 1
  1122. times 8 db 0, 0
  1123. times 8 db 3, 3
  1124. times 8 db 0, 0
  1125. times 8 db 1, 2
  1126. times 8 db 0, 0
  1127. k1_k3:
  1128. times 8 db 0, 0 ;placeholder
  1129. times 8 db -6, 12
  1130. times 8 db -11, 36
  1131. times 8 db -9, 50
  1132. times 8 db -16, 77
  1133. times 8 db -6, 93
  1134. times 8 db -8, 108
  1135. times 8 db -1, 123
  1136. k2_k4:
  1137. times 8 db 128, 0 ;placeholder
  1138. times 8 db 123, -1
  1139. times 8 db 108, -8
  1140. times 8 db 93, -6
  1141. times 8 db 77, -16
  1142. times 8 db 50, -9
  1143. times 8 db 36, -11
  1144. times 8 db 12, -6
  1145. align 16
  1146. vp8_bilinear_filters_ssse3:
  1147. times 8 db 128, 0
  1148. times 8 db 112, 16
  1149. times 8 db 96, 32
  1150. times 8 db 80, 48
  1151. times 8 db 64, 64
  1152. times 8 db 48, 80
  1153. times 8 db 32, 96
  1154. times 8 db 16, 112