idctllm_sse2.asm 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;void vp8_idct_dequant_0_2x_sse2
  12. ; (
  13. ; short *qcoeff - 0
  14. ; short *dequant - 1
  15. ; unsigned char *dst - 2
  16. ; int dst_stride - 3
  17. ; )
  18. global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
  19. sym(vp8_idct_dequant_0_2x_sse2):
  20. push rbp
  21. mov rbp, rsp
  22. SHADOW_ARGS_TO_STACK 4
  23. GET_GOT rbx
  24. ; end prolog
  25. mov rdx, arg(1) ; dequant
  26. mov rax, arg(0) ; qcoeff
  27. movd xmm4, [rax]
  28. movd xmm5, [rdx]
  29. pinsrw xmm4, [rax+32], 4
  30. pinsrw xmm5, [rdx], 4
  31. pmullw xmm4, xmm5
  32. ; Zero out xmm5, for use unpacking
  33. pxor xmm5, xmm5
  34. ; clear coeffs
  35. movd [rax], xmm5
  36. movd [rax+32], xmm5
  37. ;pshufb
  38. mov rax, arg(2) ; dst
  39. movsxd rdx, dword ptr arg(3) ; dst_stride
  40. pshuflw xmm4, xmm4, 00000000b
  41. pshufhw xmm4, xmm4, 00000000b
  42. lea rcx, [rdx + rdx*2]
  43. paddw xmm4, [GLOBAL(fours)]
  44. psraw xmm4, 3
  45. movq xmm0, [rax]
  46. movq xmm1, [rax+rdx]
  47. movq xmm2, [rax+2*rdx]
  48. movq xmm3, [rax+rcx]
  49. punpcklbw xmm0, xmm5
  50. punpcklbw xmm1, xmm5
  51. punpcklbw xmm2, xmm5
  52. punpcklbw xmm3, xmm5
  53. ; Add to predict buffer
  54. paddw xmm0, xmm4
  55. paddw xmm1, xmm4
  56. paddw xmm2, xmm4
  57. paddw xmm3, xmm4
  58. ; pack up before storing
  59. packuswb xmm0, xmm5
  60. packuswb xmm1, xmm5
  61. packuswb xmm2, xmm5
  62. packuswb xmm3, xmm5
  63. ; store blocks back out
  64. movq [rax], xmm0
  65. movq [rax + rdx], xmm1
  66. lea rax, [rax + 2*rdx]
  67. movq [rax], xmm2
  68. movq [rax + rdx], xmm3
  69. ; begin epilog
  70. RESTORE_GOT
  71. UNSHADOW_ARGS
  72. pop rbp
  73. ret
  74. ;void vp8_idct_dequant_full_2x_sse2
  75. ; (
  76. ; short *qcoeff - 0
  77. ; short *dequant - 1
  78. ; unsigned char *dst - 2
  79. ; int dst_stride - 3
  80. ; )
  81. global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
  82. sym(vp8_idct_dequant_full_2x_sse2):
  83. push rbp
  84. mov rbp, rsp
  85. SHADOW_ARGS_TO_STACK 4
  86. SAVE_XMM 7
  87. GET_GOT rbx
  88. push rsi
  89. push rdi
  90. ; end prolog
  91. ; special case when 2 blocks have 0 or 1 coeffs
  92. ; dc is set as first coeff, so no need to load qcoeff
  93. mov rax, arg(0) ; qcoeff
  94. mov rdx, arg(1) ; dequant
  95. mov rdi, arg(2) ; dst
  96. ; Zero out xmm7, for use unpacking
  97. pxor xmm7, xmm7
  98. ; note the transpose of xmm1 and xmm2, necessary for shuffle
  99. ; to spit out sensicle data
  100. movdqa xmm0, [rax]
  101. movdqa xmm2, [rax+16]
  102. movdqa xmm1, [rax+32]
  103. movdqa xmm3, [rax+48]
  104. ; Clear out coeffs
  105. movdqa [rax], xmm7
  106. movdqa [rax+16], xmm7
  107. movdqa [rax+32], xmm7
  108. movdqa [rax+48], xmm7
  109. ; dequantize qcoeff buffer
  110. pmullw xmm0, [rdx]
  111. pmullw xmm2, [rdx+16]
  112. pmullw xmm1, [rdx]
  113. pmullw xmm3, [rdx+16]
  114. movsxd rdx, dword ptr arg(3) ; dst_stride
  115. ; repack so block 0 row x and block 1 row x are together
  116. movdqa xmm4, xmm0
  117. punpckldq xmm0, xmm1
  118. punpckhdq xmm4, xmm1
  119. pshufd xmm0, xmm0, 11011000b
  120. pshufd xmm1, xmm4, 11011000b
  121. movdqa xmm4, xmm2
  122. punpckldq xmm2, xmm3
  123. punpckhdq xmm4, xmm3
  124. pshufd xmm2, xmm2, 11011000b
  125. pshufd xmm3, xmm4, 11011000b
  126. ; first pass
  127. psubw xmm0, xmm2 ; b1 = 0-2
  128. paddw xmm2, xmm2 ;
  129. movdqa xmm5, xmm1
  130. paddw xmm2, xmm0 ; a1 = 0+2
  131. pmulhw xmm5, [GLOBAL(x_s1sqr2)]
  132. lea rcx, [rdx + rdx*2] ;dst_stride * 3
  133. paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
  134. movdqa xmm7, xmm3
  135. pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
  136. paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
  137. psubw xmm7, xmm5 ; c1
  138. movdqa xmm5, xmm1
  139. movdqa xmm4, xmm3
  140. pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
  141. paddw xmm5, xmm1
  142. pmulhw xmm3, [GLOBAL(x_s1sqr2)]
  143. paddw xmm3, xmm4
  144. paddw xmm3, xmm5 ; d1
  145. movdqa xmm6, xmm2 ; a1
  146. movdqa xmm4, xmm0 ; b1
  147. paddw xmm2, xmm3 ;0
  148. paddw xmm4, xmm7 ;1
  149. psubw xmm0, xmm7 ;2
  150. psubw xmm6, xmm3 ;3
  151. ; transpose for the second pass
  152. movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
  153. punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
  154. punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
  155. movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
  156. punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
  157. punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
  158. movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
  159. punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
  160. punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
  161. movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
  162. punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
  163. punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
  164. movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
  165. punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
  166. punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
  167. movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
  168. punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
  169. punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
  170. pshufd xmm0, xmm2, 11011000b
  171. pshufd xmm2, xmm1, 11011000b
  172. pshufd xmm1, xmm5, 11011000b
  173. pshufd xmm3, xmm7, 11011000b
  174. ; second pass
  175. psubw xmm0, xmm2 ; b1 = 0-2
  176. paddw xmm2, xmm2
  177. movdqa xmm5, xmm1
  178. paddw xmm2, xmm0 ; a1 = 0+2
  179. pmulhw xmm5, [GLOBAL(x_s1sqr2)]
  180. paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
  181. movdqa xmm7, xmm3
  182. pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
  183. paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
  184. psubw xmm7, xmm5 ; c1
  185. movdqa xmm5, xmm1
  186. movdqa xmm4, xmm3
  187. pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
  188. paddw xmm5, xmm1
  189. pmulhw xmm3, [GLOBAL(x_s1sqr2)]
  190. paddw xmm3, xmm4
  191. paddw xmm3, xmm5 ; d1
  192. paddw xmm0, [GLOBAL(fours)]
  193. paddw xmm2, [GLOBAL(fours)]
  194. movdqa xmm6, xmm2 ; a1
  195. movdqa xmm4, xmm0 ; b1
  196. paddw xmm2, xmm3 ;0
  197. paddw xmm4, xmm7 ;1
  198. psubw xmm0, xmm7 ;2
  199. psubw xmm6, xmm3 ;3
  200. psraw xmm2, 3
  201. psraw xmm0, 3
  202. psraw xmm4, 3
  203. psraw xmm6, 3
  204. ; transpose to save
  205. movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
  206. punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
  207. punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
  208. movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
  209. punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
  210. punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
  211. movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
  212. punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
  213. punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
  214. movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
  215. punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
  216. punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
  217. movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
  218. punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
  219. punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
  220. movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
  221. punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
  222. punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
  223. pshufd xmm0, xmm2, 11011000b
  224. pshufd xmm2, xmm1, 11011000b
  225. pshufd xmm1, xmm5, 11011000b
  226. pshufd xmm3, xmm7, 11011000b
  227. pxor xmm7, xmm7
  228. ; Load up predict blocks
  229. movq xmm4, [rdi]
  230. movq xmm5, [rdi+rdx]
  231. punpcklbw xmm4, xmm7
  232. punpcklbw xmm5, xmm7
  233. paddw xmm0, xmm4
  234. paddw xmm1, xmm5
  235. movq xmm4, [rdi+2*rdx]
  236. movq xmm5, [rdi+rcx]
  237. punpcklbw xmm4, xmm7
  238. punpcklbw xmm5, xmm7
  239. paddw xmm2, xmm4
  240. paddw xmm3, xmm5
  241. .finish:
  242. ; pack up before storing
  243. packuswb xmm0, xmm7
  244. packuswb xmm1, xmm7
  245. packuswb xmm2, xmm7
  246. packuswb xmm3, xmm7
  247. ; store blocks back out
  248. movq [rdi], xmm0
  249. movq [rdi + rdx], xmm1
  250. movq [rdi + rdx*2], xmm2
  251. movq [rdi + rcx], xmm3
  252. ; begin epilog
  253. pop rdi
  254. pop rsi
  255. RESTORE_GOT
  256. RESTORE_XMM
  257. UNSHADOW_ARGS
  258. pop rbp
  259. ret
  260. ;void vp8_idct_dequant_dc_0_2x_sse2
  261. ; (
  262. ; short *qcoeff - 0
  263. ; short *dequant - 1
  264. ; unsigned char *dst - 2
  265. ; int dst_stride - 3
  266. ; short *dc - 4
  267. ; )
  268. global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
  269. sym(vp8_idct_dequant_dc_0_2x_sse2):
  270. push rbp
  271. mov rbp, rsp
  272. SHADOW_ARGS_TO_STACK 5
  273. GET_GOT rbx
  274. push rdi
  275. ; end prolog
  276. ; special case when 2 blocks have 0 or 1 coeffs
  277. ; dc is set as first coeff, so no need to load qcoeff
  278. mov rax, arg(0) ; qcoeff
  279. mov rdi, arg(2) ; dst
  280. mov rdx, arg(4) ; dc
  281. ; Zero out xmm5, for use unpacking
  282. pxor xmm5, xmm5
  283. ; load up 2 dc words here == 2*16 = doubleword
  284. movd xmm4, [rdx]
  285. movsxd rdx, dword ptr arg(3) ; dst_stride
  286. lea rcx, [rdx + rdx*2]
  287. ; Load up predict blocks
  288. movq xmm0, [rdi]
  289. movq xmm1, [rdi+rdx*1]
  290. movq xmm2, [rdi+rdx*2]
  291. movq xmm3, [rdi+rcx]
  292. ; Duplicate and expand dc across
  293. punpcklwd xmm4, xmm4
  294. punpckldq xmm4, xmm4
  295. ; Rounding to dequant and downshift
  296. paddw xmm4, [GLOBAL(fours)]
  297. psraw xmm4, 3
  298. ; Predict buffer needs to be expanded from bytes to words
  299. punpcklbw xmm0, xmm5
  300. punpcklbw xmm1, xmm5
  301. punpcklbw xmm2, xmm5
  302. punpcklbw xmm3, xmm5
  303. ; Add to predict buffer
  304. paddw xmm0, xmm4
  305. paddw xmm1, xmm4
  306. paddw xmm2, xmm4
  307. paddw xmm3, xmm4
  308. ; pack up before storing
  309. packuswb xmm0, xmm5
  310. packuswb xmm1, xmm5
  311. packuswb xmm2, xmm5
  312. packuswb xmm3, xmm5
  313. ; store blocks back out
  314. movq [rdi], xmm0
  315. movq [rdi + rdx], xmm1
  316. movq [rdi + rdx*2], xmm2
  317. movq [rdi + rcx], xmm3
  318. ; begin epilog
  319. pop rdi
  320. RESTORE_GOT
  321. UNSHADOW_ARGS
  322. pop rbp
  323. ret
  324. ;void vp8_idct_dequant_dc_full_2x_sse2
  325. ; (
  326. ; short *qcoeff - 0
  327. ; short *dequant - 1
  328. ; unsigned char *dst - 2
  329. ; int dst_stride - 3
  330. ; short *dc - 4
  331. ; )
  332. global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
  333. sym(vp8_idct_dequant_dc_full_2x_sse2):
  334. push rbp
  335. mov rbp, rsp
  336. SHADOW_ARGS_TO_STACK 5
  337. SAVE_XMM 7
  338. GET_GOT rbx
  339. push rdi
  340. ; end prolog
  341. ; special case when 2 blocks have 0 or 1 coeffs
  342. ; dc is set as first coeff, so no need to load qcoeff
  343. mov rax, arg(0) ; qcoeff
  344. mov rdx, arg(1) ; dequant
  345. mov rdi, arg(2) ; dst
  346. ; Zero out xmm7, for use unpacking
  347. pxor xmm7, xmm7
  348. ; note the transpose of xmm1 and xmm2, necessary for shuffle
  349. ; to spit out sensicle data
  350. movdqa xmm0, [rax]
  351. movdqa xmm2, [rax+16]
  352. movdqa xmm1, [rax+32]
  353. movdqa xmm3, [rax+48]
  354. ; Clear out coeffs
  355. movdqa [rax], xmm7
  356. movdqa [rax+16], xmm7
  357. movdqa [rax+32], xmm7
  358. movdqa [rax+48], xmm7
  359. ; dequantize qcoeff buffer
  360. pmullw xmm0, [rdx]
  361. pmullw xmm2, [rdx+16]
  362. pmullw xmm1, [rdx]
  363. pmullw xmm3, [rdx+16]
  364. ; DC component
  365. mov rdx, arg(4)
  366. ; repack so block 0 row x and block 1 row x are together
  367. movdqa xmm4, xmm0
  368. punpckldq xmm0, xmm1
  369. punpckhdq xmm4, xmm1
  370. pshufd xmm0, xmm0, 11011000b
  371. pshufd xmm1, xmm4, 11011000b
  372. movdqa xmm4, xmm2
  373. punpckldq xmm2, xmm3
  374. punpckhdq xmm4, xmm3
  375. pshufd xmm2, xmm2, 11011000b
  376. pshufd xmm3, xmm4, 11011000b
  377. ; insert DC component
  378. pinsrw xmm0, [rdx], 0
  379. pinsrw xmm0, [rdx+2], 4
  380. ; first pass
  381. psubw xmm0, xmm2 ; b1 = 0-2
  382. paddw xmm2, xmm2 ;
  383. movdqa xmm5, xmm1
  384. paddw xmm2, xmm0 ; a1 = 0+2
  385. pmulhw xmm5, [GLOBAL(x_s1sqr2)]
  386. paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
  387. movdqa xmm7, xmm3
  388. pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
  389. paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
  390. psubw xmm7, xmm5 ; c1
  391. movdqa xmm5, xmm1
  392. movdqa xmm4, xmm3
  393. pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
  394. paddw xmm5, xmm1
  395. pmulhw xmm3, [GLOBAL(x_s1sqr2)]
  396. paddw xmm3, xmm4
  397. paddw xmm3, xmm5 ; d1
  398. movdqa xmm6, xmm2 ; a1
  399. movdqa xmm4, xmm0 ; b1
  400. paddw xmm2, xmm3 ;0
  401. paddw xmm4, xmm7 ;1
  402. psubw xmm0, xmm7 ;2
  403. psubw xmm6, xmm3 ;3
  404. ; transpose for the second pass
  405. movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
  406. punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
  407. punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
  408. movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
  409. punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
  410. punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
  411. movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
  412. punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
  413. punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
  414. movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
  415. punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
  416. punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
  417. movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
  418. punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
  419. punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
  420. movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
  421. punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
  422. punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
  423. pshufd xmm0, xmm2, 11011000b
  424. pshufd xmm2, xmm1, 11011000b
  425. pshufd xmm1, xmm5, 11011000b
  426. pshufd xmm3, xmm7, 11011000b
  427. ; second pass
  428. psubw xmm0, xmm2 ; b1 = 0-2
  429. paddw xmm2, xmm2
  430. movdqa xmm5, xmm1
  431. paddw xmm2, xmm0 ; a1 = 0+2
  432. pmulhw xmm5, [GLOBAL(x_s1sqr2)]
  433. paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
  434. movdqa xmm7, xmm3
  435. pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
  436. paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
  437. psubw xmm7, xmm5 ; c1
  438. movdqa xmm5, xmm1
  439. movdqa xmm4, xmm3
  440. pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
  441. paddw xmm5, xmm1
  442. pmulhw xmm3, [GLOBAL(x_s1sqr2)]
  443. paddw xmm3, xmm4
  444. paddw xmm3, xmm5 ; d1
  445. paddw xmm0, [GLOBAL(fours)]
  446. paddw xmm2, [GLOBAL(fours)]
  447. movdqa xmm6, xmm2 ; a1
  448. movdqa xmm4, xmm0 ; b1
  449. paddw xmm2, xmm3 ;0
  450. paddw xmm4, xmm7 ;1
  451. psubw xmm0, xmm7 ;2
  452. psubw xmm6, xmm3 ;3
  453. psraw xmm2, 3
  454. psraw xmm0, 3
  455. psraw xmm4, 3
  456. psraw xmm6, 3
  457. ; transpose to save
  458. movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
  459. punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
  460. punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
  461. movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
  462. punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
  463. punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
  464. movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
  465. punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
  466. punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
  467. movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
  468. punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
  469. punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
  470. movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
  471. punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
  472. punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
  473. movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
  474. punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
  475. punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
  476. pshufd xmm0, xmm2, 11011000b
  477. pshufd xmm2, xmm1, 11011000b
  478. pshufd xmm1, xmm5, 11011000b
  479. pshufd xmm3, xmm7, 11011000b
  480. pxor xmm7, xmm7
  481. ; Load up predict blocks
  482. movsxd rdx, dword ptr arg(3) ; dst_stride
  483. movq xmm4, [rdi]
  484. movq xmm5, [rdi+rdx]
  485. lea rcx, [rdx + rdx*2]
  486. punpcklbw xmm4, xmm7
  487. punpcklbw xmm5, xmm7
  488. paddw xmm0, xmm4
  489. paddw xmm1, xmm5
  490. movq xmm4, [rdi+rdx*2]
  491. movq xmm5, [rdi+rcx]
  492. punpcklbw xmm4, xmm7
  493. punpcklbw xmm5, xmm7
  494. paddw xmm2, xmm4
  495. paddw xmm3, xmm5
  496. .finish:
  497. ; pack up before storing
  498. packuswb xmm0, xmm7
  499. packuswb xmm1, xmm7
  500. packuswb xmm2, xmm7
  501. packuswb xmm3, xmm7
  502. ; Load destination stride before writing out,
  503. ; doesn't need to persist
  504. movsxd rdx, dword ptr arg(3) ; dst_stride
  505. ; store blocks back out
  506. movq [rdi], xmm0
  507. movq [rdi + rdx], xmm1
  508. lea rdi, [rdi + 2*rdx]
  509. movq [rdi], xmm2
  510. movq [rdi + rdx], xmm3
  511. ; begin epilog
  512. pop rdi
  513. RESTORE_GOT
  514. RESTORE_XMM
  515. UNSHADOW_ARGS
  516. pop rbp
  517. ret
  518. SECTION_RODATA
  519. align 16
  520. fours:
  521. times 8 dw 0x0004
  522. align 16
  523. x_s1sqr2:
  524. times 8 dw 0x8A8C
  525. align 16
  526. x_c1sqr2less1:
  527. times 8 dw 0x4E7B