idctllm_sse2.asm 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;void vp8_idct_dequant_0_2x_sse2
  12. ; (
  13. ; short *qcoeff - 0
  14. ; short *dequant - 1
  15. ; unsigned char *dst - 2
  16. ; int dst_stride - 3
  17. ; )
  18. SECTION .text
  19. global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
  20. sym(vp8_idct_dequant_0_2x_sse2):
  21. push rbp
  22. mov rbp, rsp
  23. SHADOW_ARGS_TO_STACK 4
  24. GET_GOT rbx
  25. ; end prolog
  26. mov rdx, arg(1) ; dequant
  27. mov rax, arg(0) ; qcoeff
  28. movd xmm4, [rax]
  29. movd xmm5, [rdx]
  30. pinsrw xmm4, [rax+32], 4
  31. pinsrw xmm5, [rdx], 4
  32. pmullw xmm4, xmm5
  33. ; Zero out xmm5, for use unpacking
  34. pxor xmm5, xmm5
  35. ; clear coeffs
  36. movd [rax], xmm5
  37. movd [rax+32], xmm5
  38. ;pshufb
  39. mov rax, arg(2) ; dst
  40. movsxd rdx, dword ptr arg(3) ; dst_stride
  41. pshuflw xmm4, xmm4, 00000000b
  42. pshufhw xmm4, xmm4, 00000000b
  43. lea rcx, [rdx + rdx*2]
  44. paddw xmm4, [GLOBAL(fours)]
  45. psraw xmm4, 3
  46. movq xmm0, [rax]
  47. movq xmm1, [rax+rdx]
  48. movq xmm2, [rax+2*rdx]
  49. movq xmm3, [rax+rcx]
  50. punpcklbw xmm0, xmm5
  51. punpcklbw xmm1, xmm5
  52. punpcklbw xmm2, xmm5
  53. punpcklbw xmm3, xmm5
  54. ; Add to predict buffer
  55. paddw xmm0, xmm4
  56. paddw xmm1, xmm4
  57. paddw xmm2, xmm4
  58. paddw xmm3, xmm4
  59. ; pack up before storing
  60. packuswb xmm0, xmm5
  61. packuswb xmm1, xmm5
  62. packuswb xmm2, xmm5
  63. packuswb xmm3, xmm5
  64. ; store blocks back out
  65. movq [rax], xmm0
  66. movq [rax + rdx], xmm1
  67. lea rax, [rax + 2*rdx]
  68. movq [rax], xmm2
  69. movq [rax + rdx], xmm3
  70. ; begin epilog
  71. RESTORE_GOT
  72. UNSHADOW_ARGS
  73. pop rbp
  74. ret
  75. ;void vp8_idct_dequant_full_2x_sse2
  76. ; (
  77. ; short *qcoeff - 0
  78. ; short *dequant - 1
  79. ; unsigned char *dst - 2
  80. ; int dst_stride - 3
  81. ; )
  82. global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
  83. sym(vp8_idct_dequant_full_2x_sse2):
  84. push rbp
  85. mov rbp, rsp
  86. SHADOW_ARGS_TO_STACK 4
  87. SAVE_XMM 7
  88. GET_GOT rbx
  89. push rsi
  90. push rdi
  91. ; end prolog
  92. ; special case when 2 blocks have 0 or 1 coeffs
  93. ; dc is set as first coeff, so no need to load qcoeff
  94. mov rax, arg(0) ; qcoeff
  95. mov rdx, arg(1) ; dequant
  96. mov rdi, arg(2) ; dst
  97. ; Zero out xmm7, for use unpacking
  98. pxor xmm7, xmm7
  99. ; note the transpose of xmm1 and xmm2, necessary for shuffle
  100. ; to spit out sensicle data
  101. movdqa xmm0, [rax]
  102. movdqa xmm2, [rax+16]
  103. movdqa xmm1, [rax+32]
  104. movdqa xmm3, [rax+48]
  105. ; Clear out coeffs
  106. movdqa [rax], xmm7
  107. movdqa [rax+16], xmm7
  108. movdqa [rax+32], xmm7
  109. movdqa [rax+48], xmm7
  110. ; dequantize qcoeff buffer
  111. pmullw xmm0, [rdx]
  112. pmullw xmm2, [rdx+16]
  113. pmullw xmm1, [rdx]
  114. pmullw xmm3, [rdx+16]
  115. movsxd rdx, dword ptr arg(3) ; dst_stride
  116. ; repack so block 0 row x and block 1 row x are together
  117. movdqa xmm4, xmm0
  118. punpckldq xmm0, xmm1
  119. punpckhdq xmm4, xmm1
  120. pshufd xmm0, xmm0, 11011000b
  121. pshufd xmm1, xmm4, 11011000b
  122. movdqa xmm4, xmm2
  123. punpckldq xmm2, xmm3
  124. punpckhdq xmm4, xmm3
  125. pshufd xmm2, xmm2, 11011000b
  126. pshufd xmm3, xmm4, 11011000b
  127. ; first pass
  128. psubw xmm0, xmm2 ; b1 = 0-2
  129. paddw xmm2, xmm2 ;
  130. movdqa xmm5, xmm1
  131. paddw xmm2, xmm0 ; a1 = 0+2
  132. pmulhw xmm5, [GLOBAL(x_s1sqr2)]
  133. lea rcx, [rdx + rdx*2] ;dst_stride * 3
  134. paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
  135. movdqa xmm7, xmm3
  136. pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
  137. paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
  138. psubw xmm7, xmm5 ; c1
  139. movdqa xmm5, xmm1
  140. movdqa xmm4, xmm3
  141. pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
  142. paddw xmm5, xmm1
  143. pmulhw xmm3, [GLOBAL(x_s1sqr2)]
  144. paddw xmm3, xmm4
  145. paddw xmm3, xmm5 ; d1
  146. movdqa xmm6, xmm2 ; a1
  147. movdqa xmm4, xmm0 ; b1
  148. paddw xmm2, xmm3 ;0
  149. paddw xmm4, xmm7 ;1
  150. psubw xmm0, xmm7 ;2
  151. psubw xmm6, xmm3 ;3
  152. ; transpose for the second pass
  153. movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
  154. punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
  155. punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
  156. movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
  157. punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
  158. punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
  159. movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
  160. punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
  161. punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
  162. movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
  163. punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
  164. punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
  165. movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
  166. punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
  167. punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
  168. movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
  169. punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
  170. punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
  171. pshufd xmm0, xmm2, 11011000b
  172. pshufd xmm2, xmm1, 11011000b
  173. pshufd xmm1, xmm5, 11011000b
  174. pshufd xmm3, xmm7, 11011000b
  175. ; second pass
  176. psubw xmm0, xmm2 ; b1 = 0-2
  177. paddw xmm2, xmm2
  178. movdqa xmm5, xmm1
  179. paddw xmm2, xmm0 ; a1 = 0+2
  180. pmulhw xmm5, [GLOBAL(x_s1sqr2)]
  181. paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
  182. movdqa xmm7, xmm3
  183. pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
  184. paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
  185. psubw xmm7, xmm5 ; c1
  186. movdqa xmm5, xmm1
  187. movdqa xmm4, xmm3
  188. pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
  189. paddw xmm5, xmm1
  190. pmulhw xmm3, [GLOBAL(x_s1sqr2)]
  191. paddw xmm3, xmm4
  192. paddw xmm3, xmm5 ; d1
  193. paddw xmm0, [GLOBAL(fours)]
  194. paddw xmm2, [GLOBAL(fours)]
  195. movdqa xmm6, xmm2 ; a1
  196. movdqa xmm4, xmm0 ; b1
  197. paddw xmm2, xmm3 ;0
  198. paddw xmm4, xmm7 ;1
  199. psubw xmm0, xmm7 ;2
  200. psubw xmm6, xmm3 ;3
  201. psraw xmm2, 3
  202. psraw xmm0, 3
  203. psraw xmm4, 3
  204. psraw xmm6, 3
  205. ; transpose to save
  206. movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
  207. punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
  208. punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
  209. movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
  210. punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
  211. punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
  212. movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
  213. punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
  214. punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
  215. movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
  216. punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
  217. punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
  218. movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
  219. punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
  220. punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
  221. movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
  222. punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
  223. punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
  224. pshufd xmm0, xmm2, 11011000b
  225. pshufd xmm2, xmm1, 11011000b
  226. pshufd xmm1, xmm5, 11011000b
  227. pshufd xmm3, xmm7, 11011000b
  228. pxor xmm7, xmm7
  229. ; Load up predict blocks
  230. movq xmm4, [rdi]
  231. movq xmm5, [rdi+rdx]
  232. punpcklbw xmm4, xmm7
  233. punpcklbw xmm5, xmm7
  234. paddw xmm0, xmm4
  235. paddw xmm1, xmm5
  236. movq xmm4, [rdi+2*rdx]
  237. movq xmm5, [rdi+rcx]
  238. punpcklbw xmm4, xmm7
  239. punpcklbw xmm5, xmm7
  240. paddw xmm2, xmm4
  241. paddw xmm3, xmm5
  242. .finish:
  243. ; pack up before storing
  244. packuswb xmm0, xmm7
  245. packuswb xmm1, xmm7
  246. packuswb xmm2, xmm7
  247. packuswb xmm3, xmm7
  248. ; store blocks back out
  249. movq [rdi], xmm0
  250. movq [rdi + rdx], xmm1
  251. movq [rdi + rdx*2], xmm2
  252. movq [rdi + rcx], xmm3
  253. ; begin epilog
  254. pop rdi
  255. pop rsi
  256. RESTORE_GOT
  257. RESTORE_XMM
  258. UNSHADOW_ARGS
  259. pop rbp
  260. ret
  261. ;void vp8_idct_dequant_dc_0_2x_sse2
  262. ; (
  263. ; short *qcoeff - 0
  264. ; short *dequant - 1
  265. ; unsigned char *dst - 2
  266. ; int dst_stride - 3
  267. ; short *dc - 4
  268. ; )
  269. global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
  270. sym(vp8_idct_dequant_dc_0_2x_sse2):
  271. push rbp
  272. mov rbp, rsp
  273. SHADOW_ARGS_TO_STACK 5
  274. GET_GOT rbx
  275. push rdi
  276. ; end prolog
  277. ; special case when 2 blocks have 0 or 1 coeffs
  278. ; dc is set as first coeff, so no need to load qcoeff
  279. mov rax, arg(0) ; qcoeff
  280. mov rdi, arg(2) ; dst
  281. mov rdx, arg(4) ; dc
  282. ; Zero out xmm5, for use unpacking
  283. pxor xmm5, xmm5
  284. ; load up 2 dc words here == 2*16 = doubleword
  285. movd xmm4, [rdx]
  286. movsxd rdx, dword ptr arg(3) ; dst_stride
  287. lea rcx, [rdx + rdx*2]
  288. ; Load up predict blocks
  289. movq xmm0, [rdi]
  290. movq xmm1, [rdi+rdx*1]
  291. movq xmm2, [rdi+rdx*2]
  292. movq xmm3, [rdi+rcx]
  293. ; Duplicate and expand dc across
  294. punpcklwd xmm4, xmm4
  295. punpckldq xmm4, xmm4
  296. ; Rounding to dequant and downshift
  297. paddw xmm4, [GLOBAL(fours)]
  298. psraw xmm4, 3
  299. ; Predict buffer needs to be expanded from bytes to words
  300. punpcklbw xmm0, xmm5
  301. punpcklbw xmm1, xmm5
  302. punpcklbw xmm2, xmm5
  303. punpcklbw xmm3, xmm5
  304. ; Add to predict buffer
  305. paddw xmm0, xmm4
  306. paddw xmm1, xmm4
  307. paddw xmm2, xmm4
  308. paddw xmm3, xmm4
  309. ; pack up before storing
  310. packuswb xmm0, xmm5
  311. packuswb xmm1, xmm5
  312. packuswb xmm2, xmm5
  313. packuswb xmm3, xmm5
  314. ; store blocks back out
  315. movq [rdi], xmm0
  316. movq [rdi + rdx], xmm1
  317. movq [rdi + rdx*2], xmm2
  318. movq [rdi + rcx], xmm3
  319. ; begin epilog
  320. pop rdi
  321. RESTORE_GOT
  322. UNSHADOW_ARGS
  323. pop rbp
  324. ret
  325. ;void vp8_idct_dequant_dc_full_2x_sse2
  326. ; (
  327. ; short *qcoeff - 0
  328. ; short *dequant - 1
  329. ; unsigned char *dst - 2
  330. ; int dst_stride - 3
  331. ; short *dc - 4
  332. ; )
  333. global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
  334. sym(vp8_idct_dequant_dc_full_2x_sse2):
  335. push rbp
  336. mov rbp, rsp
  337. SHADOW_ARGS_TO_STACK 5
  338. SAVE_XMM 7
  339. GET_GOT rbx
  340. push rdi
  341. ; end prolog
  342. ; special case when 2 blocks have 0 or 1 coeffs
  343. ; dc is set as first coeff, so no need to load qcoeff
  344. mov rax, arg(0) ; qcoeff
  345. mov rdx, arg(1) ; dequant
  346. mov rdi, arg(2) ; dst
  347. ; Zero out xmm7, for use unpacking
  348. pxor xmm7, xmm7
  349. ; note the transpose of xmm1 and xmm2, necessary for shuffle
  350. ; to spit out sensicle data
  351. movdqa xmm0, [rax]
  352. movdqa xmm2, [rax+16]
  353. movdqa xmm1, [rax+32]
  354. movdqa xmm3, [rax+48]
  355. ; Clear out coeffs
  356. movdqa [rax], xmm7
  357. movdqa [rax+16], xmm7
  358. movdqa [rax+32], xmm7
  359. movdqa [rax+48], xmm7
  360. ; dequantize qcoeff buffer
  361. pmullw xmm0, [rdx]
  362. pmullw xmm2, [rdx+16]
  363. pmullw xmm1, [rdx]
  364. pmullw xmm3, [rdx+16]
  365. ; DC component
  366. mov rdx, arg(4)
  367. ; repack so block 0 row x and block 1 row x are together
  368. movdqa xmm4, xmm0
  369. punpckldq xmm0, xmm1
  370. punpckhdq xmm4, xmm1
  371. pshufd xmm0, xmm0, 11011000b
  372. pshufd xmm1, xmm4, 11011000b
  373. movdqa xmm4, xmm2
  374. punpckldq xmm2, xmm3
  375. punpckhdq xmm4, xmm3
  376. pshufd xmm2, xmm2, 11011000b
  377. pshufd xmm3, xmm4, 11011000b
  378. ; insert DC component
  379. pinsrw xmm0, [rdx], 0
  380. pinsrw xmm0, [rdx+2], 4
  381. ; first pass
  382. psubw xmm0, xmm2 ; b1 = 0-2
  383. paddw xmm2, xmm2 ;
  384. movdqa xmm5, xmm1
  385. paddw xmm2, xmm0 ; a1 = 0+2
  386. pmulhw xmm5, [GLOBAL(x_s1sqr2)]
  387. paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
  388. movdqa xmm7, xmm3
  389. pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
  390. paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
  391. psubw xmm7, xmm5 ; c1
  392. movdqa xmm5, xmm1
  393. movdqa xmm4, xmm3
  394. pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
  395. paddw xmm5, xmm1
  396. pmulhw xmm3, [GLOBAL(x_s1sqr2)]
  397. paddw xmm3, xmm4
  398. paddw xmm3, xmm5 ; d1
  399. movdqa xmm6, xmm2 ; a1
  400. movdqa xmm4, xmm0 ; b1
  401. paddw xmm2, xmm3 ;0
  402. paddw xmm4, xmm7 ;1
  403. psubw xmm0, xmm7 ;2
  404. psubw xmm6, xmm3 ;3
  405. ; transpose for the second pass
  406. movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
  407. punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
  408. punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
  409. movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
  410. punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
  411. punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
  412. movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
  413. punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
  414. punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
  415. movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
  416. punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
  417. punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
  418. movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
  419. punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
  420. punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
  421. movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
  422. punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
  423. punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
  424. pshufd xmm0, xmm2, 11011000b
  425. pshufd xmm2, xmm1, 11011000b
  426. pshufd xmm1, xmm5, 11011000b
  427. pshufd xmm3, xmm7, 11011000b
  428. ; second pass
  429. psubw xmm0, xmm2 ; b1 = 0-2
  430. paddw xmm2, xmm2
  431. movdqa xmm5, xmm1
  432. paddw xmm2, xmm0 ; a1 = 0+2
  433. pmulhw xmm5, [GLOBAL(x_s1sqr2)]
  434. paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
  435. movdqa xmm7, xmm3
  436. pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
  437. paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
  438. psubw xmm7, xmm5 ; c1
  439. movdqa xmm5, xmm1
  440. movdqa xmm4, xmm3
  441. pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
  442. paddw xmm5, xmm1
  443. pmulhw xmm3, [GLOBAL(x_s1sqr2)]
  444. paddw xmm3, xmm4
  445. paddw xmm3, xmm5 ; d1
  446. paddw xmm0, [GLOBAL(fours)]
  447. paddw xmm2, [GLOBAL(fours)]
  448. movdqa xmm6, xmm2 ; a1
  449. movdqa xmm4, xmm0 ; b1
  450. paddw xmm2, xmm3 ;0
  451. paddw xmm4, xmm7 ;1
  452. psubw xmm0, xmm7 ;2
  453. psubw xmm6, xmm3 ;3
  454. psraw xmm2, 3
  455. psraw xmm0, 3
  456. psraw xmm4, 3
  457. psraw xmm6, 3
  458. ; transpose to save
  459. movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
  460. punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
  461. punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
  462. movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
  463. punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
  464. punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
  465. movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
  466. punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
  467. punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
  468. movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
  469. punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
  470. punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
  471. movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
  472. punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
  473. punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
  474. movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
  475. punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
  476. punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
  477. pshufd xmm0, xmm2, 11011000b
  478. pshufd xmm2, xmm1, 11011000b
  479. pshufd xmm1, xmm5, 11011000b
  480. pshufd xmm3, xmm7, 11011000b
  481. pxor xmm7, xmm7
  482. ; Load up predict blocks
  483. movsxd rdx, dword ptr arg(3) ; dst_stride
  484. movq xmm4, [rdi]
  485. movq xmm5, [rdi+rdx]
  486. lea rcx, [rdx + rdx*2]
  487. punpcklbw xmm4, xmm7
  488. punpcklbw xmm5, xmm7
  489. paddw xmm0, xmm4
  490. paddw xmm1, xmm5
  491. movq xmm4, [rdi+rdx*2]
  492. movq xmm5, [rdi+rcx]
  493. punpcklbw xmm4, xmm7
  494. punpcklbw xmm5, xmm7
  495. paddw xmm2, xmm4
  496. paddw xmm3, xmm5
  497. .finish:
  498. ; pack up before storing
  499. packuswb xmm0, xmm7
  500. packuswb xmm1, xmm7
  501. packuswb xmm2, xmm7
  502. packuswb xmm3, xmm7
  503. ; Load destination stride before writing out,
  504. ; doesn't need to persist
  505. movsxd rdx, dword ptr arg(3) ; dst_stride
  506. ; store blocks back out
  507. movq [rdi], xmm0
  508. movq [rdi + rdx], xmm1
  509. lea rdi, [rdi + 2*rdx]
  510. movq [rdi], xmm2
  511. movq [rdi + rdx], xmm3
  512. ; begin epilog
  513. pop rdi
  514. RESTORE_GOT
  515. RESTORE_XMM
  516. UNSHADOW_ARGS
  517. pop rbp
  518. ret
  519. SECTION_RODATA
  520. align 16
  521. fours:
  522. times 8 dw 0x0004
  523. align 16
  524. x_s1sqr2:
  525. times 8 dw 0x8A8C
  526. align 16
  527. x_c1sqr2less1:
  528. times 8 dw 0x4E7B