intrapred_ssse3.asm 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION_RODATA
  12. pb_1: times 16 db 1
  13. sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
  14. sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
  15. sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
  16. sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
  17. sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
  18. sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
  19. sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
  20. sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
  21. sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
  22. sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
  23. sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
  24. sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
  25. sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
  26. sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  27. SECTION .text
  28. INIT_XMM ssse3
  29. cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
  30. GET_GOT goffsetq
  31. mova m0, [aboveq]
  32. DEFINE_ARGS dst, stride, stride3, dst8, line
  33. lea stride3q, [strideq*3]
  34. lea dst8q, [dstq+strideq*8]
  35. mova m1, [GLOBAL(sh_b123456789abcdeff)]
  36. pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
  37. pavgb m3, m2, m0
  38. pxor m2, m0
  39. pshufb m0, m1
  40. pand m2, [GLOBAL(pb_1)]
  41. psubb m3, m2
  42. pavgb m0, m3
  43. ; first 4 lines and first half of 3rd 4 lines
  44. mov lined, 2
  45. .loop:
  46. mova [dstq ], m0
  47. movhps [dst8q ], m0
  48. pshufb m0, m1
  49. mova [dstq +strideq ], m0
  50. movhps [dst8q+strideq ], m0
  51. pshufb m0, m1
  52. mova [dstq +strideq*2 ], m0
  53. movhps [dst8q+strideq*2 ], m0
  54. pshufb m0, m1
  55. mova [dstq +stride3q ], m0
  56. movhps [dst8q+stride3q ], m0
  57. pshufb m0, m1
  58. lea dstq, [dstq +strideq*4]
  59. lea dst8q, [dst8q+strideq*4]
  60. dec lined
  61. jnz .loop
  62. ; bottom-right 8x8 block
  63. movhps [dstq +8], m0
  64. movhps [dstq+strideq +8], m0
  65. movhps [dstq+strideq*2+8], m0
  66. movhps [dstq+stride3q +8], m0
  67. lea dstq, [dstq+strideq*4]
  68. movhps [dstq +8], m0
  69. movhps [dstq+strideq +8], m0
  70. movhps [dstq+strideq*2+8], m0
  71. movhps [dstq+stride3q +8], m0
  72. RESTORE_GOT
  73. RET
  74. INIT_XMM ssse3
  75. cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
  76. GET_GOT goffsetq
  77. mova m0, [aboveq]
  78. mova m4, [aboveq+16]
  79. DEFINE_ARGS dst, stride, stride3, dst16, line
  80. lea stride3q, [strideq*3]
  81. lea dst16q, [dstq +strideq*8]
  82. lea dst16q, [dst16q+strideq*8]
  83. mova m1, [GLOBAL(sh_b123456789abcdeff)]
  84. pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)]
  85. pavgb m3, m2, m4
  86. pxor m2, m4
  87. palignr m5, m4, m0, 1
  88. palignr m6, m4, m0, 2
  89. pshufb m4, m1
  90. pand m2, [GLOBAL(pb_1)]
  91. psubb m3, m2
  92. pavgb m4, m3
  93. pavgb m3, m0, m6
  94. pxor m0, m6
  95. pand m0, [GLOBAL(pb_1)]
  96. psubb m3, m0
  97. pavgb m5, m3
  98. ; write 4x4 lines (and the first half of the second 4x4 lines)
  99. mov lined, 4
  100. .loop:
  101. mova [dstq ], m5
  102. mova [dstq +16], m4
  103. mova [dst16q ], m4
  104. palignr m3, m4, m5, 1
  105. pshufb m4, m1
  106. mova [dstq +strideq ], m3
  107. mova [dstq +strideq +16], m4
  108. mova [dst16q+strideq ], m4
  109. palignr m5, m4, m3, 1
  110. pshufb m4, m1
  111. mova [dstq +strideq*2 ], m5
  112. mova [dstq +strideq*2+16], m4
  113. mova [dst16q+strideq*2 ], m4
  114. palignr m3, m4, m5, 1
  115. pshufb m4, m1
  116. mova [dstq +stride3q ], m3
  117. mova [dstq +stride3q +16], m4
  118. mova [dst16q+stride3q ], m4
  119. palignr m5, m4, m3, 1
  120. pshufb m4, m1
  121. lea dstq, [dstq +strideq*4]
  122. lea dst16q, [dst16q+strideq*4]
  123. dec lined
  124. jnz .loop
  125. ; write second half of second 4x4 lines
  126. mova [dstq +16], m4
  127. mova [dstq +strideq +16], m4
  128. mova [dstq +strideq*2+16], m4
  129. mova [dstq +stride3q +16], m4
  130. lea dstq, [dstq +strideq*4]
  131. mova [dstq +16], m4
  132. mova [dstq +strideq +16], m4
  133. mova [dstq +strideq*2+16], m4
  134. mova [dstq +stride3q +16], m4
  135. lea dstq, [dstq +strideq*4]
  136. mova [dstq +16], m4
  137. mova [dstq +strideq +16], m4
  138. mova [dstq +strideq*2+16], m4
  139. mova [dstq +stride3q +16], m4
  140. lea dstq, [dstq +strideq*4]
  141. mova [dstq +16], m4
  142. mova [dstq +strideq +16], m4
  143. mova [dstq +strideq*2+16], m4
  144. mova [dstq +stride3q +16], m4
  145. RESTORE_GOT
  146. RET
  147. ; ------------------------------------------
  148. ; input: x, y, z, result
  149. ;
  150. ; trick from pascal
  151. ; (x+2y+z+2)>>2 can be calculated as:
  152. ; result = avg(x,z)
  153. ; result -= xor(x,z) & 1
  154. ; result = avg(result,y)
  155. ; ------------------------------------------
  156. %macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
  157. pavgb %4, %1, %3
  158. pxor %3, %1
  159. pand %3, [GLOBAL(pb_1)]
  160. psubb %4, %3
  161. pavgb %4, %2
  162. %endmacro
  163. INIT_XMM ssse3
  164. cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
  165. GET_GOT goffsetq
  166. movq m3, [aboveq]
  167. pshufb m1, m3, [GLOBAL(sh_b23456777)]
  168. pshufb m2, m3, [GLOBAL(sh_b12345677)]
  169. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
  170. pavgb m3, m2
  171. ; store 4 lines
  172. movd [dstq ], m3
  173. movd [dstq+strideq], m4
  174. lea dstq, [dstq+strideq*2]
  175. psrldq m3, 1
  176. psrldq m4, 1
  177. movd [dstq ], m3
  178. movd [dstq+strideq], m4
  179. RESTORE_GOT
  180. RET
  181. INIT_XMM ssse3
  182. cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
  183. GET_GOT goffsetq
  184. movq m3, [aboveq]
  185. DEFINE_ARGS dst, stride, stride3
  186. lea stride3q, [strideq*3]
  187. pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
  188. pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
  189. pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
  190. pshufb m3, [GLOBAL(sh_b0123456777777777)]
  191. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
  192. pavgb m3, m2
  193. ; store 4 lines
  194. movq [dstq ], m3
  195. movq [dstq+strideq], m4
  196. psrldq m3, 1
  197. psrldq m4, 1
  198. movq [dstq+strideq*2], m3
  199. movq [dstq+stride3q ], m4
  200. lea dstq, [dstq+strideq*4]
  201. psrldq m3, 1
  202. psrldq m4, 1
  203. ; store 4 lines
  204. movq [dstq ], m3
  205. movq [dstq+strideq], m4
  206. psrldq m3, 1
  207. psrldq m4, 1
  208. movq [dstq+strideq*2], m3
  209. movq [dstq+stride3q ], m4
  210. RESTORE_GOT
  211. RET
  212. INIT_XMM ssse3
  213. cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
  214. GET_GOT goffsetq
  215. mova m0, [aboveq]
  216. DEFINE_ARGS dst, stride, stride3, line
  217. lea stride3q, [strideq*3]
  218. mova m1, [GLOBAL(sh_b123456789abcdeff)]
  219. pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
  220. pshufb m3, m0, m1
  221. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
  222. pavgb m0, m3
  223. mov lined, 4
  224. .loop:
  225. mova [dstq ], m0
  226. mova [dstq+strideq ], m4
  227. pshufb m0, m1
  228. pshufb m4, m1
  229. mova [dstq+strideq*2], m0
  230. mova [dstq+stride3q ], m4
  231. pshufb m0, m1
  232. pshufb m4, m1
  233. lea dstq, [dstq+strideq*4]
  234. dec lined
  235. jnz .loop
  236. RESTORE_GOT
  237. REP_RET
  238. INIT_XMM ssse3
  239. cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
  240. GET_GOT goffsetq
  241. mova m0, [aboveq]
  242. mova m7, [aboveq+16]
  243. DEFINE_ARGS dst, stride, stride3, line
  244. mova m1, [GLOBAL(sh_b123456789abcdeff)]
  245. lea stride3q, [strideq*3]
  246. pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)]
  247. pshufb m3, m7, m1
  248. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
  249. palignr m6, m7, m0, 1
  250. palignr m5, m7, m0, 2
  251. pavgb m7, m3
  252. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
  253. pavgb m0, m6
  254. mov lined, 8
  255. .loop:
  256. mova [dstq ], m0
  257. mova [dstq +16], m7
  258. mova [dstq+strideq ], m2
  259. mova [dstq+strideq +16], m4
  260. palignr m3, m7, m0, 1
  261. palignr m5, m4, m2, 1
  262. pshufb m7, m1
  263. pshufb m4, m1
  264. mova [dstq+strideq*2 ], m3
  265. mova [dstq+strideq*2+16], m7
  266. mova [dstq+stride3q ], m5
  267. mova [dstq+stride3q +16], m4
  268. palignr m0, m7, m3, 1
  269. palignr m2, m4, m5, 1
  270. pshufb m7, m1
  271. pshufb m4, m1
  272. lea dstq, [dstq+strideq*4]
  273. dec lined
  274. jnz .loop
  275. RESTORE_GOT
  276. REP_RET
  277. INIT_XMM ssse3
  278. cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
  279. GET_GOT goffsetq
  280. movd m0, [leftq] ; l1, l2, l3, l4
  281. movd m1, [aboveq-1] ; tl, t1, t2, t3
  282. punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3
  283. pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
  284. psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3
  285. psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3
  286. ; comments below are for a predictor like this
  287. ; A1 B1 C1 D1
  288. ; A2 B2 A1 B1
  289. ; A3 B3 A2 B2
  290. ; A4 B4 A3 B3
  291. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1
  292. pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1
  293. punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
  294. DEFINE_ARGS dst, stride, stride3
  295. lea stride3q, [strideq*3]
  296. pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
  297. movd [dstq+stride3q ], m3
  298. psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 ..
  299. movd [dstq+strideq*2], m3
  300. psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 ..
  301. movd [dstq+strideq ], m3
  302. psrldq m3, 2 ; A1 B1 C1 D1 ..
  303. movd [dstq ], m3
  304. RESTORE_GOT
  305. RET
  306. INIT_XMM ssse3
  307. cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
  308. GET_GOT goffsetq
  309. movq m0, [leftq] ; [0- 7] l1-8 [byte]
  310. movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte]
  311. pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word]
  312. pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word]
  313. pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word]
  314. pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word]
  315. psrldq m4, m0, 1 ; t1-7 [word]
  316. psrldq m5, m0, 2 ; t2-7 [word]
  317. ; comments below are for a predictor like this
  318. ; A1 B1 C1 D1 E1 F1 G1 H1
  319. ; A2 B2 A1 B1 C1 D1 E1 F1
  320. ; A3 B3 A2 B2 A1 B1 C1 D1
  321. ; A4 B4 A3 B3 A2 B2 A1 B1
  322. ; A5 B5 A4 B4 A3 B3 A2 B2
  323. ; A6 B6 A5 B5 A4 B4 A3 B3
  324. ; A7 B7 A6 B6 A5 B5 A4 B4
  325. ; A8 B8 A7 B7 A6 B6 A5 B5
  326. pavgb m6, m1, m2 ; 2-tap avg A8-A1
  327. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1
  328. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1
  329. punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1
  330. DEFINE_ARGS dst, stride, stride3
  331. lea stride3q, [strideq*3]
  332. movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1
  333. palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1
  334. movq [dstq+strideq*2], m0
  335. psrldq m0, 2 ; A-B2, A-B1, C-H1
  336. movq [dstq+strideq ], m0
  337. psrldq m0, 2 ; A-H1
  338. movq [dstq ], m0
  339. lea dstq, [dstq+strideq*4]
  340. movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5
  341. psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4
  342. movq [dstq+strideq*2], m6
  343. psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3
  344. movq [dstq+strideq ], m6
  345. psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2
  346. movq [dstq ], m6
  347. RESTORE_GOT
  348. RET
  349. INIT_XMM ssse3
  350. cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
  351. GET_GOT goffsetq
  352. mova m0, [leftq]
  353. movu m7, [aboveq-1]
  354. ; comments below are for a predictor like this
  355. ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
  356. ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
  357. ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
  358. ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
  359. ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
  360. ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
  361. ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
  362. ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
  363. ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
  364. ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
  365. ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
  366. ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
  367. ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
  368. ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
  369. ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
  370. ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
  371. pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)]
  372. palignr m5, m0, m6, 15
  373. palignr m3, m0, m6, 14
  374. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
  375. pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)]
  376. pavgb m5, m0 ; A1 - Ag
  377. punpcklbw m0, m4, m5 ; A-B8 ... A-B1
  378. punpckhbw m4, m5 ; A-B9 ... A-Bg
  379. pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)]
  380. pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)]
  381. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1
  382. pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)]
  383. DEFINE_ARGS dst, stride, stride3
  384. lea stride3q, [strideq*3]
  385. palignr m2, m1, m6, 14
  386. mova [dstq ], m2
  387. palignr m2, m1, m6, 12
  388. mova [dstq+strideq ], m2
  389. palignr m2, m1, m6, 10
  390. mova [dstq+strideq*2], m2
  391. palignr m2, m1, m6, 8
  392. mova [dstq+stride3q ], m2
  393. lea dstq, [dstq+strideq*4]
  394. palignr m2, m1, m6, 6
  395. mova [dstq ], m2
  396. palignr m2, m1, m6, 4
  397. mova [dstq+strideq ], m2
  398. palignr m2, m1, m6, 2
  399. mova [dstq+strideq*2], m2
  400. pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
  401. mova [dstq+stride3q ], m6
  402. lea dstq, [dstq+strideq*4]
  403. palignr m2, m6, m4, 14
  404. mova [dstq ], m2
  405. palignr m2, m6, m4, 12
  406. mova [dstq+strideq ], m2
  407. palignr m2, m6, m4, 10
  408. mova [dstq+strideq*2], m2
  409. palignr m2, m6, m4, 8
  410. mova [dstq+stride3q ], m2
  411. lea dstq, [dstq+strideq*4]
  412. palignr m2, m6, m4, 6
  413. mova [dstq ], m2
  414. palignr m2, m6, m4, 4
  415. mova [dstq+strideq ], m2
  416. palignr m2, m6, m4, 2
  417. mova [dstq+strideq*2], m2
  418. mova [dstq+stride3q ], m4
  419. RESTORE_GOT
  420. RET
  421. INIT_XMM ssse3
  422. cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
  423. GET_GOT goffsetq
  424. mova m0, [leftq]
  425. movu m7, [aboveq-1]
  426. movu m1, [aboveq+15]
  427. pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)]
  428. pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)]
  429. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high]
  430. palignr m3, m1, m7, 1
  431. palignr m5, m1, m7, 2
  432. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low]
  433. pshufb m7, [GLOBAL(sh_bfedcba9876543210)]
  434. palignr m5, m0, m7, 15
  435. palignr m3, m0, m7, 14
  436. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
  437. pavgb m5, m0 ; A1 - Ag
  438. punpcklbw m6, m4, m5 ; A-B8 ... A-B1
  439. punpckhbw m4, m5 ; A-B9 ... A-Bg
  440. pshufb m6, [GLOBAL(sh_bfedcba9876543210)]
  441. pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
  442. DEFINE_ARGS dst, stride, stride3, left, line
  443. lea stride3q, [strideq*3]
  444. palignr m5, m2, m1, 14
  445. palignr m7, m1, m6, 14
  446. mova [dstq ], m7
  447. mova [dstq+16 ], m5
  448. palignr m5, m2, m1, 12
  449. palignr m7, m1, m6, 12
  450. mova [dstq+strideq ], m7
  451. mova [dstq+strideq+16 ], m5
  452. palignr m5, m2, m1, 10
  453. palignr m7, m1, m6, 10
  454. mova [dstq+strideq*2 ], m7
  455. mova [dstq+strideq*2+16], m5
  456. palignr m5, m2, m1, 8
  457. palignr m7, m1, m6, 8
  458. mova [dstq+stride3q ], m7
  459. mova [dstq+stride3q+16 ], m5
  460. lea dstq, [dstq+strideq*4]
  461. palignr m5, m2, m1, 6
  462. palignr m7, m1, m6, 6
  463. mova [dstq ], m7
  464. mova [dstq+16 ], m5
  465. palignr m5, m2, m1, 4
  466. palignr m7, m1, m6, 4
  467. mova [dstq+strideq ], m7
  468. mova [dstq+strideq+16 ], m5
  469. palignr m5, m2, m1, 2
  470. palignr m7, m1, m6, 2
  471. mova [dstq+strideq*2 ], m7
  472. mova [dstq+strideq*2+16], m5
  473. mova [dstq+stride3q ], m6
  474. mova [dstq+stride3q+16 ], m1
  475. lea dstq, [dstq+strideq*4]
  476. palignr m5, m1, m6, 14
  477. palignr m3, m6, m4, 14
  478. mova [dstq ], m3
  479. mova [dstq+16 ], m5
  480. palignr m5, m1, m6, 12
  481. palignr m3, m6, m4, 12
  482. mova [dstq+strideq ], m3
  483. mova [dstq+strideq+16 ], m5
  484. palignr m5, m1, m6, 10
  485. palignr m3, m6, m4, 10
  486. mova [dstq+strideq*2 ], m3
  487. mova [dstq+strideq*2+16], m5
  488. palignr m5, m1, m6, 8
  489. palignr m3, m6, m4, 8
  490. mova [dstq+stride3q ], m3
  491. mova [dstq+stride3q+16 ], m5
  492. lea dstq, [dstq+strideq*4]
  493. palignr m5, m1, m6, 6
  494. palignr m3, m6, m4, 6
  495. mova [dstq ], m3
  496. mova [dstq+16 ], m5
  497. palignr m5, m1, m6, 4
  498. palignr m3, m6, m4, 4
  499. mova [dstq+strideq ], m3
  500. mova [dstq+strideq+16 ], m5
  501. palignr m5, m1, m6, 2
  502. palignr m3, m6, m4, 2
  503. mova [dstq+strideq*2 ], m3
  504. mova [dstq+strideq*2+16], m5
  505. mova [dstq+stride3q ], m4
  506. mova [dstq+stride3q+16 ], m6
  507. lea dstq, [dstq+strideq*4]
  508. mova m7, [leftq]
  509. mova m3, [leftq+16]
  510. palignr m5, m3, m7, 15
  511. palignr m0, m3, m7, 14
  512. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh -
  513. pavgb m5, m3 ; Ah -
  514. punpcklbw m3, m2, m5 ; A-B8 ... A-B1
  515. punpckhbw m2, m5 ; A-B9 ... A-Bg
  516. pshufb m3, [GLOBAL(sh_bfedcba9876543210)]
  517. pshufb m2, [GLOBAL(sh_bfedcba9876543210)]
  518. palignr m7, m6, m4, 14
  519. palignr m0, m4, m3, 14
  520. mova [dstq ], m0
  521. mova [dstq+16 ], m7
  522. palignr m7, m6, m4, 12
  523. palignr m0, m4, m3, 12
  524. mova [dstq+strideq ], m0
  525. mova [dstq+strideq+16 ], m7
  526. palignr m7, m6, m4, 10
  527. palignr m0, m4, m3, 10
  528. mova [dstq+strideq*2 ], m0
  529. mova [dstq+strideq*2+16], m7
  530. palignr m7, m6, m4, 8
  531. palignr m0, m4, m3, 8
  532. mova [dstq+stride3q ], m0
  533. mova [dstq+stride3q+16 ], m7
  534. lea dstq, [dstq+strideq*4]
  535. palignr m7, m6, m4, 6
  536. palignr m0, m4, m3, 6
  537. mova [dstq ], m0
  538. mova [dstq+16 ], m7
  539. palignr m7, m6, m4, 4
  540. palignr m0, m4, m3, 4
  541. mova [dstq+strideq ], m0
  542. mova [dstq+strideq+16 ], m7
  543. palignr m7, m6, m4, 2
  544. palignr m0, m4, m3, 2
  545. mova [dstq+strideq*2 ], m0
  546. mova [dstq+strideq*2+16], m7
  547. mova [dstq+stride3q ], m3
  548. mova [dstq+stride3q+16 ], m4
  549. lea dstq, [dstq+strideq*4]
  550. palignr m7, m4, m3, 14
  551. palignr m0, m3, m2, 14
  552. mova [dstq ], m0
  553. mova [dstq+16 ], m7
  554. palignr m7, m4, m3, 12
  555. palignr m0, m3, m2, 12
  556. mova [dstq+strideq ], m0
  557. mova [dstq+strideq+16 ], m7
  558. palignr m7, m4, m3, 10
  559. palignr m0, m3, m2, 10
  560. mova [dstq+strideq*2 ], m0
  561. mova [dstq+strideq*2+16], m7
  562. palignr m7, m4, m3, 8
  563. palignr m0, m3, m2, 8
  564. mova [dstq+stride3q ], m0
  565. mova [dstq+stride3q+16 ], m7
  566. lea dstq, [dstq+strideq*4]
  567. palignr m7, m4, m3, 6
  568. palignr m0, m3, m2, 6
  569. mova [dstq ], m0
  570. mova [dstq+16 ], m7
  571. palignr m7, m4, m3, 4
  572. palignr m0, m3, m2, 4
  573. mova [dstq+strideq ], m0
  574. mova [dstq+strideq+16 ], m7
  575. palignr m7, m4, m3, 2
  576. palignr m0, m3, m2, 2
  577. mova [dstq+strideq*2 ], m0
  578. mova [dstq+strideq*2+16], m7
  579. mova [dstq+stride3q ], m2
  580. mova [dstq+stride3q+16 ], m3
  581. RESTORE_GOT
  582. RET
  583. INIT_XMM ssse3
  584. cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
  585. GET_GOT goffsetq
  586. movq m3, [leftq] ; abcdefgh [byte]
  587. lea stride3q, [strideq*3]
  588. pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
  589. pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
  590. pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
  591. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
  592. pavgb m0, m2
  593. punpcklbw m0, m3 ; interleaved output
  594. movq [dstq ], m0
  595. psrldq m0, 2
  596. movq [dstq+strideq ], m0
  597. psrldq m0, 2
  598. movq [dstq+strideq*2], m0
  599. psrldq m0, 2
  600. movq [dstq+stride3q ], m0
  601. lea dstq, [dstq+strideq*4]
  602. pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
  603. psrldq m0, 2
  604. movq [dstq ], m0
  605. psrldq m0, 2
  606. movq [dstq+strideq ], m0
  607. psrldq m0, 2
  608. movq [dstq+strideq*2], m0
  609. psrldq m0, 2
  610. movq [dstq+stride3q ], m0
  611. RESTORE_GOT
  612. RET
  613. INIT_XMM ssse3
  614. cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
  615. GET_GOT goffsetq
  616. lea stride3q, [strideq*3]
  617. mova m0, [leftq] ; abcdefghijklmnop [byte]
  618. pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
  619. pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
  620. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
  621. pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte]
  622. punpckhbw m4, m1, m3 ; interleaved input
  623. punpcklbw m1, m3 ; interleaved output
  624. mova [dstq ], m1
  625. palignr m3, m4, m1, 2
  626. mova [dstq+strideq ], m3
  627. palignr m3, m4, m1, 4
  628. mova [dstq+strideq*2], m3
  629. palignr m3, m4, m1, 6
  630. mova [dstq+stride3q ], m3
  631. lea dstq, [dstq+strideq*4]
  632. palignr m3, m4, m1, 8
  633. mova [dstq ], m3
  634. palignr m3, m4, m1, 10
  635. mova [dstq+strideq ], m3
  636. palignr m3, m4, m1, 12
  637. mova [dstq+strideq*2], m3
  638. palignr m3, m4, m1, 14
  639. mova [dstq+stride3q ], m3
  640. DEFINE_ARGS dst, stride, stride3, line
  641. mov lined, 2
  642. mova m0, [GLOBAL(sh_b23456789abcdefff)]
  643. .loop:
  644. lea dstq, [dstq+strideq*4]
  645. mova [dstq ], m4
  646. pshufb m4, m0
  647. mova [dstq+strideq ], m4
  648. pshufb m4, m0
  649. mova [dstq+strideq*2], m4
  650. pshufb m4, m0
  651. mova [dstq+stride3q ], m4
  652. pshufb m4, m0
  653. dec lined
  654. jnz .loop
  655. RESTORE_GOT
  656. REP_RET
  657. INIT_XMM ssse3
  658. cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
  659. GET_GOT goffsetq
  660. lea stride3q, [strideq*3]
  661. mova m1, [leftq] ; 0-15 [byte]
  662. mova m2, [leftq+16] ; 16-31 [byte]
  663. pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)]
  664. pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)]
  665. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
  666. palignr m6, m2, m1, 1
  667. palignr m5, m2, m1, 2
  668. pavgb m2, m4 ; high 16px even lines
  669. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
  670. pavgb m1, m6 ; low 16px even lines
  671. punpckhbw m6, m1, m0 ; interleaved output 2
  672. punpcklbw m1, m0 ; interleaved output 1
  673. punpckhbw m7, m2, m3 ; interleaved output 4
  674. punpcklbw m2, m3 ; interleaved output 3
  675. ; output 1st 8 lines (and half of 2nd 8 lines)
  676. DEFINE_ARGS dst, stride, stride3, dst8
  677. lea dst8q, [dstq+strideq*8]
  678. mova [dstq ], m1
  679. mova [dstq +16], m6
  680. mova [dst8q ], m6
  681. palignr m0, m6, m1, 2
  682. palignr m4, m2, m6, 2
  683. mova [dstq +strideq ], m0
  684. mova [dstq +strideq +16], m4
  685. mova [dst8q+strideq ], m4
  686. palignr m0, m6, m1, 4
  687. palignr m4, m2, m6, 4
  688. mova [dstq +strideq*2 ], m0
  689. mova [dstq +strideq*2+16], m4
  690. mova [dst8q+strideq*2 ], m4
  691. palignr m0, m6, m1, 6
  692. palignr m4, m2, m6, 6
  693. mova [dstq +stride3q ], m0
  694. mova [dstq +stride3q +16], m4
  695. mova [dst8q+stride3q ], m4
  696. lea dstq, [dstq +strideq*4]
  697. lea dst8q, [dst8q+strideq*4]
  698. palignr m0, m6, m1, 8
  699. palignr m4, m2, m6, 8
  700. mova [dstq ], m0
  701. mova [dstq +16], m4
  702. mova [dst8q ], m4
  703. palignr m0, m6, m1, 10
  704. palignr m4, m2, m6, 10
  705. mova [dstq +strideq ], m0
  706. mova [dstq +strideq +16], m4
  707. mova [dst8q+strideq ], m4
  708. palignr m0, m6, m1, 12
  709. palignr m4, m2, m6, 12
  710. mova [dstq +strideq*2 ], m0
  711. mova [dstq +strideq*2+16], m4
  712. mova [dst8q+strideq*2 ], m4
  713. palignr m0, m6, m1, 14
  714. palignr m4, m2, m6, 14
  715. mova [dstq +stride3q ], m0
  716. mova [dstq +stride3q +16], m4
  717. mova [dst8q+stride3q ], m4
  718. lea dstq, [dstq+strideq*4]
  719. lea dst8q, [dst8q+strideq*4]
  720. ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
  721. mova [dstq +16], m2
  722. mova [dst8q ], m2
  723. palignr m4, m7, m2, 2
  724. mova [dstq +strideq +16], m4
  725. mova [dst8q+strideq ], m4
  726. palignr m4, m7, m2, 4
  727. mova [dstq +strideq*2+16], m4
  728. mova [dst8q+strideq*2 ], m4
  729. palignr m4, m7, m2, 6
  730. mova [dstq +stride3q +16], m4
  731. mova [dst8q+stride3q ], m4
  732. lea dstq, [dstq+strideq*4]
  733. lea dst8q, [dst8q+strideq*4]
  734. palignr m4, m7, m2, 8
  735. mova [dstq +16], m4
  736. mova [dst8q ], m4
  737. palignr m4, m7, m2, 10
  738. mova [dstq +strideq +16], m4
  739. mova [dst8q+strideq ], m4
  740. palignr m4, m7, m2, 12
  741. mova [dstq +strideq*2+16], m4
  742. mova [dst8q+strideq*2 ], m4
  743. palignr m4, m7, m2, 14
  744. mova [dstq +stride3q +16], m4
  745. mova [dst8q+stride3q ], m4
  746. lea dstq, [dstq+strideq*4]
  747. lea dst8q, [dst8q+strideq*4]
  748. ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
  749. mova m0, [GLOBAL(sh_b23456789abcdefff)]
  750. mova [dstq +16], m7
  751. mova [dst8q ], m7
  752. pshufb m7, m0
  753. mova [dstq +strideq +16], m7
  754. mova [dst8q+strideq ], m7
  755. pshufb m7, m0
  756. mova [dstq +strideq*2+16], m7
  757. mova [dst8q+strideq*2 ], m7
  758. pshufb m7, m0
  759. mova [dstq +stride3q +16], m7
  760. mova [dst8q+stride3q ], m7
  761. pshufb m7, m0
  762. lea dstq, [dstq+strideq*4]
  763. lea dst8q, [dst8q+strideq*4]
  764. mova [dstq +16], m7
  765. mova [dst8q ], m7
  766. pshufb m7, m0
  767. mova [dstq +strideq +16], m7
  768. mova [dst8q+strideq ], m7
  769. pshufb m7, m0
  770. mova [dstq +strideq*2+16], m7
  771. mova [dst8q+strideq*2 ], m7
  772. pshufb m7, m0
  773. mova [dstq +stride3q +16], m7
  774. mova [dst8q+stride3q ], m7
  775. pshufb m7, m0
  776. lea dstq, [dstq+strideq*4]
  777. ; output last half of 4th 8 lines
  778. mova [dstq +16], m7
  779. mova [dstq +strideq +16], m7
  780. mova [dstq +strideq*2+16], m7
  781. mova [dstq +stride3q +16], m7
  782. lea dstq, [dstq+strideq*4]
  783. mova [dstq +16], m7
  784. mova [dstq +strideq +16], m7
  785. mova [dstq +strideq*2+16], m7
  786. mova [dstq +stride3q +16], m7
  787. ; done!
  788. RESTORE_GOT
  789. RET