2
0

intrapred_sse2.asm 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION_RODATA
  12. pb_1: times 16 db 1
  13. pw_4: times 8 dw 4
  14. pw_8: times 8 dw 8
  15. pw_16: times 8 dw 16
  16. pw_32: times 8 dw 32
  17. dc_128: times 16 db 128
  18. pw2_4: times 8 dw 2
  19. pw2_8: times 8 dw 4
  20. pw2_16: times 8 dw 8
  21. pw2_32: times 8 dw 16
  22. SECTION .text
  23. ; ------------------------------------------
  24. ; input: x, y, z, result
  25. ;
  26. ; trick from pascal
  27. ; (x+2y+z+2)>>2 can be calculated as:
  28. ; result = avg(x,z)
  29. ; result -= xor(x,z) & 1
  30. ; result = avg(result,y)
  31. ; ------------------------------------------
  32. %macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
  33. pavgb %4, %1, %3
  34. pxor %3, %1
  35. pand %3, [GLOBAL(pb_1)]
  36. psubb %4, %3
  37. pavgb %4, %2
  38. %endmacro
  39. INIT_XMM sse2
  40. cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
  41. GET_GOT goffsetq
  42. movq m0, [aboveq]
  43. DEFINE_ARGS dst, stride, temp
  44. psrldq m1, m0, 1
  45. psrldq m2, m0, 2
  46. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
  47. ; store 4 lines
  48. movd [dstq ], m3
  49. psrlq m3, 8
  50. movd [dstq+strideq ], m3
  51. lea dstq, [dstq+strideq*2]
  52. psrlq m3, 8
  53. movd [dstq ], m3
  54. psrlq m3, 8
  55. movd [dstq+strideq ], m3
  56. psrlq m0, 56
  57. movd tempd, m0
  58. mov [dstq+strideq+3], tempb
  59. RESTORE_GOT
  60. RET
  61. INIT_XMM sse2
  62. cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
  63. GET_GOT goffsetq
  64. movu m1, [aboveq]
  65. pslldq m0, m1, 1
  66. psrldq m2, m1, 1
  67. DEFINE_ARGS dst, stride, stride3
  68. lea stride3q, [strideq*3]
  69. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
  70. punpckhbw m0, m0 ; 7 7
  71. punpcklwd m0, m0 ; 7 7 7 7
  72. punpckldq m0, m0 ; 7 7 7 7 7 7 7 7
  73. punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
  74. ; store 4 lines
  75. psrldq m3, 1
  76. movq [dstq ], m3
  77. psrldq m3, 1
  78. movq [dstq+strideq ], m3
  79. psrldq m3, 1
  80. movq [dstq+strideq*2], m3
  81. psrldq m3, 1
  82. movq [dstq+stride3q ], m3
  83. lea dstq, [dstq+strideq*4]
  84. ; store next 4 lines
  85. psrldq m3, 1
  86. movq [dstq ], m3
  87. psrldq m3, 1
  88. movq [dstq+strideq ], m3
  89. psrldq m3, 1
  90. movq [dstq+strideq*2], m3
  91. psrldq m3, 1
  92. movq [dstq+stride3q ], m3
  93. RESTORE_GOT
  94. RET
  95. INIT_XMM sse2
  96. cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
  97. GET_GOT goffsetq
  98. movd m0, [leftq] ; abcd [byte]
  99. punpcklbw m4, m0, m0 ; aabb ccdd
  100. punpcklwd m4, m4 ; aaaa bbbb cccc dddd
  101. psrldq m4, 12 ; dddd
  102. punpckldq m0, m4 ; abcd dddd
  103. psrldq m1, m0, 1 ; bcdd
  104. psrldq m2, m0, 2 ; cddd
  105. X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d
  106. pavgb m1, m0 ; ab, bc, cd, d [byte]
  107. punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
  108. movd [dstq ], m1
  109. psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
  110. movd [dstq+strideq], m1
  111. lea dstq, [dstq+strideq*2]
  112. psrlq m1, 16 ; cd, c3d, d, d
  113. movd [dstq ], m1
  114. movd [dstq+strideq], m4 ; d, d, d, d
  115. RESTORE_GOT
  116. RET
  117. INIT_XMM sse2
  118. cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
  119. GET_GOT goffsetq
  120. movd m2, [leftq]
  121. movd m0, [aboveq]
  122. pxor m1, m1
  123. punpckldq m0, m2
  124. psadbw m0, m1
  125. paddw m0, [GLOBAL(pw_4)]
  126. psraw m0, 3
  127. pshuflw m0, m0, 0x0
  128. packuswb m0, m0
  129. movd [dstq ], m0
  130. movd [dstq+strideq], m0
  131. lea dstq, [dstq+strideq*2]
  132. movd [dstq ], m0
  133. movd [dstq+strideq], m0
  134. RESTORE_GOT
  135. RET
  136. INIT_XMM sse2
  137. cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
  138. movifnidn leftq, leftmp
  139. GET_GOT goffsetq
  140. pxor m1, m1
  141. movd m0, [leftq]
  142. psadbw m0, m1
  143. paddw m0, [GLOBAL(pw2_4)]
  144. psraw m0, 2
  145. pshuflw m0, m0, 0x0
  146. packuswb m0, m0
  147. movd [dstq ], m0
  148. movd [dstq+strideq], m0
  149. lea dstq, [dstq+strideq*2]
  150. movd [dstq ], m0
  151. movd [dstq+strideq], m0
  152. RESTORE_GOT
  153. RET
  154. INIT_XMM sse2
  155. cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
  156. GET_GOT goffsetq
  157. pxor m1, m1
  158. movd m0, [aboveq]
  159. psadbw m0, m1
  160. paddw m0, [GLOBAL(pw2_4)]
  161. psraw m0, 2
  162. pshuflw m0, m0, 0x0
  163. packuswb m0, m0
  164. movd [dstq ], m0
  165. movd [dstq+strideq], m0
  166. lea dstq, [dstq+strideq*2]
  167. movd [dstq ], m0
  168. movd [dstq+strideq], m0
  169. RESTORE_GOT
  170. RET
  171. INIT_XMM sse2
  172. cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
  173. GET_GOT goffsetq
  174. pxor m1, m1
  175. movq m0, [aboveq]
  176. movq m2, [leftq]
  177. DEFINE_ARGS dst, stride, stride3
  178. lea stride3q, [strideq*3]
  179. psadbw m0, m1
  180. psadbw m2, m1
  181. paddw m0, m2
  182. paddw m0, [GLOBAL(pw_8)]
  183. psraw m0, 4
  184. punpcklbw m0, m0
  185. pshuflw m0, m0, 0x0
  186. movq [dstq ], m0
  187. movq [dstq+strideq ], m0
  188. movq [dstq+strideq*2], m0
  189. movq [dstq+stride3q ], m0
  190. lea dstq, [dstq+strideq*4]
  191. movq [dstq ], m0
  192. movq [dstq+strideq ], m0
  193. movq [dstq+strideq*2], m0
  194. movq [dstq+stride3q ], m0
  195. RESTORE_GOT
  196. RET
  197. INIT_XMM sse2
  198. cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
  199. GET_GOT goffsetq
  200. pxor m1, m1
  201. movq m0, [aboveq]
  202. DEFINE_ARGS dst, stride, stride3
  203. lea stride3q, [strideq*3]
  204. psadbw m0, m1
  205. paddw m0, [GLOBAL(pw2_8)]
  206. psraw m0, 3
  207. punpcklbw m0, m0
  208. pshuflw m0, m0, 0x0
  209. movq [dstq ], m0
  210. movq [dstq+strideq ], m0
  211. movq [dstq+strideq*2], m0
  212. movq [dstq+stride3q ], m0
  213. lea dstq, [dstq+strideq*4]
  214. movq [dstq ], m0
  215. movq [dstq+strideq ], m0
  216. movq [dstq+strideq*2], m0
  217. movq [dstq+stride3q ], m0
  218. RESTORE_GOT
  219. RET
  220. INIT_XMM sse2
  221. cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
  222. movifnidn leftq, leftmp
  223. GET_GOT goffsetq
  224. pxor m1, m1
  225. movq m0, [leftq]
  226. DEFINE_ARGS dst, stride, stride3
  227. lea stride3q, [strideq*3]
  228. psadbw m0, m1
  229. paddw m0, [GLOBAL(pw2_8)]
  230. psraw m0, 3
  231. punpcklbw m0, m0
  232. pshuflw m0, m0, 0x0
  233. movq [dstq ], m0
  234. movq [dstq+strideq ], m0
  235. movq [dstq+strideq*2], m0
  236. movq [dstq+stride3q ], m0
  237. lea dstq, [dstq+strideq*4]
  238. movq [dstq ], m0
  239. movq [dstq+strideq ], m0
  240. movq [dstq+strideq*2], m0
  241. movq [dstq+stride3q ], m0
  242. RESTORE_GOT
  243. RET
  244. INIT_XMM sse2
  245. cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
  246. GET_GOT goffsetq
  247. DEFINE_ARGS dst, stride, stride3
  248. lea stride3q, [strideq*3]
  249. movd m0, [GLOBAL(dc_128)]
  250. movd [dstq ], m0
  251. movd [dstq+strideq ], m0
  252. movd [dstq+strideq*2], m0
  253. movd [dstq+stride3q ], m0
  254. RESTORE_GOT
  255. RET
  256. INIT_XMM sse2
  257. cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
  258. GET_GOT goffsetq
  259. DEFINE_ARGS dst, stride, stride3
  260. lea stride3q, [strideq*3]
  261. movq m0, [GLOBAL(dc_128)]
  262. movq [dstq ], m0
  263. movq [dstq+strideq ], m0
  264. movq [dstq+strideq*2], m0
  265. movq [dstq+stride3q ], m0
  266. lea dstq, [dstq+strideq*4]
  267. movq [dstq ], m0
  268. movq [dstq+strideq ], m0
  269. movq [dstq+strideq*2], m0
  270. movq [dstq+stride3q ], m0
  271. RESTORE_GOT
  272. RET
  273. INIT_XMM sse2
  274. cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
  275. GET_GOT goffsetq
  276. pxor m1, m1
  277. mova m0, [aboveq]
  278. mova m2, [leftq]
  279. DEFINE_ARGS dst, stride, stride3, lines4
  280. lea stride3q, [strideq*3]
  281. mov lines4d, 4
  282. psadbw m0, m1
  283. psadbw m2, m1
  284. paddw m0, m2
  285. movhlps m2, m0
  286. paddw m0, m2
  287. paddw m0, [GLOBAL(pw_16)]
  288. psraw m0, 5
  289. pshuflw m0, m0, 0x0
  290. punpcklqdq m0, m0
  291. packuswb m0, m0
  292. .loop:
  293. mova [dstq ], m0
  294. mova [dstq+strideq ], m0
  295. mova [dstq+strideq*2], m0
  296. mova [dstq+stride3q ], m0
  297. lea dstq, [dstq+strideq*4]
  298. dec lines4d
  299. jnz .loop
  300. RESTORE_GOT
  301. REP_RET
  302. INIT_XMM sse2
  303. cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
  304. GET_GOT goffsetq
  305. pxor m1, m1
  306. mova m0, [aboveq]
  307. DEFINE_ARGS dst, stride, stride3, lines4
  308. lea stride3q, [strideq*3]
  309. mov lines4d, 4
  310. psadbw m0, m1
  311. movhlps m2, m0
  312. paddw m0, m2
  313. paddw m0, [GLOBAL(pw2_16)]
  314. psraw m0, 4
  315. pshuflw m0, m0, 0x0
  316. punpcklqdq m0, m0
  317. packuswb m0, m0
  318. .loop:
  319. mova [dstq ], m0
  320. mova [dstq+strideq ], m0
  321. mova [dstq+strideq*2], m0
  322. mova [dstq+stride3q ], m0
  323. lea dstq, [dstq+strideq*4]
  324. dec lines4d
  325. jnz .loop
  326. RESTORE_GOT
  327. REP_RET
  328. INIT_XMM sse2
  329. cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
  330. GET_GOT goffsetq
  331. pxor m1, m1
  332. mova m0, [leftq]
  333. DEFINE_ARGS dst, stride, stride3, lines4
  334. lea stride3q, [strideq*3]
  335. mov lines4d, 4
  336. psadbw m0, m1
  337. movhlps m2, m0
  338. paddw m0, m2
  339. paddw m0, [GLOBAL(pw2_16)]
  340. psraw m0, 4
  341. pshuflw m0, m0, 0x0
  342. punpcklqdq m0, m0
  343. packuswb m0, m0
  344. .loop:
  345. mova [dstq ], m0
  346. mova [dstq+strideq ], m0
  347. mova [dstq+strideq*2], m0
  348. mova [dstq+stride3q ], m0
  349. lea dstq, [dstq+strideq*4]
  350. dec lines4d
  351. jnz .loop
  352. RESTORE_GOT
  353. REP_RET
  354. INIT_XMM sse2
  355. cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
  356. GET_GOT goffsetq
  357. DEFINE_ARGS dst, stride, stride3, lines4
  358. lea stride3q, [strideq*3]
  359. mov lines4d, 4
  360. mova m0, [GLOBAL(dc_128)]
  361. .loop:
  362. mova [dstq ], m0
  363. mova [dstq+strideq ], m0
  364. mova [dstq+strideq*2], m0
  365. mova [dstq+stride3q ], m0
  366. lea dstq, [dstq+strideq*4]
  367. dec lines4d
  368. jnz .loop
  369. RESTORE_GOT
  370. RET
  371. INIT_XMM sse2
  372. cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
  373. GET_GOT goffsetq
  374. pxor m1, m1
  375. mova m0, [aboveq]
  376. mova m2, [aboveq+16]
  377. mova m3, [leftq]
  378. mova m4, [leftq+16]
  379. DEFINE_ARGS dst, stride, stride3, lines4
  380. lea stride3q, [strideq*3]
  381. mov lines4d, 8
  382. psadbw m0, m1
  383. psadbw m2, m1
  384. psadbw m3, m1
  385. psadbw m4, m1
  386. paddw m0, m2
  387. paddw m0, m3
  388. paddw m0, m4
  389. movhlps m2, m0
  390. paddw m0, m2
  391. paddw m0, [GLOBAL(pw_32)]
  392. psraw m0, 6
  393. pshuflw m0, m0, 0x0
  394. punpcklqdq m0, m0
  395. packuswb m0, m0
  396. .loop:
  397. mova [dstq ], m0
  398. mova [dstq +16], m0
  399. mova [dstq+strideq ], m0
  400. mova [dstq+strideq +16], m0
  401. mova [dstq+strideq*2 ], m0
  402. mova [dstq+strideq*2+16], m0
  403. mova [dstq+stride3q ], m0
  404. mova [dstq+stride3q +16], m0
  405. lea dstq, [dstq+strideq*4]
  406. dec lines4d
  407. jnz .loop
  408. RESTORE_GOT
  409. REP_RET
  410. INIT_XMM sse2
  411. cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
  412. GET_GOT goffsetq
  413. pxor m1, m1
  414. mova m0, [aboveq]
  415. mova m2, [aboveq+16]
  416. DEFINE_ARGS dst, stride, stride3, lines4
  417. lea stride3q, [strideq*3]
  418. mov lines4d, 8
  419. psadbw m0, m1
  420. psadbw m2, m1
  421. paddw m0, m2
  422. movhlps m2, m0
  423. paddw m0, m2
  424. paddw m0, [GLOBAL(pw2_32)]
  425. psraw m0, 5
  426. pshuflw m0, m0, 0x0
  427. punpcklqdq m0, m0
  428. packuswb m0, m0
  429. .loop:
  430. mova [dstq ], m0
  431. mova [dstq +16], m0
  432. mova [dstq+strideq ], m0
  433. mova [dstq+strideq +16], m0
  434. mova [dstq+strideq*2 ], m0
  435. mova [dstq+strideq*2+16], m0
  436. mova [dstq+stride3q ], m0
  437. mova [dstq+stride3q +16], m0
  438. lea dstq, [dstq+strideq*4]
  439. dec lines4d
  440. jnz .loop
  441. RESTORE_GOT
  442. REP_RET
  443. INIT_XMM sse2
  444. cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
  445. GET_GOT goffsetq
  446. pxor m1, m1
  447. mova m0, [leftq]
  448. mova m2, [leftq+16]
  449. DEFINE_ARGS dst, stride, stride3, lines4
  450. lea stride3q, [strideq*3]
  451. mov lines4d, 8
  452. psadbw m0, m1
  453. psadbw m2, m1
  454. paddw m0, m2
  455. movhlps m2, m0
  456. paddw m0, m2
  457. paddw m0, [GLOBAL(pw2_32)]
  458. psraw m0, 5
  459. pshuflw m0, m0, 0x0
  460. punpcklqdq m0, m0
  461. packuswb m0, m0
  462. .loop:
  463. mova [dstq ], m0
  464. mova [dstq +16], m0
  465. mova [dstq+strideq ], m0
  466. mova [dstq+strideq +16], m0
  467. mova [dstq+strideq*2 ], m0
  468. mova [dstq+strideq*2+16], m0
  469. mova [dstq+stride3q ], m0
  470. mova [dstq+stride3q +16], m0
  471. lea dstq, [dstq+strideq*4]
  472. dec lines4d
  473. jnz .loop
  474. RESTORE_GOT
  475. REP_RET
  476. INIT_XMM sse2
  477. cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
  478. GET_GOT goffsetq
  479. DEFINE_ARGS dst, stride, stride3, lines4
  480. lea stride3q, [strideq*3]
  481. mov lines4d, 8
  482. mova m0, [GLOBAL(dc_128)]
  483. .loop:
  484. mova [dstq ], m0
  485. mova [dstq +16], m0
  486. mova [dstq+strideq ], m0
  487. mova [dstq+strideq +16], m0
  488. mova [dstq+strideq*2 ], m0
  489. mova [dstq+strideq*2+16], m0
  490. mova [dstq+stride3q ], m0
  491. mova [dstq+stride3q +16], m0
  492. lea dstq, [dstq+strideq*4]
  493. dec lines4d
  494. jnz .loop
  495. RESTORE_GOT
  496. RET
  497. INIT_XMM sse2
  498. cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
  499. movd m0, [aboveq]
  500. movd [dstq ], m0
  501. movd [dstq+strideq], m0
  502. lea dstq, [dstq+strideq*2]
  503. movd [dstq ], m0
  504. movd [dstq+strideq], m0
  505. RET
  506. INIT_XMM sse2
  507. cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
  508. movq m0, [aboveq]
  509. DEFINE_ARGS dst, stride, stride3
  510. lea stride3q, [strideq*3]
  511. movq [dstq ], m0
  512. movq [dstq+strideq ], m0
  513. movq [dstq+strideq*2], m0
  514. movq [dstq+stride3q ], m0
  515. lea dstq, [dstq+strideq*4]
  516. movq [dstq ], m0
  517. movq [dstq+strideq ], m0
  518. movq [dstq+strideq*2], m0
  519. movq [dstq+stride3q ], m0
  520. RET
  521. INIT_XMM sse2
  522. cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
  523. mova m0, [aboveq]
  524. DEFINE_ARGS dst, stride, stride3, nlines4
  525. lea stride3q, [strideq*3]
  526. mov nlines4d, 4
  527. .loop:
  528. mova [dstq ], m0
  529. mova [dstq+strideq ], m0
  530. mova [dstq+strideq*2], m0
  531. mova [dstq+stride3q ], m0
  532. lea dstq, [dstq+strideq*4]
  533. dec nlines4d
  534. jnz .loop
  535. REP_RET
  536. INIT_XMM sse2
  537. cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
  538. mova m0, [aboveq]
  539. mova m1, [aboveq+16]
  540. DEFINE_ARGS dst, stride, stride3, nlines4
  541. lea stride3q, [strideq*3]
  542. mov nlines4d, 8
  543. .loop:
  544. mova [dstq ], m0
  545. mova [dstq +16], m1
  546. mova [dstq+strideq ], m0
  547. mova [dstq+strideq +16], m1
  548. mova [dstq+strideq*2 ], m0
  549. mova [dstq+strideq*2+16], m1
  550. mova [dstq+stride3q ], m0
  551. mova [dstq+stride3q +16], m1
  552. lea dstq, [dstq+strideq*4]
  553. dec nlines4d
  554. jnz .loop
  555. REP_RET
  556. INIT_XMM sse2
  557. cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
  558. movifnidn leftq, leftmp
  559. movd m0, [leftq]
  560. punpcklbw m0, m0
  561. punpcklbw m0, m0
  562. pshufd m1, m0, 0x1
  563. movd [dstq ], m0
  564. movd [dstq+strideq], m1
  565. pshufd m2, m0, 0x2
  566. lea dstq, [dstq+strideq*2]
  567. pshufd m3, m0, 0x3
  568. movd [dstq ], m2
  569. movd [dstq+strideq], m3
  570. RET
  571. INIT_XMM sse2
  572. cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
  573. movifnidn leftq, leftmp
  574. mov lineq, -2
  575. DEFINE_ARGS dst, stride, line, left, stride3
  576. lea stride3q, [strideq*3]
  577. movq m0, [leftq ]
  578. punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8
  579. .loop:
  580. pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1
  581. pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2
  582. movq [dstq ], m1
  583. movq [dstq+strideq], m2
  584. pshuflw m1, m0, 0xaa
  585. pshuflw m2, m0, 0xff
  586. movq [dstq+strideq*2], m1
  587. movq [dstq+stride3q ], m2
  588. pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
  589. inc lineq
  590. lea dstq, [dstq+strideq*4]
  591. jnz .loop
  592. REP_RET
  593. INIT_XMM sse2
  594. cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
  595. movifnidn leftq, leftmp
  596. mov lineq, -4
  597. DEFINE_ARGS dst, stride, line, left, stride3
  598. lea stride3q, [strideq*3]
  599. .loop:
  600. movd m0, [leftq]
  601. punpcklbw m0, m0
  602. punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
  603. pshufd m1, m0, 0x0 ; l1 repeated 16 times
  604. pshufd m2, m0, 0x55 ; l2 repeated 16 times
  605. mova [dstq ], m1
  606. mova [dstq+strideq ], m2
  607. pshufd m1, m0, 0xaa
  608. pshufd m2, m0, 0xff
  609. mova [dstq+strideq*2], m1
  610. mova [dstq+stride3q ], m2
  611. inc lineq
  612. lea leftq, [leftq+4 ]
  613. lea dstq, [dstq+strideq*4]
  614. jnz .loop
  615. REP_RET
  616. INIT_XMM sse2
  617. cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
  618. movifnidn leftq, leftmp
  619. mov lineq, -8
  620. DEFINE_ARGS dst, stride, line, left, stride3
  621. lea stride3q, [strideq*3]
  622. .loop:
  623. movd m0, [leftq]
  624. punpcklbw m0, m0
  625. punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
  626. pshufd m1, m0, 0x0 ; l1 repeated 16 times
  627. pshufd m2, m0, 0x55 ; l2 repeated 16 times
  628. mova [dstq ], m1
  629. mova [dstq+16 ], m1
  630. mova [dstq+strideq ], m2
  631. mova [dstq+strideq+16 ], m2
  632. pshufd m1, m0, 0xaa
  633. pshufd m2, m0, 0xff
  634. mova [dstq+strideq*2 ], m1
  635. mova [dstq+strideq*2+16], m1
  636. mova [dstq+stride3q ], m2
  637. mova [dstq+stride3q+16 ], m2
  638. inc lineq
  639. lea leftq, [leftq+4 ]
  640. lea dstq, [dstq+strideq*4]
  641. jnz .loop
  642. REP_RET
  643. INIT_XMM sse2
  644. cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
  645. pxor m1, m1
  646. movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
  647. punpcklbw m0, m1
  648. pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word]
  649. psrldq m0, 2
  650. psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
  651. movd m2, [leftq]
  652. punpcklbw m2, m1
  653. pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
  654. pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
  655. paddw m4, m0
  656. paddw m3, m0
  657. packuswb m4, m4
  658. packuswb m3, m3
  659. movd [dstq ], m4
  660. movd [dstq+strideq], m3
  661. lea dstq, [dstq+strideq*2]
  662. pshuflw m4, m2, 0xaa
  663. pshuflw m3, m2, 0xff
  664. paddw m4, m0
  665. paddw m3, m0
  666. packuswb m4, m4
  667. packuswb m3, m3
  668. movd [dstq ], m4
  669. movd [dstq+strideq], m3
  670. RET
  671. INIT_XMM sse2
  672. cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
  673. pxor m1, m1
  674. movd m2, [aboveq-1]
  675. movq m0, [aboveq]
  676. punpcklbw m2, m1
  677. punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
  678. pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word]
  679. DEFINE_ARGS dst, stride, line, left
  680. mov lineq, -4
  681. punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word]
  682. psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word]
  683. movq m2, [leftq]
  684. punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
  685. .loop:
  686. pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
  687. pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
  688. punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
  689. punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
  690. paddw m4, m0
  691. paddw m3, m0
  692. packuswb m4, m3
  693. movq [dstq ], m4
  694. movhps [dstq+strideq], m4
  695. lea dstq, [dstq+strideq*2]
  696. psrldq m2, 4
  697. inc lineq
  698. jnz .loop
  699. REP_RET
  700. INIT_XMM sse2
  701. cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
  702. pxor m1, m1
  703. mova m2, [aboveq-16];
  704. mova m0, [aboveq] ; t1 t2 ... t16 [byte]
  705. punpckhbw m2, m1 ; [127:112] tl [word]
  706. punpckhbw m4, m0, m1
  707. punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word]
  708. DEFINE_ARGS dst, stride, line, left, stride8
  709. mov lineq, -8
  710. pshufhw m2, m2, 0xff
  711. mova m3, [leftq] ; l1 l2 ... l16 [byte]
  712. punpckhqdq m2, m2 ; tl repeated 8 times [word]
  713. psubw m0, m2
  714. psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word]
  715. punpckhbw m5, m3, m1
  716. punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word]
  717. lea stride8q, [strideq*8]
  718. .loop:
  719. pshuflw m6, m3, 0x0
  720. pshuflw m7, m5, 0x0
  721. punpcklqdq m6, m6 ; l1 repeated 8 times [word]
  722. punpcklqdq m7, m7 ; l8 repeated 8 times [word]
  723. paddw m1, m6, m0
  724. paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word]
  725. psrldq m5, 2
  726. packuswb m1, m6
  727. mova [dstq ], m1
  728. paddw m1, m7, m0
  729. paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word]
  730. psrldq m3, 2
  731. packuswb m1, m7
  732. mova [dstq+stride8q], m1
  733. inc lineq
  734. lea dstq, [dstq+strideq]
  735. jnz .loop
  736. REP_RET
  737. INIT_XMM sse2
  738. cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
  739. pxor m1, m1
  740. movd m2, [aboveq-1]
  741. mova m0, [aboveq]
  742. mova m4, [aboveq+16]
  743. punpcklbw m2, m1
  744. punpckhbw m3, m0, m1
  745. punpckhbw m5, m4, m1
  746. punpcklbw m0, m1
  747. punpcklbw m4, m1
  748. pshuflw m2, m2, 0x0
  749. DEFINE_ARGS dst, stride, line, left
  750. mov lineq, -16
  751. punpcklqdq m2, m2
  752. add leftq, 32
  753. psubw m0, m2
  754. psubw m3, m2
  755. psubw m4, m2
  756. psubw m5, m2
  757. .loop:
  758. movd m2, [leftq+lineq*2]
  759. pxor m1, m1
  760. punpcklbw m2, m1
  761. pshuflw m7, m2, 0x55
  762. pshuflw m2, m2, 0x0
  763. punpcklqdq m2, m2
  764. punpcklqdq m7, m7
  765. paddw m6, m2, m3
  766. paddw m1, m2, m0
  767. packuswb m1, m6
  768. mova [dstq ], m1
  769. paddw m6, m2, m5
  770. paddw m1, m2, m4
  771. packuswb m1, m6
  772. mova [dstq+16 ], m1
  773. paddw m6, m7, m3
  774. paddw m1, m7, m0
  775. packuswb m1, m6
  776. mova [dstq+strideq ], m1
  777. paddw m6, m7, m5
  778. paddw m1, m7, m4
  779. packuswb m1, m6
  780. mova [dstq+strideq+16], m1
  781. lea dstq, [dstq+strideq*2]
  782. inc lineq
  783. jnz .loop
  784. REP_RET