highbd_intrapred_sse2.asm 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION_RODATA
  12. pw_4: times 8 dw 4
  13. pw_8: times 8 dw 8
  14. pw_16: times 4 dd 16
  15. pw_32: times 4 dd 32
  16. SECTION .text
  17. INIT_XMM sse2
  18. cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
  19. GET_GOT goffsetq
  20. movq m0, [aboveq]
  21. movq m2, [leftq]
  22. paddw m0, m2
  23. pshuflw m1, m0, 0xe
  24. paddw m0, m1
  25. pshuflw m1, m0, 0x1
  26. paddw m0, m1
  27. paddw m0, [GLOBAL(pw_4)]
  28. psraw m0, 3
  29. pshuflw m0, m0, 0x0
  30. movq [dstq ], m0
  31. movq [dstq+strideq*2], m0
  32. lea dstq, [dstq+strideq*4]
  33. movq [dstq ], m0
  34. movq [dstq+strideq*2], m0
  35. RESTORE_GOT
  36. RET
  37. INIT_XMM sse2
  38. cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
  39. GET_GOT goffsetq
  40. pxor m1, m1
  41. mova m0, [aboveq]
  42. mova m2, [leftq]
  43. DEFINE_ARGS dst, stride, stride3, one
  44. mov oned, 0x00010001
  45. lea stride3q, [strideq*3]
  46. movd m3, oned
  47. pshufd m3, m3, 0x0
  48. paddw m0, m2
  49. pmaddwd m0, m3
  50. packssdw m0, m1
  51. pmaddwd m0, m3
  52. packssdw m0, m1
  53. pmaddwd m0, m3
  54. paddw m0, [GLOBAL(pw_8)]
  55. psrlw m0, 4
  56. pshuflw m0, m0, 0x0
  57. punpcklqdq m0, m0
  58. mova [dstq ], m0
  59. mova [dstq+strideq*2 ], m0
  60. mova [dstq+strideq*4 ], m0
  61. mova [dstq+stride3q*2], m0
  62. lea dstq, [dstq+strideq*8]
  63. mova [dstq ], m0
  64. mova [dstq+strideq*2 ], m0
  65. mova [dstq+strideq*4 ], m0
  66. mova [dstq+stride3q*2], m0
  67. RESTORE_GOT
  68. RET
  69. INIT_XMM sse2
  70. cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
  71. GET_GOT goffsetq
  72. pxor m1, m1
  73. mova m0, [aboveq]
  74. mova m3, [aboveq+16]
  75. mova m2, [leftq]
  76. mova m4, [leftq+16]
  77. DEFINE_ARGS dst, stride, stride3, lines4
  78. lea stride3q, [strideq*3]
  79. mov lines4d, 4
  80. paddw m0, m2
  81. paddw m0, m3
  82. paddw m0, m4
  83. movhlps m2, m0
  84. paddw m0, m2
  85. punpcklwd m0, m1
  86. movhlps m2, m0
  87. paddd m0, m2
  88. punpckldq m0, m1
  89. movhlps m2, m0
  90. paddd m0, m2
  91. paddd m0, [GLOBAL(pw_16)]
  92. psrad m0, 5
  93. pshuflw m0, m0, 0x0
  94. punpcklqdq m0, m0
  95. .loop:
  96. mova [dstq ], m0
  97. mova [dstq +16], m0
  98. mova [dstq+strideq*2 ], m0
  99. mova [dstq+strideq*2 +16], m0
  100. mova [dstq+strideq*4 ], m0
  101. mova [dstq+strideq*4 +16], m0
  102. mova [dstq+stride3q*2 ], m0
  103. mova [dstq+stride3q*2+16], m0
  104. lea dstq, [dstq+strideq*8]
  105. dec lines4d
  106. jnz .loop
  107. RESTORE_GOT
  108. REP_RET
  109. INIT_XMM sse2
  110. cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
  111. GET_GOT goffsetq
  112. mova m0, [aboveq]
  113. mova m2, [aboveq+16]
  114. mova m3, [aboveq+32]
  115. mova m4, [aboveq+48]
  116. paddw m0, m2
  117. paddw m3, m4
  118. mova m2, [leftq]
  119. mova m4, [leftq+16]
  120. mova m5, [leftq+32]
  121. mova m6, [leftq+48]
  122. paddw m2, m4
  123. paddw m5, m6
  124. paddw m0, m3
  125. paddw m2, m5
  126. pxor m1, m1
  127. paddw m0, m2
  128. DEFINE_ARGS dst, stride, stride3, lines4
  129. lea stride3q, [strideq*3]
  130. mov lines4d, 8
  131. movhlps m2, m0
  132. paddw m0, m2
  133. punpcklwd m0, m1
  134. movhlps m2, m0
  135. paddd m0, m2
  136. punpckldq m0, m1
  137. movhlps m2, m0
  138. paddd m0, m2
  139. paddd m0, [GLOBAL(pw_32)]
  140. psrad m0, 6
  141. pshuflw m0, m0, 0x0
  142. punpcklqdq m0, m0
  143. .loop:
  144. mova [dstq ], m0
  145. mova [dstq +16 ], m0
  146. mova [dstq +32 ], m0
  147. mova [dstq +48 ], m0
  148. mova [dstq+strideq*2 ], m0
  149. mova [dstq+strideq*2+16 ], m0
  150. mova [dstq+strideq*2+32 ], m0
  151. mova [dstq+strideq*2+48 ], m0
  152. mova [dstq+strideq*4 ], m0
  153. mova [dstq+strideq*4+16 ], m0
  154. mova [dstq+strideq*4+32 ], m0
  155. mova [dstq+strideq*4+48 ], m0
  156. mova [dstq+stride3q*2 ], m0
  157. mova [dstq+stride3q*2 +16], m0
  158. mova [dstq+stride3q*2 +32], m0
  159. mova [dstq+stride3q*2 +48], m0
  160. lea dstq, [dstq+strideq*8]
  161. dec lines4d
  162. jnz .loop
  163. RESTORE_GOT
  164. REP_RET
  165. INIT_XMM sse2
  166. cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
  167. movq m0, [aboveq]
  168. movq [dstq ], m0
  169. movq [dstq+strideq*2], m0
  170. lea dstq, [dstq+strideq*4]
  171. movq [dstq ], m0
  172. movq [dstq+strideq*2], m0
  173. RET
  174. INIT_XMM sse2
  175. cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
  176. mova m0, [aboveq]
  177. DEFINE_ARGS dst, stride, stride3
  178. lea stride3q, [strideq*3]
  179. mova [dstq ], m0
  180. mova [dstq+strideq*2 ], m0
  181. mova [dstq+strideq*4 ], m0
  182. mova [dstq+stride3q*2], m0
  183. lea dstq, [dstq+strideq*8]
  184. mova [dstq ], m0
  185. mova [dstq+strideq*2 ], m0
  186. mova [dstq+strideq*4 ], m0
  187. mova [dstq+stride3q*2], m0
  188. RET
  189. INIT_XMM sse2
  190. cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
  191. mova m0, [aboveq]
  192. mova m1, [aboveq+16]
  193. DEFINE_ARGS dst, stride, stride3, nlines4
  194. lea stride3q, [strideq*3]
  195. mov nlines4d, 4
  196. .loop:
  197. mova [dstq ], m0
  198. mova [dstq +16], m1
  199. mova [dstq+strideq*2 ], m0
  200. mova [dstq+strideq*2 +16], m1
  201. mova [dstq+strideq*4 ], m0
  202. mova [dstq+strideq*4 +16], m1
  203. mova [dstq+stride3q*2 ], m0
  204. mova [dstq+stride3q*2+16], m1
  205. lea dstq, [dstq+strideq*8]
  206. dec nlines4d
  207. jnz .loop
  208. REP_RET
  209. INIT_XMM sse2
  210. cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
  211. mova m0, [aboveq]
  212. mova m1, [aboveq+16]
  213. mova m2, [aboveq+32]
  214. mova m3, [aboveq+48]
  215. DEFINE_ARGS dst, stride, stride3, nlines4
  216. lea stride3q, [strideq*3]
  217. mov nlines4d, 8
  218. .loop:
  219. mova [dstq ], m0
  220. mova [dstq +16], m1
  221. mova [dstq +32], m2
  222. mova [dstq +48], m3
  223. mova [dstq+strideq*2 ], m0
  224. mova [dstq+strideq*2 +16], m1
  225. mova [dstq+strideq*2 +32], m2
  226. mova [dstq+strideq*2 +48], m3
  227. mova [dstq+strideq*4 ], m0
  228. mova [dstq+strideq*4 +16], m1
  229. mova [dstq+strideq*4 +32], m2
  230. mova [dstq+strideq*4 +48], m3
  231. mova [dstq+stride3q*2 ], m0
  232. mova [dstq+stride3q*2 +16], m1
  233. mova [dstq+stride3q*2 +32], m2
  234. mova [dstq+stride3q*2 +48], m3
  235. lea dstq, [dstq+strideq*8]
  236. dec nlines4d
  237. jnz .loop
  238. REP_RET
  239. INIT_XMM sse2
  240. cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd
  241. movd m1, [aboveq-2]
  242. movq m0, [aboveq]
  243. pshuflw m1, m1, 0x0
  244. movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4
  245. movlhps m1, m1 ; tl tl tl tl tl tl tl tl
  246. ; Get the values to compute the maximum value at this bit depth
  247. pcmpeqw m3, m3
  248. movd m4, bdd
  249. psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl
  250. psllw m3, m4
  251. pcmpeqw m2, m2
  252. pxor m4, m4 ; min possible value
  253. pxor m3, m2 ; max possible value
  254. mova m1, [leftq]
  255. pshuflw m2, m1, 0x0
  256. pshuflw m5, m1, 0x55
  257. movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2
  258. paddw m2, m0
  259. ;Clamp to the bit-depth
  260. pminsw m2, m3
  261. pmaxsw m2, m4
  262. ;Store the values
  263. movq [dstq ], m2
  264. movhpd [dstq+strideq*2], m2
  265. lea dstq, [dstq+strideq*4]
  266. pshuflw m2, m1, 0xaa
  267. pshuflw m5, m1, 0xff
  268. movlhps m2, m5
  269. paddw m2, m0
  270. ;Clamp to the bit-depth
  271. pminsw m2, m3
  272. pmaxsw m2, m4
  273. ;Store the values
  274. movq [dstq ], m2
  275. movhpd [dstq+strideq*2], m2
  276. RET
  277. INIT_XMM sse2
  278. cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one
  279. movd m1, [aboveq-2]
  280. mova m0, [aboveq]
  281. pshuflw m1, m1, 0x0
  282. ; Get the values to compute the maximum value at this bit depth
  283. mov oned, 1
  284. pxor m3, m3
  285. pxor m4, m4
  286. pinsrw m3, oned, 0
  287. pinsrw m4, bdd, 0
  288. pshuflw m3, m3, 0x0
  289. DEFINE_ARGS dst, stride, line, left
  290. punpcklqdq m3, m3
  291. mov lineq, -4
  292. mova m2, m3
  293. punpcklqdq m1, m1
  294. psllw m3, m4
  295. add leftq, 16
  296. psubw m3, m2 ; max possible value
  297. pxor m4, m4 ; min possible value
  298. psubw m0, m1
  299. .loop:
  300. movd m1, [leftq+lineq*4]
  301. movd m2, [leftq+lineq*4+2]
  302. pshuflw m1, m1, 0x0
  303. pshuflw m2, m2, 0x0
  304. punpcklqdq m1, m1
  305. punpcklqdq m2, m2
  306. paddw m1, m0
  307. paddw m2, m0
  308. ;Clamp to the bit-depth
  309. pminsw m1, m3
  310. pminsw m2, m3
  311. pmaxsw m1, m4
  312. pmaxsw m2, m4
  313. ;Store the values
  314. mova [dstq ], m1
  315. mova [dstq+strideq*2], m2
  316. lea dstq, [dstq+strideq*4]
  317. inc lineq
  318. jnz .loop
  319. REP_RET
  320. INIT_XMM sse2
  321. cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd
  322. movd m2, [aboveq-2]
  323. mova m0, [aboveq]
  324. mova m1, [aboveq+16]
  325. pshuflw m2, m2, 0x0
  326. ; Get the values to compute the maximum value at this bit depth
  327. pcmpeqw m3, m3
  328. movd m4, bdd
  329. punpcklqdq m2, m2
  330. psllw m3, m4
  331. pcmpeqw m5, m5
  332. pxor m4, m4 ; min possible value
  333. pxor m3, m5 ; max possible value
  334. DEFINE_ARGS dst, stride, line, left
  335. mov lineq, -8
  336. psubw m0, m2
  337. psubw m1, m2
  338. .loop:
  339. movd m7, [leftq]
  340. pshuflw m5, m7, 0x0
  341. pshuflw m2, m7, 0x55
  342. punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1
  343. punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2
  344. paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1
  345. paddw m5, m1 ; t5-tl+l1 to t8-tl+l1
  346. pminsw m6, m3
  347. pminsw m5, m3
  348. pmaxsw m6, m4 ; Clamp to the bit-depth
  349. pmaxsw m5, m4
  350. mova [dstq ], m6
  351. mova [dstq +16], m5
  352. paddw m6, m2, m0
  353. paddw m2, m1
  354. pminsw m6, m3
  355. pminsw m2, m3
  356. pmaxsw m6, m4
  357. pmaxsw m2, m4
  358. mova [dstq+strideq*2 ], m6
  359. mova [dstq+strideq*2+16], m2
  360. lea dstq, [dstq+strideq*4]
  361. inc lineq
  362. lea leftq, [leftq+4]
  363. jnz .loop
  364. REP_RET
  365. INIT_XMM sse2
  366. cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd
  367. movd m0, [aboveq-2]
  368. mova m1, [aboveq]
  369. mova m2, [aboveq+16]
  370. mova m3, [aboveq+32]
  371. mova m4, [aboveq+48]
  372. pshuflw m0, m0, 0x0
  373. ; Get the values to compute the maximum value at this bit depth
  374. pcmpeqw m5, m5
  375. movd m6, bdd
  376. psllw m5, m6
  377. pcmpeqw m7, m7
  378. pxor m6, m6 ; min possible value
  379. pxor m5, m7 ; max possible value
  380. punpcklqdq m0, m0
  381. DEFINE_ARGS dst, stride, line, left
  382. mov lineq, -16
  383. psubw m1, m0
  384. psubw m2, m0
  385. psubw m3, m0
  386. psubw m4, m0
  387. .loop:
  388. movd m7, [leftq]
  389. pshuflw m7, m7, 0x0
  390. punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1
  391. paddw m0, m7, m1
  392. pminsw m0, m5
  393. pmaxsw m0, m6
  394. mova [dstq ], m0
  395. paddw m0, m7, m2
  396. pminsw m0, m5
  397. pmaxsw m0, m6
  398. mova [dstq +16], m0
  399. paddw m0, m7, m3
  400. pminsw m0, m5
  401. pmaxsw m0, m6
  402. mova [dstq +32], m0
  403. paddw m0, m7, m4
  404. pminsw m0, m5
  405. pmaxsw m0, m6
  406. mova [dstq +48], m0
  407. movd m7, [leftq+2]
  408. pshuflw m7, m7, 0x0
  409. punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2
  410. paddw m0, m7, m1
  411. pminsw m0, m5
  412. pmaxsw m0, m6
  413. mova [dstq+strideq*2 ], m0
  414. paddw m0, m7, m2
  415. pminsw m0, m5
  416. pmaxsw m0, m6
  417. mova [dstq+strideq*2+16], m0
  418. paddw m0, m7, m3
  419. pminsw m0, m5
  420. pmaxsw m0, m6
  421. mova [dstq+strideq*2+32], m0
  422. paddw m0, m7, m4
  423. pminsw m0, m5
  424. pmaxsw m0, m6
  425. mova [dstq+strideq*2+48], m0
  426. lea dstq, [dstq+strideq*4]
  427. lea leftq, [leftq+4]
  428. inc lineq
  429. jnz .loop
  430. REP_RET