vpx_subpixel_8t_ssse3.asm 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803
  1. ;
  2. ; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION_RODATA
  12. pw_64: times 8 dw 64
  13. ; %define USE_PMULHRSW
  14. ; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
  15. ; when using this instruction.
  16. ;
  17. ; The add order below (based on ffvp9) must be followed to prevent outranges.
  18. ; x = k0k1 + k4k5
  19. ; y = k2k3 + k6k7
  20. ; z = signed SAT(x + y)
  21. SECTION .text
  22. %define LOCAL_VARS_SIZE 16*6
  23. %macro SETUP_LOCAL_VARS 0
  24. ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
  25. ; pmaddubsw has a higher latency on some platforms, this might be eased by
  26. ; interleaving the instructions.
  27. %define k0k1 [rsp + 16*0]
  28. %define k2k3 [rsp + 16*1]
  29. %define k4k5 [rsp + 16*2]
  30. %define k6k7 [rsp + 16*3]
  31. packsswb m4, m4
  32. ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
  33. ; some platforms.
  34. pshuflw m0, m4, 0b ;k0_k1
  35. pshuflw m1, m4, 01010101b ;k2_k3
  36. pshuflw m2, m4, 10101010b ;k4_k5
  37. pshuflw m3, m4, 11111111b ;k6_k7
  38. punpcklqdq m0, m0
  39. punpcklqdq m1, m1
  40. punpcklqdq m2, m2
  41. punpcklqdq m3, m3
  42. mova k0k1, m0
  43. mova k2k3, m1
  44. mova k4k5, m2
  45. mova k6k7, m3
  46. %if ARCH_X86_64
  47. %define krd m12
  48. %define tmp0 [rsp + 16*4]
  49. %define tmp1 [rsp + 16*5]
  50. mova krd, [GLOBAL(pw_64)]
  51. %else
  52. %define krd [rsp + 16*4]
  53. %if CONFIG_PIC=0
  54. mova m6, [GLOBAL(pw_64)]
  55. %else
  56. ; build constants without accessing global memory
  57. pcmpeqb m6, m6 ;all ones
  58. psrlw m6, 15
  59. psllw m6, 6 ;aka pw_64
  60. %endif
  61. mova krd, m6
  62. %endif
  63. %endm
  64. ;-------------------------------------------------------------------------------
  65. %if ARCH_X86_64
  66. %define LOCAL_VARS_SIZE_H4 0
  67. %else
  68. %define LOCAL_VARS_SIZE_H4 16*4
  69. %endif
  70. %macro SUBPIX_HFILTER4 1
  71. cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
  72. src, sstride, dst, dstride, height, filter
  73. mova m4, [filterq]
  74. packsswb m4, m4
  75. %if ARCH_X86_64
  76. %define k0k1k4k5 m8
  77. %define k2k3k6k7 m9
  78. %define krd m10
  79. mova krd, [GLOBAL(pw_64)]
  80. pshuflw k0k1k4k5, m4, 0b ;k0_k1
  81. pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
  82. pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
  83. pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
  84. %else
  85. %define k0k1k4k5 [rsp + 16*0]
  86. %define k2k3k6k7 [rsp + 16*1]
  87. %define krd [rsp + 16*2]
  88. pshuflw m6, m4, 0b ;k0_k1
  89. pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
  90. pshuflw m7, m4, 01010101b ;k2_k3
  91. pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
  92. %if CONFIG_PIC=0
  93. mova m1, [GLOBAL(pw_64)]
  94. %else
  95. ; build constants without accessing global memory
  96. pcmpeqb m1, m1 ;all ones
  97. psrlw m1, 15
  98. psllw m1, 6 ;aka pw_64
  99. %endif
  100. mova k0k1k4k5, m6
  101. mova k2k3k6k7, m7
  102. mova krd, m1
  103. %endif
  104. dec heightd
  105. .loop:
  106. ;Do two rows at once
  107. movu m4, [srcq - 3]
  108. movu m5, [srcq + sstrideq - 3]
  109. punpckhbw m1, m4, m4
  110. punpcklbw m4, m4
  111. punpckhbw m3, m5, m5
  112. punpcklbw m5, m5
  113. palignr m0, m1, m4, 1
  114. pmaddubsw m0, k0k1k4k5
  115. palignr m1, m4, 5
  116. pmaddubsw m1, k2k3k6k7
  117. palignr m2, m3, m5, 1
  118. pmaddubsw m2, k0k1k4k5
  119. palignr m3, m5, 5
  120. pmaddubsw m3, k2k3k6k7
  121. punpckhqdq m4, m0, m2
  122. punpcklqdq m0, m2
  123. punpckhqdq m5, m1, m3
  124. punpcklqdq m1, m3
  125. paddsw m0, m4
  126. paddsw m1, m5
  127. %ifidn %1, h8_avg
  128. movd m4, [dstq]
  129. movd m5, [dstq + dstrideq]
  130. %endif
  131. paddsw m0, m1
  132. paddsw m0, krd
  133. psraw m0, 7
  134. packuswb m0, m0
  135. psrldq m1, m0, 4
  136. %ifidn %1, h8_avg
  137. pavgb m0, m4
  138. pavgb m1, m5
  139. %endif
  140. movd [dstq], m0
  141. movd [dstq + dstrideq], m1
  142. lea srcq, [srcq + sstrideq ]
  143. prefetcht0 [srcq + 4 * sstrideq - 3]
  144. lea srcq, [srcq + sstrideq ]
  145. lea dstq, [dstq + 2 * dstrideq ]
  146. prefetcht0 [srcq + 2 * sstrideq - 3]
  147. sub heightd, 2
  148. jg .loop
  149. ; Do last row if output_height is odd
  150. jne .done
  151. movu m4, [srcq - 3]
  152. punpckhbw m1, m4, m4
  153. punpcklbw m4, m4
  154. palignr m0, m1, m4, 1
  155. palignr m1, m4, 5
  156. pmaddubsw m0, k0k1k4k5
  157. pmaddubsw m1, k2k3k6k7
  158. psrldq m2, m0, 8
  159. psrldq m3, m1, 8
  160. paddsw m0, m2
  161. paddsw m1, m3
  162. paddsw m0, m1
  163. paddsw m0, krd
  164. psraw m0, 7
  165. packuswb m0, m0
  166. %ifidn %1, h8_avg
  167. movd m4, [dstq]
  168. pavgb m0, m4
  169. %endif
  170. movd [dstq], m0
  171. .done:
  172. REP_RET
  173. %endm
  174. ;-------------------------------------------------------------------------------
  175. %macro SUBPIX_HFILTER8 1
  176. cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
  177. src, sstride, dst, dstride, height, filter
  178. mova m4, [filterq]
  179. SETUP_LOCAL_VARS
  180. dec heightd
  181. .loop:
  182. ;Do two rows at once
  183. movu m0, [srcq - 3]
  184. movu m4, [srcq + sstrideq - 3]
  185. punpckhbw m1, m0, m0
  186. punpcklbw m0, m0
  187. palignr m5, m1, m0, 13
  188. pmaddubsw m5, k6k7
  189. palignr m2, m1, m0, 5
  190. palignr m3, m1, m0, 9
  191. palignr m1, m0, 1
  192. pmaddubsw m1, k0k1
  193. punpckhbw m6, m4, m4
  194. punpcklbw m4, m4
  195. pmaddubsw m2, k2k3
  196. pmaddubsw m3, k4k5
  197. palignr m7, m6, m4, 13
  198. palignr m0, m6, m4, 5
  199. pmaddubsw m7, k6k7
  200. paddsw m1, m3
  201. paddsw m2, m5
  202. paddsw m1, m2
  203. %ifidn %1, h8_avg
  204. movh m2, [dstq]
  205. movhps m2, [dstq + dstrideq]
  206. %endif
  207. palignr m5, m6, m4, 9
  208. palignr m6, m4, 1
  209. pmaddubsw m0, k2k3
  210. pmaddubsw m6, k0k1
  211. paddsw m1, krd
  212. pmaddubsw m5, k4k5
  213. psraw m1, 7
  214. paddsw m0, m7
  215. paddsw m6, m5
  216. paddsw m6, m0
  217. paddsw m6, krd
  218. psraw m6, 7
  219. packuswb m1, m6
  220. %ifidn %1, h8_avg
  221. pavgb m1, m2
  222. %endif
  223. movh [dstq], m1
  224. movhps [dstq + dstrideq], m1
  225. lea srcq, [srcq + sstrideq ]
  226. prefetcht0 [srcq + 4 * sstrideq - 3]
  227. lea srcq, [srcq + sstrideq ]
  228. lea dstq, [dstq + 2 * dstrideq ]
  229. prefetcht0 [srcq + 2 * sstrideq - 3]
  230. sub heightd, 2
  231. jg .loop
  232. ; Do last row if output_height is odd
  233. jne .done
  234. movu m0, [srcq - 3]
  235. punpckhbw m3, m0, m0
  236. punpcklbw m0, m0
  237. palignr m1, m3, m0, 1
  238. palignr m2, m3, m0, 5
  239. palignr m4, m3, m0, 13
  240. palignr m3, m0, 9
  241. pmaddubsw m1, k0k1
  242. pmaddubsw m2, k2k3
  243. pmaddubsw m3, k4k5
  244. pmaddubsw m4, k6k7
  245. paddsw m1, m3
  246. paddsw m4, m2
  247. paddsw m1, m4
  248. paddsw m1, krd
  249. psraw m1, 7
  250. packuswb m1, m1
  251. %ifidn %1, h8_avg
  252. movh m0, [dstq]
  253. pavgb m1, m0
  254. %endif
  255. movh [dstq], m1
  256. .done:
  257. REP_RET
  258. %endm
  259. ;-------------------------------------------------------------------------------
  260. %macro SUBPIX_HFILTER16 1
  261. cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
  262. src, sstride, dst, dstride, height, filter
  263. mova m4, [filterq]
  264. SETUP_LOCAL_VARS
  265. .loop:
  266. prefetcht0 [srcq + 2 * sstrideq -3]
  267. movu m0, [srcq - 3]
  268. movu m4, [srcq - 2]
  269. pmaddubsw m0, k0k1
  270. pmaddubsw m4, k0k1
  271. movu m1, [srcq - 1]
  272. movu m5, [srcq + 0]
  273. pmaddubsw m1, k2k3
  274. pmaddubsw m5, k2k3
  275. movu m2, [srcq + 1]
  276. movu m6, [srcq + 2]
  277. pmaddubsw m2, k4k5
  278. pmaddubsw m6, k4k5
  279. movu m3, [srcq + 3]
  280. movu m7, [srcq + 4]
  281. pmaddubsw m3, k6k7
  282. pmaddubsw m7, k6k7
  283. paddsw m0, m2
  284. paddsw m1, m3
  285. paddsw m0, m1
  286. paddsw m4, m6
  287. paddsw m5, m7
  288. paddsw m4, m5
  289. paddsw m0, krd
  290. paddsw m4, krd
  291. psraw m0, 7
  292. psraw m4, 7
  293. packuswb m0, m0
  294. packuswb m4, m4
  295. punpcklbw m0, m4
  296. %ifidn %1, h8_avg
  297. pavgb m0, [dstq]
  298. %endif
  299. lea srcq, [srcq + sstrideq]
  300. mova [dstq], m0
  301. lea dstq, [dstq + dstrideq]
  302. dec heightd
  303. jnz .loop
  304. REP_RET
  305. %endm
  306. INIT_XMM ssse3
  307. SUBPIX_HFILTER16 h8 ; vpx_filter_block1d16_h8_ssse3
  308. SUBPIX_HFILTER16 h8_avg ; vpx_filter_block1d16_h8_avg_ssse3
  309. SUBPIX_HFILTER8 h8 ; vpx_filter_block1d8_h8_ssse3
  310. SUBPIX_HFILTER8 h8_avg ; vpx_filter_block1d8_h8_avg_ssse3
  311. SUBPIX_HFILTER4 h8 ; vpx_filter_block1d4_h8_ssse3
  312. SUBPIX_HFILTER4 h8_avg ; vpx_filter_block1d4_h8_avg_ssse3
  313. ;-------------------------------------------------------------------------------
  314. ; TODO(Linfeng): Detect cpu type and choose the code with better performance.
  315. %define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
  316. %if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
  317. %define NUM_GENERAL_REG_USED 9
  318. %else
  319. %define NUM_GENERAL_REG_USED 6
  320. %endif
  321. %macro SUBPIX_VFILTER 2
  322. cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
  323. src, sstride, dst, dstride, height, filter
  324. mova m4, [filterq]
  325. SETUP_LOCAL_VARS
  326. %ifidn %2, 8
  327. %define movx movh
  328. %else
  329. %define movx movd
  330. %endif
  331. dec heightd
  332. %if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
  333. %if ARCH_X86_64
  334. %define src1q r7
  335. %define sstride6q r8
  336. %define dst_stride dstrideq
  337. %else
  338. %define src1q filterq
  339. %define sstride6q dstrideq
  340. %define dst_stride dstridemp
  341. %endif
  342. mov src1q, srcq
  343. add src1q, sstrideq
  344. lea sstride6q, [sstrideq + sstrideq * 4]
  345. add sstride6q, sstrideq ;pitch * 6
  346. .loop:
  347. ;Do two rows at once
  348. movx m0, [srcq ] ;A
  349. movx m1, [src1q ] ;B
  350. punpcklbw m0, m1 ;A B
  351. movx m2, [srcq + sstrideq * 2 ] ;C
  352. pmaddubsw m0, k0k1
  353. mova m6, m2
  354. movx m3, [src1q + sstrideq * 2] ;D
  355. punpcklbw m2, m3 ;C D
  356. pmaddubsw m2, k2k3
  357. movx m4, [srcq + sstrideq * 4 ] ;E
  358. mova m7, m4
  359. movx m5, [src1q + sstrideq * 4] ;F
  360. punpcklbw m4, m5 ;E F
  361. pmaddubsw m4, k4k5
  362. punpcklbw m1, m6 ;A B next iter
  363. movx m6, [srcq + sstride6q ] ;G
  364. punpcklbw m5, m6 ;E F next iter
  365. punpcklbw m3, m7 ;C D next iter
  366. pmaddubsw m5, k4k5
  367. movx m7, [src1q + sstride6q ] ;H
  368. punpcklbw m6, m7 ;G H
  369. pmaddubsw m6, k6k7
  370. pmaddubsw m3, k2k3
  371. pmaddubsw m1, k0k1
  372. paddsw m0, m4
  373. paddsw m2, m6
  374. movx m6, [srcq + sstrideq * 8 ] ;H next iter
  375. punpcklbw m7, m6
  376. pmaddubsw m7, k6k7
  377. paddsw m0, m2
  378. paddsw m0, krd
  379. psraw m0, 7
  380. paddsw m1, m5
  381. packuswb m0, m0
  382. paddsw m3, m7
  383. paddsw m1, m3
  384. paddsw m1, krd
  385. psraw m1, 7
  386. lea srcq, [srcq + sstrideq * 2 ]
  387. lea src1q, [src1q + sstrideq * 2]
  388. packuswb m1, m1
  389. %ifidn %1, v8_avg
  390. movx m2, [dstq]
  391. pavgb m0, m2
  392. %endif
  393. movx [dstq], m0
  394. add dstq, dst_stride
  395. %ifidn %1, v8_avg
  396. movx m3, [dstq]
  397. pavgb m1, m3
  398. %endif
  399. movx [dstq], m1
  400. add dstq, dst_stride
  401. sub heightd, 2
  402. jg .loop
  403. ; Do last row if output_height is odd
  404. jne .done
  405. movx m0, [srcq ] ;A
  406. movx m1, [srcq + sstrideq ] ;B
  407. movx m6, [srcq + sstride6q ] ;G
  408. punpcklbw m0, m1 ;A B
  409. movx m7, [src1q + sstride6q ] ;H
  410. pmaddubsw m0, k0k1
  411. movx m2, [srcq + sstrideq * 2 ] ;C
  412. punpcklbw m6, m7 ;G H
  413. movx m3, [src1q + sstrideq * 2] ;D
  414. pmaddubsw m6, k6k7
  415. movx m4, [srcq + sstrideq * 4 ] ;E
  416. punpcklbw m2, m3 ;C D
  417. movx m5, [src1q + sstrideq * 4] ;F
  418. punpcklbw m4, m5 ;E F
  419. pmaddubsw m2, k2k3
  420. pmaddubsw m4, k4k5
  421. paddsw m2, m6
  422. paddsw m0, m4
  423. paddsw m0, m2
  424. paddsw m0, krd
  425. psraw m0, 7
  426. packuswb m0, m0
  427. %ifidn %1, v8_avg
  428. movx m1, [dstq]
  429. pavgb m0, m1
  430. %endif
  431. movx [dstq], m0
  432. %else
  433. ; ARCH_X86_64
  434. movx m0, [srcq ] ;A
  435. movx m1, [srcq + sstrideq ] ;B
  436. lea srcq, [srcq + sstrideq * 2 ]
  437. movx m2, [srcq] ;C
  438. movx m3, [srcq + sstrideq] ;D
  439. lea srcq, [srcq + sstrideq * 2 ]
  440. movx m4, [srcq] ;E
  441. movx m5, [srcq + sstrideq] ;F
  442. lea srcq, [srcq + sstrideq * 2 ]
  443. movx m6, [srcq] ;G
  444. punpcklbw m0, m1 ;A B
  445. punpcklbw m1, m2 ;A B next iter
  446. punpcklbw m2, m3 ;C D
  447. punpcklbw m3, m4 ;C D next iter
  448. punpcklbw m4, m5 ;E F
  449. punpcklbw m5, m6 ;E F next iter
  450. .loop:
  451. ;Do two rows at once
  452. movx m7, [srcq + sstrideq] ;H
  453. lea srcq, [srcq + sstrideq * 2 ]
  454. movx m14, [srcq] ;H next iter
  455. punpcklbw m6, m7 ;G H
  456. punpcklbw m7, m14 ;G H next iter
  457. pmaddubsw m8, m0, k0k1
  458. pmaddubsw m9, m1, k0k1
  459. mova m0, m2
  460. mova m1, m3
  461. pmaddubsw m10, m2, k2k3
  462. pmaddubsw m11, m3, k2k3
  463. mova m2, m4
  464. mova m3, m5
  465. pmaddubsw m4, k4k5
  466. pmaddubsw m5, k4k5
  467. paddsw m8, m4
  468. paddsw m9, m5
  469. mova m4, m6
  470. mova m5, m7
  471. pmaddubsw m6, k6k7
  472. pmaddubsw m7, k6k7
  473. paddsw m10, m6
  474. paddsw m11, m7
  475. paddsw m8, m10
  476. paddsw m9, m11
  477. mova m6, m14
  478. paddsw m8, krd
  479. paddsw m9, krd
  480. psraw m8, 7
  481. psraw m9, 7
  482. %ifidn %2, 4
  483. packuswb m8, m8
  484. packuswb m9, m9
  485. %else
  486. packuswb m8, m9
  487. %endif
  488. %ifidn %1, v8_avg
  489. movx m7, [dstq]
  490. %ifidn %2, 4
  491. movx m10, [dstq + dstrideq]
  492. pavgb m9, m10
  493. %else
  494. movhpd m7, [dstq + dstrideq]
  495. %endif
  496. pavgb m8, m7
  497. %endif
  498. movx [dstq], m8
  499. %ifidn %2, 4
  500. movx [dstq + dstrideq], m9
  501. %else
  502. movhpd [dstq + dstrideq], m8
  503. %endif
  504. lea dstq, [dstq + dstrideq * 2 ]
  505. sub heightd, 2
  506. jg .loop
  507. ; Do last row if output_height is odd
  508. jne .done
  509. movx m7, [srcq + sstrideq] ;H
  510. punpcklbw m6, m7 ;G H
  511. pmaddubsw m0, k0k1
  512. pmaddubsw m2, k2k3
  513. pmaddubsw m4, k4k5
  514. pmaddubsw m6, k6k7
  515. paddsw m0, m4
  516. paddsw m2, m6
  517. paddsw m0, m2
  518. paddsw m0, krd
  519. psraw m0, 7
  520. packuswb m0, m0
  521. %ifidn %1, v8_avg
  522. movx m1, [dstq]
  523. pavgb m0, m1
  524. %endif
  525. movx [dstq], m0
  526. %endif ; ARCH_X86_64
  527. .done:
  528. REP_RET
  529. %endm
  530. ;-------------------------------------------------------------------------------
  531. %macro SUBPIX_VFILTER16 1
  532. cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
  533. src, sstride, dst, dstride, height, filter
  534. mova m4, [filterq]
  535. SETUP_LOCAL_VARS
  536. %if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
  537. %if ARCH_X86_64
  538. %define src1q r7
  539. %define sstride6q r8
  540. %define dst_stride dstrideq
  541. %else
  542. %define src1q filterq
  543. %define sstride6q dstrideq
  544. %define dst_stride dstridemp
  545. %endif
  546. lea src1q, [srcq + sstrideq]
  547. lea sstride6q, [sstrideq + sstrideq * 4]
  548. add sstride6q, sstrideq ;pitch * 6
  549. .loop:
  550. movh m0, [srcq ] ;A
  551. movh m1, [src1q ] ;B
  552. movh m2, [srcq + sstrideq * 2 ] ;C
  553. movh m3, [src1q + sstrideq * 2] ;D
  554. movh m4, [srcq + sstrideq * 4 ] ;E
  555. movh m5, [src1q + sstrideq * 4] ;F
  556. punpcklbw m0, m1 ;A B
  557. movh m6, [srcq + sstride6q] ;G
  558. punpcklbw m2, m3 ;C D
  559. movh m7, [src1q + sstride6q] ;H
  560. punpcklbw m4, m5 ;E F
  561. pmaddubsw m0, k0k1
  562. movh m3, [srcq + 8] ;A
  563. pmaddubsw m2, k2k3
  564. punpcklbw m6, m7 ;G H
  565. movh m5, [srcq + sstrideq + 8] ;B
  566. pmaddubsw m4, k4k5
  567. punpcklbw m3, m5 ;A B
  568. movh m7, [srcq + sstrideq * 2 + 8] ;C
  569. pmaddubsw m6, k6k7
  570. movh m5, [src1q + sstrideq * 2 + 8] ;D
  571. punpcklbw m7, m5 ;C D
  572. paddsw m2, m6
  573. pmaddubsw m3, k0k1
  574. movh m1, [srcq + sstrideq * 4 + 8] ;E
  575. paddsw m0, m4
  576. pmaddubsw m7, k2k3
  577. movh m6, [src1q + sstrideq * 4 + 8] ;F
  578. punpcklbw m1, m6 ;E F
  579. paddsw m0, m2
  580. paddsw m0, krd
  581. movh m2, [srcq + sstride6q + 8] ;G
  582. pmaddubsw m1, k4k5
  583. movh m5, [src1q + sstride6q + 8] ;H
  584. psraw m0, 7
  585. punpcklbw m2, m5 ;G H
  586. pmaddubsw m2, k6k7
  587. paddsw m7, m2
  588. paddsw m3, m1
  589. paddsw m3, m7
  590. paddsw m3, krd
  591. psraw m3, 7
  592. packuswb m0, m3
  593. add srcq, sstrideq
  594. add src1q, sstrideq
  595. %ifidn %1, v8_avg
  596. pavgb m0, [dstq]
  597. %endif
  598. mova [dstq], m0
  599. add dstq, dst_stride
  600. dec heightd
  601. jnz .loop
  602. REP_RET
  603. %else
  604. ; ARCH_X86_64
  605. dec heightd
  606. movu m1, [srcq ] ;A
  607. movu m3, [srcq + sstrideq ] ;B
  608. lea srcq, [srcq + sstrideq * 2]
  609. punpcklbw m0, m1, m3 ;A B
  610. punpckhbw m1, m3 ;A B
  611. movu m5, [srcq] ;C
  612. punpcklbw m2, m3, m5 ;A B next iter
  613. punpckhbw m3, m5 ;A B next iter
  614. mova tmp0, m2 ;store to stack
  615. mova tmp1, m3 ;store to stack
  616. movu m7, [srcq + sstrideq] ;D
  617. lea srcq, [srcq + sstrideq * 2]
  618. punpcklbw m4, m5, m7 ;C D
  619. punpckhbw m5, m7 ;C D
  620. movu m9, [srcq] ;E
  621. punpcklbw m6, m7, m9 ;C D next iter
  622. punpckhbw m7, m9 ;C D next iter
  623. movu m11, [srcq + sstrideq] ;F
  624. lea srcq, [srcq + sstrideq * 2]
  625. punpcklbw m8, m9, m11 ;E F
  626. punpckhbw m9, m11 ;E F
  627. movu m2, [srcq] ;G
  628. punpcklbw m10, m11, m2 ;E F next iter
  629. punpckhbw m11, m2 ;E F next iter
  630. .loop:
  631. ;Do two rows at once
  632. pmaddubsw m13, m0, k0k1
  633. mova m0, m4
  634. pmaddubsw m14, m8, k4k5
  635. pmaddubsw m15, m4, k2k3
  636. mova m4, m8
  637. paddsw m13, m14
  638. movu m3, [srcq + sstrideq] ;H
  639. lea srcq, [srcq + sstrideq * 2]
  640. punpcklbw m14, m2, m3 ;G H
  641. mova m8, m14
  642. pmaddubsw m14, k6k7
  643. paddsw m15, m14
  644. paddsw m13, m15
  645. paddsw m13, krd
  646. psraw m13, 7
  647. pmaddubsw m14, m1, k0k1
  648. pmaddubsw m1, m9, k4k5
  649. pmaddubsw m15, m5, k2k3
  650. paddsw m14, m1
  651. mova m1, m5
  652. mova m5, m9
  653. punpckhbw m2, m3 ;G H
  654. mova m9, m2
  655. pmaddubsw m2, k6k7
  656. paddsw m15, m2
  657. paddsw m14, m15
  658. paddsw m14, krd
  659. psraw m14, 7
  660. packuswb m13, m14
  661. %ifidn %1, v8_avg
  662. pavgb m13, [dstq]
  663. %endif
  664. mova [dstq], m13
  665. ; next iter
  666. pmaddubsw m15, tmp0, k0k1
  667. pmaddubsw m14, m10, k4k5
  668. pmaddubsw m13, m6, k2k3
  669. paddsw m15, m14
  670. mova tmp0, m6
  671. mova m6, m10
  672. movu m2, [srcq] ;G next iter
  673. punpcklbw m14, m3, m2 ;G H next iter
  674. mova m10, m14
  675. pmaddubsw m14, k6k7
  676. paddsw m13, m14
  677. paddsw m15, m13
  678. paddsw m15, krd
  679. psraw m15, 7
  680. pmaddubsw m14, tmp1, k0k1
  681. mova tmp1, m7
  682. pmaddubsw m13, m7, k2k3
  683. mova m7, m11
  684. pmaddubsw m11, k4k5
  685. paddsw m14, m11
  686. punpckhbw m3, m2 ;G H next iter
  687. mova m11, m3
  688. pmaddubsw m3, k6k7
  689. paddsw m13, m3
  690. paddsw m14, m13
  691. paddsw m14, krd
  692. psraw m14, 7
  693. packuswb m15, m14
  694. %ifidn %1, v8_avg
  695. pavgb m15, [dstq + dstrideq]
  696. %endif
  697. mova [dstq + dstrideq], m15
  698. lea dstq, [dstq + dstrideq * 2]
  699. sub heightd, 2
  700. jg .loop
  701. ; Do last row if output_height is odd
  702. jne .done
  703. movu m3, [srcq + sstrideq] ;H
  704. punpcklbw m6, m2, m3 ;G H
  705. punpckhbw m2, m3 ;G H
  706. pmaddubsw m0, k0k1
  707. pmaddubsw m1, k0k1
  708. pmaddubsw m4, k2k3
  709. pmaddubsw m5, k2k3
  710. pmaddubsw m8, k4k5
  711. pmaddubsw m9, k4k5
  712. pmaddubsw m6, k6k7
  713. pmaddubsw m2, k6k7
  714. paddsw m0, m8
  715. paddsw m1, m9
  716. paddsw m4, m6
  717. paddsw m5, m2
  718. paddsw m0, m4
  719. paddsw m1, m5
  720. paddsw m0, krd
  721. paddsw m1, krd
  722. psraw m0, 7
  723. psraw m1, 7
  724. packuswb m0, m1
  725. %ifidn %1, v8_avg
  726. pavgb m0, [dstq]
  727. %endif
  728. mova [dstq], m0
  729. .done:
  730. REP_RET
  731. %endif ; ARCH_X86_64
  732. %endm
  733. INIT_XMM ssse3
  734. SUBPIX_VFILTER16 v8 ; vpx_filter_block1d16_v8_ssse3
  735. SUBPIX_VFILTER16 v8_avg ; vpx_filter_block1d16_v8_avg_ssse3
  736. SUBPIX_VFILTER v8, 8 ; vpx_filter_block1d8_v8_ssse3
  737. SUBPIX_VFILTER v8_avg, 8 ; vpx_filter_block1d8_v8_avg_ssse3
  738. SUBPIX_VFILTER v8, 4 ; vpx_filter_block1d4_v8_ssse3
  739. SUBPIX_VFILTER v8_avg, 4 ; vpx_filter_block1d4_v8_avg_ssse3