subpel_variance_sse2.asm 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION_RODATA
  12. pw_8: times 8 dw 8
  13. bilin_filter_m_sse2: times 8 dw 16
  14. times 8 dw 0
  15. times 8 dw 14
  16. times 8 dw 2
  17. times 8 dw 12
  18. times 8 dw 4
  19. times 8 dw 10
  20. times 8 dw 6
  21. times 16 dw 8
  22. times 8 dw 6
  23. times 8 dw 10
  24. times 8 dw 4
  25. times 8 dw 12
  26. times 8 dw 2
  27. times 8 dw 14
  28. bilin_filter_m_ssse3: times 8 db 16, 0
  29. times 8 db 14, 2
  30. times 8 db 12, 4
  31. times 8 db 10, 6
  32. times 16 db 8
  33. times 8 db 6, 10
  34. times 8 db 4, 12
  35. times 8 db 2, 14
  36. SECTION .text
  37. ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
  38. ; int x_offset, int y_offset,
  39. ; const uint8_t *ref, ptrdiff_t ref_stride,
  40. ; int height, unsigned int *sse);
  41. ;
  42. ; This function returns the SE and stores SSE in the given pointer.
  43. %macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
  44. psubw %3, %4
  45. psubw %1, %2
  46. paddw %5, %3
  47. pmaddwd %3, %3
  48. paddw %5, %1
  49. pmaddwd %1, %1
  50. paddd %6, %3
  51. paddd %6, %1
  52. %endmacro
  53. %macro STORE_AND_RET 1
  54. %if %1 > 4
  55. ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
  56. ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
  57. ; We have to sign-extend it before adding the words within the register
  58. ; and outputing to a dword.
  59. pcmpgtw m5, m6 ; mask for 0 > x
  60. movhlps m3, m7
  61. punpcklwd m4, m6, m5
  62. punpckhwd m6, m5 ; sign-extend m6 word->dword
  63. paddd m7, m3
  64. paddd m6, m4
  65. pshufd m3, m7, 0x1
  66. movhlps m4, m6
  67. paddd m7, m3
  68. paddd m6, m4
  69. mov r1, ssem ; r1 = unsigned int *sse
  70. pshufd m4, m6, 0x1
  71. movd [r1], m7 ; store sse
  72. paddd m6, m4
  73. movd raxd, m6 ; store sum as return value
  74. %else ; 4xh
  75. pshuflw m4, m6, 0xe
  76. pshuflw m3, m7, 0xe
  77. paddw m6, m4
  78. paddd m7, m3
  79. pcmpgtw m5, m6 ; mask for 0 > x
  80. mov r1, ssem ; r1 = unsigned int *sse
  81. punpcklwd m6, m5 ; sign-extend m6 word->dword
  82. movd [r1], m7 ; store sse
  83. pshuflw m4, m6, 0xe
  84. paddd m6, m4
  85. movd raxd, m6 ; store sum as return value
  86. %endif
  87. RET
  88. %endmacro
  89. %macro INC_SRC_BY_SRC_STRIDE 0
  90. %if ARCH_X86=1 && CONFIG_PIC=1
  91. add srcq, src_stridemp
  92. %else
  93. add srcq, src_strideq
  94. %endif
  95. %endmacro
  96. %macro SUBPEL_VARIANCE 1-2 0 ; W
  97. %if cpuflag(ssse3)
  98. %define bilin_filter_m bilin_filter_m_ssse3
  99. %define filter_idx_shift 4
  100. %else
  101. %define bilin_filter_m bilin_filter_m_sse2
  102. %define filter_idx_shift 5
  103. %endif
  104. ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
  105. ; 11, not 13, if the registers are ordered correctly. May make a minor speed
  106. ; difference on Win64
  107. %if ARCH_X86_64
  108. %if %2 == 1 ; avg
  109. cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
  110. x_offset, y_offset, ref, ref_stride, \
  111. second_pred, second_stride, height, sse
  112. %define second_str second_strideq
  113. %else
  114. cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
  115. x_offset, y_offset, ref, ref_stride, \
  116. height, sse
  117. %endif
  118. %define block_height heightd
  119. %define bilin_filter sseq
  120. %else
  121. %if CONFIG_PIC=1
  122. %if %2 == 1 ; avg
  123. cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
  124. x_offset, y_offset, ref, ref_stride, \
  125. second_pred, second_stride, height, sse
  126. %define block_height dword heightm
  127. %define second_str second_stridemp
  128. %else
  129. cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
  130. x_offset, y_offset, ref, ref_stride, \
  131. height, sse
  132. %define block_height heightd
  133. %endif
  134. ; reuse argument stack space
  135. %define g_bilin_filterm x_offsetm
  136. %define g_pw_8m y_offsetm
  137. ;Store bilin_filter and pw_8 location in stack
  138. %if GET_GOT_DEFINED == 1
  139. GET_GOT eax
  140. add esp, 4 ; restore esp
  141. %endif
  142. lea ecx, [GLOBAL(bilin_filter_m)]
  143. mov g_bilin_filterm, ecx
  144. lea ecx, [GLOBAL(pw_8)]
  145. mov g_pw_8m, ecx
  146. LOAD_IF_USED 0, 1 ; load eax, ecx back
  147. %else
  148. %if %2 == 1 ; avg
  149. cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
  150. x_offset, y_offset, \
  151. ref, ref_stride, second_pred, second_stride, \
  152. height, sse
  153. %define block_height dword heightm
  154. %define second_str second_stridemp
  155. %else
  156. cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
  157. x_offset, y_offset, ref, ref_stride, \
  158. height, sse
  159. %define block_height heightd
  160. %endif
  161. %define bilin_filter bilin_filter_m
  162. %endif
  163. %endif
  164. %if %1 == 4
  165. %define movx movd
  166. %else
  167. %define movx movh
  168. %endif
  169. ASSERT %1 <= 16 ; m6 overflows if w > 16
  170. pxor m6, m6 ; sum
  171. pxor m7, m7 ; sse
  172. ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
  173. ; could perhaps use it for something more productive then
  174. pxor m5, m5 ; dedicated zero register
  175. %if %1 < 16
  176. sar block_height, 1
  177. %if %2 == 1 ; avg
  178. shl second_str, 1
  179. %endif
  180. %endif
  181. ; FIXME(rbultje) replace by jumptable?
  182. test x_offsetd, x_offsetd
  183. jnz .x_nonzero
  184. ; x_offset == 0
  185. test y_offsetd, y_offsetd
  186. jnz .x_zero_y_nonzero
  187. ; x_offset == 0 && y_offset == 0
  188. .x_zero_y_zero_loop:
  189. %if %1 == 16
  190. movu m0, [srcq]
  191. mova m1, [refq]
  192. %if %2 == 1 ; avg
  193. pavgb m0, [second_predq]
  194. punpckhbw m3, m1, m5
  195. punpcklbw m1, m5
  196. %endif
  197. punpckhbw m2, m0, m5
  198. punpcklbw m0, m5
  199. %if %2 == 0 ; !avg
  200. punpckhbw m3, m1, m5
  201. punpcklbw m1, m5
  202. %endif
  203. SUM_SSE m0, m1, m2, m3, m6, m7
  204. add srcq, src_strideq
  205. add refq, ref_strideq
  206. %else ; %1 < 16
  207. movx m0, [srcq]
  208. %if %2 == 1 ; avg
  209. %if %1 > 4
  210. movhps m0, [srcq+src_strideq]
  211. %else ; 4xh
  212. movx m1, [srcq+src_strideq]
  213. punpckldq m0, m1
  214. %endif
  215. %else ; !avg
  216. movx m2, [srcq+src_strideq]
  217. %endif
  218. movx m1, [refq]
  219. movx m3, [refq+ref_strideq]
  220. %if %2 == 1 ; avg
  221. %if %1 > 4
  222. pavgb m0, [second_predq]
  223. %else
  224. movh m2, [second_predq]
  225. pavgb m0, m2
  226. %endif
  227. punpcklbw m3, m5
  228. punpcklbw m1, m5
  229. %if %1 > 4
  230. punpckhbw m2, m0, m5
  231. punpcklbw m0, m5
  232. %else ; 4xh
  233. punpcklbw m0, m5
  234. movhlps m2, m0
  235. %endif
  236. %else ; !avg
  237. punpcklbw m0, m5
  238. punpcklbw m2, m5
  239. punpcklbw m3, m5
  240. punpcklbw m1, m5
  241. %endif
  242. SUM_SSE m0, m1, m2, m3, m6, m7
  243. lea srcq, [srcq+src_strideq*2]
  244. lea refq, [refq+ref_strideq*2]
  245. %endif
  246. %if %2 == 1 ; avg
  247. add second_predq, second_str
  248. %endif
  249. dec block_height
  250. jg .x_zero_y_zero_loop
  251. STORE_AND_RET %1
  252. .x_zero_y_nonzero:
  253. cmp y_offsetd, 4
  254. jne .x_zero_y_nonhalf
  255. ; x_offset == 0 && y_offset == 0.5
  256. .x_zero_y_half_loop:
  257. %if %1 == 16
  258. movu m0, [srcq]
  259. movu m4, [srcq+src_strideq]
  260. mova m1, [refq]
  261. pavgb m0, m4
  262. punpckhbw m3, m1, m5
  263. %if %2 == 1 ; avg
  264. pavgb m0, [second_predq]
  265. %endif
  266. punpcklbw m1, m5
  267. punpckhbw m2, m0, m5
  268. punpcklbw m0, m5
  269. SUM_SSE m0, m1, m2, m3, m6, m7
  270. add srcq, src_strideq
  271. add refq, ref_strideq
  272. %else ; %1 < 16
  273. movx m0, [srcq]
  274. movx m2, [srcq+src_strideq]
  275. %if %2 == 1 ; avg
  276. %if %1 > 4
  277. movhps m2, [srcq+src_strideq*2]
  278. %else ; 4xh
  279. movx m1, [srcq+src_strideq*2]
  280. punpckldq m2, m1
  281. %endif
  282. movx m1, [refq]
  283. %if %1 > 4
  284. movlhps m0, m2
  285. %else ; 4xh
  286. punpckldq m0, m2
  287. %endif
  288. movx m3, [refq+ref_strideq]
  289. pavgb m0, m2
  290. punpcklbw m1, m5
  291. %if %1 > 4
  292. pavgb m0, [second_predq]
  293. punpcklbw m3, m5
  294. punpckhbw m2, m0, m5
  295. punpcklbw m0, m5
  296. %else ; 4xh
  297. movh m4, [second_predq]
  298. pavgb m0, m4
  299. punpcklbw m3, m5
  300. punpcklbw m0, m5
  301. movhlps m2, m0
  302. %endif
  303. %else ; !avg
  304. movx m4, [srcq+src_strideq*2]
  305. movx m1, [refq]
  306. pavgb m0, m2
  307. movx m3, [refq+ref_strideq]
  308. pavgb m2, m4
  309. punpcklbw m0, m5
  310. punpcklbw m2, m5
  311. punpcklbw m3, m5
  312. punpcklbw m1, m5
  313. %endif
  314. SUM_SSE m0, m1, m2, m3, m6, m7
  315. lea srcq, [srcq+src_strideq*2]
  316. lea refq, [refq+ref_strideq*2]
  317. %endif
  318. %if %2 == 1 ; avg
  319. add second_predq, second_str
  320. %endif
  321. dec block_height
  322. jg .x_zero_y_half_loop
  323. STORE_AND_RET %1
  324. .x_zero_y_nonhalf:
  325. ; x_offset == 0 && y_offset == bilin interpolation
  326. %if ARCH_X86_64
  327. lea bilin_filter, [GLOBAL(bilin_filter_m)]
  328. %endif
  329. shl y_offsetd, filter_idx_shift
  330. %if ARCH_X86_64 && %1 > 4
  331. mova m8, [bilin_filter+y_offsetq]
  332. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  333. mova m9, [bilin_filter+y_offsetq+16]
  334. %endif
  335. mova m10, [GLOBAL(pw_8)]
  336. %define filter_y_a m8
  337. %define filter_y_b m9
  338. %define filter_rnd m10
  339. %else ; x86-32 or mmx
  340. %if ARCH_X86=1 && CONFIG_PIC=1
  341. ; x_offset == 0, reuse x_offset reg
  342. %define tempq x_offsetq
  343. add y_offsetq, g_bilin_filterm
  344. %define filter_y_a [y_offsetq]
  345. %define filter_y_b [y_offsetq+16]
  346. mov tempq, g_pw_8m
  347. %define filter_rnd [tempq]
  348. %else
  349. add y_offsetq, bilin_filter
  350. %define filter_y_a [y_offsetq]
  351. %define filter_y_b [y_offsetq+16]
  352. %define filter_rnd [GLOBAL(pw_8)]
  353. %endif
  354. %endif
  355. .x_zero_y_other_loop:
  356. %if %1 == 16
  357. movu m0, [srcq]
  358. movu m4, [srcq+src_strideq]
  359. mova m1, [refq]
  360. %if cpuflag(ssse3)
  361. punpckhbw m2, m0, m4
  362. punpcklbw m0, m4
  363. pmaddubsw m2, filter_y_a
  364. pmaddubsw m0, filter_y_a
  365. paddw m2, filter_rnd
  366. paddw m0, filter_rnd
  367. %else
  368. punpckhbw m2, m0, m5
  369. punpckhbw m3, m4, m5
  370. punpcklbw m0, m5
  371. punpcklbw m4, m5
  372. ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
  373. ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
  374. ; instructions is the same (5), but it is 1 mul instead of 2, so might be
  375. ; slightly faster because of pmullw latency. It would also cut our rodata
  376. ; tables in half for this function, and save 1-2 registers on x86-64.
  377. pmullw m2, filter_y_a
  378. pmullw m3, filter_y_b
  379. paddw m2, filter_rnd
  380. pmullw m0, filter_y_a
  381. pmullw m4, filter_y_b
  382. paddw m0, filter_rnd
  383. paddw m2, m3
  384. paddw m0, m4
  385. %endif
  386. psraw m2, 4
  387. psraw m0, 4
  388. %if %2 == 1 ; avg
  389. ; FIXME(rbultje) pipeline
  390. packuswb m0, m2
  391. pavgb m0, [second_predq]
  392. punpckhbw m2, m0, m5
  393. punpcklbw m0, m5
  394. %endif
  395. punpckhbw m3, m1, m5
  396. punpcklbw m1, m5
  397. SUM_SSE m0, m1, m2, m3, m6, m7
  398. add srcq, src_strideq
  399. add refq, ref_strideq
  400. %else ; %1 < 16
  401. movx m0, [srcq]
  402. movx m2, [srcq+src_strideq]
  403. movx m4, [srcq+src_strideq*2]
  404. movx m3, [refq+ref_strideq]
  405. %if cpuflag(ssse3)
  406. movx m1, [refq]
  407. punpcklbw m0, m2
  408. punpcklbw m2, m4
  409. pmaddubsw m0, filter_y_a
  410. pmaddubsw m2, filter_y_a
  411. punpcklbw m3, m5
  412. paddw m2, filter_rnd
  413. paddw m0, filter_rnd
  414. %else
  415. punpcklbw m0, m5
  416. punpcklbw m2, m5
  417. punpcklbw m4, m5
  418. pmullw m0, filter_y_a
  419. pmullw m1, m2, filter_y_b
  420. punpcklbw m3, m5
  421. paddw m0, filter_rnd
  422. pmullw m2, filter_y_a
  423. pmullw m4, filter_y_b
  424. paddw m0, m1
  425. paddw m2, filter_rnd
  426. movx m1, [refq]
  427. paddw m2, m4
  428. %endif
  429. psraw m0, 4
  430. psraw m2, 4
  431. %if %2 == 1 ; avg
  432. ; FIXME(rbultje) pipeline
  433. %if %1 == 4
  434. movlhps m0, m2
  435. %endif
  436. packuswb m0, m2
  437. %if %1 > 4
  438. pavgb m0, [second_predq]
  439. punpckhbw m2, m0, m5
  440. punpcklbw m0, m5
  441. %else ; 4xh
  442. movh m2, [second_predq]
  443. pavgb m0, m2
  444. punpcklbw m0, m5
  445. movhlps m2, m0
  446. %endif
  447. %endif
  448. punpcklbw m1, m5
  449. SUM_SSE m0, m1, m2, m3, m6, m7
  450. lea srcq, [srcq+src_strideq*2]
  451. lea refq, [refq+ref_strideq*2]
  452. %endif
  453. %if %2 == 1 ; avg
  454. add second_predq, second_str
  455. %endif
  456. dec block_height
  457. jg .x_zero_y_other_loop
  458. %undef filter_y_a
  459. %undef filter_y_b
  460. %undef filter_rnd
  461. STORE_AND_RET %1
  462. .x_nonzero:
  463. cmp x_offsetd, 4
  464. jne .x_nonhalf
  465. ; x_offset == 0.5
  466. test y_offsetd, y_offsetd
  467. jnz .x_half_y_nonzero
  468. ; x_offset == 0.5 && y_offset == 0
  469. .x_half_y_zero_loop:
  470. %if %1 == 16
  471. movu m0, [srcq]
  472. movu m4, [srcq+1]
  473. mova m1, [refq]
  474. pavgb m0, m4
  475. punpckhbw m3, m1, m5
  476. %if %2 == 1 ; avg
  477. pavgb m0, [second_predq]
  478. %endif
  479. punpcklbw m1, m5
  480. punpckhbw m2, m0, m5
  481. punpcklbw m0, m5
  482. SUM_SSE m0, m1, m2, m3, m6, m7
  483. add srcq, src_strideq
  484. add refq, ref_strideq
  485. %else ; %1 < 16
  486. movx m0, [srcq]
  487. movx m4, [srcq+1]
  488. %if %2 == 1 ; avg
  489. %if %1 > 4
  490. movhps m0, [srcq+src_strideq]
  491. movhps m4, [srcq+src_strideq+1]
  492. %else ; 4xh
  493. movx m1, [srcq+src_strideq]
  494. punpckldq m0, m1
  495. movx m2, [srcq+src_strideq+1]
  496. punpckldq m4, m2
  497. %endif
  498. movx m1, [refq]
  499. movx m3, [refq+ref_strideq]
  500. pavgb m0, m4
  501. punpcklbw m3, m5
  502. %if %1 > 4
  503. pavgb m0, [second_predq]
  504. punpcklbw m1, m5
  505. punpckhbw m2, m0, m5
  506. punpcklbw m0, m5
  507. %else ; 4xh
  508. movh m2, [second_predq]
  509. pavgb m0, m2
  510. punpcklbw m1, m5
  511. punpcklbw m0, m5
  512. movhlps m2, m0
  513. %endif
  514. %else ; !avg
  515. movx m2, [srcq+src_strideq]
  516. movx m1, [refq]
  517. pavgb m0, m4
  518. movx m4, [srcq+src_strideq+1]
  519. movx m3, [refq+ref_strideq]
  520. pavgb m2, m4
  521. punpcklbw m0, m5
  522. punpcklbw m2, m5
  523. punpcklbw m3, m5
  524. punpcklbw m1, m5
  525. %endif
  526. SUM_SSE m0, m1, m2, m3, m6, m7
  527. lea srcq, [srcq+src_strideq*2]
  528. lea refq, [refq+ref_strideq*2]
  529. %endif
  530. %if %2 == 1 ; avg
  531. add second_predq, second_str
  532. %endif
  533. dec block_height
  534. jg .x_half_y_zero_loop
  535. STORE_AND_RET %1
  536. .x_half_y_nonzero:
  537. cmp y_offsetd, 4
  538. jne .x_half_y_nonhalf
  539. ; x_offset == 0.5 && y_offset == 0.5
  540. %if %1 == 16
  541. movu m0, [srcq]
  542. movu m3, [srcq+1]
  543. add srcq, src_strideq
  544. pavgb m0, m3
  545. .x_half_y_half_loop:
  546. movu m4, [srcq]
  547. movu m3, [srcq+1]
  548. mova m1, [refq]
  549. pavgb m4, m3
  550. punpckhbw m3, m1, m5
  551. pavgb m0, m4
  552. %if %2 == 1 ; avg
  553. punpcklbw m1, m5
  554. pavgb m0, [second_predq]
  555. punpckhbw m2, m0, m5
  556. punpcklbw m0, m5
  557. %else
  558. punpckhbw m2, m0, m5
  559. punpcklbw m0, m5
  560. punpcklbw m1, m5
  561. %endif
  562. SUM_SSE m0, m1, m2, m3, m6, m7
  563. mova m0, m4
  564. add srcq, src_strideq
  565. add refq, ref_strideq
  566. %else ; %1 < 16
  567. movx m0, [srcq]
  568. movx m3, [srcq+1]
  569. add srcq, src_strideq
  570. pavgb m0, m3
  571. .x_half_y_half_loop:
  572. movx m2, [srcq]
  573. movx m3, [srcq+1]
  574. %if %2 == 1 ; avg
  575. %if %1 > 4
  576. movhps m2, [srcq+src_strideq]
  577. movhps m3, [srcq+src_strideq+1]
  578. %else
  579. movx m1, [srcq+src_strideq]
  580. punpckldq m2, m1
  581. movx m1, [srcq+src_strideq+1]
  582. punpckldq m3, m1
  583. %endif
  584. pavgb m2, m3
  585. %if %1 > 4
  586. movlhps m0, m2
  587. movhlps m4, m2
  588. %else ; 4xh
  589. punpckldq m0, m2
  590. pshuflw m4, m2, 0xe
  591. %endif
  592. movx m1, [refq]
  593. pavgb m0, m2
  594. movx m3, [refq+ref_strideq]
  595. %if %1 > 4
  596. pavgb m0, [second_predq]
  597. %else
  598. movh m2, [second_predq]
  599. pavgb m0, m2
  600. %endif
  601. punpcklbw m3, m5
  602. punpcklbw m1, m5
  603. %if %1 > 4
  604. punpckhbw m2, m0, m5
  605. punpcklbw m0, m5
  606. %else
  607. punpcklbw m0, m5
  608. movhlps m2, m0
  609. %endif
  610. %else ; !avg
  611. movx m4, [srcq+src_strideq]
  612. movx m1, [srcq+src_strideq+1]
  613. pavgb m2, m3
  614. pavgb m4, m1
  615. pavgb m0, m2
  616. pavgb m2, m4
  617. movx m1, [refq]
  618. movx m3, [refq+ref_strideq]
  619. punpcklbw m0, m5
  620. punpcklbw m2, m5
  621. punpcklbw m3, m5
  622. punpcklbw m1, m5
  623. %endif
  624. SUM_SSE m0, m1, m2, m3, m6, m7
  625. mova m0, m4
  626. lea srcq, [srcq+src_strideq*2]
  627. lea refq, [refq+ref_strideq*2]
  628. %endif
  629. %if %2 == 1 ; avg
  630. add second_predq, second_str
  631. %endif
  632. dec block_height
  633. jg .x_half_y_half_loop
  634. STORE_AND_RET %1
  635. .x_half_y_nonhalf:
  636. ; x_offset == 0.5 && y_offset == bilin interpolation
  637. %if ARCH_X86_64
  638. lea bilin_filter, [GLOBAL(bilin_filter_m)]
  639. %endif
  640. shl y_offsetd, filter_idx_shift
  641. %if ARCH_X86_64 && %1 > 4
  642. mova m8, [bilin_filter+y_offsetq]
  643. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  644. mova m9, [bilin_filter+y_offsetq+16]
  645. %endif
  646. mova m10, [GLOBAL(pw_8)]
  647. %define filter_y_a m8
  648. %define filter_y_b m9
  649. %define filter_rnd m10
  650. %else ;x86_32
  651. %if ARCH_X86=1 && CONFIG_PIC=1
  652. ; x_offset == 0.5. We can reuse x_offset reg
  653. %define tempq x_offsetq
  654. add y_offsetq, g_bilin_filterm
  655. %define filter_y_a [y_offsetq]
  656. %define filter_y_b [y_offsetq+16]
  657. mov tempq, g_pw_8m
  658. %define filter_rnd [tempq]
  659. %else
  660. add y_offsetq, bilin_filter
  661. %define filter_y_a [y_offsetq]
  662. %define filter_y_b [y_offsetq+16]
  663. %define filter_rnd [GLOBAL(pw_8)]
  664. %endif
  665. %endif
  666. %if %1 == 16
  667. movu m0, [srcq]
  668. movu m3, [srcq+1]
  669. add srcq, src_strideq
  670. pavgb m0, m3
  671. .x_half_y_other_loop:
  672. movu m4, [srcq]
  673. movu m2, [srcq+1]
  674. mova m1, [refq]
  675. pavgb m4, m2
  676. %if cpuflag(ssse3)
  677. punpckhbw m2, m0, m4
  678. punpcklbw m0, m4
  679. pmaddubsw m2, filter_y_a
  680. pmaddubsw m0, filter_y_a
  681. paddw m2, filter_rnd
  682. paddw m0, filter_rnd
  683. psraw m2, 4
  684. %else
  685. punpckhbw m2, m0, m5
  686. punpckhbw m3, m4, m5
  687. pmullw m2, filter_y_a
  688. pmullw m3, filter_y_b
  689. paddw m2, filter_rnd
  690. punpcklbw m0, m5
  691. paddw m2, m3
  692. punpcklbw m3, m4, m5
  693. pmullw m0, filter_y_a
  694. pmullw m3, filter_y_b
  695. paddw m0, filter_rnd
  696. psraw m2, 4
  697. paddw m0, m3
  698. %endif
  699. punpckhbw m3, m1, m5
  700. psraw m0, 4
  701. %if %2 == 1 ; avg
  702. ; FIXME(rbultje) pipeline
  703. packuswb m0, m2
  704. pavgb m0, [second_predq]
  705. punpckhbw m2, m0, m5
  706. punpcklbw m0, m5
  707. %endif
  708. punpcklbw m1, m5
  709. SUM_SSE m0, m1, m2, m3, m6, m7
  710. mova m0, m4
  711. add srcq, src_strideq
  712. add refq, ref_strideq
  713. %else ; %1 < 16
  714. movx m0, [srcq]
  715. movx m3, [srcq+1]
  716. add srcq, src_strideq
  717. pavgb m0, m3
  718. %if notcpuflag(ssse3)
  719. punpcklbw m0, m5
  720. %endif
  721. .x_half_y_other_loop:
  722. movx m2, [srcq]
  723. movx m1, [srcq+1]
  724. movx m4, [srcq+src_strideq]
  725. movx m3, [srcq+src_strideq+1]
  726. pavgb m2, m1
  727. pavgb m4, m3
  728. movx m3, [refq+ref_strideq]
  729. %if cpuflag(ssse3)
  730. movx m1, [refq]
  731. punpcklbw m0, m2
  732. punpcklbw m2, m4
  733. pmaddubsw m0, filter_y_a
  734. pmaddubsw m2, filter_y_a
  735. punpcklbw m3, m5
  736. paddw m0, filter_rnd
  737. paddw m2, filter_rnd
  738. %else
  739. punpcklbw m2, m5
  740. punpcklbw m4, m5
  741. pmullw m0, filter_y_a
  742. pmullw m1, m2, filter_y_b
  743. punpcklbw m3, m5
  744. paddw m0, filter_rnd
  745. pmullw m2, filter_y_a
  746. paddw m0, m1
  747. pmullw m1, m4, filter_y_b
  748. paddw m2, filter_rnd
  749. paddw m2, m1
  750. movx m1, [refq]
  751. %endif
  752. psraw m0, 4
  753. psraw m2, 4
  754. %if %2 == 1 ; avg
  755. ; FIXME(rbultje) pipeline
  756. %if %1 == 4
  757. movlhps m0, m2
  758. %endif
  759. packuswb m0, m2
  760. %if %1 > 4
  761. pavgb m0, [second_predq]
  762. punpckhbw m2, m0, m5
  763. punpcklbw m0, m5
  764. %else
  765. movh m2, [second_predq]
  766. pavgb m0, m2
  767. punpcklbw m0, m5
  768. movhlps m2, m0
  769. %endif
  770. %endif
  771. punpcklbw m1, m5
  772. SUM_SSE m0, m1, m2, m3, m6, m7
  773. mova m0, m4
  774. lea srcq, [srcq+src_strideq*2]
  775. lea refq, [refq+ref_strideq*2]
  776. %endif
  777. %if %2 == 1 ; avg
  778. add second_predq, second_str
  779. %endif
  780. dec block_height
  781. jg .x_half_y_other_loop
  782. %undef filter_y_a
  783. %undef filter_y_b
  784. %undef filter_rnd
  785. STORE_AND_RET %1
  786. .x_nonhalf:
  787. test y_offsetd, y_offsetd
  788. jnz .x_nonhalf_y_nonzero
  789. ; x_offset == bilin interpolation && y_offset == 0
  790. %if ARCH_X86_64
  791. lea bilin_filter, [GLOBAL(bilin_filter_m)]
  792. %endif
  793. shl x_offsetd, filter_idx_shift
  794. %if ARCH_X86_64 && %1 > 4
  795. mova m8, [bilin_filter+x_offsetq]
  796. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  797. mova m9, [bilin_filter+x_offsetq+16]
  798. %endif
  799. mova m10, [GLOBAL(pw_8)]
  800. %define filter_x_a m8
  801. %define filter_x_b m9
  802. %define filter_rnd m10
  803. %else ; x86-32
  804. %if ARCH_X86=1 && CONFIG_PIC=1
  805. ;y_offset == 0. We can reuse y_offset reg.
  806. %define tempq y_offsetq
  807. add x_offsetq, g_bilin_filterm
  808. %define filter_x_a [x_offsetq]
  809. %define filter_x_b [x_offsetq+16]
  810. mov tempq, g_pw_8m
  811. %define filter_rnd [tempq]
  812. %else
  813. add x_offsetq, bilin_filter
  814. %define filter_x_a [x_offsetq]
  815. %define filter_x_b [x_offsetq+16]
  816. %define filter_rnd [GLOBAL(pw_8)]
  817. %endif
  818. %endif
  819. .x_other_y_zero_loop:
  820. %if %1 == 16
  821. movu m0, [srcq]
  822. movu m4, [srcq+1]
  823. mova m1, [refq]
  824. %if cpuflag(ssse3)
  825. punpckhbw m2, m0, m4
  826. punpcklbw m0, m4
  827. pmaddubsw m2, filter_x_a
  828. pmaddubsw m0, filter_x_a
  829. paddw m2, filter_rnd
  830. paddw m0, filter_rnd
  831. %else
  832. punpckhbw m2, m0, m5
  833. punpckhbw m3, m4, m5
  834. punpcklbw m0, m5
  835. punpcklbw m4, m5
  836. pmullw m2, filter_x_a
  837. pmullw m3, filter_x_b
  838. paddw m2, filter_rnd
  839. pmullw m0, filter_x_a
  840. pmullw m4, filter_x_b
  841. paddw m0, filter_rnd
  842. paddw m2, m3
  843. paddw m0, m4
  844. %endif
  845. psraw m2, 4
  846. psraw m0, 4
  847. %if %2 == 1 ; avg
  848. ; FIXME(rbultje) pipeline
  849. packuswb m0, m2
  850. pavgb m0, [second_predq]
  851. punpckhbw m2, m0, m5
  852. punpcklbw m0, m5
  853. %endif
  854. punpckhbw m3, m1, m5
  855. punpcklbw m1, m5
  856. SUM_SSE m0, m1, m2, m3, m6, m7
  857. add srcq, src_strideq
  858. add refq, ref_strideq
  859. %else ; %1 < 16
  860. movx m0, [srcq]
  861. movx m1, [srcq+1]
  862. movx m2, [srcq+src_strideq]
  863. movx m4, [srcq+src_strideq+1]
  864. movx m3, [refq+ref_strideq]
  865. %if cpuflag(ssse3)
  866. punpcklbw m0, m1
  867. movx m1, [refq]
  868. punpcklbw m2, m4
  869. pmaddubsw m0, filter_x_a
  870. pmaddubsw m2, filter_x_a
  871. punpcklbw m3, m5
  872. paddw m0, filter_rnd
  873. paddw m2, filter_rnd
  874. %else
  875. punpcklbw m0, m5
  876. punpcklbw m1, m5
  877. punpcklbw m2, m5
  878. punpcklbw m4, m5
  879. pmullw m0, filter_x_a
  880. pmullw m1, filter_x_b
  881. punpcklbw m3, m5
  882. paddw m0, filter_rnd
  883. pmullw m2, filter_x_a
  884. pmullw m4, filter_x_b
  885. paddw m0, m1
  886. paddw m2, filter_rnd
  887. movx m1, [refq]
  888. paddw m2, m4
  889. %endif
  890. psraw m0, 4
  891. psraw m2, 4
  892. %if %2 == 1 ; avg
  893. ; FIXME(rbultje) pipeline
  894. %if %1 == 4
  895. movlhps m0, m2
  896. %endif
  897. packuswb m0, m2
  898. %if %1 > 4
  899. pavgb m0, [second_predq]
  900. punpckhbw m2, m0, m5
  901. punpcklbw m0, m5
  902. %else
  903. movh m2, [second_predq]
  904. pavgb m0, m2
  905. punpcklbw m0, m5
  906. movhlps m2, m0
  907. %endif
  908. %endif
  909. punpcklbw m1, m5
  910. SUM_SSE m0, m1, m2, m3, m6, m7
  911. lea srcq, [srcq+src_strideq*2]
  912. lea refq, [refq+ref_strideq*2]
  913. %endif
  914. %if %2 == 1 ; avg
  915. add second_predq, second_str
  916. %endif
  917. dec block_height
  918. jg .x_other_y_zero_loop
  919. %undef filter_x_a
  920. %undef filter_x_b
  921. %undef filter_rnd
  922. STORE_AND_RET %1
  923. .x_nonhalf_y_nonzero:
  924. cmp y_offsetd, 4
  925. jne .x_nonhalf_y_nonhalf
  926. ; x_offset == bilin interpolation && y_offset == 0.5
  927. %if ARCH_X86_64
  928. lea bilin_filter, [GLOBAL(bilin_filter_m)]
  929. %endif
  930. shl x_offsetd, filter_idx_shift
  931. %if ARCH_X86_64 && %1 > 4
  932. mova m8, [bilin_filter+x_offsetq]
  933. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  934. mova m9, [bilin_filter+x_offsetq+16]
  935. %endif
  936. mova m10, [GLOBAL(pw_8)]
  937. %define filter_x_a m8
  938. %define filter_x_b m9
  939. %define filter_rnd m10
  940. %else ; x86-32
  941. %if ARCH_X86=1 && CONFIG_PIC=1
  942. ; y_offset == 0.5. We can reuse y_offset reg.
  943. %define tempq y_offsetq
  944. add x_offsetq, g_bilin_filterm
  945. %define filter_x_a [x_offsetq]
  946. %define filter_x_b [x_offsetq+16]
  947. mov tempq, g_pw_8m
  948. %define filter_rnd [tempq]
  949. %else
  950. add x_offsetq, bilin_filter
  951. %define filter_x_a [x_offsetq]
  952. %define filter_x_b [x_offsetq+16]
  953. %define filter_rnd [GLOBAL(pw_8)]
  954. %endif
  955. %endif
  956. %if %1 == 16
  957. movu m0, [srcq]
  958. movu m1, [srcq+1]
  959. %if cpuflag(ssse3)
  960. punpckhbw m2, m0, m1
  961. punpcklbw m0, m1
  962. pmaddubsw m2, filter_x_a
  963. pmaddubsw m0, filter_x_a
  964. paddw m2, filter_rnd
  965. paddw m0, filter_rnd
  966. %else
  967. punpckhbw m2, m0, m5
  968. punpckhbw m3, m1, m5
  969. punpcklbw m0, m5
  970. punpcklbw m1, m5
  971. pmullw m0, filter_x_a
  972. pmullw m1, filter_x_b
  973. paddw m0, filter_rnd
  974. pmullw m2, filter_x_a
  975. pmullw m3, filter_x_b
  976. paddw m2, filter_rnd
  977. paddw m0, m1
  978. paddw m2, m3
  979. %endif
  980. psraw m0, 4
  981. psraw m2, 4
  982. add srcq, src_strideq
  983. packuswb m0, m2
  984. .x_other_y_half_loop:
  985. movu m4, [srcq]
  986. movu m3, [srcq+1]
  987. %if cpuflag(ssse3)
  988. mova m1, [refq]
  989. punpckhbw m2, m4, m3
  990. punpcklbw m4, m3
  991. pmaddubsw m2, filter_x_a
  992. pmaddubsw m4, filter_x_a
  993. paddw m2, filter_rnd
  994. paddw m4, filter_rnd
  995. psraw m2, 4
  996. psraw m4, 4
  997. packuswb m4, m2
  998. pavgb m0, m4
  999. punpckhbw m3, m1, m5
  1000. punpcklbw m1, m5
  1001. %else
  1002. punpckhbw m2, m4, m5
  1003. punpckhbw m1, m3, m5
  1004. punpcklbw m4, m5
  1005. punpcklbw m3, m5
  1006. pmullw m4, filter_x_a
  1007. pmullw m3, filter_x_b
  1008. paddw m4, filter_rnd
  1009. pmullw m2, filter_x_a
  1010. pmullw m1, filter_x_b
  1011. paddw m2, filter_rnd
  1012. paddw m4, m3
  1013. paddw m2, m1
  1014. mova m1, [refq]
  1015. psraw m4, 4
  1016. psraw m2, 4
  1017. punpckhbw m3, m1, m5
  1018. ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
  1019. ; have a 1-register shortage to be able to store the backup of the bilin
  1020. ; filtered second line as words as cache for the next line. Packing into
  1021. ; a byte costs 1 pack and 2 unpacks, but saves a register.
  1022. packuswb m4, m2
  1023. punpcklbw m1, m5
  1024. pavgb m0, m4
  1025. %endif
  1026. %if %2 == 1 ; avg
  1027. ; FIXME(rbultje) pipeline
  1028. pavgb m0, [second_predq]
  1029. %endif
  1030. punpckhbw m2, m0, m5
  1031. punpcklbw m0, m5
  1032. SUM_SSE m0, m1, m2, m3, m6, m7
  1033. mova m0, m4
  1034. add srcq, src_strideq
  1035. add refq, ref_strideq
  1036. %else ; %1 < 16
  1037. movx m0, [srcq]
  1038. movx m1, [srcq+1]
  1039. %if cpuflag(ssse3)
  1040. punpcklbw m0, m1
  1041. pmaddubsw m0, filter_x_a
  1042. paddw m0, filter_rnd
  1043. %else
  1044. punpcklbw m0, m5
  1045. punpcklbw m1, m5
  1046. pmullw m0, filter_x_a
  1047. pmullw m1, filter_x_b
  1048. paddw m0, filter_rnd
  1049. paddw m0, m1
  1050. %endif
  1051. add srcq, src_strideq
  1052. psraw m0, 4
  1053. .x_other_y_half_loop:
  1054. movx m2, [srcq]
  1055. movx m1, [srcq+1]
  1056. movx m4, [srcq+src_strideq]
  1057. movx m3, [srcq+src_strideq+1]
  1058. %if cpuflag(ssse3)
  1059. punpcklbw m2, m1
  1060. punpcklbw m4, m3
  1061. pmaddubsw m2, filter_x_a
  1062. pmaddubsw m4, filter_x_a
  1063. movx m1, [refq]
  1064. movx m3, [refq+ref_strideq]
  1065. paddw m2, filter_rnd
  1066. paddw m4, filter_rnd
  1067. %else
  1068. punpcklbw m2, m5
  1069. punpcklbw m1, m5
  1070. punpcklbw m4, m5
  1071. punpcklbw m3, m5
  1072. pmullw m2, filter_x_a
  1073. pmullw m1, filter_x_b
  1074. paddw m2, filter_rnd
  1075. pmullw m4, filter_x_a
  1076. pmullw m3, filter_x_b
  1077. paddw m4, filter_rnd
  1078. paddw m2, m1
  1079. movx m1, [refq]
  1080. paddw m4, m3
  1081. movx m3, [refq+ref_strideq]
  1082. %endif
  1083. psraw m2, 4
  1084. psraw m4, 4
  1085. pavgw m0, m2
  1086. pavgw m2, m4
  1087. %if %2 == 1 ; avg
  1088. ; FIXME(rbultje) pipeline - also consider going to bytes here
  1089. %if %1 == 4
  1090. movlhps m0, m2
  1091. %endif
  1092. packuswb m0, m2
  1093. %if %1 > 4
  1094. pavgb m0, [second_predq]
  1095. punpckhbw m2, m0, m5
  1096. punpcklbw m0, m5
  1097. %else
  1098. movh m2, [second_predq]
  1099. pavgb m0, m2
  1100. punpcklbw m0, m5
  1101. movhlps m2, m0
  1102. %endif
  1103. %endif
  1104. punpcklbw m3, m5
  1105. punpcklbw m1, m5
  1106. SUM_SSE m0, m1, m2, m3, m6, m7
  1107. mova m0, m4
  1108. lea srcq, [srcq+src_strideq*2]
  1109. lea refq, [refq+ref_strideq*2]
  1110. %endif
  1111. %if %2 == 1 ; avg
  1112. add second_predq, second_str
  1113. %endif
  1114. dec block_height
  1115. jg .x_other_y_half_loop
  1116. %undef filter_x_a
  1117. %undef filter_x_b
  1118. %undef filter_rnd
  1119. STORE_AND_RET %1
  1120. .x_nonhalf_y_nonhalf:
  1121. %if ARCH_X86_64
  1122. lea bilin_filter, [GLOBAL(bilin_filter_m)]
  1123. %endif
  1124. shl x_offsetd, filter_idx_shift
  1125. shl y_offsetd, filter_idx_shift
  1126. %if ARCH_X86_64 && %1 > 4
  1127. mova m8, [bilin_filter+x_offsetq]
  1128. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  1129. mova m9, [bilin_filter+x_offsetq+16]
  1130. %endif
  1131. mova m10, [bilin_filter+y_offsetq]
  1132. %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  1133. mova m11, [bilin_filter+y_offsetq+16]
  1134. %endif
  1135. mova m12, [GLOBAL(pw_8)]
  1136. %define filter_x_a m8
  1137. %define filter_x_b m9
  1138. %define filter_y_a m10
  1139. %define filter_y_b m11
  1140. %define filter_rnd m12
  1141. %else ; x86-32
  1142. %if ARCH_X86=1 && CONFIG_PIC=1
  1143. ; In this case, there is NO unused register. Used src_stride register. Later,
  1144. ; src_stride has to be loaded from stack when it is needed.
  1145. %define tempq src_strideq
  1146. mov tempq, g_bilin_filterm
  1147. add x_offsetq, tempq
  1148. add y_offsetq, tempq
  1149. %define filter_x_a [x_offsetq]
  1150. %define filter_x_b [x_offsetq+16]
  1151. %define filter_y_a [y_offsetq]
  1152. %define filter_y_b [y_offsetq+16]
  1153. mov tempq, g_pw_8m
  1154. %define filter_rnd [tempq]
  1155. %else
  1156. add x_offsetq, bilin_filter
  1157. add y_offsetq, bilin_filter
  1158. %define filter_x_a [x_offsetq]
  1159. %define filter_x_b [x_offsetq+16]
  1160. %define filter_y_a [y_offsetq]
  1161. %define filter_y_b [y_offsetq+16]
  1162. %define filter_rnd [GLOBAL(pw_8)]
  1163. %endif
  1164. %endif
  1165. ; x_offset == bilin interpolation && y_offset == bilin interpolation
  1166. %if %1 == 16
  1167. movu m0, [srcq]
  1168. movu m1, [srcq+1]
  1169. %if cpuflag(ssse3)
  1170. punpckhbw m2, m0, m1
  1171. punpcklbw m0, m1
  1172. pmaddubsw m2, filter_x_a
  1173. pmaddubsw m0, filter_x_a
  1174. paddw m2, filter_rnd
  1175. paddw m0, filter_rnd
  1176. %else
  1177. punpckhbw m2, m0, m5
  1178. punpckhbw m3, m1, m5
  1179. punpcklbw m0, m5
  1180. punpcklbw m1, m5
  1181. pmullw m0, filter_x_a
  1182. pmullw m1, filter_x_b
  1183. paddw m0, filter_rnd
  1184. pmullw m2, filter_x_a
  1185. pmullw m3, filter_x_b
  1186. paddw m2, filter_rnd
  1187. paddw m0, m1
  1188. paddw m2, m3
  1189. %endif
  1190. psraw m0, 4
  1191. psraw m2, 4
  1192. INC_SRC_BY_SRC_STRIDE
  1193. packuswb m0, m2
  1194. .x_other_y_other_loop:
  1195. %if cpuflag(ssse3)
  1196. movu m4, [srcq]
  1197. movu m3, [srcq+1]
  1198. mova m1, [refq]
  1199. punpckhbw m2, m4, m3
  1200. punpcklbw m4, m3
  1201. pmaddubsw m2, filter_x_a
  1202. pmaddubsw m4, filter_x_a
  1203. punpckhbw m3, m1, m5
  1204. paddw m2, filter_rnd
  1205. paddw m4, filter_rnd
  1206. psraw m2, 4
  1207. psraw m4, 4
  1208. packuswb m4, m2
  1209. punpckhbw m2, m0, m4
  1210. punpcklbw m0, m4
  1211. pmaddubsw m2, filter_y_a
  1212. pmaddubsw m0, filter_y_a
  1213. punpcklbw m1, m5
  1214. paddw m2, filter_rnd
  1215. paddw m0, filter_rnd
  1216. psraw m2, 4
  1217. psraw m0, 4
  1218. %else
  1219. movu m3, [srcq]
  1220. movu m4, [srcq+1]
  1221. punpckhbw m1, m3, m5
  1222. punpckhbw m2, m4, m5
  1223. punpcklbw m3, m5
  1224. punpcklbw m4, m5
  1225. pmullw m3, filter_x_a
  1226. pmullw m4, filter_x_b
  1227. paddw m3, filter_rnd
  1228. pmullw m1, filter_x_a
  1229. pmullw m2, filter_x_b
  1230. paddw m1, filter_rnd
  1231. paddw m3, m4
  1232. paddw m1, m2
  1233. psraw m3, 4
  1234. psraw m1, 4
  1235. packuswb m4, m3, m1
  1236. punpckhbw m2, m0, m5
  1237. punpcklbw m0, m5
  1238. pmullw m2, filter_y_a
  1239. pmullw m1, filter_y_b
  1240. paddw m2, filter_rnd
  1241. pmullw m0, filter_y_a
  1242. pmullw m3, filter_y_b
  1243. paddw m2, m1
  1244. mova m1, [refq]
  1245. paddw m0, filter_rnd
  1246. psraw m2, 4
  1247. paddw m0, m3
  1248. punpckhbw m3, m1, m5
  1249. psraw m0, 4
  1250. punpcklbw m1, m5
  1251. %endif
  1252. %if %2 == 1 ; avg
  1253. ; FIXME(rbultje) pipeline
  1254. packuswb m0, m2
  1255. pavgb m0, [second_predq]
  1256. punpckhbw m2, m0, m5
  1257. punpcklbw m0, m5
  1258. %endif
  1259. SUM_SSE m0, m1, m2, m3, m6, m7
  1260. mova m0, m4
  1261. INC_SRC_BY_SRC_STRIDE
  1262. add refq, ref_strideq
  1263. %else ; %1 < 16
  1264. movx m0, [srcq]
  1265. movx m1, [srcq+1]
  1266. %if cpuflag(ssse3)
  1267. punpcklbw m0, m1
  1268. pmaddubsw m0, filter_x_a
  1269. paddw m0, filter_rnd
  1270. %else
  1271. punpcklbw m0, m5
  1272. punpcklbw m1, m5
  1273. pmullw m0, filter_x_a
  1274. pmullw m1, filter_x_b
  1275. paddw m0, filter_rnd
  1276. paddw m0, m1
  1277. %endif
  1278. psraw m0, 4
  1279. %if cpuflag(ssse3)
  1280. packuswb m0, m0
  1281. %endif
  1282. INC_SRC_BY_SRC_STRIDE
  1283. .x_other_y_other_loop:
  1284. movx m2, [srcq]
  1285. movx m1, [srcq+1]
  1286. INC_SRC_BY_SRC_STRIDE
  1287. movx m4, [srcq]
  1288. movx m3, [srcq+1]
  1289. %if cpuflag(ssse3)
  1290. punpcklbw m2, m1
  1291. punpcklbw m4, m3
  1292. pmaddubsw m2, filter_x_a
  1293. pmaddubsw m4, filter_x_a
  1294. movx m3, [refq+ref_strideq]
  1295. movx m1, [refq]
  1296. paddw m2, filter_rnd
  1297. paddw m4, filter_rnd
  1298. psraw m2, 4
  1299. psraw m4, 4
  1300. packuswb m2, m2
  1301. packuswb m4, m4
  1302. punpcklbw m0, m2
  1303. punpcklbw m2, m4
  1304. pmaddubsw m0, filter_y_a
  1305. pmaddubsw m2, filter_y_a
  1306. punpcklbw m3, m5
  1307. paddw m0, filter_rnd
  1308. paddw m2, filter_rnd
  1309. psraw m0, 4
  1310. psraw m2, 4
  1311. punpcklbw m1, m5
  1312. %else
  1313. punpcklbw m2, m5
  1314. punpcklbw m1, m5
  1315. punpcklbw m4, m5
  1316. punpcklbw m3, m5
  1317. pmullw m2, filter_x_a
  1318. pmullw m1, filter_x_b
  1319. paddw m2, filter_rnd
  1320. pmullw m4, filter_x_a
  1321. pmullw m3, filter_x_b
  1322. paddw m4, filter_rnd
  1323. paddw m2, m1
  1324. paddw m4, m3
  1325. psraw m2, 4
  1326. psraw m4, 4
  1327. pmullw m0, filter_y_a
  1328. pmullw m3, m2, filter_y_b
  1329. paddw m0, filter_rnd
  1330. pmullw m2, filter_y_a
  1331. pmullw m1, m4, filter_y_b
  1332. paddw m2, filter_rnd
  1333. paddw m0, m3
  1334. movx m3, [refq+ref_strideq]
  1335. paddw m2, m1
  1336. movx m1, [refq]
  1337. psraw m0, 4
  1338. psraw m2, 4
  1339. punpcklbw m3, m5
  1340. punpcklbw m1, m5
  1341. %endif
  1342. %if %2 == 1 ; avg
  1343. ; FIXME(rbultje) pipeline
  1344. %if %1 == 4
  1345. movlhps m0, m2
  1346. %endif
  1347. packuswb m0, m2
  1348. %if %1 > 4
  1349. pavgb m0, [second_predq]
  1350. punpckhbw m2, m0, m5
  1351. punpcklbw m0, m5
  1352. %else
  1353. movh m2, [second_predq]
  1354. pavgb m0, m2
  1355. punpcklbw m0, m5
  1356. movhlps m2, m0
  1357. %endif
  1358. %endif
  1359. SUM_SSE m0, m1, m2, m3, m6, m7
  1360. mova m0, m4
  1361. INC_SRC_BY_SRC_STRIDE
  1362. lea refq, [refq+ref_strideq*2]
  1363. %endif
  1364. %if %2 == 1 ; avg
  1365. add second_predq, second_str
  1366. %endif
  1367. dec block_height
  1368. jg .x_other_y_other_loop
  1369. %undef filter_x_a
  1370. %undef filter_x_b
  1371. %undef filter_y_a
  1372. %undef filter_y_b
  1373. %undef filter_rnd
  1374. %undef movx
  1375. STORE_AND_RET %1
  1376. %endmacro
  1377. ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
  1378. ; between the ssse3 and non-ssse3 version. It may make sense to merge their
  1379. ; code in the sense that the ssse3 version would jump to the appropriate
  1380. ; location in the sse/2 version, rather than duplicating that code in the
  1381. ; binary.
  1382. INIT_XMM sse2
  1383. SUBPEL_VARIANCE 4
  1384. SUBPEL_VARIANCE 8
  1385. SUBPEL_VARIANCE 16
  1386. INIT_XMM ssse3
  1387. SUBPEL_VARIANCE 4
  1388. SUBPEL_VARIANCE 8
  1389. SUBPEL_VARIANCE 16
  1390. INIT_XMM sse2
  1391. SUBPEL_VARIANCE 4, 1
  1392. SUBPEL_VARIANCE 8, 1
  1393. SUBPEL_VARIANCE 16, 1
  1394. INIT_XMM ssse3
  1395. SUBPEL_VARIANCE 4, 1
  1396. SUBPEL_VARIANCE 8, 1
  1397. SUBPEL_VARIANCE 16, 1