vf_removegrain.asm 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218
  1. ;*****************************************************************************
  2. ;* x86-optimized functions for removegrain filter
  3. ;*
  4. ;* Copyright (C) 2015 James Darnley
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or modify
  9. ;* it under the terms of the GNU General Public License as published by
  10. ;* the Free Software Foundation; either version 2 of the License, or
  11. ;* (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. ;* GNU General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU General Public License along
  19. ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  20. ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21. ;*****************************************************************************
  22. ; column: -1 0 +1
  23. ; row -1: a1 a2 a3
  24. ; row 0: a4 c a5
  25. ; row +1: a6 a7 a8
  26. %include "libavutil/x86/x86util.asm"
  27. SECTION_RODATA 32
  28. pw_4: times 16 dw 4
  29. pw_8: times 16 dw 8
  30. pw_div9: times 16 dw ((1<<16)+4)/9
  31. SECTION .text
  32. ;*** Preprocessor helpers
  33. %define a1 srcq+stride_n-1
  34. %define a2 srcq+stride_n
  35. %define a3 srcq+stride_n+1
  36. %define a4 srcq-1
  37. %define c srcq
  38. %define a5 srcq+1
  39. %define a6 srcq+stride_p-1
  40. %define a7 srcq+stride_p
  41. %define a8 srcq+stride_p+1
  42. ; %1 dest simd register
  43. ; %2 source memory location
  44. ; %3 zero location (simd register/memory)
  45. %macro LOAD 3
  46. movh %1, %2
  47. punpcklbw %1, %3
  48. %endmacro
  49. %macro LOAD_SQUARE 0
  50. movu m1, [a1]
  51. movu m2, [a2]
  52. movu m3, [a3]
  53. movu m4, [a4]
  54. movu m0, [c]
  55. movu m5, [a5]
  56. movu m6, [a6]
  57. movu m7, [a7]
  58. movu m8, [a8]
  59. %endmacro
  60. ; %1 zero location (simd register/memory)
  61. %macro LOAD_SQUARE_16 1
  62. LOAD m1, [a1], %1
  63. LOAD m2, [a2], %1
  64. LOAD m3, [a3], %1
  65. LOAD m4, [a4], %1
  66. LOAD m0, [c], %1
  67. LOAD m5, [a5], %1
  68. LOAD m6, [a6], %1
  69. LOAD m7, [a7], %1
  70. LOAD m8, [a8], %1
  71. %endmacro
  72. ; %1 data type
  73. ; %2 simd register to hold maximums
  74. ; %3 simd register to hold minimums
  75. ; %4 temp location (simd register/memory)
  76. %macro SORT_PAIR 4
  77. mova %4, %2
  78. pmin%1 %2, %3
  79. pmax%1 %3, %4
  80. %endmacro
  81. %macro SORT_AXIS 0
  82. SORT_PAIR ub, m1, m8, m9
  83. SORT_PAIR ub, m2, m7, m10
  84. SORT_PAIR ub, m3, m6, m11
  85. SORT_PAIR ub, m4, m5, m12
  86. %endmacro
  87. %macro SORT_AXIS_16 0
  88. SORT_PAIR sw, m1, m8, m9
  89. SORT_PAIR sw, m2, m7, m10
  90. SORT_PAIR sw, m3, m6, m11
  91. SORT_PAIR sw, m4, m5, m12
  92. %endmacro
  93. ; The loop doesn't need to do all the iterations. It could stop when the right
  94. ; pixels are in the right registers.
  95. %macro SORT_SQUARE 0
  96. %assign k 7
  97. %rep 7
  98. %assign i 1
  99. %assign j 2
  100. %rep k
  101. SORT_PAIR ub, m %+ i , m %+ j , m9
  102. %assign i i+1
  103. %assign j j+1
  104. %endrep
  105. %assign k k-1
  106. %endrep
  107. %endmacro
  108. ; %1 dest simd register
  109. ; %2 source (simd register/memory)
  110. ; %3 temp simd register
  111. %macro ABS_DIFF 3
  112. mova %3, %2
  113. psubusb %3, %1
  114. psubusb %1, %2
  115. por %1, %3
  116. %endmacro
  117. ; %1 dest simd register
  118. ; %2 source (simd register/memory)
  119. ; %3 temp simd register
  120. %macro ABS_DIFF_W 3
  121. mova %3, %2
  122. psubusw %3, %1
  123. psubusw %1, %2
  124. por %1, %3
  125. %endmacro
  126. ; %1 simd register that holds the "false" values and will hold the result
  127. ; %2 simd register that holds the "true" values
  128. ; %3 location (simd register/memory) that hold the mask
  129. %macro BLEND 3
  130. %if cpuflag(avx2)
  131. vpblendvb %1, %1, %2, %3
  132. %else
  133. pand %2, %3
  134. pandn %3, %1
  135. por %3, %2
  136. SWAP %1, %3
  137. %endif
  138. %endmacro
  139. ; Functions
  140. INIT_XMM sse2
  141. cglobal rg_fl_mode_1, 4, 5, 3, 0, dst, src, stride, pixels
  142. mov r4q, strideq
  143. neg r4q
  144. %define stride_p strideq
  145. %define stride_n r4q
  146. .loop:
  147. movu m0, [a1]
  148. mova m1, m0
  149. movu m2, [a2]
  150. pmaxub m0, m2
  151. pminub m1, m2
  152. movu m2, [a3]
  153. pmaxub m0, m2
  154. pminub m1, m2
  155. movu m2, [a4]
  156. pmaxub m0, m2
  157. pminub m1, m2
  158. movu m2, [a5]
  159. pmaxub m0, m2
  160. pminub m1, m2
  161. movu m2, [a6]
  162. pmaxub m0, m2
  163. pminub m1, m2
  164. movu m2, [a7]
  165. pmaxub m0, m2
  166. pminub m1, m2
  167. movu m2, [a8]
  168. pmaxub m0, m2
  169. pminub m1, m2
  170. movu m2, [c]
  171. pminub m2, m0
  172. pmaxub m2, m1
  173. movu [dstq], m2
  174. add srcq, mmsize
  175. add dstq, mmsize
  176. sub pixelsd, mmsize
  177. jg .loop
  178. RET
  179. %if ARCH_X86_64
  180. cglobal rg_fl_mode_2, 4, 5, 10, 0, dst, src, stride, pixels
  181. mov r4q, strideq
  182. neg r4q
  183. %define stride_p strideq
  184. %define stride_n r4q
  185. .loop:
  186. LOAD_SQUARE
  187. SORT_SQUARE
  188. CLIPUB m0, m2, m7
  189. movu [dstq], m0
  190. add srcq, mmsize
  191. add dstq, mmsize
  192. sub pixelsd, mmsize
  193. jg .loop
  194. RET
  195. cglobal rg_fl_mode_3, 4, 5, 10, 0, dst, src, stride, pixels
  196. mov r4q, strideq
  197. neg r4q
  198. %define stride_p strideq
  199. %define stride_n r4q
  200. .loop:
  201. LOAD_SQUARE
  202. SORT_SQUARE
  203. CLIPUB m0, m3, m6
  204. movu [dstq], m0
  205. add srcq, mmsize
  206. add dstq, mmsize
  207. sub pixelsd, mmsize
  208. jg .loop
  209. RET
  210. cglobal rg_fl_mode_4, 4, 5, 10, 0, dst, src, stride, pixels
  211. mov r4q, strideq
  212. neg r4q
  213. %define stride_p strideq
  214. %define stride_n r4q
  215. .loop:
  216. LOAD_SQUARE
  217. SORT_SQUARE
  218. CLIPUB m0, m4, m5
  219. movu [dstq], m0
  220. add srcq, mmsize
  221. add dstq, mmsize
  222. sub pixelsd, mmsize
  223. jg .loop
  224. RET
  225. cglobal rg_fl_mode_5, 4, 5, 13, 0, dst, src, stride, pixels
  226. mov r4q, strideq
  227. neg r4q
  228. %define stride_p strideq
  229. %define stride_n r4q
  230. .loop:
  231. LOAD_SQUARE
  232. SORT_AXIS
  233. mova m9, m0
  234. mova m10, m0
  235. mova m11, m0
  236. mova m12, m0
  237. CLIPUB m9, m1, m8
  238. CLIPUB m10, m2, m7
  239. CLIPUB m11, m3, m6
  240. CLIPUB m12, m4, m5
  241. mova m8, m9 ; clip1
  242. mova m7, m10 ; clip2
  243. mova m6, m11 ; clip3
  244. mova m5, m12 ; clip4
  245. ABS_DIFF m9, m0, m1 ; c1
  246. ABS_DIFF m10, m0, m2 ; c2
  247. ABS_DIFF m11, m0, m3 ; c3
  248. ABS_DIFF m12, m0, m4 ; c4
  249. pminub m9, m10
  250. pminub m9, m11
  251. pminub m9, m12 ; mindiff
  252. pcmpeqb m10, m9
  253. pcmpeqb m11, m9
  254. pcmpeqb m12, m9
  255. ; Notice the order here: c1, c3, c2, c4
  256. BLEND m8, m6, m11
  257. BLEND m8, m7, m10
  258. BLEND m8, m5, m12
  259. movu [dstq], m8
  260. add srcq, mmsize
  261. add dstq, mmsize
  262. sub pixelsd, mmsize
  263. jg .loop
  264. RET
  265. cglobal rg_fl_mode_6, 4, 5, 16, 0, dst, src, stride, pixels
  266. mov r4q, strideq
  267. neg r4q
  268. %define stride_p strideq
  269. %define stride_n r4q
  270. ; Some register saving suggestions: the zero can be somewhere other than a
  271. ; register, the center pixels could be on the stack.
  272. pxor m15, m15
  273. .loop:
  274. LOAD_SQUARE_16 m15
  275. SORT_AXIS_16
  276. mova m9, m0
  277. mova m10, m0
  278. mova m11, m0
  279. mova m12, m0
  280. CLIPW m9, m1, m8 ; clip1
  281. CLIPW m10, m2, m7 ; clip2
  282. CLIPW m11, m3, m6 ; clip3
  283. CLIPW m12, m4, m5 ; clip4
  284. psubw m8, m1 ; d1
  285. psubw m7, m2 ; d2
  286. psubw m6, m3 ; d3
  287. psubw m5, m4 ; d4
  288. mova m1, m9
  289. mova m2, m10
  290. mova m3, m11
  291. mova m4, m12
  292. ABS_DIFF_W m1, m0, m13
  293. ABS_DIFF_W m2, m0, m14
  294. ABS_DIFF_W m3, m0, m13
  295. ABS_DIFF_W m4, m0, m14
  296. psllw m1, 1
  297. psllw m2, 1
  298. psllw m3, 1
  299. psllw m4, 1
  300. paddw m1, m8 ; c1
  301. paddw m2, m7 ; c2
  302. paddw m3, m6 ; c3
  303. paddw m4, m5 ; c4
  304. ; As the differences (d1..d4) can only be positive, there is no need to
  305. ; clip to zero. Also, the maximum positive value is less than 768.
  306. pminsw m1, m2
  307. pminsw m1, m3
  308. pminsw m1, m4
  309. pcmpeqw m2, m1
  310. pcmpeqw m3, m1
  311. pcmpeqw m4, m1
  312. BLEND m9, m11, m3
  313. BLEND m9, m10, m2
  314. BLEND m9, m12, m4
  315. packuswb m9, m9
  316. movh [dstq], m9
  317. add srcq, mmsize/2
  318. add dstq, mmsize/2
  319. sub pixelsd, mmsize/2
  320. jg .loop
  321. RET
  322. ; This is just copy-pasted straight from mode 6 with the left shifts removed.
  323. cglobal rg_fl_mode_7, 4, 5, 16, 0, dst, src, stride, pixels
  324. mov r4q, strideq
  325. neg r4q
  326. %define stride_p strideq
  327. %define stride_n r4q
  328. ; Can this be done without unpacking?
  329. pxor m15, m15
  330. .loop:
  331. LOAD_SQUARE_16 m15
  332. SORT_AXIS_16
  333. mova m9, m0
  334. mova m10, m0
  335. mova m11, m0
  336. mova m12, m0
  337. CLIPW m9, m1, m8 ; clip1
  338. CLIPW m10, m2, m7 ; clip2
  339. CLIPW m11, m3, m6 ; clip3
  340. CLIPW m12, m4, m5 ; clip4
  341. psubw m8, m1 ; d1
  342. psubw m7, m2 ; d2
  343. psubw m6, m3 ; d3
  344. psubw m5, m4 ; d4
  345. mova m1, m9
  346. mova m2, m10
  347. mova m3, m11
  348. mova m4, m12
  349. ABS_DIFF_W m1, m0, m13
  350. ABS_DIFF_W m2, m0, m14
  351. ABS_DIFF_W m3, m0, m13
  352. ABS_DIFF_W m4, m0, m14
  353. paddw m1, m8 ; c1
  354. paddw m2, m7 ; c2
  355. paddw m3, m6 ; c3
  356. paddw m4, m5 ; c4
  357. pminsw m1, m2
  358. pminsw m1, m3
  359. pminsw m1, m4
  360. pcmpeqw m2, m1
  361. pcmpeqw m3, m1
  362. pcmpeqw m4, m1
  363. BLEND m9, m11, m3
  364. BLEND m9, m10, m2
  365. BLEND m9, m12, m4
  366. packuswb m9, m9
  367. movh [dstq], m9
  368. add srcq, mmsize/2
  369. add dstq, mmsize/2
  370. sub pixelsd, mmsize/2
  371. jg .loop
  372. RET
  373. ; This is just copy-pasted straight from mode 6 with a few changes.
  374. cglobal rg_fl_mode_8, 4, 5, 16, 0, dst, src, stride, pixels
  375. mov r4q, strideq
  376. neg r4q
  377. %define stride_p strideq
  378. %define stride_n r4q
  379. pxor m15, m15
  380. .loop:
  381. LOAD_SQUARE_16 m15
  382. SORT_AXIS_16
  383. mova m9, m0
  384. mova m10, m0
  385. mova m11, m0
  386. mova m12, m0
  387. CLIPW m9, m1, m8 ; clip1
  388. CLIPW m10, m2, m7 ; clip2
  389. CLIPW m11, m3, m6 ; clip3
  390. CLIPW m12, m4, m5 ; clip4
  391. psubw m8, m1 ; d1
  392. psubw m7, m2 ; d2
  393. psubw m6, m3 ; d3
  394. psubw m5, m4 ; d4
  395. psllw m8, 1
  396. psllw m7, 1
  397. psllw m6, 1
  398. psllw m5, 1
  399. mova m1, m9
  400. mova m2, m10
  401. mova m3, m11
  402. mova m4, m12
  403. ABS_DIFF_W m1, m0, m13
  404. ABS_DIFF_W m2, m0, m14
  405. ABS_DIFF_W m3, m0, m13
  406. ABS_DIFF_W m4, m0, m14
  407. paddw m1, m8 ; c1
  408. paddw m2, m7 ; c1
  409. paddw m3, m6 ; c1
  410. paddw m4, m5 ; c1
  411. ; As the differences (d1..d4) can only be positive, there is no need to
  412. ; clip to zero. Also, the maximum positive value is less than 768.
  413. pminsw m1, m2
  414. pminsw m1, m3
  415. pminsw m1, m4
  416. pcmpeqw m2, m1
  417. pcmpeqw m3, m1
  418. pcmpeqw m4, m1
  419. BLEND m9, m11, m3
  420. BLEND m9, m10, m2
  421. BLEND m9, m12, m4
  422. packuswb m9, m9
  423. movh [dstq], m9
  424. add srcq, mmsize/2
  425. add dstq, mmsize/2
  426. sub pixelsd, mmsize/2
  427. jg .loop
  428. RET
  429. cglobal rg_fl_mode_9, 4, 5, 13, 0, dst, src, stride, pixels
  430. mov r4q, strideq
  431. neg r4q
  432. %define stride_p strideq
  433. %define stride_n r4q
  434. .loop:
  435. LOAD_SQUARE
  436. SORT_AXIS
  437. mova m9, m0
  438. mova m10, m0
  439. mova m11, m0
  440. mova m12, m0
  441. CLIPUB m9, m1, m8 ; clip1
  442. CLIPUB m10, m2, m7 ; clip2
  443. CLIPUB m11, m3, m6 ; clip3
  444. CLIPUB m12, m4, m5 ; clip4
  445. psubb m8, m1 ; d1
  446. psubb m7, m2 ; d2
  447. psubb m6, m3 ; d3
  448. psubb m5, m4 ; d4
  449. pminub m8, m7
  450. pminub m8, m6
  451. pminub m8, m5
  452. pcmpeqb m7, m8
  453. pcmpeqb m6, m8
  454. pcmpeqb m5, m8
  455. BLEND m9, m11, m6
  456. BLEND m9, m10, m7
  457. BLEND m9, m12, m5
  458. movu [dstq], m9
  459. add srcq, mmsize
  460. add dstq, mmsize
  461. sub pixelsd, mmsize
  462. jg .loop
  463. RET
  464. %endif
  465. cglobal rg_fl_mode_10, 4, 5, 8, 0, dst, src, stride, pixels
  466. mov r4q, strideq
  467. neg r4q
  468. %define stride_p strideq
  469. %define stride_n r4q
  470. .loop:
  471. movu m0, [c]
  472. movu m1, [a4]
  473. mova m2, m1
  474. ABS_DIFF m1, m0, m7
  475. movu m3, [a5] ; load pixel
  476. mova m4, m3
  477. ABS_DIFF m4, m0, m7 ; absolute difference from center
  478. pminub m1, m4 ; mindiff
  479. pcmpeqb m4, m1 ; if (difference == mindiff)
  480. BLEND m2, m3, m4 ; return pixel
  481. movu m5, [a1]
  482. mova m6, m5
  483. ABS_DIFF m6, m0, m7
  484. pminub m1, m6
  485. pcmpeqb m6, m1
  486. BLEND m2, m5, m6
  487. movu m3, [a3]
  488. mova m4, m3
  489. ABS_DIFF m4, m0, m7
  490. pminub m1, m4
  491. pcmpeqb m4, m1
  492. BLEND m2, m3, m4
  493. movu m5, [a2]
  494. mova m6, m5
  495. ABS_DIFF m6, m0, m7
  496. pminub m1, m6
  497. pcmpeqb m6, m1
  498. BLEND m2, m5, m6
  499. movu m3, [a6]
  500. mova m4, m3
  501. ABS_DIFF m4, m0, m7
  502. pminub m1, m4
  503. pcmpeqb m4, m1
  504. BLEND m2, m3, m4
  505. movu m5, [a8]
  506. mova m6, m5
  507. ABS_DIFF m6, m0, m7
  508. pminub m1, m6
  509. pcmpeqb m6, m1
  510. BLEND m2, m5, m6
  511. movu m3, [a7]
  512. mova m4, m3
  513. ABS_DIFF m4, m0, m7
  514. pminub m1, m4
  515. pcmpeqb m4, m1
  516. BLEND m2, m3, m4
  517. movu [dstq], m2
  518. add srcq, mmsize
  519. add dstq, mmsize
  520. sub pixelsd, mmsize
  521. jg .loop
  522. RET
  523. cglobal rg_fl_mode_11_12, 4, 5, 7, 0, dst, src, stride, pixels
  524. mov r4q, strideq
  525. neg r4q
  526. %define stride_p strideq
  527. %define stride_n r4q
  528. pxor m0, m0
  529. .loop:
  530. LOAD m1, [c], m0
  531. LOAD m2, [a2], m0
  532. LOAD m3, [a4], m0
  533. LOAD m4, [a5], m0
  534. LOAD m5, [a7], m0
  535. psllw m1, 2
  536. paddw m2, m3
  537. paddw m4, m5
  538. paddw m2, m4
  539. psllw m2, 1
  540. LOAD m3, [a1], m0
  541. LOAD m4, [a3], m0
  542. LOAD m5, [a6], m0
  543. LOAD m6, [a8], m0
  544. paddw m1, m2
  545. paddw m3, m4
  546. paddw m5, m6
  547. paddw m1, m3
  548. paddw m1, m5
  549. paddw m1, [pw_8]
  550. psraw m1, 4
  551. packuswb m1, m1
  552. movh [dstq], m1
  553. add srcq, mmsize/2
  554. add dstq, mmsize/2
  555. sub pixelsd, mmsize/2
  556. jg .loop
  557. RET
  558. cglobal rg_fl_mode_13_14, 4, 5, 8, 0, dst, src, stride, pixels
  559. mov r4q, strideq
  560. neg r4q
  561. %define stride_p strideq
  562. %define stride_n r4q
  563. .loop:
  564. movu m1, [a1]
  565. movu m2, [a8]
  566. mova m0, m1
  567. pavgb m1, m2
  568. ABS_DIFF m0, m2, m6
  569. movu m3, [a3]
  570. movu m4, [a6]
  571. mova m5, m3
  572. pavgb m3, m4
  573. ABS_DIFF m5, m4, m7
  574. pminub m0, m5
  575. pcmpeqb m5, m0
  576. BLEND m1, m3, m5
  577. movu m2, [a2]
  578. movu m3, [a7]
  579. mova m4, m2
  580. pavgb m2, m3
  581. ABS_DIFF m4, m3, m6
  582. pminub m0, m4
  583. pcmpeqb m4, m0
  584. BLEND m1, m2, m4
  585. movu [dstq], m1
  586. add srcq, mmsize
  587. add dstq, mmsize
  588. sub pixelsd, mmsize
  589. jg .loop
  590. RET
  591. %if ARCH_X86_64
  592. cglobal rg_fl_mode_15_16, 4, 5, 16, 0, dst, src, stride, pixels
  593. mov r4q, strideq
  594. neg r4q
  595. %define stride_p strideq
  596. %define stride_n r4q
  597. pxor m15, m15
  598. .loop:
  599. LOAD_SQUARE_16 m15
  600. mova m9, m1
  601. mova m10, m2
  602. mova m11, m3
  603. ABS_DIFF_W m9, m8, m12
  604. ABS_DIFF_W m10, m7, m13
  605. ABS_DIFF_W m11, m6, m14
  606. pminsw m9, m10
  607. pminsw m9, m11
  608. pcmpeqw m10, m9
  609. pcmpeqw m11, m9
  610. mova m12, m2
  611. mova m13, m1
  612. mova m14, m6
  613. paddw m12, m7
  614. psllw m12, 1
  615. paddw m13, m3
  616. paddw m14, m8
  617. paddw m12, [pw_4]
  618. paddw m13, m14
  619. paddw m12, m13
  620. psrlw m12, 3
  621. SORT_PAIR ub, m1, m8, m0
  622. SORT_PAIR ub, m2, m7, m9
  623. SORT_PAIR ub, m3, m6, m14
  624. mova m4, m12
  625. mova m5, m12
  626. CLIPW m4, m1, m8
  627. CLIPW m5, m2, m7
  628. CLIPW m12, m3, m6
  629. BLEND m4, m12, m11
  630. BLEND m4, m5, m10
  631. packuswb m4, m4
  632. movh [dstq], m4
  633. add srcq, mmsize/2
  634. add dstq, mmsize/2
  635. sub pixelsd, mmsize/2
  636. jg .loop
  637. RET
  638. cglobal rg_fl_mode_17, 4, 5, 9, 0, dst, src, stride, pixels
  639. mov r4q, strideq
  640. neg r4q
  641. %define stride_p strideq
  642. %define stride_n r4q
  643. .loop:
  644. LOAD_SQUARE
  645. SORT_AXIS
  646. pmaxub m1, m2
  647. pmaxub m3, m4
  648. pminub m8, m7
  649. pminub m5, m6
  650. pmaxub m1, m3
  651. pminub m8, m5
  652. mova m2, m1
  653. pminub m1, m8
  654. pmaxub m8, m2
  655. CLIPUB m0, m1, m8
  656. movu [dstq], m0
  657. add srcq, mmsize
  658. add dstq, mmsize
  659. sub pixelsd, mmsize
  660. jg .loop
  661. RET
  662. cglobal rg_fl_mode_18, 4, 5, 16, 0, dst, src, stride, pixels
  663. mov r4q, strideq
  664. neg r4q
  665. %define stride_p strideq
  666. %define stride_n r4q
  667. .loop:
  668. LOAD_SQUARE
  669. mova m9, m1
  670. mova m10, m8
  671. ABS_DIFF m9, m0, m11
  672. ABS_DIFF m10, m0, m12
  673. pmaxub m9, m10 ; m9 = d1
  674. mova m10, m2
  675. mova m11, m7
  676. ABS_DIFF m10, m0, m12
  677. ABS_DIFF m11, m0, m13
  678. pmaxub m10, m11 ; m10 = d2
  679. mova m11, m3
  680. mova m12, m6
  681. ABS_DIFF m11, m0, m13
  682. ABS_DIFF m12, m0, m14
  683. pmaxub m11, m12 ; m11 = d3
  684. mova m12, m4
  685. mova m13, m5
  686. ABS_DIFF m12, m0, m14
  687. ABS_DIFF m13, m0, m15
  688. pmaxub m12, m13 ; m12 = d4
  689. mova m13, m9
  690. pminub m13, m10
  691. pminub m13, m11
  692. pminub m13, m12 ; m13 = mindiff
  693. pcmpeqb m10, m13
  694. pcmpeqb m11, m13
  695. pcmpeqb m12, m13
  696. mova m14, m1
  697. pminub m1, m8
  698. pmaxub m8, m14
  699. mova m13, m0
  700. mova m14, m1
  701. pminub m1, m8
  702. pmaxub m8, m14
  703. CLIPUB m13, m1, m8 ; m13 = ret...d1
  704. mova m14, m0
  705. mova m15, m3
  706. pminub m3, m6
  707. pmaxub m6, m15
  708. CLIPUB m14, m3, m6
  709. pand m14, m11
  710. pandn m11, m13
  711. por m14, m11 ; m14 = ret...d3
  712. mova m15, m0
  713. mova m1, m2
  714. pminub m2, m7
  715. pmaxub m7, m1
  716. CLIPUB m15, m2, m7
  717. pand m15, m10
  718. pandn m10, m14
  719. por m15, m10 ; m15 = ret...d2
  720. mova m1, m0
  721. mova m2, m4
  722. pminub m4, m5
  723. pmaxub m5, m2
  724. CLIPUB m1, m4, m5
  725. pand m1, m12
  726. pandn m12, m15
  727. por m1, m12 ; m15 = ret...d4
  728. movu [dstq], m1
  729. add srcq, mmsize
  730. add dstq, mmsize
  731. sub pixelsd, mmsize
  732. jg .loop
  733. RET
  734. %endif
  735. cglobal rg_fl_mode_19, 4, 5, 7, 0, dst, src, stride, pixels
  736. mov r4q, strideq
  737. neg r4q
  738. %define stride_p strideq
  739. %define stride_n r4q
  740. pxor m0, m0
  741. .loop:
  742. LOAD m1, [a1], m0
  743. LOAD m2, [a2], m0
  744. paddw m1, m2
  745. LOAD m3, [a3], m0
  746. LOAD m4, [a4], m0
  747. paddw m3, m4
  748. LOAD m5, [a5], m0
  749. LOAD m6, [a6], m0
  750. paddw m5, m6
  751. LOAD m2, [a7], m0
  752. LOAD m4, [a8], m0
  753. paddw m2, m4
  754. paddw m1, m3
  755. paddw m2, m5
  756. paddw m1, m2
  757. paddw m1, [pw_4]
  758. psraw m1, 3
  759. packuswb m1, m1
  760. movh [dstq], m1
  761. add srcq, mmsize/2
  762. add dstq, mmsize/2
  763. sub pixelsd, mmsize/2
  764. jg .loop
  765. RET
  766. cglobal rg_fl_mode_20, 4, 5, 7, 0, dst, src, stride, pixels
  767. mov r4q, strideq
  768. neg r4q
  769. %define stride_p strideq
  770. %define stride_n r4q
  771. pxor m0, m0
  772. .loop:
  773. LOAD m1, [a1], m0
  774. LOAD m2, [a2], m0
  775. paddw m1, m2
  776. LOAD m3, [a3], m0
  777. LOAD m4, [a4], m0
  778. paddw m3, m4
  779. LOAD m5, [a5], m0
  780. LOAD m6, [a6], m0
  781. paddw m5, m6
  782. LOAD m2, [a7], m0
  783. LOAD m4, [a8], m0
  784. paddw m2, m4
  785. LOAD m6, [c], m0
  786. paddw m1, m3
  787. paddw m2, m5
  788. paddw m6, [pw_4]
  789. paddw m1, m2
  790. paddw m1, m6
  791. pmulhuw m1, [pw_div9]
  792. packuswb m1, m1
  793. movh [dstq], m1
  794. add srcq, mmsize/2
  795. add dstq, mmsize/2
  796. sub pixelsd, mmsize/2
  797. jg .loop
  798. RET
  799. cglobal rg_fl_mode_21, 4, 5, 8, 0, dst, src, stride, pixels
  800. mov r4q, strideq
  801. neg r4q
  802. %define stride_p strideq
  803. %define stride_n r4q
  804. pxor m0, m0
  805. .loop:
  806. movu m1, [a1]
  807. movu m2, [a8]
  808. pavgb m7, m1, m2
  809. punpckhbw m3, m1, m0
  810. punpcklbw m1, m0
  811. punpckhbw m4, m2, m0
  812. punpcklbw m2, m0
  813. paddw m3, m4
  814. paddw m1, m2
  815. psrlw m3, 1
  816. psrlw m1, 1
  817. packuswb m1, m3
  818. movu m2, [a2]
  819. movu m3, [a7]
  820. pavgb m6, m2, m3
  821. punpckhbw m4, m2, m0
  822. punpcklbw m2, m0
  823. punpckhbw m5, m3, m0
  824. punpcklbw m3, m0
  825. paddw m4, m5
  826. paddw m2, m3
  827. psrlw m4, 1
  828. psrlw m2, 1
  829. packuswb m2, m4
  830. pminub m1, m2
  831. pmaxub m7, m6
  832. movu m2, [a3]
  833. movu m3, [a6]
  834. pavgb m6, m2, m3
  835. punpckhbw m4, m2, m0
  836. punpcklbw m2, m0
  837. punpckhbw m5, m3, m0
  838. punpcklbw m3, m0
  839. paddw m4, m5
  840. paddw m2, m3
  841. psrlw m4, 1
  842. psrlw m2, 1
  843. packuswb m2, m4
  844. pminub m1, m2
  845. pmaxub m7, m6
  846. movu m2, [a4]
  847. movu m3, [a5]
  848. pavgb m6, m2, m3
  849. punpckhbw m4, m2, m0
  850. punpcklbw m2, m0
  851. punpckhbw m5, m3, m0
  852. punpcklbw m3, m0
  853. paddw m4, m5
  854. paddw m2, m3
  855. psrlw m4, 1
  856. psrlw m2, 1
  857. packuswb m2, m4
  858. pminub m1, m2
  859. pmaxub m7, m6
  860. movu m3, [c]
  861. CLIPUB m3, m1, m7
  862. movu [dstq], m3
  863. add srcq, mmsize
  864. add dstq, mmsize
  865. sub pixelsd, mmsize
  866. jg .loop
  867. RET
  868. cglobal rg_fl_mode_22, 4, 5, 8, 0, dst, src, stride, pixels
  869. mov r4q, strideq
  870. neg r4q
  871. %define stride_p strideq
  872. %define stride_n r4q
  873. .loop:
  874. movu m0, [a1]
  875. movu m1, [a8]
  876. pavgb m0, m1
  877. movu m2, [a2]
  878. movu m3, [a7]
  879. pavgb m2, m3
  880. movu m4, [a3]
  881. movu m5, [a6]
  882. pavgb m4, m5
  883. movu m6, [a4]
  884. movu m7, [a5]
  885. pavgb m6, m7
  886. mova m1, m0
  887. mova m3, m2
  888. mova m5, m4
  889. mova m7, m6
  890. pminub m0, m2
  891. pminub m4, m6
  892. pmaxub m1, m3
  893. pmaxub m5, m7
  894. pminub m0, m4
  895. pmaxub m1, m5
  896. movu m2, [c]
  897. CLIPUB m2, m0, m1
  898. movu [dstq], m2
  899. add srcq, mmsize
  900. add dstq, mmsize
  901. sub pixelsd, mmsize
  902. jg .loop
  903. RET
  904. %if ARCH_X86_64
  905. cglobal rg_fl_mode_23, 4, 5, 16, 0, dst, src, stride, pixels
  906. mov r4q, strideq
  907. neg r4q
  908. %define stride_p strideq
  909. %define stride_n r4q
  910. pxor m15, m15
  911. .loop:
  912. LOAD_SQUARE_16 m15
  913. SORT_AXIS_16
  914. mova m9, m8
  915. mova m10, m7
  916. mova m11, m6
  917. mova m12, m5
  918. psubw m9, m1 ; linediff1
  919. psubw m10, m2 ; linediff2
  920. psubw m11, m3 ; linediff3
  921. psubw m12, m4 ; linediff4
  922. psubw m1, m0
  923. psubw m2, m0
  924. psubw m3, m0
  925. psubw m4, m0
  926. pminsw m1, m9 ; d1
  927. pminsw m2, m10 ; d2
  928. pminsw m3, m11 ; d3
  929. pminsw m4, m12 ; d4
  930. pmaxsw m1, m2
  931. pmaxsw m3, m4
  932. pmaxsw m1, m3
  933. pmaxsw m1, m15 ; d
  934. mova m13, m0
  935. mova m14, m0
  936. mova m2, m0
  937. mova m4, m0
  938. psubw m13, m8
  939. psubw m14, m7
  940. psubw m2, m6
  941. psubw m4, m5
  942. pminsw m9, m13 ; u1
  943. pminsw m10, m14 ; u2
  944. pminsw m11, m2 ; u3
  945. pminsw m12, m4 ; u4
  946. pmaxsw m9, m10
  947. pmaxsw m11, m12
  948. pmaxsw m9, m11
  949. pmaxsw m9, m15 ; u
  950. paddw m0, m1
  951. psubw m0, m9
  952. packuswb m0, m0
  953. movh [dstq], m0
  954. add srcq, mmsize/2
  955. add dstq, mmsize/2
  956. sub pixelsd, mmsize/2
  957. jg .loop
  958. RET
  959. cglobal rg_fl_mode_24, 4, 5, 16, mmsize, dst, src, stride, pixels
  960. mov r4q, strideq
  961. neg r4q
  962. %define stride_p strideq
  963. %define stride_n r4q
  964. pxor m15, m15
  965. .loop:
  966. LOAD_SQUARE_16 m15
  967. mova [rsp], m0
  968. SORT_AXIS_16
  969. mova m9, m8
  970. mova m10, m7
  971. mova m11, m6
  972. mova m12, m5
  973. psubw m9, m1 ; linediff1
  974. psubw m10, m2 ; linediff2
  975. psubw m11, m3 ; linediff3
  976. psubw m12, m4 ; linediff4
  977. psubw m1, [rsp] ; td1
  978. psubw m2, [rsp] ; td2
  979. psubw m3, [rsp] ; td3
  980. psubw m4, [rsp] ; td4
  981. mova m0, m9
  982. mova m13, m10
  983. mova m14, m11
  984. mova m15, m12
  985. psubw m0, m1
  986. psubw m13, m2
  987. psubw m14, m3
  988. psubw m15, m4
  989. pminsw m1, m0 ; d1
  990. pminsw m2, m13 ; d2
  991. pminsw m3, m14 ; d3
  992. pminsw m4, m15 ; d4
  993. pmaxsw m1, m2
  994. pmaxsw m3, m4
  995. mova m0, [rsp]
  996. mova m13, [rsp]
  997. mova m14, [rsp]
  998. mova m15, [rsp]
  999. psubw m0, m8 ; tu1
  1000. psubw m13, m7 ; tu2
  1001. psubw m14, m6 ; tu3
  1002. psubw m15, m5 ; tu4
  1003. psubw m9, m0
  1004. psubw m10, m13
  1005. psubw m11, m14
  1006. psubw m12, m15
  1007. pminsw m9, m0 ; u1
  1008. pminsw m10, m13 ; u2
  1009. pminsw m11, m14 ; u3
  1010. pminsw m12, m15 ; u4
  1011. pmaxsw m9, m10
  1012. pmaxsw m11, m12
  1013. pmaxsw m1, m3 ; d without max(d,0)
  1014. pmaxsw m9, m11 ; u without max(u,0)
  1015. pxor m15, m15
  1016. pmaxsw m1, m15
  1017. pmaxsw m9, m15
  1018. mova m0, [rsp]
  1019. paddw m0, m1
  1020. psubw m0, m9
  1021. packuswb m0, m0
  1022. movh [dstq], m0
  1023. add srcq, mmsize/2
  1024. add dstq, mmsize/2
  1025. sub pixelsd, mmsize/2
  1026. jg .loop
  1027. RET
  1028. %endif