123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234 |
- /*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
- #include "libyuv/row.h"
- // This module is for Visual C 32/64 bit and clangcl 32 bit
- #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
- (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
- #if defined(_M_X64)
- #include <emmintrin.h>
- #include <tmmintrin.h> // For _mm_maddubs_epi16
- #endif
- #ifdef __cplusplus
- namespace libyuv {
- extern "C" {
- #endif
- // 64 bit
- #if defined(_M_X64)
- // Read 4 UV from 422, upsample to 8 UV.
- #define READYUV422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
- y_buf += 8;
- // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
- #define READYUVA422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
- y_buf += 8; \
- xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
- a_buf += 8;
- // Convert 8 pixels: 8 UV and 8 Y.
- #define YUVTORGB(yuvconstants) \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm2 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
- xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
- xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
- xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
- xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
- xmm0 = _mm_adds_epi16(xmm0, xmm4); \
- xmm1 = _mm_adds_epi16(xmm1, xmm4); \
- xmm2 = _mm_adds_epi16(xmm2, xmm4); \
- xmm0 = _mm_srai_epi16(xmm0, 6); \
- xmm1 = _mm_srai_epi16(xmm1, 6); \
- xmm2 = _mm_srai_epi16(xmm2, 6); \
- xmm0 = _mm_packus_epi16(xmm0, xmm0); \
- xmm1 = _mm_packus_epi16(xmm1, xmm1); \
- xmm2 = _mm_packus_epi16(xmm2, xmm2);
- // Store 8 ARGB values.
- #define STOREARGB \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
- xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
- _mm_storeu_si128((__m128i*)dst_argb, xmm0); \
- _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
- dst_argb += 32;
- #if defined(HAS_I422TOARGBROW_SSSE3)
- void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __m128i xmm0, xmm1, xmm2, xmm4;
- const __m128i xmm5 = _mm_set1_epi8(-1);
- const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
- while (width > 0) {
- READYUV422
- YUVTORGB(yuvconstants)
- STOREARGB
- width -= 8;
- }
- }
- #endif
- #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
- void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- const uint8_t* a_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
- const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
- while (width > 0) {
- READYUVA422
- YUVTORGB(yuvconstants)
- STOREARGB
- width -= 8;
- }
- }
- #endif
- // 32 bit
- #else // defined(_M_X64)
- #ifdef HAS_ARGBTOYROW_SSSE3
- // Constants for ARGB.
- static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
- 13, 65, 33, 0, 13, 65, 33, 0};
- // JPeg full range.
- static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
- 15, 75, 38, 0, 15, 75, 38, 0};
- static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
- 112, -74, -38, 0, 112, -74, -38, 0};
- static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
- 127, -84, -43, 0, 127, -84, -43, 0};
- static const vec8 kARGBToV = {
- -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
- };
- static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
- -20, -107, 127, 0, -20, -107, 127, 0};
- // vpshufb for vphaddw + vpackuswb packed to shorts.
- static const lvec8 kShufARGBToUV_AVX = {
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
- // Constants for BGRA.
- static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
- 0, 33, 65, 13, 0, 33, 65, 13};
- static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
- 0, -38, -74, 112, 0, -38, -74, 112};
- static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
- 0, 112, -94, -18, 0, 112, -94, -18};
- // Constants for ABGR.
- static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
- 33, 65, 13, 0, 33, 65, 13, 0};
- static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
- -38, -74, 112, 0, -38, -74, 112, 0};
- static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
- 112, -94, -18, 0, 112, -94, -18, 0};
- // Constants for RGBA.
- static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
- 0, 13, 65, 33, 0, 13, 65, 33};
- static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
- 0, 112, -74, -38, 0, 112, -74, -38};
- static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
- 0, -18, -94, 112, 0, -18, -94, 112};
- static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
- // 7 bit fixed point 0.5.
- static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
- static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
- static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
- 0x8080u, 0x8080u, 0x8080u, 0x8080u};
- // Shuffle table for converting RGB24 to ARGB.
- static const uvec8 kShuffleMaskRGB24ToARGB = {
- 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
- // Shuffle table for converting RAW to ARGB.
- static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
- 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
- // Shuffle table for converting RAW to RGB24. First 8.
- static const uvec8 kShuffleMaskRAWToRGB24_0 = {
- 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
- // Shuffle table for converting RAW to RGB24. Middle 8.
- static const uvec8 kShuffleMaskRAWToRGB24_1 = {
- 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
- // Shuffle table for converting RAW to RGB24. Last 8.
- static const uvec8 kShuffleMaskRAWToRGB24_2 = {
- 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
- // Shuffle table for converting ARGB to RGB24.
- static const uvec8 kShuffleMaskARGBToRGB24 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
- // Shuffle table for converting ARGB to RAW.
- static const uvec8 kShuffleMaskARGBToRAW = {
- 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
- // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
- static const uvec8 kShuffleMaskARGBToRGB24_0 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
- // YUY2 shuf 16 Y to 32 Y.
- static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
- 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
- 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
- // YUY2 shuf 8 UV to 16 UV.
- static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
- 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
- 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
- // UYVY shuf 16 Y to 32 Y.
- static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
- 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
- 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
- // UYVY shuf 8 UV to 16 UV.
- static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
- 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
- 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
- // NV21 shuf 8 VU to 16 UV.
- static const lvec8 kShuffleNV21 = {
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
- };
- // Duplicates gray value 3 times and fills in alpha opaque.
- __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_y
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
- convertloop:
- movq xmm0, qword ptr [eax]
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm0
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm0
- punpckhwd xmm1, xmm1
- por xmm0, xmm5
- por xmm1, xmm5
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #ifdef HAS_J400TOARGBROW_AVX2
- // Duplicates gray value 3 times and fills in alpha opaque.
- __declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_y
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
- vpslld ymm5, ymm5, 24
- convertloop:
- vmovdqu xmm0, [eax]
- lea eax, [eax + 16]
- vpermq ymm0, ymm0, 0xd8
- vpunpcklbw ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8
- vpunpckhwd ymm1, ymm0, ymm0
- vpunpcklwd ymm0, ymm0, ymm0
- vpor ymm0, ymm0, ymm5
- vpor ymm1, ymm1, ymm5
- vmovdqu [edx], ymm0
- vmovdqu [edx + 32], ymm1
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_J400TOARGBROW_AVX2
- __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_rgb24
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
- movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm3, [eax + 32]
- lea eax, [eax + 48]
- movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
- pshufb xmm2, xmm4
- por xmm2, xmm5
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
- pshufb xmm0, xmm4
- movdqu [edx + 32], xmm2
- por xmm0, xmm5
- pshufb xmm1, xmm4
- movdqu [edx], xmm0
- por xmm1, xmm5
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
- pshufb xmm3, xmm4
- movdqu [edx + 16], xmm1
- por xmm3, xmm5
- movdqu [edx + 48], xmm3
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_raw
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
- movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm3, [eax + 32]
- lea eax, [eax + 48]
- movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
- pshufb xmm2, xmm4
- por xmm2, xmm5
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
- pshufb xmm0, xmm4
- movdqu [edx + 32], xmm2
- por xmm0, xmm5
- pshufb xmm1, xmm4
- movdqu [edx], xmm0
- por xmm1, xmm5
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
- pshufb xmm3, xmm4
- movdqu [edx + 16], xmm1
- por xmm3, xmm5
- movdqu [edx + 48], xmm3
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
- uint8_t* dst_rgb24,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_raw
- mov edx, [esp + 8] // dst_rgb24
- mov ecx, [esp + 12] // width
- movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
- movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
- movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 4]
- movdqu xmm2, [eax + 8]
- lea eax, [eax + 24]
- pshufb xmm0, xmm3
- pshufb xmm1, xmm4
- pshufb xmm2, xmm5
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + 8], xmm1
- movq qword ptr [edx + 16], xmm2
- lea edx, [edx + 24]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- // pmul method to replicate bits.
- // Math to replicate bits:
- // (v << 8) | (v << 3)
- // v * 256 + v * 8
- // v * (256 + 8)
- // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
- // 20 instructions.
- __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, 0x01080108 // generate multiplier to repeat 5 bits
- movd xmm5, eax
- pshufd xmm5, xmm5, 0
- mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
- movd xmm6, eax
- pshufd xmm6, xmm6, 0
- pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
- psllw xmm3, 11
- pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
- psllw xmm4, 10
- psrlw xmm4, 5
- pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
- psllw xmm7, 8
- mov eax, [esp + 4] // src_rgb565
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of bgr565
- movdqa xmm1, xmm0
- movdqa xmm2, xmm0
- pand xmm1, xmm3 // R in upper 5 bits
- psllw xmm2, 11 // B in upper 5 bits
- pmulhuw xmm1, xmm5 // * (256 + 8)
- pmulhuw xmm2, xmm5 // * (256 + 8)
- psllw xmm1, 8
- por xmm1, xmm2 // RB
- pand xmm0, xmm4 // G in middle 6 bits
- pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
- por xmm0, xmm7 // AG
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
- lea eax, [eax + 16]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #ifdef HAS_RGB565TOARGBROW_AVX2
- // pmul method to replicate bits.
- // Math to replicate bits:
- // (v << 8) | (v << 3)
- // v * 256 + v * 8
- // v * (256 + 8)
- // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
- __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, 0x01080108 // generate multiplier to repeat 5 bits
- vmovd xmm5, eax
- vbroadcastss ymm5, xmm5
- mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
- vmovd xmm6, eax
- vbroadcastss ymm6, xmm6
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
- vpsllw ymm3, ymm3, 11
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
- vpsllw ymm4, ymm4, 10
- vpsrlw ymm4, ymm4, 5
- vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
- vpsllw ymm7, ymm7, 8
- mov eax, [esp + 4] // src_rgb565
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
- vpand ymm1, ymm0, ymm3 // R in upper 5 bits
- vpsllw ymm2, ymm0, 11 // B in upper 5 bits
- vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
- vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
- vpsllw ymm1, ymm1, 8
- vpor ymm1, ymm1, ymm2 // RB
- vpand ymm0, ymm0, ymm4 // G in middle 6 bits
- vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
- vpor ymm0, ymm0, ymm7 // AG
- vpermq ymm0, ymm0, 0xd8 // mutate for unpack
- vpermq ymm1, ymm1, 0xd8
- vpunpckhbw ymm2, ymm1, ymm0
- vpunpcklbw ymm1, ymm1, ymm0
- vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
- vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
- lea eax, [eax + 32]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_RGB565TOARGBROW_AVX2
- #ifdef HAS_ARGB1555TOARGBROW_AVX2
- __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, 0x01080108 // generate multiplier to repeat 5 bits
- vmovd xmm5, eax
- vbroadcastss ymm5, xmm5
- mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
- vmovd xmm6, eax
- vbroadcastss ymm6, xmm6
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
- vpsllw ymm3, ymm3, 11
- vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
- vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
- vpsllw ymm7, ymm7, 8
- mov eax, [esp + 4] // src_argb1555
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
- vpsllw ymm1, ymm0, 1 // R in upper 5 bits
- vpsllw ymm2, ymm0, 11 // B in upper 5 bits
- vpand ymm1, ymm1, ymm3
- vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
- vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
- vpsllw ymm1, ymm1, 8
- vpor ymm1, ymm1, ymm2 // RB
- vpsraw ymm2, ymm0, 8 // A
- vpand ymm0, ymm0, ymm4 // G in middle 5 bits
- vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
- vpand ymm2, ymm2, ymm7
- vpor ymm0, ymm0, ymm2 // AG
- vpermq ymm0, ymm0, 0xd8 // mutate for unpack
- vpermq ymm1, ymm1, 0xd8
- vpunpckhbw ymm2, ymm1, ymm0
- vpunpcklbw ymm1, ymm1, ymm0
- vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
- vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
- lea eax, [eax + 32]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGB1555TOARGBROW_AVX2
- #ifdef HAS_ARGB4444TOARGBROW_AVX2
- __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
- vmovd xmm4, eax
- vbroadcastss ymm4, xmm4
- vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
- mov eax, [esp + 4] // src_argb4444
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
- vpand ymm2, ymm0, ymm5 // mask high nibbles
- vpand ymm0, ymm0, ymm4 // mask low nibbles
- vpsrlw ymm3, ymm2, 4
- vpsllw ymm1, ymm0, 4
- vpor ymm2, ymm2, ymm3
- vpor ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // mutate for unpack
- vpermq ymm2, ymm2, 0xd8
- vpunpckhbw ymm1, ymm0, ymm2
- vpunpcklbw ymm0, ymm0, ymm2
- vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
- vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
- lea eax, [eax + 32]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGB4444TOARGBROW_AVX2
- // 24 instructions
- __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, 0x01080108 // generate multiplier to repeat 5 bits
- movd xmm5, eax
- pshufd xmm5, xmm5, 0
- mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
- movd xmm6, eax
- pshufd xmm6, xmm6, 0
- pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
- psllw xmm3, 11
- movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
- psrlw xmm4, 6
- pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
- psllw xmm7, 8
- mov eax, [esp + 4] // src_argb1555
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of 1555
- movdqa xmm1, xmm0
- movdqa xmm2, xmm0
- psllw xmm1, 1 // R in upper 5 bits
- psllw xmm2, 11 // B in upper 5 bits
- pand xmm1, xmm3
- pmulhuw xmm2, xmm5 // * (256 + 8)
- pmulhuw xmm1, xmm5 // * (256 + 8)
- psllw xmm1, 8
- por xmm1, xmm2 // RB
- movdqa xmm2, xmm0
- pand xmm0, xmm4 // G in middle 5 bits
- psraw xmm2, 8 // A
- pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
- pand xmm2, xmm7
- por xmm0, xmm2 // AG
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
- lea eax, [eax + 16]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- // 18 instructions.
- __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
- movd xmm4, eax
- pshufd xmm4, xmm4, 0
- movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
- pslld xmm5, 4
- mov eax, [esp + 4] // src_argb4444
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
- movdqa xmm2, xmm0
- pand xmm0, xmm4 // mask low nibbles
- pand xmm2, xmm5 // mask high nibbles
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- psllw xmm1, 4
- psrlw xmm3, 4
- por xmm0, xmm1
- por xmm2, xmm3
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
- movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
- lea eax, [eax + 16]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
- convertloop:
- movdqu xmm0, [eax] // fetch 16 pixels of argb
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- lea eax, [eax + 64]
- pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
- pshufb xmm1, xmm6
- pshufb xmm2, xmm6
- pshufb xmm3, xmm6
- movdqa xmm4, xmm1 // 4 bytes from 1 for 0
- psrldq xmm1, 4 // 8 bytes from 1
- pslldq xmm4, 12 // 4 bytes from 1 for 0
- movdqa xmm5, xmm2 // 8 bytes from 2 for 1
- por xmm0, xmm4 // 4 bytes from 1 for 0
- pslldq xmm5, 8 // 8 bytes from 2 for 1
- movdqu [edx], xmm0 // store 0
- por xmm1, xmm5 // 8 bytes from 2 for 1
- psrldq xmm2, 8 // 4 bytes from 2
- pslldq xmm3, 4 // 12 bytes from 3 for 2
- por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqu [edx + 16], xmm1 // store 1
- movdqu [edx + 32], xmm2 // store 2
- lea edx, [edx + 48]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
- convertloop:
- movdqu xmm0, [eax] // fetch 16 pixels of argb
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- lea eax, [eax + 64]
- pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
- pshufb xmm1, xmm6
- pshufb xmm2, xmm6
- pshufb xmm3, xmm6
- movdqa xmm4, xmm1 // 4 bytes from 1 for 0
- psrldq xmm1, 4 // 8 bytes from 1
- pslldq xmm4, 12 // 4 bytes from 1 for 0
- movdqa xmm5, xmm2 // 8 bytes from 2 for 1
- por xmm0, xmm4 // 4 bytes from 1 for 0
- pslldq xmm5, 8 // 8 bytes from 2 for 1
- movdqu [edx], xmm0 // store 0
- por xmm1, xmm5 // 8 bytes from 2 for 1
- psrldq xmm2, 8 // 4 bytes from 2
- pslldq xmm3, 4 // 12 bytes from 3 for 2
- por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqu [edx + 16], xmm1 // store 1
- movdqu [edx + 32], xmm2 // store 2
- lea edx, [edx + 48]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
- psrld xmm3, 27
- pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
- psrld xmm4, 26
- pslld xmm4, 5
- pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
- pslld xmm5, 11
- convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
- movdqa xmm1, xmm0 // B
- movdqa xmm2, xmm0 // G
- pslld xmm0, 8 // R
- psrld xmm1, 3 // B
- psrld xmm2, 5 // G
- psrad xmm0, 16 // R
- pand xmm1, xmm3 // B
- pand xmm2, xmm4 // G
- pand xmm0, xmm5 // R
- por xmm1, xmm2 // BG
- por xmm0, xmm1 // BGR
- packssdw xmm0, xmm0
- lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
- lea edx, [edx + 8]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- const uint32_t dither4,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- movd xmm6, [esp + 12] // dither4
- mov ecx, [esp + 16] // width
- punpcklbw xmm6, xmm6 // make dither 16 bytes
- movdqa xmm7, xmm6
- punpcklwd xmm6, xmm6
- punpckhwd xmm7, xmm7
- pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
- psrld xmm3, 27
- pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
- psrld xmm4, 26
- pslld xmm4, 5
- pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
- pslld xmm5, 11
- convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
- paddusb xmm0, xmm6 // add dither
- movdqa xmm1, xmm0 // B
- movdqa xmm2, xmm0 // G
- pslld xmm0, 8 // R
- psrld xmm1, 3 // B
- psrld xmm2, 5 // G
- psrad xmm0, 16 // R
- pand xmm1, xmm3 // B
- pand xmm2, xmm4 // G
- pand xmm0, xmm5 // R
- por xmm1, xmm2 // BG
- por xmm0, xmm1 // BGR
- packssdw xmm0, xmm0
- lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
- lea edx, [edx + 8]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
- __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- const uint32_t dither4,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- vbroadcastss xmm6, [esp + 12] // dither4
- mov ecx, [esp + 16] // width
- vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
- vpermq ymm6, ymm6, 0xd8
- vpunpcklwd ymm6, ymm6, ymm6
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
- vpsrld ymm3, ymm3, 27
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
- vpsrld ymm4, ymm4, 26
- vpslld ymm4, ymm4, 5
- vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
- convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpaddusb ymm0, ymm0, ymm6 // add dither
- vpsrld ymm2, ymm0, 5 // G
- vpsrld ymm1, ymm0, 3 // B
- vpsrld ymm0, ymm0, 8 // R
- vpand ymm2, ymm2, ymm4 // G
- vpand ymm1, ymm1, ymm3 // B
- vpand ymm0, ymm0, ymm5 // R
- vpor ymm1, ymm1, ymm2 // BG
- vpor ymm0, ymm0, ymm1 // BGR
- vpackusdw ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8
- lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of RGB565
- lea edx, [edx + 16]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTORGB565DITHERROW_AVX2
- // TODO(fbarchard): Improve sign extension/packing.
- __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
- psrld xmm4, 27
- movdqa xmm5, xmm4 // generate mask 0x000003e0
- pslld xmm5, 5
- movdqa xmm6, xmm4 // generate mask 0x00007c00
- pslld xmm6, 10
- pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
- pslld xmm7, 15
- convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
- movdqa xmm1, xmm0 // B
- movdqa xmm2, xmm0 // G
- movdqa xmm3, xmm0 // R
- psrad xmm0, 16 // A
- psrld xmm1, 3 // B
- psrld xmm2, 6 // G
- psrld xmm3, 9 // R
- pand xmm0, xmm7 // A
- pand xmm1, xmm4 // B
- pand xmm2, xmm5 // G
- pand xmm3, xmm6 // R
- por xmm0, xmm1 // BA
- por xmm2, xmm3 // GR
- por xmm0, xmm2 // BGRA
- packssdw xmm0, xmm0
- lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
- lea edx, [edx + 8]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
- psllw xmm4, 12
- movdqa xmm3, xmm4 // generate mask 0x00f000f0
- psrlw xmm3, 8
- convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
- movdqa xmm1, xmm0
- pand xmm0, xmm3 // low nibble
- pand xmm1, xmm4 // high nibble
- psrld xmm0, 4
- psrld xmm1, 8
- por xmm0, xmm1
- packuswb xmm0, xmm0
- lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
- lea edx, [edx + 8]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #ifdef HAS_ARGBTORGB565ROW_AVX2
- __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
- vpsrld ymm3, ymm3, 27
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
- vpsrld ymm4, ymm4, 26
- vpslld ymm4, ymm4, 5
- vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
- convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpsrld ymm2, ymm0, 5 // G
- vpsrld ymm1, ymm0, 3 // B
- vpsrld ymm0, ymm0, 8 // R
- vpand ymm2, ymm2, ymm4 // G
- vpand ymm1, ymm1, ymm3 // B
- vpand ymm0, ymm0, ymm5 // R
- vpor ymm1, ymm1, ymm2 // BG
- vpor ymm0, ymm0, ymm1 // BGR
- vpackusdw ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8
- lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of RGB565
- lea edx, [edx + 16]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTORGB565ROW_AVX2
- #ifdef HAS_ARGBTOARGB1555ROW_AVX2
- __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm4, ymm4, ymm4
- vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
- vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
- vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
- vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
- vpslld ymm7, ymm7, 15
- convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpsrld ymm3, ymm0, 9 // R
- vpsrld ymm2, ymm0, 6 // G
- vpsrld ymm1, ymm0, 3 // B
- vpsrad ymm0, ymm0, 16 // A
- vpand ymm3, ymm3, ymm6 // R
- vpand ymm2, ymm2, ymm5 // G
- vpand ymm1, ymm1, ymm4 // B
- vpand ymm0, ymm0, ymm7 // A
- vpor ymm0, ymm0, ymm1 // BA
- vpor ymm2, ymm2, ymm3 // GR
- vpor ymm0, ymm0, ymm2 // BGRA
- vpackssdw ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8
- lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
- lea edx, [edx + 16]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOARGB1555ROW_AVX2
- #ifdef HAS_ARGBTOARGB4444ROW_AVX2
- __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
- uint8_t* dst_rgb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
- vpsllw ymm4, ymm4, 12
- vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
- convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpand ymm1, ymm0, ymm4 // high nibble
- vpand ymm0, ymm0, ymm3 // low nibble
- vpsrld ymm1, ymm1, 8
- vpsrld ymm0, ymm0, 4
- vpor ymm0, ymm0, ymm1
- vpackuswb ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8
- lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
- lea edx, [edx + 16]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOARGB4444ROW_AVX2
- // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
- __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_y,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kARGBToY
- movdqa xmm5, xmmword ptr kAddY16
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
- // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
- __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_y,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kARGBToYJ
- movdqa xmm5, xmmword ptr kAddYJ64
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- paddw xmm0, xmm5 // Add .5 for rounding.
- paddw xmm2, xmm5
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- #ifdef HAS_ARGBTOYROW_AVX2
- // vpermd for vphaddw + vpackuswb vpermd.
- static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
- // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
- __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_y,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- vbroadcastf128 ymm4, xmmword ptr kARGBToY
- vbroadcastf128 ymm5, xmmword ptr kAddY16
- vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpmaddubsw ymm0, ymm0, ymm4
- vpmaddubsw ymm1, ymm1, ymm4
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- lea eax, [eax + 128]
- vphaddw ymm0, ymm0, ymm1 // mutates.
- vphaddw ymm2, ymm2, ymm3
- vpsrlw ymm0, ymm0, 7
- vpsrlw ymm2, ymm2, 7
- vpackuswb ymm0, ymm0, ymm2 // mutates.
- vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
- vpaddb ymm0, ymm0, ymm5 // add 16 for Y
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOYROW_AVX2
- #ifdef HAS_ARGBTOYJROW_AVX2
- // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
- __declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_y,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
- vbroadcastf128 ymm5, xmmword ptr kAddYJ64
- vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpmaddubsw ymm0, ymm0, ymm4
- vpmaddubsw ymm1, ymm1, ymm4
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- lea eax, [eax + 128]
- vphaddw ymm0, ymm0, ymm1 // mutates.
- vphaddw ymm2, ymm2, ymm3
- vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
- vpaddw ymm2, ymm2, ymm5
- vpsrlw ymm0, ymm0, 7
- vpsrlw ymm2, ymm2, 7
- vpackuswb ymm0, ymm0, ymm2 // mutates.
- vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOYJROW_AVX2
- __declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_y,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kBGRAToY
- movdqa xmm5, xmmword ptr kAddY16
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_y,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kABGRToY
- movdqa xmm5, xmmword ptr kAddY16
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_y,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kRGBAToY
- movdqa xmm5, xmmword ptr kAddY16
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kARGBToV
- movdqa xmm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm1, [eax + 16]
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm2, [eax + 32]
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUVJ128
- movdqa xmm6, xmmword ptr kARGBToVJ
- movdqa xmm7, xmmword ptr kARGBToUJ
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm1, [eax + 16]
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm2, [eax + 32]
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- paddw xmm0, xmm5 // +.5 rounding -> unsigned
- paddw xmm1, xmm5
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- // step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- #ifdef HAS_ARGBTOUVROW_AVX2
- __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUV128
- vbroadcastf128 ymm6, xmmword ptr kARGBToV
- vbroadcastf128 ymm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 32x2 argb pixels to 16x1 */
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- vpavgb ymm2, ymm2, [eax + esi + 64]
- vpavgb ymm3, ymm3, [eax + esi + 96]
- lea eax, [eax + 128]
- vshufps ymm4, ymm0, ymm1, 0x88
- vshufps ymm0, ymm0, ymm1, 0xdd
- vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
- vshufps ymm4, ymm2, ymm3, 0x88
- vshufps ymm2, ymm2, ymm3, 0xdd
- vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 32 different pixels, its 16 pixels of U and 16 of V
- vpmaddubsw ymm1, ymm0, ymm7 // U
- vpmaddubsw ymm3, ymm2, ymm7
- vpmaddubsw ymm0, ymm0, ymm6 // V
- vpmaddubsw ymm2, ymm2, ymm6
- vphaddw ymm1, ymm1, ymm3 // mutates
- vphaddw ymm0, ymm0, ymm2
- vpsraw ymm1, ymm1, 8
- vpsraw ymm0, ymm0, 8
- vpacksswb ymm0, ymm1, ymm0 // mutates
- vpermq ymm0, ymm0, 0xd8 // For vpacksswb
- vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
- vpaddb ymm0, ymm0, ymm5 // -> unsigned
- // step 3 - store 16 U and 16 V values
- vextractf128 [edx], ymm0, 0 // U
- vextractf128 [edx + edi], ymm0, 1 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOUVROW_AVX2
- #ifdef HAS_ARGBTOUVJROW_AVX2
- __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
- vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
- vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 32x2 argb pixels to 16x1 */
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- vpavgb ymm2, ymm2, [eax + esi + 64]
- vpavgb ymm3, ymm3, [eax + esi + 96]
- lea eax, [eax + 128]
- vshufps ymm4, ymm0, ymm1, 0x88
- vshufps ymm0, ymm0, ymm1, 0xdd
- vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
- vshufps ymm4, ymm2, ymm3, 0x88
- vshufps ymm2, ymm2, ymm3, 0xdd
- vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 32 different pixels, its 16 pixels of U and 16 of V
- vpmaddubsw ymm1, ymm0, ymm7 // U
- vpmaddubsw ymm3, ymm2, ymm7
- vpmaddubsw ymm0, ymm0, ymm6 // V
- vpmaddubsw ymm2, ymm2, ymm6
- vphaddw ymm1, ymm1, ymm3 // mutates
- vphaddw ymm0, ymm0, ymm2
- vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
- vpaddw ymm0, ymm0, ymm5
- vpsraw ymm1, ymm1, 8
- vpsraw ymm0, ymm0, 8
- vpacksswb ymm0, ymm1, ymm0 // mutates
- vpermq ymm0, ymm0, 0xd8 // For vpacksswb
- vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
- // step 3 - store 16 U and 16 V values
- vextractf128 [edx], ymm0, 0 // U
- vextractf128 [edx + edi], ymm0, 1 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOUVJROW_AVX2
- __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_argb
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kARGBToV
- movdqa xmm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
- convertloop:
- /* convert to U and V */
- movdqu xmm0, [eax] // U
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm1, xmm7
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm3, xmm7
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- movdqu [edx], xmm0
- movdqu xmm0, [eax] // V
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm6
- pmaddubsw xmm1, xmm6
- pmaddubsw xmm2, xmm6
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- lea eax, [eax + 64]
- movdqu [edx + edi], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- pop edi
- ret
- }
- }
- __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kBGRAToV
- movdqa xmm7, xmmword ptr kBGRAToU
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm1, [eax + 16]
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm2, [eax + 32]
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kABGRToV
- movdqa xmm7, xmmword ptr kABGRToU
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm1, [eax + 16]
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm2, [eax + 32]
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
- int src_stride_argb,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kRGBAToV
- movdqa xmm7, xmmword ptr kRGBAToU
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm1, [eax + 16]
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm2, [eax + 32]
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBTOYROW_SSSE3
- // Read 16 UV from 444
- #define READYUV444_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* U */ \
- __asm vmovdqu xmm1, [esi + edi] /* V */ \
- __asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16]}
- // Read 8 UV from 422, upsample to 16 UV.
- #define READYUV422_AVX2 \
- __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
- __asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16]}
- // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
- #define READYUVA422_AVX2 \
- __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
- __asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- __asm vmovdqu xmm5, [ebp] /* A */ \
- __asm vpermq ymm5, ymm5, 0xd8 \
- __asm lea ebp, [ebp + 16]}
- // Read 8 UV from NV12, upsample to 16 UV.
- #define READNV12_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
- __asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16]}
- // Read 8 UV from NV21, upsample to 16 UV.
- #define READNV21_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
- __asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16]}
- // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
- #define READYUY2_AVX2 \
- __asm { \
- __asm vmovdqu ymm4, [eax] /* YUY2 */ \
- __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
- __asm lea eax, [eax + 32]}
- // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
- #define READUYVY_AVX2 \
- __asm { \
- __asm vmovdqu ymm4, [eax] /* UYVY */ \
- __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
- __asm lea eax, [eax + 32]}
- // Convert 16 pixels: 16 UV and 16 Y.
- #define YUVTORGB_AVX2(YuvConstants) \
- __asm { \
- __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
- __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
- __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
- __asm vpsubw ymm2, ymm3, ymm2 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
- __asm vpsubw ymm1, ymm3, ymm1 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
- __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
- __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
- __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
- __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
- __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
- __asm vpsraw ymm0, ymm0, 6 \
- __asm vpsraw ymm1, ymm1, 6 \
- __asm vpsraw ymm2, ymm2, 6 \
- __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
- __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
- __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
- }
- // Store 16 ARGB values.
- #define STOREARGB_AVX2 \
- __asm { \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
- __asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
- __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
- __asm vmovdqu 0[edx], ymm1 \
- __asm vmovdqu 32[edx], ymm0 \
- __asm lea edx, [edx + 64]}
- // Store 16 RGBA values.
- #define STORERGBA_AVX2 \
- __asm { \
- __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
- __asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
- __asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
- __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
- __asm vmovdqu [edx], ymm0 \
- __asm vmovdqu [edx + 32], ymm1 \
- __asm lea edx, [edx + 64]}
- #ifdef HAS_I422TOARGBROW_AVX2
- // 16 pixels
- // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
- __declspec(naked) void I422ToARGBRow_AVX2(
- const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READYUV422_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_I422TOARGBROW_AVX2
- #ifdef HAS_I422ALPHATOARGBROW_AVX2
- // 16 pixels
- // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
- __declspec(naked) void I422AlphaToARGBRow_AVX2(
- const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- const uint8_t* a_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- push ebp
- mov eax, [esp + 16 + 4] // Y
- mov esi, [esp + 16 + 8] // U
- mov edi, [esp + 16 + 12] // V
- mov ebp, [esp + 16 + 16] // A
- mov edx, [esp + 16 + 20] // argb
- mov ebx, [esp + 16 + 24] // yuvconstants
- mov ecx, [esp + 16 + 28] // width
- sub edi, esi
- convertloop:
- READYUVA422_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebp
- pop ebx
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_I422ALPHATOARGBROW_AVX2
- #ifdef HAS_I444TOARGBROW_AVX2
- // 16 pixels
- // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
- __declspec(naked) void I444ToARGBRow_AVX2(
- const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READYUV444_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_I444TOARGBROW_AVX2
- #ifdef HAS_NV12TOARGBROW_AVX2
- // 16 pixels.
- // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
- __declspec(naked) void NV12ToARGBRow_AVX2(
- const uint8_t* y_buf,
- const uint8_t* uv_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // UV
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READNV12_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_NV12TOARGBROW_AVX2
- #ifdef HAS_NV21TOARGBROW_AVX2
- // 16 pixels.
- // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
- __declspec(naked) void NV21ToARGBRow_AVX2(
- const uint8_t* y_buf,
- const uint8_t* vu_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // VU
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READNV21_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_NV21TOARGBROW_AVX2
- #ifdef HAS_YUY2TOARGBROW_AVX2
- // 16 pixels.
- // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
- __declspec(naked) void YUY2ToARGBRow_AVX2(
- const uint8_t* src_yuy2,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push ebx
- mov eax, [esp + 4 + 4] // yuy2
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READYUY2_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- vzeroupper
- ret
- }
- }
- #endif // HAS_YUY2TOARGBROW_AVX2
- #ifdef HAS_UYVYTOARGBROW_AVX2
- // 16 pixels.
- // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
- __declspec(naked) void UYVYToARGBRow_AVX2(
- const uint8_t* src_uyvy,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push ebx
- mov eax, [esp + 4 + 4] // uyvy
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READUYVY_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- vzeroupper
- ret
- }
- }
- #endif // HAS_UYVYTOARGBROW_AVX2
- #ifdef HAS_I422TORGBAROW_AVX2
- // 16 pixels
- // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
- __declspec(naked) void I422ToRGBARow_AVX2(
- const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // abgr
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READYUV422_AVX2
- YUVTORGB_AVX2(ebx)
- STORERGBA_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_I422TORGBAROW_AVX2
- #if defined(HAS_I422TOARGBROW_SSSE3)
- // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
- // Allows a conversion with half size scaling.
- // Read 8 UV from 444.
- #define READYUV444 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* U */ \
- __asm movq xmm1, qword ptr [esi + edi] /* V */ \
- __asm lea esi, [esi + 8] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8]}
- // Read 4 UV from 422, upsample to 8 UV.
- #define READYUV422 \
- __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
- __asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8]}
- // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
- #define READYUVA422 \
- __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
- __asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] /* Y */ \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- __asm movq xmm5, qword ptr [ebp] /* A */ \
- __asm lea ebp, [ebp + 8]}
- // Read 4 UV from NV12, upsample to 8 UV.
- #define READNV12 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
- __asm lea esi, [esi + 8] \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8]}
- // Read 4 VU from NV21, upsample to 8 UV.
- #define READNV21 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
- __asm lea esi, [esi + 8] \
- __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8]}
- // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
- #define READYUY2 \
- __asm { \
- __asm movdqu xmm4, [eax] /* YUY2 */ \
- __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
- __asm lea eax, [eax + 16]}
- // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
- #define READUYVY \
- __asm { \
- __asm movdqu xmm4, [eax] /* UYVY */ \
- __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
- __asm lea eax, [eax + 16]}
- // Convert 8 pixels: 8 UV and 8 Y.
- #define YUVTORGB(YuvConstants) \
- __asm { \
- __asm movdqa xmm1, xmm0 \
- __asm movdqa xmm2, xmm0 \
- __asm movdqa xmm3, xmm0 \
- __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
- __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
- __asm psubw xmm0, xmm1 \
- __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
- __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
- __asm psubw xmm1, xmm2 \
- __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
- __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
- __asm psubw xmm2, xmm3 \
- __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
- __asm paddsw xmm0, xmm4 /* B += Y */ \
- __asm paddsw xmm1, xmm4 /* G += Y */ \
- __asm paddsw xmm2, xmm4 /* R += Y */ \
- __asm psraw xmm0, 6 \
- __asm psraw xmm1, 6 \
- __asm psraw xmm2, 6 \
- __asm packuswb xmm0, xmm0 /* B */ \
- __asm packuswb xmm1, xmm1 /* G */ \
- __asm packuswb xmm2, xmm2 /* R */ \
- }
- // Store 8 ARGB values.
- #define STOREARGB \
- __asm { \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm5 /* RA */ \
- __asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
- __asm movdqu 0[edx], xmm0 \
- __asm movdqu 16[edx], xmm1 \
- __asm lea edx, [edx + 32]}
- // Store 8 BGRA values.
- #define STOREBGRA \
- __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm0 /* GB */ \
- __asm punpcklbw xmm5, xmm2 /* AR */ \
- __asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
- __asm movdqu 0[edx], xmm5 \
- __asm movdqu 16[edx], xmm0 \
- __asm lea edx, [edx + 32]}
- // Store 8 RGBA values.
- #define STORERGBA \
- __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm2 /* GR */ \
- __asm punpcklbw xmm5, xmm0 /* AB */ \
- __asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
- __asm movdqu 0[edx], xmm5 \
- __asm movdqu 16[edx], xmm0 \
- __asm lea edx, [edx + 32]}
- // Store 8 RGB24 values.
- #define STORERGB24 \
- __asm {/* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
- __asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
- __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
- __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
- __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
- __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
- __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
- __asm lea edx, [edx + 24]}
- // Store 8 RGB565 values.
- #define STORERGB565 \
- __asm {/* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
- __asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
- __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
- __asm movdqa xmm2, xmm0 /* G */ \
- __asm pslld xmm0, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm0, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm0, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm0, xmm3 /* BGR */ \
- __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
- __asm movdqa xmm2, xmm1 /* G */ \
- __asm pslld xmm1, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm1, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm1, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm1, xmm3 /* BGR */ \
- __asm packssdw xmm0, xmm1 \
- __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
- __asm lea edx, [edx + 16]}
- // 8 pixels.
- // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
- __declspec(naked) void I444ToARGBRow_SSSE3(
- const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READYUV444
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
- __declspec(naked) void I422ToRGB24Row_SSSE3(
- const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
- movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
- convertloop:
- READYUV422
- YUVTORGB(ebx)
- STORERGB24
- sub ecx, 8
- jg convertloop
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- // 8 pixels
- // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
- __declspec(naked) void I422ToRGB565Row_SSSE3(
- const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* rgb565_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
- psrld xmm5, 27
- pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
- psrld xmm6, 26
- pslld xmm6, 5
- pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
- pslld xmm7, 11
- convertloop:
- READYUV422
- YUVTORGB(ebx)
- STORERGB565
- sub ecx, 8
- jg convertloop
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
- __declspec(naked) void I422ToARGBRow_SSSE3(
- const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READYUV422
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
- __declspec(naked) void I422AlphaToARGBRow_SSSE3(
- const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- const uint8_t* a_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- push ebp
- mov eax, [esp + 16 + 4] // Y
- mov esi, [esp + 16 + 8] // U
- mov edi, [esp + 16 + 12] // V
- mov ebp, [esp + 16 + 16] // A
- mov edx, [esp + 16 + 20] // argb
- mov ebx, [esp + 16 + 24] // yuvconstants
- mov ecx, [esp + 16 + 28] // width
- sub edi, esi
- convertloop:
- READYUVA422
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebp
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
- __declspec(naked) void NV12ToARGBRow_SSSE3(
- const uint8_t* y_buf,
- const uint8_t* uv_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // UV
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READNV12
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
- __declspec(naked) void NV21ToARGBRow_SSSE3(
- const uint8_t* y_buf,
- const uint8_t* vu_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // VU
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READNV21
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
- __declspec(naked) void YUY2ToARGBRow_SSSE3(
- const uint8_t* src_yuy2,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push ebx
- mov eax, [esp + 4 + 4] // yuy2
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READYUY2
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- ret
- }
- }
- // 8 pixels.
- // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
- __declspec(naked) void UYVYToARGBRow_SSSE3(
- const uint8_t* src_uyvy,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push ebx
- mov eax, [esp + 4 + 4] // uyvy
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READUYVY
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- ret
- }
- }
- __declspec(naked) void I422ToRGBARow_SSSE3(
- const uint8_t* y_buf,
- const uint8_t* u_buf,
- const uint8_t* v_buf,
- uint8_t* dst_rgba,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- convertloop:
- READYUV422
- YUVTORGB(ebx)
- STORERGBA
- sub ecx, 8
- jg convertloop
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_I422TOARGBROW_SSSE3
- #ifdef HAS_I400TOARGBROW_SSE2
- // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
- __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
- uint8_t* rgb_buf,
- int width) {
- __asm {
- mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
- movd xmm2, eax
- pshufd xmm2, xmm2,0
- mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
- movd xmm3, eax
- pshufd xmm3, xmm3, 0
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
- mov eax, [esp + 4] // Y
- mov edx, [esp + 8] // rgb
- mov ecx, [esp + 12] // width
- convertloop:
- // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
- movq xmm0, qword ptr [eax]
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm0 // Y.Y
- pmulhuw xmm0, xmm2
- psubusw xmm0, xmm3
- psrlw xmm0, 6
- packuswb xmm0, xmm0 // G
- // Step 2: Weave into ARGB
- punpcklbw xmm0, xmm0 // GG
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm0 // BGRA first 4 pixels
- punpckhwd xmm1, xmm1 // BGRA next 4 pixels
- por xmm0, xmm4
- por xmm1, xmm4
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_I400TOARGBROW_SSE2
- #ifdef HAS_I400TOARGBROW_AVX2
- // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
- // note: vpunpcklbw mutates and vpackuswb unmutates.
- __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
- uint8_t* rgb_buf,
- int width) {
- __asm {
- mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
- vmovd xmm2, eax
- vbroadcastss ymm2, xmm2
- mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
- vmovd xmm3, eax
- vbroadcastss ymm3, xmm3
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
- vpslld ymm4, ymm4, 24
- mov eax, [esp + 4] // Y
- mov edx, [esp + 8] // rgb
- mov ecx, [esp + 12] // width
- convertloop:
- // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
- vmovdqu xmm0, [eax]
- lea eax, [eax + 16]
- vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
- vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
- vpmulhuw ymm0, ymm0, ymm2
- vpsubusw ymm0, ymm0, ymm3
- vpsrlw ymm0, ymm0, 6
- vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
- // TODO(fbarchard): Weave alpha with unpack.
- // Step 2: Weave into ARGB
- vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
- vpermq ymm1, ymm1, 0xd8
- vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
- vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
- vpor ymm0, ymm0, ymm4
- vpor ymm1, ymm1, ymm4
- vmovdqu [edx], ymm0
- vmovdqu [edx + 32], ymm1
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_I400TOARGBROW_AVX2
- #ifdef HAS_MIRRORROW_SSSE3
- // Shuffle table for reversing the bytes.
- static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
- 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
- // TODO(fbarchard): Replace lea with -16 offset.
- __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
- uint8_t* dst,
- int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- movdqa xmm5, xmmword ptr kShuffleMirror
- convertloop:
- movdqu xmm0, [eax - 16 + ecx]
- pshufb xmm0, xmm5
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- #endif // HAS_MIRRORROW_SSSE3
- #ifdef HAS_MIRRORROW_AVX2
- __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
- uint8_t* dst,
- int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
- convertloop:
- vmovdqu ymm0, [eax - 32 + ecx]
- vpshufb ymm0, ymm0, ymm5
- vpermq ymm0, ymm0, 0x4e // swap high and low halfs
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_MIRRORROW_AVX2
- #ifdef HAS_MIRRORUVROW_SSSE3
- // Shuffle table for reversing the bytes of UV channels.
- static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
- 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
- __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- movdqa xmm1, xmmword ptr kShuffleMirrorUV
- lea eax, [eax + ecx * 2 - 16]
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- lea eax, [eax - 16]
- pshufb xmm0, xmm1
- movlpd qword ptr [edx], xmm0
- movhpd qword ptr [edx + edi], xmm0
- lea edx, [edx + 8]
- sub ecx, 8
- jg convertloop
- pop edi
- ret
- }
- }
- #endif // HAS_MIRRORUVROW_SSSE3
- #ifdef HAS_ARGBMIRRORROW_SSE2
- __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
- uint8_t* dst,
- int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
- convertloop:
- movdqu xmm0, [eax]
- lea eax, [eax - 16]
- pshufd xmm0, xmm0, 0x1b
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBMIRRORROW_SSE2
- #ifdef HAS_ARGBMIRRORROW_AVX2
- // Shuffle table for reversing the bytes.
- static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
- __declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
- uint8_t* dst,
- int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
- convertloop:
- vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBMIRRORROW_AVX2
- #ifdef HAS_SPLITUVROW_SSE2
- __declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- pand xmm0, xmm5 // even bytes
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- psrlw xmm2, 8 // odd bytes
- psrlw xmm3, 8
- packuswb xmm2, xmm3
- movdqu [edx], xmm0
- movdqu [edx + edi], xmm2
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- pop edi
- ret
- }
- }
- #endif // HAS_SPLITUVROW_SSE2
- #ifdef HAS_SPLITUVROW_AVX2
- __declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpsrlw ymm2, ymm0, 8 // odd bytes
- vpsrlw ymm3, ymm1, 8
- vpand ymm0, ymm0, ymm5 // even bytes
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1
- vpackuswb ymm2, ymm2, ymm3
- vpermq ymm0, ymm0, 0xd8
- vpermq ymm2, ymm2, 0xd8
- vmovdqu [edx], ymm0
- vmovdqu [edx + edi], ymm2
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- pop edi
- vzeroupper
- ret
- }
- }
- #endif // HAS_SPLITUVROW_AVX2
- #ifdef HAS_MERGEUVROW_SSE2
- __declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_uv,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_u
- mov edx, [esp + 4 + 8] // src_v
- mov edi, [esp + 4 + 12] // dst_uv
- mov ecx, [esp + 4 + 16] // width
- sub edx, eax
- convertloop:
- movdqu xmm0, [eax] // read 16 U's
- movdqu xmm1, [eax + edx] // and 16 V's
- lea eax, [eax + 16]
- movdqa xmm2, xmm0
- punpcklbw xmm0, xmm1 // first 8 UV pairs
- punpckhbw xmm2, xmm1 // next 8 UV pairs
- movdqu [edi], xmm0
- movdqu [edi + 16], xmm2
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
- pop edi
- ret
- }
- }
- #endif // HAS_MERGEUVROW_SSE2
- #ifdef HAS_MERGEUVROW_AVX2
- __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_uv,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_u
- mov edx, [esp + 4 + 8] // src_v
- mov edi, [esp + 4 + 12] // dst_uv
- mov ecx, [esp + 4 + 16] // width
- sub edx, eax
- convertloop:
- vmovdqu ymm0, [eax] // read 32 U's
- vmovdqu ymm1, [eax + edx] // and 32 V's
- lea eax, [eax + 32]
- vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
- vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
- vextractf128 [edi], ymm2, 0 // bytes 0..15
- vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
- vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
- vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
- lea edi, [edi + 64]
- sub ecx, 32
- jg convertloop
- pop edi
- vzeroupper
- ret
- }
- }
- #endif // HAS_MERGEUVROW_AVX2
- #ifdef HAS_COPYROW_SSE2
- // CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
- __declspec(naked) void CopyRow_SSE2(const uint8_t* src,
- uint8_t* dst,
- int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- test eax, 15
- jne convertloopu
- test edx, 15
- jne convertloopu
- convertloopa:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloopa
- ret
- convertloopu:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloopu
- ret
- }
- }
- #endif // HAS_COPYROW_SSE2
- #ifdef HAS_COPYROW_AVX
- // CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
- __declspec(naked) void CopyRow_AVX(const uint8_t* src,
- uint8_t* dst,
- int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vmovdqu [edx], ymm0
- vmovdqu [edx + 32], ymm1
- lea edx, [edx + 64]
- sub ecx, 64
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_COPYROW_AVX
- // Multiple of 1.
- __declspec(naked) void CopyRow_ERMS(const uint8_t* src,
- uint8_t* dst,
- int width) {
- __asm {
- mov eax, esi
- mov edx, edi
- mov esi, [esp + 4] // src
- mov edi, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- rep movsb
- mov edi, edx
- mov esi, eax
- ret
- }
- }
- #ifdef HAS_ARGBCOPYALPHAROW_SSE2
- // width in pixels
- __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
- uint8_t* dst,
- int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- pcmpeqb xmm0, xmm0 // generate mask 0xff000000
- pslld xmm0, 24
- pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
- psrld xmm1, 8
- convertloop:
- movdqu xmm2, [eax]
- movdqu xmm3, [eax + 16]
- lea eax, [eax + 32]
- movdqu xmm4, [edx]
- movdqu xmm5, [edx + 16]
- pand xmm2, xmm0
- pand xmm3, xmm0
- pand xmm4, xmm1
- pand xmm5, xmm1
- por xmm2, xmm4
- por xmm3, xmm5
- movdqu [edx], xmm2
- movdqu [edx + 16], xmm3
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBCOPYALPHAROW_SSE2
- #ifdef HAS_ARGBCOPYALPHAROW_AVX2
- // width in pixels
- __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
- uint8_t* dst,
- int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm0, ymm0, ymm0
- vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
- convertloop:
- vmovdqu ymm1, [eax]
- vmovdqu ymm2, [eax + 32]
- lea eax, [eax + 64]
- vpblendvb ymm1, ymm1, [edx], ymm0
- vpblendvb ymm2, ymm2, [edx + 32], ymm0
- vmovdqu [edx], ymm1
- vmovdqu [edx + 32], ymm2
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBCOPYALPHAROW_AVX2
- #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
- // width in pixels
- __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
- uint8_t* dst_a,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_a
- mov ecx, [esp + 12] // width
- extractloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrld xmm0, 24
- psrld xmm1, 24
- packssdw xmm0, xmm1
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- sub ecx, 8
- jg extractloop
- ret
- }
- }
- #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
- #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
- // width in pixels
- __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_a,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_a
- mov ecx, [esp + 12] // width
- vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX
- extractloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vpsrld ymm0, ymm0, 24
- vpsrld ymm1, ymm1, 24
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- lea eax, [eax + 128]
- vpackssdw ymm0, ymm0, ymm1 // mutates
- vpsrld ymm2, ymm2, 24
- vpsrld ymm3, ymm3, 24
- vpackssdw ymm2, ymm2, ymm3 // mutates
- vpackuswb ymm0, ymm0, ymm2 // mutates
- vpermd ymm0, ymm4, ymm0 // unmutate
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg extractloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
- #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
- // width in pixels
- __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
- uint8_t* dst,
- int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- pcmpeqb xmm0, xmm0 // generate mask 0xff000000
- pslld xmm0, 24
- pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
- psrld xmm1, 8
- convertloop:
- movq xmm2, qword ptr [eax] // 8 Y's
- lea eax, [eax + 8]
- punpcklbw xmm2, xmm2
- punpckhwd xmm3, xmm2
- punpcklwd xmm2, xmm2
- movdqu xmm4, [edx]
- movdqu xmm5, [edx + 16]
- pand xmm2, xmm0
- pand xmm3, xmm0
- pand xmm4, xmm1
- pand xmm5, xmm1
- por xmm2, xmm4
- por xmm3, xmm5
- movdqu [edx], xmm2
- movdqu [edx + 16], xmm3
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
- #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
- // width in pixels
- __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
- uint8_t* dst,
- int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm0, ymm0, ymm0
- vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
- convertloop:
- vpmovzxbd ymm1, qword ptr [eax]
- vpmovzxbd ymm2, qword ptr [eax + 8]
- lea eax, [eax + 16]
- vpslld ymm1, ymm1, 24
- vpslld ymm2, ymm2, 24
- vpblendvb ymm1, ymm1, [edx], ymm0
- vpblendvb ymm2, ymm2, [edx + 32], ymm0
- vmovdqu [edx], ymm1
- vmovdqu [edx + 32], ymm2
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
- #ifdef HAS_SETROW_X86
- // Write 'width' bytes using an 8 bit value repeated.
- // width should be multiple of 4.
- __declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
- __asm {
- movzx eax, byte ptr [esp + 8] // v8
- mov edx, 0x01010101 // Duplicate byte to all bytes.
- mul edx // overwrites edx with upper part of result.
- mov edx, edi
- mov edi, [esp + 4] // dst
- mov ecx, [esp + 12] // width
- shr ecx, 2
- rep stosd
- mov edi, edx
- ret
- }
- }
- // Write 'width' bytes using an 8 bit value repeated.
- __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
- __asm {
- mov edx, edi
- mov edi, [esp + 4] // dst
- mov eax, [esp + 8] // v8
- mov ecx, [esp + 12] // width
- rep stosb
- mov edi, edx
- ret
- }
- }
- // Write 'width' 32 bit values.
- __declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
- uint32_t v32,
- int width) {
- __asm {
- mov edx, edi
- mov edi, [esp + 4] // dst
- mov eax, [esp + 8] // v32
- mov ecx, [esp + 12] // width
- rep stosd
- mov edi, edx
- ret
- }
- }
- #endif // HAS_SETROW_X86
- #ifdef HAS_YUY2TOYROW_AVX2
- __declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
- uint8_t* dst_y,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // even bytes are Y
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- vzeroupper
- ret
- }
- }
- __declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
- int stride_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- vzeroupper
- ret
- }
- }
- __declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
- uint8_t* dst_y,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // odd bytes are Y
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- vzeroupper
- ret
- }
- }
- __declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
- int stride_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- __declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- vzeroupper
- ret
- }
- }
- #endif // HAS_YUY2TOYROW_AVX2
- #ifdef HAS_YUY2TOYROW_SSE2
- __declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
- uint8_t* dst_y,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // even bytes are Y
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
- int stride_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- ret
- }
- }
- __declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
- uint8_t* dst_y,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // odd bytes are Y
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
- int stride_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- ret
- }
- }
- #endif // HAS_YUY2TOYROW_SSE2
- #ifdef HAS_BLENDPLANEROW_SSSE3
- // Blend 8 pixels at a time.
- // unsigned version of math
- // =((A2*C2)+(B2*(255-C2))+255)/256
- // signed version of math
- // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
- __declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
- const uint8_t* src1,
- const uint8_t* alpha,
- uint8_t* dst,
- int width) {
- __asm {
- push esi
- push edi
- pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
- psllw xmm5, 8
- mov eax, 0x80808080 // 128 for biasing image to signed.
- movd xmm6, eax
- pshufd xmm6, xmm6, 0x00
- mov eax, 0x807f807f // 32768 + 127 for unbias and round.
- movd xmm7, eax
- pshufd xmm7, xmm7, 0x00
- mov eax, [esp + 8 + 4] // src0
- mov edx, [esp + 8 + 8] // src1
- mov esi, [esp + 8 + 12] // alpha
- mov edi, [esp + 8 + 16] // dst
- mov ecx, [esp + 8 + 20] // width
- sub eax, esi
- sub edx, esi
- sub edi, esi
- // 8 pixel loop.
- convertloop8:
- movq xmm0, qword ptr [esi] // alpha
- punpcklbw xmm0, xmm0
- pxor xmm0, xmm5 // a, 255-a
- movq xmm1, qword ptr [eax + esi] // src0
- movq xmm2, qword ptr [edx + esi] // src1
- punpcklbw xmm1, xmm2
- psubb xmm1, xmm6 // bias src0/1 - 128
- pmaddubsw xmm0, xmm1
- paddw xmm0, xmm7 // unbias result - 32768 and round.
- psrlw xmm0, 8
- packuswb xmm0, xmm0
- movq qword ptr [edi + esi], xmm0
- lea esi, [esi + 8]
- sub ecx, 8
- jg convertloop8
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_BLENDPLANEROW_SSSE3
- #ifdef HAS_BLENDPLANEROW_AVX2
- // Blend 32 pixels at a time.
- // unsigned version of math
- // =((A2*C2)+(B2*(255-C2))+255)/256
- // signed version of math
- // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
- __declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
- const uint8_t* src1,
- const uint8_t* alpha,
- uint8_t* dst,
- int width) {
- __asm {
- push esi
- push edi
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
- vpsllw ymm5, ymm5, 8
- mov eax, 0x80808080 // 128 for biasing image to signed.
- vmovd xmm6, eax
- vbroadcastss ymm6, xmm6
- mov eax, 0x807f807f // 32768 + 127 for unbias and round.
- vmovd xmm7, eax
- vbroadcastss ymm7, xmm7
- mov eax, [esp + 8 + 4] // src0
- mov edx, [esp + 8 + 8] // src1
- mov esi, [esp + 8 + 12] // alpha
- mov edi, [esp + 8 + 16] // dst
- mov ecx, [esp + 8 + 20] // width
- sub eax, esi
- sub edx, esi
- sub edi, esi
- // 32 pixel loop.
- convertloop32:
- vmovdqu ymm0, [esi] // alpha
- vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
- vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
- vpxor ymm3, ymm3, ymm5 // a, 255-a
- vpxor ymm0, ymm0, ymm5 // a, 255-a
- vmovdqu ymm1, [eax + esi] // src0
- vmovdqu ymm2, [edx + esi] // src1
- vpunpckhbw ymm4, ymm1, ymm2
- vpunpcklbw ymm1, ymm1, ymm2
- vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
- vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
- vpmaddubsw ymm3, ymm3, ymm4
- vpmaddubsw ymm0, ymm0, ymm1
- vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
- vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
- vpsrlw ymm3, ymm3, 8
- vpsrlw ymm0, ymm0, 8
- vpackuswb ymm0, ymm0, ymm3
- vmovdqu [edi + esi], ymm0
- lea esi, [esi + 32]
- sub ecx, 32
- jg convertloop32
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_BLENDPLANEROW_AVX2
- #ifdef HAS_ARGBBLENDROW_SSSE3
- // Shuffle table for isolating alpha.
- static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
- 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
- // Blend 8 pixels at a time.
- __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm7, xmm7 // generate constant 0x0001
- psrlw xmm7, 15
- pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
- psrlw xmm6, 8
- pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
- psllw xmm5, 8
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
- sub ecx, 4
- jl convertloop4b // less than 4 pixels?
- // 4 pixel loop.
- convertloop4:
- movdqu xmm3, [eax] // src argb
- lea eax, [eax + 16]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movdqu xmm2, [esi] // _r_b
- pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movdqu xmm1, [esi] // _a_g
- lea esi, [esi + 16]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jge convertloop4
- convertloop4b:
- add ecx, 4 - 1
- jl convertloop1b
- // 1 pixel loop.
- convertloop1:
- movd xmm3, [eax] // src argb
- lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
- lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- movd [edx], xmm0
- lea edx, [edx + 4]
- sub ecx, 1
- jge convertloop1
- convertloop1b:
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBBLENDROW_SSSE3
- #ifdef HAS_ARGBATTENUATEROW_SSSE3
- // Shuffle table duplicating alpha.
- static const uvec8 kShuffleAlpha0 = {
- 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
- };
- static const uvec8 kShuffleAlpha1 = {
- 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
- 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
- };
- __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm3, xmm3 // generate mask 0xff000000
- pslld xmm3, 24
- movdqa xmm4, xmmword ptr kShuffleAlpha0
- movdqa xmm5, xmmword ptr kShuffleAlpha1
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels
- pshufb xmm0, xmm4 // isolate first 2 alphas
- movdqu xmm1, [eax] // read 4 pixels
- punpcklbw xmm1, xmm1 // first 2 pixel rgbs
- pmulhuw xmm0, xmm1 // rgb * a
- movdqu xmm1, [eax] // read 4 pixels
- pshufb xmm1, xmm5 // isolate next 2 alphas
- movdqu xmm2, [eax] // read 4 pixels
- punpckhbw xmm2, xmm2 // next 2 pixel rgbs
- pmulhuw xmm1, xmm2 // rgb * a
- movdqu xmm2, [eax] // mask original alpha
- lea eax, [eax + 16]
- pand xmm2, xmm3
- psrlw xmm0, 8
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- por xmm0, xmm2 // copy original alpha
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBATTENUATEROW_SSSE3
- #ifdef HAS_ARGBATTENUATEROW_AVX2
- // Shuffle table duplicating alpha.
- static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
- 128u, 128u, 14u, 15u, 14u, 15u,
- 14u, 15u, 128u, 128u};
- __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
- vpslld ymm5, ymm5, 24
- convertloop:
- vmovdqu ymm6, [eax] // read 8 pixels.
- vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
- vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
- vpshufb ymm2, ymm0, ymm4 // low 4 alphas
- vpshufb ymm3, ymm1, ymm4 // high 4 alphas
- vpmulhuw ymm0, ymm0, ymm2 // rgb * a
- vpmulhuw ymm1, ymm1, ymm3 // rgb * a
- vpand ymm6, ymm6, ymm5 // isolate alpha
- vpsrlw ymm0, ymm0, 8
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // unmutated.
- vpor ymm0, ymm0, ymm6 // copy original alpha
- vmovdqu [eax + edx], ymm0
- lea eax, [eax + 32]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBATTENUATEROW_AVX2
- #ifdef HAS_ARGBUNATTENUATEROW_SSE2
- // Unattenuate 4 pixels at a time.
- __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov edx, [esp + 12 + 8] // dst_argb
- mov ecx, [esp + 12 + 12] // width
- lea ebx, fixed_invtbl8
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels
- movzx esi, byte ptr [eax + 3] // first alpha
- movzx edi, byte ptr [eax + 7] // second alpha
- punpcklbw xmm0, xmm0 // first 2
- movd xmm2, dword ptr [ebx + esi * 4]
- movd xmm3, dword ptr [ebx + edi * 4]
- pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
- pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
- movlhps xmm2, xmm3
- pmulhuw xmm0, xmm2 // rgb * a
- movdqu xmm1, [eax] // read 4 pixels
- movzx esi, byte ptr [eax + 11] // third alpha
- movzx edi, byte ptr [eax + 15] // forth alpha
- punpckhbw xmm1, xmm1 // next 2
- movd xmm2, dword ptr [ebx + esi * 4]
- movd xmm3, dword ptr [ebx + edi * 4]
- pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
- pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
- movlhps xmm2, xmm3
- pmulhuw xmm1, xmm2 // rgb * a
- lea eax, [eax + 16]
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- pop edi
- pop esi
- pop ebx
- ret
- }
- }
- #endif // HAS_ARGBUNATTENUATEROW_SSE2
- #ifdef HAS_ARGBUNATTENUATEROW_AVX2
- // Shuffle table duplicating alpha.
- static const uvec8 kUnattenShuffleAlpha_AVX2 = {
- 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
- // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
- // USE_GATHER is not on by default, due to being a slow instruction.
- #ifdef USE_GATHER
- __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
- convertloop:
- vmovdqu ymm6, [eax] // read 8 pixels.
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
- vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
- vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
- vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
- vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
- vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
- vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
- vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
- vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
- vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
- vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
- vpackuswb ymm0, ymm0, ymm1 // unmutated.
- vmovdqu [eax + edx], ymm0
- lea eax, [eax + 32]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #else // USE_GATHER
- __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov edx, [esp + 12 + 8] // dst_argb
- mov ecx, [esp + 12 + 12] // width
- sub edx, eax
- lea ebx, fixed_invtbl8
- vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
- convertloop:
- // replace VPGATHER
- movzx esi, byte ptr [eax + 3] // alpha0
- movzx edi, byte ptr [eax + 7] // alpha1
- vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
- vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
- movzx esi, byte ptr [eax + 11] // alpha2
- movzx edi, byte ptr [eax + 15] // alpha3
- vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
- vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
- vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
- movzx esi, byte ptr [eax + 19] // alpha4
- movzx edi, byte ptr [eax + 23] // alpha5
- vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
- vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
- vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
- movzx esi, byte ptr [eax + 27] // alpha6
- movzx edi, byte ptr [eax + 31] // alpha7
- vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
- vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
- vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
- vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
- vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
- vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
- vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
- // end of VPGATHER
- vmovdqu ymm6, [eax] // read 8 pixels.
- vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
- vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
- vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
- vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
- vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
- vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
- vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
- vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
- vpackuswb ymm0, ymm0, ymm1 // unmutated.
- vmovdqu [eax + edx], ymm0
- lea eax, [eax + 32]
- sub ecx, 8
- jg convertloop
- pop edi
- pop esi
- pop ebx
- vzeroupper
- ret
- }
- }
- #endif // USE_GATHER
- #endif // HAS_ARGBATTENUATEROW_AVX2
- #ifdef HAS_ARGBGRAYROW_SSSE3
- // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
- __declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kARGBToYJ
- movdqa xmm5, xmmword ptr kAddYJ64
- convertloop:
- movdqu xmm0, [eax] // G
- movdqu xmm1, [eax + 16]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- phaddw xmm0, xmm1
- paddw xmm0, xmm5 // Add .5 for rounding.
- psrlw xmm0, 7
- packuswb xmm0, xmm0 // 8 G bytes
- movdqu xmm2, [eax] // A
- movdqu xmm3, [eax + 16]
- lea eax, [eax + 32]
- psrld xmm2, 24
- psrld xmm3, 24
- packuswb xmm2, xmm3
- packuswb xmm2, xmm2 // 8 A bytes
- movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
- punpcklbw xmm0, xmm0 // 8 GG words
- punpcklbw xmm3, xmm2 // 8 GA words
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm3 // GGGA first 4
- punpckhwd xmm1, xmm3 // GGGA next 4
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBGRAYROW_SSSE3
- #ifdef HAS_ARGBSEPIAROW_SSSE3
- // b = (r * 35 + g * 68 + b * 17) >> 7
- // g = (r * 45 + g * 88 + b * 22) >> 7
- // r = (r * 50 + g * 98 + b * 24) >> 7
- // Constant for ARGB color to sepia tone.
- static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
- 17, 68, 35, 0, 17, 68, 35, 0};
- static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
- 22, 88, 45, 0, 22, 88, 45, 0};
- static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
- 24, 98, 50, 0, 24, 98, 50, 0};
- // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
- __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] /* dst_argb */
- mov ecx, [esp + 8] /* width */
- movdqa xmm2, xmmword ptr kARGBToSepiaB
- movdqa xmm3, xmmword ptr kARGBToSepiaG
- movdqa xmm4, xmmword ptr kARGBToSepiaR
- convertloop:
- movdqu xmm0, [eax] // B
- movdqu xmm6, [eax + 16]
- pmaddubsw xmm0, xmm2
- pmaddubsw xmm6, xmm2
- phaddw xmm0, xmm6
- psrlw xmm0, 7
- packuswb xmm0, xmm0 // 8 B values
- movdqu xmm5, [eax] // G
- movdqu xmm1, [eax + 16]
- pmaddubsw xmm5, xmm3
- pmaddubsw xmm1, xmm3
- phaddw xmm5, xmm1
- psrlw xmm5, 7
- packuswb xmm5, xmm5 // 8 G values
- punpcklbw xmm0, xmm5 // 8 BG values
- movdqu xmm5, [eax] // R
- movdqu xmm1, [eax + 16]
- pmaddubsw xmm5, xmm4
- pmaddubsw xmm1, xmm4
- phaddw xmm5, xmm1
- psrlw xmm5, 7
- packuswb xmm5, xmm5 // 8 R values
- movdqu xmm6, [eax] // A
- movdqu xmm1, [eax + 16]
- psrld xmm6, 24
- psrld xmm1, 24
- packuswb xmm6, xmm1
- packuswb xmm6, xmm6 // 8 A values
- punpcklbw xmm5, xmm6 // 8 RA values
- movdqa xmm1, xmm0 // Weave BG, RA together
- punpcklwd xmm0, xmm5 // BGRA first 4
- punpckhwd xmm1, xmm5 // BGRA next 4
- movdqu [eax], xmm0
- movdqu [eax + 16], xmm1
- lea eax, [eax + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBSEPIAROW_SSSE3
- #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
- // Tranform 8 ARGB pixels (32 bytes) with color matrix.
- // Same as Sepia except matrix is provided.
- // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
- // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
- __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const int8_t* matrix_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* matrix_argb */
- movdqu xmm5, [ecx]
- pshufd xmm2, xmm5, 0x00
- pshufd xmm3, xmm5, 0x55
- pshufd xmm4, xmm5, 0xaa
- pshufd xmm5, xmm5, 0xff
- mov ecx, [esp + 16] /* width */
- convertloop:
- movdqu xmm0, [eax] // B
- movdqu xmm7, [eax + 16]
- pmaddubsw xmm0, xmm2
- pmaddubsw xmm7, xmm2
- movdqu xmm6, [eax] // G
- movdqu xmm1, [eax + 16]
- pmaddubsw xmm6, xmm3
- pmaddubsw xmm1, xmm3
- phaddsw xmm0, xmm7 // B
- phaddsw xmm6, xmm1 // G
- psraw xmm0, 6 // B
- psraw xmm6, 6 // G
- packuswb xmm0, xmm0 // 8 B values
- packuswb xmm6, xmm6 // 8 G values
- punpcklbw xmm0, xmm6 // 8 BG values
- movdqu xmm1, [eax] // R
- movdqu xmm7, [eax + 16]
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm7, xmm4
- phaddsw xmm1, xmm7 // R
- movdqu xmm6, [eax] // A
- movdqu xmm7, [eax + 16]
- pmaddubsw xmm6, xmm5
- pmaddubsw xmm7, xmm5
- phaddsw xmm6, xmm7 // A
- psraw xmm1, 6 // R
- psraw xmm6, 6 // A
- packuswb xmm1, xmm1 // 8 R values
- packuswb xmm6, xmm6 // 8 A values
- punpcklbw xmm1, xmm6 // 8 RA values
- movdqa xmm6, xmm0 // Weave BG, RA together
- punpcklwd xmm0, xmm1 // BGRA first 4
- punpckhwd xmm6, xmm1 // BGRA next 4
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm6
- lea eax, [eax + 32]
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
- #ifdef HAS_ARGBQUANTIZEROW_SSE2
- // Quantize 4 ARGB pixels (16 bytes).
- __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
- int scale,
- int interval_size,
- int interval_offset,
- int width) {
- __asm {
- mov eax, [esp + 4] /* dst_argb */
- movd xmm2, [esp + 8] /* scale */
- movd xmm3, [esp + 12] /* interval_size */
- movd xmm4, [esp + 16] /* interval_offset */
- mov ecx, [esp + 20] /* width */
- pshuflw xmm2, xmm2, 040h
- pshufd xmm2, xmm2, 044h
- pshuflw xmm3, xmm3, 040h
- pshufd xmm3, xmm3, 044h
- pshuflw xmm4, xmm4, 040h
- pshufd xmm4, xmm4, 044h
- pxor xmm5, xmm5 // constant 0
- pcmpeqb xmm6, xmm6 // generate mask 0xff000000
- pslld xmm6, 24
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels
- punpcklbw xmm0, xmm5 // first 2 pixels
- pmulhuw xmm0, xmm2 // pixel * scale >> 16
- movdqu xmm1, [eax] // read 4 pixels
- punpckhbw xmm1, xmm5 // next 2 pixels
- pmulhuw xmm1, xmm2
- pmullw xmm0, xmm3 // * interval_size
- movdqu xmm7, [eax] // read 4 pixels
- pmullw xmm1, xmm3
- pand xmm7, xmm6 // mask alpha
- paddw xmm0, xmm4 // + interval_size / 2
- paddw xmm1, xmm4
- packuswb xmm0, xmm1
- por xmm0, xmm7
- movdqu [eax], xmm0
- lea eax, [eax + 16]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBQUANTIZEROW_SSE2
- #ifdef HAS_ARGBSHADEROW_SSE2
- // Shade 4 pixels at a time by specified value.
- __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width,
- uint32_t value) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- movd xmm2, [esp + 16] // value
- punpcklbw xmm2, xmm2
- punpcklqdq xmm2, xmm2
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- pmulhuw xmm0, xmm2 // argb * value
- pmulhuw xmm1, xmm2 // argb * value
- psrlw xmm0, 8
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBSHADEROW_SSE2
- #ifdef HAS_ARGBMULTIPLYROW_SSE2
- // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
- __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- pxor xmm5, xmm5 // constant 0
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
- movdqu xmm2, [esi] // read 4 pixels from src_argb1
- movdqu xmm1, xmm0
- movdqu xmm3, xmm2
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- punpcklbw xmm2, xmm5 // first 2
- punpckhbw xmm3, xmm5 // next 2
- pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
- pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
- lea eax, [eax + 16]
- lea esi, [esi + 16]
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBMULTIPLYROW_SSE2
- #ifdef HAS_ARGBADDROW_SSE2
- // Add 2 rows of ARGB pixels together, 4 pixels at a time.
- // TODO(fbarchard): Port this to posix, neon and other math functions.
- __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- sub ecx, 4
- jl convertloop49
- convertloop4:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
- lea eax, [eax + 16]
- movdqu xmm1, [esi] // read 4 pixels from src_argb1
- lea esi, [esi + 16]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jge convertloop4
- convertloop49:
- add ecx, 4 - 1
- jl convertloop19
- convertloop1:
- movd xmm0, [eax] // read 1 pixels from src_argb0
- lea eax, [eax + 4]
- movd xmm1, [esi] // read 1 pixels from src_argb1
- lea esi, [esi + 4]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
- movd [edx], xmm0
- lea edx, [edx + 4]
- sub ecx, 1
- jge convertloop1
- convertloop19:
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBADDROW_SSE2
- #ifdef HAS_ARGBSUBTRACTROW_SSE2
- // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
- __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
- lea eax, [eax + 16]
- movdqu xmm1, [esi] // read 4 pixels from src_argb1
- lea esi, [esi + 16]
- psubusb xmm0, xmm1 // src_argb0 - src_argb1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBSUBTRACTROW_SSE2
- #ifdef HAS_ARGBMULTIPLYROW_AVX2
- // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
- __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- vpxor ymm5, ymm5, ymm5 // constant 0
- convertloop:
- vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
- lea eax, [eax + 32]
- vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
- lea esi, [esi + 32]
- vpunpcklbw ymm0, ymm1, ymm1 // low 4
- vpunpckhbw ymm1, ymm1, ymm1 // high 4
- vpunpcklbw ymm2, ymm3, ymm5 // low 4
- vpunpckhbw ymm3, ymm3, ymm5 // high 4
- vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
- vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
- vpackuswb ymm0, ymm0, ymm1
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBMULTIPLYROW_AVX2
- #ifdef HAS_ARGBADDROW_AVX2
- // Add 2 rows of ARGB pixels together, 8 pixels at a time.
- __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
- lea eax, [eax + 32]
- vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
- lea esi, [esi + 32]
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBADDROW_AVX2
- #ifdef HAS_ARGBSUBTRACTROW_AVX2
- // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
- __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
- const uint8_t* src_argb1,
- uint8_t* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
- lea eax, [eax + 32]
- vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
- lea esi, [esi + 32]
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBSUBTRACTROW_AVX2
- #ifdef HAS_SOBELXROW_SSE2
- // SobelX as a matrix is
- // -1 0 1
- // -2 0 2
- // -1 0 1
- __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
- const uint8_t* src_y1,
- const uint8_t* src_y2,
- uint8_t* dst_sobelx,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_y0
- mov esi, [esp + 8 + 8] // src_y1
- mov edi, [esp + 8 + 12] // src_y2
- mov edx, [esp + 8 + 16] // dst_sobelx
- mov ecx, [esp + 8 + 20] // width
- sub esi, eax
- sub edi, eax
- sub edx, eax
- pxor xmm5, xmm5 // constant 0
- convertloop:
- movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
- movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- psubw xmm0, xmm1
- movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
- movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
- punpcklbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- psubw xmm1, xmm2
- movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
- movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
- punpcklbw xmm2, xmm5
- punpcklbw xmm3, xmm5
- psubw xmm2, xmm3
- paddw xmm0, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm1
- pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
- psubw xmm1, xmm0
- pmaxsw xmm0, xmm1
- packuswb xmm0, xmm0
- movq qword ptr [eax + edx], xmm0
- lea eax, [eax + 8]
- sub ecx, 8
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_SOBELXROW_SSE2
- #ifdef HAS_SOBELYROW_SSE2
- // SobelY as a matrix is
- // -1 -2 -1
- // 0 0 0
- // 1 2 1
- __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
- const uint8_t* src_y1,
- uint8_t* dst_sobely,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_y0
- mov esi, [esp + 4 + 8] // src_y1
- mov edx, [esp + 4 + 12] // dst_sobely
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- sub edx, eax
- pxor xmm5, xmm5 // constant 0
- convertloop:
- movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
- movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- psubw xmm0, xmm1
- movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
- movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
- punpcklbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- psubw xmm1, xmm2
- movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
- movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
- punpcklbw xmm2, xmm5
- punpcklbw xmm3, xmm5
- psubw xmm2, xmm3
- paddw xmm0, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm1
- pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
- psubw xmm1, xmm0
- pmaxsw xmm0, xmm1
- packuswb xmm0, xmm0
- movq qword ptr [eax + edx], xmm0
- lea eax, [eax + 8]
- sub ecx, 8
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_SOBELYROW_SSE2
- #ifdef HAS_SOBELROW_SSE2
- // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
- // A = 255
- // R = Sobel
- // G = Sobel
- // B = Sobel
- __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- pcmpeqb xmm5, xmm5 // alpha 255
- pslld xmm5, 24 // 0xff000000
- convertloop:
- movdqu xmm0, [eax] // read 16 pixels src_sobelx
- movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
- lea eax, [eax + 16]
- paddusb xmm0, xmm1 // sobel = sobelx + sobely
- movdqa xmm2, xmm0 // GG
- punpcklbw xmm2, xmm0 // First 8
- punpckhbw xmm0, xmm0 // Next 8
- movdqa xmm1, xmm2 // GGGG
- punpcklwd xmm1, xmm2 // First 4
- punpckhwd xmm2, xmm2 // Next 4
- por xmm1, xmm5 // GGGA
- por xmm2, xmm5
- movdqa xmm3, xmm0 // GGGG
- punpcklwd xmm3, xmm0 // Next 4
- punpckhwd xmm0, xmm0 // Last 4
- por xmm3, xmm5 // GGGA
- por xmm0, xmm5
- movdqu [edx], xmm1
- movdqu [edx + 16], xmm2
- movdqu [edx + 32], xmm3
- movdqu [edx + 48], xmm0
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_SOBELROW_SSE2
- #ifdef HAS_SOBELTOPLANEROW_SSE2
- // Adds Sobel X and Sobel Y and stores Sobel into a plane.
- __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_y,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- convertloop:
- movdqu xmm0, [eax] // read 16 pixels src_sobelx
- movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
- lea eax, [eax + 16]
- paddusb xmm0, xmm1 // sobel = sobelx + sobely
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_SOBELTOPLANEROW_SSE2
- #ifdef HAS_SOBELXYROW_SSE2
- // Mixes Sobel X, Sobel Y and Sobel into ARGB.
- // A = 255
- // R = Sobel X
- // G = Sobel
- // B = Sobel Y
- __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
- const uint8_t* src_sobely,
- uint8_t* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- pcmpeqb xmm5, xmm5 // alpha 255
- convertloop:
- movdqu xmm0, [eax] // read 16 pixels src_sobelx
- movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
- lea eax, [eax + 16]
- movdqa xmm2, xmm0
- paddusb xmm2, xmm1 // sobel = sobelx + sobely
- movdqa xmm3, xmm0 // XA
- punpcklbw xmm3, xmm5
- punpckhbw xmm0, xmm5
- movdqa xmm4, xmm1 // YS
- punpcklbw xmm4, xmm2
- punpckhbw xmm1, xmm2
- movdqa xmm6, xmm4 // YSXA
- punpcklwd xmm6, xmm3 // First 4
- punpckhwd xmm4, xmm3 // Next 4
- movdqa xmm7, xmm1 // YSXA
- punpcklwd xmm7, xmm0 // Next 4
- punpckhwd xmm1, xmm0 // Last 4
- movdqu [edx], xmm6
- movdqu [edx + 16], xmm4
- movdqu [edx + 32], xmm7
- movdqu [edx + 48], xmm1
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_SOBELXYROW_SSE2
- #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
- // Consider float CumulativeSum.
- // Consider calling CumulativeSum one row at time as needed.
- // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
- // Convert cumulative sum for an area to an average for 1 pixel.
- // topleft is pointer to top left of CumulativeSum buffer for area.
- // botleft is pointer to bottom left of CumulativeSum buffer.
- // width is offset from left to right of area in CumulativeSum buffer measured
- // in number of ints.
- // area is the number of pixels in the area being averaged.
- // dst points to pixel to store result to.
- // count is number of averaged pixels to produce.
- // Does 4 pixels at a time.
- // This function requires alignment on accumulation buffer pointers.
- void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
- const int32_t* botleft,
- int width,
- int area,
- uint8_t* dst,
- int count) {
- __asm {
- mov eax, topleft // eax topleft
- mov esi, botleft // esi botleft
- mov edx, width
- movd xmm5, area
- mov edi, dst
- mov ecx, count
- cvtdq2ps xmm5, xmm5
- rcpss xmm4, xmm5 // 1.0f / area
- pshufd xmm4, xmm4, 0
- sub ecx, 4
- jl l4b
- cmp area, 128 // 128 pixels will not overflow 15 bits.
- ja l4
- pshufd xmm5, xmm5, 0 // area
- pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
- psrld xmm6, 16
- cvtdq2ps xmm6, xmm6
- addps xmm5, xmm6 // (65536.0 + area - 1)
- mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
- cvtps2dq xmm5, xmm5 // 0.16 fixed point
- packssdw xmm5, xmm5 // 16 bit shorts
- // 4 pixel loop small blocks.
- s4:
- // top left
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- // - top right
- psubd xmm0, [eax + edx * 4]
- psubd xmm1, [eax + edx * 4 + 16]
- psubd xmm2, [eax + edx * 4 + 32]
- psubd xmm3, [eax + edx * 4 + 48]
- lea eax, [eax + 64]
- // - bottom left
- psubd xmm0, [esi]
- psubd xmm1, [esi + 16]
- psubd xmm2, [esi + 32]
- psubd xmm3, [esi + 48]
- // + bottom right
- paddd xmm0, [esi + edx * 4]
- paddd xmm1, [esi + edx * 4 + 16]
- paddd xmm2, [esi + edx * 4 + 32]
- paddd xmm3, [esi + edx * 4 + 48]
- lea esi, [esi + 64]
- packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
- packssdw xmm2, xmm3
- pmulhuw xmm0, xmm5
- pmulhuw xmm2, xmm5
- packuswb xmm0, xmm2
- movdqu [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 4
- jge s4
- jmp l4b
- // 4 pixel loop
- l4:
- // top left
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- // - top right
- psubd xmm0, [eax + edx * 4]
- psubd xmm1, [eax + edx * 4 + 16]
- psubd xmm2, [eax + edx * 4 + 32]
- psubd xmm3, [eax + edx * 4 + 48]
- lea eax, [eax + 64]
- // - bottom left
- psubd xmm0, [esi]
- psubd xmm1, [esi + 16]
- psubd xmm2, [esi + 32]
- psubd xmm3, [esi + 48]
- // + bottom right
- paddd xmm0, [esi + edx * 4]
- paddd xmm1, [esi + edx * 4 + 16]
- paddd xmm2, [esi + edx * 4 + 32]
- paddd xmm3, [esi + edx * 4 + 48]
- lea esi, [esi + 64]
- cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
- cvtdq2ps xmm1, xmm1
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- cvtdq2ps xmm2, xmm2
- cvtdq2ps xmm3, xmm3
- mulps xmm2, xmm4
- mulps xmm3, xmm4
- cvtps2dq xmm0, xmm0
- cvtps2dq xmm1, xmm1
- cvtps2dq xmm2, xmm2
- cvtps2dq xmm3, xmm3
- packssdw xmm0, xmm1
- packssdw xmm2, xmm3
- packuswb xmm0, xmm2
- movdqu [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 4
- jge l4
- l4b:
- add ecx, 4 - 1
- jl l1b
- // 1 pixel loop
- l1:
- movdqu xmm0, [eax]
- psubd xmm0, [eax + edx * 4]
- lea eax, [eax + 16]
- psubd xmm0, [esi]
- paddd xmm0, [esi + edx * 4]
- lea esi, [esi + 16]
- cvtdq2ps xmm0, xmm0
- mulps xmm0, xmm4
- cvtps2dq xmm0, xmm0
- packssdw xmm0, xmm0
- packuswb xmm0, xmm0
- movd dword ptr [edi], xmm0
- lea edi, [edi + 4]
- sub ecx, 1
- jge l1
- l1b:
- }
- }
- #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
- #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
- // Creates a table of cumulative sums where each value is a sum of all values
- // above and to the left of the value.
- void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
- int32_t* cumsum,
- const int32_t* previous_cumsum,
- int width) {
- __asm {
- mov eax, row
- mov edx, cumsum
- mov esi, previous_cumsum
- mov ecx, width
- pxor xmm0, xmm0
- pxor xmm1, xmm1
- sub ecx, 4
- jl l4b
- test edx, 15
- jne l4b
- // 4 pixel loop
- l4:
- movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
- lea eax, [eax + 16]
- movdqa xmm4, xmm2
- punpcklbw xmm2, xmm1
- movdqa xmm3, xmm2
- punpcklwd xmm2, xmm1
- punpckhwd xmm3, xmm1
- punpckhbw xmm4, xmm1
- movdqa xmm5, xmm4
- punpcklwd xmm4, xmm1
- punpckhwd xmm5, xmm1
- paddd xmm0, xmm2
- movdqu xmm2, [esi] // previous row above.
- paddd xmm2, xmm0
- paddd xmm0, xmm3
- movdqu xmm3, [esi + 16]
- paddd xmm3, xmm0
- paddd xmm0, xmm4
- movdqu xmm4, [esi + 32]
- paddd xmm4, xmm0
- paddd xmm0, xmm5
- movdqu xmm5, [esi + 48]
- lea esi, [esi + 64]
- paddd xmm5, xmm0
- movdqu [edx], xmm2
- movdqu [edx + 16], xmm3
- movdqu [edx + 32], xmm4
- movdqu [edx + 48], xmm5
- lea edx, [edx + 64]
- sub ecx, 4
- jge l4
- l4b:
- add ecx, 4 - 1
- jl l1b
- // 1 pixel loop
- l1:
- movd xmm2, dword ptr [eax] // 1 argb pixel
- lea eax, [eax + 4]
- punpcklbw xmm2, xmm1
- punpcklwd xmm2, xmm1
- paddd xmm0, xmm2
- movdqu xmm2, [esi]
- lea esi, [esi + 16]
- paddd xmm2, xmm0
- movdqu [edx], xmm2
- lea edx, [edx + 16]
- sub ecx, 1
- jge l1
- l1b:
- }
- }
- #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
- #ifdef HAS_ARGBAFFINEROW_SSE2
- // Copy ARGB pixels from source image with slope to a row of destination.
- __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
- int src_argb_stride,
- uint8_t* dst_argb,
- const float* uv_dudv,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 12] // src_argb
- mov esi, [esp + 16] // stride
- mov edx, [esp + 20] // dst_argb
- mov ecx, [esp + 24] // pointer to uv_dudv
- movq xmm2, qword ptr [ecx] // uv
- movq xmm7, qword ptr [ecx + 8] // dudv
- mov ecx, [esp + 28] // width
- shl esi, 16 // 4, stride
- add esi, 4
- movd xmm5, esi
- sub ecx, 4
- jl l4b
- // setup for 4 pixel loop
- pshufd xmm7, xmm7, 0x44 // dup dudv
- pshufd xmm5, xmm5, 0 // dup 4, stride
- movdqa xmm0, xmm2 // x0, y0, x1, y1
- addps xmm0, xmm7
- movlhps xmm2, xmm0
- movdqa xmm4, xmm7
- addps xmm4, xmm4 // dudv *= 2
- movdqa xmm3, xmm2 // x2, y2, x3, y3
- addps xmm3, xmm4
- addps xmm4, xmm4 // dudv *= 4
- // 4 pixel loop
- l4:
- cvttps2dq xmm0, xmm2 // x, y float to int first 2
- cvttps2dq xmm1, xmm3 // x, y float to int next 2
- packssdw xmm0, xmm1 // x, y as 8 shorts
- pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // shift right
- movd edi, xmm0
- pshufd xmm0, xmm0, 0x39 // shift right
- movd xmm1, [eax + esi] // read pixel 0
- movd xmm6, [eax + edi] // read pixel 1
- punpckldq xmm1, xmm6 // combine pixel 0 and 1
- addps xmm2, xmm4 // x, y += dx, dy first 2
- movq qword ptr [edx], xmm1
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // shift right
- movd edi, xmm0
- movd xmm6, [eax + esi] // read pixel 2
- movd xmm0, [eax + edi] // read pixel 3
- punpckldq xmm6, xmm0 // combine pixel 2 and 3
- addps xmm3, xmm4 // x, y += dx, dy next 2
- movq qword ptr 8[edx], xmm6
- lea edx, [edx + 16]
- sub ecx, 4
- jge l4
- l4b:
- add ecx, 4 - 1
- jl l1b
- // 1 pixel loop
- l1:
- cvttps2dq xmm0, xmm2 // x, y float to int
- packssdw xmm0, xmm0 // x, y as shorts
- pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
- addps xmm2, xmm7 // x, y += dx, dy
- movd esi, xmm0
- movd xmm0, [eax + esi] // copy a pixel
- movd [edx], xmm0
- lea edx, [edx + 4]
- sub ecx, 1
- jge l1
- l1b:
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBAFFINEROW_SSE2
- #ifdef HAS_INTERPOLATEROW_AVX2
- // Bilinear filter 32x2 -> 32x1
- __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 / 256. Blend 100 / 0.
- sub edi, esi
- cmp eax, 128
- je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
- vmovd xmm0, eax // high fraction 0..255
- neg eax
- add eax, 256
- vmovd xmm5, eax // low fraction 256..1
- vpunpcklbw xmm5, xmm5, xmm0
- vpunpcklwd xmm5, xmm5, xmm5
- vbroadcastss ymm5, xmm5
- mov eax, 0x80808080 // 128b for bias and rounding.
- vmovd xmm4, eax
- vbroadcastss ymm4, xmm4
- xloop:
- vmovdqu ymm0, [esi]
- vmovdqu ymm2, [esi + edx]
- vpunpckhbw ymm1, ymm0, ymm2 // mutates
- vpunpcklbw ymm0, ymm0, ymm2
- vpsubb ymm1, ymm1, ymm4 // bias to signed image
- vpsubb ymm0, ymm0, ymm4
- vpmaddubsw ymm1, ymm5, ymm1
- vpmaddubsw ymm0, ymm5, ymm0
- vpaddw ymm1, ymm1, ymm4 // unbias and round
- vpaddw ymm0, ymm0, ymm4
- vpsrlw ymm1, ymm1, 8
- vpsrlw ymm0, ymm0, 8
- vpackuswb ymm0, ymm0, ymm1 // unmutates
- vmovdqu [esi + edi], ymm0
- lea esi, [esi + 32]
- sub ecx, 32
- jg xloop
- jmp xloop99
- // Blend 50 / 50.
- xloop50:
- vmovdqu ymm0, [esi]
- vpavgb ymm0, ymm0, [esi + edx]
- vmovdqu [esi + edi], ymm0
- lea esi, [esi + 32]
- sub ecx, 32
- jg xloop50
- jmp xloop99
- // Blend 100 / 0 - Copy row unchanged.
- xloop100:
- rep movsb
- xloop99:
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_INTERPOLATEROW_AVX2
- // Bilinear filter 16x2 -> 16x1
- // TODO(fbarchard): Consider allowing 256 using memcpy.
- __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 /256. Blend 100 / 0.
- cmp eax, 128
- je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
- movd xmm0, eax // high fraction 0..255
- neg eax
- add eax, 256
- movd xmm5, eax // low fraction 255..1
- punpcklbw xmm5, xmm0
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
- mov eax, 0x80808080 // 128 for biasing image to signed.
- movd xmm4, eax
- pshufd xmm4, xmm4, 0x00
- xloop:
- movdqu xmm0, [esi]
- movdqu xmm2, [esi + edx]
- movdqu xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- psubb xmm0, xmm4 // bias image by -128
- psubb xmm1, xmm4
- movdqa xmm2, xmm5
- movdqa xmm3, xmm5
- pmaddubsw xmm2, xmm0
- pmaddubsw xmm3, xmm1
- paddw xmm2, xmm4
- paddw xmm3, xmm4
- psrlw xmm2, 8
- psrlw xmm3, 8
- packuswb xmm2, xmm3
- movdqu [esi + edi], xmm2
- lea esi, [esi + 16]
- sub ecx, 16
- jg xloop
- jmp xloop99
- // Blend 50 / 50.
- xloop50:
- movdqu xmm0, [esi]
- movdqu xmm1, [esi + edx]
- pavgb xmm0, xmm1
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- sub ecx, 16
- jg xloop50
- jmp xloop99
- // Blend 100 / 0 - Copy row unchanged.
- xloop100:
- movdqu xmm0, [esi]
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- sub ecx, 16
- jg xloop100
- xloop99:
- pop edi
- pop esi
- ret
- }
- }
- // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
- __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const uint8_t* shuffler,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // shuffler
- movdqu xmm5, [ecx]
- mov ecx, [esp + 16] // width
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pshufb xmm0, xmm5
- pshufb xmm1, xmm5
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg wloop
- ret
- }
- }
- #ifdef HAS_ARGBSHUFFLEROW_AVX2
- __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const uint8_t* shuffler,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // shuffler
- vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
- mov ecx, [esp + 16] // width
- wloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpshufb ymm0, ymm0, ymm5
- vpshufb ymm1, ymm1, ymm5
- vmovdqu [edx], ymm0
- vmovdqu [edx + 32], ymm1
- lea edx, [edx + 64]
- sub ecx, 16
- jg wloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBSHUFFLEROW_AVX2
- // YUY2 - Macro-pixel = 2 image pixels
- // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
- // UYVY - Macro-pixel = 2 image pixels
- // U0Y0V0Y1
- __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_frame,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
- mov edx, [esp + 8 + 12] // src_v
- mov edi, [esp + 8 + 16] // dst_frame
- mov ecx, [esp + 8 + 20] // width
- sub edx, esi
- convertloop:
- movq xmm2, qword ptr [esi] // U
- movq xmm3, qword ptr [esi + edx] // V
- lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
- movdqu xmm0, [eax] // Y
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2 // YUYV
- punpckhbw xmm1, xmm2
- movdqu [edi], xmm0
- movdqu [edi + 16], xmm1
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* dst_frame,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
- mov edx, [esp + 8 + 12] // src_v
- mov edi, [esp + 8 + 16] // dst_frame
- mov ecx, [esp + 8 + 20] // width
- sub edx, esi
- convertloop:
- movq xmm2, qword ptr [esi] // U
- movq xmm3, qword ptr [esi + edx] // V
- lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
- movdqu xmm0, [eax] // Y
- movdqa xmm1, xmm2
- lea eax, [eax + 16]
- punpcklbw xmm1, xmm0 // UYVY
- punpckhbw xmm2, xmm0
- movdqu [edi], xmm1
- movdqu [edi + 16], xmm2
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
- __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const float* poly,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] /* src_argb */
- mov edx, [esp + 4 + 8] /* dst_argb */
- mov esi, [esp + 4 + 12] /* poly */
- mov ecx, [esp + 4 + 16] /* width */
- pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
- // 2 pixel loop.
- convertloop:
- // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
- // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
- movq xmm0, qword ptr [eax] // BGRABGRA
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm3
- movdqa xmm4, xmm0
- punpcklwd xmm0, xmm3 // pixel 0
- punpckhwd xmm4, xmm3 // pixel 1
- cvtdq2ps xmm0, xmm0 // 4 floats
- cvtdq2ps xmm4, xmm4
- movdqa xmm1, xmm0 // X
- movdqa xmm5, xmm4
- mulps xmm0, [esi + 16] // C1 * X
- mulps xmm4, [esi + 16]
- addps xmm0, [esi] // result = C0 + C1 * X
- addps xmm4, [esi]
- movdqa xmm2, xmm1
- movdqa xmm6, xmm5
- mulps xmm2, xmm1 // X * X
- mulps xmm6, xmm5
- mulps xmm1, xmm2 // X * X * X
- mulps xmm5, xmm6
- mulps xmm2, [esi + 32] // C2 * X * X
- mulps xmm6, [esi + 32]
- mulps xmm1, [esi + 48] // C3 * X * X * X
- mulps xmm5, [esi + 48]
- addps xmm0, xmm2 // result += C2 * X * X
- addps xmm4, xmm6
- addps xmm0, xmm1 // result += C3 * X * X * X
- addps xmm4, xmm5
- cvttps2dq xmm0, xmm0
- cvttps2dq xmm4, xmm4
- packuswb xmm0, xmm4
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- sub ecx, 2
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBPOLYNOMIALROW_SSE2
- #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
- __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
- uint8_t* dst_argb,
- const float* poly,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* poly */
- vbroadcastf128 ymm4, [ecx] // C0
- vbroadcastf128 ymm5, [ecx + 16] // C1
- vbroadcastf128 ymm6, [ecx + 32] // C2
- vbroadcastf128 ymm7, [ecx + 48] // C3
- mov ecx, [esp + 16] /* width */
- // 2 pixel loop.
- convertloop:
- vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
- lea eax, [eax + 8]
- vcvtdq2ps ymm0, ymm0 // X 8 floats
- vmulps ymm2, ymm0, ymm0 // X * X
- vmulps ymm3, ymm0, ymm7 // C3 * X
- vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
- vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
- vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
- vcvttps2dq ymm0, ymm0
- vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
- vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
- vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
- vmovq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- sub ecx, 2
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBPOLYNOMIALROW_AVX2
- #ifdef HAS_HALFFLOATROW_SSE2
- static float kExpBias = 1.9259299444e-34f;
- __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
- uint16_t* dst,
- float scale,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src */
- mov edx, [esp + 8] /* dst */
- movd xmm4, dword ptr [esp + 12] /* scale */
- mov ecx, [esp + 16] /* width */
- mulss xmm4, kExpBias
- pshufd xmm4, xmm4, 0
- pxor xmm5, xmm5
- sub edx, eax
- // 8 pixel loop.
- convertloop:
- movdqu xmm2, xmmword ptr [eax] // 8 shorts
- add eax, 16
- movdqa xmm3, xmm2
- punpcklwd xmm2, xmm5
- cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
- punpckhwd xmm3, xmm5
- cvtdq2ps xmm3, xmm3
- mulps xmm2, xmm4
- mulps xmm3, xmm4
- psrld xmm2, 13
- psrld xmm3, 13
- packssdw xmm2, xmm3
- movdqu [eax + edx - 16], xmm2
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_HALFFLOATROW_SSE2
- #ifdef HAS_HALFFLOATROW_AVX2
- __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
- uint16_t* dst,
- float scale,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src */
- mov edx, [esp + 8] /* dst */
- movd xmm4, dword ptr [esp + 12] /* scale */
- mov ecx, [esp + 16] /* width */
- vmulss xmm4, xmm4, kExpBias
- vbroadcastss ymm4, xmm4
- vpxor ymm5, ymm5, ymm5
- sub edx, eax
- // 16 pixel loop.
- convertloop:
- vmovdqu ymm2, [eax] // 16 shorts
- add eax, 32
- vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
- vpunpcklwd ymm2, ymm2, ymm5
- vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
- vcvtdq2ps ymm2, ymm2
- vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
- vmulps ymm2, ymm2, ymm4
- vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
- vpsrld ymm2, ymm2, 13
- vpackssdw ymm2, ymm2, ymm3
- vmovdqu [eax + edx - 32], ymm2
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_HALFFLOATROW_AVX2
- #ifdef HAS_HALFFLOATROW_F16C
- __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
- uint16_t* dst,
- float scale,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src */
- mov edx, [esp + 8] /* dst */
- vbroadcastss ymm4, [esp + 12] /* scale */
- mov ecx, [esp + 16] /* width */
- sub edx, eax
- // 16 pixel loop.
- convertloop:
- vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
- vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
- add eax, 32
- vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
- vcvtdq2ps ymm3, ymm3
- vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
- vmulps ymm3, ymm3, ymm4
- vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
- vcvtps2ph xmm3, ymm3, 3
- vmovdqu [eax + edx + 32], xmm2
- vmovdqu [eax + edx + 32 + 16], xmm3
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_HALFFLOATROW_F16C
- #ifdef HAS_ARGBCOLORTABLEROW_X86
- // Tranform ARGB pixels with color table.
- __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
- const uint8_t* table_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] /* dst_argb */
- mov esi, [esp + 4 + 8] /* table_argb */
- mov ecx, [esp + 4 + 12] /* width */
- // 1 pixel loop.
- convertloop:
- movzx edx, byte ptr [eax]
- lea eax, [eax + 4]
- movzx edx, byte ptr [esi + edx * 4]
- mov byte ptr [eax - 4], dl
- movzx edx, byte ptr [eax - 4 + 1]
- movzx edx, byte ptr [esi + edx * 4 + 1]
- mov byte ptr [eax - 4 + 1], dl
- movzx edx, byte ptr [eax - 4 + 2]
- movzx edx, byte ptr [esi + edx * 4 + 2]
- mov byte ptr [eax - 4 + 2], dl
- movzx edx, byte ptr [eax - 4 + 3]
- movzx edx, byte ptr [esi + edx * 4 + 3]
- mov byte ptr [eax - 4 + 3], dl
- dec ecx
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBCOLORTABLEROW_X86
- #ifdef HAS_RGBCOLORTABLEROW_X86
- // Tranform RGB pixels with color table.
- __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
- const uint8_t* table_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] /* dst_argb */
- mov esi, [esp + 4 + 8] /* table_argb */
- mov ecx, [esp + 4 + 12] /* width */
- // 1 pixel loop.
- convertloop:
- movzx edx, byte ptr [eax]
- lea eax, [eax + 4]
- movzx edx, byte ptr [esi + edx * 4]
- mov byte ptr [eax - 4], dl
- movzx edx, byte ptr [eax - 4 + 1]
- movzx edx, byte ptr [esi + edx * 4 + 1]
- mov byte ptr [eax - 4 + 1], dl
- movzx edx, byte ptr [eax - 4 + 2]
- movzx edx, byte ptr [esi + edx * 4 + 2]
- mov byte ptr [eax - 4 + 2], dl
- dec ecx
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_RGBCOLORTABLEROW_X86
- #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
- // Tranform RGB pixels with luma table.
- __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
- uint8_t* dst_argb,
- int width,
- const uint8_t* luma,
- uint32_t lumacoeff) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] /* src_argb */
- mov edi, [esp + 8 + 8] /* dst_argb */
- mov ecx, [esp + 8 + 12] /* width */
- movd xmm2, dword ptr [esp + 8 + 16] // luma table
- movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
- pshufd xmm2, xmm2, 0
- pshufd xmm3, xmm3, 0
- pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
- psllw xmm4, 8
- pxor xmm5, xmm5
- // 4 pixel loop.
- convertloop:
- movdqu xmm0, xmmword ptr [eax] // generate luma ptr
- pmaddubsw xmm0, xmm3
- phaddw xmm0, xmm0
- pand xmm0, xmm4 // mask out low bits
- punpcklwd xmm0, xmm5
- paddd xmm0, xmm2 // add table base
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
- movzx edx, byte ptr [eax]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi], dl
- movzx edx, byte ptr [eax + 1]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 1], dl
- movzx edx, byte ptr [eax + 2]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 2], dl
- movzx edx, byte ptr [eax + 3] // copy alpha.
- mov byte ptr [edi + 3], dl
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
- movzx edx, byte ptr [eax + 4]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 4], dl
- movzx edx, byte ptr [eax + 5]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 5], dl
- movzx edx, byte ptr [eax + 6]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 6], dl
- movzx edx, byte ptr [eax + 7] // copy alpha.
- mov byte ptr [edi + 7], dl
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
- movzx edx, byte ptr [eax + 8]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 8], dl
- movzx edx, byte ptr [eax + 9]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 9], dl
- movzx edx, byte ptr [eax + 10]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 10], dl
- movzx edx, byte ptr [eax + 11] // copy alpha.
- mov byte ptr [edi + 11], dl
- movd esi, xmm0
- movzx edx, byte ptr [eax + 12]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 12], dl
- movzx edx, byte ptr [eax + 13]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 13], dl
- movzx edx, byte ptr [eax + 14]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 14], dl
- movzx edx, byte ptr [eax + 15] // copy alpha.
- mov byte ptr [edi + 15], dl
- lea eax, [eax + 16]
- lea edi, [edi + 16]
- sub ecx, 4
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
- #endif // defined(_M_X64)
- #ifdef __cplusplus
- } // extern "C"
- } // namespace libyuv
- #endif
- #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
|