12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269 |
- /*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
- #include "libyuv/row.h"
- // This module is for Visual C 32/64 bit and clangcl 32 bit
- #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
- (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
- #if defined(_M_X64)
- #include <emmintrin.h>
- #include <tmmintrin.h> // For _mm_maddubs_epi16
- #endif
- #ifdef __cplusplus
- namespace libyuv {
- extern "C" {
- #endif
- // 64 bit
- #if defined(_M_X64)
- // Read 4 UV from 422, upsample to 8 UV.
- #define READYUV422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
- y_buf += 8;
- // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
- #define READYUVA422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
- y_buf += 8; \
- xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
- a_buf += 8;
- // Convert 8 pixels: 8 UV and 8 Y.
- #define YUVTORGB(yuvconstants) \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm2 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
- xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
- xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
- xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
- xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
- xmm0 = _mm_adds_epi16(xmm0, xmm4); \
- xmm1 = _mm_adds_epi16(xmm1, xmm4); \
- xmm2 = _mm_adds_epi16(xmm2, xmm4); \
- xmm0 = _mm_srai_epi16(xmm0, 6); \
- xmm1 = _mm_srai_epi16(xmm1, 6); \
- xmm2 = _mm_srai_epi16(xmm2, 6); \
- xmm0 = _mm_packus_epi16(xmm0, xmm0); \
- xmm1 = _mm_packus_epi16(xmm1, xmm1); \
- xmm2 = _mm_packus_epi16(xmm2, xmm2);
- // Store 8 ARGB values.
- #define STOREARGB \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
- xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
- _mm_storeu_si128((__m128i *)dst_argb, xmm0); \
- _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \
- dst_argb += 32;
- #if defined(HAS_I422TOARGBROW_SSSE3)
- void I422ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __m128i xmm0, xmm1, xmm2, xmm4;
- const __m128i xmm5 = _mm_set1_epi8(-1);
- const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
- while (width > 0) {
- READYUV422
- YUVTORGB(yuvconstants)
- STOREARGB
- width -= 8;
- }
- }
- #endif
- #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
- void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
- const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
- while (width > 0) {
- READYUVA422
- YUVTORGB(yuvconstants)
- STOREARGB
- width -= 8;
- }
- }
- #endif
- // 32 bit
- #else // defined(_M_X64)
- #ifdef HAS_ARGBTOYROW_SSSE3
- // Constants for ARGB.
- static const vec8 kARGBToY = {
- 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
- };
- // JPeg full range.
- static const vec8 kARGBToYJ = {
- 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
- };
- static const vec8 kARGBToU = {
- 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
- };
- static const vec8 kARGBToUJ = {
- 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
- };
- static const vec8 kARGBToV = {
- -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
- };
- static const vec8 kARGBToVJ = {
- -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
- };
- // vpshufb for vphaddw + vpackuswb packed to shorts.
- static const lvec8 kShufARGBToUV_AVX = {
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
- };
- // Constants for BGRA.
- static const vec8 kBGRAToY = {
- 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
- };
- static const vec8 kBGRAToU = {
- 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
- };
- static const vec8 kBGRAToV = {
- 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
- };
- // Constants for ABGR.
- static const vec8 kABGRToY = {
- 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
- };
- static const vec8 kABGRToU = {
- -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
- };
- static const vec8 kABGRToV = {
- 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
- };
- // Constants for RGBA.
- static const vec8 kRGBAToY = {
- 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
- };
- static const vec8 kRGBAToU = {
- 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
- };
- static const vec8 kRGBAToV = {
- 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
- };
- static const uvec8 kAddY16 = {
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
- };
- // 7 bit fixed point 0.5.
- static const vec16 kAddYJ64 = {
- 64, 64, 64, 64, 64, 64, 64, 64
- };
- static const uvec8 kAddUV128 = {
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
- };
- static const uvec16 kAddUVJ128 = {
- 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
- };
- // Shuffle table for converting RGB24 to ARGB.
- static const uvec8 kShuffleMaskRGB24ToARGB = {
- 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
- };
- // Shuffle table for converting RAW to ARGB.
- static const uvec8 kShuffleMaskRAWToARGB = {
- 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
- };
- // Shuffle table for converting RAW to RGB24. First 8.
- static const uvec8 kShuffleMaskRAWToRGB24_0 = {
- 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
- };
- // Shuffle table for converting RAW to RGB24. Middle 8.
- static const uvec8 kShuffleMaskRAWToRGB24_1 = {
- 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
- };
- // Shuffle table for converting RAW to RGB24. Last 8.
- static const uvec8 kShuffleMaskRAWToRGB24_2 = {
- 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
- };
- // Shuffle table for converting ARGB to RGB24.
- static const uvec8 kShuffleMaskARGBToRGB24 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
- };
- // Shuffle table for converting ARGB to RAW.
- static const uvec8 kShuffleMaskARGBToRAW = {
- 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
- };
- // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
- static const uvec8 kShuffleMaskARGBToRGB24_0 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
- };
- // YUY2 shuf 16 Y to 32 Y.
- static const lvec8 kShuffleYUY2Y = {
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
- };
- // YUY2 shuf 8 UV to 16 UV.
- static const lvec8 kShuffleYUY2UV = {
- 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
- 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
- };
- // UYVY shuf 16 Y to 32 Y.
- static const lvec8 kShuffleUYVYY = {
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
- };
- // UYVY shuf 8 UV to 16 UV.
- static const lvec8 kShuffleUYVYUV = {
- 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
- 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
- };
- // NV21 shuf 8 VU to 16 UV.
- static const lvec8 kShuffleNV21 = {
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
- };
- // Duplicates gray value 3 times and fills in alpha opaque.
- __declspec(naked)
- void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] // src_y
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
- convertloop:
- movq xmm0, qword ptr [eax]
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm0
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm0
- punpckhwd xmm1, xmm1
- por xmm0, xmm5
- por xmm1, xmm5
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #ifdef HAS_J400TOARGBROW_AVX2
- // Duplicates gray value 3 times and fills in alpha opaque.
- __declspec(naked)
- void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] // src_y
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
- vpslld ymm5, ymm5, 24
- convertloop:
- vmovdqu xmm0, [eax]
- lea eax, [eax + 16]
- vpermq ymm0, ymm0, 0xd8
- vpunpcklbw ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8
- vpunpckhwd ymm1, ymm0, ymm0
- vpunpcklwd ymm0, ymm0, ymm0
- vpor ymm0, ymm0, ymm5
- vpor ymm1, ymm1, ymm5
- vmovdqu [edx], ymm0
- vmovdqu [edx + 32], ymm1
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_J400TOARGBROW_AVX2
- __declspec(naked)
- void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] // src_rgb24
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
- movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm3, [eax + 32]
- lea eax, [eax + 48]
- movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
- pshufb xmm2, xmm4
- por xmm2, xmm5
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
- pshufb xmm0, xmm4
- movdqu [edx + 32], xmm2
- por xmm0, xmm5
- pshufb xmm1, xmm4
- movdqu [edx], xmm0
- por xmm1, xmm5
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
- pshufb xmm3, xmm4
- movdqu [edx + 16], xmm1
- por xmm3, xmm5
- movdqu [edx + 48], xmm3
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_raw
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
- movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm3, [eax + 32]
- lea eax, [eax + 48]
- movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
- pshufb xmm2, xmm4
- por xmm2, xmm5
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
- pshufb xmm0, xmm4
- movdqu [edx + 32], xmm2
- por xmm0, xmm5
- pshufb xmm1, xmm4
- movdqu [edx], xmm0
- por xmm1, xmm5
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
- pshufb xmm3, xmm4
- movdqu [edx + 16], xmm1
- por xmm3, xmm5
- movdqu [edx + 48], xmm3
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
- __asm {
- mov eax, [esp + 4] // src_raw
- mov edx, [esp + 8] // dst_rgb24
- mov ecx, [esp + 12] // width
- movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
- movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
- movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 4]
- movdqu xmm2, [eax + 8]
- lea eax, [eax + 24]
- pshufb xmm0, xmm3
- pshufb xmm1, xmm4
- pshufb xmm2, xmm5
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + 8], xmm1
- movq qword ptr [edx + 16], xmm2
- lea edx, [edx + 24]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- // pmul method to replicate bits.
- // Math to replicate bits:
- // (v << 8) | (v << 3)
- // v * 256 + v * 8
- // v * (256 + 8)
- // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
- // 20 instructions.
- __declspec(naked)
- void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
- int width) {
- __asm {
- mov eax, 0x01080108 // generate multiplier to repeat 5 bits
- movd xmm5, eax
- pshufd xmm5, xmm5, 0
- mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
- movd xmm6, eax
- pshufd xmm6, xmm6, 0
- pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
- psllw xmm3, 11
- pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
- psllw xmm4, 10
- psrlw xmm4, 5
- pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
- psllw xmm7, 8
- mov eax, [esp + 4] // src_rgb565
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of bgr565
- movdqa xmm1, xmm0
- movdqa xmm2, xmm0
- pand xmm1, xmm3 // R in upper 5 bits
- psllw xmm2, 11 // B in upper 5 bits
- pmulhuw xmm1, xmm5 // * (256 + 8)
- pmulhuw xmm2, xmm5 // * (256 + 8)
- psllw xmm1, 8
- por xmm1, xmm2 // RB
- pand xmm0, xmm4 // G in middle 6 bits
- pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
- por xmm0, xmm7 // AG
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
- lea eax, [eax + 16]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #ifdef HAS_RGB565TOARGBROW_AVX2
- // pmul method to replicate bits.
- // Math to replicate bits:
- // (v << 8) | (v << 3)
- // v * 256 + v * 8
- // v * (256 + 8)
- // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
- __declspec(naked)
- void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
- int width) {
- __asm {
- mov eax, 0x01080108 // generate multiplier to repeat 5 bits
- vmovd xmm5, eax
- vbroadcastss ymm5, xmm5
- mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
- vmovd xmm6, eax
- vbroadcastss ymm6, xmm6
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
- vpsllw ymm3, ymm3, 11
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
- vpsllw ymm4, ymm4, 10
- vpsrlw ymm4, ymm4, 5
- vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
- vpsllw ymm7, ymm7, 8
- mov eax, [esp + 4] // src_rgb565
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
- vpand ymm1, ymm0, ymm3 // R in upper 5 bits
- vpsllw ymm2, ymm0, 11 // B in upper 5 bits
- vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
- vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
- vpsllw ymm1, ymm1, 8
- vpor ymm1, ymm1, ymm2 // RB
- vpand ymm0, ymm0, ymm4 // G in middle 6 bits
- vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
- vpor ymm0, ymm0, ymm7 // AG
- vpermq ymm0, ymm0, 0xd8 // mutate for unpack
- vpermq ymm1, ymm1, 0xd8
- vpunpckhbw ymm2, ymm1, ymm0
- vpunpcklbw ymm1, ymm1, ymm0
- vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
- vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
- lea eax, [eax + 32]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_RGB565TOARGBROW_AVX2
- #ifdef HAS_ARGB1555TOARGBROW_AVX2
- __declspec(naked)
- void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
- int width) {
- __asm {
- mov eax, 0x01080108 // generate multiplier to repeat 5 bits
- vmovd xmm5, eax
- vbroadcastss ymm5, xmm5
- mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
- vmovd xmm6, eax
- vbroadcastss ymm6, xmm6
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
- vpsllw ymm3, ymm3, 11
- vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
- vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
- vpsllw ymm7, ymm7, 8
- mov eax, [esp + 4] // src_argb1555
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
- vpsllw ymm1, ymm0, 1 // R in upper 5 bits
- vpsllw ymm2, ymm0, 11 // B in upper 5 bits
- vpand ymm1, ymm1, ymm3
- vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
- vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
- vpsllw ymm1, ymm1, 8
- vpor ymm1, ymm1, ymm2 // RB
- vpsraw ymm2, ymm0, 8 // A
- vpand ymm0, ymm0, ymm4 // G in middle 5 bits
- vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
- vpand ymm2, ymm2, ymm7
- vpor ymm0, ymm0, ymm2 // AG
- vpermq ymm0, ymm0, 0xd8 // mutate for unpack
- vpermq ymm1, ymm1, 0xd8
- vpunpckhbw ymm2, ymm1, ymm0
- vpunpcklbw ymm1, ymm1, ymm0
- vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
- vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
- lea eax, [eax + 32]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGB1555TOARGBROW_AVX2
- #ifdef HAS_ARGB4444TOARGBROW_AVX2
- __declspec(naked)
- void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
- int width) {
- __asm {
- mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
- vmovd xmm4, eax
- vbroadcastss ymm4, xmm4
- vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
- mov eax, [esp + 4] // src_argb4444
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
- vpand ymm2, ymm0, ymm5 // mask high nibbles
- vpand ymm0, ymm0, ymm4 // mask low nibbles
- vpsrlw ymm3, ymm2, 4
- vpsllw ymm1, ymm0, 4
- vpor ymm2, ymm2, ymm3
- vpor ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // mutate for unpack
- vpermq ymm2, ymm2, 0xd8
- vpunpckhbw ymm1, ymm0, ymm2
- vpunpcklbw ymm0, ymm0, ymm2
- vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
- vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
- lea eax, [eax + 32]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGB4444TOARGBROW_AVX2
- // 24 instructions
- __declspec(naked)
- void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
- int width) {
- __asm {
- mov eax, 0x01080108 // generate multiplier to repeat 5 bits
- movd xmm5, eax
- pshufd xmm5, xmm5, 0
- mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
- movd xmm6, eax
- pshufd xmm6, xmm6, 0
- pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
- psllw xmm3, 11
- movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
- psrlw xmm4, 6
- pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
- psllw xmm7, 8
- mov eax, [esp + 4] // src_argb1555
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of 1555
- movdqa xmm1, xmm0
- movdqa xmm2, xmm0
- psllw xmm1, 1 // R in upper 5 bits
- psllw xmm2, 11 // B in upper 5 bits
- pand xmm1, xmm3
- pmulhuw xmm2, xmm5 // * (256 + 8)
- pmulhuw xmm1, xmm5 // * (256 + 8)
- psllw xmm1, 8
- por xmm1, xmm2 // RB
- movdqa xmm2, xmm0
- pand xmm0, xmm4 // G in middle 5 bits
- psraw xmm2, 8 // A
- pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
- pand xmm2, xmm7
- por xmm0, xmm2 // AG
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
- lea eax, [eax + 16]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- // 18 instructions.
- __declspec(naked)
- void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
- int width) {
- __asm {
- mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
- movd xmm4, eax
- pshufd xmm4, xmm4, 0
- movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
- pslld xmm5, 4
- mov eax, [esp + 4] // src_argb4444
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- sub edx, eax
- convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
- movdqa xmm2, xmm0
- pand xmm0, xmm4 // mask low nibbles
- pand xmm2, xmm5 // mask high nibbles
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- psllw xmm1, 4
- psrlw xmm3, 4
- por xmm0, xmm1
- por xmm2, xmm3
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
- movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
- lea eax, [eax + 16]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
- convertloop:
- movdqu xmm0, [eax] // fetch 16 pixels of argb
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- lea eax, [eax + 64]
- pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
- pshufb xmm1, xmm6
- pshufb xmm2, xmm6
- pshufb xmm3, xmm6
- movdqa xmm4, xmm1 // 4 bytes from 1 for 0
- psrldq xmm1, 4 // 8 bytes from 1
- pslldq xmm4, 12 // 4 bytes from 1 for 0
- movdqa xmm5, xmm2 // 8 bytes from 2 for 1
- por xmm0, xmm4 // 4 bytes from 1 for 0
- pslldq xmm5, 8 // 8 bytes from 2 for 1
- movdqu [edx], xmm0 // store 0
- por xmm1, xmm5 // 8 bytes from 2 for 1
- psrldq xmm2, 8 // 4 bytes from 2
- pslldq xmm3, 4 // 12 bytes from 3 for 2
- por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqu [edx + 16], xmm1 // store 1
- movdqu [edx + 32], xmm2 // store 2
- lea edx, [edx + 48]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
- convertloop:
- movdqu xmm0, [eax] // fetch 16 pixels of argb
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- lea eax, [eax + 64]
- pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
- pshufb xmm1, xmm6
- pshufb xmm2, xmm6
- pshufb xmm3, xmm6
- movdqa xmm4, xmm1 // 4 bytes from 1 for 0
- psrldq xmm1, 4 // 8 bytes from 1
- pslldq xmm4, 12 // 4 bytes from 1 for 0
- movdqa xmm5, xmm2 // 8 bytes from 2 for 1
- por xmm0, xmm4 // 4 bytes from 1 for 0
- pslldq xmm5, 8 // 8 bytes from 2 for 1
- movdqu [edx], xmm0 // store 0
- por xmm1, xmm5 // 8 bytes from 2 for 1
- psrldq xmm2, 8 // 4 bytes from 2
- pslldq xmm3, 4 // 12 bytes from 3 for 2
- por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqu [edx + 16], xmm1 // store 1
- movdqu [edx + 32], xmm2 // store 2
- lea edx, [edx + 48]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
- psrld xmm3, 27
- pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
- psrld xmm4, 26
- pslld xmm4, 5
- pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
- pslld xmm5, 11
- convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
- movdqa xmm1, xmm0 // B
- movdqa xmm2, xmm0 // G
- pslld xmm0, 8 // R
- psrld xmm1, 3 // B
- psrld xmm2, 5 // G
- psrad xmm0, 16 // R
- pand xmm1, xmm3 // B
- pand xmm2, xmm4 // G
- pand xmm0, xmm5 // R
- por xmm1, xmm2 // BG
- por xmm0, xmm1 // BGR
- packssdw xmm0, xmm0
- lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
- lea edx, [edx + 8]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- movd xmm6, [esp + 12] // dither4
- mov ecx, [esp + 16] // width
- punpcklbw xmm6, xmm6 // make dither 16 bytes
- movdqa xmm7, xmm6
- punpcklwd xmm6, xmm6
- punpckhwd xmm7, xmm7
- pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
- psrld xmm3, 27
- pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
- psrld xmm4, 26
- pslld xmm4, 5
- pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
- pslld xmm5, 11
- convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
- paddusb xmm0, xmm6 // add dither
- movdqa xmm1, xmm0 // B
- movdqa xmm2, xmm0 // G
- pslld xmm0, 8 // R
- psrld xmm1, 3 // B
- psrld xmm2, 5 // G
- psrad xmm0, 16 // R
- pand xmm1, xmm3 // B
- pand xmm2, xmm4 // G
- pand xmm0, xmm5 // R
- por xmm1, xmm2 // BG
- por xmm0, xmm1 // BGR
- packssdw xmm0, xmm0
- lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
- lea edx, [edx + 8]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
- __declspec(naked)
- void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- vbroadcastss xmm6, [esp + 12] // dither4
- mov ecx, [esp + 16] // width
- vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
- vpermq ymm6, ymm6, 0xd8
- vpunpcklwd ymm6, ymm6, ymm6
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
- vpsrld ymm3, ymm3, 27
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
- vpsrld ymm4, ymm4, 26
- vpslld ymm4, ymm4, 5
- vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
- convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpaddusb ymm0, ymm0, ymm6 // add dither
- vpsrld ymm2, ymm0, 5 // G
- vpsrld ymm1, ymm0, 3 // B
- vpsrld ymm0, ymm0, 8 // R
- vpand ymm2, ymm2, ymm4 // G
- vpand ymm1, ymm1, ymm3 // B
- vpand ymm0, ymm0, ymm5 // R
- vpor ymm1, ymm1, ymm2 // BG
- vpor ymm0, ymm0, ymm1 // BGR
- vpackusdw ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8
- lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of RGB565
- lea edx, [edx + 16]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTORGB565DITHERROW_AVX2
- // TODO(fbarchard): Improve sign extension/packing.
- __declspec(naked)
- void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
- psrld xmm4, 27
- movdqa xmm5, xmm4 // generate mask 0x000003e0
- pslld xmm5, 5
- movdqa xmm6, xmm4 // generate mask 0x00007c00
- pslld xmm6, 10
- pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
- pslld xmm7, 15
- convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
- movdqa xmm1, xmm0 // B
- movdqa xmm2, xmm0 // G
- movdqa xmm3, xmm0 // R
- psrad xmm0, 16 // A
- psrld xmm1, 3 // B
- psrld xmm2, 6 // G
- psrld xmm3, 9 // R
- pand xmm0, xmm7 // A
- pand xmm1, xmm4 // B
- pand xmm2, xmm5 // G
- pand xmm3, xmm6 // R
- por xmm0, xmm1 // BA
- por xmm2, xmm3 // GR
- por xmm0, xmm2 // BGRA
- packssdw xmm0, xmm0
- lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
- lea edx, [edx + 8]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
- psllw xmm4, 12
- movdqa xmm3, xmm4 // generate mask 0x00f000f0
- psrlw xmm3, 8
- convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
- movdqa xmm1, xmm0
- pand xmm0, xmm3 // low nibble
- pand xmm1, xmm4 // high nibble
- psrld xmm0, 4
- psrld xmm1, 8
- por xmm0, xmm1
- packuswb xmm0, xmm0
- lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
- lea edx, [edx + 8]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #ifdef HAS_ARGBTORGB565ROW_AVX2
- __declspec(naked)
- void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
- vpsrld ymm3, ymm3, 27
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
- vpsrld ymm4, ymm4, 26
- vpslld ymm4, ymm4, 5
- vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
- convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpsrld ymm2, ymm0, 5 // G
- vpsrld ymm1, ymm0, 3 // B
- vpsrld ymm0, ymm0, 8 // R
- vpand ymm2, ymm2, ymm4 // G
- vpand ymm1, ymm1, ymm3 // B
- vpand ymm0, ymm0, ymm5 // R
- vpor ymm1, ymm1, ymm2 // BG
- vpor ymm0, ymm0, ymm1 // BGR
- vpackusdw ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8
- lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of RGB565
- lea edx, [edx + 16]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTORGB565ROW_AVX2
- #ifdef HAS_ARGBTOARGB1555ROW_AVX2
- __declspec(naked)
- void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm4, ymm4, ymm4
- vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
- vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
- vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
- vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
- vpslld ymm7, ymm7, 15
- convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpsrld ymm3, ymm0, 9 // R
- vpsrld ymm2, ymm0, 6 // G
- vpsrld ymm1, ymm0, 3 // B
- vpsrad ymm0, ymm0, 16 // A
- vpand ymm3, ymm3, ymm6 // R
- vpand ymm2, ymm2, ymm5 // G
- vpand ymm1, ymm1, ymm4 // B
- vpand ymm0, ymm0, ymm7 // A
- vpor ymm0, ymm0, ymm1 // BA
- vpor ymm2, ymm2, ymm3 // GR
- vpor ymm0, ymm0, ymm2 // BGRA
- vpackssdw ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8
- lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
- lea edx, [edx + 16]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOARGB1555ROW_AVX2
- #ifdef HAS_ARGBTOARGB4444ROW_AVX2
- __declspec(naked)
- void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
- vpsllw ymm4, ymm4, 12
- vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
- convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpand ymm1, ymm0, ymm4 // high nibble
- vpand ymm0, ymm0, ymm3 // low nibble
- vpsrld ymm1, ymm1, 8
- vpsrld ymm0, ymm0, 4
- vpor ymm0, ymm0, ymm1
- vpackuswb ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8
- lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
- lea edx, [edx + 16]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOARGB4444ROW_AVX2
- // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
- __declspec(naked)
- void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kARGBToY
- movdqa xmm5, xmmword ptr kAddY16
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
- // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
- __declspec(naked)
- void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kARGBToYJ
- movdqa xmm5, xmmword ptr kAddYJ64
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- paddw xmm0, xmm5 // Add .5 for rounding.
- paddw xmm2, xmm5
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- #ifdef HAS_ARGBTOYROW_AVX2
- // vpermd for vphaddw + vpackuswb vpermd.
- static const lvec32 kPermdARGBToY_AVX = {
- 0, 4, 1, 5, 2, 6, 3, 7
- };
- // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
- __declspec(naked)
- void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- vbroadcastf128 ymm4, xmmword ptr kARGBToY
- vbroadcastf128 ymm5, xmmword ptr kAddY16
- vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpmaddubsw ymm0, ymm0, ymm4
- vpmaddubsw ymm1, ymm1, ymm4
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- lea eax, [eax + 128]
- vphaddw ymm0, ymm0, ymm1 // mutates.
- vphaddw ymm2, ymm2, ymm3
- vpsrlw ymm0, ymm0, 7
- vpsrlw ymm2, ymm2, 7
- vpackuswb ymm0, ymm0, ymm2 // mutates.
- vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
- vpaddb ymm0, ymm0, ymm5 // add 16 for Y
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOYROW_AVX2
- #ifdef HAS_ARGBTOYJROW_AVX2
- // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
- __declspec(naked)
- void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
- vbroadcastf128 ymm5, xmmword ptr kAddYJ64
- vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpmaddubsw ymm0, ymm0, ymm4
- vpmaddubsw ymm1, ymm1, ymm4
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- lea eax, [eax + 128]
- vphaddw ymm0, ymm0, ymm1 // mutates.
- vphaddw ymm2, ymm2, ymm3
- vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
- vpaddw ymm2, ymm2, ymm5
- vpsrlw ymm0, ymm0, 7
- vpsrlw ymm2, ymm2, 7
- vpackuswb ymm0, ymm0, ymm2 // mutates.
- vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOYJROW_AVX2
- __declspec(naked)
- void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kBGRAToY
- movdqa xmm5, xmmword ptr kAddY16
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kABGRToY
- movdqa xmm5, xmmword ptr kAddY16
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kRGBAToY
- movdqa xmm5, xmmword ptr kAddY16
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kARGBToV
- movdqa xmm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm1, [eax + 16]
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm2, [eax + 32]
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked)
- void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUVJ128
- movdqa xmm6, xmmword ptr kARGBToVJ
- movdqa xmm7, xmmword ptr kARGBToUJ
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm1, [eax + 16]
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm2, [eax + 32]
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- paddw xmm0, xmm5 // +.5 rounding -> unsigned
- paddw xmm1, xmm5
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- // step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- #ifdef HAS_ARGBTOUVROW_AVX2
- __declspec(naked)
- void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUV128
- vbroadcastf128 ymm6, xmmword ptr kARGBToV
- vbroadcastf128 ymm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 32x2 argb pixels to 16x1 */
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- vpavgb ymm2, ymm2, [eax + esi + 64]
- vpavgb ymm3, ymm3, [eax + esi + 96]
- lea eax, [eax + 128]
- vshufps ymm4, ymm0, ymm1, 0x88
- vshufps ymm0, ymm0, ymm1, 0xdd
- vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
- vshufps ymm4, ymm2, ymm3, 0x88
- vshufps ymm2, ymm2, ymm3, 0xdd
- vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 32 different pixels, its 16 pixels of U and 16 of V
- vpmaddubsw ymm1, ymm0, ymm7 // U
- vpmaddubsw ymm3, ymm2, ymm7
- vpmaddubsw ymm0, ymm0, ymm6 // V
- vpmaddubsw ymm2, ymm2, ymm6
- vphaddw ymm1, ymm1, ymm3 // mutates
- vphaddw ymm0, ymm0, ymm2
- vpsraw ymm1, ymm1, 8
- vpsraw ymm0, ymm0, 8
- vpacksswb ymm0, ymm1, ymm0 // mutates
- vpermq ymm0, ymm0, 0xd8 // For vpacksswb
- vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
- vpaddb ymm0, ymm0, ymm5 // -> unsigned
- // step 3 - store 16 U and 16 V values
- vextractf128 [edx], ymm0, 0 // U
- vextractf128 [edx + edi], ymm0, 1 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOUVROW_AVX2
- #ifdef HAS_ARGBTOUVJROW_AVX2
- __declspec(naked)
- void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUV128
- vbroadcastf128 ymm6, xmmword ptr kARGBToV
- vbroadcastf128 ymm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 32x2 argb pixels to 16x1 */
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- vpavgb ymm2, ymm2, [eax + esi + 64]
- vpavgb ymm3, ymm3, [eax + esi + 96]
- lea eax, [eax + 128]
- vshufps ymm4, ymm0, ymm1, 0x88
- vshufps ymm0, ymm0, ymm1, 0xdd
- vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
- vshufps ymm4, ymm2, ymm3, 0x88
- vshufps ymm2, ymm2, ymm3, 0xdd
- vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 32 different pixels, its 16 pixels of U and 16 of V
- vpmaddubsw ymm1, ymm0, ymm7 // U
- vpmaddubsw ymm3, ymm2, ymm7
- vpmaddubsw ymm0, ymm0, ymm6 // V
- vpmaddubsw ymm2, ymm2, ymm6
- vphaddw ymm1, ymm1, ymm3 // mutates
- vphaddw ymm0, ymm0, ymm2
- vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
- vpaddw ymm0, ymm0, ymm5
- vpsraw ymm1, ymm1, 8
- vpsraw ymm0, ymm0, 8
- vpacksswb ymm0, ymm1, ymm0 // mutates
- vpermq ymm0, ymm0, 0xd8 // For vpacksswb
- vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
- // step 3 - store 16 U and 16 V values
- vextractf128 [edx], ymm0, 0 // U
- vextractf128 [edx + edi], ymm0, 1 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBTOUVJROW_AVX2
- __declspec(naked)
- void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_argb
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kARGBToV
- movdqa xmm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
- convertloop:
- /* convert to U and V */
- movdqu xmm0, [eax] // U
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm1, xmm7
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm3, xmm7
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- movdqu [edx], xmm0
- movdqu xmm0, [eax] // V
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm6
- pmaddubsw xmm1, xmm6
- pmaddubsw xmm2, xmm6
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- lea eax, [eax + 64]
- movdqu [edx + edi], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- pop edi
- ret
- }
- }
- __declspec(naked)
- void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kBGRAToV
- movdqa xmm7, xmmword ptr kBGRAToU
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm1, [eax + 16]
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm2, [eax + 32]
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked)
- void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kABGRToV
- movdqa xmm7, xmmword ptr kABGRToU
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm1, [eax + 16]
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm2, [eax + 32]
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked)
- void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kRGBAToV
- movdqa xmm7, xmmword ptr kRGBAToU
- sub edi, edx // stride from u to v
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm1, [eax + 16]
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm2, [eax + 32]
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBTOYROW_SSSE3
- // Read 16 UV from 444
- #define READYUV444_AVX2 __asm { \
- __asm vmovdqu xmm0, [esi] /* U */ \
- __asm vmovdqu xmm1, [esi + edi] /* V */ \
- __asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- }
- // Read 8 UV from 422, upsample to 16 UV.
- #define READYUV422_AVX2 __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
- __asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- }
- // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
- #define READYUVA422_AVX2 __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
- __asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- __asm vmovdqu xmm5, [ebp] /* A */ \
- __asm vpermq ymm5, ymm5, 0xd8 \
- __asm lea ebp, [ebp + 16] \
- }
- // Read 4 UV from 411, upsample to 16 UV.
- #define READYUV411_AVX2 __asm { \
- __asm vmovd xmm0, dword ptr [esi] /* U */ \
- __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \
- __asm lea esi, [esi + 4] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- }
- // Read 8 UV from NV12, upsample to 16 UV.
- #define READNV12_AVX2 __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
- __asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- }
- // Read 8 UV from NV21, upsample to 16 UV.
- #define READNV21_AVX2 __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
- __asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- }
- // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
- #define READYUY2_AVX2 __asm { \
- __asm vmovdqu ymm4, [eax] /* YUY2 */ \
- __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
- __asm lea eax, [eax + 32] \
- }
- // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
- #define READUYVY_AVX2 __asm { \
- __asm vmovdqu ymm4, [eax] /* UYVY */ \
- __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
- __asm lea eax, [eax + 32] \
- }
- // Convert 16 pixels: 16 UV and 16 Y.
- #define YUVTORGB_AVX2(YuvConstants) __asm { \
- __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
- __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
- __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
- __asm vpsubw ymm2, ymm3, ymm2 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
- __asm vpsubw ymm1, ymm3, ymm1 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
- __asm vpsubw ymm0, ymm3, ymm0 \
- /* Step 2: Find Y contribution to 16 R,G,B values */ \
- __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
- __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
- __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
- __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
- __asm vpsraw ymm0, ymm0, 6 \
- __asm vpsraw ymm1, ymm1, 6 \
- __asm vpsraw ymm2, ymm2, 6 \
- __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
- __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
- __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
- }
- // Store 16 ARGB values.
- #define STOREARGB_AVX2 __asm { \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
- __asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
- __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
- __asm vmovdqu 0[edx], ymm1 \
- __asm vmovdqu 32[edx], ymm0 \
- __asm lea edx, [edx + 64] \
- }
- // Store 16 RGBA values.
- #define STORERGBA_AVX2 __asm { \
- __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
- __asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
- __asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
- __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
- __asm vmovdqu [edx], ymm0 \
- __asm vmovdqu [edx + 32], ymm1 \
- __asm lea edx, [edx + 64] \
- }
- #ifdef HAS_I422TOARGBROW_AVX2
- // 16 pixels
- // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
- __declspec(naked)
- void I422ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READYUV422_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_I422TOARGBROW_AVX2
- #ifdef HAS_I422ALPHATOARGBROW_AVX2
- // 16 pixels
- // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
- __declspec(naked)
- void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- push ebp
- mov eax, [esp + 16 + 4] // Y
- mov esi, [esp + 16 + 8] // U
- mov edi, [esp + 16 + 12] // V
- mov ebp, [esp + 16 + 16] // A
- mov edx, [esp + 16 + 20] // argb
- mov ebx, [esp + 16 + 24] // yuvconstants
- mov ecx, [esp + 16 + 28] // width
- sub edi, esi
- convertloop:
- READYUVA422_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebp
- pop ebx
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_I422ALPHATOARGBROW_AVX2
- #ifdef HAS_I444TOARGBROW_AVX2
- // 16 pixels
- // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
- __declspec(naked)
- void I444ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READYUV444_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_I444TOARGBROW_AVX2
- #ifdef HAS_I411TOARGBROW_AVX2
- // 16 pixels
- // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
- __declspec(naked)
- void I411ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // abgr
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READYUV411_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_I411TOARGBROW_AVX2
- #ifdef HAS_NV12TOARGBROW_AVX2
- // 16 pixels.
- // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
- __declspec(naked)
- void NV12ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // UV
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READNV12_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_NV12TOARGBROW_AVX2
- #ifdef HAS_NV21TOARGBROW_AVX2
- // 16 pixels.
- // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
- __declspec(naked)
- void NV21ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* vu_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // VU
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READNV21_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_NV21TOARGBROW_AVX2
- #ifdef HAS_YUY2TOARGBROW_AVX2
- // 16 pixels.
- // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
- __declspec(naked)
- void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push ebx
- mov eax, [esp + 4 + 4] // yuy2
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READYUY2_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- vzeroupper
- ret
- }
- }
- #endif // HAS_YUY2TOARGBROW_AVX2
- #ifdef HAS_UYVYTOARGBROW_AVX2
- // 16 pixels.
- // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
- __declspec(naked)
- void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push ebx
- mov eax, [esp + 4 + 4] // uyvy
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READUYVY_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- vzeroupper
- ret
- }
- }
- #endif // HAS_UYVYTOARGBROW_AVX2
- #ifdef HAS_I422TORGBAROW_AVX2
- // 16 pixels
- // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
- __declspec(naked)
- void I422ToRGBARow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // abgr
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- convertloop:
- READYUV422_AVX2
- YUVTORGB_AVX2(ebx)
- STORERGBA_AVX2
- sub ecx, 16
- jg convertloop
- pop ebx
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_I422TORGBAROW_AVX2
- #if defined(HAS_I422TOARGBROW_SSSE3)
- // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
- // Allows a conversion with half size scaling.
- // Read 8 UV from 444.
- #define READYUV444 __asm { \
- __asm movq xmm0, qword ptr [esi] /* U */ \
- __asm movq xmm1, qword ptr [esi + edi] /* V */ \
- __asm lea esi, [esi + 8] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- }
- // Read 4 UV from 422, upsample to 8 UV.
- #define READYUV422 __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
- __asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- }
- // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
- #define READYUVA422 __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
- __asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] /* Y */ \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- __asm movq xmm5, qword ptr [ebp] /* A */ \
- __asm lea ebp, [ebp + 8] \
- }
- // Read 2 UV from 411, upsample to 8 UV.
- // drmemory fails with memory fault if pinsrw used. libyuv bug: 525
- // __asm pinsrw xmm0, [esi], 0 /* U */
- // __asm pinsrw xmm1, [esi + edi], 0 /* V */
- #define READYUV411_EBX __asm { \
- __asm movzx ebx, word ptr [esi] /* U */ \
- __asm movd xmm0, ebx \
- __asm movzx ebx, word ptr [esi + edi] /* V */ \
- __asm movd xmm1, ebx \
- __asm lea esi, [esi + 2] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- }
- // Read 4 UV from NV12, upsample to 8 UV.
- #define READNV12 __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
- __asm lea esi, [esi + 8] \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- }
- // Read 4 VU from NV21, upsample to 8 UV.
- #define READNV21 __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
- __asm lea esi, [esi + 8] \
- __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- }
- // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
- #define READYUY2 __asm { \
- __asm movdqu xmm4, [eax] /* YUY2 */ \
- __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
- __asm lea eax, [eax + 16] \
- }
- // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
- #define READUYVY __asm { \
- __asm movdqu xmm4, [eax] /* UYVY */ \
- __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
- __asm lea eax, [eax + 16] \
- }
- // Convert 8 pixels: 8 UV and 8 Y.
- #define YUVTORGB(YuvConstants) __asm { \
- __asm movdqa xmm1, xmm0 \
- __asm movdqa xmm2, xmm0 \
- __asm movdqa xmm3, xmm0 \
- __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
- __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
- __asm psubw xmm0, xmm1 \
- __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
- __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
- __asm psubw xmm1, xmm2 \
- __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
- __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
- __asm psubw xmm2, xmm3 \
- __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
- __asm paddsw xmm0, xmm4 /* B += Y */ \
- __asm paddsw xmm1, xmm4 /* G += Y */ \
- __asm paddsw xmm2, xmm4 /* R += Y */ \
- __asm psraw xmm0, 6 \
- __asm psraw xmm1, 6 \
- __asm psraw xmm2, 6 \
- __asm packuswb xmm0, xmm0 /* B */ \
- __asm packuswb xmm1, xmm1 /* G */ \
- __asm packuswb xmm2, xmm2 /* R */ \
- }
- // Store 8 ARGB values.
- #define STOREARGB __asm { \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm5 /* RA */ \
- __asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
- __asm movdqu 0[edx], xmm0 \
- __asm movdqu 16[edx], xmm1 \
- __asm lea edx, [edx + 32] \
- }
- // Store 8 BGRA values.
- #define STOREBGRA __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm0 /* GB */ \
- __asm punpcklbw xmm5, xmm2 /* AR */ \
- __asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
- __asm movdqu 0[edx], xmm5 \
- __asm movdqu 16[edx], xmm0 \
- __asm lea edx, [edx + 32] \
- }
- // Store 8 RGBA values.
- #define STORERGBA __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm2 /* GR */ \
- __asm punpcklbw xmm5, xmm0 /* AB */ \
- __asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
- __asm movdqu 0[edx], xmm5 \
- __asm movdqu 16[edx], xmm0 \
- __asm lea edx, [edx + 32] \
- }
- // Store 8 RGB24 values.
- #define STORERGB24 __asm { \
- /* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
- __asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
- /* RRGB -> RGB24 */ \
- __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
- __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
- __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
- __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
- __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
- __asm lea edx, [edx + 24] \
- }
- // Store 8 RGB565 values.
- #define STORERGB565 __asm { \
- /* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
- __asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
- /* RRGB -> RGB565 */ \
- __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
- __asm movdqa xmm2, xmm0 /* G */ \
- __asm pslld xmm0, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm0, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm0, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm0, xmm3 /* BGR */ \
- __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
- __asm movdqa xmm2, xmm1 /* G */ \
- __asm pslld xmm1, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm1, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm1, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm1, xmm3 /* BGR */ \
- __asm packssdw xmm0, xmm1 \
- __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
- __asm lea edx, [edx + 16] \
- }
- // 8 pixels.
- // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
- __declspec(naked)
- void I444ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READYUV444
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
- __declspec(naked)
- void I422ToRGB24Row_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
- movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
- convertloop:
- READYUV422
- YUVTORGB(ebx)
- STORERGB24
- sub ecx, 8
- jg convertloop
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- // 8 pixels
- // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
- __declspec(naked)
- void I422ToRGB565Row_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb565_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
- psrld xmm5, 27
- pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
- psrld xmm6, 26
- pslld xmm6, 5
- pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
- pslld xmm7, 11
- convertloop:
- READYUV422
- YUVTORGB(ebx)
- STORERGB565
- sub ecx, 8
- jg convertloop
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
- __declspec(naked)
- void I422ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READYUV422
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
- __declspec(naked)
- void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- push ebp
- mov eax, [esp + 16 + 4] // Y
- mov esi, [esp + 16 + 8] // U
- mov edi, [esp + 16 + 12] // V
- mov ebp, [esp + 16 + 16] // A
- mov edx, [esp + 16 + 20] // argb
- mov ebx, [esp + 16 + 24] // yuvconstants
- mov ecx, [esp + 16 + 28] // width
- sub edi, esi
- convertloop:
- READYUVA422
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebp
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
- // Similar to I420 but duplicate UV once more.
- __declspec(naked)
- void I411ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- push ebp
- mov eax, [esp + 16 + 4] // Y
- mov esi, [esp + 16 + 8] // U
- mov edi, [esp + 16 + 12] // V
- mov edx, [esp + 16 + 16] // abgr
- mov ebp, [esp + 16 + 20] // yuvconstants
- mov ecx, [esp + 16 + 24] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READYUV411_EBX
- YUVTORGB(ebp)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebp
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
- __declspec(naked)
- void NV12ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // UV
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READNV12
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
- __declspec(naked)
- void NV21ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* vu_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // VU
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READNV21
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- pop esi
- ret
- }
- }
- // 8 pixels.
- // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
- __declspec(naked)
- void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push ebx
- mov eax, [esp + 4 + 4] // yuy2
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READYUY2
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- ret
- }
- }
- // 8 pixels.
- // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
- __declspec(naked)
- void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push ebx
- mov eax, [esp + 4 + 4] // uyvy
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- convertloop:
- READUYVY
- YUVTORGB(ebx)
- STOREARGB
- sub ecx, 8
- jg convertloop
- pop ebx
- ret
- }
- }
- __declspec(naked)
- void I422ToRGBARow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- convertloop:
- READYUV422
- YUVTORGB(ebx)
- STORERGBA
- sub ecx, 8
- jg convertloop
- pop ebx
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_I422TOARGBROW_SSSE3
- #ifdef HAS_I400TOARGBROW_SSE2
- // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
- __declspec(naked)
- void I400ToARGBRow_SSE2(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
- __asm {
- mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
- movd xmm2, eax
- pshufd xmm2, xmm2,0
- mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
- movd xmm3, eax
- pshufd xmm3, xmm3, 0
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
- mov eax, [esp + 4] // Y
- mov edx, [esp + 8] // rgb
- mov ecx, [esp + 12] // width
- convertloop:
- // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
- movq xmm0, qword ptr [eax]
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm0 // Y.Y
- pmulhuw xmm0, xmm2
- psubusw xmm0, xmm3
- psrlw xmm0, 6
- packuswb xmm0, xmm0 // G
- // Step 2: Weave into ARGB
- punpcklbw xmm0, xmm0 // GG
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm0 // BGRA first 4 pixels
- punpckhwd xmm1, xmm1 // BGRA next 4 pixels
- por xmm0, xmm4
- por xmm1, xmm4
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_I400TOARGBROW_SSE2
- #ifdef HAS_I400TOARGBROW_AVX2
- // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
- // note: vpunpcklbw mutates and vpackuswb unmutates.
- __declspec(naked)
- void I400ToARGBRow_AVX2(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
- __asm {
- mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
- vmovd xmm2, eax
- vbroadcastss ymm2, xmm2
- mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
- vmovd xmm3, eax
- vbroadcastss ymm3, xmm3
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
- vpslld ymm4, ymm4, 24
- mov eax, [esp + 4] // Y
- mov edx, [esp + 8] // rgb
- mov ecx, [esp + 12] // width
- convertloop:
- // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
- vmovdqu xmm0, [eax]
- lea eax, [eax + 16]
- vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
- vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
- vpmulhuw ymm0, ymm0, ymm2
- vpsubusw ymm0, ymm0, ymm3
- vpsrlw ymm0, ymm0, 6
- vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
- // TODO(fbarchard): Weave alpha with unpack.
- // Step 2: Weave into ARGB
- vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
- vpermq ymm1, ymm1, 0xd8
- vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
- vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
- vpor ymm0, ymm0, ymm4
- vpor ymm1, ymm1, ymm4
- vmovdqu [edx], ymm0
- vmovdqu [edx + 32], ymm1
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_I400TOARGBROW_AVX2
- #ifdef HAS_MIRRORROW_SSSE3
- // Shuffle table for reversing the bytes.
- static const uvec8 kShuffleMirror = {
- 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
- };
- // TODO(fbarchard): Replace lea with -16 offset.
- __declspec(naked)
- void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- movdqa xmm5, xmmword ptr kShuffleMirror
- convertloop:
- movdqu xmm0, [eax - 16 + ecx]
- pshufb xmm0, xmm5
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- #endif // HAS_MIRRORROW_SSSE3
- #ifdef HAS_MIRRORROW_AVX2
- __declspec(naked)
- void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
- convertloop:
- vmovdqu ymm0, [eax - 32 + ecx]
- vpshufb ymm0, ymm0, ymm5
- vpermq ymm0, ymm0, 0x4e // swap high and low halfs
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_MIRRORROW_AVX2
- #ifdef HAS_MIRRORUVROW_SSSE3
- // Shuffle table for reversing the bytes of UV channels.
- static const uvec8 kShuffleMirrorUV = {
- 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
- };
- __declspec(naked)
- void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- movdqa xmm1, xmmword ptr kShuffleMirrorUV
- lea eax, [eax + ecx * 2 - 16]
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- lea eax, [eax - 16]
- pshufb xmm0, xmm1
- movlpd qword ptr [edx], xmm0
- movhpd qword ptr [edx + edi], xmm0
- lea edx, [edx + 8]
- sub ecx, 8
- jg convertloop
- pop edi
- ret
- }
- }
- #endif // HAS_MIRRORUVROW_SSSE3
- #ifdef HAS_ARGBMIRRORROW_SSE2
- __declspec(naked)
- void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
- convertloop:
- movdqu xmm0, [eax]
- lea eax, [eax - 16]
- pshufd xmm0, xmm0, 0x1b
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBMIRRORROW_SSE2
- #ifdef HAS_ARGBMIRRORROW_AVX2
- // Shuffle table for reversing the bytes.
- static const ulvec32 kARGBShuffleMirror_AVX2 = {
- 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
- };
- __declspec(naked)
- void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
- convertloop:
- vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBMIRRORROW_AVX2
- #ifdef HAS_SPLITUVROW_SSE2
- __declspec(naked)
- void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- pand xmm0, xmm5 // even bytes
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- psrlw xmm2, 8 // odd bytes
- psrlw xmm3, 8
- packuswb xmm2, xmm3
- movdqu [edx], xmm0
- movdqu [edx + edi], xmm2
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- pop edi
- ret
- }
- }
- #endif // HAS_SPLITUVROW_SSE2
- #ifdef HAS_SPLITUVROW_AVX2
- __declspec(naked)
- void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpsrlw ymm2, ymm0, 8 // odd bytes
- vpsrlw ymm3, ymm1, 8
- vpand ymm0, ymm0, ymm5 // even bytes
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1
- vpackuswb ymm2, ymm2, ymm3
- vpermq ymm0, ymm0, 0xd8
- vpermq ymm2, ymm2, 0xd8
- vmovdqu [edx], ymm0
- vmovdqu [edx + edi], ymm2
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- pop edi
- vzeroupper
- ret
- }
- }
- #endif // HAS_SPLITUVROW_AVX2
- #ifdef HAS_MERGEUVROW_SSE2
- __declspec(naked)
- void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_u
- mov edx, [esp + 4 + 8] // src_v
- mov edi, [esp + 4 + 12] // dst_uv
- mov ecx, [esp + 4 + 16] // width
- sub edx, eax
- convertloop:
- movdqu xmm0, [eax] // read 16 U's
- movdqu xmm1, [eax + edx] // and 16 V's
- lea eax, [eax + 16]
- movdqa xmm2, xmm0
- punpcklbw xmm0, xmm1 // first 8 UV pairs
- punpckhbw xmm2, xmm1 // next 8 UV pairs
- movdqu [edi], xmm0
- movdqu [edi + 16], xmm2
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
- pop edi
- ret
- }
- }
- #endif // HAS_MERGEUVROW_SSE2
- #ifdef HAS_MERGEUVROW_AVX2
- __declspec(naked)
- void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_u
- mov edx, [esp + 4 + 8] // src_v
- mov edi, [esp + 4 + 12] // dst_uv
- mov ecx, [esp + 4 + 16] // width
- sub edx, eax
- convertloop:
- vmovdqu ymm0, [eax] // read 32 U's
- vmovdqu ymm1, [eax + edx] // and 32 V's
- lea eax, [eax + 32]
- vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
- vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
- vextractf128 [edi], ymm2, 0 // bytes 0..15
- vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
- vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
- vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
- lea edi, [edi + 64]
- sub ecx, 32
- jg convertloop
- pop edi
- vzeroupper
- ret
- }
- }
- #endif // HAS_MERGEUVROW_AVX2
- #ifdef HAS_COPYROW_SSE2
- // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
- __declspec(naked)
- void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- test eax, 15
- jne convertloopu
- test edx, 15
- jne convertloopu
- convertloopa:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloopa
- ret
- convertloopu:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloopu
- ret
- }
- }
- #endif // HAS_COPYROW_SSE2
- #ifdef HAS_COPYROW_AVX
- // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
- __declspec(naked)
- void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vmovdqu [edx], ymm0
- vmovdqu [edx + 32], ymm1
- lea edx, [edx + 64]
- sub ecx, 64
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_COPYROW_AVX
- // Multiple of 1.
- __declspec(naked)
- void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
- __asm {
- mov eax, esi
- mov edx, edi
- mov esi, [esp + 4] // src
- mov edi, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- rep movsb
- mov edi, edx
- mov esi, eax
- ret
- }
- }
- #ifdef HAS_ARGBCOPYALPHAROW_SSE2
- // width in pixels
- __declspec(naked)
- void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- pcmpeqb xmm0, xmm0 // generate mask 0xff000000
- pslld xmm0, 24
- pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
- psrld xmm1, 8
- convertloop:
- movdqu xmm2, [eax]
- movdqu xmm3, [eax + 16]
- lea eax, [eax + 32]
- movdqu xmm4, [edx]
- movdqu xmm5, [edx + 16]
- pand xmm2, xmm0
- pand xmm3, xmm0
- pand xmm4, xmm1
- pand xmm5, xmm1
- por xmm2, xmm4
- por xmm3, xmm5
- movdqu [edx], xmm2
- movdqu [edx + 16], xmm3
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBCOPYALPHAROW_SSE2
- #ifdef HAS_ARGBCOPYALPHAROW_AVX2
- // width in pixels
- __declspec(naked)
- void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- vpcmpeqb ymm0, ymm0, ymm0
- vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
- convertloop:
- vmovdqu ymm1, [eax]
- vmovdqu ymm2, [eax + 32]
- lea eax, [eax + 64]
- vpblendvb ymm1, ymm1, [edx], ymm0
- vpblendvb ymm2, ymm2, [edx + 32], ymm0
- vmovdqu [edx], ymm1
- vmovdqu [edx + 32], ymm2
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBCOPYALPHAROW_AVX2
- #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
- // width in pixels
- __declspec(naked)
- void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_a
- mov ecx, [esp + 12] // width
- extractloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrld xmm0, 24
- psrld xmm1, 24
- packssdw xmm0, xmm1
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- sub ecx, 8
- jg extractloop
- ret
- }
- }
- #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
- #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
- // width in pixels
- __declspec(naked)
- void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- pcmpeqb xmm0, xmm0 // generate mask 0xff000000
- pslld xmm0, 24
- pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
- psrld xmm1, 8
- convertloop:
- movq xmm2, qword ptr [eax] // 8 Y's
- lea eax, [eax + 8]
- punpcklbw xmm2, xmm2
- punpckhwd xmm3, xmm2
- punpcklwd xmm2, xmm2
- movdqu xmm4, [edx]
- movdqu xmm5, [edx + 16]
- pand xmm2, xmm0
- pand xmm3, xmm0
- pand xmm4, xmm1
- pand xmm5, xmm1
- por xmm2, xmm4
- por xmm3, xmm5
- movdqu [edx], xmm2
- movdqu [edx + 16], xmm3
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
- #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
- // width in pixels
- __declspec(naked)
- void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- vpcmpeqb ymm0, ymm0, ymm0
- vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
- convertloop:
- vpmovzxbd ymm1, qword ptr [eax]
- vpmovzxbd ymm2, qword ptr [eax + 8]
- lea eax, [eax + 16]
- vpslld ymm1, ymm1, 24
- vpslld ymm2, ymm2, 24
- vpblendvb ymm1, ymm1, [edx], ymm0
- vpblendvb ymm2, ymm2, [edx + 32], ymm0
- vmovdqu [edx], ymm1
- vmovdqu [edx + 32], ymm2
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
- #ifdef HAS_SETROW_X86
- // Write 'count' bytes using an 8 bit value repeated.
- // Count should be multiple of 4.
- __declspec(naked)
- void SetRow_X86(uint8* dst, uint8 v8, int count) {
- __asm {
- movzx eax, byte ptr [esp + 8] // v8
- mov edx, 0x01010101 // Duplicate byte to all bytes.
- mul edx // overwrites edx with upper part of result.
- mov edx, edi
- mov edi, [esp + 4] // dst
- mov ecx, [esp + 12] // count
- shr ecx, 2
- rep stosd
- mov edi, edx
- ret
- }
- }
- // Write 'count' bytes using an 8 bit value repeated.
- __declspec(naked)
- void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
- __asm {
- mov edx, edi
- mov edi, [esp + 4] // dst
- mov eax, [esp + 8] // v8
- mov ecx, [esp + 12] // count
- rep stosb
- mov edi, edx
- ret
- }
- }
- // Write 'count' 32 bit values.
- __declspec(naked)
- void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
- __asm {
- mov edx, edi
- mov edi, [esp + 4] // dst
- mov eax, [esp + 8] // v32
- mov ecx, [esp + 12] // count
- rep stosd
- mov edi, edx
- ret
- }
- }
- #endif // HAS_SETROW_X86
- #ifdef HAS_YUY2TOYROW_AVX2
- __declspec(naked)
- void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // even bytes are Y
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- vzeroupper
- ret
- }
- }
- __declspec(naked)
- void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- __declspec(naked)
- void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- vzeroupper
- ret
- }
- }
- __declspec(naked)
- void UYVYToYRow_AVX2(const uint8* src_uyvy,
- uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // odd bytes are Y
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- vzeroupper
- ret
- }
- }
- __declspec(naked)
- void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- __declspec(naked)
- void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
- pop edi
- vzeroupper
- ret
- }
- }
- #endif // HAS_YUY2TOYROW_AVX2
- #ifdef HAS_YUY2TOYROW_SSE2
- __declspec(naked)
- void YUY2ToYRow_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // even bytes are Y
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked)
- void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- ret
- }
- }
- __declspec(naked)
- void UYVYToYRow_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // odd bytes are Y
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- ret
- }
- }
- __declspec(naked)
- void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked)
- void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
- pop edi
- ret
- }
- }
- #endif // HAS_YUY2TOYROW_SSE2
- #ifdef HAS_BLENDPLANEROW_SSSE3
- // Blend 8 pixels at a time.
- // unsigned version of math
- // =((A2*C2)+(B2*(255-C2))+255)/256
- // signed version of math
- // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
- __declspec(naked)
- void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
- __asm {
- push esi
- push edi
- pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
- psllw xmm5, 8
- mov eax, 0x80808080 // 128 for biasing image to signed.
- movd xmm6, eax
- pshufd xmm6, xmm6, 0x00
- mov eax, 0x807f807f // 32768 + 127 for unbias and round.
- movd xmm7, eax
- pshufd xmm7, xmm7, 0x00
- mov eax, [esp + 8 + 4] // src0
- mov edx, [esp + 8 + 8] // src1
- mov esi, [esp + 8 + 12] // alpha
- mov edi, [esp + 8 + 16] // dst
- mov ecx, [esp + 8 + 20] // width
- sub eax, esi
- sub edx, esi
- sub edi, esi
- // 8 pixel loop.
- convertloop8:
- movq xmm0, qword ptr [esi] // alpha
- punpcklbw xmm0, xmm0
- pxor xmm0, xmm5 // a, 255-a
- movq xmm1, qword ptr [eax + esi] // src0
- movq xmm2, qword ptr [edx + esi] // src1
- punpcklbw xmm1, xmm2
- psubb xmm1, xmm6 // bias src0/1 - 128
- pmaddubsw xmm0, xmm1
- paddw xmm0, xmm7 // unbias result - 32768 and round.
- psrlw xmm0, 8
- packuswb xmm0, xmm0
- movq qword ptr [edi + esi], xmm0
- lea esi, [esi + 8]
- sub ecx, 8
- jg convertloop8
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_BLENDPLANEROW_SSSE3
- #ifdef HAS_BLENDPLANEROW_AVX2
- // Blend 32 pixels at a time.
- // unsigned version of math
- // =((A2*C2)+(B2*(255-C2))+255)/256
- // signed version of math
- // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
- __declspec(naked)
- void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
- __asm {
- push esi
- push edi
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
- vpsllw ymm5, ymm5, 8
- mov eax, 0x80808080 // 128 for biasing image to signed.
- vmovd xmm6, eax
- vbroadcastss ymm6, xmm6
- mov eax, 0x807f807f // 32768 + 127 for unbias and round.
- vmovd xmm7, eax
- vbroadcastss ymm7, xmm7
- mov eax, [esp + 8 + 4] // src0
- mov edx, [esp + 8 + 8] // src1
- mov esi, [esp + 8 + 12] // alpha
- mov edi, [esp + 8 + 16] // dst
- mov ecx, [esp + 8 + 20] // width
- sub eax, esi
- sub edx, esi
- sub edi, esi
- // 32 pixel loop.
- convertloop32:
- vmovdqu ymm0, [esi] // alpha
- vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
- vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
- vpxor ymm3, ymm3, ymm5 // a, 255-a
- vpxor ymm0, ymm0, ymm5 // a, 255-a
- vmovdqu ymm1, [eax + esi] // src0
- vmovdqu ymm2, [edx + esi] // src1
- vpunpckhbw ymm4, ymm1, ymm2
- vpunpcklbw ymm1, ymm1, ymm2
- vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
- vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
- vpmaddubsw ymm3, ymm3, ymm4
- vpmaddubsw ymm0, ymm0, ymm1
- vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
- vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
- vpsrlw ymm3, ymm3, 8
- vpsrlw ymm0, ymm0, 8
- vpackuswb ymm0, ymm0, ymm3
- vmovdqu [edi + esi], ymm0
- lea esi, [esi + 32]
- sub ecx, 32
- jg convertloop32
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_BLENDPLANEROW_AVX2
- #ifdef HAS_ARGBBLENDROW_SSSE3
- // Shuffle table for isolating alpha.
- static const uvec8 kShuffleAlpha = {
- 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
- 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
- };
- // Blend 8 pixels at a time.
- __declspec(naked)
- void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm7, xmm7 // generate constant 0x0001
- psrlw xmm7, 15
- pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
- psrlw xmm6, 8
- pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
- psllw xmm5, 8
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
- sub ecx, 4
- jl convertloop4b // less than 4 pixels?
- // 4 pixel loop.
- convertloop4:
- movdqu xmm3, [eax] // src argb
- lea eax, [eax + 16]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movdqu xmm2, [esi] // _r_b
- pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movdqu xmm1, [esi] // _a_g
- lea esi, [esi + 16]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jge convertloop4
- convertloop4b:
- add ecx, 4 - 1
- jl convertloop1b
- // 1 pixel loop.
- convertloop1:
- movd xmm3, [eax] // src argb
- lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
- lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- movd [edx], xmm0
- lea edx, [edx + 4]
- sub ecx, 1
- jge convertloop1
- convertloop1b:
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBBLENDROW_SSSE3
- #ifdef HAS_ARGBATTENUATEROW_SSSE3
- // Shuffle table duplicating alpha.
- static const uvec8 kShuffleAlpha0 = {
- 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
- };
- static const uvec8 kShuffleAlpha1 = {
- 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
- 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
- };
- __declspec(naked)
- void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm3, xmm3 // generate mask 0xff000000
- pslld xmm3, 24
- movdqa xmm4, xmmword ptr kShuffleAlpha0
- movdqa xmm5, xmmword ptr kShuffleAlpha1
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels
- pshufb xmm0, xmm4 // isolate first 2 alphas
- movdqu xmm1, [eax] // read 4 pixels
- punpcklbw xmm1, xmm1 // first 2 pixel rgbs
- pmulhuw xmm0, xmm1 // rgb * a
- movdqu xmm1, [eax] // read 4 pixels
- pshufb xmm1, xmm5 // isolate next 2 alphas
- movdqu xmm2, [eax] // read 4 pixels
- punpckhbw xmm2, xmm2 // next 2 pixel rgbs
- pmulhuw xmm1, xmm2 // rgb * a
- movdqu xmm2, [eax] // mask original alpha
- lea eax, [eax + 16]
- pand xmm2, xmm3
- psrlw xmm0, 8
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- por xmm0, xmm2 // copy original alpha
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBATTENUATEROW_SSSE3
- #ifdef HAS_ARGBATTENUATEROW_AVX2
- // Shuffle table duplicating alpha.
- static const uvec8 kShuffleAlpha_AVX2 = {
- 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
- };
- __declspec(naked)
- void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
- vpslld ymm5, ymm5, 24
- convertloop:
- vmovdqu ymm6, [eax] // read 8 pixels.
- vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
- vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
- vpshufb ymm2, ymm0, ymm4 // low 4 alphas
- vpshufb ymm3, ymm1, ymm4 // high 4 alphas
- vpmulhuw ymm0, ymm0, ymm2 // rgb * a
- vpmulhuw ymm1, ymm1, ymm3 // rgb * a
- vpand ymm6, ymm6, ymm5 // isolate alpha
- vpsrlw ymm0, ymm0, 8
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // unmutated.
- vpor ymm0, ymm0, ymm6 // copy original alpha
- vmovdqu [eax + edx], ymm0
- lea eax, [eax + 32]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBATTENUATEROW_AVX2
- #ifdef HAS_ARGBUNATTENUATEROW_SSE2
- // Unattenuate 4 pixels at a time.
- __declspec(naked)
- void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- int width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov edx, [esp + 12 + 8] // dst_argb
- mov ecx, [esp + 12 + 12] // width
- lea ebx, fixed_invtbl8
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels
- movzx esi, byte ptr [eax + 3] // first alpha
- movzx edi, byte ptr [eax + 7] // second alpha
- punpcklbw xmm0, xmm0 // first 2
- movd xmm2, dword ptr [ebx + esi * 4]
- movd xmm3, dword ptr [ebx + edi * 4]
- pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
- pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
- movlhps xmm2, xmm3
- pmulhuw xmm0, xmm2 // rgb * a
- movdqu xmm1, [eax] // read 4 pixels
- movzx esi, byte ptr [eax + 11] // third alpha
- movzx edi, byte ptr [eax + 15] // forth alpha
- punpckhbw xmm1, xmm1 // next 2
- movd xmm2, dword ptr [ebx + esi * 4]
- movd xmm3, dword ptr [ebx + edi * 4]
- pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
- pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
- movlhps xmm2, xmm3
- pmulhuw xmm1, xmm2 // rgb * a
- lea eax, [eax + 16]
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- pop edi
- pop esi
- pop ebx
- ret
- }
- }
- #endif // HAS_ARGBUNATTENUATEROW_SSE2
- #ifdef HAS_ARGBUNATTENUATEROW_AVX2
- // Shuffle table duplicating alpha.
- static const uvec8 kUnattenShuffleAlpha_AVX2 = {
- 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
- };
- // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
- // USE_GATHER is not on by default, due to being a slow instruction.
- #ifdef USE_GATHER
- __declspec(naked)
- void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
- convertloop:
- vmovdqu ymm6, [eax] // read 8 pixels.
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
- vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
- vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
- vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
- vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
- vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
- vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
- vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
- vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
- vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
- vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
- vpackuswb ymm0, ymm0, ymm1 // unmutated.
- vmovdqu [eax + edx], ymm0
- lea eax, [eax + 32]
- sub ecx, 8
- jg convertloop
- vzeroupper
- ret
- }
- }
- #else // USE_GATHER
- __declspec(naked)
- void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- int width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov edx, [esp + 12 + 8] // dst_argb
- mov ecx, [esp + 12 + 12] // width
- sub edx, eax
- lea ebx, fixed_invtbl8
- vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
- convertloop:
- // replace VPGATHER
- movzx esi, byte ptr [eax + 3] // alpha0
- movzx edi, byte ptr [eax + 7] // alpha1
- vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
- vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
- movzx esi, byte ptr [eax + 11] // alpha2
- movzx edi, byte ptr [eax + 15] // alpha3
- vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
- vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
- vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
- movzx esi, byte ptr [eax + 19] // alpha4
- movzx edi, byte ptr [eax + 23] // alpha5
- vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
- vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
- vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
- movzx esi, byte ptr [eax + 27] // alpha6
- movzx edi, byte ptr [eax + 31] // alpha7
- vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
- vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
- vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
- vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
- vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
- vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
- vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
- // end of VPGATHER
- vmovdqu ymm6, [eax] // read 8 pixels.
- vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
- vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
- vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
- vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
- vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
- vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
- vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
- vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
- vpackuswb ymm0, ymm0, ymm1 // unmutated.
- vmovdqu [eax + edx], ymm0
- lea eax, [eax + 32]
- sub ecx, 8
- jg convertloop
- pop edi
- pop esi
- pop ebx
- vzeroupper
- ret
- }
- }
- #endif // USE_GATHER
- #endif // HAS_ARGBATTENUATEROW_AVX2
- #ifdef HAS_ARGBGRAYROW_SSSE3
- // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
- __declspec(naked)
- void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kARGBToYJ
- movdqa xmm5, xmmword ptr kAddYJ64
- convertloop:
- movdqu xmm0, [eax] // G
- movdqu xmm1, [eax + 16]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- phaddw xmm0, xmm1
- paddw xmm0, xmm5 // Add .5 for rounding.
- psrlw xmm0, 7
- packuswb xmm0, xmm0 // 8 G bytes
- movdqu xmm2, [eax] // A
- movdqu xmm3, [eax + 16]
- lea eax, [eax + 32]
- psrld xmm2, 24
- psrld xmm3, 24
- packuswb xmm2, xmm3
- packuswb xmm2, xmm2 // 8 A bytes
- movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
- punpcklbw xmm0, xmm0 // 8 GG words
- punpcklbw xmm3, xmm2 // 8 GA words
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm3 // GGGA first 4
- punpckhwd xmm1, xmm3 // GGGA next 4
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBGRAYROW_SSSE3
- #ifdef HAS_ARGBSEPIAROW_SSSE3
- // b = (r * 35 + g * 68 + b * 17) >> 7
- // g = (r * 45 + g * 88 + b * 22) >> 7
- // r = (r * 50 + g * 98 + b * 24) >> 7
- // Constant for ARGB color to sepia tone.
- static const vec8 kARGBToSepiaB = {
- 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
- };
- static const vec8 kARGBToSepiaG = {
- 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
- };
- static const vec8 kARGBToSepiaR = {
- 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
- };
- // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
- __declspec(naked)
- void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] /* dst_argb */
- mov ecx, [esp + 8] /* width */
- movdqa xmm2, xmmword ptr kARGBToSepiaB
- movdqa xmm3, xmmword ptr kARGBToSepiaG
- movdqa xmm4, xmmword ptr kARGBToSepiaR
- convertloop:
- movdqu xmm0, [eax] // B
- movdqu xmm6, [eax + 16]
- pmaddubsw xmm0, xmm2
- pmaddubsw xmm6, xmm2
- phaddw xmm0, xmm6
- psrlw xmm0, 7
- packuswb xmm0, xmm0 // 8 B values
- movdqu xmm5, [eax] // G
- movdqu xmm1, [eax + 16]
- pmaddubsw xmm5, xmm3
- pmaddubsw xmm1, xmm3
- phaddw xmm5, xmm1
- psrlw xmm5, 7
- packuswb xmm5, xmm5 // 8 G values
- punpcklbw xmm0, xmm5 // 8 BG values
- movdqu xmm5, [eax] // R
- movdqu xmm1, [eax + 16]
- pmaddubsw xmm5, xmm4
- pmaddubsw xmm1, xmm4
- phaddw xmm5, xmm1
- psrlw xmm5, 7
- packuswb xmm5, xmm5 // 8 R values
- movdqu xmm6, [eax] // A
- movdqu xmm1, [eax + 16]
- psrld xmm6, 24
- psrld xmm1, 24
- packuswb xmm6, xmm1
- packuswb xmm6, xmm6 // 8 A values
- punpcklbw xmm5, xmm6 // 8 RA values
- movdqa xmm1, xmm0 // Weave BG, RA together
- punpcklwd xmm0, xmm5 // BGRA first 4
- punpckhwd xmm1, xmm5 // BGRA next 4
- movdqu [eax], xmm0
- movdqu [eax + 16], xmm1
- lea eax, [eax + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBSEPIAROW_SSSE3
- #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
- // Tranform 8 ARGB pixels (32 bytes) with color matrix.
- // Same as Sepia except matrix is provided.
- // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
- // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
- __declspec(naked)
- void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* matrix_argb */
- movdqu xmm5, [ecx]
- pshufd xmm2, xmm5, 0x00
- pshufd xmm3, xmm5, 0x55
- pshufd xmm4, xmm5, 0xaa
- pshufd xmm5, xmm5, 0xff
- mov ecx, [esp + 16] /* width */
- convertloop:
- movdqu xmm0, [eax] // B
- movdqu xmm7, [eax + 16]
- pmaddubsw xmm0, xmm2
- pmaddubsw xmm7, xmm2
- movdqu xmm6, [eax] // G
- movdqu xmm1, [eax + 16]
- pmaddubsw xmm6, xmm3
- pmaddubsw xmm1, xmm3
- phaddsw xmm0, xmm7 // B
- phaddsw xmm6, xmm1 // G
- psraw xmm0, 6 // B
- psraw xmm6, 6 // G
- packuswb xmm0, xmm0 // 8 B values
- packuswb xmm6, xmm6 // 8 G values
- punpcklbw xmm0, xmm6 // 8 BG values
- movdqu xmm1, [eax] // R
- movdqu xmm7, [eax + 16]
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm7, xmm4
- phaddsw xmm1, xmm7 // R
- movdqu xmm6, [eax] // A
- movdqu xmm7, [eax + 16]
- pmaddubsw xmm6, xmm5
- pmaddubsw xmm7, xmm5
- phaddsw xmm6, xmm7 // A
- psraw xmm1, 6 // R
- psraw xmm6, 6 // A
- packuswb xmm1, xmm1 // 8 R values
- packuswb xmm6, xmm6 // 8 A values
- punpcklbw xmm1, xmm6 // 8 RA values
- movdqa xmm6, xmm0 // Weave BG, RA together
- punpcklwd xmm0, xmm1 // BGRA first 4
- punpckhwd xmm6, xmm1 // BGRA next 4
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm6
- lea eax, [eax + 32]
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
- #ifdef HAS_ARGBQUANTIZEROW_SSE2
- // Quantize 4 ARGB pixels (16 bytes).
- __declspec(naked)
- void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
- __asm {
- mov eax, [esp + 4] /* dst_argb */
- movd xmm2, [esp + 8] /* scale */
- movd xmm3, [esp + 12] /* interval_size */
- movd xmm4, [esp + 16] /* interval_offset */
- mov ecx, [esp + 20] /* width */
- pshuflw xmm2, xmm2, 040h
- pshufd xmm2, xmm2, 044h
- pshuflw xmm3, xmm3, 040h
- pshufd xmm3, xmm3, 044h
- pshuflw xmm4, xmm4, 040h
- pshufd xmm4, xmm4, 044h
- pxor xmm5, xmm5 // constant 0
- pcmpeqb xmm6, xmm6 // generate mask 0xff000000
- pslld xmm6, 24
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels
- punpcklbw xmm0, xmm5 // first 2 pixels
- pmulhuw xmm0, xmm2 // pixel * scale >> 16
- movdqu xmm1, [eax] // read 4 pixels
- punpckhbw xmm1, xmm5 // next 2 pixels
- pmulhuw xmm1, xmm2
- pmullw xmm0, xmm3 // * interval_size
- movdqu xmm7, [eax] // read 4 pixels
- pmullw xmm1, xmm3
- pand xmm7, xmm6 // mask alpha
- paddw xmm0, xmm4 // + interval_size / 2
- paddw xmm1, xmm4
- packuswb xmm0, xmm1
- por xmm0, xmm7
- movdqu [eax], xmm0
- lea eax, [eax + 16]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBQUANTIZEROW_SSE2
- #ifdef HAS_ARGBSHADEROW_SSE2
- // Shade 4 pixels at a time by specified value.
- __declspec(naked)
- void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
- uint32 value) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- movd xmm2, [esp + 16] // value
- punpcklbw xmm2, xmm2
- punpcklqdq xmm2, xmm2
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- pmulhuw xmm0, xmm2 // argb * value
- pmulhuw xmm1, xmm2 // argb * value
- psrlw xmm0, 8
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- ret
- }
- }
- #endif // HAS_ARGBSHADEROW_SSE2
- #ifdef HAS_ARGBMULTIPLYROW_SSE2
- // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
- __declspec(naked)
- void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- pxor xmm5, xmm5 // constant 0
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
- movdqu xmm2, [esi] // read 4 pixels from src_argb1
- movdqu xmm1, xmm0
- movdqu xmm3, xmm2
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- punpcklbw xmm2, xmm5 // first 2
- punpckhbw xmm3, xmm5 // next 2
- pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
- pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
- lea eax, [eax + 16]
- lea esi, [esi + 16]
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBMULTIPLYROW_SSE2
- #ifdef HAS_ARGBADDROW_SSE2
- // Add 2 rows of ARGB pixels together, 4 pixels at a time.
- // TODO(fbarchard): Port this to posix, neon and other math functions.
- __declspec(naked)
- void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- sub ecx, 4
- jl convertloop49
- convertloop4:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
- lea eax, [eax + 16]
- movdqu xmm1, [esi] // read 4 pixels from src_argb1
- lea esi, [esi + 16]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jge convertloop4
- convertloop49:
- add ecx, 4 - 1
- jl convertloop19
- convertloop1:
- movd xmm0, [eax] // read 1 pixels from src_argb0
- lea eax, [eax + 4]
- movd xmm1, [esi] // read 1 pixels from src_argb1
- lea esi, [esi + 4]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
- movd [edx], xmm0
- lea edx, [edx + 4]
- sub ecx, 1
- jge convertloop1
- convertloop19:
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBADDROW_SSE2
- #ifdef HAS_ARGBSUBTRACTROW_SSE2
- // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
- __declspec(naked)
- void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
- lea eax, [eax + 16]
- movdqu xmm1, [esi] // read 4 pixels from src_argb1
- lea esi, [esi + 16]
- psubusb xmm0, xmm1 // src_argb0 - src_argb1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBSUBTRACTROW_SSE2
- #ifdef HAS_ARGBMULTIPLYROW_AVX2
- // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
- __declspec(naked)
- void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- vpxor ymm5, ymm5, ymm5 // constant 0
- convertloop:
- vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
- lea eax, [eax + 32]
- vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
- lea esi, [esi + 32]
- vpunpcklbw ymm0, ymm1, ymm1 // low 4
- vpunpckhbw ymm1, ymm1, ymm1 // high 4
- vpunpcklbw ymm2, ymm3, ymm5 // low 4
- vpunpckhbw ymm3, ymm3, ymm5 // high 4
- vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
- vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
- vpackuswb ymm0, ymm0, ymm1
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBMULTIPLYROW_AVX2
- #ifdef HAS_ARGBADDROW_AVX2
- // Add 2 rows of ARGB pixels together, 8 pixels at a time.
- __declspec(naked)
- void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
- lea eax, [eax + 32]
- vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
- lea esi, [esi + 32]
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBADDROW_AVX2
- #ifdef HAS_ARGBSUBTRACTROW_AVX2
- // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
- __declspec(naked)
- void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
- lea eax, [eax + 32]
- vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
- lea esi, [esi + 32]
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBSUBTRACTROW_AVX2
- #ifdef HAS_SOBELXROW_SSE2
- // SobelX as a matrix is
- // -1 0 1
- // -2 0 2
- // -1 0 1
- __declspec(naked)
- void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_y0
- mov esi, [esp + 8 + 8] // src_y1
- mov edi, [esp + 8 + 12] // src_y2
- mov edx, [esp + 8 + 16] // dst_sobelx
- mov ecx, [esp + 8 + 20] // width
- sub esi, eax
- sub edi, eax
- sub edx, eax
- pxor xmm5, xmm5 // constant 0
- convertloop:
- movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
- movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- psubw xmm0, xmm1
- movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
- movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
- punpcklbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- psubw xmm1, xmm2
- movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
- movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
- punpcklbw xmm2, xmm5
- punpcklbw xmm3, xmm5
- psubw xmm2, xmm3
- paddw xmm0, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm1
- pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
- psubw xmm1, xmm0
- pmaxsw xmm0, xmm1
- packuswb xmm0, xmm0
- movq qword ptr [eax + edx], xmm0
- lea eax, [eax + 8]
- sub ecx, 8
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_SOBELXROW_SSE2
- #ifdef HAS_SOBELYROW_SSE2
- // SobelY as a matrix is
- // -1 -2 -1
- // 0 0 0
- // 1 2 1
- __declspec(naked)
- void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_y0
- mov esi, [esp + 4 + 8] // src_y1
- mov edx, [esp + 4 + 12] // dst_sobely
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- sub edx, eax
- pxor xmm5, xmm5 // constant 0
- convertloop:
- movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
- movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- psubw xmm0, xmm1
- movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
- movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
- punpcklbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- psubw xmm1, xmm2
- movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
- movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
- punpcklbw xmm2, xmm5
- punpcklbw xmm3, xmm5
- psubw xmm2, xmm3
- paddw xmm0, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm1
- pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
- psubw xmm1, xmm0
- pmaxsw xmm0, xmm1
- packuswb xmm0, xmm0
- movq qword ptr [eax + edx], xmm0
- lea eax, [eax + 8]
- sub ecx, 8
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_SOBELYROW_SSE2
- #ifdef HAS_SOBELROW_SSE2
- // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
- // A = 255
- // R = Sobel
- // G = Sobel
- // B = Sobel
- __declspec(naked)
- void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- pcmpeqb xmm5, xmm5 // alpha 255
- pslld xmm5, 24 // 0xff000000
- convertloop:
- movdqu xmm0, [eax] // read 16 pixels src_sobelx
- movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
- lea eax, [eax + 16]
- paddusb xmm0, xmm1 // sobel = sobelx + sobely
- movdqa xmm2, xmm0 // GG
- punpcklbw xmm2, xmm0 // First 8
- punpckhbw xmm0, xmm0 // Next 8
- movdqa xmm1, xmm2 // GGGG
- punpcklwd xmm1, xmm2 // First 4
- punpckhwd xmm2, xmm2 // Next 4
- por xmm1, xmm5 // GGGA
- por xmm2, xmm5
- movdqa xmm3, xmm0 // GGGG
- punpcklwd xmm3, xmm0 // Next 4
- punpckhwd xmm0, xmm0 // Last 4
- por xmm3, xmm5 // GGGA
- por xmm0, xmm5
- movdqu [edx], xmm1
- movdqu [edx + 16], xmm2
- movdqu [edx + 32], xmm3
- movdqu [edx + 48], xmm0
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_SOBELROW_SSE2
- #ifdef HAS_SOBELTOPLANEROW_SSE2
- // Adds Sobel X and Sobel Y and stores Sobel into a plane.
- __declspec(naked)
- void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- convertloop:
- movdqu xmm0, [eax] // read 16 pixels src_sobelx
- movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
- lea eax, [eax + 16]
- paddusb xmm0, xmm1 // sobel = sobelx + sobely
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_SOBELTOPLANEROW_SSE2
- #ifdef HAS_SOBELXYROW_SSE2
- // Mixes Sobel X, Sobel Y and Sobel into ARGB.
- // A = 255
- // R = Sobel X
- // G = Sobel
- // B = Sobel Y
- __declspec(naked)
- void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- pcmpeqb xmm5, xmm5 // alpha 255
- convertloop:
- movdqu xmm0, [eax] // read 16 pixels src_sobelx
- movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
- lea eax, [eax + 16]
- movdqa xmm2, xmm0
- paddusb xmm2, xmm1 // sobel = sobelx + sobely
- movdqa xmm3, xmm0 // XA
- punpcklbw xmm3, xmm5
- punpckhbw xmm0, xmm5
- movdqa xmm4, xmm1 // YS
- punpcklbw xmm4, xmm2
- punpckhbw xmm1, xmm2
- movdqa xmm6, xmm4 // YSXA
- punpcklwd xmm6, xmm3 // First 4
- punpckhwd xmm4, xmm3 // Next 4
- movdqa xmm7, xmm1 // YSXA
- punpcklwd xmm7, xmm0 // Next 4
- punpckhwd xmm1, xmm0 // Last 4
- movdqu [edx], xmm6
- movdqu [edx + 16], xmm4
- movdqu [edx + 32], xmm7
- movdqu [edx + 48], xmm1
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_SOBELXYROW_SSE2
- #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
- // Consider float CumulativeSum.
- // Consider calling CumulativeSum one row at time as needed.
- // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
- // Convert cumulative sum for an area to an average for 1 pixel.
- // topleft is pointer to top left of CumulativeSum buffer for area.
- // botleft is pointer to bottom left of CumulativeSum buffer.
- // width is offset from left to right of area in CumulativeSum buffer measured
- // in number of ints.
- // area is the number of pixels in the area being averaged.
- // dst points to pixel to store result to.
- // count is number of averaged pixels to produce.
- // Does 4 pixels at a time.
- // This function requires alignment on accumulation buffer pointers.
- void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst,
- int count) {
- __asm {
- mov eax, topleft // eax topleft
- mov esi, botleft // esi botleft
- mov edx, width
- movd xmm5, area
- mov edi, dst
- mov ecx, count
- cvtdq2ps xmm5, xmm5
- rcpss xmm4, xmm5 // 1.0f / area
- pshufd xmm4, xmm4, 0
- sub ecx, 4
- jl l4b
- cmp area, 128 // 128 pixels will not overflow 15 bits.
- ja l4
- pshufd xmm5, xmm5, 0 // area
- pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
- psrld xmm6, 16
- cvtdq2ps xmm6, xmm6
- addps xmm5, xmm6 // (65536.0 + area - 1)
- mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
- cvtps2dq xmm5, xmm5 // 0.16 fixed point
- packssdw xmm5, xmm5 // 16 bit shorts
- // 4 pixel loop small blocks.
- s4:
- // top left
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- // - top right
- psubd xmm0, [eax + edx * 4]
- psubd xmm1, [eax + edx * 4 + 16]
- psubd xmm2, [eax + edx * 4 + 32]
- psubd xmm3, [eax + edx * 4 + 48]
- lea eax, [eax + 64]
- // - bottom left
- psubd xmm0, [esi]
- psubd xmm1, [esi + 16]
- psubd xmm2, [esi + 32]
- psubd xmm3, [esi + 48]
- // + bottom right
- paddd xmm0, [esi + edx * 4]
- paddd xmm1, [esi + edx * 4 + 16]
- paddd xmm2, [esi + edx * 4 + 32]
- paddd xmm3, [esi + edx * 4 + 48]
- lea esi, [esi + 64]
- packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
- packssdw xmm2, xmm3
- pmulhuw xmm0, xmm5
- pmulhuw xmm2, xmm5
- packuswb xmm0, xmm2
- movdqu [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 4
- jge s4
- jmp l4b
- // 4 pixel loop
- l4:
- // top left
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- // - top right
- psubd xmm0, [eax + edx * 4]
- psubd xmm1, [eax + edx * 4 + 16]
- psubd xmm2, [eax + edx * 4 + 32]
- psubd xmm3, [eax + edx * 4 + 48]
- lea eax, [eax + 64]
- // - bottom left
- psubd xmm0, [esi]
- psubd xmm1, [esi + 16]
- psubd xmm2, [esi + 32]
- psubd xmm3, [esi + 48]
- // + bottom right
- paddd xmm0, [esi + edx * 4]
- paddd xmm1, [esi + edx * 4 + 16]
- paddd xmm2, [esi + edx * 4 + 32]
- paddd xmm3, [esi + edx * 4 + 48]
- lea esi, [esi + 64]
- cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
- cvtdq2ps xmm1, xmm1
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- cvtdq2ps xmm2, xmm2
- cvtdq2ps xmm3, xmm3
- mulps xmm2, xmm4
- mulps xmm3, xmm4
- cvtps2dq xmm0, xmm0
- cvtps2dq xmm1, xmm1
- cvtps2dq xmm2, xmm2
- cvtps2dq xmm3, xmm3
- packssdw xmm0, xmm1
- packssdw xmm2, xmm3
- packuswb xmm0, xmm2
- movdqu [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 4
- jge l4
- l4b:
- add ecx, 4 - 1
- jl l1b
- // 1 pixel loop
- l1:
- movdqu xmm0, [eax]
- psubd xmm0, [eax + edx * 4]
- lea eax, [eax + 16]
- psubd xmm0, [esi]
- paddd xmm0, [esi + edx * 4]
- lea esi, [esi + 16]
- cvtdq2ps xmm0, xmm0
- mulps xmm0, xmm4
- cvtps2dq xmm0, xmm0
- packssdw xmm0, xmm0
- packuswb xmm0, xmm0
- movd dword ptr [edi], xmm0
- lea edi, [edi + 4]
- sub ecx, 1
- jge l1
- l1b:
- }
- }
- #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
- #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
- // Creates a table of cumulative sums where each value is a sum of all values
- // above and to the left of the value.
- void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) {
- __asm {
- mov eax, row
- mov edx, cumsum
- mov esi, previous_cumsum
- mov ecx, width
- pxor xmm0, xmm0
- pxor xmm1, xmm1
- sub ecx, 4
- jl l4b
- test edx, 15
- jne l4b
- // 4 pixel loop
- l4:
- movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
- lea eax, [eax + 16]
- movdqa xmm4, xmm2
- punpcklbw xmm2, xmm1
- movdqa xmm3, xmm2
- punpcklwd xmm2, xmm1
- punpckhwd xmm3, xmm1
- punpckhbw xmm4, xmm1
- movdqa xmm5, xmm4
- punpcklwd xmm4, xmm1
- punpckhwd xmm5, xmm1
- paddd xmm0, xmm2
- movdqu xmm2, [esi] // previous row above.
- paddd xmm2, xmm0
- paddd xmm0, xmm3
- movdqu xmm3, [esi + 16]
- paddd xmm3, xmm0
- paddd xmm0, xmm4
- movdqu xmm4, [esi + 32]
- paddd xmm4, xmm0
- paddd xmm0, xmm5
- movdqu xmm5, [esi + 48]
- lea esi, [esi + 64]
- paddd xmm5, xmm0
- movdqu [edx], xmm2
- movdqu [edx + 16], xmm3
- movdqu [edx + 32], xmm4
- movdqu [edx + 48], xmm5
- lea edx, [edx + 64]
- sub ecx, 4
- jge l4
- l4b:
- add ecx, 4 - 1
- jl l1b
- // 1 pixel loop
- l1:
- movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
- lea eax, [eax + 4]
- punpcklbw xmm2, xmm1
- punpcklwd xmm2, xmm1
- paddd xmm0, xmm2
- movdqu xmm2, [esi]
- lea esi, [esi + 16]
- paddd xmm2, xmm0
- movdqu [edx], xmm2
- lea edx, [edx + 16]
- sub ecx, 1
- jge l1
- l1b:
- }
- }
- #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
- #ifdef HAS_ARGBAFFINEROW_SSE2
- // Copy ARGB pixels from source image with slope to a row of destination.
- __declspec(naked)
- LIBYUV_API
- void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* uv_dudv, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 12] // src_argb
- mov esi, [esp + 16] // stride
- mov edx, [esp + 20] // dst_argb
- mov ecx, [esp + 24] // pointer to uv_dudv
- movq xmm2, qword ptr [ecx] // uv
- movq xmm7, qword ptr [ecx + 8] // dudv
- mov ecx, [esp + 28] // width
- shl esi, 16 // 4, stride
- add esi, 4
- movd xmm5, esi
- sub ecx, 4
- jl l4b
- // setup for 4 pixel loop
- pshufd xmm7, xmm7, 0x44 // dup dudv
- pshufd xmm5, xmm5, 0 // dup 4, stride
- movdqa xmm0, xmm2 // x0, y0, x1, y1
- addps xmm0, xmm7
- movlhps xmm2, xmm0
- movdqa xmm4, xmm7
- addps xmm4, xmm4 // dudv *= 2
- movdqa xmm3, xmm2 // x2, y2, x3, y3
- addps xmm3, xmm4
- addps xmm4, xmm4 // dudv *= 4
- // 4 pixel loop
- l4:
- cvttps2dq xmm0, xmm2 // x, y float to int first 2
- cvttps2dq xmm1, xmm3 // x, y float to int next 2
- packssdw xmm0, xmm1 // x, y as 8 shorts
- pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // shift right
- movd edi, xmm0
- pshufd xmm0, xmm0, 0x39 // shift right
- movd xmm1, [eax + esi] // read pixel 0
- movd xmm6, [eax + edi] // read pixel 1
- punpckldq xmm1, xmm6 // combine pixel 0 and 1
- addps xmm2, xmm4 // x, y += dx, dy first 2
- movq qword ptr [edx], xmm1
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // shift right
- movd edi, xmm0
- movd xmm6, [eax + esi] // read pixel 2
- movd xmm0, [eax + edi] // read pixel 3
- punpckldq xmm6, xmm0 // combine pixel 2 and 3
- addps xmm3, xmm4 // x, y += dx, dy next 2
- movq qword ptr 8[edx], xmm6
- lea edx, [edx + 16]
- sub ecx, 4
- jge l4
- l4b:
- add ecx, 4 - 1
- jl l1b
- // 1 pixel loop
- l1:
- cvttps2dq xmm0, xmm2 // x, y float to int
- packssdw xmm0, xmm0 // x, y as shorts
- pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
- addps xmm2, xmm7 // x, y += dx, dy
- movd esi, xmm0
- movd xmm0, [eax + esi] // copy a pixel
- movd [edx], xmm0
- lea edx, [edx + 4]
- sub ecx, 1
- jge l1
- l1b:
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBAFFINEROW_SSE2
- #ifdef HAS_INTERPOLATEROW_AVX2
- // Bilinear filter 32x2 -> 32x1
- __declspec(naked)
- void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 / 256. Blend 100 / 0.
- sub edi, esi
- cmp eax, 128
- je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
- vmovd xmm0, eax // high fraction 0..255
- neg eax
- add eax, 256
- vmovd xmm5, eax // low fraction 256..1
- vpunpcklbw xmm5, xmm5, xmm0
- vpunpcklwd xmm5, xmm5, xmm5
- vbroadcastss ymm5, xmm5
- mov eax, 0x80808080 // 128b for bias and rounding.
- vmovd xmm4, eax
- vbroadcastss ymm4, xmm4
- xloop:
- vmovdqu ymm0, [esi]
- vmovdqu ymm2, [esi + edx]
- vpunpckhbw ymm1, ymm0, ymm2 // mutates
- vpunpcklbw ymm0, ymm0, ymm2
- vpsubb ymm1, ymm1, ymm4 // bias to signed image
- vpsubb ymm0, ymm0, ymm4
- vpmaddubsw ymm1, ymm5, ymm1
- vpmaddubsw ymm0, ymm5, ymm0
- vpaddw ymm1, ymm1, ymm4 // unbias and round
- vpaddw ymm0, ymm0, ymm4
- vpsrlw ymm1, ymm1, 8
- vpsrlw ymm0, ymm0, 8
- vpackuswb ymm0, ymm0, ymm1 // unmutates
- vmovdqu [esi + edi], ymm0
- lea esi, [esi + 32]
- sub ecx, 32
- jg xloop
- jmp xloop99
- // Blend 50 / 50.
- xloop50:
- vmovdqu ymm0, [esi]
- vpavgb ymm0, ymm0, [esi + edx]
- vmovdqu [esi + edi], ymm0
- lea esi, [esi + 32]
- sub ecx, 32
- jg xloop50
- jmp xloop99
- // Blend 100 / 0 - Copy row unchanged.
- xloop100:
- rep movsb
- xloop99:
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_INTERPOLATEROW_AVX2
- // Bilinear filter 16x2 -> 16x1
- // TODO(fbarchard): Consider allowing 256 using memcpy.
- __declspec(naked)
- void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 /256. Blend 100 / 0.
- cmp eax, 128
- je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
- movd xmm0, eax // high fraction 0..255
- neg eax
- add eax, 256
- movd xmm5, eax // low fraction 255..1
- punpcklbw xmm5, xmm0
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
- mov eax, 0x80808080 // 128 for biasing image to signed.
- movd xmm4, eax
- pshufd xmm4, xmm4, 0x00
- xloop:
- movdqu xmm0, [esi]
- movdqu xmm2, [esi + edx]
- movdqu xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- psubb xmm0, xmm4 // bias image by -128
- psubb xmm1, xmm4
- movdqa xmm2, xmm5
- movdqa xmm3, xmm5
- pmaddubsw xmm2, xmm0
- pmaddubsw xmm3, xmm1
- paddw xmm2, xmm4
- paddw xmm3, xmm4
- psrlw xmm2, 8
- psrlw xmm3, 8
- packuswb xmm2, xmm3
- movdqu [esi + edi], xmm2
- lea esi, [esi + 16]
- sub ecx, 16
- jg xloop
- jmp xloop99
- // Blend 50 / 50.
- xloop50:
- movdqu xmm0, [esi]
- movdqu xmm1, [esi + edx]
- pavgb xmm0, xmm1
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- sub ecx, 16
- jg xloop50
- jmp xloop99
- // Blend 100 / 0 - Copy row unchanged.
- xloop100:
- movdqu xmm0, [esi]
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- sub ecx, 16
- jg xloop100
- xloop99:
- pop edi
- pop esi
- ret
- }
- }
- // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
- __declspec(naked)
- void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // shuffler
- movdqu xmm5, [ecx]
- mov ecx, [esp + 16] // width
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pshufb xmm0, xmm5
- pshufb xmm1, xmm5
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg wloop
- ret
- }
- }
- #ifdef HAS_ARGBSHUFFLEROW_AVX2
- __declspec(naked)
- void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // shuffler
- vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
- mov ecx, [esp + 16] // width
- wloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpshufb ymm0, ymm0, ymm5
- vpshufb ymm1, ymm1, ymm5
- vmovdqu [edx], ymm0
- vmovdqu [edx + 32], ymm1
- lea edx, [edx + 64]
- sub ecx, 16
- jg wloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBSHUFFLEROW_AVX2
- __declspec(naked)
- void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
- __asm {
- push ebx
- push esi
- mov eax, [esp + 8 + 4] // src_argb
- mov edx, [esp + 8 + 8] // dst_argb
- mov esi, [esp + 8 + 12] // shuffler
- mov ecx, [esp + 8 + 16] // width
- pxor xmm5, xmm5
- mov ebx, [esi] // shuffler
- cmp ebx, 0x03000102
- je shuf_3012
- cmp ebx, 0x00010203
- je shuf_0123
- cmp ebx, 0x00030201
- je shuf_0321
- cmp ebx, 0x02010003
- je shuf_2103
- // TODO(fbarchard): Use one source pointer and 3 offsets.
- shuf_any1:
- movzx ebx, byte ptr [esi]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx], bl
- movzx ebx, byte ptr [esi + 1]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 1], bl
- movzx ebx, byte ptr [esi + 2]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 2], bl
- movzx ebx, byte ptr [esi + 3]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 3], bl
- lea eax, [eax + 4]
- lea edx, [edx + 4]
- sub ecx, 1
- jg shuf_any1
- jmp shuf99
- shuf_0123:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
- pshuflw xmm0, xmm0, 01Bh
- pshufhw xmm1, xmm1, 01Bh
- pshuflw xmm1, xmm1, 01Bh
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_0123
- jmp shuf99
- shuf_0321:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
- pshuflw xmm0, xmm0, 039h
- pshufhw xmm1, xmm1, 039h
- pshuflw xmm1, xmm1, 039h
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_0321
- jmp shuf99
- shuf_2103:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
- pshuflw xmm0, xmm0, 093h
- pshufhw xmm1, xmm1, 093h
- pshuflw xmm1, xmm1, 093h
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_2103
- jmp shuf99
- shuf_3012:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
- pshuflw xmm0, xmm0, 0C6h
- pshufhw xmm1, xmm1, 0C6h
- pshuflw xmm1, xmm1, 0C6h
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_3012
- shuf99:
- pop esi
- pop ebx
- ret
- }
- }
- // YUY2 - Macro-pixel = 2 image pixels
- // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
- // UYVY - Macro-pixel = 2 image pixels
- // U0Y0V0Y1
- __declspec(naked)
- void I422ToYUY2Row_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
- mov edx, [esp + 8 + 12] // src_v
- mov edi, [esp + 8 + 16] // dst_frame
- mov ecx, [esp + 8 + 20] // width
- sub edx, esi
- convertloop:
- movq xmm2, qword ptr [esi] // U
- movq xmm3, qword ptr [esi + edx] // V
- lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
- movdqu xmm0, [eax] // Y
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2 // YUYV
- punpckhbw xmm1, xmm2
- movdqu [edi], xmm0
- movdqu [edi + 16], xmm1
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- __declspec(naked)
- void I422ToUYVYRow_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
- mov edx, [esp + 8 + 12] // src_v
- mov edi, [esp + 8 + 16] // dst_frame
- mov ecx, [esp + 8 + 20] // width
- sub edx, esi
- convertloop:
- movq xmm2, qword ptr [esi] // U
- movq xmm3, qword ptr [esi + edx] // V
- lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
- movdqu xmm0, [eax] // Y
- movdqa xmm1, xmm2
- lea eax, [eax + 16]
- punpcklbw xmm1, xmm0 // UYVY
- punpckhbw xmm2, xmm0
- movdqu [edi], xmm1
- movdqu [edi + 16], xmm2
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
- __declspec(naked)
- void ARGBPolynomialRow_SSE2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] /* src_argb */
- mov edx, [esp + 4 + 8] /* dst_argb */
- mov esi, [esp + 4 + 12] /* poly */
- mov ecx, [esp + 4 + 16] /* width */
- pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
- // 2 pixel loop.
- convertloop:
- // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
- // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
- movq xmm0, qword ptr [eax] // BGRABGRA
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm3
- movdqa xmm4, xmm0
- punpcklwd xmm0, xmm3 // pixel 0
- punpckhwd xmm4, xmm3 // pixel 1
- cvtdq2ps xmm0, xmm0 // 4 floats
- cvtdq2ps xmm4, xmm4
- movdqa xmm1, xmm0 // X
- movdqa xmm5, xmm4
- mulps xmm0, [esi + 16] // C1 * X
- mulps xmm4, [esi + 16]
- addps xmm0, [esi] // result = C0 + C1 * X
- addps xmm4, [esi]
- movdqa xmm2, xmm1
- movdqa xmm6, xmm5
- mulps xmm2, xmm1 // X * X
- mulps xmm6, xmm5
- mulps xmm1, xmm2 // X * X * X
- mulps xmm5, xmm6
- mulps xmm2, [esi + 32] // C2 * X * X
- mulps xmm6, [esi + 32]
- mulps xmm1, [esi + 48] // C3 * X * X * X
- mulps xmm5, [esi + 48]
- addps xmm0, xmm2 // result += C2 * X * X
- addps xmm4, xmm6
- addps xmm0, xmm1 // result += C3 * X * X * X
- addps xmm4, xmm5
- cvttps2dq xmm0, xmm0
- cvttps2dq xmm4, xmm4
- packuswb xmm0, xmm4
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- sub ecx, 2
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBPOLYNOMIALROW_SSE2
- #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
- __declspec(naked)
- void ARGBPolynomialRow_AVX2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* poly */
- vbroadcastf128 ymm4, [ecx] // C0
- vbroadcastf128 ymm5, [ecx + 16] // C1
- vbroadcastf128 ymm6, [ecx + 32] // C2
- vbroadcastf128 ymm7, [ecx + 48] // C3
- mov ecx, [esp + 16] /* width */
- // 2 pixel loop.
- convertloop:
- vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
- lea eax, [eax + 8]
- vcvtdq2ps ymm0, ymm0 // X 8 floats
- vmulps ymm2, ymm0, ymm0 // X * X
- vmulps ymm3, ymm0, ymm7 // C3 * X
- vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
- vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
- vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
- vcvttps2dq ymm0, ymm0
- vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
- vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
- vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
- vmovq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- sub ecx, 2
- jg convertloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_ARGBPOLYNOMIALROW_AVX2
- #ifdef HAS_ARGBCOLORTABLEROW_X86
- // Tranform ARGB pixels with color table.
- __declspec(naked)
- void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] /* dst_argb */
- mov esi, [esp + 4 + 8] /* table_argb */
- mov ecx, [esp + 4 + 12] /* width */
- // 1 pixel loop.
- convertloop:
- movzx edx, byte ptr [eax]
- lea eax, [eax + 4]
- movzx edx, byte ptr [esi + edx * 4]
- mov byte ptr [eax - 4], dl
- movzx edx, byte ptr [eax - 4 + 1]
- movzx edx, byte ptr [esi + edx * 4 + 1]
- mov byte ptr [eax - 4 + 1], dl
- movzx edx, byte ptr [eax - 4 + 2]
- movzx edx, byte ptr [esi + edx * 4 + 2]
- mov byte ptr [eax - 4 + 2], dl
- movzx edx, byte ptr [eax - 4 + 3]
- movzx edx, byte ptr [esi + edx * 4 + 3]
- mov byte ptr [eax - 4 + 3], dl
- dec ecx
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBCOLORTABLEROW_X86
- #ifdef HAS_RGBCOLORTABLEROW_X86
- // Tranform RGB pixels with color table.
- __declspec(naked)
- void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] /* dst_argb */
- mov esi, [esp + 4 + 8] /* table_argb */
- mov ecx, [esp + 4 + 12] /* width */
- // 1 pixel loop.
- convertloop:
- movzx edx, byte ptr [eax]
- lea eax, [eax + 4]
- movzx edx, byte ptr [esi + edx * 4]
- mov byte ptr [eax - 4], dl
- movzx edx, byte ptr [eax - 4 + 1]
- movzx edx, byte ptr [esi + edx * 4 + 1]
- mov byte ptr [eax - 4 + 1], dl
- movzx edx, byte ptr [eax - 4 + 2]
- movzx edx, byte ptr [esi + edx * 4 + 2]
- mov byte ptr [eax - 4 + 2], dl
- dec ecx
- jg convertloop
- pop esi
- ret
- }
- }
- #endif // HAS_RGBCOLORTABLEROW_X86
- #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
- // Tranform RGB pixels with luma table.
- __declspec(naked)
- void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- int width,
- const uint8* luma, uint32 lumacoeff) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] /* src_argb */
- mov edi, [esp + 8 + 8] /* dst_argb */
- mov ecx, [esp + 8 + 12] /* width */
- movd xmm2, dword ptr [esp + 8 + 16] // luma table
- movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
- pshufd xmm2, xmm2, 0
- pshufd xmm3, xmm3, 0
- pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
- psllw xmm4, 8
- pxor xmm5, xmm5
- // 4 pixel loop.
- convertloop:
- movdqu xmm0, xmmword ptr [eax] // generate luma ptr
- pmaddubsw xmm0, xmm3
- phaddw xmm0, xmm0
- pand xmm0, xmm4 // mask out low bits
- punpcklwd xmm0, xmm5
- paddd xmm0, xmm2 // add table base
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
- movzx edx, byte ptr [eax]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi], dl
- movzx edx, byte ptr [eax + 1]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 1], dl
- movzx edx, byte ptr [eax + 2]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 2], dl
- movzx edx, byte ptr [eax + 3] // copy alpha.
- mov byte ptr [edi + 3], dl
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
- movzx edx, byte ptr [eax + 4]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 4], dl
- movzx edx, byte ptr [eax + 5]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 5], dl
- movzx edx, byte ptr [eax + 6]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 6], dl
- movzx edx, byte ptr [eax + 7] // copy alpha.
- mov byte ptr [edi + 7], dl
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
- movzx edx, byte ptr [eax + 8]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 8], dl
- movzx edx, byte ptr [eax + 9]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 9], dl
- movzx edx, byte ptr [eax + 10]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 10], dl
- movzx edx, byte ptr [eax + 11] // copy alpha.
- mov byte ptr [edi + 11], dl
- movd esi, xmm0
- movzx edx, byte ptr [eax + 12]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 12], dl
- movzx edx, byte ptr [eax + 13]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 13], dl
- movzx edx, byte ptr [eax + 14]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 14], dl
- movzx edx, byte ptr [eax + 15] // copy alpha.
- mov byte ptr [edi + 15], dl
- lea eax, [eax + 16]
- lea edi, [edi + 16]
- sub ecx, 4
- jg convertloop
- pop edi
- pop esi
- ret
- }
- }
- #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
- #endif // defined(_M_X64)
- #ifdef __cplusplus
- } // extern "C"
- } // namespace libyuv
- #endif
- #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
|