row_gcc.cc 298 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #ifdef __cplusplus
  12. namespace libyuv {
  13. extern "C" {
  14. #endif
  15. // This module is for GCC x86 and x64.
  16. #if !defined(LIBYUV_DISABLE_X86) && \
  17. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  18. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  19. // Constants for ARGB
  20. static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
  21. 13, 65, 33, 0, 13, 65, 33, 0};
  22. // JPeg full range.
  23. static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
  24. 15, 75, 38, 0, 15, 75, 38, 0};
  25. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  26. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  27. static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
  28. 112, -74, -38, 0, 112, -74, -38, 0};
  29. static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
  30. 127, -84, -43, 0, 127, -84, -43, 0};
  31. static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
  32. -18, -94, 112, 0, -18, -94, 112, 0};
  33. static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
  34. -20, -107, 127, 0, -20, -107, 127, 0};
  35. // Constants for BGRA
  36. static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
  37. 0, 33, 65, 13, 0, 33, 65, 13};
  38. static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
  39. 0, -38, -74, 112, 0, -38, -74, 112};
  40. static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
  41. 0, 112, -94, -18, 0, 112, -94, -18};
  42. // Constants for ABGR
  43. static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
  44. 33, 65, 13, 0, 33, 65, 13, 0};
  45. static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
  46. -38, -74, 112, 0, -38, -74, 112, 0};
  47. static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
  48. 112, -94, -18, 0, 112, -94, -18, 0};
  49. // Constants for RGBA.
  50. static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
  51. 0, 13, 65, 33, 0, 13, 65, 33};
  52. static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
  53. 0, 112, -74, -38, 0, 112, -74, -38};
  54. static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
  55. 0, -18, -94, 112, 0, -18, -94, 112};
  56. static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
  57. 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
  58. // 7 bit fixed point 0.5.
  59. static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
  60. static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  61. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  62. static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
  63. 0x8080u, 0x8080u, 0x8080u, 0x8080u};
  64. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  65. #ifdef HAS_RGB24TOARGBROW_SSSE3
  66. // Shuffle table for converting RGB24 to ARGB.
  67. static const uvec8 kShuffleMaskRGB24ToARGB = {
  68. 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
  69. // Shuffle table for converting RAW to ARGB.
  70. static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
  71. 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
  72. // Shuffle table for converting RAW to RGB24. First 8.
  73. static const uvec8 kShuffleMaskRAWToRGB24_0 = {
  74. 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
  75. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  76. // Shuffle table for converting RAW to RGB24. Middle 8.
  77. static const uvec8 kShuffleMaskRAWToRGB24_1 = {
  78. 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
  79. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  80. // Shuffle table for converting RAW to RGB24. Last 8.
  81. static const uvec8 kShuffleMaskRAWToRGB24_2 = {
  82. 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
  83. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  84. // Shuffle table for converting ARGB to RGB24.
  85. static const uvec8 kShuffleMaskARGBToRGB24 = {
  86. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
  87. // Shuffle table for converting ARGB to RAW.
  88. static const uvec8 kShuffleMaskARGBToRAW = {
  89. 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
  90. // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
  91. static const uvec8 kShuffleMaskARGBToRGB24_0 = {
  92. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
  93. // YUY2 shuf 16 Y to 32 Y.
  94. static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
  95. 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
  96. 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
  97. // YUY2 shuf 8 UV to 16 UV.
  98. static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
  99. 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
  100. 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
  101. // UYVY shuf 16 Y to 32 Y.
  102. static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
  103. 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
  104. 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
  105. // UYVY shuf 8 UV to 16 UV.
  106. static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
  107. 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
  108. 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
  109. // NV21 shuf 8 VU to 16 UV.
  110. static const lvec8 kShuffleNV21 = {
  111. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  112. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  113. };
  114. #endif // HAS_RGB24TOARGBROW_SSSE3
  115. #ifdef HAS_J400TOARGBROW_SSE2
  116. void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  117. asm volatile(
  118. "pcmpeqb %%xmm5,%%xmm5 \n"
  119. "pslld $0x18,%%xmm5 \n"
  120. LABELALIGN
  121. "1: \n"
  122. "movq (%0),%%xmm0 \n"
  123. "lea 0x8(%0),%0 \n"
  124. "punpcklbw %%xmm0,%%xmm0 \n"
  125. "movdqa %%xmm0,%%xmm1 \n"
  126. "punpcklwd %%xmm0,%%xmm0 \n"
  127. "punpckhwd %%xmm1,%%xmm1 \n"
  128. "por %%xmm5,%%xmm0 \n"
  129. "por %%xmm5,%%xmm1 \n"
  130. "movdqu %%xmm0,(%1) \n"
  131. "movdqu %%xmm1,0x10(%1) \n"
  132. "lea 0x20(%1),%1 \n"
  133. "sub $0x8,%2 \n"
  134. "jg 1b \n"
  135. : "+r"(src_y), // %0
  136. "+r"(dst_argb), // %1
  137. "+r"(width) // %2
  138. ::"memory",
  139. "cc", "xmm0", "xmm1", "xmm5");
  140. }
  141. #endif // HAS_J400TOARGBROW_SSE2
  142. #ifdef HAS_RGB24TOARGBROW_SSSE3
  143. void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
  144. uint8_t* dst_argb,
  145. int width) {
  146. asm volatile(
  147. "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
  148. "pslld $0x18,%%xmm5 \n"
  149. "movdqa %3,%%xmm4 \n"
  150. LABELALIGN
  151. "1: \n"
  152. "movdqu (%0),%%xmm0 \n"
  153. "movdqu 0x10(%0),%%xmm1 \n"
  154. "movdqu 0x20(%0),%%xmm3 \n"
  155. "lea 0x30(%0),%0 \n"
  156. "movdqa %%xmm3,%%xmm2 \n"
  157. "palignr $0x8,%%xmm1,%%xmm2 \n"
  158. "pshufb %%xmm4,%%xmm2 \n"
  159. "por %%xmm5,%%xmm2 \n"
  160. "palignr $0xc,%%xmm0,%%xmm1 \n"
  161. "pshufb %%xmm4,%%xmm0 \n"
  162. "movdqu %%xmm2,0x20(%1) \n"
  163. "por %%xmm5,%%xmm0 \n"
  164. "pshufb %%xmm4,%%xmm1 \n"
  165. "movdqu %%xmm0,(%1) \n"
  166. "por %%xmm5,%%xmm1 \n"
  167. "palignr $0x4,%%xmm3,%%xmm3 \n"
  168. "pshufb %%xmm4,%%xmm3 \n"
  169. "movdqu %%xmm1,0x10(%1) \n"
  170. "por %%xmm5,%%xmm3 \n"
  171. "movdqu %%xmm3,0x30(%1) \n"
  172. "lea 0x40(%1),%1 \n"
  173. "sub $0x10,%2 \n"
  174. "jg 1b \n"
  175. : "+r"(src_rgb24), // %0
  176. "+r"(dst_argb), // %1
  177. "+r"(width) // %2
  178. : "m"(kShuffleMaskRGB24ToARGB) // %3
  179. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  180. }
  181. void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  182. asm volatile(
  183. "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
  184. "pslld $0x18,%%xmm5 \n"
  185. "movdqa %3,%%xmm4 \n"
  186. LABELALIGN
  187. "1: \n"
  188. "movdqu (%0),%%xmm0 \n"
  189. "movdqu 0x10(%0),%%xmm1 \n"
  190. "movdqu 0x20(%0),%%xmm3 \n"
  191. "lea 0x30(%0),%0 \n"
  192. "movdqa %%xmm3,%%xmm2 \n"
  193. "palignr $0x8,%%xmm1,%%xmm2 \n"
  194. "pshufb %%xmm4,%%xmm2 \n"
  195. "por %%xmm5,%%xmm2 \n"
  196. "palignr $0xc,%%xmm0,%%xmm1 \n"
  197. "pshufb %%xmm4,%%xmm0 \n"
  198. "movdqu %%xmm2,0x20(%1) \n"
  199. "por %%xmm5,%%xmm0 \n"
  200. "pshufb %%xmm4,%%xmm1 \n"
  201. "movdqu %%xmm0,(%1) \n"
  202. "por %%xmm5,%%xmm1 \n"
  203. "palignr $0x4,%%xmm3,%%xmm3 \n"
  204. "pshufb %%xmm4,%%xmm3 \n"
  205. "movdqu %%xmm1,0x10(%1) \n"
  206. "por %%xmm5,%%xmm3 \n"
  207. "movdqu %%xmm3,0x30(%1) \n"
  208. "lea 0x40(%1),%1 \n"
  209. "sub $0x10,%2 \n"
  210. "jg 1b \n"
  211. : "+r"(src_raw), // %0
  212. "+r"(dst_argb), // %1
  213. "+r"(width) // %2
  214. : "m"(kShuffleMaskRAWToARGB) // %3
  215. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  216. }
  217. void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
  218. uint8_t* dst_rgb24,
  219. int width) {
  220. asm volatile(
  221. "movdqa %3,%%xmm3 \n"
  222. "movdqa %4,%%xmm4 \n"
  223. "movdqa %5,%%xmm5 \n"
  224. LABELALIGN
  225. "1: \n"
  226. "movdqu (%0),%%xmm0 \n"
  227. "movdqu 0x4(%0),%%xmm1 \n"
  228. "movdqu 0x8(%0),%%xmm2 \n"
  229. "lea 0x18(%0),%0 \n"
  230. "pshufb %%xmm3,%%xmm0 \n"
  231. "pshufb %%xmm4,%%xmm1 \n"
  232. "pshufb %%xmm5,%%xmm2 \n"
  233. "movq %%xmm0,(%1) \n"
  234. "movq %%xmm1,0x8(%1) \n"
  235. "movq %%xmm2,0x10(%1) \n"
  236. "lea 0x18(%1),%1 \n"
  237. "sub $0x8,%2 \n"
  238. "jg 1b \n"
  239. : "+r"(src_raw), // %0
  240. "+r"(dst_rgb24), // %1
  241. "+r"(width) // %2
  242. : "m"(kShuffleMaskRAWToRGB24_0), // %3
  243. "m"(kShuffleMaskRAWToRGB24_1), // %4
  244. "m"(kShuffleMaskRAWToRGB24_2) // %5
  245. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  246. }
  247. void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  248. asm volatile(
  249. "mov $0x1080108,%%eax \n"
  250. "movd %%eax,%%xmm5 \n"
  251. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  252. "mov $0x20802080,%%eax \n"
  253. "movd %%eax,%%xmm6 \n"
  254. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  255. "pcmpeqb %%xmm3,%%xmm3 \n"
  256. "psllw $0xb,%%xmm3 \n"
  257. "pcmpeqb %%xmm4,%%xmm4 \n"
  258. "psllw $0xa,%%xmm4 \n"
  259. "psrlw $0x5,%%xmm4 \n"
  260. "pcmpeqb %%xmm7,%%xmm7 \n"
  261. "psllw $0x8,%%xmm7 \n"
  262. "sub %0,%1 \n"
  263. "sub %0,%1 \n"
  264. LABELALIGN
  265. "1: \n"
  266. "movdqu (%0),%%xmm0 \n"
  267. "movdqa %%xmm0,%%xmm1 \n"
  268. "movdqa %%xmm0,%%xmm2 \n"
  269. "pand %%xmm3,%%xmm1 \n"
  270. "psllw $0xb,%%xmm2 \n"
  271. "pmulhuw %%xmm5,%%xmm1 \n"
  272. "pmulhuw %%xmm5,%%xmm2 \n"
  273. "psllw $0x8,%%xmm1 \n"
  274. "por %%xmm2,%%xmm1 \n"
  275. "pand %%xmm4,%%xmm0 \n"
  276. "pmulhuw %%xmm6,%%xmm0 \n"
  277. "por %%xmm7,%%xmm0 \n"
  278. "movdqa %%xmm1,%%xmm2 \n"
  279. "punpcklbw %%xmm0,%%xmm1 \n"
  280. "punpckhbw %%xmm0,%%xmm2 \n"
  281. "movdqu %%xmm1,0x00(%1,%0,2) \n"
  282. "movdqu %%xmm2,0x10(%1,%0,2) \n"
  283. "lea 0x10(%0),%0 \n"
  284. "sub $0x8,%2 \n"
  285. "jg 1b \n"
  286. : "+r"(src), // %0
  287. "+r"(dst), // %1
  288. "+r"(width) // %2
  289. :
  290. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
  291. "xmm6", "xmm7");
  292. }
  293. void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  294. asm volatile(
  295. "mov $0x1080108,%%eax \n"
  296. "movd %%eax,%%xmm5 \n"
  297. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  298. "mov $0x42004200,%%eax \n"
  299. "movd %%eax,%%xmm6 \n"
  300. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  301. "pcmpeqb %%xmm3,%%xmm3 \n"
  302. "psllw $0xb,%%xmm3 \n"
  303. "movdqa %%xmm3,%%xmm4 \n"
  304. "psrlw $0x6,%%xmm4 \n"
  305. "pcmpeqb %%xmm7,%%xmm7 \n"
  306. "psllw $0x8,%%xmm7 \n"
  307. "sub %0,%1 \n"
  308. "sub %0,%1 \n"
  309. LABELALIGN
  310. "1: \n"
  311. "movdqu (%0),%%xmm0 \n"
  312. "movdqa %%xmm0,%%xmm1 \n"
  313. "movdqa %%xmm0,%%xmm2 \n"
  314. "psllw $0x1,%%xmm1 \n"
  315. "psllw $0xb,%%xmm2 \n"
  316. "pand %%xmm3,%%xmm1 \n"
  317. "pmulhuw %%xmm5,%%xmm2 \n"
  318. "pmulhuw %%xmm5,%%xmm1 \n"
  319. "psllw $0x8,%%xmm1 \n"
  320. "por %%xmm2,%%xmm1 \n"
  321. "movdqa %%xmm0,%%xmm2 \n"
  322. "pand %%xmm4,%%xmm0 \n"
  323. "psraw $0x8,%%xmm2 \n"
  324. "pmulhuw %%xmm6,%%xmm0 \n"
  325. "pand %%xmm7,%%xmm2 \n"
  326. "por %%xmm2,%%xmm0 \n"
  327. "movdqa %%xmm1,%%xmm2 \n"
  328. "punpcklbw %%xmm0,%%xmm1 \n"
  329. "punpckhbw %%xmm0,%%xmm2 \n"
  330. "movdqu %%xmm1,0x00(%1,%0,2) \n"
  331. "movdqu %%xmm2,0x10(%1,%0,2) \n"
  332. "lea 0x10(%0),%0 \n"
  333. "sub $0x8,%2 \n"
  334. "jg 1b \n"
  335. : "+r"(src), // %0
  336. "+r"(dst), // %1
  337. "+r"(width) // %2
  338. :
  339. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
  340. "xmm6", "xmm7");
  341. }
  342. void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  343. asm volatile(
  344. "mov $0xf0f0f0f,%%eax \n"
  345. "movd %%eax,%%xmm4 \n"
  346. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  347. "movdqa %%xmm4,%%xmm5 \n"
  348. "pslld $0x4,%%xmm5 \n"
  349. "sub %0,%1 \n"
  350. "sub %0,%1 \n"
  351. LABELALIGN
  352. "1: \n"
  353. "movdqu (%0),%%xmm0 \n"
  354. "movdqa %%xmm0,%%xmm2 \n"
  355. "pand %%xmm4,%%xmm0 \n"
  356. "pand %%xmm5,%%xmm2 \n"
  357. "movdqa %%xmm0,%%xmm1 \n"
  358. "movdqa %%xmm2,%%xmm3 \n"
  359. "psllw $0x4,%%xmm1 \n"
  360. "psrlw $0x4,%%xmm3 \n"
  361. "por %%xmm1,%%xmm0 \n"
  362. "por %%xmm3,%%xmm2 \n"
  363. "movdqa %%xmm0,%%xmm1 \n"
  364. "punpcklbw %%xmm2,%%xmm0 \n"
  365. "punpckhbw %%xmm2,%%xmm1 \n"
  366. "movdqu %%xmm0,0x00(%1,%0,2) \n"
  367. "movdqu %%xmm1,0x10(%1,%0,2) \n"
  368. "lea 0x10(%0),%0 \n"
  369. "sub $0x8,%2 \n"
  370. "jg 1b \n"
  371. : "+r"(src), // %0
  372. "+r"(dst), // %1
  373. "+r"(width) // %2
  374. :
  375. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  376. }
  377. void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  378. asm volatile(
  379. "movdqa %3,%%xmm6 \n"
  380. LABELALIGN
  381. "1: \n"
  382. "movdqu (%0),%%xmm0 \n"
  383. "movdqu 0x10(%0),%%xmm1 \n"
  384. "movdqu 0x20(%0),%%xmm2 \n"
  385. "movdqu 0x30(%0),%%xmm3 \n"
  386. "lea 0x40(%0),%0 \n"
  387. "pshufb %%xmm6,%%xmm0 \n"
  388. "pshufb %%xmm6,%%xmm1 \n"
  389. "pshufb %%xmm6,%%xmm2 \n"
  390. "pshufb %%xmm6,%%xmm3 \n"
  391. "movdqa %%xmm1,%%xmm4 \n"
  392. "psrldq $0x4,%%xmm1 \n"
  393. "pslldq $0xc,%%xmm4 \n"
  394. "movdqa %%xmm2,%%xmm5 \n"
  395. "por %%xmm4,%%xmm0 \n"
  396. "pslldq $0x8,%%xmm5 \n"
  397. "movdqu %%xmm0,(%1) \n"
  398. "por %%xmm5,%%xmm1 \n"
  399. "psrldq $0x8,%%xmm2 \n"
  400. "pslldq $0x4,%%xmm3 \n"
  401. "por %%xmm3,%%xmm2 \n"
  402. "movdqu %%xmm1,0x10(%1) \n"
  403. "movdqu %%xmm2,0x20(%1) \n"
  404. "lea 0x30(%1),%1 \n"
  405. "sub $0x10,%2 \n"
  406. "jg 1b \n"
  407. : "+r"(src), // %0
  408. "+r"(dst), // %1
  409. "+r"(width) // %2
  410. : "m"(kShuffleMaskARGBToRGB24) // %3
  411. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  412. }
  413. void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  414. asm volatile(
  415. "movdqa %3,%%xmm6 \n"
  416. LABELALIGN
  417. "1: \n"
  418. "movdqu (%0),%%xmm0 \n"
  419. "movdqu 0x10(%0),%%xmm1 \n"
  420. "movdqu 0x20(%0),%%xmm2 \n"
  421. "movdqu 0x30(%0),%%xmm3 \n"
  422. "lea 0x40(%0),%0 \n"
  423. "pshufb %%xmm6,%%xmm0 \n"
  424. "pshufb %%xmm6,%%xmm1 \n"
  425. "pshufb %%xmm6,%%xmm2 \n"
  426. "pshufb %%xmm6,%%xmm3 \n"
  427. "movdqa %%xmm1,%%xmm4 \n"
  428. "psrldq $0x4,%%xmm1 \n"
  429. "pslldq $0xc,%%xmm4 \n"
  430. "movdqa %%xmm2,%%xmm5 \n"
  431. "por %%xmm4,%%xmm0 \n"
  432. "pslldq $0x8,%%xmm5 \n"
  433. "movdqu %%xmm0,(%1) \n"
  434. "por %%xmm5,%%xmm1 \n"
  435. "psrldq $0x8,%%xmm2 \n"
  436. "pslldq $0x4,%%xmm3 \n"
  437. "por %%xmm3,%%xmm2 \n"
  438. "movdqu %%xmm1,0x10(%1) \n"
  439. "movdqu %%xmm2,0x20(%1) \n"
  440. "lea 0x30(%1),%1 \n"
  441. "sub $0x10,%2 \n"
  442. "jg 1b \n"
  443. : "+r"(src), // %0
  444. "+r"(dst), // %1
  445. "+r"(width) // %2
  446. : "m"(kShuffleMaskARGBToRAW) // %3
  447. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  448. }
  449. #ifdef HAS_ARGBTORGB24ROW_AVX2
  450. // vpermd for 12+12 to 24
  451. static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
  452. void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  453. asm volatile(
  454. "vbroadcastf128 %3,%%ymm6 \n"
  455. "vmovdqa %4,%%ymm7 \n"
  456. LABELALIGN
  457. "1: \n"
  458. "vmovdqu (%0),%%ymm0 \n"
  459. "vmovdqu 0x20(%0),%%ymm1 \n"
  460. "vmovdqu 0x40(%0),%%ymm2 \n"
  461. "vmovdqu 0x60(%0),%%ymm3 \n"
  462. "lea 0x80(%0),%0 \n"
  463. "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
  464. "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
  465. "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
  466. "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
  467. "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
  468. "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
  469. "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
  470. "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
  471. "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
  472. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  473. "vmovdqu %%ymm0,(%1) \n"
  474. "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
  475. "vpermq $0x4f,%%ymm2,%%ymm4 \n"
  476. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  477. "vmovdqu %%ymm1,0x20(%1) \n"
  478. "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
  479. "vpermq $0x93,%%ymm3,%%ymm3 \n"
  480. "vpor %%ymm3,%%ymm2,%%ymm2 \n"
  481. "vmovdqu %%ymm2,0x40(%1) \n"
  482. "lea 0x60(%1),%1 \n"
  483. "sub $0x20,%2 \n"
  484. "jg 1b \n"
  485. "vzeroupper \n"
  486. : "+r"(src), // %0
  487. "+r"(dst), // %1
  488. "+r"(width) // %2
  489. : "m"(kShuffleMaskARGBToRGB24), // %3
  490. "m"(kPermdRGB24_AVX) // %4
  491. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  492. "xmm7");
  493. }
  494. #endif
  495. #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
  496. // Shuffle table for converting ARGBToRGB24
  497. static const ulvec8 kPermARGBToRGB24_0 = {
  498. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
  499. 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
  500. 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
  501. static const ulvec8 kPermARGBToRGB24_1 = {
  502. 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
  503. 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
  504. 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
  505. static const ulvec8 kPermARGBToRGB24_2 = {
  506. 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
  507. 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
  508. 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
  509. void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
  510. asm volatile(
  511. "vmovdqa %3,%%ymm5 \n"
  512. "vmovdqa %4,%%ymm6 \n"
  513. "vmovdqa %5,%%ymm7 \n"
  514. LABELALIGN
  515. "1: \n"
  516. "vmovdqu (%0),%%ymm0 \n"
  517. "vmovdqu 0x20(%0),%%ymm1 \n"
  518. "vmovdqu 0x40(%0),%%ymm2 \n"
  519. "vmovdqu 0x60(%0),%%ymm3 \n"
  520. "lea 0x80(%0),%0 \n"
  521. "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
  522. "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
  523. "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
  524. "vmovdqu %%ymm0,(%1) \n"
  525. "vmovdqu %%ymm1,0x20(%1) \n"
  526. "vmovdqu %%ymm2,0x40(%1) \n"
  527. "lea 0x60(%1),%1 \n"
  528. "sub $0x20,%2 \n"
  529. "jg 1b \n"
  530. "vzeroupper \n"
  531. : "+r"(src), // %0
  532. "+r"(dst), // %1
  533. "+r"(width) // %2
  534. : "m"(kPermARGBToRGB24_0), // %3
  535. "m"(kPermARGBToRGB24_1), // %4
  536. "m"(kPermARGBToRGB24_2) // %5
  537. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
  538. }
  539. #endif
  540. #ifdef HAS_ARGBTORAWROW_AVX2
  541. void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  542. asm volatile(
  543. "vbroadcastf128 %3,%%ymm6 \n"
  544. "vmovdqa %4,%%ymm7 \n"
  545. LABELALIGN
  546. "1: \n"
  547. "vmovdqu (%0),%%ymm0 \n"
  548. "vmovdqu 0x20(%0),%%ymm1 \n"
  549. "vmovdqu 0x40(%0),%%ymm2 \n"
  550. "vmovdqu 0x60(%0),%%ymm3 \n"
  551. "lea 0x80(%0),%0 \n"
  552. "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
  553. "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
  554. "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
  555. "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
  556. "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
  557. "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
  558. "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
  559. "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
  560. "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
  561. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  562. "vmovdqu %%ymm0,(%1) \n"
  563. "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
  564. "vpermq $0x4f,%%ymm2,%%ymm4 \n"
  565. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  566. "vmovdqu %%ymm1,0x20(%1) \n"
  567. "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
  568. "vpermq $0x93,%%ymm3,%%ymm3 \n"
  569. "vpor %%ymm3,%%ymm2,%%ymm2 \n"
  570. "vmovdqu %%ymm2,0x40(%1) \n"
  571. "lea 0x60(%1),%1 \n"
  572. "sub $0x20,%2 \n"
  573. "jg 1b \n"
  574. "vzeroupper \n"
  575. : "+r"(src), // %0
  576. "+r"(dst), // %1
  577. "+r"(width) // %2
  578. : "m"(kShuffleMaskARGBToRAW), // %3
  579. "m"(kPermdRGB24_AVX) // %4
  580. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  581. "xmm7");
  582. }
  583. #endif
  584. void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  585. asm volatile(
  586. "pcmpeqb %%xmm3,%%xmm3 \n"
  587. "psrld $0x1b,%%xmm3 \n"
  588. "pcmpeqb %%xmm4,%%xmm4 \n"
  589. "psrld $0x1a,%%xmm4 \n"
  590. "pslld $0x5,%%xmm4 \n"
  591. "pcmpeqb %%xmm5,%%xmm5 \n"
  592. "pslld $0xb,%%xmm5 \n"
  593. LABELALIGN
  594. "1: \n"
  595. "movdqu (%0),%%xmm0 \n"
  596. "movdqa %%xmm0,%%xmm1 \n"
  597. "movdqa %%xmm0,%%xmm2 \n"
  598. "pslld $0x8,%%xmm0 \n"
  599. "psrld $0x3,%%xmm1 \n"
  600. "psrld $0x5,%%xmm2 \n"
  601. "psrad $0x10,%%xmm0 \n"
  602. "pand %%xmm3,%%xmm1 \n"
  603. "pand %%xmm4,%%xmm2 \n"
  604. "pand %%xmm5,%%xmm0 \n"
  605. "por %%xmm2,%%xmm1 \n"
  606. "por %%xmm1,%%xmm0 \n"
  607. "packssdw %%xmm0,%%xmm0 \n"
  608. "lea 0x10(%0),%0 \n"
  609. "movq %%xmm0,(%1) \n"
  610. "lea 0x8(%1),%1 \n"
  611. "sub $0x4,%2 \n"
  612. "jg 1b \n"
  613. : "+r"(src), // %0
  614. "+r"(dst), // %1
  615. "+r"(width) // %2
  616. ::"memory",
  617. "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  618. }
  619. void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
  620. uint8_t* dst,
  621. const uint32_t dither4,
  622. int width) {
  623. asm volatile(
  624. "movd %3,%%xmm6 \n"
  625. "punpcklbw %%xmm6,%%xmm6 \n"
  626. "movdqa %%xmm6,%%xmm7 \n"
  627. "punpcklwd %%xmm6,%%xmm6 \n"
  628. "punpckhwd %%xmm7,%%xmm7 \n"
  629. "pcmpeqb %%xmm3,%%xmm3 \n"
  630. "psrld $0x1b,%%xmm3 \n"
  631. "pcmpeqb %%xmm4,%%xmm4 \n"
  632. "psrld $0x1a,%%xmm4 \n"
  633. "pslld $0x5,%%xmm4 \n"
  634. "pcmpeqb %%xmm5,%%xmm5 \n"
  635. "pslld $0xb,%%xmm5 \n"
  636. LABELALIGN
  637. "1: \n"
  638. "movdqu (%0),%%xmm0 \n"
  639. "paddusb %%xmm6,%%xmm0 \n"
  640. "movdqa %%xmm0,%%xmm1 \n"
  641. "movdqa %%xmm0,%%xmm2 \n"
  642. "pslld $0x8,%%xmm0 \n"
  643. "psrld $0x3,%%xmm1 \n"
  644. "psrld $0x5,%%xmm2 \n"
  645. "psrad $0x10,%%xmm0 \n"
  646. "pand %%xmm3,%%xmm1 \n"
  647. "pand %%xmm4,%%xmm2 \n"
  648. "pand %%xmm5,%%xmm0 \n"
  649. "por %%xmm2,%%xmm1 \n"
  650. "por %%xmm1,%%xmm0 \n"
  651. "packssdw %%xmm0,%%xmm0 \n"
  652. "lea 0x10(%0),%0 \n"
  653. "movq %%xmm0,(%1) \n"
  654. "lea 0x8(%1),%1 \n"
  655. "sub $0x4,%2 \n"
  656. "jg 1b \n"
  657. : "+r"(src), // %0
  658. "+r"(dst), // %1
  659. "+r"(width) // %2
  660. : "m"(dither4) // %3
  661. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  662. "xmm7");
  663. }
  664. #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
  665. void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
  666. uint8_t* dst,
  667. const uint32_t dither4,
  668. int width) {
  669. asm volatile(
  670. "vbroadcastss %3,%%xmm6 \n"
  671. "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
  672. "vpermq $0xd8,%%ymm6,%%ymm6 \n"
  673. "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
  674. "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
  675. "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
  676. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  677. "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
  678. "vpslld $0x5,%%ymm4,%%ymm4 \n"
  679. "vpslld $0xb,%%ymm3,%%ymm5 \n"
  680. LABELALIGN
  681. "1: \n"
  682. "vmovdqu (%0),%%ymm0 \n"
  683. "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
  684. "vpsrld $0x5,%%ymm0,%%ymm2 \n"
  685. "vpsrld $0x3,%%ymm0,%%ymm1 \n"
  686. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  687. "vpand %%ymm4,%%ymm2,%%ymm2 \n"
  688. "vpand %%ymm3,%%ymm1,%%ymm1 \n"
  689. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  690. "vpor %%ymm2,%%ymm1,%%ymm1 \n"
  691. "vpor %%ymm1,%%ymm0,%%ymm0 \n"
  692. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  693. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  694. "lea 0x20(%0),%0 \n"
  695. "vmovdqu %%xmm0,(%1) \n"
  696. "lea 0x10(%1),%1 \n"
  697. "sub $0x8,%2 \n"
  698. "jg 1b \n"
  699. "vzeroupper \n"
  700. : "+r"(src), // %0
  701. "+r"(dst), // %1
  702. "+r"(width) // %2
  703. : "m"(dither4) // %3
  704. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  705. "xmm7");
  706. }
  707. #endif // HAS_ARGBTORGB565DITHERROW_AVX2
  708. void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  709. asm volatile(
  710. "pcmpeqb %%xmm4,%%xmm4 \n"
  711. "psrld $0x1b,%%xmm4 \n"
  712. "movdqa %%xmm4,%%xmm5 \n"
  713. "pslld $0x5,%%xmm5 \n"
  714. "movdqa %%xmm4,%%xmm6 \n"
  715. "pslld $0xa,%%xmm6 \n"
  716. "pcmpeqb %%xmm7,%%xmm7 \n"
  717. "pslld $0xf,%%xmm7 \n"
  718. LABELALIGN
  719. "1: \n"
  720. "movdqu (%0),%%xmm0 \n"
  721. "movdqa %%xmm0,%%xmm1 \n"
  722. "movdqa %%xmm0,%%xmm2 \n"
  723. "movdqa %%xmm0,%%xmm3 \n"
  724. "psrad $0x10,%%xmm0 \n"
  725. "psrld $0x3,%%xmm1 \n"
  726. "psrld $0x6,%%xmm2 \n"
  727. "psrld $0x9,%%xmm3 \n"
  728. "pand %%xmm7,%%xmm0 \n"
  729. "pand %%xmm4,%%xmm1 \n"
  730. "pand %%xmm5,%%xmm2 \n"
  731. "pand %%xmm6,%%xmm3 \n"
  732. "por %%xmm1,%%xmm0 \n"
  733. "por %%xmm3,%%xmm2 \n"
  734. "por %%xmm2,%%xmm0 \n"
  735. "packssdw %%xmm0,%%xmm0 \n"
  736. "lea 0x10(%0),%0 \n"
  737. "movq %%xmm0,(%1) \n"
  738. "lea 0x8(%1),%1 \n"
  739. "sub $0x4,%2 \n"
  740. "jg 1b \n"
  741. : "+r"(src), // %0
  742. "+r"(dst), // %1
  743. "+r"(width) // %2
  744. ::"memory",
  745. "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
  746. }
  747. void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  748. asm volatile(
  749. "pcmpeqb %%xmm4,%%xmm4 \n"
  750. "psllw $0xc,%%xmm4 \n"
  751. "movdqa %%xmm4,%%xmm3 \n"
  752. "psrlw $0x8,%%xmm3 \n"
  753. LABELALIGN
  754. "1: \n"
  755. "movdqu (%0),%%xmm0 \n"
  756. "movdqa %%xmm0,%%xmm1 \n"
  757. "pand %%xmm3,%%xmm0 \n"
  758. "pand %%xmm4,%%xmm1 \n"
  759. "psrlq $0x4,%%xmm0 \n"
  760. "psrlq $0x8,%%xmm1 \n"
  761. "por %%xmm1,%%xmm0 \n"
  762. "packuswb %%xmm0,%%xmm0 \n"
  763. "lea 0x10(%0),%0 \n"
  764. "movq %%xmm0,(%1) \n"
  765. "lea 0x8(%1),%1 \n"
  766. "sub $0x4,%2 \n"
  767. "jg 1b \n"
  768. : "+r"(src), // %0
  769. "+r"(dst), // %1
  770. "+r"(width) // %2
  771. ::"memory",
  772. "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  773. }
  774. #endif // HAS_RGB24TOARGBROW_SSSE3
  775. /*
  776. ARGBToAR30Row:
  777. Red Blue
  778. With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
  779. produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
  780. wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
  781. (1024+4)*16 for red.
  782. Alpha Green
  783. Alpha and Green are already in the high bits so vpand can zero out the other
  784. bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
  785. could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
  786. would be a simple multiplier to shift it into position. It wants a gap of 10
  787. above the green. Green is 10 bits, so there are 6 bits in the low short. 4
  788. more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
  789. and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
  790. result left 10 to position the A and G channels.
  791. */
  792. // Shuffle table for converting RAW to RGB24. Last 8.
  793. static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
  794. 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
  795. static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
  796. 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
  797. static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
  798. static const uint32_t kMaskRB10 = 0x3ff003ff;
  799. static const uint32_t kMaskAG10 = 0xc000ff00;
  800. static const uint32_t kMulAG10 = 64 * 65536 + 1028;
  801. void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  802. asm volatile(
  803. "movdqa %3,%%xmm2 \n" // shuffler for RB
  804. "movd %4,%%xmm3 \n" // multipler for RB
  805. "movd %5,%%xmm4 \n" // mask for R10 B10
  806. "movd %6,%%xmm5 \n" // mask for AG
  807. "movd %7,%%xmm6 \n" // multipler for AG
  808. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  809. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  810. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  811. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  812. "sub %0,%1 \n"
  813. "1: \n"
  814. "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
  815. "movdqa %%xmm0,%%xmm1 \n"
  816. "pshufb %%xmm2,%%xmm1 \n" // R0B0
  817. "pand %%xmm5,%%xmm0 \n" // A0G0
  818. "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
  819. "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
  820. "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
  821. "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
  822. "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
  823. "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
  824. "add $0x10,%0 \n"
  825. "sub $0x4,%2 \n"
  826. "jg 1b \n"
  827. : "+r"(src), // %0
  828. "+r"(dst), // %1
  829. "+r"(width) // %2
  830. : "m"(kShuffleRB30), // %3
  831. "m"(kMulRB10), // %4
  832. "m"(kMaskRB10), // %5
  833. "m"(kMaskAG10), // %6
  834. "m"(kMulAG10) // %7
  835. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  836. }
  837. void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  838. asm volatile(
  839. "movdqa %3,%%xmm2 \n" // shuffler for RB
  840. "movd %4,%%xmm3 \n" // multipler for RB
  841. "movd %5,%%xmm4 \n" // mask for R10 B10
  842. "movd %6,%%xmm5 \n" // mask for AG
  843. "movd %7,%%xmm6 \n" // multipler for AG
  844. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  845. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  846. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  847. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  848. "sub %0,%1 \n"
  849. "1: \n"
  850. "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
  851. "movdqa %%xmm0,%%xmm1 \n"
  852. "pshufb %%xmm2,%%xmm1 \n" // R0B0
  853. "pand %%xmm5,%%xmm0 \n" // A0G0
  854. "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
  855. "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
  856. "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
  857. "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
  858. "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
  859. "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
  860. "add $0x10,%0 \n"
  861. "sub $0x4,%2 \n"
  862. "jg 1b \n"
  863. : "+r"(src), // %0
  864. "+r"(dst), // %1
  865. "+r"(width) // %2
  866. : "m"(kShuffleBR30), // %3 reversed shuffler
  867. "m"(kMulRB10), // %4
  868. "m"(kMaskRB10), // %5
  869. "m"(kMaskAG10), // %6
  870. "m"(kMulAG10) // %7
  871. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  872. }
  873. #ifdef HAS_ARGBTOAR30ROW_AVX2
  874. void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  875. asm volatile(
  876. "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
  877. "vbroadcastss %4,%%ymm3 \n" // multipler for RB
  878. "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
  879. "vbroadcastss %6,%%ymm5 \n" // mask for AG
  880. "vbroadcastss %7,%%ymm6 \n" // multipler for AG
  881. "sub %0,%1 \n"
  882. "1: \n"
  883. "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
  884. "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
  885. "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
  886. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
  887. "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
  888. "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
  889. "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
  890. "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
  891. "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
  892. "add $0x20,%0 \n"
  893. "sub $0x8,%2 \n"
  894. "jg 1b \n"
  895. "vzeroupper \n"
  896. : "+r"(src), // %0
  897. "+r"(dst), // %1
  898. "+r"(width) // %2
  899. : "m"(kShuffleRB30), // %3
  900. "m"(kMulRB10), // %4
  901. "m"(kMaskRB10), // %5
  902. "m"(kMaskAG10), // %6
  903. "m"(kMulAG10) // %7
  904. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  905. }
  906. #endif
  907. #ifdef HAS_ABGRTOAR30ROW_AVX2
  908. void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  909. asm volatile(
  910. "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
  911. "vbroadcastss %4,%%ymm3 \n" // multipler for RB
  912. "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
  913. "vbroadcastss %6,%%ymm5 \n" // mask for AG
  914. "vbroadcastss %7,%%ymm6 \n" // multipler for AG
  915. "sub %0,%1 \n"
  916. "1: \n"
  917. "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
  918. "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
  919. "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
  920. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
  921. "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
  922. "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
  923. "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
  924. "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
  925. "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
  926. "add $0x20,%0 \n"
  927. "sub $0x8,%2 \n"
  928. "jg 1b \n"
  929. "vzeroupper \n"
  930. : "+r"(src), // %0
  931. "+r"(dst), // %1
  932. "+r"(width) // %2
  933. : "m"(kShuffleBR30), // %3 reversed shuffler
  934. "m"(kMulRB10), // %4
  935. "m"(kMaskRB10), // %5
  936. "m"(kMaskAG10), // %6
  937. "m"(kMulAG10) // %7
  938. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  939. }
  940. #endif
  941. #ifdef HAS_ARGBTOYROW_SSSE3
  942. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
  943. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  944. asm volatile(
  945. "movdqa %3,%%xmm4 \n"
  946. "movdqa %4,%%xmm5 \n"
  947. LABELALIGN
  948. "1: \n"
  949. "movdqu (%0),%%xmm0 \n"
  950. "movdqu 0x10(%0),%%xmm1 \n"
  951. "movdqu 0x20(%0),%%xmm2 \n"
  952. "movdqu 0x30(%0),%%xmm3 \n"
  953. "pmaddubsw %%xmm4,%%xmm0 \n"
  954. "pmaddubsw %%xmm4,%%xmm1 \n"
  955. "pmaddubsw %%xmm4,%%xmm2 \n"
  956. "pmaddubsw %%xmm4,%%xmm3 \n"
  957. "lea 0x40(%0),%0 \n"
  958. "phaddw %%xmm1,%%xmm0 \n"
  959. "phaddw %%xmm3,%%xmm2 \n"
  960. "psrlw $0x7,%%xmm0 \n"
  961. "psrlw $0x7,%%xmm2 \n"
  962. "packuswb %%xmm2,%%xmm0 \n"
  963. "paddb %%xmm5,%%xmm0 \n"
  964. "movdqu %%xmm0,(%1) \n"
  965. "lea 0x10(%1),%1 \n"
  966. "sub $0x10,%2 \n"
  967. "jg 1b \n"
  968. : "+r"(src_argb), // %0
  969. "+r"(dst_y), // %1
  970. "+r"(width) // %2
  971. : "m"(kARGBToY), // %3
  972. "m"(kAddY16) // %4
  973. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  974. }
  975. #endif // HAS_ARGBTOYROW_SSSE3
  976. #ifdef HAS_ARGBTOYJROW_SSSE3
  977. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
  978. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
  979. void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  980. asm volatile(
  981. "movdqa %3,%%xmm4 \n"
  982. "movdqa %4,%%xmm5 \n"
  983. LABELALIGN
  984. "1: \n"
  985. "movdqu (%0),%%xmm0 \n"
  986. "movdqu 0x10(%0),%%xmm1 \n"
  987. "movdqu 0x20(%0),%%xmm2 \n"
  988. "movdqu 0x30(%0),%%xmm3 \n"
  989. "pmaddubsw %%xmm4,%%xmm0 \n"
  990. "pmaddubsw %%xmm4,%%xmm1 \n"
  991. "pmaddubsw %%xmm4,%%xmm2 \n"
  992. "pmaddubsw %%xmm4,%%xmm3 \n"
  993. "lea 0x40(%0),%0 \n"
  994. "phaddw %%xmm1,%%xmm0 \n"
  995. "phaddw %%xmm3,%%xmm2 \n"
  996. "paddw %%xmm5,%%xmm0 \n"
  997. "paddw %%xmm5,%%xmm2 \n"
  998. "psrlw $0x7,%%xmm0 \n"
  999. "psrlw $0x7,%%xmm2 \n"
  1000. "packuswb %%xmm2,%%xmm0 \n"
  1001. "movdqu %%xmm0,(%1) \n"
  1002. "lea 0x10(%1),%1 \n"
  1003. "sub $0x10,%2 \n"
  1004. "jg 1b \n"
  1005. : "+r"(src_argb), // %0
  1006. "+r"(dst_y), // %1
  1007. "+r"(width) // %2
  1008. : "m"(kARGBToYJ), // %3
  1009. "m"(kAddYJ64) // %4
  1010. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  1011. }
  1012. #endif // HAS_ARGBTOYJROW_SSSE3
  1013. #ifdef HAS_ARGBTOYROW_AVX2
  1014. // vpermd for vphaddw + vpackuswb vpermd.
  1015. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
  1016. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1017. void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1018. asm volatile(
  1019. "vbroadcastf128 %3,%%ymm4 \n"
  1020. "vbroadcastf128 %4,%%ymm5 \n"
  1021. "vmovdqu %5,%%ymm6 \n"
  1022. LABELALIGN
  1023. "1: \n"
  1024. "vmovdqu (%0),%%ymm0 \n"
  1025. "vmovdqu 0x20(%0),%%ymm1 \n"
  1026. "vmovdqu 0x40(%0),%%ymm2 \n"
  1027. "vmovdqu 0x60(%0),%%ymm3 \n"
  1028. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  1029. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  1030. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  1031. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  1032. "lea 0x80(%0),%0 \n"
  1033. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
  1034. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
  1035. "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
  1036. "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
  1037. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  1038. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
  1039. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
  1040. "vmovdqu %%ymm0,(%1) \n"
  1041. "lea 0x20(%1),%1 \n"
  1042. "sub $0x20,%2 \n"
  1043. "jg 1b \n"
  1044. "vzeroupper \n"
  1045. : "+r"(src_argb), // %0
  1046. "+r"(dst_y), // %1
  1047. "+r"(width) // %2
  1048. : "m"(kARGBToY), // %3
  1049. "m"(kAddY16), // %4
  1050. "m"(kPermdARGBToY_AVX) // %5
  1051. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  1052. }
  1053. #endif // HAS_ARGBTOYROW_AVX2
  1054. #ifdef HAS_ABGRTOYROW_AVX2
  1055. // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
  1056. void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
  1057. asm volatile(
  1058. "vbroadcastf128 %3,%%ymm4 \n"
  1059. "vbroadcastf128 %4,%%ymm5 \n"
  1060. "vmovdqu %5,%%ymm6 \n"
  1061. LABELALIGN
  1062. "1: \n"
  1063. "vmovdqu (%0),%%ymm0 \n"
  1064. "vmovdqu 0x20(%0),%%ymm1 \n"
  1065. "vmovdqu 0x40(%0),%%ymm2 \n"
  1066. "vmovdqu 0x60(%0),%%ymm3 \n"
  1067. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  1068. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  1069. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  1070. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  1071. "lea 0x80(%0),%0 \n"
  1072. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
  1073. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
  1074. "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
  1075. "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
  1076. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  1077. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
  1078. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
  1079. "vmovdqu %%ymm0,(%1) \n"
  1080. "lea 0x20(%1),%1 \n"
  1081. "sub $0x20,%2 \n"
  1082. "jg 1b \n"
  1083. "vzeroupper \n"
  1084. : "+r"(src_abgr), // %0
  1085. "+r"(dst_y), // %1
  1086. "+r"(width) // %2
  1087. : "m"(kABGRToY), // %3
  1088. "m"(kAddY16), // %4
  1089. "m"(kPermdARGBToY_AVX) // %5
  1090. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  1091. }
  1092. #endif // HAS_ABGRTOYROW_AVX2
  1093. #ifdef HAS_ARGBTOYJROW_AVX2
  1094. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1095. void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1096. asm volatile(
  1097. "vbroadcastf128 %3,%%ymm4 \n"
  1098. "vbroadcastf128 %4,%%ymm5 \n"
  1099. "vmovdqu %5,%%ymm6 \n"
  1100. LABELALIGN
  1101. "1: \n"
  1102. "vmovdqu (%0),%%ymm0 \n"
  1103. "vmovdqu 0x20(%0),%%ymm1 \n"
  1104. "vmovdqu 0x40(%0),%%ymm2 \n"
  1105. "vmovdqu 0x60(%0),%%ymm3 \n"
  1106. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  1107. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  1108. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  1109. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  1110. "lea 0x80(%0),%0 \n"
  1111. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
  1112. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
  1113. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
  1114. "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
  1115. "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
  1116. "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
  1117. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  1118. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
  1119. "vmovdqu %%ymm0,(%1) \n"
  1120. "lea 0x20(%1),%1 \n"
  1121. "sub $0x20,%2 \n"
  1122. "jg 1b \n"
  1123. "vzeroupper \n"
  1124. : "+r"(src_argb), // %0
  1125. "+r"(dst_y), // %1
  1126. "+r"(width) // %2
  1127. : "m"(kARGBToYJ), // %3
  1128. "m"(kAddYJ64), // %4
  1129. "m"(kPermdARGBToY_AVX) // %5
  1130. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  1131. }
  1132. #endif // HAS_ARGBTOYJROW_AVX2
  1133. #ifdef HAS_ARGBTOUVROW_SSSE3
  1134. void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
  1135. int src_stride_argb,
  1136. uint8_t* dst_u,
  1137. uint8_t* dst_v,
  1138. int width) {
  1139. asm volatile(
  1140. "movdqa %5,%%xmm3 \n"
  1141. "movdqa %6,%%xmm4 \n"
  1142. "movdqa %7,%%xmm5 \n"
  1143. "sub %1,%2 \n"
  1144. LABELALIGN
  1145. "1: \n"
  1146. "movdqu (%0),%%xmm0 \n"
  1147. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1148. "pavgb %%xmm7,%%xmm0 \n"
  1149. "movdqu 0x10(%0),%%xmm1 \n"
  1150. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1151. "pavgb %%xmm7,%%xmm1 \n"
  1152. "movdqu 0x20(%0),%%xmm2 \n"
  1153. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1154. "pavgb %%xmm7,%%xmm2 \n"
  1155. "movdqu 0x30(%0),%%xmm6 \n"
  1156. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1157. "pavgb %%xmm7,%%xmm6 \n"
  1158. "lea 0x40(%0),%0 \n"
  1159. "movdqa %%xmm0,%%xmm7 \n"
  1160. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1161. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1162. "pavgb %%xmm7,%%xmm0 \n"
  1163. "movdqa %%xmm2,%%xmm7 \n"
  1164. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1165. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1166. "pavgb %%xmm7,%%xmm2 \n"
  1167. "movdqa %%xmm0,%%xmm1 \n"
  1168. "movdqa %%xmm2,%%xmm6 \n"
  1169. "pmaddubsw %%xmm4,%%xmm0 \n"
  1170. "pmaddubsw %%xmm4,%%xmm2 \n"
  1171. "pmaddubsw %%xmm3,%%xmm1 \n"
  1172. "pmaddubsw %%xmm3,%%xmm6 \n"
  1173. "phaddw %%xmm2,%%xmm0 \n"
  1174. "phaddw %%xmm6,%%xmm1 \n"
  1175. "psraw $0x8,%%xmm0 \n"
  1176. "psraw $0x8,%%xmm1 \n"
  1177. "packsswb %%xmm1,%%xmm0 \n"
  1178. "paddb %%xmm5,%%xmm0 \n"
  1179. "movlps %%xmm0,(%1) \n"
  1180. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1181. "lea 0x8(%1),%1 \n"
  1182. "sub $0x10,%3 \n"
  1183. "jg 1b \n"
  1184. : "+r"(src_argb0), // %0
  1185. "+r"(dst_u), // %1
  1186. "+r"(dst_v), // %2
  1187. "+rm"(width) // %3
  1188. : "r"((intptr_t)(src_stride_argb)), // %4
  1189. "m"(kARGBToV), // %5
  1190. "m"(kARGBToU), // %6
  1191. "m"(kAddUV128) // %7
  1192. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1193. }
  1194. #endif // HAS_ARGBTOUVROW_SSSE3
  1195. #ifdef HAS_ARGBTOUVROW_AVX2
  1196. // vpshufb for vphaddw + vpackuswb packed to shorts.
  1197. static const lvec8 kShufARGBToUV_AVX = {
  1198. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
  1199. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
  1200. void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
  1201. int src_stride_argb,
  1202. uint8_t* dst_u,
  1203. uint8_t* dst_v,
  1204. int width) {
  1205. asm volatile(
  1206. "vbroadcastf128 %5,%%ymm5 \n"
  1207. "vbroadcastf128 %6,%%ymm6 \n"
  1208. "vbroadcastf128 %7,%%ymm7 \n"
  1209. "sub %1,%2 \n"
  1210. LABELALIGN
  1211. "1: \n"
  1212. "vmovdqu (%0),%%ymm0 \n"
  1213. "vmovdqu 0x20(%0),%%ymm1 \n"
  1214. "vmovdqu 0x40(%0),%%ymm2 \n"
  1215. "vmovdqu 0x60(%0),%%ymm3 \n"
  1216. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  1217. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  1218. "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
  1219. "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
  1220. "lea 0x80(%0),%0 \n"
  1221. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  1222. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  1223. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  1224. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  1225. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  1226. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  1227. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  1228. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  1229. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  1230. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  1231. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  1232. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  1233. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  1234. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  1235. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  1236. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  1237. "vpshufb %8,%%ymm0,%%ymm0 \n"
  1238. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
  1239. "vextractf128 $0x0,%%ymm0,(%1) \n"
  1240. "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
  1241. "lea 0x10(%1),%1 \n"
  1242. "sub $0x20,%3 \n"
  1243. "jg 1b \n"
  1244. "vzeroupper \n"
  1245. : "+r"(src_argb0), // %0
  1246. "+r"(dst_u), // %1
  1247. "+r"(dst_v), // %2
  1248. "+rm"(width) // %3
  1249. : "r"((intptr_t)(src_stride_argb)), // %4
  1250. "m"(kAddUV128), // %5
  1251. "m"(kARGBToV), // %6
  1252. "m"(kARGBToU), // %7
  1253. "m"(kShufARGBToUV_AVX) // %8
  1254. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1255. "xmm7");
  1256. }
  1257. #endif // HAS_ARGBTOUVROW_AVX2
  1258. #ifdef HAS_ABGRTOUVROW_AVX2
  1259. void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
  1260. int src_stride_abgr,
  1261. uint8_t* dst_u,
  1262. uint8_t* dst_v,
  1263. int width) {
  1264. asm volatile(
  1265. "vbroadcastf128 %5,%%ymm5 \n"
  1266. "vbroadcastf128 %6,%%ymm6 \n"
  1267. "vbroadcastf128 %7,%%ymm7 \n"
  1268. "sub %1,%2 \n"
  1269. LABELALIGN
  1270. "1: \n"
  1271. "vmovdqu (%0),%%ymm0 \n"
  1272. "vmovdqu 0x20(%0),%%ymm1 \n"
  1273. "vmovdqu 0x40(%0),%%ymm2 \n"
  1274. "vmovdqu 0x60(%0),%%ymm3 \n"
  1275. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  1276. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  1277. "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
  1278. "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
  1279. "lea 0x80(%0),%0 \n"
  1280. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  1281. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  1282. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  1283. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  1284. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  1285. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  1286. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  1287. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  1288. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  1289. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  1290. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  1291. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  1292. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  1293. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  1294. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  1295. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  1296. "vpshufb %8,%%ymm0,%%ymm0 \n"
  1297. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
  1298. "vextractf128 $0x0,%%ymm0,(%1) \n"
  1299. "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
  1300. "lea 0x10(%1),%1 \n"
  1301. "sub $0x20,%3 \n"
  1302. "jg 1b \n"
  1303. "vzeroupper \n"
  1304. : "+r"(src_abgr0), // %0
  1305. "+r"(dst_u), // %1
  1306. "+r"(dst_v), // %2
  1307. "+rm"(width) // %3
  1308. : "r"((intptr_t)(src_stride_abgr)), // %4
  1309. "m"(kAddUV128), // %5
  1310. "m"(kABGRToV), // %6
  1311. "m"(kABGRToU), // %7
  1312. "m"(kShufARGBToUV_AVX) // %8
  1313. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1314. "xmm7");
  1315. }
  1316. #endif // HAS_ABGRTOUVROW_AVX2
  1317. #ifdef HAS_ARGBTOUVJROW_AVX2
  1318. void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
  1319. int src_stride_argb,
  1320. uint8_t* dst_u,
  1321. uint8_t* dst_v,
  1322. int width) {
  1323. asm volatile(
  1324. "vbroadcastf128 %5,%%ymm5 \n"
  1325. "vbroadcastf128 %6,%%ymm6 \n"
  1326. "vbroadcastf128 %7,%%ymm7 \n"
  1327. "sub %1,%2 \n"
  1328. LABELALIGN
  1329. "1: \n"
  1330. "vmovdqu (%0),%%ymm0 \n"
  1331. "vmovdqu 0x20(%0),%%ymm1 \n"
  1332. "vmovdqu 0x40(%0),%%ymm2 \n"
  1333. "vmovdqu 0x60(%0),%%ymm3 \n"
  1334. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  1335. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  1336. "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
  1337. "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
  1338. "lea 0x80(%0),%0 \n"
  1339. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  1340. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  1341. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  1342. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  1343. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  1344. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  1345. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  1346. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  1347. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  1348. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  1349. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  1350. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  1351. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
  1352. "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
  1353. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  1354. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  1355. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  1356. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  1357. "vpshufb %8,%%ymm0,%%ymm0 \n"
  1358. "vextractf128 $0x0,%%ymm0,(%1) \n"
  1359. "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
  1360. "lea 0x10(%1),%1 \n"
  1361. "sub $0x20,%3 \n"
  1362. "jg 1b \n"
  1363. "vzeroupper \n"
  1364. : "+r"(src_argb0), // %0
  1365. "+r"(dst_u), // %1
  1366. "+r"(dst_v), // %2
  1367. "+rm"(width) // %3
  1368. : "r"((intptr_t)(src_stride_argb)), // %4
  1369. "m"(kAddUVJ128), // %5
  1370. "m"(kARGBToVJ), // %6
  1371. "m"(kARGBToUJ), // %7
  1372. "m"(kShufARGBToUV_AVX) // %8
  1373. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1374. "xmm7");
  1375. }
  1376. #endif // HAS_ARGBTOUVJROW_AVX2
  1377. #ifdef HAS_ARGBTOUVJROW_SSSE3
  1378. void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
  1379. int src_stride_argb,
  1380. uint8_t* dst_u,
  1381. uint8_t* dst_v,
  1382. int width) {
  1383. asm volatile(
  1384. "movdqa %5,%%xmm3 \n"
  1385. "movdqa %6,%%xmm4 \n"
  1386. "movdqa %7,%%xmm5 \n"
  1387. "sub %1,%2 \n"
  1388. LABELALIGN
  1389. "1: \n"
  1390. "movdqu (%0),%%xmm0 \n"
  1391. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1392. "pavgb %%xmm7,%%xmm0 \n"
  1393. "movdqu 0x10(%0),%%xmm1 \n"
  1394. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1395. "pavgb %%xmm7,%%xmm1 \n"
  1396. "movdqu 0x20(%0),%%xmm2 \n"
  1397. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1398. "pavgb %%xmm7,%%xmm2 \n"
  1399. "movdqu 0x30(%0),%%xmm6 \n"
  1400. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1401. "pavgb %%xmm7,%%xmm6 \n"
  1402. "lea 0x40(%0),%0 \n"
  1403. "movdqa %%xmm0,%%xmm7 \n"
  1404. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1405. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1406. "pavgb %%xmm7,%%xmm0 \n"
  1407. "movdqa %%xmm2,%%xmm7 \n"
  1408. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1409. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1410. "pavgb %%xmm7,%%xmm2 \n"
  1411. "movdqa %%xmm0,%%xmm1 \n"
  1412. "movdqa %%xmm2,%%xmm6 \n"
  1413. "pmaddubsw %%xmm4,%%xmm0 \n"
  1414. "pmaddubsw %%xmm4,%%xmm2 \n"
  1415. "pmaddubsw %%xmm3,%%xmm1 \n"
  1416. "pmaddubsw %%xmm3,%%xmm6 \n"
  1417. "phaddw %%xmm2,%%xmm0 \n"
  1418. "phaddw %%xmm6,%%xmm1 \n"
  1419. "paddw %%xmm5,%%xmm0 \n"
  1420. "paddw %%xmm5,%%xmm1 \n"
  1421. "psraw $0x8,%%xmm0 \n"
  1422. "psraw $0x8,%%xmm1 \n"
  1423. "packsswb %%xmm1,%%xmm0 \n"
  1424. "movlps %%xmm0,(%1) \n"
  1425. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1426. "lea 0x8(%1),%1 \n"
  1427. "sub $0x10,%3 \n"
  1428. "jg 1b \n"
  1429. : "+r"(src_argb0), // %0
  1430. "+r"(dst_u), // %1
  1431. "+r"(dst_v), // %2
  1432. "+rm"(width) // %3
  1433. : "r"((intptr_t)(src_stride_argb)), // %4
  1434. "m"(kARGBToVJ), // %5
  1435. "m"(kARGBToUJ), // %6
  1436. "m"(kAddUVJ128) // %7
  1437. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1438. }
  1439. #endif // HAS_ARGBTOUVJROW_SSSE3
  1440. #ifdef HAS_ARGBTOUV444ROW_SSSE3
  1441. void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
  1442. uint8_t* dst_u,
  1443. uint8_t* dst_v,
  1444. int width) {
  1445. asm volatile(
  1446. "movdqa %4,%%xmm3 \n"
  1447. "movdqa %5,%%xmm4 \n"
  1448. "movdqa %6,%%xmm5 \n"
  1449. "sub %1,%2 \n"
  1450. LABELALIGN
  1451. "1: \n"
  1452. "movdqu (%0),%%xmm0 \n"
  1453. "movdqu 0x10(%0),%%xmm1 \n"
  1454. "movdqu 0x20(%0),%%xmm2 \n"
  1455. "movdqu 0x30(%0),%%xmm6 \n"
  1456. "pmaddubsw %%xmm4,%%xmm0 \n"
  1457. "pmaddubsw %%xmm4,%%xmm1 \n"
  1458. "pmaddubsw %%xmm4,%%xmm2 \n"
  1459. "pmaddubsw %%xmm4,%%xmm6 \n"
  1460. "phaddw %%xmm1,%%xmm0 \n"
  1461. "phaddw %%xmm6,%%xmm2 \n"
  1462. "psraw $0x8,%%xmm0 \n"
  1463. "psraw $0x8,%%xmm2 \n"
  1464. "packsswb %%xmm2,%%xmm0 \n"
  1465. "paddb %%xmm5,%%xmm0 \n"
  1466. "movdqu %%xmm0,(%1) \n"
  1467. "movdqu (%0),%%xmm0 \n"
  1468. "movdqu 0x10(%0),%%xmm1 \n"
  1469. "movdqu 0x20(%0),%%xmm2 \n"
  1470. "movdqu 0x30(%0),%%xmm6 \n"
  1471. "pmaddubsw %%xmm3,%%xmm0 \n"
  1472. "pmaddubsw %%xmm3,%%xmm1 \n"
  1473. "pmaddubsw %%xmm3,%%xmm2 \n"
  1474. "pmaddubsw %%xmm3,%%xmm6 \n"
  1475. "phaddw %%xmm1,%%xmm0 \n"
  1476. "phaddw %%xmm6,%%xmm2 \n"
  1477. "psraw $0x8,%%xmm0 \n"
  1478. "psraw $0x8,%%xmm2 \n"
  1479. "packsswb %%xmm2,%%xmm0 \n"
  1480. "paddb %%xmm5,%%xmm0 \n"
  1481. "lea 0x40(%0),%0 \n"
  1482. "movdqu %%xmm0,0x00(%1,%2,1) \n"
  1483. "lea 0x10(%1),%1 \n"
  1484. "sub $0x10,%3 \n"
  1485. "jg 1b \n"
  1486. : "+r"(src_argb), // %0
  1487. "+r"(dst_u), // %1
  1488. "+r"(dst_v), // %2
  1489. "+rm"(width) // %3
  1490. : "m"(kARGBToV), // %4
  1491. "m"(kARGBToU), // %5
  1492. "m"(kAddUV128) // %6
  1493. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
  1494. }
  1495. #endif // HAS_ARGBTOUV444ROW_SSSE3
  1496. void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
  1497. asm volatile(
  1498. "movdqa %4,%%xmm5 \n"
  1499. "movdqa %3,%%xmm4 \n"
  1500. LABELALIGN
  1501. "1: \n"
  1502. "movdqu (%0),%%xmm0 \n"
  1503. "movdqu 0x10(%0),%%xmm1 \n"
  1504. "movdqu 0x20(%0),%%xmm2 \n"
  1505. "movdqu 0x30(%0),%%xmm3 \n"
  1506. "pmaddubsw %%xmm4,%%xmm0 \n"
  1507. "pmaddubsw %%xmm4,%%xmm1 \n"
  1508. "pmaddubsw %%xmm4,%%xmm2 \n"
  1509. "pmaddubsw %%xmm4,%%xmm3 \n"
  1510. "lea 0x40(%0),%0 \n"
  1511. "phaddw %%xmm1,%%xmm0 \n"
  1512. "phaddw %%xmm3,%%xmm2 \n"
  1513. "psrlw $0x7,%%xmm0 \n"
  1514. "psrlw $0x7,%%xmm2 \n"
  1515. "packuswb %%xmm2,%%xmm0 \n"
  1516. "paddb %%xmm5,%%xmm0 \n"
  1517. "movdqu %%xmm0,(%1) \n"
  1518. "lea 0x10(%1),%1 \n"
  1519. "sub $0x10,%2 \n"
  1520. "jg 1b \n"
  1521. : "+r"(src_bgra), // %0
  1522. "+r"(dst_y), // %1
  1523. "+r"(width) // %2
  1524. : "m"(kBGRAToY), // %3
  1525. "m"(kAddY16) // %4
  1526. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  1527. }
  1528. void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
  1529. int src_stride_bgra,
  1530. uint8_t* dst_u,
  1531. uint8_t* dst_v,
  1532. int width) {
  1533. asm volatile(
  1534. "movdqa %5,%%xmm3 \n"
  1535. "movdqa %6,%%xmm4 \n"
  1536. "movdqa %7,%%xmm5 \n"
  1537. "sub %1,%2 \n"
  1538. LABELALIGN
  1539. "1: \n"
  1540. "movdqu (%0),%%xmm0 \n"
  1541. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1542. "pavgb %%xmm7,%%xmm0 \n"
  1543. "movdqu 0x10(%0),%%xmm1 \n"
  1544. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1545. "pavgb %%xmm7,%%xmm1 \n"
  1546. "movdqu 0x20(%0),%%xmm2 \n"
  1547. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1548. "pavgb %%xmm7,%%xmm2 \n"
  1549. "movdqu 0x30(%0),%%xmm6 \n"
  1550. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1551. "pavgb %%xmm7,%%xmm6 \n"
  1552. "lea 0x40(%0),%0 \n"
  1553. "movdqa %%xmm0,%%xmm7 \n"
  1554. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1555. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1556. "pavgb %%xmm7,%%xmm0 \n"
  1557. "movdqa %%xmm2,%%xmm7 \n"
  1558. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1559. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1560. "pavgb %%xmm7,%%xmm2 \n"
  1561. "movdqa %%xmm0,%%xmm1 \n"
  1562. "movdqa %%xmm2,%%xmm6 \n"
  1563. "pmaddubsw %%xmm4,%%xmm0 \n"
  1564. "pmaddubsw %%xmm4,%%xmm2 \n"
  1565. "pmaddubsw %%xmm3,%%xmm1 \n"
  1566. "pmaddubsw %%xmm3,%%xmm6 \n"
  1567. "phaddw %%xmm2,%%xmm0 \n"
  1568. "phaddw %%xmm6,%%xmm1 \n"
  1569. "psraw $0x8,%%xmm0 \n"
  1570. "psraw $0x8,%%xmm1 \n"
  1571. "packsswb %%xmm1,%%xmm0 \n"
  1572. "paddb %%xmm5,%%xmm0 \n"
  1573. "movlps %%xmm0,(%1) \n"
  1574. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1575. "lea 0x8(%1),%1 \n"
  1576. "sub $0x10,%3 \n"
  1577. "jg 1b \n"
  1578. : "+r"(src_bgra0), // %0
  1579. "+r"(dst_u), // %1
  1580. "+r"(dst_v), // %2
  1581. "+rm"(width) // %3
  1582. : "r"((intptr_t)(src_stride_bgra)), // %4
  1583. "m"(kBGRAToV), // %5
  1584. "m"(kBGRAToU), // %6
  1585. "m"(kAddUV128) // %7
  1586. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1587. }
  1588. void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
  1589. asm volatile(
  1590. "movdqa %4,%%xmm5 \n"
  1591. "movdqa %3,%%xmm4 \n"
  1592. LABELALIGN
  1593. "1: \n"
  1594. "movdqu (%0),%%xmm0 \n"
  1595. "movdqu 0x10(%0),%%xmm1 \n"
  1596. "movdqu 0x20(%0),%%xmm2 \n"
  1597. "movdqu 0x30(%0),%%xmm3 \n"
  1598. "pmaddubsw %%xmm4,%%xmm0 \n"
  1599. "pmaddubsw %%xmm4,%%xmm1 \n"
  1600. "pmaddubsw %%xmm4,%%xmm2 \n"
  1601. "pmaddubsw %%xmm4,%%xmm3 \n"
  1602. "lea 0x40(%0),%0 \n"
  1603. "phaddw %%xmm1,%%xmm0 \n"
  1604. "phaddw %%xmm3,%%xmm2 \n"
  1605. "psrlw $0x7,%%xmm0 \n"
  1606. "psrlw $0x7,%%xmm2 \n"
  1607. "packuswb %%xmm2,%%xmm0 \n"
  1608. "paddb %%xmm5,%%xmm0 \n"
  1609. "movdqu %%xmm0,(%1) \n"
  1610. "lea 0x10(%1),%1 \n"
  1611. "sub $0x10,%2 \n"
  1612. "jg 1b \n"
  1613. : "+r"(src_abgr), // %0
  1614. "+r"(dst_y), // %1
  1615. "+r"(width) // %2
  1616. : "m"(kABGRToY), // %3
  1617. "m"(kAddY16) // %4
  1618. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  1619. }
  1620. void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
  1621. asm volatile(
  1622. "movdqa %4,%%xmm5 \n"
  1623. "movdqa %3,%%xmm4 \n"
  1624. LABELALIGN
  1625. "1: \n"
  1626. "movdqu (%0),%%xmm0 \n"
  1627. "movdqu 0x10(%0),%%xmm1 \n"
  1628. "movdqu 0x20(%0),%%xmm2 \n"
  1629. "movdqu 0x30(%0),%%xmm3 \n"
  1630. "pmaddubsw %%xmm4,%%xmm0 \n"
  1631. "pmaddubsw %%xmm4,%%xmm1 \n"
  1632. "pmaddubsw %%xmm4,%%xmm2 \n"
  1633. "pmaddubsw %%xmm4,%%xmm3 \n"
  1634. "lea 0x40(%0),%0 \n"
  1635. "phaddw %%xmm1,%%xmm0 \n"
  1636. "phaddw %%xmm3,%%xmm2 \n"
  1637. "psrlw $0x7,%%xmm0 \n"
  1638. "psrlw $0x7,%%xmm2 \n"
  1639. "packuswb %%xmm2,%%xmm0 \n"
  1640. "paddb %%xmm5,%%xmm0 \n"
  1641. "movdqu %%xmm0,(%1) \n"
  1642. "lea 0x10(%1),%1 \n"
  1643. "sub $0x10,%2 \n"
  1644. "jg 1b \n"
  1645. : "+r"(src_rgba), // %0
  1646. "+r"(dst_y), // %1
  1647. "+r"(width) // %2
  1648. : "m"(kRGBAToY), // %3
  1649. "m"(kAddY16) // %4
  1650. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  1651. }
  1652. void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
  1653. int src_stride_abgr,
  1654. uint8_t* dst_u,
  1655. uint8_t* dst_v,
  1656. int width) {
  1657. asm volatile(
  1658. "movdqa %5,%%xmm3 \n"
  1659. "movdqa %6,%%xmm4 \n"
  1660. "movdqa %7,%%xmm5 \n"
  1661. "sub %1,%2 \n"
  1662. LABELALIGN
  1663. "1: \n"
  1664. "movdqu (%0),%%xmm0 \n"
  1665. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1666. "pavgb %%xmm7,%%xmm0 \n"
  1667. "movdqu 0x10(%0),%%xmm1 \n"
  1668. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1669. "pavgb %%xmm7,%%xmm1 \n"
  1670. "movdqu 0x20(%0),%%xmm2 \n"
  1671. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1672. "pavgb %%xmm7,%%xmm2 \n"
  1673. "movdqu 0x30(%0),%%xmm6 \n"
  1674. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1675. "pavgb %%xmm7,%%xmm6 \n"
  1676. "lea 0x40(%0),%0 \n"
  1677. "movdqa %%xmm0,%%xmm7 \n"
  1678. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1679. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1680. "pavgb %%xmm7,%%xmm0 \n"
  1681. "movdqa %%xmm2,%%xmm7 \n"
  1682. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1683. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1684. "pavgb %%xmm7,%%xmm2 \n"
  1685. "movdqa %%xmm0,%%xmm1 \n"
  1686. "movdqa %%xmm2,%%xmm6 \n"
  1687. "pmaddubsw %%xmm4,%%xmm0 \n"
  1688. "pmaddubsw %%xmm4,%%xmm2 \n"
  1689. "pmaddubsw %%xmm3,%%xmm1 \n"
  1690. "pmaddubsw %%xmm3,%%xmm6 \n"
  1691. "phaddw %%xmm2,%%xmm0 \n"
  1692. "phaddw %%xmm6,%%xmm1 \n"
  1693. "psraw $0x8,%%xmm0 \n"
  1694. "psraw $0x8,%%xmm1 \n"
  1695. "packsswb %%xmm1,%%xmm0 \n"
  1696. "paddb %%xmm5,%%xmm0 \n"
  1697. "movlps %%xmm0,(%1) \n"
  1698. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1699. "lea 0x8(%1),%1 \n"
  1700. "sub $0x10,%3 \n"
  1701. "jg 1b \n"
  1702. : "+r"(src_abgr0), // %0
  1703. "+r"(dst_u), // %1
  1704. "+r"(dst_v), // %2
  1705. "+rm"(width) // %3
  1706. : "r"((intptr_t)(src_stride_abgr)), // %4
  1707. "m"(kABGRToV), // %5
  1708. "m"(kABGRToU), // %6
  1709. "m"(kAddUV128) // %7
  1710. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1711. }
  1712. void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
  1713. int src_stride_rgba,
  1714. uint8_t* dst_u,
  1715. uint8_t* dst_v,
  1716. int width) {
  1717. asm volatile(
  1718. "movdqa %5,%%xmm3 \n"
  1719. "movdqa %6,%%xmm4 \n"
  1720. "movdqa %7,%%xmm5 \n"
  1721. "sub %1,%2 \n"
  1722. LABELALIGN
  1723. "1: \n"
  1724. "movdqu (%0),%%xmm0 \n"
  1725. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1726. "pavgb %%xmm7,%%xmm0 \n"
  1727. "movdqu 0x10(%0),%%xmm1 \n"
  1728. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1729. "pavgb %%xmm7,%%xmm1 \n"
  1730. "movdqu 0x20(%0),%%xmm2 \n"
  1731. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1732. "pavgb %%xmm7,%%xmm2 \n"
  1733. "movdqu 0x30(%0),%%xmm6 \n"
  1734. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1735. "pavgb %%xmm7,%%xmm6 \n"
  1736. "lea 0x40(%0),%0 \n"
  1737. "movdqa %%xmm0,%%xmm7 \n"
  1738. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1739. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1740. "pavgb %%xmm7,%%xmm0 \n"
  1741. "movdqa %%xmm2,%%xmm7 \n"
  1742. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1743. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1744. "pavgb %%xmm7,%%xmm2 \n"
  1745. "movdqa %%xmm0,%%xmm1 \n"
  1746. "movdqa %%xmm2,%%xmm6 \n"
  1747. "pmaddubsw %%xmm4,%%xmm0 \n"
  1748. "pmaddubsw %%xmm4,%%xmm2 \n"
  1749. "pmaddubsw %%xmm3,%%xmm1 \n"
  1750. "pmaddubsw %%xmm3,%%xmm6 \n"
  1751. "phaddw %%xmm2,%%xmm0 \n"
  1752. "phaddw %%xmm6,%%xmm1 \n"
  1753. "psraw $0x8,%%xmm0 \n"
  1754. "psraw $0x8,%%xmm1 \n"
  1755. "packsswb %%xmm1,%%xmm0 \n"
  1756. "paddb %%xmm5,%%xmm0 \n"
  1757. "movlps %%xmm0,(%1) \n"
  1758. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1759. "lea 0x8(%1),%1 \n"
  1760. "sub $0x10,%3 \n"
  1761. "jg 1b \n"
  1762. : "+r"(src_rgba0), // %0
  1763. "+r"(dst_u), // %1
  1764. "+r"(dst_v), // %2
  1765. "+rm"(width) // %3
  1766. : "r"((intptr_t)(src_stride_rgba)), // %4
  1767. "m"(kRGBAToV), // %5
  1768. "m"(kRGBAToU), // %6
  1769. "m"(kAddUV128) // %7
  1770. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1771. }
  1772. #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
  1773. // Read 8 UV from 444
  1774. #define READYUV444 \
  1775. "movq (%[u_buf]),%%xmm0 \n" \
  1776. "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1777. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  1778. "punpcklbw %%xmm1,%%xmm0 \n" \
  1779. "movq (%[y_buf]),%%xmm4 \n" \
  1780. "punpcklbw %%xmm4,%%xmm4 \n" \
  1781. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1782. // Read 4 UV from 422, upsample to 8 UV
  1783. #define READYUV422 \
  1784. "movd (%[u_buf]),%%xmm0 \n" \
  1785. "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1786. "lea 0x4(%[u_buf]),%[u_buf] \n" \
  1787. "punpcklbw %%xmm1,%%xmm0 \n" \
  1788. "punpcklwd %%xmm0,%%xmm0 \n" \
  1789. "movq (%[y_buf]),%%xmm4 \n" \
  1790. "punpcklbw %%xmm4,%%xmm4 \n" \
  1791. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1792. // Read 4 UV from 422 10 bit, upsample to 8 UV
  1793. // TODO(fbarchard): Consider shufb to replace pack/unpack
  1794. // TODO(fbarchard): Consider pmulhuw to replace psraw
  1795. // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
  1796. #define READYUV210 \
  1797. "movq (%[u_buf]),%%xmm0 \n" \
  1798. "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1799. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  1800. "punpcklwd %%xmm1,%%xmm0 \n" \
  1801. "psraw $0x2,%%xmm0 \n" \
  1802. "packuswb %%xmm0,%%xmm0 \n" \
  1803. "punpcklwd %%xmm0,%%xmm0 \n" \
  1804. "movdqu (%[y_buf]),%%xmm4 \n" \
  1805. "psllw $0x6,%%xmm4 \n" \
  1806. "lea 0x10(%[y_buf]),%[y_buf] \n"
  1807. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  1808. #define READYUVA422 \
  1809. "movd (%[u_buf]),%%xmm0 \n" \
  1810. "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1811. "lea 0x4(%[u_buf]),%[u_buf] \n" \
  1812. "punpcklbw %%xmm1,%%xmm0 \n" \
  1813. "punpcklwd %%xmm0,%%xmm0 \n" \
  1814. "movq (%[y_buf]),%%xmm4 \n" \
  1815. "punpcklbw %%xmm4,%%xmm4 \n" \
  1816. "lea 0x8(%[y_buf]),%[y_buf] \n" \
  1817. "movq (%[a_buf]),%%xmm5 \n" \
  1818. "lea 0x8(%[a_buf]),%[a_buf] \n"
  1819. // Read 4 UV from NV12, upsample to 8 UV
  1820. #define READNV12 \
  1821. "movq (%[uv_buf]),%%xmm0 \n" \
  1822. "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
  1823. "punpcklwd %%xmm0,%%xmm0 \n" \
  1824. "movq (%[y_buf]),%%xmm4 \n" \
  1825. "punpcklbw %%xmm4,%%xmm4 \n" \
  1826. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1827. // Read 4 VU from NV21, upsample to 8 UV
  1828. #define READNV21 \
  1829. "movq (%[vu_buf]),%%xmm0 \n" \
  1830. "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
  1831. "pshufb %[kShuffleNV21], %%xmm0 \n" \
  1832. "movq (%[y_buf]),%%xmm4 \n" \
  1833. "punpcklbw %%xmm4,%%xmm4 \n" \
  1834. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1835. // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
  1836. #define READYUY2 \
  1837. "movdqu (%[yuy2_buf]),%%xmm4 \n" \
  1838. "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
  1839. "movdqu (%[yuy2_buf]),%%xmm0 \n" \
  1840. "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
  1841. "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
  1842. // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
  1843. #define READUYVY \
  1844. "movdqu (%[uyvy_buf]),%%xmm4 \n" \
  1845. "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
  1846. "movdqu (%[uyvy_buf]),%%xmm0 \n" \
  1847. "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
  1848. "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
  1849. #if defined(__x86_64__)
  1850. #define YUVTORGB_SETUP(yuvconstants) \
  1851. "movdqa (%[yuvconstants]),%%xmm8 \n" \
  1852. "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
  1853. "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
  1854. "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
  1855. "movdqa 128(%[yuvconstants]),%%xmm12 \n" \
  1856. "movdqa 160(%[yuvconstants]),%%xmm13 \n" \
  1857. "movdqa 192(%[yuvconstants]),%%xmm14 \n"
  1858. // Convert 8 pixels: 8 UV and 8 Y
  1859. #define YUVTORGB16(yuvconstants) \
  1860. "movdqa %%xmm0,%%xmm1 \n" \
  1861. "movdqa %%xmm0,%%xmm2 \n" \
  1862. "movdqa %%xmm0,%%xmm3 \n" \
  1863. "movdqa %%xmm11,%%xmm0 \n" \
  1864. "pmaddubsw %%xmm8,%%xmm1 \n" \
  1865. "psubw %%xmm1,%%xmm0 \n" \
  1866. "movdqa %%xmm12,%%xmm1 \n" \
  1867. "pmaddubsw %%xmm9,%%xmm2 \n" \
  1868. "psubw %%xmm2,%%xmm1 \n" \
  1869. "movdqa %%xmm13,%%xmm2 \n" \
  1870. "pmaddubsw %%xmm10,%%xmm3 \n" \
  1871. "psubw %%xmm3,%%xmm2 \n" \
  1872. "pmulhuw %%xmm14,%%xmm4 \n" \
  1873. "paddsw %%xmm4,%%xmm0 \n" \
  1874. "paddsw %%xmm4,%%xmm1 \n" \
  1875. "paddsw %%xmm4,%%xmm2 \n"
  1876. #define YUVTORGB_REGS \
  1877. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  1878. #else
  1879. #define YUVTORGB_SETUP(yuvconstants)
  1880. // Convert 8 pixels: 8 UV and 8 Y
  1881. #define YUVTORGB16(yuvconstants) \
  1882. "movdqa %%xmm0,%%xmm1 \n" \
  1883. "movdqa %%xmm0,%%xmm2 \n" \
  1884. "movdqa %%xmm0,%%xmm3 \n" \
  1885. "movdqa 96(%[yuvconstants]),%%xmm0 \n" \
  1886. "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \
  1887. "psubw %%xmm1,%%xmm0 \n" \
  1888. "movdqa 128(%[yuvconstants]),%%xmm1 \n" \
  1889. "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \
  1890. "psubw %%xmm2,%%xmm1 \n" \
  1891. "movdqa 160(%[yuvconstants]),%%xmm2 \n" \
  1892. "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \
  1893. "psubw %%xmm3,%%xmm2 \n" \
  1894. "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \
  1895. "paddsw %%xmm4,%%xmm0 \n" \
  1896. "paddsw %%xmm4,%%xmm1 \n" \
  1897. "paddsw %%xmm4,%%xmm2 \n"
  1898. #define YUVTORGB_REGS
  1899. #endif
  1900. #define YUVTORGB(yuvconstants) \
  1901. YUVTORGB16(yuvconstants) \
  1902. "psraw $0x6,%%xmm0 \n" \
  1903. "psraw $0x6,%%xmm1 \n" \
  1904. "psraw $0x6,%%xmm2 \n" \
  1905. "packuswb %%xmm0,%%xmm0 \n" \
  1906. "packuswb %%xmm1,%%xmm1 \n" \
  1907. "packuswb %%xmm2,%%xmm2 \n"
  1908. // Store 8 ARGB values.
  1909. #define STOREARGB \
  1910. "punpcklbw %%xmm1,%%xmm0 \n" \
  1911. "punpcklbw %%xmm5,%%xmm2 \n" \
  1912. "movdqa %%xmm0,%%xmm1 \n" \
  1913. "punpcklwd %%xmm2,%%xmm0 \n" \
  1914. "punpckhwd %%xmm2,%%xmm1 \n" \
  1915. "movdqu %%xmm0,(%[dst_argb]) \n" \
  1916. "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
  1917. "lea 0x20(%[dst_argb]), %[dst_argb] \n"
  1918. // Store 8 RGBA values.
  1919. #define STORERGBA \
  1920. "pcmpeqb %%xmm5,%%xmm5 \n" \
  1921. "punpcklbw %%xmm2,%%xmm1 \n" \
  1922. "punpcklbw %%xmm0,%%xmm5 \n" \
  1923. "movdqa %%xmm5,%%xmm0 \n" \
  1924. "punpcklwd %%xmm1,%%xmm5 \n" \
  1925. "punpckhwd %%xmm1,%%xmm0 \n" \
  1926. "movdqu %%xmm5,(%[dst_rgba]) \n" \
  1927. "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
  1928. "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
  1929. // Store 8 AR30 values.
  1930. #define STOREAR30 \
  1931. "psraw $0x4,%%xmm0 \n" \
  1932. "psraw $0x4,%%xmm1 \n" \
  1933. "psraw $0x4,%%xmm2 \n" \
  1934. "pminsw %%xmm7,%%xmm0 \n" \
  1935. "pminsw %%xmm7,%%xmm1 \n" \
  1936. "pminsw %%xmm7,%%xmm2 \n" \
  1937. "pmaxsw %%xmm6,%%xmm0 \n" \
  1938. "pmaxsw %%xmm6,%%xmm1 \n" \
  1939. "pmaxsw %%xmm6,%%xmm2 \n" \
  1940. "psllw $0x4,%%xmm2 \n" \
  1941. "movdqa %%xmm0,%%xmm3 \n" \
  1942. "punpcklwd %%xmm2,%%xmm0 \n" \
  1943. "punpckhwd %%xmm2,%%xmm3 \n" \
  1944. "movdqa %%xmm1,%%xmm2 \n" \
  1945. "punpcklwd %%xmm5,%%xmm1 \n" \
  1946. "punpckhwd %%xmm5,%%xmm2 \n" \
  1947. "pslld $0xa,%%xmm1 \n" \
  1948. "pslld $0xa,%%xmm2 \n" \
  1949. "por %%xmm1,%%xmm0 \n" \
  1950. "por %%xmm2,%%xmm3 \n" \
  1951. "movdqu %%xmm0,(%[dst_ar30]) \n" \
  1952. "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
  1953. "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
  1954. void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
  1955. const uint8_t* u_buf,
  1956. const uint8_t* v_buf,
  1957. uint8_t* dst_argb,
  1958. const struct YuvConstants* yuvconstants,
  1959. int width) {
  1960. asm volatile (
  1961. YUVTORGB_SETUP(yuvconstants)
  1962. "sub %[u_buf],%[v_buf] \n"
  1963. "pcmpeqb %%xmm5,%%xmm5 \n"
  1964. LABELALIGN
  1965. "1: \n"
  1966. READYUV444
  1967. YUVTORGB(yuvconstants)
  1968. STOREARGB
  1969. "sub $0x8,%[width] \n"
  1970. "jg 1b \n"
  1971. : [y_buf]"+r"(y_buf), // %[y_buf]
  1972. [u_buf]"+r"(u_buf), // %[u_buf]
  1973. [v_buf]"+r"(v_buf), // %[v_buf]
  1974. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1975. [width]"+rm"(width) // %[width]
  1976. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1977. : "memory", "cc", YUVTORGB_REGS
  1978. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1979. );
  1980. }
  1981. void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
  1982. const uint8_t* u_buf,
  1983. const uint8_t* v_buf,
  1984. uint8_t* dst_rgb24,
  1985. const struct YuvConstants* yuvconstants,
  1986. int width) {
  1987. asm volatile (
  1988. YUVTORGB_SETUP(yuvconstants)
  1989. "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
  1990. "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
  1991. "sub %[u_buf],%[v_buf] \n"
  1992. LABELALIGN
  1993. "1: \n"
  1994. READYUV422
  1995. YUVTORGB(yuvconstants)
  1996. "punpcklbw %%xmm1,%%xmm0 \n"
  1997. "punpcklbw %%xmm2,%%xmm2 \n"
  1998. "movdqa %%xmm0,%%xmm1 \n"
  1999. "punpcklwd %%xmm2,%%xmm0 \n"
  2000. "punpckhwd %%xmm2,%%xmm1 \n"
  2001. "pshufb %%xmm5,%%xmm0 \n"
  2002. "pshufb %%xmm6,%%xmm1 \n"
  2003. "palignr $0xc,%%xmm0,%%xmm1 \n"
  2004. "movq %%xmm0,(%[dst_rgb24]) \n"
  2005. "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
  2006. "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
  2007. "subl $0x8,%[width] \n"
  2008. "jg 1b \n"
  2009. : [y_buf]"+r"(y_buf), // %[y_buf]
  2010. [u_buf]"+r"(u_buf), // %[u_buf]
  2011. [v_buf]"+r"(v_buf), // %[v_buf]
  2012. [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
  2013. #if defined(__i386__)
  2014. [width]"+m"(width) // %[width]
  2015. #else
  2016. [width]"+rm"(width) // %[width]
  2017. #endif
  2018. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2019. [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
  2020. [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
  2021. : "memory", "cc", YUVTORGB_REGS
  2022. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  2023. );
  2024. }
  2025. void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
  2026. const uint8_t* u_buf,
  2027. const uint8_t* v_buf,
  2028. uint8_t* dst_argb,
  2029. const struct YuvConstants* yuvconstants,
  2030. int width) {
  2031. asm volatile (
  2032. YUVTORGB_SETUP(yuvconstants)
  2033. "sub %[u_buf],%[v_buf] \n"
  2034. "pcmpeqb %%xmm5,%%xmm5 \n"
  2035. LABELALIGN
  2036. "1: \n"
  2037. READYUV422
  2038. YUVTORGB(yuvconstants)
  2039. STOREARGB
  2040. "sub $0x8,%[width] \n"
  2041. "jg 1b \n"
  2042. : [y_buf]"+r"(y_buf), // %[y_buf]
  2043. [u_buf]"+r"(u_buf), // %[u_buf]
  2044. [v_buf]"+r"(v_buf), // %[v_buf]
  2045. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2046. [width]"+rm"(width) // %[width]
  2047. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2048. : "memory", "cc", YUVTORGB_REGS
  2049. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2050. );
  2051. }
  2052. void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
  2053. const uint8_t* u_buf,
  2054. const uint8_t* v_buf,
  2055. uint8_t* dst_ar30,
  2056. const struct YuvConstants* yuvconstants,
  2057. int width) {
  2058. asm volatile (
  2059. YUVTORGB_SETUP(yuvconstants)
  2060. "sub %[u_buf],%[v_buf] \n"
  2061. "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
  2062. "psrlw $14,%%xmm5 \n"
  2063. "psllw $4,%%xmm5 \n" // 2 alpha bits
  2064. "pxor %%xmm6,%%xmm6 \n"
  2065. "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
  2066. "psrlw $6,%%xmm7 \n" // 1023 for max
  2067. LABELALIGN
  2068. "1: \n"
  2069. READYUV422
  2070. YUVTORGB16(yuvconstants)
  2071. STOREAR30
  2072. "sub $0x8,%[width] \n"
  2073. "jg 1b \n"
  2074. : [y_buf]"+r"(y_buf), // %[y_buf]
  2075. [u_buf]"+r"(u_buf), // %[u_buf]
  2076. [v_buf]"+r"(v_buf), // %[v_buf]
  2077. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  2078. [width]"+rm"(width) // %[width]
  2079. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2080. : "memory", "cc", YUVTORGB_REGS
  2081. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  2082. );
  2083. }
  2084. // 10 bit YUV to ARGB
  2085. void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
  2086. const uint16_t* u_buf,
  2087. const uint16_t* v_buf,
  2088. uint8_t* dst_argb,
  2089. const struct YuvConstants* yuvconstants,
  2090. int width) {
  2091. asm volatile (
  2092. YUVTORGB_SETUP(yuvconstants)
  2093. "sub %[u_buf],%[v_buf] \n"
  2094. "pcmpeqb %%xmm5,%%xmm5 \n"
  2095. LABELALIGN
  2096. "1: \n"
  2097. READYUV210
  2098. YUVTORGB(yuvconstants)
  2099. STOREARGB
  2100. "sub $0x8,%[width] \n"
  2101. "jg 1b \n"
  2102. : [y_buf]"+r"(y_buf), // %[y_buf]
  2103. [u_buf]"+r"(u_buf), // %[u_buf]
  2104. [v_buf]"+r"(v_buf), // %[v_buf]
  2105. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2106. [width]"+rm"(width) // %[width]
  2107. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2108. : "memory", "cc", YUVTORGB_REGS
  2109. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2110. );
  2111. }
  2112. // 10 bit YUV to AR30
  2113. void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
  2114. const uint16_t* u_buf,
  2115. const uint16_t* v_buf,
  2116. uint8_t* dst_ar30,
  2117. const struct YuvConstants* yuvconstants,
  2118. int width) {
  2119. asm volatile (
  2120. YUVTORGB_SETUP(yuvconstants)
  2121. "sub %[u_buf],%[v_buf] \n"
  2122. "pcmpeqb %%xmm5,%%xmm5 \n"
  2123. "psrlw $14,%%xmm5 \n"
  2124. "psllw $4,%%xmm5 \n" // 2 alpha bits
  2125. "pxor %%xmm6,%%xmm6 \n"
  2126. "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
  2127. "psrlw $6,%%xmm7 \n" // 1023 for max
  2128. LABELALIGN
  2129. "1: \n"
  2130. READYUV210
  2131. YUVTORGB16(yuvconstants)
  2132. STOREAR30
  2133. "sub $0x8,%[width] \n"
  2134. "jg 1b \n"
  2135. : [y_buf]"+r"(y_buf), // %[y_buf]
  2136. [u_buf]"+r"(u_buf), // %[u_buf]
  2137. [v_buf]"+r"(v_buf), // %[v_buf]
  2138. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  2139. [width]"+rm"(width) // %[width]
  2140. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2141. : "memory", "cc", YUVTORGB_REGS
  2142. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  2143. );
  2144. }
  2145. #ifdef HAS_I422ALPHATOARGBROW_SSSE3
  2146. void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
  2147. const uint8_t* u_buf,
  2148. const uint8_t* v_buf,
  2149. const uint8_t* a_buf,
  2150. uint8_t* dst_argb,
  2151. const struct YuvConstants* yuvconstants,
  2152. int width) {
  2153. // clang-format off
  2154. asm volatile (
  2155. YUVTORGB_SETUP(yuvconstants)
  2156. "sub %[u_buf],%[v_buf] \n"
  2157. LABELALIGN
  2158. "1: \n"
  2159. READYUVA422
  2160. YUVTORGB(yuvconstants)
  2161. STOREARGB
  2162. "subl $0x8,%[width] \n"
  2163. "jg 1b \n"
  2164. : [y_buf]"+r"(y_buf), // %[y_buf]
  2165. [u_buf]"+r"(u_buf), // %[u_buf]
  2166. [v_buf]"+r"(v_buf), // %[v_buf]
  2167. [a_buf]"+r"(a_buf), // %[a_buf]
  2168. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2169. #if defined(__i386__)
  2170. [width]"+m"(width) // %[width]
  2171. #else
  2172. [width]"+rm"(width) // %[width]
  2173. #endif
  2174. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2175. : "memory", "cc", YUVTORGB_REGS
  2176. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2177. );
  2178. // clang-format on
  2179. }
  2180. #endif // HAS_I422ALPHATOARGBROW_SSSE3
  2181. void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
  2182. const uint8_t* uv_buf,
  2183. uint8_t* dst_argb,
  2184. const struct YuvConstants* yuvconstants,
  2185. int width) {
  2186. // clang-format off
  2187. asm volatile (
  2188. YUVTORGB_SETUP(yuvconstants)
  2189. "pcmpeqb %%xmm5,%%xmm5 \n"
  2190. LABELALIGN
  2191. "1: \n"
  2192. READNV12
  2193. YUVTORGB(yuvconstants)
  2194. STOREARGB
  2195. "sub $0x8,%[width] \n"
  2196. "jg 1b \n"
  2197. : [y_buf]"+r"(y_buf), // %[y_buf]
  2198. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  2199. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2200. [width]"+rm"(width) // %[width]
  2201. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2202. : "memory", "cc", YUVTORGB_REGS
  2203. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2204. );
  2205. // clang-format on
  2206. }
  2207. void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
  2208. const uint8_t* vu_buf,
  2209. uint8_t* dst_argb,
  2210. const struct YuvConstants* yuvconstants,
  2211. int width) {
  2212. // clang-format off
  2213. asm volatile (
  2214. YUVTORGB_SETUP(yuvconstants)
  2215. "pcmpeqb %%xmm5,%%xmm5 \n"
  2216. LABELALIGN
  2217. "1: \n"
  2218. READNV21
  2219. YUVTORGB(yuvconstants)
  2220. STOREARGB
  2221. "sub $0x8,%[width] \n"
  2222. "jg 1b \n"
  2223. : [y_buf]"+r"(y_buf), // %[y_buf]
  2224. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  2225. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2226. [width]"+rm"(width) // %[width]
  2227. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2228. [kShuffleNV21]"m"(kShuffleNV21)
  2229. : "memory", "cc", YUVTORGB_REGS
  2230. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2231. );
  2232. // clang-format on
  2233. }
  2234. void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
  2235. uint8_t* dst_argb,
  2236. const struct YuvConstants* yuvconstants,
  2237. int width) {
  2238. // clang-format off
  2239. asm volatile (
  2240. YUVTORGB_SETUP(yuvconstants)
  2241. "pcmpeqb %%xmm5,%%xmm5 \n"
  2242. LABELALIGN
  2243. "1: \n"
  2244. READYUY2
  2245. YUVTORGB(yuvconstants)
  2246. STOREARGB
  2247. "sub $0x8,%[width] \n"
  2248. "jg 1b \n"
  2249. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  2250. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2251. [width]"+rm"(width) // %[width]
  2252. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2253. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  2254. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  2255. : "memory", "cc", YUVTORGB_REGS
  2256. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2257. );
  2258. // clang-format on
  2259. }
  2260. void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
  2261. uint8_t* dst_argb,
  2262. const struct YuvConstants* yuvconstants,
  2263. int width) {
  2264. // clang-format off
  2265. asm volatile (
  2266. YUVTORGB_SETUP(yuvconstants)
  2267. "pcmpeqb %%xmm5,%%xmm5 \n"
  2268. LABELALIGN
  2269. "1: \n"
  2270. READUYVY
  2271. YUVTORGB(yuvconstants)
  2272. STOREARGB
  2273. "sub $0x8,%[width] \n"
  2274. "jg 1b \n"
  2275. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  2276. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2277. [width]"+rm"(width) // %[width]
  2278. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2279. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  2280. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  2281. : "memory", "cc", YUVTORGB_REGS
  2282. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2283. );
  2284. // clang-format on
  2285. }
  2286. void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
  2287. const uint8_t* u_buf,
  2288. const uint8_t* v_buf,
  2289. uint8_t* dst_rgba,
  2290. const struct YuvConstants* yuvconstants,
  2291. int width) {
  2292. asm volatile (
  2293. YUVTORGB_SETUP(yuvconstants)
  2294. "sub %[u_buf],%[v_buf] \n"
  2295. "pcmpeqb %%xmm5,%%xmm5 \n"
  2296. LABELALIGN
  2297. "1: \n"
  2298. READYUV422
  2299. YUVTORGB(yuvconstants)
  2300. STORERGBA
  2301. "sub $0x8,%[width] \n"
  2302. "jg 1b \n"
  2303. : [y_buf]"+r"(y_buf), // %[y_buf]
  2304. [u_buf]"+r"(u_buf), // %[u_buf]
  2305. [v_buf]"+r"(v_buf), // %[v_buf]
  2306. [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
  2307. [width]"+rm"(width) // %[width]
  2308. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2309. : "memory", "cc", YUVTORGB_REGS
  2310. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2311. );
  2312. }
  2313. #endif // HAS_I422TOARGBROW_SSSE3
  2314. // Read 16 UV from 444
  2315. #define READYUV444_AVX2 \
  2316. "vmovdqu (%[u_buf]),%%xmm0 \n" \
  2317. "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2318. "lea 0x10(%[u_buf]),%[u_buf] \n" \
  2319. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2320. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  2321. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2322. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2323. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2324. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2325. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2326. // Read 8 UV from 422, upsample to 16 UV.
  2327. #define READYUV422_AVX2 \
  2328. "vmovq (%[u_buf]),%%xmm0 \n" \
  2329. "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2330. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  2331. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2332. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2333. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2334. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2335. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2336. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2337. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2338. // Read 8 UV from 210 10 bit, upsample to 16 UV
  2339. // TODO(fbarchard): Consider vshufb to replace pack/unpack
  2340. // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
  2341. #define READYUV210_AVX2 \
  2342. "vmovdqu (%[u_buf]),%%xmm0 \n" \
  2343. "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2344. "lea 0x10(%[u_buf]),%[u_buf] \n" \
  2345. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2346. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  2347. "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \
  2348. "vpsraw $0x2,%%ymm0,%%ymm0 \n" \
  2349. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  2350. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2351. "vmovdqu (%[y_buf]),%%ymm4 \n" \
  2352. "vpsllw $0x6,%%ymm4,%%ymm4 \n" \
  2353. "lea 0x20(%[y_buf]),%[y_buf] \n"
  2354. // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
  2355. #define READYUVA422_AVX2 \
  2356. "vmovq (%[u_buf]),%%xmm0 \n" \
  2357. "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2358. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  2359. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2360. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2361. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2362. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2363. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2364. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2365. "lea 0x10(%[y_buf]),%[y_buf] \n" \
  2366. "vmovdqu (%[a_buf]),%%xmm5 \n" \
  2367. "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
  2368. "lea 0x10(%[a_buf]),%[a_buf] \n"
  2369. // Read 8 UV from NV12, upsample to 16 UV.
  2370. #define READNV12_AVX2 \
  2371. "vmovdqu (%[uv_buf]),%%xmm0 \n" \
  2372. "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
  2373. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2374. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2375. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2376. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2377. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2378. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2379. // Read 8 VU from NV21, upsample to 16 UV.
  2380. #define READNV21_AVX2 \
  2381. "vmovdqu (%[vu_buf]),%%xmm0 \n" \
  2382. "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
  2383. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2384. "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
  2385. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2386. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2387. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2388. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2389. // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
  2390. #define READYUY2_AVX2 \
  2391. "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
  2392. "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
  2393. "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \
  2394. "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
  2395. "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
  2396. // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
  2397. #define READUYVY_AVX2 \
  2398. "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
  2399. "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
  2400. "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \
  2401. "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
  2402. "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
  2403. #if defined(__x86_64__)
  2404. #define YUVTORGB_SETUP_AVX2(yuvconstants) \
  2405. "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
  2406. "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
  2407. "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
  2408. "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
  2409. "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \
  2410. "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
  2411. "vmovdqa 192(%[yuvconstants]),%%ymm14 \n"
  2412. #define YUVTORGB16_AVX2(yuvconstants) \
  2413. "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
  2414. "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
  2415. "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
  2416. "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
  2417. "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
  2418. "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
  2419. "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
  2420. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  2421. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  2422. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
  2423. #define YUVTORGB_REGS_AVX2 \
  2424. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  2425. #else // Convert 16 pixels: 16 UV and 16 Y.
  2426. #define YUVTORGB_SETUP_AVX2(yuvconstants)
  2427. #define YUVTORGB16_AVX2(yuvconstants) \
  2428. "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \
  2429. "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \
  2430. "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \
  2431. "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \
  2432. "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
  2433. "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \
  2434. "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
  2435. "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \
  2436. "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
  2437. "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
  2438. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  2439. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  2440. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
  2441. #define YUVTORGB_REGS_AVX2
  2442. #endif
  2443. #define YUVTORGB_AVX2(yuvconstants) \
  2444. YUVTORGB16_AVX2(yuvconstants) \
  2445. "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
  2446. "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
  2447. "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
  2448. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  2449. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
  2450. "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
  2451. // Store 16 ARGB values.
  2452. #define STOREARGB_AVX2 \
  2453. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2454. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2455. "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
  2456. "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
  2457. "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
  2458. "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
  2459. "vmovdqu %%ymm1,(%[dst_argb]) \n" \
  2460. "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
  2461. "lea 0x40(%[dst_argb]), %[dst_argb] \n"
  2462. // Store 16 AR30 values.
  2463. #define STOREAR30_AVX2 \
  2464. "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
  2465. "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
  2466. "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
  2467. "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
  2468. "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
  2469. "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
  2470. "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
  2471. "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
  2472. "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
  2473. "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
  2474. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2475. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  2476. "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
  2477. "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
  2478. "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
  2479. "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
  2480. "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
  2481. "vpslld $0xa,%%ymm1,%%ymm1 \n" \
  2482. "vpslld $0xa,%%ymm2,%%ymm2 \n" \
  2483. "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
  2484. "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
  2485. "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
  2486. "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
  2487. "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
  2488. #ifdef HAS_I444TOARGBROW_AVX2
  2489. // 16 pixels
  2490. // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
  2491. void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
  2492. const uint8_t* u_buf,
  2493. const uint8_t* v_buf,
  2494. uint8_t* dst_argb,
  2495. const struct YuvConstants* yuvconstants,
  2496. int width) {
  2497. asm volatile (
  2498. YUVTORGB_SETUP_AVX2(yuvconstants)
  2499. "sub %[u_buf],%[v_buf] \n"
  2500. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2501. LABELALIGN
  2502. "1: \n"
  2503. READYUV444_AVX2
  2504. YUVTORGB_AVX2(yuvconstants)
  2505. STOREARGB_AVX2
  2506. "sub $0x10,%[width] \n"
  2507. "jg 1b \n"
  2508. "vzeroupper \n"
  2509. : [y_buf]"+r"(y_buf), // %[y_buf]
  2510. [u_buf]"+r"(u_buf), // %[u_buf]
  2511. [v_buf]"+r"(v_buf), // %[v_buf]
  2512. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2513. [width]"+rm"(width) // %[width]
  2514. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2515. : "memory", "cc", YUVTORGB_REGS_AVX2
  2516. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2517. );
  2518. }
  2519. #endif // HAS_I444TOARGBROW_AVX2
  2520. #if defined(HAS_I422TOARGBROW_AVX2)
  2521. // 16 pixels
  2522. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2523. void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
  2524. const uint8_t* u_buf,
  2525. const uint8_t* v_buf,
  2526. uint8_t* dst_argb,
  2527. const struct YuvConstants* yuvconstants,
  2528. int width) {
  2529. asm volatile (
  2530. YUVTORGB_SETUP_AVX2(yuvconstants)
  2531. "sub %[u_buf],%[v_buf] \n"
  2532. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2533. LABELALIGN
  2534. "1: \n"
  2535. READYUV422_AVX2
  2536. YUVTORGB_AVX2(yuvconstants)
  2537. STOREARGB_AVX2
  2538. "sub $0x10,%[width] \n"
  2539. "jg 1b \n"
  2540. "vzeroupper \n"
  2541. : [y_buf]"+r"(y_buf), // %[y_buf]
  2542. [u_buf]"+r"(u_buf), // %[u_buf]
  2543. [v_buf]"+r"(v_buf), // %[v_buf]
  2544. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2545. [width]"+rm"(width) // %[width]
  2546. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2547. : "memory", "cc", YUVTORGB_REGS_AVX2
  2548. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2549. );
  2550. }
  2551. #endif // HAS_I422TOARGBROW_AVX2
  2552. #if defined(HAS_I422TOAR30ROW_AVX2)
  2553. // 16 pixels
  2554. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
  2555. void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
  2556. const uint8_t* u_buf,
  2557. const uint8_t* v_buf,
  2558. uint8_t* dst_ar30,
  2559. const struct YuvConstants* yuvconstants,
  2560. int width) {
  2561. asm volatile (
  2562. YUVTORGB_SETUP_AVX2(yuvconstants)
  2563. "sub %[u_buf],%[v_buf] \n"
  2564. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
  2565. "vpsrlw $14,%%ymm5,%%ymm5 \n"
  2566. "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
  2567. "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
  2568. "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
  2569. "vpsrlw $6,%%ymm7,%%ymm7 \n"
  2570. LABELALIGN
  2571. "1: \n"
  2572. READYUV422_AVX2
  2573. YUVTORGB16_AVX2(yuvconstants)
  2574. STOREAR30_AVX2
  2575. "sub $0x10,%[width] \n"
  2576. "jg 1b \n"
  2577. "vzeroupper \n"
  2578. : [y_buf]"+r"(y_buf), // %[y_buf]
  2579. [u_buf]"+r"(u_buf), // %[u_buf]
  2580. [v_buf]"+r"(v_buf), // %[v_buf]
  2581. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  2582. [width]"+rm"(width) // %[width]
  2583. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2584. : "memory", "cc", YUVTORGB_REGS_AVX2
  2585. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  2586. );
  2587. }
  2588. #endif // HAS_I422TOAR30ROW_AVX2
  2589. #if defined(HAS_I210TOARGBROW_AVX2)
  2590. // 16 pixels
  2591. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2592. void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
  2593. const uint16_t* u_buf,
  2594. const uint16_t* v_buf,
  2595. uint8_t* dst_argb,
  2596. const struct YuvConstants* yuvconstants,
  2597. int width) {
  2598. asm volatile (
  2599. YUVTORGB_SETUP_AVX2(yuvconstants)
  2600. "sub %[u_buf],%[v_buf] \n"
  2601. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2602. LABELALIGN
  2603. "1: \n"
  2604. READYUV210_AVX2
  2605. YUVTORGB_AVX2(yuvconstants)
  2606. STOREARGB_AVX2
  2607. "sub $0x10,%[width] \n"
  2608. "jg 1b \n"
  2609. "vzeroupper \n"
  2610. : [y_buf]"+r"(y_buf), // %[y_buf]
  2611. [u_buf]"+r"(u_buf), // %[u_buf]
  2612. [v_buf]"+r"(v_buf), // %[v_buf]
  2613. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2614. [width]"+rm"(width) // %[width]
  2615. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2616. : "memory", "cc", YUVTORGB_REGS_AVX2
  2617. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2618. );
  2619. }
  2620. #endif // HAS_I210TOARGBROW_AVX2
  2621. #if defined(HAS_I210TOAR30ROW_AVX2)
  2622. // 16 pixels
  2623. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
  2624. void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
  2625. const uint16_t* u_buf,
  2626. const uint16_t* v_buf,
  2627. uint8_t* dst_ar30,
  2628. const struct YuvConstants* yuvconstants,
  2629. int width) {
  2630. asm volatile (
  2631. YUVTORGB_SETUP_AVX2(yuvconstants)
  2632. "sub %[u_buf],%[v_buf] \n"
  2633. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
  2634. "vpsrlw $14,%%ymm5,%%ymm5 \n"
  2635. "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
  2636. "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
  2637. "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
  2638. "vpsrlw $6,%%ymm7,%%ymm7 \n"
  2639. LABELALIGN
  2640. "1: \n"
  2641. READYUV210_AVX2
  2642. YUVTORGB16_AVX2(yuvconstants)
  2643. STOREAR30_AVX2
  2644. "sub $0x10,%[width] \n"
  2645. "jg 1b \n"
  2646. "vzeroupper \n"
  2647. : [y_buf]"+r"(y_buf), // %[y_buf]
  2648. [u_buf]"+r"(u_buf), // %[u_buf]
  2649. [v_buf]"+r"(v_buf), // %[v_buf]
  2650. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  2651. [width]"+rm"(width) // %[width]
  2652. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2653. : "memory", "cc", YUVTORGB_REGS_AVX2
  2654. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2655. );
  2656. }
  2657. #endif // HAS_I210TOAR30ROW_AVX2
  2658. #if defined(HAS_I422ALPHATOARGBROW_AVX2)
  2659. // 16 pixels
  2660. // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
  2661. void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
  2662. const uint8_t* u_buf,
  2663. const uint8_t* v_buf,
  2664. const uint8_t* a_buf,
  2665. uint8_t* dst_argb,
  2666. const struct YuvConstants* yuvconstants,
  2667. int width) {
  2668. // clang-format off
  2669. asm volatile (
  2670. YUVTORGB_SETUP_AVX2(yuvconstants)
  2671. "sub %[u_buf],%[v_buf] \n"
  2672. LABELALIGN
  2673. "1: \n"
  2674. READYUVA422_AVX2
  2675. YUVTORGB_AVX2(yuvconstants)
  2676. STOREARGB_AVX2
  2677. "subl $0x10,%[width] \n"
  2678. "jg 1b \n"
  2679. "vzeroupper \n"
  2680. : [y_buf]"+r"(y_buf), // %[y_buf]
  2681. [u_buf]"+r"(u_buf), // %[u_buf]
  2682. [v_buf]"+r"(v_buf), // %[v_buf]
  2683. [a_buf]"+r"(a_buf), // %[a_buf]
  2684. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2685. #if defined(__i386__)
  2686. [width]"+m"(width) // %[width]
  2687. #else
  2688. [width]"+rm"(width) // %[width]
  2689. #endif
  2690. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2691. : "memory", "cc", YUVTORGB_REGS_AVX2
  2692. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2693. );
  2694. // clang-format on
  2695. }
  2696. #endif // HAS_I422ALPHATOARGBROW_AVX2
  2697. #if defined(HAS_I422TORGBAROW_AVX2)
  2698. // 16 pixels
  2699. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
  2700. void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
  2701. const uint8_t* u_buf,
  2702. const uint8_t* v_buf,
  2703. uint8_t* dst_argb,
  2704. const struct YuvConstants* yuvconstants,
  2705. int width) {
  2706. asm volatile (
  2707. YUVTORGB_SETUP_AVX2(yuvconstants)
  2708. "sub %[u_buf],%[v_buf] \n"
  2709. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2710. LABELALIGN
  2711. "1: \n"
  2712. READYUV422_AVX2
  2713. YUVTORGB_AVX2(yuvconstants)
  2714. // Step 3: Weave into RGBA
  2715. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  2716. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2717. "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
  2718. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  2719. "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
  2720. "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
  2721. "vmovdqu %%ymm0,(%[dst_argb]) \n"
  2722. "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
  2723. "lea 0x40(%[dst_argb]),%[dst_argb] \n"
  2724. "sub $0x10,%[width] \n"
  2725. "jg 1b \n"
  2726. "vzeroupper \n"
  2727. : [y_buf]"+r"(y_buf), // %[y_buf]
  2728. [u_buf]"+r"(u_buf), // %[u_buf]
  2729. [v_buf]"+r"(v_buf), // %[v_buf]
  2730. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2731. [width]"+rm"(width) // %[width]
  2732. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2733. : "memory", "cc", YUVTORGB_REGS_AVX2
  2734. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2735. );
  2736. }
  2737. #endif // HAS_I422TORGBAROW_AVX2
  2738. #if defined(HAS_NV12TOARGBROW_AVX2)
  2739. // 16 pixels.
  2740. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2741. void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
  2742. const uint8_t* uv_buf,
  2743. uint8_t* dst_argb,
  2744. const struct YuvConstants* yuvconstants,
  2745. int width) {
  2746. // clang-format off
  2747. asm volatile (
  2748. YUVTORGB_SETUP_AVX2(yuvconstants)
  2749. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2750. LABELALIGN
  2751. "1: \n"
  2752. READNV12_AVX2
  2753. YUVTORGB_AVX2(yuvconstants)
  2754. STOREARGB_AVX2
  2755. "sub $0x10,%[width] \n"
  2756. "jg 1b \n"
  2757. "vzeroupper \n"
  2758. : [y_buf]"+r"(y_buf), // %[y_buf]
  2759. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  2760. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2761. [width]"+rm"(width) // %[width]
  2762. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2763. : "memory", "cc", YUVTORGB_REGS_AVX2
  2764. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2765. );
  2766. // clang-format on
  2767. }
  2768. #endif // HAS_NV12TOARGBROW_AVX2
  2769. #if defined(HAS_NV21TOARGBROW_AVX2)
  2770. // 16 pixels.
  2771. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2772. void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
  2773. const uint8_t* vu_buf,
  2774. uint8_t* dst_argb,
  2775. const struct YuvConstants* yuvconstants,
  2776. int width) {
  2777. // clang-format off
  2778. asm volatile (
  2779. YUVTORGB_SETUP_AVX2(yuvconstants)
  2780. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2781. LABELALIGN
  2782. "1: \n"
  2783. READNV21_AVX2
  2784. YUVTORGB_AVX2(yuvconstants)
  2785. STOREARGB_AVX2
  2786. "sub $0x10,%[width] \n"
  2787. "jg 1b \n"
  2788. "vzeroupper \n"
  2789. : [y_buf]"+r"(y_buf), // %[y_buf]
  2790. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  2791. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2792. [width]"+rm"(width) // %[width]
  2793. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2794. [kShuffleNV21]"m"(kShuffleNV21)
  2795. : "memory", "cc", YUVTORGB_REGS_AVX2
  2796. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2797. );
  2798. // clang-format on
  2799. }
  2800. #endif // HAS_NV21TOARGBROW_AVX2
  2801. #if defined(HAS_YUY2TOARGBROW_AVX2)
  2802. // 16 pixels.
  2803. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2804. void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
  2805. uint8_t* dst_argb,
  2806. const struct YuvConstants* yuvconstants,
  2807. int width) {
  2808. // clang-format off
  2809. asm volatile (
  2810. YUVTORGB_SETUP_AVX2(yuvconstants)
  2811. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2812. LABELALIGN
  2813. "1: \n"
  2814. READYUY2_AVX2
  2815. YUVTORGB_AVX2(yuvconstants)
  2816. STOREARGB_AVX2
  2817. "sub $0x10,%[width] \n"
  2818. "jg 1b \n"
  2819. "vzeroupper \n"
  2820. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  2821. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2822. [width]"+rm"(width) // %[width]
  2823. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2824. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  2825. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  2826. : "memory", "cc", YUVTORGB_REGS_AVX2
  2827. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2828. );
  2829. // clang-format on
  2830. }
  2831. #endif // HAS_YUY2TOARGBROW_AVX2
  2832. #if defined(HAS_UYVYTOARGBROW_AVX2)
  2833. // 16 pixels.
  2834. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2835. void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
  2836. uint8_t* dst_argb,
  2837. const struct YuvConstants* yuvconstants,
  2838. int width) {
  2839. // clang-format off
  2840. asm volatile (
  2841. YUVTORGB_SETUP_AVX2(yuvconstants)
  2842. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2843. LABELALIGN
  2844. "1: \n"
  2845. READUYVY_AVX2
  2846. YUVTORGB_AVX2(yuvconstants)
  2847. STOREARGB_AVX2
  2848. "sub $0x10,%[width] \n"
  2849. "jg 1b \n"
  2850. "vzeroupper \n"
  2851. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  2852. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2853. [width]"+rm"(width) // %[width]
  2854. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2855. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  2856. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  2857. : "memory", "cc", YUVTORGB_REGS_AVX2
  2858. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2859. );
  2860. // clang-format on
  2861. }
  2862. #endif // HAS_UYVYTOARGBROW_AVX2
  2863. #ifdef HAS_I400TOARGBROW_SSE2
  2864. void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
  2865. asm volatile(
  2866. "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
  2867. "movd %%eax,%%xmm2 \n"
  2868. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  2869. "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 *
  2870. // 16
  2871. "movd %%eax,%%xmm3 \n"
  2872. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  2873. "pcmpeqb %%xmm4,%%xmm4 \n"
  2874. "pslld $0x18,%%xmm4 \n"
  2875. LABELALIGN
  2876. "1: \n"
  2877. // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  2878. "movq (%0),%%xmm0 \n"
  2879. "lea 0x8(%0),%0 \n"
  2880. "punpcklbw %%xmm0,%%xmm0 \n"
  2881. "pmulhuw %%xmm2,%%xmm0 \n"
  2882. "psubusw %%xmm3,%%xmm0 \n"
  2883. "psrlw $6, %%xmm0 \n"
  2884. "packuswb %%xmm0,%%xmm0 \n"
  2885. // Step 2: Weave into ARGB
  2886. "punpcklbw %%xmm0,%%xmm0 \n"
  2887. "movdqa %%xmm0,%%xmm1 \n"
  2888. "punpcklwd %%xmm0,%%xmm0 \n"
  2889. "punpckhwd %%xmm1,%%xmm1 \n"
  2890. "por %%xmm4,%%xmm0 \n"
  2891. "por %%xmm4,%%xmm1 \n"
  2892. "movdqu %%xmm0,(%1) \n"
  2893. "movdqu %%xmm1,0x10(%1) \n"
  2894. "lea 0x20(%1),%1 \n"
  2895. "sub $0x8,%2 \n"
  2896. "jg 1b \n"
  2897. : "+r"(y_buf), // %0
  2898. "+r"(dst_argb), // %1
  2899. "+rm"(width) // %2
  2900. :
  2901. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  2902. }
  2903. #endif // HAS_I400TOARGBROW_SSE2
  2904. #ifdef HAS_I400TOARGBROW_AVX2
  2905. // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
  2906. // note: vpunpcklbw mutates and vpackuswb unmutates.
  2907. void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
  2908. asm volatile(
  2909. "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 *
  2910. // 16
  2911. "vmovd %%eax,%%xmm2 \n"
  2912. "vbroadcastss %%xmm2,%%ymm2 \n"
  2913. "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
  2914. "vmovd %%eax,%%xmm3 \n"
  2915. "vbroadcastss %%xmm3,%%ymm3 \n"
  2916. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  2917. "vpslld $0x18,%%ymm4,%%ymm4 \n"
  2918. LABELALIGN
  2919. "1: \n"
  2920. // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
  2921. "vmovdqu (%0),%%xmm0 \n"
  2922. "lea 0x10(%0),%0 \n"
  2923. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  2924. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  2925. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  2926. "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
  2927. "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
  2928. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  2929. "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
  2930. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2931. "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
  2932. "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
  2933. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  2934. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  2935. "vmovdqu %%ymm0,(%1) \n"
  2936. "vmovdqu %%ymm1,0x20(%1) \n"
  2937. "lea 0x40(%1),%1 \n"
  2938. "sub $0x10,%2 \n"
  2939. "jg 1b \n"
  2940. "vzeroupper \n"
  2941. : "+r"(y_buf), // %0
  2942. "+r"(dst_argb), // %1
  2943. "+rm"(width) // %2
  2944. :
  2945. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  2946. }
  2947. #endif // HAS_I400TOARGBROW_AVX2
  2948. #ifdef HAS_MIRRORROW_SSSE3
  2949. // Shuffle table for reversing the bytes.
  2950. static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
  2951. 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2952. void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  2953. intptr_t temp_width = (intptr_t)(width);
  2954. asm volatile(
  2955. "movdqa %3,%%xmm5 \n"
  2956. LABELALIGN
  2957. "1: \n"
  2958. "movdqu -0x10(%0,%2,1),%%xmm0 \n"
  2959. "pshufb %%xmm5,%%xmm0 \n"
  2960. "movdqu %%xmm0,(%1) \n"
  2961. "lea 0x10(%1),%1 \n"
  2962. "sub $0x10,%2 \n"
  2963. "jg 1b \n"
  2964. : "+r"(src), // %0
  2965. "+r"(dst), // %1
  2966. "+r"(temp_width) // %2
  2967. : "m"(kShuffleMirror) // %3
  2968. : "memory", "cc", "xmm0", "xmm5");
  2969. }
  2970. #endif // HAS_MIRRORROW_SSSE3
  2971. #ifdef HAS_MIRRORROW_AVX2
  2972. void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  2973. intptr_t temp_width = (intptr_t)(width);
  2974. asm volatile(
  2975. "vbroadcastf128 %3,%%ymm5 \n"
  2976. LABELALIGN
  2977. "1: \n"
  2978. "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
  2979. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  2980. "vpermq $0x4e,%%ymm0,%%ymm0 \n"
  2981. "vmovdqu %%ymm0,(%1) \n"
  2982. "lea 0x20(%1),%1 \n"
  2983. "sub $0x20,%2 \n"
  2984. "jg 1b \n"
  2985. "vzeroupper \n"
  2986. : "+r"(src), // %0
  2987. "+r"(dst), // %1
  2988. "+r"(temp_width) // %2
  2989. : "m"(kShuffleMirror) // %3
  2990. : "memory", "cc", "xmm0", "xmm5");
  2991. }
  2992. #endif // HAS_MIRRORROW_AVX2
  2993. #ifdef HAS_MIRRORUVROW_SSSE3
  2994. // Shuffle table for reversing the bytes of UV channels.
  2995. static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
  2996. 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
  2997. void MirrorUVRow_SSSE3(const uint8_t* src,
  2998. uint8_t* dst_u,
  2999. uint8_t* dst_v,
  3000. int width) {
  3001. intptr_t temp_width = (intptr_t)(width);
  3002. asm volatile(
  3003. "movdqa %4,%%xmm1 \n"
  3004. "lea -0x10(%0,%3,2),%0 \n"
  3005. "sub %1,%2 \n"
  3006. LABELALIGN
  3007. "1: \n"
  3008. "movdqu (%0),%%xmm0 \n"
  3009. "lea -0x10(%0),%0 \n"
  3010. "pshufb %%xmm1,%%xmm0 \n"
  3011. "movlpd %%xmm0,(%1) \n"
  3012. "movhpd %%xmm0,0x00(%1,%2,1) \n"
  3013. "lea 0x8(%1),%1 \n"
  3014. "sub $8,%3 \n"
  3015. "jg 1b \n"
  3016. : "+r"(src), // %0
  3017. "+r"(dst_u), // %1
  3018. "+r"(dst_v), // %2
  3019. "+r"(temp_width) // %3
  3020. : "m"(kShuffleMirrorUV) // %4
  3021. : "memory", "cc", "xmm0", "xmm1");
  3022. }
  3023. #endif // HAS_MIRRORUVROW_SSSE3
  3024. #ifdef HAS_ARGBMIRRORROW_SSE2
  3025. void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  3026. intptr_t temp_width = (intptr_t)(width);
  3027. asm volatile(
  3028. "lea -0x10(%0,%2,4),%0 \n"
  3029. LABELALIGN
  3030. "1: \n"
  3031. "movdqu (%0),%%xmm0 \n"
  3032. "pshufd $0x1b,%%xmm0,%%xmm0 \n"
  3033. "lea -0x10(%0),%0 \n"
  3034. "movdqu %%xmm0,(%1) \n"
  3035. "lea 0x10(%1),%1 \n"
  3036. "sub $0x4,%2 \n"
  3037. "jg 1b \n"
  3038. : "+r"(src), // %0
  3039. "+r"(dst), // %1
  3040. "+r"(temp_width) // %2
  3041. :
  3042. : "memory", "cc", "xmm0");
  3043. }
  3044. #endif // HAS_ARGBMIRRORROW_SSE2
  3045. #ifdef HAS_ARGBMIRRORROW_AVX2
  3046. // Shuffle table for reversing the bytes.
  3047. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  3048. void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  3049. intptr_t temp_width = (intptr_t)(width);
  3050. asm volatile(
  3051. "vmovdqu %3,%%ymm5 \n"
  3052. LABELALIGN
  3053. "1: \n"
  3054. "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
  3055. "vmovdqu %%ymm0,(%1) \n"
  3056. "lea 0x20(%1),%1 \n"
  3057. "sub $0x8,%2 \n"
  3058. "jg 1b \n"
  3059. "vzeroupper \n"
  3060. : "+r"(src), // %0
  3061. "+r"(dst), // %1
  3062. "+r"(temp_width) // %2
  3063. : "m"(kARGBShuffleMirror_AVX2) // %3
  3064. : "memory", "cc", "xmm0", "xmm5");
  3065. }
  3066. #endif // HAS_ARGBMIRRORROW_AVX2
  3067. #ifdef HAS_SPLITUVROW_AVX2
  3068. void SplitUVRow_AVX2(const uint8_t* src_uv,
  3069. uint8_t* dst_u,
  3070. uint8_t* dst_v,
  3071. int width) {
  3072. asm volatile(
  3073. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3074. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3075. "sub %1,%2 \n"
  3076. LABELALIGN
  3077. "1: \n"
  3078. "vmovdqu (%0),%%ymm0 \n"
  3079. "vmovdqu 0x20(%0),%%ymm1 \n"
  3080. "lea 0x40(%0),%0 \n"
  3081. "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
  3082. "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
  3083. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3084. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3085. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3086. "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
  3087. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3088. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  3089. "vmovdqu %%ymm0,(%1) \n"
  3090. "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
  3091. "lea 0x20(%1),%1 \n"
  3092. "sub $0x20,%3 \n"
  3093. "jg 1b \n"
  3094. "vzeroupper \n"
  3095. : "+r"(src_uv), // %0
  3096. "+r"(dst_u), // %1
  3097. "+r"(dst_v), // %2
  3098. "+r"(width) // %3
  3099. :
  3100. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  3101. }
  3102. #endif // HAS_SPLITUVROW_AVX2
  3103. #ifdef HAS_SPLITUVROW_SSE2
  3104. void SplitUVRow_SSE2(const uint8_t* src_uv,
  3105. uint8_t* dst_u,
  3106. uint8_t* dst_v,
  3107. int width) {
  3108. asm volatile(
  3109. "pcmpeqb %%xmm5,%%xmm5 \n"
  3110. "psrlw $0x8,%%xmm5 \n"
  3111. "sub %1,%2 \n"
  3112. LABELALIGN
  3113. "1: \n"
  3114. "movdqu (%0),%%xmm0 \n"
  3115. "movdqu 0x10(%0),%%xmm1 \n"
  3116. "lea 0x20(%0),%0 \n"
  3117. "movdqa %%xmm0,%%xmm2 \n"
  3118. "movdqa %%xmm1,%%xmm3 \n"
  3119. "pand %%xmm5,%%xmm0 \n"
  3120. "pand %%xmm5,%%xmm1 \n"
  3121. "packuswb %%xmm1,%%xmm0 \n"
  3122. "psrlw $0x8,%%xmm2 \n"
  3123. "psrlw $0x8,%%xmm3 \n"
  3124. "packuswb %%xmm3,%%xmm2 \n"
  3125. "movdqu %%xmm0,(%1) \n"
  3126. "movdqu %%xmm2,0x00(%1,%2,1) \n"
  3127. "lea 0x10(%1),%1 \n"
  3128. "sub $0x10,%3 \n"
  3129. "jg 1b \n"
  3130. : "+r"(src_uv), // %0
  3131. "+r"(dst_u), // %1
  3132. "+r"(dst_v), // %2
  3133. "+r"(width) // %3
  3134. :
  3135. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  3136. }
  3137. #endif // HAS_SPLITUVROW_SSE2
  3138. #ifdef HAS_MERGEUVROW_AVX2
  3139. void MergeUVRow_AVX2(const uint8_t* src_u,
  3140. const uint8_t* src_v,
  3141. uint8_t* dst_uv,
  3142. int width) {
  3143. asm volatile(
  3144. "sub %0,%1 \n"
  3145. LABELALIGN
  3146. "1: \n"
  3147. "vmovdqu (%0),%%ymm0 \n"
  3148. "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
  3149. "lea 0x20(%0),%0 \n"
  3150. "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
  3151. "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
  3152. "vextractf128 $0x0,%%ymm2,(%2) \n"
  3153. "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
  3154. "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
  3155. "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
  3156. "lea 0x40(%2),%2 \n"
  3157. "sub $0x20,%3 \n"
  3158. "jg 1b \n"
  3159. "vzeroupper \n"
  3160. : "+r"(src_u), // %0
  3161. "+r"(src_v), // %1
  3162. "+r"(dst_uv), // %2
  3163. "+r"(width) // %3
  3164. :
  3165. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3166. }
  3167. #endif // HAS_MERGEUVROW_AVX2
  3168. #ifdef HAS_MERGEUVROW_SSE2
  3169. void MergeUVRow_SSE2(const uint8_t* src_u,
  3170. const uint8_t* src_v,
  3171. uint8_t* dst_uv,
  3172. int width) {
  3173. asm volatile(
  3174. "sub %0,%1 \n"
  3175. LABELALIGN
  3176. "1: \n"
  3177. "movdqu (%0),%%xmm0 \n"
  3178. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  3179. "lea 0x10(%0),%0 \n"
  3180. "movdqa %%xmm0,%%xmm2 \n"
  3181. "punpcklbw %%xmm1,%%xmm0 \n"
  3182. "punpckhbw %%xmm1,%%xmm2 \n"
  3183. "movdqu %%xmm0,(%2) \n"
  3184. "movdqu %%xmm2,0x10(%2) \n"
  3185. "lea 0x20(%2),%2 \n"
  3186. "sub $0x10,%3 \n"
  3187. "jg 1b \n"
  3188. : "+r"(src_u), // %0
  3189. "+r"(src_v), // %1
  3190. "+r"(dst_uv), // %2
  3191. "+r"(width) // %3
  3192. :
  3193. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3194. }
  3195. #endif // HAS_MERGEUVROW_SSE2
  3196. // Use scale to convert lsb formats to msb, depending how many bits there are:
  3197. // 128 = 9 bits
  3198. // 64 = 10 bits
  3199. // 16 = 12 bits
  3200. // 1 = 16 bits
  3201. #ifdef HAS_MERGEUVROW_16_AVX2
  3202. void MergeUVRow_16_AVX2(const uint16_t* src_u,
  3203. const uint16_t* src_v,
  3204. uint16_t* dst_uv,
  3205. int scale,
  3206. int width) {
  3207. // clang-format off
  3208. asm volatile (
  3209. "vmovd %4,%%xmm3 \n"
  3210. "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
  3211. "vbroadcastss %%xmm3,%%ymm3 \n"
  3212. "sub %0,%1 \n"
  3213. // 16 pixels per loop.
  3214. LABELALIGN
  3215. "1: \n"
  3216. "vmovdqu (%0),%%ymm0 \n"
  3217. "vmovdqu (%0,%1,1),%%ymm1 \n"
  3218. "add $0x20,%0 \n"
  3219. "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
  3220. "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
  3221. "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
  3222. "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
  3223. "vextractf128 $0x0,%%ymm2,(%2) \n"
  3224. "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
  3225. "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
  3226. "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
  3227. "add $0x40,%2 \n"
  3228. "sub $0x10,%3 \n"
  3229. "jg 1b \n"
  3230. "vzeroupper \n"
  3231. : "+r"(src_u), // %0
  3232. "+r"(src_v), // %1
  3233. "+r"(dst_uv), // %2
  3234. "+r"(width) // %3
  3235. : "r"(scale) // %4
  3236. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
  3237. // clang-format on
  3238. }
  3239. #endif // HAS_MERGEUVROW_AVX2
  3240. // Use scale to convert lsb formats to msb, depending how many bits there are:
  3241. // 128 = 9 bits
  3242. // 64 = 10 bits
  3243. // 16 = 12 bits
  3244. // 1 = 16 bits
  3245. #ifdef HAS_MULTIPLYROW_16_AVX2
  3246. void MultiplyRow_16_AVX2(const uint16_t* src_y,
  3247. uint16_t* dst_y,
  3248. int scale,
  3249. int width) {
  3250. // clang-format off
  3251. asm volatile (
  3252. "vmovd %3,%%xmm3 \n"
  3253. "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
  3254. "vbroadcastss %%xmm3,%%ymm3 \n"
  3255. "sub %0,%1 \n"
  3256. // 16 pixels per loop.
  3257. LABELALIGN
  3258. "1: \n"
  3259. "vmovdqu (%0),%%ymm0 \n"
  3260. "vmovdqu 0x20(%0),%%ymm1 \n"
  3261. "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
  3262. "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
  3263. "vmovdqu %%ymm0,(%0,%1) \n"
  3264. "vmovdqu %%ymm1,0x20(%0,%1) \n"
  3265. "add $0x40,%0 \n"
  3266. "sub $0x20,%2 \n"
  3267. "jg 1b \n"
  3268. "vzeroupper \n"
  3269. : "+r"(src_y), // %0
  3270. "+r"(dst_y), // %1
  3271. "+r"(width) // %2
  3272. : "r"(scale) // %3
  3273. : "memory", "cc", "xmm0", "xmm1", "xmm3");
  3274. // clang-format on
  3275. }
  3276. #endif // HAS_MULTIPLYROW_16_AVX2
  3277. // Use scale to convert lsb formats to msb, depending how many bits there are:
  3278. // 32768 = 9 bits
  3279. // 16384 = 10 bits
  3280. // 4096 = 12 bits
  3281. // 256 = 16 bits
  3282. void Convert16To8Row_SSSE3(const uint16_t* src_y,
  3283. uint8_t* dst_y,
  3284. int scale,
  3285. int width) {
  3286. // clang-format off
  3287. asm volatile (
  3288. "movd %3,%%xmm2 \n"
  3289. "punpcklwd %%xmm2,%%xmm2 \n"
  3290. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  3291. // 32 pixels per loop.
  3292. LABELALIGN
  3293. "1: \n"
  3294. "movdqu (%0),%%xmm0 \n"
  3295. "movdqu 0x10(%0),%%xmm1 \n"
  3296. "add $0x20,%0 \n"
  3297. "pmulhuw %%xmm2,%%xmm0 \n"
  3298. "pmulhuw %%xmm2,%%xmm1 \n"
  3299. "packuswb %%xmm1,%%xmm0 \n"
  3300. "movdqu %%xmm0,(%1) \n"
  3301. "add $0x10,%1 \n"
  3302. "sub $0x10,%2 \n"
  3303. "jg 1b \n"
  3304. : "+r"(src_y), // %0
  3305. "+r"(dst_y), // %1
  3306. "+r"(width) // %2
  3307. : "r"(scale) // %3
  3308. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3309. // clang-format on
  3310. }
  3311. #ifdef HAS_CONVERT16TO8ROW_AVX2
  3312. void Convert16To8Row_AVX2(const uint16_t* src_y,
  3313. uint8_t* dst_y,
  3314. int scale,
  3315. int width) {
  3316. // clang-format off
  3317. asm volatile (
  3318. "vmovd %3,%%xmm2 \n"
  3319. "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
  3320. "vbroadcastss %%xmm2,%%ymm2 \n"
  3321. // 32 pixels per loop.
  3322. LABELALIGN
  3323. "1: \n"
  3324. "vmovdqu (%0),%%ymm0 \n"
  3325. "vmovdqu 0x20(%0),%%ymm1 \n"
  3326. "add $0x40,%0 \n"
  3327. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3328. "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
  3329. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
  3330. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3331. "vmovdqu %%ymm0,(%1) \n"
  3332. "add $0x20,%1 \n"
  3333. "sub $0x20,%2 \n"
  3334. "jg 1b \n"
  3335. "vzeroupper \n"
  3336. : "+r"(src_y), // %0
  3337. "+r"(dst_y), // %1
  3338. "+r"(width) // %2
  3339. : "r"(scale) // %3
  3340. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3341. // clang-format on
  3342. }
  3343. #endif // HAS_CONVERT16TO8ROW_AVX2
  3344. // Use scale to convert to lsb formats depending how many bits there are:
  3345. // 512 = 9 bits
  3346. // 1024 = 10 bits
  3347. // 4096 = 12 bits
  3348. // TODO(fbarchard): reduce to SSE2
  3349. void Convert8To16Row_SSE2(const uint8_t* src_y,
  3350. uint16_t* dst_y,
  3351. int scale,
  3352. int width) {
  3353. // clang-format off
  3354. asm volatile (
  3355. "movd %3,%%xmm2 \n"
  3356. "punpcklwd %%xmm2,%%xmm2 \n"
  3357. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  3358. // 32 pixels per loop.
  3359. LABELALIGN
  3360. "1: \n"
  3361. "movdqu (%0),%%xmm0 \n"
  3362. "movdqa %%xmm0,%%xmm1 \n"
  3363. "punpcklbw %%xmm0,%%xmm0 \n"
  3364. "punpckhbw %%xmm1,%%xmm1 \n"
  3365. "add $0x10,%0 \n"
  3366. "pmulhuw %%xmm2,%%xmm0 \n"
  3367. "pmulhuw %%xmm2,%%xmm1 \n"
  3368. "movdqu %%xmm0,(%1) \n"
  3369. "movdqu %%xmm1,0x10(%1) \n"
  3370. "add $0x20,%1 \n"
  3371. "sub $0x10,%2 \n"
  3372. "jg 1b \n"
  3373. : "+r"(src_y), // %0
  3374. "+r"(dst_y), // %1
  3375. "+r"(width) // %2
  3376. : "r"(scale) // %3
  3377. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3378. // clang-format on
  3379. }
  3380. #ifdef HAS_CONVERT8TO16ROW_AVX2
  3381. void Convert8To16Row_AVX2(const uint8_t* src_y,
  3382. uint16_t* dst_y,
  3383. int scale,
  3384. int width) {
  3385. // clang-format off
  3386. asm volatile (
  3387. "vmovd %3,%%xmm2 \n"
  3388. "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
  3389. "vbroadcastss %%xmm2,%%ymm2 \n"
  3390. // 32 pixels per loop.
  3391. LABELALIGN
  3392. "1: \n"
  3393. "vmovdqu (%0),%%ymm0 \n"
  3394. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3395. "add $0x20,%0 \n"
  3396. "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
  3397. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  3398. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3399. "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
  3400. "vmovdqu %%ymm0,(%1) \n"
  3401. "vmovdqu %%ymm1,0x20(%1) \n"
  3402. "add $0x40,%1 \n"
  3403. "sub $0x20,%2 \n"
  3404. "jg 1b \n"
  3405. "vzeroupper \n"
  3406. : "+r"(src_y), // %0
  3407. "+r"(dst_y), // %1
  3408. "+r"(width) // %2
  3409. : "r"(scale) // %3
  3410. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3411. // clang-format on
  3412. }
  3413. #endif // HAS_CONVERT8TO16ROW_AVX2
  3414. #ifdef HAS_SPLITRGBROW_SSSE3
  3415. // Shuffle table for converting RGB to Planar.
  3416. static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
  3417. 128u, 128u, 128u, 128u, 128u, 128u,
  3418. 128u, 128u, 128u, 128u};
  3419. static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
  3420. 2u, 5u, 8u, 11u, 14u, 128u,
  3421. 128u, 128u, 128u, 128u};
  3422. static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
  3423. 128u, 128u, 128u, 128u, 128u, 1u,
  3424. 4u, 7u, 10u, 13u};
  3425. static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
  3426. 128u, 128u, 128u, 128u, 128u, 128u,
  3427. 128u, 128u, 128u, 128u};
  3428. static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
  3429. 3u, 6u, 9u, 12u, 15u, 128u,
  3430. 128u, 128u, 128u, 128u};
  3431. static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
  3432. 128u, 128u, 128u, 128u, 128u, 2u,
  3433. 5u, 8u, 11u, 14u};
  3434. static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
  3435. 128u, 128u, 128u, 128u, 128u, 128u,
  3436. 128u, 128u, 128u, 128u};
  3437. static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
  3438. 4u, 7u, 10u, 13u, 128u, 128u,
  3439. 128u, 128u, 128u, 128u};
  3440. static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
  3441. 128u, 128u, 128u, 128u, 0u, 3u,
  3442. 6u, 9u, 12u, 15u};
  3443. void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
  3444. uint8_t* dst_r,
  3445. uint8_t* dst_g,
  3446. uint8_t* dst_b,
  3447. int width) {
  3448. asm volatile(
  3449. LABELALIGN
  3450. "1: \n"
  3451. "movdqu (%0),%%xmm0 \n"
  3452. "movdqu 0x10(%0),%%xmm1 \n"
  3453. "movdqu 0x20(%0),%%xmm2 \n"
  3454. "pshufb %5, %%xmm0 \n"
  3455. "pshufb %6, %%xmm1 \n"
  3456. "pshufb %7, %%xmm2 \n"
  3457. "por %%xmm1,%%xmm0 \n"
  3458. "por %%xmm2,%%xmm0 \n"
  3459. "movdqu %%xmm0,(%1) \n"
  3460. "lea 0x10(%1),%1 \n"
  3461. "movdqu (%0),%%xmm0 \n"
  3462. "movdqu 0x10(%0),%%xmm1 \n"
  3463. "movdqu 0x20(%0),%%xmm2 \n"
  3464. "pshufb %8, %%xmm0 \n"
  3465. "pshufb %9, %%xmm1 \n"
  3466. "pshufb %10, %%xmm2 \n"
  3467. "por %%xmm1,%%xmm0 \n"
  3468. "por %%xmm2,%%xmm0 \n"
  3469. "movdqu %%xmm0,(%2) \n"
  3470. "lea 0x10(%2),%2 \n"
  3471. "movdqu (%0),%%xmm0 \n"
  3472. "movdqu 0x10(%0),%%xmm1 \n"
  3473. "movdqu 0x20(%0),%%xmm2 \n"
  3474. "pshufb %11, %%xmm0 \n"
  3475. "pshufb %12, %%xmm1 \n"
  3476. "pshufb %13, %%xmm2 \n"
  3477. "por %%xmm1,%%xmm0 \n"
  3478. "por %%xmm2,%%xmm0 \n"
  3479. "movdqu %%xmm0,(%3) \n"
  3480. "lea 0x10(%3),%3 \n"
  3481. "lea 0x30(%0),%0 \n"
  3482. "sub $0x10,%4 \n"
  3483. "jg 1b \n"
  3484. : "+r"(src_rgb), // %0
  3485. "+r"(dst_r), // %1
  3486. "+r"(dst_g), // %2
  3487. "+r"(dst_b), // %3
  3488. "+r"(width) // %4
  3489. : "m"(kShuffleMaskRGBToR0), // %5
  3490. "m"(kShuffleMaskRGBToR1), // %6
  3491. "m"(kShuffleMaskRGBToR2), // %7
  3492. "m"(kShuffleMaskRGBToG0), // %8
  3493. "m"(kShuffleMaskRGBToG1), // %9
  3494. "m"(kShuffleMaskRGBToG2), // %10
  3495. "m"(kShuffleMaskRGBToB0), // %11
  3496. "m"(kShuffleMaskRGBToB1), // %12
  3497. "m"(kShuffleMaskRGBToB2) // %13
  3498. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3499. }
  3500. #endif // HAS_SPLITRGBROW_SSSE3
  3501. #ifdef HAS_MERGERGBROW_SSSE3
  3502. // Shuffle table for converting RGB to Planar.
  3503. static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
  3504. 2u, 128u, 128u, 3u, 128u, 128u,
  3505. 4u, 128u, 128u, 5u};
  3506. static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
  3507. 128u, 2u, 128u, 128u, 3u, 128u,
  3508. 128u, 4u, 128u, 128u};
  3509. static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
  3510. 128u, 128u, 2u, 128u, 128u, 3u,
  3511. 128u, 128u, 4u, 128u};
  3512. static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
  3513. 7u, 128u, 128u, 8u, 128u, 128u,
  3514. 9u, 128u, 128u, 10u};
  3515. static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
  3516. 128u, 7u, 128u, 128u, 8u, 128u,
  3517. 128u, 9u, 128u, 128u};
  3518. static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
  3519. 128u, 128u, 8u, 128u, 128u, 9u,
  3520. 128u, 128u, 10u, 128u};
  3521. static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
  3522. 12u, 128u, 128u, 13u, 128u, 128u,
  3523. 14u, 128u, 128u, 15u};
  3524. static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
  3525. 128u, 13u, 128u, 128u, 14u, 128u,
  3526. 128u, 15u, 128u, 128u};
  3527. static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
  3528. 128u, 128u, 13u, 128u, 128u, 14u,
  3529. 128u, 128u, 15u, 128u};
  3530. void MergeRGBRow_SSSE3(const uint8_t* src_r,
  3531. const uint8_t* src_g,
  3532. const uint8_t* src_b,
  3533. uint8_t* dst_rgb,
  3534. int width) {
  3535. asm volatile(
  3536. LABELALIGN
  3537. "1: \n"
  3538. "movdqu (%0),%%xmm0 \n"
  3539. "movdqu (%1),%%xmm1 \n"
  3540. "movdqu (%2),%%xmm2 \n"
  3541. "pshufb %5, %%xmm0 \n"
  3542. "pshufb %6, %%xmm1 \n"
  3543. "pshufb %7, %%xmm2 \n"
  3544. "por %%xmm1,%%xmm0 \n"
  3545. "por %%xmm2,%%xmm0 \n"
  3546. "movdqu %%xmm0,(%3) \n"
  3547. "movdqu (%0),%%xmm0 \n"
  3548. "movdqu (%1),%%xmm1 \n"
  3549. "movdqu (%2),%%xmm2 \n"
  3550. "pshufb %8, %%xmm0 \n"
  3551. "pshufb %9, %%xmm1 \n"
  3552. "pshufb %10, %%xmm2 \n"
  3553. "por %%xmm1,%%xmm0 \n"
  3554. "por %%xmm2,%%xmm0 \n"
  3555. "movdqu %%xmm0,16(%3) \n"
  3556. "movdqu (%0),%%xmm0 \n"
  3557. "movdqu (%1),%%xmm1 \n"
  3558. "movdqu (%2),%%xmm2 \n"
  3559. "pshufb %11, %%xmm0 \n"
  3560. "pshufb %12, %%xmm1 \n"
  3561. "pshufb %13, %%xmm2 \n"
  3562. "por %%xmm1,%%xmm0 \n"
  3563. "por %%xmm2,%%xmm0 \n"
  3564. "movdqu %%xmm0,32(%3) \n"
  3565. "lea 0x10(%0),%0 \n"
  3566. "lea 0x10(%1),%1 \n"
  3567. "lea 0x10(%2),%2 \n"
  3568. "lea 0x30(%3),%3 \n"
  3569. "sub $0x10,%4 \n"
  3570. "jg 1b \n"
  3571. : "+r"(src_r), // %0
  3572. "+r"(src_g), // %1
  3573. "+r"(src_b), // %2
  3574. "+r"(dst_rgb), // %3
  3575. "+r"(width) // %4
  3576. : "m"(kShuffleMaskRToRGB0), // %5
  3577. "m"(kShuffleMaskGToRGB0), // %6
  3578. "m"(kShuffleMaskBToRGB0), // %7
  3579. "m"(kShuffleMaskRToRGB1), // %8
  3580. "m"(kShuffleMaskGToRGB1), // %9
  3581. "m"(kShuffleMaskBToRGB1), // %10
  3582. "m"(kShuffleMaskRToRGB2), // %11
  3583. "m"(kShuffleMaskGToRGB2), // %12
  3584. "m"(kShuffleMaskBToRGB2) // %13
  3585. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3586. }
  3587. #endif // HAS_MERGERGBROW_SSSE3
  3588. #ifdef HAS_COPYROW_SSE2
  3589. void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  3590. asm volatile(
  3591. "test $0xf,%0 \n"
  3592. "jne 2f \n"
  3593. "test $0xf,%1 \n"
  3594. "jne 2f \n"
  3595. LABELALIGN
  3596. "1: \n"
  3597. "movdqa (%0),%%xmm0 \n"
  3598. "movdqa 0x10(%0),%%xmm1 \n"
  3599. "lea 0x20(%0),%0 \n"
  3600. "movdqa %%xmm0,(%1) \n"
  3601. "movdqa %%xmm1,0x10(%1) \n"
  3602. "lea 0x20(%1),%1 \n"
  3603. "sub $0x20,%2 \n"
  3604. "jg 1b \n"
  3605. "jmp 9f \n"
  3606. LABELALIGN
  3607. "2: \n"
  3608. "movdqu (%0),%%xmm0 \n"
  3609. "movdqu 0x10(%0),%%xmm1 \n"
  3610. "lea 0x20(%0),%0 \n"
  3611. "movdqu %%xmm0,(%1) \n"
  3612. "movdqu %%xmm1,0x10(%1) \n"
  3613. "lea 0x20(%1),%1 \n"
  3614. "sub $0x20,%2 \n"
  3615. "jg 2b \n"
  3616. LABELALIGN "9: \n"
  3617. : "+r"(src), // %0
  3618. "+r"(dst), // %1
  3619. "+r"(width) // %2
  3620. :
  3621. : "memory", "cc", "xmm0", "xmm1");
  3622. }
  3623. #endif // HAS_COPYROW_SSE2
  3624. #ifdef HAS_COPYROW_AVX
  3625. void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
  3626. asm volatile(
  3627. LABELALIGN
  3628. "1: \n"
  3629. "vmovdqu (%0),%%ymm0 \n"
  3630. "vmovdqu 0x20(%0),%%ymm1 \n"
  3631. "lea 0x40(%0),%0 \n"
  3632. "vmovdqu %%ymm0,(%1) \n"
  3633. "vmovdqu %%ymm1,0x20(%1) \n"
  3634. "lea 0x40(%1),%1 \n"
  3635. "sub $0x40,%2 \n"
  3636. "jg 1b \n"
  3637. : "+r"(src), // %0
  3638. "+r"(dst), // %1
  3639. "+r"(width) // %2
  3640. :
  3641. : "memory", "cc", "xmm0", "xmm1");
  3642. }
  3643. #endif // HAS_COPYROW_AVX
  3644. #ifdef HAS_COPYROW_ERMS
  3645. // Multiple of 1.
  3646. void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
  3647. size_t width_tmp = (size_t)(width);
  3648. asm volatile(
  3649. "rep movsb \n"
  3650. : "+S"(src), // %0
  3651. "+D"(dst), // %1
  3652. "+c"(width_tmp) // %2
  3653. :
  3654. : "memory", "cc");
  3655. }
  3656. #endif // HAS_COPYROW_ERMS
  3657. #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  3658. // width in pixels
  3659. void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  3660. asm volatile(
  3661. "pcmpeqb %%xmm0,%%xmm0 \n"
  3662. "pslld $0x18,%%xmm0 \n"
  3663. "pcmpeqb %%xmm1,%%xmm1 \n"
  3664. "psrld $0x8,%%xmm1 \n"
  3665. LABELALIGN
  3666. "1: \n"
  3667. "movdqu (%0),%%xmm2 \n"
  3668. "movdqu 0x10(%0),%%xmm3 \n"
  3669. "lea 0x20(%0),%0 \n"
  3670. "movdqu (%1),%%xmm4 \n"
  3671. "movdqu 0x10(%1),%%xmm5 \n"
  3672. "pand %%xmm0,%%xmm2 \n"
  3673. "pand %%xmm0,%%xmm3 \n"
  3674. "pand %%xmm1,%%xmm4 \n"
  3675. "pand %%xmm1,%%xmm5 \n"
  3676. "por %%xmm4,%%xmm2 \n"
  3677. "por %%xmm5,%%xmm3 \n"
  3678. "movdqu %%xmm2,(%1) \n"
  3679. "movdqu %%xmm3,0x10(%1) \n"
  3680. "lea 0x20(%1),%1 \n"
  3681. "sub $0x8,%2 \n"
  3682. "jg 1b \n"
  3683. : "+r"(src), // %0
  3684. "+r"(dst), // %1
  3685. "+r"(width) // %2
  3686. :
  3687. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  3688. }
  3689. #endif // HAS_ARGBCOPYALPHAROW_SSE2
  3690. #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  3691. // width in pixels
  3692. void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  3693. asm volatile(
  3694. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  3695. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  3696. LABELALIGN
  3697. "1: \n"
  3698. "vmovdqu (%0),%%ymm1 \n"
  3699. "vmovdqu 0x20(%0),%%ymm2 \n"
  3700. "lea 0x40(%0),%0 \n"
  3701. "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
  3702. "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
  3703. "vmovdqu %%ymm1,(%1) \n"
  3704. "vmovdqu %%ymm2,0x20(%1) \n"
  3705. "lea 0x40(%1),%1 \n"
  3706. "sub $0x10,%2 \n"
  3707. "jg 1b \n"
  3708. "vzeroupper \n"
  3709. : "+r"(src), // %0
  3710. "+r"(dst), // %1
  3711. "+r"(width) // %2
  3712. :
  3713. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3714. }
  3715. #endif // HAS_ARGBCOPYALPHAROW_AVX2
  3716. #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
  3717. // width in pixels
  3718. void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
  3719. uint8_t* dst_a,
  3720. int width) {
  3721. asm volatile(
  3722. LABELALIGN
  3723. "1: \n"
  3724. "movdqu (%0), %%xmm0 \n"
  3725. "movdqu 0x10(%0), %%xmm1 \n"
  3726. "lea 0x20(%0), %0 \n"
  3727. "psrld $0x18, %%xmm0 \n"
  3728. "psrld $0x18, %%xmm1 \n"
  3729. "packssdw %%xmm1, %%xmm0 \n"
  3730. "packuswb %%xmm0, %%xmm0 \n"
  3731. "movq %%xmm0,(%1) \n"
  3732. "lea 0x8(%1), %1 \n"
  3733. "sub $0x8, %2 \n"
  3734. "jg 1b \n"
  3735. : "+r"(src_argb), // %0
  3736. "+r"(dst_a), // %1
  3737. "+rm"(width) // %2
  3738. :
  3739. : "memory", "cc", "xmm0", "xmm1");
  3740. }
  3741. #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
  3742. #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
  3743. static const uvec8 kShuffleAlphaShort_AVX2 = {
  3744. 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
  3745. 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
  3746. void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
  3747. uint8_t* dst_a,
  3748. int width) {
  3749. asm volatile(
  3750. "vmovdqa %3,%%ymm4 \n"
  3751. "vbroadcastf128 %4,%%ymm5 \n"
  3752. LABELALIGN
  3753. "1: \n"
  3754. "vmovdqu (%0), %%ymm0 \n"
  3755. "vmovdqu 0x20(%0), %%ymm1 \n"
  3756. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
  3757. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  3758. "vmovdqu 0x40(%0), %%ymm2 \n"
  3759. "vmovdqu 0x60(%0), %%ymm3 \n"
  3760. "lea 0x80(%0), %0 \n"
  3761. "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
  3762. "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
  3763. "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
  3764. "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
  3765. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  3766. "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
  3767. "vmovdqu %%ymm0,(%1) \n"
  3768. "lea 0x20(%1),%1 \n"
  3769. "sub $0x20, %2 \n"
  3770. "jg 1b \n"
  3771. "vzeroupper \n"
  3772. : "+r"(src_argb), // %0
  3773. "+r"(dst_a), // %1
  3774. "+rm"(width) // %2
  3775. : "m"(kPermdARGBToY_AVX), // %3
  3776. "m"(kShuffleAlphaShort_AVX2) // %4
  3777. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  3778. }
  3779. #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
  3780. #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  3781. // width in pixels
  3782. void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  3783. asm volatile(
  3784. "pcmpeqb %%xmm0,%%xmm0 \n"
  3785. "pslld $0x18,%%xmm0 \n"
  3786. "pcmpeqb %%xmm1,%%xmm1 \n"
  3787. "psrld $0x8,%%xmm1 \n"
  3788. LABELALIGN
  3789. "1: \n"
  3790. "movq (%0),%%xmm2 \n"
  3791. "lea 0x8(%0),%0 \n"
  3792. "punpcklbw %%xmm2,%%xmm2 \n"
  3793. "punpckhwd %%xmm2,%%xmm3 \n"
  3794. "punpcklwd %%xmm2,%%xmm2 \n"
  3795. "movdqu (%1),%%xmm4 \n"
  3796. "movdqu 0x10(%1),%%xmm5 \n"
  3797. "pand %%xmm0,%%xmm2 \n"
  3798. "pand %%xmm0,%%xmm3 \n"
  3799. "pand %%xmm1,%%xmm4 \n"
  3800. "pand %%xmm1,%%xmm5 \n"
  3801. "por %%xmm4,%%xmm2 \n"
  3802. "por %%xmm5,%%xmm3 \n"
  3803. "movdqu %%xmm2,(%1) \n"
  3804. "movdqu %%xmm3,0x10(%1) \n"
  3805. "lea 0x20(%1),%1 \n"
  3806. "sub $0x8,%2 \n"
  3807. "jg 1b \n"
  3808. : "+r"(src), // %0
  3809. "+r"(dst), // %1
  3810. "+r"(width) // %2
  3811. :
  3812. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  3813. }
  3814. #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
  3815. #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  3816. // width in pixels
  3817. void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  3818. asm volatile(
  3819. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  3820. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  3821. LABELALIGN
  3822. "1: \n"
  3823. "vpmovzxbd (%0),%%ymm1 \n"
  3824. "vpmovzxbd 0x8(%0),%%ymm2 \n"
  3825. "lea 0x10(%0),%0 \n"
  3826. "vpslld $0x18,%%ymm1,%%ymm1 \n"
  3827. "vpslld $0x18,%%ymm2,%%ymm2 \n"
  3828. "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
  3829. "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
  3830. "vmovdqu %%ymm1,(%1) \n"
  3831. "vmovdqu %%ymm2,0x20(%1) \n"
  3832. "lea 0x40(%1),%1 \n"
  3833. "sub $0x10,%2 \n"
  3834. "jg 1b \n"
  3835. "vzeroupper \n"
  3836. : "+r"(src), // %0
  3837. "+r"(dst), // %1
  3838. "+r"(width) // %2
  3839. :
  3840. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3841. }
  3842. #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
  3843. #ifdef HAS_SETROW_X86
  3844. void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
  3845. size_t width_tmp = (size_t)(width >> 2);
  3846. const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
  3847. asm volatile(
  3848. "rep stosl \n"
  3849. : "+D"(dst), // %0
  3850. "+c"(width_tmp) // %1
  3851. : "a"(v32) // %2
  3852. : "memory", "cc");
  3853. }
  3854. void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
  3855. size_t width_tmp = (size_t)(width);
  3856. asm volatile(
  3857. "rep stosb \n"
  3858. : "+D"(dst), // %0
  3859. "+c"(width_tmp) // %1
  3860. : "a"(v8) // %2
  3861. : "memory", "cc");
  3862. }
  3863. void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
  3864. size_t width_tmp = (size_t)(width);
  3865. asm volatile(
  3866. "rep stosl \n"
  3867. : "+D"(dst_argb), // %0
  3868. "+c"(width_tmp) // %1
  3869. : "a"(v32) // %2
  3870. : "memory", "cc");
  3871. }
  3872. #endif // HAS_SETROW_X86
  3873. #ifdef HAS_YUY2TOYROW_SSE2
  3874. void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  3875. asm volatile(
  3876. "pcmpeqb %%xmm5,%%xmm5 \n"
  3877. "psrlw $0x8,%%xmm5 \n"
  3878. LABELALIGN
  3879. "1: \n"
  3880. "movdqu (%0),%%xmm0 \n"
  3881. "movdqu 0x10(%0),%%xmm1 \n"
  3882. "lea 0x20(%0),%0 \n"
  3883. "pand %%xmm5,%%xmm0 \n"
  3884. "pand %%xmm5,%%xmm1 \n"
  3885. "packuswb %%xmm1,%%xmm0 \n"
  3886. "movdqu %%xmm0,(%1) \n"
  3887. "lea 0x10(%1),%1 \n"
  3888. "sub $0x10,%2 \n"
  3889. "jg 1b \n"
  3890. : "+r"(src_yuy2), // %0
  3891. "+r"(dst_y), // %1
  3892. "+r"(width) // %2
  3893. :
  3894. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  3895. }
  3896. void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
  3897. int stride_yuy2,
  3898. uint8_t* dst_u,
  3899. uint8_t* dst_v,
  3900. int width) {
  3901. asm volatile(
  3902. "pcmpeqb %%xmm5,%%xmm5 \n"
  3903. "psrlw $0x8,%%xmm5 \n"
  3904. "sub %1,%2 \n"
  3905. LABELALIGN
  3906. "1: \n"
  3907. "movdqu (%0),%%xmm0 \n"
  3908. "movdqu 0x10(%0),%%xmm1 \n"
  3909. "movdqu 0x00(%0,%4,1),%%xmm2 \n"
  3910. "movdqu 0x10(%0,%4,1),%%xmm3 \n"
  3911. "lea 0x20(%0),%0 \n"
  3912. "pavgb %%xmm2,%%xmm0 \n"
  3913. "pavgb %%xmm3,%%xmm1 \n"
  3914. "psrlw $0x8,%%xmm0 \n"
  3915. "psrlw $0x8,%%xmm1 \n"
  3916. "packuswb %%xmm1,%%xmm0 \n"
  3917. "movdqa %%xmm0,%%xmm1 \n"
  3918. "pand %%xmm5,%%xmm0 \n"
  3919. "packuswb %%xmm0,%%xmm0 \n"
  3920. "psrlw $0x8,%%xmm1 \n"
  3921. "packuswb %%xmm1,%%xmm1 \n"
  3922. "movq %%xmm0,(%1) \n"
  3923. "movq %%xmm1,0x00(%1,%2,1) \n"
  3924. "lea 0x8(%1),%1 \n"
  3925. "sub $0x10,%3 \n"
  3926. "jg 1b \n"
  3927. : "+r"(src_yuy2), // %0
  3928. "+r"(dst_u), // %1
  3929. "+r"(dst_v), // %2
  3930. "+r"(width) // %3
  3931. : "r"((intptr_t)(stride_yuy2)) // %4
  3932. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  3933. }
  3934. void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
  3935. uint8_t* dst_u,
  3936. uint8_t* dst_v,
  3937. int width) {
  3938. asm volatile(
  3939. "pcmpeqb %%xmm5,%%xmm5 \n"
  3940. "psrlw $0x8,%%xmm5 \n"
  3941. "sub %1,%2 \n"
  3942. LABELALIGN
  3943. "1: \n"
  3944. "movdqu (%0),%%xmm0 \n"
  3945. "movdqu 0x10(%0),%%xmm1 \n"
  3946. "lea 0x20(%0),%0 \n"
  3947. "psrlw $0x8,%%xmm0 \n"
  3948. "psrlw $0x8,%%xmm1 \n"
  3949. "packuswb %%xmm1,%%xmm0 \n"
  3950. "movdqa %%xmm0,%%xmm1 \n"
  3951. "pand %%xmm5,%%xmm0 \n"
  3952. "packuswb %%xmm0,%%xmm0 \n"
  3953. "psrlw $0x8,%%xmm1 \n"
  3954. "packuswb %%xmm1,%%xmm1 \n"
  3955. "movq %%xmm0,(%1) \n"
  3956. "movq %%xmm1,0x00(%1,%2,1) \n"
  3957. "lea 0x8(%1),%1 \n"
  3958. "sub $0x10,%3 \n"
  3959. "jg 1b \n"
  3960. : "+r"(src_yuy2), // %0
  3961. "+r"(dst_u), // %1
  3962. "+r"(dst_v), // %2
  3963. "+r"(width) // %3
  3964. :
  3965. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  3966. }
  3967. void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  3968. asm volatile(
  3969. LABELALIGN
  3970. "1: \n"
  3971. "movdqu (%0),%%xmm0 \n"
  3972. "movdqu 0x10(%0),%%xmm1 \n"
  3973. "lea 0x20(%0),%0 \n"
  3974. "psrlw $0x8,%%xmm0 \n"
  3975. "psrlw $0x8,%%xmm1 \n"
  3976. "packuswb %%xmm1,%%xmm0 \n"
  3977. "movdqu %%xmm0,(%1) \n"
  3978. "lea 0x10(%1),%1 \n"
  3979. "sub $0x10,%2 \n"
  3980. "jg 1b \n"
  3981. : "+r"(src_uyvy), // %0
  3982. "+r"(dst_y), // %1
  3983. "+r"(width) // %2
  3984. :
  3985. : "memory", "cc", "xmm0", "xmm1");
  3986. }
  3987. void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
  3988. int stride_uyvy,
  3989. uint8_t* dst_u,
  3990. uint8_t* dst_v,
  3991. int width) {
  3992. asm volatile(
  3993. "pcmpeqb %%xmm5,%%xmm5 \n"
  3994. "psrlw $0x8,%%xmm5 \n"
  3995. "sub %1,%2 \n"
  3996. LABELALIGN
  3997. "1: \n"
  3998. "movdqu (%0),%%xmm0 \n"
  3999. "movdqu 0x10(%0),%%xmm1 \n"
  4000. "movdqu 0x00(%0,%4,1),%%xmm2 \n"
  4001. "movdqu 0x10(%0,%4,1),%%xmm3 \n"
  4002. "lea 0x20(%0),%0 \n"
  4003. "pavgb %%xmm2,%%xmm0 \n"
  4004. "pavgb %%xmm3,%%xmm1 \n"
  4005. "pand %%xmm5,%%xmm0 \n"
  4006. "pand %%xmm5,%%xmm1 \n"
  4007. "packuswb %%xmm1,%%xmm0 \n"
  4008. "movdqa %%xmm0,%%xmm1 \n"
  4009. "pand %%xmm5,%%xmm0 \n"
  4010. "packuswb %%xmm0,%%xmm0 \n"
  4011. "psrlw $0x8,%%xmm1 \n"
  4012. "packuswb %%xmm1,%%xmm1 \n"
  4013. "movq %%xmm0,(%1) \n"
  4014. "movq %%xmm1,0x00(%1,%2,1) \n"
  4015. "lea 0x8(%1),%1 \n"
  4016. "sub $0x10,%3 \n"
  4017. "jg 1b \n"
  4018. : "+r"(src_uyvy), // %0
  4019. "+r"(dst_u), // %1
  4020. "+r"(dst_v), // %2
  4021. "+r"(width) // %3
  4022. : "r"((intptr_t)(stride_uyvy)) // %4
  4023. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  4024. }
  4025. void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
  4026. uint8_t* dst_u,
  4027. uint8_t* dst_v,
  4028. int width) {
  4029. asm volatile(
  4030. "pcmpeqb %%xmm5,%%xmm5 \n"
  4031. "psrlw $0x8,%%xmm5 \n"
  4032. "sub %1,%2 \n"
  4033. LABELALIGN
  4034. "1: \n"
  4035. "movdqu (%0),%%xmm0 \n"
  4036. "movdqu 0x10(%0),%%xmm1 \n"
  4037. "lea 0x20(%0),%0 \n"
  4038. "pand %%xmm5,%%xmm0 \n"
  4039. "pand %%xmm5,%%xmm1 \n"
  4040. "packuswb %%xmm1,%%xmm0 \n"
  4041. "movdqa %%xmm0,%%xmm1 \n"
  4042. "pand %%xmm5,%%xmm0 \n"
  4043. "packuswb %%xmm0,%%xmm0 \n"
  4044. "psrlw $0x8,%%xmm1 \n"
  4045. "packuswb %%xmm1,%%xmm1 \n"
  4046. "movq %%xmm0,(%1) \n"
  4047. "movq %%xmm1,0x00(%1,%2,1) \n"
  4048. "lea 0x8(%1),%1 \n"
  4049. "sub $0x10,%3 \n"
  4050. "jg 1b \n"
  4051. : "+r"(src_uyvy), // %0
  4052. "+r"(dst_u), // %1
  4053. "+r"(dst_v), // %2
  4054. "+r"(width) // %3
  4055. :
  4056. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4057. }
  4058. #endif // HAS_YUY2TOYROW_SSE2
  4059. #ifdef HAS_YUY2TOYROW_AVX2
  4060. void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  4061. asm volatile(
  4062. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4063. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4064. LABELALIGN
  4065. "1: \n"
  4066. "vmovdqu (%0),%%ymm0 \n"
  4067. "vmovdqu 0x20(%0),%%ymm1 \n"
  4068. "lea 0x40(%0),%0 \n"
  4069. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  4070. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  4071. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4072. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4073. "vmovdqu %%ymm0,(%1) \n"
  4074. "lea 0x20(%1),%1 \n"
  4075. "sub $0x20,%2 \n"
  4076. "jg 1b \n"
  4077. "vzeroupper \n"
  4078. : "+r"(src_yuy2), // %0
  4079. "+r"(dst_y), // %1
  4080. "+r"(width) // %2
  4081. :
  4082. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4083. }
  4084. void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
  4085. int stride_yuy2,
  4086. uint8_t* dst_u,
  4087. uint8_t* dst_v,
  4088. int width) {
  4089. asm volatile(
  4090. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4091. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4092. "sub %1,%2 \n"
  4093. LABELALIGN
  4094. "1: \n"
  4095. "vmovdqu (%0),%%ymm0 \n"
  4096. "vmovdqu 0x20(%0),%%ymm1 \n"
  4097. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  4098. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  4099. "lea 0x40(%0),%0 \n"
  4100. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4101. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4102. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4103. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4104. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4105. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4106. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4107. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4108. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4109. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4110. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4111. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4112. "lea 0x10(%1),%1 \n"
  4113. "sub $0x20,%3 \n"
  4114. "jg 1b \n"
  4115. "vzeroupper \n"
  4116. : "+r"(src_yuy2), // %0
  4117. "+r"(dst_u), // %1
  4118. "+r"(dst_v), // %2
  4119. "+r"(width) // %3
  4120. : "r"((intptr_t)(stride_yuy2)) // %4
  4121. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4122. }
  4123. void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
  4124. uint8_t* dst_u,
  4125. uint8_t* dst_v,
  4126. int width) {
  4127. asm volatile(
  4128. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4129. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4130. "sub %1,%2 \n"
  4131. LABELALIGN
  4132. "1: \n"
  4133. "vmovdqu (%0),%%ymm0 \n"
  4134. "vmovdqu 0x20(%0),%%ymm1 \n"
  4135. "lea 0x40(%0),%0 \n"
  4136. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4137. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4138. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4139. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4140. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4141. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4142. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4143. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4144. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4145. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4146. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4147. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4148. "lea 0x10(%1),%1 \n"
  4149. "sub $0x20,%3 \n"
  4150. "jg 1b \n"
  4151. "vzeroupper \n"
  4152. : "+r"(src_yuy2), // %0
  4153. "+r"(dst_u), // %1
  4154. "+r"(dst_v), // %2
  4155. "+r"(width) // %3
  4156. :
  4157. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4158. }
  4159. void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  4160. asm volatile(
  4161. LABELALIGN
  4162. "1: \n"
  4163. "vmovdqu (%0),%%ymm0 \n"
  4164. "vmovdqu 0x20(%0),%%ymm1 \n"
  4165. "lea 0x40(%0),%0 \n"
  4166. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4167. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4168. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4169. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4170. "vmovdqu %%ymm0,(%1) \n"
  4171. "lea 0x20(%1),%1 \n"
  4172. "sub $0x20,%2 \n"
  4173. "jg 1b \n"
  4174. "vzeroupper \n"
  4175. : "+r"(src_uyvy), // %0
  4176. "+r"(dst_y), // %1
  4177. "+r"(width) // %2
  4178. :
  4179. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4180. }
  4181. void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
  4182. int stride_uyvy,
  4183. uint8_t* dst_u,
  4184. uint8_t* dst_v,
  4185. int width) {
  4186. asm volatile(
  4187. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4188. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4189. "sub %1,%2 \n"
  4190. LABELALIGN
  4191. "1: \n"
  4192. "vmovdqu (%0),%%ymm0 \n"
  4193. "vmovdqu 0x20(%0),%%ymm1 \n"
  4194. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  4195. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  4196. "lea 0x40(%0),%0 \n"
  4197. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  4198. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  4199. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4200. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4201. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4202. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4203. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4204. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4205. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4206. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4207. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4208. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4209. "lea 0x10(%1),%1 \n"
  4210. "sub $0x20,%3 \n"
  4211. "jg 1b \n"
  4212. "vzeroupper \n"
  4213. : "+r"(src_uyvy), // %0
  4214. "+r"(dst_u), // %1
  4215. "+r"(dst_v), // %2
  4216. "+r"(width) // %3
  4217. : "r"((intptr_t)(stride_uyvy)) // %4
  4218. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4219. }
  4220. void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
  4221. uint8_t* dst_u,
  4222. uint8_t* dst_v,
  4223. int width) {
  4224. asm volatile(
  4225. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4226. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4227. "sub %1,%2 \n"
  4228. LABELALIGN
  4229. "1: \n"
  4230. "vmovdqu (%0),%%ymm0 \n"
  4231. "vmovdqu 0x20(%0),%%ymm1 \n"
  4232. "lea 0x40(%0),%0 \n"
  4233. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  4234. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  4235. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4236. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4237. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4238. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4239. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4240. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4241. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4242. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4243. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4244. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4245. "lea 0x10(%1),%1 \n"
  4246. "sub $0x20,%3 \n"
  4247. "jg 1b \n"
  4248. "vzeroupper \n"
  4249. : "+r"(src_uyvy), // %0
  4250. "+r"(dst_u), // %1
  4251. "+r"(dst_v), // %2
  4252. "+r"(width) // %3
  4253. :
  4254. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4255. }
  4256. #endif // HAS_YUY2TOYROW_AVX2
  4257. #ifdef HAS_ARGBBLENDROW_SSSE3
  4258. // Shuffle table for isolating alpha.
  4259. static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  4260. 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
  4261. // Blend 8 pixels at a time
  4262. void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
  4263. const uint8_t* src_argb1,
  4264. uint8_t* dst_argb,
  4265. int width) {
  4266. asm volatile(
  4267. "pcmpeqb %%xmm7,%%xmm7 \n"
  4268. "psrlw $0xf,%%xmm7 \n"
  4269. "pcmpeqb %%xmm6,%%xmm6 \n"
  4270. "psrlw $0x8,%%xmm6 \n"
  4271. "pcmpeqb %%xmm5,%%xmm5 \n"
  4272. "psllw $0x8,%%xmm5 \n"
  4273. "pcmpeqb %%xmm4,%%xmm4 \n"
  4274. "pslld $0x18,%%xmm4 \n"
  4275. "sub $0x4,%3 \n"
  4276. "jl 49f \n"
  4277. // 4 pixel loop.
  4278. LABELALIGN
  4279. "40: \n"
  4280. "movdqu (%0),%%xmm3 \n"
  4281. "lea 0x10(%0),%0 \n"
  4282. "movdqa %%xmm3,%%xmm0 \n"
  4283. "pxor %%xmm4,%%xmm3 \n"
  4284. "movdqu (%1),%%xmm2 \n"
  4285. "pshufb %4,%%xmm3 \n"
  4286. "pand %%xmm6,%%xmm2 \n"
  4287. "paddw %%xmm7,%%xmm3 \n"
  4288. "pmullw %%xmm3,%%xmm2 \n"
  4289. "movdqu (%1),%%xmm1 \n"
  4290. "lea 0x10(%1),%1 \n"
  4291. "psrlw $0x8,%%xmm1 \n"
  4292. "por %%xmm4,%%xmm0 \n"
  4293. "pmullw %%xmm3,%%xmm1 \n"
  4294. "psrlw $0x8,%%xmm2 \n"
  4295. "paddusb %%xmm2,%%xmm0 \n"
  4296. "pand %%xmm5,%%xmm1 \n"
  4297. "paddusb %%xmm1,%%xmm0 \n"
  4298. "movdqu %%xmm0,(%2) \n"
  4299. "lea 0x10(%2),%2 \n"
  4300. "sub $0x4,%3 \n"
  4301. "jge 40b \n"
  4302. "49: \n"
  4303. "add $0x3,%3 \n"
  4304. "jl 99f \n"
  4305. // 1 pixel loop.
  4306. "91: \n"
  4307. "movd (%0),%%xmm3 \n"
  4308. "lea 0x4(%0),%0 \n"
  4309. "movdqa %%xmm3,%%xmm0 \n"
  4310. "pxor %%xmm4,%%xmm3 \n"
  4311. "movd (%1),%%xmm2 \n"
  4312. "pshufb %4,%%xmm3 \n"
  4313. "pand %%xmm6,%%xmm2 \n"
  4314. "paddw %%xmm7,%%xmm3 \n"
  4315. "pmullw %%xmm3,%%xmm2 \n"
  4316. "movd (%1),%%xmm1 \n"
  4317. "lea 0x4(%1),%1 \n"
  4318. "psrlw $0x8,%%xmm1 \n"
  4319. "por %%xmm4,%%xmm0 \n"
  4320. "pmullw %%xmm3,%%xmm1 \n"
  4321. "psrlw $0x8,%%xmm2 \n"
  4322. "paddusb %%xmm2,%%xmm0 \n"
  4323. "pand %%xmm5,%%xmm1 \n"
  4324. "paddusb %%xmm1,%%xmm0 \n"
  4325. "movd %%xmm0,(%2) \n"
  4326. "lea 0x4(%2),%2 \n"
  4327. "sub $0x1,%3 \n"
  4328. "jge 91b \n"
  4329. "99: \n"
  4330. : "+r"(src_argb0), // %0
  4331. "+r"(src_argb1), // %1
  4332. "+r"(dst_argb), // %2
  4333. "+r"(width) // %3
  4334. : "m"(kShuffleAlpha) // %4
  4335. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4336. "xmm7");
  4337. }
  4338. #endif // HAS_ARGBBLENDROW_SSSE3
  4339. #ifdef HAS_BLENDPLANEROW_SSSE3
  4340. // Blend 8 pixels at a time.
  4341. // unsigned version of math
  4342. // =((A2*C2)+(B2*(255-C2))+255)/256
  4343. // signed version of math
  4344. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  4345. void BlendPlaneRow_SSSE3(const uint8_t* src0,
  4346. const uint8_t* src1,
  4347. const uint8_t* alpha,
  4348. uint8_t* dst,
  4349. int width) {
  4350. asm volatile(
  4351. "pcmpeqb %%xmm5,%%xmm5 \n"
  4352. "psllw $0x8,%%xmm5 \n"
  4353. "mov $0x80808080,%%eax \n"
  4354. "movd %%eax,%%xmm6 \n"
  4355. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  4356. "mov $0x807f807f,%%eax \n"
  4357. "movd %%eax,%%xmm7 \n"
  4358. "pshufd $0x0,%%xmm7,%%xmm7 \n"
  4359. "sub %2,%0 \n"
  4360. "sub %2,%1 \n"
  4361. "sub %2,%3 \n"
  4362. // 8 pixel loop.
  4363. LABELALIGN
  4364. "1: \n"
  4365. "movq (%2),%%xmm0 \n"
  4366. "punpcklbw %%xmm0,%%xmm0 \n"
  4367. "pxor %%xmm5,%%xmm0 \n"
  4368. "movq (%0,%2,1),%%xmm1 \n"
  4369. "movq (%1,%2,1),%%xmm2 \n"
  4370. "punpcklbw %%xmm2,%%xmm1 \n"
  4371. "psubb %%xmm6,%%xmm1 \n"
  4372. "pmaddubsw %%xmm1,%%xmm0 \n"
  4373. "paddw %%xmm7,%%xmm0 \n"
  4374. "psrlw $0x8,%%xmm0 \n"
  4375. "packuswb %%xmm0,%%xmm0 \n"
  4376. "movq %%xmm0,(%3,%2,1) \n"
  4377. "lea 0x8(%2),%2 \n"
  4378. "sub $0x8,%4 \n"
  4379. "jg 1b \n"
  4380. : "+r"(src0), // %0
  4381. "+r"(src1), // %1
  4382. "+r"(alpha), // %2
  4383. "+r"(dst), // %3
  4384. "+rm"(width) // %4
  4385. ::"memory",
  4386. "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
  4387. }
  4388. #endif // HAS_BLENDPLANEROW_SSSE3
  4389. #ifdef HAS_BLENDPLANEROW_AVX2
  4390. // Blend 32 pixels at a time.
  4391. // unsigned version of math
  4392. // =((A2*C2)+(B2*(255-C2))+255)/256
  4393. // signed version of math
  4394. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  4395. void BlendPlaneRow_AVX2(const uint8_t* src0,
  4396. const uint8_t* src1,
  4397. const uint8_t* alpha,
  4398. uint8_t* dst,
  4399. int width) {
  4400. asm volatile(
  4401. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4402. "vpsllw $0x8,%%ymm5,%%ymm5 \n"
  4403. "mov $0x80808080,%%eax \n"
  4404. "vmovd %%eax,%%xmm6 \n"
  4405. "vbroadcastss %%xmm6,%%ymm6 \n"
  4406. "mov $0x807f807f,%%eax \n"
  4407. "vmovd %%eax,%%xmm7 \n"
  4408. "vbroadcastss %%xmm7,%%ymm7 \n"
  4409. "sub %2,%0 \n"
  4410. "sub %2,%1 \n"
  4411. "sub %2,%3 \n"
  4412. // 32 pixel loop.
  4413. LABELALIGN
  4414. "1: \n"
  4415. "vmovdqu (%2),%%ymm0 \n"
  4416. "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
  4417. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  4418. "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
  4419. "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
  4420. "vmovdqu (%0,%2,1),%%ymm1 \n"
  4421. "vmovdqu (%1,%2,1),%%ymm2 \n"
  4422. "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
  4423. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  4424. "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
  4425. "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
  4426. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  4427. "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
  4428. "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
  4429. "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
  4430. "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
  4431. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4432. "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
  4433. "vmovdqu %%ymm0,(%3,%2,1) \n"
  4434. "lea 0x20(%2),%2 \n"
  4435. "sub $0x20,%4 \n"
  4436. "jg 1b \n"
  4437. "vzeroupper \n"
  4438. : "+r"(src0), // %0
  4439. "+r"(src1), // %1
  4440. "+r"(alpha), // %2
  4441. "+r"(dst), // %3
  4442. "+rm"(width) // %4
  4443. ::"memory",
  4444. "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4445. "xmm7");
  4446. }
  4447. #endif // HAS_BLENDPLANEROW_AVX2
  4448. #ifdef HAS_ARGBATTENUATEROW_SSSE3
  4449. // Shuffle table duplicating alpha
  4450. static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
  4451. 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
  4452. static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  4453. 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
  4454. // Attenuate 4 pixels at a time.
  4455. void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
  4456. uint8_t* dst_argb,
  4457. int width) {
  4458. asm volatile(
  4459. "pcmpeqb %%xmm3,%%xmm3 \n"
  4460. "pslld $0x18,%%xmm3 \n"
  4461. "movdqa %3,%%xmm4 \n"
  4462. "movdqa %4,%%xmm5 \n"
  4463. // 4 pixel loop.
  4464. LABELALIGN
  4465. "1: \n"
  4466. "movdqu (%0),%%xmm0 \n"
  4467. "pshufb %%xmm4,%%xmm0 \n"
  4468. "movdqu (%0),%%xmm1 \n"
  4469. "punpcklbw %%xmm1,%%xmm1 \n"
  4470. "pmulhuw %%xmm1,%%xmm0 \n"
  4471. "movdqu (%0),%%xmm1 \n"
  4472. "pshufb %%xmm5,%%xmm1 \n"
  4473. "movdqu (%0),%%xmm2 \n"
  4474. "punpckhbw %%xmm2,%%xmm2 \n"
  4475. "pmulhuw %%xmm2,%%xmm1 \n"
  4476. "movdqu (%0),%%xmm2 \n"
  4477. "lea 0x10(%0),%0 \n"
  4478. "pand %%xmm3,%%xmm2 \n"
  4479. "psrlw $0x8,%%xmm0 \n"
  4480. "psrlw $0x8,%%xmm1 \n"
  4481. "packuswb %%xmm1,%%xmm0 \n"
  4482. "por %%xmm2,%%xmm0 \n"
  4483. "movdqu %%xmm0,(%1) \n"
  4484. "lea 0x10(%1),%1 \n"
  4485. "sub $0x4,%2 \n"
  4486. "jg 1b \n"
  4487. : "+r"(src_argb), // %0
  4488. "+r"(dst_argb), // %1
  4489. "+r"(width) // %2
  4490. : "m"(kShuffleAlpha0), // %3
  4491. "m"(kShuffleAlpha1) // %4
  4492. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  4493. }
  4494. #endif // HAS_ARGBATTENUATEROW_SSSE3
  4495. #ifdef HAS_ARGBATTENUATEROW_AVX2
  4496. // Shuffle table duplicating alpha.
  4497. static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
  4498. 128u, 128u, 14u, 15u, 14u, 15u,
  4499. 14u, 15u, 128u, 128u};
  4500. // Attenuate 8 pixels at a time.
  4501. void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
  4502. uint8_t* dst_argb,
  4503. int width) {
  4504. asm volatile(
  4505. "vbroadcastf128 %3,%%ymm4 \n"
  4506. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4507. "vpslld $0x18,%%ymm5,%%ymm5 \n"
  4508. "sub %0,%1 \n"
  4509. // 8 pixel loop.
  4510. LABELALIGN
  4511. "1: \n"
  4512. "vmovdqu (%0),%%ymm6 \n"
  4513. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  4514. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  4515. "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
  4516. "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
  4517. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  4518. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  4519. "vpand %%ymm5,%%ymm6,%%ymm6 \n"
  4520. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4521. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4522. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4523. "vpor %%ymm6,%%ymm0,%%ymm0 \n"
  4524. "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
  4525. "lea 0x20(%0),%0 \n"
  4526. "sub $0x8,%2 \n"
  4527. "jg 1b \n"
  4528. "vzeroupper \n"
  4529. : "+r"(src_argb), // %0
  4530. "+r"(dst_argb), // %1
  4531. "+r"(width) // %2
  4532. : "m"(kShuffleAlpha_AVX2) // %3
  4533. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  4534. }
  4535. #endif // HAS_ARGBATTENUATEROW_AVX2
  4536. #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  4537. // Unattenuate 4 pixels at a time.
  4538. void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
  4539. uint8_t* dst_argb,
  4540. int width) {
  4541. uintptr_t alpha;
  4542. asm volatile(
  4543. // 4 pixel loop.
  4544. LABELALIGN
  4545. "1: \n"
  4546. "movdqu (%0),%%xmm0 \n"
  4547. "movzb 0x03(%0),%3 \n"
  4548. "punpcklbw %%xmm0,%%xmm0 \n"
  4549. "movd 0x00(%4,%3,4),%%xmm2 \n"
  4550. "movzb 0x07(%0),%3 \n"
  4551. "movd 0x00(%4,%3,4),%%xmm3 \n"
  4552. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  4553. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  4554. "movlhps %%xmm3,%%xmm2 \n"
  4555. "pmulhuw %%xmm2,%%xmm0 \n"
  4556. "movdqu (%0),%%xmm1 \n"
  4557. "movzb 0x0b(%0),%3 \n"
  4558. "punpckhbw %%xmm1,%%xmm1 \n"
  4559. "movd 0x00(%4,%3,4),%%xmm2 \n"
  4560. "movzb 0x0f(%0),%3 \n"
  4561. "movd 0x00(%4,%3,4),%%xmm3 \n"
  4562. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  4563. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  4564. "movlhps %%xmm3,%%xmm2 \n"
  4565. "pmulhuw %%xmm2,%%xmm1 \n"
  4566. "lea 0x10(%0),%0 \n"
  4567. "packuswb %%xmm1,%%xmm0 \n"
  4568. "movdqu %%xmm0,(%1) \n"
  4569. "lea 0x10(%1),%1 \n"
  4570. "sub $0x4,%2 \n"
  4571. "jg 1b \n"
  4572. : "+r"(src_argb), // %0
  4573. "+r"(dst_argb), // %1
  4574. "+r"(width), // %2
  4575. "=&r"(alpha) // %3
  4576. : "r"(fixed_invtbl8) // %4
  4577. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  4578. }
  4579. #endif // HAS_ARGBUNATTENUATEROW_SSE2
  4580. #ifdef HAS_ARGBUNATTENUATEROW_AVX2
  4581. // Shuffle table duplicating alpha.
  4582. static const uvec8 kUnattenShuffleAlpha_AVX2 = {
  4583. 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
  4584. // Unattenuate 8 pixels at a time.
  4585. void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
  4586. uint8_t* dst_argb,
  4587. int width) {
  4588. uintptr_t alpha;
  4589. asm volatile(
  4590. "sub %0,%1 \n"
  4591. "vbroadcastf128 %5,%%ymm5 \n"
  4592. // 8 pixel loop.
  4593. LABELALIGN
  4594. "1: \n"
  4595. // replace VPGATHER
  4596. "movzb 0x03(%0),%3 \n"
  4597. "vmovd 0x00(%4,%3,4),%%xmm0 \n"
  4598. "movzb 0x07(%0),%3 \n"
  4599. "vmovd 0x00(%4,%3,4),%%xmm1 \n"
  4600. "movzb 0x0b(%0),%3 \n"
  4601. "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
  4602. "vmovd 0x00(%4,%3,4),%%xmm2 \n"
  4603. "movzb 0x0f(%0),%3 \n"
  4604. "vmovd 0x00(%4,%3,4),%%xmm3 \n"
  4605. "movzb 0x13(%0),%3 \n"
  4606. "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
  4607. "vmovd 0x00(%4,%3,4),%%xmm0 \n"
  4608. "movzb 0x17(%0),%3 \n"
  4609. "vmovd 0x00(%4,%3,4),%%xmm1 \n"
  4610. "movzb 0x1b(%0),%3 \n"
  4611. "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
  4612. "vmovd 0x00(%4,%3,4),%%xmm2 \n"
  4613. "movzb 0x1f(%0),%3 \n"
  4614. "vmovd 0x00(%4,%3,4),%%xmm3 \n"
  4615. "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
  4616. "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
  4617. "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
  4618. "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
  4619. // end of VPGATHER
  4620. "vmovdqu (%0),%%ymm6 \n"
  4621. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  4622. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  4623. "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
  4624. "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
  4625. "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
  4626. "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
  4627. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  4628. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  4629. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4630. "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
  4631. "lea 0x20(%0),%0 \n"
  4632. "sub $0x8,%2 \n"
  4633. "jg 1b \n"
  4634. "vzeroupper \n"
  4635. : "+r"(src_argb), // %0
  4636. "+r"(dst_argb), // %1
  4637. "+r"(width), // %2
  4638. "=&r"(alpha) // %3
  4639. : "r"(fixed_invtbl8), // %4
  4640. "m"(kUnattenShuffleAlpha_AVX2) // %5
  4641. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4642. "xmm7");
  4643. }
  4644. #endif // HAS_ARGBUNATTENUATEROW_AVX2
  4645. #ifdef HAS_ARGBGRAYROW_SSSE3
  4646. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  4647. void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  4648. asm volatile(
  4649. "movdqa %3,%%xmm4 \n"
  4650. "movdqa %4,%%xmm5 \n"
  4651. // 8 pixel loop.
  4652. LABELALIGN
  4653. "1: \n"
  4654. "movdqu (%0),%%xmm0 \n"
  4655. "movdqu 0x10(%0),%%xmm1 \n"
  4656. "pmaddubsw %%xmm4,%%xmm0 \n"
  4657. "pmaddubsw %%xmm4,%%xmm1 \n"
  4658. "phaddw %%xmm1,%%xmm0 \n"
  4659. "paddw %%xmm5,%%xmm0 \n"
  4660. "psrlw $0x7,%%xmm0 \n"
  4661. "packuswb %%xmm0,%%xmm0 \n"
  4662. "movdqu (%0),%%xmm2 \n"
  4663. "movdqu 0x10(%0),%%xmm3 \n"
  4664. "lea 0x20(%0),%0 \n"
  4665. "psrld $0x18,%%xmm2 \n"
  4666. "psrld $0x18,%%xmm3 \n"
  4667. "packuswb %%xmm3,%%xmm2 \n"
  4668. "packuswb %%xmm2,%%xmm2 \n"
  4669. "movdqa %%xmm0,%%xmm3 \n"
  4670. "punpcklbw %%xmm0,%%xmm0 \n"
  4671. "punpcklbw %%xmm2,%%xmm3 \n"
  4672. "movdqa %%xmm0,%%xmm1 \n"
  4673. "punpcklwd %%xmm3,%%xmm0 \n"
  4674. "punpckhwd %%xmm3,%%xmm1 \n"
  4675. "movdqu %%xmm0,(%1) \n"
  4676. "movdqu %%xmm1,0x10(%1) \n"
  4677. "lea 0x20(%1),%1 \n"
  4678. "sub $0x8,%2 \n"
  4679. "jg 1b \n"
  4680. : "+r"(src_argb), // %0
  4681. "+r"(dst_argb), // %1
  4682. "+r"(width) // %2
  4683. : "m"(kARGBToYJ), // %3
  4684. "m"(kAddYJ64) // %4
  4685. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  4686. }
  4687. #endif // HAS_ARGBGRAYROW_SSSE3
  4688. #ifdef HAS_ARGBSEPIAROW_SSSE3
  4689. // b = (r * 35 + g * 68 + b * 17) >> 7
  4690. // g = (r * 45 + g * 88 + b * 22) >> 7
  4691. // r = (r * 50 + g * 98 + b * 24) >> 7
  4692. // Constant for ARGB color to sepia tone
  4693. static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
  4694. 17, 68, 35, 0, 17, 68, 35, 0};
  4695. static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
  4696. 22, 88, 45, 0, 22, 88, 45, 0};
  4697. static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
  4698. 24, 98, 50, 0, 24, 98, 50, 0};
  4699. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  4700. void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
  4701. asm volatile(
  4702. "movdqa %2,%%xmm2 \n"
  4703. "movdqa %3,%%xmm3 \n"
  4704. "movdqa %4,%%xmm4 \n"
  4705. // 8 pixel loop.
  4706. LABELALIGN
  4707. "1: \n"
  4708. "movdqu (%0),%%xmm0 \n"
  4709. "movdqu 0x10(%0),%%xmm6 \n"
  4710. "pmaddubsw %%xmm2,%%xmm0 \n"
  4711. "pmaddubsw %%xmm2,%%xmm6 \n"
  4712. "phaddw %%xmm6,%%xmm0 \n"
  4713. "psrlw $0x7,%%xmm0 \n"
  4714. "packuswb %%xmm0,%%xmm0 \n"
  4715. "movdqu (%0),%%xmm5 \n"
  4716. "movdqu 0x10(%0),%%xmm1 \n"
  4717. "pmaddubsw %%xmm3,%%xmm5 \n"
  4718. "pmaddubsw %%xmm3,%%xmm1 \n"
  4719. "phaddw %%xmm1,%%xmm5 \n"
  4720. "psrlw $0x7,%%xmm5 \n"
  4721. "packuswb %%xmm5,%%xmm5 \n"
  4722. "punpcklbw %%xmm5,%%xmm0 \n"
  4723. "movdqu (%0),%%xmm5 \n"
  4724. "movdqu 0x10(%0),%%xmm1 \n"
  4725. "pmaddubsw %%xmm4,%%xmm5 \n"
  4726. "pmaddubsw %%xmm4,%%xmm1 \n"
  4727. "phaddw %%xmm1,%%xmm5 \n"
  4728. "psrlw $0x7,%%xmm5 \n"
  4729. "packuswb %%xmm5,%%xmm5 \n"
  4730. "movdqu (%0),%%xmm6 \n"
  4731. "movdqu 0x10(%0),%%xmm1 \n"
  4732. "psrld $0x18,%%xmm6 \n"
  4733. "psrld $0x18,%%xmm1 \n"
  4734. "packuswb %%xmm1,%%xmm6 \n"
  4735. "packuswb %%xmm6,%%xmm6 \n"
  4736. "punpcklbw %%xmm6,%%xmm5 \n"
  4737. "movdqa %%xmm0,%%xmm1 \n"
  4738. "punpcklwd %%xmm5,%%xmm0 \n"
  4739. "punpckhwd %%xmm5,%%xmm1 \n"
  4740. "movdqu %%xmm0,(%0) \n"
  4741. "movdqu %%xmm1,0x10(%0) \n"
  4742. "lea 0x20(%0),%0 \n"
  4743. "sub $0x8,%1 \n"
  4744. "jg 1b \n"
  4745. : "+r"(dst_argb), // %0
  4746. "+r"(width) // %1
  4747. : "m"(kARGBToSepiaB), // %2
  4748. "m"(kARGBToSepiaG), // %3
  4749. "m"(kARGBToSepiaR) // %4
  4750. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  4751. }
  4752. #endif // HAS_ARGBSEPIAROW_SSSE3
  4753. #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  4754. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  4755. // Same as Sepia except matrix is provided.
  4756. void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
  4757. uint8_t* dst_argb,
  4758. const int8_t* matrix_argb,
  4759. int width) {
  4760. asm volatile(
  4761. "movdqu (%3),%%xmm5 \n"
  4762. "pshufd $0x00,%%xmm5,%%xmm2 \n"
  4763. "pshufd $0x55,%%xmm5,%%xmm3 \n"
  4764. "pshufd $0xaa,%%xmm5,%%xmm4 \n"
  4765. "pshufd $0xff,%%xmm5,%%xmm5 \n"
  4766. // 8 pixel loop.
  4767. LABELALIGN
  4768. "1: \n"
  4769. "movdqu (%0),%%xmm0 \n"
  4770. "movdqu 0x10(%0),%%xmm7 \n"
  4771. "pmaddubsw %%xmm2,%%xmm0 \n"
  4772. "pmaddubsw %%xmm2,%%xmm7 \n"
  4773. "movdqu (%0),%%xmm6 \n"
  4774. "movdqu 0x10(%0),%%xmm1 \n"
  4775. "pmaddubsw %%xmm3,%%xmm6 \n"
  4776. "pmaddubsw %%xmm3,%%xmm1 \n"
  4777. "phaddsw %%xmm7,%%xmm0 \n"
  4778. "phaddsw %%xmm1,%%xmm6 \n"
  4779. "psraw $0x6,%%xmm0 \n"
  4780. "psraw $0x6,%%xmm6 \n"
  4781. "packuswb %%xmm0,%%xmm0 \n"
  4782. "packuswb %%xmm6,%%xmm6 \n"
  4783. "punpcklbw %%xmm6,%%xmm0 \n"
  4784. "movdqu (%0),%%xmm1 \n"
  4785. "movdqu 0x10(%0),%%xmm7 \n"
  4786. "pmaddubsw %%xmm4,%%xmm1 \n"
  4787. "pmaddubsw %%xmm4,%%xmm7 \n"
  4788. "phaddsw %%xmm7,%%xmm1 \n"
  4789. "movdqu (%0),%%xmm6 \n"
  4790. "movdqu 0x10(%0),%%xmm7 \n"
  4791. "pmaddubsw %%xmm5,%%xmm6 \n"
  4792. "pmaddubsw %%xmm5,%%xmm7 \n"
  4793. "phaddsw %%xmm7,%%xmm6 \n"
  4794. "psraw $0x6,%%xmm1 \n"
  4795. "psraw $0x6,%%xmm6 \n"
  4796. "packuswb %%xmm1,%%xmm1 \n"
  4797. "packuswb %%xmm6,%%xmm6 \n"
  4798. "punpcklbw %%xmm6,%%xmm1 \n"
  4799. "movdqa %%xmm0,%%xmm6 \n"
  4800. "punpcklwd %%xmm1,%%xmm0 \n"
  4801. "punpckhwd %%xmm1,%%xmm6 \n"
  4802. "movdqu %%xmm0,(%1) \n"
  4803. "movdqu %%xmm6,0x10(%1) \n"
  4804. "lea 0x20(%0),%0 \n"
  4805. "lea 0x20(%1),%1 \n"
  4806. "sub $0x8,%2 \n"
  4807. "jg 1b \n"
  4808. : "+r"(src_argb), // %0
  4809. "+r"(dst_argb), // %1
  4810. "+r"(width) // %2
  4811. : "r"(matrix_argb) // %3
  4812. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4813. "xmm7");
  4814. }
  4815. #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
  4816. #ifdef HAS_ARGBQUANTIZEROW_SSE2
  4817. // Quantize 4 ARGB pixels (16 bytes).
  4818. void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
  4819. int scale,
  4820. int interval_size,
  4821. int interval_offset,
  4822. int width) {
  4823. asm volatile(
  4824. "movd %2,%%xmm2 \n"
  4825. "movd %3,%%xmm3 \n"
  4826. "movd %4,%%xmm4 \n"
  4827. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  4828. "pshufd $0x44,%%xmm2,%%xmm2 \n"
  4829. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  4830. "pshufd $0x44,%%xmm3,%%xmm3 \n"
  4831. "pshuflw $0x40,%%xmm4,%%xmm4 \n"
  4832. "pshufd $0x44,%%xmm4,%%xmm4 \n"
  4833. "pxor %%xmm5,%%xmm5 \n"
  4834. "pcmpeqb %%xmm6,%%xmm6 \n"
  4835. "pslld $0x18,%%xmm6 \n"
  4836. // 4 pixel loop.
  4837. LABELALIGN
  4838. "1: \n"
  4839. "movdqu (%0),%%xmm0 \n"
  4840. "punpcklbw %%xmm5,%%xmm0 \n"
  4841. "pmulhuw %%xmm2,%%xmm0 \n"
  4842. "movdqu (%0),%%xmm1 \n"
  4843. "punpckhbw %%xmm5,%%xmm1 \n"
  4844. "pmulhuw %%xmm2,%%xmm1 \n"
  4845. "pmullw %%xmm3,%%xmm0 \n"
  4846. "movdqu (%0),%%xmm7 \n"
  4847. "pmullw %%xmm3,%%xmm1 \n"
  4848. "pand %%xmm6,%%xmm7 \n"
  4849. "paddw %%xmm4,%%xmm0 \n"
  4850. "paddw %%xmm4,%%xmm1 \n"
  4851. "packuswb %%xmm1,%%xmm0 \n"
  4852. "por %%xmm7,%%xmm0 \n"
  4853. "movdqu %%xmm0,(%0) \n"
  4854. "lea 0x10(%0),%0 \n"
  4855. "sub $0x4,%1 \n"
  4856. "jg 1b \n"
  4857. : "+r"(dst_argb), // %0
  4858. "+r"(width) // %1
  4859. : "r"(scale), // %2
  4860. "r"(interval_size), // %3
  4861. "r"(interval_offset) // %4
  4862. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4863. "xmm7");
  4864. }
  4865. #endif // HAS_ARGBQUANTIZEROW_SSE2
  4866. #ifdef HAS_ARGBSHADEROW_SSE2
  4867. // Shade 4 pixels at a time by specified value.
  4868. void ARGBShadeRow_SSE2(const uint8_t* src_argb,
  4869. uint8_t* dst_argb,
  4870. int width,
  4871. uint32_t value) {
  4872. asm volatile(
  4873. "movd %3,%%xmm2 \n"
  4874. "punpcklbw %%xmm2,%%xmm2 \n"
  4875. "punpcklqdq %%xmm2,%%xmm2 \n"
  4876. // 4 pixel loop.
  4877. LABELALIGN
  4878. "1: \n"
  4879. "movdqu (%0),%%xmm0 \n"
  4880. "lea 0x10(%0),%0 \n"
  4881. "movdqa %%xmm0,%%xmm1 \n"
  4882. "punpcklbw %%xmm0,%%xmm0 \n"
  4883. "punpckhbw %%xmm1,%%xmm1 \n"
  4884. "pmulhuw %%xmm2,%%xmm0 \n"
  4885. "pmulhuw %%xmm2,%%xmm1 \n"
  4886. "psrlw $0x8,%%xmm0 \n"
  4887. "psrlw $0x8,%%xmm1 \n"
  4888. "packuswb %%xmm1,%%xmm0 \n"
  4889. "movdqu %%xmm0,(%1) \n"
  4890. "lea 0x10(%1),%1 \n"
  4891. "sub $0x4,%2 \n"
  4892. "jg 1b \n"
  4893. : "+r"(src_argb), // %0
  4894. "+r"(dst_argb), // %1
  4895. "+r"(width) // %2
  4896. : "r"(value) // %3
  4897. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  4898. }
  4899. #endif // HAS_ARGBSHADEROW_SSE2
  4900. #ifdef HAS_ARGBMULTIPLYROW_SSE2
  4901. // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  4902. void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
  4903. const uint8_t* src_argb1,
  4904. uint8_t* dst_argb,
  4905. int width) {
  4906. asm volatile(
  4907. "pxor %%xmm5,%%xmm5 \n"
  4908. // 4 pixel loop.
  4909. LABELALIGN
  4910. "1: \n"
  4911. "movdqu (%0),%%xmm0 \n"
  4912. "lea 0x10(%0),%0 \n"
  4913. "movdqu (%1),%%xmm2 \n"
  4914. "lea 0x10(%1),%1 \n"
  4915. "movdqu %%xmm0,%%xmm1 \n"
  4916. "movdqu %%xmm2,%%xmm3 \n"
  4917. "punpcklbw %%xmm0,%%xmm0 \n"
  4918. "punpckhbw %%xmm1,%%xmm1 \n"
  4919. "punpcklbw %%xmm5,%%xmm2 \n"
  4920. "punpckhbw %%xmm5,%%xmm3 \n"
  4921. "pmulhuw %%xmm2,%%xmm0 \n"
  4922. "pmulhuw %%xmm3,%%xmm1 \n"
  4923. "packuswb %%xmm1,%%xmm0 \n"
  4924. "movdqu %%xmm0,(%2) \n"
  4925. "lea 0x10(%2),%2 \n"
  4926. "sub $0x4,%3 \n"
  4927. "jg 1b \n"
  4928. : "+r"(src_argb0), // %0
  4929. "+r"(src_argb1), // %1
  4930. "+r"(dst_argb), // %2
  4931. "+r"(width) // %3
  4932. :
  4933. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  4934. }
  4935. #endif // HAS_ARGBMULTIPLYROW_SSE2
  4936. #ifdef HAS_ARGBMULTIPLYROW_AVX2
  4937. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  4938. void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
  4939. const uint8_t* src_argb1,
  4940. uint8_t* dst_argb,
  4941. int width) {
  4942. asm volatile(
  4943. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  4944. // 4 pixel loop.
  4945. LABELALIGN
  4946. "1: \n"
  4947. "vmovdqu (%0),%%ymm1 \n"
  4948. "lea 0x20(%0),%0 \n"
  4949. "vmovdqu (%1),%%ymm3 \n"
  4950. "lea 0x20(%1),%1 \n"
  4951. "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
  4952. "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
  4953. "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
  4954. "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
  4955. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  4956. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  4957. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4958. "vmovdqu %%ymm0,(%2) \n"
  4959. "lea 0x20(%2),%2 \n"
  4960. "sub $0x8,%3 \n"
  4961. "jg 1b \n"
  4962. "vzeroupper \n"
  4963. : "+r"(src_argb0), // %0
  4964. "+r"(src_argb1), // %1
  4965. "+r"(dst_argb), // %2
  4966. "+r"(width) // %3
  4967. :
  4968. : "memory", "cc"
  4969. #if defined(__AVX2__)
  4970. ,
  4971. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4972. #endif
  4973. );
  4974. }
  4975. #endif // HAS_ARGBMULTIPLYROW_AVX2
  4976. #ifdef HAS_ARGBADDROW_SSE2
  4977. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4978. void ARGBAddRow_SSE2(const uint8_t* src_argb0,
  4979. const uint8_t* src_argb1,
  4980. uint8_t* dst_argb,
  4981. int width) {
  4982. asm volatile(
  4983. // 4 pixel loop.
  4984. LABELALIGN
  4985. "1: \n"
  4986. "movdqu (%0),%%xmm0 \n"
  4987. "lea 0x10(%0),%0 \n"
  4988. "movdqu (%1),%%xmm1 \n"
  4989. "lea 0x10(%1),%1 \n"
  4990. "paddusb %%xmm1,%%xmm0 \n"
  4991. "movdqu %%xmm0,(%2) \n"
  4992. "lea 0x10(%2),%2 \n"
  4993. "sub $0x4,%3 \n"
  4994. "jg 1b \n"
  4995. : "+r"(src_argb0), // %0
  4996. "+r"(src_argb1), // %1
  4997. "+r"(dst_argb), // %2
  4998. "+r"(width) // %3
  4999. :
  5000. : "memory", "cc", "xmm0", "xmm1");
  5001. }
  5002. #endif // HAS_ARGBADDROW_SSE2
  5003. #ifdef HAS_ARGBADDROW_AVX2
  5004. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  5005. void ARGBAddRow_AVX2(const uint8_t* src_argb0,
  5006. const uint8_t* src_argb1,
  5007. uint8_t* dst_argb,
  5008. int width) {
  5009. asm volatile(
  5010. // 4 pixel loop.
  5011. LABELALIGN
  5012. "1: \n"
  5013. "vmovdqu (%0),%%ymm0 \n"
  5014. "lea 0x20(%0),%0 \n"
  5015. "vpaddusb (%1),%%ymm0,%%ymm0 \n"
  5016. "lea 0x20(%1),%1 \n"
  5017. "vmovdqu %%ymm0,(%2) \n"
  5018. "lea 0x20(%2),%2 \n"
  5019. "sub $0x8,%3 \n"
  5020. "jg 1b \n"
  5021. "vzeroupper \n"
  5022. : "+r"(src_argb0), // %0
  5023. "+r"(src_argb1), // %1
  5024. "+r"(dst_argb), // %2
  5025. "+r"(width) // %3
  5026. :
  5027. : "memory", "cc", "xmm0");
  5028. }
  5029. #endif // HAS_ARGBADDROW_AVX2
  5030. #ifdef HAS_ARGBSUBTRACTROW_SSE2
  5031. // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
  5032. void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
  5033. const uint8_t* src_argb1,
  5034. uint8_t* dst_argb,
  5035. int width) {
  5036. asm volatile(
  5037. // 4 pixel loop.
  5038. LABELALIGN
  5039. "1: \n"
  5040. "movdqu (%0),%%xmm0 \n"
  5041. "lea 0x10(%0),%0 \n"
  5042. "movdqu (%1),%%xmm1 \n"
  5043. "lea 0x10(%1),%1 \n"
  5044. "psubusb %%xmm1,%%xmm0 \n"
  5045. "movdqu %%xmm0,(%2) \n"
  5046. "lea 0x10(%2),%2 \n"
  5047. "sub $0x4,%3 \n"
  5048. "jg 1b \n"
  5049. : "+r"(src_argb0), // %0
  5050. "+r"(src_argb1), // %1
  5051. "+r"(dst_argb), // %2
  5052. "+r"(width) // %3
  5053. :
  5054. : "memory", "cc", "xmm0", "xmm1");
  5055. }
  5056. #endif // HAS_ARGBSUBTRACTROW_SSE2
  5057. #ifdef HAS_ARGBSUBTRACTROW_AVX2
  5058. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  5059. void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
  5060. const uint8_t* src_argb1,
  5061. uint8_t* dst_argb,
  5062. int width) {
  5063. asm volatile(
  5064. // 4 pixel loop.
  5065. LABELALIGN
  5066. "1: \n"
  5067. "vmovdqu (%0),%%ymm0 \n"
  5068. "lea 0x20(%0),%0 \n"
  5069. "vpsubusb (%1),%%ymm0,%%ymm0 \n"
  5070. "lea 0x20(%1),%1 \n"
  5071. "vmovdqu %%ymm0,(%2) \n"
  5072. "lea 0x20(%2),%2 \n"
  5073. "sub $0x8,%3 \n"
  5074. "jg 1b \n"
  5075. "vzeroupper \n"
  5076. : "+r"(src_argb0), // %0
  5077. "+r"(src_argb1), // %1
  5078. "+r"(dst_argb), // %2
  5079. "+r"(width) // %3
  5080. :
  5081. : "memory", "cc", "xmm0");
  5082. }
  5083. #endif // HAS_ARGBSUBTRACTROW_AVX2
  5084. #ifdef HAS_SOBELXROW_SSE2
  5085. // SobelX as a matrix is
  5086. // -1 0 1
  5087. // -2 0 2
  5088. // -1 0 1
  5089. void SobelXRow_SSE2(const uint8_t* src_y0,
  5090. const uint8_t* src_y1,
  5091. const uint8_t* src_y2,
  5092. uint8_t* dst_sobelx,
  5093. int width) {
  5094. asm volatile(
  5095. "sub %0,%1 \n"
  5096. "sub %0,%2 \n"
  5097. "sub %0,%3 \n"
  5098. "pxor %%xmm5,%%xmm5 \n"
  5099. // 8 pixel loop.
  5100. LABELALIGN
  5101. "1: \n"
  5102. "movq (%0),%%xmm0 \n"
  5103. "movq 0x2(%0),%%xmm1 \n"
  5104. "punpcklbw %%xmm5,%%xmm0 \n"
  5105. "punpcklbw %%xmm5,%%xmm1 \n"
  5106. "psubw %%xmm1,%%xmm0 \n"
  5107. "movq 0x00(%0,%1,1),%%xmm1 \n"
  5108. "movq 0x02(%0,%1,1),%%xmm2 \n"
  5109. "punpcklbw %%xmm5,%%xmm1 \n"
  5110. "punpcklbw %%xmm5,%%xmm2 \n"
  5111. "psubw %%xmm2,%%xmm1 \n"
  5112. "movq 0x00(%0,%2,1),%%xmm2 \n"
  5113. "movq 0x02(%0,%2,1),%%xmm3 \n"
  5114. "punpcklbw %%xmm5,%%xmm2 \n"
  5115. "punpcklbw %%xmm5,%%xmm3 \n"
  5116. "psubw %%xmm3,%%xmm2 \n"
  5117. "paddw %%xmm2,%%xmm0 \n"
  5118. "paddw %%xmm1,%%xmm0 \n"
  5119. "paddw %%xmm1,%%xmm0 \n"
  5120. "pxor %%xmm1,%%xmm1 \n"
  5121. "psubw %%xmm0,%%xmm1 \n"
  5122. "pmaxsw %%xmm1,%%xmm0 \n"
  5123. "packuswb %%xmm0,%%xmm0 \n"
  5124. "movq %%xmm0,0x00(%0,%3,1) \n"
  5125. "lea 0x8(%0),%0 \n"
  5126. "sub $0x8,%4 \n"
  5127. "jg 1b \n"
  5128. : "+r"(src_y0), // %0
  5129. "+r"(src_y1), // %1
  5130. "+r"(src_y2), // %2
  5131. "+r"(dst_sobelx), // %3
  5132. "+r"(width) // %4
  5133. :
  5134. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  5135. }
  5136. #endif // HAS_SOBELXROW_SSE2
  5137. #ifdef HAS_SOBELYROW_SSE2
  5138. // SobelY as a matrix is
  5139. // -1 -2 -1
  5140. // 0 0 0
  5141. // 1 2 1
  5142. void SobelYRow_SSE2(const uint8_t* src_y0,
  5143. const uint8_t* src_y1,
  5144. uint8_t* dst_sobely,
  5145. int width) {
  5146. asm volatile(
  5147. "sub %0,%1 \n"
  5148. "sub %0,%2 \n"
  5149. "pxor %%xmm5,%%xmm5 \n"
  5150. // 8 pixel loop.
  5151. LABELALIGN
  5152. "1: \n"
  5153. "movq (%0),%%xmm0 \n"
  5154. "movq 0x00(%0,%1,1),%%xmm1 \n"
  5155. "punpcklbw %%xmm5,%%xmm0 \n"
  5156. "punpcklbw %%xmm5,%%xmm1 \n"
  5157. "psubw %%xmm1,%%xmm0 \n"
  5158. "movq 0x1(%0),%%xmm1 \n"
  5159. "movq 0x01(%0,%1,1),%%xmm2 \n"
  5160. "punpcklbw %%xmm5,%%xmm1 \n"
  5161. "punpcklbw %%xmm5,%%xmm2 \n"
  5162. "psubw %%xmm2,%%xmm1 \n"
  5163. "movq 0x2(%0),%%xmm2 \n"
  5164. "movq 0x02(%0,%1,1),%%xmm3 \n"
  5165. "punpcklbw %%xmm5,%%xmm2 \n"
  5166. "punpcklbw %%xmm5,%%xmm3 \n"
  5167. "psubw %%xmm3,%%xmm2 \n"
  5168. "paddw %%xmm2,%%xmm0 \n"
  5169. "paddw %%xmm1,%%xmm0 \n"
  5170. "paddw %%xmm1,%%xmm0 \n"
  5171. "pxor %%xmm1,%%xmm1 \n"
  5172. "psubw %%xmm0,%%xmm1 \n"
  5173. "pmaxsw %%xmm1,%%xmm0 \n"
  5174. "packuswb %%xmm0,%%xmm0 \n"
  5175. "movq %%xmm0,0x00(%0,%2,1) \n"
  5176. "lea 0x8(%0),%0 \n"
  5177. "sub $0x8,%3 \n"
  5178. "jg 1b \n"
  5179. : "+r"(src_y0), // %0
  5180. "+r"(src_y1), // %1
  5181. "+r"(dst_sobely), // %2
  5182. "+r"(width) // %3
  5183. :
  5184. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  5185. }
  5186. #endif // HAS_SOBELYROW_SSE2
  5187. #ifdef HAS_SOBELROW_SSE2
  5188. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  5189. // A = 255
  5190. // R = Sobel
  5191. // G = Sobel
  5192. // B = Sobel
  5193. void SobelRow_SSE2(const uint8_t* src_sobelx,
  5194. const uint8_t* src_sobely,
  5195. uint8_t* dst_argb,
  5196. int width) {
  5197. asm volatile(
  5198. "sub %0,%1 \n"
  5199. "pcmpeqb %%xmm5,%%xmm5 \n"
  5200. "pslld $0x18,%%xmm5 \n"
  5201. // 8 pixel loop.
  5202. LABELALIGN
  5203. "1: \n"
  5204. "movdqu (%0),%%xmm0 \n"
  5205. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  5206. "lea 0x10(%0),%0 \n"
  5207. "paddusb %%xmm1,%%xmm0 \n"
  5208. "movdqa %%xmm0,%%xmm2 \n"
  5209. "punpcklbw %%xmm0,%%xmm2 \n"
  5210. "punpckhbw %%xmm0,%%xmm0 \n"
  5211. "movdqa %%xmm2,%%xmm1 \n"
  5212. "punpcklwd %%xmm2,%%xmm1 \n"
  5213. "punpckhwd %%xmm2,%%xmm2 \n"
  5214. "por %%xmm5,%%xmm1 \n"
  5215. "por %%xmm5,%%xmm2 \n"
  5216. "movdqa %%xmm0,%%xmm3 \n"
  5217. "punpcklwd %%xmm0,%%xmm3 \n"
  5218. "punpckhwd %%xmm0,%%xmm0 \n"
  5219. "por %%xmm5,%%xmm3 \n"
  5220. "por %%xmm5,%%xmm0 \n"
  5221. "movdqu %%xmm1,(%2) \n"
  5222. "movdqu %%xmm2,0x10(%2) \n"
  5223. "movdqu %%xmm3,0x20(%2) \n"
  5224. "movdqu %%xmm0,0x30(%2) \n"
  5225. "lea 0x40(%2),%2 \n"
  5226. "sub $0x10,%3 \n"
  5227. "jg 1b \n"
  5228. : "+r"(src_sobelx), // %0
  5229. "+r"(src_sobely), // %1
  5230. "+r"(dst_argb), // %2
  5231. "+r"(width) // %3
  5232. :
  5233. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  5234. }
  5235. #endif // HAS_SOBELROW_SSE2
  5236. #ifdef HAS_SOBELTOPLANEROW_SSE2
  5237. // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  5238. void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
  5239. const uint8_t* src_sobely,
  5240. uint8_t* dst_y,
  5241. int width) {
  5242. asm volatile(
  5243. "sub %0,%1 \n"
  5244. "pcmpeqb %%xmm5,%%xmm5 \n"
  5245. "pslld $0x18,%%xmm5 \n"
  5246. // 8 pixel loop.
  5247. LABELALIGN
  5248. "1: \n"
  5249. "movdqu (%0),%%xmm0 \n"
  5250. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  5251. "lea 0x10(%0),%0 \n"
  5252. "paddusb %%xmm1,%%xmm0 \n"
  5253. "movdqu %%xmm0,(%2) \n"
  5254. "lea 0x10(%2),%2 \n"
  5255. "sub $0x10,%3 \n"
  5256. "jg 1b \n"
  5257. : "+r"(src_sobelx), // %0
  5258. "+r"(src_sobely), // %1
  5259. "+r"(dst_y), // %2
  5260. "+r"(width) // %3
  5261. :
  5262. : "memory", "cc", "xmm0", "xmm1");
  5263. }
  5264. #endif // HAS_SOBELTOPLANEROW_SSE2
  5265. #ifdef HAS_SOBELXYROW_SSE2
  5266. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  5267. // A = 255
  5268. // R = Sobel X
  5269. // G = Sobel
  5270. // B = Sobel Y
  5271. void SobelXYRow_SSE2(const uint8_t* src_sobelx,
  5272. const uint8_t* src_sobely,
  5273. uint8_t* dst_argb,
  5274. int width) {
  5275. asm volatile(
  5276. "sub %0,%1 \n"
  5277. "pcmpeqb %%xmm5,%%xmm5 \n"
  5278. // 8 pixel loop.
  5279. LABELALIGN
  5280. "1: \n"
  5281. "movdqu (%0),%%xmm0 \n"
  5282. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  5283. "lea 0x10(%0),%0 \n"
  5284. "movdqa %%xmm0,%%xmm2 \n"
  5285. "paddusb %%xmm1,%%xmm2 \n"
  5286. "movdqa %%xmm0,%%xmm3 \n"
  5287. "punpcklbw %%xmm5,%%xmm3 \n"
  5288. "punpckhbw %%xmm5,%%xmm0 \n"
  5289. "movdqa %%xmm1,%%xmm4 \n"
  5290. "punpcklbw %%xmm2,%%xmm4 \n"
  5291. "punpckhbw %%xmm2,%%xmm1 \n"
  5292. "movdqa %%xmm4,%%xmm6 \n"
  5293. "punpcklwd %%xmm3,%%xmm6 \n"
  5294. "punpckhwd %%xmm3,%%xmm4 \n"
  5295. "movdqa %%xmm1,%%xmm7 \n"
  5296. "punpcklwd %%xmm0,%%xmm7 \n"
  5297. "punpckhwd %%xmm0,%%xmm1 \n"
  5298. "movdqu %%xmm6,(%2) \n"
  5299. "movdqu %%xmm4,0x10(%2) \n"
  5300. "movdqu %%xmm7,0x20(%2) \n"
  5301. "movdqu %%xmm1,0x30(%2) \n"
  5302. "lea 0x40(%2),%2 \n"
  5303. "sub $0x10,%3 \n"
  5304. "jg 1b \n"
  5305. : "+r"(src_sobelx), // %0
  5306. "+r"(src_sobely), // %1
  5307. "+r"(dst_argb), // %2
  5308. "+r"(width) // %3
  5309. :
  5310. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  5311. "xmm7");
  5312. }
  5313. #endif // HAS_SOBELXYROW_SSE2
  5314. #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  5315. // Creates a table of cumulative sums where each value is a sum of all values
  5316. // above and to the left of the value, inclusive of the value.
  5317. void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
  5318. int32_t* cumsum,
  5319. const int32_t* previous_cumsum,
  5320. int width) {
  5321. asm volatile(
  5322. "pxor %%xmm0,%%xmm0 \n"
  5323. "pxor %%xmm1,%%xmm1 \n"
  5324. "sub $0x4,%3 \n"
  5325. "jl 49f \n"
  5326. "test $0xf,%1 \n"
  5327. "jne 49f \n"
  5328. // 4 pixel loop.
  5329. LABELALIGN
  5330. "40: \n"
  5331. "movdqu (%0),%%xmm2 \n"
  5332. "lea 0x10(%0),%0 \n"
  5333. "movdqa %%xmm2,%%xmm4 \n"
  5334. "punpcklbw %%xmm1,%%xmm2 \n"
  5335. "movdqa %%xmm2,%%xmm3 \n"
  5336. "punpcklwd %%xmm1,%%xmm2 \n"
  5337. "punpckhwd %%xmm1,%%xmm3 \n"
  5338. "punpckhbw %%xmm1,%%xmm4 \n"
  5339. "movdqa %%xmm4,%%xmm5 \n"
  5340. "punpcklwd %%xmm1,%%xmm4 \n"
  5341. "punpckhwd %%xmm1,%%xmm5 \n"
  5342. "paddd %%xmm2,%%xmm0 \n"
  5343. "movdqu (%2),%%xmm2 \n"
  5344. "paddd %%xmm0,%%xmm2 \n"
  5345. "paddd %%xmm3,%%xmm0 \n"
  5346. "movdqu 0x10(%2),%%xmm3 \n"
  5347. "paddd %%xmm0,%%xmm3 \n"
  5348. "paddd %%xmm4,%%xmm0 \n"
  5349. "movdqu 0x20(%2),%%xmm4 \n"
  5350. "paddd %%xmm0,%%xmm4 \n"
  5351. "paddd %%xmm5,%%xmm0 \n"
  5352. "movdqu 0x30(%2),%%xmm5 \n"
  5353. "lea 0x40(%2),%2 \n"
  5354. "paddd %%xmm0,%%xmm5 \n"
  5355. "movdqu %%xmm2,(%1) \n"
  5356. "movdqu %%xmm3,0x10(%1) \n"
  5357. "movdqu %%xmm4,0x20(%1) \n"
  5358. "movdqu %%xmm5,0x30(%1) \n"
  5359. "lea 0x40(%1),%1 \n"
  5360. "sub $0x4,%3 \n"
  5361. "jge 40b \n"
  5362. "49: \n"
  5363. "add $0x3,%3 \n"
  5364. "jl 19f \n"
  5365. // 1 pixel loop.
  5366. LABELALIGN
  5367. "10: \n"
  5368. "movd (%0),%%xmm2 \n"
  5369. "lea 0x4(%0),%0 \n"
  5370. "punpcklbw %%xmm1,%%xmm2 \n"
  5371. "punpcklwd %%xmm1,%%xmm2 \n"
  5372. "paddd %%xmm2,%%xmm0 \n"
  5373. "movdqu (%2),%%xmm2 \n"
  5374. "lea 0x10(%2),%2 \n"
  5375. "paddd %%xmm0,%%xmm2 \n"
  5376. "movdqu %%xmm2,(%1) \n"
  5377. "lea 0x10(%1),%1 \n"
  5378. "sub $0x1,%3 \n"
  5379. "jge 10b \n"
  5380. "19: \n"
  5381. : "+r"(row), // %0
  5382. "+r"(cumsum), // %1
  5383. "+r"(previous_cumsum), // %2
  5384. "+r"(width) // %3
  5385. :
  5386. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  5387. }
  5388. #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
  5389. #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  5390. void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
  5391. const int32_t* botleft,
  5392. int width,
  5393. int area,
  5394. uint8_t* dst,
  5395. int count) {
  5396. asm volatile(
  5397. "movd %5,%%xmm5 \n"
  5398. "cvtdq2ps %%xmm5,%%xmm5 \n"
  5399. "rcpss %%xmm5,%%xmm4 \n"
  5400. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  5401. "sub $0x4,%3 \n"
  5402. "jl 49f \n"
  5403. "cmpl $0x80,%5 \n"
  5404. "ja 40f \n"
  5405. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  5406. "pcmpeqb %%xmm6,%%xmm6 \n"
  5407. "psrld $0x10,%%xmm6 \n"
  5408. "cvtdq2ps %%xmm6,%%xmm6 \n"
  5409. "addps %%xmm6,%%xmm5 \n"
  5410. "mulps %%xmm4,%%xmm5 \n"
  5411. "cvtps2dq %%xmm5,%%xmm5 \n"
  5412. "packssdw %%xmm5,%%xmm5 \n"
  5413. // 4 pixel small loop.
  5414. LABELALIGN
  5415. "4: \n"
  5416. "movdqu (%0),%%xmm0 \n"
  5417. "movdqu 0x10(%0),%%xmm1 \n"
  5418. "movdqu 0x20(%0),%%xmm2 \n"
  5419. "movdqu 0x30(%0),%%xmm3 \n"
  5420. "psubd 0x00(%0,%4,4),%%xmm0 \n"
  5421. "psubd 0x10(%0,%4,4),%%xmm1 \n"
  5422. "psubd 0x20(%0,%4,4),%%xmm2 \n"
  5423. "psubd 0x30(%0,%4,4),%%xmm3 \n"
  5424. "lea 0x40(%0),%0 \n"
  5425. "psubd (%1),%%xmm0 \n"
  5426. "psubd 0x10(%1),%%xmm1 \n"
  5427. "psubd 0x20(%1),%%xmm2 \n"
  5428. "psubd 0x30(%1),%%xmm3 \n"
  5429. "paddd 0x00(%1,%4,4),%%xmm0 \n"
  5430. "paddd 0x10(%1,%4,4),%%xmm1 \n"
  5431. "paddd 0x20(%1,%4,4),%%xmm2 \n"
  5432. "paddd 0x30(%1,%4,4),%%xmm3 \n"
  5433. "lea 0x40(%1),%1 \n"
  5434. "packssdw %%xmm1,%%xmm0 \n"
  5435. "packssdw %%xmm3,%%xmm2 \n"
  5436. "pmulhuw %%xmm5,%%xmm0 \n"
  5437. "pmulhuw %%xmm5,%%xmm2 \n"
  5438. "packuswb %%xmm2,%%xmm0 \n"
  5439. "movdqu %%xmm0,(%2) \n"
  5440. "lea 0x10(%2),%2 \n"
  5441. "sub $0x4,%3 \n"
  5442. "jge 4b \n"
  5443. "jmp 49f \n"
  5444. // 4 pixel loop
  5445. LABELALIGN
  5446. "40: \n"
  5447. "movdqu (%0),%%xmm0 \n"
  5448. "movdqu 0x10(%0),%%xmm1 \n"
  5449. "movdqu 0x20(%0),%%xmm2 \n"
  5450. "movdqu 0x30(%0),%%xmm3 \n"
  5451. "psubd 0x00(%0,%4,4),%%xmm0 \n"
  5452. "psubd 0x10(%0,%4,4),%%xmm1 \n"
  5453. "psubd 0x20(%0,%4,4),%%xmm2 \n"
  5454. "psubd 0x30(%0,%4,4),%%xmm3 \n"
  5455. "lea 0x40(%0),%0 \n"
  5456. "psubd (%1),%%xmm0 \n"
  5457. "psubd 0x10(%1),%%xmm1 \n"
  5458. "psubd 0x20(%1),%%xmm2 \n"
  5459. "psubd 0x30(%1),%%xmm3 \n"
  5460. "paddd 0x00(%1,%4,4),%%xmm0 \n"
  5461. "paddd 0x10(%1,%4,4),%%xmm1 \n"
  5462. "paddd 0x20(%1,%4,4),%%xmm2 \n"
  5463. "paddd 0x30(%1,%4,4),%%xmm3 \n"
  5464. "lea 0x40(%1),%1 \n"
  5465. "cvtdq2ps %%xmm0,%%xmm0 \n"
  5466. "cvtdq2ps %%xmm1,%%xmm1 \n"
  5467. "mulps %%xmm4,%%xmm0 \n"
  5468. "mulps %%xmm4,%%xmm1 \n"
  5469. "cvtdq2ps %%xmm2,%%xmm2 \n"
  5470. "cvtdq2ps %%xmm3,%%xmm3 \n"
  5471. "mulps %%xmm4,%%xmm2 \n"
  5472. "mulps %%xmm4,%%xmm3 \n"
  5473. "cvtps2dq %%xmm0,%%xmm0 \n"
  5474. "cvtps2dq %%xmm1,%%xmm1 \n"
  5475. "cvtps2dq %%xmm2,%%xmm2 \n"
  5476. "cvtps2dq %%xmm3,%%xmm3 \n"
  5477. "packssdw %%xmm1,%%xmm0 \n"
  5478. "packssdw %%xmm3,%%xmm2 \n"
  5479. "packuswb %%xmm2,%%xmm0 \n"
  5480. "movdqu %%xmm0,(%2) \n"
  5481. "lea 0x10(%2),%2 \n"
  5482. "sub $0x4,%3 \n"
  5483. "jge 40b \n"
  5484. "49: \n"
  5485. "add $0x3,%3 \n"
  5486. "jl 19f \n"
  5487. // 1 pixel loop
  5488. LABELALIGN
  5489. "10: \n"
  5490. "movdqu (%0),%%xmm0 \n"
  5491. "psubd 0x00(%0,%4,4),%%xmm0 \n"
  5492. "lea 0x10(%0),%0 \n"
  5493. "psubd (%1),%%xmm0 \n"
  5494. "paddd 0x00(%1,%4,4),%%xmm0 \n"
  5495. "lea 0x10(%1),%1 \n"
  5496. "cvtdq2ps %%xmm0,%%xmm0 \n"
  5497. "mulps %%xmm4,%%xmm0 \n"
  5498. "cvtps2dq %%xmm0,%%xmm0 \n"
  5499. "packssdw %%xmm0,%%xmm0 \n"
  5500. "packuswb %%xmm0,%%xmm0 \n"
  5501. "movd %%xmm0,(%2) \n"
  5502. "lea 0x4(%2),%2 \n"
  5503. "sub $0x1,%3 \n"
  5504. "jge 10b \n"
  5505. "19: \n"
  5506. : "+r"(topleft), // %0
  5507. "+r"(botleft), // %1
  5508. "+r"(dst), // %2
  5509. "+rm"(count) // %3
  5510. : "r"((intptr_t)(width)), // %4
  5511. "rm"(area) // %5
  5512. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  5513. }
  5514. #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  5515. #ifdef HAS_ARGBAFFINEROW_SSE2
  5516. // Copy ARGB pixels from source image with slope to a row of destination.
  5517. LIBYUV_API
  5518. void ARGBAffineRow_SSE2(const uint8_t* src_argb,
  5519. int src_argb_stride,
  5520. uint8_t* dst_argb,
  5521. const float* src_dudv,
  5522. int width) {
  5523. intptr_t src_argb_stride_temp = src_argb_stride;
  5524. intptr_t temp;
  5525. asm volatile(
  5526. "movq (%3),%%xmm2 \n"
  5527. "movq 0x08(%3),%%xmm7 \n"
  5528. "shl $0x10,%1 \n"
  5529. "add $0x4,%1 \n"
  5530. "movd %1,%%xmm5 \n"
  5531. "sub $0x4,%4 \n"
  5532. "jl 49f \n"
  5533. "pshufd $0x44,%%xmm7,%%xmm7 \n"
  5534. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  5535. "movdqa %%xmm2,%%xmm0 \n"
  5536. "addps %%xmm7,%%xmm0 \n"
  5537. "movlhps %%xmm0,%%xmm2 \n"
  5538. "movdqa %%xmm7,%%xmm4 \n"
  5539. "addps %%xmm4,%%xmm4 \n"
  5540. "movdqa %%xmm2,%%xmm3 \n"
  5541. "addps %%xmm4,%%xmm3 \n"
  5542. "addps %%xmm4,%%xmm4 \n"
  5543. // 4 pixel loop
  5544. LABELALIGN
  5545. "40: \n"
  5546. "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
  5547. "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
  5548. "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
  5549. "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
  5550. "movd %%xmm0,%k1 \n"
  5551. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5552. "movd %%xmm0,%k5 \n"
  5553. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5554. "movd 0x00(%0,%1,1),%%xmm1 \n"
  5555. "movd 0x00(%0,%5,1),%%xmm6 \n"
  5556. "punpckldq %%xmm6,%%xmm1 \n"
  5557. "addps %%xmm4,%%xmm2 \n"
  5558. "movq %%xmm1,(%2) \n"
  5559. "movd %%xmm0,%k1 \n"
  5560. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5561. "movd %%xmm0,%k5 \n"
  5562. "movd 0x00(%0,%1,1),%%xmm0 \n"
  5563. "movd 0x00(%0,%5,1),%%xmm6 \n"
  5564. "punpckldq %%xmm6,%%xmm0 \n"
  5565. "addps %%xmm4,%%xmm3 \n"
  5566. "movq %%xmm0,0x08(%2) \n"
  5567. "lea 0x10(%2),%2 \n"
  5568. "sub $0x4,%4 \n"
  5569. "jge 40b \n"
  5570. "49: \n"
  5571. "add $0x3,%4 \n"
  5572. "jl 19f \n"
  5573. // 1 pixel loop
  5574. LABELALIGN
  5575. "10: \n"
  5576. "cvttps2dq %%xmm2,%%xmm0 \n"
  5577. "packssdw %%xmm0,%%xmm0 \n"
  5578. "pmaddwd %%xmm5,%%xmm0 \n"
  5579. "addps %%xmm7,%%xmm2 \n"
  5580. "movd %%xmm0,%k1 \n"
  5581. "movd 0x00(%0,%1,1),%%xmm0 \n"
  5582. "movd %%xmm0,(%2) \n"
  5583. "lea 0x04(%2),%2 \n"
  5584. "sub $0x1,%4 \n"
  5585. "jge 10b \n"
  5586. "19: \n"
  5587. : "+r"(src_argb), // %0
  5588. "+r"(src_argb_stride_temp), // %1
  5589. "+r"(dst_argb), // %2
  5590. "+r"(src_dudv), // %3
  5591. "+rm"(width), // %4
  5592. "=&r"(temp) // %5
  5593. :
  5594. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  5595. "xmm7");
  5596. }
  5597. #endif // HAS_ARGBAFFINEROW_SSE2
  5598. #ifdef HAS_INTERPOLATEROW_SSSE3
  5599. // Bilinear filter 16x2 -> 16x1
  5600. void InterpolateRow_SSSE3(uint8_t* dst_ptr,
  5601. const uint8_t* src_ptr,
  5602. ptrdiff_t src_stride,
  5603. int dst_width,
  5604. int source_y_fraction) {
  5605. asm volatile(
  5606. "sub %1,%0 \n"
  5607. "cmp $0x0,%3 \n"
  5608. "je 100f \n"
  5609. "cmp $0x80,%3 \n"
  5610. "je 50f \n"
  5611. "movd %3,%%xmm0 \n"
  5612. "neg %3 \n"
  5613. "add $0x100,%3 \n"
  5614. "movd %3,%%xmm5 \n"
  5615. "punpcklbw %%xmm0,%%xmm5 \n"
  5616. "punpcklwd %%xmm5,%%xmm5 \n"
  5617. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  5618. "mov $0x80808080,%%eax \n"
  5619. "movd %%eax,%%xmm4 \n"
  5620. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  5621. // General purpose row blend.
  5622. LABELALIGN
  5623. "1: \n"
  5624. "movdqu (%1),%%xmm0 \n"
  5625. "movdqu 0x00(%1,%4,1),%%xmm2 \n"
  5626. "movdqa %%xmm0,%%xmm1 \n"
  5627. "punpcklbw %%xmm2,%%xmm0 \n"
  5628. "punpckhbw %%xmm2,%%xmm1 \n"
  5629. "psubb %%xmm4,%%xmm0 \n"
  5630. "psubb %%xmm4,%%xmm1 \n"
  5631. "movdqa %%xmm5,%%xmm2 \n"
  5632. "movdqa %%xmm5,%%xmm3 \n"
  5633. "pmaddubsw %%xmm0,%%xmm2 \n"
  5634. "pmaddubsw %%xmm1,%%xmm3 \n"
  5635. "paddw %%xmm4,%%xmm2 \n"
  5636. "paddw %%xmm4,%%xmm3 \n"
  5637. "psrlw $0x8,%%xmm2 \n"
  5638. "psrlw $0x8,%%xmm3 \n"
  5639. "packuswb %%xmm3,%%xmm2 \n"
  5640. "movdqu %%xmm2,0x00(%1,%0,1) \n"
  5641. "lea 0x10(%1),%1 \n"
  5642. "sub $0x10,%2 \n"
  5643. "jg 1b \n"
  5644. "jmp 99f \n"
  5645. // Blend 50 / 50.
  5646. LABELALIGN
  5647. "50: \n"
  5648. "movdqu (%1),%%xmm0 \n"
  5649. "movdqu 0x00(%1,%4,1),%%xmm1 \n"
  5650. "pavgb %%xmm1,%%xmm0 \n"
  5651. "movdqu %%xmm0,0x00(%1,%0,1) \n"
  5652. "lea 0x10(%1),%1 \n"
  5653. "sub $0x10,%2 \n"
  5654. "jg 50b \n"
  5655. "jmp 99f \n"
  5656. // Blend 100 / 0 - Copy row unchanged.
  5657. LABELALIGN
  5658. "100: \n"
  5659. "movdqu (%1),%%xmm0 \n"
  5660. "movdqu %%xmm0,0x00(%1,%0,1) \n"
  5661. "lea 0x10(%1),%1 \n"
  5662. "sub $0x10,%2 \n"
  5663. "jg 100b \n"
  5664. "99: \n"
  5665. : "+r"(dst_ptr), // %0
  5666. "+r"(src_ptr), // %1
  5667. "+rm"(dst_width), // %2
  5668. "+r"(source_y_fraction) // %3
  5669. : "r"((intptr_t)(src_stride)) // %4
  5670. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  5671. }
  5672. #endif // HAS_INTERPOLATEROW_SSSE3
  5673. #ifdef HAS_INTERPOLATEROW_AVX2
  5674. // Bilinear filter 32x2 -> 32x1
  5675. void InterpolateRow_AVX2(uint8_t* dst_ptr,
  5676. const uint8_t* src_ptr,
  5677. ptrdiff_t src_stride,
  5678. int dst_width,
  5679. int source_y_fraction) {
  5680. asm volatile(
  5681. "cmp $0x0,%3 \n"
  5682. "je 100f \n"
  5683. "sub %1,%0 \n"
  5684. "cmp $0x80,%3 \n"
  5685. "je 50f \n"
  5686. "vmovd %3,%%xmm0 \n"
  5687. "neg %3 \n"
  5688. "add $0x100,%3 \n"
  5689. "vmovd %3,%%xmm5 \n"
  5690. "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
  5691. "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
  5692. "vbroadcastss %%xmm5,%%ymm5 \n"
  5693. "mov $0x80808080,%%eax \n"
  5694. "vmovd %%eax,%%xmm4 \n"
  5695. "vbroadcastss %%xmm4,%%ymm4 \n"
  5696. // General purpose row blend.
  5697. LABELALIGN
  5698. "1: \n"
  5699. "vmovdqu (%1),%%ymm0 \n"
  5700. "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
  5701. "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
  5702. "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
  5703. "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
  5704. "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
  5705. "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
  5706. "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
  5707. "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
  5708. "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
  5709. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  5710. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  5711. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  5712. "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
  5713. "lea 0x20(%1),%1 \n"
  5714. "sub $0x20,%2 \n"
  5715. "jg 1b \n"
  5716. "jmp 99f \n"
  5717. // Blend 50 / 50.
  5718. LABELALIGN
  5719. "50: \n"
  5720. "vmovdqu (%1),%%ymm0 \n"
  5721. "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
  5722. "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
  5723. "lea 0x20(%1),%1 \n"
  5724. "sub $0x20,%2 \n"
  5725. "jg 50b \n"
  5726. "jmp 99f \n"
  5727. // Blend 100 / 0 - Copy row unchanged.
  5728. LABELALIGN
  5729. "100: \n"
  5730. "rep movsb \n"
  5731. "jmp 999f \n"
  5732. "99: \n"
  5733. "vzeroupper \n"
  5734. "999: \n"
  5735. : "+D"(dst_ptr), // %0
  5736. "+S"(src_ptr), // %1
  5737. "+cm"(dst_width), // %2
  5738. "+r"(source_y_fraction) // %3
  5739. : "r"((intptr_t)(src_stride)) // %4
  5740. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
  5741. }
  5742. #endif // HAS_INTERPOLATEROW_AVX2
  5743. #ifdef HAS_ARGBSHUFFLEROW_SSSE3
  5744. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5745. void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
  5746. uint8_t* dst_argb,
  5747. const uint8_t* shuffler,
  5748. int width) {
  5749. asm volatile(
  5750. "movdqu (%3),%%xmm5 \n"
  5751. LABELALIGN
  5752. "1: \n"
  5753. "movdqu (%0),%%xmm0 \n"
  5754. "movdqu 0x10(%0),%%xmm1 \n"
  5755. "lea 0x20(%0),%0 \n"
  5756. "pshufb %%xmm5,%%xmm0 \n"
  5757. "pshufb %%xmm5,%%xmm1 \n"
  5758. "movdqu %%xmm0,(%1) \n"
  5759. "movdqu %%xmm1,0x10(%1) \n"
  5760. "lea 0x20(%1),%1 \n"
  5761. "sub $0x8,%2 \n"
  5762. "jg 1b \n"
  5763. : "+r"(src_argb), // %0
  5764. "+r"(dst_argb), // %1
  5765. "+r"(width) // %2
  5766. : "r"(shuffler) // %3
  5767. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  5768. }
  5769. #endif // HAS_ARGBSHUFFLEROW_SSSE3
  5770. #ifdef HAS_ARGBSHUFFLEROW_AVX2
  5771. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5772. void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
  5773. uint8_t* dst_argb,
  5774. const uint8_t* shuffler,
  5775. int width) {
  5776. asm volatile(
  5777. "vbroadcastf128 (%3),%%ymm5 \n"
  5778. LABELALIGN
  5779. "1: \n"
  5780. "vmovdqu (%0),%%ymm0 \n"
  5781. "vmovdqu 0x20(%0),%%ymm1 \n"
  5782. "lea 0x40(%0),%0 \n"
  5783. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  5784. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  5785. "vmovdqu %%ymm0,(%1) \n"
  5786. "vmovdqu %%ymm1,0x20(%1) \n"
  5787. "lea 0x40(%1),%1 \n"
  5788. "sub $0x10,%2 \n"
  5789. "jg 1b \n"
  5790. "vzeroupper \n"
  5791. : "+r"(src_argb), // %0
  5792. "+r"(dst_argb), // %1
  5793. "+r"(width) // %2
  5794. : "r"(shuffler) // %3
  5795. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  5796. }
  5797. #endif // HAS_ARGBSHUFFLEROW_AVX2
  5798. #ifdef HAS_I422TOYUY2ROW_SSE2
  5799. void I422ToYUY2Row_SSE2(const uint8_t* src_y,
  5800. const uint8_t* src_u,
  5801. const uint8_t* src_v,
  5802. uint8_t* dst_yuy2,
  5803. int width) {
  5804. asm volatile(
  5805. "sub %1,%2 \n"
  5806. LABELALIGN
  5807. "1: \n"
  5808. "movq (%1),%%xmm2 \n"
  5809. "movq 0x00(%1,%2,1),%%xmm1 \n"
  5810. "add $0x8,%1 \n"
  5811. "punpcklbw %%xmm1,%%xmm2 \n"
  5812. "movdqu (%0),%%xmm0 \n"
  5813. "add $0x10,%0 \n"
  5814. "movdqa %%xmm0,%%xmm1 \n"
  5815. "punpcklbw %%xmm2,%%xmm0 \n"
  5816. "punpckhbw %%xmm2,%%xmm1 \n"
  5817. "movdqu %%xmm0,(%3) \n"
  5818. "movdqu %%xmm1,0x10(%3) \n"
  5819. "lea 0x20(%3),%3 \n"
  5820. "sub $0x10,%4 \n"
  5821. "jg 1b \n"
  5822. : "+r"(src_y), // %0
  5823. "+r"(src_u), // %1
  5824. "+r"(src_v), // %2
  5825. "+r"(dst_yuy2), // %3
  5826. "+rm"(width) // %4
  5827. :
  5828. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  5829. }
  5830. #endif // HAS_I422TOYUY2ROW_SSE2
  5831. #ifdef HAS_I422TOUYVYROW_SSE2
  5832. void I422ToUYVYRow_SSE2(const uint8_t* src_y,
  5833. const uint8_t* src_u,
  5834. const uint8_t* src_v,
  5835. uint8_t* dst_uyvy,
  5836. int width) {
  5837. asm volatile(
  5838. "sub %1,%2 \n"
  5839. LABELALIGN
  5840. "1: \n"
  5841. "movq (%1),%%xmm2 \n"
  5842. "movq 0x00(%1,%2,1),%%xmm1 \n"
  5843. "add $0x8,%1 \n"
  5844. "punpcklbw %%xmm1,%%xmm2 \n"
  5845. "movdqu (%0),%%xmm0 \n"
  5846. "movdqa %%xmm2,%%xmm1 \n"
  5847. "add $0x10,%0 \n"
  5848. "punpcklbw %%xmm0,%%xmm1 \n"
  5849. "punpckhbw %%xmm0,%%xmm2 \n"
  5850. "movdqu %%xmm1,(%3) \n"
  5851. "movdqu %%xmm2,0x10(%3) \n"
  5852. "lea 0x20(%3),%3 \n"
  5853. "sub $0x10,%4 \n"
  5854. "jg 1b \n"
  5855. : "+r"(src_y), // %0
  5856. "+r"(src_u), // %1
  5857. "+r"(src_v), // %2
  5858. "+r"(dst_uyvy), // %3
  5859. "+rm"(width) // %4
  5860. :
  5861. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  5862. }
  5863. #endif // HAS_I422TOUYVYROW_SSE2
  5864. #ifdef HAS_I422TOYUY2ROW_AVX2
  5865. void I422ToYUY2Row_AVX2(const uint8_t* src_y,
  5866. const uint8_t* src_u,
  5867. const uint8_t* src_v,
  5868. uint8_t* dst_yuy2,
  5869. int width) {
  5870. asm volatile(
  5871. "sub %1,%2 \n"
  5872. LABELALIGN
  5873. "1: \n"
  5874. "vpmovzxbw (%1),%%ymm1 \n"
  5875. "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
  5876. "add $0x10,%1 \n"
  5877. "vpsllw $0x8,%%ymm2,%%ymm2 \n"
  5878. "vpor %%ymm1,%%ymm2,%%ymm2 \n"
  5879. "vmovdqu (%0),%%ymm0 \n"
  5880. "add $0x20,%0 \n"
  5881. "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
  5882. "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
  5883. "vextractf128 $0x0,%%ymm1,(%3) \n"
  5884. "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
  5885. "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
  5886. "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
  5887. "lea 0x40(%3),%3 \n"
  5888. "sub $0x20,%4 \n"
  5889. "jg 1b \n"
  5890. "vzeroupper \n"
  5891. : "+r"(src_y), // %0
  5892. "+r"(src_u), // %1
  5893. "+r"(src_v), // %2
  5894. "+r"(dst_yuy2), // %3
  5895. "+rm"(width) // %4
  5896. :
  5897. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  5898. }
  5899. #endif // HAS_I422TOYUY2ROW_AVX2
  5900. #ifdef HAS_I422TOUYVYROW_AVX2
  5901. void I422ToUYVYRow_AVX2(const uint8_t* src_y,
  5902. const uint8_t* src_u,
  5903. const uint8_t* src_v,
  5904. uint8_t* dst_uyvy,
  5905. int width) {
  5906. asm volatile(
  5907. "sub %1,%2 \n"
  5908. LABELALIGN
  5909. "1: \n"
  5910. "vpmovzxbw (%1),%%ymm1 \n"
  5911. "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
  5912. "add $0x10,%1 \n"
  5913. "vpsllw $0x8,%%ymm2,%%ymm2 \n"
  5914. "vpor %%ymm1,%%ymm2,%%ymm2 \n"
  5915. "vmovdqu (%0),%%ymm0 \n"
  5916. "add $0x20,%0 \n"
  5917. "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
  5918. "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
  5919. "vextractf128 $0x0,%%ymm1,(%3) \n"
  5920. "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
  5921. "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
  5922. "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
  5923. "lea 0x40(%3),%3 \n"
  5924. "sub $0x20,%4 \n"
  5925. "jg 1b \n"
  5926. "vzeroupper \n"
  5927. : "+r"(src_y), // %0
  5928. "+r"(src_u), // %1
  5929. "+r"(src_v), // %2
  5930. "+r"(dst_uyvy), // %3
  5931. "+rm"(width) // %4
  5932. :
  5933. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  5934. }
  5935. #endif // HAS_I422TOUYVYROW_AVX2
  5936. #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  5937. void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
  5938. uint8_t* dst_argb,
  5939. const float* poly,
  5940. int width) {
  5941. asm volatile(
  5942. "pxor %%xmm3,%%xmm3 \n"
  5943. // 2 pixel loop.
  5944. LABELALIGN
  5945. "1: \n"
  5946. "movq (%0),%%xmm0 \n"
  5947. "lea 0x8(%0),%0 \n"
  5948. "punpcklbw %%xmm3,%%xmm0 \n"
  5949. "movdqa %%xmm0,%%xmm4 \n"
  5950. "punpcklwd %%xmm3,%%xmm0 \n"
  5951. "punpckhwd %%xmm3,%%xmm4 \n"
  5952. "cvtdq2ps %%xmm0,%%xmm0 \n"
  5953. "cvtdq2ps %%xmm4,%%xmm4 \n"
  5954. "movdqa %%xmm0,%%xmm1 \n"
  5955. "movdqa %%xmm4,%%xmm5 \n"
  5956. "mulps 0x10(%3),%%xmm0 \n"
  5957. "mulps 0x10(%3),%%xmm4 \n"
  5958. "addps (%3),%%xmm0 \n"
  5959. "addps (%3),%%xmm4 \n"
  5960. "movdqa %%xmm1,%%xmm2 \n"
  5961. "movdqa %%xmm5,%%xmm6 \n"
  5962. "mulps %%xmm1,%%xmm2 \n"
  5963. "mulps %%xmm5,%%xmm6 \n"
  5964. "mulps %%xmm2,%%xmm1 \n"
  5965. "mulps %%xmm6,%%xmm5 \n"
  5966. "mulps 0x20(%3),%%xmm2 \n"
  5967. "mulps 0x20(%3),%%xmm6 \n"
  5968. "mulps 0x30(%3),%%xmm1 \n"
  5969. "mulps 0x30(%3),%%xmm5 \n"
  5970. "addps %%xmm2,%%xmm0 \n"
  5971. "addps %%xmm6,%%xmm4 \n"
  5972. "addps %%xmm1,%%xmm0 \n"
  5973. "addps %%xmm5,%%xmm4 \n"
  5974. "cvttps2dq %%xmm0,%%xmm0 \n"
  5975. "cvttps2dq %%xmm4,%%xmm4 \n"
  5976. "packuswb %%xmm4,%%xmm0 \n"
  5977. "packuswb %%xmm0,%%xmm0 \n"
  5978. "movq %%xmm0,(%1) \n"
  5979. "lea 0x8(%1),%1 \n"
  5980. "sub $0x2,%2 \n"
  5981. "jg 1b \n"
  5982. : "+r"(src_argb), // %0
  5983. "+r"(dst_argb), // %1
  5984. "+r"(width) // %2
  5985. : "r"(poly) // %3
  5986. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  5987. }
  5988. #endif // HAS_ARGBPOLYNOMIALROW_SSE2
  5989. #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  5990. void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
  5991. uint8_t* dst_argb,
  5992. const float* poly,
  5993. int width) {
  5994. asm volatile(
  5995. "vbroadcastf128 (%3),%%ymm4 \n"
  5996. "vbroadcastf128 0x10(%3),%%ymm5 \n"
  5997. "vbroadcastf128 0x20(%3),%%ymm6 \n"
  5998. "vbroadcastf128 0x30(%3),%%ymm7 \n"
  5999. // 2 pixel loop.
  6000. LABELALIGN
  6001. "1: \n"
  6002. "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
  6003. "lea 0x8(%0),%0 \n"
  6004. "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
  6005. "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
  6006. "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
  6007. "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
  6008. "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
  6009. "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
  6010. // X
  6011. "vcvttps2dq %%ymm0,%%ymm0 \n"
  6012. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  6013. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  6014. "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
  6015. "vmovq %%xmm0,(%1) \n"
  6016. "lea 0x8(%1),%1 \n"
  6017. "sub $0x2,%2 \n"
  6018. "jg 1b \n"
  6019. "vzeroupper \n"
  6020. : "+r"(src_argb), // %0
  6021. "+r"(dst_argb), // %1
  6022. "+r"(width) // %2
  6023. : "r"(poly) // %3
  6024. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  6025. "xmm7");
  6026. }
  6027. #endif // HAS_ARGBPOLYNOMIALROW_AVX2
  6028. #ifdef HAS_HALFFLOATROW_SSE2
  6029. static float kScaleBias = 1.9259299444e-34f;
  6030. void HalfFloatRow_SSE2(const uint16_t* src,
  6031. uint16_t* dst,
  6032. float scale,
  6033. int width) {
  6034. scale *= kScaleBias;
  6035. asm volatile(
  6036. "movd %3,%%xmm4 \n"
  6037. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  6038. "pxor %%xmm5,%%xmm5 \n"
  6039. "sub %0,%1 \n"
  6040. // 16 pixel loop.
  6041. LABELALIGN
  6042. "1: \n"
  6043. "movdqu (%0),%%xmm2 \n" // 8 shorts
  6044. "add $0x10,%0 \n"
  6045. "movdqa %%xmm2,%%xmm3 \n"
  6046. "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
  6047. "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
  6048. "punpckhwd %%xmm5,%%xmm3 \n"
  6049. "cvtdq2ps %%xmm3,%%xmm3 \n"
  6050. "mulps %%xmm4,%%xmm2 \n"
  6051. "mulps %%xmm4,%%xmm3 \n"
  6052. "psrld $0xd,%%xmm2 \n"
  6053. "psrld $0xd,%%xmm3 \n"
  6054. "packssdw %%xmm3,%%xmm2 \n"
  6055. "movdqu %%xmm2,-0x10(%0,%1,1) \n"
  6056. "sub $0x8,%2 \n"
  6057. "jg 1b \n"
  6058. : "+r"(src), // %0
  6059. "+r"(dst), // %1
  6060. "+r"(width) // %2
  6061. : "m"(scale) // %3
  6062. : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
  6063. }
  6064. #endif // HAS_HALFFLOATROW_SSE2
  6065. #ifdef HAS_HALFFLOATROW_AVX2
  6066. void HalfFloatRow_AVX2(const uint16_t* src,
  6067. uint16_t* dst,
  6068. float scale,
  6069. int width) {
  6070. scale *= kScaleBias;
  6071. asm volatile(
  6072. "vbroadcastss %3, %%ymm4 \n"
  6073. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  6074. "sub %0,%1 \n"
  6075. // 16 pixel loop.
  6076. LABELALIGN
  6077. "1: \n"
  6078. "vmovdqu (%0),%%ymm2 \n" // 16 shorts
  6079. "add $0x20,%0 \n"
  6080. "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
  6081. "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
  6082. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  6083. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  6084. "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
  6085. "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
  6086. "vpsrld $0xd,%%ymm3,%%ymm3 \n"
  6087. "vpsrld $0xd,%%ymm2,%%ymm2 \n"
  6088. "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
  6089. "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
  6090. "sub $0x10,%2 \n"
  6091. "jg 1b \n"
  6092. "vzeroupper \n"
  6093. : "+r"(src), // %0
  6094. "+r"(dst), // %1
  6095. "+r"(width) // %2
  6096. #if defined(__x86_64__)
  6097. : "x"(scale) // %3
  6098. #else
  6099. : "m"(scale) // %3
  6100. #endif
  6101. : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
  6102. }
  6103. #endif // HAS_HALFFLOATROW_AVX2
  6104. #ifdef HAS_HALFFLOATROW_F16C
  6105. void HalfFloatRow_F16C(const uint16_t* src,
  6106. uint16_t* dst,
  6107. float scale,
  6108. int width) {
  6109. asm volatile(
  6110. "vbroadcastss %3, %%ymm4 \n"
  6111. "sub %0,%1 \n"
  6112. // 16 pixel loop.
  6113. LABELALIGN
  6114. "1: \n"
  6115. "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
  6116. "vpmovzxwd 0x10(%0),%%ymm3 \n"
  6117. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  6118. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  6119. "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
  6120. "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
  6121. "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
  6122. "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
  6123. "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
  6124. "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
  6125. "add $0x20,%0 \n"
  6126. "sub $0x10,%2 \n"
  6127. "jg 1b \n"
  6128. "vzeroupper \n"
  6129. : "+r"(src), // %0
  6130. "+r"(dst), // %1
  6131. "+r"(width) // %2
  6132. #if defined(__x86_64__)
  6133. : "x"(scale) // %3
  6134. #else
  6135. : "m"(scale) // %3
  6136. #endif
  6137. : "memory", "cc", "xmm2", "xmm3", "xmm4");
  6138. }
  6139. #endif // HAS_HALFFLOATROW_F16C
  6140. #ifdef HAS_HALFFLOATROW_F16C
  6141. void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
  6142. asm volatile(
  6143. "sub %0,%1 \n"
  6144. // 16 pixel loop.
  6145. LABELALIGN
  6146. "1: \n"
  6147. "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
  6148. "vpmovzxwd 0x10(%0),%%ymm3 \n"
  6149. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  6150. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  6151. "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
  6152. "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
  6153. "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
  6154. "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
  6155. "add $0x20,%0 \n"
  6156. "sub $0x10,%2 \n"
  6157. "jg 1b \n"
  6158. "vzeroupper \n"
  6159. : "+r"(src), // %0
  6160. "+r"(dst), // %1
  6161. "+r"(width) // %2
  6162. :
  6163. : "memory", "cc", "xmm2", "xmm3");
  6164. }
  6165. #endif // HAS_HALFFLOATROW_F16C
  6166. #ifdef HAS_ARGBCOLORTABLEROW_X86
  6167. // Tranform ARGB pixels with color table.
  6168. void ARGBColorTableRow_X86(uint8_t* dst_argb,
  6169. const uint8_t* table_argb,
  6170. int width) {
  6171. uintptr_t pixel_temp;
  6172. asm volatile(
  6173. // 1 pixel loop.
  6174. LABELALIGN
  6175. "1: \n"
  6176. "movzb (%0),%1 \n"
  6177. "lea 0x4(%0),%0 \n"
  6178. "movzb 0x00(%3,%1,4),%1 \n"
  6179. "mov %b1,-0x4(%0) \n"
  6180. "movzb -0x3(%0),%1 \n"
  6181. "movzb 0x01(%3,%1,4),%1 \n"
  6182. "mov %b1,-0x3(%0) \n"
  6183. "movzb -0x2(%0),%1 \n"
  6184. "movzb 0x02(%3,%1,4),%1 \n"
  6185. "mov %b1,-0x2(%0) \n"
  6186. "movzb -0x1(%0),%1 \n"
  6187. "movzb 0x03(%3,%1,4),%1 \n"
  6188. "mov %b1,-0x1(%0) \n"
  6189. "dec %2 \n"
  6190. "jg 1b \n"
  6191. : "+r"(dst_argb), // %0
  6192. "=&d"(pixel_temp), // %1
  6193. "+r"(width) // %2
  6194. : "r"(table_argb) // %3
  6195. : "memory", "cc");
  6196. }
  6197. #endif // HAS_ARGBCOLORTABLEROW_X86
  6198. #ifdef HAS_RGBCOLORTABLEROW_X86
  6199. // Tranform RGB pixels with color table.
  6200. void RGBColorTableRow_X86(uint8_t* dst_argb,
  6201. const uint8_t* table_argb,
  6202. int width) {
  6203. uintptr_t pixel_temp;
  6204. asm volatile(
  6205. // 1 pixel loop.
  6206. LABELALIGN
  6207. "1: \n"
  6208. "movzb (%0),%1 \n"
  6209. "lea 0x4(%0),%0 \n"
  6210. "movzb 0x00(%3,%1,4),%1 \n"
  6211. "mov %b1,-0x4(%0) \n"
  6212. "movzb -0x3(%0),%1 \n"
  6213. "movzb 0x01(%3,%1,4),%1 \n"
  6214. "mov %b1,-0x3(%0) \n"
  6215. "movzb -0x2(%0),%1 \n"
  6216. "movzb 0x02(%3,%1,4),%1 \n"
  6217. "mov %b1,-0x2(%0) \n"
  6218. "dec %2 \n"
  6219. "jg 1b \n"
  6220. : "+r"(dst_argb), // %0
  6221. "=&d"(pixel_temp), // %1
  6222. "+r"(width) // %2
  6223. : "r"(table_argb) // %3
  6224. : "memory", "cc");
  6225. }
  6226. #endif // HAS_RGBCOLORTABLEROW_X86
  6227. #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  6228. // Tranform RGB pixels with luma table.
  6229. void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
  6230. uint8_t* dst_argb,
  6231. int width,
  6232. const uint8_t* luma,
  6233. uint32_t lumacoeff) {
  6234. uintptr_t pixel_temp;
  6235. uintptr_t table_temp;
  6236. asm volatile(
  6237. "movd %6,%%xmm3 \n"
  6238. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  6239. "pcmpeqb %%xmm4,%%xmm4 \n"
  6240. "psllw $0x8,%%xmm4 \n"
  6241. "pxor %%xmm5,%%xmm5 \n"
  6242. // 4 pixel loop.
  6243. LABELALIGN
  6244. "1: \n"
  6245. "movdqu (%2),%%xmm0 \n"
  6246. "pmaddubsw %%xmm3,%%xmm0 \n"
  6247. "phaddw %%xmm0,%%xmm0 \n"
  6248. "pand %%xmm4,%%xmm0 \n"
  6249. "punpcklwd %%xmm5,%%xmm0 \n"
  6250. "movd %%xmm0,%k1 \n" // 32 bit offset
  6251. "add %5,%1 \n"
  6252. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  6253. "movzb (%2),%0 \n"
  6254. "movzb 0x00(%1,%0,1),%0 \n"
  6255. "mov %b0,(%3) \n"
  6256. "movzb 0x1(%2),%0 \n"
  6257. "movzb 0x00(%1,%0,1),%0 \n"
  6258. "mov %b0,0x1(%3) \n"
  6259. "movzb 0x2(%2),%0 \n"
  6260. "movzb 0x00(%1,%0,1),%0 \n"
  6261. "mov %b0,0x2(%3) \n"
  6262. "movzb 0x3(%2),%0 \n"
  6263. "mov %b0,0x3(%3) \n"
  6264. "movd %%xmm0,%k1 \n" // 32 bit offset
  6265. "add %5,%1 \n"
  6266. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  6267. "movzb 0x4(%2),%0 \n"
  6268. "movzb 0x00(%1,%0,1),%0 \n"
  6269. "mov %b0,0x4(%3) \n"
  6270. "movzb 0x5(%2),%0 \n"
  6271. "movzb 0x00(%1,%0,1),%0 \n"
  6272. "mov %b0,0x5(%3) \n"
  6273. "movzb 0x6(%2),%0 \n"
  6274. "movzb 0x00(%1,%0,1),%0 \n"
  6275. "mov %b0,0x6(%3) \n"
  6276. "movzb 0x7(%2),%0 \n"
  6277. "mov %b0,0x7(%3) \n"
  6278. "movd %%xmm0,%k1 \n" // 32 bit offset
  6279. "add %5,%1 \n"
  6280. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  6281. "movzb 0x8(%2),%0 \n"
  6282. "movzb 0x00(%1,%0,1),%0 \n"
  6283. "mov %b0,0x8(%3) \n"
  6284. "movzb 0x9(%2),%0 \n"
  6285. "movzb 0x00(%1,%0,1),%0 \n"
  6286. "mov %b0,0x9(%3) \n"
  6287. "movzb 0xa(%2),%0 \n"
  6288. "movzb 0x00(%1,%0,1),%0 \n"
  6289. "mov %b0,0xa(%3) \n"
  6290. "movzb 0xb(%2),%0 \n"
  6291. "mov %b0,0xb(%3) \n"
  6292. "movd %%xmm0,%k1 \n" // 32 bit offset
  6293. "add %5,%1 \n"
  6294. "movzb 0xc(%2),%0 \n"
  6295. "movzb 0x00(%1,%0,1),%0 \n"
  6296. "mov %b0,0xc(%3) \n"
  6297. "movzb 0xd(%2),%0 \n"
  6298. "movzb 0x00(%1,%0,1),%0 \n"
  6299. "mov %b0,0xd(%3) \n"
  6300. "movzb 0xe(%2),%0 \n"
  6301. "movzb 0x00(%1,%0,1),%0 \n"
  6302. "mov %b0,0xe(%3) \n"
  6303. "movzb 0xf(%2),%0 \n"
  6304. "mov %b0,0xf(%3) \n"
  6305. "lea 0x10(%2),%2 \n"
  6306. "lea 0x10(%3),%3 \n"
  6307. "sub $0x4,%4 \n"
  6308. "jg 1b \n"
  6309. : "=&d"(pixel_temp), // %0
  6310. "=&a"(table_temp), // %1
  6311. "+r"(src_argb), // %2
  6312. "+r"(dst_argb), // %3
  6313. "+rm"(width) // %4
  6314. : "r"(luma), // %5
  6315. "rm"(lumacoeff) // %6
  6316. : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
  6317. }
  6318. #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  6319. #ifdef HAS_NV21TOYUV24ROW_AVX2
  6320. // begin NV21ToYUV24Row_C avx2 constants
  6321. static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
  6322. 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
  6323. 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
  6324. 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
  6325. static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
  6326. 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
  6327. 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
  6328. 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
  6329. static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
  6330. 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
  6331. 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
  6332. 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
  6333. static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
  6334. 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
  6335. 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
  6336. 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
  6337. static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
  6338. 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
  6339. 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
  6340. 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
  6341. static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
  6342. 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
  6343. 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
  6344. 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
  6345. static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
  6346. 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
  6347. 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
  6348. 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
  6349. static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
  6350. 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
  6351. 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
  6352. 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
  6353. static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
  6354. 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
  6355. 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
  6356. 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
  6357. // NV21ToYUV24Row_AVX2
  6358. void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
  6359. const uint8_t* src_vu,
  6360. uint8_t* dst_yuv24,
  6361. int width) {
  6362. uint8_t* src_y_ptr;
  6363. uint64_t src_offset = 0;
  6364. uint64_t width64;
  6365. width64 = width;
  6366. src_y_ptr = (uint8_t*)src_y;
  6367. asm volatile(
  6368. "vmovdqu %5, %%ymm0 \n" // init blend value
  6369. "vmovdqu %6, %%ymm1 \n" // init blend value
  6370. "vmovdqu %7, %%ymm2 \n" // init blend value
  6371. // "sub $0x20, %3 \n" //sub 32 from width for final loop
  6372. LABELALIGN
  6373. "1: \n" // label 1
  6374. "vmovdqu (%0,%4), %%ymm3 \n" // src_y
  6375. "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
  6376. "vmovdqu (%1), %%ymm5 \n" // src_uv
  6377. "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
  6378. "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
  6379. // shuf
  6380. "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
  6381. // shuf
  6382. "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
  6383. "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
  6384. // shuf
  6385. "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
  6386. "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
  6387. "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
  6388. "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
  6389. "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
  6390. "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
  6391. "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
  6392. "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
  6393. "add $0x20, %4 \n" // add to src buffer
  6394. // ptr
  6395. "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
  6396. "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
  6397. "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
  6398. "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
  6399. "add $0x60,%2 \n" // add to dst buffer
  6400. // ptr
  6401. // "cmp %3, %4 \n" //(width64 -
  6402. // 32 bytes) and src_offset
  6403. "sub $0x20,%3 \n" // 32 pixels per loop
  6404. "jg 1b \n"
  6405. "vzeroupper \n" // sse-avx2
  6406. // transistions
  6407. : "+r"(src_y), //%0
  6408. "+r"(src_vu), //%1
  6409. "+r"(dst_yuv24), //%2
  6410. "+r"(width64), //%3
  6411. "+r"(src_offset) //%4
  6412. : "m"(kBLEND0), //%5
  6413. "m"(kBLEND1), //%6
  6414. "m"(kBLEND2), //%7
  6415. "m"(kSHUF0), //%8
  6416. "m"(kSHUF1), //%9
  6417. "m"(kSHUF2), //%10
  6418. "m"(kSHUF3), //%11
  6419. "m"(kSHUF4), //%12
  6420. "m"(kSHUF5) //%13
  6421. : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
  6422. "xmm13", "xmm14", "xmm15");
  6423. }
  6424. #endif // HAS_NV21TOYUV24ROW_AVX2
  6425. #ifdef HAS_SWAPUVROW_SSSE3
  6426. // Shuffle table for reversing the bytes.
  6427. static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
  6428. 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
  6429. // Convert UV plane of NV12 to VU of NV21.
  6430. void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
  6431. asm volatile(
  6432. "movdqu %3,%%xmm5 \n"
  6433. LABELALIGN
  6434. "1: \n"
  6435. "movdqu (%0),%%xmm0 \n"
  6436. "movdqu 0x10(%0),%%xmm1 \n"
  6437. "lea 0x20(%0),%0 \n"
  6438. "pshufb %%xmm5,%%xmm0 \n"
  6439. "pshufb %%xmm5,%%xmm1 \n"
  6440. "movdqu %%xmm0,(%1) \n"
  6441. "movdqu %%xmm1,0x10(%1) \n"
  6442. "lea 0x20(%1),%1 \n"
  6443. "sub $0x10,%2 \n"
  6444. "jg 1b \n"
  6445. : "+r"(src_uv), // %0
  6446. "+r"(dst_vu), // %1
  6447. "+r"(width) // %2
  6448. : "m"(kShuffleUVToVU) // %3
  6449. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  6450. }
  6451. #endif // HAS_SWAPUVROW_SSSE3
  6452. #ifdef HAS_SWAPUVROW_AVX2
  6453. void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
  6454. asm volatile(
  6455. "vbroadcastf128 %3,%%ymm5 \n"
  6456. LABELALIGN
  6457. "1: \n"
  6458. "vmovdqu (%0),%%ymm0 \n"
  6459. "vmovdqu 0x20(%0),%%ymm1 \n"
  6460. "lea 0x40(%0),%0 \n"
  6461. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  6462. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  6463. "vmovdqu %%ymm0,(%1) \n"
  6464. "vmovdqu %%ymm1,0x20(%1) \n"
  6465. "lea 0x40(%1),%1 \n"
  6466. "sub $0x20,%2 \n"
  6467. "jg 1b \n"
  6468. "vzeroupper \n"
  6469. : "+r"(src_uv), // %0
  6470. "+r"(dst_vu), // %1
  6471. "+r"(width) // %2
  6472. : "m"(kShuffleUVToVU) // %3
  6473. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  6474. }
  6475. #endif // HAS_SWAPUVROW_AVX2
  6476. #endif // defined(__x86_64__) || defined(__i386__)
  6477. #ifdef __cplusplus
  6478. } // extern "C"
  6479. } // namespace libyuv
  6480. #endif