row_gcc.cc 285 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #ifdef __cplusplus
  12. namespace libyuv {
  13. extern "C" {
  14. #endif
  15. // This module is for GCC x86 and x64.
  16. #if !defined(LIBYUV_DISABLE_X86) && \
  17. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  18. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  19. // Constants for ARGB
  20. static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
  21. 13, 65, 33, 0, 13, 65, 33, 0};
  22. // JPeg full range.
  23. static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
  24. 15, 75, 38, 0, 15, 75, 38, 0};
  25. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  26. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  27. static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
  28. 112, -74, -38, 0, 112, -74, -38, 0};
  29. static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
  30. 127, -84, -43, 0, 127, -84, -43, 0};
  31. static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
  32. -18, -94, 112, 0, -18, -94, 112, 0};
  33. static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
  34. -20, -107, 127, 0, -20, -107, 127, 0};
  35. // Constants for BGRA
  36. static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
  37. 0, 33, 65, 13, 0, 33, 65, 13};
  38. static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
  39. 0, -38, -74, 112, 0, -38, -74, 112};
  40. static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
  41. 0, 112, -94, -18, 0, 112, -94, -18};
  42. // Constants for ABGR
  43. static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
  44. 33, 65, 13, 0, 33, 65, 13, 0};
  45. static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
  46. -38, -74, 112, 0, -38, -74, 112, 0};
  47. static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
  48. 112, -94, -18, 0, 112, -94, -18, 0};
  49. // Constants for RGBA.
  50. static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
  51. 0, 13, 65, 33, 0, 13, 65, 33};
  52. static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
  53. 0, 112, -74, -38, 0, 112, -74, -38};
  54. static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
  55. 0, -18, -94, 112, 0, -18, -94, 112};
  56. static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
  57. 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
  58. // 7 bit fixed point 0.5.
  59. static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
  60. static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  61. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  62. static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
  63. 0x8080u, 0x8080u, 0x8080u, 0x8080u};
  64. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  65. #ifdef HAS_RGB24TOARGBROW_SSSE3
  66. // Shuffle table for converting RGB24 to ARGB.
  67. static const uvec8 kShuffleMaskRGB24ToARGB = {
  68. 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
  69. // Shuffle table for converting RAW to ARGB.
  70. static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
  71. 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
  72. // Shuffle table for converting RAW to RGB24. First 8.
  73. static const uvec8 kShuffleMaskRAWToRGB24_0 = {
  74. 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
  75. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  76. // Shuffle table for converting RAW to RGB24. Middle 8.
  77. static const uvec8 kShuffleMaskRAWToRGB24_1 = {
  78. 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
  79. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  80. // Shuffle table for converting RAW to RGB24. Last 8.
  81. static const uvec8 kShuffleMaskRAWToRGB24_2 = {
  82. 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
  83. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  84. // Shuffle table for converting ARGB to RGB24.
  85. static const uvec8 kShuffleMaskARGBToRGB24 = {
  86. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
  87. // Shuffle table for converting ARGB to RAW.
  88. static const uvec8 kShuffleMaskARGBToRAW = {
  89. 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
  90. // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
  91. static const uvec8 kShuffleMaskARGBToRGB24_0 = {
  92. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
  93. // YUY2 shuf 16 Y to 32 Y.
  94. static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
  95. 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
  96. 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
  97. // YUY2 shuf 8 UV to 16 UV.
  98. static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
  99. 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
  100. 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
  101. // UYVY shuf 16 Y to 32 Y.
  102. static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
  103. 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
  104. 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
  105. // UYVY shuf 8 UV to 16 UV.
  106. static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
  107. 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
  108. 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
  109. // NV21 shuf 8 VU to 16 UV.
  110. static const lvec8 kShuffleNV21 = {
  111. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  112. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  113. };
  114. #endif // HAS_RGB24TOARGBROW_SSSE3
  115. #ifdef HAS_J400TOARGBROW_SSE2
  116. void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  117. asm volatile(
  118. "pcmpeqb %%xmm5,%%xmm5 \n"
  119. "pslld $0x18,%%xmm5 \n"
  120. LABELALIGN
  121. "1: \n"
  122. "movq (%0),%%xmm0 \n"
  123. "lea 0x8(%0),%0 \n"
  124. "punpcklbw %%xmm0,%%xmm0 \n"
  125. "movdqa %%xmm0,%%xmm1 \n"
  126. "punpcklwd %%xmm0,%%xmm0 \n"
  127. "punpckhwd %%xmm1,%%xmm1 \n"
  128. "por %%xmm5,%%xmm0 \n"
  129. "por %%xmm5,%%xmm1 \n"
  130. "movdqu %%xmm0,(%1) \n"
  131. "movdqu %%xmm1,0x10(%1) \n"
  132. "lea 0x20(%1),%1 \n"
  133. "sub $0x8,%2 \n"
  134. "jg 1b \n"
  135. : "+r"(src_y), // %0
  136. "+r"(dst_argb), // %1
  137. "+r"(width) // %2
  138. ::"memory",
  139. "cc", "xmm0", "xmm1", "xmm5");
  140. }
  141. #endif // HAS_J400TOARGBROW_SSE2
  142. #ifdef HAS_RGB24TOARGBROW_SSSE3
  143. void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
  144. uint8_t* dst_argb,
  145. int width) {
  146. asm volatile(
  147. "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
  148. "pslld $0x18,%%xmm5 \n"
  149. "movdqa %3,%%xmm4 \n"
  150. LABELALIGN
  151. "1: \n"
  152. "movdqu (%0),%%xmm0 \n"
  153. "movdqu 0x10(%0),%%xmm1 \n"
  154. "movdqu 0x20(%0),%%xmm3 \n"
  155. "lea 0x30(%0),%0 \n"
  156. "movdqa %%xmm3,%%xmm2 \n"
  157. "palignr $0x8,%%xmm1,%%xmm2 \n"
  158. "pshufb %%xmm4,%%xmm2 \n"
  159. "por %%xmm5,%%xmm2 \n"
  160. "palignr $0xc,%%xmm0,%%xmm1 \n"
  161. "pshufb %%xmm4,%%xmm0 \n"
  162. "movdqu %%xmm2,0x20(%1) \n"
  163. "por %%xmm5,%%xmm0 \n"
  164. "pshufb %%xmm4,%%xmm1 \n"
  165. "movdqu %%xmm0,(%1) \n"
  166. "por %%xmm5,%%xmm1 \n"
  167. "palignr $0x4,%%xmm3,%%xmm3 \n"
  168. "pshufb %%xmm4,%%xmm3 \n"
  169. "movdqu %%xmm1,0x10(%1) \n"
  170. "por %%xmm5,%%xmm3 \n"
  171. "movdqu %%xmm3,0x30(%1) \n"
  172. "lea 0x40(%1),%1 \n"
  173. "sub $0x10,%2 \n"
  174. "jg 1b \n"
  175. : "+r"(src_rgb24), // %0
  176. "+r"(dst_argb), // %1
  177. "+r"(width) // %2
  178. : "m"(kShuffleMaskRGB24ToARGB) // %3
  179. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  180. }
  181. void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  182. asm volatile(
  183. "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
  184. "pslld $0x18,%%xmm5 \n"
  185. "movdqa %3,%%xmm4 \n"
  186. LABELALIGN
  187. "1: \n"
  188. "movdqu (%0),%%xmm0 \n"
  189. "movdqu 0x10(%0),%%xmm1 \n"
  190. "movdqu 0x20(%0),%%xmm3 \n"
  191. "lea 0x30(%0),%0 \n"
  192. "movdqa %%xmm3,%%xmm2 \n"
  193. "palignr $0x8,%%xmm1,%%xmm2 \n"
  194. "pshufb %%xmm4,%%xmm2 \n"
  195. "por %%xmm5,%%xmm2 \n"
  196. "palignr $0xc,%%xmm0,%%xmm1 \n"
  197. "pshufb %%xmm4,%%xmm0 \n"
  198. "movdqu %%xmm2,0x20(%1) \n"
  199. "por %%xmm5,%%xmm0 \n"
  200. "pshufb %%xmm4,%%xmm1 \n"
  201. "movdqu %%xmm0,(%1) \n"
  202. "por %%xmm5,%%xmm1 \n"
  203. "palignr $0x4,%%xmm3,%%xmm3 \n"
  204. "pshufb %%xmm4,%%xmm3 \n"
  205. "movdqu %%xmm1,0x10(%1) \n"
  206. "por %%xmm5,%%xmm3 \n"
  207. "movdqu %%xmm3,0x30(%1) \n"
  208. "lea 0x40(%1),%1 \n"
  209. "sub $0x10,%2 \n"
  210. "jg 1b \n"
  211. : "+r"(src_raw), // %0
  212. "+r"(dst_argb), // %1
  213. "+r"(width) // %2
  214. : "m"(kShuffleMaskRAWToARGB) // %3
  215. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  216. }
  217. void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
  218. uint8_t* dst_rgb24,
  219. int width) {
  220. asm volatile(
  221. "movdqa %3,%%xmm3 \n"
  222. "movdqa %4,%%xmm4 \n"
  223. "movdqa %5,%%xmm5 \n"
  224. LABELALIGN
  225. "1: \n"
  226. "movdqu (%0),%%xmm0 \n"
  227. "movdqu 0x4(%0),%%xmm1 \n"
  228. "movdqu 0x8(%0),%%xmm2 \n"
  229. "lea 0x18(%0),%0 \n"
  230. "pshufb %%xmm3,%%xmm0 \n"
  231. "pshufb %%xmm4,%%xmm1 \n"
  232. "pshufb %%xmm5,%%xmm2 \n"
  233. "movq %%xmm0,(%1) \n"
  234. "movq %%xmm1,0x8(%1) \n"
  235. "movq %%xmm2,0x10(%1) \n"
  236. "lea 0x18(%1),%1 \n"
  237. "sub $0x8,%2 \n"
  238. "jg 1b \n"
  239. : "+r"(src_raw), // %0
  240. "+r"(dst_rgb24), // %1
  241. "+r"(width) // %2
  242. : "m"(kShuffleMaskRAWToRGB24_0), // %3
  243. "m"(kShuffleMaskRAWToRGB24_1), // %4
  244. "m"(kShuffleMaskRAWToRGB24_2) // %5
  245. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  246. }
  247. void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  248. asm volatile(
  249. "mov $0x1080108,%%eax \n"
  250. "movd %%eax,%%xmm5 \n"
  251. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  252. "mov $0x20802080,%%eax \n"
  253. "movd %%eax,%%xmm6 \n"
  254. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  255. "pcmpeqb %%xmm3,%%xmm3 \n"
  256. "psllw $0xb,%%xmm3 \n"
  257. "pcmpeqb %%xmm4,%%xmm4 \n"
  258. "psllw $0xa,%%xmm4 \n"
  259. "psrlw $0x5,%%xmm4 \n"
  260. "pcmpeqb %%xmm7,%%xmm7 \n"
  261. "psllw $0x8,%%xmm7 \n"
  262. "sub %0,%1 \n"
  263. "sub %0,%1 \n"
  264. LABELALIGN
  265. "1: \n"
  266. "movdqu (%0),%%xmm0 \n"
  267. "movdqa %%xmm0,%%xmm1 \n"
  268. "movdqa %%xmm0,%%xmm2 \n"
  269. "pand %%xmm3,%%xmm1 \n"
  270. "psllw $0xb,%%xmm2 \n"
  271. "pmulhuw %%xmm5,%%xmm1 \n"
  272. "pmulhuw %%xmm5,%%xmm2 \n"
  273. "psllw $0x8,%%xmm1 \n"
  274. "por %%xmm2,%%xmm1 \n"
  275. "pand %%xmm4,%%xmm0 \n"
  276. "pmulhuw %%xmm6,%%xmm0 \n"
  277. "por %%xmm7,%%xmm0 \n"
  278. "movdqa %%xmm1,%%xmm2 \n"
  279. "punpcklbw %%xmm0,%%xmm1 \n"
  280. "punpckhbw %%xmm0,%%xmm2 \n"
  281. "movdqu %%xmm1,0x00(%1,%0,2) \n"
  282. "movdqu %%xmm2,0x10(%1,%0,2) \n"
  283. "lea 0x10(%0),%0 \n"
  284. "sub $0x8,%2 \n"
  285. "jg 1b \n"
  286. : "+r"(src), // %0
  287. "+r"(dst), // %1
  288. "+r"(width) // %2
  289. :
  290. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
  291. "xmm6", "xmm7");
  292. }
  293. void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  294. asm volatile(
  295. "mov $0x1080108,%%eax \n"
  296. "movd %%eax,%%xmm5 \n"
  297. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  298. "mov $0x42004200,%%eax \n"
  299. "movd %%eax,%%xmm6 \n"
  300. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  301. "pcmpeqb %%xmm3,%%xmm3 \n"
  302. "psllw $0xb,%%xmm3 \n"
  303. "movdqa %%xmm3,%%xmm4 \n"
  304. "psrlw $0x6,%%xmm4 \n"
  305. "pcmpeqb %%xmm7,%%xmm7 \n"
  306. "psllw $0x8,%%xmm7 \n"
  307. "sub %0,%1 \n"
  308. "sub %0,%1 \n"
  309. LABELALIGN
  310. "1: \n"
  311. "movdqu (%0),%%xmm0 \n"
  312. "movdqa %%xmm0,%%xmm1 \n"
  313. "movdqa %%xmm0,%%xmm2 \n"
  314. "psllw $0x1,%%xmm1 \n"
  315. "psllw $0xb,%%xmm2 \n"
  316. "pand %%xmm3,%%xmm1 \n"
  317. "pmulhuw %%xmm5,%%xmm2 \n"
  318. "pmulhuw %%xmm5,%%xmm1 \n"
  319. "psllw $0x8,%%xmm1 \n"
  320. "por %%xmm2,%%xmm1 \n"
  321. "movdqa %%xmm0,%%xmm2 \n"
  322. "pand %%xmm4,%%xmm0 \n"
  323. "psraw $0x8,%%xmm2 \n"
  324. "pmulhuw %%xmm6,%%xmm0 \n"
  325. "pand %%xmm7,%%xmm2 \n"
  326. "por %%xmm2,%%xmm0 \n"
  327. "movdqa %%xmm1,%%xmm2 \n"
  328. "punpcklbw %%xmm0,%%xmm1 \n"
  329. "punpckhbw %%xmm0,%%xmm2 \n"
  330. "movdqu %%xmm1,0x00(%1,%0,2) \n"
  331. "movdqu %%xmm2,0x10(%1,%0,2) \n"
  332. "lea 0x10(%0),%0 \n"
  333. "sub $0x8,%2 \n"
  334. "jg 1b \n"
  335. : "+r"(src), // %0
  336. "+r"(dst), // %1
  337. "+r"(width) // %2
  338. :
  339. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
  340. "xmm6", "xmm7");
  341. }
  342. void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  343. asm volatile(
  344. "mov $0xf0f0f0f,%%eax \n"
  345. "movd %%eax,%%xmm4 \n"
  346. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  347. "movdqa %%xmm4,%%xmm5 \n"
  348. "pslld $0x4,%%xmm5 \n"
  349. "sub %0,%1 \n"
  350. "sub %0,%1 \n"
  351. LABELALIGN
  352. "1: \n"
  353. "movdqu (%0),%%xmm0 \n"
  354. "movdqa %%xmm0,%%xmm2 \n"
  355. "pand %%xmm4,%%xmm0 \n"
  356. "pand %%xmm5,%%xmm2 \n"
  357. "movdqa %%xmm0,%%xmm1 \n"
  358. "movdqa %%xmm2,%%xmm3 \n"
  359. "psllw $0x4,%%xmm1 \n"
  360. "psrlw $0x4,%%xmm3 \n"
  361. "por %%xmm1,%%xmm0 \n"
  362. "por %%xmm3,%%xmm2 \n"
  363. "movdqa %%xmm0,%%xmm1 \n"
  364. "punpcklbw %%xmm2,%%xmm0 \n"
  365. "punpckhbw %%xmm2,%%xmm1 \n"
  366. "movdqu %%xmm0,0x00(%1,%0,2) \n"
  367. "movdqu %%xmm1,0x10(%1,%0,2) \n"
  368. "lea 0x10(%0),%0 \n"
  369. "sub $0x8,%2 \n"
  370. "jg 1b \n"
  371. : "+r"(src), // %0
  372. "+r"(dst), // %1
  373. "+r"(width) // %2
  374. :
  375. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  376. }
  377. void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  378. asm volatile(
  379. "movdqa %3,%%xmm6 \n"
  380. LABELALIGN
  381. "1: \n"
  382. "movdqu (%0),%%xmm0 \n"
  383. "movdqu 0x10(%0),%%xmm1 \n"
  384. "movdqu 0x20(%0),%%xmm2 \n"
  385. "movdqu 0x30(%0),%%xmm3 \n"
  386. "lea 0x40(%0),%0 \n"
  387. "pshufb %%xmm6,%%xmm0 \n"
  388. "pshufb %%xmm6,%%xmm1 \n"
  389. "pshufb %%xmm6,%%xmm2 \n"
  390. "pshufb %%xmm6,%%xmm3 \n"
  391. "movdqa %%xmm1,%%xmm4 \n"
  392. "psrldq $0x4,%%xmm1 \n"
  393. "pslldq $0xc,%%xmm4 \n"
  394. "movdqa %%xmm2,%%xmm5 \n"
  395. "por %%xmm4,%%xmm0 \n"
  396. "pslldq $0x8,%%xmm5 \n"
  397. "movdqu %%xmm0,(%1) \n"
  398. "por %%xmm5,%%xmm1 \n"
  399. "psrldq $0x8,%%xmm2 \n"
  400. "pslldq $0x4,%%xmm3 \n"
  401. "por %%xmm3,%%xmm2 \n"
  402. "movdqu %%xmm1,0x10(%1) \n"
  403. "movdqu %%xmm2,0x20(%1) \n"
  404. "lea 0x30(%1),%1 \n"
  405. "sub $0x10,%2 \n"
  406. "jg 1b \n"
  407. : "+r"(src), // %0
  408. "+r"(dst), // %1
  409. "+r"(width) // %2
  410. : "m"(kShuffleMaskARGBToRGB24) // %3
  411. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  412. }
  413. void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  414. asm volatile(
  415. "movdqa %3,%%xmm6 \n"
  416. LABELALIGN
  417. "1: \n"
  418. "movdqu (%0),%%xmm0 \n"
  419. "movdqu 0x10(%0),%%xmm1 \n"
  420. "movdqu 0x20(%0),%%xmm2 \n"
  421. "movdqu 0x30(%0),%%xmm3 \n"
  422. "lea 0x40(%0),%0 \n"
  423. "pshufb %%xmm6,%%xmm0 \n"
  424. "pshufb %%xmm6,%%xmm1 \n"
  425. "pshufb %%xmm6,%%xmm2 \n"
  426. "pshufb %%xmm6,%%xmm3 \n"
  427. "movdqa %%xmm1,%%xmm4 \n"
  428. "psrldq $0x4,%%xmm1 \n"
  429. "pslldq $0xc,%%xmm4 \n"
  430. "movdqa %%xmm2,%%xmm5 \n"
  431. "por %%xmm4,%%xmm0 \n"
  432. "pslldq $0x8,%%xmm5 \n"
  433. "movdqu %%xmm0,(%1) \n"
  434. "por %%xmm5,%%xmm1 \n"
  435. "psrldq $0x8,%%xmm2 \n"
  436. "pslldq $0x4,%%xmm3 \n"
  437. "por %%xmm3,%%xmm2 \n"
  438. "movdqu %%xmm1,0x10(%1) \n"
  439. "movdqu %%xmm2,0x20(%1) \n"
  440. "lea 0x30(%1),%1 \n"
  441. "sub $0x10,%2 \n"
  442. "jg 1b \n"
  443. : "+r"(src), // %0
  444. "+r"(dst), // %1
  445. "+r"(width) // %2
  446. : "m"(kShuffleMaskARGBToRAW) // %3
  447. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  448. }
  449. #ifdef HAS_ARGBTORGB24ROW_AVX2
  450. // vpermd for 12+12 to 24
  451. static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
  452. void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  453. asm volatile(
  454. "vbroadcastf128 %3,%%ymm6 \n"
  455. "vmovdqa %4,%%ymm7 \n"
  456. LABELALIGN
  457. "1: \n"
  458. "vmovdqu (%0),%%ymm0 \n"
  459. "vmovdqu 0x20(%0),%%ymm1 \n"
  460. "vmovdqu 0x40(%0),%%ymm2 \n"
  461. "vmovdqu 0x60(%0),%%ymm3 \n"
  462. "lea 0x80(%0),%0 \n"
  463. "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
  464. "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
  465. "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
  466. "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
  467. "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
  468. "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
  469. "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
  470. "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
  471. "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
  472. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  473. "vmovdqu %%ymm0,(%1) \n"
  474. "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
  475. "vpermq $0x4f,%%ymm2,%%ymm4 \n"
  476. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  477. "vmovdqu %%ymm1,0x20(%1) \n"
  478. "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
  479. "vpermq $0x93,%%ymm3,%%ymm3 \n"
  480. "vpor %%ymm3,%%ymm2,%%ymm2 \n"
  481. "vmovdqu %%ymm2,0x40(%1) \n"
  482. "lea 0x60(%1),%1 \n"
  483. "sub $0x20,%2 \n"
  484. "jg 1b \n"
  485. "vzeroupper \n"
  486. : "+r"(src), // %0
  487. "+r"(dst), // %1
  488. "+r"(width) // %2
  489. : "m"(kShuffleMaskARGBToRGB24), // %3
  490. "m"(kPermdRGB24_AVX) // %4
  491. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  492. "xmm7");
  493. }
  494. #endif
  495. #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
  496. // Shuffle table for converting ARGBToRGB24
  497. static const ulvec8 kPermARGBToRGB24_0 = {
  498. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
  499. 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
  500. 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
  501. static const ulvec8 kPermARGBToRGB24_1 = {
  502. 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
  503. 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
  504. 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
  505. static const ulvec8 kPermARGBToRGB24_2 = {
  506. 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
  507. 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
  508. 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
  509. void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
  510. asm volatile(
  511. "vmovdqa %3,%%ymm5 \n"
  512. "vmovdqa %4,%%ymm6 \n"
  513. "vmovdqa %5,%%ymm7 \n"
  514. LABELALIGN
  515. "1: \n"
  516. "vmovdqu (%0),%%ymm0 \n"
  517. "vmovdqu 0x20(%0),%%ymm1 \n"
  518. "vmovdqu 0x40(%0),%%ymm2 \n"
  519. "vmovdqu 0x60(%0),%%ymm3 \n"
  520. "lea 0x80(%0),%0 \n"
  521. "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
  522. "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
  523. "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
  524. "vmovdqu %%ymm0,(%1) \n"
  525. "vmovdqu %%ymm1,0x20(%1) \n"
  526. "vmovdqu %%ymm2,0x40(%1) \n"
  527. "lea 0x60(%1),%1 \n"
  528. "sub $0x20,%2 \n"
  529. "jg 1b \n"
  530. "vzeroupper \n"
  531. : "+r"(src), // %0
  532. "+r"(dst), // %1
  533. "+r"(width) // %2
  534. : "m"(kPermARGBToRGB24_0), // %3
  535. "m"(kPermARGBToRGB24_1), // %4
  536. "m"(kPermARGBToRGB24_2) // %5
  537. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
  538. }
  539. #endif
  540. #ifdef HAS_ARGBTORAWROW_AVX2
  541. void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  542. asm volatile(
  543. "vbroadcastf128 %3,%%ymm6 \n"
  544. "vmovdqa %4,%%ymm7 \n"
  545. LABELALIGN
  546. "1: \n"
  547. "vmovdqu (%0),%%ymm0 \n"
  548. "vmovdqu 0x20(%0),%%ymm1 \n"
  549. "vmovdqu 0x40(%0),%%ymm2 \n"
  550. "vmovdqu 0x60(%0),%%ymm3 \n"
  551. "lea 0x80(%0),%0 \n"
  552. "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
  553. "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
  554. "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
  555. "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
  556. "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
  557. "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
  558. "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
  559. "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
  560. "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
  561. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  562. "vmovdqu %%ymm0,(%1) \n"
  563. "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
  564. "vpermq $0x4f,%%ymm2,%%ymm4 \n"
  565. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  566. "vmovdqu %%ymm1,0x20(%1) \n"
  567. "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
  568. "vpermq $0x93,%%ymm3,%%ymm3 \n"
  569. "vpor %%ymm3,%%ymm2,%%ymm2 \n"
  570. "vmovdqu %%ymm2,0x40(%1) \n"
  571. "lea 0x60(%1),%1 \n"
  572. "sub $0x20,%2 \n"
  573. "jg 1b \n"
  574. "vzeroupper \n"
  575. : "+r"(src), // %0
  576. "+r"(dst), // %1
  577. "+r"(width) // %2
  578. : "m"(kShuffleMaskARGBToRAW), // %3
  579. "m"(kPermdRGB24_AVX) // %4
  580. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  581. "xmm7");
  582. }
  583. #endif
  584. void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  585. asm volatile(
  586. "pcmpeqb %%xmm3,%%xmm3 \n"
  587. "psrld $0x1b,%%xmm3 \n"
  588. "pcmpeqb %%xmm4,%%xmm4 \n"
  589. "psrld $0x1a,%%xmm4 \n"
  590. "pslld $0x5,%%xmm4 \n"
  591. "pcmpeqb %%xmm5,%%xmm5 \n"
  592. "pslld $0xb,%%xmm5 \n"
  593. LABELALIGN
  594. "1: \n"
  595. "movdqu (%0),%%xmm0 \n"
  596. "movdqa %%xmm0,%%xmm1 \n"
  597. "movdqa %%xmm0,%%xmm2 \n"
  598. "pslld $0x8,%%xmm0 \n"
  599. "psrld $0x3,%%xmm1 \n"
  600. "psrld $0x5,%%xmm2 \n"
  601. "psrad $0x10,%%xmm0 \n"
  602. "pand %%xmm3,%%xmm1 \n"
  603. "pand %%xmm4,%%xmm2 \n"
  604. "pand %%xmm5,%%xmm0 \n"
  605. "por %%xmm2,%%xmm1 \n"
  606. "por %%xmm1,%%xmm0 \n"
  607. "packssdw %%xmm0,%%xmm0 \n"
  608. "lea 0x10(%0),%0 \n"
  609. "movq %%xmm0,(%1) \n"
  610. "lea 0x8(%1),%1 \n"
  611. "sub $0x4,%2 \n"
  612. "jg 1b \n"
  613. : "+r"(src), // %0
  614. "+r"(dst), // %1
  615. "+r"(width) // %2
  616. ::"memory",
  617. "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  618. }
  619. void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
  620. uint8_t* dst,
  621. const uint32_t dither4,
  622. int width) {
  623. asm volatile(
  624. "movd %3,%%xmm6 \n"
  625. "punpcklbw %%xmm6,%%xmm6 \n"
  626. "movdqa %%xmm6,%%xmm7 \n"
  627. "punpcklwd %%xmm6,%%xmm6 \n"
  628. "punpckhwd %%xmm7,%%xmm7 \n"
  629. "pcmpeqb %%xmm3,%%xmm3 \n"
  630. "psrld $0x1b,%%xmm3 \n"
  631. "pcmpeqb %%xmm4,%%xmm4 \n"
  632. "psrld $0x1a,%%xmm4 \n"
  633. "pslld $0x5,%%xmm4 \n"
  634. "pcmpeqb %%xmm5,%%xmm5 \n"
  635. "pslld $0xb,%%xmm5 \n"
  636. LABELALIGN
  637. "1: \n"
  638. "movdqu (%0),%%xmm0 \n"
  639. "paddusb %%xmm6,%%xmm0 \n"
  640. "movdqa %%xmm0,%%xmm1 \n"
  641. "movdqa %%xmm0,%%xmm2 \n"
  642. "pslld $0x8,%%xmm0 \n"
  643. "psrld $0x3,%%xmm1 \n"
  644. "psrld $0x5,%%xmm2 \n"
  645. "psrad $0x10,%%xmm0 \n"
  646. "pand %%xmm3,%%xmm1 \n"
  647. "pand %%xmm4,%%xmm2 \n"
  648. "pand %%xmm5,%%xmm0 \n"
  649. "por %%xmm2,%%xmm1 \n"
  650. "por %%xmm1,%%xmm0 \n"
  651. "packssdw %%xmm0,%%xmm0 \n"
  652. "lea 0x10(%0),%0 \n"
  653. "movq %%xmm0,(%1) \n"
  654. "lea 0x8(%1),%1 \n"
  655. "sub $0x4,%2 \n"
  656. "jg 1b \n"
  657. : "+r"(src), // %0
  658. "+r"(dst), // %1
  659. "+r"(width) // %2
  660. : "m"(dither4) // %3
  661. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  662. "xmm7");
  663. }
  664. #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
  665. void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
  666. uint8_t* dst,
  667. const uint32_t dither4,
  668. int width) {
  669. asm volatile(
  670. "vbroadcastss %3,%%xmm6 \n"
  671. "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
  672. "vpermq $0xd8,%%ymm6,%%ymm6 \n"
  673. "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
  674. "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
  675. "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
  676. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  677. "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
  678. "vpslld $0x5,%%ymm4,%%ymm4 \n"
  679. "vpslld $0xb,%%ymm3,%%ymm5 \n"
  680. LABELALIGN
  681. "1: \n"
  682. "vmovdqu (%0),%%ymm0 \n"
  683. "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
  684. "vpsrld $0x5,%%ymm0,%%ymm2 \n"
  685. "vpsrld $0x3,%%ymm0,%%ymm1 \n"
  686. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  687. "vpand %%ymm4,%%ymm2,%%ymm2 \n"
  688. "vpand %%ymm3,%%ymm1,%%ymm1 \n"
  689. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  690. "vpor %%ymm2,%%ymm1,%%ymm1 \n"
  691. "vpor %%ymm1,%%ymm0,%%ymm0 \n"
  692. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  693. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  694. "lea 0x20(%0),%0 \n"
  695. "vmovdqu %%xmm0,(%1) \n"
  696. "lea 0x10(%1),%1 \n"
  697. "sub $0x8,%2 \n"
  698. "jg 1b \n"
  699. "vzeroupper \n"
  700. : "+r"(src), // %0
  701. "+r"(dst), // %1
  702. "+r"(width) // %2
  703. : "m"(dither4) // %3
  704. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  705. "xmm7");
  706. }
  707. #endif // HAS_ARGBTORGB565DITHERROW_AVX2
  708. void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  709. asm volatile(
  710. "pcmpeqb %%xmm4,%%xmm4 \n"
  711. "psrld $0x1b,%%xmm4 \n"
  712. "movdqa %%xmm4,%%xmm5 \n"
  713. "pslld $0x5,%%xmm5 \n"
  714. "movdqa %%xmm4,%%xmm6 \n"
  715. "pslld $0xa,%%xmm6 \n"
  716. "pcmpeqb %%xmm7,%%xmm7 \n"
  717. "pslld $0xf,%%xmm7 \n"
  718. LABELALIGN
  719. "1: \n"
  720. "movdqu (%0),%%xmm0 \n"
  721. "movdqa %%xmm0,%%xmm1 \n"
  722. "movdqa %%xmm0,%%xmm2 \n"
  723. "movdqa %%xmm0,%%xmm3 \n"
  724. "psrad $0x10,%%xmm0 \n"
  725. "psrld $0x3,%%xmm1 \n"
  726. "psrld $0x6,%%xmm2 \n"
  727. "psrld $0x9,%%xmm3 \n"
  728. "pand %%xmm7,%%xmm0 \n"
  729. "pand %%xmm4,%%xmm1 \n"
  730. "pand %%xmm5,%%xmm2 \n"
  731. "pand %%xmm6,%%xmm3 \n"
  732. "por %%xmm1,%%xmm0 \n"
  733. "por %%xmm3,%%xmm2 \n"
  734. "por %%xmm2,%%xmm0 \n"
  735. "packssdw %%xmm0,%%xmm0 \n"
  736. "lea 0x10(%0),%0 \n"
  737. "movq %%xmm0,(%1) \n"
  738. "lea 0x8(%1),%1 \n"
  739. "sub $0x4,%2 \n"
  740. "jg 1b \n"
  741. : "+r"(src), // %0
  742. "+r"(dst), // %1
  743. "+r"(width) // %2
  744. ::"memory",
  745. "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
  746. }
  747. void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  748. asm volatile(
  749. "pcmpeqb %%xmm4,%%xmm4 \n"
  750. "psllw $0xc,%%xmm4 \n"
  751. "movdqa %%xmm4,%%xmm3 \n"
  752. "psrlw $0x8,%%xmm3 \n"
  753. LABELALIGN
  754. "1: \n"
  755. "movdqu (%0),%%xmm0 \n"
  756. "movdqa %%xmm0,%%xmm1 \n"
  757. "pand %%xmm3,%%xmm0 \n"
  758. "pand %%xmm4,%%xmm1 \n"
  759. "psrlq $0x4,%%xmm0 \n"
  760. "psrlq $0x8,%%xmm1 \n"
  761. "por %%xmm1,%%xmm0 \n"
  762. "packuswb %%xmm0,%%xmm0 \n"
  763. "lea 0x10(%0),%0 \n"
  764. "movq %%xmm0,(%1) \n"
  765. "lea 0x8(%1),%1 \n"
  766. "sub $0x4,%2 \n"
  767. "jg 1b \n"
  768. : "+r"(src), // %0
  769. "+r"(dst), // %1
  770. "+r"(width) // %2
  771. ::"memory",
  772. "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  773. }
  774. #endif // HAS_RGB24TOARGBROW_SSSE3
  775. /*
  776. ARGBToAR30Row:
  777. Red Blue
  778. With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
  779. produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
  780. wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
  781. (1024+4)*16 for red.
  782. Alpha Green
  783. Alpha and Green are already in the high bits so vpand can zero out the other
  784. bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
  785. could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
  786. would be a simple multiplier to shift it into position. It wants a gap of 10
  787. above the green. Green is 10 bits, so there are 6 bits in the low short. 4
  788. more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
  789. and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
  790. result left 10 to position the A and G channels.
  791. */
  792. // Shuffle table for converting RAW to RGB24. Last 8.
  793. static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
  794. 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
  795. static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
  796. 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
  797. static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
  798. static const uint32_t kMaskRB10 = 0x3ff003ff;
  799. static const uint32_t kMaskAG10 = 0xc000ff00;
  800. static const uint32_t kMulAG10 = 64 * 65536 + 1028;
  801. void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  802. asm volatile(
  803. "movdqa %3,%%xmm2 \n" // shuffler for RB
  804. "movd %4,%%xmm3 \n" // multipler for RB
  805. "movd %5,%%xmm4 \n" // mask for R10 B10
  806. "movd %6,%%xmm5 \n" // mask for AG
  807. "movd %7,%%xmm6 \n" // multipler for AG
  808. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  809. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  810. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  811. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  812. "sub %0,%1 \n"
  813. "1: \n"
  814. "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
  815. "movdqa %%xmm0,%%xmm1 \n"
  816. "pshufb %%xmm2,%%xmm1 \n" // R0B0
  817. "pand %%xmm5,%%xmm0 \n" // A0G0
  818. "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
  819. "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
  820. "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
  821. "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
  822. "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
  823. "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
  824. "add $0x10,%0 \n"
  825. "sub $0x4,%2 \n"
  826. "jg 1b \n"
  827. : "+r"(src), // %0
  828. "+r"(dst), // %1
  829. "+r"(width) // %2
  830. : "m"(kShuffleRB30), // %3
  831. "m"(kMulRB10), // %4
  832. "m"(kMaskRB10), // %5
  833. "m"(kMaskAG10), // %6
  834. "m"(kMulAG10) // %7
  835. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  836. }
  837. void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  838. asm volatile(
  839. "movdqa %3,%%xmm2 \n" // shuffler for RB
  840. "movd %4,%%xmm3 \n" // multipler for RB
  841. "movd %5,%%xmm4 \n" // mask for R10 B10
  842. "movd %6,%%xmm5 \n" // mask for AG
  843. "movd %7,%%xmm6 \n" // multipler for AG
  844. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  845. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  846. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  847. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  848. "sub %0,%1 \n"
  849. "1: \n"
  850. "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
  851. "movdqa %%xmm0,%%xmm1 \n"
  852. "pshufb %%xmm2,%%xmm1 \n" // R0B0
  853. "pand %%xmm5,%%xmm0 \n" // A0G0
  854. "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
  855. "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
  856. "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
  857. "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
  858. "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
  859. "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
  860. "add $0x10,%0 \n"
  861. "sub $0x4,%2 \n"
  862. "jg 1b \n"
  863. : "+r"(src), // %0
  864. "+r"(dst), // %1
  865. "+r"(width) // %2
  866. : "m"(kShuffleBR30), // %3 reversed shuffler
  867. "m"(kMulRB10), // %4
  868. "m"(kMaskRB10), // %5
  869. "m"(kMaskAG10), // %6
  870. "m"(kMulAG10) // %7
  871. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  872. }
  873. #ifdef HAS_ARGBTOAR30ROW_AVX2
  874. void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  875. asm volatile(
  876. "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
  877. "vbroadcastss %4,%%ymm3 \n" // multipler for RB
  878. "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
  879. "vbroadcastss %6,%%ymm5 \n" // mask for AG
  880. "vbroadcastss %7,%%ymm6 \n" // multipler for AG
  881. "sub %0,%1 \n"
  882. "1: \n"
  883. "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
  884. "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
  885. "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
  886. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
  887. "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
  888. "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
  889. "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
  890. "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
  891. "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
  892. "add $0x20,%0 \n"
  893. "sub $0x8,%2 \n"
  894. "jg 1b \n"
  895. "vzeroupper \n"
  896. : "+r"(src), // %0
  897. "+r"(dst), // %1
  898. "+r"(width) // %2
  899. : "m"(kShuffleRB30), // %3
  900. "m"(kMulRB10), // %4
  901. "m"(kMaskRB10), // %5
  902. "m"(kMaskAG10), // %6
  903. "m"(kMulAG10) // %7
  904. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  905. }
  906. #endif
  907. #ifdef HAS_ABGRTOAR30ROW_AVX2
  908. void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  909. asm volatile(
  910. "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
  911. "vbroadcastss %4,%%ymm3 \n" // multipler for RB
  912. "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
  913. "vbroadcastss %6,%%ymm5 \n" // mask for AG
  914. "vbroadcastss %7,%%ymm6 \n" // multipler for AG
  915. "sub %0,%1 \n"
  916. "1: \n"
  917. "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
  918. "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
  919. "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
  920. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
  921. "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
  922. "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
  923. "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
  924. "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
  925. "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
  926. "add $0x20,%0 \n"
  927. "sub $0x8,%2 \n"
  928. "jg 1b \n"
  929. "vzeroupper \n"
  930. : "+r"(src), // %0
  931. "+r"(dst), // %1
  932. "+r"(width) // %2
  933. : "m"(kShuffleBR30), // %3 reversed shuffler
  934. "m"(kMulRB10), // %4
  935. "m"(kMaskRB10), // %5
  936. "m"(kMaskAG10), // %6
  937. "m"(kMulAG10) // %7
  938. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  939. }
  940. #endif
  941. #ifdef HAS_ARGBTOYROW_SSSE3
  942. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
  943. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  944. asm volatile(
  945. "movdqa %3,%%xmm4 \n"
  946. "movdqa %4,%%xmm5 \n"
  947. LABELALIGN
  948. "1: \n"
  949. "movdqu (%0),%%xmm0 \n"
  950. "movdqu 0x10(%0),%%xmm1 \n"
  951. "movdqu 0x20(%0),%%xmm2 \n"
  952. "movdqu 0x30(%0),%%xmm3 \n"
  953. "pmaddubsw %%xmm4,%%xmm0 \n"
  954. "pmaddubsw %%xmm4,%%xmm1 \n"
  955. "pmaddubsw %%xmm4,%%xmm2 \n"
  956. "pmaddubsw %%xmm4,%%xmm3 \n"
  957. "lea 0x40(%0),%0 \n"
  958. "phaddw %%xmm1,%%xmm0 \n"
  959. "phaddw %%xmm3,%%xmm2 \n"
  960. "psrlw $0x7,%%xmm0 \n"
  961. "psrlw $0x7,%%xmm2 \n"
  962. "packuswb %%xmm2,%%xmm0 \n"
  963. "paddb %%xmm5,%%xmm0 \n"
  964. "movdqu %%xmm0,(%1) \n"
  965. "lea 0x10(%1),%1 \n"
  966. "sub $0x10,%2 \n"
  967. "jg 1b \n"
  968. : "+r"(src_argb), // %0
  969. "+r"(dst_y), // %1
  970. "+r"(width) // %2
  971. : "m"(kARGBToY), // %3
  972. "m"(kAddY16) // %4
  973. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  974. }
  975. #endif // HAS_ARGBTOYROW_SSSE3
  976. #ifdef HAS_ARGBTOYJROW_SSSE3
  977. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
  978. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
  979. void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  980. asm volatile(
  981. "movdqa %3,%%xmm4 \n"
  982. "movdqa %4,%%xmm5 \n"
  983. LABELALIGN
  984. "1: \n"
  985. "movdqu (%0),%%xmm0 \n"
  986. "movdqu 0x10(%0),%%xmm1 \n"
  987. "movdqu 0x20(%0),%%xmm2 \n"
  988. "movdqu 0x30(%0),%%xmm3 \n"
  989. "pmaddubsw %%xmm4,%%xmm0 \n"
  990. "pmaddubsw %%xmm4,%%xmm1 \n"
  991. "pmaddubsw %%xmm4,%%xmm2 \n"
  992. "pmaddubsw %%xmm4,%%xmm3 \n"
  993. "lea 0x40(%0),%0 \n"
  994. "phaddw %%xmm1,%%xmm0 \n"
  995. "phaddw %%xmm3,%%xmm2 \n"
  996. "paddw %%xmm5,%%xmm0 \n"
  997. "paddw %%xmm5,%%xmm2 \n"
  998. "psrlw $0x7,%%xmm0 \n"
  999. "psrlw $0x7,%%xmm2 \n"
  1000. "packuswb %%xmm2,%%xmm0 \n"
  1001. "movdqu %%xmm0,(%1) \n"
  1002. "lea 0x10(%1),%1 \n"
  1003. "sub $0x10,%2 \n"
  1004. "jg 1b \n"
  1005. : "+r"(src_argb), // %0
  1006. "+r"(dst_y), // %1
  1007. "+r"(width) // %2
  1008. : "m"(kARGBToYJ), // %3
  1009. "m"(kAddYJ64) // %4
  1010. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  1011. }
  1012. #endif // HAS_ARGBTOYJROW_SSSE3
  1013. #ifdef HAS_ARGBTOYROW_AVX2
  1014. // vpermd for vphaddw + vpackuswb vpermd.
  1015. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
  1016. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1017. void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1018. asm volatile(
  1019. "vbroadcastf128 %3,%%ymm4 \n"
  1020. "vbroadcastf128 %4,%%ymm5 \n"
  1021. "vmovdqu %5,%%ymm6 \n"
  1022. LABELALIGN
  1023. "1: \n"
  1024. "vmovdqu (%0),%%ymm0 \n"
  1025. "vmovdqu 0x20(%0),%%ymm1 \n"
  1026. "vmovdqu 0x40(%0),%%ymm2 \n"
  1027. "vmovdqu 0x60(%0),%%ymm3 \n"
  1028. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  1029. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  1030. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  1031. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  1032. "lea 0x80(%0),%0 \n"
  1033. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
  1034. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
  1035. "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
  1036. "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
  1037. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  1038. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
  1039. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
  1040. "vmovdqu %%ymm0,(%1) \n"
  1041. "lea 0x20(%1),%1 \n"
  1042. "sub $0x20,%2 \n"
  1043. "jg 1b \n"
  1044. "vzeroupper \n"
  1045. : "+r"(src_argb), // %0
  1046. "+r"(dst_y), // %1
  1047. "+r"(width) // %2
  1048. : "m"(kARGBToY), // %3
  1049. "m"(kAddY16), // %4
  1050. "m"(kPermdARGBToY_AVX) // %5
  1051. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  1052. }
  1053. #endif // HAS_ARGBTOYROW_AVX2
  1054. #ifdef HAS_ARGBTOYJROW_AVX2
  1055. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1056. void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1057. asm volatile(
  1058. "vbroadcastf128 %3,%%ymm4 \n"
  1059. "vbroadcastf128 %4,%%ymm5 \n"
  1060. "vmovdqu %5,%%ymm6 \n"
  1061. LABELALIGN
  1062. "1: \n"
  1063. "vmovdqu (%0),%%ymm0 \n"
  1064. "vmovdqu 0x20(%0),%%ymm1 \n"
  1065. "vmovdqu 0x40(%0),%%ymm2 \n"
  1066. "vmovdqu 0x60(%0),%%ymm3 \n"
  1067. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  1068. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  1069. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  1070. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  1071. "lea 0x80(%0),%0 \n"
  1072. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
  1073. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
  1074. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
  1075. "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
  1076. "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
  1077. "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
  1078. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  1079. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
  1080. "vmovdqu %%ymm0,(%1) \n"
  1081. "lea 0x20(%1),%1 \n"
  1082. "sub $0x20,%2 \n"
  1083. "jg 1b \n"
  1084. "vzeroupper \n"
  1085. : "+r"(src_argb), // %0
  1086. "+r"(dst_y), // %1
  1087. "+r"(width) // %2
  1088. : "m"(kARGBToYJ), // %3
  1089. "m"(kAddYJ64), // %4
  1090. "m"(kPermdARGBToY_AVX) // %5
  1091. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  1092. }
  1093. #endif // HAS_ARGBTOYJROW_AVX2
  1094. #ifdef HAS_ARGBTOUVROW_SSSE3
  1095. void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
  1096. int src_stride_argb,
  1097. uint8_t* dst_u,
  1098. uint8_t* dst_v,
  1099. int width) {
  1100. asm volatile(
  1101. "movdqa %5,%%xmm3 \n"
  1102. "movdqa %6,%%xmm4 \n"
  1103. "movdqa %7,%%xmm5 \n"
  1104. "sub %1,%2 \n"
  1105. LABELALIGN
  1106. "1: \n"
  1107. "movdqu (%0),%%xmm0 \n"
  1108. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1109. "pavgb %%xmm7,%%xmm0 \n"
  1110. "movdqu 0x10(%0),%%xmm1 \n"
  1111. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1112. "pavgb %%xmm7,%%xmm1 \n"
  1113. "movdqu 0x20(%0),%%xmm2 \n"
  1114. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1115. "pavgb %%xmm7,%%xmm2 \n"
  1116. "movdqu 0x30(%0),%%xmm6 \n"
  1117. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1118. "pavgb %%xmm7,%%xmm6 \n"
  1119. "lea 0x40(%0),%0 \n"
  1120. "movdqa %%xmm0,%%xmm7 \n"
  1121. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1122. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1123. "pavgb %%xmm7,%%xmm0 \n"
  1124. "movdqa %%xmm2,%%xmm7 \n"
  1125. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1126. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1127. "pavgb %%xmm7,%%xmm2 \n"
  1128. "movdqa %%xmm0,%%xmm1 \n"
  1129. "movdqa %%xmm2,%%xmm6 \n"
  1130. "pmaddubsw %%xmm4,%%xmm0 \n"
  1131. "pmaddubsw %%xmm4,%%xmm2 \n"
  1132. "pmaddubsw %%xmm3,%%xmm1 \n"
  1133. "pmaddubsw %%xmm3,%%xmm6 \n"
  1134. "phaddw %%xmm2,%%xmm0 \n"
  1135. "phaddw %%xmm6,%%xmm1 \n"
  1136. "psraw $0x8,%%xmm0 \n"
  1137. "psraw $0x8,%%xmm1 \n"
  1138. "packsswb %%xmm1,%%xmm0 \n"
  1139. "paddb %%xmm5,%%xmm0 \n"
  1140. "movlps %%xmm0,(%1) \n"
  1141. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1142. "lea 0x8(%1),%1 \n"
  1143. "sub $0x10,%3 \n"
  1144. "jg 1b \n"
  1145. : "+r"(src_argb0), // %0
  1146. "+r"(dst_u), // %1
  1147. "+r"(dst_v), // %2
  1148. "+rm"(width) // %3
  1149. : "r"((intptr_t)(src_stride_argb)), // %4
  1150. "m"(kARGBToV), // %5
  1151. "m"(kARGBToU), // %6
  1152. "m"(kAddUV128) // %7
  1153. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1154. }
  1155. #endif // HAS_ARGBTOUVROW_SSSE3
  1156. #ifdef HAS_ARGBTOUVROW_AVX2
  1157. // vpshufb for vphaddw + vpackuswb packed to shorts.
  1158. static const lvec8 kShufARGBToUV_AVX = {
  1159. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
  1160. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
  1161. void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
  1162. int src_stride_argb,
  1163. uint8_t* dst_u,
  1164. uint8_t* dst_v,
  1165. int width) {
  1166. asm volatile(
  1167. "vbroadcastf128 %5,%%ymm5 \n"
  1168. "vbroadcastf128 %6,%%ymm6 \n"
  1169. "vbroadcastf128 %7,%%ymm7 \n"
  1170. "sub %1,%2 \n"
  1171. LABELALIGN
  1172. "1: \n"
  1173. "vmovdqu (%0),%%ymm0 \n"
  1174. "vmovdqu 0x20(%0),%%ymm1 \n"
  1175. "vmovdqu 0x40(%0),%%ymm2 \n"
  1176. "vmovdqu 0x60(%0),%%ymm3 \n"
  1177. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  1178. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  1179. "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
  1180. "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
  1181. "lea 0x80(%0),%0 \n"
  1182. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  1183. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  1184. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  1185. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  1186. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  1187. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  1188. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  1189. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  1190. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  1191. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  1192. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  1193. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  1194. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  1195. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  1196. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  1197. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  1198. "vpshufb %8,%%ymm0,%%ymm0 \n"
  1199. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
  1200. "vextractf128 $0x0,%%ymm0,(%1) \n"
  1201. "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
  1202. "lea 0x10(%1),%1 \n"
  1203. "sub $0x20,%3 \n"
  1204. "jg 1b \n"
  1205. "vzeroupper \n"
  1206. : "+r"(src_argb0), // %0
  1207. "+r"(dst_u), // %1
  1208. "+r"(dst_v), // %2
  1209. "+rm"(width) // %3
  1210. : "r"((intptr_t)(src_stride_argb)), // %4
  1211. "m"(kAddUV128), // %5
  1212. "m"(kARGBToV), // %6
  1213. "m"(kARGBToU), // %7
  1214. "m"(kShufARGBToUV_AVX) // %8
  1215. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1216. "xmm7");
  1217. }
  1218. #endif // HAS_ARGBTOUVROW_AVX2
  1219. #ifdef HAS_ARGBTOUVJROW_AVX2
  1220. void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
  1221. int src_stride_argb,
  1222. uint8_t* dst_u,
  1223. uint8_t* dst_v,
  1224. int width) {
  1225. asm volatile(
  1226. "vbroadcastf128 %5,%%ymm5 \n"
  1227. "vbroadcastf128 %6,%%ymm6 \n"
  1228. "vbroadcastf128 %7,%%ymm7 \n"
  1229. "sub %1,%2 \n"
  1230. LABELALIGN
  1231. "1: \n"
  1232. "vmovdqu (%0),%%ymm0 \n"
  1233. "vmovdqu 0x20(%0),%%ymm1 \n"
  1234. "vmovdqu 0x40(%0),%%ymm2 \n"
  1235. "vmovdqu 0x60(%0),%%ymm3 \n"
  1236. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  1237. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  1238. "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
  1239. "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
  1240. "lea 0x80(%0),%0 \n"
  1241. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  1242. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  1243. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  1244. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  1245. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  1246. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  1247. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  1248. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  1249. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  1250. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  1251. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  1252. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  1253. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
  1254. "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
  1255. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  1256. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  1257. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  1258. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  1259. "vpshufb %8,%%ymm0,%%ymm0 \n"
  1260. "vextractf128 $0x0,%%ymm0,(%1) \n"
  1261. "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
  1262. "lea 0x10(%1),%1 \n"
  1263. "sub $0x20,%3 \n"
  1264. "jg 1b \n"
  1265. "vzeroupper \n"
  1266. : "+r"(src_argb0), // %0
  1267. "+r"(dst_u), // %1
  1268. "+r"(dst_v), // %2
  1269. "+rm"(width) // %3
  1270. : "r"((intptr_t)(src_stride_argb)), // %4
  1271. "m"(kAddUVJ128), // %5
  1272. "m"(kARGBToVJ), // %6
  1273. "m"(kARGBToUJ), // %7
  1274. "m"(kShufARGBToUV_AVX) // %8
  1275. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1276. "xmm7");
  1277. }
  1278. #endif // HAS_ARGBTOUVJROW_AVX2
  1279. #ifdef HAS_ARGBTOUVJROW_SSSE3
  1280. void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
  1281. int src_stride_argb,
  1282. uint8_t* dst_u,
  1283. uint8_t* dst_v,
  1284. int width) {
  1285. asm volatile(
  1286. "movdqa %5,%%xmm3 \n"
  1287. "movdqa %6,%%xmm4 \n"
  1288. "movdqa %7,%%xmm5 \n"
  1289. "sub %1,%2 \n"
  1290. LABELALIGN
  1291. "1: \n"
  1292. "movdqu (%0),%%xmm0 \n"
  1293. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1294. "pavgb %%xmm7,%%xmm0 \n"
  1295. "movdqu 0x10(%0),%%xmm1 \n"
  1296. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1297. "pavgb %%xmm7,%%xmm1 \n"
  1298. "movdqu 0x20(%0),%%xmm2 \n"
  1299. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1300. "pavgb %%xmm7,%%xmm2 \n"
  1301. "movdqu 0x30(%0),%%xmm6 \n"
  1302. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1303. "pavgb %%xmm7,%%xmm6 \n"
  1304. "lea 0x40(%0),%0 \n"
  1305. "movdqa %%xmm0,%%xmm7 \n"
  1306. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1307. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1308. "pavgb %%xmm7,%%xmm0 \n"
  1309. "movdqa %%xmm2,%%xmm7 \n"
  1310. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1311. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1312. "pavgb %%xmm7,%%xmm2 \n"
  1313. "movdqa %%xmm0,%%xmm1 \n"
  1314. "movdqa %%xmm2,%%xmm6 \n"
  1315. "pmaddubsw %%xmm4,%%xmm0 \n"
  1316. "pmaddubsw %%xmm4,%%xmm2 \n"
  1317. "pmaddubsw %%xmm3,%%xmm1 \n"
  1318. "pmaddubsw %%xmm3,%%xmm6 \n"
  1319. "phaddw %%xmm2,%%xmm0 \n"
  1320. "phaddw %%xmm6,%%xmm1 \n"
  1321. "paddw %%xmm5,%%xmm0 \n"
  1322. "paddw %%xmm5,%%xmm1 \n"
  1323. "psraw $0x8,%%xmm0 \n"
  1324. "psraw $0x8,%%xmm1 \n"
  1325. "packsswb %%xmm1,%%xmm0 \n"
  1326. "movlps %%xmm0,(%1) \n"
  1327. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1328. "lea 0x8(%1),%1 \n"
  1329. "sub $0x10,%3 \n"
  1330. "jg 1b \n"
  1331. : "+r"(src_argb0), // %0
  1332. "+r"(dst_u), // %1
  1333. "+r"(dst_v), // %2
  1334. "+rm"(width) // %3
  1335. : "r"((intptr_t)(src_stride_argb)), // %4
  1336. "m"(kARGBToVJ), // %5
  1337. "m"(kARGBToUJ), // %6
  1338. "m"(kAddUVJ128) // %7
  1339. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1340. }
  1341. #endif // HAS_ARGBTOUVJROW_SSSE3
  1342. #ifdef HAS_ARGBTOUV444ROW_SSSE3
  1343. void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
  1344. uint8_t* dst_u,
  1345. uint8_t* dst_v,
  1346. int width) {
  1347. asm volatile(
  1348. "movdqa %4,%%xmm3 \n"
  1349. "movdqa %5,%%xmm4 \n"
  1350. "movdqa %6,%%xmm5 \n"
  1351. "sub %1,%2 \n"
  1352. LABELALIGN
  1353. "1: \n"
  1354. "movdqu (%0),%%xmm0 \n"
  1355. "movdqu 0x10(%0),%%xmm1 \n"
  1356. "movdqu 0x20(%0),%%xmm2 \n"
  1357. "movdqu 0x30(%0),%%xmm6 \n"
  1358. "pmaddubsw %%xmm4,%%xmm0 \n"
  1359. "pmaddubsw %%xmm4,%%xmm1 \n"
  1360. "pmaddubsw %%xmm4,%%xmm2 \n"
  1361. "pmaddubsw %%xmm4,%%xmm6 \n"
  1362. "phaddw %%xmm1,%%xmm0 \n"
  1363. "phaddw %%xmm6,%%xmm2 \n"
  1364. "psraw $0x8,%%xmm0 \n"
  1365. "psraw $0x8,%%xmm2 \n"
  1366. "packsswb %%xmm2,%%xmm0 \n"
  1367. "paddb %%xmm5,%%xmm0 \n"
  1368. "movdqu %%xmm0,(%1) \n"
  1369. "movdqu (%0),%%xmm0 \n"
  1370. "movdqu 0x10(%0),%%xmm1 \n"
  1371. "movdqu 0x20(%0),%%xmm2 \n"
  1372. "movdqu 0x30(%0),%%xmm6 \n"
  1373. "pmaddubsw %%xmm3,%%xmm0 \n"
  1374. "pmaddubsw %%xmm3,%%xmm1 \n"
  1375. "pmaddubsw %%xmm3,%%xmm2 \n"
  1376. "pmaddubsw %%xmm3,%%xmm6 \n"
  1377. "phaddw %%xmm1,%%xmm0 \n"
  1378. "phaddw %%xmm6,%%xmm2 \n"
  1379. "psraw $0x8,%%xmm0 \n"
  1380. "psraw $0x8,%%xmm2 \n"
  1381. "packsswb %%xmm2,%%xmm0 \n"
  1382. "paddb %%xmm5,%%xmm0 \n"
  1383. "lea 0x40(%0),%0 \n"
  1384. "movdqu %%xmm0,0x00(%1,%2,1) \n"
  1385. "lea 0x10(%1),%1 \n"
  1386. "sub $0x10,%3 \n"
  1387. "jg 1b \n"
  1388. : "+r"(src_argb), // %0
  1389. "+r"(dst_u), // %1
  1390. "+r"(dst_v), // %2
  1391. "+rm"(width) // %3
  1392. : "m"(kARGBToV), // %4
  1393. "m"(kARGBToU), // %5
  1394. "m"(kAddUV128) // %6
  1395. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
  1396. }
  1397. #endif // HAS_ARGBTOUV444ROW_SSSE3
  1398. void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
  1399. asm volatile(
  1400. "movdqa %4,%%xmm5 \n"
  1401. "movdqa %3,%%xmm4 \n"
  1402. LABELALIGN
  1403. "1: \n"
  1404. "movdqu (%0),%%xmm0 \n"
  1405. "movdqu 0x10(%0),%%xmm1 \n"
  1406. "movdqu 0x20(%0),%%xmm2 \n"
  1407. "movdqu 0x30(%0),%%xmm3 \n"
  1408. "pmaddubsw %%xmm4,%%xmm0 \n"
  1409. "pmaddubsw %%xmm4,%%xmm1 \n"
  1410. "pmaddubsw %%xmm4,%%xmm2 \n"
  1411. "pmaddubsw %%xmm4,%%xmm3 \n"
  1412. "lea 0x40(%0),%0 \n"
  1413. "phaddw %%xmm1,%%xmm0 \n"
  1414. "phaddw %%xmm3,%%xmm2 \n"
  1415. "psrlw $0x7,%%xmm0 \n"
  1416. "psrlw $0x7,%%xmm2 \n"
  1417. "packuswb %%xmm2,%%xmm0 \n"
  1418. "paddb %%xmm5,%%xmm0 \n"
  1419. "movdqu %%xmm0,(%1) \n"
  1420. "lea 0x10(%1),%1 \n"
  1421. "sub $0x10,%2 \n"
  1422. "jg 1b \n"
  1423. : "+r"(src_bgra), // %0
  1424. "+r"(dst_y), // %1
  1425. "+r"(width) // %2
  1426. : "m"(kBGRAToY), // %3
  1427. "m"(kAddY16) // %4
  1428. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  1429. }
  1430. void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
  1431. int src_stride_bgra,
  1432. uint8_t* dst_u,
  1433. uint8_t* dst_v,
  1434. int width) {
  1435. asm volatile(
  1436. "movdqa %5,%%xmm3 \n"
  1437. "movdqa %6,%%xmm4 \n"
  1438. "movdqa %7,%%xmm5 \n"
  1439. "sub %1,%2 \n"
  1440. LABELALIGN
  1441. "1: \n"
  1442. "movdqu (%0),%%xmm0 \n"
  1443. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1444. "pavgb %%xmm7,%%xmm0 \n"
  1445. "movdqu 0x10(%0),%%xmm1 \n"
  1446. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1447. "pavgb %%xmm7,%%xmm1 \n"
  1448. "movdqu 0x20(%0),%%xmm2 \n"
  1449. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1450. "pavgb %%xmm7,%%xmm2 \n"
  1451. "movdqu 0x30(%0),%%xmm6 \n"
  1452. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1453. "pavgb %%xmm7,%%xmm6 \n"
  1454. "lea 0x40(%0),%0 \n"
  1455. "movdqa %%xmm0,%%xmm7 \n"
  1456. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1457. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1458. "pavgb %%xmm7,%%xmm0 \n"
  1459. "movdqa %%xmm2,%%xmm7 \n"
  1460. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1461. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1462. "pavgb %%xmm7,%%xmm2 \n"
  1463. "movdqa %%xmm0,%%xmm1 \n"
  1464. "movdqa %%xmm2,%%xmm6 \n"
  1465. "pmaddubsw %%xmm4,%%xmm0 \n"
  1466. "pmaddubsw %%xmm4,%%xmm2 \n"
  1467. "pmaddubsw %%xmm3,%%xmm1 \n"
  1468. "pmaddubsw %%xmm3,%%xmm6 \n"
  1469. "phaddw %%xmm2,%%xmm0 \n"
  1470. "phaddw %%xmm6,%%xmm1 \n"
  1471. "psraw $0x8,%%xmm0 \n"
  1472. "psraw $0x8,%%xmm1 \n"
  1473. "packsswb %%xmm1,%%xmm0 \n"
  1474. "paddb %%xmm5,%%xmm0 \n"
  1475. "movlps %%xmm0,(%1) \n"
  1476. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1477. "lea 0x8(%1),%1 \n"
  1478. "sub $0x10,%3 \n"
  1479. "jg 1b \n"
  1480. : "+r"(src_bgra0), // %0
  1481. "+r"(dst_u), // %1
  1482. "+r"(dst_v), // %2
  1483. "+rm"(width) // %3
  1484. : "r"((intptr_t)(src_stride_bgra)), // %4
  1485. "m"(kBGRAToV), // %5
  1486. "m"(kBGRAToU), // %6
  1487. "m"(kAddUV128) // %7
  1488. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1489. }
  1490. void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
  1491. asm volatile(
  1492. "movdqa %4,%%xmm5 \n"
  1493. "movdqa %3,%%xmm4 \n"
  1494. LABELALIGN
  1495. "1: \n"
  1496. "movdqu (%0),%%xmm0 \n"
  1497. "movdqu 0x10(%0),%%xmm1 \n"
  1498. "movdqu 0x20(%0),%%xmm2 \n"
  1499. "movdqu 0x30(%0),%%xmm3 \n"
  1500. "pmaddubsw %%xmm4,%%xmm0 \n"
  1501. "pmaddubsw %%xmm4,%%xmm1 \n"
  1502. "pmaddubsw %%xmm4,%%xmm2 \n"
  1503. "pmaddubsw %%xmm4,%%xmm3 \n"
  1504. "lea 0x40(%0),%0 \n"
  1505. "phaddw %%xmm1,%%xmm0 \n"
  1506. "phaddw %%xmm3,%%xmm2 \n"
  1507. "psrlw $0x7,%%xmm0 \n"
  1508. "psrlw $0x7,%%xmm2 \n"
  1509. "packuswb %%xmm2,%%xmm0 \n"
  1510. "paddb %%xmm5,%%xmm0 \n"
  1511. "movdqu %%xmm0,(%1) \n"
  1512. "lea 0x10(%1),%1 \n"
  1513. "sub $0x10,%2 \n"
  1514. "jg 1b \n"
  1515. : "+r"(src_abgr), // %0
  1516. "+r"(dst_y), // %1
  1517. "+r"(width) // %2
  1518. : "m"(kABGRToY), // %3
  1519. "m"(kAddY16) // %4
  1520. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  1521. }
  1522. void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
  1523. asm volatile(
  1524. "movdqa %4,%%xmm5 \n"
  1525. "movdqa %3,%%xmm4 \n"
  1526. LABELALIGN
  1527. "1: \n"
  1528. "movdqu (%0),%%xmm0 \n"
  1529. "movdqu 0x10(%0),%%xmm1 \n"
  1530. "movdqu 0x20(%0),%%xmm2 \n"
  1531. "movdqu 0x30(%0),%%xmm3 \n"
  1532. "pmaddubsw %%xmm4,%%xmm0 \n"
  1533. "pmaddubsw %%xmm4,%%xmm1 \n"
  1534. "pmaddubsw %%xmm4,%%xmm2 \n"
  1535. "pmaddubsw %%xmm4,%%xmm3 \n"
  1536. "lea 0x40(%0),%0 \n"
  1537. "phaddw %%xmm1,%%xmm0 \n"
  1538. "phaddw %%xmm3,%%xmm2 \n"
  1539. "psrlw $0x7,%%xmm0 \n"
  1540. "psrlw $0x7,%%xmm2 \n"
  1541. "packuswb %%xmm2,%%xmm0 \n"
  1542. "paddb %%xmm5,%%xmm0 \n"
  1543. "movdqu %%xmm0,(%1) \n"
  1544. "lea 0x10(%1),%1 \n"
  1545. "sub $0x10,%2 \n"
  1546. "jg 1b \n"
  1547. : "+r"(src_rgba), // %0
  1548. "+r"(dst_y), // %1
  1549. "+r"(width) // %2
  1550. : "m"(kRGBAToY), // %3
  1551. "m"(kAddY16) // %4
  1552. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  1553. }
  1554. void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
  1555. int src_stride_abgr,
  1556. uint8_t* dst_u,
  1557. uint8_t* dst_v,
  1558. int width) {
  1559. asm volatile(
  1560. "movdqa %5,%%xmm3 \n"
  1561. "movdqa %6,%%xmm4 \n"
  1562. "movdqa %7,%%xmm5 \n"
  1563. "sub %1,%2 \n"
  1564. LABELALIGN
  1565. "1: \n"
  1566. "movdqu (%0),%%xmm0 \n"
  1567. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1568. "pavgb %%xmm7,%%xmm0 \n"
  1569. "movdqu 0x10(%0),%%xmm1 \n"
  1570. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1571. "pavgb %%xmm7,%%xmm1 \n"
  1572. "movdqu 0x20(%0),%%xmm2 \n"
  1573. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1574. "pavgb %%xmm7,%%xmm2 \n"
  1575. "movdqu 0x30(%0),%%xmm6 \n"
  1576. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1577. "pavgb %%xmm7,%%xmm6 \n"
  1578. "lea 0x40(%0),%0 \n"
  1579. "movdqa %%xmm0,%%xmm7 \n"
  1580. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1581. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1582. "pavgb %%xmm7,%%xmm0 \n"
  1583. "movdqa %%xmm2,%%xmm7 \n"
  1584. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1585. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1586. "pavgb %%xmm7,%%xmm2 \n"
  1587. "movdqa %%xmm0,%%xmm1 \n"
  1588. "movdqa %%xmm2,%%xmm6 \n"
  1589. "pmaddubsw %%xmm4,%%xmm0 \n"
  1590. "pmaddubsw %%xmm4,%%xmm2 \n"
  1591. "pmaddubsw %%xmm3,%%xmm1 \n"
  1592. "pmaddubsw %%xmm3,%%xmm6 \n"
  1593. "phaddw %%xmm2,%%xmm0 \n"
  1594. "phaddw %%xmm6,%%xmm1 \n"
  1595. "psraw $0x8,%%xmm0 \n"
  1596. "psraw $0x8,%%xmm1 \n"
  1597. "packsswb %%xmm1,%%xmm0 \n"
  1598. "paddb %%xmm5,%%xmm0 \n"
  1599. "movlps %%xmm0,(%1) \n"
  1600. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1601. "lea 0x8(%1),%1 \n"
  1602. "sub $0x10,%3 \n"
  1603. "jg 1b \n"
  1604. : "+r"(src_abgr0), // %0
  1605. "+r"(dst_u), // %1
  1606. "+r"(dst_v), // %2
  1607. "+rm"(width) // %3
  1608. : "r"((intptr_t)(src_stride_abgr)), // %4
  1609. "m"(kABGRToV), // %5
  1610. "m"(kABGRToU), // %6
  1611. "m"(kAddUV128) // %7
  1612. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1613. }
  1614. void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
  1615. int src_stride_rgba,
  1616. uint8_t* dst_u,
  1617. uint8_t* dst_v,
  1618. int width) {
  1619. asm volatile(
  1620. "movdqa %5,%%xmm3 \n"
  1621. "movdqa %6,%%xmm4 \n"
  1622. "movdqa %7,%%xmm5 \n"
  1623. "sub %1,%2 \n"
  1624. LABELALIGN
  1625. "1: \n"
  1626. "movdqu (%0),%%xmm0 \n"
  1627. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1628. "pavgb %%xmm7,%%xmm0 \n"
  1629. "movdqu 0x10(%0),%%xmm1 \n"
  1630. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1631. "pavgb %%xmm7,%%xmm1 \n"
  1632. "movdqu 0x20(%0),%%xmm2 \n"
  1633. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1634. "pavgb %%xmm7,%%xmm2 \n"
  1635. "movdqu 0x30(%0),%%xmm6 \n"
  1636. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1637. "pavgb %%xmm7,%%xmm6 \n"
  1638. "lea 0x40(%0),%0 \n"
  1639. "movdqa %%xmm0,%%xmm7 \n"
  1640. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1641. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1642. "pavgb %%xmm7,%%xmm0 \n"
  1643. "movdqa %%xmm2,%%xmm7 \n"
  1644. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1645. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1646. "pavgb %%xmm7,%%xmm2 \n"
  1647. "movdqa %%xmm0,%%xmm1 \n"
  1648. "movdqa %%xmm2,%%xmm6 \n"
  1649. "pmaddubsw %%xmm4,%%xmm0 \n"
  1650. "pmaddubsw %%xmm4,%%xmm2 \n"
  1651. "pmaddubsw %%xmm3,%%xmm1 \n"
  1652. "pmaddubsw %%xmm3,%%xmm6 \n"
  1653. "phaddw %%xmm2,%%xmm0 \n"
  1654. "phaddw %%xmm6,%%xmm1 \n"
  1655. "psraw $0x8,%%xmm0 \n"
  1656. "psraw $0x8,%%xmm1 \n"
  1657. "packsswb %%xmm1,%%xmm0 \n"
  1658. "paddb %%xmm5,%%xmm0 \n"
  1659. "movlps %%xmm0,(%1) \n"
  1660. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1661. "lea 0x8(%1),%1 \n"
  1662. "sub $0x10,%3 \n"
  1663. "jg 1b \n"
  1664. : "+r"(src_rgba0), // %0
  1665. "+r"(dst_u), // %1
  1666. "+r"(dst_v), // %2
  1667. "+rm"(width) // %3
  1668. : "r"((intptr_t)(src_stride_rgba)), // %4
  1669. "m"(kRGBAToV), // %5
  1670. "m"(kRGBAToU), // %6
  1671. "m"(kAddUV128) // %7
  1672. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1673. }
  1674. #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
  1675. // Read 8 UV from 444
  1676. #define READYUV444 \
  1677. "movq (%[u_buf]),%%xmm0 \n" \
  1678. "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1679. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  1680. "punpcklbw %%xmm1,%%xmm0 \n" \
  1681. "movq (%[y_buf]),%%xmm4 \n" \
  1682. "punpcklbw %%xmm4,%%xmm4 \n" \
  1683. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1684. // Read 4 UV from 422, upsample to 8 UV
  1685. #define READYUV422 \
  1686. "movd (%[u_buf]),%%xmm0 \n" \
  1687. "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1688. "lea 0x4(%[u_buf]),%[u_buf] \n" \
  1689. "punpcklbw %%xmm1,%%xmm0 \n" \
  1690. "punpcklwd %%xmm0,%%xmm0 \n" \
  1691. "movq (%[y_buf]),%%xmm4 \n" \
  1692. "punpcklbw %%xmm4,%%xmm4 \n" \
  1693. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1694. // Read 4 UV from 422 10 bit, upsample to 8 UV
  1695. // TODO(fbarchard): Consider shufb to replace pack/unpack
  1696. // TODO(fbarchard): Consider pmulhuw to replace psraw
  1697. // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
  1698. #define READYUV210 \
  1699. "movq (%[u_buf]),%%xmm0 \n" \
  1700. "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1701. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  1702. "punpcklwd %%xmm1,%%xmm0 \n" \
  1703. "psraw $0x2,%%xmm0 \n" \
  1704. "packuswb %%xmm0,%%xmm0 \n" \
  1705. "punpcklwd %%xmm0,%%xmm0 \n" \
  1706. "movdqu (%[y_buf]),%%xmm4 \n" \
  1707. "psllw $0x6,%%xmm4 \n" \
  1708. "lea 0x10(%[y_buf]),%[y_buf] \n"
  1709. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  1710. #define READYUVA422 \
  1711. "movd (%[u_buf]),%%xmm0 \n" \
  1712. "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1713. "lea 0x4(%[u_buf]),%[u_buf] \n" \
  1714. "punpcklbw %%xmm1,%%xmm0 \n" \
  1715. "punpcklwd %%xmm0,%%xmm0 \n" \
  1716. "movq (%[y_buf]),%%xmm4 \n" \
  1717. "punpcklbw %%xmm4,%%xmm4 \n" \
  1718. "lea 0x8(%[y_buf]),%[y_buf] \n" \
  1719. "movq (%[a_buf]),%%xmm5 \n" \
  1720. "lea 0x8(%[a_buf]),%[a_buf] \n"
  1721. // Read 4 UV from NV12, upsample to 8 UV
  1722. #define READNV12 \
  1723. "movq (%[uv_buf]),%%xmm0 \n" \
  1724. "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
  1725. "punpcklwd %%xmm0,%%xmm0 \n" \
  1726. "movq (%[y_buf]),%%xmm4 \n" \
  1727. "punpcklbw %%xmm4,%%xmm4 \n" \
  1728. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1729. // Read 4 VU from NV21, upsample to 8 UV
  1730. #define READNV21 \
  1731. "movq (%[vu_buf]),%%xmm0 \n" \
  1732. "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
  1733. "pshufb %[kShuffleNV21], %%xmm0 \n" \
  1734. "movq (%[y_buf]),%%xmm4 \n" \
  1735. "punpcklbw %%xmm4,%%xmm4 \n" \
  1736. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1737. // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
  1738. #define READYUY2 \
  1739. "movdqu (%[yuy2_buf]),%%xmm4 \n" \
  1740. "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
  1741. "movdqu (%[yuy2_buf]),%%xmm0 \n" \
  1742. "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
  1743. "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
  1744. // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
  1745. #define READUYVY \
  1746. "movdqu (%[uyvy_buf]),%%xmm4 \n" \
  1747. "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
  1748. "movdqu (%[uyvy_buf]),%%xmm0 \n" \
  1749. "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
  1750. "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
  1751. #if defined(__x86_64__)
  1752. #define YUVTORGB_SETUP(yuvconstants) \
  1753. "movdqa (%[yuvconstants]),%%xmm8 \n" \
  1754. "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
  1755. "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
  1756. "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
  1757. "movdqa 128(%[yuvconstants]),%%xmm12 \n" \
  1758. "movdqa 160(%[yuvconstants]),%%xmm13 \n" \
  1759. "movdqa 192(%[yuvconstants]),%%xmm14 \n"
  1760. // Convert 8 pixels: 8 UV and 8 Y
  1761. #define YUVTORGB16(yuvconstants) \
  1762. "movdqa %%xmm0,%%xmm1 \n" \
  1763. "movdqa %%xmm0,%%xmm2 \n" \
  1764. "movdqa %%xmm0,%%xmm3 \n" \
  1765. "movdqa %%xmm11,%%xmm0 \n" \
  1766. "pmaddubsw %%xmm8,%%xmm1 \n" \
  1767. "psubw %%xmm1,%%xmm0 \n" \
  1768. "movdqa %%xmm12,%%xmm1 \n" \
  1769. "pmaddubsw %%xmm9,%%xmm2 \n" \
  1770. "psubw %%xmm2,%%xmm1 \n" \
  1771. "movdqa %%xmm13,%%xmm2 \n" \
  1772. "pmaddubsw %%xmm10,%%xmm3 \n" \
  1773. "psubw %%xmm3,%%xmm2 \n" \
  1774. "pmulhuw %%xmm14,%%xmm4 \n" \
  1775. "paddsw %%xmm4,%%xmm0 \n" \
  1776. "paddsw %%xmm4,%%xmm1 \n" \
  1777. "paddsw %%xmm4,%%xmm2 \n"
  1778. #define YUVTORGB_REGS \
  1779. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  1780. #else
  1781. #define YUVTORGB_SETUP(yuvconstants)
  1782. // Convert 8 pixels: 8 UV and 8 Y
  1783. #define YUVTORGB16(yuvconstants) \
  1784. "movdqa %%xmm0,%%xmm1 \n" \
  1785. "movdqa %%xmm0,%%xmm2 \n" \
  1786. "movdqa %%xmm0,%%xmm3 \n" \
  1787. "movdqa 96(%[yuvconstants]),%%xmm0 \n" \
  1788. "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \
  1789. "psubw %%xmm1,%%xmm0 \n" \
  1790. "movdqa 128(%[yuvconstants]),%%xmm1 \n" \
  1791. "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \
  1792. "psubw %%xmm2,%%xmm1 \n" \
  1793. "movdqa 160(%[yuvconstants]),%%xmm2 \n" \
  1794. "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \
  1795. "psubw %%xmm3,%%xmm2 \n" \
  1796. "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \
  1797. "paddsw %%xmm4,%%xmm0 \n" \
  1798. "paddsw %%xmm4,%%xmm1 \n" \
  1799. "paddsw %%xmm4,%%xmm2 \n"
  1800. #define YUVTORGB_REGS
  1801. #endif
  1802. #define YUVTORGB(yuvconstants) \
  1803. YUVTORGB16(yuvconstants) \
  1804. "psraw $0x6,%%xmm0 \n" \
  1805. "psraw $0x6,%%xmm1 \n" \
  1806. "psraw $0x6,%%xmm2 \n" \
  1807. "packuswb %%xmm0,%%xmm0 \n" \
  1808. "packuswb %%xmm1,%%xmm1 \n" \
  1809. "packuswb %%xmm2,%%xmm2 \n"
  1810. // Store 8 ARGB values.
  1811. #define STOREARGB \
  1812. "punpcklbw %%xmm1,%%xmm0 \n" \
  1813. "punpcklbw %%xmm5,%%xmm2 \n" \
  1814. "movdqa %%xmm0,%%xmm1 \n" \
  1815. "punpcklwd %%xmm2,%%xmm0 \n" \
  1816. "punpckhwd %%xmm2,%%xmm1 \n" \
  1817. "movdqu %%xmm0,(%[dst_argb]) \n" \
  1818. "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
  1819. "lea 0x20(%[dst_argb]), %[dst_argb] \n"
  1820. // Store 8 RGBA values.
  1821. #define STORERGBA \
  1822. "pcmpeqb %%xmm5,%%xmm5 \n" \
  1823. "punpcklbw %%xmm2,%%xmm1 \n" \
  1824. "punpcklbw %%xmm0,%%xmm5 \n" \
  1825. "movdqa %%xmm5,%%xmm0 \n" \
  1826. "punpcklwd %%xmm1,%%xmm5 \n" \
  1827. "punpckhwd %%xmm1,%%xmm0 \n" \
  1828. "movdqu %%xmm5,(%[dst_rgba]) \n" \
  1829. "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
  1830. "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
  1831. // Store 8 AR30 values.
  1832. #define STOREAR30 \
  1833. "psraw $0x4,%%xmm0 \n" \
  1834. "psraw $0x4,%%xmm1 \n" \
  1835. "psraw $0x4,%%xmm2 \n" \
  1836. "pminsw %%xmm7,%%xmm0 \n" \
  1837. "pminsw %%xmm7,%%xmm1 \n" \
  1838. "pminsw %%xmm7,%%xmm2 \n" \
  1839. "pmaxsw %%xmm6,%%xmm0 \n" \
  1840. "pmaxsw %%xmm6,%%xmm1 \n" \
  1841. "pmaxsw %%xmm6,%%xmm2 \n" \
  1842. "psllw $0x4,%%xmm2 \n" \
  1843. "movdqa %%xmm0,%%xmm3 \n" \
  1844. "punpcklwd %%xmm2,%%xmm0 \n" \
  1845. "punpckhwd %%xmm2,%%xmm3 \n" \
  1846. "movdqa %%xmm1,%%xmm2 \n" \
  1847. "punpcklwd %%xmm5,%%xmm1 \n" \
  1848. "punpckhwd %%xmm5,%%xmm2 \n" \
  1849. "pslld $0xa,%%xmm1 \n" \
  1850. "pslld $0xa,%%xmm2 \n" \
  1851. "por %%xmm1,%%xmm0 \n" \
  1852. "por %%xmm2,%%xmm3 \n" \
  1853. "movdqu %%xmm0,(%[dst_ar30]) \n" \
  1854. "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
  1855. "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
  1856. void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
  1857. const uint8_t* u_buf,
  1858. const uint8_t* v_buf,
  1859. uint8_t* dst_argb,
  1860. const struct YuvConstants* yuvconstants,
  1861. int width) {
  1862. asm volatile (
  1863. YUVTORGB_SETUP(yuvconstants)
  1864. "sub %[u_buf],%[v_buf] \n"
  1865. "pcmpeqb %%xmm5,%%xmm5 \n"
  1866. LABELALIGN
  1867. "1: \n"
  1868. READYUV444
  1869. YUVTORGB(yuvconstants)
  1870. STOREARGB
  1871. "sub $0x8,%[width] \n"
  1872. "jg 1b \n"
  1873. : [y_buf]"+r"(y_buf), // %[y_buf]
  1874. [u_buf]"+r"(u_buf), // %[u_buf]
  1875. [v_buf]"+r"(v_buf), // %[v_buf]
  1876. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1877. [width]"+rm"(width) // %[width]
  1878. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1879. : "memory", "cc", YUVTORGB_REGS
  1880. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1881. );
  1882. }
  1883. void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
  1884. const uint8_t* u_buf,
  1885. const uint8_t* v_buf,
  1886. uint8_t* dst_rgb24,
  1887. const struct YuvConstants* yuvconstants,
  1888. int width) {
  1889. asm volatile (
  1890. YUVTORGB_SETUP(yuvconstants)
  1891. "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
  1892. "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
  1893. "sub %[u_buf],%[v_buf] \n"
  1894. LABELALIGN
  1895. "1: \n"
  1896. READYUV422
  1897. YUVTORGB(yuvconstants)
  1898. "punpcklbw %%xmm1,%%xmm0 \n"
  1899. "punpcklbw %%xmm2,%%xmm2 \n"
  1900. "movdqa %%xmm0,%%xmm1 \n"
  1901. "punpcklwd %%xmm2,%%xmm0 \n"
  1902. "punpckhwd %%xmm2,%%xmm1 \n"
  1903. "pshufb %%xmm5,%%xmm0 \n"
  1904. "pshufb %%xmm6,%%xmm1 \n"
  1905. "palignr $0xc,%%xmm0,%%xmm1 \n"
  1906. "movq %%xmm0,(%[dst_rgb24]) \n"
  1907. "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
  1908. "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
  1909. "subl $0x8,%[width] \n"
  1910. "jg 1b \n"
  1911. : [y_buf]"+r"(y_buf), // %[y_buf]
  1912. [u_buf]"+r"(u_buf), // %[u_buf]
  1913. [v_buf]"+r"(v_buf), // %[v_buf]
  1914. [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
  1915. #if defined(__i386__)
  1916. [width]"+m"(width) // %[width]
  1917. #else
  1918. [width]"+rm"(width) // %[width]
  1919. #endif
  1920. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1921. [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
  1922. [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
  1923. : "memory", "cc", YUVTORGB_REGS
  1924. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1925. );
  1926. }
  1927. void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
  1928. const uint8_t* u_buf,
  1929. const uint8_t* v_buf,
  1930. uint8_t* dst_argb,
  1931. const struct YuvConstants* yuvconstants,
  1932. int width) {
  1933. asm volatile (
  1934. YUVTORGB_SETUP(yuvconstants)
  1935. "sub %[u_buf],%[v_buf] \n"
  1936. "pcmpeqb %%xmm5,%%xmm5 \n"
  1937. LABELALIGN
  1938. "1: \n"
  1939. READYUV422
  1940. YUVTORGB(yuvconstants)
  1941. STOREARGB
  1942. "sub $0x8,%[width] \n"
  1943. "jg 1b \n"
  1944. : [y_buf]"+r"(y_buf), // %[y_buf]
  1945. [u_buf]"+r"(u_buf), // %[u_buf]
  1946. [v_buf]"+r"(v_buf), // %[v_buf]
  1947. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1948. [width]"+rm"(width) // %[width]
  1949. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1950. : "memory", "cc", YUVTORGB_REGS
  1951. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1952. );
  1953. }
  1954. void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
  1955. const uint8_t* u_buf,
  1956. const uint8_t* v_buf,
  1957. uint8_t* dst_ar30,
  1958. const struct YuvConstants* yuvconstants,
  1959. int width) {
  1960. asm volatile (
  1961. YUVTORGB_SETUP(yuvconstants)
  1962. "sub %[u_buf],%[v_buf] \n"
  1963. "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
  1964. "psrlw $14,%%xmm5 \n"
  1965. "psllw $4,%%xmm5 \n" // 2 alpha bits
  1966. "pxor %%xmm6,%%xmm6 \n"
  1967. "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
  1968. "psrlw $6,%%xmm7 \n" // 1023 for max
  1969. LABELALIGN
  1970. "1: \n"
  1971. READYUV422
  1972. YUVTORGB16(yuvconstants)
  1973. STOREAR30
  1974. "sub $0x8,%[width] \n"
  1975. "jg 1b \n"
  1976. : [y_buf]"+r"(y_buf), // %[y_buf]
  1977. [u_buf]"+r"(u_buf), // %[u_buf]
  1978. [v_buf]"+r"(v_buf), // %[v_buf]
  1979. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  1980. [width]"+rm"(width) // %[width]
  1981. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1982. : "memory", "cc", YUVTORGB_REGS
  1983. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  1984. );
  1985. }
  1986. // 10 bit YUV to ARGB
  1987. void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
  1988. const uint16_t* u_buf,
  1989. const uint16_t* v_buf,
  1990. uint8_t* dst_argb,
  1991. const struct YuvConstants* yuvconstants,
  1992. int width) {
  1993. asm volatile (
  1994. YUVTORGB_SETUP(yuvconstants)
  1995. "sub %[u_buf],%[v_buf] \n"
  1996. "pcmpeqb %%xmm5,%%xmm5 \n"
  1997. LABELALIGN
  1998. "1: \n"
  1999. READYUV210
  2000. YUVTORGB(yuvconstants)
  2001. STOREARGB
  2002. "sub $0x8,%[width] \n"
  2003. "jg 1b \n"
  2004. : [y_buf]"+r"(y_buf), // %[y_buf]
  2005. [u_buf]"+r"(u_buf), // %[u_buf]
  2006. [v_buf]"+r"(v_buf), // %[v_buf]
  2007. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2008. [width]"+rm"(width) // %[width]
  2009. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2010. : "memory", "cc", YUVTORGB_REGS
  2011. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2012. );
  2013. }
  2014. // 10 bit YUV to AR30
  2015. void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
  2016. const uint16_t* u_buf,
  2017. const uint16_t* v_buf,
  2018. uint8_t* dst_ar30,
  2019. const struct YuvConstants* yuvconstants,
  2020. int width) {
  2021. asm volatile (
  2022. YUVTORGB_SETUP(yuvconstants)
  2023. "sub %[u_buf],%[v_buf] \n"
  2024. "pcmpeqb %%xmm5,%%xmm5 \n"
  2025. "psrlw $14,%%xmm5 \n"
  2026. "psllw $4,%%xmm5 \n" // 2 alpha bits
  2027. "pxor %%xmm6,%%xmm6 \n"
  2028. "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
  2029. "psrlw $6,%%xmm7 \n" // 1023 for max
  2030. LABELALIGN
  2031. "1: \n"
  2032. READYUV210
  2033. YUVTORGB16(yuvconstants)
  2034. STOREAR30
  2035. "sub $0x8,%[width] \n"
  2036. "jg 1b \n"
  2037. : [y_buf]"+r"(y_buf), // %[y_buf]
  2038. [u_buf]"+r"(u_buf), // %[u_buf]
  2039. [v_buf]"+r"(v_buf), // %[v_buf]
  2040. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  2041. [width]"+rm"(width) // %[width]
  2042. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2043. : "memory", "cc", YUVTORGB_REGS
  2044. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  2045. );
  2046. }
  2047. #ifdef HAS_I422ALPHATOARGBROW_SSSE3
  2048. void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
  2049. const uint8_t* u_buf,
  2050. const uint8_t* v_buf,
  2051. const uint8_t* a_buf,
  2052. uint8_t* dst_argb,
  2053. const struct YuvConstants* yuvconstants,
  2054. int width) {
  2055. // clang-format off
  2056. asm volatile (
  2057. YUVTORGB_SETUP(yuvconstants)
  2058. "sub %[u_buf],%[v_buf] \n"
  2059. LABELALIGN
  2060. "1: \n"
  2061. READYUVA422
  2062. YUVTORGB(yuvconstants)
  2063. STOREARGB
  2064. "subl $0x8,%[width] \n"
  2065. "jg 1b \n"
  2066. : [y_buf]"+r"(y_buf), // %[y_buf]
  2067. [u_buf]"+r"(u_buf), // %[u_buf]
  2068. [v_buf]"+r"(v_buf), // %[v_buf]
  2069. [a_buf]"+r"(a_buf), // %[a_buf]
  2070. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2071. #if defined(__i386__)
  2072. [width]"+m"(width) // %[width]
  2073. #else
  2074. [width]"+rm"(width) // %[width]
  2075. #endif
  2076. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2077. : "memory", "cc", YUVTORGB_REGS
  2078. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2079. );
  2080. // clang-format on
  2081. }
  2082. #endif // HAS_I422ALPHATOARGBROW_SSSE3
  2083. void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
  2084. const uint8_t* uv_buf,
  2085. uint8_t* dst_argb,
  2086. const struct YuvConstants* yuvconstants,
  2087. int width) {
  2088. // clang-format off
  2089. asm volatile (
  2090. YUVTORGB_SETUP(yuvconstants)
  2091. "pcmpeqb %%xmm5,%%xmm5 \n"
  2092. LABELALIGN
  2093. "1: \n"
  2094. READNV12
  2095. YUVTORGB(yuvconstants)
  2096. STOREARGB
  2097. "sub $0x8,%[width] \n"
  2098. "jg 1b \n"
  2099. : [y_buf]"+r"(y_buf), // %[y_buf]
  2100. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  2101. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2102. [width]"+rm"(width) // %[width]
  2103. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2104. : "memory", "cc", YUVTORGB_REGS
  2105. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2106. );
  2107. // clang-format on
  2108. }
  2109. void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
  2110. const uint8_t* vu_buf,
  2111. uint8_t* dst_argb,
  2112. const struct YuvConstants* yuvconstants,
  2113. int width) {
  2114. // clang-format off
  2115. asm volatile (
  2116. YUVTORGB_SETUP(yuvconstants)
  2117. "pcmpeqb %%xmm5,%%xmm5 \n"
  2118. LABELALIGN
  2119. "1: \n"
  2120. READNV21
  2121. YUVTORGB(yuvconstants)
  2122. STOREARGB
  2123. "sub $0x8,%[width] \n"
  2124. "jg 1b \n"
  2125. : [y_buf]"+r"(y_buf), // %[y_buf]
  2126. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  2127. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2128. [width]"+rm"(width) // %[width]
  2129. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2130. [kShuffleNV21]"m"(kShuffleNV21)
  2131. : "memory", "cc", YUVTORGB_REGS
  2132. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2133. );
  2134. // clang-format on
  2135. }
  2136. void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
  2137. uint8_t* dst_argb,
  2138. const struct YuvConstants* yuvconstants,
  2139. int width) {
  2140. // clang-format off
  2141. asm volatile (
  2142. YUVTORGB_SETUP(yuvconstants)
  2143. "pcmpeqb %%xmm5,%%xmm5 \n"
  2144. LABELALIGN
  2145. "1: \n"
  2146. READYUY2
  2147. YUVTORGB(yuvconstants)
  2148. STOREARGB
  2149. "sub $0x8,%[width] \n"
  2150. "jg 1b \n"
  2151. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  2152. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2153. [width]"+rm"(width) // %[width]
  2154. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2155. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  2156. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  2157. : "memory", "cc", YUVTORGB_REGS
  2158. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2159. );
  2160. // clang-format on
  2161. }
  2162. void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
  2163. uint8_t* dst_argb,
  2164. const struct YuvConstants* yuvconstants,
  2165. int width) {
  2166. // clang-format off
  2167. asm volatile (
  2168. YUVTORGB_SETUP(yuvconstants)
  2169. "pcmpeqb %%xmm5,%%xmm5 \n"
  2170. LABELALIGN
  2171. "1: \n"
  2172. READUYVY
  2173. YUVTORGB(yuvconstants)
  2174. STOREARGB
  2175. "sub $0x8,%[width] \n"
  2176. "jg 1b \n"
  2177. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  2178. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2179. [width]"+rm"(width) // %[width]
  2180. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2181. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  2182. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  2183. : "memory", "cc", YUVTORGB_REGS
  2184. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2185. );
  2186. // clang-format on
  2187. }
  2188. void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
  2189. const uint8_t* u_buf,
  2190. const uint8_t* v_buf,
  2191. uint8_t* dst_rgba,
  2192. const struct YuvConstants* yuvconstants,
  2193. int width) {
  2194. asm volatile (
  2195. YUVTORGB_SETUP(yuvconstants)
  2196. "sub %[u_buf],%[v_buf] \n"
  2197. "pcmpeqb %%xmm5,%%xmm5 \n"
  2198. LABELALIGN
  2199. "1: \n"
  2200. READYUV422
  2201. YUVTORGB(yuvconstants)
  2202. STORERGBA
  2203. "sub $0x8,%[width] \n"
  2204. "jg 1b \n"
  2205. : [y_buf]"+r"(y_buf), // %[y_buf]
  2206. [u_buf]"+r"(u_buf), // %[u_buf]
  2207. [v_buf]"+r"(v_buf), // %[v_buf]
  2208. [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
  2209. [width]"+rm"(width) // %[width]
  2210. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2211. : "memory", "cc", YUVTORGB_REGS
  2212. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2213. );
  2214. }
  2215. #endif // HAS_I422TOARGBROW_SSSE3
  2216. // Read 16 UV from 444
  2217. #define READYUV444_AVX2 \
  2218. "vmovdqu (%[u_buf]),%%xmm0 \n" \
  2219. "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2220. "lea 0x10(%[u_buf]),%[u_buf] \n" \
  2221. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2222. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  2223. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2224. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2225. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2226. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2227. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2228. // Read 8 UV from 422, upsample to 16 UV.
  2229. #define READYUV422_AVX2 \
  2230. "vmovq (%[u_buf]),%%xmm0 \n" \
  2231. "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2232. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  2233. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2234. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2235. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2236. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2237. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2238. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2239. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2240. // Read 8 UV from 210 10 bit, upsample to 16 UV
  2241. // TODO(fbarchard): Consider vshufb to replace pack/unpack
  2242. // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
  2243. #define READYUV210_AVX2 \
  2244. "vmovdqu (%[u_buf]),%%xmm0 \n" \
  2245. "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2246. "lea 0x10(%[u_buf]),%[u_buf] \n" \
  2247. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2248. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  2249. "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \
  2250. "vpsraw $0x2,%%ymm0,%%ymm0 \n" \
  2251. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  2252. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2253. "vmovdqu (%[y_buf]),%%ymm4 \n" \
  2254. "vpsllw $0x6,%%ymm4,%%ymm4 \n" \
  2255. "lea 0x20(%[y_buf]),%[y_buf] \n"
  2256. // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
  2257. #define READYUVA422_AVX2 \
  2258. "vmovq (%[u_buf]),%%xmm0 \n" \
  2259. "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2260. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  2261. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2262. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2263. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2264. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2265. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2266. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2267. "lea 0x10(%[y_buf]),%[y_buf] \n" \
  2268. "vmovdqu (%[a_buf]),%%xmm5 \n" \
  2269. "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
  2270. "lea 0x10(%[a_buf]),%[a_buf] \n"
  2271. // Read 8 UV from NV12, upsample to 16 UV.
  2272. #define READNV12_AVX2 \
  2273. "vmovdqu (%[uv_buf]),%%xmm0 \n" \
  2274. "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
  2275. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2276. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2277. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2278. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2279. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2280. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2281. // Read 8 VU from NV21, upsample to 16 UV.
  2282. #define READNV21_AVX2 \
  2283. "vmovdqu (%[vu_buf]),%%xmm0 \n" \
  2284. "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
  2285. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2286. "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
  2287. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2288. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2289. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2290. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2291. // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
  2292. #define READYUY2_AVX2 \
  2293. "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
  2294. "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
  2295. "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \
  2296. "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
  2297. "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
  2298. // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
  2299. #define READUYVY_AVX2 \
  2300. "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
  2301. "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
  2302. "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \
  2303. "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
  2304. "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
  2305. #if defined(__x86_64__)
  2306. #define YUVTORGB_SETUP_AVX2(yuvconstants) \
  2307. "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
  2308. "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
  2309. "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
  2310. "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
  2311. "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \
  2312. "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
  2313. "vmovdqa 192(%[yuvconstants]),%%ymm14 \n"
  2314. #define YUVTORGB16_AVX2(yuvconstants) \
  2315. "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
  2316. "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
  2317. "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
  2318. "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
  2319. "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
  2320. "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
  2321. "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
  2322. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  2323. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  2324. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
  2325. #define YUVTORGB_REGS_AVX2 \
  2326. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  2327. #else // Convert 16 pixels: 16 UV and 16 Y.
  2328. #define YUVTORGB_SETUP_AVX2(yuvconstants)
  2329. #define YUVTORGB16_AVX2(yuvconstants) \
  2330. "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \
  2331. "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \
  2332. "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \
  2333. "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \
  2334. "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
  2335. "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \
  2336. "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
  2337. "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \
  2338. "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
  2339. "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
  2340. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  2341. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  2342. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
  2343. #define YUVTORGB_REGS_AVX2
  2344. #endif
  2345. #define YUVTORGB_AVX2(yuvconstants) \
  2346. YUVTORGB16_AVX2(yuvconstants) \
  2347. "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
  2348. "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
  2349. "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
  2350. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  2351. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
  2352. "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
  2353. // Store 16 ARGB values.
  2354. #define STOREARGB_AVX2 \
  2355. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2356. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2357. "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
  2358. "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
  2359. "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
  2360. "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
  2361. "vmovdqu %%ymm1,(%[dst_argb]) \n" \
  2362. "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
  2363. "lea 0x40(%[dst_argb]), %[dst_argb] \n"
  2364. // Store 16 AR30 values.
  2365. #define STOREAR30_AVX2 \
  2366. "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
  2367. "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
  2368. "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
  2369. "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
  2370. "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
  2371. "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
  2372. "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
  2373. "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
  2374. "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
  2375. "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
  2376. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2377. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  2378. "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
  2379. "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
  2380. "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
  2381. "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
  2382. "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
  2383. "vpslld $0xa,%%ymm1,%%ymm1 \n" \
  2384. "vpslld $0xa,%%ymm2,%%ymm2 \n" \
  2385. "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
  2386. "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
  2387. "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
  2388. "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
  2389. "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
  2390. #ifdef HAS_I444TOARGBROW_AVX2
  2391. // 16 pixels
  2392. // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
  2393. void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
  2394. const uint8_t* u_buf,
  2395. const uint8_t* v_buf,
  2396. uint8_t* dst_argb,
  2397. const struct YuvConstants* yuvconstants,
  2398. int width) {
  2399. asm volatile (
  2400. YUVTORGB_SETUP_AVX2(yuvconstants)
  2401. "sub %[u_buf],%[v_buf] \n"
  2402. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2403. LABELALIGN
  2404. "1: \n"
  2405. READYUV444_AVX2
  2406. YUVTORGB_AVX2(yuvconstants)
  2407. STOREARGB_AVX2
  2408. "sub $0x10,%[width] \n"
  2409. "jg 1b \n"
  2410. "vzeroupper \n"
  2411. : [y_buf]"+r"(y_buf), // %[y_buf]
  2412. [u_buf]"+r"(u_buf), // %[u_buf]
  2413. [v_buf]"+r"(v_buf), // %[v_buf]
  2414. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2415. [width]"+rm"(width) // %[width]
  2416. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2417. : "memory", "cc", YUVTORGB_REGS_AVX2
  2418. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2419. );
  2420. }
  2421. #endif // HAS_I444TOARGBROW_AVX2
  2422. #if defined(HAS_I422TOARGBROW_AVX2)
  2423. // 16 pixels
  2424. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2425. void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
  2426. const uint8_t* u_buf,
  2427. const uint8_t* v_buf,
  2428. uint8_t* dst_argb,
  2429. const struct YuvConstants* yuvconstants,
  2430. int width) {
  2431. asm volatile (
  2432. YUVTORGB_SETUP_AVX2(yuvconstants)
  2433. "sub %[u_buf],%[v_buf] \n"
  2434. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2435. LABELALIGN
  2436. "1: \n"
  2437. READYUV422_AVX2
  2438. YUVTORGB_AVX2(yuvconstants)
  2439. STOREARGB_AVX2
  2440. "sub $0x10,%[width] \n"
  2441. "jg 1b \n"
  2442. "vzeroupper \n"
  2443. : [y_buf]"+r"(y_buf), // %[y_buf]
  2444. [u_buf]"+r"(u_buf), // %[u_buf]
  2445. [v_buf]"+r"(v_buf), // %[v_buf]
  2446. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2447. [width]"+rm"(width) // %[width]
  2448. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2449. : "memory", "cc", YUVTORGB_REGS_AVX2
  2450. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2451. );
  2452. }
  2453. #endif // HAS_I422TOARGBROW_AVX2
  2454. #if defined(HAS_I422TOAR30ROW_AVX2)
  2455. // 16 pixels
  2456. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
  2457. void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
  2458. const uint8_t* u_buf,
  2459. const uint8_t* v_buf,
  2460. uint8_t* dst_ar30,
  2461. const struct YuvConstants* yuvconstants,
  2462. int width) {
  2463. asm volatile (
  2464. YUVTORGB_SETUP_AVX2(yuvconstants)
  2465. "sub %[u_buf],%[v_buf] \n"
  2466. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
  2467. "vpsrlw $14,%%ymm5,%%ymm5 \n"
  2468. "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
  2469. "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
  2470. "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
  2471. "vpsrlw $6,%%ymm7,%%ymm7 \n"
  2472. LABELALIGN
  2473. "1: \n"
  2474. READYUV422_AVX2
  2475. YUVTORGB16_AVX2(yuvconstants)
  2476. STOREAR30_AVX2
  2477. "sub $0x10,%[width] \n"
  2478. "jg 1b \n"
  2479. "vzeroupper \n"
  2480. : [y_buf]"+r"(y_buf), // %[y_buf]
  2481. [u_buf]"+r"(u_buf), // %[u_buf]
  2482. [v_buf]"+r"(v_buf), // %[v_buf]
  2483. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  2484. [width]"+rm"(width) // %[width]
  2485. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2486. : "memory", "cc", YUVTORGB_REGS_AVX2
  2487. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  2488. );
  2489. }
  2490. #endif // HAS_I422TOAR30ROW_AVX2
  2491. #if defined(HAS_I210TOARGBROW_AVX2)
  2492. // 16 pixels
  2493. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2494. void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
  2495. const uint16_t* u_buf,
  2496. const uint16_t* v_buf,
  2497. uint8_t* dst_argb,
  2498. const struct YuvConstants* yuvconstants,
  2499. int width) {
  2500. asm volatile (
  2501. YUVTORGB_SETUP_AVX2(yuvconstants)
  2502. "sub %[u_buf],%[v_buf] \n"
  2503. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2504. LABELALIGN
  2505. "1: \n"
  2506. READYUV210_AVX2
  2507. YUVTORGB_AVX2(yuvconstants)
  2508. STOREARGB_AVX2
  2509. "sub $0x10,%[width] \n"
  2510. "jg 1b \n"
  2511. "vzeroupper \n"
  2512. : [y_buf]"+r"(y_buf), // %[y_buf]
  2513. [u_buf]"+r"(u_buf), // %[u_buf]
  2514. [v_buf]"+r"(v_buf), // %[v_buf]
  2515. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2516. [width]"+rm"(width) // %[width]
  2517. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2518. : "memory", "cc", YUVTORGB_REGS_AVX2
  2519. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2520. );
  2521. }
  2522. #endif // HAS_I210TOARGBROW_AVX2
  2523. #if defined(HAS_I210TOAR30ROW_AVX2)
  2524. // 16 pixels
  2525. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
  2526. void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
  2527. const uint16_t* u_buf,
  2528. const uint16_t* v_buf,
  2529. uint8_t* dst_ar30,
  2530. const struct YuvConstants* yuvconstants,
  2531. int width) {
  2532. asm volatile (
  2533. YUVTORGB_SETUP_AVX2(yuvconstants)
  2534. "sub %[u_buf],%[v_buf] \n"
  2535. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
  2536. "vpsrlw $14,%%ymm5,%%ymm5 \n"
  2537. "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
  2538. "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
  2539. "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
  2540. "vpsrlw $6,%%ymm7,%%ymm7 \n"
  2541. LABELALIGN
  2542. "1: \n"
  2543. READYUV210_AVX2
  2544. YUVTORGB16_AVX2(yuvconstants)
  2545. STOREAR30_AVX2
  2546. "sub $0x10,%[width] \n"
  2547. "jg 1b \n"
  2548. "vzeroupper \n"
  2549. : [y_buf]"+r"(y_buf), // %[y_buf]
  2550. [u_buf]"+r"(u_buf), // %[u_buf]
  2551. [v_buf]"+r"(v_buf), // %[v_buf]
  2552. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  2553. [width]"+rm"(width) // %[width]
  2554. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2555. : "memory", "cc", YUVTORGB_REGS_AVX2
  2556. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2557. );
  2558. }
  2559. #endif // HAS_I210TOAR30ROW_AVX2
  2560. #if defined(HAS_I422ALPHATOARGBROW_AVX2)
  2561. // 16 pixels
  2562. // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
  2563. void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
  2564. const uint8_t* u_buf,
  2565. const uint8_t* v_buf,
  2566. const uint8_t* a_buf,
  2567. uint8_t* dst_argb,
  2568. const struct YuvConstants* yuvconstants,
  2569. int width) {
  2570. // clang-format off
  2571. asm volatile (
  2572. YUVTORGB_SETUP_AVX2(yuvconstants)
  2573. "sub %[u_buf],%[v_buf] \n"
  2574. LABELALIGN
  2575. "1: \n"
  2576. READYUVA422_AVX2
  2577. YUVTORGB_AVX2(yuvconstants)
  2578. STOREARGB_AVX2
  2579. "subl $0x10,%[width] \n"
  2580. "jg 1b \n"
  2581. "vzeroupper \n"
  2582. : [y_buf]"+r"(y_buf), // %[y_buf]
  2583. [u_buf]"+r"(u_buf), // %[u_buf]
  2584. [v_buf]"+r"(v_buf), // %[v_buf]
  2585. [a_buf]"+r"(a_buf), // %[a_buf]
  2586. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2587. #if defined(__i386__)
  2588. [width]"+m"(width) // %[width]
  2589. #else
  2590. [width]"+rm"(width) // %[width]
  2591. #endif
  2592. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2593. : "memory", "cc", YUVTORGB_REGS_AVX2
  2594. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2595. );
  2596. // clang-format on
  2597. }
  2598. #endif // HAS_I422ALPHATOARGBROW_AVX2
  2599. #if defined(HAS_I422TORGBAROW_AVX2)
  2600. // 16 pixels
  2601. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
  2602. void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
  2603. const uint8_t* u_buf,
  2604. const uint8_t* v_buf,
  2605. uint8_t* dst_argb,
  2606. const struct YuvConstants* yuvconstants,
  2607. int width) {
  2608. asm volatile (
  2609. YUVTORGB_SETUP_AVX2(yuvconstants)
  2610. "sub %[u_buf],%[v_buf] \n"
  2611. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2612. LABELALIGN
  2613. "1: \n"
  2614. READYUV422_AVX2
  2615. YUVTORGB_AVX2(yuvconstants)
  2616. // Step 3: Weave into RGBA
  2617. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  2618. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2619. "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
  2620. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  2621. "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
  2622. "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
  2623. "vmovdqu %%ymm0,(%[dst_argb]) \n"
  2624. "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
  2625. "lea 0x40(%[dst_argb]),%[dst_argb] \n"
  2626. "sub $0x10,%[width] \n"
  2627. "jg 1b \n"
  2628. "vzeroupper \n"
  2629. : [y_buf]"+r"(y_buf), // %[y_buf]
  2630. [u_buf]"+r"(u_buf), // %[u_buf]
  2631. [v_buf]"+r"(v_buf), // %[v_buf]
  2632. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2633. [width]"+rm"(width) // %[width]
  2634. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2635. : "memory", "cc", YUVTORGB_REGS_AVX2
  2636. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2637. );
  2638. }
  2639. #endif // HAS_I422TORGBAROW_AVX2
  2640. #if defined(HAS_NV12TOARGBROW_AVX2)
  2641. // 16 pixels.
  2642. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2643. void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
  2644. const uint8_t* uv_buf,
  2645. uint8_t* dst_argb,
  2646. const struct YuvConstants* yuvconstants,
  2647. int width) {
  2648. // clang-format off
  2649. asm volatile (
  2650. YUVTORGB_SETUP_AVX2(yuvconstants)
  2651. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2652. LABELALIGN
  2653. "1: \n"
  2654. READNV12_AVX2
  2655. YUVTORGB_AVX2(yuvconstants)
  2656. STOREARGB_AVX2
  2657. "sub $0x10,%[width] \n"
  2658. "jg 1b \n"
  2659. "vzeroupper \n"
  2660. : [y_buf]"+r"(y_buf), // %[y_buf]
  2661. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  2662. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2663. [width]"+rm"(width) // %[width]
  2664. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2665. : "memory", "cc", YUVTORGB_REGS_AVX2
  2666. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2667. );
  2668. // clang-format on
  2669. }
  2670. #endif // HAS_NV12TOARGBROW_AVX2
  2671. #if defined(HAS_NV21TOARGBROW_AVX2)
  2672. // 16 pixels.
  2673. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2674. void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
  2675. const uint8_t* vu_buf,
  2676. uint8_t* dst_argb,
  2677. const struct YuvConstants* yuvconstants,
  2678. int width) {
  2679. // clang-format off
  2680. asm volatile (
  2681. YUVTORGB_SETUP_AVX2(yuvconstants)
  2682. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2683. LABELALIGN
  2684. "1: \n"
  2685. READNV21_AVX2
  2686. YUVTORGB_AVX2(yuvconstants)
  2687. STOREARGB_AVX2
  2688. "sub $0x10,%[width] \n"
  2689. "jg 1b \n"
  2690. "vzeroupper \n"
  2691. : [y_buf]"+r"(y_buf), // %[y_buf]
  2692. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  2693. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2694. [width]"+rm"(width) // %[width]
  2695. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2696. [kShuffleNV21]"m"(kShuffleNV21)
  2697. : "memory", "cc", YUVTORGB_REGS_AVX2
  2698. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2699. );
  2700. // clang-format on
  2701. }
  2702. #endif // HAS_NV21TOARGBROW_AVX2
  2703. #if defined(HAS_YUY2TOARGBROW_AVX2)
  2704. // 16 pixels.
  2705. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2706. void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
  2707. uint8_t* dst_argb,
  2708. const struct YuvConstants* yuvconstants,
  2709. int width) {
  2710. // clang-format off
  2711. asm volatile (
  2712. YUVTORGB_SETUP_AVX2(yuvconstants)
  2713. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2714. LABELALIGN
  2715. "1: \n"
  2716. READYUY2_AVX2
  2717. YUVTORGB_AVX2(yuvconstants)
  2718. STOREARGB_AVX2
  2719. "sub $0x10,%[width] \n"
  2720. "jg 1b \n"
  2721. "vzeroupper \n"
  2722. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  2723. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2724. [width]"+rm"(width) // %[width]
  2725. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2726. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  2727. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  2728. : "memory", "cc", YUVTORGB_REGS_AVX2
  2729. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2730. );
  2731. // clang-format on
  2732. }
  2733. #endif // HAS_YUY2TOARGBROW_AVX2
  2734. #if defined(HAS_UYVYTOARGBROW_AVX2)
  2735. // 16 pixels.
  2736. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2737. void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
  2738. uint8_t* dst_argb,
  2739. const struct YuvConstants* yuvconstants,
  2740. int width) {
  2741. // clang-format off
  2742. asm volatile (
  2743. YUVTORGB_SETUP_AVX2(yuvconstants)
  2744. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2745. LABELALIGN
  2746. "1: \n"
  2747. READUYVY_AVX2
  2748. YUVTORGB_AVX2(yuvconstants)
  2749. STOREARGB_AVX2
  2750. "sub $0x10,%[width] \n"
  2751. "jg 1b \n"
  2752. "vzeroupper \n"
  2753. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  2754. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2755. [width]"+rm"(width) // %[width]
  2756. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2757. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  2758. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  2759. : "memory", "cc", YUVTORGB_REGS_AVX2
  2760. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2761. );
  2762. // clang-format on
  2763. }
  2764. #endif // HAS_UYVYTOARGBROW_AVX2
  2765. #ifdef HAS_I400TOARGBROW_SSE2
  2766. void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
  2767. asm volatile(
  2768. "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
  2769. "movd %%eax,%%xmm2 \n"
  2770. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  2771. "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 *
  2772. // 16
  2773. "movd %%eax,%%xmm3 \n"
  2774. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  2775. "pcmpeqb %%xmm4,%%xmm4 \n"
  2776. "pslld $0x18,%%xmm4 \n"
  2777. LABELALIGN
  2778. "1: \n"
  2779. // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  2780. "movq (%0),%%xmm0 \n"
  2781. "lea 0x8(%0),%0 \n"
  2782. "punpcklbw %%xmm0,%%xmm0 \n"
  2783. "pmulhuw %%xmm2,%%xmm0 \n"
  2784. "psubusw %%xmm3,%%xmm0 \n"
  2785. "psrlw $6, %%xmm0 \n"
  2786. "packuswb %%xmm0,%%xmm0 \n"
  2787. // Step 2: Weave into ARGB
  2788. "punpcklbw %%xmm0,%%xmm0 \n"
  2789. "movdqa %%xmm0,%%xmm1 \n"
  2790. "punpcklwd %%xmm0,%%xmm0 \n"
  2791. "punpckhwd %%xmm1,%%xmm1 \n"
  2792. "por %%xmm4,%%xmm0 \n"
  2793. "por %%xmm4,%%xmm1 \n"
  2794. "movdqu %%xmm0,(%1) \n"
  2795. "movdqu %%xmm1,0x10(%1) \n"
  2796. "lea 0x20(%1),%1 \n"
  2797. "sub $0x8,%2 \n"
  2798. "jg 1b \n"
  2799. : "+r"(y_buf), // %0
  2800. "+r"(dst_argb), // %1
  2801. "+rm"(width) // %2
  2802. :
  2803. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  2804. }
  2805. #endif // HAS_I400TOARGBROW_SSE2
  2806. #ifdef HAS_I400TOARGBROW_AVX2
  2807. // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
  2808. // note: vpunpcklbw mutates and vpackuswb unmutates.
  2809. void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
  2810. asm volatile(
  2811. "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 *
  2812. // 16
  2813. "vmovd %%eax,%%xmm2 \n"
  2814. "vbroadcastss %%xmm2,%%ymm2 \n"
  2815. "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
  2816. "vmovd %%eax,%%xmm3 \n"
  2817. "vbroadcastss %%xmm3,%%ymm3 \n"
  2818. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  2819. "vpslld $0x18,%%ymm4,%%ymm4 \n"
  2820. LABELALIGN
  2821. "1: \n"
  2822. // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
  2823. "vmovdqu (%0),%%xmm0 \n"
  2824. "lea 0x10(%0),%0 \n"
  2825. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  2826. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  2827. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  2828. "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
  2829. "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
  2830. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  2831. "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
  2832. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2833. "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
  2834. "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
  2835. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  2836. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  2837. "vmovdqu %%ymm0,(%1) \n"
  2838. "vmovdqu %%ymm1,0x20(%1) \n"
  2839. "lea 0x40(%1),%1 \n"
  2840. "sub $0x10,%2 \n"
  2841. "jg 1b \n"
  2842. "vzeroupper \n"
  2843. : "+r"(y_buf), // %0
  2844. "+r"(dst_argb), // %1
  2845. "+rm"(width) // %2
  2846. :
  2847. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  2848. }
  2849. #endif // HAS_I400TOARGBROW_AVX2
  2850. #ifdef HAS_MIRRORROW_SSSE3
  2851. // Shuffle table for reversing the bytes.
  2852. static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
  2853. 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2854. void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  2855. intptr_t temp_width = (intptr_t)(width);
  2856. asm volatile(
  2857. "movdqa %3,%%xmm5 \n"
  2858. LABELALIGN
  2859. "1: \n"
  2860. "movdqu -0x10(%0,%2,1),%%xmm0 \n"
  2861. "pshufb %%xmm5,%%xmm0 \n"
  2862. "movdqu %%xmm0,(%1) \n"
  2863. "lea 0x10(%1),%1 \n"
  2864. "sub $0x10,%2 \n"
  2865. "jg 1b \n"
  2866. : "+r"(src), // %0
  2867. "+r"(dst), // %1
  2868. "+r"(temp_width) // %2
  2869. : "m"(kShuffleMirror) // %3
  2870. : "memory", "cc", "xmm0", "xmm5");
  2871. }
  2872. #endif // HAS_MIRRORROW_SSSE3
  2873. #ifdef HAS_MIRRORROW_AVX2
  2874. void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  2875. intptr_t temp_width = (intptr_t)(width);
  2876. asm volatile(
  2877. "vbroadcastf128 %3,%%ymm5 \n"
  2878. LABELALIGN
  2879. "1: \n"
  2880. "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
  2881. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  2882. "vpermq $0x4e,%%ymm0,%%ymm0 \n"
  2883. "vmovdqu %%ymm0,(%1) \n"
  2884. "lea 0x20(%1),%1 \n"
  2885. "sub $0x20,%2 \n"
  2886. "jg 1b \n"
  2887. "vzeroupper \n"
  2888. : "+r"(src), // %0
  2889. "+r"(dst), // %1
  2890. "+r"(temp_width) // %2
  2891. : "m"(kShuffleMirror) // %3
  2892. : "memory", "cc", "xmm0", "xmm5");
  2893. }
  2894. #endif // HAS_MIRRORROW_AVX2
  2895. #ifdef HAS_MIRRORUVROW_SSSE3
  2896. // Shuffle table for reversing the bytes of UV channels.
  2897. static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
  2898. 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
  2899. void MirrorUVRow_SSSE3(const uint8_t* src,
  2900. uint8_t* dst_u,
  2901. uint8_t* dst_v,
  2902. int width) {
  2903. intptr_t temp_width = (intptr_t)(width);
  2904. asm volatile(
  2905. "movdqa %4,%%xmm1 \n"
  2906. "lea -0x10(%0,%3,2),%0 \n"
  2907. "sub %1,%2 \n"
  2908. LABELALIGN
  2909. "1: \n"
  2910. "movdqu (%0),%%xmm0 \n"
  2911. "lea -0x10(%0),%0 \n"
  2912. "pshufb %%xmm1,%%xmm0 \n"
  2913. "movlpd %%xmm0,(%1) \n"
  2914. "movhpd %%xmm0,0x00(%1,%2,1) \n"
  2915. "lea 0x8(%1),%1 \n"
  2916. "sub $8,%3 \n"
  2917. "jg 1b \n"
  2918. : "+r"(src), // %0
  2919. "+r"(dst_u), // %1
  2920. "+r"(dst_v), // %2
  2921. "+r"(temp_width) // %3
  2922. : "m"(kShuffleMirrorUV) // %4
  2923. : "memory", "cc", "xmm0", "xmm1");
  2924. }
  2925. #endif // HAS_MIRRORUVROW_SSSE3
  2926. #ifdef HAS_ARGBMIRRORROW_SSE2
  2927. void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  2928. intptr_t temp_width = (intptr_t)(width);
  2929. asm volatile(
  2930. "lea -0x10(%0,%2,4),%0 \n"
  2931. LABELALIGN
  2932. "1: \n"
  2933. "movdqu (%0),%%xmm0 \n"
  2934. "pshufd $0x1b,%%xmm0,%%xmm0 \n"
  2935. "lea -0x10(%0),%0 \n"
  2936. "movdqu %%xmm0,(%1) \n"
  2937. "lea 0x10(%1),%1 \n"
  2938. "sub $0x4,%2 \n"
  2939. "jg 1b \n"
  2940. : "+r"(src), // %0
  2941. "+r"(dst), // %1
  2942. "+r"(temp_width) // %2
  2943. :
  2944. : "memory", "cc", "xmm0");
  2945. }
  2946. #endif // HAS_ARGBMIRRORROW_SSE2
  2947. #ifdef HAS_ARGBMIRRORROW_AVX2
  2948. // Shuffle table for reversing the bytes.
  2949. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2950. void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  2951. intptr_t temp_width = (intptr_t)(width);
  2952. asm volatile(
  2953. "vmovdqu %3,%%ymm5 \n"
  2954. LABELALIGN
  2955. "1: \n"
  2956. "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
  2957. "vmovdqu %%ymm0,(%1) \n"
  2958. "lea 0x20(%1),%1 \n"
  2959. "sub $0x8,%2 \n"
  2960. "jg 1b \n"
  2961. "vzeroupper \n"
  2962. : "+r"(src), // %0
  2963. "+r"(dst), // %1
  2964. "+r"(temp_width) // %2
  2965. : "m"(kARGBShuffleMirror_AVX2) // %3
  2966. : "memory", "cc", "xmm0", "xmm5");
  2967. }
  2968. #endif // HAS_ARGBMIRRORROW_AVX2
  2969. #ifdef HAS_SPLITUVROW_AVX2
  2970. void SplitUVRow_AVX2(const uint8_t* src_uv,
  2971. uint8_t* dst_u,
  2972. uint8_t* dst_v,
  2973. int width) {
  2974. asm volatile(
  2975. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2976. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  2977. "sub %1,%2 \n"
  2978. LABELALIGN
  2979. "1: \n"
  2980. "vmovdqu (%0),%%ymm0 \n"
  2981. "vmovdqu 0x20(%0),%%ymm1 \n"
  2982. "lea 0x40(%0),%0 \n"
  2983. "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
  2984. "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
  2985. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  2986. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  2987. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  2988. "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
  2989. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  2990. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  2991. "vmovdqu %%ymm0,(%1) \n"
  2992. "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
  2993. "lea 0x20(%1),%1 \n"
  2994. "sub $0x20,%3 \n"
  2995. "jg 1b \n"
  2996. "vzeroupper \n"
  2997. : "+r"(src_uv), // %0
  2998. "+r"(dst_u), // %1
  2999. "+r"(dst_v), // %2
  3000. "+r"(width) // %3
  3001. :
  3002. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  3003. }
  3004. #endif // HAS_SPLITUVROW_AVX2
  3005. #ifdef HAS_SPLITUVROW_SSE2
  3006. void SplitUVRow_SSE2(const uint8_t* src_uv,
  3007. uint8_t* dst_u,
  3008. uint8_t* dst_v,
  3009. int width) {
  3010. asm volatile(
  3011. "pcmpeqb %%xmm5,%%xmm5 \n"
  3012. "psrlw $0x8,%%xmm5 \n"
  3013. "sub %1,%2 \n"
  3014. LABELALIGN
  3015. "1: \n"
  3016. "movdqu (%0),%%xmm0 \n"
  3017. "movdqu 0x10(%0),%%xmm1 \n"
  3018. "lea 0x20(%0),%0 \n"
  3019. "movdqa %%xmm0,%%xmm2 \n"
  3020. "movdqa %%xmm1,%%xmm3 \n"
  3021. "pand %%xmm5,%%xmm0 \n"
  3022. "pand %%xmm5,%%xmm1 \n"
  3023. "packuswb %%xmm1,%%xmm0 \n"
  3024. "psrlw $0x8,%%xmm2 \n"
  3025. "psrlw $0x8,%%xmm3 \n"
  3026. "packuswb %%xmm3,%%xmm2 \n"
  3027. "movdqu %%xmm0,(%1) \n"
  3028. "movdqu %%xmm2,0x00(%1,%2,1) \n"
  3029. "lea 0x10(%1),%1 \n"
  3030. "sub $0x10,%3 \n"
  3031. "jg 1b \n"
  3032. : "+r"(src_uv), // %0
  3033. "+r"(dst_u), // %1
  3034. "+r"(dst_v), // %2
  3035. "+r"(width) // %3
  3036. :
  3037. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  3038. }
  3039. #endif // HAS_SPLITUVROW_SSE2
  3040. #ifdef HAS_MERGEUVROW_AVX2
  3041. void MergeUVRow_AVX2(const uint8_t* src_u,
  3042. const uint8_t* src_v,
  3043. uint8_t* dst_uv,
  3044. int width) {
  3045. asm volatile(
  3046. "sub %0,%1 \n"
  3047. LABELALIGN
  3048. "1: \n"
  3049. "vmovdqu (%0),%%ymm0 \n"
  3050. "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
  3051. "lea 0x20(%0),%0 \n"
  3052. "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
  3053. "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
  3054. "vextractf128 $0x0,%%ymm2,(%2) \n"
  3055. "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
  3056. "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
  3057. "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
  3058. "lea 0x40(%2),%2 \n"
  3059. "sub $0x20,%3 \n"
  3060. "jg 1b \n"
  3061. "vzeroupper \n"
  3062. : "+r"(src_u), // %0
  3063. "+r"(src_v), // %1
  3064. "+r"(dst_uv), // %2
  3065. "+r"(width) // %3
  3066. :
  3067. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3068. }
  3069. #endif // HAS_MERGEUVROW_AVX2
  3070. #ifdef HAS_MERGEUVROW_SSE2
  3071. void MergeUVRow_SSE2(const uint8_t* src_u,
  3072. const uint8_t* src_v,
  3073. uint8_t* dst_uv,
  3074. int width) {
  3075. asm volatile(
  3076. "sub %0,%1 \n"
  3077. LABELALIGN
  3078. "1: \n"
  3079. "movdqu (%0),%%xmm0 \n"
  3080. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  3081. "lea 0x10(%0),%0 \n"
  3082. "movdqa %%xmm0,%%xmm2 \n"
  3083. "punpcklbw %%xmm1,%%xmm0 \n"
  3084. "punpckhbw %%xmm1,%%xmm2 \n"
  3085. "movdqu %%xmm0,(%2) \n"
  3086. "movdqu %%xmm2,0x10(%2) \n"
  3087. "lea 0x20(%2),%2 \n"
  3088. "sub $0x10,%3 \n"
  3089. "jg 1b \n"
  3090. : "+r"(src_u), // %0
  3091. "+r"(src_v), // %1
  3092. "+r"(dst_uv), // %2
  3093. "+r"(width) // %3
  3094. :
  3095. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3096. }
  3097. #endif // HAS_MERGEUVROW_SSE2
  3098. // Use scale to convert lsb formats to msb, depending how many bits there are:
  3099. // 128 = 9 bits
  3100. // 64 = 10 bits
  3101. // 16 = 12 bits
  3102. // 1 = 16 bits
  3103. #ifdef HAS_MERGEUVROW_16_AVX2
  3104. void MergeUVRow_16_AVX2(const uint16_t* src_u,
  3105. const uint16_t* src_v,
  3106. uint16_t* dst_uv,
  3107. int scale,
  3108. int width) {
  3109. // clang-format off
  3110. asm volatile (
  3111. "vmovd %4,%%xmm3 \n"
  3112. "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
  3113. "vbroadcastss %%xmm3,%%ymm3 \n"
  3114. "sub %0,%1 \n"
  3115. // 16 pixels per loop.
  3116. LABELALIGN
  3117. "1: \n"
  3118. "vmovdqu (%0),%%ymm0 \n"
  3119. "vmovdqu (%0,%1,1),%%ymm1 \n"
  3120. "add $0x20,%0 \n"
  3121. "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
  3122. "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
  3123. "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
  3124. "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
  3125. "vextractf128 $0x0,%%ymm2,(%2) \n"
  3126. "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
  3127. "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
  3128. "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
  3129. "add $0x40,%2 \n"
  3130. "sub $0x10,%3 \n"
  3131. "jg 1b \n"
  3132. "vzeroupper \n"
  3133. : "+r"(src_u), // %0
  3134. "+r"(src_v), // %1
  3135. "+r"(dst_uv), // %2
  3136. "+r"(width) // %3
  3137. : "r"(scale) // %4
  3138. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
  3139. // clang-format on
  3140. }
  3141. #endif // HAS_MERGEUVROW_AVX2
  3142. // Use scale to convert lsb formats to msb, depending how many bits there are:
  3143. // 128 = 9 bits
  3144. // 64 = 10 bits
  3145. // 16 = 12 bits
  3146. // 1 = 16 bits
  3147. #ifdef HAS_MULTIPLYROW_16_AVX2
  3148. void MultiplyRow_16_AVX2(const uint16_t* src_y,
  3149. uint16_t* dst_y,
  3150. int scale,
  3151. int width) {
  3152. // clang-format off
  3153. asm volatile (
  3154. "vmovd %3,%%xmm3 \n"
  3155. "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
  3156. "vbroadcastss %%xmm3,%%ymm3 \n"
  3157. "sub %0,%1 \n"
  3158. // 16 pixels per loop.
  3159. LABELALIGN
  3160. "1: \n"
  3161. "vmovdqu (%0),%%ymm0 \n"
  3162. "vmovdqu 0x20(%0),%%ymm1 \n"
  3163. "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
  3164. "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
  3165. "vmovdqu %%ymm0,(%0,%1) \n"
  3166. "vmovdqu %%ymm1,0x20(%0,%1) \n"
  3167. "add $0x40,%0 \n"
  3168. "sub $0x20,%2 \n"
  3169. "jg 1b \n"
  3170. "vzeroupper \n"
  3171. : "+r"(src_y), // %0
  3172. "+r"(dst_y), // %1
  3173. "+r"(width) // %2
  3174. : "r"(scale) // %3
  3175. : "memory", "cc", "xmm0", "xmm1", "xmm3");
  3176. // clang-format on
  3177. }
  3178. #endif // HAS_MULTIPLYROW_16_AVX2
  3179. // Use scale to convert lsb formats to msb, depending how many bits there are:
  3180. // 32768 = 9 bits
  3181. // 16384 = 10 bits
  3182. // 4096 = 12 bits
  3183. // 256 = 16 bits
  3184. void Convert16To8Row_SSSE3(const uint16_t* src_y,
  3185. uint8_t* dst_y,
  3186. int scale,
  3187. int width) {
  3188. // clang-format off
  3189. asm volatile (
  3190. "movd %3,%%xmm2 \n"
  3191. "punpcklwd %%xmm2,%%xmm2 \n"
  3192. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  3193. // 32 pixels per loop.
  3194. LABELALIGN
  3195. "1: \n"
  3196. "movdqu (%0),%%xmm0 \n"
  3197. "movdqu 0x10(%0),%%xmm1 \n"
  3198. "add $0x20,%0 \n"
  3199. "pmulhuw %%xmm2,%%xmm0 \n"
  3200. "pmulhuw %%xmm2,%%xmm1 \n"
  3201. "packuswb %%xmm1,%%xmm0 \n"
  3202. "movdqu %%xmm0,(%1) \n"
  3203. "add $0x10,%1 \n"
  3204. "sub $0x10,%2 \n"
  3205. "jg 1b \n"
  3206. : "+r"(src_y), // %0
  3207. "+r"(dst_y), // %1
  3208. "+r"(width) // %2
  3209. : "r"(scale) // %3
  3210. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3211. // clang-format on
  3212. }
  3213. #ifdef HAS_CONVERT16TO8ROW_AVX2
  3214. void Convert16To8Row_AVX2(const uint16_t* src_y,
  3215. uint8_t* dst_y,
  3216. int scale,
  3217. int width) {
  3218. // clang-format off
  3219. asm volatile (
  3220. "vmovd %3,%%xmm2 \n"
  3221. "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
  3222. "vbroadcastss %%xmm2,%%ymm2 \n"
  3223. // 32 pixels per loop.
  3224. LABELALIGN
  3225. "1: \n"
  3226. "vmovdqu (%0),%%ymm0 \n"
  3227. "vmovdqu 0x20(%0),%%ymm1 \n"
  3228. "add $0x40,%0 \n"
  3229. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3230. "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
  3231. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
  3232. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3233. "vmovdqu %%ymm0,(%1) \n"
  3234. "add $0x20,%1 \n"
  3235. "sub $0x20,%2 \n"
  3236. "jg 1b \n"
  3237. "vzeroupper \n"
  3238. : "+r"(src_y), // %0
  3239. "+r"(dst_y), // %1
  3240. "+r"(width) // %2
  3241. : "r"(scale) // %3
  3242. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3243. // clang-format on
  3244. }
  3245. #endif // HAS_CONVERT16TO8ROW_AVX2
  3246. // Use scale to convert to lsb formats depending how many bits there are:
  3247. // 512 = 9 bits
  3248. // 1024 = 10 bits
  3249. // 4096 = 12 bits
  3250. // TODO(fbarchard): reduce to SSE2
  3251. void Convert8To16Row_SSE2(const uint8_t* src_y,
  3252. uint16_t* dst_y,
  3253. int scale,
  3254. int width) {
  3255. // clang-format off
  3256. asm volatile (
  3257. "movd %3,%%xmm2 \n"
  3258. "punpcklwd %%xmm2,%%xmm2 \n"
  3259. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  3260. // 32 pixels per loop.
  3261. LABELALIGN
  3262. "1: \n"
  3263. "movdqu (%0),%%xmm0 \n"
  3264. "movdqa %%xmm0,%%xmm1 \n"
  3265. "punpcklbw %%xmm0,%%xmm0 \n"
  3266. "punpckhbw %%xmm1,%%xmm1 \n"
  3267. "add $0x10,%0 \n"
  3268. "pmulhuw %%xmm2,%%xmm0 \n"
  3269. "pmulhuw %%xmm2,%%xmm1 \n"
  3270. "movdqu %%xmm0,(%1) \n"
  3271. "movdqu %%xmm1,0x10(%1) \n"
  3272. "add $0x20,%1 \n"
  3273. "sub $0x10,%2 \n"
  3274. "jg 1b \n"
  3275. : "+r"(src_y), // %0
  3276. "+r"(dst_y), // %1
  3277. "+r"(width) // %2
  3278. : "r"(scale) // %3
  3279. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3280. // clang-format on
  3281. }
  3282. #ifdef HAS_CONVERT8TO16ROW_AVX2
  3283. void Convert8To16Row_AVX2(const uint8_t* src_y,
  3284. uint16_t* dst_y,
  3285. int scale,
  3286. int width) {
  3287. // clang-format off
  3288. asm volatile (
  3289. "vmovd %3,%%xmm2 \n"
  3290. "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
  3291. "vbroadcastss %%xmm2,%%ymm2 \n"
  3292. // 32 pixels per loop.
  3293. LABELALIGN
  3294. "1: \n"
  3295. "vmovdqu (%0),%%ymm0 \n"
  3296. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3297. "add $0x20,%0 \n"
  3298. "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
  3299. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  3300. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3301. "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
  3302. "vmovdqu %%ymm0,(%1) \n"
  3303. "vmovdqu %%ymm1,0x20(%1) \n"
  3304. "add $0x40,%1 \n"
  3305. "sub $0x20,%2 \n"
  3306. "jg 1b \n"
  3307. "vzeroupper \n"
  3308. : "+r"(src_y), // %0
  3309. "+r"(dst_y), // %1
  3310. "+r"(width) // %2
  3311. : "r"(scale) // %3
  3312. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3313. // clang-format on
  3314. }
  3315. #endif // HAS_CONVERT8TO16ROW_AVX2
  3316. #ifdef HAS_SPLITRGBROW_SSSE3
  3317. // Shuffle table for converting RGB to Planar.
  3318. static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
  3319. 128u, 128u, 128u, 128u, 128u, 128u,
  3320. 128u, 128u, 128u, 128u};
  3321. static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
  3322. 2u, 5u, 8u, 11u, 14u, 128u,
  3323. 128u, 128u, 128u, 128u};
  3324. static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
  3325. 128u, 128u, 128u, 128u, 128u, 1u,
  3326. 4u, 7u, 10u, 13u};
  3327. static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
  3328. 128u, 128u, 128u, 128u, 128u, 128u,
  3329. 128u, 128u, 128u, 128u};
  3330. static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
  3331. 3u, 6u, 9u, 12u, 15u, 128u,
  3332. 128u, 128u, 128u, 128u};
  3333. static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
  3334. 128u, 128u, 128u, 128u, 128u, 2u,
  3335. 5u, 8u, 11u, 14u};
  3336. static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
  3337. 128u, 128u, 128u, 128u, 128u, 128u,
  3338. 128u, 128u, 128u, 128u};
  3339. static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
  3340. 4u, 7u, 10u, 13u, 128u, 128u,
  3341. 128u, 128u, 128u, 128u};
  3342. static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
  3343. 128u, 128u, 128u, 128u, 0u, 3u,
  3344. 6u, 9u, 12u, 15u};
  3345. void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
  3346. uint8_t* dst_r,
  3347. uint8_t* dst_g,
  3348. uint8_t* dst_b,
  3349. int width) {
  3350. asm volatile(
  3351. LABELALIGN
  3352. "1: \n"
  3353. "movdqu (%0),%%xmm0 \n"
  3354. "movdqu 0x10(%0),%%xmm1 \n"
  3355. "movdqu 0x20(%0),%%xmm2 \n"
  3356. "pshufb %5, %%xmm0 \n"
  3357. "pshufb %6, %%xmm1 \n"
  3358. "pshufb %7, %%xmm2 \n"
  3359. "por %%xmm1,%%xmm0 \n"
  3360. "por %%xmm2,%%xmm0 \n"
  3361. "movdqu %%xmm0,(%1) \n"
  3362. "lea 0x10(%1),%1 \n"
  3363. "movdqu (%0),%%xmm0 \n"
  3364. "movdqu 0x10(%0),%%xmm1 \n"
  3365. "movdqu 0x20(%0),%%xmm2 \n"
  3366. "pshufb %8, %%xmm0 \n"
  3367. "pshufb %9, %%xmm1 \n"
  3368. "pshufb %10, %%xmm2 \n"
  3369. "por %%xmm1,%%xmm0 \n"
  3370. "por %%xmm2,%%xmm0 \n"
  3371. "movdqu %%xmm0,(%2) \n"
  3372. "lea 0x10(%2),%2 \n"
  3373. "movdqu (%0),%%xmm0 \n"
  3374. "movdqu 0x10(%0),%%xmm1 \n"
  3375. "movdqu 0x20(%0),%%xmm2 \n"
  3376. "pshufb %11, %%xmm0 \n"
  3377. "pshufb %12, %%xmm1 \n"
  3378. "pshufb %13, %%xmm2 \n"
  3379. "por %%xmm1,%%xmm0 \n"
  3380. "por %%xmm2,%%xmm0 \n"
  3381. "movdqu %%xmm0,(%3) \n"
  3382. "lea 0x10(%3),%3 \n"
  3383. "lea 0x30(%0),%0 \n"
  3384. "sub $0x10,%4 \n"
  3385. "jg 1b \n"
  3386. : "+r"(src_rgb), // %0
  3387. "+r"(dst_r), // %1
  3388. "+r"(dst_g), // %2
  3389. "+r"(dst_b), // %3
  3390. "+r"(width) // %4
  3391. : "m"(kShuffleMaskRGBToR0), // %5
  3392. "m"(kShuffleMaskRGBToR1), // %6
  3393. "m"(kShuffleMaskRGBToR2), // %7
  3394. "m"(kShuffleMaskRGBToG0), // %8
  3395. "m"(kShuffleMaskRGBToG1), // %9
  3396. "m"(kShuffleMaskRGBToG2), // %10
  3397. "m"(kShuffleMaskRGBToB0), // %11
  3398. "m"(kShuffleMaskRGBToB1), // %12
  3399. "m"(kShuffleMaskRGBToB2) // %13
  3400. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3401. }
  3402. #endif // HAS_SPLITRGBROW_SSSE3
  3403. #ifdef HAS_MERGERGBROW_SSSE3
  3404. // Shuffle table for converting RGB to Planar.
  3405. static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
  3406. 2u, 128u, 128u, 3u, 128u, 128u,
  3407. 4u, 128u, 128u, 5u};
  3408. static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
  3409. 128u, 2u, 128u, 128u, 3u, 128u,
  3410. 128u, 4u, 128u, 128u};
  3411. static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
  3412. 128u, 128u, 2u, 128u, 128u, 3u,
  3413. 128u, 128u, 4u, 128u};
  3414. static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
  3415. 7u, 128u, 128u, 8u, 128u, 128u,
  3416. 9u, 128u, 128u, 10u};
  3417. static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
  3418. 128u, 7u, 128u, 128u, 8u, 128u,
  3419. 128u, 9u, 128u, 128u};
  3420. static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
  3421. 128u, 128u, 8u, 128u, 128u, 9u,
  3422. 128u, 128u, 10u, 128u};
  3423. static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
  3424. 12u, 128u, 128u, 13u, 128u, 128u,
  3425. 14u, 128u, 128u, 15u};
  3426. static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
  3427. 128u, 13u, 128u, 128u, 14u, 128u,
  3428. 128u, 15u, 128u, 128u};
  3429. static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
  3430. 128u, 128u, 13u, 128u, 128u, 14u,
  3431. 128u, 128u, 15u, 128u};
  3432. void MergeRGBRow_SSSE3(const uint8_t* src_r,
  3433. const uint8_t* src_g,
  3434. const uint8_t* src_b,
  3435. uint8_t* dst_rgb,
  3436. int width) {
  3437. asm volatile(
  3438. LABELALIGN
  3439. "1: \n"
  3440. "movdqu (%0),%%xmm0 \n"
  3441. "movdqu (%1),%%xmm1 \n"
  3442. "movdqu (%2),%%xmm2 \n"
  3443. "pshufb %5, %%xmm0 \n"
  3444. "pshufb %6, %%xmm1 \n"
  3445. "pshufb %7, %%xmm2 \n"
  3446. "por %%xmm1,%%xmm0 \n"
  3447. "por %%xmm2,%%xmm0 \n"
  3448. "movdqu %%xmm0,(%3) \n"
  3449. "movdqu (%0),%%xmm0 \n"
  3450. "movdqu (%1),%%xmm1 \n"
  3451. "movdqu (%2),%%xmm2 \n"
  3452. "pshufb %8, %%xmm0 \n"
  3453. "pshufb %9, %%xmm1 \n"
  3454. "pshufb %10, %%xmm2 \n"
  3455. "por %%xmm1,%%xmm0 \n"
  3456. "por %%xmm2,%%xmm0 \n"
  3457. "movdqu %%xmm0,16(%3) \n"
  3458. "movdqu (%0),%%xmm0 \n"
  3459. "movdqu (%1),%%xmm1 \n"
  3460. "movdqu (%2),%%xmm2 \n"
  3461. "pshufb %11, %%xmm0 \n"
  3462. "pshufb %12, %%xmm1 \n"
  3463. "pshufb %13, %%xmm2 \n"
  3464. "por %%xmm1,%%xmm0 \n"
  3465. "por %%xmm2,%%xmm0 \n"
  3466. "movdqu %%xmm0,32(%3) \n"
  3467. "lea 0x10(%0),%0 \n"
  3468. "lea 0x10(%1),%1 \n"
  3469. "lea 0x10(%2),%2 \n"
  3470. "lea 0x30(%3),%3 \n"
  3471. "sub $0x10,%4 \n"
  3472. "jg 1b \n"
  3473. : "+r"(src_r), // %0
  3474. "+r"(src_g), // %1
  3475. "+r"(src_b), // %2
  3476. "+r"(dst_rgb), // %3
  3477. "+r"(width) // %4
  3478. : "m"(kShuffleMaskRToRGB0), // %5
  3479. "m"(kShuffleMaskGToRGB0), // %6
  3480. "m"(kShuffleMaskBToRGB0), // %7
  3481. "m"(kShuffleMaskRToRGB1), // %8
  3482. "m"(kShuffleMaskGToRGB1), // %9
  3483. "m"(kShuffleMaskBToRGB1), // %10
  3484. "m"(kShuffleMaskRToRGB2), // %11
  3485. "m"(kShuffleMaskGToRGB2), // %12
  3486. "m"(kShuffleMaskBToRGB2) // %13
  3487. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3488. }
  3489. #endif // HAS_MERGERGBROW_SSSE3
  3490. #ifdef HAS_COPYROW_SSE2
  3491. void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  3492. asm volatile(
  3493. "test $0xf,%0 \n"
  3494. "jne 2f \n"
  3495. "test $0xf,%1 \n"
  3496. "jne 2f \n"
  3497. LABELALIGN
  3498. "1: \n"
  3499. "movdqa (%0),%%xmm0 \n"
  3500. "movdqa 0x10(%0),%%xmm1 \n"
  3501. "lea 0x20(%0),%0 \n"
  3502. "movdqa %%xmm0,(%1) \n"
  3503. "movdqa %%xmm1,0x10(%1) \n"
  3504. "lea 0x20(%1),%1 \n"
  3505. "sub $0x20,%2 \n"
  3506. "jg 1b \n"
  3507. "jmp 9f \n"
  3508. LABELALIGN
  3509. "2: \n"
  3510. "movdqu (%0),%%xmm0 \n"
  3511. "movdqu 0x10(%0),%%xmm1 \n"
  3512. "lea 0x20(%0),%0 \n"
  3513. "movdqu %%xmm0,(%1) \n"
  3514. "movdqu %%xmm1,0x10(%1) \n"
  3515. "lea 0x20(%1),%1 \n"
  3516. "sub $0x20,%2 \n"
  3517. "jg 2b \n"
  3518. LABELALIGN "9: \n"
  3519. : "+r"(src), // %0
  3520. "+r"(dst), // %1
  3521. "+r"(width) // %2
  3522. :
  3523. : "memory", "cc", "xmm0", "xmm1");
  3524. }
  3525. #endif // HAS_COPYROW_SSE2
  3526. #ifdef HAS_COPYROW_AVX
  3527. void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
  3528. asm volatile(
  3529. LABELALIGN
  3530. "1: \n"
  3531. "vmovdqu (%0),%%ymm0 \n"
  3532. "vmovdqu 0x20(%0),%%ymm1 \n"
  3533. "lea 0x40(%0),%0 \n"
  3534. "vmovdqu %%ymm0,(%1) \n"
  3535. "vmovdqu %%ymm1,0x20(%1) \n"
  3536. "lea 0x40(%1),%1 \n"
  3537. "sub $0x40,%2 \n"
  3538. "jg 1b \n"
  3539. : "+r"(src), // %0
  3540. "+r"(dst), // %1
  3541. "+r"(width) // %2
  3542. :
  3543. : "memory", "cc", "xmm0", "xmm1");
  3544. }
  3545. #endif // HAS_COPYROW_AVX
  3546. #ifdef HAS_COPYROW_ERMS
  3547. // Multiple of 1.
  3548. void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
  3549. size_t width_tmp = (size_t)(width);
  3550. asm volatile(
  3551. "rep movsb \n"
  3552. : "+S"(src), // %0
  3553. "+D"(dst), // %1
  3554. "+c"(width_tmp) // %2
  3555. :
  3556. : "memory", "cc");
  3557. }
  3558. #endif // HAS_COPYROW_ERMS
  3559. #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  3560. // width in pixels
  3561. void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  3562. asm volatile(
  3563. "pcmpeqb %%xmm0,%%xmm0 \n"
  3564. "pslld $0x18,%%xmm0 \n"
  3565. "pcmpeqb %%xmm1,%%xmm1 \n"
  3566. "psrld $0x8,%%xmm1 \n"
  3567. LABELALIGN
  3568. "1: \n"
  3569. "movdqu (%0),%%xmm2 \n"
  3570. "movdqu 0x10(%0),%%xmm3 \n"
  3571. "lea 0x20(%0),%0 \n"
  3572. "movdqu (%1),%%xmm4 \n"
  3573. "movdqu 0x10(%1),%%xmm5 \n"
  3574. "pand %%xmm0,%%xmm2 \n"
  3575. "pand %%xmm0,%%xmm3 \n"
  3576. "pand %%xmm1,%%xmm4 \n"
  3577. "pand %%xmm1,%%xmm5 \n"
  3578. "por %%xmm4,%%xmm2 \n"
  3579. "por %%xmm5,%%xmm3 \n"
  3580. "movdqu %%xmm2,(%1) \n"
  3581. "movdqu %%xmm3,0x10(%1) \n"
  3582. "lea 0x20(%1),%1 \n"
  3583. "sub $0x8,%2 \n"
  3584. "jg 1b \n"
  3585. : "+r"(src), // %0
  3586. "+r"(dst), // %1
  3587. "+r"(width) // %2
  3588. :
  3589. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  3590. }
  3591. #endif // HAS_ARGBCOPYALPHAROW_SSE2
  3592. #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  3593. // width in pixels
  3594. void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  3595. asm volatile(
  3596. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  3597. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  3598. LABELALIGN
  3599. "1: \n"
  3600. "vmovdqu (%0),%%ymm1 \n"
  3601. "vmovdqu 0x20(%0),%%ymm2 \n"
  3602. "lea 0x40(%0),%0 \n"
  3603. "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
  3604. "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
  3605. "vmovdqu %%ymm1,(%1) \n"
  3606. "vmovdqu %%ymm2,0x20(%1) \n"
  3607. "lea 0x40(%1),%1 \n"
  3608. "sub $0x10,%2 \n"
  3609. "jg 1b \n"
  3610. "vzeroupper \n"
  3611. : "+r"(src), // %0
  3612. "+r"(dst), // %1
  3613. "+r"(width) // %2
  3614. :
  3615. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3616. }
  3617. #endif // HAS_ARGBCOPYALPHAROW_AVX2
  3618. #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
  3619. // width in pixels
  3620. void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
  3621. uint8_t* dst_a,
  3622. int width) {
  3623. asm volatile(
  3624. LABELALIGN
  3625. "1: \n"
  3626. "movdqu (%0), %%xmm0 \n"
  3627. "movdqu 0x10(%0), %%xmm1 \n"
  3628. "lea 0x20(%0), %0 \n"
  3629. "psrld $0x18, %%xmm0 \n"
  3630. "psrld $0x18, %%xmm1 \n"
  3631. "packssdw %%xmm1, %%xmm0 \n"
  3632. "packuswb %%xmm0, %%xmm0 \n"
  3633. "movq %%xmm0,(%1) \n"
  3634. "lea 0x8(%1), %1 \n"
  3635. "sub $0x8, %2 \n"
  3636. "jg 1b \n"
  3637. : "+r"(src_argb), // %0
  3638. "+r"(dst_a), // %1
  3639. "+rm"(width) // %2
  3640. :
  3641. : "memory", "cc", "xmm0", "xmm1");
  3642. }
  3643. #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
  3644. #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
  3645. static const uvec8 kShuffleAlphaShort_AVX2 = {
  3646. 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
  3647. 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
  3648. void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
  3649. uint8_t* dst_a,
  3650. int width) {
  3651. asm volatile(
  3652. "vmovdqa %3,%%ymm4 \n"
  3653. "vbroadcastf128 %4,%%ymm5 \n"
  3654. LABELALIGN
  3655. "1: \n"
  3656. "vmovdqu (%0), %%ymm0 \n"
  3657. "vmovdqu 0x20(%0), %%ymm1 \n"
  3658. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
  3659. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  3660. "vmovdqu 0x40(%0), %%ymm2 \n"
  3661. "vmovdqu 0x60(%0), %%ymm3 \n"
  3662. "lea 0x80(%0), %0 \n"
  3663. "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
  3664. "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
  3665. "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
  3666. "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
  3667. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  3668. "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
  3669. "vmovdqu %%ymm0,(%1) \n"
  3670. "lea 0x20(%1),%1 \n"
  3671. "sub $0x20, %2 \n"
  3672. "jg 1b \n"
  3673. "vzeroupper \n"
  3674. : "+r"(src_argb), // %0
  3675. "+r"(dst_a), // %1
  3676. "+rm"(width) // %2
  3677. : "m"(kPermdARGBToY_AVX), // %3
  3678. "m"(kShuffleAlphaShort_AVX2) // %4
  3679. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  3680. }
  3681. #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
  3682. #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  3683. // width in pixels
  3684. void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  3685. asm volatile(
  3686. "pcmpeqb %%xmm0,%%xmm0 \n"
  3687. "pslld $0x18,%%xmm0 \n"
  3688. "pcmpeqb %%xmm1,%%xmm1 \n"
  3689. "psrld $0x8,%%xmm1 \n"
  3690. LABELALIGN
  3691. "1: \n"
  3692. "movq (%0),%%xmm2 \n"
  3693. "lea 0x8(%0),%0 \n"
  3694. "punpcklbw %%xmm2,%%xmm2 \n"
  3695. "punpckhwd %%xmm2,%%xmm3 \n"
  3696. "punpcklwd %%xmm2,%%xmm2 \n"
  3697. "movdqu (%1),%%xmm4 \n"
  3698. "movdqu 0x10(%1),%%xmm5 \n"
  3699. "pand %%xmm0,%%xmm2 \n"
  3700. "pand %%xmm0,%%xmm3 \n"
  3701. "pand %%xmm1,%%xmm4 \n"
  3702. "pand %%xmm1,%%xmm5 \n"
  3703. "por %%xmm4,%%xmm2 \n"
  3704. "por %%xmm5,%%xmm3 \n"
  3705. "movdqu %%xmm2,(%1) \n"
  3706. "movdqu %%xmm3,0x10(%1) \n"
  3707. "lea 0x20(%1),%1 \n"
  3708. "sub $0x8,%2 \n"
  3709. "jg 1b \n"
  3710. : "+r"(src), // %0
  3711. "+r"(dst), // %1
  3712. "+r"(width) // %2
  3713. :
  3714. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  3715. }
  3716. #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
  3717. #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  3718. // width in pixels
  3719. void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  3720. asm volatile(
  3721. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  3722. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  3723. LABELALIGN
  3724. "1: \n"
  3725. "vpmovzxbd (%0),%%ymm1 \n"
  3726. "vpmovzxbd 0x8(%0),%%ymm2 \n"
  3727. "lea 0x10(%0),%0 \n"
  3728. "vpslld $0x18,%%ymm1,%%ymm1 \n"
  3729. "vpslld $0x18,%%ymm2,%%ymm2 \n"
  3730. "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
  3731. "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
  3732. "vmovdqu %%ymm1,(%1) \n"
  3733. "vmovdqu %%ymm2,0x20(%1) \n"
  3734. "lea 0x40(%1),%1 \n"
  3735. "sub $0x10,%2 \n"
  3736. "jg 1b \n"
  3737. "vzeroupper \n"
  3738. : "+r"(src), // %0
  3739. "+r"(dst), // %1
  3740. "+r"(width) // %2
  3741. :
  3742. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3743. }
  3744. #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
  3745. #ifdef HAS_SETROW_X86
  3746. void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
  3747. size_t width_tmp = (size_t)(width >> 2);
  3748. const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
  3749. asm volatile(
  3750. "rep stosl \n"
  3751. : "+D"(dst), // %0
  3752. "+c"(width_tmp) // %1
  3753. : "a"(v32) // %2
  3754. : "memory", "cc");
  3755. }
  3756. void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
  3757. size_t width_tmp = (size_t)(width);
  3758. asm volatile(
  3759. "rep stosb \n"
  3760. : "+D"(dst), // %0
  3761. "+c"(width_tmp) // %1
  3762. : "a"(v8) // %2
  3763. : "memory", "cc");
  3764. }
  3765. void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
  3766. size_t width_tmp = (size_t)(width);
  3767. asm volatile(
  3768. "rep stosl \n"
  3769. : "+D"(dst_argb), // %0
  3770. "+c"(width_tmp) // %1
  3771. : "a"(v32) // %2
  3772. : "memory", "cc");
  3773. }
  3774. #endif // HAS_SETROW_X86
  3775. #ifdef HAS_YUY2TOYROW_SSE2
  3776. void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  3777. asm volatile(
  3778. "pcmpeqb %%xmm5,%%xmm5 \n"
  3779. "psrlw $0x8,%%xmm5 \n"
  3780. LABELALIGN
  3781. "1: \n"
  3782. "movdqu (%0),%%xmm0 \n"
  3783. "movdqu 0x10(%0),%%xmm1 \n"
  3784. "lea 0x20(%0),%0 \n"
  3785. "pand %%xmm5,%%xmm0 \n"
  3786. "pand %%xmm5,%%xmm1 \n"
  3787. "packuswb %%xmm1,%%xmm0 \n"
  3788. "movdqu %%xmm0,(%1) \n"
  3789. "lea 0x10(%1),%1 \n"
  3790. "sub $0x10,%2 \n"
  3791. "jg 1b \n"
  3792. : "+r"(src_yuy2), // %0
  3793. "+r"(dst_y), // %1
  3794. "+r"(width) // %2
  3795. :
  3796. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  3797. }
  3798. void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
  3799. int stride_yuy2,
  3800. uint8_t* dst_u,
  3801. uint8_t* dst_v,
  3802. int width) {
  3803. asm volatile(
  3804. "pcmpeqb %%xmm5,%%xmm5 \n"
  3805. "psrlw $0x8,%%xmm5 \n"
  3806. "sub %1,%2 \n"
  3807. LABELALIGN
  3808. "1: \n"
  3809. "movdqu (%0),%%xmm0 \n"
  3810. "movdqu 0x10(%0),%%xmm1 \n"
  3811. "movdqu 0x00(%0,%4,1),%%xmm2 \n"
  3812. "movdqu 0x10(%0,%4,1),%%xmm3 \n"
  3813. "lea 0x20(%0),%0 \n"
  3814. "pavgb %%xmm2,%%xmm0 \n"
  3815. "pavgb %%xmm3,%%xmm1 \n"
  3816. "psrlw $0x8,%%xmm0 \n"
  3817. "psrlw $0x8,%%xmm1 \n"
  3818. "packuswb %%xmm1,%%xmm0 \n"
  3819. "movdqa %%xmm0,%%xmm1 \n"
  3820. "pand %%xmm5,%%xmm0 \n"
  3821. "packuswb %%xmm0,%%xmm0 \n"
  3822. "psrlw $0x8,%%xmm1 \n"
  3823. "packuswb %%xmm1,%%xmm1 \n"
  3824. "movq %%xmm0,(%1) \n"
  3825. "movq %%xmm1,0x00(%1,%2,1) \n"
  3826. "lea 0x8(%1),%1 \n"
  3827. "sub $0x10,%3 \n"
  3828. "jg 1b \n"
  3829. : "+r"(src_yuy2), // %0
  3830. "+r"(dst_u), // %1
  3831. "+r"(dst_v), // %2
  3832. "+r"(width) // %3
  3833. : "r"((intptr_t)(stride_yuy2)) // %4
  3834. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  3835. }
  3836. void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
  3837. uint8_t* dst_u,
  3838. uint8_t* dst_v,
  3839. int width) {
  3840. asm volatile(
  3841. "pcmpeqb %%xmm5,%%xmm5 \n"
  3842. "psrlw $0x8,%%xmm5 \n"
  3843. "sub %1,%2 \n"
  3844. LABELALIGN
  3845. "1: \n"
  3846. "movdqu (%0),%%xmm0 \n"
  3847. "movdqu 0x10(%0),%%xmm1 \n"
  3848. "lea 0x20(%0),%0 \n"
  3849. "psrlw $0x8,%%xmm0 \n"
  3850. "psrlw $0x8,%%xmm1 \n"
  3851. "packuswb %%xmm1,%%xmm0 \n"
  3852. "movdqa %%xmm0,%%xmm1 \n"
  3853. "pand %%xmm5,%%xmm0 \n"
  3854. "packuswb %%xmm0,%%xmm0 \n"
  3855. "psrlw $0x8,%%xmm1 \n"
  3856. "packuswb %%xmm1,%%xmm1 \n"
  3857. "movq %%xmm0,(%1) \n"
  3858. "movq %%xmm1,0x00(%1,%2,1) \n"
  3859. "lea 0x8(%1),%1 \n"
  3860. "sub $0x10,%3 \n"
  3861. "jg 1b \n"
  3862. : "+r"(src_yuy2), // %0
  3863. "+r"(dst_u), // %1
  3864. "+r"(dst_v), // %2
  3865. "+r"(width) // %3
  3866. :
  3867. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  3868. }
  3869. void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  3870. asm volatile(
  3871. LABELALIGN
  3872. "1: \n"
  3873. "movdqu (%0),%%xmm0 \n"
  3874. "movdqu 0x10(%0),%%xmm1 \n"
  3875. "lea 0x20(%0),%0 \n"
  3876. "psrlw $0x8,%%xmm0 \n"
  3877. "psrlw $0x8,%%xmm1 \n"
  3878. "packuswb %%xmm1,%%xmm0 \n"
  3879. "movdqu %%xmm0,(%1) \n"
  3880. "lea 0x10(%1),%1 \n"
  3881. "sub $0x10,%2 \n"
  3882. "jg 1b \n"
  3883. : "+r"(src_uyvy), // %0
  3884. "+r"(dst_y), // %1
  3885. "+r"(width) // %2
  3886. :
  3887. : "memory", "cc", "xmm0", "xmm1");
  3888. }
  3889. void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
  3890. int stride_uyvy,
  3891. uint8_t* dst_u,
  3892. uint8_t* dst_v,
  3893. int width) {
  3894. asm volatile(
  3895. "pcmpeqb %%xmm5,%%xmm5 \n"
  3896. "psrlw $0x8,%%xmm5 \n"
  3897. "sub %1,%2 \n"
  3898. LABELALIGN
  3899. "1: \n"
  3900. "movdqu (%0),%%xmm0 \n"
  3901. "movdqu 0x10(%0),%%xmm1 \n"
  3902. "movdqu 0x00(%0,%4,1),%%xmm2 \n"
  3903. "movdqu 0x10(%0,%4,1),%%xmm3 \n"
  3904. "lea 0x20(%0),%0 \n"
  3905. "pavgb %%xmm2,%%xmm0 \n"
  3906. "pavgb %%xmm3,%%xmm1 \n"
  3907. "pand %%xmm5,%%xmm0 \n"
  3908. "pand %%xmm5,%%xmm1 \n"
  3909. "packuswb %%xmm1,%%xmm0 \n"
  3910. "movdqa %%xmm0,%%xmm1 \n"
  3911. "pand %%xmm5,%%xmm0 \n"
  3912. "packuswb %%xmm0,%%xmm0 \n"
  3913. "psrlw $0x8,%%xmm1 \n"
  3914. "packuswb %%xmm1,%%xmm1 \n"
  3915. "movq %%xmm0,(%1) \n"
  3916. "movq %%xmm1,0x00(%1,%2,1) \n"
  3917. "lea 0x8(%1),%1 \n"
  3918. "sub $0x10,%3 \n"
  3919. "jg 1b \n"
  3920. : "+r"(src_uyvy), // %0
  3921. "+r"(dst_u), // %1
  3922. "+r"(dst_v), // %2
  3923. "+r"(width) // %3
  3924. : "r"((intptr_t)(stride_uyvy)) // %4
  3925. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  3926. }
  3927. void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
  3928. uint8_t* dst_u,
  3929. uint8_t* dst_v,
  3930. int width) {
  3931. asm volatile(
  3932. "pcmpeqb %%xmm5,%%xmm5 \n"
  3933. "psrlw $0x8,%%xmm5 \n"
  3934. "sub %1,%2 \n"
  3935. LABELALIGN
  3936. "1: \n"
  3937. "movdqu (%0),%%xmm0 \n"
  3938. "movdqu 0x10(%0),%%xmm1 \n"
  3939. "lea 0x20(%0),%0 \n"
  3940. "pand %%xmm5,%%xmm0 \n"
  3941. "pand %%xmm5,%%xmm1 \n"
  3942. "packuswb %%xmm1,%%xmm0 \n"
  3943. "movdqa %%xmm0,%%xmm1 \n"
  3944. "pand %%xmm5,%%xmm0 \n"
  3945. "packuswb %%xmm0,%%xmm0 \n"
  3946. "psrlw $0x8,%%xmm1 \n"
  3947. "packuswb %%xmm1,%%xmm1 \n"
  3948. "movq %%xmm0,(%1) \n"
  3949. "movq %%xmm1,0x00(%1,%2,1) \n"
  3950. "lea 0x8(%1),%1 \n"
  3951. "sub $0x10,%3 \n"
  3952. "jg 1b \n"
  3953. : "+r"(src_uyvy), // %0
  3954. "+r"(dst_u), // %1
  3955. "+r"(dst_v), // %2
  3956. "+r"(width) // %3
  3957. :
  3958. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  3959. }
  3960. #endif // HAS_YUY2TOYROW_SSE2
  3961. #ifdef HAS_YUY2TOYROW_AVX2
  3962. void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  3963. asm volatile(
  3964. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3965. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3966. LABELALIGN
  3967. "1: \n"
  3968. "vmovdqu (%0),%%ymm0 \n"
  3969. "vmovdqu 0x20(%0),%%ymm1 \n"
  3970. "lea 0x40(%0),%0 \n"
  3971. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3972. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3973. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3974. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3975. "vmovdqu %%ymm0,(%1) \n"
  3976. "lea 0x20(%1),%1 \n"
  3977. "sub $0x20,%2 \n"
  3978. "jg 1b \n"
  3979. "vzeroupper \n"
  3980. : "+r"(src_yuy2), // %0
  3981. "+r"(dst_y), // %1
  3982. "+r"(width) // %2
  3983. :
  3984. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  3985. }
  3986. void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
  3987. int stride_yuy2,
  3988. uint8_t* dst_u,
  3989. uint8_t* dst_v,
  3990. int width) {
  3991. asm volatile(
  3992. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3993. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3994. "sub %1,%2 \n"
  3995. LABELALIGN
  3996. "1: \n"
  3997. "vmovdqu (%0),%%ymm0 \n"
  3998. "vmovdqu 0x20(%0),%%ymm1 \n"
  3999. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  4000. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  4001. "lea 0x40(%0),%0 \n"
  4002. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4003. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4004. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4005. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4006. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4007. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4008. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4009. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4010. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4011. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4012. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4013. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4014. "lea 0x10(%1),%1 \n"
  4015. "sub $0x20,%3 \n"
  4016. "jg 1b \n"
  4017. "vzeroupper \n"
  4018. : "+r"(src_yuy2), // %0
  4019. "+r"(dst_u), // %1
  4020. "+r"(dst_v), // %2
  4021. "+r"(width) // %3
  4022. : "r"((intptr_t)(stride_yuy2)) // %4
  4023. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4024. }
  4025. void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
  4026. uint8_t* dst_u,
  4027. uint8_t* dst_v,
  4028. int width) {
  4029. asm volatile(
  4030. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4031. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4032. "sub %1,%2 \n"
  4033. LABELALIGN
  4034. "1: \n"
  4035. "vmovdqu (%0),%%ymm0 \n"
  4036. "vmovdqu 0x20(%0),%%ymm1 \n"
  4037. "lea 0x40(%0),%0 \n"
  4038. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4039. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4040. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4041. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4042. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4043. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4044. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4045. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4046. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4047. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4048. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4049. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4050. "lea 0x10(%1),%1 \n"
  4051. "sub $0x20,%3 \n"
  4052. "jg 1b \n"
  4053. "vzeroupper \n"
  4054. : "+r"(src_yuy2), // %0
  4055. "+r"(dst_u), // %1
  4056. "+r"(dst_v), // %2
  4057. "+r"(width) // %3
  4058. :
  4059. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4060. }
  4061. void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  4062. asm volatile(
  4063. LABELALIGN
  4064. "1: \n"
  4065. "vmovdqu (%0),%%ymm0 \n"
  4066. "vmovdqu 0x20(%0),%%ymm1 \n"
  4067. "lea 0x40(%0),%0 \n"
  4068. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4069. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4070. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4071. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4072. "vmovdqu %%ymm0,(%1) \n"
  4073. "lea 0x20(%1),%1 \n"
  4074. "sub $0x20,%2 \n"
  4075. "jg 1b \n"
  4076. "vzeroupper \n"
  4077. : "+r"(src_uyvy), // %0
  4078. "+r"(dst_y), // %1
  4079. "+r"(width) // %2
  4080. :
  4081. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4082. }
  4083. void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
  4084. int stride_uyvy,
  4085. uint8_t* dst_u,
  4086. uint8_t* dst_v,
  4087. int width) {
  4088. asm volatile(
  4089. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4090. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4091. "sub %1,%2 \n"
  4092. LABELALIGN
  4093. "1: \n"
  4094. "vmovdqu (%0),%%ymm0 \n"
  4095. "vmovdqu 0x20(%0),%%ymm1 \n"
  4096. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  4097. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  4098. "lea 0x40(%0),%0 \n"
  4099. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  4100. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  4101. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4102. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4103. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4104. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4105. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4106. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4107. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4108. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4109. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4110. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4111. "lea 0x10(%1),%1 \n"
  4112. "sub $0x20,%3 \n"
  4113. "jg 1b \n"
  4114. "vzeroupper \n"
  4115. : "+r"(src_uyvy), // %0
  4116. "+r"(dst_u), // %1
  4117. "+r"(dst_v), // %2
  4118. "+r"(width) // %3
  4119. : "r"((intptr_t)(stride_uyvy)) // %4
  4120. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4121. }
  4122. void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
  4123. uint8_t* dst_u,
  4124. uint8_t* dst_v,
  4125. int width) {
  4126. asm volatile(
  4127. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4128. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4129. "sub %1,%2 \n"
  4130. LABELALIGN
  4131. "1: \n"
  4132. "vmovdqu (%0),%%ymm0 \n"
  4133. "vmovdqu 0x20(%0),%%ymm1 \n"
  4134. "lea 0x40(%0),%0 \n"
  4135. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  4136. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  4137. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4138. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4139. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4140. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4141. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4142. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4143. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4144. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4145. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4146. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4147. "lea 0x10(%1),%1 \n"
  4148. "sub $0x20,%3 \n"
  4149. "jg 1b \n"
  4150. "vzeroupper \n"
  4151. : "+r"(src_uyvy), // %0
  4152. "+r"(dst_u), // %1
  4153. "+r"(dst_v), // %2
  4154. "+r"(width) // %3
  4155. :
  4156. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4157. }
  4158. #endif // HAS_YUY2TOYROW_AVX2
  4159. #ifdef HAS_ARGBBLENDROW_SSSE3
  4160. // Shuffle table for isolating alpha.
  4161. static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  4162. 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
  4163. // Blend 8 pixels at a time
  4164. void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
  4165. const uint8_t* src_argb1,
  4166. uint8_t* dst_argb,
  4167. int width) {
  4168. asm volatile(
  4169. "pcmpeqb %%xmm7,%%xmm7 \n"
  4170. "psrlw $0xf,%%xmm7 \n"
  4171. "pcmpeqb %%xmm6,%%xmm6 \n"
  4172. "psrlw $0x8,%%xmm6 \n"
  4173. "pcmpeqb %%xmm5,%%xmm5 \n"
  4174. "psllw $0x8,%%xmm5 \n"
  4175. "pcmpeqb %%xmm4,%%xmm4 \n"
  4176. "pslld $0x18,%%xmm4 \n"
  4177. "sub $0x4,%3 \n"
  4178. "jl 49f \n"
  4179. // 4 pixel loop.
  4180. LABELALIGN
  4181. "40: \n"
  4182. "movdqu (%0),%%xmm3 \n"
  4183. "lea 0x10(%0),%0 \n"
  4184. "movdqa %%xmm3,%%xmm0 \n"
  4185. "pxor %%xmm4,%%xmm3 \n"
  4186. "movdqu (%1),%%xmm2 \n"
  4187. "pshufb %4,%%xmm3 \n"
  4188. "pand %%xmm6,%%xmm2 \n"
  4189. "paddw %%xmm7,%%xmm3 \n"
  4190. "pmullw %%xmm3,%%xmm2 \n"
  4191. "movdqu (%1),%%xmm1 \n"
  4192. "lea 0x10(%1),%1 \n"
  4193. "psrlw $0x8,%%xmm1 \n"
  4194. "por %%xmm4,%%xmm0 \n"
  4195. "pmullw %%xmm3,%%xmm1 \n"
  4196. "psrlw $0x8,%%xmm2 \n"
  4197. "paddusb %%xmm2,%%xmm0 \n"
  4198. "pand %%xmm5,%%xmm1 \n"
  4199. "paddusb %%xmm1,%%xmm0 \n"
  4200. "movdqu %%xmm0,(%2) \n"
  4201. "lea 0x10(%2),%2 \n"
  4202. "sub $0x4,%3 \n"
  4203. "jge 40b \n"
  4204. "49: \n"
  4205. "add $0x3,%3 \n"
  4206. "jl 99f \n"
  4207. // 1 pixel loop.
  4208. "91: \n"
  4209. "movd (%0),%%xmm3 \n"
  4210. "lea 0x4(%0),%0 \n"
  4211. "movdqa %%xmm3,%%xmm0 \n"
  4212. "pxor %%xmm4,%%xmm3 \n"
  4213. "movd (%1),%%xmm2 \n"
  4214. "pshufb %4,%%xmm3 \n"
  4215. "pand %%xmm6,%%xmm2 \n"
  4216. "paddw %%xmm7,%%xmm3 \n"
  4217. "pmullw %%xmm3,%%xmm2 \n"
  4218. "movd (%1),%%xmm1 \n"
  4219. "lea 0x4(%1),%1 \n"
  4220. "psrlw $0x8,%%xmm1 \n"
  4221. "por %%xmm4,%%xmm0 \n"
  4222. "pmullw %%xmm3,%%xmm1 \n"
  4223. "psrlw $0x8,%%xmm2 \n"
  4224. "paddusb %%xmm2,%%xmm0 \n"
  4225. "pand %%xmm5,%%xmm1 \n"
  4226. "paddusb %%xmm1,%%xmm0 \n"
  4227. "movd %%xmm0,(%2) \n"
  4228. "lea 0x4(%2),%2 \n"
  4229. "sub $0x1,%3 \n"
  4230. "jge 91b \n"
  4231. "99: \n"
  4232. : "+r"(src_argb0), // %0
  4233. "+r"(src_argb1), // %1
  4234. "+r"(dst_argb), // %2
  4235. "+r"(width) // %3
  4236. : "m"(kShuffleAlpha) // %4
  4237. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4238. "xmm7");
  4239. }
  4240. #endif // HAS_ARGBBLENDROW_SSSE3
  4241. #ifdef HAS_BLENDPLANEROW_SSSE3
  4242. // Blend 8 pixels at a time.
  4243. // unsigned version of math
  4244. // =((A2*C2)+(B2*(255-C2))+255)/256
  4245. // signed version of math
  4246. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  4247. void BlendPlaneRow_SSSE3(const uint8_t* src0,
  4248. const uint8_t* src1,
  4249. const uint8_t* alpha,
  4250. uint8_t* dst,
  4251. int width) {
  4252. asm volatile(
  4253. "pcmpeqb %%xmm5,%%xmm5 \n"
  4254. "psllw $0x8,%%xmm5 \n"
  4255. "mov $0x80808080,%%eax \n"
  4256. "movd %%eax,%%xmm6 \n"
  4257. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  4258. "mov $0x807f807f,%%eax \n"
  4259. "movd %%eax,%%xmm7 \n"
  4260. "pshufd $0x0,%%xmm7,%%xmm7 \n"
  4261. "sub %2,%0 \n"
  4262. "sub %2,%1 \n"
  4263. "sub %2,%3 \n"
  4264. // 8 pixel loop.
  4265. LABELALIGN
  4266. "1: \n"
  4267. "movq (%2),%%xmm0 \n"
  4268. "punpcklbw %%xmm0,%%xmm0 \n"
  4269. "pxor %%xmm5,%%xmm0 \n"
  4270. "movq (%0,%2,1),%%xmm1 \n"
  4271. "movq (%1,%2,1),%%xmm2 \n"
  4272. "punpcklbw %%xmm2,%%xmm1 \n"
  4273. "psubb %%xmm6,%%xmm1 \n"
  4274. "pmaddubsw %%xmm1,%%xmm0 \n"
  4275. "paddw %%xmm7,%%xmm0 \n"
  4276. "psrlw $0x8,%%xmm0 \n"
  4277. "packuswb %%xmm0,%%xmm0 \n"
  4278. "movq %%xmm0,(%3,%2,1) \n"
  4279. "lea 0x8(%2),%2 \n"
  4280. "sub $0x8,%4 \n"
  4281. "jg 1b \n"
  4282. : "+r"(src0), // %0
  4283. "+r"(src1), // %1
  4284. "+r"(alpha), // %2
  4285. "+r"(dst), // %3
  4286. "+rm"(width) // %4
  4287. ::"memory",
  4288. "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
  4289. }
  4290. #endif // HAS_BLENDPLANEROW_SSSE3
  4291. #ifdef HAS_BLENDPLANEROW_AVX2
  4292. // Blend 32 pixels at a time.
  4293. // unsigned version of math
  4294. // =((A2*C2)+(B2*(255-C2))+255)/256
  4295. // signed version of math
  4296. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  4297. void BlendPlaneRow_AVX2(const uint8_t* src0,
  4298. const uint8_t* src1,
  4299. const uint8_t* alpha,
  4300. uint8_t* dst,
  4301. int width) {
  4302. asm volatile(
  4303. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4304. "vpsllw $0x8,%%ymm5,%%ymm5 \n"
  4305. "mov $0x80808080,%%eax \n"
  4306. "vmovd %%eax,%%xmm6 \n"
  4307. "vbroadcastss %%xmm6,%%ymm6 \n"
  4308. "mov $0x807f807f,%%eax \n"
  4309. "vmovd %%eax,%%xmm7 \n"
  4310. "vbroadcastss %%xmm7,%%ymm7 \n"
  4311. "sub %2,%0 \n"
  4312. "sub %2,%1 \n"
  4313. "sub %2,%3 \n"
  4314. // 32 pixel loop.
  4315. LABELALIGN
  4316. "1: \n"
  4317. "vmovdqu (%2),%%ymm0 \n"
  4318. "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
  4319. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  4320. "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
  4321. "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
  4322. "vmovdqu (%0,%2,1),%%ymm1 \n"
  4323. "vmovdqu (%1,%2,1),%%ymm2 \n"
  4324. "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
  4325. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  4326. "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
  4327. "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
  4328. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  4329. "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
  4330. "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
  4331. "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
  4332. "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
  4333. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4334. "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
  4335. "vmovdqu %%ymm0,(%3,%2,1) \n"
  4336. "lea 0x20(%2),%2 \n"
  4337. "sub $0x20,%4 \n"
  4338. "jg 1b \n"
  4339. "vzeroupper \n"
  4340. : "+r"(src0), // %0
  4341. "+r"(src1), // %1
  4342. "+r"(alpha), // %2
  4343. "+r"(dst), // %3
  4344. "+rm"(width) // %4
  4345. ::"memory",
  4346. "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4347. "xmm7");
  4348. }
  4349. #endif // HAS_BLENDPLANEROW_AVX2
  4350. #ifdef HAS_ARGBATTENUATEROW_SSSE3
  4351. // Shuffle table duplicating alpha
  4352. static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
  4353. 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
  4354. static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  4355. 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
  4356. // Attenuate 4 pixels at a time.
  4357. void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
  4358. uint8_t* dst_argb,
  4359. int width) {
  4360. asm volatile(
  4361. "pcmpeqb %%xmm3,%%xmm3 \n"
  4362. "pslld $0x18,%%xmm3 \n"
  4363. "movdqa %3,%%xmm4 \n"
  4364. "movdqa %4,%%xmm5 \n"
  4365. // 4 pixel loop.
  4366. LABELALIGN
  4367. "1: \n"
  4368. "movdqu (%0),%%xmm0 \n"
  4369. "pshufb %%xmm4,%%xmm0 \n"
  4370. "movdqu (%0),%%xmm1 \n"
  4371. "punpcklbw %%xmm1,%%xmm1 \n"
  4372. "pmulhuw %%xmm1,%%xmm0 \n"
  4373. "movdqu (%0),%%xmm1 \n"
  4374. "pshufb %%xmm5,%%xmm1 \n"
  4375. "movdqu (%0),%%xmm2 \n"
  4376. "punpckhbw %%xmm2,%%xmm2 \n"
  4377. "pmulhuw %%xmm2,%%xmm1 \n"
  4378. "movdqu (%0),%%xmm2 \n"
  4379. "lea 0x10(%0),%0 \n"
  4380. "pand %%xmm3,%%xmm2 \n"
  4381. "psrlw $0x8,%%xmm0 \n"
  4382. "psrlw $0x8,%%xmm1 \n"
  4383. "packuswb %%xmm1,%%xmm0 \n"
  4384. "por %%xmm2,%%xmm0 \n"
  4385. "movdqu %%xmm0,(%1) \n"
  4386. "lea 0x10(%1),%1 \n"
  4387. "sub $0x4,%2 \n"
  4388. "jg 1b \n"
  4389. : "+r"(src_argb), // %0
  4390. "+r"(dst_argb), // %1
  4391. "+r"(width) // %2
  4392. : "m"(kShuffleAlpha0), // %3
  4393. "m"(kShuffleAlpha1) // %4
  4394. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  4395. }
  4396. #endif // HAS_ARGBATTENUATEROW_SSSE3
  4397. #ifdef HAS_ARGBATTENUATEROW_AVX2
  4398. // Shuffle table duplicating alpha.
  4399. static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
  4400. 128u, 128u, 14u, 15u, 14u, 15u,
  4401. 14u, 15u, 128u, 128u};
  4402. // Attenuate 8 pixels at a time.
  4403. void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
  4404. uint8_t* dst_argb,
  4405. int width) {
  4406. asm volatile(
  4407. "vbroadcastf128 %3,%%ymm4 \n"
  4408. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4409. "vpslld $0x18,%%ymm5,%%ymm5 \n"
  4410. "sub %0,%1 \n"
  4411. // 8 pixel loop.
  4412. LABELALIGN
  4413. "1: \n"
  4414. "vmovdqu (%0),%%ymm6 \n"
  4415. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  4416. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  4417. "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
  4418. "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
  4419. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  4420. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  4421. "vpand %%ymm5,%%ymm6,%%ymm6 \n"
  4422. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4423. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4424. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4425. "vpor %%ymm6,%%ymm0,%%ymm0 \n"
  4426. "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
  4427. "lea 0x20(%0),%0 \n"
  4428. "sub $0x8,%2 \n"
  4429. "jg 1b \n"
  4430. "vzeroupper \n"
  4431. : "+r"(src_argb), // %0
  4432. "+r"(dst_argb), // %1
  4433. "+r"(width) // %2
  4434. : "m"(kShuffleAlpha_AVX2) // %3
  4435. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  4436. }
  4437. #endif // HAS_ARGBATTENUATEROW_AVX2
  4438. #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  4439. // Unattenuate 4 pixels at a time.
  4440. void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
  4441. uint8_t* dst_argb,
  4442. int width) {
  4443. uintptr_t alpha;
  4444. asm volatile(
  4445. // 4 pixel loop.
  4446. LABELALIGN
  4447. "1: \n"
  4448. "movdqu (%0),%%xmm0 \n"
  4449. "movzb 0x03(%0),%3 \n"
  4450. "punpcklbw %%xmm0,%%xmm0 \n"
  4451. "movd 0x00(%4,%3,4),%%xmm2 \n"
  4452. "movzb 0x07(%0),%3 \n"
  4453. "movd 0x00(%4,%3,4),%%xmm3 \n"
  4454. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  4455. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  4456. "movlhps %%xmm3,%%xmm2 \n"
  4457. "pmulhuw %%xmm2,%%xmm0 \n"
  4458. "movdqu (%0),%%xmm1 \n"
  4459. "movzb 0x0b(%0),%3 \n"
  4460. "punpckhbw %%xmm1,%%xmm1 \n"
  4461. "movd 0x00(%4,%3,4),%%xmm2 \n"
  4462. "movzb 0x0f(%0),%3 \n"
  4463. "movd 0x00(%4,%3,4),%%xmm3 \n"
  4464. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  4465. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  4466. "movlhps %%xmm3,%%xmm2 \n"
  4467. "pmulhuw %%xmm2,%%xmm1 \n"
  4468. "lea 0x10(%0),%0 \n"
  4469. "packuswb %%xmm1,%%xmm0 \n"
  4470. "movdqu %%xmm0,(%1) \n"
  4471. "lea 0x10(%1),%1 \n"
  4472. "sub $0x4,%2 \n"
  4473. "jg 1b \n"
  4474. : "+r"(src_argb), // %0
  4475. "+r"(dst_argb), // %1
  4476. "+r"(width), // %2
  4477. "=&r"(alpha) // %3
  4478. : "r"(fixed_invtbl8) // %4
  4479. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  4480. }
  4481. #endif // HAS_ARGBUNATTENUATEROW_SSE2
  4482. #ifdef HAS_ARGBUNATTENUATEROW_AVX2
  4483. // Shuffle table duplicating alpha.
  4484. static const uvec8 kUnattenShuffleAlpha_AVX2 = {
  4485. 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
  4486. // Unattenuate 8 pixels at a time.
  4487. void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
  4488. uint8_t* dst_argb,
  4489. int width) {
  4490. uintptr_t alpha;
  4491. asm volatile(
  4492. "sub %0,%1 \n"
  4493. "vbroadcastf128 %5,%%ymm5 \n"
  4494. // 8 pixel loop.
  4495. LABELALIGN
  4496. "1: \n"
  4497. // replace VPGATHER
  4498. "movzb 0x03(%0),%3 \n"
  4499. "vmovd 0x00(%4,%3,4),%%xmm0 \n"
  4500. "movzb 0x07(%0),%3 \n"
  4501. "vmovd 0x00(%4,%3,4),%%xmm1 \n"
  4502. "movzb 0x0b(%0),%3 \n"
  4503. "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
  4504. "vmovd 0x00(%4,%3,4),%%xmm2 \n"
  4505. "movzb 0x0f(%0),%3 \n"
  4506. "vmovd 0x00(%4,%3,4),%%xmm3 \n"
  4507. "movzb 0x13(%0),%3 \n"
  4508. "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
  4509. "vmovd 0x00(%4,%3,4),%%xmm0 \n"
  4510. "movzb 0x17(%0),%3 \n"
  4511. "vmovd 0x00(%4,%3,4),%%xmm1 \n"
  4512. "movzb 0x1b(%0),%3 \n"
  4513. "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
  4514. "vmovd 0x00(%4,%3,4),%%xmm2 \n"
  4515. "movzb 0x1f(%0),%3 \n"
  4516. "vmovd 0x00(%4,%3,4),%%xmm3 \n"
  4517. "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
  4518. "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
  4519. "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
  4520. "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
  4521. // end of VPGATHER
  4522. "vmovdqu (%0),%%ymm6 \n"
  4523. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  4524. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  4525. "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
  4526. "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
  4527. "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
  4528. "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
  4529. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  4530. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  4531. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4532. "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
  4533. "lea 0x20(%0),%0 \n"
  4534. "sub $0x8,%2 \n"
  4535. "jg 1b \n"
  4536. "vzeroupper \n"
  4537. : "+r"(src_argb), // %0
  4538. "+r"(dst_argb), // %1
  4539. "+r"(width), // %2
  4540. "=&r"(alpha) // %3
  4541. : "r"(fixed_invtbl8), // %4
  4542. "m"(kUnattenShuffleAlpha_AVX2) // %5
  4543. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4544. "xmm7");
  4545. }
  4546. #endif // HAS_ARGBUNATTENUATEROW_AVX2
  4547. #ifdef HAS_ARGBGRAYROW_SSSE3
  4548. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  4549. void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  4550. asm volatile(
  4551. "movdqa %3,%%xmm4 \n"
  4552. "movdqa %4,%%xmm5 \n"
  4553. // 8 pixel loop.
  4554. LABELALIGN
  4555. "1: \n"
  4556. "movdqu (%0),%%xmm0 \n"
  4557. "movdqu 0x10(%0),%%xmm1 \n"
  4558. "pmaddubsw %%xmm4,%%xmm0 \n"
  4559. "pmaddubsw %%xmm4,%%xmm1 \n"
  4560. "phaddw %%xmm1,%%xmm0 \n"
  4561. "paddw %%xmm5,%%xmm0 \n"
  4562. "psrlw $0x7,%%xmm0 \n"
  4563. "packuswb %%xmm0,%%xmm0 \n"
  4564. "movdqu (%0),%%xmm2 \n"
  4565. "movdqu 0x10(%0),%%xmm3 \n"
  4566. "lea 0x20(%0),%0 \n"
  4567. "psrld $0x18,%%xmm2 \n"
  4568. "psrld $0x18,%%xmm3 \n"
  4569. "packuswb %%xmm3,%%xmm2 \n"
  4570. "packuswb %%xmm2,%%xmm2 \n"
  4571. "movdqa %%xmm0,%%xmm3 \n"
  4572. "punpcklbw %%xmm0,%%xmm0 \n"
  4573. "punpcklbw %%xmm2,%%xmm3 \n"
  4574. "movdqa %%xmm0,%%xmm1 \n"
  4575. "punpcklwd %%xmm3,%%xmm0 \n"
  4576. "punpckhwd %%xmm3,%%xmm1 \n"
  4577. "movdqu %%xmm0,(%1) \n"
  4578. "movdqu %%xmm1,0x10(%1) \n"
  4579. "lea 0x20(%1),%1 \n"
  4580. "sub $0x8,%2 \n"
  4581. "jg 1b \n"
  4582. : "+r"(src_argb), // %0
  4583. "+r"(dst_argb), // %1
  4584. "+r"(width) // %2
  4585. : "m"(kARGBToYJ), // %3
  4586. "m"(kAddYJ64) // %4
  4587. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  4588. }
  4589. #endif // HAS_ARGBGRAYROW_SSSE3
  4590. #ifdef HAS_ARGBSEPIAROW_SSSE3
  4591. // b = (r * 35 + g * 68 + b * 17) >> 7
  4592. // g = (r * 45 + g * 88 + b * 22) >> 7
  4593. // r = (r * 50 + g * 98 + b * 24) >> 7
  4594. // Constant for ARGB color to sepia tone
  4595. static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
  4596. 17, 68, 35, 0, 17, 68, 35, 0};
  4597. static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
  4598. 22, 88, 45, 0, 22, 88, 45, 0};
  4599. static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
  4600. 24, 98, 50, 0, 24, 98, 50, 0};
  4601. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  4602. void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
  4603. asm volatile(
  4604. "movdqa %2,%%xmm2 \n"
  4605. "movdqa %3,%%xmm3 \n"
  4606. "movdqa %4,%%xmm4 \n"
  4607. // 8 pixel loop.
  4608. LABELALIGN
  4609. "1: \n"
  4610. "movdqu (%0),%%xmm0 \n"
  4611. "movdqu 0x10(%0),%%xmm6 \n"
  4612. "pmaddubsw %%xmm2,%%xmm0 \n"
  4613. "pmaddubsw %%xmm2,%%xmm6 \n"
  4614. "phaddw %%xmm6,%%xmm0 \n"
  4615. "psrlw $0x7,%%xmm0 \n"
  4616. "packuswb %%xmm0,%%xmm0 \n"
  4617. "movdqu (%0),%%xmm5 \n"
  4618. "movdqu 0x10(%0),%%xmm1 \n"
  4619. "pmaddubsw %%xmm3,%%xmm5 \n"
  4620. "pmaddubsw %%xmm3,%%xmm1 \n"
  4621. "phaddw %%xmm1,%%xmm5 \n"
  4622. "psrlw $0x7,%%xmm5 \n"
  4623. "packuswb %%xmm5,%%xmm5 \n"
  4624. "punpcklbw %%xmm5,%%xmm0 \n"
  4625. "movdqu (%0),%%xmm5 \n"
  4626. "movdqu 0x10(%0),%%xmm1 \n"
  4627. "pmaddubsw %%xmm4,%%xmm5 \n"
  4628. "pmaddubsw %%xmm4,%%xmm1 \n"
  4629. "phaddw %%xmm1,%%xmm5 \n"
  4630. "psrlw $0x7,%%xmm5 \n"
  4631. "packuswb %%xmm5,%%xmm5 \n"
  4632. "movdqu (%0),%%xmm6 \n"
  4633. "movdqu 0x10(%0),%%xmm1 \n"
  4634. "psrld $0x18,%%xmm6 \n"
  4635. "psrld $0x18,%%xmm1 \n"
  4636. "packuswb %%xmm1,%%xmm6 \n"
  4637. "packuswb %%xmm6,%%xmm6 \n"
  4638. "punpcklbw %%xmm6,%%xmm5 \n"
  4639. "movdqa %%xmm0,%%xmm1 \n"
  4640. "punpcklwd %%xmm5,%%xmm0 \n"
  4641. "punpckhwd %%xmm5,%%xmm1 \n"
  4642. "movdqu %%xmm0,(%0) \n"
  4643. "movdqu %%xmm1,0x10(%0) \n"
  4644. "lea 0x20(%0),%0 \n"
  4645. "sub $0x8,%1 \n"
  4646. "jg 1b \n"
  4647. : "+r"(dst_argb), // %0
  4648. "+r"(width) // %1
  4649. : "m"(kARGBToSepiaB), // %2
  4650. "m"(kARGBToSepiaG), // %3
  4651. "m"(kARGBToSepiaR) // %4
  4652. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  4653. }
  4654. #endif // HAS_ARGBSEPIAROW_SSSE3
  4655. #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  4656. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  4657. // Same as Sepia except matrix is provided.
  4658. void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
  4659. uint8_t* dst_argb,
  4660. const int8_t* matrix_argb,
  4661. int width) {
  4662. asm volatile(
  4663. "movdqu (%3),%%xmm5 \n"
  4664. "pshufd $0x00,%%xmm5,%%xmm2 \n"
  4665. "pshufd $0x55,%%xmm5,%%xmm3 \n"
  4666. "pshufd $0xaa,%%xmm5,%%xmm4 \n"
  4667. "pshufd $0xff,%%xmm5,%%xmm5 \n"
  4668. // 8 pixel loop.
  4669. LABELALIGN
  4670. "1: \n"
  4671. "movdqu (%0),%%xmm0 \n"
  4672. "movdqu 0x10(%0),%%xmm7 \n"
  4673. "pmaddubsw %%xmm2,%%xmm0 \n"
  4674. "pmaddubsw %%xmm2,%%xmm7 \n"
  4675. "movdqu (%0),%%xmm6 \n"
  4676. "movdqu 0x10(%0),%%xmm1 \n"
  4677. "pmaddubsw %%xmm3,%%xmm6 \n"
  4678. "pmaddubsw %%xmm3,%%xmm1 \n"
  4679. "phaddsw %%xmm7,%%xmm0 \n"
  4680. "phaddsw %%xmm1,%%xmm6 \n"
  4681. "psraw $0x6,%%xmm0 \n"
  4682. "psraw $0x6,%%xmm6 \n"
  4683. "packuswb %%xmm0,%%xmm0 \n"
  4684. "packuswb %%xmm6,%%xmm6 \n"
  4685. "punpcklbw %%xmm6,%%xmm0 \n"
  4686. "movdqu (%0),%%xmm1 \n"
  4687. "movdqu 0x10(%0),%%xmm7 \n"
  4688. "pmaddubsw %%xmm4,%%xmm1 \n"
  4689. "pmaddubsw %%xmm4,%%xmm7 \n"
  4690. "phaddsw %%xmm7,%%xmm1 \n"
  4691. "movdqu (%0),%%xmm6 \n"
  4692. "movdqu 0x10(%0),%%xmm7 \n"
  4693. "pmaddubsw %%xmm5,%%xmm6 \n"
  4694. "pmaddubsw %%xmm5,%%xmm7 \n"
  4695. "phaddsw %%xmm7,%%xmm6 \n"
  4696. "psraw $0x6,%%xmm1 \n"
  4697. "psraw $0x6,%%xmm6 \n"
  4698. "packuswb %%xmm1,%%xmm1 \n"
  4699. "packuswb %%xmm6,%%xmm6 \n"
  4700. "punpcklbw %%xmm6,%%xmm1 \n"
  4701. "movdqa %%xmm0,%%xmm6 \n"
  4702. "punpcklwd %%xmm1,%%xmm0 \n"
  4703. "punpckhwd %%xmm1,%%xmm6 \n"
  4704. "movdqu %%xmm0,(%1) \n"
  4705. "movdqu %%xmm6,0x10(%1) \n"
  4706. "lea 0x20(%0),%0 \n"
  4707. "lea 0x20(%1),%1 \n"
  4708. "sub $0x8,%2 \n"
  4709. "jg 1b \n"
  4710. : "+r"(src_argb), // %0
  4711. "+r"(dst_argb), // %1
  4712. "+r"(width) // %2
  4713. : "r"(matrix_argb) // %3
  4714. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4715. "xmm7");
  4716. }
  4717. #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
  4718. #ifdef HAS_ARGBQUANTIZEROW_SSE2
  4719. // Quantize 4 ARGB pixels (16 bytes).
  4720. void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
  4721. int scale,
  4722. int interval_size,
  4723. int interval_offset,
  4724. int width) {
  4725. asm volatile(
  4726. "movd %2,%%xmm2 \n"
  4727. "movd %3,%%xmm3 \n"
  4728. "movd %4,%%xmm4 \n"
  4729. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  4730. "pshufd $0x44,%%xmm2,%%xmm2 \n"
  4731. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  4732. "pshufd $0x44,%%xmm3,%%xmm3 \n"
  4733. "pshuflw $0x40,%%xmm4,%%xmm4 \n"
  4734. "pshufd $0x44,%%xmm4,%%xmm4 \n"
  4735. "pxor %%xmm5,%%xmm5 \n"
  4736. "pcmpeqb %%xmm6,%%xmm6 \n"
  4737. "pslld $0x18,%%xmm6 \n"
  4738. // 4 pixel loop.
  4739. LABELALIGN
  4740. "1: \n"
  4741. "movdqu (%0),%%xmm0 \n"
  4742. "punpcklbw %%xmm5,%%xmm0 \n"
  4743. "pmulhuw %%xmm2,%%xmm0 \n"
  4744. "movdqu (%0),%%xmm1 \n"
  4745. "punpckhbw %%xmm5,%%xmm1 \n"
  4746. "pmulhuw %%xmm2,%%xmm1 \n"
  4747. "pmullw %%xmm3,%%xmm0 \n"
  4748. "movdqu (%0),%%xmm7 \n"
  4749. "pmullw %%xmm3,%%xmm1 \n"
  4750. "pand %%xmm6,%%xmm7 \n"
  4751. "paddw %%xmm4,%%xmm0 \n"
  4752. "paddw %%xmm4,%%xmm1 \n"
  4753. "packuswb %%xmm1,%%xmm0 \n"
  4754. "por %%xmm7,%%xmm0 \n"
  4755. "movdqu %%xmm0,(%0) \n"
  4756. "lea 0x10(%0),%0 \n"
  4757. "sub $0x4,%1 \n"
  4758. "jg 1b \n"
  4759. : "+r"(dst_argb), // %0
  4760. "+r"(width) // %1
  4761. : "r"(scale), // %2
  4762. "r"(interval_size), // %3
  4763. "r"(interval_offset) // %4
  4764. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4765. "xmm7");
  4766. }
  4767. #endif // HAS_ARGBQUANTIZEROW_SSE2
  4768. #ifdef HAS_ARGBSHADEROW_SSE2
  4769. // Shade 4 pixels at a time by specified value.
  4770. void ARGBShadeRow_SSE2(const uint8_t* src_argb,
  4771. uint8_t* dst_argb,
  4772. int width,
  4773. uint32_t value) {
  4774. asm volatile(
  4775. "movd %3,%%xmm2 \n"
  4776. "punpcklbw %%xmm2,%%xmm2 \n"
  4777. "punpcklqdq %%xmm2,%%xmm2 \n"
  4778. // 4 pixel loop.
  4779. LABELALIGN
  4780. "1: \n"
  4781. "movdqu (%0),%%xmm0 \n"
  4782. "lea 0x10(%0),%0 \n"
  4783. "movdqa %%xmm0,%%xmm1 \n"
  4784. "punpcklbw %%xmm0,%%xmm0 \n"
  4785. "punpckhbw %%xmm1,%%xmm1 \n"
  4786. "pmulhuw %%xmm2,%%xmm0 \n"
  4787. "pmulhuw %%xmm2,%%xmm1 \n"
  4788. "psrlw $0x8,%%xmm0 \n"
  4789. "psrlw $0x8,%%xmm1 \n"
  4790. "packuswb %%xmm1,%%xmm0 \n"
  4791. "movdqu %%xmm0,(%1) \n"
  4792. "lea 0x10(%1),%1 \n"
  4793. "sub $0x4,%2 \n"
  4794. "jg 1b \n"
  4795. : "+r"(src_argb), // %0
  4796. "+r"(dst_argb), // %1
  4797. "+r"(width) // %2
  4798. : "r"(value) // %3
  4799. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  4800. }
  4801. #endif // HAS_ARGBSHADEROW_SSE2
  4802. #ifdef HAS_ARGBMULTIPLYROW_SSE2
  4803. // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  4804. void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
  4805. const uint8_t* src_argb1,
  4806. uint8_t* dst_argb,
  4807. int width) {
  4808. asm volatile(
  4809. "pxor %%xmm5,%%xmm5 \n"
  4810. // 4 pixel loop.
  4811. LABELALIGN
  4812. "1: \n"
  4813. "movdqu (%0),%%xmm0 \n"
  4814. "lea 0x10(%0),%0 \n"
  4815. "movdqu (%1),%%xmm2 \n"
  4816. "lea 0x10(%1),%1 \n"
  4817. "movdqu %%xmm0,%%xmm1 \n"
  4818. "movdqu %%xmm2,%%xmm3 \n"
  4819. "punpcklbw %%xmm0,%%xmm0 \n"
  4820. "punpckhbw %%xmm1,%%xmm1 \n"
  4821. "punpcklbw %%xmm5,%%xmm2 \n"
  4822. "punpckhbw %%xmm5,%%xmm3 \n"
  4823. "pmulhuw %%xmm2,%%xmm0 \n"
  4824. "pmulhuw %%xmm3,%%xmm1 \n"
  4825. "packuswb %%xmm1,%%xmm0 \n"
  4826. "movdqu %%xmm0,(%2) \n"
  4827. "lea 0x10(%2),%2 \n"
  4828. "sub $0x4,%3 \n"
  4829. "jg 1b \n"
  4830. : "+r"(src_argb0), // %0
  4831. "+r"(src_argb1), // %1
  4832. "+r"(dst_argb), // %2
  4833. "+r"(width) // %3
  4834. :
  4835. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  4836. }
  4837. #endif // HAS_ARGBMULTIPLYROW_SSE2
  4838. #ifdef HAS_ARGBMULTIPLYROW_AVX2
  4839. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  4840. void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
  4841. const uint8_t* src_argb1,
  4842. uint8_t* dst_argb,
  4843. int width) {
  4844. asm volatile(
  4845. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  4846. // 4 pixel loop.
  4847. LABELALIGN
  4848. "1: \n"
  4849. "vmovdqu (%0),%%ymm1 \n"
  4850. "lea 0x20(%0),%0 \n"
  4851. "vmovdqu (%1),%%ymm3 \n"
  4852. "lea 0x20(%1),%1 \n"
  4853. "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
  4854. "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
  4855. "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
  4856. "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
  4857. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  4858. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  4859. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4860. "vmovdqu %%ymm0,(%2) \n"
  4861. "lea 0x20(%2),%2 \n"
  4862. "sub $0x8,%3 \n"
  4863. "jg 1b \n"
  4864. "vzeroupper \n"
  4865. : "+r"(src_argb0), // %0
  4866. "+r"(src_argb1), // %1
  4867. "+r"(dst_argb), // %2
  4868. "+r"(width) // %3
  4869. :
  4870. : "memory", "cc"
  4871. #if defined(__AVX2__)
  4872. ,
  4873. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4874. #endif
  4875. );
  4876. }
  4877. #endif // HAS_ARGBMULTIPLYROW_AVX2
  4878. #ifdef HAS_ARGBADDROW_SSE2
  4879. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4880. void ARGBAddRow_SSE2(const uint8_t* src_argb0,
  4881. const uint8_t* src_argb1,
  4882. uint8_t* dst_argb,
  4883. int width) {
  4884. asm volatile(
  4885. // 4 pixel loop.
  4886. LABELALIGN
  4887. "1: \n"
  4888. "movdqu (%0),%%xmm0 \n"
  4889. "lea 0x10(%0),%0 \n"
  4890. "movdqu (%1),%%xmm1 \n"
  4891. "lea 0x10(%1),%1 \n"
  4892. "paddusb %%xmm1,%%xmm0 \n"
  4893. "movdqu %%xmm0,(%2) \n"
  4894. "lea 0x10(%2),%2 \n"
  4895. "sub $0x4,%3 \n"
  4896. "jg 1b \n"
  4897. : "+r"(src_argb0), // %0
  4898. "+r"(src_argb1), // %1
  4899. "+r"(dst_argb), // %2
  4900. "+r"(width) // %3
  4901. :
  4902. : "memory", "cc", "xmm0", "xmm1");
  4903. }
  4904. #endif // HAS_ARGBADDROW_SSE2
  4905. #ifdef HAS_ARGBADDROW_AVX2
  4906. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4907. void ARGBAddRow_AVX2(const uint8_t* src_argb0,
  4908. const uint8_t* src_argb1,
  4909. uint8_t* dst_argb,
  4910. int width) {
  4911. asm volatile(
  4912. // 4 pixel loop.
  4913. LABELALIGN
  4914. "1: \n"
  4915. "vmovdqu (%0),%%ymm0 \n"
  4916. "lea 0x20(%0),%0 \n"
  4917. "vpaddusb (%1),%%ymm0,%%ymm0 \n"
  4918. "lea 0x20(%1),%1 \n"
  4919. "vmovdqu %%ymm0,(%2) \n"
  4920. "lea 0x20(%2),%2 \n"
  4921. "sub $0x8,%3 \n"
  4922. "jg 1b \n"
  4923. "vzeroupper \n"
  4924. : "+r"(src_argb0), // %0
  4925. "+r"(src_argb1), // %1
  4926. "+r"(dst_argb), // %2
  4927. "+r"(width) // %3
  4928. :
  4929. : "memory", "cc", "xmm0");
  4930. }
  4931. #endif // HAS_ARGBADDROW_AVX2
  4932. #ifdef HAS_ARGBSUBTRACTROW_SSE2
  4933. // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
  4934. void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
  4935. const uint8_t* src_argb1,
  4936. uint8_t* dst_argb,
  4937. int width) {
  4938. asm volatile(
  4939. // 4 pixel loop.
  4940. LABELALIGN
  4941. "1: \n"
  4942. "movdqu (%0),%%xmm0 \n"
  4943. "lea 0x10(%0),%0 \n"
  4944. "movdqu (%1),%%xmm1 \n"
  4945. "lea 0x10(%1),%1 \n"
  4946. "psubusb %%xmm1,%%xmm0 \n"
  4947. "movdqu %%xmm0,(%2) \n"
  4948. "lea 0x10(%2),%2 \n"
  4949. "sub $0x4,%3 \n"
  4950. "jg 1b \n"
  4951. : "+r"(src_argb0), // %0
  4952. "+r"(src_argb1), // %1
  4953. "+r"(dst_argb), // %2
  4954. "+r"(width) // %3
  4955. :
  4956. : "memory", "cc", "xmm0", "xmm1");
  4957. }
  4958. #endif // HAS_ARGBSUBTRACTROW_SSE2
  4959. #ifdef HAS_ARGBSUBTRACTROW_AVX2
  4960. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  4961. void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
  4962. const uint8_t* src_argb1,
  4963. uint8_t* dst_argb,
  4964. int width) {
  4965. asm volatile(
  4966. // 4 pixel loop.
  4967. LABELALIGN
  4968. "1: \n"
  4969. "vmovdqu (%0),%%ymm0 \n"
  4970. "lea 0x20(%0),%0 \n"
  4971. "vpsubusb (%1),%%ymm0,%%ymm0 \n"
  4972. "lea 0x20(%1),%1 \n"
  4973. "vmovdqu %%ymm0,(%2) \n"
  4974. "lea 0x20(%2),%2 \n"
  4975. "sub $0x8,%3 \n"
  4976. "jg 1b \n"
  4977. "vzeroupper \n"
  4978. : "+r"(src_argb0), // %0
  4979. "+r"(src_argb1), // %1
  4980. "+r"(dst_argb), // %2
  4981. "+r"(width) // %3
  4982. :
  4983. : "memory", "cc", "xmm0");
  4984. }
  4985. #endif // HAS_ARGBSUBTRACTROW_AVX2
  4986. #ifdef HAS_SOBELXROW_SSE2
  4987. // SobelX as a matrix is
  4988. // -1 0 1
  4989. // -2 0 2
  4990. // -1 0 1
  4991. void SobelXRow_SSE2(const uint8_t* src_y0,
  4992. const uint8_t* src_y1,
  4993. const uint8_t* src_y2,
  4994. uint8_t* dst_sobelx,
  4995. int width) {
  4996. asm volatile(
  4997. "sub %0,%1 \n"
  4998. "sub %0,%2 \n"
  4999. "sub %0,%3 \n"
  5000. "pxor %%xmm5,%%xmm5 \n"
  5001. // 8 pixel loop.
  5002. LABELALIGN
  5003. "1: \n"
  5004. "movq (%0),%%xmm0 \n"
  5005. "movq 0x2(%0),%%xmm1 \n"
  5006. "punpcklbw %%xmm5,%%xmm0 \n"
  5007. "punpcklbw %%xmm5,%%xmm1 \n"
  5008. "psubw %%xmm1,%%xmm0 \n"
  5009. "movq 0x00(%0,%1,1),%%xmm1 \n"
  5010. "movq 0x02(%0,%1,1),%%xmm2 \n"
  5011. "punpcklbw %%xmm5,%%xmm1 \n"
  5012. "punpcklbw %%xmm5,%%xmm2 \n"
  5013. "psubw %%xmm2,%%xmm1 \n"
  5014. "movq 0x00(%0,%2,1),%%xmm2 \n"
  5015. "movq 0x02(%0,%2,1),%%xmm3 \n"
  5016. "punpcklbw %%xmm5,%%xmm2 \n"
  5017. "punpcklbw %%xmm5,%%xmm3 \n"
  5018. "psubw %%xmm3,%%xmm2 \n"
  5019. "paddw %%xmm2,%%xmm0 \n"
  5020. "paddw %%xmm1,%%xmm0 \n"
  5021. "paddw %%xmm1,%%xmm0 \n"
  5022. "pxor %%xmm1,%%xmm1 \n"
  5023. "psubw %%xmm0,%%xmm1 \n"
  5024. "pmaxsw %%xmm1,%%xmm0 \n"
  5025. "packuswb %%xmm0,%%xmm0 \n"
  5026. "movq %%xmm0,0x00(%0,%3,1) \n"
  5027. "lea 0x8(%0),%0 \n"
  5028. "sub $0x8,%4 \n"
  5029. "jg 1b \n"
  5030. : "+r"(src_y0), // %0
  5031. "+r"(src_y1), // %1
  5032. "+r"(src_y2), // %2
  5033. "+r"(dst_sobelx), // %3
  5034. "+r"(width) // %4
  5035. :
  5036. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  5037. }
  5038. #endif // HAS_SOBELXROW_SSE2
  5039. #ifdef HAS_SOBELYROW_SSE2
  5040. // SobelY as a matrix is
  5041. // -1 -2 -1
  5042. // 0 0 0
  5043. // 1 2 1
  5044. void SobelYRow_SSE2(const uint8_t* src_y0,
  5045. const uint8_t* src_y1,
  5046. uint8_t* dst_sobely,
  5047. int width) {
  5048. asm volatile(
  5049. "sub %0,%1 \n"
  5050. "sub %0,%2 \n"
  5051. "pxor %%xmm5,%%xmm5 \n"
  5052. // 8 pixel loop.
  5053. LABELALIGN
  5054. "1: \n"
  5055. "movq (%0),%%xmm0 \n"
  5056. "movq 0x00(%0,%1,1),%%xmm1 \n"
  5057. "punpcklbw %%xmm5,%%xmm0 \n"
  5058. "punpcklbw %%xmm5,%%xmm1 \n"
  5059. "psubw %%xmm1,%%xmm0 \n"
  5060. "movq 0x1(%0),%%xmm1 \n"
  5061. "movq 0x01(%0,%1,1),%%xmm2 \n"
  5062. "punpcklbw %%xmm5,%%xmm1 \n"
  5063. "punpcklbw %%xmm5,%%xmm2 \n"
  5064. "psubw %%xmm2,%%xmm1 \n"
  5065. "movq 0x2(%0),%%xmm2 \n"
  5066. "movq 0x02(%0,%1,1),%%xmm3 \n"
  5067. "punpcklbw %%xmm5,%%xmm2 \n"
  5068. "punpcklbw %%xmm5,%%xmm3 \n"
  5069. "psubw %%xmm3,%%xmm2 \n"
  5070. "paddw %%xmm2,%%xmm0 \n"
  5071. "paddw %%xmm1,%%xmm0 \n"
  5072. "paddw %%xmm1,%%xmm0 \n"
  5073. "pxor %%xmm1,%%xmm1 \n"
  5074. "psubw %%xmm0,%%xmm1 \n"
  5075. "pmaxsw %%xmm1,%%xmm0 \n"
  5076. "packuswb %%xmm0,%%xmm0 \n"
  5077. "movq %%xmm0,0x00(%0,%2,1) \n"
  5078. "lea 0x8(%0),%0 \n"
  5079. "sub $0x8,%3 \n"
  5080. "jg 1b \n"
  5081. : "+r"(src_y0), // %0
  5082. "+r"(src_y1), // %1
  5083. "+r"(dst_sobely), // %2
  5084. "+r"(width) // %3
  5085. :
  5086. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  5087. }
  5088. #endif // HAS_SOBELYROW_SSE2
  5089. #ifdef HAS_SOBELROW_SSE2
  5090. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  5091. // A = 255
  5092. // R = Sobel
  5093. // G = Sobel
  5094. // B = Sobel
  5095. void SobelRow_SSE2(const uint8_t* src_sobelx,
  5096. const uint8_t* src_sobely,
  5097. uint8_t* dst_argb,
  5098. int width) {
  5099. asm volatile(
  5100. "sub %0,%1 \n"
  5101. "pcmpeqb %%xmm5,%%xmm5 \n"
  5102. "pslld $0x18,%%xmm5 \n"
  5103. // 8 pixel loop.
  5104. LABELALIGN
  5105. "1: \n"
  5106. "movdqu (%0),%%xmm0 \n"
  5107. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  5108. "lea 0x10(%0),%0 \n"
  5109. "paddusb %%xmm1,%%xmm0 \n"
  5110. "movdqa %%xmm0,%%xmm2 \n"
  5111. "punpcklbw %%xmm0,%%xmm2 \n"
  5112. "punpckhbw %%xmm0,%%xmm0 \n"
  5113. "movdqa %%xmm2,%%xmm1 \n"
  5114. "punpcklwd %%xmm2,%%xmm1 \n"
  5115. "punpckhwd %%xmm2,%%xmm2 \n"
  5116. "por %%xmm5,%%xmm1 \n"
  5117. "por %%xmm5,%%xmm2 \n"
  5118. "movdqa %%xmm0,%%xmm3 \n"
  5119. "punpcklwd %%xmm0,%%xmm3 \n"
  5120. "punpckhwd %%xmm0,%%xmm0 \n"
  5121. "por %%xmm5,%%xmm3 \n"
  5122. "por %%xmm5,%%xmm0 \n"
  5123. "movdqu %%xmm1,(%2) \n"
  5124. "movdqu %%xmm2,0x10(%2) \n"
  5125. "movdqu %%xmm3,0x20(%2) \n"
  5126. "movdqu %%xmm0,0x30(%2) \n"
  5127. "lea 0x40(%2),%2 \n"
  5128. "sub $0x10,%3 \n"
  5129. "jg 1b \n"
  5130. : "+r"(src_sobelx), // %0
  5131. "+r"(src_sobely), // %1
  5132. "+r"(dst_argb), // %2
  5133. "+r"(width) // %3
  5134. :
  5135. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  5136. }
  5137. #endif // HAS_SOBELROW_SSE2
  5138. #ifdef HAS_SOBELTOPLANEROW_SSE2
  5139. // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  5140. void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
  5141. const uint8_t* src_sobely,
  5142. uint8_t* dst_y,
  5143. int width) {
  5144. asm volatile(
  5145. "sub %0,%1 \n"
  5146. "pcmpeqb %%xmm5,%%xmm5 \n"
  5147. "pslld $0x18,%%xmm5 \n"
  5148. // 8 pixel loop.
  5149. LABELALIGN
  5150. "1: \n"
  5151. "movdqu (%0),%%xmm0 \n"
  5152. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  5153. "lea 0x10(%0),%0 \n"
  5154. "paddusb %%xmm1,%%xmm0 \n"
  5155. "movdqu %%xmm0,(%2) \n"
  5156. "lea 0x10(%2),%2 \n"
  5157. "sub $0x10,%3 \n"
  5158. "jg 1b \n"
  5159. : "+r"(src_sobelx), // %0
  5160. "+r"(src_sobely), // %1
  5161. "+r"(dst_y), // %2
  5162. "+r"(width) // %3
  5163. :
  5164. : "memory", "cc", "xmm0", "xmm1");
  5165. }
  5166. #endif // HAS_SOBELTOPLANEROW_SSE2
  5167. #ifdef HAS_SOBELXYROW_SSE2
  5168. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  5169. // A = 255
  5170. // R = Sobel X
  5171. // G = Sobel
  5172. // B = Sobel Y
  5173. void SobelXYRow_SSE2(const uint8_t* src_sobelx,
  5174. const uint8_t* src_sobely,
  5175. uint8_t* dst_argb,
  5176. int width) {
  5177. asm volatile(
  5178. "sub %0,%1 \n"
  5179. "pcmpeqb %%xmm5,%%xmm5 \n"
  5180. // 8 pixel loop.
  5181. LABELALIGN
  5182. "1: \n"
  5183. "movdqu (%0),%%xmm0 \n"
  5184. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  5185. "lea 0x10(%0),%0 \n"
  5186. "movdqa %%xmm0,%%xmm2 \n"
  5187. "paddusb %%xmm1,%%xmm2 \n"
  5188. "movdqa %%xmm0,%%xmm3 \n"
  5189. "punpcklbw %%xmm5,%%xmm3 \n"
  5190. "punpckhbw %%xmm5,%%xmm0 \n"
  5191. "movdqa %%xmm1,%%xmm4 \n"
  5192. "punpcklbw %%xmm2,%%xmm4 \n"
  5193. "punpckhbw %%xmm2,%%xmm1 \n"
  5194. "movdqa %%xmm4,%%xmm6 \n"
  5195. "punpcklwd %%xmm3,%%xmm6 \n"
  5196. "punpckhwd %%xmm3,%%xmm4 \n"
  5197. "movdqa %%xmm1,%%xmm7 \n"
  5198. "punpcklwd %%xmm0,%%xmm7 \n"
  5199. "punpckhwd %%xmm0,%%xmm1 \n"
  5200. "movdqu %%xmm6,(%2) \n"
  5201. "movdqu %%xmm4,0x10(%2) \n"
  5202. "movdqu %%xmm7,0x20(%2) \n"
  5203. "movdqu %%xmm1,0x30(%2) \n"
  5204. "lea 0x40(%2),%2 \n"
  5205. "sub $0x10,%3 \n"
  5206. "jg 1b \n"
  5207. : "+r"(src_sobelx), // %0
  5208. "+r"(src_sobely), // %1
  5209. "+r"(dst_argb), // %2
  5210. "+r"(width) // %3
  5211. :
  5212. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  5213. "xmm7");
  5214. }
  5215. #endif // HAS_SOBELXYROW_SSE2
  5216. #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  5217. // Creates a table of cumulative sums where each value is a sum of all values
  5218. // above and to the left of the value, inclusive of the value.
  5219. void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
  5220. int32_t* cumsum,
  5221. const int32_t* previous_cumsum,
  5222. int width) {
  5223. asm volatile(
  5224. "pxor %%xmm0,%%xmm0 \n"
  5225. "pxor %%xmm1,%%xmm1 \n"
  5226. "sub $0x4,%3 \n"
  5227. "jl 49f \n"
  5228. "test $0xf,%1 \n"
  5229. "jne 49f \n"
  5230. // 4 pixel loop.
  5231. LABELALIGN
  5232. "40: \n"
  5233. "movdqu (%0),%%xmm2 \n"
  5234. "lea 0x10(%0),%0 \n"
  5235. "movdqa %%xmm2,%%xmm4 \n"
  5236. "punpcklbw %%xmm1,%%xmm2 \n"
  5237. "movdqa %%xmm2,%%xmm3 \n"
  5238. "punpcklwd %%xmm1,%%xmm2 \n"
  5239. "punpckhwd %%xmm1,%%xmm3 \n"
  5240. "punpckhbw %%xmm1,%%xmm4 \n"
  5241. "movdqa %%xmm4,%%xmm5 \n"
  5242. "punpcklwd %%xmm1,%%xmm4 \n"
  5243. "punpckhwd %%xmm1,%%xmm5 \n"
  5244. "paddd %%xmm2,%%xmm0 \n"
  5245. "movdqu (%2),%%xmm2 \n"
  5246. "paddd %%xmm0,%%xmm2 \n"
  5247. "paddd %%xmm3,%%xmm0 \n"
  5248. "movdqu 0x10(%2),%%xmm3 \n"
  5249. "paddd %%xmm0,%%xmm3 \n"
  5250. "paddd %%xmm4,%%xmm0 \n"
  5251. "movdqu 0x20(%2),%%xmm4 \n"
  5252. "paddd %%xmm0,%%xmm4 \n"
  5253. "paddd %%xmm5,%%xmm0 \n"
  5254. "movdqu 0x30(%2),%%xmm5 \n"
  5255. "lea 0x40(%2),%2 \n"
  5256. "paddd %%xmm0,%%xmm5 \n"
  5257. "movdqu %%xmm2,(%1) \n"
  5258. "movdqu %%xmm3,0x10(%1) \n"
  5259. "movdqu %%xmm4,0x20(%1) \n"
  5260. "movdqu %%xmm5,0x30(%1) \n"
  5261. "lea 0x40(%1),%1 \n"
  5262. "sub $0x4,%3 \n"
  5263. "jge 40b \n"
  5264. "49: \n"
  5265. "add $0x3,%3 \n"
  5266. "jl 19f \n"
  5267. // 1 pixel loop.
  5268. LABELALIGN
  5269. "10: \n"
  5270. "movd (%0),%%xmm2 \n"
  5271. "lea 0x4(%0),%0 \n"
  5272. "punpcklbw %%xmm1,%%xmm2 \n"
  5273. "punpcklwd %%xmm1,%%xmm2 \n"
  5274. "paddd %%xmm2,%%xmm0 \n"
  5275. "movdqu (%2),%%xmm2 \n"
  5276. "lea 0x10(%2),%2 \n"
  5277. "paddd %%xmm0,%%xmm2 \n"
  5278. "movdqu %%xmm2,(%1) \n"
  5279. "lea 0x10(%1),%1 \n"
  5280. "sub $0x1,%3 \n"
  5281. "jge 10b \n"
  5282. "19: \n"
  5283. : "+r"(row), // %0
  5284. "+r"(cumsum), // %1
  5285. "+r"(previous_cumsum), // %2
  5286. "+r"(width) // %3
  5287. :
  5288. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  5289. }
  5290. #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
  5291. #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  5292. void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
  5293. const int32_t* botleft,
  5294. int width,
  5295. int area,
  5296. uint8_t* dst,
  5297. int count) {
  5298. asm volatile(
  5299. "movd %5,%%xmm5 \n"
  5300. "cvtdq2ps %%xmm5,%%xmm5 \n"
  5301. "rcpss %%xmm5,%%xmm4 \n"
  5302. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  5303. "sub $0x4,%3 \n"
  5304. "jl 49f \n"
  5305. "cmpl $0x80,%5 \n"
  5306. "ja 40f \n"
  5307. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  5308. "pcmpeqb %%xmm6,%%xmm6 \n"
  5309. "psrld $0x10,%%xmm6 \n"
  5310. "cvtdq2ps %%xmm6,%%xmm6 \n"
  5311. "addps %%xmm6,%%xmm5 \n"
  5312. "mulps %%xmm4,%%xmm5 \n"
  5313. "cvtps2dq %%xmm5,%%xmm5 \n"
  5314. "packssdw %%xmm5,%%xmm5 \n"
  5315. // 4 pixel small loop.
  5316. LABELALIGN
  5317. "4: \n"
  5318. "movdqu (%0),%%xmm0 \n"
  5319. "movdqu 0x10(%0),%%xmm1 \n"
  5320. "movdqu 0x20(%0),%%xmm2 \n"
  5321. "movdqu 0x30(%0),%%xmm3 \n"
  5322. "psubd 0x00(%0,%4,4),%%xmm0 \n"
  5323. "psubd 0x10(%0,%4,4),%%xmm1 \n"
  5324. "psubd 0x20(%0,%4,4),%%xmm2 \n"
  5325. "psubd 0x30(%0,%4,4),%%xmm3 \n"
  5326. "lea 0x40(%0),%0 \n"
  5327. "psubd (%1),%%xmm0 \n"
  5328. "psubd 0x10(%1),%%xmm1 \n"
  5329. "psubd 0x20(%1),%%xmm2 \n"
  5330. "psubd 0x30(%1),%%xmm3 \n"
  5331. "paddd 0x00(%1,%4,4),%%xmm0 \n"
  5332. "paddd 0x10(%1,%4,4),%%xmm1 \n"
  5333. "paddd 0x20(%1,%4,4),%%xmm2 \n"
  5334. "paddd 0x30(%1,%4,4),%%xmm3 \n"
  5335. "lea 0x40(%1),%1 \n"
  5336. "packssdw %%xmm1,%%xmm0 \n"
  5337. "packssdw %%xmm3,%%xmm2 \n"
  5338. "pmulhuw %%xmm5,%%xmm0 \n"
  5339. "pmulhuw %%xmm5,%%xmm2 \n"
  5340. "packuswb %%xmm2,%%xmm0 \n"
  5341. "movdqu %%xmm0,(%2) \n"
  5342. "lea 0x10(%2),%2 \n"
  5343. "sub $0x4,%3 \n"
  5344. "jge 4b \n"
  5345. "jmp 49f \n"
  5346. // 4 pixel loop
  5347. LABELALIGN
  5348. "40: \n"
  5349. "movdqu (%0),%%xmm0 \n"
  5350. "movdqu 0x10(%0),%%xmm1 \n"
  5351. "movdqu 0x20(%0),%%xmm2 \n"
  5352. "movdqu 0x30(%0),%%xmm3 \n"
  5353. "psubd 0x00(%0,%4,4),%%xmm0 \n"
  5354. "psubd 0x10(%0,%4,4),%%xmm1 \n"
  5355. "psubd 0x20(%0,%4,4),%%xmm2 \n"
  5356. "psubd 0x30(%0,%4,4),%%xmm3 \n"
  5357. "lea 0x40(%0),%0 \n"
  5358. "psubd (%1),%%xmm0 \n"
  5359. "psubd 0x10(%1),%%xmm1 \n"
  5360. "psubd 0x20(%1),%%xmm2 \n"
  5361. "psubd 0x30(%1),%%xmm3 \n"
  5362. "paddd 0x00(%1,%4,4),%%xmm0 \n"
  5363. "paddd 0x10(%1,%4,4),%%xmm1 \n"
  5364. "paddd 0x20(%1,%4,4),%%xmm2 \n"
  5365. "paddd 0x30(%1,%4,4),%%xmm3 \n"
  5366. "lea 0x40(%1),%1 \n"
  5367. "cvtdq2ps %%xmm0,%%xmm0 \n"
  5368. "cvtdq2ps %%xmm1,%%xmm1 \n"
  5369. "mulps %%xmm4,%%xmm0 \n"
  5370. "mulps %%xmm4,%%xmm1 \n"
  5371. "cvtdq2ps %%xmm2,%%xmm2 \n"
  5372. "cvtdq2ps %%xmm3,%%xmm3 \n"
  5373. "mulps %%xmm4,%%xmm2 \n"
  5374. "mulps %%xmm4,%%xmm3 \n"
  5375. "cvtps2dq %%xmm0,%%xmm0 \n"
  5376. "cvtps2dq %%xmm1,%%xmm1 \n"
  5377. "cvtps2dq %%xmm2,%%xmm2 \n"
  5378. "cvtps2dq %%xmm3,%%xmm3 \n"
  5379. "packssdw %%xmm1,%%xmm0 \n"
  5380. "packssdw %%xmm3,%%xmm2 \n"
  5381. "packuswb %%xmm2,%%xmm0 \n"
  5382. "movdqu %%xmm0,(%2) \n"
  5383. "lea 0x10(%2),%2 \n"
  5384. "sub $0x4,%3 \n"
  5385. "jge 40b \n"
  5386. "49: \n"
  5387. "add $0x3,%3 \n"
  5388. "jl 19f \n"
  5389. // 1 pixel loop
  5390. LABELALIGN
  5391. "10: \n"
  5392. "movdqu (%0),%%xmm0 \n"
  5393. "psubd 0x00(%0,%4,4),%%xmm0 \n"
  5394. "lea 0x10(%0),%0 \n"
  5395. "psubd (%1),%%xmm0 \n"
  5396. "paddd 0x00(%1,%4,4),%%xmm0 \n"
  5397. "lea 0x10(%1),%1 \n"
  5398. "cvtdq2ps %%xmm0,%%xmm0 \n"
  5399. "mulps %%xmm4,%%xmm0 \n"
  5400. "cvtps2dq %%xmm0,%%xmm0 \n"
  5401. "packssdw %%xmm0,%%xmm0 \n"
  5402. "packuswb %%xmm0,%%xmm0 \n"
  5403. "movd %%xmm0,(%2) \n"
  5404. "lea 0x4(%2),%2 \n"
  5405. "sub $0x1,%3 \n"
  5406. "jge 10b \n"
  5407. "19: \n"
  5408. : "+r"(topleft), // %0
  5409. "+r"(botleft), // %1
  5410. "+r"(dst), // %2
  5411. "+rm"(count) // %3
  5412. : "r"((intptr_t)(width)), // %4
  5413. "rm"(area) // %5
  5414. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  5415. }
  5416. #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  5417. #ifdef HAS_ARGBAFFINEROW_SSE2
  5418. // Copy ARGB pixels from source image with slope to a row of destination.
  5419. LIBYUV_API
  5420. void ARGBAffineRow_SSE2(const uint8_t* src_argb,
  5421. int src_argb_stride,
  5422. uint8_t* dst_argb,
  5423. const float* src_dudv,
  5424. int width) {
  5425. intptr_t src_argb_stride_temp = src_argb_stride;
  5426. intptr_t temp;
  5427. asm volatile(
  5428. "movq (%3),%%xmm2 \n"
  5429. "movq 0x08(%3),%%xmm7 \n"
  5430. "shl $0x10,%1 \n"
  5431. "add $0x4,%1 \n"
  5432. "movd %1,%%xmm5 \n"
  5433. "sub $0x4,%4 \n"
  5434. "jl 49f \n"
  5435. "pshufd $0x44,%%xmm7,%%xmm7 \n"
  5436. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  5437. "movdqa %%xmm2,%%xmm0 \n"
  5438. "addps %%xmm7,%%xmm0 \n"
  5439. "movlhps %%xmm0,%%xmm2 \n"
  5440. "movdqa %%xmm7,%%xmm4 \n"
  5441. "addps %%xmm4,%%xmm4 \n"
  5442. "movdqa %%xmm2,%%xmm3 \n"
  5443. "addps %%xmm4,%%xmm3 \n"
  5444. "addps %%xmm4,%%xmm4 \n"
  5445. // 4 pixel loop
  5446. LABELALIGN
  5447. "40: \n"
  5448. "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
  5449. "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
  5450. "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
  5451. "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
  5452. "movd %%xmm0,%k1 \n"
  5453. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5454. "movd %%xmm0,%k5 \n"
  5455. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5456. "movd 0x00(%0,%1,1),%%xmm1 \n"
  5457. "movd 0x00(%0,%5,1),%%xmm6 \n"
  5458. "punpckldq %%xmm6,%%xmm1 \n"
  5459. "addps %%xmm4,%%xmm2 \n"
  5460. "movq %%xmm1,(%2) \n"
  5461. "movd %%xmm0,%k1 \n"
  5462. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5463. "movd %%xmm0,%k5 \n"
  5464. "movd 0x00(%0,%1,1),%%xmm0 \n"
  5465. "movd 0x00(%0,%5,1),%%xmm6 \n"
  5466. "punpckldq %%xmm6,%%xmm0 \n"
  5467. "addps %%xmm4,%%xmm3 \n"
  5468. "movq %%xmm0,0x08(%2) \n"
  5469. "lea 0x10(%2),%2 \n"
  5470. "sub $0x4,%4 \n"
  5471. "jge 40b \n"
  5472. "49: \n"
  5473. "add $0x3,%4 \n"
  5474. "jl 19f \n"
  5475. // 1 pixel loop
  5476. LABELALIGN
  5477. "10: \n"
  5478. "cvttps2dq %%xmm2,%%xmm0 \n"
  5479. "packssdw %%xmm0,%%xmm0 \n"
  5480. "pmaddwd %%xmm5,%%xmm0 \n"
  5481. "addps %%xmm7,%%xmm2 \n"
  5482. "movd %%xmm0,%k1 \n"
  5483. "movd 0x00(%0,%1,1),%%xmm0 \n"
  5484. "movd %%xmm0,(%2) \n"
  5485. "lea 0x04(%2),%2 \n"
  5486. "sub $0x1,%4 \n"
  5487. "jge 10b \n"
  5488. "19: \n"
  5489. : "+r"(src_argb), // %0
  5490. "+r"(src_argb_stride_temp), // %1
  5491. "+r"(dst_argb), // %2
  5492. "+r"(src_dudv), // %3
  5493. "+rm"(width), // %4
  5494. "=&r"(temp) // %5
  5495. :
  5496. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  5497. "xmm7");
  5498. }
  5499. #endif // HAS_ARGBAFFINEROW_SSE2
  5500. #ifdef HAS_INTERPOLATEROW_SSSE3
  5501. // Bilinear filter 16x2 -> 16x1
  5502. void InterpolateRow_SSSE3(uint8_t* dst_ptr,
  5503. const uint8_t* src_ptr,
  5504. ptrdiff_t src_stride,
  5505. int dst_width,
  5506. int source_y_fraction) {
  5507. asm volatile(
  5508. "sub %1,%0 \n"
  5509. "cmp $0x0,%3 \n"
  5510. "je 100f \n"
  5511. "cmp $0x80,%3 \n"
  5512. "je 50f \n"
  5513. "movd %3,%%xmm0 \n"
  5514. "neg %3 \n"
  5515. "add $0x100,%3 \n"
  5516. "movd %3,%%xmm5 \n"
  5517. "punpcklbw %%xmm0,%%xmm5 \n"
  5518. "punpcklwd %%xmm5,%%xmm5 \n"
  5519. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  5520. "mov $0x80808080,%%eax \n"
  5521. "movd %%eax,%%xmm4 \n"
  5522. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  5523. // General purpose row blend.
  5524. LABELALIGN
  5525. "1: \n"
  5526. "movdqu (%1),%%xmm0 \n"
  5527. "movdqu 0x00(%1,%4,1),%%xmm2 \n"
  5528. "movdqa %%xmm0,%%xmm1 \n"
  5529. "punpcklbw %%xmm2,%%xmm0 \n"
  5530. "punpckhbw %%xmm2,%%xmm1 \n"
  5531. "psubb %%xmm4,%%xmm0 \n"
  5532. "psubb %%xmm4,%%xmm1 \n"
  5533. "movdqa %%xmm5,%%xmm2 \n"
  5534. "movdqa %%xmm5,%%xmm3 \n"
  5535. "pmaddubsw %%xmm0,%%xmm2 \n"
  5536. "pmaddubsw %%xmm1,%%xmm3 \n"
  5537. "paddw %%xmm4,%%xmm2 \n"
  5538. "paddw %%xmm4,%%xmm3 \n"
  5539. "psrlw $0x8,%%xmm2 \n"
  5540. "psrlw $0x8,%%xmm3 \n"
  5541. "packuswb %%xmm3,%%xmm2 \n"
  5542. "movdqu %%xmm2,0x00(%1,%0,1) \n"
  5543. "lea 0x10(%1),%1 \n"
  5544. "sub $0x10,%2 \n"
  5545. "jg 1b \n"
  5546. "jmp 99f \n"
  5547. // Blend 50 / 50.
  5548. LABELALIGN
  5549. "50: \n"
  5550. "movdqu (%1),%%xmm0 \n"
  5551. "movdqu 0x00(%1,%4,1),%%xmm1 \n"
  5552. "pavgb %%xmm1,%%xmm0 \n"
  5553. "movdqu %%xmm0,0x00(%1,%0,1) \n"
  5554. "lea 0x10(%1),%1 \n"
  5555. "sub $0x10,%2 \n"
  5556. "jg 50b \n"
  5557. "jmp 99f \n"
  5558. // Blend 100 / 0 - Copy row unchanged.
  5559. LABELALIGN
  5560. "100: \n"
  5561. "movdqu (%1),%%xmm0 \n"
  5562. "movdqu %%xmm0,0x00(%1,%0,1) \n"
  5563. "lea 0x10(%1),%1 \n"
  5564. "sub $0x10,%2 \n"
  5565. "jg 100b \n"
  5566. "99: \n"
  5567. : "+r"(dst_ptr), // %0
  5568. "+r"(src_ptr), // %1
  5569. "+rm"(dst_width), // %2
  5570. "+r"(source_y_fraction) // %3
  5571. : "r"((intptr_t)(src_stride)) // %4
  5572. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  5573. }
  5574. #endif // HAS_INTERPOLATEROW_SSSE3
  5575. #ifdef HAS_INTERPOLATEROW_AVX2
  5576. // Bilinear filter 32x2 -> 32x1
  5577. void InterpolateRow_AVX2(uint8_t* dst_ptr,
  5578. const uint8_t* src_ptr,
  5579. ptrdiff_t src_stride,
  5580. int dst_width,
  5581. int source_y_fraction) {
  5582. asm volatile(
  5583. "cmp $0x0,%3 \n"
  5584. "je 100f \n"
  5585. "sub %1,%0 \n"
  5586. "cmp $0x80,%3 \n"
  5587. "je 50f \n"
  5588. "vmovd %3,%%xmm0 \n"
  5589. "neg %3 \n"
  5590. "add $0x100,%3 \n"
  5591. "vmovd %3,%%xmm5 \n"
  5592. "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
  5593. "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
  5594. "vbroadcastss %%xmm5,%%ymm5 \n"
  5595. "mov $0x80808080,%%eax \n"
  5596. "vmovd %%eax,%%xmm4 \n"
  5597. "vbroadcastss %%xmm4,%%ymm4 \n"
  5598. // General purpose row blend.
  5599. LABELALIGN
  5600. "1: \n"
  5601. "vmovdqu (%1),%%ymm0 \n"
  5602. "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
  5603. "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
  5604. "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
  5605. "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
  5606. "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
  5607. "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
  5608. "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
  5609. "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
  5610. "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
  5611. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  5612. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  5613. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  5614. "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
  5615. "lea 0x20(%1),%1 \n"
  5616. "sub $0x20,%2 \n"
  5617. "jg 1b \n"
  5618. "jmp 99f \n"
  5619. // Blend 50 / 50.
  5620. LABELALIGN
  5621. "50: \n"
  5622. "vmovdqu (%1),%%ymm0 \n"
  5623. "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
  5624. "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
  5625. "lea 0x20(%1),%1 \n"
  5626. "sub $0x20,%2 \n"
  5627. "jg 50b \n"
  5628. "jmp 99f \n"
  5629. // Blend 100 / 0 - Copy row unchanged.
  5630. LABELALIGN
  5631. "100: \n"
  5632. "rep movsb \n"
  5633. "jmp 999f \n"
  5634. "99: \n"
  5635. "vzeroupper \n"
  5636. "999: \n"
  5637. : "+D"(dst_ptr), // %0
  5638. "+S"(src_ptr), // %1
  5639. "+cm"(dst_width), // %2
  5640. "+r"(source_y_fraction) // %3
  5641. : "r"((intptr_t)(src_stride)) // %4
  5642. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
  5643. }
  5644. #endif // HAS_INTERPOLATEROW_AVX2
  5645. #ifdef HAS_ARGBSHUFFLEROW_SSSE3
  5646. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5647. void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
  5648. uint8_t* dst_argb,
  5649. const uint8_t* shuffler,
  5650. int width) {
  5651. asm volatile(
  5652. "movdqu (%3),%%xmm5 \n"
  5653. LABELALIGN
  5654. "1: \n"
  5655. "movdqu (%0),%%xmm0 \n"
  5656. "movdqu 0x10(%0),%%xmm1 \n"
  5657. "lea 0x20(%0),%0 \n"
  5658. "pshufb %%xmm5,%%xmm0 \n"
  5659. "pshufb %%xmm5,%%xmm1 \n"
  5660. "movdqu %%xmm0,(%1) \n"
  5661. "movdqu %%xmm1,0x10(%1) \n"
  5662. "lea 0x20(%1),%1 \n"
  5663. "sub $0x8,%2 \n"
  5664. "jg 1b \n"
  5665. : "+r"(src_argb), // %0
  5666. "+r"(dst_argb), // %1
  5667. "+r"(width) // %2
  5668. : "r"(shuffler) // %3
  5669. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  5670. }
  5671. #endif // HAS_ARGBSHUFFLEROW_SSSE3
  5672. #ifdef HAS_ARGBSHUFFLEROW_AVX2
  5673. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5674. void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
  5675. uint8_t* dst_argb,
  5676. const uint8_t* shuffler,
  5677. int width) {
  5678. asm volatile(
  5679. "vbroadcastf128 (%3),%%ymm5 \n"
  5680. LABELALIGN
  5681. "1: \n"
  5682. "vmovdqu (%0),%%ymm0 \n"
  5683. "vmovdqu 0x20(%0),%%ymm1 \n"
  5684. "lea 0x40(%0),%0 \n"
  5685. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  5686. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  5687. "vmovdqu %%ymm0,(%1) \n"
  5688. "vmovdqu %%ymm1,0x20(%1) \n"
  5689. "lea 0x40(%1),%1 \n"
  5690. "sub $0x10,%2 \n"
  5691. "jg 1b \n"
  5692. "vzeroupper \n"
  5693. : "+r"(src_argb), // %0
  5694. "+r"(dst_argb), // %1
  5695. "+r"(width) // %2
  5696. : "r"(shuffler) // %3
  5697. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  5698. }
  5699. #endif // HAS_ARGBSHUFFLEROW_AVX2
  5700. #ifdef HAS_I422TOYUY2ROW_SSE2
  5701. void I422ToYUY2Row_SSE2(const uint8_t* src_y,
  5702. const uint8_t* src_u,
  5703. const uint8_t* src_v,
  5704. uint8_t* dst_yuy2,
  5705. int width) {
  5706. asm volatile(
  5707. "sub %1,%2 \n"
  5708. LABELALIGN
  5709. "1: \n"
  5710. "movq (%1),%%xmm2 \n"
  5711. "movq 0x00(%1,%2,1),%%xmm1 \n"
  5712. "add $0x8,%1 \n"
  5713. "punpcklbw %%xmm1,%%xmm2 \n"
  5714. "movdqu (%0),%%xmm0 \n"
  5715. "add $0x10,%0 \n"
  5716. "movdqa %%xmm0,%%xmm1 \n"
  5717. "punpcklbw %%xmm2,%%xmm0 \n"
  5718. "punpckhbw %%xmm2,%%xmm1 \n"
  5719. "movdqu %%xmm0,(%3) \n"
  5720. "movdqu %%xmm1,0x10(%3) \n"
  5721. "lea 0x20(%3),%3 \n"
  5722. "sub $0x10,%4 \n"
  5723. "jg 1b \n"
  5724. : "+r"(src_y), // %0
  5725. "+r"(src_u), // %1
  5726. "+r"(src_v), // %2
  5727. "+r"(dst_yuy2), // %3
  5728. "+rm"(width) // %4
  5729. :
  5730. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  5731. }
  5732. #endif // HAS_I422TOYUY2ROW_SSE2
  5733. #ifdef HAS_I422TOUYVYROW_SSE2
  5734. void I422ToUYVYRow_SSE2(const uint8_t* src_y,
  5735. const uint8_t* src_u,
  5736. const uint8_t* src_v,
  5737. uint8_t* dst_uyvy,
  5738. int width) {
  5739. asm volatile(
  5740. "sub %1,%2 \n"
  5741. LABELALIGN
  5742. "1: \n"
  5743. "movq (%1),%%xmm2 \n"
  5744. "movq 0x00(%1,%2,1),%%xmm1 \n"
  5745. "add $0x8,%1 \n"
  5746. "punpcklbw %%xmm1,%%xmm2 \n"
  5747. "movdqu (%0),%%xmm0 \n"
  5748. "movdqa %%xmm2,%%xmm1 \n"
  5749. "add $0x10,%0 \n"
  5750. "punpcklbw %%xmm0,%%xmm1 \n"
  5751. "punpckhbw %%xmm0,%%xmm2 \n"
  5752. "movdqu %%xmm1,(%3) \n"
  5753. "movdqu %%xmm2,0x10(%3) \n"
  5754. "lea 0x20(%3),%3 \n"
  5755. "sub $0x10,%4 \n"
  5756. "jg 1b \n"
  5757. : "+r"(src_y), // %0
  5758. "+r"(src_u), // %1
  5759. "+r"(src_v), // %2
  5760. "+r"(dst_uyvy), // %3
  5761. "+rm"(width) // %4
  5762. :
  5763. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  5764. }
  5765. #endif // HAS_I422TOUYVYROW_SSE2
  5766. #ifdef HAS_I422TOYUY2ROW_AVX2
  5767. void I422ToYUY2Row_AVX2(const uint8_t* src_y,
  5768. const uint8_t* src_u,
  5769. const uint8_t* src_v,
  5770. uint8_t* dst_yuy2,
  5771. int width) {
  5772. asm volatile(
  5773. "sub %1,%2 \n"
  5774. LABELALIGN
  5775. "1: \n"
  5776. "vpmovzxbw (%1),%%ymm1 \n"
  5777. "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
  5778. "add $0x10,%1 \n"
  5779. "vpsllw $0x8,%%ymm2,%%ymm2 \n"
  5780. "vpor %%ymm1,%%ymm2,%%ymm2 \n"
  5781. "vmovdqu (%0),%%ymm0 \n"
  5782. "add $0x20,%0 \n"
  5783. "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
  5784. "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
  5785. "vextractf128 $0x0,%%ymm1,(%3) \n"
  5786. "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
  5787. "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
  5788. "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
  5789. "lea 0x40(%3),%3 \n"
  5790. "sub $0x20,%4 \n"
  5791. "jg 1b \n"
  5792. "vzeroupper \n"
  5793. : "+r"(src_y), // %0
  5794. "+r"(src_u), // %1
  5795. "+r"(src_v), // %2
  5796. "+r"(dst_yuy2), // %3
  5797. "+rm"(width) // %4
  5798. :
  5799. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  5800. }
  5801. #endif // HAS_I422TOYUY2ROW_AVX2
  5802. #ifdef HAS_I422TOUYVYROW_AVX2
  5803. void I422ToUYVYRow_AVX2(const uint8_t* src_y,
  5804. const uint8_t* src_u,
  5805. const uint8_t* src_v,
  5806. uint8_t* dst_uyvy,
  5807. int width) {
  5808. asm volatile(
  5809. "sub %1,%2 \n"
  5810. LABELALIGN
  5811. "1: \n"
  5812. "vpmovzxbw (%1),%%ymm1 \n"
  5813. "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
  5814. "add $0x10,%1 \n"
  5815. "vpsllw $0x8,%%ymm2,%%ymm2 \n"
  5816. "vpor %%ymm1,%%ymm2,%%ymm2 \n"
  5817. "vmovdqu (%0),%%ymm0 \n"
  5818. "add $0x20,%0 \n"
  5819. "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
  5820. "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
  5821. "vextractf128 $0x0,%%ymm1,(%3) \n"
  5822. "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
  5823. "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
  5824. "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
  5825. "lea 0x40(%3),%3 \n"
  5826. "sub $0x20,%4 \n"
  5827. "jg 1b \n"
  5828. "vzeroupper \n"
  5829. : "+r"(src_y), // %0
  5830. "+r"(src_u), // %1
  5831. "+r"(src_v), // %2
  5832. "+r"(dst_uyvy), // %3
  5833. "+rm"(width) // %4
  5834. :
  5835. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  5836. }
  5837. #endif // HAS_I422TOUYVYROW_AVX2
  5838. #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  5839. void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
  5840. uint8_t* dst_argb,
  5841. const float* poly,
  5842. int width) {
  5843. asm volatile(
  5844. "pxor %%xmm3,%%xmm3 \n"
  5845. // 2 pixel loop.
  5846. LABELALIGN
  5847. "1: \n"
  5848. "movq (%0),%%xmm0 \n"
  5849. "lea 0x8(%0),%0 \n"
  5850. "punpcklbw %%xmm3,%%xmm0 \n"
  5851. "movdqa %%xmm0,%%xmm4 \n"
  5852. "punpcklwd %%xmm3,%%xmm0 \n"
  5853. "punpckhwd %%xmm3,%%xmm4 \n"
  5854. "cvtdq2ps %%xmm0,%%xmm0 \n"
  5855. "cvtdq2ps %%xmm4,%%xmm4 \n"
  5856. "movdqa %%xmm0,%%xmm1 \n"
  5857. "movdqa %%xmm4,%%xmm5 \n"
  5858. "mulps 0x10(%3),%%xmm0 \n"
  5859. "mulps 0x10(%3),%%xmm4 \n"
  5860. "addps (%3),%%xmm0 \n"
  5861. "addps (%3),%%xmm4 \n"
  5862. "movdqa %%xmm1,%%xmm2 \n"
  5863. "movdqa %%xmm5,%%xmm6 \n"
  5864. "mulps %%xmm1,%%xmm2 \n"
  5865. "mulps %%xmm5,%%xmm6 \n"
  5866. "mulps %%xmm2,%%xmm1 \n"
  5867. "mulps %%xmm6,%%xmm5 \n"
  5868. "mulps 0x20(%3),%%xmm2 \n"
  5869. "mulps 0x20(%3),%%xmm6 \n"
  5870. "mulps 0x30(%3),%%xmm1 \n"
  5871. "mulps 0x30(%3),%%xmm5 \n"
  5872. "addps %%xmm2,%%xmm0 \n"
  5873. "addps %%xmm6,%%xmm4 \n"
  5874. "addps %%xmm1,%%xmm0 \n"
  5875. "addps %%xmm5,%%xmm4 \n"
  5876. "cvttps2dq %%xmm0,%%xmm0 \n"
  5877. "cvttps2dq %%xmm4,%%xmm4 \n"
  5878. "packuswb %%xmm4,%%xmm0 \n"
  5879. "packuswb %%xmm0,%%xmm0 \n"
  5880. "movq %%xmm0,(%1) \n"
  5881. "lea 0x8(%1),%1 \n"
  5882. "sub $0x2,%2 \n"
  5883. "jg 1b \n"
  5884. : "+r"(src_argb), // %0
  5885. "+r"(dst_argb), // %1
  5886. "+r"(width) // %2
  5887. : "r"(poly) // %3
  5888. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  5889. }
  5890. #endif // HAS_ARGBPOLYNOMIALROW_SSE2
  5891. #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  5892. void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
  5893. uint8_t* dst_argb,
  5894. const float* poly,
  5895. int width) {
  5896. asm volatile(
  5897. "vbroadcastf128 (%3),%%ymm4 \n"
  5898. "vbroadcastf128 0x10(%3),%%ymm5 \n"
  5899. "vbroadcastf128 0x20(%3),%%ymm6 \n"
  5900. "vbroadcastf128 0x30(%3),%%ymm7 \n"
  5901. // 2 pixel loop.
  5902. LABELALIGN
  5903. "1: \n"
  5904. "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
  5905. "lea 0x8(%0),%0 \n"
  5906. "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
  5907. "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
  5908. "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
  5909. "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
  5910. "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
  5911. "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
  5912. // X
  5913. "vcvttps2dq %%ymm0,%%ymm0 \n"
  5914. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  5915. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  5916. "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
  5917. "vmovq %%xmm0,(%1) \n"
  5918. "lea 0x8(%1),%1 \n"
  5919. "sub $0x2,%2 \n"
  5920. "jg 1b \n"
  5921. "vzeroupper \n"
  5922. : "+r"(src_argb), // %0
  5923. "+r"(dst_argb), // %1
  5924. "+r"(width) // %2
  5925. : "r"(poly) // %3
  5926. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  5927. "xmm7");
  5928. }
  5929. #endif // HAS_ARGBPOLYNOMIALROW_AVX2
  5930. #ifdef HAS_HALFFLOATROW_SSE2
  5931. static float kScaleBias = 1.9259299444e-34f;
  5932. void HalfFloatRow_SSE2(const uint16_t* src,
  5933. uint16_t* dst,
  5934. float scale,
  5935. int width) {
  5936. scale *= kScaleBias;
  5937. asm volatile(
  5938. "movd %3,%%xmm4 \n"
  5939. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  5940. "pxor %%xmm5,%%xmm5 \n"
  5941. "sub %0,%1 \n"
  5942. // 16 pixel loop.
  5943. LABELALIGN
  5944. "1: \n"
  5945. "movdqu (%0),%%xmm2 \n" // 8 shorts
  5946. "add $0x10,%0 \n"
  5947. "movdqa %%xmm2,%%xmm3 \n"
  5948. "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
  5949. "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
  5950. "punpckhwd %%xmm5,%%xmm3 \n"
  5951. "cvtdq2ps %%xmm3,%%xmm3 \n"
  5952. "mulps %%xmm4,%%xmm2 \n"
  5953. "mulps %%xmm4,%%xmm3 \n"
  5954. "psrld $0xd,%%xmm2 \n"
  5955. "psrld $0xd,%%xmm3 \n"
  5956. "packssdw %%xmm3,%%xmm2 \n"
  5957. "movdqu %%xmm2,-0x10(%0,%1,1) \n"
  5958. "sub $0x8,%2 \n"
  5959. "jg 1b \n"
  5960. : "+r"(src), // %0
  5961. "+r"(dst), // %1
  5962. "+r"(width) // %2
  5963. : "m"(scale) // %3
  5964. : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
  5965. }
  5966. #endif // HAS_HALFFLOATROW_SSE2
  5967. #ifdef HAS_HALFFLOATROW_AVX2
  5968. void HalfFloatRow_AVX2(const uint16_t* src,
  5969. uint16_t* dst,
  5970. float scale,
  5971. int width) {
  5972. scale *= kScaleBias;
  5973. asm volatile(
  5974. "vbroadcastss %3, %%ymm4 \n"
  5975. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  5976. "sub %0,%1 \n"
  5977. // 16 pixel loop.
  5978. LABELALIGN
  5979. "1: \n"
  5980. "vmovdqu (%0),%%ymm2 \n" // 16 shorts
  5981. "add $0x20,%0 \n"
  5982. "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
  5983. "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
  5984. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  5985. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  5986. "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
  5987. "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
  5988. "vpsrld $0xd,%%ymm3,%%ymm3 \n"
  5989. "vpsrld $0xd,%%ymm2,%%ymm2 \n"
  5990. "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
  5991. "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
  5992. "sub $0x10,%2 \n"
  5993. "jg 1b \n"
  5994. "vzeroupper \n"
  5995. : "+r"(src), // %0
  5996. "+r"(dst), // %1
  5997. "+r"(width) // %2
  5998. #if defined(__x86_64__)
  5999. : "x"(scale) // %3
  6000. #else
  6001. : "m"(scale) // %3
  6002. #endif
  6003. : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
  6004. }
  6005. #endif // HAS_HALFFLOATROW_AVX2
  6006. #ifdef HAS_HALFFLOATROW_F16C
  6007. void HalfFloatRow_F16C(const uint16_t* src,
  6008. uint16_t* dst,
  6009. float scale,
  6010. int width) {
  6011. asm volatile(
  6012. "vbroadcastss %3, %%ymm4 \n"
  6013. "sub %0,%1 \n"
  6014. // 16 pixel loop.
  6015. LABELALIGN
  6016. "1: \n"
  6017. "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
  6018. "vpmovzxwd 0x10(%0),%%ymm3 \n"
  6019. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  6020. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  6021. "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
  6022. "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
  6023. "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
  6024. "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
  6025. "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
  6026. "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
  6027. "add $0x20,%0 \n"
  6028. "sub $0x10,%2 \n"
  6029. "jg 1b \n"
  6030. "vzeroupper \n"
  6031. : "+r"(src), // %0
  6032. "+r"(dst), // %1
  6033. "+r"(width) // %2
  6034. #if defined(__x86_64__)
  6035. : "x"(scale) // %3
  6036. #else
  6037. : "m"(scale) // %3
  6038. #endif
  6039. : "memory", "cc", "xmm2", "xmm3", "xmm4");
  6040. }
  6041. #endif // HAS_HALFFLOATROW_F16C
  6042. #ifdef HAS_HALFFLOATROW_F16C
  6043. void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
  6044. asm volatile(
  6045. "sub %0,%1 \n"
  6046. // 16 pixel loop.
  6047. LABELALIGN
  6048. "1: \n"
  6049. "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
  6050. "vpmovzxwd 0x10(%0),%%ymm3 \n"
  6051. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  6052. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  6053. "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
  6054. "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
  6055. "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
  6056. "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
  6057. "add $0x20,%0 \n"
  6058. "sub $0x10,%2 \n"
  6059. "jg 1b \n"
  6060. "vzeroupper \n"
  6061. : "+r"(src), // %0
  6062. "+r"(dst), // %1
  6063. "+r"(width) // %2
  6064. :
  6065. : "memory", "cc", "xmm2", "xmm3");
  6066. }
  6067. #endif // HAS_HALFFLOATROW_F16C
  6068. #ifdef HAS_ARGBCOLORTABLEROW_X86
  6069. // Tranform ARGB pixels with color table.
  6070. void ARGBColorTableRow_X86(uint8_t* dst_argb,
  6071. const uint8_t* table_argb,
  6072. int width) {
  6073. uintptr_t pixel_temp;
  6074. asm volatile(
  6075. // 1 pixel loop.
  6076. LABELALIGN
  6077. "1: \n"
  6078. "movzb (%0),%1 \n"
  6079. "lea 0x4(%0),%0 \n"
  6080. "movzb 0x00(%3,%1,4),%1 \n"
  6081. "mov %b1,-0x4(%0) \n"
  6082. "movzb -0x3(%0),%1 \n"
  6083. "movzb 0x01(%3,%1,4),%1 \n"
  6084. "mov %b1,-0x3(%0) \n"
  6085. "movzb -0x2(%0),%1 \n"
  6086. "movzb 0x02(%3,%1,4),%1 \n"
  6087. "mov %b1,-0x2(%0) \n"
  6088. "movzb -0x1(%0),%1 \n"
  6089. "movzb 0x03(%3,%1,4),%1 \n"
  6090. "mov %b1,-0x1(%0) \n"
  6091. "dec %2 \n"
  6092. "jg 1b \n"
  6093. : "+r"(dst_argb), // %0
  6094. "=&d"(pixel_temp), // %1
  6095. "+r"(width) // %2
  6096. : "r"(table_argb) // %3
  6097. : "memory", "cc");
  6098. }
  6099. #endif // HAS_ARGBCOLORTABLEROW_X86
  6100. #ifdef HAS_RGBCOLORTABLEROW_X86
  6101. // Tranform RGB pixels with color table.
  6102. void RGBColorTableRow_X86(uint8_t* dst_argb,
  6103. const uint8_t* table_argb,
  6104. int width) {
  6105. uintptr_t pixel_temp;
  6106. asm volatile(
  6107. // 1 pixel loop.
  6108. LABELALIGN
  6109. "1: \n"
  6110. "movzb (%0),%1 \n"
  6111. "lea 0x4(%0),%0 \n"
  6112. "movzb 0x00(%3,%1,4),%1 \n"
  6113. "mov %b1,-0x4(%0) \n"
  6114. "movzb -0x3(%0),%1 \n"
  6115. "movzb 0x01(%3,%1,4),%1 \n"
  6116. "mov %b1,-0x3(%0) \n"
  6117. "movzb -0x2(%0),%1 \n"
  6118. "movzb 0x02(%3,%1,4),%1 \n"
  6119. "mov %b1,-0x2(%0) \n"
  6120. "dec %2 \n"
  6121. "jg 1b \n"
  6122. : "+r"(dst_argb), // %0
  6123. "=&d"(pixel_temp), // %1
  6124. "+r"(width) // %2
  6125. : "r"(table_argb) // %3
  6126. : "memory", "cc");
  6127. }
  6128. #endif // HAS_RGBCOLORTABLEROW_X86
  6129. #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  6130. // Tranform RGB pixels with luma table.
  6131. void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
  6132. uint8_t* dst_argb,
  6133. int width,
  6134. const uint8_t* luma,
  6135. uint32_t lumacoeff) {
  6136. uintptr_t pixel_temp;
  6137. uintptr_t table_temp;
  6138. asm volatile(
  6139. "movd %6,%%xmm3 \n"
  6140. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  6141. "pcmpeqb %%xmm4,%%xmm4 \n"
  6142. "psllw $0x8,%%xmm4 \n"
  6143. "pxor %%xmm5,%%xmm5 \n"
  6144. // 4 pixel loop.
  6145. LABELALIGN
  6146. "1: \n"
  6147. "movdqu (%2),%%xmm0 \n"
  6148. "pmaddubsw %%xmm3,%%xmm0 \n"
  6149. "phaddw %%xmm0,%%xmm0 \n"
  6150. "pand %%xmm4,%%xmm0 \n"
  6151. "punpcklwd %%xmm5,%%xmm0 \n"
  6152. "movd %%xmm0,%k1 \n" // 32 bit offset
  6153. "add %5,%1 \n"
  6154. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  6155. "movzb (%2),%0 \n"
  6156. "movzb 0x00(%1,%0,1),%0 \n"
  6157. "mov %b0,(%3) \n"
  6158. "movzb 0x1(%2),%0 \n"
  6159. "movzb 0x00(%1,%0,1),%0 \n"
  6160. "mov %b0,0x1(%3) \n"
  6161. "movzb 0x2(%2),%0 \n"
  6162. "movzb 0x00(%1,%0,1),%0 \n"
  6163. "mov %b0,0x2(%3) \n"
  6164. "movzb 0x3(%2),%0 \n"
  6165. "mov %b0,0x3(%3) \n"
  6166. "movd %%xmm0,%k1 \n" // 32 bit offset
  6167. "add %5,%1 \n"
  6168. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  6169. "movzb 0x4(%2),%0 \n"
  6170. "movzb 0x00(%1,%0,1),%0 \n"
  6171. "mov %b0,0x4(%3) \n"
  6172. "movzb 0x5(%2),%0 \n"
  6173. "movzb 0x00(%1,%0,1),%0 \n"
  6174. "mov %b0,0x5(%3) \n"
  6175. "movzb 0x6(%2),%0 \n"
  6176. "movzb 0x00(%1,%0,1),%0 \n"
  6177. "mov %b0,0x6(%3) \n"
  6178. "movzb 0x7(%2),%0 \n"
  6179. "mov %b0,0x7(%3) \n"
  6180. "movd %%xmm0,%k1 \n" // 32 bit offset
  6181. "add %5,%1 \n"
  6182. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  6183. "movzb 0x8(%2),%0 \n"
  6184. "movzb 0x00(%1,%0,1),%0 \n"
  6185. "mov %b0,0x8(%3) \n"
  6186. "movzb 0x9(%2),%0 \n"
  6187. "movzb 0x00(%1,%0,1),%0 \n"
  6188. "mov %b0,0x9(%3) \n"
  6189. "movzb 0xa(%2),%0 \n"
  6190. "movzb 0x00(%1,%0,1),%0 \n"
  6191. "mov %b0,0xa(%3) \n"
  6192. "movzb 0xb(%2),%0 \n"
  6193. "mov %b0,0xb(%3) \n"
  6194. "movd %%xmm0,%k1 \n" // 32 bit offset
  6195. "add %5,%1 \n"
  6196. "movzb 0xc(%2),%0 \n"
  6197. "movzb 0x00(%1,%0,1),%0 \n"
  6198. "mov %b0,0xc(%3) \n"
  6199. "movzb 0xd(%2),%0 \n"
  6200. "movzb 0x00(%1,%0,1),%0 \n"
  6201. "mov %b0,0xd(%3) \n"
  6202. "movzb 0xe(%2),%0 \n"
  6203. "movzb 0x00(%1,%0,1),%0 \n"
  6204. "mov %b0,0xe(%3) \n"
  6205. "movzb 0xf(%2),%0 \n"
  6206. "mov %b0,0xf(%3) \n"
  6207. "lea 0x10(%2),%2 \n"
  6208. "lea 0x10(%3),%3 \n"
  6209. "sub $0x4,%4 \n"
  6210. "jg 1b \n"
  6211. : "=&d"(pixel_temp), // %0
  6212. "=&a"(table_temp), // %1
  6213. "+r"(src_argb), // %2
  6214. "+r"(dst_argb), // %3
  6215. "+rm"(width) // %4
  6216. : "r"(luma), // %5
  6217. "rm"(lumacoeff) // %6
  6218. : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
  6219. }
  6220. #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  6221. #endif // defined(__x86_64__) || defined(__i386__)
  6222. #ifdef __cplusplus
  6223. } // extern "C"
  6224. } // namespace libyuv
  6225. #endif