row_win.cc 200 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. // This module is for Visual C 32/64 bit and clangcl 32 bit
  12. #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
  13. (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
  14. #if defined(_M_X64)
  15. #include <emmintrin.h>
  16. #include <tmmintrin.h> // For _mm_maddubs_epi16
  17. #endif
  18. #ifdef __cplusplus
  19. namespace libyuv {
  20. extern "C" {
  21. #endif
  22. // 64 bit
  23. #if defined(_M_X64)
  24. // Read 4 UV from 422, upsample to 8 UV.
  25. #define READYUV422 \
  26. xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
  27. xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
  28. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  29. xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
  30. u_buf += 4; \
  31. xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
  32. xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
  33. y_buf += 8;
  34. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  35. #define READYUVA422 \
  36. xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
  37. xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
  38. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  39. xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
  40. u_buf += 4; \
  41. xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
  42. xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
  43. y_buf += 8; \
  44. xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
  45. a_buf += 8;
  46. // Convert 8 pixels: 8 UV and 8 Y.
  47. #define YUVTORGB(yuvconstants) \
  48. xmm1 = _mm_loadu_si128(&xmm0); \
  49. xmm2 = _mm_loadu_si128(&xmm0); \
  50. xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
  51. xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
  52. xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
  53. xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
  54. xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
  55. xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
  56. xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
  57. xmm0 = _mm_adds_epi16(xmm0, xmm4); \
  58. xmm1 = _mm_adds_epi16(xmm1, xmm4); \
  59. xmm2 = _mm_adds_epi16(xmm2, xmm4); \
  60. xmm0 = _mm_srai_epi16(xmm0, 6); \
  61. xmm1 = _mm_srai_epi16(xmm1, 6); \
  62. xmm2 = _mm_srai_epi16(xmm2, 6); \
  63. xmm0 = _mm_packus_epi16(xmm0, xmm0); \
  64. xmm1 = _mm_packus_epi16(xmm1, xmm1); \
  65. xmm2 = _mm_packus_epi16(xmm2, xmm2);
  66. // Store 8 ARGB values.
  67. #define STOREARGB \
  68. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  69. xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
  70. xmm1 = _mm_loadu_si128(&xmm0); \
  71. xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
  72. xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
  73. _mm_storeu_si128((__m128i *)dst_argb, xmm0); \
  74. _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \
  75. dst_argb += 32;
  76. #if defined(HAS_I422TOARGBROW_SSSE3)
  77. void I422ToARGBRow_SSSE3(const uint8* y_buf,
  78. const uint8* u_buf,
  79. const uint8* v_buf,
  80. uint8* dst_argb,
  81. const struct YuvConstants* yuvconstants,
  82. int width) {
  83. __m128i xmm0, xmm1, xmm2, xmm4;
  84. const __m128i xmm5 = _mm_set1_epi8(-1);
  85. const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
  86. while (width > 0) {
  87. READYUV422
  88. YUVTORGB(yuvconstants)
  89. STOREARGB
  90. width -= 8;
  91. }
  92. }
  93. #endif
  94. #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
  95. void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
  96. const uint8* u_buf,
  97. const uint8* v_buf,
  98. const uint8* a_buf,
  99. uint8* dst_argb,
  100. const struct YuvConstants* yuvconstants,
  101. int width) {
  102. __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
  103. const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
  104. while (width > 0) {
  105. READYUVA422
  106. YUVTORGB(yuvconstants)
  107. STOREARGB
  108. width -= 8;
  109. }
  110. }
  111. #endif
  112. // 32 bit
  113. #else // defined(_M_X64)
  114. #ifdef HAS_ARGBTOYROW_SSSE3
  115. // Constants for ARGB.
  116. static const vec8 kARGBToY = {
  117. 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
  118. };
  119. // JPeg full range.
  120. static const vec8 kARGBToYJ = {
  121. 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
  122. };
  123. static const vec8 kARGBToU = {
  124. 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
  125. };
  126. static const vec8 kARGBToUJ = {
  127. 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
  128. };
  129. static const vec8 kARGBToV = {
  130. -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
  131. };
  132. static const vec8 kARGBToVJ = {
  133. -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
  134. };
  135. // vpshufb for vphaddw + vpackuswb packed to shorts.
  136. static const lvec8 kShufARGBToUV_AVX = {
  137. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
  138. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
  139. };
  140. // Constants for BGRA.
  141. static const vec8 kBGRAToY = {
  142. 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
  143. };
  144. static const vec8 kBGRAToU = {
  145. 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
  146. };
  147. static const vec8 kBGRAToV = {
  148. 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
  149. };
  150. // Constants for ABGR.
  151. static const vec8 kABGRToY = {
  152. 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
  153. };
  154. static const vec8 kABGRToU = {
  155. -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
  156. };
  157. static const vec8 kABGRToV = {
  158. 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
  159. };
  160. // Constants for RGBA.
  161. static const vec8 kRGBAToY = {
  162. 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
  163. };
  164. static const vec8 kRGBAToU = {
  165. 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
  166. };
  167. static const vec8 kRGBAToV = {
  168. 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
  169. };
  170. static const uvec8 kAddY16 = {
  171. 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
  172. };
  173. // 7 bit fixed point 0.5.
  174. static const vec16 kAddYJ64 = {
  175. 64, 64, 64, 64, 64, 64, 64, 64
  176. };
  177. static const uvec8 kAddUV128 = {
  178. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  179. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  180. };
  181. static const uvec16 kAddUVJ128 = {
  182. 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
  183. };
  184. // Shuffle table for converting RGB24 to ARGB.
  185. static const uvec8 kShuffleMaskRGB24ToARGB = {
  186. 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
  187. };
  188. // Shuffle table for converting RAW to ARGB.
  189. static const uvec8 kShuffleMaskRAWToARGB = {
  190. 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
  191. };
  192. // Shuffle table for converting RAW to RGB24. First 8.
  193. static const uvec8 kShuffleMaskRAWToRGB24_0 = {
  194. 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
  195. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  196. };
  197. // Shuffle table for converting RAW to RGB24. Middle 8.
  198. static const uvec8 kShuffleMaskRAWToRGB24_1 = {
  199. 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
  200. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  201. };
  202. // Shuffle table for converting RAW to RGB24. Last 8.
  203. static const uvec8 kShuffleMaskRAWToRGB24_2 = {
  204. 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
  205. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  206. };
  207. // Shuffle table for converting ARGB to RGB24.
  208. static const uvec8 kShuffleMaskARGBToRGB24 = {
  209. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
  210. };
  211. // Shuffle table for converting ARGB to RAW.
  212. static const uvec8 kShuffleMaskARGBToRAW = {
  213. 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
  214. };
  215. // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
  216. static const uvec8 kShuffleMaskARGBToRGB24_0 = {
  217. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
  218. };
  219. // YUY2 shuf 16 Y to 32 Y.
  220. static const lvec8 kShuffleYUY2Y = {
  221. 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
  222. 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
  223. };
  224. // YUY2 shuf 8 UV to 16 UV.
  225. static const lvec8 kShuffleYUY2UV = {
  226. 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
  227. 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
  228. };
  229. // UYVY shuf 16 Y to 32 Y.
  230. static const lvec8 kShuffleUYVYY = {
  231. 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
  232. 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
  233. };
  234. // UYVY shuf 8 UV to 16 UV.
  235. static const lvec8 kShuffleUYVYUV = {
  236. 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
  237. 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
  238. };
  239. // NV21 shuf 8 VU to 16 UV.
  240. static const lvec8 kShuffleNV21 = {
  241. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  242. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  243. };
  244. // Duplicates gray value 3 times and fills in alpha opaque.
  245. __declspec(naked)
  246. void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
  247. __asm {
  248. mov eax, [esp + 4] // src_y
  249. mov edx, [esp + 8] // dst_argb
  250. mov ecx, [esp + 12] // width
  251. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  252. pslld xmm5, 24
  253. convertloop:
  254. movq xmm0, qword ptr [eax]
  255. lea eax, [eax + 8]
  256. punpcklbw xmm0, xmm0
  257. movdqa xmm1, xmm0
  258. punpcklwd xmm0, xmm0
  259. punpckhwd xmm1, xmm1
  260. por xmm0, xmm5
  261. por xmm1, xmm5
  262. movdqu [edx], xmm0
  263. movdqu [edx + 16], xmm1
  264. lea edx, [edx + 32]
  265. sub ecx, 8
  266. jg convertloop
  267. ret
  268. }
  269. }
  270. #ifdef HAS_J400TOARGBROW_AVX2
  271. // Duplicates gray value 3 times and fills in alpha opaque.
  272. __declspec(naked)
  273. void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
  274. __asm {
  275. mov eax, [esp + 4] // src_y
  276. mov edx, [esp + 8] // dst_argb
  277. mov ecx, [esp + 12] // width
  278. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
  279. vpslld ymm5, ymm5, 24
  280. convertloop:
  281. vmovdqu xmm0, [eax]
  282. lea eax, [eax + 16]
  283. vpermq ymm0, ymm0, 0xd8
  284. vpunpcklbw ymm0, ymm0, ymm0
  285. vpermq ymm0, ymm0, 0xd8
  286. vpunpckhwd ymm1, ymm0, ymm0
  287. vpunpcklwd ymm0, ymm0, ymm0
  288. vpor ymm0, ymm0, ymm5
  289. vpor ymm1, ymm1, ymm5
  290. vmovdqu [edx], ymm0
  291. vmovdqu [edx + 32], ymm1
  292. lea edx, [edx + 64]
  293. sub ecx, 16
  294. jg convertloop
  295. vzeroupper
  296. ret
  297. }
  298. }
  299. #endif // HAS_J400TOARGBROW_AVX2
  300. __declspec(naked)
  301. void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
  302. __asm {
  303. mov eax, [esp + 4] // src_rgb24
  304. mov edx, [esp + 8] // dst_argb
  305. mov ecx, [esp + 12] // width
  306. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  307. pslld xmm5, 24
  308. movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
  309. convertloop:
  310. movdqu xmm0, [eax]
  311. movdqu xmm1, [eax + 16]
  312. movdqu xmm3, [eax + 32]
  313. lea eax, [eax + 48]
  314. movdqa xmm2, xmm3
  315. palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
  316. pshufb xmm2, xmm4
  317. por xmm2, xmm5
  318. palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
  319. pshufb xmm0, xmm4
  320. movdqu [edx + 32], xmm2
  321. por xmm0, xmm5
  322. pshufb xmm1, xmm4
  323. movdqu [edx], xmm0
  324. por xmm1, xmm5
  325. palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
  326. pshufb xmm3, xmm4
  327. movdqu [edx + 16], xmm1
  328. por xmm3, xmm5
  329. movdqu [edx + 48], xmm3
  330. lea edx, [edx + 64]
  331. sub ecx, 16
  332. jg convertloop
  333. ret
  334. }
  335. }
  336. __declspec(naked)
  337. void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
  338. int width) {
  339. __asm {
  340. mov eax, [esp + 4] // src_raw
  341. mov edx, [esp + 8] // dst_argb
  342. mov ecx, [esp + 12] // width
  343. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  344. pslld xmm5, 24
  345. movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
  346. convertloop:
  347. movdqu xmm0, [eax]
  348. movdqu xmm1, [eax + 16]
  349. movdqu xmm3, [eax + 32]
  350. lea eax, [eax + 48]
  351. movdqa xmm2, xmm3
  352. palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
  353. pshufb xmm2, xmm4
  354. por xmm2, xmm5
  355. palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
  356. pshufb xmm0, xmm4
  357. movdqu [edx + 32], xmm2
  358. por xmm0, xmm5
  359. pshufb xmm1, xmm4
  360. movdqu [edx], xmm0
  361. por xmm1, xmm5
  362. palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
  363. pshufb xmm3, xmm4
  364. movdqu [edx + 16], xmm1
  365. por xmm3, xmm5
  366. movdqu [edx + 48], xmm3
  367. lea edx, [edx + 64]
  368. sub ecx, 16
  369. jg convertloop
  370. ret
  371. }
  372. }
  373. __declspec(naked)
  374. void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
  375. __asm {
  376. mov eax, [esp + 4] // src_raw
  377. mov edx, [esp + 8] // dst_rgb24
  378. mov ecx, [esp + 12] // width
  379. movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
  380. movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
  381. movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
  382. convertloop:
  383. movdqu xmm0, [eax]
  384. movdqu xmm1, [eax + 4]
  385. movdqu xmm2, [eax + 8]
  386. lea eax, [eax + 24]
  387. pshufb xmm0, xmm3
  388. pshufb xmm1, xmm4
  389. pshufb xmm2, xmm5
  390. movq qword ptr [edx], xmm0
  391. movq qword ptr [edx + 8], xmm1
  392. movq qword ptr [edx + 16], xmm2
  393. lea edx, [edx + 24]
  394. sub ecx, 8
  395. jg convertloop
  396. ret
  397. }
  398. }
  399. // pmul method to replicate bits.
  400. // Math to replicate bits:
  401. // (v << 8) | (v << 3)
  402. // v * 256 + v * 8
  403. // v * (256 + 8)
  404. // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
  405. // 20 instructions.
  406. __declspec(naked)
  407. void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
  408. int width) {
  409. __asm {
  410. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  411. movd xmm5, eax
  412. pshufd xmm5, xmm5, 0
  413. mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
  414. movd xmm6, eax
  415. pshufd xmm6, xmm6, 0
  416. pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
  417. psllw xmm3, 11
  418. pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
  419. psllw xmm4, 10
  420. psrlw xmm4, 5
  421. pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
  422. psllw xmm7, 8
  423. mov eax, [esp + 4] // src_rgb565
  424. mov edx, [esp + 8] // dst_argb
  425. mov ecx, [esp + 12] // width
  426. sub edx, eax
  427. sub edx, eax
  428. convertloop:
  429. movdqu xmm0, [eax] // fetch 8 pixels of bgr565
  430. movdqa xmm1, xmm0
  431. movdqa xmm2, xmm0
  432. pand xmm1, xmm3 // R in upper 5 bits
  433. psllw xmm2, 11 // B in upper 5 bits
  434. pmulhuw xmm1, xmm5 // * (256 + 8)
  435. pmulhuw xmm2, xmm5 // * (256 + 8)
  436. psllw xmm1, 8
  437. por xmm1, xmm2 // RB
  438. pand xmm0, xmm4 // G in middle 6 bits
  439. pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
  440. por xmm0, xmm7 // AG
  441. movdqa xmm2, xmm1
  442. punpcklbw xmm1, xmm0
  443. punpckhbw xmm2, xmm0
  444. movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
  445. movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
  446. lea eax, [eax + 16]
  447. sub ecx, 8
  448. jg convertloop
  449. ret
  450. }
  451. }
  452. #ifdef HAS_RGB565TOARGBROW_AVX2
  453. // pmul method to replicate bits.
  454. // Math to replicate bits:
  455. // (v << 8) | (v << 3)
  456. // v * 256 + v * 8
  457. // v * (256 + 8)
  458. // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
  459. __declspec(naked)
  460. void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
  461. int width) {
  462. __asm {
  463. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  464. vmovd xmm5, eax
  465. vbroadcastss ymm5, xmm5
  466. mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
  467. vmovd xmm6, eax
  468. vbroadcastss ymm6, xmm6
  469. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
  470. vpsllw ymm3, ymm3, 11
  471. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
  472. vpsllw ymm4, ymm4, 10
  473. vpsrlw ymm4, ymm4, 5
  474. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
  475. vpsllw ymm7, ymm7, 8
  476. mov eax, [esp + 4] // src_rgb565
  477. mov edx, [esp + 8] // dst_argb
  478. mov ecx, [esp + 12] // width
  479. sub edx, eax
  480. sub edx, eax
  481. convertloop:
  482. vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
  483. vpand ymm1, ymm0, ymm3 // R in upper 5 bits
  484. vpsllw ymm2, ymm0, 11 // B in upper 5 bits
  485. vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
  486. vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
  487. vpsllw ymm1, ymm1, 8
  488. vpor ymm1, ymm1, ymm2 // RB
  489. vpand ymm0, ymm0, ymm4 // G in middle 6 bits
  490. vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
  491. vpor ymm0, ymm0, ymm7 // AG
  492. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  493. vpermq ymm1, ymm1, 0xd8
  494. vpunpckhbw ymm2, ymm1, ymm0
  495. vpunpcklbw ymm1, ymm1, ymm0
  496. vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
  497. vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
  498. lea eax, [eax + 32]
  499. sub ecx, 16
  500. jg convertloop
  501. vzeroupper
  502. ret
  503. }
  504. }
  505. #endif // HAS_RGB565TOARGBROW_AVX2
  506. #ifdef HAS_ARGB1555TOARGBROW_AVX2
  507. __declspec(naked)
  508. void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
  509. int width) {
  510. __asm {
  511. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  512. vmovd xmm5, eax
  513. vbroadcastss ymm5, xmm5
  514. mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
  515. vmovd xmm6, eax
  516. vbroadcastss ymm6, xmm6
  517. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
  518. vpsllw ymm3, ymm3, 11
  519. vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
  520. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
  521. vpsllw ymm7, ymm7, 8
  522. mov eax, [esp + 4] // src_argb1555
  523. mov edx, [esp + 8] // dst_argb
  524. mov ecx, [esp + 12] // width
  525. sub edx, eax
  526. sub edx, eax
  527. convertloop:
  528. vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
  529. vpsllw ymm1, ymm0, 1 // R in upper 5 bits
  530. vpsllw ymm2, ymm0, 11 // B in upper 5 bits
  531. vpand ymm1, ymm1, ymm3
  532. vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
  533. vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
  534. vpsllw ymm1, ymm1, 8
  535. vpor ymm1, ymm1, ymm2 // RB
  536. vpsraw ymm2, ymm0, 8 // A
  537. vpand ymm0, ymm0, ymm4 // G in middle 5 bits
  538. vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
  539. vpand ymm2, ymm2, ymm7
  540. vpor ymm0, ymm0, ymm2 // AG
  541. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  542. vpermq ymm1, ymm1, 0xd8
  543. vpunpckhbw ymm2, ymm1, ymm0
  544. vpunpcklbw ymm1, ymm1, ymm0
  545. vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
  546. vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
  547. lea eax, [eax + 32]
  548. sub ecx, 16
  549. jg convertloop
  550. vzeroupper
  551. ret
  552. }
  553. }
  554. #endif // HAS_ARGB1555TOARGBROW_AVX2
  555. #ifdef HAS_ARGB4444TOARGBROW_AVX2
  556. __declspec(naked)
  557. void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
  558. int width) {
  559. __asm {
  560. mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
  561. vmovd xmm4, eax
  562. vbroadcastss ymm4, xmm4
  563. vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
  564. mov eax, [esp + 4] // src_argb4444
  565. mov edx, [esp + 8] // dst_argb
  566. mov ecx, [esp + 12] // width
  567. sub edx, eax
  568. sub edx, eax
  569. convertloop:
  570. vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
  571. vpand ymm2, ymm0, ymm5 // mask high nibbles
  572. vpand ymm0, ymm0, ymm4 // mask low nibbles
  573. vpsrlw ymm3, ymm2, 4
  574. vpsllw ymm1, ymm0, 4
  575. vpor ymm2, ymm2, ymm3
  576. vpor ymm0, ymm0, ymm1
  577. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  578. vpermq ymm2, ymm2, 0xd8
  579. vpunpckhbw ymm1, ymm0, ymm2
  580. vpunpcklbw ymm0, ymm0, ymm2
  581. vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
  582. vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
  583. lea eax, [eax + 32]
  584. sub ecx, 16
  585. jg convertloop
  586. vzeroupper
  587. ret
  588. }
  589. }
  590. #endif // HAS_ARGB4444TOARGBROW_AVX2
  591. // 24 instructions
  592. __declspec(naked)
  593. void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
  594. int width) {
  595. __asm {
  596. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  597. movd xmm5, eax
  598. pshufd xmm5, xmm5, 0
  599. mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
  600. movd xmm6, eax
  601. pshufd xmm6, xmm6, 0
  602. pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
  603. psllw xmm3, 11
  604. movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
  605. psrlw xmm4, 6
  606. pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
  607. psllw xmm7, 8
  608. mov eax, [esp + 4] // src_argb1555
  609. mov edx, [esp + 8] // dst_argb
  610. mov ecx, [esp + 12] // width
  611. sub edx, eax
  612. sub edx, eax
  613. convertloop:
  614. movdqu xmm0, [eax] // fetch 8 pixels of 1555
  615. movdqa xmm1, xmm0
  616. movdqa xmm2, xmm0
  617. psllw xmm1, 1 // R in upper 5 bits
  618. psllw xmm2, 11 // B in upper 5 bits
  619. pand xmm1, xmm3
  620. pmulhuw xmm2, xmm5 // * (256 + 8)
  621. pmulhuw xmm1, xmm5 // * (256 + 8)
  622. psllw xmm1, 8
  623. por xmm1, xmm2 // RB
  624. movdqa xmm2, xmm0
  625. pand xmm0, xmm4 // G in middle 5 bits
  626. psraw xmm2, 8 // A
  627. pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
  628. pand xmm2, xmm7
  629. por xmm0, xmm2 // AG
  630. movdqa xmm2, xmm1
  631. punpcklbw xmm1, xmm0
  632. punpckhbw xmm2, xmm0
  633. movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
  634. movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
  635. lea eax, [eax + 16]
  636. sub ecx, 8
  637. jg convertloop
  638. ret
  639. }
  640. }
  641. // 18 instructions.
  642. __declspec(naked)
  643. void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
  644. int width) {
  645. __asm {
  646. mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
  647. movd xmm4, eax
  648. pshufd xmm4, xmm4, 0
  649. movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
  650. pslld xmm5, 4
  651. mov eax, [esp + 4] // src_argb4444
  652. mov edx, [esp + 8] // dst_argb
  653. mov ecx, [esp + 12] // width
  654. sub edx, eax
  655. sub edx, eax
  656. convertloop:
  657. movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
  658. movdqa xmm2, xmm0
  659. pand xmm0, xmm4 // mask low nibbles
  660. pand xmm2, xmm5 // mask high nibbles
  661. movdqa xmm1, xmm0
  662. movdqa xmm3, xmm2
  663. psllw xmm1, 4
  664. psrlw xmm3, 4
  665. por xmm0, xmm1
  666. por xmm2, xmm3
  667. movdqa xmm1, xmm0
  668. punpcklbw xmm0, xmm2
  669. punpckhbw xmm1, xmm2
  670. movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
  671. movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
  672. lea eax, [eax + 16]
  673. sub ecx, 8
  674. jg convertloop
  675. ret
  676. }
  677. }
  678. __declspec(naked)
  679. void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
  680. __asm {
  681. mov eax, [esp + 4] // src_argb
  682. mov edx, [esp + 8] // dst_rgb
  683. mov ecx, [esp + 12] // width
  684. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
  685. convertloop:
  686. movdqu xmm0, [eax] // fetch 16 pixels of argb
  687. movdqu xmm1, [eax + 16]
  688. movdqu xmm2, [eax + 32]
  689. movdqu xmm3, [eax + 48]
  690. lea eax, [eax + 64]
  691. pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
  692. pshufb xmm1, xmm6
  693. pshufb xmm2, xmm6
  694. pshufb xmm3, xmm6
  695. movdqa xmm4, xmm1 // 4 bytes from 1 for 0
  696. psrldq xmm1, 4 // 8 bytes from 1
  697. pslldq xmm4, 12 // 4 bytes from 1 for 0
  698. movdqa xmm5, xmm2 // 8 bytes from 2 for 1
  699. por xmm0, xmm4 // 4 bytes from 1 for 0
  700. pslldq xmm5, 8 // 8 bytes from 2 for 1
  701. movdqu [edx], xmm0 // store 0
  702. por xmm1, xmm5 // 8 bytes from 2 for 1
  703. psrldq xmm2, 8 // 4 bytes from 2
  704. pslldq xmm3, 4 // 12 bytes from 3 for 2
  705. por xmm2, xmm3 // 12 bytes from 3 for 2
  706. movdqu [edx + 16], xmm1 // store 1
  707. movdqu [edx + 32], xmm2 // store 2
  708. lea edx, [edx + 48]
  709. sub ecx, 16
  710. jg convertloop
  711. ret
  712. }
  713. }
  714. __declspec(naked)
  715. void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
  716. __asm {
  717. mov eax, [esp + 4] // src_argb
  718. mov edx, [esp + 8] // dst_rgb
  719. mov ecx, [esp + 12] // width
  720. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
  721. convertloop:
  722. movdqu xmm0, [eax] // fetch 16 pixels of argb
  723. movdqu xmm1, [eax + 16]
  724. movdqu xmm2, [eax + 32]
  725. movdqu xmm3, [eax + 48]
  726. lea eax, [eax + 64]
  727. pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
  728. pshufb xmm1, xmm6
  729. pshufb xmm2, xmm6
  730. pshufb xmm3, xmm6
  731. movdqa xmm4, xmm1 // 4 bytes from 1 for 0
  732. psrldq xmm1, 4 // 8 bytes from 1
  733. pslldq xmm4, 12 // 4 bytes from 1 for 0
  734. movdqa xmm5, xmm2 // 8 bytes from 2 for 1
  735. por xmm0, xmm4 // 4 bytes from 1 for 0
  736. pslldq xmm5, 8 // 8 bytes from 2 for 1
  737. movdqu [edx], xmm0 // store 0
  738. por xmm1, xmm5 // 8 bytes from 2 for 1
  739. psrldq xmm2, 8 // 4 bytes from 2
  740. pslldq xmm3, 4 // 12 bytes from 3 for 2
  741. por xmm2, xmm3 // 12 bytes from 3 for 2
  742. movdqu [edx + 16], xmm1 // store 1
  743. movdqu [edx + 32], xmm2 // store 2
  744. lea edx, [edx + 48]
  745. sub ecx, 16
  746. jg convertloop
  747. ret
  748. }
  749. }
  750. __declspec(naked)
  751. void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
  752. __asm {
  753. mov eax, [esp + 4] // src_argb
  754. mov edx, [esp + 8] // dst_rgb
  755. mov ecx, [esp + 12] // width
  756. pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
  757. psrld xmm3, 27
  758. pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
  759. psrld xmm4, 26
  760. pslld xmm4, 5
  761. pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
  762. pslld xmm5, 11
  763. convertloop:
  764. movdqu xmm0, [eax] // fetch 4 pixels of argb
  765. movdqa xmm1, xmm0 // B
  766. movdqa xmm2, xmm0 // G
  767. pslld xmm0, 8 // R
  768. psrld xmm1, 3 // B
  769. psrld xmm2, 5 // G
  770. psrad xmm0, 16 // R
  771. pand xmm1, xmm3 // B
  772. pand xmm2, xmm4 // G
  773. pand xmm0, xmm5 // R
  774. por xmm1, xmm2 // BG
  775. por xmm0, xmm1 // BGR
  776. packssdw xmm0, xmm0
  777. lea eax, [eax + 16]
  778. movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
  779. lea edx, [edx + 8]
  780. sub ecx, 4
  781. jg convertloop
  782. ret
  783. }
  784. }
  785. __declspec(naked)
  786. void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
  787. const uint32 dither4, int width) {
  788. __asm {
  789. mov eax, [esp + 4] // src_argb
  790. mov edx, [esp + 8] // dst_rgb
  791. movd xmm6, [esp + 12] // dither4
  792. mov ecx, [esp + 16] // width
  793. punpcklbw xmm6, xmm6 // make dither 16 bytes
  794. movdqa xmm7, xmm6
  795. punpcklwd xmm6, xmm6
  796. punpckhwd xmm7, xmm7
  797. pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
  798. psrld xmm3, 27
  799. pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
  800. psrld xmm4, 26
  801. pslld xmm4, 5
  802. pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
  803. pslld xmm5, 11
  804. convertloop:
  805. movdqu xmm0, [eax] // fetch 4 pixels of argb
  806. paddusb xmm0, xmm6 // add dither
  807. movdqa xmm1, xmm0 // B
  808. movdqa xmm2, xmm0 // G
  809. pslld xmm0, 8 // R
  810. psrld xmm1, 3 // B
  811. psrld xmm2, 5 // G
  812. psrad xmm0, 16 // R
  813. pand xmm1, xmm3 // B
  814. pand xmm2, xmm4 // G
  815. pand xmm0, xmm5 // R
  816. por xmm1, xmm2 // BG
  817. por xmm0, xmm1 // BGR
  818. packssdw xmm0, xmm0
  819. lea eax, [eax + 16]
  820. movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
  821. lea edx, [edx + 8]
  822. sub ecx, 4
  823. jg convertloop
  824. ret
  825. }
  826. }
  827. #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
  828. __declspec(naked)
  829. void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
  830. const uint32 dither4, int width) {
  831. __asm {
  832. mov eax, [esp + 4] // src_argb
  833. mov edx, [esp + 8] // dst_rgb
  834. vbroadcastss xmm6, [esp + 12] // dither4
  835. mov ecx, [esp + 16] // width
  836. vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
  837. vpermq ymm6, ymm6, 0xd8
  838. vpunpcklwd ymm6, ymm6, ymm6
  839. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
  840. vpsrld ymm3, ymm3, 27
  841. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
  842. vpsrld ymm4, ymm4, 26
  843. vpslld ymm4, ymm4, 5
  844. vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
  845. convertloop:
  846. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  847. vpaddusb ymm0, ymm0, ymm6 // add dither
  848. vpsrld ymm2, ymm0, 5 // G
  849. vpsrld ymm1, ymm0, 3 // B
  850. vpsrld ymm0, ymm0, 8 // R
  851. vpand ymm2, ymm2, ymm4 // G
  852. vpand ymm1, ymm1, ymm3 // B
  853. vpand ymm0, ymm0, ymm5 // R
  854. vpor ymm1, ymm1, ymm2 // BG
  855. vpor ymm0, ymm0, ymm1 // BGR
  856. vpackusdw ymm0, ymm0, ymm0
  857. vpermq ymm0, ymm0, 0xd8
  858. lea eax, [eax + 32]
  859. vmovdqu [edx], xmm0 // store 8 pixels of RGB565
  860. lea edx, [edx + 16]
  861. sub ecx, 8
  862. jg convertloop
  863. vzeroupper
  864. ret
  865. }
  866. }
  867. #endif // HAS_ARGBTORGB565DITHERROW_AVX2
  868. // TODO(fbarchard): Improve sign extension/packing.
  869. __declspec(naked)
  870. void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
  871. __asm {
  872. mov eax, [esp + 4] // src_argb
  873. mov edx, [esp + 8] // dst_rgb
  874. mov ecx, [esp + 12] // width
  875. pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
  876. psrld xmm4, 27
  877. movdqa xmm5, xmm4 // generate mask 0x000003e0
  878. pslld xmm5, 5
  879. movdqa xmm6, xmm4 // generate mask 0x00007c00
  880. pslld xmm6, 10
  881. pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
  882. pslld xmm7, 15
  883. convertloop:
  884. movdqu xmm0, [eax] // fetch 4 pixels of argb
  885. movdqa xmm1, xmm0 // B
  886. movdqa xmm2, xmm0 // G
  887. movdqa xmm3, xmm0 // R
  888. psrad xmm0, 16 // A
  889. psrld xmm1, 3 // B
  890. psrld xmm2, 6 // G
  891. psrld xmm3, 9 // R
  892. pand xmm0, xmm7 // A
  893. pand xmm1, xmm4 // B
  894. pand xmm2, xmm5 // G
  895. pand xmm3, xmm6 // R
  896. por xmm0, xmm1 // BA
  897. por xmm2, xmm3 // GR
  898. por xmm0, xmm2 // BGRA
  899. packssdw xmm0, xmm0
  900. lea eax, [eax + 16]
  901. movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
  902. lea edx, [edx + 8]
  903. sub ecx, 4
  904. jg convertloop
  905. ret
  906. }
  907. }
  908. __declspec(naked)
  909. void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
  910. __asm {
  911. mov eax, [esp + 4] // src_argb
  912. mov edx, [esp + 8] // dst_rgb
  913. mov ecx, [esp + 12] // width
  914. pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
  915. psllw xmm4, 12
  916. movdqa xmm3, xmm4 // generate mask 0x00f000f0
  917. psrlw xmm3, 8
  918. convertloop:
  919. movdqu xmm0, [eax] // fetch 4 pixels of argb
  920. movdqa xmm1, xmm0
  921. pand xmm0, xmm3 // low nibble
  922. pand xmm1, xmm4 // high nibble
  923. psrld xmm0, 4
  924. psrld xmm1, 8
  925. por xmm0, xmm1
  926. packuswb xmm0, xmm0
  927. lea eax, [eax + 16]
  928. movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
  929. lea edx, [edx + 8]
  930. sub ecx, 4
  931. jg convertloop
  932. ret
  933. }
  934. }
  935. #ifdef HAS_ARGBTORGB565ROW_AVX2
  936. __declspec(naked)
  937. void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
  938. __asm {
  939. mov eax, [esp + 4] // src_argb
  940. mov edx, [esp + 8] // dst_rgb
  941. mov ecx, [esp + 12] // width
  942. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
  943. vpsrld ymm3, ymm3, 27
  944. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
  945. vpsrld ymm4, ymm4, 26
  946. vpslld ymm4, ymm4, 5
  947. vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
  948. convertloop:
  949. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  950. vpsrld ymm2, ymm0, 5 // G
  951. vpsrld ymm1, ymm0, 3 // B
  952. vpsrld ymm0, ymm0, 8 // R
  953. vpand ymm2, ymm2, ymm4 // G
  954. vpand ymm1, ymm1, ymm3 // B
  955. vpand ymm0, ymm0, ymm5 // R
  956. vpor ymm1, ymm1, ymm2 // BG
  957. vpor ymm0, ymm0, ymm1 // BGR
  958. vpackusdw ymm0, ymm0, ymm0
  959. vpermq ymm0, ymm0, 0xd8
  960. lea eax, [eax + 32]
  961. vmovdqu [edx], xmm0 // store 8 pixels of RGB565
  962. lea edx, [edx + 16]
  963. sub ecx, 8
  964. jg convertloop
  965. vzeroupper
  966. ret
  967. }
  968. }
  969. #endif // HAS_ARGBTORGB565ROW_AVX2
  970. #ifdef HAS_ARGBTOARGB1555ROW_AVX2
  971. __declspec(naked)
  972. void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
  973. __asm {
  974. mov eax, [esp + 4] // src_argb
  975. mov edx, [esp + 8] // dst_rgb
  976. mov ecx, [esp + 12] // width
  977. vpcmpeqb ymm4, ymm4, ymm4
  978. vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
  979. vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
  980. vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
  981. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
  982. vpslld ymm7, ymm7, 15
  983. convertloop:
  984. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  985. vpsrld ymm3, ymm0, 9 // R
  986. vpsrld ymm2, ymm0, 6 // G
  987. vpsrld ymm1, ymm0, 3 // B
  988. vpsrad ymm0, ymm0, 16 // A
  989. vpand ymm3, ymm3, ymm6 // R
  990. vpand ymm2, ymm2, ymm5 // G
  991. vpand ymm1, ymm1, ymm4 // B
  992. vpand ymm0, ymm0, ymm7 // A
  993. vpor ymm0, ymm0, ymm1 // BA
  994. vpor ymm2, ymm2, ymm3 // GR
  995. vpor ymm0, ymm0, ymm2 // BGRA
  996. vpackssdw ymm0, ymm0, ymm0
  997. vpermq ymm0, ymm0, 0xd8
  998. lea eax, [eax + 32]
  999. vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
  1000. lea edx, [edx + 16]
  1001. sub ecx, 8
  1002. jg convertloop
  1003. vzeroupper
  1004. ret
  1005. }
  1006. }
  1007. #endif // HAS_ARGBTOARGB1555ROW_AVX2
  1008. #ifdef HAS_ARGBTOARGB4444ROW_AVX2
  1009. __declspec(naked)
  1010. void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
  1011. __asm {
  1012. mov eax, [esp + 4] // src_argb
  1013. mov edx, [esp + 8] // dst_rgb
  1014. mov ecx, [esp + 12] // width
  1015. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
  1016. vpsllw ymm4, ymm4, 12
  1017. vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
  1018. convertloop:
  1019. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  1020. vpand ymm1, ymm0, ymm4 // high nibble
  1021. vpand ymm0, ymm0, ymm3 // low nibble
  1022. vpsrld ymm1, ymm1, 8
  1023. vpsrld ymm0, ymm0, 4
  1024. vpor ymm0, ymm0, ymm1
  1025. vpackuswb ymm0, ymm0, ymm0
  1026. vpermq ymm0, ymm0, 0xd8
  1027. lea eax, [eax + 32]
  1028. vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
  1029. lea edx, [edx + 16]
  1030. sub ecx, 8
  1031. jg convertloop
  1032. vzeroupper
  1033. ret
  1034. }
  1035. }
  1036. #endif // HAS_ARGBTOARGB4444ROW_AVX2
  1037. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
  1038. __declspec(naked)
  1039. void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
  1040. __asm {
  1041. mov eax, [esp + 4] /* src_argb */
  1042. mov edx, [esp + 8] /* dst_y */
  1043. mov ecx, [esp + 12] /* width */
  1044. movdqa xmm4, xmmword ptr kARGBToY
  1045. movdqa xmm5, xmmword ptr kAddY16
  1046. convertloop:
  1047. movdqu xmm0, [eax]
  1048. movdqu xmm1, [eax + 16]
  1049. movdqu xmm2, [eax + 32]
  1050. movdqu xmm3, [eax + 48]
  1051. pmaddubsw xmm0, xmm4
  1052. pmaddubsw xmm1, xmm4
  1053. pmaddubsw xmm2, xmm4
  1054. pmaddubsw xmm3, xmm4
  1055. lea eax, [eax + 64]
  1056. phaddw xmm0, xmm1
  1057. phaddw xmm2, xmm3
  1058. psrlw xmm0, 7
  1059. psrlw xmm2, 7
  1060. packuswb xmm0, xmm2
  1061. paddb xmm0, xmm5
  1062. movdqu [edx], xmm0
  1063. lea edx, [edx + 16]
  1064. sub ecx, 16
  1065. jg convertloop
  1066. ret
  1067. }
  1068. }
  1069. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
  1070. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
  1071. __declspec(naked)
  1072. void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
  1073. __asm {
  1074. mov eax, [esp + 4] /* src_argb */
  1075. mov edx, [esp + 8] /* dst_y */
  1076. mov ecx, [esp + 12] /* width */
  1077. movdqa xmm4, xmmword ptr kARGBToYJ
  1078. movdqa xmm5, xmmword ptr kAddYJ64
  1079. convertloop:
  1080. movdqu xmm0, [eax]
  1081. movdqu xmm1, [eax + 16]
  1082. movdqu xmm2, [eax + 32]
  1083. movdqu xmm3, [eax + 48]
  1084. pmaddubsw xmm0, xmm4
  1085. pmaddubsw xmm1, xmm4
  1086. pmaddubsw xmm2, xmm4
  1087. pmaddubsw xmm3, xmm4
  1088. lea eax, [eax + 64]
  1089. phaddw xmm0, xmm1
  1090. phaddw xmm2, xmm3
  1091. paddw xmm0, xmm5 // Add .5 for rounding.
  1092. paddw xmm2, xmm5
  1093. psrlw xmm0, 7
  1094. psrlw xmm2, 7
  1095. packuswb xmm0, xmm2
  1096. movdqu [edx], xmm0
  1097. lea edx, [edx + 16]
  1098. sub ecx, 16
  1099. jg convertloop
  1100. ret
  1101. }
  1102. }
  1103. #ifdef HAS_ARGBTOYROW_AVX2
  1104. // vpermd for vphaddw + vpackuswb vpermd.
  1105. static const lvec32 kPermdARGBToY_AVX = {
  1106. 0, 4, 1, 5, 2, 6, 3, 7
  1107. };
  1108. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1109. __declspec(naked)
  1110. void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
  1111. __asm {
  1112. mov eax, [esp + 4] /* src_argb */
  1113. mov edx, [esp + 8] /* dst_y */
  1114. mov ecx, [esp + 12] /* width */
  1115. vbroadcastf128 ymm4, xmmword ptr kARGBToY
  1116. vbroadcastf128 ymm5, xmmword ptr kAddY16
  1117. vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
  1118. convertloop:
  1119. vmovdqu ymm0, [eax]
  1120. vmovdqu ymm1, [eax + 32]
  1121. vmovdqu ymm2, [eax + 64]
  1122. vmovdqu ymm3, [eax + 96]
  1123. vpmaddubsw ymm0, ymm0, ymm4
  1124. vpmaddubsw ymm1, ymm1, ymm4
  1125. vpmaddubsw ymm2, ymm2, ymm4
  1126. vpmaddubsw ymm3, ymm3, ymm4
  1127. lea eax, [eax + 128]
  1128. vphaddw ymm0, ymm0, ymm1 // mutates.
  1129. vphaddw ymm2, ymm2, ymm3
  1130. vpsrlw ymm0, ymm0, 7
  1131. vpsrlw ymm2, ymm2, 7
  1132. vpackuswb ymm0, ymm0, ymm2 // mutates.
  1133. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
  1134. vpaddb ymm0, ymm0, ymm5 // add 16 for Y
  1135. vmovdqu [edx], ymm0
  1136. lea edx, [edx + 32]
  1137. sub ecx, 32
  1138. jg convertloop
  1139. vzeroupper
  1140. ret
  1141. }
  1142. }
  1143. #endif // HAS_ARGBTOYROW_AVX2
  1144. #ifdef HAS_ARGBTOYJROW_AVX2
  1145. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1146. __declspec(naked)
  1147. void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
  1148. __asm {
  1149. mov eax, [esp + 4] /* src_argb */
  1150. mov edx, [esp + 8] /* dst_y */
  1151. mov ecx, [esp + 12] /* width */
  1152. vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
  1153. vbroadcastf128 ymm5, xmmword ptr kAddYJ64
  1154. vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
  1155. convertloop:
  1156. vmovdqu ymm0, [eax]
  1157. vmovdqu ymm1, [eax + 32]
  1158. vmovdqu ymm2, [eax + 64]
  1159. vmovdqu ymm3, [eax + 96]
  1160. vpmaddubsw ymm0, ymm0, ymm4
  1161. vpmaddubsw ymm1, ymm1, ymm4
  1162. vpmaddubsw ymm2, ymm2, ymm4
  1163. vpmaddubsw ymm3, ymm3, ymm4
  1164. lea eax, [eax + 128]
  1165. vphaddw ymm0, ymm0, ymm1 // mutates.
  1166. vphaddw ymm2, ymm2, ymm3
  1167. vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
  1168. vpaddw ymm2, ymm2, ymm5
  1169. vpsrlw ymm0, ymm0, 7
  1170. vpsrlw ymm2, ymm2, 7
  1171. vpackuswb ymm0, ymm0, ymm2 // mutates.
  1172. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
  1173. vmovdqu [edx], ymm0
  1174. lea edx, [edx + 32]
  1175. sub ecx, 32
  1176. jg convertloop
  1177. vzeroupper
  1178. ret
  1179. }
  1180. }
  1181. #endif // HAS_ARGBTOYJROW_AVX2
  1182. __declspec(naked)
  1183. void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
  1184. __asm {
  1185. mov eax, [esp + 4] /* src_argb */
  1186. mov edx, [esp + 8] /* dst_y */
  1187. mov ecx, [esp + 12] /* width */
  1188. movdqa xmm4, xmmword ptr kBGRAToY
  1189. movdqa xmm5, xmmword ptr kAddY16
  1190. convertloop:
  1191. movdqu xmm0, [eax]
  1192. movdqu xmm1, [eax + 16]
  1193. movdqu xmm2, [eax + 32]
  1194. movdqu xmm3, [eax + 48]
  1195. pmaddubsw xmm0, xmm4
  1196. pmaddubsw xmm1, xmm4
  1197. pmaddubsw xmm2, xmm4
  1198. pmaddubsw xmm3, xmm4
  1199. lea eax, [eax + 64]
  1200. phaddw xmm0, xmm1
  1201. phaddw xmm2, xmm3
  1202. psrlw xmm0, 7
  1203. psrlw xmm2, 7
  1204. packuswb xmm0, xmm2
  1205. paddb xmm0, xmm5
  1206. movdqu [edx], xmm0
  1207. lea edx, [edx + 16]
  1208. sub ecx, 16
  1209. jg convertloop
  1210. ret
  1211. }
  1212. }
  1213. __declspec(naked)
  1214. void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
  1215. __asm {
  1216. mov eax, [esp + 4] /* src_argb */
  1217. mov edx, [esp + 8] /* dst_y */
  1218. mov ecx, [esp + 12] /* width */
  1219. movdqa xmm4, xmmword ptr kABGRToY
  1220. movdqa xmm5, xmmword ptr kAddY16
  1221. convertloop:
  1222. movdqu xmm0, [eax]
  1223. movdqu xmm1, [eax + 16]
  1224. movdqu xmm2, [eax + 32]
  1225. movdqu xmm3, [eax + 48]
  1226. pmaddubsw xmm0, xmm4
  1227. pmaddubsw xmm1, xmm4
  1228. pmaddubsw xmm2, xmm4
  1229. pmaddubsw xmm3, xmm4
  1230. lea eax, [eax + 64]
  1231. phaddw xmm0, xmm1
  1232. phaddw xmm2, xmm3
  1233. psrlw xmm0, 7
  1234. psrlw xmm2, 7
  1235. packuswb xmm0, xmm2
  1236. paddb xmm0, xmm5
  1237. movdqu [edx], xmm0
  1238. lea edx, [edx + 16]
  1239. sub ecx, 16
  1240. jg convertloop
  1241. ret
  1242. }
  1243. }
  1244. __declspec(naked)
  1245. void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
  1246. __asm {
  1247. mov eax, [esp + 4] /* src_argb */
  1248. mov edx, [esp + 8] /* dst_y */
  1249. mov ecx, [esp + 12] /* width */
  1250. movdqa xmm4, xmmword ptr kRGBAToY
  1251. movdqa xmm5, xmmword ptr kAddY16
  1252. convertloop:
  1253. movdqu xmm0, [eax]
  1254. movdqu xmm1, [eax + 16]
  1255. movdqu xmm2, [eax + 32]
  1256. movdqu xmm3, [eax + 48]
  1257. pmaddubsw xmm0, xmm4
  1258. pmaddubsw xmm1, xmm4
  1259. pmaddubsw xmm2, xmm4
  1260. pmaddubsw xmm3, xmm4
  1261. lea eax, [eax + 64]
  1262. phaddw xmm0, xmm1
  1263. phaddw xmm2, xmm3
  1264. psrlw xmm0, 7
  1265. psrlw xmm2, 7
  1266. packuswb xmm0, xmm2
  1267. paddb xmm0, xmm5
  1268. movdqu [edx], xmm0
  1269. lea edx, [edx + 16]
  1270. sub ecx, 16
  1271. jg convertloop
  1272. ret
  1273. }
  1274. }
  1275. __declspec(naked)
  1276. void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1277. uint8* dst_u, uint8* dst_v, int width) {
  1278. __asm {
  1279. push esi
  1280. push edi
  1281. mov eax, [esp + 8 + 4] // src_argb
  1282. mov esi, [esp + 8 + 8] // src_stride_argb
  1283. mov edx, [esp + 8 + 12] // dst_u
  1284. mov edi, [esp + 8 + 16] // dst_v
  1285. mov ecx, [esp + 8 + 20] // width
  1286. movdqa xmm5, xmmword ptr kAddUV128
  1287. movdqa xmm6, xmmword ptr kARGBToV
  1288. movdqa xmm7, xmmword ptr kARGBToU
  1289. sub edi, edx // stride from u to v
  1290. convertloop:
  1291. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1292. movdqu xmm0, [eax]
  1293. movdqu xmm4, [eax + esi]
  1294. pavgb xmm0, xmm4
  1295. movdqu xmm1, [eax + 16]
  1296. movdqu xmm4, [eax + esi + 16]
  1297. pavgb xmm1, xmm4
  1298. movdqu xmm2, [eax + 32]
  1299. movdqu xmm4, [eax + esi + 32]
  1300. pavgb xmm2, xmm4
  1301. movdqu xmm3, [eax + 48]
  1302. movdqu xmm4, [eax + esi + 48]
  1303. pavgb xmm3, xmm4
  1304. lea eax, [eax + 64]
  1305. movdqa xmm4, xmm0
  1306. shufps xmm0, xmm1, 0x88
  1307. shufps xmm4, xmm1, 0xdd
  1308. pavgb xmm0, xmm4
  1309. movdqa xmm4, xmm2
  1310. shufps xmm2, xmm3, 0x88
  1311. shufps xmm4, xmm3, 0xdd
  1312. pavgb xmm2, xmm4
  1313. // step 2 - convert to U and V
  1314. // from here down is very similar to Y code except
  1315. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1316. movdqa xmm1, xmm0
  1317. movdqa xmm3, xmm2
  1318. pmaddubsw xmm0, xmm7 // U
  1319. pmaddubsw xmm2, xmm7
  1320. pmaddubsw xmm1, xmm6 // V
  1321. pmaddubsw xmm3, xmm6
  1322. phaddw xmm0, xmm2
  1323. phaddw xmm1, xmm3
  1324. psraw xmm0, 8
  1325. psraw xmm1, 8
  1326. packsswb xmm0, xmm1
  1327. paddb xmm0, xmm5 // -> unsigned
  1328. // step 3 - store 8 U and 8 V values
  1329. movlps qword ptr [edx], xmm0 // U
  1330. movhps qword ptr [edx + edi], xmm0 // V
  1331. lea edx, [edx + 8]
  1332. sub ecx, 16
  1333. jg convertloop
  1334. pop edi
  1335. pop esi
  1336. ret
  1337. }
  1338. }
  1339. __declspec(naked)
  1340. void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1341. uint8* dst_u, uint8* dst_v, int width) {
  1342. __asm {
  1343. push esi
  1344. push edi
  1345. mov eax, [esp + 8 + 4] // src_argb
  1346. mov esi, [esp + 8 + 8] // src_stride_argb
  1347. mov edx, [esp + 8 + 12] // dst_u
  1348. mov edi, [esp + 8 + 16] // dst_v
  1349. mov ecx, [esp + 8 + 20] // width
  1350. movdqa xmm5, xmmword ptr kAddUVJ128
  1351. movdqa xmm6, xmmword ptr kARGBToVJ
  1352. movdqa xmm7, xmmword ptr kARGBToUJ
  1353. sub edi, edx // stride from u to v
  1354. convertloop:
  1355. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1356. movdqu xmm0, [eax]
  1357. movdqu xmm4, [eax + esi]
  1358. pavgb xmm0, xmm4
  1359. movdqu xmm1, [eax + 16]
  1360. movdqu xmm4, [eax + esi + 16]
  1361. pavgb xmm1, xmm4
  1362. movdqu xmm2, [eax + 32]
  1363. movdqu xmm4, [eax + esi + 32]
  1364. pavgb xmm2, xmm4
  1365. movdqu xmm3, [eax + 48]
  1366. movdqu xmm4, [eax + esi + 48]
  1367. pavgb xmm3, xmm4
  1368. lea eax, [eax + 64]
  1369. movdqa xmm4, xmm0
  1370. shufps xmm0, xmm1, 0x88
  1371. shufps xmm4, xmm1, 0xdd
  1372. pavgb xmm0, xmm4
  1373. movdqa xmm4, xmm2
  1374. shufps xmm2, xmm3, 0x88
  1375. shufps xmm4, xmm3, 0xdd
  1376. pavgb xmm2, xmm4
  1377. // step 2 - convert to U and V
  1378. // from here down is very similar to Y code except
  1379. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1380. movdqa xmm1, xmm0
  1381. movdqa xmm3, xmm2
  1382. pmaddubsw xmm0, xmm7 // U
  1383. pmaddubsw xmm2, xmm7
  1384. pmaddubsw xmm1, xmm6 // V
  1385. pmaddubsw xmm3, xmm6
  1386. phaddw xmm0, xmm2
  1387. phaddw xmm1, xmm3
  1388. paddw xmm0, xmm5 // +.5 rounding -> unsigned
  1389. paddw xmm1, xmm5
  1390. psraw xmm0, 8
  1391. psraw xmm1, 8
  1392. packsswb xmm0, xmm1
  1393. // step 3 - store 8 U and 8 V values
  1394. movlps qword ptr [edx], xmm0 // U
  1395. movhps qword ptr [edx + edi], xmm0 // V
  1396. lea edx, [edx + 8]
  1397. sub ecx, 16
  1398. jg convertloop
  1399. pop edi
  1400. pop esi
  1401. ret
  1402. }
  1403. }
  1404. #ifdef HAS_ARGBTOUVROW_AVX2
  1405. __declspec(naked)
  1406. void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
  1407. uint8* dst_u, uint8* dst_v, int width) {
  1408. __asm {
  1409. push esi
  1410. push edi
  1411. mov eax, [esp + 8 + 4] // src_argb
  1412. mov esi, [esp + 8 + 8] // src_stride_argb
  1413. mov edx, [esp + 8 + 12] // dst_u
  1414. mov edi, [esp + 8 + 16] // dst_v
  1415. mov ecx, [esp + 8 + 20] // width
  1416. vbroadcastf128 ymm5, xmmword ptr kAddUV128
  1417. vbroadcastf128 ymm6, xmmword ptr kARGBToV
  1418. vbroadcastf128 ymm7, xmmword ptr kARGBToU
  1419. sub edi, edx // stride from u to v
  1420. convertloop:
  1421. /* step 1 - subsample 32x2 argb pixels to 16x1 */
  1422. vmovdqu ymm0, [eax]
  1423. vmovdqu ymm1, [eax + 32]
  1424. vmovdqu ymm2, [eax + 64]
  1425. vmovdqu ymm3, [eax + 96]
  1426. vpavgb ymm0, ymm0, [eax + esi]
  1427. vpavgb ymm1, ymm1, [eax + esi + 32]
  1428. vpavgb ymm2, ymm2, [eax + esi + 64]
  1429. vpavgb ymm3, ymm3, [eax + esi + 96]
  1430. lea eax, [eax + 128]
  1431. vshufps ymm4, ymm0, ymm1, 0x88
  1432. vshufps ymm0, ymm0, ymm1, 0xdd
  1433. vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
  1434. vshufps ymm4, ymm2, ymm3, 0x88
  1435. vshufps ymm2, ymm2, ymm3, 0xdd
  1436. vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
  1437. // step 2 - convert to U and V
  1438. // from here down is very similar to Y code except
  1439. // instead of 32 different pixels, its 16 pixels of U and 16 of V
  1440. vpmaddubsw ymm1, ymm0, ymm7 // U
  1441. vpmaddubsw ymm3, ymm2, ymm7
  1442. vpmaddubsw ymm0, ymm0, ymm6 // V
  1443. vpmaddubsw ymm2, ymm2, ymm6
  1444. vphaddw ymm1, ymm1, ymm3 // mutates
  1445. vphaddw ymm0, ymm0, ymm2
  1446. vpsraw ymm1, ymm1, 8
  1447. vpsraw ymm0, ymm0, 8
  1448. vpacksswb ymm0, ymm1, ymm0 // mutates
  1449. vpermq ymm0, ymm0, 0xd8 // For vpacksswb
  1450. vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
  1451. vpaddb ymm0, ymm0, ymm5 // -> unsigned
  1452. // step 3 - store 16 U and 16 V values
  1453. vextractf128 [edx], ymm0, 0 // U
  1454. vextractf128 [edx + edi], ymm0, 1 // V
  1455. lea edx, [edx + 16]
  1456. sub ecx, 32
  1457. jg convertloop
  1458. pop edi
  1459. pop esi
  1460. vzeroupper
  1461. ret
  1462. }
  1463. }
  1464. #endif // HAS_ARGBTOUVROW_AVX2
  1465. #ifdef HAS_ARGBTOUVJROW_AVX2
  1466. __declspec(naked)
  1467. void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
  1468. uint8* dst_u, uint8* dst_v, int width) {
  1469. __asm {
  1470. push esi
  1471. push edi
  1472. mov eax, [esp + 8 + 4] // src_argb
  1473. mov esi, [esp + 8 + 8] // src_stride_argb
  1474. mov edx, [esp + 8 + 12] // dst_u
  1475. mov edi, [esp + 8 + 16] // dst_v
  1476. mov ecx, [esp + 8 + 20] // width
  1477. vbroadcastf128 ymm5, xmmword ptr kAddUV128
  1478. vbroadcastf128 ymm6, xmmword ptr kARGBToV
  1479. vbroadcastf128 ymm7, xmmword ptr kARGBToU
  1480. sub edi, edx // stride from u to v
  1481. convertloop:
  1482. /* step 1 - subsample 32x2 argb pixels to 16x1 */
  1483. vmovdqu ymm0, [eax]
  1484. vmovdqu ymm1, [eax + 32]
  1485. vmovdqu ymm2, [eax + 64]
  1486. vmovdqu ymm3, [eax + 96]
  1487. vpavgb ymm0, ymm0, [eax + esi]
  1488. vpavgb ymm1, ymm1, [eax + esi + 32]
  1489. vpavgb ymm2, ymm2, [eax + esi + 64]
  1490. vpavgb ymm3, ymm3, [eax + esi + 96]
  1491. lea eax, [eax + 128]
  1492. vshufps ymm4, ymm0, ymm1, 0x88
  1493. vshufps ymm0, ymm0, ymm1, 0xdd
  1494. vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
  1495. vshufps ymm4, ymm2, ymm3, 0x88
  1496. vshufps ymm2, ymm2, ymm3, 0xdd
  1497. vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
  1498. // step 2 - convert to U and V
  1499. // from here down is very similar to Y code except
  1500. // instead of 32 different pixels, its 16 pixels of U and 16 of V
  1501. vpmaddubsw ymm1, ymm0, ymm7 // U
  1502. vpmaddubsw ymm3, ymm2, ymm7
  1503. vpmaddubsw ymm0, ymm0, ymm6 // V
  1504. vpmaddubsw ymm2, ymm2, ymm6
  1505. vphaddw ymm1, ymm1, ymm3 // mutates
  1506. vphaddw ymm0, ymm0, ymm2
  1507. vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
  1508. vpaddw ymm0, ymm0, ymm5
  1509. vpsraw ymm1, ymm1, 8
  1510. vpsraw ymm0, ymm0, 8
  1511. vpacksswb ymm0, ymm1, ymm0 // mutates
  1512. vpermq ymm0, ymm0, 0xd8 // For vpacksswb
  1513. vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
  1514. // step 3 - store 16 U and 16 V values
  1515. vextractf128 [edx], ymm0, 0 // U
  1516. vextractf128 [edx + edi], ymm0, 1 // V
  1517. lea edx, [edx + 16]
  1518. sub ecx, 32
  1519. jg convertloop
  1520. pop edi
  1521. pop esi
  1522. vzeroupper
  1523. ret
  1524. }
  1525. }
  1526. #endif // HAS_ARGBTOUVJROW_AVX2
  1527. __declspec(naked)
  1528. void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
  1529. uint8* dst_u, uint8* dst_v, int width) {
  1530. __asm {
  1531. push edi
  1532. mov eax, [esp + 4 + 4] // src_argb
  1533. mov edx, [esp + 4 + 8] // dst_u
  1534. mov edi, [esp + 4 + 12] // dst_v
  1535. mov ecx, [esp + 4 + 16] // width
  1536. movdqa xmm5, xmmword ptr kAddUV128
  1537. movdqa xmm6, xmmword ptr kARGBToV
  1538. movdqa xmm7, xmmword ptr kARGBToU
  1539. sub edi, edx // stride from u to v
  1540. convertloop:
  1541. /* convert to U and V */
  1542. movdqu xmm0, [eax] // U
  1543. movdqu xmm1, [eax + 16]
  1544. movdqu xmm2, [eax + 32]
  1545. movdqu xmm3, [eax + 48]
  1546. pmaddubsw xmm0, xmm7
  1547. pmaddubsw xmm1, xmm7
  1548. pmaddubsw xmm2, xmm7
  1549. pmaddubsw xmm3, xmm7
  1550. phaddw xmm0, xmm1
  1551. phaddw xmm2, xmm3
  1552. psraw xmm0, 8
  1553. psraw xmm2, 8
  1554. packsswb xmm0, xmm2
  1555. paddb xmm0, xmm5
  1556. movdqu [edx], xmm0
  1557. movdqu xmm0, [eax] // V
  1558. movdqu xmm1, [eax + 16]
  1559. movdqu xmm2, [eax + 32]
  1560. movdqu xmm3, [eax + 48]
  1561. pmaddubsw xmm0, xmm6
  1562. pmaddubsw xmm1, xmm6
  1563. pmaddubsw xmm2, xmm6
  1564. pmaddubsw xmm3, xmm6
  1565. phaddw xmm0, xmm1
  1566. phaddw xmm2, xmm3
  1567. psraw xmm0, 8
  1568. psraw xmm2, 8
  1569. packsswb xmm0, xmm2
  1570. paddb xmm0, xmm5
  1571. lea eax, [eax + 64]
  1572. movdqu [edx + edi], xmm0
  1573. lea edx, [edx + 16]
  1574. sub ecx, 16
  1575. jg convertloop
  1576. pop edi
  1577. ret
  1578. }
  1579. }
  1580. __declspec(naked)
  1581. void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1582. uint8* dst_u, uint8* dst_v, int width) {
  1583. __asm {
  1584. push esi
  1585. push edi
  1586. mov eax, [esp + 8 + 4] // src_argb
  1587. mov esi, [esp + 8 + 8] // src_stride_argb
  1588. mov edx, [esp + 8 + 12] // dst_u
  1589. mov edi, [esp + 8 + 16] // dst_v
  1590. mov ecx, [esp + 8 + 20] // width
  1591. movdqa xmm5, xmmword ptr kAddUV128
  1592. movdqa xmm6, xmmword ptr kBGRAToV
  1593. movdqa xmm7, xmmword ptr kBGRAToU
  1594. sub edi, edx // stride from u to v
  1595. convertloop:
  1596. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1597. movdqu xmm0, [eax]
  1598. movdqu xmm4, [eax + esi]
  1599. pavgb xmm0, xmm4
  1600. movdqu xmm1, [eax + 16]
  1601. movdqu xmm4, [eax + esi + 16]
  1602. pavgb xmm1, xmm4
  1603. movdqu xmm2, [eax + 32]
  1604. movdqu xmm4, [eax + esi + 32]
  1605. pavgb xmm2, xmm4
  1606. movdqu xmm3, [eax + 48]
  1607. movdqu xmm4, [eax + esi + 48]
  1608. pavgb xmm3, xmm4
  1609. lea eax, [eax + 64]
  1610. movdqa xmm4, xmm0
  1611. shufps xmm0, xmm1, 0x88
  1612. shufps xmm4, xmm1, 0xdd
  1613. pavgb xmm0, xmm4
  1614. movdqa xmm4, xmm2
  1615. shufps xmm2, xmm3, 0x88
  1616. shufps xmm4, xmm3, 0xdd
  1617. pavgb xmm2, xmm4
  1618. // step 2 - convert to U and V
  1619. // from here down is very similar to Y code except
  1620. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1621. movdqa xmm1, xmm0
  1622. movdqa xmm3, xmm2
  1623. pmaddubsw xmm0, xmm7 // U
  1624. pmaddubsw xmm2, xmm7
  1625. pmaddubsw xmm1, xmm6 // V
  1626. pmaddubsw xmm3, xmm6
  1627. phaddw xmm0, xmm2
  1628. phaddw xmm1, xmm3
  1629. psraw xmm0, 8
  1630. psraw xmm1, 8
  1631. packsswb xmm0, xmm1
  1632. paddb xmm0, xmm5 // -> unsigned
  1633. // step 3 - store 8 U and 8 V values
  1634. movlps qword ptr [edx], xmm0 // U
  1635. movhps qword ptr [edx + edi], xmm0 // V
  1636. lea edx, [edx + 8]
  1637. sub ecx, 16
  1638. jg convertloop
  1639. pop edi
  1640. pop esi
  1641. ret
  1642. }
  1643. }
  1644. __declspec(naked)
  1645. void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1646. uint8* dst_u, uint8* dst_v, int width) {
  1647. __asm {
  1648. push esi
  1649. push edi
  1650. mov eax, [esp + 8 + 4] // src_argb
  1651. mov esi, [esp + 8 + 8] // src_stride_argb
  1652. mov edx, [esp + 8 + 12] // dst_u
  1653. mov edi, [esp + 8 + 16] // dst_v
  1654. mov ecx, [esp + 8 + 20] // width
  1655. movdqa xmm5, xmmword ptr kAddUV128
  1656. movdqa xmm6, xmmword ptr kABGRToV
  1657. movdqa xmm7, xmmword ptr kABGRToU
  1658. sub edi, edx // stride from u to v
  1659. convertloop:
  1660. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1661. movdqu xmm0, [eax]
  1662. movdqu xmm4, [eax + esi]
  1663. pavgb xmm0, xmm4
  1664. movdqu xmm1, [eax + 16]
  1665. movdqu xmm4, [eax + esi + 16]
  1666. pavgb xmm1, xmm4
  1667. movdqu xmm2, [eax + 32]
  1668. movdqu xmm4, [eax + esi + 32]
  1669. pavgb xmm2, xmm4
  1670. movdqu xmm3, [eax + 48]
  1671. movdqu xmm4, [eax + esi + 48]
  1672. pavgb xmm3, xmm4
  1673. lea eax, [eax + 64]
  1674. movdqa xmm4, xmm0
  1675. shufps xmm0, xmm1, 0x88
  1676. shufps xmm4, xmm1, 0xdd
  1677. pavgb xmm0, xmm4
  1678. movdqa xmm4, xmm2
  1679. shufps xmm2, xmm3, 0x88
  1680. shufps xmm4, xmm3, 0xdd
  1681. pavgb xmm2, xmm4
  1682. // step 2 - convert to U and V
  1683. // from here down is very similar to Y code except
  1684. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1685. movdqa xmm1, xmm0
  1686. movdqa xmm3, xmm2
  1687. pmaddubsw xmm0, xmm7 // U
  1688. pmaddubsw xmm2, xmm7
  1689. pmaddubsw xmm1, xmm6 // V
  1690. pmaddubsw xmm3, xmm6
  1691. phaddw xmm0, xmm2
  1692. phaddw xmm1, xmm3
  1693. psraw xmm0, 8
  1694. psraw xmm1, 8
  1695. packsswb xmm0, xmm1
  1696. paddb xmm0, xmm5 // -> unsigned
  1697. // step 3 - store 8 U and 8 V values
  1698. movlps qword ptr [edx], xmm0 // U
  1699. movhps qword ptr [edx + edi], xmm0 // V
  1700. lea edx, [edx + 8]
  1701. sub ecx, 16
  1702. jg convertloop
  1703. pop edi
  1704. pop esi
  1705. ret
  1706. }
  1707. }
  1708. __declspec(naked)
  1709. void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1710. uint8* dst_u, uint8* dst_v, int width) {
  1711. __asm {
  1712. push esi
  1713. push edi
  1714. mov eax, [esp + 8 + 4] // src_argb
  1715. mov esi, [esp + 8 + 8] // src_stride_argb
  1716. mov edx, [esp + 8 + 12] // dst_u
  1717. mov edi, [esp + 8 + 16] // dst_v
  1718. mov ecx, [esp + 8 + 20] // width
  1719. movdqa xmm5, xmmword ptr kAddUV128
  1720. movdqa xmm6, xmmword ptr kRGBAToV
  1721. movdqa xmm7, xmmword ptr kRGBAToU
  1722. sub edi, edx // stride from u to v
  1723. convertloop:
  1724. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1725. movdqu xmm0, [eax]
  1726. movdqu xmm4, [eax + esi]
  1727. pavgb xmm0, xmm4
  1728. movdqu xmm1, [eax + 16]
  1729. movdqu xmm4, [eax + esi + 16]
  1730. pavgb xmm1, xmm4
  1731. movdqu xmm2, [eax + 32]
  1732. movdqu xmm4, [eax + esi + 32]
  1733. pavgb xmm2, xmm4
  1734. movdqu xmm3, [eax + 48]
  1735. movdqu xmm4, [eax + esi + 48]
  1736. pavgb xmm3, xmm4
  1737. lea eax, [eax + 64]
  1738. movdqa xmm4, xmm0
  1739. shufps xmm0, xmm1, 0x88
  1740. shufps xmm4, xmm1, 0xdd
  1741. pavgb xmm0, xmm4
  1742. movdqa xmm4, xmm2
  1743. shufps xmm2, xmm3, 0x88
  1744. shufps xmm4, xmm3, 0xdd
  1745. pavgb xmm2, xmm4
  1746. // step 2 - convert to U and V
  1747. // from here down is very similar to Y code except
  1748. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1749. movdqa xmm1, xmm0
  1750. movdqa xmm3, xmm2
  1751. pmaddubsw xmm0, xmm7 // U
  1752. pmaddubsw xmm2, xmm7
  1753. pmaddubsw xmm1, xmm6 // V
  1754. pmaddubsw xmm3, xmm6
  1755. phaddw xmm0, xmm2
  1756. phaddw xmm1, xmm3
  1757. psraw xmm0, 8
  1758. psraw xmm1, 8
  1759. packsswb xmm0, xmm1
  1760. paddb xmm0, xmm5 // -> unsigned
  1761. // step 3 - store 8 U and 8 V values
  1762. movlps qword ptr [edx], xmm0 // U
  1763. movhps qword ptr [edx + edi], xmm0 // V
  1764. lea edx, [edx + 8]
  1765. sub ecx, 16
  1766. jg convertloop
  1767. pop edi
  1768. pop esi
  1769. ret
  1770. }
  1771. }
  1772. #endif // HAS_ARGBTOYROW_SSSE3
  1773. // Read 16 UV from 444
  1774. #define READYUV444_AVX2 __asm { \
  1775. __asm vmovdqu xmm0, [esi] /* U */ \
  1776. __asm vmovdqu xmm1, [esi + edi] /* V */ \
  1777. __asm lea esi, [esi + 16] \
  1778. __asm vpermq ymm0, ymm0, 0xd8 \
  1779. __asm vpermq ymm1, ymm1, 0xd8 \
  1780. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1781. __asm vmovdqu xmm4, [eax] /* Y */ \
  1782. __asm vpermq ymm4, ymm4, 0xd8 \
  1783. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1784. __asm lea eax, [eax + 16] \
  1785. }
  1786. // Read 8 UV from 422, upsample to 16 UV.
  1787. #define READYUV422_AVX2 __asm { \
  1788. __asm vmovq xmm0, qword ptr [esi] /* U */ \
  1789. __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
  1790. __asm lea esi, [esi + 8] \
  1791. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1792. __asm vpermq ymm0, ymm0, 0xd8 \
  1793. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1794. __asm vmovdqu xmm4, [eax] /* Y */ \
  1795. __asm vpermq ymm4, ymm4, 0xd8 \
  1796. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1797. __asm lea eax, [eax + 16] \
  1798. }
  1799. // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
  1800. #define READYUVA422_AVX2 __asm { \
  1801. __asm vmovq xmm0, qword ptr [esi] /* U */ \
  1802. __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
  1803. __asm lea esi, [esi + 8] \
  1804. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1805. __asm vpermq ymm0, ymm0, 0xd8 \
  1806. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1807. __asm vmovdqu xmm4, [eax] /* Y */ \
  1808. __asm vpermq ymm4, ymm4, 0xd8 \
  1809. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1810. __asm lea eax, [eax + 16] \
  1811. __asm vmovdqu xmm5, [ebp] /* A */ \
  1812. __asm vpermq ymm5, ymm5, 0xd8 \
  1813. __asm lea ebp, [ebp + 16] \
  1814. }
  1815. // Read 4 UV from 411, upsample to 16 UV.
  1816. #define READYUV411_AVX2 __asm { \
  1817. __asm vmovd xmm0, dword ptr [esi] /* U */ \
  1818. __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \
  1819. __asm lea esi, [esi + 4] \
  1820. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1821. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1822. __asm vpermq ymm0, ymm0, 0xd8 \
  1823. __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
  1824. __asm vmovdqu xmm4, [eax] /* Y */ \
  1825. __asm vpermq ymm4, ymm4, 0xd8 \
  1826. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1827. __asm lea eax, [eax + 16] \
  1828. }
  1829. // Read 8 UV from NV12, upsample to 16 UV.
  1830. #define READNV12_AVX2 __asm { \
  1831. __asm vmovdqu xmm0, [esi] /* UV */ \
  1832. __asm lea esi, [esi + 16] \
  1833. __asm vpermq ymm0, ymm0, 0xd8 \
  1834. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1835. __asm vmovdqu xmm4, [eax] /* Y */ \
  1836. __asm vpermq ymm4, ymm4, 0xd8 \
  1837. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1838. __asm lea eax, [eax + 16] \
  1839. }
  1840. // Read 8 UV from NV21, upsample to 16 UV.
  1841. #define READNV21_AVX2 __asm { \
  1842. __asm vmovdqu xmm0, [esi] /* UV */ \
  1843. __asm lea esi, [esi + 16] \
  1844. __asm vpermq ymm0, ymm0, 0xd8 \
  1845. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
  1846. __asm vmovdqu xmm4, [eax] /* Y */ \
  1847. __asm vpermq ymm4, ymm4, 0xd8 \
  1848. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1849. __asm lea eax, [eax + 16] \
  1850. }
  1851. // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
  1852. #define READYUY2_AVX2 __asm { \
  1853. __asm vmovdqu ymm4, [eax] /* YUY2 */ \
  1854. __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
  1855. __asm vmovdqu ymm0, [eax] /* UV */ \
  1856. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
  1857. __asm lea eax, [eax + 32] \
  1858. }
  1859. // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
  1860. #define READUYVY_AVX2 __asm { \
  1861. __asm vmovdqu ymm4, [eax] /* UYVY */ \
  1862. __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
  1863. __asm vmovdqu ymm0, [eax] /* UV */ \
  1864. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
  1865. __asm lea eax, [eax + 32] \
  1866. }
  1867. // Convert 16 pixels: 16 UV and 16 Y.
  1868. #define YUVTORGB_AVX2(YuvConstants) __asm { \
  1869. __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
  1870. __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
  1871. __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
  1872. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
  1873. __asm vpsubw ymm2, ymm3, ymm2 \
  1874. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
  1875. __asm vpsubw ymm1, ymm3, ymm1 \
  1876. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
  1877. __asm vpsubw ymm0, ymm3, ymm0 \
  1878. /* Step 2: Find Y contribution to 16 R,G,B values */ \
  1879. __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
  1880. __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
  1881. __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
  1882. __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
  1883. __asm vpsraw ymm0, ymm0, 6 \
  1884. __asm vpsraw ymm1, ymm1, 6 \
  1885. __asm vpsraw ymm2, ymm2, 6 \
  1886. __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
  1887. __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
  1888. __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
  1889. }
  1890. // Store 16 ARGB values.
  1891. #define STOREARGB_AVX2 __asm { \
  1892. __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
  1893. __asm vpermq ymm0, ymm0, 0xd8 \
  1894. __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
  1895. __asm vpermq ymm2, ymm2, 0xd8 \
  1896. __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
  1897. __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
  1898. __asm vmovdqu 0[edx], ymm1 \
  1899. __asm vmovdqu 32[edx], ymm0 \
  1900. __asm lea edx, [edx + 64] \
  1901. }
  1902. // Store 16 RGBA values.
  1903. #define STORERGBA_AVX2 __asm { \
  1904. __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
  1905. __asm vpermq ymm1, ymm1, 0xd8 \
  1906. __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
  1907. __asm vpermq ymm2, ymm2, 0xd8 \
  1908. __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
  1909. __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
  1910. __asm vmovdqu [edx], ymm0 \
  1911. __asm vmovdqu [edx + 32], ymm1 \
  1912. __asm lea edx, [edx + 64] \
  1913. }
  1914. #ifdef HAS_I422TOARGBROW_AVX2
  1915. // 16 pixels
  1916. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  1917. __declspec(naked)
  1918. void I422ToARGBRow_AVX2(const uint8* y_buf,
  1919. const uint8* u_buf,
  1920. const uint8* v_buf,
  1921. uint8* dst_argb,
  1922. const struct YuvConstants* yuvconstants,
  1923. int width) {
  1924. __asm {
  1925. push esi
  1926. push edi
  1927. push ebx
  1928. mov eax, [esp + 12 + 4] // Y
  1929. mov esi, [esp + 12 + 8] // U
  1930. mov edi, [esp + 12 + 12] // V
  1931. mov edx, [esp + 12 + 16] // argb
  1932. mov ebx, [esp + 12 + 20] // yuvconstants
  1933. mov ecx, [esp + 12 + 24] // width
  1934. sub edi, esi
  1935. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  1936. convertloop:
  1937. READYUV422_AVX2
  1938. YUVTORGB_AVX2(ebx)
  1939. STOREARGB_AVX2
  1940. sub ecx, 16
  1941. jg convertloop
  1942. pop ebx
  1943. pop edi
  1944. pop esi
  1945. vzeroupper
  1946. ret
  1947. }
  1948. }
  1949. #endif // HAS_I422TOARGBROW_AVX2
  1950. #ifdef HAS_I422ALPHATOARGBROW_AVX2
  1951. // 16 pixels
  1952. // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
  1953. __declspec(naked)
  1954. void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
  1955. const uint8* u_buf,
  1956. const uint8* v_buf,
  1957. const uint8* a_buf,
  1958. uint8* dst_argb,
  1959. const struct YuvConstants* yuvconstants,
  1960. int width) {
  1961. __asm {
  1962. push esi
  1963. push edi
  1964. push ebx
  1965. push ebp
  1966. mov eax, [esp + 16 + 4] // Y
  1967. mov esi, [esp + 16 + 8] // U
  1968. mov edi, [esp + 16 + 12] // V
  1969. mov ebp, [esp + 16 + 16] // A
  1970. mov edx, [esp + 16 + 20] // argb
  1971. mov ebx, [esp + 16 + 24] // yuvconstants
  1972. mov ecx, [esp + 16 + 28] // width
  1973. sub edi, esi
  1974. convertloop:
  1975. READYUVA422_AVX2
  1976. YUVTORGB_AVX2(ebx)
  1977. STOREARGB_AVX2
  1978. sub ecx, 16
  1979. jg convertloop
  1980. pop ebp
  1981. pop ebx
  1982. pop edi
  1983. pop esi
  1984. vzeroupper
  1985. ret
  1986. }
  1987. }
  1988. #endif // HAS_I422ALPHATOARGBROW_AVX2
  1989. #ifdef HAS_I444TOARGBROW_AVX2
  1990. // 16 pixels
  1991. // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
  1992. __declspec(naked)
  1993. void I444ToARGBRow_AVX2(const uint8* y_buf,
  1994. const uint8* u_buf,
  1995. const uint8* v_buf,
  1996. uint8* dst_argb,
  1997. const struct YuvConstants* yuvconstants,
  1998. int width) {
  1999. __asm {
  2000. push esi
  2001. push edi
  2002. push ebx
  2003. mov eax, [esp + 12 + 4] // Y
  2004. mov esi, [esp + 12 + 8] // U
  2005. mov edi, [esp + 12 + 12] // V
  2006. mov edx, [esp + 12 + 16] // argb
  2007. mov ebx, [esp + 12 + 20] // yuvconstants
  2008. mov ecx, [esp + 12 + 24] // width
  2009. sub edi, esi
  2010. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2011. convertloop:
  2012. READYUV444_AVX2
  2013. YUVTORGB_AVX2(ebx)
  2014. STOREARGB_AVX2
  2015. sub ecx, 16
  2016. jg convertloop
  2017. pop ebx
  2018. pop edi
  2019. pop esi
  2020. vzeroupper
  2021. ret
  2022. }
  2023. }
  2024. #endif // HAS_I444TOARGBROW_AVX2
  2025. #ifdef HAS_I411TOARGBROW_AVX2
  2026. // 16 pixels
  2027. // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2028. __declspec(naked)
  2029. void I411ToARGBRow_AVX2(const uint8* y_buf,
  2030. const uint8* u_buf,
  2031. const uint8* v_buf,
  2032. uint8* dst_argb,
  2033. const struct YuvConstants* yuvconstants,
  2034. int width) {
  2035. __asm {
  2036. push esi
  2037. push edi
  2038. push ebx
  2039. mov eax, [esp + 12 + 4] // Y
  2040. mov esi, [esp + 12 + 8] // U
  2041. mov edi, [esp + 12 + 12] // V
  2042. mov edx, [esp + 12 + 16] // abgr
  2043. mov ebx, [esp + 12 + 20] // yuvconstants
  2044. mov ecx, [esp + 12 + 24] // width
  2045. sub edi, esi
  2046. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2047. convertloop:
  2048. READYUV411_AVX2
  2049. YUVTORGB_AVX2(ebx)
  2050. STOREARGB_AVX2
  2051. sub ecx, 16
  2052. jg convertloop
  2053. pop ebx
  2054. pop edi
  2055. pop esi
  2056. vzeroupper
  2057. ret
  2058. }
  2059. }
  2060. #endif // HAS_I411TOARGBROW_AVX2
  2061. #ifdef HAS_NV12TOARGBROW_AVX2
  2062. // 16 pixels.
  2063. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2064. __declspec(naked)
  2065. void NV12ToARGBRow_AVX2(const uint8* y_buf,
  2066. const uint8* uv_buf,
  2067. uint8* dst_argb,
  2068. const struct YuvConstants* yuvconstants,
  2069. int width) {
  2070. __asm {
  2071. push esi
  2072. push ebx
  2073. mov eax, [esp + 8 + 4] // Y
  2074. mov esi, [esp + 8 + 8] // UV
  2075. mov edx, [esp + 8 + 12] // argb
  2076. mov ebx, [esp + 8 + 16] // yuvconstants
  2077. mov ecx, [esp + 8 + 20] // width
  2078. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2079. convertloop:
  2080. READNV12_AVX2
  2081. YUVTORGB_AVX2(ebx)
  2082. STOREARGB_AVX2
  2083. sub ecx, 16
  2084. jg convertloop
  2085. pop ebx
  2086. pop esi
  2087. vzeroupper
  2088. ret
  2089. }
  2090. }
  2091. #endif // HAS_NV12TOARGBROW_AVX2
  2092. #ifdef HAS_NV21TOARGBROW_AVX2
  2093. // 16 pixels.
  2094. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2095. __declspec(naked)
  2096. void NV21ToARGBRow_AVX2(const uint8* y_buf,
  2097. const uint8* vu_buf,
  2098. uint8* dst_argb,
  2099. const struct YuvConstants* yuvconstants,
  2100. int width) {
  2101. __asm {
  2102. push esi
  2103. push ebx
  2104. mov eax, [esp + 8 + 4] // Y
  2105. mov esi, [esp + 8 + 8] // VU
  2106. mov edx, [esp + 8 + 12] // argb
  2107. mov ebx, [esp + 8 + 16] // yuvconstants
  2108. mov ecx, [esp + 8 + 20] // width
  2109. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2110. convertloop:
  2111. READNV21_AVX2
  2112. YUVTORGB_AVX2(ebx)
  2113. STOREARGB_AVX2
  2114. sub ecx, 16
  2115. jg convertloop
  2116. pop ebx
  2117. pop esi
  2118. vzeroupper
  2119. ret
  2120. }
  2121. }
  2122. #endif // HAS_NV21TOARGBROW_AVX2
  2123. #ifdef HAS_YUY2TOARGBROW_AVX2
  2124. // 16 pixels.
  2125. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2126. __declspec(naked)
  2127. void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
  2128. uint8* dst_argb,
  2129. const struct YuvConstants* yuvconstants,
  2130. int width) {
  2131. __asm {
  2132. push ebx
  2133. mov eax, [esp + 4 + 4] // yuy2
  2134. mov edx, [esp + 4 + 8] // argb
  2135. mov ebx, [esp + 4 + 12] // yuvconstants
  2136. mov ecx, [esp + 4 + 16] // width
  2137. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2138. convertloop:
  2139. READYUY2_AVX2
  2140. YUVTORGB_AVX2(ebx)
  2141. STOREARGB_AVX2
  2142. sub ecx, 16
  2143. jg convertloop
  2144. pop ebx
  2145. vzeroupper
  2146. ret
  2147. }
  2148. }
  2149. #endif // HAS_YUY2TOARGBROW_AVX2
  2150. #ifdef HAS_UYVYTOARGBROW_AVX2
  2151. // 16 pixels.
  2152. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2153. __declspec(naked)
  2154. void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
  2155. uint8* dst_argb,
  2156. const struct YuvConstants* yuvconstants,
  2157. int width) {
  2158. __asm {
  2159. push ebx
  2160. mov eax, [esp + 4 + 4] // uyvy
  2161. mov edx, [esp + 4 + 8] // argb
  2162. mov ebx, [esp + 4 + 12] // yuvconstants
  2163. mov ecx, [esp + 4 + 16] // width
  2164. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2165. convertloop:
  2166. READUYVY_AVX2
  2167. YUVTORGB_AVX2(ebx)
  2168. STOREARGB_AVX2
  2169. sub ecx, 16
  2170. jg convertloop
  2171. pop ebx
  2172. vzeroupper
  2173. ret
  2174. }
  2175. }
  2176. #endif // HAS_UYVYTOARGBROW_AVX2
  2177. #ifdef HAS_I422TORGBAROW_AVX2
  2178. // 16 pixels
  2179. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
  2180. __declspec(naked)
  2181. void I422ToRGBARow_AVX2(const uint8* y_buf,
  2182. const uint8* u_buf,
  2183. const uint8* v_buf,
  2184. uint8* dst_argb,
  2185. const struct YuvConstants* yuvconstants,
  2186. int width) {
  2187. __asm {
  2188. push esi
  2189. push edi
  2190. push ebx
  2191. mov eax, [esp + 12 + 4] // Y
  2192. mov esi, [esp + 12 + 8] // U
  2193. mov edi, [esp + 12 + 12] // V
  2194. mov edx, [esp + 12 + 16] // abgr
  2195. mov ebx, [esp + 12 + 20] // yuvconstants
  2196. mov ecx, [esp + 12 + 24] // width
  2197. sub edi, esi
  2198. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2199. convertloop:
  2200. READYUV422_AVX2
  2201. YUVTORGB_AVX2(ebx)
  2202. STORERGBA_AVX2
  2203. sub ecx, 16
  2204. jg convertloop
  2205. pop ebx
  2206. pop edi
  2207. pop esi
  2208. vzeroupper
  2209. ret
  2210. }
  2211. }
  2212. #endif // HAS_I422TORGBAROW_AVX2
  2213. #if defined(HAS_I422TOARGBROW_SSSE3)
  2214. // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
  2215. // Allows a conversion with half size scaling.
  2216. // Read 8 UV from 444.
  2217. #define READYUV444 __asm { \
  2218. __asm movq xmm0, qword ptr [esi] /* U */ \
  2219. __asm movq xmm1, qword ptr [esi + edi] /* V */ \
  2220. __asm lea esi, [esi + 8] \
  2221. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2222. __asm movq xmm4, qword ptr [eax] \
  2223. __asm punpcklbw xmm4, xmm4 \
  2224. __asm lea eax, [eax + 8] \
  2225. }
  2226. // Read 4 UV from 422, upsample to 8 UV.
  2227. #define READYUV422 __asm { \
  2228. __asm movd xmm0, [esi] /* U */ \
  2229. __asm movd xmm1, [esi + edi] /* V */ \
  2230. __asm lea esi, [esi + 4] \
  2231. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2232. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2233. __asm movq xmm4, qword ptr [eax] \
  2234. __asm punpcklbw xmm4, xmm4 \
  2235. __asm lea eax, [eax + 8] \
  2236. }
  2237. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  2238. #define READYUVA422 __asm { \
  2239. __asm movd xmm0, [esi] /* U */ \
  2240. __asm movd xmm1, [esi + edi] /* V */ \
  2241. __asm lea esi, [esi + 4] \
  2242. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2243. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2244. __asm movq xmm4, qword ptr [eax] /* Y */ \
  2245. __asm punpcklbw xmm4, xmm4 \
  2246. __asm lea eax, [eax + 8] \
  2247. __asm movq xmm5, qword ptr [ebp] /* A */ \
  2248. __asm lea ebp, [ebp + 8] \
  2249. }
  2250. // Read 2 UV from 411, upsample to 8 UV.
  2251. // drmemory fails with memory fault if pinsrw used. libyuv bug: 525
  2252. // __asm pinsrw xmm0, [esi], 0 /* U */
  2253. // __asm pinsrw xmm1, [esi + edi], 0 /* V */
  2254. #define READYUV411_EBX __asm { \
  2255. __asm movzx ebx, word ptr [esi] /* U */ \
  2256. __asm movd xmm0, ebx \
  2257. __asm movzx ebx, word ptr [esi + edi] /* V */ \
  2258. __asm movd xmm1, ebx \
  2259. __asm lea esi, [esi + 2] \
  2260. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2261. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2262. __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
  2263. __asm movq xmm4, qword ptr [eax] \
  2264. __asm punpcklbw xmm4, xmm4 \
  2265. __asm lea eax, [eax + 8] \
  2266. }
  2267. // Read 4 UV from NV12, upsample to 8 UV.
  2268. #define READNV12 __asm { \
  2269. __asm movq xmm0, qword ptr [esi] /* UV */ \
  2270. __asm lea esi, [esi + 8] \
  2271. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2272. __asm movq xmm4, qword ptr [eax] \
  2273. __asm punpcklbw xmm4, xmm4 \
  2274. __asm lea eax, [eax + 8] \
  2275. }
  2276. // Read 4 VU from NV21, upsample to 8 UV.
  2277. #define READNV21 __asm { \
  2278. __asm movq xmm0, qword ptr [esi] /* UV */ \
  2279. __asm lea esi, [esi + 8] \
  2280. __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
  2281. __asm movq xmm4, qword ptr [eax] \
  2282. __asm punpcklbw xmm4, xmm4 \
  2283. __asm lea eax, [eax + 8] \
  2284. }
  2285. // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
  2286. #define READYUY2 __asm { \
  2287. __asm movdqu xmm4, [eax] /* YUY2 */ \
  2288. __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
  2289. __asm movdqu xmm0, [eax] /* UV */ \
  2290. __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
  2291. __asm lea eax, [eax + 16] \
  2292. }
  2293. // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
  2294. #define READUYVY __asm { \
  2295. __asm movdqu xmm4, [eax] /* UYVY */ \
  2296. __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
  2297. __asm movdqu xmm0, [eax] /* UV */ \
  2298. __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
  2299. __asm lea eax, [eax + 16] \
  2300. }
  2301. // Convert 8 pixels: 8 UV and 8 Y.
  2302. #define YUVTORGB(YuvConstants) __asm { \
  2303. __asm movdqa xmm1, xmm0 \
  2304. __asm movdqa xmm2, xmm0 \
  2305. __asm movdqa xmm3, xmm0 \
  2306. __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
  2307. __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
  2308. __asm psubw xmm0, xmm1 \
  2309. __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
  2310. __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
  2311. __asm psubw xmm1, xmm2 \
  2312. __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
  2313. __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
  2314. __asm psubw xmm2, xmm3 \
  2315. __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
  2316. __asm paddsw xmm0, xmm4 /* B += Y */ \
  2317. __asm paddsw xmm1, xmm4 /* G += Y */ \
  2318. __asm paddsw xmm2, xmm4 /* R += Y */ \
  2319. __asm psraw xmm0, 6 \
  2320. __asm psraw xmm1, 6 \
  2321. __asm psraw xmm2, 6 \
  2322. __asm packuswb xmm0, xmm0 /* B */ \
  2323. __asm packuswb xmm1, xmm1 /* G */ \
  2324. __asm packuswb xmm2, xmm2 /* R */ \
  2325. }
  2326. // Store 8 ARGB values.
  2327. #define STOREARGB __asm { \
  2328. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2329. __asm punpcklbw xmm2, xmm5 /* RA */ \
  2330. __asm movdqa xmm1, xmm0 \
  2331. __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
  2332. __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
  2333. __asm movdqu 0[edx], xmm0 \
  2334. __asm movdqu 16[edx], xmm1 \
  2335. __asm lea edx, [edx + 32] \
  2336. }
  2337. // Store 8 BGRA values.
  2338. #define STOREBGRA __asm { \
  2339. __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
  2340. __asm punpcklbw xmm1, xmm0 /* GB */ \
  2341. __asm punpcklbw xmm5, xmm2 /* AR */ \
  2342. __asm movdqa xmm0, xmm5 \
  2343. __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
  2344. __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
  2345. __asm movdqu 0[edx], xmm5 \
  2346. __asm movdqu 16[edx], xmm0 \
  2347. __asm lea edx, [edx + 32] \
  2348. }
  2349. // Store 8 RGBA values.
  2350. #define STORERGBA __asm { \
  2351. __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
  2352. __asm punpcklbw xmm1, xmm2 /* GR */ \
  2353. __asm punpcklbw xmm5, xmm0 /* AB */ \
  2354. __asm movdqa xmm0, xmm5 \
  2355. __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
  2356. __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
  2357. __asm movdqu 0[edx], xmm5 \
  2358. __asm movdqu 16[edx], xmm0 \
  2359. __asm lea edx, [edx + 32] \
  2360. }
  2361. // Store 8 RGB24 values.
  2362. #define STORERGB24 __asm { \
  2363. /* Weave into RRGB */ \
  2364. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2365. __asm punpcklbw xmm2, xmm2 /* RR */ \
  2366. __asm movdqa xmm1, xmm0 \
  2367. __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
  2368. __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
  2369. /* RRGB -> RGB24 */ \
  2370. __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
  2371. __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
  2372. __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
  2373. __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
  2374. __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
  2375. __asm lea edx, [edx + 24] \
  2376. }
  2377. // Store 8 RGB565 values.
  2378. #define STORERGB565 __asm { \
  2379. /* Weave into RRGB */ \
  2380. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2381. __asm punpcklbw xmm2, xmm2 /* RR */ \
  2382. __asm movdqa xmm1, xmm0 \
  2383. __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
  2384. __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
  2385. /* RRGB -> RGB565 */ \
  2386. __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
  2387. __asm movdqa xmm2, xmm0 /* G */ \
  2388. __asm pslld xmm0, 8 /* R */ \
  2389. __asm psrld xmm3, 3 /* B */ \
  2390. __asm psrld xmm2, 5 /* G */ \
  2391. __asm psrad xmm0, 16 /* R */ \
  2392. __asm pand xmm3, xmm5 /* B */ \
  2393. __asm pand xmm2, xmm6 /* G */ \
  2394. __asm pand xmm0, xmm7 /* R */ \
  2395. __asm por xmm3, xmm2 /* BG */ \
  2396. __asm por xmm0, xmm3 /* BGR */ \
  2397. __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
  2398. __asm movdqa xmm2, xmm1 /* G */ \
  2399. __asm pslld xmm1, 8 /* R */ \
  2400. __asm psrld xmm3, 3 /* B */ \
  2401. __asm psrld xmm2, 5 /* G */ \
  2402. __asm psrad xmm1, 16 /* R */ \
  2403. __asm pand xmm3, xmm5 /* B */ \
  2404. __asm pand xmm2, xmm6 /* G */ \
  2405. __asm pand xmm1, xmm7 /* R */ \
  2406. __asm por xmm3, xmm2 /* BG */ \
  2407. __asm por xmm1, xmm3 /* BGR */ \
  2408. __asm packssdw xmm0, xmm1 \
  2409. __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
  2410. __asm lea edx, [edx + 16] \
  2411. }
  2412. // 8 pixels.
  2413. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
  2414. __declspec(naked)
  2415. void I444ToARGBRow_SSSE3(const uint8* y_buf,
  2416. const uint8* u_buf,
  2417. const uint8* v_buf,
  2418. uint8* dst_argb,
  2419. const struct YuvConstants* yuvconstants,
  2420. int width) {
  2421. __asm {
  2422. push esi
  2423. push edi
  2424. push ebx
  2425. mov eax, [esp + 12 + 4] // Y
  2426. mov esi, [esp + 12 + 8] // U
  2427. mov edi, [esp + 12 + 12] // V
  2428. mov edx, [esp + 12 + 16] // argb
  2429. mov ebx, [esp + 12 + 20] // yuvconstants
  2430. mov ecx, [esp + 12 + 24] // width
  2431. sub edi, esi
  2432. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2433. convertloop:
  2434. READYUV444
  2435. YUVTORGB(ebx)
  2436. STOREARGB
  2437. sub ecx, 8
  2438. jg convertloop
  2439. pop ebx
  2440. pop edi
  2441. pop esi
  2442. ret
  2443. }
  2444. }
  2445. // 8 pixels.
  2446. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
  2447. __declspec(naked)
  2448. void I422ToRGB24Row_SSSE3(const uint8* y_buf,
  2449. const uint8* u_buf,
  2450. const uint8* v_buf,
  2451. uint8* dst_rgb24,
  2452. const struct YuvConstants* yuvconstants,
  2453. int width) {
  2454. __asm {
  2455. push esi
  2456. push edi
  2457. push ebx
  2458. mov eax, [esp + 12 + 4] // Y
  2459. mov esi, [esp + 12 + 8] // U
  2460. mov edi, [esp + 12 + 12] // V
  2461. mov edx, [esp + 12 + 16] // argb
  2462. mov ebx, [esp + 12 + 20] // yuvconstants
  2463. mov ecx, [esp + 12 + 24] // width
  2464. sub edi, esi
  2465. movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
  2466. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
  2467. convertloop:
  2468. READYUV422
  2469. YUVTORGB(ebx)
  2470. STORERGB24
  2471. sub ecx, 8
  2472. jg convertloop
  2473. pop ebx
  2474. pop edi
  2475. pop esi
  2476. ret
  2477. }
  2478. }
  2479. // 8 pixels
  2480. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
  2481. __declspec(naked)
  2482. void I422ToRGB565Row_SSSE3(const uint8* y_buf,
  2483. const uint8* u_buf,
  2484. const uint8* v_buf,
  2485. uint8* rgb565_buf,
  2486. const struct YuvConstants* yuvconstants,
  2487. int width) {
  2488. __asm {
  2489. push esi
  2490. push edi
  2491. push ebx
  2492. mov eax, [esp + 12 + 4] // Y
  2493. mov esi, [esp + 12 + 8] // U
  2494. mov edi, [esp + 12 + 12] // V
  2495. mov edx, [esp + 12 + 16] // argb
  2496. mov ebx, [esp + 12 + 20] // yuvconstants
  2497. mov ecx, [esp + 12 + 24] // width
  2498. sub edi, esi
  2499. pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
  2500. psrld xmm5, 27
  2501. pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
  2502. psrld xmm6, 26
  2503. pslld xmm6, 5
  2504. pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
  2505. pslld xmm7, 11
  2506. convertloop:
  2507. READYUV422
  2508. YUVTORGB(ebx)
  2509. STORERGB565
  2510. sub ecx, 8
  2511. jg convertloop
  2512. pop ebx
  2513. pop edi
  2514. pop esi
  2515. ret
  2516. }
  2517. }
  2518. // 8 pixels.
  2519. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2520. __declspec(naked)
  2521. void I422ToARGBRow_SSSE3(const uint8* y_buf,
  2522. const uint8* u_buf,
  2523. const uint8* v_buf,
  2524. uint8* dst_argb,
  2525. const struct YuvConstants* yuvconstants,
  2526. int width) {
  2527. __asm {
  2528. push esi
  2529. push edi
  2530. push ebx
  2531. mov eax, [esp + 12 + 4] // Y
  2532. mov esi, [esp + 12 + 8] // U
  2533. mov edi, [esp + 12 + 12] // V
  2534. mov edx, [esp + 12 + 16] // argb
  2535. mov ebx, [esp + 12 + 20] // yuvconstants
  2536. mov ecx, [esp + 12 + 24] // width
  2537. sub edi, esi
  2538. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2539. convertloop:
  2540. READYUV422
  2541. YUVTORGB(ebx)
  2542. STOREARGB
  2543. sub ecx, 8
  2544. jg convertloop
  2545. pop ebx
  2546. pop edi
  2547. pop esi
  2548. ret
  2549. }
  2550. }
  2551. // 8 pixels.
  2552. // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
  2553. __declspec(naked)
  2554. void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
  2555. const uint8* u_buf,
  2556. const uint8* v_buf,
  2557. const uint8* a_buf,
  2558. uint8* dst_argb,
  2559. const struct YuvConstants* yuvconstants,
  2560. int width) {
  2561. __asm {
  2562. push esi
  2563. push edi
  2564. push ebx
  2565. push ebp
  2566. mov eax, [esp + 16 + 4] // Y
  2567. mov esi, [esp + 16 + 8] // U
  2568. mov edi, [esp + 16 + 12] // V
  2569. mov ebp, [esp + 16 + 16] // A
  2570. mov edx, [esp + 16 + 20] // argb
  2571. mov ebx, [esp + 16 + 24] // yuvconstants
  2572. mov ecx, [esp + 16 + 28] // width
  2573. sub edi, esi
  2574. convertloop:
  2575. READYUVA422
  2576. YUVTORGB(ebx)
  2577. STOREARGB
  2578. sub ecx, 8
  2579. jg convertloop
  2580. pop ebp
  2581. pop ebx
  2582. pop edi
  2583. pop esi
  2584. ret
  2585. }
  2586. }
  2587. // 8 pixels.
  2588. // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2589. // Similar to I420 but duplicate UV once more.
  2590. __declspec(naked)
  2591. void I411ToARGBRow_SSSE3(const uint8* y_buf,
  2592. const uint8* u_buf,
  2593. const uint8* v_buf,
  2594. uint8* dst_argb,
  2595. const struct YuvConstants* yuvconstants,
  2596. int width) {
  2597. __asm {
  2598. push esi
  2599. push edi
  2600. push ebx
  2601. push ebp
  2602. mov eax, [esp + 16 + 4] // Y
  2603. mov esi, [esp + 16 + 8] // U
  2604. mov edi, [esp + 16 + 12] // V
  2605. mov edx, [esp + 16 + 16] // abgr
  2606. mov ebp, [esp + 16 + 20] // yuvconstants
  2607. mov ecx, [esp + 16 + 24] // width
  2608. sub edi, esi
  2609. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2610. convertloop:
  2611. READYUV411_EBX
  2612. YUVTORGB(ebp)
  2613. STOREARGB
  2614. sub ecx, 8
  2615. jg convertloop
  2616. pop ebp
  2617. pop ebx
  2618. pop edi
  2619. pop esi
  2620. ret
  2621. }
  2622. }
  2623. // 8 pixels.
  2624. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2625. __declspec(naked)
  2626. void NV12ToARGBRow_SSSE3(const uint8* y_buf,
  2627. const uint8* uv_buf,
  2628. uint8* dst_argb,
  2629. const struct YuvConstants* yuvconstants,
  2630. int width) {
  2631. __asm {
  2632. push esi
  2633. push ebx
  2634. mov eax, [esp + 8 + 4] // Y
  2635. mov esi, [esp + 8 + 8] // UV
  2636. mov edx, [esp + 8 + 12] // argb
  2637. mov ebx, [esp + 8 + 16] // yuvconstants
  2638. mov ecx, [esp + 8 + 20] // width
  2639. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2640. convertloop:
  2641. READNV12
  2642. YUVTORGB(ebx)
  2643. STOREARGB
  2644. sub ecx, 8
  2645. jg convertloop
  2646. pop ebx
  2647. pop esi
  2648. ret
  2649. }
  2650. }
  2651. // 8 pixels.
  2652. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2653. __declspec(naked)
  2654. void NV21ToARGBRow_SSSE3(const uint8* y_buf,
  2655. const uint8* vu_buf,
  2656. uint8* dst_argb,
  2657. const struct YuvConstants* yuvconstants,
  2658. int width) {
  2659. __asm {
  2660. push esi
  2661. push ebx
  2662. mov eax, [esp + 8 + 4] // Y
  2663. mov esi, [esp + 8 + 8] // VU
  2664. mov edx, [esp + 8 + 12] // argb
  2665. mov ebx, [esp + 8 + 16] // yuvconstants
  2666. mov ecx, [esp + 8 + 20] // width
  2667. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2668. convertloop:
  2669. READNV21
  2670. YUVTORGB(ebx)
  2671. STOREARGB
  2672. sub ecx, 8
  2673. jg convertloop
  2674. pop ebx
  2675. pop esi
  2676. ret
  2677. }
  2678. }
  2679. // 8 pixels.
  2680. // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
  2681. __declspec(naked)
  2682. void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
  2683. uint8* dst_argb,
  2684. const struct YuvConstants* yuvconstants,
  2685. int width) {
  2686. __asm {
  2687. push ebx
  2688. mov eax, [esp + 4 + 4] // yuy2
  2689. mov edx, [esp + 4 + 8] // argb
  2690. mov ebx, [esp + 4 + 12] // yuvconstants
  2691. mov ecx, [esp + 4 + 16] // width
  2692. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2693. convertloop:
  2694. READYUY2
  2695. YUVTORGB(ebx)
  2696. STOREARGB
  2697. sub ecx, 8
  2698. jg convertloop
  2699. pop ebx
  2700. ret
  2701. }
  2702. }
  2703. // 8 pixels.
  2704. // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
  2705. __declspec(naked)
  2706. void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
  2707. uint8* dst_argb,
  2708. const struct YuvConstants* yuvconstants,
  2709. int width) {
  2710. __asm {
  2711. push ebx
  2712. mov eax, [esp + 4 + 4] // uyvy
  2713. mov edx, [esp + 4 + 8] // argb
  2714. mov ebx, [esp + 4 + 12] // yuvconstants
  2715. mov ecx, [esp + 4 + 16] // width
  2716. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2717. convertloop:
  2718. READUYVY
  2719. YUVTORGB(ebx)
  2720. STOREARGB
  2721. sub ecx, 8
  2722. jg convertloop
  2723. pop ebx
  2724. ret
  2725. }
  2726. }
  2727. __declspec(naked)
  2728. void I422ToRGBARow_SSSE3(const uint8* y_buf,
  2729. const uint8* u_buf,
  2730. const uint8* v_buf,
  2731. uint8* dst_rgba,
  2732. const struct YuvConstants* yuvconstants,
  2733. int width) {
  2734. __asm {
  2735. push esi
  2736. push edi
  2737. push ebx
  2738. mov eax, [esp + 12 + 4] // Y
  2739. mov esi, [esp + 12 + 8] // U
  2740. mov edi, [esp + 12 + 12] // V
  2741. mov edx, [esp + 12 + 16] // argb
  2742. mov ebx, [esp + 12 + 20] // yuvconstants
  2743. mov ecx, [esp + 12 + 24] // width
  2744. sub edi, esi
  2745. convertloop:
  2746. READYUV422
  2747. YUVTORGB(ebx)
  2748. STORERGBA
  2749. sub ecx, 8
  2750. jg convertloop
  2751. pop ebx
  2752. pop edi
  2753. pop esi
  2754. ret
  2755. }
  2756. }
  2757. #endif // HAS_I422TOARGBROW_SSSE3
  2758. #ifdef HAS_I400TOARGBROW_SSE2
  2759. // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
  2760. __declspec(naked)
  2761. void I400ToARGBRow_SSE2(const uint8* y_buf,
  2762. uint8* rgb_buf,
  2763. int width) {
  2764. __asm {
  2765. mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
  2766. movd xmm2, eax
  2767. pshufd xmm2, xmm2,0
  2768. mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
  2769. movd xmm3, eax
  2770. pshufd xmm3, xmm3, 0
  2771. pcmpeqb xmm4, xmm4 // generate mask 0xff000000
  2772. pslld xmm4, 24
  2773. mov eax, [esp + 4] // Y
  2774. mov edx, [esp + 8] // rgb
  2775. mov ecx, [esp + 12] // width
  2776. convertloop:
  2777. // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  2778. movq xmm0, qword ptr [eax]
  2779. lea eax, [eax + 8]
  2780. punpcklbw xmm0, xmm0 // Y.Y
  2781. pmulhuw xmm0, xmm2
  2782. psubusw xmm0, xmm3
  2783. psrlw xmm0, 6
  2784. packuswb xmm0, xmm0 // G
  2785. // Step 2: Weave into ARGB
  2786. punpcklbw xmm0, xmm0 // GG
  2787. movdqa xmm1, xmm0
  2788. punpcklwd xmm0, xmm0 // BGRA first 4 pixels
  2789. punpckhwd xmm1, xmm1 // BGRA next 4 pixels
  2790. por xmm0, xmm4
  2791. por xmm1, xmm4
  2792. movdqu [edx], xmm0
  2793. movdqu [edx + 16], xmm1
  2794. lea edx, [edx + 32]
  2795. sub ecx, 8
  2796. jg convertloop
  2797. ret
  2798. }
  2799. }
  2800. #endif // HAS_I400TOARGBROW_SSE2
  2801. #ifdef HAS_I400TOARGBROW_AVX2
  2802. // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
  2803. // note: vpunpcklbw mutates and vpackuswb unmutates.
  2804. __declspec(naked)
  2805. void I400ToARGBRow_AVX2(const uint8* y_buf,
  2806. uint8* rgb_buf,
  2807. int width) {
  2808. __asm {
  2809. mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
  2810. vmovd xmm2, eax
  2811. vbroadcastss ymm2, xmm2
  2812. mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
  2813. vmovd xmm3, eax
  2814. vbroadcastss ymm3, xmm3
  2815. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
  2816. vpslld ymm4, ymm4, 24
  2817. mov eax, [esp + 4] // Y
  2818. mov edx, [esp + 8] // rgb
  2819. mov ecx, [esp + 12] // width
  2820. convertloop:
  2821. // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
  2822. vmovdqu xmm0, [eax]
  2823. lea eax, [eax + 16]
  2824. vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
  2825. vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
  2826. vpmulhuw ymm0, ymm0, ymm2
  2827. vpsubusw ymm0, ymm0, ymm3
  2828. vpsrlw ymm0, ymm0, 6
  2829. vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
  2830. // TODO(fbarchard): Weave alpha with unpack.
  2831. // Step 2: Weave into ARGB
  2832. vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
  2833. vpermq ymm1, ymm1, 0xd8
  2834. vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
  2835. vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
  2836. vpor ymm0, ymm0, ymm4
  2837. vpor ymm1, ymm1, ymm4
  2838. vmovdqu [edx], ymm0
  2839. vmovdqu [edx + 32], ymm1
  2840. lea edx, [edx + 64]
  2841. sub ecx, 16
  2842. jg convertloop
  2843. vzeroupper
  2844. ret
  2845. }
  2846. }
  2847. #endif // HAS_I400TOARGBROW_AVX2
  2848. #ifdef HAS_MIRRORROW_SSSE3
  2849. // Shuffle table for reversing the bytes.
  2850. static const uvec8 kShuffleMirror = {
  2851. 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  2852. };
  2853. // TODO(fbarchard): Replace lea with -16 offset.
  2854. __declspec(naked)
  2855. void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  2856. __asm {
  2857. mov eax, [esp + 4] // src
  2858. mov edx, [esp + 8] // dst
  2859. mov ecx, [esp + 12] // width
  2860. movdqa xmm5, xmmword ptr kShuffleMirror
  2861. convertloop:
  2862. movdqu xmm0, [eax - 16 + ecx]
  2863. pshufb xmm0, xmm5
  2864. movdqu [edx], xmm0
  2865. lea edx, [edx + 16]
  2866. sub ecx, 16
  2867. jg convertloop
  2868. ret
  2869. }
  2870. }
  2871. #endif // HAS_MIRRORROW_SSSE3
  2872. #ifdef HAS_MIRRORROW_AVX2
  2873. __declspec(naked)
  2874. void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  2875. __asm {
  2876. mov eax, [esp + 4] // src
  2877. mov edx, [esp + 8] // dst
  2878. mov ecx, [esp + 12] // width
  2879. vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
  2880. convertloop:
  2881. vmovdqu ymm0, [eax - 32 + ecx]
  2882. vpshufb ymm0, ymm0, ymm5
  2883. vpermq ymm0, ymm0, 0x4e // swap high and low halfs
  2884. vmovdqu [edx], ymm0
  2885. lea edx, [edx + 32]
  2886. sub ecx, 32
  2887. jg convertloop
  2888. vzeroupper
  2889. ret
  2890. }
  2891. }
  2892. #endif // HAS_MIRRORROW_AVX2
  2893. #ifdef HAS_MIRRORUVROW_SSSE3
  2894. // Shuffle table for reversing the bytes of UV channels.
  2895. static const uvec8 kShuffleMirrorUV = {
  2896. 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
  2897. };
  2898. __declspec(naked)
  2899. void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
  2900. int width) {
  2901. __asm {
  2902. push edi
  2903. mov eax, [esp + 4 + 4] // src
  2904. mov edx, [esp + 4 + 8] // dst_u
  2905. mov edi, [esp + 4 + 12] // dst_v
  2906. mov ecx, [esp + 4 + 16] // width
  2907. movdqa xmm1, xmmword ptr kShuffleMirrorUV
  2908. lea eax, [eax + ecx * 2 - 16]
  2909. sub edi, edx
  2910. convertloop:
  2911. movdqu xmm0, [eax]
  2912. lea eax, [eax - 16]
  2913. pshufb xmm0, xmm1
  2914. movlpd qword ptr [edx], xmm0
  2915. movhpd qword ptr [edx + edi], xmm0
  2916. lea edx, [edx + 8]
  2917. sub ecx, 8
  2918. jg convertloop
  2919. pop edi
  2920. ret
  2921. }
  2922. }
  2923. #endif // HAS_MIRRORUVROW_SSSE3
  2924. #ifdef HAS_ARGBMIRRORROW_SSE2
  2925. __declspec(naked)
  2926. void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  2927. __asm {
  2928. mov eax, [esp + 4] // src
  2929. mov edx, [esp + 8] // dst
  2930. mov ecx, [esp + 12] // width
  2931. lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
  2932. convertloop:
  2933. movdqu xmm0, [eax]
  2934. lea eax, [eax - 16]
  2935. pshufd xmm0, xmm0, 0x1b
  2936. movdqu [edx], xmm0
  2937. lea edx, [edx + 16]
  2938. sub ecx, 4
  2939. jg convertloop
  2940. ret
  2941. }
  2942. }
  2943. #endif // HAS_ARGBMIRRORROW_SSE2
  2944. #ifdef HAS_ARGBMIRRORROW_AVX2
  2945. // Shuffle table for reversing the bytes.
  2946. static const ulvec32 kARGBShuffleMirror_AVX2 = {
  2947. 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  2948. };
  2949. __declspec(naked)
  2950. void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  2951. __asm {
  2952. mov eax, [esp + 4] // src
  2953. mov edx, [esp + 8] // dst
  2954. mov ecx, [esp + 12] // width
  2955. vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
  2956. convertloop:
  2957. vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
  2958. vmovdqu [edx], ymm0
  2959. lea edx, [edx + 32]
  2960. sub ecx, 8
  2961. jg convertloop
  2962. vzeroupper
  2963. ret
  2964. }
  2965. }
  2966. #endif // HAS_ARGBMIRRORROW_AVX2
  2967. #ifdef HAS_SPLITUVROW_SSE2
  2968. __declspec(naked)
  2969. void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  2970. int width) {
  2971. __asm {
  2972. push edi
  2973. mov eax, [esp + 4 + 4] // src_uv
  2974. mov edx, [esp + 4 + 8] // dst_u
  2975. mov edi, [esp + 4 + 12] // dst_v
  2976. mov ecx, [esp + 4 + 16] // width
  2977. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  2978. psrlw xmm5, 8
  2979. sub edi, edx
  2980. convertloop:
  2981. movdqu xmm0, [eax]
  2982. movdqu xmm1, [eax + 16]
  2983. lea eax, [eax + 32]
  2984. movdqa xmm2, xmm0
  2985. movdqa xmm3, xmm1
  2986. pand xmm0, xmm5 // even bytes
  2987. pand xmm1, xmm5
  2988. packuswb xmm0, xmm1
  2989. psrlw xmm2, 8 // odd bytes
  2990. psrlw xmm3, 8
  2991. packuswb xmm2, xmm3
  2992. movdqu [edx], xmm0
  2993. movdqu [edx + edi], xmm2
  2994. lea edx, [edx + 16]
  2995. sub ecx, 16
  2996. jg convertloop
  2997. pop edi
  2998. ret
  2999. }
  3000. }
  3001. #endif // HAS_SPLITUVROW_SSE2
  3002. #ifdef HAS_SPLITUVROW_AVX2
  3003. __declspec(naked)
  3004. void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  3005. int width) {
  3006. __asm {
  3007. push edi
  3008. mov eax, [esp + 4 + 4] // src_uv
  3009. mov edx, [esp + 4 + 8] // dst_u
  3010. mov edi, [esp + 4 + 12] // dst_v
  3011. mov ecx, [esp + 4 + 16] // width
  3012. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3013. vpsrlw ymm5, ymm5, 8
  3014. sub edi, edx
  3015. convertloop:
  3016. vmovdqu ymm0, [eax]
  3017. vmovdqu ymm1, [eax + 32]
  3018. lea eax, [eax + 64]
  3019. vpsrlw ymm2, ymm0, 8 // odd bytes
  3020. vpsrlw ymm3, ymm1, 8
  3021. vpand ymm0, ymm0, ymm5 // even bytes
  3022. vpand ymm1, ymm1, ymm5
  3023. vpackuswb ymm0, ymm0, ymm1
  3024. vpackuswb ymm2, ymm2, ymm3
  3025. vpermq ymm0, ymm0, 0xd8
  3026. vpermq ymm2, ymm2, 0xd8
  3027. vmovdqu [edx], ymm0
  3028. vmovdqu [edx + edi], ymm2
  3029. lea edx, [edx + 32]
  3030. sub ecx, 32
  3031. jg convertloop
  3032. pop edi
  3033. vzeroupper
  3034. ret
  3035. }
  3036. }
  3037. #endif // HAS_SPLITUVROW_AVX2
  3038. #ifdef HAS_MERGEUVROW_SSE2
  3039. __declspec(naked)
  3040. void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  3041. int width) {
  3042. __asm {
  3043. push edi
  3044. mov eax, [esp + 4 + 4] // src_u
  3045. mov edx, [esp + 4 + 8] // src_v
  3046. mov edi, [esp + 4 + 12] // dst_uv
  3047. mov ecx, [esp + 4 + 16] // width
  3048. sub edx, eax
  3049. convertloop:
  3050. movdqu xmm0, [eax] // read 16 U's
  3051. movdqu xmm1, [eax + edx] // and 16 V's
  3052. lea eax, [eax + 16]
  3053. movdqa xmm2, xmm0
  3054. punpcklbw xmm0, xmm1 // first 8 UV pairs
  3055. punpckhbw xmm2, xmm1 // next 8 UV pairs
  3056. movdqu [edi], xmm0
  3057. movdqu [edi + 16], xmm2
  3058. lea edi, [edi + 32]
  3059. sub ecx, 16
  3060. jg convertloop
  3061. pop edi
  3062. ret
  3063. }
  3064. }
  3065. #endif // HAS_MERGEUVROW_SSE2
  3066. #ifdef HAS_MERGEUVROW_AVX2
  3067. __declspec(naked)
  3068. void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  3069. int width) {
  3070. __asm {
  3071. push edi
  3072. mov eax, [esp + 4 + 4] // src_u
  3073. mov edx, [esp + 4 + 8] // src_v
  3074. mov edi, [esp + 4 + 12] // dst_uv
  3075. mov ecx, [esp + 4 + 16] // width
  3076. sub edx, eax
  3077. convertloop:
  3078. vmovdqu ymm0, [eax] // read 32 U's
  3079. vmovdqu ymm1, [eax + edx] // and 32 V's
  3080. lea eax, [eax + 32]
  3081. vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
  3082. vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
  3083. vextractf128 [edi], ymm2, 0 // bytes 0..15
  3084. vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
  3085. vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
  3086. vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
  3087. lea edi, [edi + 64]
  3088. sub ecx, 32
  3089. jg convertloop
  3090. pop edi
  3091. vzeroupper
  3092. ret
  3093. }
  3094. }
  3095. #endif // HAS_MERGEUVROW_AVX2
  3096. #ifdef HAS_COPYROW_SSE2
  3097. // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
  3098. __declspec(naked)
  3099. void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  3100. __asm {
  3101. mov eax, [esp + 4] // src
  3102. mov edx, [esp + 8] // dst
  3103. mov ecx, [esp + 12] // count
  3104. test eax, 15
  3105. jne convertloopu
  3106. test edx, 15
  3107. jne convertloopu
  3108. convertloopa:
  3109. movdqa xmm0, [eax]
  3110. movdqa xmm1, [eax + 16]
  3111. lea eax, [eax + 32]
  3112. movdqa [edx], xmm0
  3113. movdqa [edx + 16], xmm1
  3114. lea edx, [edx + 32]
  3115. sub ecx, 32
  3116. jg convertloopa
  3117. ret
  3118. convertloopu:
  3119. movdqu xmm0, [eax]
  3120. movdqu xmm1, [eax + 16]
  3121. lea eax, [eax + 32]
  3122. movdqu [edx], xmm0
  3123. movdqu [edx + 16], xmm1
  3124. lea edx, [edx + 32]
  3125. sub ecx, 32
  3126. jg convertloopu
  3127. ret
  3128. }
  3129. }
  3130. #endif // HAS_COPYROW_SSE2
  3131. #ifdef HAS_COPYROW_AVX
  3132. // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
  3133. __declspec(naked)
  3134. void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
  3135. __asm {
  3136. mov eax, [esp + 4] // src
  3137. mov edx, [esp + 8] // dst
  3138. mov ecx, [esp + 12] // count
  3139. convertloop:
  3140. vmovdqu ymm0, [eax]
  3141. vmovdqu ymm1, [eax + 32]
  3142. lea eax, [eax + 64]
  3143. vmovdqu [edx], ymm0
  3144. vmovdqu [edx + 32], ymm1
  3145. lea edx, [edx + 64]
  3146. sub ecx, 64
  3147. jg convertloop
  3148. vzeroupper
  3149. ret
  3150. }
  3151. }
  3152. #endif // HAS_COPYROW_AVX
  3153. // Multiple of 1.
  3154. __declspec(naked)
  3155. void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
  3156. __asm {
  3157. mov eax, esi
  3158. mov edx, edi
  3159. mov esi, [esp + 4] // src
  3160. mov edi, [esp + 8] // dst
  3161. mov ecx, [esp + 12] // count
  3162. rep movsb
  3163. mov edi, edx
  3164. mov esi, eax
  3165. ret
  3166. }
  3167. }
  3168. #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  3169. // width in pixels
  3170. __declspec(naked)
  3171. void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  3172. __asm {
  3173. mov eax, [esp + 4] // src
  3174. mov edx, [esp + 8] // dst
  3175. mov ecx, [esp + 12] // count
  3176. pcmpeqb xmm0, xmm0 // generate mask 0xff000000
  3177. pslld xmm0, 24
  3178. pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
  3179. psrld xmm1, 8
  3180. convertloop:
  3181. movdqu xmm2, [eax]
  3182. movdqu xmm3, [eax + 16]
  3183. lea eax, [eax + 32]
  3184. movdqu xmm4, [edx]
  3185. movdqu xmm5, [edx + 16]
  3186. pand xmm2, xmm0
  3187. pand xmm3, xmm0
  3188. pand xmm4, xmm1
  3189. pand xmm5, xmm1
  3190. por xmm2, xmm4
  3191. por xmm3, xmm5
  3192. movdqu [edx], xmm2
  3193. movdqu [edx + 16], xmm3
  3194. lea edx, [edx + 32]
  3195. sub ecx, 8
  3196. jg convertloop
  3197. ret
  3198. }
  3199. }
  3200. #endif // HAS_ARGBCOPYALPHAROW_SSE2
  3201. #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  3202. // width in pixels
  3203. __declspec(naked)
  3204. void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  3205. __asm {
  3206. mov eax, [esp + 4] // src
  3207. mov edx, [esp + 8] // dst
  3208. mov ecx, [esp + 12] // count
  3209. vpcmpeqb ymm0, ymm0, ymm0
  3210. vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
  3211. convertloop:
  3212. vmovdqu ymm1, [eax]
  3213. vmovdqu ymm2, [eax + 32]
  3214. lea eax, [eax + 64]
  3215. vpblendvb ymm1, ymm1, [edx], ymm0
  3216. vpblendvb ymm2, ymm2, [edx + 32], ymm0
  3217. vmovdqu [edx], ymm1
  3218. vmovdqu [edx + 32], ymm2
  3219. lea edx, [edx + 64]
  3220. sub ecx, 16
  3221. jg convertloop
  3222. vzeroupper
  3223. ret
  3224. }
  3225. }
  3226. #endif // HAS_ARGBCOPYALPHAROW_AVX2
  3227. #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
  3228. // width in pixels
  3229. __declspec(naked)
  3230. void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
  3231. __asm {
  3232. mov eax, [esp + 4] // src_argb
  3233. mov edx, [esp + 8] // dst_a
  3234. mov ecx, [esp + 12] // width
  3235. extractloop:
  3236. movdqu xmm0, [eax]
  3237. movdqu xmm1, [eax + 16]
  3238. lea eax, [eax + 32]
  3239. psrld xmm0, 24
  3240. psrld xmm1, 24
  3241. packssdw xmm0, xmm1
  3242. packuswb xmm0, xmm0
  3243. movq qword ptr [edx], xmm0
  3244. lea edx, [edx + 8]
  3245. sub ecx, 8
  3246. jg extractloop
  3247. ret
  3248. }
  3249. }
  3250. #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
  3251. #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  3252. // width in pixels
  3253. __declspec(naked)
  3254. void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  3255. __asm {
  3256. mov eax, [esp + 4] // src
  3257. mov edx, [esp + 8] // dst
  3258. mov ecx, [esp + 12] // count
  3259. pcmpeqb xmm0, xmm0 // generate mask 0xff000000
  3260. pslld xmm0, 24
  3261. pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
  3262. psrld xmm1, 8
  3263. convertloop:
  3264. movq xmm2, qword ptr [eax] // 8 Y's
  3265. lea eax, [eax + 8]
  3266. punpcklbw xmm2, xmm2
  3267. punpckhwd xmm3, xmm2
  3268. punpcklwd xmm2, xmm2
  3269. movdqu xmm4, [edx]
  3270. movdqu xmm5, [edx + 16]
  3271. pand xmm2, xmm0
  3272. pand xmm3, xmm0
  3273. pand xmm4, xmm1
  3274. pand xmm5, xmm1
  3275. por xmm2, xmm4
  3276. por xmm3, xmm5
  3277. movdqu [edx], xmm2
  3278. movdqu [edx + 16], xmm3
  3279. lea edx, [edx + 32]
  3280. sub ecx, 8
  3281. jg convertloop
  3282. ret
  3283. }
  3284. }
  3285. #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
  3286. #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  3287. // width in pixels
  3288. __declspec(naked)
  3289. void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  3290. __asm {
  3291. mov eax, [esp + 4] // src
  3292. mov edx, [esp + 8] // dst
  3293. mov ecx, [esp + 12] // count
  3294. vpcmpeqb ymm0, ymm0, ymm0
  3295. vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
  3296. convertloop:
  3297. vpmovzxbd ymm1, qword ptr [eax]
  3298. vpmovzxbd ymm2, qword ptr [eax + 8]
  3299. lea eax, [eax + 16]
  3300. vpslld ymm1, ymm1, 24
  3301. vpslld ymm2, ymm2, 24
  3302. vpblendvb ymm1, ymm1, [edx], ymm0
  3303. vpblendvb ymm2, ymm2, [edx + 32], ymm0
  3304. vmovdqu [edx], ymm1
  3305. vmovdqu [edx + 32], ymm2
  3306. lea edx, [edx + 64]
  3307. sub ecx, 16
  3308. jg convertloop
  3309. vzeroupper
  3310. ret
  3311. }
  3312. }
  3313. #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
  3314. #ifdef HAS_SETROW_X86
  3315. // Write 'count' bytes using an 8 bit value repeated.
  3316. // Count should be multiple of 4.
  3317. __declspec(naked)
  3318. void SetRow_X86(uint8* dst, uint8 v8, int count) {
  3319. __asm {
  3320. movzx eax, byte ptr [esp + 8] // v8
  3321. mov edx, 0x01010101 // Duplicate byte to all bytes.
  3322. mul edx // overwrites edx with upper part of result.
  3323. mov edx, edi
  3324. mov edi, [esp + 4] // dst
  3325. mov ecx, [esp + 12] // count
  3326. shr ecx, 2
  3327. rep stosd
  3328. mov edi, edx
  3329. ret
  3330. }
  3331. }
  3332. // Write 'count' bytes using an 8 bit value repeated.
  3333. __declspec(naked)
  3334. void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
  3335. __asm {
  3336. mov edx, edi
  3337. mov edi, [esp + 4] // dst
  3338. mov eax, [esp + 8] // v8
  3339. mov ecx, [esp + 12] // count
  3340. rep stosb
  3341. mov edi, edx
  3342. ret
  3343. }
  3344. }
  3345. // Write 'count' 32 bit values.
  3346. __declspec(naked)
  3347. void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
  3348. __asm {
  3349. mov edx, edi
  3350. mov edi, [esp + 4] // dst
  3351. mov eax, [esp + 8] // v32
  3352. mov ecx, [esp + 12] // count
  3353. rep stosd
  3354. mov edi, edx
  3355. ret
  3356. }
  3357. }
  3358. #endif // HAS_SETROW_X86
  3359. #ifdef HAS_YUY2TOYROW_AVX2
  3360. __declspec(naked)
  3361. void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
  3362. __asm {
  3363. mov eax, [esp + 4] // src_yuy2
  3364. mov edx, [esp + 8] // dst_y
  3365. mov ecx, [esp + 12] // width
  3366. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3367. vpsrlw ymm5, ymm5, 8
  3368. convertloop:
  3369. vmovdqu ymm0, [eax]
  3370. vmovdqu ymm1, [eax + 32]
  3371. lea eax, [eax + 64]
  3372. vpand ymm0, ymm0, ymm5 // even bytes are Y
  3373. vpand ymm1, ymm1, ymm5
  3374. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3375. vpermq ymm0, ymm0, 0xd8
  3376. vmovdqu [edx], ymm0
  3377. lea edx, [edx + 32]
  3378. sub ecx, 32
  3379. jg convertloop
  3380. vzeroupper
  3381. ret
  3382. }
  3383. }
  3384. __declspec(naked)
  3385. void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
  3386. uint8* dst_u, uint8* dst_v, int width) {
  3387. __asm {
  3388. push esi
  3389. push edi
  3390. mov eax, [esp + 8 + 4] // src_yuy2
  3391. mov esi, [esp + 8 + 8] // stride_yuy2
  3392. mov edx, [esp + 8 + 12] // dst_u
  3393. mov edi, [esp + 8 + 16] // dst_v
  3394. mov ecx, [esp + 8 + 20] // width
  3395. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3396. vpsrlw ymm5, ymm5, 8
  3397. sub edi, edx
  3398. convertloop:
  3399. vmovdqu ymm0, [eax]
  3400. vmovdqu ymm1, [eax + 32]
  3401. vpavgb ymm0, ymm0, [eax + esi]
  3402. vpavgb ymm1, ymm1, [eax + esi + 32]
  3403. lea eax, [eax + 64]
  3404. vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
  3405. vpsrlw ymm1, ymm1, 8
  3406. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3407. vpermq ymm0, ymm0, 0xd8
  3408. vpand ymm1, ymm0, ymm5 // U
  3409. vpsrlw ymm0, ymm0, 8 // V
  3410. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3411. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3412. vpermq ymm1, ymm1, 0xd8
  3413. vpermq ymm0, ymm0, 0xd8
  3414. vextractf128 [edx], ymm1, 0 // U
  3415. vextractf128 [edx + edi], ymm0, 0 // V
  3416. lea edx, [edx + 16]
  3417. sub ecx, 32
  3418. jg convertloop
  3419. pop edi
  3420. pop esi
  3421. vzeroupper
  3422. ret
  3423. }
  3424. }
  3425. __declspec(naked)
  3426. void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
  3427. uint8* dst_u, uint8* dst_v, int width) {
  3428. __asm {
  3429. push edi
  3430. mov eax, [esp + 4 + 4] // src_yuy2
  3431. mov edx, [esp + 4 + 8] // dst_u
  3432. mov edi, [esp + 4 + 12] // dst_v
  3433. mov ecx, [esp + 4 + 16] // width
  3434. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3435. vpsrlw ymm5, ymm5, 8
  3436. sub edi, edx
  3437. convertloop:
  3438. vmovdqu ymm0, [eax]
  3439. vmovdqu ymm1, [eax + 32]
  3440. lea eax, [eax + 64]
  3441. vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
  3442. vpsrlw ymm1, ymm1, 8
  3443. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3444. vpermq ymm0, ymm0, 0xd8
  3445. vpand ymm1, ymm0, ymm5 // U
  3446. vpsrlw ymm0, ymm0, 8 // V
  3447. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3448. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3449. vpermq ymm1, ymm1, 0xd8
  3450. vpermq ymm0, ymm0, 0xd8
  3451. vextractf128 [edx], ymm1, 0 // U
  3452. vextractf128 [edx + edi], ymm0, 0 // V
  3453. lea edx, [edx + 16]
  3454. sub ecx, 32
  3455. jg convertloop
  3456. pop edi
  3457. vzeroupper
  3458. ret
  3459. }
  3460. }
  3461. __declspec(naked)
  3462. void UYVYToYRow_AVX2(const uint8* src_uyvy,
  3463. uint8* dst_y, int width) {
  3464. __asm {
  3465. mov eax, [esp + 4] // src_uyvy
  3466. mov edx, [esp + 8] // dst_y
  3467. mov ecx, [esp + 12] // width
  3468. convertloop:
  3469. vmovdqu ymm0, [eax]
  3470. vmovdqu ymm1, [eax + 32]
  3471. lea eax, [eax + 64]
  3472. vpsrlw ymm0, ymm0, 8 // odd bytes are Y
  3473. vpsrlw ymm1, ymm1, 8
  3474. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3475. vpermq ymm0, ymm0, 0xd8
  3476. vmovdqu [edx], ymm0
  3477. lea edx, [edx + 32]
  3478. sub ecx, 32
  3479. jg convertloop
  3480. vzeroupper
  3481. ret
  3482. }
  3483. }
  3484. __declspec(naked)
  3485. void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
  3486. uint8* dst_u, uint8* dst_v, int width) {
  3487. __asm {
  3488. push esi
  3489. push edi
  3490. mov eax, [esp + 8 + 4] // src_yuy2
  3491. mov esi, [esp + 8 + 8] // stride_yuy2
  3492. mov edx, [esp + 8 + 12] // dst_u
  3493. mov edi, [esp + 8 + 16] // dst_v
  3494. mov ecx, [esp + 8 + 20] // width
  3495. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3496. vpsrlw ymm5, ymm5, 8
  3497. sub edi, edx
  3498. convertloop:
  3499. vmovdqu ymm0, [eax]
  3500. vmovdqu ymm1, [eax + 32]
  3501. vpavgb ymm0, ymm0, [eax + esi]
  3502. vpavgb ymm1, ymm1, [eax + esi + 32]
  3503. lea eax, [eax + 64]
  3504. vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
  3505. vpand ymm1, ymm1, ymm5
  3506. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3507. vpermq ymm0, ymm0, 0xd8
  3508. vpand ymm1, ymm0, ymm5 // U
  3509. vpsrlw ymm0, ymm0, 8 // V
  3510. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3511. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3512. vpermq ymm1, ymm1, 0xd8
  3513. vpermq ymm0, ymm0, 0xd8
  3514. vextractf128 [edx], ymm1, 0 // U
  3515. vextractf128 [edx + edi], ymm0, 0 // V
  3516. lea edx, [edx + 16]
  3517. sub ecx, 32
  3518. jg convertloop
  3519. pop edi
  3520. pop esi
  3521. vzeroupper
  3522. ret
  3523. }
  3524. }
  3525. __declspec(naked)
  3526. void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
  3527. uint8* dst_u, uint8* dst_v, int width) {
  3528. __asm {
  3529. push edi
  3530. mov eax, [esp + 4 + 4] // src_yuy2
  3531. mov edx, [esp + 4 + 8] // dst_u
  3532. mov edi, [esp + 4 + 12] // dst_v
  3533. mov ecx, [esp + 4 + 16] // width
  3534. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3535. vpsrlw ymm5, ymm5, 8
  3536. sub edi, edx
  3537. convertloop:
  3538. vmovdqu ymm0, [eax]
  3539. vmovdqu ymm1, [eax + 32]
  3540. lea eax, [eax + 64]
  3541. vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
  3542. vpand ymm1, ymm1, ymm5
  3543. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3544. vpermq ymm0, ymm0, 0xd8
  3545. vpand ymm1, ymm0, ymm5 // U
  3546. vpsrlw ymm0, ymm0, 8 // V
  3547. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3548. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3549. vpermq ymm1, ymm1, 0xd8
  3550. vpermq ymm0, ymm0, 0xd8
  3551. vextractf128 [edx], ymm1, 0 // U
  3552. vextractf128 [edx + edi], ymm0, 0 // V
  3553. lea edx, [edx + 16]
  3554. sub ecx, 32
  3555. jg convertloop
  3556. pop edi
  3557. vzeroupper
  3558. ret
  3559. }
  3560. }
  3561. #endif // HAS_YUY2TOYROW_AVX2
  3562. #ifdef HAS_YUY2TOYROW_SSE2
  3563. __declspec(naked)
  3564. void YUY2ToYRow_SSE2(const uint8* src_yuy2,
  3565. uint8* dst_y, int width) {
  3566. __asm {
  3567. mov eax, [esp + 4] // src_yuy2
  3568. mov edx, [esp + 8] // dst_y
  3569. mov ecx, [esp + 12] // width
  3570. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3571. psrlw xmm5, 8
  3572. convertloop:
  3573. movdqu xmm0, [eax]
  3574. movdqu xmm1, [eax + 16]
  3575. lea eax, [eax + 32]
  3576. pand xmm0, xmm5 // even bytes are Y
  3577. pand xmm1, xmm5
  3578. packuswb xmm0, xmm1
  3579. movdqu [edx], xmm0
  3580. lea edx, [edx + 16]
  3581. sub ecx, 16
  3582. jg convertloop
  3583. ret
  3584. }
  3585. }
  3586. __declspec(naked)
  3587. void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
  3588. uint8* dst_u, uint8* dst_v, int width) {
  3589. __asm {
  3590. push esi
  3591. push edi
  3592. mov eax, [esp + 8 + 4] // src_yuy2
  3593. mov esi, [esp + 8 + 8] // stride_yuy2
  3594. mov edx, [esp + 8 + 12] // dst_u
  3595. mov edi, [esp + 8 + 16] // dst_v
  3596. mov ecx, [esp + 8 + 20] // width
  3597. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3598. psrlw xmm5, 8
  3599. sub edi, edx
  3600. convertloop:
  3601. movdqu xmm0, [eax]
  3602. movdqu xmm1, [eax + 16]
  3603. movdqu xmm2, [eax + esi]
  3604. movdqu xmm3, [eax + esi + 16]
  3605. lea eax, [eax + 32]
  3606. pavgb xmm0, xmm2
  3607. pavgb xmm1, xmm3
  3608. psrlw xmm0, 8 // YUYV -> UVUV
  3609. psrlw xmm1, 8
  3610. packuswb xmm0, xmm1
  3611. movdqa xmm1, xmm0
  3612. pand xmm0, xmm5 // U
  3613. packuswb xmm0, xmm0
  3614. psrlw xmm1, 8 // V
  3615. packuswb xmm1, xmm1
  3616. movq qword ptr [edx], xmm0
  3617. movq qword ptr [edx + edi], xmm1
  3618. lea edx, [edx + 8]
  3619. sub ecx, 16
  3620. jg convertloop
  3621. pop edi
  3622. pop esi
  3623. ret
  3624. }
  3625. }
  3626. __declspec(naked)
  3627. void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
  3628. uint8* dst_u, uint8* dst_v, int width) {
  3629. __asm {
  3630. push edi
  3631. mov eax, [esp + 4 + 4] // src_yuy2
  3632. mov edx, [esp + 4 + 8] // dst_u
  3633. mov edi, [esp + 4 + 12] // dst_v
  3634. mov ecx, [esp + 4 + 16] // width
  3635. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3636. psrlw xmm5, 8
  3637. sub edi, edx
  3638. convertloop:
  3639. movdqu xmm0, [eax]
  3640. movdqu xmm1, [eax + 16]
  3641. lea eax, [eax + 32]
  3642. psrlw xmm0, 8 // YUYV -> UVUV
  3643. psrlw xmm1, 8
  3644. packuswb xmm0, xmm1
  3645. movdqa xmm1, xmm0
  3646. pand xmm0, xmm5 // U
  3647. packuswb xmm0, xmm0
  3648. psrlw xmm1, 8 // V
  3649. packuswb xmm1, xmm1
  3650. movq qword ptr [edx], xmm0
  3651. movq qword ptr [edx + edi], xmm1
  3652. lea edx, [edx + 8]
  3653. sub ecx, 16
  3654. jg convertloop
  3655. pop edi
  3656. ret
  3657. }
  3658. }
  3659. __declspec(naked)
  3660. void UYVYToYRow_SSE2(const uint8* src_uyvy,
  3661. uint8* dst_y, int width) {
  3662. __asm {
  3663. mov eax, [esp + 4] // src_uyvy
  3664. mov edx, [esp + 8] // dst_y
  3665. mov ecx, [esp + 12] // width
  3666. convertloop:
  3667. movdqu xmm0, [eax]
  3668. movdqu xmm1, [eax + 16]
  3669. lea eax, [eax + 32]
  3670. psrlw xmm0, 8 // odd bytes are Y
  3671. psrlw xmm1, 8
  3672. packuswb xmm0, xmm1
  3673. movdqu [edx], xmm0
  3674. lea edx, [edx + 16]
  3675. sub ecx, 16
  3676. jg convertloop
  3677. ret
  3678. }
  3679. }
  3680. __declspec(naked)
  3681. void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
  3682. uint8* dst_u, uint8* dst_v, int width) {
  3683. __asm {
  3684. push esi
  3685. push edi
  3686. mov eax, [esp + 8 + 4] // src_yuy2
  3687. mov esi, [esp + 8 + 8] // stride_yuy2
  3688. mov edx, [esp + 8 + 12] // dst_u
  3689. mov edi, [esp + 8 + 16] // dst_v
  3690. mov ecx, [esp + 8 + 20] // width
  3691. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3692. psrlw xmm5, 8
  3693. sub edi, edx
  3694. convertloop:
  3695. movdqu xmm0, [eax]
  3696. movdqu xmm1, [eax + 16]
  3697. movdqu xmm2, [eax + esi]
  3698. movdqu xmm3, [eax + esi + 16]
  3699. lea eax, [eax + 32]
  3700. pavgb xmm0, xmm2
  3701. pavgb xmm1, xmm3
  3702. pand xmm0, xmm5 // UYVY -> UVUV
  3703. pand xmm1, xmm5
  3704. packuswb xmm0, xmm1
  3705. movdqa xmm1, xmm0
  3706. pand xmm0, xmm5 // U
  3707. packuswb xmm0, xmm0
  3708. psrlw xmm1, 8 // V
  3709. packuswb xmm1, xmm1
  3710. movq qword ptr [edx], xmm0
  3711. movq qword ptr [edx + edi], xmm1
  3712. lea edx, [edx + 8]
  3713. sub ecx, 16
  3714. jg convertloop
  3715. pop edi
  3716. pop esi
  3717. ret
  3718. }
  3719. }
  3720. __declspec(naked)
  3721. void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
  3722. uint8* dst_u, uint8* dst_v, int width) {
  3723. __asm {
  3724. push edi
  3725. mov eax, [esp + 4 + 4] // src_yuy2
  3726. mov edx, [esp + 4 + 8] // dst_u
  3727. mov edi, [esp + 4 + 12] // dst_v
  3728. mov ecx, [esp + 4 + 16] // width
  3729. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3730. psrlw xmm5, 8
  3731. sub edi, edx
  3732. convertloop:
  3733. movdqu xmm0, [eax]
  3734. movdqu xmm1, [eax + 16]
  3735. lea eax, [eax + 32]
  3736. pand xmm0, xmm5 // UYVY -> UVUV
  3737. pand xmm1, xmm5
  3738. packuswb xmm0, xmm1
  3739. movdqa xmm1, xmm0
  3740. pand xmm0, xmm5 // U
  3741. packuswb xmm0, xmm0
  3742. psrlw xmm1, 8 // V
  3743. packuswb xmm1, xmm1
  3744. movq qword ptr [edx], xmm0
  3745. movq qword ptr [edx + edi], xmm1
  3746. lea edx, [edx + 8]
  3747. sub ecx, 16
  3748. jg convertloop
  3749. pop edi
  3750. ret
  3751. }
  3752. }
  3753. #endif // HAS_YUY2TOYROW_SSE2
  3754. #ifdef HAS_BLENDPLANEROW_SSSE3
  3755. // Blend 8 pixels at a time.
  3756. // unsigned version of math
  3757. // =((A2*C2)+(B2*(255-C2))+255)/256
  3758. // signed version of math
  3759. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3760. __declspec(naked)
  3761. void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
  3762. const uint8* alpha, uint8* dst, int width) {
  3763. __asm {
  3764. push esi
  3765. push edi
  3766. pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
  3767. psllw xmm5, 8
  3768. mov eax, 0x80808080 // 128 for biasing image to signed.
  3769. movd xmm6, eax
  3770. pshufd xmm6, xmm6, 0x00
  3771. mov eax, 0x807f807f // 32768 + 127 for unbias and round.
  3772. movd xmm7, eax
  3773. pshufd xmm7, xmm7, 0x00
  3774. mov eax, [esp + 8 + 4] // src0
  3775. mov edx, [esp + 8 + 8] // src1
  3776. mov esi, [esp + 8 + 12] // alpha
  3777. mov edi, [esp + 8 + 16] // dst
  3778. mov ecx, [esp + 8 + 20] // width
  3779. sub eax, esi
  3780. sub edx, esi
  3781. sub edi, esi
  3782. // 8 pixel loop.
  3783. convertloop8:
  3784. movq xmm0, qword ptr [esi] // alpha
  3785. punpcklbw xmm0, xmm0
  3786. pxor xmm0, xmm5 // a, 255-a
  3787. movq xmm1, qword ptr [eax + esi] // src0
  3788. movq xmm2, qword ptr [edx + esi] // src1
  3789. punpcklbw xmm1, xmm2
  3790. psubb xmm1, xmm6 // bias src0/1 - 128
  3791. pmaddubsw xmm0, xmm1
  3792. paddw xmm0, xmm7 // unbias result - 32768 and round.
  3793. psrlw xmm0, 8
  3794. packuswb xmm0, xmm0
  3795. movq qword ptr [edi + esi], xmm0
  3796. lea esi, [esi + 8]
  3797. sub ecx, 8
  3798. jg convertloop8
  3799. pop edi
  3800. pop esi
  3801. ret
  3802. }
  3803. }
  3804. #endif // HAS_BLENDPLANEROW_SSSE3
  3805. #ifdef HAS_BLENDPLANEROW_AVX2
  3806. // Blend 32 pixels at a time.
  3807. // unsigned version of math
  3808. // =((A2*C2)+(B2*(255-C2))+255)/256
  3809. // signed version of math
  3810. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3811. __declspec(naked)
  3812. void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
  3813. const uint8* alpha, uint8* dst, int width) {
  3814. __asm {
  3815. push esi
  3816. push edi
  3817. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
  3818. vpsllw ymm5, ymm5, 8
  3819. mov eax, 0x80808080 // 128 for biasing image to signed.
  3820. vmovd xmm6, eax
  3821. vbroadcastss ymm6, xmm6
  3822. mov eax, 0x807f807f // 32768 + 127 for unbias and round.
  3823. vmovd xmm7, eax
  3824. vbroadcastss ymm7, xmm7
  3825. mov eax, [esp + 8 + 4] // src0
  3826. mov edx, [esp + 8 + 8] // src1
  3827. mov esi, [esp + 8 + 12] // alpha
  3828. mov edi, [esp + 8 + 16] // dst
  3829. mov ecx, [esp + 8 + 20] // width
  3830. sub eax, esi
  3831. sub edx, esi
  3832. sub edi, esi
  3833. // 32 pixel loop.
  3834. convertloop32:
  3835. vmovdqu ymm0, [esi] // alpha
  3836. vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
  3837. vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
  3838. vpxor ymm3, ymm3, ymm5 // a, 255-a
  3839. vpxor ymm0, ymm0, ymm5 // a, 255-a
  3840. vmovdqu ymm1, [eax + esi] // src0
  3841. vmovdqu ymm2, [edx + esi] // src1
  3842. vpunpckhbw ymm4, ymm1, ymm2
  3843. vpunpcklbw ymm1, ymm1, ymm2
  3844. vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
  3845. vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
  3846. vpmaddubsw ymm3, ymm3, ymm4
  3847. vpmaddubsw ymm0, ymm0, ymm1
  3848. vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
  3849. vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
  3850. vpsrlw ymm3, ymm3, 8
  3851. vpsrlw ymm0, ymm0, 8
  3852. vpackuswb ymm0, ymm0, ymm3
  3853. vmovdqu [edi + esi], ymm0
  3854. lea esi, [esi + 32]
  3855. sub ecx, 32
  3856. jg convertloop32
  3857. pop edi
  3858. pop esi
  3859. vzeroupper
  3860. ret
  3861. }
  3862. }
  3863. #endif // HAS_BLENDPLANEROW_AVX2
  3864. #ifdef HAS_ARGBBLENDROW_SSSE3
  3865. // Shuffle table for isolating alpha.
  3866. static const uvec8 kShuffleAlpha = {
  3867. 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  3868. 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
  3869. };
  3870. // Blend 8 pixels at a time.
  3871. __declspec(naked)
  3872. void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
  3873. uint8* dst_argb, int width) {
  3874. __asm {
  3875. push esi
  3876. mov eax, [esp + 4 + 4] // src_argb0
  3877. mov esi, [esp + 4 + 8] // src_argb1
  3878. mov edx, [esp + 4 + 12] // dst_argb
  3879. mov ecx, [esp + 4 + 16] // width
  3880. pcmpeqb xmm7, xmm7 // generate constant 0x0001
  3881. psrlw xmm7, 15
  3882. pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
  3883. psrlw xmm6, 8
  3884. pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
  3885. psllw xmm5, 8
  3886. pcmpeqb xmm4, xmm4 // generate mask 0xff000000
  3887. pslld xmm4, 24
  3888. sub ecx, 4
  3889. jl convertloop4b // less than 4 pixels?
  3890. // 4 pixel loop.
  3891. convertloop4:
  3892. movdqu xmm3, [eax] // src argb
  3893. lea eax, [eax + 16]
  3894. movdqa xmm0, xmm3 // src argb
  3895. pxor xmm3, xmm4 // ~alpha
  3896. movdqu xmm2, [esi] // _r_b
  3897. pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
  3898. pand xmm2, xmm6 // _r_b
  3899. paddw xmm3, xmm7 // 256 - alpha
  3900. pmullw xmm2, xmm3 // _r_b * alpha
  3901. movdqu xmm1, [esi] // _a_g
  3902. lea esi, [esi + 16]
  3903. psrlw xmm1, 8 // _a_g
  3904. por xmm0, xmm4 // set alpha to 255
  3905. pmullw xmm1, xmm3 // _a_g * alpha
  3906. psrlw xmm2, 8 // _r_b convert to 8 bits again
  3907. paddusb xmm0, xmm2 // + src argb
  3908. pand xmm1, xmm5 // a_g_ convert to 8 bits again
  3909. paddusb xmm0, xmm1 // + src argb
  3910. movdqu [edx], xmm0
  3911. lea edx, [edx + 16]
  3912. sub ecx, 4
  3913. jge convertloop4
  3914. convertloop4b:
  3915. add ecx, 4 - 1
  3916. jl convertloop1b
  3917. // 1 pixel loop.
  3918. convertloop1:
  3919. movd xmm3, [eax] // src argb
  3920. lea eax, [eax + 4]
  3921. movdqa xmm0, xmm3 // src argb
  3922. pxor xmm3, xmm4 // ~alpha
  3923. movd xmm2, [esi] // _r_b
  3924. pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
  3925. pand xmm2, xmm6 // _r_b
  3926. paddw xmm3, xmm7 // 256 - alpha
  3927. pmullw xmm2, xmm3 // _r_b * alpha
  3928. movd xmm1, [esi] // _a_g
  3929. lea esi, [esi + 4]
  3930. psrlw xmm1, 8 // _a_g
  3931. por xmm0, xmm4 // set alpha to 255
  3932. pmullw xmm1, xmm3 // _a_g * alpha
  3933. psrlw xmm2, 8 // _r_b convert to 8 bits again
  3934. paddusb xmm0, xmm2 // + src argb
  3935. pand xmm1, xmm5 // a_g_ convert to 8 bits again
  3936. paddusb xmm0, xmm1 // + src argb
  3937. movd [edx], xmm0
  3938. lea edx, [edx + 4]
  3939. sub ecx, 1
  3940. jge convertloop1
  3941. convertloop1b:
  3942. pop esi
  3943. ret
  3944. }
  3945. }
  3946. #endif // HAS_ARGBBLENDROW_SSSE3
  3947. #ifdef HAS_ARGBATTENUATEROW_SSSE3
  3948. // Shuffle table duplicating alpha.
  3949. static const uvec8 kShuffleAlpha0 = {
  3950. 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
  3951. };
  3952. static const uvec8 kShuffleAlpha1 = {
  3953. 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  3954. 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
  3955. };
  3956. __declspec(naked)
  3957. void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  3958. __asm {
  3959. mov eax, [esp + 4] // src_argb0
  3960. mov edx, [esp + 8] // dst_argb
  3961. mov ecx, [esp + 12] // width
  3962. pcmpeqb xmm3, xmm3 // generate mask 0xff000000
  3963. pslld xmm3, 24
  3964. movdqa xmm4, xmmword ptr kShuffleAlpha0
  3965. movdqa xmm5, xmmword ptr kShuffleAlpha1
  3966. convertloop:
  3967. movdqu xmm0, [eax] // read 4 pixels
  3968. pshufb xmm0, xmm4 // isolate first 2 alphas
  3969. movdqu xmm1, [eax] // read 4 pixels
  3970. punpcklbw xmm1, xmm1 // first 2 pixel rgbs
  3971. pmulhuw xmm0, xmm1 // rgb * a
  3972. movdqu xmm1, [eax] // read 4 pixels
  3973. pshufb xmm1, xmm5 // isolate next 2 alphas
  3974. movdqu xmm2, [eax] // read 4 pixels
  3975. punpckhbw xmm2, xmm2 // next 2 pixel rgbs
  3976. pmulhuw xmm1, xmm2 // rgb * a
  3977. movdqu xmm2, [eax] // mask original alpha
  3978. lea eax, [eax + 16]
  3979. pand xmm2, xmm3
  3980. psrlw xmm0, 8
  3981. psrlw xmm1, 8
  3982. packuswb xmm0, xmm1
  3983. por xmm0, xmm2 // copy original alpha
  3984. movdqu [edx], xmm0
  3985. lea edx, [edx + 16]
  3986. sub ecx, 4
  3987. jg convertloop
  3988. ret
  3989. }
  3990. }
  3991. #endif // HAS_ARGBATTENUATEROW_SSSE3
  3992. #ifdef HAS_ARGBATTENUATEROW_AVX2
  3993. // Shuffle table duplicating alpha.
  3994. static const uvec8 kShuffleAlpha_AVX2 = {
  3995. 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
  3996. };
  3997. __declspec(naked)
  3998. void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
  3999. __asm {
  4000. mov eax, [esp + 4] // src_argb0
  4001. mov edx, [esp + 8] // dst_argb
  4002. mov ecx, [esp + 12] // width
  4003. sub edx, eax
  4004. vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
  4005. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
  4006. vpslld ymm5, ymm5, 24
  4007. convertloop:
  4008. vmovdqu ymm6, [eax] // read 8 pixels.
  4009. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  4010. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  4011. vpshufb ymm2, ymm0, ymm4 // low 4 alphas
  4012. vpshufb ymm3, ymm1, ymm4 // high 4 alphas
  4013. vpmulhuw ymm0, ymm0, ymm2 // rgb * a
  4014. vpmulhuw ymm1, ymm1, ymm3 // rgb * a
  4015. vpand ymm6, ymm6, ymm5 // isolate alpha
  4016. vpsrlw ymm0, ymm0, 8
  4017. vpsrlw ymm1, ymm1, 8
  4018. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  4019. vpor ymm0, ymm0, ymm6 // copy original alpha
  4020. vmovdqu [eax + edx], ymm0
  4021. lea eax, [eax + 32]
  4022. sub ecx, 8
  4023. jg convertloop
  4024. vzeroupper
  4025. ret
  4026. }
  4027. }
  4028. #endif // HAS_ARGBATTENUATEROW_AVX2
  4029. #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  4030. // Unattenuate 4 pixels at a time.
  4031. __declspec(naked)
  4032. void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  4033. int width) {
  4034. __asm {
  4035. push ebx
  4036. push esi
  4037. push edi
  4038. mov eax, [esp + 12 + 4] // src_argb
  4039. mov edx, [esp + 12 + 8] // dst_argb
  4040. mov ecx, [esp + 12 + 12] // width
  4041. lea ebx, fixed_invtbl8
  4042. convertloop:
  4043. movdqu xmm0, [eax] // read 4 pixels
  4044. movzx esi, byte ptr [eax + 3] // first alpha
  4045. movzx edi, byte ptr [eax + 7] // second alpha
  4046. punpcklbw xmm0, xmm0 // first 2
  4047. movd xmm2, dword ptr [ebx + esi * 4]
  4048. movd xmm3, dword ptr [ebx + edi * 4]
  4049. pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
  4050. pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
  4051. movlhps xmm2, xmm3
  4052. pmulhuw xmm0, xmm2 // rgb * a
  4053. movdqu xmm1, [eax] // read 4 pixels
  4054. movzx esi, byte ptr [eax + 11] // third alpha
  4055. movzx edi, byte ptr [eax + 15] // forth alpha
  4056. punpckhbw xmm1, xmm1 // next 2
  4057. movd xmm2, dword ptr [ebx + esi * 4]
  4058. movd xmm3, dword ptr [ebx + edi * 4]
  4059. pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
  4060. pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
  4061. movlhps xmm2, xmm3
  4062. pmulhuw xmm1, xmm2 // rgb * a
  4063. lea eax, [eax + 16]
  4064. packuswb xmm0, xmm1
  4065. movdqu [edx], xmm0
  4066. lea edx, [edx + 16]
  4067. sub ecx, 4
  4068. jg convertloop
  4069. pop edi
  4070. pop esi
  4071. pop ebx
  4072. ret
  4073. }
  4074. }
  4075. #endif // HAS_ARGBUNATTENUATEROW_SSE2
  4076. #ifdef HAS_ARGBUNATTENUATEROW_AVX2
  4077. // Shuffle table duplicating alpha.
  4078. static const uvec8 kUnattenShuffleAlpha_AVX2 = {
  4079. 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
  4080. };
  4081. // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
  4082. // USE_GATHER is not on by default, due to being a slow instruction.
  4083. #ifdef USE_GATHER
  4084. __declspec(naked)
  4085. void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  4086. int width) {
  4087. __asm {
  4088. mov eax, [esp + 4] // src_argb0
  4089. mov edx, [esp + 8] // dst_argb
  4090. mov ecx, [esp + 12] // width
  4091. sub edx, eax
  4092. vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
  4093. convertloop:
  4094. vmovdqu ymm6, [eax] // read 8 pixels.
  4095. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
  4096. vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
  4097. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  4098. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  4099. vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
  4100. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
  4101. vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
  4102. vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
  4103. vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
  4104. vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
  4105. vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
  4106. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  4107. vmovdqu [eax + edx], ymm0
  4108. lea eax, [eax + 32]
  4109. sub ecx, 8
  4110. jg convertloop
  4111. vzeroupper
  4112. ret
  4113. }
  4114. }
  4115. #else // USE_GATHER
  4116. __declspec(naked)
  4117. void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  4118. int width) {
  4119. __asm {
  4120. push ebx
  4121. push esi
  4122. push edi
  4123. mov eax, [esp + 12 + 4] // src_argb
  4124. mov edx, [esp + 12 + 8] // dst_argb
  4125. mov ecx, [esp + 12 + 12] // width
  4126. sub edx, eax
  4127. lea ebx, fixed_invtbl8
  4128. vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
  4129. convertloop:
  4130. // replace VPGATHER
  4131. movzx esi, byte ptr [eax + 3] // alpha0
  4132. movzx edi, byte ptr [eax + 7] // alpha1
  4133. vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
  4134. vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
  4135. movzx esi, byte ptr [eax + 11] // alpha2
  4136. movzx edi, byte ptr [eax + 15] // alpha3
  4137. vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
  4138. vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
  4139. vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
  4140. movzx esi, byte ptr [eax + 19] // alpha4
  4141. movzx edi, byte ptr [eax + 23] // alpha5
  4142. vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
  4143. vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
  4144. vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
  4145. movzx esi, byte ptr [eax + 27] // alpha6
  4146. movzx edi, byte ptr [eax + 31] // alpha7
  4147. vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
  4148. vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
  4149. vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
  4150. vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
  4151. vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
  4152. vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
  4153. vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
  4154. // end of VPGATHER
  4155. vmovdqu ymm6, [eax] // read 8 pixels.
  4156. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  4157. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  4158. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
  4159. vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
  4160. vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
  4161. vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
  4162. vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
  4163. vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
  4164. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  4165. vmovdqu [eax + edx], ymm0
  4166. lea eax, [eax + 32]
  4167. sub ecx, 8
  4168. jg convertloop
  4169. pop edi
  4170. pop esi
  4171. pop ebx
  4172. vzeroupper
  4173. ret
  4174. }
  4175. }
  4176. #endif // USE_GATHER
  4177. #endif // HAS_ARGBATTENUATEROW_AVX2
  4178. #ifdef HAS_ARGBGRAYROW_SSSE3
  4179. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
  4180. __declspec(naked)
  4181. void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  4182. __asm {
  4183. mov eax, [esp + 4] /* src_argb */
  4184. mov edx, [esp + 8] /* dst_argb */
  4185. mov ecx, [esp + 12] /* width */
  4186. movdqa xmm4, xmmword ptr kARGBToYJ
  4187. movdqa xmm5, xmmword ptr kAddYJ64
  4188. convertloop:
  4189. movdqu xmm0, [eax] // G
  4190. movdqu xmm1, [eax + 16]
  4191. pmaddubsw xmm0, xmm4
  4192. pmaddubsw xmm1, xmm4
  4193. phaddw xmm0, xmm1
  4194. paddw xmm0, xmm5 // Add .5 for rounding.
  4195. psrlw xmm0, 7
  4196. packuswb xmm0, xmm0 // 8 G bytes
  4197. movdqu xmm2, [eax] // A
  4198. movdqu xmm3, [eax + 16]
  4199. lea eax, [eax + 32]
  4200. psrld xmm2, 24
  4201. psrld xmm3, 24
  4202. packuswb xmm2, xmm3
  4203. packuswb xmm2, xmm2 // 8 A bytes
  4204. movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
  4205. punpcklbw xmm0, xmm0 // 8 GG words
  4206. punpcklbw xmm3, xmm2 // 8 GA words
  4207. movdqa xmm1, xmm0
  4208. punpcklwd xmm0, xmm3 // GGGA first 4
  4209. punpckhwd xmm1, xmm3 // GGGA next 4
  4210. movdqu [edx], xmm0
  4211. movdqu [edx + 16], xmm1
  4212. lea edx, [edx + 32]
  4213. sub ecx, 8
  4214. jg convertloop
  4215. ret
  4216. }
  4217. }
  4218. #endif // HAS_ARGBGRAYROW_SSSE3
  4219. #ifdef HAS_ARGBSEPIAROW_SSSE3
  4220. // b = (r * 35 + g * 68 + b * 17) >> 7
  4221. // g = (r * 45 + g * 88 + b * 22) >> 7
  4222. // r = (r * 50 + g * 98 + b * 24) >> 7
  4223. // Constant for ARGB color to sepia tone.
  4224. static const vec8 kARGBToSepiaB = {
  4225. 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
  4226. };
  4227. static const vec8 kARGBToSepiaG = {
  4228. 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
  4229. };
  4230. static const vec8 kARGBToSepiaR = {
  4231. 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
  4232. };
  4233. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  4234. __declspec(naked)
  4235. void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  4236. __asm {
  4237. mov eax, [esp + 4] /* dst_argb */
  4238. mov ecx, [esp + 8] /* width */
  4239. movdqa xmm2, xmmword ptr kARGBToSepiaB
  4240. movdqa xmm3, xmmword ptr kARGBToSepiaG
  4241. movdqa xmm4, xmmword ptr kARGBToSepiaR
  4242. convertloop:
  4243. movdqu xmm0, [eax] // B
  4244. movdqu xmm6, [eax + 16]
  4245. pmaddubsw xmm0, xmm2
  4246. pmaddubsw xmm6, xmm2
  4247. phaddw xmm0, xmm6
  4248. psrlw xmm0, 7
  4249. packuswb xmm0, xmm0 // 8 B values
  4250. movdqu xmm5, [eax] // G
  4251. movdqu xmm1, [eax + 16]
  4252. pmaddubsw xmm5, xmm3
  4253. pmaddubsw xmm1, xmm3
  4254. phaddw xmm5, xmm1
  4255. psrlw xmm5, 7
  4256. packuswb xmm5, xmm5 // 8 G values
  4257. punpcklbw xmm0, xmm5 // 8 BG values
  4258. movdqu xmm5, [eax] // R
  4259. movdqu xmm1, [eax + 16]
  4260. pmaddubsw xmm5, xmm4
  4261. pmaddubsw xmm1, xmm4
  4262. phaddw xmm5, xmm1
  4263. psrlw xmm5, 7
  4264. packuswb xmm5, xmm5 // 8 R values
  4265. movdqu xmm6, [eax] // A
  4266. movdqu xmm1, [eax + 16]
  4267. psrld xmm6, 24
  4268. psrld xmm1, 24
  4269. packuswb xmm6, xmm1
  4270. packuswb xmm6, xmm6 // 8 A values
  4271. punpcklbw xmm5, xmm6 // 8 RA values
  4272. movdqa xmm1, xmm0 // Weave BG, RA together
  4273. punpcklwd xmm0, xmm5 // BGRA first 4
  4274. punpckhwd xmm1, xmm5 // BGRA next 4
  4275. movdqu [eax], xmm0
  4276. movdqu [eax + 16], xmm1
  4277. lea eax, [eax + 32]
  4278. sub ecx, 8
  4279. jg convertloop
  4280. ret
  4281. }
  4282. }
  4283. #endif // HAS_ARGBSEPIAROW_SSSE3
  4284. #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  4285. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  4286. // Same as Sepia except matrix is provided.
  4287. // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
  4288. // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
  4289. __declspec(naked)
  4290. void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  4291. const int8* matrix_argb, int width) {
  4292. __asm {
  4293. mov eax, [esp + 4] /* src_argb */
  4294. mov edx, [esp + 8] /* dst_argb */
  4295. mov ecx, [esp + 12] /* matrix_argb */
  4296. movdqu xmm5, [ecx]
  4297. pshufd xmm2, xmm5, 0x00
  4298. pshufd xmm3, xmm5, 0x55
  4299. pshufd xmm4, xmm5, 0xaa
  4300. pshufd xmm5, xmm5, 0xff
  4301. mov ecx, [esp + 16] /* width */
  4302. convertloop:
  4303. movdqu xmm0, [eax] // B
  4304. movdqu xmm7, [eax + 16]
  4305. pmaddubsw xmm0, xmm2
  4306. pmaddubsw xmm7, xmm2
  4307. movdqu xmm6, [eax] // G
  4308. movdqu xmm1, [eax + 16]
  4309. pmaddubsw xmm6, xmm3
  4310. pmaddubsw xmm1, xmm3
  4311. phaddsw xmm0, xmm7 // B
  4312. phaddsw xmm6, xmm1 // G
  4313. psraw xmm0, 6 // B
  4314. psraw xmm6, 6 // G
  4315. packuswb xmm0, xmm0 // 8 B values
  4316. packuswb xmm6, xmm6 // 8 G values
  4317. punpcklbw xmm0, xmm6 // 8 BG values
  4318. movdqu xmm1, [eax] // R
  4319. movdqu xmm7, [eax + 16]
  4320. pmaddubsw xmm1, xmm4
  4321. pmaddubsw xmm7, xmm4
  4322. phaddsw xmm1, xmm7 // R
  4323. movdqu xmm6, [eax] // A
  4324. movdqu xmm7, [eax + 16]
  4325. pmaddubsw xmm6, xmm5
  4326. pmaddubsw xmm7, xmm5
  4327. phaddsw xmm6, xmm7 // A
  4328. psraw xmm1, 6 // R
  4329. psraw xmm6, 6 // A
  4330. packuswb xmm1, xmm1 // 8 R values
  4331. packuswb xmm6, xmm6 // 8 A values
  4332. punpcklbw xmm1, xmm6 // 8 RA values
  4333. movdqa xmm6, xmm0 // Weave BG, RA together
  4334. punpcklwd xmm0, xmm1 // BGRA first 4
  4335. punpckhwd xmm6, xmm1 // BGRA next 4
  4336. movdqu [edx], xmm0
  4337. movdqu [edx + 16], xmm6
  4338. lea eax, [eax + 32]
  4339. lea edx, [edx + 32]
  4340. sub ecx, 8
  4341. jg convertloop
  4342. ret
  4343. }
  4344. }
  4345. #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
  4346. #ifdef HAS_ARGBQUANTIZEROW_SSE2
  4347. // Quantize 4 ARGB pixels (16 bytes).
  4348. __declspec(naked)
  4349. void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
  4350. int interval_offset, int width) {
  4351. __asm {
  4352. mov eax, [esp + 4] /* dst_argb */
  4353. movd xmm2, [esp + 8] /* scale */
  4354. movd xmm3, [esp + 12] /* interval_size */
  4355. movd xmm4, [esp + 16] /* interval_offset */
  4356. mov ecx, [esp + 20] /* width */
  4357. pshuflw xmm2, xmm2, 040h
  4358. pshufd xmm2, xmm2, 044h
  4359. pshuflw xmm3, xmm3, 040h
  4360. pshufd xmm3, xmm3, 044h
  4361. pshuflw xmm4, xmm4, 040h
  4362. pshufd xmm4, xmm4, 044h
  4363. pxor xmm5, xmm5 // constant 0
  4364. pcmpeqb xmm6, xmm6 // generate mask 0xff000000
  4365. pslld xmm6, 24
  4366. convertloop:
  4367. movdqu xmm0, [eax] // read 4 pixels
  4368. punpcklbw xmm0, xmm5 // first 2 pixels
  4369. pmulhuw xmm0, xmm2 // pixel * scale >> 16
  4370. movdqu xmm1, [eax] // read 4 pixels
  4371. punpckhbw xmm1, xmm5 // next 2 pixels
  4372. pmulhuw xmm1, xmm2
  4373. pmullw xmm0, xmm3 // * interval_size
  4374. movdqu xmm7, [eax] // read 4 pixels
  4375. pmullw xmm1, xmm3
  4376. pand xmm7, xmm6 // mask alpha
  4377. paddw xmm0, xmm4 // + interval_size / 2
  4378. paddw xmm1, xmm4
  4379. packuswb xmm0, xmm1
  4380. por xmm0, xmm7
  4381. movdqu [eax], xmm0
  4382. lea eax, [eax + 16]
  4383. sub ecx, 4
  4384. jg convertloop
  4385. ret
  4386. }
  4387. }
  4388. #endif // HAS_ARGBQUANTIZEROW_SSE2
  4389. #ifdef HAS_ARGBSHADEROW_SSE2
  4390. // Shade 4 pixels at a time by specified value.
  4391. __declspec(naked)
  4392. void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
  4393. uint32 value) {
  4394. __asm {
  4395. mov eax, [esp + 4] // src_argb
  4396. mov edx, [esp + 8] // dst_argb
  4397. mov ecx, [esp + 12] // width
  4398. movd xmm2, [esp + 16] // value
  4399. punpcklbw xmm2, xmm2
  4400. punpcklqdq xmm2, xmm2
  4401. convertloop:
  4402. movdqu xmm0, [eax] // read 4 pixels
  4403. lea eax, [eax + 16]
  4404. movdqa xmm1, xmm0
  4405. punpcklbw xmm0, xmm0 // first 2
  4406. punpckhbw xmm1, xmm1 // next 2
  4407. pmulhuw xmm0, xmm2 // argb * value
  4408. pmulhuw xmm1, xmm2 // argb * value
  4409. psrlw xmm0, 8
  4410. psrlw xmm1, 8
  4411. packuswb xmm0, xmm1
  4412. movdqu [edx], xmm0
  4413. lea edx, [edx + 16]
  4414. sub ecx, 4
  4415. jg convertloop
  4416. ret
  4417. }
  4418. }
  4419. #endif // HAS_ARGBSHADEROW_SSE2
  4420. #ifdef HAS_ARGBMULTIPLYROW_SSE2
  4421. // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  4422. __declspec(naked)
  4423. void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  4424. uint8* dst_argb, int width) {
  4425. __asm {
  4426. push esi
  4427. mov eax, [esp + 4 + 4] // src_argb0
  4428. mov esi, [esp + 4 + 8] // src_argb1
  4429. mov edx, [esp + 4 + 12] // dst_argb
  4430. mov ecx, [esp + 4 + 16] // width
  4431. pxor xmm5, xmm5 // constant 0
  4432. convertloop:
  4433. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4434. movdqu xmm2, [esi] // read 4 pixels from src_argb1
  4435. movdqu xmm1, xmm0
  4436. movdqu xmm3, xmm2
  4437. punpcklbw xmm0, xmm0 // first 2
  4438. punpckhbw xmm1, xmm1 // next 2
  4439. punpcklbw xmm2, xmm5 // first 2
  4440. punpckhbw xmm3, xmm5 // next 2
  4441. pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
  4442. pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
  4443. lea eax, [eax + 16]
  4444. lea esi, [esi + 16]
  4445. packuswb xmm0, xmm1
  4446. movdqu [edx], xmm0
  4447. lea edx, [edx + 16]
  4448. sub ecx, 4
  4449. jg convertloop
  4450. pop esi
  4451. ret
  4452. }
  4453. }
  4454. #endif // HAS_ARGBMULTIPLYROW_SSE2
  4455. #ifdef HAS_ARGBADDROW_SSE2
  4456. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4457. // TODO(fbarchard): Port this to posix, neon and other math functions.
  4458. __declspec(naked)
  4459. void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  4460. uint8* dst_argb, int width) {
  4461. __asm {
  4462. push esi
  4463. mov eax, [esp + 4 + 4] // src_argb0
  4464. mov esi, [esp + 4 + 8] // src_argb1
  4465. mov edx, [esp + 4 + 12] // dst_argb
  4466. mov ecx, [esp + 4 + 16] // width
  4467. sub ecx, 4
  4468. jl convertloop49
  4469. convertloop4:
  4470. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4471. lea eax, [eax + 16]
  4472. movdqu xmm1, [esi] // read 4 pixels from src_argb1
  4473. lea esi, [esi + 16]
  4474. paddusb xmm0, xmm1 // src_argb0 + src_argb1
  4475. movdqu [edx], xmm0
  4476. lea edx, [edx + 16]
  4477. sub ecx, 4
  4478. jge convertloop4
  4479. convertloop49:
  4480. add ecx, 4 - 1
  4481. jl convertloop19
  4482. convertloop1:
  4483. movd xmm0, [eax] // read 1 pixels from src_argb0
  4484. lea eax, [eax + 4]
  4485. movd xmm1, [esi] // read 1 pixels from src_argb1
  4486. lea esi, [esi + 4]
  4487. paddusb xmm0, xmm1 // src_argb0 + src_argb1
  4488. movd [edx], xmm0
  4489. lea edx, [edx + 4]
  4490. sub ecx, 1
  4491. jge convertloop1
  4492. convertloop19:
  4493. pop esi
  4494. ret
  4495. }
  4496. }
  4497. #endif // HAS_ARGBADDROW_SSE2
  4498. #ifdef HAS_ARGBSUBTRACTROW_SSE2
  4499. // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
  4500. __declspec(naked)
  4501. void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  4502. uint8* dst_argb, int width) {
  4503. __asm {
  4504. push esi
  4505. mov eax, [esp + 4 + 4] // src_argb0
  4506. mov esi, [esp + 4 + 8] // src_argb1
  4507. mov edx, [esp + 4 + 12] // dst_argb
  4508. mov ecx, [esp + 4 + 16] // width
  4509. convertloop:
  4510. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4511. lea eax, [eax + 16]
  4512. movdqu xmm1, [esi] // read 4 pixels from src_argb1
  4513. lea esi, [esi + 16]
  4514. psubusb xmm0, xmm1 // src_argb0 - src_argb1
  4515. movdqu [edx], xmm0
  4516. lea edx, [edx + 16]
  4517. sub ecx, 4
  4518. jg convertloop
  4519. pop esi
  4520. ret
  4521. }
  4522. }
  4523. #endif // HAS_ARGBSUBTRACTROW_SSE2
  4524. #ifdef HAS_ARGBMULTIPLYROW_AVX2
  4525. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  4526. __declspec(naked)
  4527. void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  4528. uint8* dst_argb, int width) {
  4529. __asm {
  4530. push esi
  4531. mov eax, [esp + 4 + 4] // src_argb0
  4532. mov esi, [esp + 4 + 8] // src_argb1
  4533. mov edx, [esp + 4 + 12] // dst_argb
  4534. mov ecx, [esp + 4 + 16] // width
  4535. vpxor ymm5, ymm5, ymm5 // constant 0
  4536. convertloop:
  4537. vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
  4538. lea eax, [eax + 32]
  4539. vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
  4540. lea esi, [esi + 32]
  4541. vpunpcklbw ymm0, ymm1, ymm1 // low 4
  4542. vpunpckhbw ymm1, ymm1, ymm1 // high 4
  4543. vpunpcklbw ymm2, ymm3, ymm5 // low 4
  4544. vpunpckhbw ymm3, ymm3, ymm5 // high 4
  4545. vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
  4546. vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
  4547. vpackuswb ymm0, ymm0, ymm1
  4548. vmovdqu [edx], ymm0
  4549. lea edx, [edx + 32]
  4550. sub ecx, 8
  4551. jg convertloop
  4552. pop esi
  4553. vzeroupper
  4554. ret
  4555. }
  4556. }
  4557. #endif // HAS_ARGBMULTIPLYROW_AVX2
  4558. #ifdef HAS_ARGBADDROW_AVX2
  4559. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  4560. __declspec(naked)
  4561. void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  4562. uint8* dst_argb, int width) {
  4563. __asm {
  4564. push esi
  4565. mov eax, [esp + 4 + 4] // src_argb0
  4566. mov esi, [esp + 4 + 8] // src_argb1
  4567. mov edx, [esp + 4 + 12] // dst_argb
  4568. mov ecx, [esp + 4 + 16] // width
  4569. convertloop:
  4570. vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
  4571. lea eax, [eax + 32]
  4572. vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
  4573. lea esi, [esi + 32]
  4574. vmovdqu [edx], ymm0
  4575. lea edx, [edx + 32]
  4576. sub ecx, 8
  4577. jg convertloop
  4578. pop esi
  4579. vzeroupper
  4580. ret
  4581. }
  4582. }
  4583. #endif // HAS_ARGBADDROW_AVX2
  4584. #ifdef HAS_ARGBSUBTRACTROW_AVX2
  4585. // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
  4586. __declspec(naked)
  4587. void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  4588. uint8* dst_argb, int width) {
  4589. __asm {
  4590. push esi
  4591. mov eax, [esp + 4 + 4] // src_argb0
  4592. mov esi, [esp + 4 + 8] // src_argb1
  4593. mov edx, [esp + 4 + 12] // dst_argb
  4594. mov ecx, [esp + 4 + 16] // width
  4595. convertloop:
  4596. vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
  4597. lea eax, [eax + 32]
  4598. vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
  4599. lea esi, [esi + 32]
  4600. vmovdqu [edx], ymm0
  4601. lea edx, [edx + 32]
  4602. sub ecx, 8
  4603. jg convertloop
  4604. pop esi
  4605. vzeroupper
  4606. ret
  4607. }
  4608. }
  4609. #endif // HAS_ARGBSUBTRACTROW_AVX2
  4610. #ifdef HAS_SOBELXROW_SSE2
  4611. // SobelX as a matrix is
  4612. // -1 0 1
  4613. // -2 0 2
  4614. // -1 0 1
  4615. __declspec(naked)
  4616. void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  4617. const uint8* src_y2, uint8* dst_sobelx, int width) {
  4618. __asm {
  4619. push esi
  4620. push edi
  4621. mov eax, [esp + 8 + 4] // src_y0
  4622. mov esi, [esp + 8 + 8] // src_y1
  4623. mov edi, [esp + 8 + 12] // src_y2
  4624. mov edx, [esp + 8 + 16] // dst_sobelx
  4625. mov ecx, [esp + 8 + 20] // width
  4626. sub esi, eax
  4627. sub edi, eax
  4628. sub edx, eax
  4629. pxor xmm5, xmm5 // constant 0
  4630. convertloop:
  4631. movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
  4632. movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
  4633. punpcklbw xmm0, xmm5
  4634. punpcklbw xmm1, xmm5
  4635. psubw xmm0, xmm1
  4636. movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
  4637. movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
  4638. punpcklbw xmm1, xmm5
  4639. punpcklbw xmm2, xmm5
  4640. psubw xmm1, xmm2
  4641. movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
  4642. movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
  4643. punpcklbw xmm2, xmm5
  4644. punpcklbw xmm3, xmm5
  4645. psubw xmm2, xmm3
  4646. paddw xmm0, xmm2
  4647. paddw xmm0, xmm1
  4648. paddw xmm0, xmm1
  4649. pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
  4650. psubw xmm1, xmm0
  4651. pmaxsw xmm0, xmm1
  4652. packuswb xmm0, xmm0
  4653. movq qword ptr [eax + edx], xmm0
  4654. lea eax, [eax + 8]
  4655. sub ecx, 8
  4656. jg convertloop
  4657. pop edi
  4658. pop esi
  4659. ret
  4660. }
  4661. }
  4662. #endif // HAS_SOBELXROW_SSE2
  4663. #ifdef HAS_SOBELYROW_SSE2
  4664. // SobelY as a matrix is
  4665. // -1 -2 -1
  4666. // 0 0 0
  4667. // 1 2 1
  4668. __declspec(naked)
  4669. void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  4670. uint8* dst_sobely, int width) {
  4671. __asm {
  4672. push esi
  4673. mov eax, [esp + 4 + 4] // src_y0
  4674. mov esi, [esp + 4 + 8] // src_y1
  4675. mov edx, [esp + 4 + 12] // dst_sobely
  4676. mov ecx, [esp + 4 + 16] // width
  4677. sub esi, eax
  4678. sub edx, eax
  4679. pxor xmm5, xmm5 // constant 0
  4680. convertloop:
  4681. movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
  4682. movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
  4683. punpcklbw xmm0, xmm5
  4684. punpcklbw xmm1, xmm5
  4685. psubw xmm0, xmm1
  4686. movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
  4687. movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
  4688. punpcklbw xmm1, xmm5
  4689. punpcklbw xmm2, xmm5
  4690. psubw xmm1, xmm2
  4691. movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
  4692. movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
  4693. punpcklbw xmm2, xmm5
  4694. punpcklbw xmm3, xmm5
  4695. psubw xmm2, xmm3
  4696. paddw xmm0, xmm2
  4697. paddw xmm0, xmm1
  4698. paddw xmm0, xmm1
  4699. pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
  4700. psubw xmm1, xmm0
  4701. pmaxsw xmm0, xmm1
  4702. packuswb xmm0, xmm0
  4703. movq qword ptr [eax + edx], xmm0
  4704. lea eax, [eax + 8]
  4705. sub ecx, 8
  4706. jg convertloop
  4707. pop esi
  4708. ret
  4709. }
  4710. }
  4711. #endif // HAS_SOBELYROW_SSE2
  4712. #ifdef HAS_SOBELROW_SSE2
  4713. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  4714. // A = 255
  4715. // R = Sobel
  4716. // G = Sobel
  4717. // B = Sobel
  4718. __declspec(naked)
  4719. void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4720. uint8* dst_argb, int width) {
  4721. __asm {
  4722. push esi
  4723. mov eax, [esp + 4 + 4] // src_sobelx
  4724. mov esi, [esp + 4 + 8] // src_sobely
  4725. mov edx, [esp + 4 + 12] // dst_argb
  4726. mov ecx, [esp + 4 + 16] // width
  4727. sub esi, eax
  4728. pcmpeqb xmm5, xmm5 // alpha 255
  4729. pslld xmm5, 24 // 0xff000000
  4730. convertloop:
  4731. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4732. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4733. lea eax, [eax + 16]
  4734. paddusb xmm0, xmm1 // sobel = sobelx + sobely
  4735. movdqa xmm2, xmm0 // GG
  4736. punpcklbw xmm2, xmm0 // First 8
  4737. punpckhbw xmm0, xmm0 // Next 8
  4738. movdqa xmm1, xmm2 // GGGG
  4739. punpcklwd xmm1, xmm2 // First 4
  4740. punpckhwd xmm2, xmm2 // Next 4
  4741. por xmm1, xmm5 // GGGA
  4742. por xmm2, xmm5
  4743. movdqa xmm3, xmm0 // GGGG
  4744. punpcklwd xmm3, xmm0 // Next 4
  4745. punpckhwd xmm0, xmm0 // Last 4
  4746. por xmm3, xmm5 // GGGA
  4747. por xmm0, xmm5
  4748. movdqu [edx], xmm1
  4749. movdqu [edx + 16], xmm2
  4750. movdqu [edx + 32], xmm3
  4751. movdqu [edx + 48], xmm0
  4752. lea edx, [edx + 64]
  4753. sub ecx, 16
  4754. jg convertloop
  4755. pop esi
  4756. ret
  4757. }
  4758. }
  4759. #endif // HAS_SOBELROW_SSE2
  4760. #ifdef HAS_SOBELTOPLANEROW_SSE2
  4761. // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  4762. __declspec(naked)
  4763. void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4764. uint8* dst_y, int width) {
  4765. __asm {
  4766. push esi
  4767. mov eax, [esp + 4 + 4] // src_sobelx
  4768. mov esi, [esp + 4 + 8] // src_sobely
  4769. mov edx, [esp + 4 + 12] // dst_argb
  4770. mov ecx, [esp + 4 + 16] // width
  4771. sub esi, eax
  4772. convertloop:
  4773. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4774. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4775. lea eax, [eax + 16]
  4776. paddusb xmm0, xmm1 // sobel = sobelx + sobely
  4777. movdqu [edx], xmm0
  4778. lea edx, [edx + 16]
  4779. sub ecx, 16
  4780. jg convertloop
  4781. pop esi
  4782. ret
  4783. }
  4784. }
  4785. #endif // HAS_SOBELTOPLANEROW_SSE2
  4786. #ifdef HAS_SOBELXYROW_SSE2
  4787. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  4788. // A = 255
  4789. // R = Sobel X
  4790. // G = Sobel
  4791. // B = Sobel Y
  4792. __declspec(naked)
  4793. void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4794. uint8* dst_argb, int width) {
  4795. __asm {
  4796. push esi
  4797. mov eax, [esp + 4 + 4] // src_sobelx
  4798. mov esi, [esp + 4 + 8] // src_sobely
  4799. mov edx, [esp + 4 + 12] // dst_argb
  4800. mov ecx, [esp + 4 + 16] // width
  4801. sub esi, eax
  4802. pcmpeqb xmm5, xmm5 // alpha 255
  4803. convertloop:
  4804. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4805. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4806. lea eax, [eax + 16]
  4807. movdqa xmm2, xmm0
  4808. paddusb xmm2, xmm1 // sobel = sobelx + sobely
  4809. movdqa xmm3, xmm0 // XA
  4810. punpcklbw xmm3, xmm5
  4811. punpckhbw xmm0, xmm5
  4812. movdqa xmm4, xmm1 // YS
  4813. punpcklbw xmm4, xmm2
  4814. punpckhbw xmm1, xmm2
  4815. movdqa xmm6, xmm4 // YSXA
  4816. punpcklwd xmm6, xmm3 // First 4
  4817. punpckhwd xmm4, xmm3 // Next 4
  4818. movdqa xmm7, xmm1 // YSXA
  4819. punpcklwd xmm7, xmm0 // Next 4
  4820. punpckhwd xmm1, xmm0 // Last 4
  4821. movdqu [edx], xmm6
  4822. movdqu [edx + 16], xmm4
  4823. movdqu [edx + 32], xmm7
  4824. movdqu [edx + 48], xmm1
  4825. lea edx, [edx + 64]
  4826. sub ecx, 16
  4827. jg convertloop
  4828. pop esi
  4829. ret
  4830. }
  4831. }
  4832. #endif // HAS_SOBELXYROW_SSE2
  4833. #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4834. // Consider float CumulativeSum.
  4835. // Consider calling CumulativeSum one row at time as needed.
  4836. // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
  4837. // Convert cumulative sum for an area to an average for 1 pixel.
  4838. // topleft is pointer to top left of CumulativeSum buffer for area.
  4839. // botleft is pointer to bottom left of CumulativeSum buffer.
  4840. // width is offset from left to right of area in CumulativeSum buffer measured
  4841. // in number of ints.
  4842. // area is the number of pixels in the area being averaged.
  4843. // dst points to pixel to store result to.
  4844. // count is number of averaged pixels to produce.
  4845. // Does 4 pixels at a time.
  4846. // This function requires alignment on accumulation buffer pointers.
  4847. void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
  4848. int width, int area, uint8* dst,
  4849. int count) {
  4850. __asm {
  4851. mov eax, topleft // eax topleft
  4852. mov esi, botleft // esi botleft
  4853. mov edx, width
  4854. movd xmm5, area
  4855. mov edi, dst
  4856. mov ecx, count
  4857. cvtdq2ps xmm5, xmm5
  4858. rcpss xmm4, xmm5 // 1.0f / area
  4859. pshufd xmm4, xmm4, 0
  4860. sub ecx, 4
  4861. jl l4b
  4862. cmp area, 128 // 128 pixels will not overflow 15 bits.
  4863. ja l4
  4864. pshufd xmm5, xmm5, 0 // area
  4865. pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
  4866. psrld xmm6, 16
  4867. cvtdq2ps xmm6, xmm6
  4868. addps xmm5, xmm6 // (65536.0 + area - 1)
  4869. mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
  4870. cvtps2dq xmm5, xmm5 // 0.16 fixed point
  4871. packssdw xmm5, xmm5 // 16 bit shorts
  4872. // 4 pixel loop small blocks.
  4873. s4:
  4874. // top left
  4875. movdqu xmm0, [eax]
  4876. movdqu xmm1, [eax + 16]
  4877. movdqu xmm2, [eax + 32]
  4878. movdqu xmm3, [eax + 48]
  4879. // - top right
  4880. psubd xmm0, [eax + edx * 4]
  4881. psubd xmm1, [eax + edx * 4 + 16]
  4882. psubd xmm2, [eax + edx * 4 + 32]
  4883. psubd xmm3, [eax + edx * 4 + 48]
  4884. lea eax, [eax + 64]
  4885. // - bottom left
  4886. psubd xmm0, [esi]
  4887. psubd xmm1, [esi + 16]
  4888. psubd xmm2, [esi + 32]
  4889. psubd xmm3, [esi + 48]
  4890. // + bottom right
  4891. paddd xmm0, [esi + edx * 4]
  4892. paddd xmm1, [esi + edx * 4 + 16]
  4893. paddd xmm2, [esi + edx * 4 + 32]
  4894. paddd xmm3, [esi + edx * 4 + 48]
  4895. lea esi, [esi + 64]
  4896. packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
  4897. packssdw xmm2, xmm3
  4898. pmulhuw xmm0, xmm5
  4899. pmulhuw xmm2, xmm5
  4900. packuswb xmm0, xmm2
  4901. movdqu [edi], xmm0
  4902. lea edi, [edi + 16]
  4903. sub ecx, 4
  4904. jge s4
  4905. jmp l4b
  4906. // 4 pixel loop
  4907. l4:
  4908. // top left
  4909. movdqu xmm0, [eax]
  4910. movdqu xmm1, [eax + 16]
  4911. movdqu xmm2, [eax + 32]
  4912. movdqu xmm3, [eax + 48]
  4913. // - top right
  4914. psubd xmm0, [eax + edx * 4]
  4915. psubd xmm1, [eax + edx * 4 + 16]
  4916. psubd xmm2, [eax + edx * 4 + 32]
  4917. psubd xmm3, [eax + edx * 4 + 48]
  4918. lea eax, [eax + 64]
  4919. // - bottom left
  4920. psubd xmm0, [esi]
  4921. psubd xmm1, [esi + 16]
  4922. psubd xmm2, [esi + 32]
  4923. psubd xmm3, [esi + 48]
  4924. // + bottom right
  4925. paddd xmm0, [esi + edx * 4]
  4926. paddd xmm1, [esi + edx * 4 + 16]
  4927. paddd xmm2, [esi + edx * 4 + 32]
  4928. paddd xmm3, [esi + edx * 4 + 48]
  4929. lea esi, [esi + 64]
  4930. cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
  4931. cvtdq2ps xmm1, xmm1
  4932. mulps xmm0, xmm4
  4933. mulps xmm1, xmm4
  4934. cvtdq2ps xmm2, xmm2
  4935. cvtdq2ps xmm3, xmm3
  4936. mulps xmm2, xmm4
  4937. mulps xmm3, xmm4
  4938. cvtps2dq xmm0, xmm0
  4939. cvtps2dq xmm1, xmm1
  4940. cvtps2dq xmm2, xmm2
  4941. cvtps2dq xmm3, xmm3
  4942. packssdw xmm0, xmm1
  4943. packssdw xmm2, xmm3
  4944. packuswb xmm0, xmm2
  4945. movdqu [edi], xmm0
  4946. lea edi, [edi + 16]
  4947. sub ecx, 4
  4948. jge l4
  4949. l4b:
  4950. add ecx, 4 - 1
  4951. jl l1b
  4952. // 1 pixel loop
  4953. l1:
  4954. movdqu xmm0, [eax]
  4955. psubd xmm0, [eax + edx * 4]
  4956. lea eax, [eax + 16]
  4957. psubd xmm0, [esi]
  4958. paddd xmm0, [esi + edx * 4]
  4959. lea esi, [esi + 16]
  4960. cvtdq2ps xmm0, xmm0
  4961. mulps xmm0, xmm4
  4962. cvtps2dq xmm0, xmm0
  4963. packssdw xmm0, xmm0
  4964. packuswb xmm0, xmm0
  4965. movd dword ptr [edi], xmm0
  4966. lea edi, [edi + 4]
  4967. sub ecx, 1
  4968. jge l1
  4969. l1b:
  4970. }
  4971. }
  4972. #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4973. #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  4974. // Creates a table of cumulative sums where each value is a sum of all values
  4975. // above and to the left of the value.
  4976. void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
  4977. const int32* previous_cumsum, int width) {
  4978. __asm {
  4979. mov eax, row
  4980. mov edx, cumsum
  4981. mov esi, previous_cumsum
  4982. mov ecx, width
  4983. pxor xmm0, xmm0
  4984. pxor xmm1, xmm1
  4985. sub ecx, 4
  4986. jl l4b
  4987. test edx, 15
  4988. jne l4b
  4989. // 4 pixel loop
  4990. l4:
  4991. movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
  4992. lea eax, [eax + 16]
  4993. movdqa xmm4, xmm2
  4994. punpcklbw xmm2, xmm1
  4995. movdqa xmm3, xmm2
  4996. punpcklwd xmm2, xmm1
  4997. punpckhwd xmm3, xmm1
  4998. punpckhbw xmm4, xmm1
  4999. movdqa xmm5, xmm4
  5000. punpcklwd xmm4, xmm1
  5001. punpckhwd xmm5, xmm1
  5002. paddd xmm0, xmm2
  5003. movdqu xmm2, [esi] // previous row above.
  5004. paddd xmm2, xmm0
  5005. paddd xmm0, xmm3
  5006. movdqu xmm3, [esi + 16]
  5007. paddd xmm3, xmm0
  5008. paddd xmm0, xmm4
  5009. movdqu xmm4, [esi + 32]
  5010. paddd xmm4, xmm0
  5011. paddd xmm0, xmm5
  5012. movdqu xmm5, [esi + 48]
  5013. lea esi, [esi + 64]
  5014. paddd xmm5, xmm0
  5015. movdqu [edx], xmm2
  5016. movdqu [edx + 16], xmm3
  5017. movdqu [edx + 32], xmm4
  5018. movdqu [edx + 48], xmm5
  5019. lea edx, [edx + 64]
  5020. sub ecx, 4
  5021. jge l4
  5022. l4b:
  5023. add ecx, 4 - 1
  5024. jl l1b
  5025. // 1 pixel loop
  5026. l1:
  5027. movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
  5028. lea eax, [eax + 4]
  5029. punpcklbw xmm2, xmm1
  5030. punpcklwd xmm2, xmm1
  5031. paddd xmm0, xmm2
  5032. movdqu xmm2, [esi]
  5033. lea esi, [esi + 16]
  5034. paddd xmm2, xmm0
  5035. movdqu [edx], xmm2
  5036. lea edx, [edx + 16]
  5037. sub ecx, 1
  5038. jge l1
  5039. l1b:
  5040. }
  5041. }
  5042. #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
  5043. #ifdef HAS_ARGBAFFINEROW_SSE2
  5044. // Copy ARGB pixels from source image with slope to a row of destination.
  5045. __declspec(naked)
  5046. LIBYUV_API
  5047. void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
  5048. uint8* dst_argb, const float* uv_dudv, int width) {
  5049. __asm {
  5050. push esi
  5051. push edi
  5052. mov eax, [esp + 12] // src_argb
  5053. mov esi, [esp + 16] // stride
  5054. mov edx, [esp + 20] // dst_argb
  5055. mov ecx, [esp + 24] // pointer to uv_dudv
  5056. movq xmm2, qword ptr [ecx] // uv
  5057. movq xmm7, qword ptr [ecx + 8] // dudv
  5058. mov ecx, [esp + 28] // width
  5059. shl esi, 16 // 4, stride
  5060. add esi, 4
  5061. movd xmm5, esi
  5062. sub ecx, 4
  5063. jl l4b
  5064. // setup for 4 pixel loop
  5065. pshufd xmm7, xmm7, 0x44 // dup dudv
  5066. pshufd xmm5, xmm5, 0 // dup 4, stride
  5067. movdqa xmm0, xmm2 // x0, y0, x1, y1
  5068. addps xmm0, xmm7
  5069. movlhps xmm2, xmm0
  5070. movdqa xmm4, xmm7
  5071. addps xmm4, xmm4 // dudv *= 2
  5072. movdqa xmm3, xmm2 // x2, y2, x3, y3
  5073. addps xmm3, xmm4
  5074. addps xmm4, xmm4 // dudv *= 4
  5075. // 4 pixel loop
  5076. l4:
  5077. cvttps2dq xmm0, xmm2 // x, y float to int first 2
  5078. cvttps2dq xmm1, xmm3 // x, y float to int next 2
  5079. packssdw xmm0, xmm1 // x, y as 8 shorts
  5080. pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
  5081. movd esi, xmm0
  5082. pshufd xmm0, xmm0, 0x39 // shift right
  5083. movd edi, xmm0
  5084. pshufd xmm0, xmm0, 0x39 // shift right
  5085. movd xmm1, [eax + esi] // read pixel 0
  5086. movd xmm6, [eax + edi] // read pixel 1
  5087. punpckldq xmm1, xmm6 // combine pixel 0 and 1
  5088. addps xmm2, xmm4 // x, y += dx, dy first 2
  5089. movq qword ptr [edx], xmm1
  5090. movd esi, xmm0
  5091. pshufd xmm0, xmm0, 0x39 // shift right
  5092. movd edi, xmm0
  5093. movd xmm6, [eax + esi] // read pixel 2
  5094. movd xmm0, [eax + edi] // read pixel 3
  5095. punpckldq xmm6, xmm0 // combine pixel 2 and 3
  5096. addps xmm3, xmm4 // x, y += dx, dy next 2
  5097. movq qword ptr 8[edx], xmm6
  5098. lea edx, [edx + 16]
  5099. sub ecx, 4
  5100. jge l4
  5101. l4b:
  5102. add ecx, 4 - 1
  5103. jl l1b
  5104. // 1 pixel loop
  5105. l1:
  5106. cvttps2dq xmm0, xmm2 // x, y float to int
  5107. packssdw xmm0, xmm0 // x, y as shorts
  5108. pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
  5109. addps xmm2, xmm7 // x, y += dx, dy
  5110. movd esi, xmm0
  5111. movd xmm0, [eax + esi] // copy a pixel
  5112. movd [edx], xmm0
  5113. lea edx, [edx + 4]
  5114. sub ecx, 1
  5115. jge l1
  5116. l1b:
  5117. pop edi
  5118. pop esi
  5119. ret
  5120. }
  5121. }
  5122. #endif // HAS_ARGBAFFINEROW_SSE2
  5123. #ifdef HAS_INTERPOLATEROW_AVX2
  5124. // Bilinear filter 32x2 -> 32x1
  5125. __declspec(naked)
  5126. void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
  5127. ptrdiff_t src_stride, int dst_width,
  5128. int source_y_fraction) {
  5129. __asm {
  5130. push esi
  5131. push edi
  5132. mov edi, [esp + 8 + 4] // dst_ptr
  5133. mov esi, [esp + 8 + 8] // src_ptr
  5134. mov edx, [esp + 8 + 12] // src_stride
  5135. mov ecx, [esp + 8 + 16] // dst_width
  5136. mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
  5137. // Dispatch to specialized filters if applicable.
  5138. cmp eax, 0
  5139. je xloop100 // 0 / 256. Blend 100 / 0.
  5140. sub edi, esi
  5141. cmp eax, 128
  5142. je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
  5143. vmovd xmm0, eax // high fraction 0..255
  5144. neg eax
  5145. add eax, 256
  5146. vmovd xmm5, eax // low fraction 256..1
  5147. vpunpcklbw xmm5, xmm5, xmm0
  5148. vpunpcklwd xmm5, xmm5, xmm5
  5149. vbroadcastss ymm5, xmm5
  5150. mov eax, 0x80808080 // 128b for bias and rounding.
  5151. vmovd xmm4, eax
  5152. vbroadcastss ymm4, xmm4
  5153. xloop:
  5154. vmovdqu ymm0, [esi]
  5155. vmovdqu ymm2, [esi + edx]
  5156. vpunpckhbw ymm1, ymm0, ymm2 // mutates
  5157. vpunpcklbw ymm0, ymm0, ymm2
  5158. vpsubb ymm1, ymm1, ymm4 // bias to signed image
  5159. vpsubb ymm0, ymm0, ymm4
  5160. vpmaddubsw ymm1, ymm5, ymm1
  5161. vpmaddubsw ymm0, ymm5, ymm0
  5162. vpaddw ymm1, ymm1, ymm4 // unbias and round
  5163. vpaddw ymm0, ymm0, ymm4
  5164. vpsrlw ymm1, ymm1, 8
  5165. vpsrlw ymm0, ymm0, 8
  5166. vpackuswb ymm0, ymm0, ymm1 // unmutates
  5167. vmovdqu [esi + edi], ymm0
  5168. lea esi, [esi + 32]
  5169. sub ecx, 32
  5170. jg xloop
  5171. jmp xloop99
  5172. // Blend 50 / 50.
  5173. xloop50:
  5174. vmovdqu ymm0, [esi]
  5175. vpavgb ymm0, ymm0, [esi + edx]
  5176. vmovdqu [esi + edi], ymm0
  5177. lea esi, [esi + 32]
  5178. sub ecx, 32
  5179. jg xloop50
  5180. jmp xloop99
  5181. // Blend 100 / 0 - Copy row unchanged.
  5182. xloop100:
  5183. rep movsb
  5184. xloop99:
  5185. pop edi
  5186. pop esi
  5187. vzeroupper
  5188. ret
  5189. }
  5190. }
  5191. #endif // HAS_INTERPOLATEROW_AVX2
  5192. // Bilinear filter 16x2 -> 16x1
  5193. // TODO(fbarchard): Consider allowing 256 using memcpy.
  5194. __declspec(naked)
  5195. void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  5196. ptrdiff_t src_stride, int dst_width,
  5197. int source_y_fraction) {
  5198. __asm {
  5199. push esi
  5200. push edi
  5201. mov edi, [esp + 8 + 4] // dst_ptr
  5202. mov esi, [esp + 8 + 8] // src_ptr
  5203. mov edx, [esp + 8 + 12] // src_stride
  5204. mov ecx, [esp + 8 + 16] // dst_width
  5205. mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
  5206. sub edi, esi
  5207. // Dispatch to specialized filters if applicable.
  5208. cmp eax, 0
  5209. je xloop100 // 0 /256. Blend 100 / 0.
  5210. cmp eax, 128
  5211. je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
  5212. movd xmm0, eax // high fraction 0..255
  5213. neg eax
  5214. add eax, 256
  5215. movd xmm5, eax // low fraction 255..1
  5216. punpcklbw xmm5, xmm0
  5217. punpcklwd xmm5, xmm5
  5218. pshufd xmm5, xmm5, 0
  5219. mov eax, 0x80808080 // 128 for biasing image to signed.
  5220. movd xmm4, eax
  5221. pshufd xmm4, xmm4, 0x00
  5222. xloop:
  5223. movdqu xmm0, [esi]
  5224. movdqu xmm2, [esi + edx]
  5225. movdqu xmm1, xmm0
  5226. punpcklbw xmm0, xmm2
  5227. punpckhbw xmm1, xmm2
  5228. psubb xmm0, xmm4 // bias image by -128
  5229. psubb xmm1, xmm4
  5230. movdqa xmm2, xmm5
  5231. movdqa xmm3, xmm5
  5232. pmaddubsw xmm2, xmm0
  5233. pmaddubsw xmm3, xmm1
  5234. paddw xmm2, xmm4
  5235. paddw xmm3, xmm4
  5236. psrlw xmm2, 8
  5237. psrlw xmm3, 8
  5238. packuswb xmm2, xmm3
  5239. movdqu [esi + edi], xmm2
  5240. lea esi, [esi + 16]
  5241. sub ecx, 16
  5242. jg xloop
  5243. jmp xloop99
  5244. // Blend 50 / 50.
  5245. xloop50:
  5246. movdqu xmm0, [esi]
  5247. movdqu xmm1, [esi + edx]
  5248. pavgb xmm0, xmm1
  5249. movdqu [esi + edi], xmm0
  5250. lea esi, [esi + 16]
  5251. sub ecx, 16
  5252. jg xloop50
  5253. jmp xloop99
  5254. // Blend 100 / 0 - Copy row unchanged.
  5255. xloop100:
  5256. movdqu xmm0, [esi]
  5257. movdqu [esi + edi], xmm0
  5258. lea esi, [esi + 16]
  5259. sub ecx, 16
  5260. jg xloop100
  5261. xloop99:
  5262. pop edi
  5263. pop esi
  5264. ret
  5265. }
  5266. }
  5267. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5268. __declspec(naked)
  5269. void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  5270. const uint8* shuffler, int width) {
  5271. __asm {
  5272. mov eax, [esp + 4] // src_argb
  5273. mov edx, [esp + 8] // dst_argb
  5274. mov ecx, [esp + 12] // shuffler
  5275. movdqu xmm5, [ecx]
  5276. mov ecx, [esp + 16] // width
  5277. wloop:
  5278. movdqu xmm0, [eax]
  5279. movdqu xmm1, [eax + 16]
  5280. lea eax, [eax + 32]
  5281. pshufb xmm0, xmm5
  5282. pshufb xmm1, xmm5
  5283. movdqu [edx], xmm0
  5284. movdqu [edx + 16], xmm1
  5285. lea edx, [edx + 32]
  5286. sub ecx, 8
  5287. jg wloop
  5288. ret
  5289. }
  5290. }
  5291. #ifdef HAS_ARGBSHUFFLEROW_AVX2
  5292. __declspec(naked)
  5293. void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  5294. const uint8* shuffler, int width) {
  5295. __asm {
  5296. mov eax, [esp + 4] // src_argb
  5297. mov edx, [esp + 8] // dst_argb
  5298. mov ecx, [esp + 12] // shuffler
  5299. vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
  5300. mov ecx, [esp + 16] // width
  5301. wloop:
  5302. vmovdqu ymm0, [eax]
  5303. vmovdqu ymm1, [eax + 32]
  5304. lea eax, [eax + 64]
  5305. vpshufb ymm0, ymm0, ymm5
  5306. vpshufb ymm1, ymm1, ymm5
  5307. vmovdqu [edx], ymm0
  5308. vmovdqu [edx + 32], ymm1
  5309. lea edx, [edx + 64]
  5310. sub ecx, 16
  5311. jg wloop
  5312. vzeroupper
  5313. ret
  5314. }
  5315. }
  5316. #endif // HAS_ARGBSHUFFLEROW_AVX2
  5317. __declspec(naked)
  5318. void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  5319. const uint8* shuffler, int width) {
  5320. __asm {
  5321. push ebx
  5322. push esi
  5323. mov eax, [esp + 8 + 4] // src_argb
  5324. mov edx, [esp + 8 + 8] // dst_argb
  5325. mov esi, [esp + 8 + 12] // shuffler
  5326. mov ecx, [esp + 8 + 16] // width
  5327. pxor xmm5, xmm5
  5328. mov ebx, [esi] // shuffler
  5329. cmp ebx, 0x03000102
  5330. je shuf_3012
  5331. cmp ebx, 0x00010203
  5332. je shuf_0123
  5333. cmp ebx, 0x00030201
  5334. je shuf_0321
  5335. cmp ebx, 0x02010003
  5336. je shuf_2103
  5337. // TODO(fbarchard): Use one source pointer and 3 offsets.
  5338. shuf_any1:
  5339. movzx ebx, byte ptr [esi]
  5340. movzx ebx, byte ptr [eax + ebx]
  5341. mov [edx], bl
  5342. movzx ebx, byte ptr [esi + 1]
  5343. movzx ebx, byte ptr [eax + ebx]
  5344. mov [edx + 1], bl
  5345. movzx ebx, byte ptr [esi + 2]
  5346. movzx ebx, byte ptr [eax + ebx]
  5347. mov [edx + 2], bl
  5348. movzx ebx, byte ptr [esi + 3]
  5349. movzx ebx, byte ptr [eax + ebx]
  5350. mov [edx + 3], bl
  5351. lea eax, [eax + 4]
  5352. lea edx, [edx + 4]
  5353. sub ecx, 1
  5354. jg shuf_any1
  5355. jmp shuf99
  5356. shuf_0123:
  5357. movdqu xmm0, [eax]
  5358. lea eax, [eax + 16]
  5359. movdqa xmm1, xmm0
  5360. punpcklbw xmm0, xmm5
  5361. punpckhbw xmm1, xmm5
  5362. pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
  5363. pshuflw xmm0, xmm0, 01Bh
  5364. pshufhw xmm1, xmm1, 01Bh
  5365. pshuflw xmm1, xmm1, 01Bh
  5366. packuswb xmm0, xmm1
  5367. movdqu [edx], xmm0
  5368. lea edx, [edx + 16]
  5369. sub ecx, 4
  5370. jg shuf_0123
  5371. jmp shuf99
  5372. shuf_0321:
  5373. movdqu xmm0, [eax]
  5374. lea eax, [eax + 16]
  5375. movdqa xmm1, xmm0
  5376. punpcklbw xmm0, xmm5
  5377. punpckhbw xmm1, xmm5
  5378. pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
  5379. pshuflw xmm0, xmm0, 039h
  5380. pshufhw xmm1, xmm1, 039h
  5381. pshuflw xmm1, xmm1, 039h
  5382. packuswb xmm0, xmm1
  5383. movdqu [edx], xmm0
  5384. lea edx, [edx + 16]
  5385. sub ecx, 4
  5386. jg shuf_0321
  5387. jmp shuf99
  5388. shuf_2103:
  5389. movdqu xmm0, [eax]
  5390. lea eax, [eax + 16]
  5391. movdqa xmm1, xmm0
  5392. punpcklbw xmm0, xmm5
  5393. punpckhbw xmm1, xmm5
  5394. pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
  5395. pshuflw xmm0, xmm0, 093h
  5396. pshufhw xmm1, xmm1, 093h
  5397. pshuflw xmm1, xmm1, 093h
  5398. packuswb xmm0, xmm1
  5399. movdqu [edx], xmm0
  5400. lea edx, [edx + 16]
  5401. sub ecx, 4
  5402. jg shuf_2103
  5403. jmp shuf99
  5404. shuf_3012:
  5405. movdqu xmm0, [eax]
  5406. lea eax, [eax + 16]
  5407. movdqa xmm1, xmm0
  5408. punpcklbw xmm0, xmm5
  5409. punpckhbw xmm1, xmm5
  5410. pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
  5411. pshuflw xmm0, xmm0, 0C6h
  5412. pshufhw xmm1, xmm1, 0C6h
  5413. pshuflw xmm1, xmm1, 0C6h
  5414. packuswb xmm0, xmm1
  5415. movdqu [edx], xmm0
  5416. lea edx, [edx + 16]
  5417. sub ecx, 4
  5418. jg shuf_3012
  5419. shuf99:
  5420. pop esi
  5421. pop ebx
  5422. ret
  5423. }
  5424. }
  5425. // YUY2 - Macro-pixel = 2 image pixels
  5426. // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
  5427. // UYVY - Macro-pixel = 2 image pixels
  5428. // U0Y0V0Y1
  5429. __declspec(naked)
  5430. void I422ToYUY2Row_SSE2(const uint8* src_y,
  5431. const uint8* src_u,
  5432. const uint8* src_v,
  5433. uint8* dst_frame, int width) {
  5434. __asm {
  5435. push esi
  5436. push edi
  5437. mov eax, [esp + 8 + 4] // src_y
  5438. mov esi, [esp + 8 + 8] // src_u
  5439. mov edx, [esp + 8 + 12] // src_v
  5440. mov edi, [esp + 8 + 16] // dst_frame
  5441. mov ecx, [esp + 8 + 20] // width
  5442. sub edx, esi
  5443. convertloop:
  5444. movq xmm2, qword ptr [esi] // U
  5445. movq xmm3, qword ptr [esi + edx] // V
  5446. lea esi, [esi + 8]
  5447. punpcklbw xmm2, xmm3 // UV
  5448. movdqu xmm0, [eax] // Y
  5449. lea eax, [eax + 16]
  5450. movdqa xmm1, xmm0
  5451. punpcklbw xmm0, xmm2 // YUYV
  5452. punpckhbw xmm1, xmm2
  5453. movdqu [edi], xmm0
  5454. movdqu [edi + 16], xmm1
  5455. lea edi, [edi + 32]
  5456. sub ecx, 16
  5457. jg convertloop
  5458. pop edi
  5459. pop esi
  5460. ret
  5461. }
  5462. }
  5463. __declspec(naked)
  5464. void I422ToUYVYRow_SSE2(const uint8* src_y,
  5465. const uint8* src_u,
  5466. const uint8* src_v,
  5467. uint8* dst_frame, int width) {
  5468. __asm {
  5469. push esi
  5470. push edi
  5471. mov eax, [esp + 8 + 4] // src_y
  5472. mov esi, [esp + 8 + 8] // src_u
  5473. mov edx, [esp + 8 + 12] // src_v
  5474. mov edi, [esp + 8 + 16] // dst_frame
  5475. mov ecx, [esp + 8 + 20] // width
  5476. sub edx, esi
  5477. convertloop:
  5478. movq xmm2, qword ptr [esi] // U
  5479. movq xmm3, qword ptr [esi + edx] // V
  5480. lea esi, [esi + 8]
  5481. punpcklbw xmm2, xmm3 // UV
  5482. movdqu xmm0, [eax] // Y
  5483. movdqa xmm1, xmm2
  5484. lea eax, [eax + 16]
  5485. punpcklbw xmm1, xmm0 // UYVY
  5486. punpckhbw xmm2, xmm0
  5487. movdqu [edi], xmm1
  5488. movdqu [edi + 16], xmm2
  5489. lea edi, [edi + 32]
  5490. sub ecx, 16
  5491. jg convertloop
  5492. pop edi
  5493. pop esi
  5494. ret
  5495. }
  5496. }
  5497. #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  5498. __declspec(naked)
  5499. void ARGBPolynomialRow_SSE2(const uint8* src_argb,
  5500. uint8* dst_argb, const float* poly,
  5501. int width) {
  5502. __asm {
  5503. push esi
  5504. mov eax, [esp + 4 + 4] /* src_argb */
  5505. mov edx, [esp + 4 + 8] /* dst_argb */
  5506. mov esi, [esp + 4 + 12] /* poly */
  5507. mov ecx, [esp + 4 + 16] /* width */
  5508. pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
  5509. // 2 pixel loop.
  5510. convertloop:
  5511. // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
  5512. // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
  5513. movq xmm0, qword ptr [eax] // BGRABGRA
  5514. lea eax, [eax + 8]
  5515. punpcklbw xmm0, xmm3
  5516. movdqa xmm4, xmm0
  5517. punpcklwd xmm0, xmm3 // pixel 0
  5518. punpckhwd xmm4, xmm3 // pixel 1
  5519. cvtdq2ps xmm0, xmm0 // 4 floats
  5520. cvtdq2ps xmm4, xmm4
  5521. movdqa xmm1, xmm0 // X
  5522. movdqa xmm5, xmm4
  5523. mulps xmm0, [esi + 16] // C1 * X
  5524. mulps xmm4, [esi + 16]
  5525. addps xmm0, [esi] // result = C0 + C1 * X
  5526. addps xmm4, [esi]
  5527. movdqa xmm2, xmm1
  5528. movdqa xmm6, xmm5
  5529. mulps xmm2, xmm1 // X * X
  5530. mulps xmm6, xmm5
  5531. mulps xmm1, xmm2 // X * X * X
  5532. mulps xmm5, xmm6
  5533. mulps xmm2, [esi + 32] // C2 * X * X
  5534. mulps xmm6, [esi + 32]
  5535. mulps xmm1, [esi + 48] // C3 * X * X * X
  5536. mulps xmm5, [esi + 48]
  5537. addps xmm0, xmm2 // result += C2 * X * X
  5538. addps xmm4, xmm6
  5539. addps xmm0, xmm1 // result += C3 * X * X * X
  5540. addps xmm4, xmm5
  5541. cvttps2dq xmm0, xmm0
  5542. cvttps2dq xmm4, xmm4
  5543. packuswb xmm0, xmm4
  5544. packuswb xmm0, xmm0
  5545. movq qword ptr [edx], xmm0
  5546. lea edx, [edx + 8]
  5547. sub ecx, 2
  5548. jg convertloop
  5549. pop esi
  5550. ret
  5551. }
  5552. }
  5553. #endif // HAS_ARGBPOLYNOMIALROW_SSE2
  5554. #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  5555. __declspec(naked)
  5556. void ARGBPolynomialRow_AVX2(const uint8* src_argb,
  5557. uint8* dst_argb, const float* poly,
  5558. int width) {
  5559. __asm {
  5560. mov eax, [esp + 4] /* src_argb */
  5561. mov edx, [esp + 8] /* dst_argb */
  5562. mov ecx, [esp + 12] /* poly */
  5563. vbroadcastf128 ymm4, [ecx] // C0
  5564. vbroadcastf128 ymm5, [ecx + 16] // C1
  5565. vbroadcastf128 ymm6, [ecx + 32] // C2
  5566. vbroadcastf128 ymm7, [ecx + 48] // C3
  5567. mov ecx, [esp + 16] /* width */
  5568. // 2 pixel loop.
  5569. convertloop:
  5570. vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
  5571. lea eax, [eax + 8]
  5572. vcvtdq2ps ymm0, ymm0 // X 8 floats
  5573. vmulps ymm2, ymm0, ymm0 // X * X
  5574. vmulps ymm3, ymm0, ymm7 // C3 * X
  5575. vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
  5576. vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
  5577. vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
  5578. vcvttps2dq ymm0, ymm0
  5579. vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
  5580. vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
  5581. vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
  5582. vmovq qword ptr [edx], xmm0
  5583. lea edx, [edx + 8]
  5584. sub ecx, 2
  5585. jg convertloop
  5586. vzeroupper
  5587. ret
  5588. }
  5589. }
  5590. #endif // HAS_ARGBPOLYNOMIALROW_AVX2
  5591. #ifdef HAS_ARGBCOLORTABLEROW_X86
  5592. // Tranform ARGB pixels with color table.
  5593. __declspec(naked)
  5594. void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
  5595. int width) {
  5596. __asm {
  5597. push esi
  5598. mov eax, [esp + 4 + 4] /* dst_argb */
  5599. mov esi, [esp + 4 + 8] /* table_argb */
  5600. mov ecx, [esp + 4 + 12] /* width */
  5601. // 1 pixel loop.
  5602. convertloop:
  5603. movzx edx, byte ptr [eax]
  5604. lea eax, [eax + 4]
  5605. movzx edx, byte ptr [esi + edx * 4]
  5606. mov byte ptr [eax - 4], dl
  5607. movzx edx, byte ptr [eax - 4 + 1]
  5608. movzx edx, byte ptr [esi + edx * 4 + 1]
  5609. mov byte ptr [eax - 4 + 1], dl
  5610. movzx edx, byte ptr [eax - 4 + 2]
  5611. movzx edx, byte ptr [esi + edx * 4 + 2]
  5612. mov byte ptr [eax - 4 + 2], dl
  5613. movzx edx, byte ptr [eax - 4 + 3]
  5614. movzx edx, byte ptr [esi + edx * 4 + 3]
  5615. mov byte ptr [eax - 4 + 3], dl
  5616. dec ecx
  5617. jg convertloop
  5618. pop esi
  5619. ret
  5620. }
  5621. }
  5622. #endif // HAS_ARGBCOLORTABLEROW_X86
  5623. #ifdef HAS_RGBCOLORTABLEROW_X86
  5624. // Tranform RGB pixels with color table.
  5625. __declspec(naked)
  5626. void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
  5627. __asm {
  5628. push esi
  5629. mov eax, [esp + 4 + 4] /* dst_argb */
  5630. mov esi, [esp + 4 + 8] /* table_argb */
  5631. mov ecx, [esp + 4 + 12] /* width */
  5632. // 1 pixel loop.
  5633. convertloop:
  5634. movzx edx, byte ptr [eax]
  5635. lea eax, [eax + 4]
  5636. movzx edx, byte ptr [esi + edx * 4]
  5637. mov byte ptr [eax - 4], dl
  5638. movzx edx, byte ptr [eax - 4 + 1]
  5639. movzx edx, byte ptr [esi + edx * 4 + 1]
  5640. mov byte ptr [eax - 4 + 1], dl
  5641. movzx edx, byte ptr [eax - 4 + 2]
  5642. movzx edx, byte ptr [esi + edx * 4 + 2]
  5643. mov byte ptr [eax - 4 + 2], dl
  5644. dec ecx
  5645. jg convertloop
  5646. pop esi
  5647. ret
  5648. }
  5649. }
  5650. #endif // HAS_RGBCOLORTABLEROW_X86
  5651. #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5652. // Tranform RGB pixels with luma table.
  5653. __declspec(naked)
  5654. void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  5655. int width,
  5656. const uint8* luma, uint32 lumacoeff) {
  5657. __asm {
  5658. push esi
  5659. push edi
  5660. mov eax, [esp + 8 + 4] /* src_argb */
  5661. mov edi, [esp + 8 + 8] /* dst_argb */
  5662. mov ecx, [esp + 8 + 12] /* width */
  5663. movd xmm2, dword ptr [esp + 8 + 16] // luma table
  5664. movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
  5665. pshufd xmm2, xmm2, 0
  5666. pshufd xmm3, xmm3, 0
  5667. pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
  5668. psllw xmm4, 8
  5669. pxor xmm5, xmm5
  5670. // 4 pixel loop.
  5671. convertloop:
  5672. movdqu xmm0, xmmword ptr [eax] // generate luma ptr
  5673. pmaddubsw xmm0, xmm3
  5674. phaddw xmm0, xmm0
  5675. pand xmm0, xmm4 // mask out low bits
  5676. punpcklwd xmm0, xmm5
  5677. paddd xmm0, xmm2 // add table base
  5678. movd esi, xmm0
  5679. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5680. movzx edx, byte ptr [eax]
  5681. movzx edx, byte ptr [esi + edx]
  5682. mov byte ptr [edi], dl
  5683. movzx edx, byte ptr [eax + 1]
  5684. movzx edx, byte ptr [esi + edx]
  5685. mov byte ptr [edi + 1], dl
  5686. movzx edx, byte ptr [eax + 2]
  5687. movzx edx, byte ptr [esi + edx]
  5688. mov byte ptr [edi + 2], dl
  5689. movzx edx, byte ptr [eax + 3] // copy alpha.
  5690. mov byte ptr [edi + 3], dl
  5691. movd esi, xmm0
  5692. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5693. movzx edx, byte ptr [eax + 4]
  5694. movzx edx, byte ptr [esi + edx]
  5695. mov byte ptr [edi + 4], dl
  5696. movzx edx, byte ptr [eax + 5]
  5697. movzx edx, byte ptr [esi + edx]
  5698. mov byte ptr [edi + 5], dl
  5699. movzx edx, byte ptr [eax + 6]
  5700. movzx edx, byte ptr [esi + edx]
  5701. mov byte ptr [edi + 6], dl
  5702. movzx edx, byte ptr [eax + 7] // copy alpha.
  5703. mov byte ptr [edi + 7], dl
  5704. movd esi, xmm0
  5705. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5706. movzx edx, byte ptr [eax + 8]
  5707. movzx edx, byte ptr [esi + edx]
  5708. mov byte ptr [edi + 8], dl
  5709. movzx edx, byte ptr [eax + 9]
  5710. movzx edx, byte ptr [esi + edx]
  5711. mov byte ptr [edi + 9], dl
  5712. movzx edx, byte ptr [eax + 10]
  5713. movzx edx, byte ptr [esi + edx]
  5714. mov byte ptr [edi + 10], dl
  5715. movzx edx, byte ptr [eax + 11] // copy alpha.
  5716. mov byte ptr [edi + 11], dl
  5717. movd esi, xmm0
  5718. movzx edx, byte ptr [eax + 12]
  5719. movzx edx, byte ptr [esi + edx]
  5720. mov byte ptr [edi + 12], dl
  5721. movzx edx, byte ptr [eax + 13]
  5722. movzx edx, byte ptr [esi + edx]
  5723. mov byte ptr [edi + 13], dl
  5724. movzx edx, byte ptr [eax + 14]
  5725. movzx edx, byte ptr [esi + edx]
  5726. mov byte ptr [edi + 14], dl
  5727. movzx edx, byte ptr [eax + 15] // copy alpha.
  5728. mov byte ptr [edi + 15], dl
  5729. lea eax, [eax + 16]
  5730. lea edi, [edi + 16]
  5731. sub ecx, 4
  5732. jg convertloop
  5733. pop edi
  5734. pop esi
  5735. ret
  5736. }
  5737. }
  5738. #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5739. #endif // defined(_M_X64)
  5740. #ifdef __cplusplus
  5741. } // extern "C"
  5742. } // namespace libyuv
  5743. #endif
  5744. #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))