row_gcc.cc 231 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535
  1. // VERSION 2
  2. /*
  3. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  4. *
  5. * Use of this source code is governed by a BSD-style license
  6. * that can be found in the LICENSE file in the root of the source
  7. * tree. An additional intellectual property rights grant can be found
  8. * in the file PATENTS. All contributing project authors may
  9. * be found in the AUTHORS file in the root of the source tree.
  10. */
  11. #include "libyuv/row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for GCC x86 and x64.
  17. #if !defined(LIBYUV_DISABLE_X86) && \
  18. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  19. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  20. // Constants for ARGB
  21. static vec8 kARGBToY = {
  22. 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
  23. };
  24. // JPeg full range.
  25. static vec8 kARGBToYJ = {
  26. 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
  27. };
  28. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  29. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  30. static vec8 kARGBToU = {
  31. 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
  32. };
  33. static vec8 kARGBToUJ = {
  34. 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
  35. };
  36. static vec8 kARGBToV = {
  37. -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
  38. };
  39. static vec8 kARGBToVJ = {
  40. -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
  41. };
  42. // Constants for BGRA
  43. static vec8 kBGRAToY = {
  44. 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
  45. };
  46. static vec8 kBGRAToU = {
  47. 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
  48. };
  49. static vec8 kBGRAToV = {
  50. 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
  51. };
  52. // Constants for ABGR
  53. static vec8 kABGRToY = {
  54. 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
  55. };
  56. static vec8 kABGRToU = {
  57. -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
  58. };
  59. static vec8 kABGRToV = {
  60. 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
  61. };
  62. // Constants for RGBA.
  63. static vec8 kRGBAToY = {
  64. 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
  65. };
  66. static vec8 kRGBAToU = {
  67. 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
  68. };
  69. static vec8 kRGBAToV = {
  70. 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
  71. };
  72. static uvec8 kAddY16 = {
  73. 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
  74. };
  75. // 7 bit fixed point 0.5.
  76. static vec16 kAddYJ64 = {
  77. 64, 64, 64, 64, 64, 64, 64, 64
  78. };
  79. static uvec8 kAddUV128 = {
  80. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  81. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  82. };
  83. static uvec16 kAddUVJ128 = {
  84. 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
  85. };
  86. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  87. #ifdef HAS_RGB24TOARGBROW_SSSE3
  88. // Shuffle table for converting RGB24 to ARGB.
  89. static uvec8 kShuffleMaskRGB24ToARGB = {
  90. 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
  91. };
  92. // Shuffle table for converting RAW to ARGB.
  93. static uvec8 kShuffleMaskRAWToARGB = {
  94. 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
  95. };
  96. // Shuffle table for converting RAW to RGB24. First 8.
  97. static const uvec8 kShuffleMaskRAWToRGB24_0 = {
  98. 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
  99. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  100. };
  101. // Shuffle table for converting RAW to RGB24. Middle 8.
  102. static const uvec8 kShuffleMaskRAWToRGB24_1 = {
  103. 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
  104. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  105. };
  106. // Shuffle table for converting RAW to RGB24. Last 8.
  107. static const uvec8 kShuffleMaskRAWToRGB24_2 = {
  108. 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
  109. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  110. };
  111. // Shuffle table for converting ARGB to RGB24.
  112. static uvec8 kShuffleMaskARGBToRGB24 = {
  113. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
  114. };
  115. // Shuffle table for converting ARGB to RAW.
  116. static uvec8 kShuffleMaskARGBToRAW = {
  117. 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
  118. };
  119. // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
  120. static uvec8 kShuffleMaskARGBToRGB24_0 = {
  121. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
  122. };
  123. // YUY2 shuf 16 Y to 32 Y.
  124. static const lvec8 kShuffleYUY2Y = {
  125. 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
  126. 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
  127. };
  128. // YUY2 shuf 8 UV to 16 UV.
  129. static const lvec8 kShuffleYUY2UV = {
  130. 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
  131. 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
  132. };
  133. // UYVY shuf 16 Y to 32 Y.
  134. static const lvec8 kShuffleUYVYY = {
  135. 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
  136. 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
  137. };
  138. // UYVY shuf 8 UV to 16 UV.
  139. static const lvec8 kShuffleUYVYUV = {
  140. 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
  141. 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
  142. };
  143. // NV21 shuf 8 VU to 16 UV.
  144. static const lvec8 kShuffleNV21 = {
  145. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  146. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  147. };
  148. #endif // HAS_RGB24TOARGBROW_SSSE3
  149. #ifdef HAS_J400TOARGBROW_SSE2
  150. void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
  151. asm volatile (
  152. "pcmpeqb %%xmm5,%%xmm5 \n"
  153. "pslld $0x18,%%xmm5 \n"
  154. LABELALIGN
  155. "1: \n"
  156. "movq " MEMACCESS(0) ",%%xmm0 \n"
  157. "lea " MEMLEA(0x8,0) ",%0 \n"
  158. "punpcklbw %%xmm0,%%xmm0 \n"
  159. "movdqa %%xmm0,%%xmm1 \n"
  160. "punpcklwd %%xmm0,%%xmm0 \n"
  161. "punpckhwd %%xmm1,%%xmm1 \n"
  162. "por %%xmm5,%%xmm0 \n"
  163. "por %%xmm5,%%xmm1 \n"
  164. "movdqu %%xmm0," MEMACCESS(1) " \n"
  165. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  166. "lea " MEMLEA(0x20,1) ",%1 \n"
  167. "sub $0x8,%2 \n"
  168. "jg 1b \n"
  169. : "+r"(src_y), // %0
  170. "+r"(dst_argb), // %1
  171. "+r"(width) // %2
  172. :: "memory", "cc", "xmm0", "xmm1", "xmm5"
  173. );
  174. }
  175. #endif // HAS_J400TOARGBROW_SSE2
  176. #ifdef HAS_RGB24TOARGBROW_SSSE3
  177. void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
  178. asm volatile (
  179. "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
  180. "pslld $0x18,%%xmm5 \n"
  181. "movdqa %3,%%xmm4 \n"
  182. LABELALIGN
  183. "1: \n"
  184. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  185. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  186. "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
  187. "lea " MEMLEA(0x30,0) ",%0 \n"
  188. "movdqa %%xmm3,%%xmm2 \n"
  189. "palignr $0x8,%%xmm1,%%xmm2 \n"
  190. "pshufb %%xmm4,%%xmm2 \n"
  191. "por %%xmm5,%%xmm2 \n"
  192. "palignr $0xc,%%xmm0,%%xmm1 \n"
  193. "pshufb %%xmm4,%%xmm0 \n"
  194. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  195. "por %%xmm5,%%xmm0 \n"
  196. "pshufb %%xmm4,%%xmm1 \n"
  197. "movdqu %%xmm0," MEMACCESS(1) " \n"
  198. "por %%xmm5,%%xmm1 \n"
  199. "palignr $0x4,%%xmm3,%%xmm3 \n"
  200. "pshufb %%xmm4,%%xmm3 \n"
  201. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  202. "por %%xmm5,%%xmm3 \n"
  203. "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
  204. "lea " MEMLEA(0x40,1) ",%1 \n"
  205. "sub $0x10,%2 \n"
  206. "jg 1b \n"
  207. : "+r"(src_rgb24), // %0
  208. "+r"(dst_argb), // %1
  209. "+r"(width) // %2
  210. : "m"(kShuffleMaskRGB24ToARGB) // %3
  211. : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  212. );
  213. }
  214. void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
  215. asm volatile (
  216. "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
  217. "pslld $0x18,%%xmm5 \n"
  218. "movdqa %3,%%xmm4 \n"
  219. LABELALIGN
  220. "1: \n"
  221. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  222. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  223. "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
  224. "lea " MEMLEA(0x30,0) ",%0 \n"
  225. "movdqa %%xmm3,%%xmm2 \n"
  226. "palignr $0x8,%%xmm1,%%xmm2 \n"
  227. "pshufb %%xmm4,%%xmm2 \n"
  228. "por %%xmm5,%%xmm2 \n"
  229. "palignr $0xc,%%xmm0,%%xmm1 \n"
  230. "pshufb %%xmm4,%%xmm0 \n"
  231. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  232. "por %%xmm5,%%xmm0 \n"
  233. "pshufb %%xmm4,%%xmm1 \n"
  234. "movdqu %%xmm0," MEMACCESS(1) " \n"
  235. "por %%xmm5,%%xmm1 \n"
  236. "palignr $0x4,%%xmm3,%%xmm3 \n"
  237. "pshufb %%xmm4,%%xmm3 \n"
  238. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  239. "por %%xmm5,%%xmm3 \n"
  240. "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
  241. "lea " MEMLEA(0x40,1) ",%1 \n"
  242. "sub $0x10,%2 \n"
  243. "jg 1b \n"
  244. : "+r"(src_raw), // %0
  245. "+r"(dst_argb), // %1
  246. "+r"(width) // %2
  247. : "m"(kShuffleMaskRAWToARGB) // %3
  248. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  249. );
  250. }
  251. void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
  252. asm volatile (
  253. "movdqa %3,%%xmm3 \n"
  254. "movdqa %4,%%xmm4 \n"
  255. "movdqa %5,%%xmm5 \n"
  256. LABELALIGN
  257. "1: \n"
  258. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  259. "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n"
  260. "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n"
  261. "lea " MEMLEA(0x18,0) ",%0 \n"
  262. "pshufb %%xmm3,%%xmm0 \n"
  263. "pshufb %%xmm4,%%xmm1 \n"
  264. "pshufb %%xmm5,%%xmm2 \n"
  265. "movq %%xmm0," MEMACCESS(1) " \n"
  266. "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
  267. "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
  268. "lea " MEMLEA(0x18,1) ",%1 \n"
  269. "sub $0x8,%2 \n"
  270. "jg 1b \n"
  271. : "+r"(src_raw), // %0
  272. "+r"(dst_rgb24), // %1
  273. "+r"(width) // %2
  274. : "m"(kShuffleMaskRAWToRGB24_0), // %3
  275. "m"(kShuffleMaskRAWToRGB24_1), // %4
  276. "m"(kShuffleMaskRAWToRGB24_2) // %5
  277. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  278. );
  279. }
  280. void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
  281. asm volatile (
  282. "mov $0x1080108,%%eax \n"
  283. "movd %%eax,%%xmm5 \n"
  284. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  285. "mov $0x20802080,%%eax \n"
  286. "movd %%eax,%%xmm6 \n"
  287. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  288. "pcmpeqb %%xmm3,%%xmm3 \n"
  289. "psllw $0xb,%%xmm3 \n"
  290. "pcmpeqb %%xmm4,%%xmm4 \n"
  291. "psllw $0xa,%%xmm4 \n"
  292. "psrlw $0x5,%%xmm4 \n"
  293. "pcmpeqb %%xmm7,%%xmm7 \n"
  294. "psllw $0x8,%%xmm7 \n"
  295. "sub %0,%1 \n"
  296. "sub %0,%1 \n"
  297. LABELALIGN
  298. "1: \n"
  299. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  300. "movdqa %%xmm0,%%xmm1 \n"
  301. "movdqa %%xmm0,%%xmm2 \n"
  302. "pand %%xmm3,%%xmm1 \n"
  303. "psllw $0xb,%%xmm2 \n"
  304. "pmulhuw %%xmm5,%%xmm1 \n"
  305. "pmulhuw %%xmm5,%%xmm2 \n"
  306. "psllw $0x8,%%xmm1 \n"
  307. "por %%xmm2,%%xmm1 \n"
  308. "pand %%xmm4,%%xmm0 \n"
  309. "pmulhuw %%xmm6,%%xmm0 \n"
  310. "por %%xmm7,%%xmm0 \n"
  311. "movdqa %%xmm1,%%xmm2 \n"
  312. "punpcklbw %%xmm0,%%xmm1 \n"
  313. "punpckhbw %%xmm0,%%xmm2 \n"
  314. MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
  315. MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
  316. "lea " MEMLEA(0x10,0) ",%0 \n"
  317. "sub $0x8,%2 \n"
  318. "jg 1b \n"
  319. : "+r"(src), // %0
  320. "+r"(dst), // %1
  321. "+r"(width) // %2
  322. :
  323. : "memory", "cc", "eax", NACL_R14
  324. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  325. );
  326. }
  327. void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
  328. asm volatile (
  329. "mov $0x1080108,%%eax \n"
  330. "movd %%eax,%%xmm5 \n"
  331. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  332. "mov $0x42004200,%%eax \n"
  333. "movd %%eax,%%xmm6 \n"
  334. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  335. "pcmpeqb %%xmm3,%%xmm3 \n"
  336. "psllw $0xb,%%xmm3 \n"
  337. "movdqa %%xmm3,%%xmm4 \n"
  338. "psrlw $0x6,%%xmm4 \n"
  339. "pcmpeqb %%xmm7,%%xmm7 \n"
  340. "psllw $0x8,%%xmm7 \n"
  341. "sub %0,%1 \n"
  342. "sub %0,%1 \n"
  343. LABELALIGN
  344. "1: \n"
  345. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  346. "movdqa %%xmm0,%%xmm1 \n"
  347. "movdqa %%xmm0,%%xmm2 \n"
  348. "psllw $0x1,%%xmm1 \n"
  349. "psllw $0xb,%%xmm2 \n"
  350. "pand %%xmm3,%%xmm1 \n"
  351. "pmulhuw %%xmm5,%%xmm2 \n"
  352. "pmulhuw %%xmm5,%%xmm1 \n"
  353. "psllw $0x8,%%xmm1 \n"
  354. "por %%xmm2,%%xmm1 \n"
  355. "movdqa %%xmm0,%%xmm2 \n"
  356. "pand %%xmm4,%%xmm0 \n"
  357. "psraw $0x8,%%xmm2 \n"
  358. "pmulhuw %%xmm6,%%xmm0 \n"
  359. "pand %%xmm7,%%xmm2 \n"
  360. "por %%xmm2,%%xmm0 \n"
  361. "movdqa %%xmm1,%%xmm2 \n"
  362. "punpcklbw %%xmm0,%%xmm1 \n"
  363. "punpckhbw %%xmm0,%%xmm2 \n"
  364. MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
  365. MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
  366. "lea " MEMLEA(0x10,0) ",%0 \n"
  367. "sub $0x8,%2 \n"
  368. "jg 1b \n"
  369. : "+r"(src), // %0
  370. "+r"(dst), // %1
  371. "+r"(width) // %2
  372. :
  373. : "memory", "cc", "eax", NACL_R14
  374. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  375. );
  376. }
  377. void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
  378. asm volatile (
  379. "mov $0xf0f0f0f,%%eax \n"
  380. "movd %%eax,%%xmm4 \n"
  381. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  382. "movdqa %%xmm4,%%xmm5 \n"
  383. "pslld $0x4,%%xmm5 \n"
  384. "sub %0,%1 \n"
  385. "sub %0,%1 \n"
  386. LABELALIGN
  387. "1: \n"
  388. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  389. "movdqa %%xmm0,%%xmm2 \n"
  390. "pand %%xmm4,%%xmm0 \n"
  391. "pand %%xmm5,%%xmm2 \n"
  392. "movdqa %%xmm0,%%xmm1 \n"
  393. "movdqa %%xmm2,%%xmm3 \n"
  394. "psllw $0x4,%%xmm1 \n"
  395. "psrlw $0x4,%%xmm3 \n"
  396. "por %%xmm1,%%xmm0 \n"
  397. "por %%xmm3,%%xmm2 \n"
  398. "movdqa %%xmm0,%%xmm1 \n"
  399. "punpcklbw %%xmm2,%%xmm0 \n"
  400. "punpckhbw %%xmm2,%%xmm1 \n"
  401. MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
  402. MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
  403. "lea " MEMLEA(0x10,0) ",%0 \n"
  404. "sub $0x8,%2 \n"
  405. "jg 1b \n"
  406. : "+r"(src), // %0
  407. "+r"(dst), // %1
  408. "+r"(width) // %2
  409. :
  410. : "memory", "cc", "eax", NACL_R14
  411. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  412. );
  413. }
  414. void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
  415. asm volatile (
  416. "movdqa %3,%%xmm6 \n"
  417. LABELALIGN
  418. "1: \n"
  419. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  420. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  421. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  422. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  423. "lea " MEMLEA(0x40,0) ",%0 \n"
  424. "pshufb %%xmm6,%%xmm0 \n"
  425. "pshufb %%xmm6,%%xmm1 \n"
  426. "pshufb %%xmm6,%%xmm2 \n"
  427. "pshufb %%xmm6,%%xmm3 \n"
  428. "movdqa %%xmm1,%%xmm4 \n"
  429. "psrldq $0x4,%%xmm1 \n"
  430. "pslldq $0xc,%%xmm4 \n"
  431. "movdqa %%xmm2,%%xmm5 \n"
  432. "por %%xmm4,%%xmm0 \n"
  433. "pslldq $0x8,%%xmm5 \n"
  434. "movdqu %%xmm0," MEMACCESS(1) " \n"
  435. "por %%xmm5,%%xmm1 \n"
  436. "psrldq $0x8,%%xmm2 \n"
  437. "pslldq $0x4,%%xmm3 \n"
  438. "por %%xmm3,%%xmm2 \n"
  439. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  440. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  441. "lea " MEMLEA(0x30,1) ",%1 \n"
  442. "sub $0x10,%2 \n"
  443. "jg 1b \n"
  444. : "+r"(src), // %0
  445. "+r"(dst), // %1
  446. "+r"(width) // %2
  447. : "m"(kShuffleMaskARGBToRGB24) // %3
  448. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  449. );
  450. }
  451. void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
  452. asm volatile (
  453. "movdqa %3,%%xmm6 \n"
  454. LABELALIGN
  455. "1: \n"
  456. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  457. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  458. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  459. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  460. "lea " MEMLEA(0x40,0) ",%0 \n"
  461. "pshufb %%xmm6,%%xmm0 \n"
  462. "pshufb %%xmm6,%%xmm1 \n"
  463. "pshufb %%xmm6,%%xmm2 \n"
  464. "pshufb %%xmm6,%%xmm3 \n"
  465. "movdqa %%xmm1,%%xmm4 \n"
  466. "psrldq $0x4,%%xmm1 \n"
  467. "pslldq $0xc,%%xmm4 \n"
  468. "movdqa %%xmm2,%%xmm5 \n"
  469. "por %%xmm4,%%xmm0 \n"
  470. "pslldq $0x8,%%xmm5 \n"
  471. "movdqu %%xmm0," MEMACCESS(1) " \n"
  472. "por %%xmm5,%%xmm1 \n"
  473. "psrldq $0x8,%%xmm2 \n"
  474. "pslldq $0x4,%%xmm3 \n"
  475. "por %%xmm3,%%xmm2 \n"
  476. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  477. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  478. "lea " MEMLEA(0x30,1) ",%1 \n"
  479. "sub $0x10,%2 \n"
  480. "jg 1b \n"
  481. : "+r"(src), // %0
  482. "+r"(dst), // %1
  483. "+r"(width) // %2
  484. : "m"(kShuffleMaskARGBToRAW) // %3
  485. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  486. );
  487. }
  488. void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
  489. asm volatile (
  490. "pcmpeqb %%xmm3,%%xmm3 \n"
  491. "psrld $0x1b,%%xmm3 \n"
  492. "pcmpeqb %%xmm4,%%xmm4 \n"
  493. "psrld $0x1a,%%xmm4 \n"
  494. "pslld $0x5,%%xmm4 \n"
  495. "pcmpeqb %%xmm5,%%xmm5 \n"
  496. "pslld $0xb,%%xmm5 \n"
  497. LABELALIGN
  498. "1: \n"
  499. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  500. "movdqa %%xmm0,%%xmm1 \n"
  501. "movdqa %%xmm0,%%xmm2 \n"
  502. "pslld $0x8,%%xmm0 \n"
  503. "psrld $0x3,%%xmm1 \n"
  504. "psrld $0x5,%%xmm2 \n"
  505. "psrad $0x10,%%xmm0 \n"
  506. "pand %%xmm3,%%xmm1 \n"
  507. "pand %%xmm4,%%xmm2 \n"
  508. "pand %%xmm5,%%xmm0 \n"
  509. "por %%xmm2,%%xmm1 \n"
  510. "por %%xmm1,%%xmm0 \n"
  511. "packssdw %%xmm0,%%xmm0 \n"
  512. "lea " MEMLEA(0x10,0) ",%0 \n"
  513. "movq %%xmm0," MEMACCESS(1) " \n"
  514. "lea " MEMLEA(0x8,1) ",%1 \n"
  515. "sub $0x4,%2 \n"
  516. "jg 1b \n"
  517. : "+r"(src), // %0
  518. "+r"(dst), // %1
  519. "+r"(width) // %2
  520. :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  521. );
  522. }
  523. void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
  524. const uint32 dither4, int width) {
  525. asm volatile (
  526. "movd %3,%%xmm6 \n"
  527. "punpcklbw %%xmm6,%%xmm6 \n"
  528. "movdqa %%xmm6,%%xmm7 \n"
  529. "punpcklwd %%xmm6,%%xmm6 \n"
  530. "punpckhwd %%xmm7,%%xmm7 \n"
  531. "pcmpeqb %%xmm3,%%xmm3 \n"
  532. "psrld $0x1b,%%xmm3 \n"
  533. "pcmpeqb %%xmm4,%%xmm4 \n"
  534. "psrld $0x1a,%%xmm4 \n"
  535. "pslld $0x5,%%xmm4 \n"
  536. "pcmpeqb %%xmm5,%%xmm5 \n"
  537. "pslld $0xb,%%xmm5 \n"
  538. LABELALIGN
  539. "1: \n"
  540. "movdqu (%0),%%xmm0 \n"
  541. "paddusb %%xmm6,%%xmm0 \n"
  542. "movdqa %%xmm0,%%xmm1 \n"
  543. "movdqa %%xmm0,%%xmm2 \n"
  544. "pslld $0x8,%%xmm0 \n"
  545. "psrld $0x3,%%xmm1 \n"
  546. "psrld $0x5,%%xmm2 \n"
  547. "psrad $0x10,%%xmm0 \n"
  548. "pand %%xmm3,%%xmm1 \n"
  549. "pand %%xmm4,%%xmm2 \n"
  550. "pand %%xmm5,%%xmm0 \n"
  551. "por %%xmm2,%%xmm1 \n"
  552. "por %%xmm1,%%xmm0 \n"
  553. "packssdw %%xmm0,%%xmm0 \n"
  554. "lea 0x10(%0),%0 \n"
  555. "movq %%xmm0,(%1) \n"
  556. "lea 0x8(%1),%1 \n"
  557. "sub $0x4,%2 \n"
  558. "jg 1b \n"
  559. : "+r"(src), // %0
  560. "+r"(dst), // %1
  561. "+r"(width) // %2
  562. : "m"(dither4) // %3
  563. : "memory", "cc",
  564. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  565. );
  566. }
  567. #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
  568. void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
  569. const uint32 dither4, int width) {
  570. asm volatile (
  571. "vbroadcastss %3,%%xmm6 \n"
  572. "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
  573. "vpermq $0xd8,%%ymm6,%%ymm6 \n"
  574. "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
  575. "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
  576. "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
  577. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  578. "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
  579. "vpslld $0x5,%%ymm4,%%ymm4 \n"
  580. "vpslld $0xb,%%ymm3,%%ymm5 \n"
  581. LABELALIGN
  582. "1: \n"
  583. "vmovdqu (%0),%%ymm0 \n"
  584. "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
  585. "vpsrld $0x5,%%ymm0,%%ymm2 \n"
  586. "vpsrld $0x3,%%ymm0,%%ymm1 \n"
  587. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  588. "vpand %%ymm4,%%ymm2,%%ymm2 \n"
  589. "vpand %%ymm3,%%ymm1,%%ymm1 \n"
  590. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  591. "vpor %%ymm2,%%ymm1,%%ymm1 \n"
  592. "vpor %%ymm1,%%ymm0,%%ymm0 \n"
  593. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  594. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  595. "lea 0x20(%0),%0 \n"
  596. "vmovdqu %%xmm0,(%1) \n"
  597. "lea 0x10(%1),%1 \n"
  598. "sub $0x8,%2 \n"
  599. "jg 1b \n"
  600. "vzeroupper \n"
  601. : "+r"(src), // %0
  602. "+r"(dst), // %1
  603. "+r"(width) // %2
  604. : "m"(dither4) // %3
  605. : "memory", "cc",
  606. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  607. );
  608. }
  609. #endif // HAS_ARGBTORGB565DITHERROW_AVX2
  610. void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
  611. asm volatile (
  612. "pcmpeqb %%xmm4,%%xmm4 \n"
  613. "psrld $0x1b,%%xmm4 \n"
  614. "movdqa %%xmm4,%%xmm5 \n"
  615. "pslld $0x5,%%xmm5 \n"
  616. "movdqa %%xmm4,%%xmm6 \n"
  617. "pslld $0xa,%%xmm6 \n"
  618. "pcmpeqb %%xmm7,%%xmm7 \n"
  619. "pslld $0xf,%%xmm7 \n"
  620. LABELALIGN
  621. "1: \n"
  622. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  623. "movdqa %%xmm0,%%xmm1 \n"
  624. "movdqa %%xmm0,%%xmm2 \n"
  625. "movdqa %%xmm0,%%xmm3 \n"
  626. "psrad $0x10,%%xmm0 \n"
  627. "psrld $0x3,%%xmm1 \n"
  628. "psrld $0x6,%%xmm2 \n"
  629. "psrld $0x9,%%xmm3 \n"
  630. "pand %%xmm7,%%xmm0 \n"
  631. "pand %%xmm4,%%xmm1 \n"
  632. "pand %%xmm5,%%xmm2 \n"
  633. "pand %%xmm6,%%xmm3 \n"
  634. "por %%xmm1,%%xmm0 \n"
  635. "por %%xmm3,%%xmm2 \n"
  636. "por %%xmm2,%%xmm0 \n"
  637. "packssdw %%xmm0,%%xmm0 \n"
  638. "lea " MEMLEA(0x10,0) ",%0 \n"
  639. "movq %%xmm0," MEMACCESS(1) " \n"
  640. "lea " MEMLEA(0x8,1) ",%1 \n"
  641. "sub $0x4,%2 \n"
  642. "jg 1b \n"
  643. : "+r"(src), // %0
  644. "+r"(dst), // %1
  645. "+r"(width) // %2
  646. :: "memory", "cc",
  647. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  648. );
  649. }
  650. void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
  651. asm volatile (
  652. "pcmpeqb %%xmm4,%%xmm4 \n"
  653. "psllw $0xc,%%xmm4 \n"
  654. "movdqa %%xmm4,%%xmm3 \n"
  655. "psrlw $0x8,%%xmm3 \n"
  656. LABELALIGN
  657. "1: \n"
  658. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  659. "movdqa %%xmm0,%%xmm1 \n"
  660. "pand %%xmm3,%%xmm0 \n"
  661. "pand %%xmm4,%%xmm1 \n"
  662. "psrlq $0x4,%%xmm0 \n"
  663. "psrlq $0x8,%%xmm1 \n"
  664. "por %%xmm1,%%xmm0 \n"
  665. "packuswb %%xmm0,%%xmm0 \n"
  666. "lea " MEMLEA(0x10,0) ",%0 \n"
  667. "movq %%xmm0," MEMACCESS(1) " \n"
  668. "lea " MEMLEA(0x8,1) ",%1 \n"
  669. "sub $0x4,%2 \n"
  670. "jg 1b \n"
  671. : "+r"(src), // %0
  672. "+r"(dst), // %1
  673. "+r"(width) // %2
  674. :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  675. );
  676. }
  677. #endif // HAS_RGB24TOARGBROW_SSSE3
  678. #ifdef HAS_ARGBTOYROW_SSSE3
  679. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
  680. void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
  681. asm volatile (
  682. "movdqa %3,%%xmm4 \n"
  683. "movdqa %4,%%xmm5 \n"
  684. LABELALIGN
  685. "1: \n"
  686. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  687. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  688. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  689. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  690. "pmaddubsw %%xmm4,%%xmm0 \n"
  691. "pmaddubsw %%xmm4,%%xmm1 \n"
  692. "pmaddubsw %%xmm4,%%xmm2 \n"
  693. "pmaddubsw %%xmm4,%%xmm3 \n"
  694. "lea " MEMLEA(0x40,0) ",%0 \n"
  695. "phaddw %%xmm1,%%xmm0 \n"
  696. "phaddw %%xmm3,%%xmm2 \n"
  697. "psrlw $0x7,%%xmm0 \n"
  698. "psrlw $0x7,%%xmm2 \n"
  699. "packuswb %%xmm2,%%xmm0 \n"
  700. "paddb %%xmm5,%%xmm0 \n"
  701. "movdqu %%xmm0," MEMACCESS(1) " \n"
  702. "lea " MEMLEA(0x10,1) ",%1 \n"
  703. "sub $0x10,%2 \n"
  704. "jg 1b \n"
  705. : "+r"(src_argb), // %0
  706. "+r"(dst_y), // %1
  707. "+r"(width) // %2
  708. : "m"(kARGBToY), // %3
  709. "m"(kAddY16) // %4
  710. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  711. );
  712. }
  713. #endif // HAS_ARGBTOYROW_SSSE3
  714. #ifdef HAS_ARGBTOYJROW_SSSE3
  715. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
  716. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
  717. void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
  718. asm volatile (
  719. "movdqa %3,%%xmm4 \n"
  720. "movdqa %4,%%xmm5 \n"
  721. LABELALIGN
  722. "1: \n"
  723. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  724. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  725. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  726. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  727. "pmaddubsw %%xmm4,%%xmm0 \n"
  728. "pmaddubsw %%xmm4,%%xmm1 \n"
  729. "pmaddubsw %%xmm4,%%xmm2 \n"
  730. "pmaddubsw %%xmm4,%%xmm3 \n"
  731. "lea " MEMLEA(0x40,0) ",%0 \n"
  732. "phaddw %%xmm1,%%xmm0 \n"
  733. "phaddw %%xmm3,%%xmm2 \n"
  734. "paddw %%xmm5,%%xmm0 \n"
  735. "paddw %%xmm5,%%xmm2 \n"
  736. "psrlw $0x7,%%xmm0 \n"
  737. "psrlw $0x7,%%xmm2 \n"
  738. "packuswb %%xmm2,%%xmm0 \n"
  739. "movdqu %%xmm0," MEMACCESS(1) " \n"
  740. "lea " MEMLEA(0x10,1) ",%1 \n"
  741. "sub $0x10,%2 \n"
  742. "jg 1b \n"
  743. : "+r"(src_argb), // %0
  744. "+r"(dst_y), // %1
  745. "+r"(width) // %2
  746. : "m"(kARGBToYJ), // %3
  747. "m"(kAddYJ64) // %4
  748. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  749. );
  750. }
  751. #endif // HAS_ARGBTOYJROW_SSSE3
  752. #ifdef HAS_ARGBTOYROW_AVX2
  753. // vpermd for vphaddw + vpackuswb vpermd.
  754. static const lvec32 kPermdARGBToY_AVX = {
  755. 0, 4, 1, 5, 2, 6, 3, 7
  756. };
  757. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  758. void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
  759. asm volatile (
  760. "vbroadcastf128 %3,%%ymm4 \n"
  761. "vbroadcastf128 %4,%%ymm5 \n"
  762. "vmovdqu %5,%%ymm6 \n"
  763. LABELALIGN
  764. "1: \n"
  765. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  766. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  767. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  768. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  769. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  770. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  771. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  772. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  773. "lea " MEMLEA(0x80,0) ",%0 \n"
  774. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
  775. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
  776. "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
  777. "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
  778. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  779. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
  780. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
  781. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  782. "lea " MEMLEA(0x20,1) ",%1 \n"
  783. "sub $0x20,%2 \n"
  784. "jg 1b \n"
  785. "vzeroupper \n"
  786. : "+r"(src_argb), // %0
  787. "+r"(dst_y), // %1
  788. "+r"(width) // %2
  789. : "m"(kARGBToY), // %3
  790. "m"(kAddY16), // %4
  791. "m"(kPermdARGBToY_AVX) // %5
  792. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  793. );
  794. }
  795. #endif // HAS_ARGBTOYROW_AVX2
  796. #ifdef HAS_ARGBTOYJROW_AVX2
  797. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  798. void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
  799. asm volatile (
  800. "vbroadcastf128 %3,%%ymm4 \n"
  801. "vbroadcastf128 %4,%%ymm5 \n"
  802. "vmovdqu %5,%%ymm6 \n"
  803. LABELALIGN
  804. "1: \n"
  805. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  806. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  807. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  808. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  809. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  810. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  811. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  812. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  813. "lea " MEMLEA(0x80,0) ",%0 \n"
  814. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
  815. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
  816. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
  817. "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
  818. "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
  819. "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
  820. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  821. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
  822. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  823. "lea " MEMLEA(0x20,1) ",%1 \n"
  824. "sub $0x20,%2 \n"
  825. "jg 1b \n"
  826. "vzeroupper \n"
  827. : "+r"(src_argb), // %0
  828. "+r"(dst_y), // %1
  829. "+r"(width) // %2
  830. : "m"(kARGBToYJ), // %3
  831. "m"(kAddYJ64), // %4
  832. "m"(kPermdARGBToY_AVX) // %5
  833. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  834. );
  835. }
  836. #endif // HAS_ARGBTOYJROW_AVX2
  837. #ifdef HAS_ARGBTOUVROW_SSSE3
  838. void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  839. uint8* dst_u, uint8* dst_v, int width) {
  840. asm volatile (
  841. "movdqa %5,%%xmm3 \n"
  842. "movdqa %6,%%xmm4 \n"
  843. "movdqa %7,%%xmm5 \n"
  844. "sub %1,%2 \n"
  845. LABELALIGN
  846. "1: \n"
  847. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  848. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  849. "pavgb %%xmm7,%%xmm0 \n"
  850. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  851. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  852. "pavgb %%xmm7,%%xmm1 \n"
  853. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  854. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  855. "pavgb %%xmm7,%%xmm2 \n"
  856. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  857. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  858. "pavgb %%xmm7,%%xmm6 \n"
  859. "lea " MEMLEA(0x40,0) ",%0 \n"
  860. "movdqa %%xmm0,%%xmm7 \n"
  861. "shufps $0x88,%%xmm1,%%xmm0 \n"
  862. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  863. "pavgb %%xmm7,%%xmm0 \n"
  864. "movdqa %%xmm2,%%xmm7 \n"
  865. "shufps $0x88,%%xmm6,%%xmm2 \n"
  866. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  867. "pavgb %%xmm7,%%xmm2 \n"
  868. "movdqa %%xmm0,%%xmm1 \n"
  869. "movdqa %%xmm2,%%xmm6 \n"
  870. "pmaddubsw %%xmm4,%%xmm0 \n"
  871. "pmaddubsw %%xmm4,%%xmm2 \n"
  872. "pmaddubsw %%xmm3,%%xmm1 \n"
  873. "pmaddubsw %%xmm3,%%xmm6 \n"
  874. "phaddw %%xmm2,%%xmm0 \n"
  875. "phaddw %%xmm6,%%xmm1 \n"
  876. "psraw $0x8,%%xmm0 \n"
  877. "psraw $0x8,%%xmm1 \n"
  878. "packsswb %%xmm1,%%xmm0 \n"
  879. "paddb %%xmm5,%%xmm0 \n"
  880. "movlps %%xmm0," MEMACCESS(1) " \n"
  881. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  882. "lea " MEMLEA(0x8,1) ",%1 \n"
  883. "sub $0x10,%3 \n"
  884. "jg 1b \n"
  885. : "+r"(src_argb0), // %0
  886. "+r"(dst_u), // %1
  887. "+r"(dst_v), // %2
  888. "+rm"(width) // %3
  889. : "r"((intptr_t)(src_stride_argb)), // %4
  890. "m"(kARGBToV), // %5
  891. "m"(kARGBToU), // %6
  892. "m"(kAddUV128) // %7
  893. : "memory", "cc", NACL_R14
  894. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  895. );
  896. }
  897. #endif // HAS_ARGBTOUVROW_SSSE3
  898. #ifdef HAS_ARGBTOUVROW_AVX2
  899. // vpshufb for vphaddw + vpackuswb packed to shorts.
  900. static const lvec8 kShufARGBToUV_AVX = {
  901. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
  902. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
  903. };
  904. void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
  905. uint8* dst_u, uint8* dst_v, int width) {
  906. asm volatile (
  907. "vbroadcastf128 %5,%%ymm5 \n"
  908. "vbroadcastf128 %6,%%ymm6 \n"
  909. "vbroadcastf128 %7,%%ymm7 \n"
  910. "sub %1,%2 \n"
  911. LABELALIGN
  912. "1: \n"
  913. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  914. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  915. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  916. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  917. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  918. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  919. VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
  920. VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
  921. "lea " MEMLEA(0x80,0) ",%0 \n"
  922. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  923. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  924. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  925. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  926. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  927. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  928. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  929. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  930. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  931. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  932. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  933. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  934. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  935. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  936. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  937. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  938. "vpshufb %8,%%ymm0,%%ymm0 \n"
  939. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
  940. "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
  941. VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
  942. "lea " MEMLEA(0x10,1) ",%1 \n"
  943. "sub $0x20,%3 \n"
  944. "jg 1b \n"
  945. "vzeroupper \n"
  946. : "+r"(src_argb0), // %0
  947. "+r"(dst_u), // %1
  948. "+r"(dst_v), // %2
  949. "+rm"(width) // %3
  950. : "r"((intptr_t)(src_stride_argb)), // %4
  951. "m"(kAddUV128), // %5
  952. "m"(kARGBToV), // %6
  953. "m"(kARGBToU), // %7
  954. "m"(kShufARGBToUV_AVX) // %8
  955. : "memory", "cc", NACL_R14
  956. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  957. );
  958. }
  959. #endif // HAS_ARGBTOUVROW_AVX2
  960. #ifdef HAS_ARGBTOUVJROW_AVX2
  961. void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
  962. uint8* dst_u, uint8* dst_v, int width) {
  963. asm volatile (
  964. "vbroadcastf128 %5,%%ymm5 \n"
  965. "vbroadcastf128 %6,%%ymm6 \n"
  966. "vbroadcastf128 %7,%%ymm7 \n"
  967. "sub %1,%2 \n"
  968. LABELALIGN
  969. "1: \n"
  970. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  971. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  972. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  973. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  974. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  975. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  976. VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
  977. VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
  978. "lea " MEMLEA(0x80,0) ",%0 \n"
  979. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  980. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  981. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  982. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  983. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  984. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  985. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  986. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  987. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  988. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  989. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  990. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  991. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
  992. "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
  993. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  994. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  995. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  996. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  997. "vpshufb %8,%%ymm0,%%ymm0 \n"
  998. "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
  999. VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
  1000. "lea " MEMLEA(0x10,1) ",%1 \n"
  1001. "sub $0x20,%3 \n"
  1002. "jg 1b \n"
  1003. "vzeroupper \n"
  1004. : "+r"(src_argb0), // %0
  1005. "+r"(dst_u), // %1
  1006. "+r"(dst_v), // %2
  1007. "+rm"(width) // %3
  1008. : "r"((intptr_t)(src_stride_argb)), // %4
  1009. "m"(kAddUVJ128), // %5
  1010. "m"(kARGBToVJ), // %6
  1011. "m"(kARGBToUJ), // %7
  1012. "m"(kShufARGBToUV_AVX) // %8
  1013. : "memory", "cc", NACL_R14
  1014. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  1015. );
  1016. }
  1017. #endif // HAS_ARGBTOUVJROW_AVX2
  1018. #ifdef HAS_ARGBTOUVJROW_SSSE3
  1019. void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1020. uint8* dst_u, uint8* dst_v, int width) {
  1021. asm volatile (
  1022. "movdqa %5,%%xmm3 \n"
  1023. "movdqa %6,%%xmm4 \n"
  1024. "movdqa %7,%%xmm5 \n"
  1025. "sub %1,%2 \n"
  1026. LABELALIGN
  1027. "1: \n"
  1028. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1029. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1030. "pavgb %%xmm7,%%xmm0 \n"
  1031. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1032. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1033. "pavgb %%xmm7,%%xmm1 \n"
  1034. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1035. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1036. "pavgb %%xmm7,%%xmm2 \n"
  1037. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1038. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1039. "pavgb %%xmm7,%%xmm6 \n"
  1040. "lea " MEMLEA(0x40,0) ",%0 \n"
  1041. "movdqa %%xmm0,%%xmm7 \n"
  1042. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1043. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1044. "pavgb %%xmm7,%%xmm0 \n"
  1045. "movdqa %%xmm2,%%xmm7 \n"
  1046. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1047. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1048. "pavgb %%xmm7,%%xmm2 \n"
  1049. "movdqa %%xmm0,%%xmm1 \n"
  1050. "movdqa %%xmm2,%%xmm6 \n"
  1051. "pmaddubsw %%xmm4,%%xmm0 \n"
  1052. "pmaddubsw %%xmm4,%%xmm2 \n"
  1053. "pmaddubsw %%xmm3,%%xmm1 \n"
  1054. "pmaddubsw %%xmm3,%%xmm6 \n"
  1055. "phaddw %%xmm2,%%xmm0 \n"
  1056. "phaddw %%xmm6,%%xmm1 \n"
  1057. "paddw %%xmm5,%%xmm0 \n"
  1058. "paddw %%xmm5,%%xmm1 \n"
  1059. "psraw $0x8,%%xmm0 \n"
  1060. "psraw $0x8,%%xmm1 \n"
  1061. "packsswb %%xmm1,%%xmm0 \n"
  1062. "movlps %%xmm0," MEMACCESS(1) " \n"
  1063. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1064. "lea " MEMLEA(0x8,1) ",%1 \n"
  1065. "sub $0x10,%3 \n"
  1066. "jg 1b \n"
  1067. : "+r"(src_argb0), // %0
  1068. "+r"(dst_u), // %1
  1069. "+r"(dst_v), // %2
  1070. "+rm"(width) // %3
  1071. : "r"((intptr_t)(src_stride_argb)), // %4
  1072. "m"(kARGBToVJ), // %5
  1073. "m"(kARGBToUJ), // %6
  1074. "m"(kAddUVJ128) // %7
  1075. : "memory", "cc", NACL_R14
  1076. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1077. );
  1078. }
  1079. #endif // HAS_ARGBTOUVJROW_SSSE3
  1080. #ifdef HAS_ARGBTOUV444ROW_SSSE3
  1081. void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
  1082. int width) {
  1083. asm volatile (
  1084. "movdqa %4,%%xmm3 \n"
  1085. "movdqa %5,%%xmm4 \n"
  1086. "movdqa %6,%%xmm5 \n"
  1087. "sub %1,%2 \n"
  1088. LABELALIGN
  1089. "1: \n"
  1090. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1091. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1092. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1093. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1094. "pmaddubsw %%xmm4,%%xmm0 \n"
  1095. "pmaddubsw %%xmm4,%%xmm1 \n"
  1096. "pmaddubsw %%xmm4,%%xmm2 \n"
  1097. "pmaddubsw %%xmm4,%%xmm6 \n"
  1098. "phaddw %%xmm1,%%xmm0 \n"
  1099. "phaddw %%xmm6,%%xmm2 \n"
  1100. "psraw $0x8,%%xmm0 \n"
  1101. "psraw $0x8,%%xmm2 \n"
  1102. "packsswb %%xmm2,%%xmm0 \n"
  1103. "paddb %%xmm5,%%xmm0 \n"
  1104. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1105. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1106. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1107. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1108. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1109. "pmaddubsw %%xmm3,%%xmm0 \n"
  1110. "pmaddubsw %%xmm3,%%xmm1 \n"
  1111. "pmaddubsw %%xmm3,%%xmm2 \n"
  1112. "pmaddubsw %%xmm3,%%xmm6 \n"
  1113. "phaddw %%xmm1,%%xmm0 \n"
  1114. "phaddw %%xmm6,%%xmm2 \n"
  1115. "psraw $0x8,%%xmm0 \n"
  1116. "psraw $0x8,%%xmm2 \n"
  1117. "packsswb %%xmm2,%%xmm0 \n"
  1118. "paddb %%xmm5,%%xmm0 \n"
  1119. "lea " MEMLEA(0x40,0) ",%0 \n"
  1120. MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
  1121. "lea " MEMLEA(0x10,1) ",%1 \n"
  1122. "sub $0x10,%3 \n"
  1123. "jg 1b \n"
  1124. : "+r"(src_argb), // %0
  1125. "+r"(dst_u), // %1
  1126. "+r"(dst_v), // %2
  1127. "+rm"(width) // %3
  1128. : "m"(kARGBToV), // %4
  1129. "m"(kARGBToU), // %5
  1130. "m"(kAddUV128) // %6
  1131. : "memory", "cc", NACL_R14
  1132. "xmm0", "xmm1", "xmm2", "xmm6"
  1133. );
  1134. }
  1135. #endif // HAS_ARGBTOUV444ROW_SSSE3
  1136. void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
  1137. asm volatile (
  1138. "movdqa %4,%%xmm5 \n"
  1139. "movdqa %3,%%xmm4 \n"
  1140. LABELALIGN
  1141. "1: \n"
  1142. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1143. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1144. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1145. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  1146. "pmaddubsw %%xmm4,%%xmm0 \n"
  1147. "pmaddubsw %%xmm4,%%xmm1 \n"
  1148. "pmaddubsw %%xmm4,%%xmm2 \n"
  1149. "pmaddubsw %%xmm4,%%xmm3 \n"
  1150. "lea " MEMLEA(0x40,0) ",%0 \n"
  1151. "phaddw %%xmm1,%%xmm0 \n"
  1152. "phaddw %%xmm3,%%xmm2 \n"
  1153. "psrlw $0x7,%%xmm0 \n"
  1154. "psrlw $0x7,%%xmm2 \n"
  1155. "packuswb %%xmm2,%%xmm0 \n"
  1156. "paddb %%xmm5,%%xmm0 \n"
  1157. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1158. "lea " MEMLEA(0x10,1) ",%1 \n"
  1159. "sub $0x10,%2 \n"
  1160. "jg 1b \n"
  1161. : "+r"(src_bgra), // %0
  1162. "+r"(dst_y), // %1
  1163. "+r"(width) // %2
  1164. : "m"(kBGRAToY), // %3
  1165. "m"(kAddY16) // %4
  1166. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1167. );
  1168. }
  1169. void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
  1170. uint8* dst_u, uint8* dst_v, int width) {
  1171. asm volatile (
  1172. "movdqa %5,%%xmm3 \n"
  1173. "movdqa %6,%%xmm4 \n"
  1174. "movdqa %7,%%xmm5 \n"
  1175. "sub %1,%2 \n"
  1176. LABELALIGN
  1177. "1: \n"
  1178. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1179. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1180. "pavgb %%xmm7,%%xmm0 \n"
  1181. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1182. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1183. "pavgb %%xmm7,%%xmm1 \n"
  1184. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1185. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1186. "pavgb %%xmm7,%%xmm2 \n"
  1187. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1188. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1189. "pavgb %%xmm7,%%xmm6 \n"
  1190. "lea " MEMLEA(0x40,0) ",%0 \n"
  1191. "movdqa %%xmm0,%%xmm7 \n"
  1192. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1193. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1194. "pavgb %%xmm7,%%xmm0 \n"
  1195. "movdqa %%xmm2,%%xmm7 \n"
  1196. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1197. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1198. "pavgb %%xmm7,%%xmm2 \n"
  1199. "movdqa %%xmm0,%%xmm1 \n"
  1200. "movdqa %%xmm2,%%xmm6 \n"
  1201. "pmaddubsw %%xmm4,%%xmm0 \n"
  1202. "pmaddubsw %%xmm4,%%xmm2 \n"
  1203. "pmaddubsw %%xmm3,%%xmm1 \n"
  1204. "pmaddubsw %%xmm3,%%xmm6 \n"
  1205. "phaddw %%xmm2,%%xmm0 \n"
  1206. "phaddw %%xmm6,%%xmm1 \n"
  1207. "psraw $0x8,%%xmm0 \n"
  1208. "psraw $0x8,%%xmm1 \n"
  1209. "packsswb %%xmm1,%%xmm0 \n"
  1210. "paddb %%xmm5,%%xmm0 \n"
  1211. "movlps %%xmm0," MEMACCESS(1) " \n"
  1212. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1213. "lea " MEMLEA(0x8,1) ",%1 \n"
  1214. "sub $0x10,%3 \n"
  1215. "jg 1b \n"
  1216. : "+r"(src_bgra0), // %0
  1217. "+r"(dst_u), // %1
  1218. "+r"(dst_v), // %2
  1219. "+rm"(width) // %3
  1220. : "r"((intptr_t)(src_stride_bgra)), // %4
  1221. "m"(kBGRAToV), // %5
  1222. "m"(kBGRAToU), // %6
  1223. "m"(kAddUV128) // %7
  1224. : "memory", "cc", NACL_R14
  1225. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1226. );
  1227. }
  1228. void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
  1229. asm volatile (
  1230. "movdqa %4,%%xmm5 \n"
  1231. "movdqa %3,%%xmm4 \n"
  1232. LABELALIGN
  1233. "1: \n"
  1234. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1235. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1236. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1237. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  1238. "pmaddubsw %%xmm4,%%xmm0 \n"
  1239. "pmaddubsw %%xmm4,%%xmm1 \n"
  1240. "pmaddubsw %%xmm4,%%xmm2 \n"
  1241. "pmaddubsw %%xmm4,%%xmm3 \n"
  1242. "lea " MEMLEA(0x40,0) ",%0 \n"
  1243. "phaddw %%xmm1,%%xmm0 \n"
  1244. "phaddw %%xmm3,%%xmm2 \n"
  1245. "psrlw $0x7,%%xmm0 \n"
  1246. "psrlw $0x7,%%xmm2 \n"
  1247. "packuswb %%xmm2,%%xmm0 \n"
  1248. "paddb %%xmm5,%%xmm0 \n"
  1249. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1250. "lea " MEMLEA(0x10,1) ",%1 \n"
  1251. "sub $0x10,%2 \n"
  1252. "jg 1b \n"
  1253. : "+r"(src_abgr), // %0
  1254. "+r"(dst_y), // %1
  1255. "+r"(width) // %2
  1256. : "m"(kABGRToY), // %3
  1257. "m"(kAddY16) // %4
  1258. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1259. );
  1260. }
  1261. void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
  1262. asm volatile (
  1263. "movdqa %4,%%xmm5 \n"
  1264. "movdqa %3,%%xmm4 \n"
  1265. LABELALIGN
  1266. "1: \n"
  1267. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1268. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1269. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1270. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  1271. "pmaddubsw %%xmm4,%%xmm0 \n"
  1272. "pmaddubsw %%xmm4,%%xmm1 \n"
  1273. "pmaddubsw %%xmm4,%%xmm2 \n"
  1274. "pmaddubsw %%xmm4,%%xmm3 \n"
  1275. "lea " MEMLEA(0x40,0) ",%0 \n"
  1276. "phaddw %%xmm1,%%xmm0 \n"
  1277. "phaddw %%xmm3,%%xmm2 \n"
  1278. "psrlw $0x7,%%xmm0 \n"
  1279. "psrlw $0x7,%%xmm2 \n"
  1280. "packuswb %%xmm2,%%xmm0 \n"
  1281. "paddb %%xmm5,%%xmm0 \n"
  1282. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1283. "lea " MEMLEA(0x10,1) ",%1 \n"
  1284. "sub $0x10,%2 \n"
  1285. "jg 1b \n"
  1286. : "+r"(src_rgba), // %0
  1287. "+r"(dst_y), // %1
  1288. "+r"(width) // %2
  1289. : "m"(kRGBAToY), // %3
  1290. "m"(kAddY16) // %4
  1291. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1292. );
  1293. }
  1294. void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
  1295. uint8* dst_u, uint8* dst_v, int width) {
  1296. asm volatile (
  1297. "movdqa %5,%%xmm3 \n"
  1298. "movdqa %6,%%xmm4 \n"
  1299. "movdqa %7,%%xmm5 \n"
  1300. "sub %1,%2 \n"
  1301. LABELALIGN
  1302. "1: \n"
  1303. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1304. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1305. "pavgb %%xmm7,%%xmm0 \n"
  1306. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1307. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1308. "pavgb %%xmm7,%%xmm1 \n"
  1309. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1310. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1311. "pavgb %%xmm7,%%xmm2 \n"
  1312. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1313. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1314. "pavgb %%xmm7,%%xmm6 \n"
  1315. "lea " MEMLEA(0x40,0) ",%0 \n"
  1316. "movdqa %%xmm0,%%xmm7 \n"
  1317. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1318. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1319. "pavgb %%xmm7,%%xmm0 \n"
  1320. "movdqa %%xmm2,%%xmm7 \n"
  1321. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1322. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1323. "pavgb %%xmm7,%%xmm2 \n"
  1324. "movdqa %%xmm0,%%xmm1 \n"
  1325. "movdqa %%xmm2,%%xmm6 \n"
  1326. "pmaddubsw %%xmm4,%%xmm0 \n"
  1327. "pmaddubsw %%xmm4,%%xmm2 \n"
  1328. "pmaddubsw %%xmm3,%%xmm1 \n"
  1329. "pmaddubsw %%xmm3,%%xmm6 \n"
  1330. "phaddw %%xmm2,%%xmm0 \n"
  1331. "phaddw %%xmm6,%%xmm1 \n"
  1332. "psraw $0x8,%%xmm0 \n"
  1333. "psraw $0x8,%%xmm1 \n"
  1334. "packsswb %%xmm1,%%xmm0 \n"
  1335. "paddb %%xmm5,%%xmm0 \n"
  1336. "movlps %%xmm0," MEMACCESS(1) " \n"
  1337. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1338. "lea " MEMLEA(0x8,1) ",%1 \n"
  1339. "sub $0x10,%3 \n"
  1340. "jg 1b \n"
  1341. : "+r"(src_abgr0), // %0
  1342. "+r"(dst_u), // %1
  1343. "+r"(dst_v), // %2
  1344. "+rm"(width) // %3
  1345. : "r"((intptr_t)(src_stride_abgr)), // %4
  1346. "m"(kABGRToV), // %5
  1347. "m"(kABGRToU), // %6
  1348. "m"(kAddUV128) // %7
  1349. : "memory", "cc", NACL_R14
  1350. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1351. );
  1352. }
  1353. void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
  1354. uint8* dst_u, uint8* dst_v, int width) {
  1355. asm volatile (
  1356. "movdqa %5,%%xmm3 \n"
  1357. "movdqa %6,%%xmm4 \n"
  1358. "movdqa %7,%%xmm5 \n"
  1359. "sub %1,%2 \n"
  1360. LABELALIGN
  1361. "1: \n"
  1362. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1363. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1364. "pavgb %%xmm7,%%xmm0 \n"
  1365. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1366. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1367. "pavgb %%xmm7,%%xmm1 \n"
  1368. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1369. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1370. "pavgb %%xmm7,%%xmm2 \n"
  1371. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1372. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1373. "pavgb %%xmm7,%%xmm6 \n"
  1374. "lea " MEMLEA(0x40,0) ",%0 \n"
  1375. "movdqa %%xmm0,%%xmm7 \n"
  1376. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1377. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1378. "pavgb %%xmm7,%%xmm0 \n"
  1379. "movdqa %%xmm2,%%xmm7 \n"
  1380. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1381. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1382. "pavgb %%xmm7,%%xmm2 \n"
  1383. "movdqa %%xmm0,%%xmm1 \n"
  1384. "movdqa %%xmm2,%%xmm6 \n"
  1385. "pmaddubsw %%xmm4,%%xmm0 \n"
  1386. "pmaddubsw %%xmm4,%%xmm2 \n"
  1387. "pmaddubsw %%xmm3,%%xmm1 \n"
  1388. "pmaddubsw %%xmm3,%%xmm6 \n"
  1389. "phaddw %%xmm2,%%xmm0 \n"
  1390. "phaddw %%xmm6,%%xmm1 \n"
  1391. "psraw $0x8,%%xmm0 \n"
  1392. "psraw $0x8,%%xmm1 \n"
  1393. "packsswb %%xmm1,%%xmm0 \n"
  1394. "paddb %%xmm5,%%xmm0 \n"
  1395. "movlps %%xmm0," MEMACCESS(1) " \n"
  1396. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1397. "lea " MEMLEA(0x8,1) ",%1 \n"
  1398. "sub $0x10,%3 \n"
  1399. "jg 1b \n"
  1400. : "+r"(src_rgba0), // %0
  1401. "+r"(dst_u), // %1
  1402. "+r"(dst_v), // %2
  1403. "+rm"(width) // %3
  1404. : "r"((intptr_t)(src_stride_rgba)), // %4
  1405. "m"(kRGBAToV), // %5
  1406. "m"(kRGBAToU), // %6
  1407. "m"(kAddUV128) // %7
  1408. : "memory", "cc", NACL_R14
  1409. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1410. );
  1411. }
  1412. #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
  1413. // Read 8 UV from 444
  1414. #define READYUV444 \
  1415. "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1416. MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1417. "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
  1418. "punpcklbw %%xmm1,%%xmm0 \n" \
  1419. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1420. "punpcklbw %%xmm4,%%xmm4 \n" \
  1421. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1422. // Read 4 UV from 422, upsample to 8 UV
  1423. #define READYUV422 \
  1424. "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1425. MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1426. "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
  1427. "punpcklbw %%xmm1,%%xmm0 \n" \
  1428. "punpcklwd %%xmm0,%%xmm0 \n" \
  1429. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1430. "punpcklbw %%xmm4,%%xmm4 \n" \
  1431. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1432. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  1433. #define READYUVA422 \
  1434. "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1435. MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1436. "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
  1437. "punpcklbw %%xmm1,%%xmm0 \n" \
  1438. "punpcklwd %%xmm0,%%xmm0 \n" \
  1439. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1440. "punpcklbw %%xmm4,%%xmm4 \n" \
  1441. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
  1442. "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
  1443. "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
  1444. // Read 2 UV from 411, upsample to 8 UV.
  1445. // reading 4 bytes is an msan violation.
  1446. // "movd " MEMACCESS([u_buf]) ",%%xmm0 \n"
  1447. // MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
  1448. // pinsrw fails with drmemory
  1449. // __asm pinsrw xmm0, [esi], 0 /* U */
  1450. // __asm pinsrw xmm1, [esi + edi], 0 /* V */
  1451. #define READYUV411_TEMP \
  1452. "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \
  1453. "movd %[temp],%%xmm0 \n" \
  1454. MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \
  1455. "movd %[temp],%%xmm1 \n" \
  1456. "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
  1457. "punpcklbw %%xmm1,%%xmm0 \n" \
  1458. "punpcklwd %%xmm0,%%xmm0 \n" \
  1459. "punpckldq %%xmm0,%%xmm0 \n" \
  1460. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1461. "punpcklbw %%xmm4,%%xmm4 \n" \
  1462. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1463. // Read 4 UV from NV12, upsample to 8 UV
  1464. #define READNV12 \
  1465. "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
  1466. "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
  1467. "punpcklwd %%xmm0,%%xmm0 \n" \
  1468. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1469. "punpcklbw %%xmm4,%%xmm4 \n" \
  1470. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1471. // Read 4 VU from NV21, upsample to 8 UV
  1472. #define READNV21 \
  1473. "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
  1474. "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
  1475. "pshufb %[kShuffleNV21], %%xmm0 \n" \
  1476. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1477. "punpcklbw %%xmm4,%%xmm4 \n" \
  1478. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1479. // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
  1480. #define READYUY2 \
  1481. "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
  1482. "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
  1483. "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
  1484. "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
  1485. "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
  1486. // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
  1487. #define READUYVY \
  1488. "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
  1489. "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
  1490. "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
  1491. "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
  1492. "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
  1493. #if defined(__x86_64__)
  1494. #define YUVTORGB_SETUP(yuvconstants) \
  1495. "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
  1496. "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
  1497. "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
  1498. "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
  1499. "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
  1500. "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
  1501. "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
  1502. // Convert 8 pixels: 8 UV and 8 Y
  1503. #define YUVTORGB(yuvconstants) \
  1504. "movdqa %%xmm0,%%xmm1 \n" \
  1505. "movdqa %%xmm0,%%xmm2 \n" \
  1506. "movdqa %%xmm0,%%xmm3 \n" \
  1507. "movdqa %%xmm11,%%xmm0 \n" \
  1508. "pmaddubsw %%xmm8,%%xmm1 \n" \
  1509. "psubw %%xmm1,%%xmm0 \n" \
  1510. "movdqa %%xmm12,%%xmm1 \n" \
  1511. "pmaddubsw %%xmm9,%%xmm2 \n" \
  1512. "psubw %%xmm2,%%xmm1 \n" \
  1513. "movdqa %%xmm13,%%xmm2 \n" \
  1514. "pmaddubsw %%xmm10,%%xmm3 \n" \
  1515. "psubw %%xmm3,%%xmm2 \n" \
  1516. "pmulhuw %%xmm14,%%xmm4 \n" \
  1517. "paddsw %%xmm4,%%xmm0 \n" \
  1518. "paddsw %%xmm4,%%xmm1 \n" \
  1519. "paddsw %%xmm4,%%xmm2 \n" \
  1520. "psraw $0x6,%%xmm0 \n" \
  1521. "psraw $0x6,%%xmm1 \n" \
  1522. "psraw $0x6,%%xmm2 \n" \
  1523. "packuswb %%xmm0,%%xmm0 \n" \
  1524. "packuswb %%xmm1,%%xmm1 \n" \
  1525. "packuswb %%xmm2,%%xmm2 \n"
  1526. #define YUVTORGB_REGS \
  1527. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  1528. #else
  1529. #define YUVTORGB_SETUP(yuvconstants)
  1530. // Convert 8 pixels: 8 UV and 8 Y
  1531. #define YUVTORGB(yuvconstants) \
  1532. "movdqa %%xmm0,%%xmm1 \n" \
  1533. "movdqa %%xmm0,%%xmm2 \n" \
  1534. "movdqa %%xmm0,%%xmm3 \n" \
  1535. "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
  1536. "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
  1537. "psubw %%xmm1,%%xmm0 \n" \
  1538. "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
  1539. "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
  1540. "psubw %%xmm2,%%xmm1 \n" \
  1541. "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
  1542. "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
  1543. "psubw %%xmm3,%%xmm2 \n" \
  1544. "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
  1545. "paddsw %%xmm4,%%xmm0 \n" \
  1546. "paddsw %%xmm4,%%xmm1 \n" \
  1547. "paddsw %%xmm4,%%xmm2 \n" \
  1548. "psraw $0x6,%%xmm0 \n" \
  1549. "psraw $0x6,%%xmm1 \n" \
  1550. "psraw $0x6,%%xmm2 \n" \
  1551. "packuswb %%xmm0,%%xmm0 \n" \
  1552. "packuswb %%xmm1,%%xmm1 \n" \
  1553. "packuswb %%xmm2,%%xmm2 \n"
  1554. #define YUVTORGB_REGS
  1555. #endif
  1556. // Store 8 ARGB values.
  1557. #define STOREARGB \
  1558. "punpcklbw %%xmm1,%%xmm0 \n" \
  1559. "punpcklbw %%xmm5,%%xmm2 \n" \
  1560. "movdqa %%xmm0,%%xmm1 \n" \
  1561. "punpcklwd %%xmm2,%%xmm0 \n" \
  1562. "punpckhwd %%xmm2,%%xmm1 \n" \
  1563. "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
  1564. "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
  1565. "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
  1566. // Store 8 RGBA values.
  1567. #define STORERGBA \
  1568. "pcmpeqb %%xmm5,%%xmm5 \n" \
  1569. "punpcklbw %%xmm2,%%xmm1 \n" \
  1570. "punpcklbw %%xmm0,%%xmm5 \n" \
  1571. "movdqa %%xmm5,%%xmm0 \n" \
  1572. "punpcklwd %%xmm1,%%xmm5 \n" \
  1573. "punpckhwd %%xmm1,%%xmm0 \n" \
  1574. "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
  1575. "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
  1576. "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
  1577. void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
  1578. const uint8* u_buf,
  1579. const uint8* v_buf,
  1580. uint8* dst_argb,
  1581. const struct YuvConstants* yuvconstants,
  1582. int width) {
  1583. asm volatile (
  1584. YUVTORGB_SETUP(yuvconstants)
  1585. "sub %[u_buf],%[v_buf] \n"
  1586. "pcmpeqb %%xmm5,%%xmm5 \n"
  1587. LABELALIGN
  1588. "1: \n"
  1589. READYUV444
  1590. YUVTORGB(yuvconstants)
  1591. STOREARGB
  1592. "sub $0x8,%[width] \n"
  1593. "jg 1b \n"
  1594. : [y_buf]"+r"(y_buf), // %[y_buf]
  1595. [u_buf]"+r"(u_buf), // %[u_buf]
  1596. [v_buf]"+r"(v_buf), // %[v_buf]
  1597. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1598. [width]"+rm"(width) // %[width]
  1599. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1600. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1601. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1602. );
  1603. }
  1604. void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
  1605. const uint8* u_buf,
  1606. const uint8* v_buf,
  1607. uint8* dst_rgb24,
  1608. const struct YuvConstants* yuvconstants,
  1609. int width) {
  1610. asm volatile (
  1611. YUVTORGB_SETUP(yuvconstants)
  1612. "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
  1613. "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
  1614. "sub %[u_buf],%[v_buf] \n"
  1615. LABELALIGN
  1616. "1: \n"
  1617. READYUV422
  1618. YUVTORGB(yuvconstants)
  1619. "punpcklbw %%xmm1,%%xmm0 \n"
  1620. "punpcklbw %%xmm2,%%xmm2 \n"
  1621. "movdqa %%xmm0,%%xmm1 \n"
  1622. "punpcklwd %%xmm2,%%xmm0 \n"
  1623. "punpckhwd %%xmm2,%%xmm1 \n"
  1624. "pshufb %%xmm5,%%xmm0 \n"
  1625. "pshufb %%xmm6,%%xmm1 \n"
  1626. "palignr $0xc,%%xmm0,%%xmm1 \n"
  1627. "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
  1628. "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
  1629. "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
  1630. "subl $0x8,%[width] \n"
  1631. "jg 1b \n"
  1632. : [y_buf]"+r"(y_buf), // %[y_buf]
  1633. [u_buf]"+r"(u_buf), // %[u_buf]
  1634. [v_buf]"+r"(v_buf), // %[v_buf]
  1635. [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
  1636. #if defined(__i386__) && defined(__pic__)
  1637. [width]"+m"(width) // %[width]
  1638. #else
  1639. [width]"+rm"(width) // %[width]
  1640. #endif
  1641. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1642. [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
  1643. [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
  1644. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1645. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1646. );
  1647. }
  1648. void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
  1649. const uint8* u_buf,
  1650. const uint8* v_buf,
  1651. uint8* dst_argb,
  1652. const struct YuvConstants* yuvconstants,
  1653. int width) {
  1654. asm volatile (
  1655. YUVTORGB_SETUP(yuvconstants)
  1656. "sub %[u_buf],%[v_buf] \n"
  1657. "pcmpeqb %%xmm5,%%xmm5 \n"
  1658. LABELALIGN
  1659. "1: \n"
  1660. READYUV422
  1661. YUVTORGB(yuvconstants)
  1662. STOREARGB
  1663. "sub $0x8,%[width] \n"
  1664. "jg 1b \n"
  1665. : [y_buf]"+r"(y_buf), // %[y_buf]
  1666. [u_buf]"+r"(u_buf), // %[u_buf]
  1667. [v_buf]"+r"(v_buf), // %[v_buf]
  1668. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1669. [width]"+rm"(width) // %[width]
  1670. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1671. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1672. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1673. );
  1674. }
  1675. #ifdef HAS_I422ALPHATOARGBROW_SSSE3
  1676. void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
  1677. const uint8* u_buf,
  1678. const uint8* v_buf,
  1679. const uint8* a_buf,
  1680. uint8* dst_argb,
  1681. const struct YuvConstants* yuvconstants,
  1682. int width) {
  1683. asm volatile (
  1684. YUVTORGB_SETUP(yuvconstants)
  1685. "sub %[u_buf],%[v_buf] \n"
  1686. LABELALIGN
  1687. "1: \n"
  1688. READYUVA422
  1689. YUVTORGB(yuvconstants)
  1690. STOREARGB
  1691. "subl $0x8,%[width] \n"
  1692. "jg 1b \n"
  1693. : [y_buf]"+r"(y_buf), // %[y_buf]
  1694. [u_buf]"+r"(u_buf), // %[u_buf]
  1695. [v_buf]"+r"(v_buf), // %[v_buf]
  1696. [a_buf]"+r"(a_buf), // %[a_buf]
  1697. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1698. #if defined(__i386__) && defined(__pic__)
  1699. [width]"+m"(width) // %[width]
  1700. #else
  1701. [width]"+rm"(width) // %[width]
  1702. #endif
  1703. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1704. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1705. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1706. );
  1707. }
  1708. #endif // HAS_I422ALPHATOARGBROW_SSSE3
  1709. #ifdef HAS_I411TOARGBROW_SSSE3
  1710. void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
  1711. const uint8* u_buf,
  1712. const uint8* v_buf,
  1713. uint8* dst_argb,
  1714. const struct YuvConstants* yuvconstants,
  1715. int width) {
  1716. int temp;
  1717. asm volatile (
  1718. YUVTORGB_SETUP(yuvconstants)
  1719. "sub %[u_buf],%[v_buf] \n"
  1720. "pcmpeqb %%xmm5,%%xmm5 \n"
  1721. LABELALIGN
  1722. "1: \n"
  1723. READYUV411_TEMP
  1724. YUVTORGB(yuvconstants)
  1725. STOREARGB
  1726. "subl $0x8,%[width] \n"
  1727. "jg 1b \n"
  1728. : [y_buf]"+r"(y_buf), // %[y_buf]
  1729. [u_buf]"+r"(u_buf), // %[u_buf]
  1730. [v_buf]"+r"(v_buf), // %[v_buf]
  1731. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1732. [temp]"=&r"(temp), // %[temp]
  1733. #if defined(__i386__) && defined(__pic__)
  1734. [width]"+m"(width) // %[width]
  1735. #else
  1736. [width]"+rm"(width) // %[width]
  1737. #endif
  1738. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1739. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1740. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1741. );
  1742. }
  1743. #endif
  1744. void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
  1745. const uint8* uv_buf,
  1746. uint8* dst_argb,
  1747. const struct YuvConstants* yuvconstants,
  1748. int width) {
  1749. asm volatile (
  1750. YUVTORGB_SETUP(yuvconstants)
  1751. "pcmpeqb %%xmm5,%%xmm5 \n"
  1752. LABELALIGN
  1753. "1: \n"
  1754. READNV12
  1755. YUVTORGB(yuvconstants)
  1756. STOREARGB
  1757. "sub $0x8,%[width] \n"
  1758. "jg 1b \n"
  1759. : [y_buf]"+r"(y_buf), // %[y_buf]
  1760. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  1761. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1762. [width]"+rm"(width) // %[width]
  1763. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1764. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1765. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1766. );
  1767. }
  1768. void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
  1769. const uint8* vu_buf,
  1770. uint8* dst_argb,
  1771. const struct YuvConstants* yuvconstants,
  1772. int width) {
  1773. asm volatile (
  1774. YUVTORGB_SETUP(yuvconstants)
  1775. "pcmpeqb %%xmm5,%%xmm5 \n"
  1776. LABELALIGN
  1777. "1: \n"
  1778. READNV21
  1779. YUVTORGB(yuvconstants)
  1780. STOREARGB
  1781. "sub $0x8,%[width] \n"
  1782. "jg 1b \n"
  1783. : [y_buf]"+r"(y_buf), // %[y_buf]
  1784. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  1785. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1786. [width]"+rm"(width) // %[width]
  1787. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1788. [kShuffleNV21]"m"(kShuffleNV21)
  1789. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1790. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1791. );
  1792. }
  1793. void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
  1794. uint8* dst_argb,
  1795. const struct YuvConstants* yuvconstants,
  1796. int width) {
  1797. asm volatile (
  1798. YUVTORGB_SETUP(yuvconstants)
  1799. "pcmpeqb %%xmm5,%%xmm5 \n"
  1800. LABELALIGN
  1801. "1: \n"
  1802. READYUY2
  1803. YUVTORGB(yuvconstants)
  1804. STOREARGB
  1805. "sub $0x8,%[width] \n"
  1806. "jg 1b \n"
  1807. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  1808. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1809. [width]"+rm"(width) // %[width]
  1810. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1811. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  1812. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  1813. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1814. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1815. );
  1816. }
  1817. void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
  1818. uint8* dst_argb,
  1819. const struct YuvConstants* yuvconstants,
  1820. int width) {
  1821. asm volatile (
  1822. YUVTORGB_SETUP(yuvconstants)
  1823. "pcmpeqb %%xmm5,%%xmm5 \n"
  1824. LABELALIGN
  1825. "1: \n"
  1826. READUYVY
  1827. YUVTORGB(yuvconstants)
  1828. STOREARGB
  1829. "sub $0x8,%[width] \n"
  1830. "jg 1b \n"
  1831. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  1832. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1833. [width]"+rm"(width) // %[width]
  1834. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1835. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  1836. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  1837. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1838. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1839. );
  1840. }
  1841. void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
  1842. const uint8* u_buf,
  1843. const uint8* v_buf,
  1844. uint8* dst_rgba,
  1845. const struct YuvConstants* yuvconstants,
  1846. int width) {
  1847. asm volatile (
  1848. YUVTORGB_SETUP(yuvconstants)
  1849. "sub %[u_buf],%[v_buf] \n"
  1850. "pcmpeqb %%xmm5,%%xmm5 \n"
  1851. LABELALIGN
  1852. "1: \n"
  1853. READYUV422
  1854. YUVTORGB(yuvconstants)
  1855. STORERGBA
  1856. "sub $0x8,%[width] \n"
  1857. "jg 1b \n"
  1858. : [y_buf]"+r"(y_buf), // %[y_buf]
  1859. [u_buf]"+r"(u_buf), // %[u_buf]
  1860. [v_buf]"+r"(v_buf), // %[v_buf]
  1861. [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
  1862. [width]"+rm"(width) // %[width]
  1863. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1864. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1865. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1866. );
  1867. }
  1868. #endif // HAS_I422TOARGBROW_SSSE3
  1869. // Read 16 UV from 444
  1870. #define READYUV444_AVX2 \
  1871. "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1872. MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1873. "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
  1874. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1875. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  1876. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1877. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1878. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1879. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1880. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1881. // Read 8 UV from 422, upsample to 16 UV.
  1882. #define READYUV422_AVX2 \
  1883. "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1884. MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1885. "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
  1886. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1887. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1888. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  1889. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1890. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1891. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1892. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1893. // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
  1894. #define READYUVA422_AVX2 \
  1895. "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1896. MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1897. "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
  1898. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1899. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1900. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  1901. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1902. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1903. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1904. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
  1905. "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
  1906. "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
  1907. "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
  1908. // Read 4 UV from 411, upsample to 16 UV.
  1909. #define READYUV411_AVX2 \
  1910. "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1911. MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1912. "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
  1913. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1914. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  1915. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1916. "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \
  1917. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1918. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1919. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1920. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1921. // Read 8 UV from NV12, upsample to 16 UV.
  1922. #define READNV12_AVX2 \
  1923. "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
  1924. "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
  1925. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1926. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  1927. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1928. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1929. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1930. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1931. // Read 8 VU from NV21, upsample to 16 UV.
  1932. #define READNV21_AVX2 \
  1933. "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
  1934. "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
  1935. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1936. "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
  1937. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1938. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1939. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1940. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1941. // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
  1942. #define READYUY2_AVX2 \
  1943. "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
  1944. "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
  1945. "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
  1946. "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
  1947. "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
  1948. // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
  1949. #define READUYVY_AVX2 \
  1950. "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
  1951. "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
  1952. "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
  1953. "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
  1954. "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
  1955. #if defined(__x86_64__)
  1956. #define YUVTORGB_SETUP_AVX2(yuvconstants) \
  1957. "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
  1958. "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
  1959. "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
  1960. "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
  1961. "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
  1962. "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
  1963. "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
  1964. #define YUVTORGB_AVX2(yuvconstants) \
  1965. "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
  1966. "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
  1967. "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
  1968. "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
  1969. "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
  1970. "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
  1971. "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
  1972. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  1973. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  1974. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
  1975. "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
  1976. "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
  1977. "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
  1978. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  1979. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
  1980. "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
  1981. #define YUVTORGB_REGS_AVX2 \
  1982. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  1983. #else // Convert 16 pixels: 16 UV and 16 Y.
  1984. #define YUVTORGB_SETUP_AVX2(yuvconstants)
  1985. #define YUVTORGB_AVX2(yuvconstants) \
  1986. "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
  1987. "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
  1988. "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
  1989. "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
  1990. "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
  1991. "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
  1992. "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
  1993. "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
  1994. "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
  1995. "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
  1996. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  1997. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  1998. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
  1999. "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
  2000. "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
  2001. "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
  2002. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  2003. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
  2004. "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
  2005. #define YUVTORGB_REGS_AVX2
  2006. #endif
  2007. // Store 16 ARGB values.
  2008. #define STOREARGB_AVX2 \
  2009. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2010. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2011. "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
  2012. "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
  2013. "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
  2014. "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
  2015. "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
  2016. "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
  2017. "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
  2018. #ifdef HAS_I444TOARGBROW_AVX2
  2019. // 16 pixels
  2020. // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
  2021. void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
  2022. const uint8* u_buf,
  2023. const uint8* v_buf,
  2024. uint8* dst_argb,
  2025. const struct YuvConstants* yuvconstants,
  2026. int width) {
  2027. asm volatile (
  2028. YUVTORGB_SETUP_AVX2(yuvconstants)
  2029. "sub %[u_buf],%[v_buf] \n"
  2030. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2031. LABELALIGN
  2032. "1: \n"
  2033. READYUV444_AVX2
  2034. YUVTORGB_AVX2(yuvconstants)
  2035. STOREARGB_AVX2
  2036. "sub $0x10,%[width] \n"
  2037. "jg 1b \n"
  2038. "vzeroupper \n"
  2039. : [y_buf]"+r"(y_buf), // %[y_buf]
  2040. [u_buf]"+r"(u_buf), // %[u_buf]
  2041. [v_buf]"+r"(v_buf), // %[v_buf]
  2042. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2043. [width]"+rm"(width) // %[width]
  2044. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2045. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2046. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2047. );
  2048. }
  2049. #endif // HAS_I444TOARGBROW_AVX2
  2050. #ifdef HAS_I411TOARGBROW_AVX2
  2051. // 16 pixels
  2052. // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2053. void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
  2054. const uint8* u_buf,
  2055. const uint8* v_buf,
  2056. uint8* dst_argb,
  2057. const struct YuvConstants* yuvconstants,
  2058. int width) {
  2059. asm volatile (
  2060. YUVTORGB_SETUP_AVX2(yuvconstants)
  2061. "sub %[u_buf],%[v_buf] \n"
  2062. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2063. LABELALIGN
  2064. "1: \n"
  2065. READYUV411_AVX2
  2066. YUVTORGB_AVX2(yuvconstants)
  2067. STOREARGB_AVX2
  2068. "sub $0x10,%[width] \n"
  2069. "jg 1b \n"
  2070. "vzeroupper \n"
  2071. : [y_buf]"+r"(y_buf), // %[y_buf]
  2072. [u_buf]"+r"(u_buf), // %[u_buf]
  2073. [v_buf]"+r"(v_buf), // %[v_buf]
  2074. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2075. [width]"+rm"(width) // %[width]
  2076. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2077. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2078. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2079. );
  2080. }
  2081. #endif // HAS_I411TOARGBROW_AVX2
  2082. #if defined(HAS_I422TOARGBROW_AVX2)
  2083. // 16 pixels
  2084. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2085. void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
  2086. const uint8* u_buf,
  2087. const uint8* v_buf,
  2088. uint8* dst_argb,
  2089. const struct YuvConstants* yuvconstants,
  2090. int width) {
  2091. asm volatile (
  2092. YUVTORGB_SETUP_AVX2(yuvconstants)
  2093. "sub %[u_buf],%[v_buf] \n"
  2094. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2095. LABELALIGN
  2096. "1: \n"
  2097. READYUV422_AVX2
  2098. YUVTORGB_AVX2(yuvconstants)
  2099. STOREARGB_AVX2
  2100. "sub $0x10,%[width] \n"
  2101. "jg 1b \n"
  2102. "vzeroupper \n"
  2103. : [y_buf]"+r"(y_buf), // %[y_buf]
  2104. [u_buf]"+r"(u_buf), // %[u_buf]
  2105. [v_buf]"+r"(v_buf), // %[v_buf]
  2106. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2107. [width]"+rm"(width) // %[width]
  2108. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2109. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2110. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2111. );
  2112. }
  2113. #endif // HAS_I422TOARGBROW_AVX2
  2114. #if defined(HAS_I422ALPHATOARGBROW_AVX2)
  2115. // 16 pixels
  2116. // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
  2117. void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
  2118. const uint8* u_buf,
  2119. const uint8* v_buf,
  2120. const uint8* a_buf,
  2121. uint8* dst_argb,
  2122. const struct YuvConstants* yuvconstants,
  2123. int width) {
  2124. asm volatile (
  2125. YUVTORGB_SETUP_AVX2(yuvconstants)
  2126. "sub %[u_buf],%[v_buf] \n"
  2127. LABELALIGN
  2128. "1: \n"
  2129. READYUVA422_AVX2
  2130. YUVTORGB_AVX2(yuvconstants)
  2131. STOREARGB_AVX2
  2132. "subl $0x10,%[width] \n"
  2133. "jg 1b \n"
  2134. "vzeroupper \n"
  2135. : [y_buf]"+r"(y_buf), // %[y_buf]
  2136. [u_buf]"+r"(u_buf), // %[u_buf]
  2137. [v_buf]"+r"(v_buf), // %[v_buf]
  2138. [a_buf]"+r"(a_buf), // %[a_buf]
  2139. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2140. #if defined(__i386__) && defined(__pic__)
  2141. [width]"+m"(width) // %[width]
  2142. #else
  2143. [width]"+rm"(width) // %[width]
  2144. #endif
  2145. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2146. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2147. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2148. );
  2149. }
  2150. #endif // HAS_I422ALPHATOARGBROW_AVX2
  2151. #if defined(HAS_I422TORGBAROW_AVX2)
  2152. // 16 pixels
  2153. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
  2154. void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
  2155. const uint8* u_buf,
  2156. const uint8* v_buf,
  2157. uint8* dst_argb,
  2158. const struct YuvConstants* yuvconstants,
  2159. int width) {
  2160. asm volatile (
  2161. YUVTORGB_SETUP_AVX2(yuvconstants)
  2162. "sub %[u_buf],%[v_buf] \n"
  2163. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2164. LABELALIGN
  2165. "1: \n"
  2166. READYUV422_AVX2
  2167. YUVTORGB_AVX2(yuvconstants)
  2168. // Step 3: Weave into RGBA
  2169. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  2170. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2171. "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
  2172. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  2173. "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
  2174. "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
  2175. "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
  2176. "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
  2177. "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
  2178. "sub $0x10,%[width] \n"
  2179. "jg 1b \n"
  2180. "vzeroupper \n"
  2181. : [y_buf]"+r"(y_buf), // %[y_buf]
  2182. [u_buf]"+r"(u_buf), // %[u_buf]
  2183. [v_buf]"+r"(v_buf), // %[v_buf]
  2184. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2185. [width]"+rm"(width) // %[width]
  2186. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2187. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2188. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2189. );
  2190. }
  2191. #endif // HAS_I422TORGBAROW_AVX2
  2192. #if defined(HAS_NV12TOARGBROW_AVX2)
  2193. // 16 pixels.
  2194. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2195. void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
  2196. const uint8* uv_buf,
  2197. uint8* dst_argb,
  2198. const struct YuvConstants* yuvconstants,
  2199. int width) {
  2200. asm volatile (
  2201. YUVTORGB_SETUP_AVX2(yuvconstants)
  2202. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2203. LABELALIGN
  2204. "1: \n"
  2205. READNV12_AVX2
  2206. YUVTORGB_AVX2(yuvconstants)
  2207. STOREARGB_AVX2
  2208. "sub $0x10,%[width] \n"
  2209. "jg 1b \n"
  2210. "vzeroupper \n"
  2211. : [y_buf]"+r"(y_buf), // %[y_buf]
  2212. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  2213. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2214. [width]"+rm"(width) // %[width]
  2215. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2216. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2217. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2218. );
  2219. }
  2220. #endif // HAS_NV12TOARGBROW_AVX2
  2221. #if defined(HAS_NV21TOARGBROW_AVX2)
  2222. // 16 pixels.
  2223. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2224. void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
  2225. const uint8* vu_buf,
  2226. uint8* dst_argb,
  2227. const struct YuvConstants* yuvconstants,
  2228. int width) {
  2229. asm volatile (
  2230. YUVTORGB_SETUP_AVX2(yuvconstants)
  2231. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2232. LABELALIGN
  2233. "1: \n"
  2234. READNV21_AVX2
  2235. YUVTORGB_AVX2(yuvconstants)
  2236. STOREARGB_AVX2
  2237. "sub $0x10,%[width] \n"
  2238. "jg 1b \n"
  2239. "vzeroupper \n"
  2240. : [y_buf]"+r"(y_buf), // %[y_buf]
  2241. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  2242. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2243. [width]"+rm"(width) // %[width]
  2244. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2245. [kShuffleNV21]"m"(kShuffleNV21)
  2246. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2247. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2248. );
  2249. }
  2250. #endif // HAS_NV21TOARGBROW_AVX2
  2251. #if defined(HAS_YUY2TOARGBROW_AVX2)
  2252. // 16 pixels.
  2253. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2254. void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
  2255. uint8* dst_argb,
  2256. const struct YuvConstants* yuvconstants,
  2257. int width) {
  2258. asm volatile (
  2259. YUVTORGB_SETUP_AVX2(yuvconstants)
  2260. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2261. LABELALIGN
  2262. "1: \n"
  2263. READYUY2_AVX2
  2264. YUVTORGB_AVX2(yuvconstants)
  2265. STOREARGB_AVX2
  2266. "sub $0x10,%[width] \n"
  2267. "jg 1b \n"
  2268. "vzeroupper \n"
  2269. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  2270. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2271. [width]"+rm"(width) // %[width]
  2272. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2273. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  2274. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  2275. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2276. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2277. );
  2278. }
  2279. #endif // HAS_YUY2TOARGBROW_AVX2
  2280. #if defined(HAS_UYVYTOARGBROW_AVX2)
  2281. // 16 pixels.
  2282. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2283. void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
  2284. uint8* dst_argb,
  2285. const struct YuvConstants* yuvconstants,
  2286. int width) {
  2287. asm volatile (
  2288. YUVTORGB_SETUP_AVX2(yuvconstants)
  2289. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2290. LABELALIGN
  2291. "1: \n"
  2292. READUYVY_AVX2
  2293. YUVTORGB_AVX2(yuvconstants)
  2294. STOREARGB_AVX2
  2295. "sub $0x10,%[width] \n"
  2296. "jg 1b \n"
  2297. "vzeroupper \n"
  2298. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  2299. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2300. [width]"+rm"(width) // %[width]
  2301. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2302. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  2303. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  2304. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2305. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2306. );
  2307. }
  2308. #endif // HAS_UYVYTOARGBROW_AVX2
  2309. #ifdef HAS_I400TOARGBROW_SSE2
  2310. void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
  2311. asm volatile (
  2312. "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
  2313. "movd %%eax,%%xmm2 \n"
  2314. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  2315. "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
  2316. "movd %%eax,%%xmm3 \n"
  2317. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  2318. "pcmpeqb %%xmm4,%%xmm4 \n"
  2319. "pslld $0x18,%%xmm4 \n"
  2320. LABELALIGN
  2321. "1: \n"
  2322. // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  2323. "movq " MEMACCESS(0) ",%%xmm0 \n"
  2324. "lea " MEMLEA(0x8,0) ",%0 \n"
  2325. "punpcklbw %%xmm0,%%xmm0 \n"
  2326. "pmulhuw %%xmm2,%%xmm0 \n"
  2327. "psubusw %%xmm3,%%xmm0 \n"
  2328. "psrlw $6, %%xmm0 \n"
  2329. "packuswb %%xmm0,%%xmm0 \n"
  2330. // Step 2: Weave into ARGB
  2331. "punpcklbw %%xmm0,%%xmm0 \n"
  2332. "movdqa %%xmm0,%%xmm1 \n"
  2333. "punpcklwd %%xmm0,%%xmm0 \n"
  2334. "punpckhwd %%xmm1,%%xmm1 \n"
  2335. "por %%xmm4,%%xmm0 \n"
  2336. "por %%xmm4,%%xmm1 \n"
  2337. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2338. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  2339. "lea " MEMLEA(0x20,1) ",%1 \n"
  2340. "sub $0x8,%2 \n"
  2341. "jg 1b \n"
  2342. : "+r"(y_buf), // %0
  2343. "+r"(dst_argb), // %1
  2344. "+rm"(width) // %2
  2345. :
  2346. : "memory", "cc", "eax"
  2347. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  2348. );
  2349. }
  2350. #endif // HAS_I400TOARGBROW_SSE2
  2351. #ifdef HAS_I400TOARGBROW_AVX2
  2352. // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
  2353. // note: vpunpcklbw mutates and vpackuswb unmutates.
  2354. void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
  2355. asm volatile (
  2356. "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
  2357. "vmovd %%eax,%%xmm2 \n"
  2358. "vbroadcastss %%xmm2,%%ymm2 \n"
  2359. "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
  2360. "vmovd %%eax,%%xmm3 \n"
  2361. "vbroadcastss %%xmm3,%%ymm3 \n"
  2362. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  2363. "vpslld $0x18,%%ymm4,%%ymm4 \n"
  2364. LABELALIGN
  2365. "1: \n"
  2366. // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
  2367. "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
  2368. "lea " MEMLEA(0x10,0) ",%0 \n"
  2369. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  2370. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  2371. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  2372. "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
  2373. "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
  2374. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  2375. "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
  2376. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2377. "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
  2378. "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
  2379. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  2380. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  2381. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2382. "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
  2383. "lea " MEMLEA(0x40,1) ",%1 \n"
  2384. "sub $0x10,%2 \n"
  2385. "jg 1b \n"
  2386. "vzeroupper \n"
  2387. : "+r"(y_buf), // %0
  2388. "+r"(dst_argb), // %1
  2389. "+rm"(width) // %2
  2390. :
  2391. : "memory", "cc", "eax"
  2392. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  2393. );
  2394. }
  2395. #endif // HAS_I400TOARGBROW_AVX2
  2396. #ifdef HAS_MIRRORROW_SSSE3
  2397. // Shuffle table for reversing the bytes.
  2398. static uvec8 kShuffleMirror = {
  2399. 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  2400. };
  2401. void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  2402. intptr_t temp_width = (intptr_t)(width);
  2403. asm volatile (
  2404. "movdqa %3,%%xmm5 \n"
  2405. LABELALIGN
  2406. "1: \n"
  2407. MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
  2408. "pshufb %%xmm5,%%xmm0 \n"
  2409. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2410. "lea " MEMLEA(0x10,1) ",%1 \n"
  2411. "sub $0x10,%2 \n"
  2412. "jg 1b \n"
  2413. : "+r"(src), // %0
  2414. "+r"(dst), // %1
  2415. "+r"(temp_width) // %2
  2416. : "m"(kShuffleMirror) // %3
  2417. : "memory", "cc", NACL_R14
  2418. "xmm0", "xmm5"
  2419. );
  2420. }
  2421. #endif // HAS_MIRRORROW_SSSE3
  2422. #ifdef HAS_MIRRORROW_AVX2
  2423. void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  2424. intptr_t temp_width = (intptr_t)(width);
  2425. asm volatile (
  2426. "vbroadcastf128 %3,%%ymm5 \n"
  2427. LABELALIGN
  2428. "1: \n"
  2429. MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
  2430. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  2431. "vpermq $0x4e,%%ymm0,%%ymm0 \n"
  2432. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2433. "lea " MEMLEA(0x20,1) ",%1 \n"
  2434. "sub $0x20,%2 \n"
  2435. "jg 1b \n"
  2436. "vzeroupper \n"
  2437. : "+r"(src), // %0
  2438. "+r"(dst), // %1
  2439. "+r"(temp_width) // %2
  2440. : "m"(kShuffleMirror) // %3
  2441. : "memory", "cc", NACL_R14
  2442. "xmm0", "xmm5"
  2443. );
  2444. }
  2445. #endif // HAS_MIRRORROW_AVX2
  2446. #ifdef HAS_MIRRORUVROW_SSSE3
  2447. // Shuffle table for reversing the bytes of UV channels.
  2448. static uvec8 kShuffleMirrorUV = {
  2449. 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
  2450. };
  2451. void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
  2452. int width) {
  2453. intptr_t temp_width = (intptr_t)(width);
  2454. asm volatile (
  2455. "movdqa %4,%%xmm1 \n"
  2456. "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
  2457. "sub %1,%2 \n"
  2458. LABELALIGN
  2459. "1: \n"
  2460. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2461. "lea " MEMLEA(-0x10,0) ",%0 \n"
  2462. "pshufb %%xmm1,%%xmm0 \n"
  2463. "movlpd %%xmm0," MEMACCESS(1) " \n"
  2464. MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
  2465. "lea " MEMLEA(0x8,1) ",%1 \n"
  2466. "sub $8,%3 \n"
  2467. "jg 1b \n"
  2468. : "+r"(src), // %0
  2469. "+r"(dst_u), // %1
  2470. "+r"(dst_v), // %2
  2471. "+r"(temp_width) // %3
  2472. : "m"(kShuffleMirrorUV) // %4
  2473. : "memory", "cc", NACL_R14
  2474. "xmm0", "xmm1"
  2475. );
  2476. }
  2477. #endif // HAS_MIRRORUVROW_SSSE3
  2478. #ifdef HAS_ARGBMIRRORROW_SSE2
  2479. void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  2480. intptr_t temp_width = (intptr_t)(width);
  2481. asm volatile (
  2482. "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
  2483. LABELALIGN
  2484. "1: \n"
  2485. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2486. "pshufd $0x1b,%%xmm0,%%xmm0 \n"
  2487. "lea " MEMLEA(-0x10,0) ",%0 \n"
  2488. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2489. "lea " MEMLEA(0x10,1) ",%1 \n"
  2490. "sub $0x4,%2 \n"
  2491. "jg 1b \n"
  2492. : "+r"(src), // %0
  2493. "+r"(dst), // %1
  2494. "+r"(temp_width) // %2
  2495. :
  2496. : "memory", "cc"
  2497. , "xmm0"
  2498. );
  2499. }
  2500. #endif // HAS_ARGBMIRRORROW_SSE2
  2501. #ifdef HAS_ARGBMIRRORROW_AVX2
  2502. // Shuffle table for reversing the bytes.
  2503. static const ulvec32 kARGBShuffleMirror_AVX2 = {
  2504. 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  2505. };
  2506. void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  2507. intptr_t temp_width = (intptr_t)(width);
  2508. asm volatile (
  2509. "vmovdqu %3,%%ymm5 \n"
  2510. LABELALIGN
  2511. "1: \n"
  2512. VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
  2513. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2514. "lea " MEMLEA(0x20,1) ",%1 \n"
  2515. "sub $0x8,%2 \n"
  2516. "jg 1b \n"
  2517. "vzeroupper \n"
  2518. : "+r"(src), // %0
  2519. "+r"(dst), // %1
  2520. "+r"(temp_width) // %2
  2521. : "m"(kARGBShuffleMirror_AVX2) // %3
  2522. : "memory", "cc", NACL_R14
  2523. "xmm0", "xmm5"
  2524. );
  2525. }
  2526. #endif // HAS_ARGBMIRRORROW_AVX2
  2527. #ifdef HAS_SPLITUVROW_AVX2
  2528. void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  2529. int width) {
  2530. asm volatile (
  2531. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2532. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  2533. "sub %1,%2 \n"
  2534. LABELALIGN
  2535. "1: \n"
  2536. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  2537. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  2538. "lea " MEMLEA(0x40,0) ",%0 \n"
  2539. "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
  2540. "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
  2541. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  2542. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  2543. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  2544. "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
  2545. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  2546. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  2547. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2548. MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
  2549. "lea " MEMLEA(0x20,1) ",%1 \n"
  2550. "sub $0x20,%3 \n"
  2551. "jg 1b \n"
  2552. "vzeroupper \n"
  2553. : "+r"(src_uv), // %0
  2554. "+r"(dst_u), // %1
  2555. "+r"(dst_v), // %2
  2556. "+r"(width) // %3
  2557. :
  2558. : "memory", "cc", NACL_R14
  2559. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  2560. );
  2561. }
  2562. #endif // HAS_SPLITUVROW_AVX2
  2563. #ifdef HAS_SPLITUVROW_SSE2
  2564. void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  2565. int width) {
  2566. asm volatile (
  2567. "pcmpeqb %%xmm5,%%xmm5 \n"
  2568. "psrlw $0x8,%%xmm5 \n"
  2569. "sub %1,%2 \n"
  2570. LABELALIGN
  2571. "1: \n"
  2572. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2573. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2574. "lea " MEMLEA(0x20,0) ",%0 \n"
  2575. "movdqa %%xmm0,%%xmm2 \n"
  2576. "movdqa %%xmm1,%%xmm3 \n"
  2577. "pand %%xmm5,%%xmm0 \n"
  2578. "pand %%xmm5,%%xmm1 \n"
  2579. "packuswb %%xmm1,%%xmm0 \n"
  2580. "psrlw $0x8,%%xmm2 \n"
  2581. "psrlw $0x8,%%xmm3 \n"
  2582. "packuswb %%xmm3,%%xmm2 \n"
  2583. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2584. MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
  2585. "lea " MEMLEA(0x10,1) ",%1 \n"
  2586. "sub $0x10,%3 \n"
  2587. "jg 1b \n"
  2588. : "+r"(src_uv), // %0
  2589. "+r"(dst_u), // %1
  2590. "+r"(dst_v), // %2
  2591. "+r"(width) // %3
  2592. :
  2593. : "memory", "cc", NACL_R14
  2594. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  2595. );
  2596. }
  2597. #endif // HAS_SPLITUVROW_SSE2
  2598. #ifdef HAS_MERGEUVROW_AVX2
  2599. void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  2600. int width) {
  2601. asm volatile (
  2602. "sub %0,%1 \n"
  2603. LABELALIGN
  2604. "1: \n"
  2605. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  2606. MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
  2607. "lea " MEMLEA(0x20,0) ",%0 \n"
  2608. "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
  2609. "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
  2610. "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
  2611. "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
  2612. "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
  2613. "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
  2614. "lea " MEMLEA(0x40,2) ",%2 \n"
  2615. "sub $0x20,%3 \n"
  2616. "jg 1b \n"
  2617. "vzeroupper \n"
  2618. : "+r"(src_u), // %0
  2619. "+r"(src_v), // %1
  2620. "+r"(dst_uv), // %2
  2621. "+r"(width) // %3
  2622. :
  2623. : "memory", "cc", NACL_R14
  2624. "xmm0", "xmm1", "xmm2"
  2625. );
  2626. }
  2627. #endif // HAS_MERGEUVROW_AVX2
  2628. #ifdef HAS_MERGEUVROW_SSE2
  2629. void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  2630. int width) {
  2631. asm volatile (
  2632. "sub %0,%1 \n"
  2633. LABELALIGN
  2634. "1: \n"
  2635. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2636. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  2637. "lea " MEMLEA(0x10,0) ",%0 \n"
  2638. "movdqa %%xmm0,%%xmm2 \n"
  2639. "punpcklbw %%xmm1,%%xmm0 \n"
  2640. "punpckhbw %%xmm1,%%xmm2 \n"
  2641. "movdqu %%xmm0," MEMACCESS(2) " \n"
  2642. "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
  2643. "lea " MEMLEA(0x20,2) ",%2 \n"
  2644. "sub $0x10,%3 \n"
  2645. "jg 1b \n"
  2646. : "+r"(src_u), // %0
  2647. "+r"(src_v), // %1
  2648. "+r"(dst_uv), // %2
  2649. "+r"(width) // %3
  2650. :
  2651. : "memory", "cc", NACL_R14
  2652. "xmm0", "xmm1", "xmm2"
  2653. );
  2654. }
  2655. #endif // HAS_MERGEUVROW_SSE2
  2656. #ifdef HAS_COPYROW_SSE2
  2657. void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  2658. asm volatile (
  2659. "test $0xf,%0 \n"
  2660. "jne 2f \n"
  2661. "test $0xf,%1 \n"
  2662. "jne 2f \n"
  2663. LABELALIGN
  2664. "1: \n"
  2665. "movdqa " MEMACCESS(0) ",%%xmm0 \n"
  2666. "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2667. "lea " MEMLEA(0x20,0) ",%0 \n"
  2668. "movdqa %%xmm0," MEMACCESS(1) " \n"
  2669. "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
  2670. "lea " MEMLEA(0x20,1) ",%1 \n"
  2671. "sub $0x20,%2 \n"
  2672. "jg 1b \n"
  2673. "jmp 9f \n"
  2674. LABELALIGN
  2675. "2: \n"
  2676. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2677. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2678. "lea " MEMLEA(0x20,0) ",%0 \n"
  2679. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2680. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  2681. "lea " MEMLEA(0x20,1) ",%1 \n"
  2682. "sub $0x20,%2 \n"
  2683. "jg 2b \n"
  2684. "9: \n"
  2685. : "+r"(src), // %0
  2686. "+r"(dst), // %1
  2687. "+r"(count) // %2
  2688. :
  2689. : "memory", "cc"
  2690. , "xmm0", "xmm1"
  2691. );
  2692. }
  2693. #endif // HAS_COPYROW_SSE2
  2694. #ifdef HAS_COPYROW_AVX
  2695. void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
  2696. asm volatile (
  2697. LABELALIGN
  2698. "1: \n"
  2699. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  2700. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  2701. "lea " MEMLEA(0x40,0) ",%0 \n"
  2702. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2703. "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
  2704. "lea " MEMLEA(0x40,1) ",%1 \n"
  2705. "sub $0x40,%2 \n"
  2706. "jg 1b \n"
  2707. : "+r"(src), // %0
  2708. "+r"(dst), // %1
  2709. "+r"(count) // %2
  2710. :
  2711. : "memory", "cc"
  2712. , "xmm0", "xmm1"
  2713. );
  2714. }
  2715. #endif // HAS_COPYROW_AVX
  2716. #ifdef HAS_COPYROW_ERMS
  2717. // Multiple of 1.
  2718. void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
  2719. size_t width_tmp = (size_t)(width);
  2720. asm volatile (
  2721. "rep movsb " MEMMOVESTRING(0,1) " \n"
  2722. : "+S"(src), // %0
  2723. "+D"(dst), // %1
  2724. "+c"(width_tmp) // %2
  2725. :
  2726. : "memory", "cc"
  2727. );
  2728. }
  2729. #endif // HAS_COPYROW_ERMS
  2730. #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  2731. // width in pixels
  2732. void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  2733. asm volatile (
  2734. "pcmpeqb %%xmm0,%%xmm0 \n"
  2735. "pslld $0x18,%%xmm0 \n"
  2736. "pcmpeqb %%xmm1,%%xmm1 \n"
  2737. "psrld $0x8,%%xmm1 \n"
  2738. LABELALIGN
  2739. "1: \n"
  2740. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  2741. "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
  2742. "lea " MEMLEA(0x20,0) ",%0 \n"
  2743. "movdqu " MEMACCESS(1) ",%%xmm4 \n"
  2744. "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
  2745. "pand %%xmm0,%%xmm2 \n"
  2746. "pand %%xmm0,%%xmm3 \n"
  2747. "pand %%xmm1,%%xmm4 \n"
  2748. "pand %%xmm1,%%xmm5 \n"
  2749. "por %%xmm4,%%xmm2 \n"
  2750. "por %%xmm5,%%xmm3 \n"
  2751. "movdqu %%xmm2," MEMACCESS(1) " \n"
  2752. "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
  2753. "lea " MEMLEA(0x20,1) ",%1 \n"
  2754. "sub $0x8,%2 \n"
  2755. "jg 1b \n"
  2756. : "+r"(src), // %0
  2757. "+r"(dst), // %1
  2758. "+r"(width) // %2
  2759. :
  2760. : "memory", "cc"
  2761. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2762. );
  2763. }
  2764. #endif // HAS_ARGBCOPYALPHAROW_SSE2
  2765. #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  2766. // width in pixels
  2767. void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  2768. asm volatile (
  2769. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  2770. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  2771. LABELALIGN
  2772. "1: \n"
  2773. "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
  2774. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
  2775. "lea " MEMLEA(0x40,0) ",%0 \n"
  2776. "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
  2777. "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
  2778. "vmovdqu %%ymm1," MEMACCESS(1) " \n"
  2779. "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
  2780. "lea " MEMLEA(0x40,1) ",%1 \n"
  2781. "sub $0x10,%2 \n"
  2782. "jg 1b \n"
  2783. "vzeroupper \n"
  2784. : "+r"(src), // %0
  2785. "+r"(dst), // %1
  2786. "+r"(width) // %2
  2787. :
  2788. : "memory", "cc"
  2789. , "xmm0", "xmm1", "xmm2"
  2790. );
  2791. }
  2792. #endif // HAS_ARGBCOPYALPHAROW_AVX2
  2793. #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
  2794. // width in pixels
  2795. void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
  2796. asm volatile (
  2797. LABELALIGN
  2798. "1: \n"
  2799. "movdqu " MEMACCESS(0) ", %%xmm0 \n"
  2800. "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
  2801. "lea " MEMLEA(0x20, 0) ", %0 \n"
  2802. "psrld $0x18, %%xmm0 \n"
  2803. "psrld $0x18, %%xmm1 \n"
  2804. "packssdw %%xmm1, %%xmm0 \n"
  2805. "packuswb %%xmm0, %%xmm0 \n"
  2806. "movq %%xmm0," MEMACCESS(1) " \n"
  2807. "lea " MEMLEA(0x8, 1) ", %1 \n"
  2808. "sub $0x8, %2 \n"
  2809. "jg 1b \n"
  2810. : "+r"(src_argb), // %0
  2811. "+r"(dst_a), // %1
  2812. "+rm"(width) // %2
  2813. :
  2814. : "memory", "cc"
  2815. , "xmm0", "xmm1"
  2816. );
  2817. }
  2818. #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
  2819. #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  2820. // width in pixels
  2821. void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  2822. asm volatile (
  2823. "pcmpeqb %%xmm0,%%xmm0 \n"
  2824. "pslld $0x18,%%xmm0 \n"
  2825. "pcmpeqb %%xmm1,%%xmm1 \n"
  2826. "psrld $0x8,%%xmm1 \n"
  2827. LABELALIGN
  2828. "1: \n"
  2829. "movq " MEMACCESS(0) ",%%xmm2 \n"
  2830. "lea " MEMLEA(0x8,0) ",%0 \n"
  2831. "punpcklbw %%xmm2,%%xmm2 \n"
  2832. "punpckhwd %%xmm2,%%xmm3 \n"
  2833. "punpcklwd %%xmm2,%%xmm2 \n"
  2834. "movdqu " MEMACCESS(1) ",%%xmm4 \n"
  2835. "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
  2836. "pand %%xmm0,%%xmm2 \n"
  2837. "pand %%xmm0,%%xmm3 \n"
  2838. "pand %%xmm1,%%xmm4 \n"
  2839. "pand %%xmm1,%%xmm5 \n"
  2840. "por %%xmm4,%%xmm2 \n"
  2841. "por %%xmm5,%%xmm3 \n"
  2842. "movdqu %%xmm2," MEMACCESS(1) " \n"
  2843. "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
  2844. "lea " MEMLEA(0x20,1) ",%1 \n"
  2845. "sub $0x8,%2 \n"
  2846. "jg 1b \n"
  2847. : "+r"(src), // %0
  2848. "+r"(dst), // %1
  2849. "+r"(width) // %2
  2850. :
  2851. : "memory", "cc"
  2852. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2853. );
  2854. }
  2855. #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
  2856. #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  2857. // width in pixels
  2858. void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  2859. asm volatile (
  2860. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  2861. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  2862. LABELALIGN
  2863. "1: \n"
  2864. "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
  2865. "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
  2866. "lea " MEMLEA(0x10,0) ",%0 \n"
  2867. "vpslld $0x18,%%ymm1,%%ymm1 \n"
  2868. "vpslld $0x18,%%ymm2,%%ymm2 \n"
  2869. "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
  2870. "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
  2871. "vmovdqu %%ymm1," MEMACCESS(1) " \n"
  2872. "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
  2873. "lea " MEMLEA(0x40,1) ",%1 \n"
  2874. "sub $0x10,%2 \n"
  2875. "jg 1b \n"
  2876. "vzeroupper \n"
  2877. : "+r"(src), // %0
  2878. "+r"(dst), // %1
  2879. "+r"(width) // %2
  2880. :
  2881. : "memory", "cc"
  2882. , "xmm0", "xmm1", "xmm2"
  2883. );
  2884. }
  2885. #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
  2886. #ifdef HAS_SETROW_X86
  2887. void SetRow_X86(uint8* dst, uint8 v8, int width) {
  2888. size_t width_tmp = (size_t)(width >> 2);
  2889. const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
  2890. asm volatile (
  2891. "rep stosl " MEMSTORESTRING(eax,0) " \n"
  2892. : "+D"(dst), // %0
  2893. "+c"(width_tmp) // %1
  2894. : "a"(v32) // %2
  2895. : "memory", "cc");
  2896. }
  2897. void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
  2898. size_t width_tmp = (size_t)(width);
  2899. asm volatile (
  2900. "rep stosb " MEMSTORESTRING(al,0) " \n"
  2901. : "+D"(dst), // %0
  2902. "+c"(width_tmp) // %1
  2903. : "a"(v8) // %2
  2904. : "memory", "cc");
  2905. }
  2906. void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
  2907. size_t width_tmp = (size_t)(width);
  2908. asm volatile (
  2909. "rep stosl " MEMSTORESTRING(eax,0) " \n"
  2910. : "+D"(dst_argb), // %0
  2911. "+c"(width_tmp) // %1
  2912. : "a"(v32) // %2
  2913. : "memory", "cc");
  2914. }
  2915. #endif // HAS_SETROW_X86
  2916. #ifdef HAS_YUY2TOYROW_SSE2
  2917. void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
  2918. asm volatile (
  2919. "pcmpeqb %%xmm5,%%xmm5 \n"
  2920. "psrlw $0x8,%%xmm5 \n"
  2921. LABELALIGN
  2922. "1: \n"
  2923. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2924. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2925. "lea " MEMLEA(0x20,0) ",%0 \n"
  2926. "pand %%xmm5,%%xmm0 \n"
  2927. "pand %%xmm5,%%xmm1 \n"
  2928. "packuswb %%xmm1,%%xmm0 \n"
  2929. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2930. "lea " MEMLEA(0x10,1) ",%1 \n"
  2931. "sub $0x10,%2 \n"
  2932. "jg 1b \n"
  2933. : "+r"(src_yuy2), // %0
  2934. "+r"(dst_y), // %1
  2935. "+r"(width) // %2
  2936. :
  2937. : "memory", "cc"
  2938. , "xmm0", "xmm1", "xmm5"
  2939. );
  2940. }
  2941. void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
  2942. uint8* dst_u, uint8* dst_v, int width) {
  2943. asm volatile (
  2944. "pcmpeqb %%xmm5,%%xmm5 \n"
  2945. "psrlw $0x8,%%xmm5 \n"
  2946. "sub %1,%2 \n"
  2947. LABELALIGN
  2948. "1: \n"
  2949. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2950. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2951. MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
  2952. MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
  2953. "lea " MEMLEA(0x20,0) ",%0 \n"
  2954. "pavgb %%xmm2,%%xmm0 \n"
  2955. "pavgb %%xmm3,%%xmm1 \n"
  2956. "psrlw $0x8,%%xmm0 \n"
  2957. "psrlw $0x8,%%xmm1 \n"
  2958. "packuswb %%xmm1,%%xmm0 \n"
  2959. "movdqa %%xmm0,%%xmm1 \n"
  2960. "pand %%xmm5,%%xmm0 \n"
  2961. "packuswb %%xmm0,%%xmm0 \n"
  2962. "psrlw $0x8,%%xmm1 \n"
  2963. "packuswb %%xmm1,%%xmm1 \n"
  2964. "movq %%xmm0," MEMACCESS(1) " \n"
  2965. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  2966. "lea " MEMLEA(0x8,1) ",%1 \n"
  2967. "sub $0x10,%3 \n"
  2968. "jg 1b \n"
  2969. : "+r"(src_yuy2), // %0
  2970. "+r"(dst_u), // %1
  2971. "+r"(dst_v), // %2
  2972. "+r"(width) // %3
  2973. : "r"((intptr_t)(stride_yuy2)) // %4
  2974. : "memory", "cc", NACL_R14
  2975. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  2976. );
  2977. }
  2978. void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
  2979. uint8* dst_u, uint8* dst_v, int width) {
  2980. asm volatile (
  2981. "pcmpeqb %%xmm5,%%xmm5 \n"
  2982. "psrlw $0x8,%%xmm5 \n"
  2983. "sub %1,%2 \n"
  2984. LABELALIGN
  2985. "1: \n"
  2986. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2987. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2988. "lea " MEMLEA(0x20,0) ",%0 \n"
  2989. "psrlw $0x8,%%xmm0 \n"
  2990. "psrlw $0x8,%%xmm1 \n"
  2991. "packuswb %%xmm1,%%xmm0 \n"
  2992. "movdqa %%xmm0,%%xmm1 \n"
  2993. "pand %%xmm5,%%xmm0 \n"
  2994. "packuswb %%xmm0,%%xmm0 \n"
  2995. "psrlw $0x8,%%xmm1 \n"
  2996. "packuswb %%xmm1,%%xmm1 \n"
  2997. "movq %%xmm0," MEMACCESS(1) " \n"
  2998. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  2999. "lea " MEMLEA(0x8,1) ",%1 \n"
  3000. "sub $0x10,%3 \n"
  3001. "jg 1b \n"
  3002. : "+r"(src_yuy2), // %0
  3003. "+r"(dst_u), // %1
  3004. "+r"(dst_v), // %2
  3005. "+r"(width) // %3
  3006. :
  3007. : "memory", "cc", NACL_R14
  3008. "xmm0", "xmm1", "xmm5"
  3009. );
  3010. }
  3011. void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
  3012. asm volatile (
  3013. LABELALIGN
  3014. "1: \n"
  3015. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3016. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3017. "lea " MEMLEA(0x20,0) ",%0 \n"
  3018. "psrlw $0x8,%%xmm0 \n"
  3019. "psrlw $0x8,%%xmm1 \n"
  3020. "packuswb %%xmm1,%%xmm0 \n"
  3021. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3022. "lea " MEMLEA(0x10,1) ",%1 \n"
  3023. "sub $0x10,%2 \n"
  3024. "jg 1b \n"
  3025. : "+r"(src_uyvy), // %0
  3026. "+r"(dst_y), // %1
  3027. "+r"(width) // %2
  3028. :
  3029. : "memory", "cc"
  3030. , "xmm0", "xmm1"
  3031. );
  3032. }
  3033. void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
  3034. uint8* dst_u, uint8* dst_v, int width) {
  3035. asm volatile (
  3036. "pcmpeqb %%xmm5,%%xmm5 \n"
  3037. "psrlw $0x8,%%xmm5 \n"
  3038. "sub %1,%2 \n"
  3039. LABELALIGN
  3040. "1: \n"
  3041. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3042. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3043. MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
  3044. MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
  3045. "lea " MEMLEA(0x20,0) ",%0 \n"
  3046. "pavgb %%xmm2,%%xmm0 \n"
  3047. "pavgb %%xmm3,%%xmm1 \n"
  3048. "pand %%xmm5,%%xmm0 \n"
  3049. "pand %%xmm5,%%xmm1 \n"
  3050. "packuswb %%xmm1,%%xmm0 \n"
  3051. "movdqa %%xmm0,%%xmm1 \n"
  3052. "pand %%xmm5,%%xmm0 \n"
  3053. "packuswb %%xmm0,%%xmm0 \n"
  3054. "psrlw $0x8,%%xmm1 \n"
  3055. "packuswb %%xmm1,%%xmm1 \n"
  3056. "movq %%xmm0," MEMACCESS(1) " \n"
  3057. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  3058. "lea " MEMLEA(0x8,1) ",%1 \n"
  3059. "sub $0x10,%3 \n"
  3060. "jg 1b \n"
  3061. : "+r"(src_uyvy), // %0
  3062. "+r"(dst_u), // %1
  3063. "+r"(dst_v), // %2
  3064. "+r"(width) // %3
  3065. : "r"((intptr_t)(stride_uyvy)) // %4
  3066. : "memory", "cc", NACL_R14
  3067. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3068. );
  3069. }
  3070. void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
  3071. uint8* dst_u, uint8* dst_v, int width) {
  3072. asm volatile (
  3073. "pcmpeqb %%xmm5,%%xmm5 \n"
  3074. "psrlw $0x8,%%xmm5 \n"
  3075. "sub %1,%2 \n"
  3076. LABELALIGN
  3077. "1: \n"
  3078. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3079. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3080. "lea " MEMLEA(0x20,0) ",%0 \n"
  3081. "pand %%xmm5,%%xmm0 \n"
  3082. "pand %%xmm5,%%xmm1 \n"
  3083. "packuswb %%xmm1,%%xmm0 \n"
  3084. "movdqa %%xmm0,%%xmm1 \n"
  3085. "pand %%xmm5,%%xmm0 \n"
  3086. "packuswb %%xmm0,%%xmm0 \n"
  3087. "psrlw $0x8,%%xmm1 \n"
  3088. "packuswb %%xmm1,%%xmm1 \n"
  3089. "movq %%xmm0," MEMACCESS(1) " \n"
  3090. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  3091. "lea " MEMLEA(0x8,1) ",%1 \n"
  3092. "sub $0x10,%3 \n"
  3093. "jg 1b \n"
  3094. : "+r"(src_uyvy), // %0
  3095. "+r"(dst_u), // %1
  3096. "+r"(dst_v), // %2
  3097. "+r"(width) // %3
  3098. :
  3099. : "memory", "cc", NACL_R14
  3100. "xmm0", "xmm1", "xmm5"
  3101. );
  3102. }
  3103. #endif // HAS_YUY2TOYROW_SSE2
  3104. #ifdef HAS_YUY2TOYROW_AVX2
  3105. void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
  3106. asm volatile (
  3107. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3108. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3109. LABELALIGN
  3110. "1: \n"
  3111. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3112. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3113. "lea " MEMLEA(0x40,0) ",%0 \n"
  3114. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3115. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3116. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3117. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3118. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  3119. "lea " MEMLEA(0x20,1) ",%1 \n"
  3120. "sub $0x20,%2 \n"
  3121. "jg 1b \n"
  3122. "vzeroupper \n"
  3123. : "+r"(src_yuy2), // %0
  3124. "+r"(dst_y), // %1
  3125. "+r"(width) // %2
  3126. :
  3127. : "memory", "cc"
  3128. , "xmm0", "xmm1", "xmm5"
  3129. );
  3130. }
  3131. void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
  3132. uint8* dst_u, uint8* dst_v, int width) {
  3133. asm volatile (
  3134. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3135. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3136. "sub %1,%2 \n"
  3137. LABELALIGN
  3138. "1: \n"
  3139. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3140. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3141. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  3142. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  3143. "lea " MEMLEA(0x40,0) ",%0 \n"
  3144. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3145. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3146. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3147. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3148. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3149. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3150. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3151. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3152. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3153. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3154. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3155. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3156. "lea " MEMLEA(0x10,1) ",%1 \n"
  3157. "sub $0x20,%3 \n"
  3158. "jg 1b \n"
  3159. "vzeroupper \n"
  3160. : "+r"(src_yuy2), // %0
  3161. "+r"(dst_u), // %1
  3162. "+r"(dst_v), // %2
  3163. "+r"(width) // %3
  3164. : "r"((intptr_t)(stride_yuy2)) // %4
  3165. : "memory", "cc", NACL_R14
  3166. "xmm0", "xmm1", "xmm5"
  3167. );
  3168. }
  3169. void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
  3170. uint8* dst_u, uint8* dst_v, int width) {
  3171. asm volatile (
  3172. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3173. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3174. "sub %1,%2 \n"
  3175. LABELALIGN
  3176. "1: \n"
  3177. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3178. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3179. "lea " MEMLEA(0x40,0) ",%0 \n"
  3180. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3181. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3182. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3183. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3184. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3185. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3186. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3187. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3188. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3189. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3190. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3191. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3192. "lea " MEMLEA(0x10,1) ",%1 \n"
  3193. "sub $0x20,%3 \n"
  3194. "jg 1b \n"
  3195. "vzeroupper \n"
  3196. : "+r"(src_yuy2), // %0
  3197. "+r"(dst_u), // %1
  3198. "+r"(dst_v), // %2
  3199. "+r"(width) // %3
  3200. :
  3201. : "memory", "cc", NACL_R14
  3202. "xmm0", "xmm1", "xmm5"
  3203. );
  3204. }
  3205. void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
  3206. asm volatile (
  3207. LABELALIGN
  3208. "1: \n"
  3209. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3210. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3211. "lea " MEMLEA(0x40,0) ",%0 \n"
  3212. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3213. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3214. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3215. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3216. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  3217. "lea " MEMLEA(0x20,1) ",%1 \n"
  3218. "sub $0x20,%2 \n"
  3219. "jg 1b \n"
  3220. "vzeroupper \n"
  3221. : "+r"(src_uyvy), // %0
  3222. "+r"(dst_y), // %1
  3223. "+r"(width) // %2
  3224. :
  3225. : "memory", "cc"
  3226. , "xmm0", "xmm1", "xmm5"
  3227. );
  3228. }
  3229. void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
  3230. uint8* dst_u, uint8* dst_v, int width) {
  3231. asm volatile (
  3232. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3233. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3234. "sub %1,%2 \n"
  3235. LABELALIGN
  3236. "1: \n"
  3237. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3238. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3239. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  3240. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  3241. "lea " MEMLEA(0x40,0) ",%0 \n"
  3242. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3243. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3244. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3245. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3246. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3247. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3248. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3249. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3250. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3251. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3252. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3253. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3254. "lea " MEMLEA(0x10,1) ",%1 \n"
  3255. "sub $0x20,%3 \n"
  3256. "jg 1b \n"
  3257. "vzeroupper \n"
  3258. : "+r"(src_uyvy), // %0
  3259. "+r"(dst_u), // %1
  3260. "+r"(dst_v), // %2
  3261. "+r"(width) // %3
  3262. : "r"((intptr_t)(stride_uyvy)) // %4
  3263. : "memory", "cc", NACL_R14
  3264. "xmm0", "xmm1", "xmm5"
  3265. );
  3266. }
  3267. void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
  3268. uint8* dst_u, uint8* dst_v, int width) {
  3269. asm volatile (
  3270. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3271. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3272. "sub %1,%2 \n"
  3273. LABELALIGN
  3274. "1: \n"
  3275. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3276. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3277. "lea " MEMLEA(0x40,0) ",%0 \n"
  3278. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3279. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3280. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3281. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3282. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3283. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3284. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3285. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3286. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3287. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3288. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3289. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3290. "lea " MEMLEA(0x10,1) ",%1 \n"
  3291. "sub $0x20,%3 \n"
  3292. "jg 1b \n"
  3293. "vzeroupper \n"
  3294. : "+r"(src_uyvy), // %0
  3295. "+r"(dst_u), // %1
  3296. "+r"(dst_v), // %2
  3297. "+r"(width) // %3
  3298. :
  3299. : "memory", "cc", NACL_R14
  3300. "xmm0", "xmm1", "xmm5"
  3301. );
  3302. }
  3303. #endif // HAS_YUY2TOYROW_AVX2
  3304. #ifdef HAS_ARGBBLENDROW_SSSE3
  3305. // Shuffle table for isolating alpha.
  3306. static uvec8 kShuffleAlpha = {
  3307. 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  3308. 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
  3309. };
  3310. // Blend 8 pixels at a time
  3311. void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
  3312. uint8* dst_argb, int width) {
  3313. asm volatile (
  3314. "pcmpeqb %%xmm7,%%xmm7 \n"
  3315. "psrlw $0xf,%%xmm7 \n"
  3316. "pcmpeqb %%xmm6,%%xmm6 \n"
  3317. "psrlw $0x8,%%xmm6 \n"
  3318. "pcmpeqb %%xmm5,%%xmm5 \n"
  3319. "psllw $0x8,%%xmm5 \n"
  3320. "pcmpeqb %%xmm4,%%xmm4 \n"
  3321. "pslld $0x18,%%xmm4 \n"
  3322. "sub $0x4,%3 \n"
  3323. "jl 49f \n"
  3324. // 4 pixel loop.
  3325. LABELALIGN
  3326. "40: \n"
  3327. "movdqu " MEMACCESS(0) ",%%xmm3 \n"
  3328. "lea " MEMLEA(0x10,0) ",%0 \n"
  3329. "movdqa %%xmm3,%%xmm0 \n"
  3330. "pxor %%xmm4,%%xmm3 \n"
  3331. "movdqu " MEMACCESS(1) ",%%xmm2 \n"
  3332. "pshufb %4,%%xmm3 \n"
  3333. "pand %%xmm6,%%xmm2 \n"
  3334. "paddw %%xmm7,%%xmm3 \n"
  3335. "pmullw %%xmm3,%%xmm2 \n"
  3336. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  3337. "lea " MEMLEA(0x10,1) ",%1 \n"
  3338. "psrlw $0x8,%%xmm1 \n"
  3339. "por %%xmm4,%%xmm0 \n"
  3340. "pmullw %%xmm3,%%xmm1 \n"
  3341. "psrlw $0x8,%%xmm2 \n"
  3342. "paddusb %%xmm2,%%xmm0 \n"
  3343. "pand %%xmm5,%%xmm1 \n"
  3344. "paddusb %%xmm1,%%xmm0 \n"
  3345. "movdqu %%xmm0," MEMACCESS(2) " \n"
  3346. "lea " MEMLEA(0x10,2) ",%2 \n"
  3347. "sub $0x4,%3 \n"
  3348. "jge 40b \n"
  3349. "49: \n"
  3350. "add $0x3,%3 \n"
  3351. "jl 99f \n"
  3352. // 1 pixel loop.
  3353. "91: \n"
  3354. "movd " MEMACCESS(0) ",%%xmm3 \n"
  3355. "lea " MEMLEA(0x4,0) ",%0 \n"
  3356. "movdqa %%xmm3,%%xmm0 \n"
  3357. "pxor %%xmm4,%%xmm3 \n"
  3358. "movd " MEMACCESS(1) ",%%xmm2 \n"
  3359. "pshufb %4,%%xmm3 \n"
  3360. "pand %%xmm6,%%xmm2 \n"
  3361. "paddw %%xmm7,%%xmm3 \n"
  3362. "pmullw %%xmm3,%%xmm2 \n"
  3363. "movd " MEMACCESS(1) ",%%xmm1 \n"
  3364. "lea " MEMLEA(0x4,1) ",%1 \n"
  3365. "psrlw $0x8,%%xmm1 \n"
  3366. "por %%xmm4,%%xmm0 \n"
  3367. "pmullw %%xmm3,%%xmm1 \n"
  3368. "psrlw $0x8,%%xmm2 \n"
  3369. "paddusb %%xmm2,%%xmm0 \n"
  3370. "pand %%xmm5,%%xmm1 \n"
  3371. "paddusb %%xmm1,%%xmm0 \n"
  3372. "movd %%xmm0," MEMACCESS(2) " \n"
  3373. "lea " MEMLEA(0x4,2) ",%2 \n"
  3374. "sub $0x1,%3 \n"
  3375. "jge 91b \n"
  3376. "99: \n"
  3377. : "+r"(src_argb0), // %0
  3378. "+r"(src_argb1), // %1
  3379. "+r"(dst_argb), // %2
  3380. "+r"(width) // %3
  3381. : "m"(kShuffleAlpha) // %4
  3382. : "memory", "cc"
  3383. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3384. );
  3385. }
  3386. #endif // HAS_ARGBBLENDROW_SSSE3
  3387. #ifdef HAS_BLENDPLANEROW_SSSE3
  3388. // Blend 8 pixels at a time.
  3389. // unsigned version of math
  3390. // =((A2*C2)+(B2*(255-C2))+255)/256
  3391. // signed version of math
  3392. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3393. void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
  3394. const uint8* alpha, uint8* dst, int width) {
  3395. asm volatile (
  3396. "pcmpeqb %%xmm5,%%xmm5 \n"
  3397. "psllw $0x8,%%xmm5 \n"
  3398. "mov $0x80808080,%%eax \n"
  3399. "movd %%eax,%%xmm6 \n"
  3400. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  3401. "mov $0x807f807f,%%eax \n"
  3402. "movd %%eax,%%xmm7 \n"
  3403. "pshufd $0x0,%%xmm7,%%xmm7 \n"
  3404. "sub %2,%0 \n"
  3405. "sub %2,%1 \n"
  3406. "sub %2,%3 \n"
  3407. // 8 pixel loop.
  3408. LABELALIGN
  3409. "1: \n"
  3410. "movq (%2),%%xmm0 \n"
  3411. "punpcklbw %%xmm0,%%xmm0 \n"
  3412. "pxor %%xmm5,%%xmm0 \n"
  3413. "movq (%0,%2,1),%%xmm1 \n"
  3414. "movq (%1,%2,1),%%xmm2 \n"
  3415. "punpcklbw %%xmm2,%%xmm1 \n"
  3416. "psubb %%xmm6,%%xmm1 \n"
  3417. "pmaddubsw %%xmm1,%%xmm0 \n"
  3418. "paddw %%xmm7,%%xmm0 \n"
  3419. "psrlw $0x8,%%xmm0 \n"
  3420. "packuswb %%xmm0,%%xmm0 \n"
  3421. "movq %%xmm0,(%3,%2,1) \n"
  3422. "lea 0x8(%2),%2 \n"
  3423. "sub $0x8,%4 \n"
  3424. "jg 1b \n"
  3425. : "+r"(src0), // %0
  3426. "+r"(src1), // %1
  3427. "+r"(alpha), // %2
  3428. "+r"(dst), // %3
  3429. "+rm"(width) // %4
  3430. :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
  3431. );
  3432. }
  3433. #endif // HAS_BLENDPLANEROW_SSSE3
  3434. #ifdef HAS_BLENDPLANEROW_AVX2
  3435. // Blend 32 pixels at a time.
  3436. // unsigned version of math
  3437. // =((A2*C2)+(B2*(255-C2))+255)/256
  3438. // signed version of math
  3439. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3440. void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
  3441. const uint8* alpha, uint8* dst, int width) {
  3442. asm volatile (
  3443. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3444. "vpsllw $0x8,%%ymm5,%%ymm5 \n"
  3445. "mov $0x80808080,%%eax \n"
  3446. "vmovd %%eax,%%xmm6 \n"
  3447. "vbroadcastss %%xmm6,%%ymm6 \n"
  3448. "mov $0x807f807f,%%eax \n"
  3449. "vmovd %%eax,%%xmm7 \n"
  3450. "vbroadcastss %%xmm7,%%ymm7 \n"
  3451. "sub %2,%0 \n"
  3452. "sub %2,%1 \n"
  3453. "sub %2,%3 \n"
  3454. // 32 pixel loop.
  3455. LABELALIGN
  3456. "1: \n"
  3457. "vmovdqu (%2),%%ymm0 \n"
  3458. "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
  3459. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  3460. "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
  3461. "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
  3462. "vmovdqu (%0,%2,1),%%ymm1 \n"
  3463. "vmovdqu (%1,%2,1),%%ymm2 \n"
  3464. "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
  3465. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  3466. "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
  3467. "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
  3468. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  3469. "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
  3470. "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
  3471. "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
  3472. "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
  3473. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3474. "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
  3475. "vmovdqu %%ymm0,(%3,%2,1) \n"
  3476. "lea 0x20(%2),%2 \n"
  3477. "sub $0x20,%4 \n"
  3478. "jg 1b \n"
  3479. "vzeroupper \n"
  3480. : "+r"(src0), // %0
  3481. "+r"(src1), // %1
  3482. "+r"(alpha), // %2
  3483. "+r"(dst), // %3
  3484. "+rm"(width) // %4
  3485. :: "memory", "cc", "eax",
  3486. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3487. );
  3488. }
  3489. #endif // HAS_BLENDPLANEROW_AVX2
  3490. #ifdef HAS_ARGBATTENUATEROW_SSSE3
  3491. // Shuffle table duplicating alpha
  3492. static uvec8 kShuffleAlpha0 = {
  3493. 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
  3494. };
  3495. static uvec8 kShuffleAlpha1 = {
  3496. 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  3497. 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
  3498. };
  3499. // Attenuate 4 pixels at a time.
  3500. void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  3501. asm volatile (
  3502. "pcmpeqb %%xmm3,%%xmm3 \n"
  3503. "pslld $0x18,%%xmm3 \n"
  3504. "movdqa %3,%%xmm4 \n"
  3505. "movdqa %4,%%xmm5 \n"
  3506. // 4 pixel loop.
  3507. LABELALIGN
  3508. "1: \n"
  3509. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3510. "pshufb %%xmm4,%%xmm0 \n"
  3511. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3512. "punpcklbw %%xmm1,%%xmm1 \n"
  3513. "pmulhuw %%xmm1,%%xmm0 \n"
  3514. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3515. "pshufb %%xmm5,%%xmm1 \n"
  3516. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  3517. "punpckhbw %%xmm2,%%xmm2 \n"
  3518. "pmulhuw %%xmm2,%%xmm1 \n"
  3519. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  3520. "lea " MEMLEA(0x10,0) ",%0 \n"
  3521. "pand %%xmm3,%%xmm2 \n"
  3522. "psrlw $0x8,%%xmm0 \n"
  3523. "psrlw $0x8,%%xmm1 \n"
  3524. "packuswb %%xmm1,%%xmm0 \n"
  3525. "por %%xmm2,%%xmm0 \n"
  3526. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3527. "lea " MEMLEA(0x10,1) ",%1 \n"
  3528. "sub $0x4,%2 \n"
  3529. "jg 1b \n"
  3530. : "+r"(src_argb), // %0
  3531. "+r"(dst_argb), // %1
  3532. "+r"(width) // %2
  3533. : "m"(kShuffleAlpha0), // %3
  3534. "m"(kShuffleAlpha1) // %4
  3535. : "memory", "cc"
  3536. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3537. );
  3538. }
  3539. #endif // HAS_ARGBATTENUATEROW_SSSE3
  3540. #ifdef HAS_ARGBATTENUATEROW_AVX2
  3541. // Shuffle table duplicating alpha.
  3542. static const uvec8 kShuffleAlpha_AVX2 = {
  3543. 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
  3544. };
  3545. // Attenuate 8 pixels at a time.
  3546. void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
  3547. asm volatile (
  3548. "vbroadcastf128 %3,%%ymm4 \n"
  3549. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3550. "vpslld $0x18,%%ymm5,%%ymm5 \n"
  3551. "sub %0,%1 \n"
  3552. // 8 pixel loop.
  3553. LABELALIGN
  3554. "1: \n"
  3555. "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
  3556. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  3557. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  3558. "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
  3559. "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
  3560. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3561. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  3562. "vpand %%ymm5,%%ymm6,%%ymm6 \n"
  3563. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3564. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3565. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3566. "vpor %%ymm6,%%ymm0,%%ymm0 \n"
  3567. MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
  3568. "lea " MEMLEA(0x20,0) ",%0 \n"
  3569. "sub $0x8,%2 \n"
  3570. "jg 1b \n"
  3571. "vzeroupper \n"
  3572. : "+r"(src_argb), // %0
  3573. "+r"(dst_argb), // %1
  3574. "+r"(width) // %2
  3575. : "m"(kShuffleAlpha_AVX2) // %3
  3576. : "memory", "cc"
  3577. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  3578. );
  3579. }
  3580. #endif // HAS_ARGBATTENUATEROW_AVX2
  3581. #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  3582. // Unattenuate 4 pixels at a time.
  3583. void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  3584. int width) {
  3585. uintptr_t alpha;
  3586. asm volatile (
  3587. // 4 pixel loop.
  3588. LABELALIGN
  3589. "1: \n"
  3590. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3591. "movzb " MEMACCESS2(0x03,0) ",%3 \n"
  3592. "punpcklbw %%xmm0,%%xmm0 \n"
  3593. MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
  3594. "movzb " MEMACCESS2(0x07,0) ",%3 \n"
  3595. MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
  3596. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  3597. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  3598. "movlhps %%xmm3,%%xmm2 \n"
  3599. "pmulhuw %%xmm2,%%xmm0 \n"
  3600. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3601. "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
  3602. "punpckhbw %%xmm1,%%xmm1 \n"
  3603. MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
  3604. "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
  3605. MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
  3606. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  3607. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  3608. "movlhps %%xmm3,%%xmm2 \n"
  3609. "pmulhuw %%xmm2,%%xmm1 \n"
  3610. "lea " MEMLEA(0x10,0) ",%0 \n"
  3611. "packuswb %%xmm1,%%xmm0 \n"
  3612. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3613. "lea " MEMLEA(0x10,1) ",%1 \n"
  3614. "sub $0x4,%2 \n"
  3615. "jg 1b \n"
  3616. : "+r"(src_argb), // %0
  3617. "+r"(dst_argb), // %1
  3618. "+r"(width), // %2
  3619. "=&r"(alpha) // %3
  3620. : "r"(fixed_invtbl8) // %4
  3621. : "memory", "cc", NACL_R14
  3622. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3623. );
  3624. }
  3625. #endif // HAS_ARGBUNATTENUATEROW_SSE2
  3626. #ifdef HAS_ARGBUNATTENUATEROW_AVX2
  3627. // Shuffle table duplicating alpha.
  3628. static const uvec8 kUnattenShuffleAlpha_AVX2 = {
  3629. 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
  3630. };
  3631. // Unattenuate 8 pixels at a time.
  3632. void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  3633. int width) {
  3634. uintptr_t alpha;
  3635. asm volatile (
  3636. "sub %0,%1 \n"
  3637. "vbroadcastf128 %5,%%ymm5 \n"
  3638. // 8 pixel loop.
  3639. LABELALIGN
  3640. "1: \n"
  3641. // replace VPGATHER
  3642. "movzb " MEMACCESS2(0x03,0) ",%3 \n"
  3643. MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
  3644. "movzb " MEMACCESS2(0x07,0) ",%3 \n"
  3645. MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
  3646. "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
  3647. "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
  3648. MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
  3649. "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
  3650. MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
  3651. "movzb " MEMACCESS2(0x13,0) ",%3 \n"
  3652. "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
  3653. MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
  3654. "movzb " MEMACCESS2(0x17,0) ",%3 \n"
  3655. MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
  3656. "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
  3657. "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
  3658. MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
  3659. "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
  3660. MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
  3661. "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
  3662. "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
  3663. "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
  3664. "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
  3665. // end of VPGATHER
  3666. "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
  3667. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  3668. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  3669. "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
  3670. "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
  3671. "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
  3672. "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
  3673. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3674. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  3675. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3676. MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
  3677. "lea " MEMLEA(0x20,0) ",%0 \n"
  3678. "sub $0x8,%2 \n"
  3679. "jg 1b \n"
  3680. "vzeroupper \n"
  3681. : "+r"(src_argb), // %0
  3682. "+r"(dst_argb), // %1
  3683. "+r"(width), // %2
  3684. "=&r"(alpha) // %3
  3685. : "r"(fixed_invtbl8), // %4
  3686. "m"(kUnattenShuffleAlpha_AVX2) // %5
  3687. : "memory", "cc", NACL_R14
  3688. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3689. );
  3690. }
  3691. #endif // HAS_ARGBUNATTENUATEROW_AVX2
  3692. #ifdef HAS_ARGBGRAYROW_SSSE3
  3693. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  3694. void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  3695. asm volatile (
  3696. "movdqa %3,%%xmm4 \n"
  3697. "movdqa %4,%%xmm5 \n"
  3698. // 8 pixel loop.
  3699. LABELALIGN
  3700. "1: \n"
  3701. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3702. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3703. "pmaddubsw %%xmm4,%%xmm0 \n"
  3704. "pmaddubsw %%xmm4,%%xmm1 \n"
  3705. "phaddw %%xmm1,%%xmm0 \n"
  3706. "paddw %%xmm5,%%xmm0 \n"
  3707. "psrlw $0x7,%%xmm0 \n"
  3708. "packuswb %%xmm0,%%xmm0 \n"
  3709. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  3710. "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
  3711. "lea " MEMLEA(0x20,0) ",%0 \n"
  3712. "psrld $0x18,%%xmm2 \n"
  3713. "psrld $0x18,%%xmm3 \n"
  3714. "packuswb %%xmm3,%%xmm2 \n"
  3715. "packuswb %%xmm2,%%xmm2 \n"
  3716. "movdqa %%xmm0,%%xmm3 \n"
  3717. "punpcklbw %%xmm0,%%xmm0 \n"
  3718. "punpcklbw %%xmm2,%%xmm3 \n"
  3719. "movdqa %%xmm0,%%xmm1 \n"
  3720. "punpcklwd %%xmm3,%%xmm0 \n"
  3721. "punpckhwd %%xmm3,%%xmm1 \n"
  3722. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3723. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  3724. "lea " MEMLEA(0x20,1) ",%1 \n"
  3725. "sub $0x8,%2 \n"
  3726. "jg 1b \n"
  3727. : "+r"(src_argb), // %0
  3728. "+r"(dst_argb), // %1
  3729. "+r"(width) // %2
  3730. : "m"(kARGBToYJ), // %3
  3731. "m"(kAddYJ64) // %4
  3732. : "memory", "cc"
  3733. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3734. );
  3735. }
  3736. #endif // HAS_ARGBGRAYROW_SSSE3
  3737. #ifdef HAS_ARGBSEPIAROW_SSSE3
  3738. // b = (r * 35 + g * 68 + b * 17) >> 7
  3739. // g = (r * 45 + g * 88 + b * 22) >> 7
  3740. // r = (r * 50 + g * 98 + b * 24) >> 7
  3741. // Constant for ARGB color to sepia tone
  3742. static vec8 kARGBToSepiaB = {
  3743. 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
  3744. };
  3745. static vec8 kARGBToSepiaG = {
  3746. 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
  3747. };
  3748. static vec8 kARGBToSepiaR = {
  3749. 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
  3750. };
  3751. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  3752. void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  3753. asm volatile (
  3754. "movdqa %2,%%xmm2 \n"
  3755. "movdqa %3,%%xmm3 \n"
  3756. "movdqa %4,%%xmm4 \n"
  3757. // 8 pixel loop.
  3758. LABELALIGN
  3759. "1: \n"
  3760. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3761. "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
  3762. "pmaddubsw %%xmm2,%%xmm0 \n"
  3763. "pmaddubsw %%xmm2,%%xmm6 \n"
  3764. "phaddw %%xmm6,%%xmm0 \n"
  3765. "psrlw $0x7,%%xmm0 \n"
  3766. "packuswb %%xmm0,%%xmm0 \n"
  3767. "movdqu " MEMACCESS(0) ",%%xmm5 \n"
  3768. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3769. "pmaddubsw %%xmm3,%%xmm5 \n"
  3770. "pmaddubsw %%xmm3,%%xmm1 \n"
  3771. "phaddw %%xmm1,%%xmm5 \n"
  3772. "psrlw $0x7,%%xmm5 \n"
  3773. "packuswb %%xmm5,%%xmm5 \n"
  3774. "punpcklbw %%xmm5,%%xmm0 \n"
  3775. "movdqu " MEMACCESS(0) ",%%xmm5 \n"
  3776. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3777. "pmaddubsw %%xmm4,%%xmm5 \n"
  3778. "pmaddubsw %%xmm4,%%xmm1 \n"
  3779. "phaddw %%xmm1,%%xmm5 \n"
  3780. "psrlw $0x7,%%xmm5 \n"
  3781. "packuswb %%xmm5,%%xmm5 \n"
  3782. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  3783. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3784. "psrld $0x18,%%xmm6 \n"
  3785. "psrld $0x18,%%xmm1 \n"
  3786. "packuswb %%xmm1,%%xmm6 \n"
  3787. "packuswb %%xmm6,%%xmm6 \n"
  3788. "punpcklbw %%xmm6,%%xmm5 \n"
  3789. "movdqa %%xmm0,%%xmm1 \n"
  3790. "punpcklwd %%xmm5,%%xmm0 \n"
  3791. "punpckhwd %%xmm5,%%xmm1 \n"
  3792. "movdqu %%xmm0," MEMACCESS(0) " \n"
  3793. "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
  3794. "lea " MEMLEA(0x20,0) ",%0 \n"
  3795. "sub $0x8,%1 \n"
  3796. "jg 1b \n"
  3797. : "+r"(dst_argb), // %0
  3798. "+r"(width) // %1
  3799. : "m"(kARGBToSepiaB), // %2
  3800. "m"(kARGBToSepiaG), // %3
  3801. "m"(kARGBToSepiaR) // %4
  3802. : "memory", "cc"
  3803. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  3804. );
  3805. }
  3806. #endif // HAS_ARGBSEPIAROW_SSSE3
  3807. #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  3808. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  3809. // Same as Sepia except matrix is provided.
  3810. void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  3811. const int8* matrix_argb, int width) {
  3812. asm volatile (
  3813. "movdqu " MEMACCESS(3) ",%%xmm5 \n"
  3814. "pshufd $0x00,%%xmm5,%%xmm2 \n"
  3815. "pshufd $0x55,%%xmm5,%%xmm3 \n"
  3816. "pshufd $0xaa,%%xmm5,%%xmm4 \n"
  3817. "pshufd $0xff,%%xmm5,%%xmm5 \n"
  3818. // 8 pixel loop.
  3819. LABELALIGN
  3820. "1: \n"
  3821. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3822. "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
  3823. "pmaddubsw %%xmm2,%%xmm0 \n"
  3824. "pmaddubsw %%xmm2,%%xmm7 \n"
  3825. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  3826. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3827. "pmaddubsw %%xmm3,%%xmm6 \n"
  3828. "pmaddubsw %%xmm3,%%xmm1 \n"
  3829. "phaddsw %%xmm7,%%xmm0 \n"
  3830. "phaddsw %%xmm1,%%xmm6 \n"
  3831. "psraw $0x6,%%xmm0 \n"
  3832. "psraw $0x6,%%xmm6 \n"
  3833. "packuswb %%xmm0,%%xmm0 \n"
  3834. "packuswb %%xmm6,%%xmm6 \n"
  3835. "punpcklbw %%xmm6,%%xmm0 \n"
  3836. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3837. "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
  3838. "pmaddubsw %%xmm4,%%xmm1 \n"
  3839. "pmaddubsw %%xmm4,%%xmm7 \n"
  3840. "phaddsw %%xmm7,%%xmm1 \n"
  3841. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  3842. "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
  3843. "pmaddubsw %%xmm5,%%xmm6 \n"
  3844. "pmaddubsw %%xmm5,%%xmm7 \n"
  3845. "phaddsw %%xmm7,%%xmm6 \n"
  3846. "psraw $0x6,%%xmm1 \n"
  3847. "psraw $0x6,%%xmm6 \n"
  3848. "packuswb %%xmm1,%%xmm1 \n"
  3849. "packuswb %%xmm6,%%xmm6 \n"
  3850. "punpcklbw %%xmm6,%%xmm1 \n"
  3851. "movdqa %%xmm0,%%xmm6 \n"
  3852. "punpcklwd %%xmm1,%%xmm0 \n"
  3853. "punpckhwd %%xmm1,%%xmm6 \n"
  3854. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3855. "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
  3856. "lea " MEMLEA(0x20,0) ",%0 \n"
  3857. "lea " MEMLEA(0x20,1) ",%1 \n"
  3858. "sub $0x8,%2 \n"
  3859. "jg 1b \n"
  3860. : "+r"(src_argb), // %0
  3861. "+r"(dst_argb), // %1
  3862. "+r"(width) // %2
  3863. : "r"(matrix_argb) // %3
  3864. : "memory", "cc"
  3865. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3866. );
  3867. }
  3868. #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
  3869. #ifdef HAS_ARGBQUANTIZEROW_SSE2
  3870. // Quantize 4 ARGB pixels (16 bytes).
  3871. void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
  3872. int interval_offset, int width) {
  3873. asm volatile (
  3874. "movd %2,%%xmm2 \n"
  3875. "movd %3,%%xmm3 \n"
  3876. "movd %4,%%xmm4 \n"
  3877. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  3878. "pshufd $0x44,%%xmm2,%%xmm2 \n"
  3879. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  3880. "pshufd $0x44,%%xmm3,%%xmm3 \n"
  3881. "pshuflw $0x40,%%xmm4,%%xmm4 \n"
  3882. "pshufd $0x44,%%xmm4,%%xmm4 \n"
  3883. "pxor %%xmm5,%%xmm5 \n"
  3884. "pcmpeqb %%xmm6,%%xmm6 \n"
  3885. "pslld $0x18,%%xmm6 \n"
  3886. // 4 pixel loop.
  3887. LABELALIGN
  3888. "1: \n"
  3889. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3890. "punpcklbw %%xmm5,%%xmm0 \n"
  3891. "pmulhuw %%xmm2,%%xmm0 \n"
  3892. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3893. "punpckhbw %%xmm5,%%xmm1 \n"
  3894. "pmulhuw %%xmm2,%%xmm1 \n"
  3895. "pmullw %%xmm3,%%xmm0 \n"
  3896. "movdqu " MEMACCESS(0) ",%%xmm7 \n"
  3897. "pmullw %%xmm3,%%xmm1 \n"
  3898. "pand %%xmm6,%%xmm7 \n"
  3899. "paddw %%xmm4,%%xmm0 \n"
  3900. "paddw %%xmm4,%%xmm1 \n"
  3901. "packuswb %%xmm1,%%xmm0 \n"
  3902. "por %%xmm7,%%xmm0 \n"
  3903. "movdqu %%xmm0," MEMACCESS(0) " \n"
  3904. "lea " MEMLEA(0x10,0) ",%0 \n"
  3905. "sub $0x4,%1 \n"
  3906. "jg 1b \n"
  3907. : "+r"(dst_argb), // %0
  3908. "+r"(width) // %1
  3909. : "r"(scale), // %2
  3910. "r"(interval_size), // %3
  3911. "r"(interval_offset) // %4
  3912. : "memory", "cc"
  3913. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3914. );
  3915. }
  3916. #endif // HAS_ARGBQUANTIZEROW_SSE2
  3917. #ifdef HAS_ARGBSHADEROW_SSE2
  3918. // Shade 4 pixels at a time by specified value.
  3919. void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
  3920. uint32 value) {
  3921. asm volatile (
  3922. "movd %3,%%xmm2 \n"
  3923. "punpcklbw %%xmm2,%%xmm2 \n"
  3924. "punpcklqdq %%xmm2,%%xmm2 \n"
  3925. // 4 pixel loop.
  3926. LABELALIGN
  3927. "1: \n"
  3928. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3929. "lea " MEMLEA(0x10,0) ",%0 \n"
  3930. "movdqa %%xmm0,%%xmm1 \n"
  3931. "punpcklbw %%xmm0,%%xmm0 \n"
  3932. "punpckhbw %%xmm1,%%xmm1 \n"
  3933. "pmulhuw %%xmm2,%%xmm0 \n"
  3934. "pmulhuw %%xmm2,%%xmm1 \n"
  3935. "psrlw $0x8,%%xmm0 \n"
  3936. "psrlw $0x8,%%xmm1 \n"
  3937. "packuswb %%xmm1,%%xmm0 \n"
  3938. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3939. "lea " MEMLEA(0x10,1) ",%1 \n"
  3940. "sub $0x4,%2 \n"
  3941. "jg 1b \n"
  3942. : "+r"(src_argb), // %0
  3943. "+r"(dst_argb), // %1
  3944. "+r"(width) // %2
  3945. : "r"(value) // %3
  3946. : "memory", "cc"
  3947. , "xmm0", "xmm1", "xmm2"
  3948. );
  3949. }
  3950. #endif // HAS_ARGBSHADEROW_SSE2
  3951. #ifdef HAS_ARGBMULTIPLYROW_SSE2
  3952. // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  3953. void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  3954. uint8* dst_argb, int width) {
  3955. asm volatile (
  3956. "pxor %%xmm5,%%xmm5 \n"
  3957. // 4 pixel loop.
  3958. LABELALIGN
  3959. "1: \n"
  3960. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3961. "lea " MEMLEA(0x10,0) ",%0 \n"
  3962. "movdqu " MEMACCESS(1) ",%%xmm2 \n"
  3963. "lea " MEMLEA(0x10,1) ",%1 \n"
  3964. "movdqu %%xmm0,%%xmm1 \n"
  3965. "movdqu %%xmm2,%%xmm3 \n"
  3966. "punpcklbw %%xmm0,%%xmm0 \n"
  3967. "punpckhbw %%xmm1,%%xmm1 \n"
  3968. "punpcklbw %%xmm5,%%xmm2 \n"
  3969. "punpckhbw %%xmm5,%%xmm3 \n"
  3970. "pmulhuw %%xmm2,%%xmm0 \n"
  3971. "pmulhuw %%xmm3,%%xmm1 \n"
  3972. "packuswb %%xmm1,%%xmm0 \n"
  3973. "movdqu %%xmm0," MEMACCESS(2) " \n"
  3974. "lea " MEMLEA(0x10,2) ",%2 \n"
  3975. "sub $0x4,%3 \n"
  3976. "jg 1b \n"
  3977. : "+r"(src_argb0), // %0
  3978. "+r"(src_argb1), // %1
  3979. "+r"(dst_argb), // %2
  3980. "+r"(width) // %3
  3981. :
  3982. : "memory", "cc"
  3983. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3984. );
  3985. }
  3986. #endif // HAS_ARGBMULTIPLYROW_SSE2
  3987. #ifdef HAS_ARGBMULTIPLYROW_AVX2
  3988. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  3989. void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  3990. uint8* dst_argb, int width) {
  3991. asm volatile (
  3992. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  3993. // 4 pixel loop.
  3994. LABELALIGN
  3995. "1: \n"
  3996. "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
  3997. "lea " MEMLEA(0x20,0) ",%0 \n"
  3998. "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
  3999. "lea " MEMLEA(0x20,1) ",%1 \n"
  4000. "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
  4001. "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
  4002. "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
  4003. "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
  4004. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  4005. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  4006. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4007. "vmovdqu %%ymm0," MEMACCESS(2) " \n"
  4008. "lea " MEMLEA(0x20,2) ",%2 \n"
  4009. "sub $0x8,%3 \n"
  4010. "jg 1b \n"
  4011. "vzeroupper \n"
  4012. : "+r"(src_argb0), // %0
  4013. "+r"(src_argb1), // %1
  4014. "+r"(dst_argb), // %2
  4015. "+r"(width) // %3
  4016. :
  4017. : "memory", "cc"
  4018. #if defined(__AVX2__)
  4019. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4020. #endif
  4021. );
  4022. }
  4023. #endif // HAS_ARGBMULTIPLYROW_AVX2
  4024. #ifdef HAS_ARGBADDROW_SSE2
  4025. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4026. void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  4027. uint8* dst_argb, int width) {
  4028. asm volatile (
  4029. // 4 pixel loop.
  4030. LABELALIGN
  4031. "1: \n"
  4032. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4033. "lea " MEMLEA(0x10,0) ",%0 \n"
  4034. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  4035. "lea " MEMLEA(0x10,1) ",%1 \n"
  4036. "paddusb %%xmm1,%%xmm0 \n"
  4037. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4038. "lea " MEMLEA(0x10,2) ",%2 \n"
  4039. "sub $0x4,%3 \n"
  4040. "jg 1b \n"
  4041. : "+r"(src_argb0), // %0
  4042. "+r"(src_argb1), // %1
  4043. "+r"(dst_argb), // %2
  4044. "+r"(width) // %3
  4045. :
  4046. : "memory", "cc"
  4047. , "xmm0", "xmm1"
  4048. );
  4049. }
  4050. #endif // HAS_ARGBADDROW_SSE2
  4051. #ifdef HAS_ARGBADDROW_AVX2
  4052. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4053. void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  4054. uint8* dst_argb, int width) {
  4055. asm volatile (
  4056. // 4 pixel loop.
  4057. LABELALIGN
  4058. "1: \n"
  4059. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  4060. "lea " MEMLEA(0x20,0) ",%0 \n"
  4061. "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
  4062. "lea " MEMLEA(0x20,1) ",%1 \n"
  4063. "vmovdqu %%ymm0," MEMACCESS(2) " \n"
  4064. "lea " MEMLEA(0x20,2) ",%2 \n"
  4065. "sub $0x8,%3 \n"
  4066. "jg 1b \n"
  4067. "vzeroupper \n"
  4068. : "+r"(src_argb0), // %0
  4069. "+r"(src_argb1), // %1
  4070. "+r"(dst_argb), // %2
  4071. "+r"(width) // %3
  4072. :
  4073. : "memory", "cc"
  4074. , "xmm0"
  4075. );
  4076. }
  4077. #endif // HAS_ARGBADDROW_AVX2
  4078. #ifdef HAS_ARGBSUBTRACTROW_SSE2
  4079. // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
  4080. void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  4081. uint8* dst_argb, int width) {
  4082. asm volatile (
  4083. // 4 pixel loop.
  4084. LABELALIGN
  4085. "1: \n"
  4086. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4087. "lea " MEMLEA(0x10,0) ",%0 \n"
  4088. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  4089. "lea " MEMLEA(0x10,1) ",%1 \n"
  4090. "psubusb %%xmm1,%%xmm0 \n"
  4091. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4092. "lea " MEMLEA(0x10,2) ",%2 \n"
  4093. "sub $0x4,%3 \n"
  4094. "jg 1b \n"
  4095. : "+r"(src_argb0), // %0
  4096. "+r"(src_argb1), // %1
  4097. "+r"(dst_argb), // %2
  4098. "+r"(width) // %3
  4099. :
  4100. : "memory", "cc"
  4101. , "xmm0", "xmm1"
  4102. );
  4103. }
  4104. #endif // HAS_ARGBSUBTRACTROW_SSE2
  4105. #ifdef HAS_ARGBSUBTRACTROW_AVX2
  4106. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  4107. void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  4108. uint8* dst_argb, int width) {
  4109. asm volatile (
  4110. // 4 pixel loop.
  4111. LABELALIGN
  4112. "1: \n"
  4113. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  4114. "lea " MEMLEA(0x20,0) ",%0 \n"
  4115. "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
  4116. "lea " MEMLEA(0x20,1) ",%1 \n"
  4117. "vmovdqu %%ymm0," MEMACCESS(2) " \n"
  4118. "lea " MEMLEA(0x20,2) ",%2 \n"
  4119. "sub $0x8,%3 \n"
  4120. "jg 1b \n"
  4121. "vzeroupper \n"
  4122. : "+r"(src_argb0), // %0
  4123. "+r"(src_argb1), // %1
  4124. "+r"(dst_argb), // %2
  4125. "+r"(width) // %3
  4126. :
  4127. : "memory", "cc"
  4128. , "xmm0"
  4129. );
  4130. }
  4131. #endif // HAS_ARGBSUBTRACTROW_AVX2
  4132. #ifdef HAS_SOBELXROW_SSE2
  4133. // SobelX as a matrix is
  4134. // -1 0 1
  4135. // -2 0 2
  4136. // -1 0 1
  4137. void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  4138. const uint8* src_y2, uint8* dst_sobelx, int width) {
  4139. asm volatile (
  4140. "sub %0,%1 \n"
  4141. "sub %0,%2 \n"
  4142. "sub %0,%3 \n"
  4143. "pxor %%xmm5,%%xmm5 \n"
  4144. // 8 pixel loop.
  4145. LABELALIGN
  4146. "1: \n"
  4147. "movq " MEMACCESS(0) ",%%xmm0 \n"
  4148. "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
  4149. "punpcklbw %%xmm5,%%xmm0 \n"
  4150. "punpcklbw %%xmm5,%%xmm1 \n"
  4151. "psubw %%xmm1,%%xmm0 \n"
  4152. MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
  4153. MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
  4154. "punpcklbw %%xmm5,%%xmm1 \n"
  4155. "punpcklbw %%xmm5,%%xmm2 \n"
  4156. "psubw %%xmm2,%%xmm1 \n"
  4157. MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
  4158. MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
  4159. "punpcklbw %%xmm5,%%xmm2 \n"
  4160. "punpcklbw %%xmm5,%%xmm3 \n"
  4161. "psubw %%xmm3,%%xmm2 \n"
  4162. "paddw %%xmm2,%%xmm0 \n"
  4163. "paddw %%xmm1,%%xmm0 \n"
  4164. "paddw %%xmm1,%%xmm0 \n"
  4165. "pxor %%xmm1,%%xmm1 \n"
  4166. "psubw %%xmm0,%%xmm1 \n"
  4167. "pmaxsw %%xmm1,%%xmm0 \n"
  4168. "packuswb %%xmm0,%%xmm0 \n"
  4169. MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
  4170. "lea " MEMLEA(0x8,0) ",%0 \n"
  4171. "sub $0x8,%4 \n"
  4172. "jg 1b \n"
  4173. : "+r"(src_y0), // %0
  4174. "+r"(src_y1), // %1
  4175. "+r"(src_y2), // %2
  4176. "+r"(dst_sobelx), // %3
  4177. "+r"(width) // %4
  4178. :
  4179. : "memory", "cc", NACL_R14
  4180. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4181. );
  4182. }
  4183. #endif // HAS_SOBELXROW_SSE2
  4184. #ifdef HAS_SOBELYROW_SSE2
  4185. // SobelY as a matrix is
  4186. // -1 -2 -1
  4187. // 0 0 0
  4188. // 1 2 1
  4189. void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  4190. uint8* dst_sobely, int width) {
  4191. asm volatile (
  4192. "sub %0,%1 \n"
  4193. "sub %0,%2 \n"
  4194. "pxor %%xmm5,%%xmm5 \n"
  4195. // 8 pixel loop.
  4196. LABELALIGN
  4197. "1: \n"
  4198. "movq " MEMACCESS(0) ",%%xmm0 \n"
  4199. MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
  4200. "punpcklbw %%xmm5,%%xmm0 \n"
  4201. "punpcklbw %%xmm5,%%xmm1 \n"
  4202. "psubw %%xmm1,%%xmm0 \n"
  4203. "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
  4204. MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
  4205. "punpcklbw %%xmm5,%%xmm1 \n"
  4206. "punpcklbw %%xmm5,%%xmm2 \n"
  4207. "psubw %%xmm2,%%xmm1 \n"
  4208. "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
  4209. MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
  4210. "punpcklbw %%xmm5,%%xmm2 \n"
  4211. "punpcklbw %%xmm5,%%xmm3 \n"
  4212. "psubw %%xmm3,%%xmm2 \n"
  4213. "paddw %%xmm2,%%xmm0 \n"
  4214. "paddw %%xmm1,%%xmm0 \n"
  4215. "paddw %%xmm1,%%xmm0 \n"
  4216. "pxor %%xmm1,%%xmm1 \n"
  4217. "psubw %%xmm0,%%xmm1 \n"
  4218. "pmaxsw %%xmm1,%%xmm0 \n"
  4219. "packuswb %%xmm0,%%xmm0 \n"
  4220. MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
  4221. "lea " MEMLEA(0x8,0) ",%0 \n"
  4222. "sub $0x8,%3 \n"
  4223. "jg 1b \n"
  4224. : "+r"(src_y0), // %0
  4225. "+r"(src_y1), // %1
  4226. "+r"(dst_sobely), // %2
  4227. "+r"(width) // %3
  4228. :
  4229. : "memory", "cc", NACL_R14
  4230. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4231. );
  4232. }
  4233. #endif // HAS_SOBELYROW_SSE2
  4234. #ifdef HAS_SOBELROW_SSE2
  4235. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  4236. // A = 255
  4237. // R = Sobel
  4238. // G = Sobel
  4239. // B = Sobel
  4240. void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4241. uint8* dst_argb, int width) {
  4242. asm volatile (
  4243. "sub %0,%1 \n"
  4244. "pcmpeqb %%xmm5,%%xmm5 \n"
  4245. "pslld $0x18,%%xmm5 \n"
  4246. // 8 pixel loop.
  4247. LABELALIGN
  4248. "1: \n"
  4249. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4250. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  4251. "lea " MEMLEA(0x10,0) ",%0 \n"
  4252. "paddusb %%xmm1,%%xmm0 \n"
  4253. "movdqa %%xmm0,%%xmm2 \n"
  4254. "punpcklbw %%xmm0,%%xmm2 \n"
  4255. "punpckhbw %%xmm0,%%xmm0 \n"
  4256. "movdqa %%xmm2,%%xmm1 \n"
  4257. "punpcklwd %%xmm2,%%xmm1 \n"
  4258. "punpckhwd %%xmm2,%%xmm2 \n"
  4259. "por %%xmm5,%%xmm1 \n"
  4260. "por %%xmm5,%%xmm2 \n"
  4261. "movdqa %%xmm0,%%xmm3 \n"
  4262. "punpcklwd %%xmm0,%%xmm3 \n"
  4263. "punpckhwd %%xmm0,%%xmm0 \n"
  4264. "por %%xmm5,%%xmm3 \n"
  4265. "por %%xmm5,%%xmm0 \n"
  4266. "movdqu %%xmm1," MEMACCESS(2) " \n"
  4267. "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
  4268. "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
  4269. "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
  4270. "lea " MEMLEA(0x40,2) ",%2 \n"
  4271. "sub $0x10,%3 \n"
  4272. "jg 1b \n"
  4273. : "+r"(src_sobelx), // %0
  4274. "+r"(src_sobely), // %1
  4275. "+r"(dst_argb), // %2
  4276. "+r"(width) // %3
  4277. :
  4278. : "memory", "cc", NACL_R14
  4279. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4280. );
  4281. }
  4282. #endif // HAS_SOBELROW_SSE2
  4283. #ifdef HAS_SOBELTOPLANEROW_SSE2
  4284. // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  4285. void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4286. uint8* dst_y, int width) {
  4287. asm volatile (
  4288. "sub %0,%1 \n"
  4289. "pcmpeqb %%xmm5,%%xmm5 \n"
  4290. "pslld $0x18,%%xmm5 \n"
  4291. // 8 pixel loop.
  4292. LABELALIGN
  4293. "1: \n"
  4294. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4295. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  4296. "lea " MEMLEA(0x10,0) ",%0 \n"
  4297. "paddusb %%xmm1,%%xmm0 \n"
  4298. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4299. "lea " MEMLEA(0x10,2) ",%2 \n"
  4300. "sub $0x10,%3 \n"
  4301. "jg 1b \n"
  4302. : "+r"(src_sobelx), // %0
  4303. "+r"(src_sobely), // %1
  4304. "+r"(dst_y), // %2
  4305. "+r"(width) // %3
  4306. :
  4307. : "memory", "cc", NACL_R14
  4308. "xmm0", "xmm1"
  4309. );
  4310. }
  4311. #endif // HAS_SOBELTOPLANEROW_SSE2
  4312. #ifdef HAS_SOBELXYROW_SSE2
  4313. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  4314. // A = 255
  4315. // R = Sobel X
  4316. // G = Sobel
  4317. // B = Sobel Y
  4318. void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4319. uint8* dst_argb, int width) {
  4320. asm volatile (
  4321. "sub %0,%1 \n"
  4322. "pcmpeqb %%xmm5,%%xmm5 \n"
  4323. // 8 pixel loop.
  4324. LABELALIGN
  4325. "1: \n"
  4326. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4327. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  4328. "lea " MEMLEA(0x10,0) ",%0 \n"
  4329. "movdqa %%xmm0,%%xmm2 \n"
  4330. "paddusb %%xmm1,%%xmm2 \n"
  4331. "movdqa %%xmm0,%%xmm3 \n"
  4332. "punpcklbw %%xmm5,%%xmm3 \n"
  4333. "punpckhbw %%xmm5,%%xmm0 \n"
  4334. "movdqa %%xmm1,%%xmm4 \n"
  4335. "punpcklbw %%xmm2,%%xmm4 \n"
  4336. "punpckhbw %%xmm2,%%xmm1 \n"
  4337. "movdqa %%xmm4,%%xmm6 \n"
  4338. "punpcklwd %%xmm3,%%xmm6 \n"
  4339. "punpckhwd %%xmm3,%%xmm4 \n"
  4340. "movdqa %%xmm1,%%xmm7 \n"
  4341. "punpcklwd %%xmm0,%%xmm7 \n"
  4342. "punpckhwd %%xmm0,%%xmm1 \n"
  4343. "movdqu %%xmm6," MEMACCESS(2) " \n"
  4344. "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
  4345. "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
  4346. "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
  4347. "lea " MEMLEA(0x40,2) ",%2 \n"
  4348. "sub $0x10,%3 \n"
  4349. "jg 1b \n"
  4350. : "+r"(src_sobelx), // %0
  4351. "+r"(src_sobely), // %1
  4352. "+r"(dst_argb), // %2
  4353. "+r"(width) // %3
  4354. :
  4355. : "memory", "cc", NACL_R14
  4356. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4357. );
  4358. }
  4359. #endif // HAS_SOBELXYROW_SSE2
  4360. #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  4361. // Creates a table of cumulative sums where each value is a sum of all values
  4362. // above and to the left of the value, inclusive of the value.
  4363. void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
  4364. const int32* previous_cumsum, int width) {
  4365. asm volatile (
  4366. "pxor %%xmm0,%%xmm0 \n"
  4367. "pxor %%xmm1,%%xmm1 \n"
  4368. "sub $0x4,%3 \n"
  4369. "jl 49f \n"
  4370. "test $0xf,%1 \n"
  4371. "jne 49f \n"
  4372. // 4 pixel loop \n"
  4373. LABELALIGN
  4374. "40: \n"
  4375. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  4376. "lea " MEMLEA(0x10,0) ",%0 \n"
  4377. "movdqa %%xmm2,%%xmm4 \n"
  4378. "punpcklbw %%xmm1,%%xmm2 \n"
  4379. "movdqa %%xmm2,%%xmm3 \n"
  4380. "punpcklwd %%xmm1,%%xmm2 \n"
  4381. "punpckhwd %%xmm1,%%xmm3 \n"
  4382. "punpckhbw %%xmm1,%%xmm4 \n"
  4383. "movdqa %%xmm4,%%xmm5 \n"
  4384. "punpcklwd %%xmm1,%%xmm4 \n"
  4385. "punpckhwd %%xmm1,%%xmm5 \n"
  4386. "paddd %%xmm2,%%xmm0 \n"
  4387. "movdqu " MEMACCESS(2) ",%%xmm2 \n"
  4388. "paddd %%xmm0,%%xmm2 \n"
  4389. "paddd %%xmm3,%%xmm0 \n"
  4390. "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
  4391. "paddd %%xmm0,%%xmm3 \n"
  4392. "paddd %%xmm4,%%xmm0 \n"
  4393. "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
  4394. "paddd %%xmm0,%%xmm4 \n"
  4395. "paddd %%xmm5,%%xmm0 \n"
  4396. "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
  4397. "lea " MEMLEA(0x40,2) ",%2 \n"
  4398. "paddd %%xmm0,%%xmm5 \n"
  4399. "movdqu %%xmm2," MEMACCESS(1) " \n"
  4400. "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
  4401. "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
  4402. "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
  4403. "lea " MEMLEA(0x40,1) ",%1 \n"
  4404. "sub $0x4,%3 \n"
  4405. "jge 40b \n"
  4406. "49: \n"
  4407. "add $0x3,%3 \n"
  4408. "jl 19f \n"
  4409. // 1 pixel loop \n"
  4410. LABELALIGN
  4411. "10: \n"
  4412. "movd " MEMACCESS(0) ",%%xmm2 \n"
  4413. "lea " MEMLEA(0x4,0) ",%0 \n"
  4414. "punpcklbw %%xmm1,%%xmm2 \n"
  4415. "punpcklwd %%xmm1,%%xmm2 \n"
  4416. "paddd %%xmm2,%%xmm0 \n"
  4417. "movdqu " MEMACCESS(2) ",%%xmm2 \n"
  4418. "lea " MEMLEA(0x10,2) ",%2 \n"
  4419. "paddd %%xmm0,%%xmm2 \n"
  4420. "movdqu %%xmm2," MEMACCESS(1) " \n"
  4421. "lea " MEMLEA(0x10,1) ",%1 \n"
  4422. "sub $0x1,%3 \n"
  4423. "jge 10b \n"
  4424. "19: \n"
  4425. : "+r"(row), // %0
  4426. "+r"(cumsum), // %1
  4427. "+r"(previous_cumsum), // %2
  4428. "+r"(width) // %3
  4429. :
  4430. : "memory", "cc"
  4431. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  4432. );
  4433. }
  4434. #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
  4435. #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4436. void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
  4437. int width, int area, uint8* dst,
  4438. int count) {
  4439. asm volatile (
  4440. "movd %5,%%xmm5 \n"
  4441. "cvtdq2ps %%xmm5,%%xmm5 \n"
  4442. "rcpss %%xmm5,%%xmm4 \n"
  4443. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  4444. "sub $0x4,%3 \n"
  4445. "jl 49f \n"
  4446. "cmpl $0x80,%5 \n"
  4447. "ja 40f \n"
  4448. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  4449. "pcmpeqb %%xmm6,%%xmm6 \n"
  4450. "psrld $0x10,%%xmm6 \n"
  4451. "cvtdq2ps %%xmm6,%%xmm6 \n"
  4452. "addps %%xmm6,%%xmm5 \n"
  4453. "mulps %%xmm4,%%xmm5 \n"
  4454. "cvtps2dq %%xmm5,%%xmm5 \n"
  4455. "packssdw %%xmm5,%%xmm5 \n"
  4456. // 4 pixel small loop \n"
  4457. LABELALIGN
  4458. "4: \n"
  4459. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4460. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  4461. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  4462. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  4463. MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
  4464. MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
  4465. MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
  4466. MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
  4467. "lea " MEMLEA(0x40,0) ",%0 \n"
  4468. "psubd " MEMACCESS(1) ",%%xmm0 \n"
  4469. "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
  4470. "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
  4471. "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
  4472. MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
  4473. MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
  4474. MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
  4475. MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
  4476. "lea " MEMLEA(0x40,1) ",%1 \n"
  4477. "packssdw %%xmm1,%%xmm0 \n"
  4478. "packssdw %%xmm3,%%xmm2 \n"
  4479. "pmulhuw %%xmm5,%%xmm0 \n"
  4480. "pmulhuw %%xmm5,%%xmm2 \n"
  4481. "packuswb %%xmm2,%%xmm0 \n"
  4482. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4483. "lea " MEMLEA(0x10,2) ",%2 \n"
  4484. "sub $0x4,%3 \n"
  4485. "jge 4b \n"
  4486. "jmp 49f \n"
  4487. // 4 pixel loop \n"
  4488. LABELALIGN
  4489. "40: \n"
  4490. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4491. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  4492. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  4493. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  4494. MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
  4495. MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
  4496. MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
  4497. MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
  4498. "lea " MEMLEA(0x40,0) ",%0 \n"
  4499. "psubd " MEMACCESS(1) ",%%xmm0 \n"
  4500. "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
  4501. "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
  4502. "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
  4503. MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
  4504. MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
  4505. MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
  4506. MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
  4507. "lea " MEMLEA(0x40,1) ",%1 \n"
  4508. "cvtdq2ps %%xmm0,%%xmm0 \n"
  4509. "cvtdq2ps %%xmm1,%%xmm1 \n"
  4510. "mulps %%xmm4,%%xmm0 \n"
  4511. "mulps %%xmm4,%%xmm1 \n"
  4512. "cvtdq2ps %%xmm2,%%xmm2 \n"
  4513. "cvtdq2ps %%xmm3,%%xmm3 \n"
  4514. "mulps %%xmm4,%%xmm2 \n"
  4515. "mulps %%xmm4,%%xmm3 \n"
  4516. "cvtps2dq %%xmm0,%%xmm0 \n"
  4517. "cvtps2dq %%xmm1,%%xmm1 \n"
  4518. "cvtps2dq %%xmm2,%%xmm2 \n"
  4519. "cvtps2dq %%xmm3,%%xmm3 \n"
  4520. "packssdw %%xmm1,%%xmm0 \n"
  4521. "packssdw %%xmm3,%%xmm2 \n"
  4522. "packuswb %%xmm2,%%xmm0 \n"
  4523. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4524. "lea " MEMLEA(0x10,2) ",%2 \n"
  4525. "sub $0x4,%3 \n"
  4526. "jge 40b \n"
  4527. "49: \n"
  4528. "add $0x3,%3 \n"
  4529. "jl 19f \n"
  4530. // 1 pixel loop \n"
  4531. LABELALIGN
  4532. "10: \n"
  4533. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4534. MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
  4535. "lea " MEMLEA(0x10,0) ",%0 \n"
  4536. "psubd " MEMACCESS(1) ",%%xmm0 \n"
  4537. MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
  4538. "lea " MEMLEA(0x10,1) ",%1 \n"
  4539. "cvtdq2ps %%xmm0,%%xmm0 \n"
  4540. "mulps %%xmm4,%%xmm0 \n"
  4541. "cvtps2dq %%xmm0,%%xmm0 \n"
  4542. "packssdw %%xmm0,%%xmm0 \n"
  4543. "packuswb %%xmm0,%%xmm0 \n"
  4544. "movd %%xmm0," MEMACCESS(2) " \n"
  4545. "lea " MEMLEA(0x4,2) ",%2 \n"
  4546. "sub $0x1,%3 \n"
  4547. "jge 10b \n"
  4548. "19: \n"
  4549. : "+r"(topleft), // %0
  4550. "+r"(botleft), // %1
  4551. "+r"(dst), // %2
  4552. "+rm"(count) // %3
  4553. : "r"((intptr_t)(width)), // %4
  4554. "rm"(area) // %5
  4555. : "memory", "cc", NACL_R14
  4556. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  4557. );
  4558. }
  4559. #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4560. #ifdef HAS_ARGBAFFINEROW_SSE2
  4561. // Copy ARGB pixels from source image with slope to a row of destination.
  4562. LIBYUV_API
  4563. void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
  4564. uint8* dst_argb, const float* src_dudv, int width) {
  4565. intptr_t src_argb_stride_temp = src_argb_stride;
  4566. intptr_t temp;
  4567. asm volatile (
  4568. "movq " MEMACCESS(3) ",%%xmm2 \n"
  4569. "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
  4570. "shl $0x10,%1 \n"
  4571. "add $0x4,%1 \n"
  4572. "movd %1,%%xmm5 \n"
  4573. "sub $0x4,%4 \n"
  4574. "jl 49f \n"
  4575. "pshufd $0x44,%%xmm7,%%xmm7 \n"
  4576. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  4577. "movdqa %%xmm2,%%xmm0 \n"
  4578. "addps %%xmm7,%%xmm0 \n"
  4579. "movlhps %%xmm0,%%xmm2 \n"
  4580. "movdqa %%xmm7,%%xmm4 \n"
  4581. "addps %%xmm4,%%xmm4 \n"
  4582. "movdqa %%xmm2,%%xmm3 \n"
  4583. "addps %%xmm4,%%xmm3 \n"
  4584. "addps %%xmm4,%%xmm4 \n"
  4585. // 4 pixel loop \n"
  4586. LABELALIGN
  4587. "40: \n"
  4588. "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
  4589. "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
  4590. "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
  4591. "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
  4592. "movd %%xmm0,%k1 \n"
  4593. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  4594. "movd %%xmm0,%k5 \n"
  4595. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  4596. MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
  4597. MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
  4598. "punpckldq %%xmm6,%%xmm1 \n"
  4599. "addps %%xmm4,%%xmm2 \n"
  4600. "movq %%xmm1," MEMACCESS(2) " \n"
  4601. "movd %%xmm0,%k1 \n"
  4602. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  4603. "movd %%xmm0,%k5 \n"
  4604. MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
  4605. MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
  4606. "punpckldq %%xmm6,%%xmm0 \n"
  4607. "addps %%xmm4,%%xmm3 \n"
  4608. "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
  4609. "lea " MEMLEA(0x10,2) ",%2 \n"
  4610. "sub $0x4,%4 \n"
  4611. "jge 40b \n"
  4612. "49: \n"
  4613. "add $0x3,%4 \n"
  4614. "jl 19f \n"
  4615. // 1 pixel loop \n"
  4616. LABELALIGN
  4617. "10: \n"
  4618. "cvttps2dq %%xmm2,%%xmm0 \n"
  4619. "packssdw %%xmm0,%%xmm0 \n"
  4620. "pmaddwd %%xmm5,%%xmm0 \n"
  4621. "addps %%xmm7,%%xmm2 \n"
  4622. "movd %%xmm0,%k1 \n"
  4623. MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
  4624. "movd %%xmm0," MEMACCESS(2) " \n"
  4625. "lea " MEMLEA(0x04,2) ",%2 \n"
  4626. "sub $0x1,%4 \n"
  4627. "jge 10b \n"
  4628. "19: \n"
  4629. : "+r"(src_argb), // %0
  4630. "+r"(src_argb_stride_temp), // %1
  4631. "+r"(dst_argb), // %2
  4632. "+r"(src_dudv), // %3
  4633. "+rm"(width), // %4
  4634. "=&r"(temp) // %5
  4635. :
  4636. : "memory", "cc", NACL_R14
  4637. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4638. );
  4639. }
  4640. #endif // HAS_ARGBAFFINEROW_SSE2
  4641. #ifdef HAS_INTERPOLATEROW_SSSE3
  4642. // Bilinear filter 16x2 -> 16x1
  4643. void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  4644. ptrdiff_t src_stride, int dst_width,
  4645. int source_y_fraction) {
  4646. asm volatile (
  4647. "sub %1,%0 \n"
  4648. "cmp $0x0,%3 \n"
  4649. "je 100f \n"
  4650. "cmp $0x80,%3 \n"
  4651. "je 50f \n"
  4652. "movd %3,%%xmm0 \n"
  4653. "neg %3 \n"
  4654. "add $0x100,%3 \n"
  4655. "movd %3,%%xmm5 \n"
  4656. "punpcklbw %%xmm0,%%xmm5 \n"
  4657. "punpcklwd %%xmm5,%%xmm5 \n"
  4658. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  4659. "mov $0x80808080,%%eax \n"
  4660. "movd %%eax,%%xmm4 \n"
  4661. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  4662. // General purpose row blend.
  4663. LABELALIGN
  4664. "1: \n"
  4665. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  4666. MEMOPREG(movdqu,0x00,1,4,1,xmm2)
  4667. "movdqa %%xmm0,%%xmm1 \n"
  4668. "punpcklbw %%xmm2,%%xmm0 \n"
  4669. "punpckhbw %%xmm2,%%xmm1 \n"
  4670. "psubb %%xmm4,%%xmm0 \n"
  4671. "psubb %%xmm4,%%xmm1 \n"
  4672. "movdqa %%xmm5,%%xmm2 \n"
  4673. "movdqa %%xmm5,%%xmm3 \n"
  4674. "pmaddubsw %%xmm0,%%xmm2 \n"
  4675. "pmaddubsw %%xmm1,%%xmm3 \n"
  4676. "paddw %%xmm4,%%xmm2 \n"
  4677. "paddw %%xmm4,%%xmm3 \n"
  4678. "psrlw $0x8,%%xmm2 \n"
  4679. "psrlw $0x8,%%xmm3 \n"
  4680. "packuswb %%xmm3,%%xmm2 \n"
  4681. MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
  4682. "lea " MEMLEA(0x10,1) ",%1 \n"
  4683. "sub $0x10,%2 \n"
  4684. "jg 1b \n"
  4685. "jmp 99f \n"
  4686. // Blend 50 / 50.
  4687. LABELALIGN
  4688. "50: \n"
  4689. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  4690. MEMOPREG(movdqu,0x00,1,4,1,xmm1)
  4691. "pavgb %%xmm1,%%xmm0 \n"
  4692. MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  4693. "lea " MEMLEA(0x10,1) ",%1 \n"
  4694. "sub $0x10,%2 \n"
  4695. "jg 50b \n"
  4696. "jmp 99f \n"
  4697. // Blend 100 / 0 - Copy row unchanged.
  4698. LABELALIGN
  4699. "100: \n"
  4700. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  4701. MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  4702. "lea " MEMLEA(0x10,1) ",%1 \n"
  4703. "sub $0x10,%2 \n"
  4704. "jg 100b \n"
  4705. "99: \n"
  4706. : "+r"(dst_ptr), // %0
  4707. "+r"(src_ptr), // %1
  4708. "+rm"(dst_width), // %2
  4709. "+r"(source_y_fraction) // %3
  4710. : "r"((intptr_t)(src_stride)) // %4
  4711. : "memory", "cc", "eax", NACL_R14
  4712. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  4713. );
  4714. }
  4715. #endif // HAS_INTERPOLATEROW_SSSE3
  4716. #ifdef HAS_INTERPOLATEROW_AVX2
  4717. // Bilinear filter 32x2 -> 32x1
  4718. void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
  4719. ptrdiff_t src_stride, int dst_width,
  4720. int source_y_fraction) {
  4721. asm volatile (
  4722. "cmp $0x0,%3 \n"
  4723. "je 100f \n"
  4724. "sub %1,%0 \n"
  4725. "cmp $0x80,%3 \n"
  4726. "je 50f \n"
  4727. "vmovd %3,%%xmm0 \n"
  4728. "neg %3 \n"
  4729. "add $0x100,%3 \n"
  4730. "vmovd %3,%%xmm5 \n"
  4731. "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
  4732. "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
  4733. "vbroadcastss %%xmm5,%%ymm5 \n"
  4734. "mov $0x80808080,%%eax \n"
  4735. "vmovd %%eax,%%xmm4 \n"
  4736. "vbroadcastss %%xmm4,%%ymm4 \n"
  4737. // General purpose row blend.
  4738. LABELALIGN
  4739. "1: \n"
  4740. "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
  4741. MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
  4742. "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
  4743. "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
  4744. "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
  4745. "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
  4746. "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
  4747. "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
  4748. "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
  4749. "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
  4750. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4751. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4752. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4753. MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
  4754. "lea " MEMLEA(0x20,1) ",%1 \n"
  4755. "sub $0x20,%2 \n"
  4756. "jg 1b \n"
  4757. "jmp 99f \n"
  4758. // Blend 50 / 50.
  4759. LABELALIGN
  4760. "50: \n"
  4761. "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
  4762. VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
  4763. MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
  4764. "lea " MEMLEA(0x20,1) ",%1 \n"
  4765. "sub $0x20,%2 \n"
  4766. "jg 50b \n"
  4767. "jmp 99f \n"
  4768. // Blend 100 / 0 - Copy row unchanged.
  4769. LABELALIGN
  4770. "100: \n"
  4771. "rep movsb " MEMMOVESTRING(1,0) " \n"
  4772. "jmp 999f \n"
  4773. "99: \n"
  4774. "vzeroupper \n"
  4775. "999: \n"
  4776. : "+D"(dst_ptr), // %0
  4777. "+S"(src_ptr), // %1
  4778. "+cm"(dst_width), // %2
  4779. "+r"(source_y_fraction) // %3
  4780. : "r"((intptr_t)(src_stride)) // %4
  4781. : "memory", "cc", "eax", NACL_R14
  4782. "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
  4783. );
  4784. }
  4785. #endif // HAS_INTERPOLATEROW_AVX2
  4786. #ifdef HAS_ARGBSHUFFLEROW_SSSE3
  4787. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  4788. void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  4789. const uint8* shuffler, int width) {
  4790. asm volatile (
  4791. "movdqu " MEMACCESS(3) ",%%xmm5 \n"
  4792. LABELALIGN
  4793. "1: \n"
  4794. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4795. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  4796. "lea " MEMLEA(0x20,0) ",%0 \n"
  4797. "pshufb %%xmm5,%%xmm0 \n"
  4798. "pshufb %%xmm5,%%xmm1 \n"
  4799. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4800. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  4801. "lea " MEMLEA(0x20,1) ",%1 \n"
  4802. "sub $0x8,%2 \n"
  4803. "jg 1b \n"
  4804. : "+r"(src_argb), // %0
  4805. "+r"(dst_argb), // %1
  4806. "+r"(width) // %2
  4807. : "r"(shuffler) // %3
  4808. : "memory", "cc"
  4809. , "xmm0", "xmm1", "xmm5"
  4810. );
  4811. }
  4812. #endif // HAS_ARGBSHUFFLEROW_SSSE3
  4813. #ifdef HAS_ARGBSHUFFLEROW_AVX2
  4814. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  4815. void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  4816. const uint8* shuffler, int width) {
  4817. asm volatile (
  4818. "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
  4819. LABELALIGN
  4820. "1: \n"
  4821. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  4822. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  4823. "lea " MEMLEA(0x40,0) ",%0 \n"
  4824. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  4825. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  4826. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  4827. "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
  4828. "lea " MEMLEA(0x40,1) ",%1 \n"
  4829. "sub $0x10,%2 \n"
  4830. "jg 1b \n"
  4831. "vzeroupper \n"
  4832. : "+r"(src_argb), // %0
  4833. "+r"(dst_argb), // %1
  4834. "+r"(width) // %2
  4835. : "r"(shuffler) // %3
  4836. : "memory", "cc"
  4837. , "xmm0", "xmm1", "xmm5"
  4838. );
  4839. }
  4840. #endif // HAS_ARGBSHUFFLEROW_AVX2
  4841. #ifdef HAS_ARGBSHUFFLEROW_SSE2
  4842. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  4843. void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  4844. const uint8* shuffler, int width) {
  4845. uintptr_t pixel_temp;
  4846. asm volatile (
  4847. "pxor %%xmm5,%%xmm5 \n"
  4848. "mov " MEMACCESS(4) ",%k2 \n"
  4849. "cmp $0x3000102,%k2 \n"
  4850. "je 3012f \n"
  4851. "cmp $0x10203,%k2 \n"
  4852. "je 123f \n"
  4853. "cmp $0x30201,%k2 \n"
  4854. "je 321f \n"
  4855. "cmp $0x2010003,%k2 \n"
  4856. "je 2103f \n"
  4857. LABELALIGN
  4858. "1: \n"
  4859. "movzb " MEMACCESS(4) ",%2 \n"
  4860. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  4861. "mov %b2," MEMACCESS(1) " \n"
  4862. "movzb " MEMACCESS2(0x1,4) ",%2 \n"
  4863. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  4864. "mov %b2," MEMACCESS2(0x1,1) " \n"
  4865. "movzb " MEMACCESS2(0x2,4) ",%2 \n"
  4866. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  4867. "mov %b2," MEMACCESS2(0x2,1) " \n"
  4868. "movzb " MEMACCESS2(0x3,4) ",%2 \n"
  4869. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  4870. "mov %b2," MEMACCESS2(0x3,1) " \n"
  4871. "lea " MEMLEA(0x4,0) ",%0 \n"
  4872. "lea " MEMLEA(0x4,1) ",%1 \n"
  4873. "sub $0x1,%3 \n"
  4874. "jg 1b \n"
  4875. "jmp 99f \n"
  4876. LABELALIGN
  4877. "123: \n"
  4878. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4879. "lea " MEMLEA(0x10,0) ",%0 \n"
  4880. "movdqa %%xmm0,%%xmm1 \n"
  4881. "punpcklbw %%xmm5,%%xmm0 \n"
  4882. "punpckhbw %%xmm5,%%xmm1 \n"
  4883. "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
  4884. "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
  4885. "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
  4886. "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
  4887. "packuswb %%xmm1,%%xmm0 \n"
  4888. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4889. "lea " MEMLEA(0x10,1) ",%1 \n"
  4890. "sub $0x4,%3 \n"
  4891. "jg 123b \n"
  4892. "jmp 99f \n"
  4893. LABELALIGN
  4894. "321: \n"
  4895. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4896. "lea " MEMLEA(0x10,0) ",%0 \n"
  4897. "movdqa %%xmm0,%%xmm1 \n"
  4898. "punpcklbw %%xmm5,%%xmm0 \n"
  4899. "punpckhbw %%xmm5,%%xmm1 \n"
  4900. "pshufhw $0x39,%%xmm0,%%xmm0 \n"
  4901. "pshuflw $0x39,%%xmm0,%%xmm0 \n"
  4902. "pshufhw $0x39,%%xmm1,%%xmm1 \n"
  4903. "pshuflw $0x39,%%xmm1,%%xmm1 \n"
  4904. "packuswb %%xmm1,%%xmm0 \n"
  4905. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4906. "lea " MEMLEA(0x10,1) ",%1 \n"
  4907. "sub $0x4,%3 \n"
  4908. "jg 321b \n"
  4909. "jmp 99f \n"
  4910. LABELALIGN
  4911. "2103: \n"
  4912. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4913. "lea " MEMLEA(0x10,0) ",%0 \n"
  4914. "movdqa %%xmm0,%%xmm1 \n"
  4915. "punpcklbw %%xmm5,%%xmm0 \n"
  4916. "punpckhbw %%xmm5,%%xmm1 \n"
  4917. "pshufhw $0x93,%%xmm0,%%xmm0 \n"
  4918. "pshuflw $0x93,%%xmm0,%%xmm0 \n"
  4919. "pshufhw $0x93,%%xmm1,%%xmm1 \n"
  4920. "pshuflw $0x93,%%xmm1,%%xmm1 \n"
  4921. "packuswb %%xmm1,%%xmm0 \n"
  4922. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4923. "lea " MEMLEA(0x10,1) ",%1 \n"
  4924. "sub $0x4,%3 \n"
  4925. "jg 2103b \n"
  4926. "jmp 99f \n"
  4927. LABELALIGN
  4928. "3012: \n"
  4929. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4930. "lea " MEMLEA(0x10,0) ",%0 \n"
  4931. "movdqa %%xmm0,%%xmm1 \n"
  4932. "punpcklbw %%xmm5,%%xmm0 \n"
  4933. "punpckhbw %%xmm5,%%xmm1 \n"
  4934. "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
  4935. "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
  4936. "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
  4937. "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
  4938. "packuswb %%xmm1,%%xmm0 \n"
  4939. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4940. "lea " MEMLEA(0x10,1) ",%1 \n"
  4941. "sub $0x4,%3 \n"
  4942. "jg 3012b \n"
  4943. "99: \n"
  4944. : "+r"(src_argb), // %0
  4945. "+r"(dst_argb), // %1
  4946. "=&d"(pixel_temp), // %2
  4947. "+r"(width) // %3
  4948. : "r"(shuffler) // %4
  4949. : "memory", "cc", NACL_R14
  4950. "xmm0", "xmm1", "xmm5"
  4951. );
  4952. }
  4953. #endif // HAS_ARGBSHUFFLEROW_SSE2
  4954. #ifdef HAS_I422TOYUY2ROW_SSE2
  4955. void I422ToYUY2Row_SSE2(const uint8* src_y,
  4956. const uint8* src_u,
  4957. const uint8* src_v,
  4958. uint8* dst_frame, int width) {
  4959. asm volatile (
  4960. "sub %1,%2 \n"
  4961. LABELALIGN
  4962. "1: \n"
  4963. "movq " MEMACCESS(1) ",%%xmm2 \n"
  4964. MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
  4965. "lea " MEMLEA(0x8,1) ",%1 \n"
  4966. "punpcklbw %%xmm3,%%xmm2 \n"
  4967. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4968. "lea " MEMLEA(0x10,0) ",%0 \n"
  4969. "movdqa %%xmm0,%%xmm1 \n"
  4970. "punpcklbw %%xmm2,%%xmm0 \n"
  4971. "punpckhbw %%xmm2,%%xmm1 \n"
  4972. "movdqu %%xmm0," MEMACCESS(3) " \n"
  4973. "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
  4974. "lea " MEMLEA(0x20,3) ",%3 \n"
  4975. "sub $0x10,%4 \n"
  4976. "jg 1b \n"
  4977. : "+r"(src_y), // %0
  4978. "+r"(src_u), // %1
  4979. "+r"(src_v), // %2
  4980. "+r"(dst_frame), // %3
  4981. "+rm"(width) // %4
  4982. :
  4983. : "memory", "cc", NACL_R14
  4984. "xmm0", "xmm1", "xmm2", "xmm3"
  4985. );
  4986. }
  4987. #endif // HAS_I422TOYUY2ROW_SSE2
  4988. #ifdef HAS_I422TOUYVYROW_SSE2
  4989. void I422ToUYVYRow_SSE2(const uint8* src_y,
  4990. const uint8* src_u,
  4991. const uint8* src_v,
  4992. uint8* dst_frame, int width) {
  4993. asm volatile (
  4994. "sub %1,%2 \n"
  4995. LABELALIGN
  4996. "1: \n"
  4997. "movq " MEMACCESS(1) ",%%xmm2 \n"
  4998. MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
  4999. "lea " MEMLEA(0x8,1) ",%1 \n"
  5000. "punpcklbw %%xmm3,%%xmm2 \n"
  5001. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  5002. "movdqa %%xmm2,%%xmm1 \n"
  5003. "lea " MEMLEA(0x10,0) ",%0 \n"
  5004. "punpcklbw %%xmm0,%%xmm1 \n"
  5005. "punpckhbw %%xmm0,%%xmm2 \n"
  5006. "movdqu %%xmm1," MEMACCESS(3) " \n"
  5007. "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
  5008. "lea " MEMLEA(0x20,3) ",%3 \n"
  5009. "sub $0x10,%4 \n"
  5010. "jg 1b \n"
  5011. : "+r"(src_y), // %0
  5012. "+r"(src_u), // %1
  5013. "+r"(src_v), // %2
  5014. "+r"(dst_frame), // %3
  5015. "+rm"(width) // %4
  5016. :
  5017. : "memory", "cc", NACL_R14
  5018. "xmm0", "xmm1", "xmm2", "xmm3"
  5019. );
  5020. }
  5021. #endif // HAS_I422TOUYVYROW_SSE2
  5022. #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  5023. void ARGBPolynomialRow_SSE2(const uint8* src_argb,
  5024. uint8* dst_argb, const float* poly,
  5025. int width) {
  5026. asm volatile (
  5027. "pxor %%xmm3,%%xmm3 \n"
  5028. // 2 pixel loop.
  5029. LABELALIGN
  5030. "1: \n"
  5031. "movq " MEMACCESS(0) ",%%xmm0 \n"
  5032. "lea " MEMLEA(0x8,0) ",%0 \n"
  5033. "punpcklbw %%xmm3,%%xmm0 \n"
  5034. "movdqa %%xmm0,%%xmm4 \n"
  5035. "punpcklwd %%xmm3,%%xmm0 \n"
  5036. "punpckhwd %%xmm3,%%xmm4 \n"
  5037. "cvtdq2ps %%xmm0,%%xmm0 \n"
  5038. "cvtdq2ps %%xmm4,%%xmm4 \n"
  5039. "movdqa %%xmm0,%%xmm1 \n"
  5040. "movdqa %%xmm4,%%xmm5 \n"
  5041. "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
  5042. "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
  5043. "addps " MEMACCESS(3) ",%%xmm0 \n"
  5044. "addps " MEMACCESS(3) ",%%xmm4 \n"
  5045. "movdqa %%xmm1,%%xmm2 \n"
  5046. "movdqa %%xmm5,%%xmm6 \n"
  5047. "mulps %%xmm1,%%xmm2 \n"
  5048. "mulps %%xmm5,%%xmm6 \n"
  5049. "mulps %%xmm2,%%xmm1 \n"
  5050. "mulps %%xmm6,%%xmm5 \n"
  5051. "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
  5052. "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
  5053. "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
  5054. "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
  5055. "addps %%xmm2,%%xmm0 \n"
  5056. "addps %%xmm6,%%xmm4 \n"
  5057. "addps %%xmm1,%%xmm0 \n"
  5058. "addps %%xmm5,%%xmm4 \n"
  5059. "cvttps2dq %%xmm0,%%xmm0 \n"
  5060. "cvttps2dq %%xmm4,%%xmm4 \n"
  5061. "packuswb %%xmm4,%%xmm0 \n"
  5062. "packuswb %%xmm0,%%xmm0 \n"
  5063. "movq %%xmm0," MEMACCESS(1) " \n"
  5064. "lea " MEMLEA(0x8,1) ",%1 \n"
  5065. "sub $0x2,%2 \n"
  5066. "jg 1b \n"
  5067. : "+r"(src_argb), // %0
  5068. "+r"(dst_argb), // %1
  5069. "+r"(width) // %2
  5070. : "r"(poly) // %3
  5071. : "memory", "cc"
  5072. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  5073. );
  5074. }
  5075. #endif // HAS_ARGBPOLYNOMIALROW_SSE2
  5076. #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  5077. void ARGBPolynomialRow_AVX2(const uint8* src_argb,
  5078. uint8* dst_argb, const float* poly,
  5079. int width) {
  5080. asm volatile (
  5081. "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
  5082. "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
  5083. "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
  5084. "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
  5085. // 2 pixel loop.
  5086. LABELALIGN
  5087. "1: \n"
  5088. "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
  5089. "lea " MEMLEA(0x8,0) ",%0 \n"
  5090. "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
  5091. "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
  5092. "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
  5093. "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
  5094. "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
  5095. "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
  5096. "vcvttps2dq %%ymm0,%%ymm0 \n"
  5097. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  5098. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  5099. "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
  5100. "vmovq %%xmm0," MEMACCESS(1) " \n"
  5101. "lea " MEMLEA(0x8,1) ",%1 \n"
  5102. "sub $0x2,%2 \n"
  5103. "jg 1b \n"
  5104. "vzeroupper \n"
  5105. : "+r"(src_argb), // %0
  5106. "+r"(dst_argb), // %1
  5107. "+r"(width) // %2
  5108. : "r"(poly) // %3
  5109. : "memory", "cc",
  5110. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  5111. );
  5112. }
  5113. #endif // HAS_ARGBPOLYNOMIALROW_AVX2
  5114. #ifdef HAS_ARGBCOLORTABLEROW_X86
  5115. // Tranform ARGB pixels with color table.
  5116. void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
  5117. int width) {
  5118. uintptr_t pixel_temp;
  5119. asm volatile (
  5120. // 1 pixel loop.
  5121. LABELALIGN
  5122. "1: \n"
  5123. "movzb " MEMACCESS(0) ",%1 \n"
  5124. "lea " MEMLEA(0x4,0) ",%0 \n"
  5125. MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
  5126. "mov %b1," MEMACCESS2(-0x4,0) " \n"
  5127. "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
  5128. MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
  5129. "mov %b1," MEMACCESS2(-0x3,0) " \n"
  5130. "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
  5131. MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
  5132. "mov %b1," MEMACCESS2(-0x2,0) " \n"
  5133. "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
  5134. MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
  5135. "mov %b1," MEMACCESS2(-0x1,0) " \n"
  5136. "dec %2 \n"
  5137. "jg 1b \n"
  5138. : "+r"(dst_argb), // %0
  5139. "=&d"(pixel_temp), // %1
  5140. "+r"(width) // %2
  5141. : "r"(table_argb) // %3
  5142. : "memory", "cc");
  5143. }
  5144. #endif // HAS_ARGBCOLORTABLEROW_X86
  5145. #ifdef HAS_RGBCOLORTABLEROW_X86
  5146. // Tranform RGB pixels with color table.
  5147. void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
  5148. uintptr_t pixel_temp;
  5149. asm volatile (
  5150. // 1 pixel loop.
  5151. LABELALIGN
  5152. "1: \n"
  5153. "movzb " MEMACCESS(0) ",%1 \n"
  5154. "lea " MEMLEA(0x4,0) ",%0 \n"
  5155. MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
  5156. "mov %b1," MEMACCESS2(-0x4,0) " \n"
  5157. "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
  5158. MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
  5159. "mov %b1," MEMACCESS2(-0x3,0) " \n"
  5160. "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
  5161. MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
  5162. "mov %b1," MEMACCESS2(-0x2,0) " \n"
  5163. "dec %2 \n"
  5164. "jg 1b \n"
  5165. : "+r"(dst_argb), // %0
  5166. "=&d"(pixel_temp), // %1
  5167. "+r"(width) // %2
  5168. : "r"(table_argb) // %3
  5169. : "memory", "cc");
  5170. }
  5171. #endif // HAS_RGBCOLORTABLEROW_X86
  5172. #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5173. // Tranform RGB pixels with luma table.
  5174. void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  5175. int width,
  5176. const uint8* luma, uint32 lumacoeff) {
  5177. uintptr_t pixel_temp;
  5178. uintptr_t table_temp;
  5179. asm volatile (
  5180. "movd %6,%%xmm3 \n"
  5181. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  5182. "pcmpeqb %%xmm4,%%xmm4 \n"
  5183. "psllw $0x8,%%xmm4 \n"
  5184. "pxor %%xmm5,%%xmm5 \n"
  5185. // 4 pixel loop.
  5186. LABELALIGN
  5187. "1: \n"
  5188. "movdqu " MEMACCESS(2) ",%%xmm0 \n"
  5189. "pmaddubsw %%xmm3,%%xmm0 \n"
  5190. "phaddw %%xmm0,%%xmm0 \n"
  5191. "pand %%xmm4,%%xmm0 \n"
  5192. "punpcklwd %%xmm5,%%xmm0 \n"
  5193. "movd %%xmm0,%k1 \n" // 32 bit offset
  5194. "add %5,%1 \n"
  5195. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5196. "movzb " MEMACCESS(2) ",%0 \n"
  5197. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5198. "mov %b0," MEMACCESS(3) " \n"
  5199. "movzb " MEMACCESS2(0x1,2) ",%0 \n"
  5200. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5201. "mov %b0," MEMACCESS2(0x1,3) " \n"
  5202. "movzb " MEMACCESS2(0x2,2) ",%0 \n"
  5203. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5204. "mov %b0," MEMACCESS2(0x2,3) " \n"
  5205. "movzb " MEMACCESS2(0x3,2) ",%0 \n"
  5206. "mov %b0," MEMACCESS2(0x3,3) " \n"
  5207. "movd %%xmm0,%k1 \n" // 32 bit offset
  5208. "add %5,%1 \n"
  5209. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5210. "movzb " MEMACCESS2(0x4,2) ",%0 \n"
  5211. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5212. "mov %b0," MEMACCESS2(0x4,3) " \n"
  5213. "movzb " MEMACCESS2(0x5,2) ",%0 \n"
  5214. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5215. "mov %b0," MEMACCESS2(0x5,3) " \n"
  5216. "movzb " MEMACCESS2(0x6,2) ",%0 \n"
  5217. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5218. "mov %b0," MEMACCESS2(0x6,3) " \n"
  5219. "movzb " MEMACCESS2(0x7,2) ",%0 \n"
  5220. "mov %b0," MEMACCESS2(0x7,3) " \n"
  5221. "movd %%xmm0,%k1 \n" // 32 bit offset
  5222. "add %5,%1 \n"
  5223. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5224. "movzb " MEMACCESS2(0x8,2) ",%0 \n"
  5225. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5226. "mov %b0," MEMACCESS2(0x8,3) " \n"
  5227. "movzb " MEMACCESS2(0x9,2) ",%0 \n"
  5228. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5229. "mov %b0," MEMACCESS2(0x9,3) " \n"
  5230. "movzb " MEMACCESS2(0xa,2) ",%0 \n"
  5231. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5232. "mov %b0," MEMACCESS2(0xa,3) " \n"
  5233. "movzb " MEMACCESS2(0xb,2) ",%0 \n"
  5234. "mov %b0," MEMACCESS2(0xb,3) " \n"
  5235. "movd %%xmm0,%k1 \n" // 32 bit offset
  5236. "add %5,%1 \n"
  5237. "movzb " MEMACCESS2(0xc,2) ",%0 \n"
  5238. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5239. "mov %b0," MEMACCESS2(0xc,3) " \n"
  5240. "movzb " MEMACCESS2(0xd,2) ",%0 \n"
  5241. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5242. "mov %b0," MEMACCESS2(0xd,3) " \n"
  5243. "movzb " MEMACCESS2(0xe,2) ",%0 \n"
  5244. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5245. "mov %b0," MEMACCESS2(0xe,3) " \n"
  5246. "movzb " MEMACCESS2(0xf,2) ",%0 \n"
  5247. "mov %b0," MEMACCESS2(0xf,3) " \n"
  5248. "lea " MEMLEA(0x10,2) ",%2 \n"
  5249. "lea " MEMLEA(0x10,3) ",%3 \n"
  5250. "sub $0x4,%4 \n"
  5251. "jg 1b \n"
  5252. : "=&d"(pixel_temp), // %0
  5253. "=&a"(table_temp), // %1
  5254. "+r"(src_argb), // %2
  5255. "+r"(dst_argb), // %3
  5256. "+rm"(width) // %4
  5257. : "r"(luma), // %5
  5258. "rm"(lumacoeff) // %6
  5259. : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
  5260. );
  5261. }
  5262. #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5263. #endif // defined(__x86_64__) || defined(__i386__)
  5264. #ifdef __cplusplus
  5265. } // extern "C"
  5266. } // namespace libyuv
  5267. #endif