12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419144201442114422144231442414425144261442714428144291443014431144321443314434144351443614437144381443914440144411444214443144441444514446144471444814449144501445114452144531445414455144561445714458144591446014461144621446314464144651446614467144681446914470144711447214473144741447514476144771447814479144801448114482144831448414485144861448714488144891449014491144921449314494144951449614497144981449914500145011450214503145041450514506145071450814509145101451114512145131451414515145161451714518145191452014521145221452314524145251452614527145281452914530145311453214533145341453514536145371453814539145401454114542145431454414545145461454714548145491455014551145521455314554145551455614557145581455914560145611456214563145641456514566145671456814569145701457114572145731457414575145761457714578145791458014581145821458314584145851458614587145881458914590145911459214593145941459514596145971459814599146001460114602146031460414605146061460714608146091461014611146121461314614146151461614617146181461914620146211462214623146241462514626146271462814629146301463114632146331463414635146361463714638146391464014641146421464314644146451464614647146481464914650146511465214653146541465514656146571465814659146601466114662146631466414665146661466714668146691467014671146721467314674146751467614677146781467914680146811468214683146841468514686146871468814689146901469114692146931469414695146961469714698146991470014701147021470314704147051470614707147081470914710147111471214713147141471514716147171471814719147201472114722147231472414725147261472714728147291473014731147321473314734147351473614737147381473914740147411474214743147441474514746147471474814749147501475114752147531475414755147561475714758147591476014761147621476314764147651476614767147681476914770147711477214773147741477514776147771477814779147801478114782147831478414785147861478714788147891479014791147921479314794147951479614797147981479914800148011480214803148041480514806148071480814809148101481114812148131481414815148161481714818148191482014821148221482314824148251482614827148281482914830148311483214833148341483514836148371483814839148401484114842148431484414845148461484714848148491485014851148521485314854148551485614857148581485914860148611486214863148641486514866148671486814869148701487114872148731487414875148761487714878148791488014881148821488314884148851488614887148881488914890148911489214893148941489514896148971489814899149001490114902149031490414905149061490714908149091491014911149121491314914149151491614917149181491914920149211492214923149241492514926149271492814929149301493114932149331493414935149361493714938149391494014941149421494314944149451494614947149481494914950149511495214953149541495514956149571495814959149601496114962149631496414965149661496714968149691497014971149721497314974149751497614977149781497914980149811498214983149841498514986149871498814989149901499114992149931499414995149961499714998149991500015001150021500315004150051500615007150081500915010150111501215013150141501515016150171501815019150201502115022150231502415025150261502715028150291503015031150321503315034150351503615037150381503915040150411504215043150441504515046150471504815049150501505115052150531505415055150561505715058150591506015061150621506315064150651506615067150681506915070150711507215073150741507515076150771507815079150801508115082150831508415085150861508715088150891509015091150921509315094150951509615097150981509915100151011510215103151041510515106151071510815109151101511115112151131511415115151161511715118151191512015121151221512315124151251512615127151281512915130151311513215133151341513515136151371513815139151401514115142151431514415145151461514715148151491515015151151521515315154151551515615157151581515915160151611516215163151641516515166151671516815169151701517115172151731517415175151761517715178151791518015181151821518315184151851518615187151881518915190151911519215193151941519515196151971519815199152001520115202152031520415205152061520715208152091521015211152121521315214152151521615217152181521915220152211522215223152241522515226152271522815229152301523115232152331523415235152361523715238152391524015241152421524315244152451524615247152481524915250152511525215253152541525515256152571525815259152601526115262152631526415265152661526715268152691527015271152721527315274152751527615277152781527915280152811528215283152841528515286152871528815289152901529115292152931529415295152961529715298152991530015301153021530315304153051530615307153081530915310153111531215313153141531515316153171531815319153201532115322153231532415325153261532715328153291533015331153321533315334153351533615337153381533915340153411534215343153441534515346153471534815349153501535115352153531535415355153561535715358153591536015361153621536315364153651536615367153681536915370153711537215373153741537515376153771537815379153801538115382153831538415385153861538715388153891539015391153921539315394153951539615397153981539915400154011540215403154041540515406154071540815409154101541115412154131541415415154161541715418154191542015421154221542315424154251542615427154281542915430154311543215433154341543515436154371543815439154401544115442154431544415445154461544715448154491545015451154521545315454154551545615457154581545915460154611546215463154641546515466154671546815469154701547115472154731547415475154761547715478154791548015481154821548315484154851548615487154881548915490154911549215493154941549515496154971549815499155001550115502155031550415505155061550715508155091551015511155121551315514155151551615517155181551915520155211552215523155241552515526155271552815529155301553115532155331553415535155361553715538155391554015541155421554315544155451554615547155481554915550155511555215553155541555515556155571555815559155601556115562155631556415565155661556715568155691557015571155721557315574155751557615577155781557915580155811558215583155841558515586155871558815589155901559115592155931559415595155961559715598155991560015601156021560315604156051560615607156081560915610156111561215613156141561515616156171561815619156201562115622156231562415625156261562715628156291563015631156321563315634156351563615637156381563915640156411564215643156441564515646156471564815649156501565115652156531565415655156561565715658156591566015661156621566315664156651566615667156681566915670156711567215673156741567515676156771567815679156801568115682156831568415685156861568715688156891569015691156921569315694156951569615697156981569915700157011570215703157041570515706157071570815709157101571115712157131571415715157161571715718157191572015721157221572315724157251572615727157281572915730157311573215733157341573515736157371573815739157401574115742157431574415745157461574715748157491575015751157521575315754157551575615757157581575915760157611576215763157641576515766157671576815769157701577115772157731577415775157761577715778157791578015781157821578315784157851578615787157881578915790157911579215793157941579515796157971579815799158001580115802158031580415805158061580715808158091581015811158121581315814158151581615817158181581915820158211582215823158241582515826158271582815829158301583115832158331583415835158361583715838158391584015841158421584315844158451584615847158481584915850158511585215853158541585515856158571585815859158601586115862158631586415865158661586715868158691587015871158721587315874158751587615877158781587915880158811588215883158841588515886158871588815889158901589115892158931589415895158961589715898158991590015901159021590315904159051590615907159081590915910159111591215913159141591515916159171591815919159201592115922159231592415925159261592715928159291593015931159321593315934159351593615937159381593915940159411594215943159441594515946159471594815949159501595115952159531595415955159561595715958159591596015961159621596315964159651596615967159681596915970159711597215973159741597515976159771597815979159801598115982159831598415985159861598715988159891599015991159921599315994159951599615997159981599916000160011600216003160041600516006160071600816009160101601116012160131601416015160161601716018160191602016021160221602316024160251602616027160281602916030160311603216033160341603516036160371603816039160401604116042160431604416045160461604716048160491605016051160521605316054160551605616057160581605916060160611606216063160641606516066160671606816069160701607116072160731607416075160761607716078160791608016081160821608316084160851608616087160881608916090160911609216093160941609516096160971609816099161001610116102161031610416105161061610716108161091611016111161121611316114161151611616117161181611916120161211612216123161241612516126161271612816129161301613116132161331613416135161361613716138161391614016141161421614316144161451614616147161481614916150161511615216153161541615516156161571615816159161601616116162161631616416165161661616716168161691617016171161721617316174161751617616177161781617916180161811618216183161841618516186161871618816189161901619116192161931619416195161961619716198161991620016201162021620316204162051620616207162081620916210162111621216213162141621516216162171621816219162201622116222162231622416225162261622716228162291623016231162321623316234162351623616237162381623916240162411624216243162441624516246162471624816249162501625116252162531625416255162561625716258162591626016261162621626316264162651626616267162681626916270162711627216273162741627516276162771627816279162801628116282162831628416285162861628716288162891629016291162921629316294162951629616297162981629916300163011630216303163041630516306163071630816309163101631116312163131631416315163161631716318163191632016321163221632316324163251632616327163281632916330163311633216333163341633516336163371633816339163401634116342163431634416345163461634716348163491635016351163521635316354163551635616357163581635916360163611636216363163641636516366163671636816369163701637116372163731637416375163761637716378163791638016381163821638316384163851638616387163881638916390163911639216393163941639516396163971639816399164001640116402164031640416405164061640716408164091641016411164121641316414164151641616417164181641916420164211642216423164241642516426164271642816429164301643116432164331643416435164361643716438164391644016441164421644316444164451644616447164481644916450164511645216453164541645516456164571645816459164601646116462164631646416465164661646716468164691647016471164721647316474164751647616477164781647916480164811648216483164841648516486164871648816489164901649116492164931649416495164961649716498164991650016501165021650316504165051650616507165081650916510165111651216513165141651516516165171651816519165201652116522165231652416525165261652716528165291653016531165321653316534165351653616537165381653916540165411654216543165441654516546165471654816549165501655116552165531655416555165561655716558165591656016561165621656316564165651656616567165681656916570165711657216573165741657516576165771657816579165801658116582165831658416585165861658716588165891659016591165921659316594165951659616597165981659916600166011660216603166041660516606166071660816609166101661116612166131661416615166161661716618166191662016621166221662316624166251662616627166281662916630166311663216633166341663516636166371663816639166401664116642166431664416645166461664716648166491665016651166521665316654166551665616657166581665916660166611666216663166641666516666166671666816669166701667116672166731667416675166761667716678166791668016681166821668316684166851668616687166881668916690166911669216693166941669516696166971669816699167001670116702167031670416705167061670716708167091671016711167121671316714167151671616717167181671916720167211672216723167241672516726167271672816729167301673116732167331673416735167361673716738167391674016741167421674316744167451674616747167481674916750167511675216753167541675516756167571675816759167601676116762167631676416765167661676716768167691677016771167721677316774167751677616777167781677916780167811678216783167841678516786167871678816789167901679116792167931679416795167961679716798167991680016801168021680316804168051680616807168081680916810168111681216813168141681516816168171681816819168201682116822168231682416825168261682716828168291683016831168321683316834168351683616837168381683916840168411684216843168441684516846168471684816849168501685116852168531685416855168561685716858168591686016861168621686316864168651686616867168681686916870168711687216873168741687516876168771687816879168801688116882168831688416885168861688716888168891689016891168921689316894168951689616897168981689916900169011690216903169041690516906169071690816909169101691116912169131691416915169161691716918169191692016921169221692316924169251692616927169281692916930169311693216933169341693516936169371693816939169401694116942169431694416945169461694716948169491695016951169521695316954169551695616957169581695916960169611696216963169641696516966169671696816969169701697116972169731697416975169761697716978169791698016981169821698316984169851698616987169881698916990169911699216993169941699516996169971699816999170001700117002170031700417005170061700717008170091701017011170121701317014170151701617017170181701917020170211702217023170241702517026170271702817029170301703117032170331703417035170361703717038170391704017041170421704317044170451704617047170481704917050170511705217053170541705517056170571705817059170601706117062170631706417065170661706717068170691707017071170721707317074170751707617077170781707917080170811708217083170841708517086170871708817089170901709117092170931709417095170961709717098170991710017101171021710317104171051710617107171081710917110171111711217113171141711517116171171711817119171201712117122171231712417125171261712717128171291713017131171321713317134171351713617137171381713917140171411714217143171441714517146171471714817149171501715117152171531715417155171561715717158171591716017161171621716317164171651716617167171681716917170171711717217173171741717517176171771717817179171801718117182171831718417185171861718717188171891719017191171921719317194171951719617197171981719917200172011720217203172041720517206172071720817209172101721117212172131721417215172161721717218172191722017221172221722317224172251722617227172281722917230172311723217233172341723517236172371723817239172401724117242172431724417245172461724717248172491725017251172521725317254172551725617257172581725917260172611726217263172641726517266172671726817269172701727117272172731727417275172761727717278172791728017281172821728317284172851728617287172881728917290172911729217293172941729517296172971729817299173001730117302173031730417305173061730717308173091731017311173121731317314173151731617317173181731917320173211732217323173241732517326173271732817329173301733117332173331733417335173361733717338173391734017341173421734317344173451734617347173481734917350173511735217353173541735517356173571735817359173601736117362173631736417365173661736717368173691737017371173721737317374173751737617377173781737917380173811738217383173841738517386173871738817389173901739117392173931739417395173961739717398173991740017401174021740317404174051740617407174081740917410174111741217413174141741517416174171741817419174201742117422174231742417425174261742717428174291743017431174321743317434174351743617437174381743917440174411744217443174441744517446174471744817449174501745117452174531745417455174561745717458174591746017461174621746317464174651746617467174681746917470174711747217473174741747517476174771747817479174801748117482174831748417485174861748717488174891749017491174921749317494174951749617497174981749917500175011750217503175041750517506175071750817509175101751117512175131751417515175161751717518175191752017521175221752317524175251752617527175281752917530175311753217533175341753517536175371753817539175401754117542175431754417545175461754717548175491755017551175521755317554175551755617557175581755917560175611756217563175641756517566175671756817569175701757117572175731757417575175761757717578175791758017581175821758317584175851758617587175881758917590175911759217593175941759517596175971759817599176001760117602176031760417605176061760717608176091761017611176121761317614176151761617617176181761917620176211762217623176241762517626176271762817629176301763117632176331763417635176361763717638176391764017641176421764317644176451764617647176481764917650176511765217653176541765517656176571765817659176601766117662176631766417665176661766717668176691767017671176721767317674176751767617677176781767917680176811768217683176841768517686176871768817689176901769117692176931769417695176961769717698176991770017701177021770317704177051770617707177081770917710177111771217713177141771517716177171771817719177201772117722177231772417725177261772717728177291773017731177321773317734177351773617737177381773917740177411774217743177441774517746177471774817749177501775117752177531775417755177561775717758177591776017761177621776317764177651776617767177681776917770177711777217773177741777517776177771777817779177801778117782177831778417785177861778717788177891779017791177921779317794177951779617797177981779917800178011780217803178041780517806178071780817809178101781117812178131781417815178161781717818178191782017821178221782317824178251782617827178281782917830178311783217833178341783517836178371783817839178401784117842178431784417845178461784717848178491785017851178521785317854178551785617857178581785917860178611786217863178641786517866178671786817869178701787117872178731787417875178761787717878178791788017881178821788317884178851788617887178881788917890178911789217893178941789517896178971789817899179001790117902179031790417905179061790717908179091791017911179121791317914179151791617917179181791917920179211792217923179241792517926179271792817929179301793117932179331793417935179361793717938179391794017941179421794317944179451794617947179481794917950179511795217953179541795517956179571795817959179601796117962179631796417965179661796717968179691797017971179721797317974179751797617977179781797917980179811798217983179841798517986179871798817989179901799117992179931799417995179961799717998179991800018001180021800318004180051800618007180081800918010180111801218013180141801518016180171801818019180201802118022180231802418025180261802718028180291803018031180321803318034180351803618037180381803918040180411804218043180441804518046180471804818049180501805118052180531805418055180561805718058180591806018061180621806318064180651806618067180681806918070180711807218073180741807518076180771807818079180801808118082180831808418085180861808718088 |
- /*
- ===========================================================================
- Doom 3 GPL Source Code
- Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
- This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
- Doom 3 Source Code is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- Doom 3 Source Code is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
- In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
- If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
- ===========================================================================
- */
- #include "../precompiled.h"
- #pragma hdrstop
- #include "Simd_Generic.h"
- #include "Simd_MMX.h"
- #include "Simd_SSE.h"
- //===============================================================
- // M
- // SSE implementation of idSIMDProcessor MrE
- // E
- //===============================================================
- #if defined(MACOS_X) && defined(__i386__)
- #include <xmmintrin.h>
- #define DRAWVERT_SIZE 60
- #define DRAWVERT_XYZ_OFFSET (0*4)
- #define DRAWVERT_ST_OFFSET (3*4)
- #define DRAWVERT_NORMAL_OFFSET (5*4)
- #define DRAWVERT_TANGENT0_OFFSET (8*4)
- #define DRAWVERT_TANGENT1_OFFSET (11*4)
- #define DRAWVERT_COLOR_OFFSET (14*4)
- #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
- #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
- /*
- ============
- idSIMD_SSE::GetName
- ============
- */
- const char * idSIMD_SSE::GetName( void ) const {
- return "MMX & SSE";
- }
- /*
- ============
- idSIMD_SSE::Dot
- dst[i] = constant.Normal() * src[i].xyz + constant[3];
- ============
- */
- void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
- // 0, 1, 2
- // 3, 4, 5
- // 6, 7, 8
- // 9, 10, 11
- /*
- mov eax, count
- mov edi, constant
- mov edx, eax
- mov esi, src
- mov ecx, dst
- */
- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // Declare 8 xmm registers.
- int count_l4 = count; // count_l4 = eax
- int count_l1 = count; // count_l1 = edx
- char *constant_p = (char *)&constant; // constant_p = edi
- char *src_p = (char *) src; // src_p = esi
- char *dst_p = (char *) dst; // dst_p = ecx
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
-
- /*
- and eax, ~3
- movss xmm4, [edi+0]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm5, [edi+4]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm6, [edi+8]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm7, [edi+12]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- */
- count_l4 = count_l4 & ~3;
- xmm4 = _mm_load_ss((float *) (constant_p));
- xmm4 = _mm_shuffle_ps(xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ));
- xmm5 = _mm_load_ss((float *) (constant_p + 4));
- xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
- xmm6 = _mm_load_ss((float *) (constant_p + 8));
- xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
- xmm7 = _mm_load_ss((float *) (constant_p + 12));
- xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
-
- /*
- jz startVert1
- */
- if(count_l4 != 0) {
- /*
- imul eax, DRAWVERT_SIZE
- add esi, eax
- neg eax
- */
- count_l4 = count_l4 * DRAWVERT_SIZE;
- src_p = src_p + count_l4;
- count_l4 = -count_l4;
- /*
- loopVert4:
- */
- do {
- /*
- movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
- movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
- movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
- movaps xmm1, xmm0 // 3, X, 0, 1
- */
- xmm0 = _mm_load_ss((float *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, X, X
- xmm2 = _mm_load_ss((float *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 2, X, X, X
- xmm0 = _mm_loadh_pi(xmm0, (__m64 *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, 0, 1
- xmm1 = xmm0; // 3, X, 0, 1
-
- /*
- movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
- shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
- */
- xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 4, 5, 0, 1
- xmm2 = _mm_shuffle_ps(xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )); // 2, X, 4, 5
-
- /*
- movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
- movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
- shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
- */
- xmm3 = _mm_load_ss((float *) (src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, X, X
- xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, 6, 7
- xmm0 = _mm_shuffle_ps(xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )); // 0, 3, 6, 9
- /*
- movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
- shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
- */
- xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 10, 11, 6, 7
- xmm1 = _mm_shuffle_ps(xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )); // 1, 4, 7, 10
- /*
- movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
- shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
- */
- xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 10, 11, 8, X
- xmm2 = _mm_shuffle_ps(xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )); // 2, 5, 8, 11
-
- /*
- add ecx, 16
- add eax, 4*DRAWVERT_SIZE
- */
- dst_p = dst_p + 16;
- count_l4 = count_l4 + 4*DRAWVERT_SIZE;
-
- /*
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- addps xmm0, xmm7
- addps xmm0, xmm1
- addps xmm0, xmm2
- */
- xmm0 = _mm_mul_ps(xmm0, xmm4);
- xmm1 = _mm_mul_ps(xmm1, xmm5);
- xmm2 = _mm_mul_ps(xmm2, xmm6);
- xmm0 = _mm_add_ps(xmm0, xmm7);
- xmm0 = _mm_add_ps(xmm0, xmm1);
- xmm0 = _mm_add_ps(xmm0, xmm2);
-
- /*
- movlps [ecx-16+0], xmm0
- movhps [ecx-16+8], xmm0
- jl loopVert4
- */
- _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
- _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
- } while(count_l4 < 0);
- }
-
- /*
- startVert1:
- and edx, 3
- jz done
- */
- count_l1 = count_l1 & 3;
- if(count_l1 != 0) {
- /*
- loopVert1:
- movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
- movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
- movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
- mulss xmm0, xmm4
- mulss xmm1, xmm5
- mulss xmm2, xmm6
- addss xmm0, xmm7
- add ecx, 4
- addss xmm0, xmm1
- add eax, DRAWVERT_SIZE
- addss xmm0, xmm2
- dec edx
- movss [ecx-4], xmm0
- jnz loopVert1
- */
- do {
- xmm0 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+0));
- xmm1 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+4));
- xmm2 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+8));
- xmm0 = _mm_mul_ss(xmm0, xmm4);
- xmm1 = _mm_mul_ss(xmm1, xmm5);
- xmm2 = _mm_mul_ss(xmm2, xmm6);
- xmm0 = _mm_add_ss(xmm0, xmm7);
- dst_p = dst_p + 4;
- xmm0 = _mm_add_ss(xmm0, xmm1);
- count_l4 = count_l4 + DRAWVERT_SIZE;
- xmm0 = _mm_add_ss(xmm0, xmm2);
- count_l1 = count_l1 - 1;
- _mm_store_ss((float *) (dst_p-4), xmm0);
- } while( count_l1 != 0);
- }
- /*
- done:
- */
- }
- /*
- ============
- idSIMD_SSE::MinMax
- ============
- */
- void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
- char *indexes_p;
- char *src_p;
- int count_l;
- int edx;
- char *min_p;
- char *max_p;
-
- /*
- movss xmm0, idMath::INFINITY
- xorps xmm1, xmm1
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- subps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, xmm1
- */
- xmm0 = _mm_load_ss(&idMath::INFINITY);
- // To satisfy the compiler use xmm0 instead.
- xmm1 = _mm_xor_ps(xmm0, xmm0);
- xmm0 = _mm_shuffle_ps(xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ));
- xmm1 = _mm_sub_ps(xmm1, xmm0);
- xmm2 = xmm0;
- xmm3 = xmm1;
- /*
- mov edi, indexes
- mov esi, src
- mov eax, count
- and eax, ~3
- jz done4
- */
- indexes_p = (char *) indexes;
- src_p = (char *) src;
- count_l = count;
- count_l = count_l & ~3;
- if(count_l != 0) {
- /*
- shl eax, 2
- add edi, eax
- neg eax
- */
- count_l = count_l << 2;
- indexes_p = indexes_p + count_l;
- count_l = -count_l;
- /*
- loop4:
- // prefetchnta [edi+128]
- // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
- */
- do {
- /*
- mov edx, [edi+eax+0]
- imul edx, DRAWVERT_SIZE
- movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
- minps xmm0, xmm4
- maxps xmm1, xmm4
- */
- edx = *((int*)(indexes_p+count_l+0));
- edx = edx * DRAWVERT_SIZE;
- xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
- xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
- xmm0 = _mm_min_ps(xmm0, xmm4);
- xmm1 = _mm_max_ps(xmm1, xmm4);
-
- /*
- mov edx, [edi+eax+4]
- imul edx, DRAWVERT_SIZE
- movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
- minps xmm2, xmm5
- maxps xmm3, xmm5
- */
- edx = *((int*)(indexes_p+count_l+4));
- edx = edx * DRAWVERT_SIZE;
- xmm5 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
- xmm5 = _mm_loadh_pi(xmm5, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
- xmm2 = _mm_min_ps(xmm2, xmm5);
- xmm3 = _mm_max_ps(xmm3, xmm5);
-
- /*
- mov edx, [edi+eax+8]
- imul edx, DRAWVERT_SIZE
- movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
- minps xmm0, xmm6
- maxps xmm1, xmm6
- */
- edx = *((int*)(indexes_p+count_l+8));
- edx = edx * DRAWVERT_SIZE;
- xmm6 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
- xmm6 = _mm_loadh_pi(xmm6, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
- xmm0 = _mm_min_ps(xmm0, xmm6);
- xmm1 = _mm_max_ps(xmm1, xmm6);
-
- /*
- mov edx, [edi+eax+12]
- imul edx, DRAWVERT_SIZE
- movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
- minps xmm2, xmm7
- maxps xmm3, xmm7
- */
- edx = *((int*)(indexes_p+count_l+12));
- edx = edx * DRAWVERT_SIZE;
- xmm7 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
- xmm7 = _mm_loadh_pi(xmm7, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
- xmm2 = _mm_min_ps(xmm2, xmm7);
- xmm3 = _mm_max_ps(xmm3, xmm7);
- /*
- add eax, 4*4
- jl loop4
- */
- count_l = count_l + 4*4;
- } while (count_l < 0);
- }
- /*
- done4:
- mov eax, count
- and eax, 3
- jz done1
- */
- count_l = count;
- count_l = count_l & 3;
- if(count_l != 0) {
- /*
- shl eax, 2
- add edi, eax
- neg eax
- */
- count_l = count_l << 2;
- indexes_p = indexes_p + count_l;
- count_l = -count_l;
- /*
- loop1:
- */
- do{
- /*
- mov edx, [edi+eax+0]
- imul edx, DRAWVERT_SIZE;
- movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
- minps xmm0, xmm4
- maxps xmm1, xmm4
- */
- edx = *((int*)(indexes_p+count_l+0));
- edx = edx * DRAWVERT_SIZE;
- xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
- xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
- xmm0 = _mm_min_ps(xmm0, xmm4);
- xmm1 = _mm_max_ps(xmm1, xmm4);
-
- /*
- add eax, 4
- jl loop1
- */
- count_l = count_l + 4;
- } while (count_l < 0);
-
- }
- /*
- done1:
- shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
- minps xmm0, xmm2
- maxps xmm1, xmm3
- mov esi, min
- movhps [esi], xmm0
- movss [esi+8], xmm0
- mov edi, max
- movhps [edi], xmm1
- movss [edi+8], xmm1
- */
- xmm2 = _mm_shuffle_ps(xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ));
- xmm3 = _mm_shuffle_ps(xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ));
- xmm0 = _mm_min_ps(xmm0, xmm2);
- xmm1 = _mm_max_ps(xmm1, xmm3);
- min_p = (char *) &min;
- _mm_storeh_pi((__m64 *)(min_p), xmm0);
- _mm_store_ss((float *)(min_p+8), xmm0);
- max_p = (char *) &max;
- _mm_storeh_pi((__m64 *)(max_p), xmm1);
- _mm_store_ss((float *)(max_p+8), xmm1);
- }
- /*
- ============
- idSIMD_SSE::Dot
- dst[i] = constant * src[i].Normal() + src[i][3];
- ============
- */
- void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
- int count_l4;
- int count_l1;
- char *constant_p;
- char *src_p;
- char *dst_p;
- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
- /*
- mov eax, count
- mov edi, constant
- mov edx, eax
- mov esi, src
- mov ecx, dst
- and eax, ~3
- */
- count_l4 = count;
- constant_p = (char *) &constant;
- count_l1 = count_l4;
- src_p = (char *) src;
- dst_p = (char *) dst;
- count_l4 = count_l4 & ~3;
- /*
- movss xmm5, [edi+0]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm6, [edi+4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm7, [edi+8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- */
- xmm5 = _mm_load_ss((float *) (constant_p+0));
- xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
- xmm6 = _mm_load_ss((float *) (constant_p+4));
- xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
- xmm7 = _mm_load_ss((float *) (constant_p+8));
- xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
-
- /*
- jz startVert1
- */
- if (count != 0) {
- /*
- imul eax, 16
- add esi, eax
- neg eax
- */
- count_l4 = count_l4 * 16;
- src_p = src_p + count_l4;
- count_l4 = -count_l4;
- /*
- loopVert4:
- */
- do {
- /*
- movlps xmm1, [esi+eax+ 0]
- movlps xmm3, [esi+eax+ 8]
- movhps xmm1, [esi+eax+16]
- movhps xmm3, [esi+eax+24]
- movlps xmm2, [esi+eax+32]
- movlps xmm4, [esi+eax+40]
- movhps xmm2, [esi+eax+48]
- movhps xmm4, [esi+eax+56]
- movaps xmm0, xmm1
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
- movaps xmm2, xmm3
- shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
- */
- xmm1 = _mm_loadl_pi(xmm1, (__m64 *)(src_p+count_l4+ 0));
- xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+ 8));
- xmm1 = _mm_loadh_pi(xmm1, (__m64 *)(src_p+count_l4+16));
- xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+24));
- xmm2 = _mm_loadl_pi(xmm2, (__m64 *)(src_p+count_l4+32));
- xmm4 = _mm_loadl_pi(xmm4, (__m64 *)(src_p+count_l4+40));
- xmm2 = _mm_loadh_pi(xmm2, (__m64 *)(src_p+count_l4+48));
- xmm4 = _mm_loadh_pi(xmm4, (__m64 *)(src_p+count_l4+56));
- xmm0 = xmm1;
- xmm0 = _mm_shuffle_ps(xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ));
- xmm1 = _mm_shuffle_ps(xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ));
- xmm2 = xmm3;
- xmm2 = _mm_shuffle_ps(xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ));
- xmm3 = _mm_shuffle_ps(xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ));
-
- /*
- add ecx, 16
- add eax, 4*16
- */
- dst_p = dst_p + 16;
- count_l4 = count_l4 + 4*16;
-
- /*
- mulps xmm0, xmm5
- mulps xmm1, xmm6
- mulps xmm2, xmm7
- addps xmm0, xmm3
- addps xmm0, xmm1
- addps xmm0, xmm2
- */
- xmm0 = _mm_mul_ps(xmm0, xmm5);
- xmm1 = _mm_mul_ps(xmm1, xmm6);
- xmm2 = _mm_mul_ps(xmm2, xmm7);
- xmm0 = _mm_add_ps(xmm0, xmm3);
- xmm0 = _mm_add_ps(xmm0, xmm1);
- xmm0 = _mm_add_ps(xmm0, xmm2);
-
- /*
- movlps [ecx-16+0], xmm0
- movhps [ecx-16+8], xmm0
- jl loopVert4
- */
- _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
- _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
- } while (count_l4 < 0);
- }
- /*
- startVert1:
- and edx, 3
- jz done
- */
- count_l1 = count_l1 & 3;
- if(count_l1 != 0) {
- /*
- loopVert1:
- */
- do {
- /*
- movss xmm0, [esi+eax+0]
- movss xmm1, [esi+eax+4]
- movss xmm2, [esi+eax+8]
- mulss xmm0, xmm5
- mulss xmm1, xmm6
- mulss xmm2, xmm7
- addss xmm0, [esi+eax+12]
- add ecx, 4
- addss xmm0, xmm1
- add eax, 16
- addss xmm0, xmm2
- dec edx
- movss [ecx-4], xmm0
- jnz loopVert1
- */
- xmm0 = _mm_load_ss((float *) (src_p+count_l4+ 0));
- xmm1 = _mm_load_ss((float *) (src_p+count_l4+ 4));
- xmm2 = _mm_load_ss((float *) (src_p+count_l4+ 8));
- xmm3 = _mm_load_ss((float *) (src_p+count_l4+12));
-
- xmm0 = _mm_mul_ss(xmm0, xmm5);
- xmm1 = _mm_mul_ss(xmm1, xmm6);
- xmm2 = _mm_mul_ss(xmm2, xmm7);
-
- xmm0 = _mm_add_ss(xmm0, xmm3);
- dst_p = dst_p + 4;
- xmm0 = _mm_add_ss(xmm0, xmm1);
- count_l4 = count_l4 + 16;
- xmm0 = _mm_add_ss(xmm0, xmm2);
- count_l1 = count_l1 - 1;
- _mm_store_ss((float *) (dst_p-4), xmm0);
- } while (count_l1 != 0);
- }
- /*
- done:
- */
- }
- #elif defined(_WIN32)
- #include <xmmintrin.h>
- #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
- #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
- // transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary)
- #define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 ) \
- __asm movaps reg4, reg2 /* reg4 = 8, 9, 10, 11 */ \
- __asm unpcklps reg2, reg3 /* reg2 = 8, 12, 9, 13 */ \
- __asm unpckhps reg4, reg3 /* reg4 = 10, 14, 11, 15 */ \
- __asm movaps reg3, reg0 /* reg3 = 0, 1, 2, 3 */ \
- __asm unpcklps reg0, reg1 /* reg0 = 0, 4, 1, 5 */ \
- __asm unpckhps reg3, reg1 /* reg3 = 2, 6, 3, 7 */ \
- __asm movaps reg1, reg0 /* reg1 = 0, 4, 1, 5 */ \
- __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg0 = 0, 4, 8, 12 */ \
- __asm shufps reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg1 = 1, 5, 9, 13 */ \
- __asm movaps reg2, reg3 /* reg2 = 2, 6, 3, 7 */ \
- __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg2 = 2, 6, 10, 14 */ \
- __asm shufps reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg3 = 3, 7, 11, 15 */
- // transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary)
- #define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
- __asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
- __asm movlps reg3, [address+ 8] /* reg3 = 2, 3, X, X */ \
- __asm movhps reg1, [address+16] /* reg1 = 0, 1, 4, 5 */ \
- __asm movhps reg3, [address+24] /* reg3 = 2, 3, 6, 7 */ \
- __asm movlps reg2, [address+32] /* reg2 = 8, 9, X, X */ \
- __asm movlps reg4, [address+40] /* reg4 = 10, 11, X, X */ \
- __asm movhps reg2, [address+48] /* reg2 = 8, 9, 12, 13 */ \
- __asm movhps reg4, [address+56] /* reg4 = 10, 11, 14, 15 */ \
- __asm movaps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
- __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg0 = 0, 4, 8, 12 */ \
- __asm shufps reg1, reg2, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg1 = 1, 5, 9, 13 */ \
- __asm movaps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
- __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg2 = 2, 6, 10, 14 */ \
- __asm shufps reg3, reg4, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg3 = 3, 7, 11, 15 */
- // transpose a 4x4 matrix to memory from 4 xmm registers (reg4 is temporary)
- #define TRANPOSE_4x4_TO_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
- __asm movaps reg4, reg0 /* reg4 = 0, 4, 8, 12 */ \
- __asm unpcklps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
- __asm unpckhps reg4, reg1 /* reg4 = 8, 9, 12, 13 */ \
- __asm movaps reg1, reg2 /* reg1 = 2, 6, 10, 14 */ \
- __asm unpcklps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
- __asm unpckhps reg1, reg3 /* reg1 = 10, 11, 14, 15 */ \
- __asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
- __asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
- __asm movhps [address+16], reg0 /* mem1 = 4, 5, X, X */ \
- __asm movhps [address+24], reg2 /* mem1 = 4, 5, 6, 7 */ \
- __asm movlps [address+32], reg4 /* mem2 = 8, 9, X, X */ \
- __asm movlps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */ \
- __asm movhps [address+48], reg4 /* mem3 = 12, 13, X, X */ \
- __asm movhps [address+56], reg1 /* mem3 = 12, 13, 14, 15 */
- // transpose a 4x3 matrix loaded into 3 xmm registers (reg3 is temporary)
- #define TRANSPOSE_4x3( reg0, reg1, reg2, reg3 ) \
- __asm movaps reg3, reg2 /* reg3 = 8, 9, 10, 11 */ \
- __asm shufps reg3, reg1, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg3 = 10, 11, 4, 5 */ \
- __asm shufps reg2, reg0, R_SHUFFLEPS( 0, 1, 2, 3 ) /* reg2 = 8, 9, 2, 3 */ \
- __asm shufps reg1, reg0, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg1 = 6, 7, 0, 1 */ \
- __asm movaps reg0, reg1 /* reg0 = 6, 7, 0, 1 */ \
- __asm shufps reg0, reg2, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg0 = 0, 6, 3, 9 */ \
- __asm shufps reg1, reg3, R_SHUFFLEPS( 3, 1, 2, 0 ) /* reg1 = 1, 7, 4, 10 */ \
- __asm shufps reg2, reg3, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg2 = 2, 8, 5, 11 */
- // transpose a 4x3 matrix from memory into 3 xmm registers (reg3 is temporary)
- #define TRANSPOSE_4x3_FROM_MEMORY( address, reg0, reg1, reg2, reg3 ) \
- __asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
- __asm movlps reg2, [address+ 8] /* reg2 = 2, 3, X, X */ \
- __asm movlps reg3, [address+16] /* reg3 = 4, 5, X, X */ \
- __asm movhps reg1, [address+24] /* reg1 = 0, 1, 6, 7 */ \
- __asm movhps reg2, [address+32] /* reg2 = 2, 3, 8, 9 */ \
- __asm movhps reg3, [address+40] /* reg3 = 4, 5, 10, 11 */ \
- __asm movaps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
- __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg0 = 0, 6, 3, 9 */ \
- __asm shufps reg1, reg3, R_SHUFFLEPS( 1, 3, 0, 2 ) /* reg1 = 1, 7, 4, 10 */ \
- __asm shufps reg2, reg3, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg2 = 2, 8, 5, 11 */
- // transpose a 4x3 matrix to memory from 3 xmm registers (reg3 is temporary)
- #define TRANSPOSE_4x3_TO_MEMORY( address, reg0, reg1, reg2, reg3 ) \
- __asm movhlps reg3, reg0 /* reg3 = 3, 9, X, X */ \
- __asm unpcklps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
- __asm unpckhps reg1, reg2 /* reg1 = 4, 5, 10, 11 */ \
- __asm unpcklps reg2, reg3 /* reg2 = 2, 3, 8, 9 */ \
- __asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
- __asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
- __asm movlps [address+16], reg1 /* mem1 = 4, 5, X, X */ \
- __asm movhps [address+24], reg0 /* mem1 = 4, 5, 6, 7 */ \
- __asm movhps [address+32], reg2 /* mem2 = 8, 9, X, X */ \
- __asm movhps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */
- // with alignment
- #define KFLOATINITS( SRC0, COUNT, PRE, POST ) KFLOATINITDSS( SRC0,SRC0,SRC0,COUNT,PRE,POST )
- #define KFLOATINITD( DST, COUNT, PRE, POST ) KFLOATINITDSS( DST,DST,DST,COUNT,PRE,POST )
- #define KFLOATINITDS( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS( DST,SRC0,SRC0,COUNT,PRE,POST )
- #define KFLOATINITDSS( DST, SRC0, SRC1, COUNT, PRE, POST )\
- __asm mov ecx,DST \
- __asm shr ecx,2 \
- __asm mov ebx,COUNT \
- __asm neg ecx \
- __asm mov edx,SRC0 \
- __asm and ecx,3 \
- __asm mov esi,SRC1 \
- __asm sub ebx,ecx \
- __asm jge noUnderFlow \
- __asm xor ebx,ebx \
- __asm mov ecx,COUNT \
- __asm noUnderFlow: \
- __asm mov PRE,ecx \
- __asm mov eax,ebx \
- __asm mov edi,DST \
- __asm and eax,8-1 \
- __asm mov POST,eax \
- __asm and ebx,0xfffffff8 \
- __asm jle done \
- __asm shl ebx,2 \
- __asm lea ecx,[ecx*4+ebx] \
- __asm neg ebx \
- __asm add edx,ecx \
- __asm add esi,ecx \
- __asm add edi,ecx \
- __asm mov eax,edx \
- __asm or eax,esi
- // without alignment (pre==0)
- #define KFLOATINITS_NA( SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( SRC0,SRC0,SRC0,COUNT,PRE,POST )
- #define KFLOATINITD_NA( DST, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,DST,DST,COUNT,PRE,POST )
- #define KFLOATINITDS_NA( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,SRC0,SRC0,COUNT,PRE,POST )
- #define KFLOATINITDSS_NA( DST, SRC0, SRC1, COUNT, PRE, POST )\
- __asm mov eax,COUNT \
- __asm mov PRE,0 \
- __asm and eax,8-1 \
- __asm mov ebx,COUNT \
- __asm mov POST,eax \
- __asm and ebx,0xfffffff8 \
- __asm je done \
- __asm shl ebx,2 \
- __asm mov edx,SRC0 \
- __asm mov esi,SRC1 \
- __asm mov edi,DST \
- __asm add edx,ebx \
- __asm add esi,ebx \
- __asm add edi,ebx \
- __asm mov eax,edx \
- __asm or eax,esi \
- __asm or eax,edi \
- __asm neg ebx \
- /*
- when OPER is called:
- edx = s0
- esi = s1
- edi = d
- ebx = index*4
- xmm0 & xmm1 must not be trashed
- */
- #define KMOVDS1( DST, SRC0 ) \
- __asm movss xmm2,SRC0 \
- __asm movss DST,xmm2
- #define KMOVDS4( DST, SRC0 ) \
- __asm movups xmm2,SRC0 \
- __asm movups DST,xmm2
- #define KMINDS1( DST, SRC0 ) \
- __asm movss xmm2,SRC0 \
- __asm minss DST,xmm2
- #define KMAXDS1( DST, SRC0 ) \
- __asm movss xmm2,SRC0 \
- __asm maxss DST,xmm2
- // general ALU operation
- #define KALUDSS1( OP, DST, SRC0, SRC1 ) \
- __asm movss xmm2,SRC0 \
- __asm OP##ss xmm2,SRC1 \
- __asm movss DST,xmm2
- #define KALUDSS4( OP, DST, SRC0, SRC1 ) \
- __asm movups xmm2,SRC0 \
- __asm movups xmm3,SRC1 \
- __asm OP##ps xmm2,xmm3 \
- __asm movups DST,xmm2
- #define KADDDSS1( DST, SRC0, SRC1 ) KALUDSS1( add, DST,SRC0,SRC1 )
- #define KADDDSS4( DST, SRC0, SRC1 ) KALUDSS4( add, DST,SRC0,SRC1 )
- #define KSUBDSS1( DST, SRC0, SRC1 ) KALUDSS1( sub, DST,SRC0,SRC1 )
- #define KSUBDSS4( DST, SRC0, SRC1 ) KALUDSS4( sub, DST,SRC0,SRC1 )
- #define KMULDSS1( DST, SRC0, SRC1 ) KALUDSS1( mul, DST,SRC0,SRC1 )
- #define KMULDSS4( DST, SRC0, SRC1 ) KALUDSS4( mul, DST,SRC0,SRC1 )
- #define KDIVDSS1( DST, SRC0, SRC1 ) \
- __asm movss xmm2,SRC1 \
- __asm rcpss xmm3,xmm2 \
- __asm mulss xmm2,xmm3 \
- __asm mulss xmm2,xmm3 \
- __asm addss xmm3,xmm3 \
- __asm subss xmm3,xmm2 \
- __asm mulss xmm3,SRC0 \
- __asm movss DST,xmm3
- #define KDIVDSS4( DST, SRC0, SRC1 ) \
- __asm movups xmm2,SRC1 \
- __asm rcpps xmm3,xmm2 \
- __asm mulps xmm2,xmm3 \
- __asm mulps xmm2,xmm3 \
- __asm addps xmm3,xmm3 \
- __asm subps xmm3,xmm2 \
- __asm movups xmm2,SRC0 \
- __asm mulps xmm3,xmm2 \
- __asm movups DST,xmm3
- #define KF2IDS1( SRC0 ) \
- __asm movss xmm2,SRC0 \
- __asm cvttps2pi mm2,xmm2 \
- __asm movd [edi+ebx],mm2
- #define KF2IDS4( SRC0 ) \
- __asm movups xmm2,SRC0 \
- __asm cvttps2pi mm2,xmm2 \
- __asm movq [edi+ebx+0],mm2 \
- __asm shufps xmm2,xmm2,SHUFFLEPS(1,0,3,2) \
- __asm cvttps2pi mm2,xmm2 \
- __asm movq [edi+ebx+8],mm2
- #define KISQRTDS1( DST,SRC0 ) \
- __asm movss xmm2,SRC0 \
- __asm rsqrtss xmm3,xmm2 \
- __asm mulss xmm2,xmm3 \
- __asm mulss xmm2,xmm3 \
- __asm subss xmm2,xmm1 \
- __asm mulss xmm3,xmm0 \
- __asm mulss xmm3,xmm2 \
- __asm movss DST,xmm3
- #define KISQRTDS4( DST,SRC0 ) \
- __asm movups xmm2,SRC0 \
- __asm rsqrtps xmm3,xmm2 \
- __asm mulps xmm2,xmm3 \
- __asm mulps xmm2,xmm3 \
- __asm subps xmm2,xmm1 \
- __asm mulps xmm3,xmm0 \
- __asm mulps xmm3,xmm2 \
- __asm movups DST,xmm3
- // this is used in vector4 implementation to shift constant V4
- #define KANDREGDSV( DST, SRC0, VALUE ) \
- __asm mov DST,SRC0 \
- __asm and DST,VALUE
- // this is used in vector4 code to operate with float arrays as sources
- #define KEXPANDFLOAT( DST, SRC ) \
- __asm movss DST,SRC \
- __asm shufps DST,DST,0
- #define KADDDS1( DST,SRC ) KADDDSS1( DST,DST,SRC )
- #define KADDDS4( DST,SRC ) KADDDSS4( DST,DST,SRC )
- #define KSUBDS1( DST,SRC ) KSUBDSS1( DST,DST,SRC )
- #define KSUBDS4( DST,SRC ) KSUBDSS4( DST,DST,SRC )
- #define KMULDS1( DST,SRC ) KMULDSS1( DST,DST,SRC )
- #define KMULDS4( DST,SRC ) KMULDSS4( DST,DST,SRC )
- #define KDIVDS1( DST,SRC ) KDIVDSS1( DST,DST,SRC )
- #define KDIVDS4( DST,SRC ) KDIVDSS4( DST,DST,SRC )
- // handles pre & post leftovers
- #define KFLOATOPER( OPER, OPER4, COUNT ) \
- __asm mov ecx,pre \
- __asm mov ebx,COUNT \
- __asm cmp ebx,ecx \
- __asm cmovl ecx,COUNT \
- __asm test ecx,ecx \
- __asm je preDone \
- __asm xor ebx,ebx \
- __asm lpPre: \
- OPER \
- __asm add ebx,4 \
- __asm dec ecx \
- __asm jg lpPre \
- __asm preDone: \
- __asm mov ecx,post \
- __asm mov ebx,COUNT \
- __asm sub ebx,ecx \
- __asm shl ebx,2 \
- __asm cmp ecx,4 \
- __asm jl post4Done \
- OPER4 \
- __asm sub ecx,4 \
- __asm add ebx,4*4 \
- __asm post4Done: \
- __asm test ecx,ecx \
- __asm je postDone \
- __asm lpPost: \
- OPER \
- __asm add ebx,4 \
- __asm dec ecx \
- __asm jg lpPost \
- __asm postDone:
- // operate on a constant and a float array
- #define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT ) \
- int pre,post; \
- __asm movss xmm0,CONSTANT \
- __asm shufps xmm0,xmm0,0 \
- KFLOATINITDS( DST, SRC, COUNT, pre, post ) \
- __asm and eax,15 \
- __asm jne lpNA \
- __asm jmp lpA \
- __asm align 16 \
- __asm lpA: \
- __asm prefetchnta [edx+ebx+64] \
- __asm movaps xmm1,xmm0 \
- __asm movaps xmm2,xmm0 \
- __asm ALUOP##ps xmm1,[edx+ebx] \
- __asm ALUOP##ps xmm2,[edx+ebx+16] \
- __asm movaps [edi+ebx],xmm1 \
- __asm movaps [edi+ebx+16],xmm2 \
- __asm add ebx,16*2 \
- __asm jl lpA \
- __asm jmp done \
- __asm align 16 \
- __asm lpNA: \
- __asm prefetchnta [edx+ebx+64] \
- __asm movaps xmm1,xmm0 \
- __asm movaps xmm2,xmm0 \
- __asm movups xmm3,[edx+ebx] \
- __asm movups xmm4,[edx+ebx+16] \
- __asm ALUOP##ps xmm1,xmm3 \
- __asm ALUOP##ps xmm2,xmm4 \
- __asm movaps [edi+ebx],xmm1 \
- __asm movaps [edi+ebx+16],xmm2 \
- __asm add ebx,16*2 \
- __asm jl lpNA \
- __asm done: \
- __asm mov edx,SRC \
- __asm mov edi,DST \
- __asm KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), \
- __asm KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT )
- // operate on two float arrays
- #define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT ) \
- int pre,post; \
- KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post ) \
- __asm and eax,15 \
- __asm jne lpNA \
- __asm jmp lpA \
- __asm align 16 \
- __asm lpA: \
- __asm movaps xmm1,[edx+ebx] \
- __asm movaps xmm2,[edx+ebx+16] \
- __asm ALUOP##ps xmm1,[esi+ebx] \
- __asm ALUOP##ps xmm2,[esi+ebx+16] \
- __asm prefetchnta [edx+ebx+64] \
- __asm prefetchnta [esi+ebx+64] \
- __asm movaps [edi+ebx],xmm1 \
- __asm movaps [edi+ebx+16],xmm2 \
- __asm add ebx,16*2 \
- __asm jl lpA \
- __asm jmp done \
- __asm align 16 \
- __asm lpNA: \
- __asm movups xmm1,[edx+ebx] \
- __asm movups xmm2,[edx+ebx+16] \
- __asm movups xmm3,[esi+ebx] \
- __asm movups xmm4,[esi+ebx+16] \
- __asm prefetchnta [edx+ebx+64] \
- __asm prefetchnta [esi+ebx+64] \
- __asm ALUOP##ps xmm1,xmm3 \
- __asm ALUOP##ps xmm2,xmm4 \
- __asm movaps [edi+ebx],xmm1 \
- __asm movaps [edi+ebx+16],xmm2 \
- __asm add ebx,16*2 \
- __asm jl lpNA \
- __asm done: \
- __asm mov edx,SRC0 \
- __asm mov esi,SRC1 \
- __asm mov edi,DST \
- KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), \
- KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )
- #define DRAWVERT_SIZE 60
- #define DRAWVERT_XYZ_OFFSET (0*4)
- #define DRAWVERT_ST_OFFSET (3*4)
- #define DRAWVERT_NORMAL_OFFSET (5*4)
- #define DRAWVERT_TANGENT0_OFFSET (8*4)
- #define DRAWVERT_TANGENT1_OFFSET (11*4)
- #define DRAWVERT_COLOR_OFFSET (14*4)
- #define JOINTQUAT_SIZE (7*4)
- #define JOINTMAT_SIZE (4*3*4)
- #define JOINTWEIGHT_SIZE (4*4)
- #define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
- #define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
- #define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
- ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
- ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
- ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) );
- ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) );
- ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) );
- ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) );
- ALIGN4_INIT4( unsigned long SIMD_SP_singleSignBitMask, (unsigned long) ( 1 << 31 ), 0, 0, 0 );
- ALIGN4_INIT1( unsigned long SIMD_SP_signBitMask, (unsigned long) ( 1 << 31 ) );
- ALIGN4_INIT1( unsigned long SIMD_SP_absMask, (unsigned long) ~( 1 << 31 ) );
- ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, (unsigned long) ~( 1 << 23 ) );
- ALIGN4_INIT1( unsigned long SIMD_SP_not, 0xFFFFFFFF );
- ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
- ALIGN4_INIT1( float SIMD_SP_half, 0.5f );
- ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
- ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
- ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
- ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
- ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
- ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
- ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
- ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
- ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
- ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
- ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
- ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f );
- ALIGN4_INIT1( float SIMD_SP_rsqrt_c0, 3.0f );
- ALIGN4_INIT1( float SIMD_SP_rsqrt_c1, -0.5f );
- ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f*0.5f );
- ALIGN4_INIT1( float SIMD_SP_sin_c0, -2.39e-08f );
- ALIGN4_INIT1( float SIMD_SP_sin_c1, 2.7526e-06f );
- ALIGN4_INIT1( float SIMD_SP_sin_c2, -1.98409e-04f );
- ALIGN4_INIT1( float SIMD_SP_sin_c3, 8.3333315e-03f );
- ALIGN4_INIT1( float SIMD_SP_sin_c4, -1.666666664e-01f );
- ALIGN4_INIT1( float SIMD_SP_cos_c0, -2.605e-07f );
- ALIGN4_INIT1( float SIMD_SP_cos_c1, 2.47609e-05f );
- ALIGN4_INIT1( float SIMD_SP_cos_c2, -1.3888397e-03f );
- ALIGN4_INIT1( float SIMD_SP_cos_c3, 4.16666418e-02f );
- ALIGN4_INIT1( float SIMD_SP_cos_c4, -4.999999963e-01f );
- ALIGN4_INIT1( float SIMD_SP_atan_c0, 0.0028662257f );
- ALIGN4_INIT1( float SIMD_SP_atan_c1, -0.0161657367f );
- ALIGN4_INIT1( float SIMD_SP_atan_c2, 0.0429096138f );
- ALIGN4_INIT1( float SIMD_SP_atan_c3, -0.0752896400f );
- ALIGN4_INIT1( float SIMD_SP_atan_c4, 0.1065626393f );
- ALIGN4_INIT1( float SIMD_SP_atan_c5, -0.1420889944f );
- ALIGN4_INIT1( float SIMD_SP_atan_c6, 0.1999355085f );
- ALIGN4_INIT1( float SIMD_SP_atan_c7, -0.3333314528f );
- /*
- ============
- SSE_InvSqrt
- ============
- */
- float SSE_InvSqrt( float x ) {
- float y;
- __asm {
- movss xmm0, x
- rsqrtss xmm1, xmm0
- mulss xmm0, xmm1
- mulss xmm0, xmm1
- subss xmm0, SIMD_SP_rsqrt_c0
- mulss xmm1, SIMD_SP_rsqrt_c1
- mulss xmm0, xmm1
- movss y, xmm0
- }
- return y;
- }
- /*
- ============
- SSE_InvSqrt4
- ============
- */
- void SSE_InvSqrt4( float x[4] ) {
- __asm {
- mov edi, x
- movaps xmm0, [edi]
- rsqrtps xmm1, xmm0
- mulps xmm0, xmm1
- mulps xmm0, xmm1
- subps xmm0, SIMD_SP_rsqrt_c0
- mulps xmm1, SIMD_SP_rsqrt_c1
- mulps xmm0, xmm1
- movaps [edi], xmm0
- }
- }
- /*
- ============
- SSE_SinZeroHalfPI
- The angle must be between zero and half PI.
- ============
- */
- float SSE_SinZeroHalfPI( float a ) {
- #if 1
- float t;
- assert( a >= 0.0f && a <= idMath::HALF_PI );
- __asm {
- movss xmm0, a
- movss xmm1, xmm0
- mulss xmm1, xmm1
- movss xmm2, SIMD_SP_sin_c0
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_sin_c1
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_sin_c2
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_sin_c3
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_sin_c4
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_one
- mulss xmm2, xmm0
- movss t, xmm2
- }
- return t;
- #else
- float s, t;
- assert( a >= 0.0f && a <= idMath::HALF_PI );
- s = a * a;
- t = -2.39e-08f;
- t *= s;
- t += 2.7526e-06f;
- t *= s;
- t += -1.98409e-04f;
- t *= s;
- t += 8.3333315e-03f;
- t *= s;
- t += -1.666666664e-01f;
- t *= s;
- t += 1.0f;
- t *= a;
- return t;
- #endif
- }
- /*
- ============
- SSE_Sin4ZeroHalfPI
- The angle must be between zero and half PI.
- ============
- */
- void SSE_Sin4ZeroHalfPI( float a[4], float s[4] ) {
- __asm {
- mov edi, a
- mov esi, s
- movaps xmm0, [edi]
- movaps xmm1, xmm0
- mulps xmm1, xmm1
- movaps xmm2, SIMD_SP_sin_c0
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_sin_c1
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_sin_c2
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_sin_c3
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_sin_c4
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_one
- mulps xmm2, xmm0
- movaps [esi], xmm2
- }
- }
- /*
- ============
- SSE_Sin
- ============
- */
- float SSE_Sin( float a ) {
- #if 1
- float t;
- __asm {
- movss xmm1, a
- movss xmm2, xmm1
- movss xmm3, xmm1
- mulss xmm2, SIMD_SP_oneOverTwoPI
- cvttss2si ecx, xmm2
- cmpltss xmm3, SIMD_SP_zero
- andps xmm3, SIMD_SP_one
- cvtsi2ss xmm2, ecx
- subss xmm2, xmm3
- mulss xmm2, SIMD_SP_twoPI
- subss xmm1, xmm2
- movss xmm0, SIMD_SP_PI // xmm0 = PI
- subss xmm0, xmm1 // xmm0 = PI - a
- movss xmm1, xmm0 // xmm1 = PI - a
- andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
- movss xmm2, xmm0 // xmm2 = PI - a
- xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
- cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
- movss xmm3, SIMD_SP_PI // xmm3 = PI
- xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
- andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
- andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
- xorps xmm0, xmm2
- addps xmm0, xmm3
- movss xmm1, xmm0
- mulss xmm1, xmm1
- movss xmm2, SIMD_SP_sin_c0
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_sin_c1
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_sin_c2
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_sin_c3
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_sin_c4
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_one
- mulss xmm2, xmm0
- movss t, xmm2
- }
- return t;
- #else
- float s, t;
- if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
- a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
- }
- a = idMath::PI - a;
- if ( fabs( a ) >= idMath::HALF_PI ) {
- a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
- }
- s = a * a;
- t = -2.39e-08f;
- t *= s;
- t += 2.7526e-06f;
- t *= s;
- t += -1.98409e-04f;
- t *= s;
- t += 8.3333315e-03f;
- t *= s;
- t += -1.666666664e-01f;
- t *= s;
- t += 1.0f;
- t *= a;
- return t;
- #endif
- }
- /*
- ============
- SSE_Sin4
- ============
- */
- void SSE_Sin4( float a[4], float s[4] ) {
- __asm {
- mov edi, a
- mov esi, s
- movaps xmm1, [edi]
- movaps xmm2, xmm1
- mulps xmm2, SIMD_SP_oneOverTwoPI
- movhlps xmm3, xmm2
- cvttss2si ecx, xmm2
- cvtsi2ss xmm2, ecx
- cvttss2si edx, xmm3
- cvtsi2ss xmm3, edx
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
- cvttss2si ecx, xmm2
- cvtsi2ss xmm2, ecx
- cvttss2si edx, xmm3
- cvtsi2ss xmm3, edx
- shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
- movaps xmm3, xmm1
- cmpltps xmm3, SIMD_SP_zero
- andps xmm3, SIMD_SP_one
- subps xmm2, xmm3
- mulps xmm2, SIMD_SP_twoPI
- subps xmm1, xmm2
- movaps xmm0, SIMD_SP_PI // xmm0 = PI
- subps xmm0, xmm1 // xmm0 = PI - a
- movaps xmm1, xmm0 // xmm1 = PI - a
- andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
- movaps xmm2, xmm0 // xmm2 = PI - a
- xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
- cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
- movaps xmm3, SIMD_SP_PI // xmm3 = PI
- xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
- andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
- andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
- xorps xmm0, xmm2
- addps xmm0, xmm3
- movaps xmm1, xmm0
- mulps xmm1, xmm1
- movaps xmm2, SIMD_SP_sin_c0
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_sin_c1
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_sin_c2
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_sin_c3
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_sin_c4
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_one
- mulps xmm2, xmm0
- movaps [esi], xmm2
- }
- }
- /*
- ============
- SSE_CosZeroHalfPI
- The angle must be between zero and half PI.
- ============
- */
- float SSE_CosZeroHalfPI( float a ) {
- #if 1
- float t;
- assert( a >= 0.0f && a <= idMath::HALF_PI );
- __asm {
- movss xmm0, a
- mulss xmm0, xmm0
- movss xmm1, SIMD_SP_cos_c0
- mulss xmm1, xmm0
- addss xmm1, SIMD_SP_cos_c1
- mulss xmm1, xmm0
- addss xmm1, SIMD_SP_cos_c2
- mulss xmm1, xmm0
- addss xmm1, SIMD_SP_cos_c3
- mulss xmm1, xmm0
- addss xmm1, SIMD_SP_cos_c4
- mulss xmm1, xmm0
- addss xmm1, SIMD_SP_one
- movss t, xmm1
- }
- return t;
- #else
- float s, t;
- assert( a >= 0.0f && a <= idMath::HALF_PI );
- s = a * a;
- t = -2.605e-07f;
- t *= s;
- t += 2.47609e-05f;
- t *= s;
- t += -1.3888397e-03f;
- t *= s;
- t += 4.16666418e-02f;
- t *= s;
- t += -4.999999963e-01f;
- t *= s;
- t += 1.0f;
- return t;
- #endif
- }
- /*
- ============
- SSE_Cos4ZeroHalfPI
- The angle must be between zero and half PI.
- ============
- */
- void SSE_Cos4ZeroHalfPI( float a[4], float c[4] ) {
- __asm {
- mov edi, a
- mov esi, c
- movaps xmm0, [edi]
- mulps xmm0, xmm0
- movaps xmm1, SIMD_SP_cos_c0
- mulps xmm1, xmm0
- addps xmm1, SIMD_SP_cos_c1
- mulps xmm1, xmm0
- addps xmm1, SIMD_SP_cos_c2
- mulps xmm1, xmm0
- addps xmm1, SIMD_SP_cos_c3
- mulps xmm1, xmm0
- addps xmm1, SIMD_SP_cos_c4
- mulps xmm1, xmm0
- addps xmm1, SIMD_SP_one
- movaps [esi], xmm2
- }
- }
- /*
- ============
- SSE_Cos
- ============
- */
- float SSE_Cos( float a ) {
- #if 1
- float t;
- __asm {
- movss xmm1, a
- movss xmm2, xmm1
- movss xmm3, xmm1
- mulss xmm2, SIMD_SP_oneOverTwoPI
- cvttss2si ecx, xmm2
- cmpltss xmm3, SIMD_SP_zero
- andps xmm3, SIMD_SP_one
- cvtsi2ss xmm2, ecx
- subss xmm2, xmm3
- mulss xmm2, SIMD_SP_twoPI
- subss xmm1, xmm2
- movss xmm0, SIMD_SP_PI // xmm0 = PI
- subss xmm0, xmm1 // xmm0 = PI - a
- movss xmm1, xmm0 // xmm1 = PI - a
- andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
- movss xmm2, xmm0 // xmm2 = PI - a
- xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
- cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
- movss xmm3, SIMD_SP_PI // xmm3 = PI
- xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
- andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
- andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
- xorps xmm0, xmm2
- addps xmm0, xmm3
- mulss xmm0, xmm0
- movss xmm1, SIMD_SP_cos_c0
- mulss xmm1, xmm0
- addss xmm1, SIMD_SP_cos_c1
- mulss xmm1, xmm0
- addss xmm1, SIMD_SP_cos_c2
- mulss xmm1, xmm0
- addss xmm1, SIMD_SP_cos_c3
- mulss xmm1, xmm0
- addss xmm1, SIMD_SP_cos_c4
- mulss xmm1, xmm0
- addss xmm1, SIMD_SP_one
- xorps xmm2, SIMD_SP_signBitMask
- xorps xmm1, xmm2
- movss t, xmm1
- }
- return t;
- #else
- float s, t;
- if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
- a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
- }
- a = idMath::PI - a;
- if ( fabs( a ) >= idMath::HALF_PI ) {
- a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
- d = 1.0f;
- } else {
- d = -1.0f;
- }
- s = a * a;
- t = -2.605e-07f;
- t *= s;
- t += 2.47609e-05f;
- t *= s;
- t += -1.3888397e-03f;
- t *= s;
- t += 4.16666418e-02f;
- t *= s;
- t += -4.999999963e-01f;
- t *= s;
- t += 1.0f;
- t *= d;
- return t;
- #endif
- }
- /*
- ============
- SSE_Cos4
- ============
- */
- void SSE_Cos4( float a[4], float c[4] ) {
- __asm {
- mov edi, a
- mov esi, c
- movaps xmm1, [edi]
- movaps xmm2, xmm1
- mulps xmm2, SIMD_SP_oneOverTwoPI
- movhlps xmm3, xmm2
- cvttss2si ecx, xmm2
- cvtsi2ss xmm2, ecx
- cvttss2si edx, xmm3
- cvtsi2ss xmm3, edx
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
- cvttss2si ecx, xmm2
- cvtsi2ss xmm2, ecx
- cvttss2si edx, xmm3
- cvtsi2ss xmm3, edx
- shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
- movaps xmm3, xmm1
- cmpltps xmm3, SIMD_SP_zero
- andps xmm3, SIMD_SP_one
- subps xmm2, xmm3
- mulps xmm2, SIMD_SP_twoPI
- subps xmm1, xmm2
- movaps xmm0, SIMD_SP_PI // xmm0 = PI
- subps xmm0, xmm1 // xmm0 = PI - a
- movaps xmm1, xmm0 // xmm1 = PI - a
- andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
- movaps xmm2, xmm0 // xmm2 = PI - a
- xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
- cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
- movaps xmm3, SIMD_SP_PI // xmm3 = PI
- xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
- andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
- andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
- xorps xmm0, xmm2
- addps xmm0, xmm3
- mulps xmm0, xmm0
- movaps xmm1, SIMD_SP_cos_c0
- mulps xmm1, xmm0
- addps xmm1, SIMD_SP_cos_c1
- mulps xmm1, xmm0
- addps xmm1, SIMD_SP_cos_c2
- mulps xmm1, xmm0
- addps xmm1, SIMD_SP_cos_c3
- mulps xmm1, xmm0
- addps xmm1, SIMD_SP_cos_c4
- mulps xmm1, xmm0
- addps xmm1, SIMD_SP_one
- xorps xmm2, SIMD_SP_signBitMask
- xorps xmm1, xmm2
- movaps [esi], xmm1
- }
- }
- /*
- ============
- SSE_SinCos
- ============
- */
- void SSE_SinCos( float a, float &s, float &c ) {
- __asm {
- mov edi, s
- mov esi, c
- movss xmm1, a
- movss xmm2, xmm1
- movss xmm3, xmm1
- mulss xmm2, SIMD_SP_oneOverTwoPI
- cvttss2si ecx, xmm2
- cmpltss xmm3, SIMD_SP_zero
- andps xmm3, SIMD_SP_one
- cvtsi2ss xmm2, ecx
- subss xmm2, xmm3
- mulss xmm2, SIMD_SP_twoPI
- subss xmm1, xmm2
- movss xmm0, SIMD_SP_PI // xmm0 = PI
- subss xmm0, xmm1 // xmm0 = PI - a
- movss xmm1, xmm0 // xmm1 = PI - a
- andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
- movss xmm2, xmm0 // xmm2 = PI - a
- xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
- cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
- movss xmm3, SIMD_SP_PI // xmm3 = PI
- xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
- andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
- andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
- xorps xmm0, xmm2
- addps xmm0, xmm3
- movss xmm1, xmm0
- mulss xmm1, xmm1
- movss xmm3, SIMD_SP_sin_c0
- movss xmm4, SIMD_SP_cos_c0
- mulss xmm3, xmm1
- mulss xmm4, xmm1
- addss xmm3, SIMD_SP_sin_c1
- addss xmm4, SIMD_SP_cos_c1
- mulss xmm3, xmm1
- mulss xmm4, xmm1
- addss xmm3, SIMD_SP_sin_c2
- addss xmm4, SIMD_SP_cos_c2
- mulss xmm3, xmm1
- mulss xmm4, xmm1
- addss xmm3, SIMD_SP_sin_c3
- addss xmm4, SIMD_SP_cos_c3
- mulss xmm3, xmm1
- mulss xmm4, xmm1
- addss xmm3, SIMD_SP_sin_c4
- addss xmm4, SIMD_SP_cos_c4
- mulss xmm3, xmm1
- mulss xmm4, xmm1
- addss xmm3, SIMD_SP_one
- addss xmm4, SIMD_SP_one
- mulss xmm3, xmm0
- xorps xmm2, SIMD_SP_signBitMask
- xorps xmm4, xmm2
- movss [edi], xmm2
- movss [esi], xmm3
- }
- }
- /*
- ============
- SSE_SinCos4
- ============
- */
- void SSE_SinCos4( float a[4], float s[4], float c[4] ) {
- __asm {
- mov eax, a
- mov edi, s
- mov esi, c
- movaps xmm1, [eax]
- movaps xmm2, xmm1
- mulps xmm2, SIMD_SP_oneOverTwoPI
- movhlps xmm3, xmm2
- cvttss2si ecx, xmm2
- cvtsi2ss xmm2, ecx
- cvttss2si edx, xmm3
- cvtsi2ss xmm3, edx
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
- cvttss2si ecx, xmm2
- cvtsi2ss xmm2, ecx
- cvttss2si edx, xmm3
- cvtsi2ss xmm3, edx
- shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
- movaps xmm3, xmm1
- cmpltps xmm3, SIMD_SP_zero
- andps xmm3, SIMD_SP_one
- subps xmm2, xmm3
- mulps xmm2, SIMD_SP_twoPI
- subps xmm1, xmm2
- movaps xmm0, SIMD_SP_PI // xmm0 = PI
- subps xmm0, xmm1 // xmm0 = PI - a
- movaps xmm1, xmm0 // xmm1 = PI - a
- andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
- movaps xmm2, xmm0 // xmm2 = PI - a
- xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
- cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
- movaps xmm3, SIMD_SP_PI // xmm3 = PI
- xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
- andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
- andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
- xorps xmm0, xmm2
- addps xmm0, xmm3
- movaps xmm0, [eax]
- movaps xmm1, xmm0
- mulps xmm1, xmm1
- movaps xmm3, SIMD_SP_sin_c0
- movaps xmm4, SIMD_SP_cos_c0
- mulps xmm3, xmm1
- mulps xmm4, xmm1
- addps xmm3, SIMD_SP_sin_c1
- addps xmm4, SIMD_SP_cos_c1
- mulps xmm3, xmm1
- mulps xmm4, xmm1
- addps xmm3, SIMD_SP_sin_c2
- addps xmm4, SIMD_SP_cos_c2
- mulps xmm3, xmm1
- mulps xmm4, xmm1
- addps xmm3, SIMD_SP_sin_c3
- addps xmm4, SIMD_SP_cos_c3
- mulps xmm3, xmm1
- mulps xmm4, xmm1
- addps xmm3, SIMD_SP_sin_c4
- addps xmm4, SIMD_SP_cos_c4
- mulps xmm3, xmm1
- mulps xmm4, xmm1
- addps xmm3, SIMD_SP_one
- addps xmm4, SIMD_SP_one
- mulps xmm3, xmm0
- xorps xmm2, SIMD_SP_signBitMask
- xorps xmm4, xmm2
- movaps [edi], xmm3
- movaps [esi], xmm4
- }
- }
- /*
- ============
- SSE_ATanPositive
- Both 'x' and 'y' must be positive.
- ============
- */
- float SSE_ATanPositive( float y, float x ) {
- #if 1
- float t;
- assert( y >= 0.0f && x >= 0.0f );
- __asm {
- movss xmm0, x
- movss xmm3, xmm0
- movss xmm1, y
- minss xmm0, xmm1
- maxss xmm1, xmm3
- cmpeqss xmm3, xmm0
- rcpss xmm2, xmm1
- mulss xmm1, xmm2
- mulss xmm1, xmm2
- addss xmm2, xmm2
- subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x
- mulss xmm0, xmm2 // xmm0 = x / y or y / x
- movss xmm1, xmm3
- andps xmm1, SIMD_SP_signBitMask
- xorps xmm0, xmm1 // xmm0 = -x / y or y / x
- andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f
- movss xmm1, xmm0
- mulss xmm1, xmm1 // xmm1 = s
- movss xmm2, SIMD_SP_atan_c0
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c1
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c2
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c3
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c4
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c5
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c6
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c7
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_one
- mulss xmm2, xmm0
- addss xmm2, xmm3
- movss t, xmm2
- }
- return t;
- #else
- float a, d, s, t;
- assert( y >= 0.0f && x >= 0.0f );
- if ( y > x ) {
- a = -x / y;
- d = idMath::HALF_PI;
- } else {
- a = y / x;
- d = 0.0f;
- }
- s = a * a;
- t = 0.0028662257f;
- t *= s;
- t += -0.0161657367f;
- t *= s;
- t += 0.0429096138f;
- t *= s;
- t += -0.0752896400f;
- t *= s;
- t += 0.1065626393f;
- t *= s;
- t += -0.1420889944f;
- t *= s;
- t += 0.1999355085f;
- t *= s;
- t += -0.3333314528f;
- t *= s;
- t += 1.0f;
- t *= a;
- t += d;
- return t;
- #endif
- }
- /*
- ============
- SSE_ATan4Positive
- Both 'x' and 'y' must be positive.
- ============
- */
- void SSE_ATan4Positive( float y[4], float x[4], float at[4] ) {
- __asm {
- mov esi, x
- mov edi, y
- mov edx, at
- movaps xmm0, [esi]
- movaps xmm3, xmm0
- movaps xmm1, [edi]
- minps xmm0, xmm1
- maxps xmm1, xmm3
- cmpeqps xmm3, xmm0
- rcpps xmm2, xmm1
- mulps xmm1, xmm2
- mulps xmm1, xmm2
- addps xmm2, xmm2
- subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x
- mulps xmm0, xmm2 // xmm0 = x / y or y / x
- movaps xmm1, xmm3
- andps xmm1, SIMD_SP_signBitMask
- xorps xmm0, xmm1 // xmm0 = -x / y or y / x
- andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f
- movaps xmm1, xmm0
- mulps xmm1, xmm1 // xmm1 = s
- movaps xmm2, SIMD_SP_atan_c0
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c1
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c2
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c3
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c4
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c5
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c6
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c7
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_one
- mulps xmm2, xmm0
- addps xmm2, xmm3
- movaps [edx], xmm2
- }
- }
- /*
- ============
- SSE_ATan
- ============
- */
- float SSE_ATan( float y, float x ) {
- #if 1
- float t;
- __asm {
- movss xmm0, x
- movss xmm3, xmm0
- movss xmm4, xmm0
- andps xmm0, SIMD_SP_absMask
- movss xmm1, y
- xorps xmm4, xmm1
- andps xmm1, SIMD_SP_absMask
- andps xmm4, SIMD_SP_signBitMask
- minss xmm0, xmm1
- maxss xmm1, xmm3
- cmpeqss xmm3, xmm0
- rcpss xmm2, xmm1
- mulss xmm1, xmm2
- mulss xmm1, xmm2
- addss xmm2, xmm2
- subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x
- mulss xmm0, xmm2 // xmm0 = x / y or y / x
- xorps xmm0, xmm4
- movss xmm1, xmm3
- andps xmm1, SIMD_SP_signBitMask
- xorps xmm0, xmm1 // xmm0 = -x / y or y / x
- orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI
- andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f
- movss xmm1, xmm0
- mulss xmm1, xmm1 // xmm1 = s
- movss xmm2, SIMD_SP_atan_c0
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c1
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c2
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c3
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c4
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c5
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c6
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_atan_c7
- mulss xmm2, xmm1
- addss xmm2, SIMD_SP_one
- mulss xmm2, xmm0
- addss xmm2, xmm3
- movss t, xmm2
- }
- return t;
- #else
- float a, d, s, t;
- if ( fabs( y ) > fabs( x ) ) {
- a = -x / y;
- d = idMath::HALF_PI;
- *((unsigned long *)&d) ^= ( *((unsigned long *)&x) ^ *((unsigned long *)&y) ) & (1<<31);
- } else {
- a = y / x;
- d = 0.0f;
- }
- s = a * a;
- t = 0.0028662257f;
- t *= s;
- t += -0.0161657367f;
- t *= s;
- t += 0.0429096138f;
- t *= s;
- t += -0.0752896400f;
- t *= s;
- t += 0.1065626393f;
- t *= s;
- t += -0.1420889944f;
- t *= s;
- t += 0.1999355085f;
- t *= s;
- t += -0.3333314528f;
- t *= s;
- t += 1.0f;
- t *= a;
- t += d;
- return t;
- #endif
- }
- /*
- ============
- SSE_ATan4
- ============
- */
- void SSE_ATan4( float y[4], float x[4], float at[4] ) {
- __asm {
- mov esi, x
- mov edi, y
- mov edx, at
- movaps xmm0, [esi]
- movaps xmm3, xmm0
- movaps xmm4, xmm0
- andps xmm0, SIMD_SP_absMask
- movaps xmm1, [edi]
- xorps xmm4, xmm1
- andps xmm1, SIMD_SP_absMask
- andps xmm4, SIMD_SP_signBitMask
- minps xmm0, xmm1
- maxps xmm1, xmm3
- cmpeqps xmm3, xmm0
- rcpps xmm2, xmm1
- mulps xmm1, xmm2
- mulps xmm1, xmm2
- addps xmm2, xmm2
- subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x
- mulps xmm0, xmm2 // xmm0 = x / y or y / x
- xorps xmm0, xmm4
- movaps xmm1, xmm3
- andps xmm1, SIMD_SP_signBitMask
- xorps xmm0, xmm1 // xmm0 = -x / y or y / x
- orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI
- andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f
- movaps xmm1, xmm0
- mulps xmm1, xmm1 // xmm1 = s
- movaps xmm2, SIMD_SP_atan_c0
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c1
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c2
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c3
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c4
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c5
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c6
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_atan_c7
- mulps xmm2, xmm1
- addps xmm2, SIMD_SP_one
- mulps xmm2, xmm0
- addps xmm2, xmm3
- movaps [edx], xmm2
- }
- }
- /*
- ============
- SSE_TestTrigonometry
- ============
- */
- void SSE_TestTrigonometry( void ) {
- int i;
- float a, s1, s2, c1, c2;
- for ( i = 0; i < 100; i++ ) {
- a = i * idMath::HALF_PI / 100.0f;
- s1 = sin( a );
- s2 = SSE_SinZeroHalfPI( a );
- if ( fabs( s1 - s2 ) > 1e-7f ) {
- assert( 0 );
- }
- c1 = cos( a );
- c2 = SSE_CosZeroHalfPI( a );
- if ( fabs( c1 - c2 ) > 1e-7f ) {
- assert( 0 );
- }
- }
- for ( i = -200; i < 200; i++ ) {
- a = i * idMath::TWO_PI / 100.0f;
- s1 = sin( a );
- s2 = SSE_Sin( a );
- if ( fabs( s1 - s2 ) > 1e-6f ) {
- assert( 0 );
- }
- c1 = cos( a );
- c2 = SSE_Cos( a );
- if ( fabs( c1 - c2 ) > 1e-6f ) {
- assert( 0 );
- }
- SSE_SinCos( a, s2, c2 );
- if ( fabs( s1 - s2 ) > 1e-6f || fabs( c1 - c2 ) > 1e-6f ) {
- assert( 0 );
- }
- }
- }
- /*
- ============
- idSIMD_SSE::GetName
- ============
- */
- const char * idSIMD_SSE::GetName( void ) const {
- return "MMX & SSE";
- }
- /*
- ============
- idSIMD_SSE::Add
- dst[i] = constant + src[i];
- ============
- */
- void VPCALL idSIMD_SSE::Add( float *dst, const float constant, const float *src, const int count ) {
- KFLOAT_CA( add, dst, src, constant, count )
- }
- /*
- ============
- idSIMD_SSE::Add
- dst[i] = src0[i] + src1[i];
- ============
- */
- void VPCALL idSIMD_SSE::Add( float *dst, const float *src0, const float *src1, const int count ) {
- KFLOAT_AA( add, dst, src0, src1, count )
- }
- /*
- ============
- idSIMD_SSE::Sub
- dst[i] = constant - src[i];
- ============
- */
- void VPCALL idSIMD_SSE::Sub( float *dst, const float constant, const float *src, const int count ) {
- KFLOAT_CA( sub, dst, src, constant, count )
- }
- /*
- ============
- idSIMD_SSE::Sub
- dst[i] = src0[i] - src1[i];
- ============
- */
- void VPCALL idSIMD_SSE::Sub( float *dst, const float *src0, const float *src1, const int count ) {
- KFLOAT_AA( sub, dst, src0, src1, count )
- }
- /*
- ============
- idSIMD_SSE::Mul
- dst[i] = constant * src[i];
- ============
- */
- void VPCALL idSIMD_SSE::Mul( float *dst, const float constant, const float *src, const int count ) {
- KFLOAT_CA( mul, dst, src, constant, count )
- }
- /*
- ============
- idSIMD_SSE::Mul
- dst[i] = src0[i] * src1[i];
- ============
- */
- void VPCALL idSIMD_SSE::Mul( float *dst, const float *src0, const float *src1, const int count ) {
- KFLOAT_AA( mul, dst, src0, src1, count )
- }
- /*
- ============
- idSIMD_SSE::Div
- dst[i] = constant / src[i];
- ============
- */
- void VPCALL idSIMD_SSE::Div( float *dst, const float constant, const float *src, const int count ) {
- int pre, post;
- // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
- __asm
- {
- movss xmm1,constant
- shufps xmm1,xmm1,0
- KFLOATINITDS( dst, src, count, pre, post )
- and eax,15
- jne lpNA
- jmp lpA
- align 16
- lpA:
- movaps xmm2,[edx+ebx]
- movaps xmm3,[edx+ebx+16]
- rcpps xmm4,xmm2
- rcpps xmm5,xmm3
- prefetchnta [edx+ebx+64]
- mulps xmm2,xmm4
- mulps xmm2,xmm4
- mulps xmm3,xmm5
- mulps xmm3,xmm5
- addps xmm4,xmm4
- addps xmm5,xmm5
- subps xmm4,xmm2
- subps xmm5,xmm3
- mulps xmm4,xmm1
- mulps xmm5,xmm1
- movaps [edi+ebx],xmm4
- movaps [edi+ebx+16],xmm5
- add ebx,16*2
- jl lpA
- jmp done
- align 16
- lpNA:
- movups xmm2,[edx+ebx]
- movups xmm3,[edx+ebx+16]
- rcpps xmm4,xmm2
- rcpps xmm5,xmm3
- prefetchnta [edx+ebx+64]
- mulps xmm2,xmm4
- mulps xmm2,xmm4
- mulps xmm3,xmm5
- mulps xmm3,xmm5
- addps xmm4,xmm4
- addps xmm5,xmm5
- subps xmm4,xmm2
- subps xmm5,xmm3
- mulps xmm4,xmm1
- mulps xmm5,xmm1
- movaps [edi+ebx],xmm4
- movaps [edi+ebx+16],xmm5
- add ebx,16*2
- jl lpNA
- done:
- mov edx,src
- mov edi,dst
- KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ),
- KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count )
- }
- }
- /*
- ============
- idSIMD_SSE::Div
- dst[i] = src0[i] / src1[i];
- ============
- */
- void VPCALL idSIMD_SSE::Div( float *dst, const float *src0, const float *src1, const int count ) {
- int pre,post;
- // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
- __asm
- {
- KFLOATINITDSS( dst, src0, src1, count, pre, post )
- and eax,15
- jne lpNA
- jmp lpA
- align 16
- lpA:
- movaps xmm2,[esi+ebx]
- movaps xmm3,[esi+ebx+16]
- rcpps xmm4,xmm2
- rcpps xmm5,xmm3
- prefetchnta [esi+ebx+64]
- mulps xmm2,xmm4
- mulps xmm2,xmm4
- mulps xmm3,xmm5
- mulps xmm3,xmm5
- addps xmm4,xmm4
- addps xmm5,xmm5
- subps xmm4,xmm2
- subps xmm5,xmm3
- mulps xmm4,[edx+ebx]
- mulps xmm5,[edx+ebx+16]
- movaps [edi+ebx],xmm4
- movaps [edi+ebx+16],xmm5
- add ebx,16*2
- jl lpA
- jmp done
- align 16
- lpNA:
- movups xmm2,[esi+ebx]
- movups xmm3,[esi+ebx+16]
- rcpps xmm4,xmm2
- rcpps xmm5,xmm3
- prefetchnta [esi+ebx+64]
- mulps xmm2,xmm4
- mulps xmm2,xmm4
- mulps xmm3,xmm5
- mulps xmm3,xmm5
- addps xmm4,xmm4
- addps xmm5,xmm5
- subps xmm4,xmm2
- subps xmm5,xmm3
- movups xmm2,[edx+ebx]
- movups xmm3,[edx+ebx+16]
- mulps xmm4,xmm2
- mulps xmm5,xmm3
- movaps [edi+ebx],xmm4
- movaps [edi+ebx+16],xmm5
- add ebx,16*2
- jl lpNA
- done:
- mov edx,src0
- mov esi,src1
- mov edi,dst
- KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ),
- KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count )
- }
- }
- /*
- ============
- Simd_MulAdd
- assumes count >= 7
- ============
- */
- static void Simd_MulAdd( float *dst, const float constant, const float *src, const int count ) {
- __asm mov esi, dst
- __asm mov edi, src
- __asm mov eax, count
- __asm shl eax, 2
- __asm mov ecx, esi
- __asm mov edx, eax
- __asm or ecx, edi
- __asm fld constant
- __asm and ecx, 15
- __asm jz SimdMulAdd16
- __asm and ecx, 3
- __asm jnz SimdMulAdd8
- __asm mov ecx, esi
- __asm xor ecx, edi
- __asm and ecx, 15
- __asm jnz MulAdd8
- __asm mov ecx, esi
- __asm and ecx, 15
- __asm neg ecx
- __asm add ecx, 16
- __asm sub eax, ecx
- __asm add edi, ecx
- __asm add esi, ecx
- __asm neg ecx
- __asm mov edx, eax
- __asm loopPreMulAdd16:
- __asm fld st
- __asm fmul dword ptr [edi+ecx]
- __asm fadd dword ptr [esi+ecx]
- __asm fstp dword ptr [esi+ecx]
- __asm add ecx, 4
- __asm jl loopPreMulAdd16
- __asm SimdMulAdd16:
- __asm and eax, ~15
- __asm movss xmm1, constant
- __asm shufps xmm1, xmm1, 0x00
- __asm add esi, eax
- __asm add edi, eax
- __asm neg eax
- __asm align 16
- __asm loopMulAdd16:
- __asm movaps xmm0, [edi+eax]
- __asm mulps xmm0, xmm1
- __asm addps xmm0, [esi+eax]
- __asm movaps [esi+eax], xmm0
- __asm add eax, 16
- __asm jl loopMulAdd16
- __asm jmp postMulAdd
- __asm MulAdd8:
- __asm mov ecx, esi
- __asm and ecx, 7
- __asm jz SimdMulAdd8
- __asm sub eax, ecx
- __asm add esi, ecx
- __asm add edi, ecx
- __asm neg ecx
- __asm mov edx, eax
- __asm loopPreMulAdd8:
- __asm fld st
- __asm fmul dword ptr [edi+ecx]
- __asm fadd dword ptr [esi+ecx]
- __asm fstp dword ptr [esi+ecx]
- __asm add ecx, 4
- __asm jl loopPreMulAdd8
- __asm SimdMulAdd8:
- __asm and eax, ~15
- __asm movss xmm1, constant
- __asm shufps xmm1, xmm1, 0x00
- __asm add esi, eax
- __asm add edi, eax
- __asm neg eax
- __asm align 16
- __asm loopMulAdd8:
- __asm movlps xmm0, [edi+eax]
- __asm movhps xmm0, [edi+eax+8]
- __asm mulps xmm0, xmm1
- __asm movlps xmm2, [esi+eax]
- __asm movhps xmm2, [esi+eax+8]
- __asm addps xmm0, xmm2
- __asm movlps [esi+eax], xmm0
- __asm movhps [esi+eax+8], xmm0
- __asm add eax, 16
- __asm jl loopMulAdd8
- __asm jmp postMulAdd
- __asm postMulAdd:
- __asm and edx, 15
- __asm jz MulAddDone
- __asm add esi, edx
- __asm add edi, edx
- __asm neg edx
- __asm loopPostMulAdd:
- __asm fld st
- __asm fmul dword ptr [edi+edx]
- __asm fadd dword ptr [esi+edx]
- __asm fstp dword ptr [esi+edx]
- __asm add edx, 4
- __asm jl loopPostMulAdd
- __asm MulAddDone:
- __asm fstp st
- }
- #define MULADD_FEW( OPER ) \
- switch( count ) { \
- case 0: \
- return; \
- case 1: \
- dst[0] OPER c * src[0]; \
- return; \
- case 2: \
- dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; \
- return; \
- case 3: \
- dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; \
- return; \
- case 4: \
- dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
- return; \
- case 5: \
- dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
- dst[4] OPER c * src[4]; \
- return; \
- case 6: \
- dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
- dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; \
- return; \
- case 7: \
- dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
- dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; \
- return; \
- case 8: \
- dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
- dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
- return; \
- case 9: \
- dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
- dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
- dst[8] OPER c * src[8]; \
- return; \
- case 10: \
- dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
- dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
- dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; \
- return; \
- case 11: \
- dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
- dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
- dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; dst[10] OPER c * src[10]; \
- return; \
- }
- /*
- ============
- idSIMD_SSE::MulAdd
- dst[i] += constant * src[i];
- ============
- */
- void VPCALL idSIMD_SSE::MulAdd( float *dst, const float constant, const float *src, const int count ) {
- float c = constant;
- MULADD_FEW( += )
- Simd_MulAdd( dst, constant, src, count );
- }
- /*
- ============
- idSIMD_SSE::MulAdd
- dst[i] += src0[i] * src1[i];
- ============
- */
- void VPCALL idSIMD_SSE::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
- for ( int i = 0; i < count; i++ ) {
- dst[i] += src0[i] + src1[i];
- }
- }
- /*
- ============
- idSIMD_SSE::MulSub
- dst[i] -= constant * src[i];
- ============
- */
- void VPCALL idSIMD_SSE::MulSub( float *dst, const float constant, const float *src, const int count ) {
- float c = constant;
- MULADD_FEW( -= )
- Simd_MulAdd( dst, -constant, src, count );
- }
- /*
- ============
- idSIMD_SSE::MulSub
- dst[i] -= src0[i] * src1[i];
- ============
- */
- void VPCALL idSIMD_SSE::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
- for ( int i = 0; i < count; i++ ) {
- dst[i] -= src0[i] + src1[i];
- }
- }
- /*
- ============
- idSIMD_SSE::Dot
- dst[i] = constant * src[i];
- ============
- */
- void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
- __asm
- {
- mov eax, count
- mov edi, constant
- mov edx, eax
- mov esi, src
- mov ecx, dst
- and eax, ~3
- movss xmm4, [edi+0]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm5, [edi+4]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm6, [edi+8]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- jz done4
- imul eax, 12
- add esi, eax
- neg eax
- loop4:
- movlps xmm1, [esi+eax+ 0]
- movlps xmm2, [esi+eax+ 8]
- movlps xmm3, [esi+eax+16]
- movhps xmm1, [esi+eax+24]
- movhps xmm2, [esi+eax+32]
- movhps xmm3, [esi+eax+40]
- movaps xmm0, xmm1
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
- shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
- shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
- add ecx, 16
- add eax, 4*12
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- addps xmm0, xmm1
- addps xmm0, xmm2
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
- movlps [ecx-16+0], xmm0
- movhps [ecx-16+8], xmm0
- jl loop4
- done4:
- and edx, 3
- jz done1
- loop1:
- movss xmm0, [esi+eax+0]
- movss xmm1, [esi+eax+4]
- movss xmm2, [esi+eax+8]
- mulss xmm0, xmm4
- mulss xmm1, xmm5
- mulss xmm2, xmm6
- add ecx, 4
- addss xmm0, xmm1
- add eax, 12
- addss xmm0, xmm2
- dec edx
- movss [ecx-4], xmm0
- jnz loop1
- done1:
- }
- }
- /*
- ============
- idSIMD_SSE::Dot
- dst[i] = constant * src[i].Normal() + src[i][3];
- ============
- */
- void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
- __asm {
- mov eax, count
- mov edi, constant
- mov edx, eax
- mov esi, src
- mov ecx, dst
- and eax, ~3
- movss xmm5, [edi+0]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm6, [edi+4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm7, [edi+8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- jz startVert1
- imul eax, 16
- add esi, eax
- neg eax
- loopVert4:
- movlps xmm1, [esi+eax+ 0]
- movlps xmm3, [esi+eax+ 8]
- movhps xmm1, [esi+eax+16]
- movhps xmm3, [esi+eax+24]
- movlps xmm2, [esi+eax+32]
- movlps xmm4, [esi+eax+40]
- movhps xmm2, [esi+eax+48]
- movhps xmm4, [esi+eax+56]
- movaps xmm0, xmm1
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
- movaps xmm2, xmm3
- shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
- add ecx, 16
- add eax, 4*16
- mulps xmm0, xmm5
- mulps xmm1, xmm6
- mulps xmm2, xmm7
- addps xmm0, xmm3
- addps xmm0, xmm1
- addps xmm0, xmm2
- movlps [ecx-16+0], xmm0
- movhps [ecx-16+8], xmm0
- jl loopVert4
- startVert1:
- and edx, 3
- jz done
- loopVert1:
- movss xmm0, [esi+eax+0]
- movss xmm1, [esi+eax+4]
- movss xmm2, [esi+eax+8]
- mulss xmm0, xmm5
- mulss xmm1, xmm6
- mulss xmm2, xmm7
- addss xmm0, [esi+eax+12]
- add ecx, 4
- addss xmm0, xmm1
- add eax, 16
- addss xmm0, xmm2
- dec edx
- movss [ecx-4], xmm0
- jnz loopVert1
- done:
- }
- }
- /*
- ============
- idSIMD_SSE::Dot
- dst[i] = constant * src[i].xyz;
- ============
- */
- void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- // 0, 1, 2
- // 3, 4, 5
- // 6, 7, 8
- // 9, 10, 11
- __asm {
- mov eax, count
- mov edi, constant
- mov edx, eax
- mov esi, src
- mov ecx, dst
- and eax, ~3
- movss xmm4, [edi+0]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm5, [edi+4]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm6, [edi+8]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- jz startVert1
- imul eax, DRAWVERT_SIZE
- add esi, eax
- neg eax
- loopVert4:
- movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
- movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
- movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
- movaps xmm1, xmm0 // 3, X, 0, 1
- movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
- shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
- movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
- movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
- shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
- movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
- shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
- movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
- shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
- add ecx, 16
- add eax, 4*DRAWVERT_SIZE
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- addps xmm0, xmm1
- addps xmm0, xmm2
- movlps [ecx-16+0], xmm0
- movhps [ecx-16+8], xmm0
- jl loopVert4
- startVert1:
- and edx, 3
- jz done
- loopVert1:
- movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
- movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
- movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
- mulss xmm0, xmm4
- mulss xmm1, xmm5
- mulss xmm2, xmm6
- add ecx, 4
- addss xmm0, xmm1
- add eax, DRAWVERT_SIZE
- addss xmm0, xmm2
- dec edx
- movss [ecx-4], xmm0
- jnz loopVert1
- done:
- }
- }
- /*
- ============
- idSIMD_SSE::Dot
- dst[i] = constant.Normal() * src[i] + constant[3];
- ============
- */
- void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
- __asm
- {
- mov eax, count
- mov edi, constant
- mov edx, eax
- mov esi, src
- mov ecx, dst
- and eax, ~3
- movss xmm4, [edi+0]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm5, [edi+4]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm6, [edi+8]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm7, [edi+12]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- jz done4
- imul eax, 12
- add esi, eax
- neg eax
- loop4:
- movlps xmm1, [esi+eax+ 0]
- movlps xmm2, [esi+eax+ 8]
- movlps xmm3, [esi+eax+16]
- movhps xmm1, [esi+eax+24]
- movhps xmm2, [esi+eax+32]
- movhps xmm3, [esi+eax+40]
- movaps xmm0, xmm1
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
- shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
- shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
- add ecx, 16
- add eax, 4*12
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- addps xmm0, xmm7
- addps xmm0, xmm1
- addps xmm0, xmm2
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
- movlps [ecx-16+0], xmm0
- movhps [ecx-16+8], xmm0
- jl loop4
- done4:
- and edx, 3
- jz done1
- loop1:
- movss xmm0, [esi+eax+0]
- movss xmm1, [esi+eax+4]
- movss xmm2, [esi+eax+8]
- mulss xmm0, xmm4
- mulss xmm1, xmm5
- mulss xmm2, xmm6
- addss xmm0, xmm7
- add ecx, 4
- addss xmm0, xmm1
- add eax, 12
- addss xmm0, xmm2
- dec edx
- movss [ecx-4], xmm0
- jnz loop1
- done1:
- }
- }
- /*
- ============
- idSIMD_SSE::Dot
- dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
- ============
- */
- void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
- #define SINGLE_OP(SRC, DEST) \
- __asm movlps xmm0,[SRC] \
- __asm movlps xmm1,[SRC+8] \
- __asm mulps xmm0,xmm4 \
- __asm mulps xmm1,xmm5 \
- __asm addps xmm0,xmm1 \
- __asm movaps xmm1,xmm0 \
- __asm shufps xmm1,xmm1,SHUFFLEPS(1,1,1,1) \
- __asm addss xmm0,xmm1 \
- __asm movss [DEST],xmm0 \
- __asm add SRC,16 \
- __asm add DEST,4
- #define DUAL_OP(SRC, DEST) \
- __asm movlps xmm0,[SRC] \
- __asm movlps xmm1,[SRC+8] \
- __asm movhps xmm0,[SRC+16] \
- __asm movhps xmm1,[SRC+24] \
- __asm mulps xmm0,xmm4 \
- __asm mulps xmm1,xmm5 \
- __asm addps xmm0,xmm1 \
- __asm shufps xmm1,xmm0,SHUFFLEPS(2,0,1,0) \
- __asm shufps xmm0,xmm0,SHUFFLEPS(3,1,2,0) \
- __asm addps xmm0,xmm1 \
- __asm movhps [DEST],xmm0 \
- __asm add SRC,32 \
- __asm add DEST,8
- __asm {
- mov edx, dst
- mov eax, src
- mov ebx, constant
- mov ecx, count
- movlps xmm4, [ebx]
- shufps xmm4, xmm4, SHUFFLEPS(1,0,1,0)
- movlps xmm5, [ebx+8]
- shufps xmm5, xmm5, SHUFFLEPS(1,0,1,0)
- xorps xmm0, xmm0
- xorps xmm1, xmm1
- _lpAlignDest:
- test edx, 0x0f
- jz _destAligned
- SINGLE_OP(eax,edx)
- dec ecx
- jnz _lpAlignDest
- jmp _vpExit
- _destAligned:
- push ecx
- cmp ecx, 4
- jl _post
- and ecx, ~3
- shl ecx, 2
- lea eax, [eax+ecx*4]
- add edx, ecx
- neg ecx
- movlps xmm0, [eax+ecx*4]
- movhps xmm0, [eax+ecx*4+16]
- movlps xmm2, [eax+ecx*4+32]
- movhps xmm2, [eax+ecx*4+48]
- jmp _lpStart
- align 16
- _lp:
- prefetchnta [eax+ecx*4+128]
- addps xmm1, xmm0
- movlps xmm0, [eax+ecx*4]
- movhps xmm0, [eax+ecx*4+16]
- movlps xmm2, [eax+ecx*4+32]
- movhps xmm2, [eax+ecx*4+48]
- movaps [edx+ecx-16],xmm1
- _lpStart:
- movlps xmm1, [eax+ecx*4+8]
- movhps xmm1, [eax+ecx*4+24]
- movlps xmm3, [eax+ecx*4+40]
- movhps xmm3, [eax+ecx*4+56]
- add ecx, 16
- mulps xmm1, xmm5
- mulps xmm2, xmm4
- mulps xmm3, xmm5
- addps xmm2, xmm3 // y3+w3 x3+z3 y2+w2 x2+z2
- mulps xmm0, xmm4
- addps xmm0, xmm1 // y1+w1 x1+z1 y0+w0 x0+z0
- movaps xmm1, xmm0
- shufps xmm0, xmm2, SHUFFLEPS(2,0,2,0) // x3+z3 x2+z2 x1+z1 x0+z0
- shufps xmm1, xmm2, SHUFFLEPS(3,1,3,1) // y3+w3 y2+w2 y1+w1 y0+w0
- js _lp
- addps xmm1, xmm0
- movaps [edx+ecx-16], xmm1
- _post:
- pop ecx
- and ecx, 0x3
- cmp ecx, 2
- jl _post1
- DUAL_OP(eax,edx)
- sub ecx, 2
- _post1:
- cmp ecx, 1
- jne _vpExit
- SINGLE_OP(eax,edx)
- _vpExit:
- }
- #undef DUAL_OP
- #undef SINGLE_OP
- }
- /*
- ============
- idSIMD_SSE::Dot
- dst[i] = constant.Normal() * src[i].xyz + constant[3];
- ============
- */
- void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- // 0, 1, 2
- // 3, 4, 5
- // 6, 7, 8
- // 9, 10, 11
- __asm {
- mov eax, count
- mov edi, constant
- mov edx, eax
- mov esi, src
- mov ecx, dst
- and eax, ~3
- movss xmm4, [edi+0]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm5, [edi+4]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm6, [edi+8]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm7, [edi+12]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- jz startVert1
- imul eax, DRAWVERT_SIZE
- add esi, eax
- neg eax
- loopVert4:
- movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
- movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
- movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
- movaps xmm1, xmm0 // 3, X, 0, 1
- movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
- shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
- movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
- movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
- shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
- movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
- shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
- movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
- shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
- add ecx, 16
- add eax, 4*DRAWVERT_SIZE
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- addps xmm0, xmm7
- addps xmm0, xmm1
- addps xmm0, xmm2
- movlps [ecx-16+0], xmm0
- movhps [ecx-16+8], xmm0
- jl loopVert4
- startVert1:
- and edx, 3
- jz done
- loopVert1:
- movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
- movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
- movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
- mulss xmm0, xmm4
- mulss xmm1, xmm5
- mulss xmm2, xmm6
- addss xmm0, xmm7
- add ecx, 4
- addss xmm0, xmm1
- add eax, DRAWVERT_SIZE
- addss xmm0, xmm2
- dec edx
- movss [ecx-4], xmm0
- jnz loopVert1
- done:
- }
- }
- /*
- ============
- idSIMD_SSE::Dot
- dst[i] = src0[i] * src1[i];
- ============
- */
- void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
- __asm
- {
- mov eax, count
- mov edi, src0
- mov edx, eax
- mov esi, src1
- mov ecx, dst
- and eax, ~3
- jz done4
- imul eax, 12
- add edi, eax
- add esi, eax
- neg eax
- loop4:
- movlps xmm0, [esi+eax] // 0, 1, X, X
- movlps xmm3, [edi+eax] // 0, 1, X, X
- movlps xmm1, [esi+eax+8] // 2, 3, X, X
- movlps xmm4, [edi+eax+8] // 2, 3, X, X
- movhps xmm0, [esi+eax+24] // 0, 1, 6, 7
- movhps xmm3, [edi+eax+24] // 0, 1, 6, 7
- movhps xmm1, [esi+eax+32] // 2, 3, 8, 9
- movhps xmm4, [edi+eax+32] // 2, 3, 8, 9
- movlps xmm2, [esi+eax+16] // 4, 5, X, X
- movlps xmm5, [edi+eax+16] // 4, 5, X, X
- movhps xmm2, [esi+eax+40] // 4, 5, 10, 11
- movhps xmm5, [edi+eax+40] // 4, 5, 10, 11
- add ecx, 16
- add eax, 48
- mulps xmm0, xmm3
- mulps xmm1, xmm4
- mulps xmm2, xmm5
- movaps xmm7, xmm0
- shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) // 0, 6, 3, 9
- shufps xmm0, xmm2, R_SHUFFLEPS( 1, 3, 0, 2 ) // 1, 7, 4, 10
- shufps xmm1, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 ) // 2, 8, 5, 11
- addps xmm7, xmm0
- addps xmm7, xmm1
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 1, 3 )
- movlps [ecx-16+0], xmm7
- movhps [ecx-16+8], xmm7
- jl loop4
- done4:
- and edx, 3
- jz done1
- loop1:
- movss xmm0, [esi+eax+0]
- movss xmm3, [edi+eax+0]
- movss xmm1, [esi+eax+4]
- movss xmm4, [edi+eax+4]
- movss xmm2, [esi+eax+8]
- movss xmm5, [edi+eax+8]
- mulss xmm0, xmm3
- mulss xmm1, xmm4
- mulss xmm2, xmm5
- add ecx, 4
- addss xmm0, xmm1
- add eax, 12
- addss xmm0, xmm2
- dec edx
- movss [ecx-4], xmm0
- jnz loop1
- done1:
- }
- }
- /*
- ============
- idSIMD_SSE::Dot
- dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
- ============
- */
- void VPCALL idSIMD_SSE::Dot( float &dot, const float *src1, const float *src2, const int count ) {
- switch( count ) {
- case 0:
- dot = 0.0f;
- return;
- case 1:
- dot = src1[0] * src2[0];
- return;
- case 2:
- dot = src1[0] * src2[0] + src1[1] * src2[1];
- return;
- case 3:
- dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
- return;
- default:
- __asm {
- mov ecx, src1
- mov edx, src2
- mov eax, ecx
- or eax, edx
- and eax, 15
- jz alignedDot
- // unaligned
- mov eax, count
- shr eax, 2
- shl eax, 4
- add ecx, eax
- add edx, eax
- neg eax
- movups xmm0, [ecx+eax]
- movups xmm1, [edx+eax]
- mulps xmm0, xmm1
- add eax, 16
- jz doneDot
- loopUnalignedDot:
- movups xmm1, [ecx+eax]
- movups xmm2, [edx+eax]
- mulps xmm1, xmm2
- addps xmm0, xmm1
- add eax, 16
- jl loopUnalignedDot
- jmp doneDot
- // aligned
- alignedDot:
- mov eax, count
- shr eax, 2
- shl eax, 4
- add ecx, eax
- add edx, eax
- neg eax
- movaps xmm0, [ecx+eax]
- movaps xmm1, [edx+eax]
- mulps xmm0, xmm1
- add eax, 16
- jz doneDot
- loopAlignedDot:
- movaps xmm1, [ecx+eax]
- movaps xmm2, [edx+eax]
- mulps xmm1, xmm2
- addps xmm0, xmm1
- add eax, 16
- jl loopAlignedDot
- doneDot:
- }
- switch( count & 3 ) {
- case 1:
- __asm {
- movss xmm1, [ecx]
- movss xmm2, [edx]
- mulss xmm1, xmm2
- addss xmm0, xmm1
- }
- break;
- case 2:
- __asm {
- xorps xmm2, xmm2
- movlps xmm1, [ecx]
- movlps xmm2, [edx]
- mulps xmm1, xmm2
- addps xmm0, xmm1
- }
- break;
- case 3:
- __asm {
- movss xmm1, [ecx]
- movhps xmm1, [ecx+4]
- movss xmm2, [edx]
- movhps xmm2, [edx+4]
- mulps xmm1, xmm2
- addps xmm0, xmm1
- }
- break;
- }
- __asm {
- movhlps xmm1, xmm0
- addps xmm0, xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
- addss xmm0, xmm1
- mov eax, dot
- movss [eax], xmm0
- }
- return;
- }
- }
- //
- // cmpeqps == Equal
- // cmpneqps != Not Equal
- // cmpltps < Less Than
- // cmpnltps >= Not Less Than
- // cmpnleps > Not Less Or Equal
- //
- #define FLIP not al
- #define NOFLIP
- #define COMPARECONSTANT( DST, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
- int i, cnt, pre, post; \
- float *aligned; \
- \
- /* if the float array is not aligned on a 4 byte boundary */ \
- if ( ((int) SRC0) & 3 ) { \
- /* unaligned memory access */ \
- pre = 0; \
- cnt = COUNT >> 2; \
- post = COUNT - (cnt<<2); \
- __asm mov edx, cnt \
- __asm test edx, edx \
- __asm je doneCmp \
- __asm push ebx \
- __asm neg edx \
- __asm mov esi, SRC0 \
- __asm prefetchnta [esi+64] \
- __asm movss xmm1, CONSTANT \
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mov edi, DST \
- __asm mov ecx, 0x01010101 \
- __asm loopNA: \
- __asm movups xmm0, [esi] \
- __asm prefetchnta [esi+128] \
- __asm CMPSIMD xmm0, xmm1 \
- __asm movmskps eax, xmm0 \
- __asm DOFLIP \
- __asm mov ah, al \
- __asm shr ah, 1 \
- __asm mov bx, ax \
- __asm shl ebx, 14 \
- __asm mov bx, ax \
- __asm and ebx, ecx \
- __asm mov dword ptr [edi], ebx \
- __asm add esi, 16 \
- __asm add edi, 4 \
- __asm inc edx \
- __asm jl loopNA \
- __asm pop ebx \
- } \
- else { \
- /* aligned memory access */ \
- aligned = (float *) ((((int) SRC0) + 15) & ~15); \
- if ( (int)aligned > ((int)src0) + COUNT ) { \
- pre = COUNT; \
- post = 0; \
- } \
- else { \
- pre = aligned - SRC0; \
- cnt = (COUNT - pre) >> 2; \
- post = COUNT - pre - (cnt<<2); \
- __asm mov edx, cnt \
- __asm test edx, edx \
- __asm je doneCmp \
- __asm push ebx \
- __asm neg edx \
- __asm mov esi, aligned \
- __asm prefetchnta [esi+64] \
- __asm movss xmm1, CONSTANT \
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mov edi, DST \
- __asm add edi, pre \
- __asm mov ecx, 0x01010101 \
- __asm loopA: \
- __asm movaps xmm0, [esi] \
- __asm prefetchnta [esi+128] \
- __asm CMPSIMD xmm0, xmm1 \
- __asm movmskps eax, xmm0 \
- __asm DOFLIP \
- __asm mov ah, al \
- __asm shr ah, 1 \
- __asm mov bx, ax \
- __asm shl ebx, 14 \
- __asm mov bx, ax \
- __asm and ebx, ecx \
- __asm mov dword ptr [edi], ebx \
- __asm add esi, 16 \
- __asm add edi, 4 \
- __asm inc edx \
- __asm jl loopA \
- __asm pop ebx \
- } \
- } \
- doneCmp: \
- double c = constant; \
- for ( i = 0; i < pre; i++ ) { \
- dst[i] = src0[i] CMP c; \
- } \
- for ( i = count - post; i < count; i++ ) { \
- dst[i] = src0[i] CMP c; \
- }
- #define COMPAREBITCONSTANT( DST, BITNUM, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
- int i, cnt, pre, post; \
- float *aligned; \
- \
- /* if the float array is not aligned on a 4 byte boundary */ \
- if ( ((int) SRC0) & 3 ) { \
- /* unaligned memory access */ \
- pre = 0; \
- cnt = COUNT >> 2; \
- post = COUNT - (cnt<<2); \
- __asm mov edx, cnt \
- __asm test edx, edx \
- __asm je doneCmp \
- __asm push ebx \
- __asm neg edx \
- __asm mov esi, SRC0 \
- __asm prefetchnta [esi+64] \
- __asm movss xmm1, CONSTANT \
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mov edi, DST \
- __asm mov cl, bitNum \
- __asm loopNA: \
- __asm movups xmm0, [esi] \
- __asm prefetchnta [esi+128] \
- __asm CMPSIMD xmm0, xmm1 \
- __asm movmskps eax, xmm0 \
- __asm DOFLIP \
- __asm mov ah, al \
- __asm shr ah, 1 \
- __asm mov bx, ax \
- __asm shl ebx, 14 \
- __asm mov bx, ax \
- __asm and ebx, 0x01010101 \
- __asm shl ebx, cl \
- __asm or ebx, dword ptr [edi] \
- __asm mov dword ptr [edi], ebx \
- __asm add esi, 16 \
- __asm add edi, 4 \
- __asm inc edx \
- __asm jl loopNA \
- __asm pop ebx \
- } \
- else { \
- /* aligned memory access */ \
- aligned = (float *) ((((int) SRC0) + 15) & ~15); \
- if ( (int)aligned > ((int)src0) + COUNT ) { \
- pre = COUNT; \
- post = 0; \
- } \
- else { \
- pre = aligned - SRC0; \
- cnt = (COUNT - pre) >> 2; \
- post = COUNT - pre - (cnt<<2); \
- __asm mov edx, cnt \
- __asm test edx, edx \
- __asm je doneCmp \
- __asm push ebx \
- __asm neg edx \
- __asm mov esi, aligned \
- __asm prefetchnta [esi+64] \
- __asm movss xmm1, CONSTANT \
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mov edi, DST \
- __asm add edi, pre \
- __asm mov cl, bitNum \
- __asm loopA: \
- __asm movaps xmm0, [esi] \
- __asm prefetchnta [esi+128] \
- __asm CMPSIMD xmm0, xmm1 \
- __asm movmskps eax, xmm0 \
- __asm DOFLIP \
- __asm mov ah, al \
- __asm shr ah, 1 \
- __asm mov bx, ax \
- __asm shl ebx, 14 \
- __asm mov bx, ax \
- __asm and ebx, 0x01010101 \
- __asm shl ebx, cl \
- __asm or ebx, dword ptr [edi] \
- __asm mov dword ptr [edi], ebx \
- __asm add esi, 16 \
- __asm add edi, 4 \
- __asm inc edx \
- __asm jl loopA \
- __asm pop ebx \
- } \
- } \
- doneCmp: \
- float c = constant; \
- for ( i = 0; i < pre; i++ ) { \
- dst[i] |= ( src0[i] CMP c ) << BITNUM; \
- } \
- for ( i = count - post; i < count; i++ ) { \
- dst[i] |= ( src0[i] CMP c ) << BITNUM; \
- }
- /*
- ============
- idSIMD_SSE::CmpGT
- dst[i] = src0[i] > constant;
- ============
- */
- void VPCALL idSIMD_SSE::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
- COMPARECONSTANT( dst, src0, constant, count, >, cmpnleps, NOFLIP )
- }
- /*
- ============
- idSIMD_SSE::CmpGT
- dst[i] |= ( src0[i] > constant ) << bitNum;
- ============
- */
- void VPCALL idSIMD_SSE::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
- COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >, cmpnleps, NOFLIP )
- }
- /*
- ============
- idSIMD_SSE::CmpGE
- dst[i] = src0[i] >= constant;
- ============
- */
- void VPCALL idSIMD_SSE::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
- COMPARECONSTANT( dst, src0, constant, count, >=, cmpnltps, NOFLIP )
- }
- /*
- ============
- idSIMD_SSE::CmpGE
- dst[i] |= ( src0[i] >= constant ) << bitNum;
- ============
- */
- void VPCALL idSIMD_SSE::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
- COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >=, cmpnltps, NOFLIP )
- }
- /*
- ============
- idSIMD_SSE::CmpLT
- dst[i] = src0[i] < constant;
- ============
- */
- void VPCALL idSIMD_SSE::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
- COMPARECONSTANT( dst, src0, constant, count, <, cmpltps, NOFLIP )
- }
- /*
- ============
- idSIMD_SSE::CmpLT
- dst[i] |= ( src0[i] < constant ) << bitNum;
- ============
- */
- void VPCALL idSIMD_SSE::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
- COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <, cmpltps, NOFLIP )
- }
- /*
- ============
- idSIMD_SSE::CmpLE
- dst[i] = src0[i] <= constant;
- ============
- */
- void VPCALL idSIMD_SSE::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
- COMPARECONSTANT( dst, src0, constant, count, <=, cmpnleps, FLIP )
- }
- /*
- ============
- idSIMD_SSE::CmpLE
- dst[i] |= ( src0[i] <= constant ) << bitNum;
- ============
- */
- void VPCALL idSIMD_SSE::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
- COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <=, cmpnleps, FLIP )
- }
- /*
- ============
- idSIMD_SSE::MinMax
- ============
- */
- void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const int count ) {
- int i, pre, post;
- min = idMath::INFINITY; max = -idMath::INFINITY;
- __asm
- {
- push ebx
- mov eax, min
- mov ebx, max
- movss xmm0, [eax]
- movss xmm1, [ebx]
- shufps xmm0, xmm0, 0
- shufps xmm1, xmm1, 0
- KFLOATINITS( src, count, pre, post )
- and eax, 15
- jz lpA
- jmp lpNA
- align 16
- lpNA:
- movups xmm2, [edx+ebx]
- movups xmm3, [edx+ebx+16]
- minps xmm0, xmm2
- maxps xmm1, xmm2
- prefetchnta [edx+ebx+64]
- minps xmm0, xmm3
- maxps xmm1, xmm3
- add ebx, 16*2
- jl lpNA
- jmp done2
- lpA:
- movaps xmm2, [edx+ebx]
- movaps xmm3, [edx+ebx+16]
- minps xmm0, xmm2
- maxps xmm1, xmm2
- prefetchnta [edx+ebx+64]
- minps xmm0, xmm3
- maxps xmm1, xmm3
- add ebx, 16*2
- jl lpA
- jmp done2
- align 16
- done2:
- movaps xmm2, xmm0
- movaps xmm3, xmm1
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
- minss xmm0, xmm2
- maxss xmm1, xmm3
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
- minss xmm0, xmm2
- maxss xmm1, xmm3
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
- minss xmm0, xmm2
- maxss xmm1, xmm3
- mov eax, min
- mov ebx, max
- movss [eax], xmm0
- movss [ebx], xmm1
- done:
- pop ebx
- }
- for ( i = 0; i < pre; i++ ) {
- float tmp = src[i];
- if ( tmp > max ) {
- max = tmp;
- }
- if ( tmp < min ) {
- min = tmp;
- }
- }
- for ( i = count - post; i < count; i++ ) {
- float tmp = src[i];
- if ( tmp > max ) {
- max = tmp;
- }
- if ( tmp < min ) {
- min = tmp;
- }
- }
- }
- /*
- ============
- idSIMD_SSE::MinMax
- ============
- */
- void VPCALL idSIMD_SSE::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
- __asm {
- mov eax, count
- test eax, eax
- movss xmm0, idMath::INFINITY
- xorps xmm1, xmm1
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- subps xmm1, xmm0
- jz done
- mov ecx, eax
- and ecx, 1
- mov esi, src
- jz startLoop
- movlps xmm2, [esi]
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
- dec eax
- add esi, 2*4
- minps xmm0, xmm2
- maxps xmm1, xmm2
- startLoop:
- imul eax, 2*4
- add esi, eax
- neg eax
- loopVert:
- movlps xmm2, [esi+eax]
- movhps xmm2, [esi+eax+8]
- add eax, 4*4
- minps xmm0, xmm2
- maxps xmm1, xmm2
- jl loopVert
- done:
- movaps xmm2, xmm0
- shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 )
- minps xmm0, xmm2
- mov esi, min
- movlps [esi], xmm0
- movaps xmm3, xmm1
- shufps xmm3, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
- maxps xmm1, xmm3
- mov edi, max
- movlps [edi], xmm1
- }
- }
- /*
- ============
- idSIMD_SSE::MinMax
- ============
- */
- void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
- __asm {
- movss xmm0, idMath::INFINITY
- xorps xmm1, xmm1
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- subps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, xmm1
- mov esi, src
- mov eax, count
- and eax, ~3
- jz done4
- imul eax, 12
- add esi, eax
- neg eax
- loop4:
- // prefetchnta [esi+4*12]
- movss xmm4, [esi+eax+0*12+8]
- movhps xmm4, [esi+eax+0*12+0]
- minps xmm0, xmm4
- maxps xmm1, xmm4
- movss xmm5, [esi+eax+1*12+0]
- movhps xmm5, [esi+eax+1*12+4]
- minps xmm2, xmm5
- maxps xmm3, xmm5
- movss xmm6, [esi+eax+2*12+8]
- movhps xmm6, [esi+eax+2*12+0]
- minps xmm0, xmm6
- maxps xmm1, xmm6
- movss xmm7, [esi+eax+3*12+0]
- movhps xmm7, [esi+eax+3*12+4]
- minps xmm2, xmm7
- maxps xmm3, xmm7
- add eax, 4*12
- jl loop4
- done4:
- mov eax, count
- and eax, 3
- jz done1
- imul eax, 12
- add esi, eax
- neg eax
- loop1:
- movss xmm4, [esi+eax+0*12+8]
- movhps xmm4, [esi+eax+0*12+0]
- minps xmm0, xmm4
- maxps xmm1, xmm4
- add eax, 12
- jl loop1
- done1:
- shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
- minps xmm0, xmm2
- maxps xmm1, xmm3
- mov esi, min
- movhps [esi], xmm0
- movss [esi+8], xmm0
- mov edi, max
- movhps [edi], xmm1
- movss [edi+8], xmm1
- }
- }
- /*
- ============
- idSIMD_SSE::MinMax
- ============
- */
- void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- __asm {
- movss xmm0, idMath::INFINITY
- xorps xmm1, xmm1
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- subps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, xmm1
- mov esi, src
- mov eax, count
- and eax, ~3
- jz done4
- imul eax, DRAWVERT_SIZE
- add esi, eax
- neg eax
- loop4:
- // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
- movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- minps xmm0, xmm4
- maxps xmm1, xmm4
- movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
- minps xmm2, xmm5
- maxps xmm3, xmm5
- movss xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- minps xmm0, xmm6
- maxps xmm1, xmm6
- movss xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
- minps xmm2, xmm7
- maxps xmm3, xmm7
- add eax, 4*DRAWVERT_SIZE
- jl loop4
- done4:
- mov eax, count
- and eax, 3
- jz done1
- imul eax, DRAWVERT_SIZE
- add esi, eax
- neg eax
- loop1:
- movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- minps xmm0, xmm4
- maxps xmm1, xmm4
- add eax, DRAWVERT_SIZE
- jl loop1
- done1:
- shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
- minps xmm0, xmm2
- maxps xmm1, xmm3
- mov esi, min
- movhps [esi], xmm0
- movss [esi+8], xmm0
- mov edi, max
- movhps [edi], xmm1
- movss [edi+8], xmm1
- }
- }
- /*
- ============
- idSIMD_SSE::MinMax
- ============
- */
- void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- __asm {
- movss xmm0, idMath::INFINITY
- xorps xmm1, xmm1
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- subps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, xmm1
- mov edi, indexes
- mov esi, src
- mov eax, count
- and eax, ~3
- jz done4
- shl eax, 2
- add edi, eax
- neg eax
- loop4:
- // prefetchnta [edi+128]
- // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
- mov edx, [edi+eax+0]
- imul edx, DRAWVERT_SIZE
- movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
- minps xmm0, xmm4
- maxps xmm1, xmm4
- mov edx, [edi+eax+4]
- imul edx, DRAWVERT_SIZE
- movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
- minps xmm2, xmm5
- maxps xmm3, xmm5
- mov edx, [edi+eax+8]
- imul edx, DRAWVERT_SIZE
- movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
- minps xmm0, xmm6
- maxps xmm1, xmm6
- mov edx, [edi+eax+12]
- imul edx, DRAWVERT_SIZE
- movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
- minps xmm2, xmm7
- maxps xmm3, xmm7
- add eax, 4*4
- jl loop4
- done4:
- mov eax, count
- and eax, 3
- jz done1
- shl eax, 2
- add edi, eax
- neg eax
- loop1:
- mov edx, [edi+eax+0]
- imul edx, DRAWVERT_SIZE;
- movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
- minps xmm0, xmm4
- maxps xmm1, xmm4
- add eax, 4
- jl loop1
- done1:
- shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
- minps xmm0, xmm2
- maxps xmm1, xmm3
- mov esi, min
- movhps [esi], xmm0
- movss [esi+8], xmm0
- mov edi, max
- movhps [edi], xmm1
- movss [edi+8], xmm1
- }
- }
- /*
- ============
- idSIMD_SSE::Clamp
- ============
- */
- void VPCALL idSIMD_SSE::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
- int i, pre, post;
- __asm
- {
- movss xmm0,min
- movss xmm1,max
- shufps xmm0,xmm0,0
- shufps xmm1,xmm1,0
- KFLOATINITDS( dst, src, count, pre, post )
- and eax,15
- jne lpNA
- jmp lpA
- align 16
- lpA:
- movaps xmm2,[edx+ebx]
- movaps xmm3,[edx+ebx+16]
- maxps xmm2,xmm0
- maxps xmm3,xmm0
- prefetchnta [edx+ebx+64]
- minps xmm2,xmm1
- minps xmm3,xmm1
- movaps [edi+ebx],xmm2
- movaps [edi+ebx+16],xmm3
- add ebx,16*2
- jl lpA
- jmp done
- align 16
- lpNA:
- movups xmm2,[edx+ebx]
- movups xmm3,[edx+ebx+16]
- maxps xmm2,xmm0
- maxps xmm3,xmm0
- prefetchnta [edx+ebx+64]
- minps xmm2,xmm1
- minps xmm3,xmm1
- movaps [edi+ebx],xmm2
- movaps [edi+ebx+16],xmm3
- add ebx,16*2
- jl lpNA
- done:
- }
- for ( i = 0; i < pre; i++ ) {
- if ( src[i] < min )
- dst[i] = min;
- else if ( src[i] > max )
- dst[i] = max;
- else
- dst[i] = src[i];
- }
- for( i = count - post; i < count; i++ ) {
- if ( src[i] < min )
- dst[i] = min;
- else if ( src[i] > max )
- dst[i] = max;
- else
- dst[i] = src[i];
- }
- }
- /*
- ============
- idSIMD_SSE::ClampMin
- ============
- */
- void VPCALL idSIMD_SSE::ClampMin( float *dst, const float *src, const float min, const int count ) {
- int i, pre, post;
- __asm
- {
- movss xmm0,min
- shufps xmm0,xmm0,0
- KFLOATINITDS( dst, src, count, pre, post )
- and eax,15
- jne lpNA
- jmp lpA
- align 16
- lpA:
- movaps xmm2,[edx+ebx]
- movaps xmm3,[edx+ebx+16]
- maxps xmm2,xmm0
- prefetchnta [edx+ebx+64]
- maxps xmm3,xmm0
- movaps [edi+ebx],xmm2
- movaps [edi+ebx+16],xmm3
- add ebx,16*2
- jl lpA
- jmp done
- align 16
- lpNA:
- movups xmm2,[edx+ebx]
- movups xmm3,[edx+ebx+16]
- maxps xmm2,xmm0
- prefetchnta [edx+ebx+64]
- maxps xmm3,xmm0
- movaps [edi+ebx],xmm2
- movaps [edi+ebx+16],xmm3
- add ebx,16*2
- jl lpNA
- done:
- }
- for( i = 0; i < pre; i++ ) {
- if ( src[i] < min )
- dst[i] = min;
- else
- dst[i] = src[i];
- }
- for( i = count - post; i < count; i++ ) {
- if ( src[i] < min )
- dst[i] = min;
- else
- dst[i] = src[i];
- }
- }
- /*
- ============
- idSIMD_SSE::ClampMax
- ============
- */
- void VPCALL idSIMD_SSE::ClampMax( float *dst, const float *src, const float max, const int count ) {
- int i, pre, post;
- __asm
- {
- movss xmm1,max
- shufps xmm1,xmm1,0
- KFLOATINITDS( dst, src, count, pre, post )
- and eax,15
- jne lpNA
- jmp lpA
- align 16
- lpA:
- movaps xmm2,[edx+ebx]
- movaps xmm3,[edx+ebx+16]
- minps xmm2,xmm1
- prefetchnta [edx+ebx+64]
- minps xmm3,xmm1
- movaps [edi+ebx],xmm2
- movaps [edi+ebx+16],xmm3
- add ebx,16*2
- jl lpA
- jmp done
- align 16
- lpNA:
- movups xmm2,[edx+ebx]
- movups xmm3,[edx+ebx+16]
- minps xmm2,xmm1
- prefetchnta [edx+ebx+64]
- minps xmm3,xmm1
- movaps [edi+ebx],xmm2
- movaps [edi+ebx+16],xmm3
- add ebx,16*2
- jl lpNA
- done:
- }
- for( i = 0; i < pre; i++ ) {
- if ( src[i] > max )
- dst[i] = max;
- else
- dst[i] = src[i];
- }
- for( i = count - post; i < count; i++ ) {
- if ( src[i] > max )
- dst[i] = max;
- else
- dst[i] = src[i];
- }
- }
- /*
- ============
- idSIMD_SSE::Zero16
- ============
- */
- void VPCALL idSIMD_SSE::Zero16( float *dst, const int count ) {
- __asm {
- mov edx, dst
- mov eax, count
- add eax, 3
- shr eax, 2
- jz doneZero16
- shl eax, 4
- add edx, eax
- neg eax
- xorps xmm0, xmm0
- loopZero16:
- movaps [edx+eax], xmm0
- add eax, 16
- jl loopZero16
- doneZero16:
- }
- }
- /*
- ============
- idSIMD_SSE::Negate16
- ============
- */
- void VPCALL idSIMD_SSE::Negate16( float *dst, const int count ) {
- __asm {
- mov edx, dst
- mov eax, count
- add eax, 3
- shr eax, 2
- jz doneNegate16
- shl eax, 4
- add edx, eax
- neg eax
- movss xmm0, SIMD_SP_signBitMask
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- loopNegate16:
- movaps xmm1, [edx+eax]
- xorps xmm1, xmm0
- movaps [edx+eax], xmm1
- add eax, 16
- jl loopNegate16
- doneNegate16:
- }
- }
- /*
- ============
- idSIMD_SSE::Copy16
- ============
- */
- void VPCALL idSIMD_SSE::Copy16( float *dst, const float *src, const int count ) {
- __asm {
- mov ecx, src
- mov edx, dst
- mov eax, count
- add eax, 3
- shr eax, 2
- jz doneCopy16
- shl eax, 4
- add ecx, eax
- add edx, eax
- neg eax
- loopCopy16:
- movaps xmm0, [ecx+eax]
- movaps [edx+eax], xmm0
- add eax, 16
- jl loopCopy16
- doneCopy16:
- }
- }
- /*
- ============
- idSIMD_SSE::Add16
- ============
- */
- void VPCALL idSIMD_SSE::Add16( float *dst, const float *src1, const float *src2, const int count ) {
- __asm {
- mov ecx, src1
- mov edx, src2
- mov esi, dst
- mov eax, count
- add eax, 3
- shr eax, 2
- jz doneAdd16
- shl eax, 4
- add esi, eax
- add ecx, eax
- add edx, eax
- neg eax
- loopAdd16:
- movaps xmm0, [ecx+eax]
- addps xmm0, [edx+eax]
- movaps [esi+eax], xmm0
- add eax, 16
- jl loopAdd16
- doneAdd16:
- }
- }
- /*
- ============
- idSIMD_SSE::Sub16
- ============
- */
- void VPCALL idSIMD_SSE::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
- __asm {
- mov ecx, src1
- mov edx, src2
- mov esi, dst
- mov eax, count
- add eax, 3
- shr eax, 2
- jz doneSub16
- shl eax, 4
- add esi, eax
- add ecx, eax
- add edx, eax
- neg eax
- loopSub16:
- movaps xmm0, [ecx+eax]
- subps xmm0, [edx+eax]
- movaps [esi+eax], xmm0
- add eax, 16
- jl loopSub16
- doneSub16:
- }
- }
- /*
- ============
- idSIMD_SSE::Mul16
- ============
- */
- void VPCALL idSIMD_SSE::Mul16( float *dst, const float *src1, const float constant, const int count ) {
- __asm {
- mov ecx, dst
- mov edx, src1
- mov eax, count
- add eax, 3
- shr eax, 2
- jz doneMulScalar16
- movss xmm1, constant
- shl eax, 4
- add ecx, eax
- add edx, eax
- neg eax
- shufps xmm1, xmm1, 0x00
- loopMulScalar16:
- movaps xmm0, [edx+eax]
- mulps xmm0, xmm1
- movaps [ecx+eax], xmm0
- add eax, 16
- jl loopMulScalar16
- doneMulScalar16:
- }
- }
- /*
- ============
- idSIMD_SSE::AddAssign16
- ============
- */
- void VPCALL idSIMD_SSE::AddAssign16( float *dst, const float *src, const int count ) {
- __asm {
- mov ecx, dst
- mov edx, src
- mov eax, count
- add eax, 3
- shr eax, 2
- jz doneAddAssign16
- shl eax, 4
- add ecx, eax
- add edx, eax
- neg eax
- loopAddAssign16:
- movaps xmm0, [ecx+eax]
- addps xmm0, [edx+eax]
- movaps [ecx+eax], xmm0
- add eax, 16
- jl loopAddAssign16
- doneAddAssign16:
- }
- }
- /*
- ============
- idSIMD_SSE::SubAssign16
- ============
- */
- void VPCALL idSIMD_SSE::SubAssign16( float *dst, const float *src, const int count ) {
- __asm {
- mov ecx, dst
- mov edx, src
- mov eax, count
- add eax, 3
- shr eax, 2
- jz doneSubAssign16
- shl eax, 4
- add ecx, eax
- add edx, eax
- neg eax
- loopSubAssign16:
- movaps xmm0, [ecx+eax]
- subps xmm0, [edx+eax]
- movaps [ecx+eax], xmm0
- add eax, 16
- jl loopSubAssign16
- doneSubAssign16:
- }
- }
- /*
- ============
- idSIMD_SSE::MulAssign16
- ============
- */
- void VPCALL idSIMD_SSE::MulAssign16( float *dst, const float constant, const int count ) {
- __asm {
- mov ecx, dst
- mov eax, count
- add eax, 3
- shr eax, 2
- jz doneMulAssign16
- movss xmm1, constant
- shl eax, 4
- add ecx, eax
- neg eax
- shufps xmm1, xmm1, 0x00
- loopMulAssign16:
- movaps xmm0, [ecx+eax]
- mulps xmm0, xmm1
- movaps [ecx+eax], xmm0
- add eax, 16
- jl loopMulAssign16
- doneMulAssign16:
- }
- }
- /*
- ============
- idSIMD_SSE::MatX_MultiplyVecX
- optimizes the following matrix multiplications:
- NxN * Nx1
- Nx6 * 6x1
- 6xN * Nx1
- with N in the range [1-6]
- ============
- */
- void VPCALL idSIMD_SSE::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
- #define STORE1( offset, reg1, reg2 ) \
- __asm movss [eax+offset], reg1
- #define STORE2LO( offset, reg1, reg2 ) \
- __asm movlps [eax+offset], reg1
- #define STORE2HI( offset, reg1, reg2 ) \
- __asm movhps [eax+offset], reg1
- #define STORE4( offset, reg1, reg2 ) \
- __asm movlps [eax+offset], reg1 \
- __asm movhps [eax+offset+8], reg1
- #define STOREC =
- int numRows;
- const float *mPtr, *vPtr;
- float *dstPtr;
- assert( vec.GetSize() >= mat.GetNumColumns() );
- assert( dst.GetSize() >= mat.GetNumRows() );
- mPtr = mat.ToFloatPtr();
- vPtr = vec.ToFloatPtr();
- dstPtr = dst.ToFloatPtr();
- numRows = mat.GetNumRows();
- switch( mat.GetNumColumns() ) {
- case 1: {
- switch( numRows ) {
- case 1: { // 1x1 * 1x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- mulss xmm0, [edi]
- STORE1( 0, xmm0, xmm1 )
- }
- return;
- }
- case 6: { // 6x1 * 1x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm1, xmm0
- mulps xmm0, [edi]
- mulps xmm1, [edi+16]
- STORE4( 0, xmm0, xmm2 )
- STORE2LO( 16, xmm1, xmm2 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0];
- mPtr++;
- }
- return;
- }
- }
- break;
- }
- case 2: {
- switch( numRows ) {
- case 2: { // 2x2 * 2x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- movss xmm1, [esi+4]
- movss xmm2, [edi]
- mulss xmm2, xmm0
- movss xmm3, [edi+4]
- mulss xmm3, xmm1
- addss xmm2, xmm3
- STORE1( 0, xmm2, xmm4 )
- mulss xmm0, [edi+8]
- mulss xmm1, [edi+8+4]
- addss xmm0, xmm1
- STORE1( 4, xmm0, xmm4 )
- }
- return;
- }
- case 6: { // 6x2 * 2x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm7, [esi]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movaps xmm0, [edi]
- mulps xmm0, xmm7
- movaps xmm1, [edi+16]
- mulps xmm1, xmm7
- movaps xmm2, xmm0
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- movaps xmm3, [edi+32]
- addps xmm0, xmm2
- mulps xmm3, xmm7
- STORE4( 0, xmm0, xmm4 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm1, xmm3
- addps xmm3, xmm1
- STORE2LO( 16, xmm3, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
- mPtr += 2;
- }
- return;
- }
- }
- break;
- }
- case 3: {
- switch( numRows ) {
- case 3: { // 3x3 * 3x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- movss xmm4, [edi]
- mulss xmm4, xmm0
- movss xmm1, [esi+4]
- movss xmm5, [edi+4]
- mulss xmm5, xmm1
- addss xmm4, xmm5
- movss xmm2, [esi+8]
- movss xmm6, [edi+8]
- mulss xmm6, xmm2
- addss xmm4, xmm6
- movss xmm3, [edi+12]
- mulss xmm3, xmm0
- STORE1( 0, xmm4, xmm7 );
- movss xmm5, [edi+12+4]
- mulss xmm5, xmm1
- addss xmm3, xmm5
- movss xmm6, [edi+12+8]
- mulss xmm6, xmm2
- addss xmm3, xmm6
- mulss xmm0, [edi+24]
- mulss xmm1, [edi+24+4]
- STORE1( 4, xmm3, xmm7 );
- addss xmm0, xmm1
- mulss xmm2, [edi+24+8]
- addss xmm0, xmm2
- STORE1( 8, xmm0, xmm7 );
- }
- return;
- }
- case 6: { // 6x3 * 3x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm5, [esi]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm6, [esi+4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm7, [esi+8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
- movlps xmm1, [edi+4*4]
- shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
- movlps xmm2, [edi+6*4]
- movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
- mulps xmm0, xmm5
- movlps xmm3, [edi+10*4]
- shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
- movaps xmm3, xmm1
- shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
- mulps xmm1, xmm6
- shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
- mulps xmm3, xmm7
- addps xmm0, xmm1
- addps xmm0, xmm3
- STORE4( 0, xmm0, xmm4 )
- movss xmm1, [edi+12*4]
- mulss xmm1, xmm5
- movss xmm2, [edi+13*4]
- mulss xmm2, xmm6
- movss xmm3, [edi+14*4]
- mulss xmm3, xmm7
- addss xmm1, xmm2
- addss xmm1, xmm3
- STORE1( 16, xmm1, xmm4 )
- mulss xmm5, [edi+15*4]
- mulss xmm6, [edi+16*4]
- mulss xmm7, [edi+17*4]
- addss xmm5, xmm6
- addss xmm5, xmm7
- STORE1( 20, xmm5, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
- mPtr += 3;
- }
- return;
- }
- }
- break;
- }
- case 4: {
- switch( numRows ) {
- case 4: { // 4x4 * 4x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, qword ptr [esi ]
- movlps xmm0, qword ptr [edi ]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm0, qword ptr [edi+16]
- mulps xmm0, xmm6
- movlps xmm7, qword ptr [esi+ 8]
- movlps xmm2, qword ptr [edi+ 8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm2, qword ptr [edi+24]
- mulps xmm2, xmm7
- movlps xmm1, qword ptr [edi+32]
- movhps xmm1, qword ptr [edi+48]
- mulps xmm1, xmm6
- movlps xmm3, qword ptr [edi+40]
- addps xmm0, xmm2
- movhps xmm3, qword ptr [edi+56]
- mulps xmm3, xmm7
- movaps xmm4, xmm0
- addps xmm1, xmm3
- shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm4
- STORE4( 0, xmm0, xmm2 )
- }
- return;
- }
- case 6: { // 6x4 * 4x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, qword ptr [esi+ 0]
- movlps xmm0, qword ptr [edi+ 0]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm0, qword ptr [edi+16]
- mulps xmm0, xmm6
- movlps xmm7, qword ptr [esi+ 8]
- movlps xmm2, qword ptr [edi+ 8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm2, qword ptr [edi+24]
- mulps xmm2, xmm7
- movlps xmm1, qword ptr [edi+32]
- movhps xmm1, qword ptr [edi+48]
- mulps xmm1, xmm6
- movlps xmm3, qword ptr [edi+40]
- addps xmm0, xmm2
- movhps xmm3, qword ptr [edi+56]
- mulps xmm3, xmm7
- movaps xmm4, xmm0
- addps xmm1, xmm3
- shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm4
- movlps xmm1, qword ptr [edi+64]
- movhps xmm1, qword ptr [edi+80]
- STORE4( 0, xmm0, xmm4 )
- mulps xmm1, xmm6
- movlps xmm2, qword ptr [edi+72]
- movhps xmm2, qword ptr [edi+88]
- mulps xmm2, xmm7
- addps xmm1, xmm2
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm3, xmm1
- addps xmm1, xmm3
- STORE2LO( 16, xmm1, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
- mPtr += 4;
- }
- return;
- }
- }
- break;
- }
- case 5: {
- switch( numRows ) {
- case 5: { // 5x5 * 5x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
- movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
- movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
- movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
- movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
- shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
- movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
- movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
- movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
- shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
- movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
- movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
- movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
- shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
- movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
- movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
- movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
- movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
- shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
- movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
- shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
- movss xmm7, [esi+0*4]
- shufps xmm7, xmm7, 0
- mulps xmm0, xmm7
- movss xmm5, [esi+1*4]
- shufps xmm5, xmm5, 0
- mulps xmm1, xmm5
- addps xmm0, xmm1
- movss xmm6, [esi+2*4]
- shufps xmm6, xmm6, 0
- mulps xmm2, xmm6
- addps xmm0, xmm2
- movss xmm1, [esi+3*4]
- shufps xmm1, xmm1, 0
- mulps xmm3, xmm1
- addps xmm0, xmm3
- movss xmm2, [esi+4*4]
- shufps xmm2, xmm2, 0
- mulps xmm4, xmm2
- addps xmm0, xmm4
- mulss xmm7, [edi+20*4]
- mulss xmm5, [edi+21*4]
- addps xmm7, xmm5
- mulss xmm6, [edi+22*4]
- addps xmm7, xmm6
- mulss xmm1, [edi+23*4]
- addps xmm7, xmm1
- mulss xmm2, [edi+24*4]
- addps xmm7, xmm2
- STORE4( 0, xmm0, xmm3 )
- STORE1( 16, xmm7, xmm4 )
- }
- return;
- }
- case 6: { // 6x5 * 5x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, [esi]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
- movlps xmm7, [esi+8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movlps xmm0, [edi]
- movhps xmm3, [edi+8]
- movaps xmm1, [edi+16]
- movlps xmm2, [edi+32]
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
- shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
- shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
- mulps xmm0, xmm6
- mulps xmm3, xmm7
- movlps xmm2, [edi+40]
- addps xmm0, xmm3 // xmm0 + xmm1
- movhps xmm5, [edi+40+8]
- movlps xmm3, [edi+40+16]
- movhps xmm3, [edi+40+24]
- movlps xmm4, [edi+40+32]
- shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
- shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
- shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
- mulps xmm2, xmm6
- mulps xmm5, xmm7
- addps xmm2, xmm5 // xmm2 + xmm3
- movss xmm5, [esi+16]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm4, xmm0
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
- shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
- addps xmm0, xmm4
- mulps xmm1, xmm5
- addps xmm0, xmm1
- STORE4( 0, xmm0, xmm2 )
- movlps xmm4, [edi+80]
- movhps xmm3, [edi+80+8]
- movaps xmm1, [edi+80+16]
- movlps xmm2, [edi+80+32]
- shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
- shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
- shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
- mulps xmm4, xmm6
- mulps xmm3, xmm7
- mulps xmm1, xmm5
- addps xmm4, xmm3 // xmm4 + xmm1
- shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
- shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
- addps xmm4, xmm1
- shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
- addps xmm4, xmm1
- STORE2LO( 16, xmm4, xmm2 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
- mPtr += 5;
- }
- return;
- }
- }
- break;
- }
- case 6: {
- switch( numRows ) {
- case 1: { // 1x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- mulss xmm0, [edi]
- movss xmm1, [esi+4]
- mulss xmm1, [edi+4]
- movss xmm2, [esi+8]
- addss xmm0, xmm1
- mulss xmm2, [edi+8]
- movss xmm3, [esi+12]
- addss xmm0, xmm2
- mulss xmm3, [edi+12]
- movss xmm4, [esi+16]
- addss xmm0, xmm3
- mulss xmm4, [edi+16]
- movss xmm5, [esi+20]
- addss xmm0, xmm4
- mulss xmm5, [edi+20]
- movss xmm6, [esi+24]
- addss xmm0, xmm5
- mulss xmm6, [edi+24]
- addss xmm0, xmm6
- STORE1( 0, xmm0, xmm7 )
- }
- return;
- }
- case 2: { // 2x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm0, xmm1
- addps xmm0, xmm1
- STORE2LO( 0, xmm0, xmm3 )
- }
- return;
- }
- case 3: { // 3x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm0, xmm1
- addps xmm0, xmm1
- STORE2LO( 0, xmm0, xmm3 )
- // row 2
- movaps xmm0, [edi+48]
- movaps xmm1, [edi+48+16]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- addps xmm0, xmm1
- movhlps xmm1, xmm0
- addps xmm0, xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
- addss xmm0, xmm1
- STORE1( 8, xmm0, xmm3 )
- }
- return;
- }
- case 4: { // 4x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm7, xmm0
- movlhps xmm7, xmm2
- addps xmm7, xmm1
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm7, xmm0
- // row 2 and 3
- movaps xmm0, [edi+48]
- movaps xmm1, [edi+48+16]
- movaps xmm2, [edi+48+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- // last 4 additions for the first 4 rows and store result
- movaps xmm0, xmm7
- shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm7
- STORE4( 0, xmm0, xmm4 )
- }
- return;
- }
- case 5: { // 5x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm7, xmm0
- movlhps xmm7, xmm2
- addps xmm7, xmm1
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm7, xmm0
- // row 2 and 3
- movaps xmm0, [edi+48]
- movaps xmm1, [edi+48+16]
- movaps xmm2, [edi+48+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- // last 4 additions for the first 4 rows and store result
- movaps xmm0, xmm7
- shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm7
- STORE4( 0, xmm0, xmm3 )
- // row 5
- movaps xmm0, [edi+96]
- movaps xmm1, [edi+96+16]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- addps xmm0, xmm1
- movhlps xmm1, xmm0
- addps xmm0, xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, 0x01
- addss xmm0, xmm1
- STORE1( 16, xmm0, xmm3 )
- }
- return;
- }
- case 6: { // 6x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm7, qword ptr [esi]
- movlps xmm6, qword ptr [esi+8]
- shufps xmm7, xmm7, 0x44
- shufps xmm6, xmm6, 0x44
- movlps xmm0, qword ptr [edi ]
- movhps xmm0, qword ptr [edi+ 24]
- mulps xmm0, xmm7
- movlps xmm3, qword ptr [edi+ 8]
- movhps xmm3, qword ptr [edi+ 32]
- mulps xmm3, xmm6
- movlps xmm1, qword ptr [edi+ 48]
- movhps xmm1, qword ptr [edi+ 72]
- mulps xmm1, xmm7
- movlps xmm2, qword ptr [edi+ 96]
- movhps xmm2, qword ptr [edi+120]
- mulps xmm2, xmm7
- movlps xmm4, qword ptr [edi+ 56]
- movhps xmm4, qword ptr [edi+ 80]
- movlps xmm5, qword ptr [edi+104]
- movhps xmm5, qword ptr [edi+128]
- mulps xmm4, xmm6
- movlps xmm7, qword ptr [esi+16]
- addps xmm0, xmm3
- shufps xmm7, xmm7, 0x44
- mulps xmm5, xmm6
- addps xmm1, xmm4
- movlps xmm3, qword ptr [edi+ 16]
- movhps xmm3, qword ptr [edi+ 40]
- addps xmm2, xmm5
- movlps xmm4, qword ptr [edi+ 64]
- movhps xmm4, qword ptr [edi+ 88]
- mulps xmm3, xmm7
- movlps xmm5, qword ptr [edi+112]
- movhps xmm5, qword ptr [edi+136]
- addps xmm0, xmm3
- mulps xmm4, xmm7
- mulps xmm5, xmm7
- addps xmm1, xmm4
- addps xmm2, xmm5
- movaps xmm6, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm6, xmm1, 0xDD
- movaps xmm7, xmm2
- shufps xmm7, xmm2, 0x88
- shufps xmm2, xmm2, 0xDD
- addps xmm0, xmm6
- addps xmm2, xmm7
- STORE4( 0, xmm0, xmm3 )
- STORE2LO( 16, xmm2, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
- mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
- mPtr += 6;
- }
- return;
- }
- }
- break;
- }
- default: {
- int numColumns = mat.GetNumColumns();
- for ( int i = 0; i < numRows; i++ ) {
- float sum = mPtr[0] * vPtr[0];
- for ( int j = 1; j < numColumns; j++ ) {
- sum += mPtr[j] * vPtr[j];
- }
- dstPtr[i] STOREC sum;
- mPtr += numColumns;
- }
- break;
- }
- }
- #undef STOREC
- #undef STORE4
- #undef STORE2HI
- #undef STORE2LO
- #undef STORE1
- }
- /*
- ============
- idSIMD_SSE::MatX_MultiplyAddVecX
- optimizes the following matrix multiplications:
- NxN * Nx1
- Nx6 * 6x1
- 6xN * Nx1
- with N in the range [1-6]
- ============
- */
- void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
- #define STORE1( offset, reg1, reg2 ) \
- __asm movss reg2, [eax+offset] \
- __asm addss reg2, reg1 \
- __asm movss [eax+offset], reg2
- #define STORE2LO( offset, reg1, reg2 ) \
- __asm movlps reg2, [eax+offset] \
- __asm addps reg2, reg1 \
- __asm movlps [eax+offset], reg2
- #define STORE2HI( offset, reg1, reg2 ) \
- __asm movhps reg2, [eax+offset] \
- __asm addps reg2, reg1 \
- __asm movhps [eax+offset], reg2
- #define STORE4( offset, reg1, reg2 ) \
- __asm movlps reg2, [eax+offset] \
- __asm movhps reg2, [eax+offset+8] \
- __asm addps reg2, reg1 \
- __asm movlps [eax+offset], reg2 \
- __asm movhps [eax+offset+8], reg2
- #define STOREC +=
- int numRows;
- const float *mPtr, *vPtr;
- float *dstPtr;
- assert( vec.GetSize() >= mat.GetNumColumns() );
- assert( dst.GetSize() >= mat.GetNumRows() );
- mPtr = mat.ToFloatPtr();
- vPtr = vec.ToFloatPtr();
- dstPtr = dst.ToFloatPtr();
- numRows = mat.GetNumRows();
- switch( mat.GetNumColumns() ) {
- case 1: {
- switch( numRows ) {
- case 1: { // 1x1 * 1x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- mulss xmm0, [edi]
- STORE1( 0, xmm0, xmm1 )
- }
- return;
- }
- case 6: { // 6x1 * 1x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm1, xmm0
- mulps xmm0, [edi]
- mulps xmm1, [edi+16]
- STORE4( 0, xmm0, xmm2 )
- STORE2LO( 16, xmm1, xmm2 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0];
- mPtr++;
- }
- return;
- }
- }
- break;
- }
- case 2: {
- switch( numRows ) {
- case 2: { // 2x2 * 2x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- movss xmm1, [esi+4]
- movss xmm2, [edi]
- mulss xmm2, xmm0
- movss xmm3, [edi+4]
- mulss xmm3, xmm1
- addss xmm2, xmm3
- STORE1( 0, xmm2, xmm4 )
- mulss xmm0, [edi+8]
- mulss xmm1, [edi+8+4]
- addss xmm0, xmm1
- STORE1( 4, xmm0, xmm4 )
- }
- return;
- }
- case 6: { // 6x2 * 2x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm7, [esi]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movaps xmm0, [edi]
- mulps xmm0, xmm7
- movaps xmm1, [edi+16]
- mulps xmm1, xmm7
- movaps xmm2, xmm0
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- movaps xmm3, [edi+32]
- addps xmm0, xmm2
- mulps xmm3, xmm7
- STORE4( 0, xmm0, xmm4 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm1, xmm3
- addps xmm3, xmm1
- STORE2LO( 16, xmm3, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
- mPtr += 2;
- }
- return;
- }
- }
- break;
- }
- case 3: {
- switch( numRows ) {
- case 3: { // 3x3 * 3x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- movss xmm4, [edi]
- mulss xmm4, xmm0
- movss xmm1, [esi+4]
- movss xmm5, [edi+4]
- mulss xmm5, xmm1
- addss xmm4, xmm5
- movss xmm2, [esi+8]
- movss xmm6, [edi+8]
- mulss xmm6, xmm2
- addss xmm4, xmm6
- movss xmm3, [edi+12]
- mulss xmm3, xmm0
- STORE1( 0, xmm4, xmm7 );
- movss xmm5, [edi+12+4]
- mulss xmm5, xmm1
- addss xmm3, xmm5
- movss xmm6, [edi+12+8]
- mulss xmm6, xmm2
- addss xmm3, xmm6
- mulss xmm0, [edi+24]
- mulss xmm1, [edi+24+4]
- STORE1( 4, xmm3, xmm7 );
- addss xmm0, xmm1
- mulss xmm2, [edi+24+8]
- addss xmm0, xmm2
- STORE1( 8, xmm0, xmm7 );
- }
- return;
- }
- case 6: { // 6x3 * 3x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm5, [esi]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm6, [esi+4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm7, [esi+8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
- movlps xmm1, [edi+4*4]
- shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
- movlps xmm2, [edi+6*4]
- movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
- mulps xmm0, xmm5
- movlps xmm3, [edi+10*4]
- shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
- movaps xmm3, xmm1
- shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
- mulps xmm1, xmm6
- shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
- mulps xmm3, xmm7
- addps xmm0, xmm1
- addps xmm0, xmm3
- STORE4( 0, xmm0, xmm4 )
- movss xmm1, [edi+12*4]
- mulss xmm1, xmm5
- movss xmm2, [edi+13*4]
- mulss xmm2, xmm6
- movss xmm3, [edi+14*4]
- mulss xmm3, xmm7
- addss xmm1, xmm2
- addss xmm1, xmm3
- STORE1( 16, xmm1, xmm4 )
- mulss xmm5, [edi+15*4]
- mulss xmm6, [edi+16*4]
- mulss xmm7, [edi+17*4]
- addss xmm5, xmm6
- addss xmm5, xmm7
- STORE1( 20, xmm5, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
- mPtr += 3;
- }
- return;
- }
- }
- break;
- }
- case 4: {
- switch( numRows ) {
- case 4: { // 4x4 * 4x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, qword ptr [esi ]
- movlps xmm0, qword ptr [edi ]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm0, qword ptr [edi+16]
- mulps xmm0, xmm6
- movlps xmm7, qword ptr [esi+ 8]
- movlps xmm2, qword ptr [edi+ 8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm2, qword ptr [edi+24]
- mulps xmm2, xmm7
- movlps xmm1, qword ptr [edi+32]
- movhps xmm1, qword ptr [edi+48]
- mulps xmm1, xmm6
- movlps xmm3, qword ptr [edi+40]
- addps xmm0, xmm2
- movhps xmm3, qword ptr [edi+56]
- mulps xmm3, xmm7
- movaps xmm4, xmm0
- addps xmm1, xmm3
- shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm4
- STORE4( 0, xmm0, xmm2 )
- }
- return;
- }
- case 6: { // 6x4 * 4x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, qword ptr [esi+ 0]
- movlps xmm0, qword ptr [edi+ 0]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm0, qword ptr [edi+16]
- mulps xmm0, xmm6
- movlps xmm7, qword ptr [esi+ 8]
- movlps xmm2, qword ptr [edi+ 8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm2, qword ptr [edi+24]
- mulps xmm2, xmm7
- movlps xmm1, qword ptr [edi+32]
- movhps xmm1, qword ptr [edi+48]
- mulps xmm1, xmm6
- movlps xmm3, qword ptr [edi+40]
- addps xmm0, xmm2
- movhps xmm3, qword ptr [edi+56]
- mulps xmm3, xmm7
- movaps xmm4, xmm0
- addps xmm1, xmm3
- shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm4
- movlps xmm1, qword ptr [edi+64]
- movhps xmm1, qword ptr [edi+80]
- STORE4( 0, xmm0, xmm4 )
- mulps xmm1, xmm6
- movlps xmm2, qword ptr [edi+72]
- movhps xmm2, qword ptr [edi+88]
- mulps xmm2, xmm7
- addps xmm1, xmm2
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm3, xmm1
- addps xmm1, xmm3
- STORE2LO( 16, xmm1, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
- mPtr += 4;
- }
- return;
- }
- }
- break;
- }
- case 5: {
- switch( numRows ) {
- case 5: { // 5x5 * 5x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
- movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
- movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
- movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
- movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
- shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
- movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
- movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
- movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
- shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
- movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
- movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
- movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
- shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
- movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
- movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
- movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
- movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
- shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
- movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
- shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
- movss xmm7, [esi+0*4]
- shufps xmm7, xmm7, 0
- mulps xmm0, xmm7
- movss xmm5, [esi+1*4]
- shufps xmm5, xmm5, 0
- mulps xmm1, xmm5
- addps xmm0, xmm1
- movss xmm6, [esi+2*4]
- shufps xmm6, xmm6, 0
- mulps xmm2, xmm6
- addps xmm0, xmm2
- movss xmm1, [esi+3*4]
- shufps xmm1, xmm1, 0
- mulps xmm3, xmm1
- addps xmm0, xmm3
- movss xmm2, [esi+4*4]
- shufps xmm2, xmm2, 0
- mulps xmm4, xmm2
- addps xmm0, xmm4
- mulss xmm7, [edi+20*4]
- mulss xmm5, [edi+21*4]
- addps xmm7, xmm5
- mulss xmm6, [edi+22*4]
- addps xmm7, xmm6
- mulss xmm1, [edi+23*4]
- addps xmm7, xmm1
- mulss xmm2, [edi+24*4]
- addps xmm7, xmm2
- STORE4( 0, xmm0, xmm3 )
- STORE1( 16, xmm7, xmm4 )
- }
- return;
- }
- case 6: { // 6x5 * 5x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, [esi]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
- movlps xmm7, [esi+8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movlps xmm0, [edi]
- movhps xmm3, [edi+8]
- movaps xmm1, [edi+16]
- movlps xmm2, [edi+32]
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
- shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
- shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
- mulps xmm0, xmm6
- mulps xmm3, xmm7
- movlps xmm2, [edi+40]
- addps xmm0, xmm3 // xmm0 + xmm1
- movhps xmm5, [edi+40+8]
- movlps xmm3, [edi+40+16]
- movhps xmm3, [edi+40+24]
- movlps xmm4, [edi+40+32]
- shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
- shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
- shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
- mulps xmm2, xmm6
- mulps xmm5, xmm7
- addps xmm2, xmm5 // xmm2 + xmm3
- movss xmm5, [esi+16]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm4, xmm0
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
- shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
- addps xmm0, xmm4
- mulps xmm1, xmm5
- addps xmm0, xmm1
- STORE4( 0, xmm0, xmm2 )
- movlps xmm4, [edi+80]
- movhps xmm3, [edi+80+8]
- movaps xmm1, [edi+80+16]
- movlps xmm2, [edi+80+32]
- shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
- shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
- shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
- mulps xmm4, xmm6
- mulps xmm3, xmm7
- mulps xmm1, xmm5
- addps xmm4, xmm3 // xmm4 + xmm1
- shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
- shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
- addps xmm4, xmm1
- shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
- addps xmm4, xmm1
- STORE2LO( 16, xmm4, xmm2 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
- mPtr += 5;
- }
- return;
- }
- }
- break;
- }
- case 6: {
- switch( numRows ) {
- case 1: { // 1x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- mulss xmm0, [edi]
- movss xmm1, [esi+4]
- mulss xmm1, [edi+4]
- movss xmm2, [esi+8]
- addss xmm0, xmm1
- mulss xmm2, [edi+8]
- movss xmm3, [esi+12]
- addss xmm0, xmm2
- mulss xmm3, [edi+12]
- movss xmm4, [esi+16]
- addss xmm0, xmm3
- mulss xmm4, [edi+16]
- movss xmm5, [esi+20]
- addss xmm0, xmm4
- mulss xmm5, [edi+20]
- movss xmm6, [esi+24]
- addss xmm0, xmm5
- mulss xmm6, [edi+24]
- addss xmm0, xmm6
- STORE1( 0, xmm0, xmm7 )
- }
- return;
- }
- case 2: { // 2x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm0, xmm1
- addps xmm0, xmm1
- STORE2LO( 0, xmm0, xmm3 )
- }
- return;
- }
- case 3: { // 3x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm0, xmm1
- addps xmm0, xmm1
- STORE2LO( 0, xmm0, xmm3 )
- // row 2
- movaps xmm0, [edi+48]
- movaps xmm1, [edi+48+16]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- addps xmm0, xmm1
- movhlps xmm1, xmm0
- addps xmm0, xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
- addss xmm0, xmm1
- STORE1( 8, xmm0, xmm3 )
- }
- return;
- }
- case 4: { // 4x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm7, xmm0
- movlhps xmm7, xmm2
- addps xmm7, xmm1
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm7, xmm0
- // row 2 and 3
- movaps xmm0, [edi+48]
- movaps xmm1, [edi+48+16]
- movaps xmm2, [edi+48+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- // last 4 additions for the first 4 rows and store result
- movaps xmm0, xmm7
- shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm7
- STORE4( 0, xmm0, xmm4 )
- }
- return;
- }
- case 5: { // 5x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm7, xmm0
- movlhps xmm7, xmm2
- addps xmm7, xmm1
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm7, xmm0
- // row 2 and 3
- movaps xmm0, [edi+48]
- movaps xmm1, [edi+48+16]
- movaps xmm2, [edi+48+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- // last 4 additions for the first 4 rows and store result
- movaps xmm0, xmm7
- shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm7
- STORE4( 0, xmm0, xmm3 )
- // row 5
- movaps xmm0, [edi+96]
- movaps xmm1, [edi+96+16]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- addps xmm0, xmm1
- movhlps xmm1, xmm0
- addps xmm0, xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, 0x01
- addss xmm0, xmm1
- STORE1( 16, xmm0, xmm3 )
- }
- return;
- }
- case 6: { // 6x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm7, qword ptr [esi]
- movlps xmm6, qword ptr [esi+8]
- shufps xmm7, xmm7, 0x44
- shufps xmm6, xmm6, 0x44
- movlps xmm0, qword ptr [edi ]
- movhps xmm0, qword ptr [edi+ 24]
- mulps xmm0, xmm7
- movlps xmm3, qword ptr [edi+ 8]
- movhps xmm3, qword ptr [edi+ 32]
- mulps xmm3, xmm6
- movlps xmm1, qword ptr [edi+ 48]
- movhps xmm1, qword ptr [edi+ 72]
- mulps xmm1, xmm7
- movlps xmm2, qword ptr [edi+ 96]
- movhps xmm2, qword ptr [edi+120]
- mulps xmm2, xmm7
- movlps xmm4, qword ptr [edi+ 56]
- movhps xmm4, qword ptr [edi+ 80]
- movlps xmm5, qword ptr [edi+104]
- movhps xmm5, qword ptr [edi+128]
- mulps xmm4, xmm6
- movlps xmm7, qword ptr [esi+16]
- addps xmm0, xmm3
- shufps xmm7, xmm7, 0x44
- mulps xmm5, xmm6
- addps xmm1, xmm4
- movlps xmm3, qword ptr [edi+ 16]
- movhps xmm3, qword ptr [edi+ 40]
- addps xmm2, xmm5
- movlps xmm4, qword ptr [edi+ 64]
- movhps xmm4, qword ptr [edi+ 88]
- mulps xmm3, xmm7
- movlps xmm5, qword ptr [edi+112]
- movhps xmm5, qword ptr [edi+136]
- addps xmm0, xmm3
- mulps xmm4, xmm7
- mulps xmm5, xmm7
- addps xmm1, xmm4
- addps xmm2, xmm5
- movaps xmm6, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm6, xmm1, 0xDD
- movaps xmm7, xmm2
- shufps xmm7, xmm2, 0x88
- shufps xmm2, xmm2, 0xDD
- addps xmm0, xmm6
- addps xmm2, xmm7
- STORE4( 0, xmm0, xmm3 )
- STORE2LO( 16, xmm2, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
- mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
- mPtr += 6;
- }
- return;
- }
- }
- break;
- }
- default: {
- int numColumns = mat.GetNumColumns();
- for ( int i = 0; i < numRows; i++ ) {
- float sum = mPtr[0] * vPtr[0];
- for ( int j = 1; j < numColumns; j++ ) {
- sum += mPtr[j] * vPtr[j];
- }
- dstPtr[i] STOREC sum;
- mPtr += numColumns;
- }
- break;
- }
- }
- #undef STOREC
- #undef STORE4
- #undef STORE2HI
- #undef STORE2LO
- #undef STORE1
- }
- /*
- ============
- idSIMD_SSE::MatX_MultiplySubVecX
- optimizes the following matrix multiplications:
- NxN * Nx1
- Nx6 * 6x1
- 6xN * Nx1
- with N in the range [1-6]
- ============
- */
- void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
- #define STORE1( offset, reg1, reg2 ) \
- __asm movss reg2, [eax+offset] \
- __asm subss reg2, reg1 \
- __asm movss [eax+offset], reg2
- #define STORE2LO( offset, reg1, reg2 ) \
- __asm movlps reg2, [eax+offset] \
- __asm subps reg2, reg1 \
- __asm movlps [eax+offset], reg2
- #define STORE2HI( offset, reg1, reg2 ) \
- __asm movhps reg2, [eax+offset] \
- __asm subps reg2, reg1 \
- __asm movhps [eax+offset], reg2
- #define STORE4( offset, reg1, reg2 ) \
- __asm movlps reg2, [eax+offset] \
- __asm movhps reg2, [eax+offset+8] \
- __asm subps reg2, reg1 \
- __asm movlps [eax+offset], reg2 \
- __asm movhps [eax+offset+8], reg2
- #define STOREC -=
- int numRows;
- const float *mPtr, *vPtr;
- float *dstPtr;
- assert( vec.GetSize() >= mat.GetNumColumns() );
- assert( dst.GetSize() >= mat.GetNumRows() );
- mPtr = mat.ToFloatPtr();
- vPtr = vec.ToFloatPtr();
- dstPtr = dst.ToFloatPtr();
- numRows = mat.GetNumRows();
- switch( mat.GetNumColumns() ) {
- case 1: {
- switch( numRows ) {
- case 1: { // 1x1 * 1x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- mulss xmm0, [edi]
- STORE1( 0, xmm0, xmm1 )
- }
- return;
- }
- case 6: { // 6x1 * 1x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm1, xmm0
- mulps xmm0, [edi]
- mulps xmm1, [edi+16]
- STORE4( 0, xmm0, xmm2 )
- STORE2LO( 16, xmm1, xmm2 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0];
- mPtr++;
- }
- return;
- }
- }
- break;
- }
- case 2: {
- switch( numRows ) {
- case 2: { // 2x2 * 2x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- movss xmm1, [esi+4]
- movss xmm2, [edi]
- mulss xmm2, xmm0
- movss xmm3, [edi+4]
- mulss xmm3, xmm1
- addss xmm2, xmm3
- STORE1( 0, xmm2, xmm4 )
- mulss xmm0, [edi+8]
- mulss xmm1, [edi+8+4]
- addss xmm0, xmm1
- STORE1( 4, xmm0, xmm4 )
- }
- return;
- }
- case 6: { // 6x2 * 2x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm7, [esi]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movaps xmm0, [edi]
- mulps xmm0, xmm7
- movaps xmm1, [edi+16]
- mulps xmm1, xmm7
- movaps xmm2, xmm0
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- movaps xmm3, [edi+32]
- addps xmm0, xmm2
- mulps xmm3, xmm7
- STORE4( 0, xmm0, xmm4 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm1, xmm3
- addps xmm3, xmm1
- STORE2LO( 16, xmm3, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
- mPtr += 2;
- }
- return;
- }
- }
- break;
- }
- case 3: {
- switch( numRows ) {
- case 3: { // 3x3 * 3x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- movss xmm4, [edi]
- mulss xmm4, xmm0
- movss xmm1, [esi+4]
- movss xmm5, [edi+4]
- mulss xmm5, xmm1
- addss xmm4, xmm5
- movss xmm2, [esi+8]
- movss xmm6, [edi+8]
- mulss xmm6, xmm2
- addss xmm4, xmm6
- movss xmm3, [edi+12]
- mulss xmm3, xmm0
- STORE1( 0, xmm4, xmm7 );
- movss xmm5, [edi+12+4]
- mulss xmm5, xmm1
- addss xmm3, xmm5
- movss xmm6, [edi+12+8]
- mulss xmm6, xmm2
- addss xmm3, xmm6
- mulss xmm0, [edi+24]
- mulss xmm1, [edi+24+4]
- STORE1( 4, xmm3, xmm7 );
- addss xmm0, xmm1
- mulss xmm2, [edi+24+8]
- addss xmm0, xmm2
- STORE1( 8, xmm0, xmm7 );
- }
- return;
- }
- case 6: { // 6x3 * 3x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm5, [esi]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm6, [esi+4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm7, [esi+8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
- movlps xmm1, [edi+4*4]
- shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
- movlps xmm2, [edi+6*4]
- movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
- mulps xmm0, xmm5
- movlps xmm3, [edi+10*4]
- shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
- movaps xmm3, xmm1
- shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
- mulps xmm1, xmm6
- shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
- mulps xmm3, xmm7
- addps xmm0, xmm1
- addps xmm0, xmm3
- STORE4( 0, xmm0, xmm4 )
- movss xmm1, [edi+12*4]
- mulss xmm1, xmm5
- movss xmm2, [edi+13*4]
- mulss xmm2, xmm6
- movss xmm3, [edi+14*4]
- mulss xmm3, xmm7
- addss xmm1, xmm2
- addss xmm1, xmm3
- STORE1( 16, xmm1, xmm4 )
- mulss xmm5, [edi+15*4]
- mulss xmm6, [edi+16*4]
- mulss xmm7, [edi+17*4]
- addss xmm5, xmm6
- addss xmm5, xmm7
- STORE1( 20, xmm5, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
- mPtr += 3;
- }
- return;
- }
- }
- break;
- }
- case 4: {
- switch( numRows ) {
- case 4: { // 4x4 * 4x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, qword ptr [esi ]
- movlps xmm0, qword ptr [edi ]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm0, qword ptr [edi+16]
- mulps xmm0, xmm6
- movlps xmm7, qword ptr [esi+ 8]
- movlps xmm2, qword ptr [edi+ 8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm2, qword ptr [edi+24]
- mulps xmm2, xmm7
- movlps xmm1, qword ptr [edi+32]
- movhps xmm1, qword ptr [edi+48]
- mulps xmm1, xmm6
- movlps xmm3, qword ptr [edi+40]
- addps xmm0, xmm2
- movhps xmm3, qword ptr [edi+56]
- mulps xmm3, xmm7
- movaps xmm4, xmm0
- addps xmm1, xmm3
- shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm4
- STORE4( 0, xmm0, xmm2 )
- }
- return;
- }
- case 6: { // 6x4 * 4x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, qword ptr [esi+ 0]
- movlps xmm0, qword ptr [edi+ 0]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm0, qword ptr [edi+16]
- mulps xmm0, xmm6
- movlps xmm7, qword ptr [esi+ 8]
- movlps xmm2, qword ptr [edi+ 8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movhps xmm2, qword ptr [edi+24]
- mulps xmm2, xmm7
- movlps xmm1, qword ptr [edi+32]
- movhps xmm1, qword ptr [edi+48]
- mulps xmm1, xmm6
- movlps xmm3, qword ptr [edi+40]
- addps xmm0, xmm2
- movhps xmm3, qword ptr [edi+56]
- mulps xmm3, xmm7
- movaps xmm4, xmm0
- addps xmm1, xmm3
- shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm4
- movlps xmm1, qword ptr [edi+64]
- movhps xmm1, qword ptr [edi+80]
- STORE4( 0, xmm0, xmm4 )
- mulps xmm1, xmm6
- movlps xmm2, qword ptr [edi+72]
- movhps xmm2, qword ptr [edi+88]
- mulps xmm2, xmm7
- addps xmm1, xmm2
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm3, xmm1
- addps xmm1, xmm3
- STORE2LO( 16, xmm1, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
- mPtr += 4;
- }
- return;
- }
- }
- break;
- }
- case 5: {
- switch( numRows ) {
- case 5: { // 5x5 * 5x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
- movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
- movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
- movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
- movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
- shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
- movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
- movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
- movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
- shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
- movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
- movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
- movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
- shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
- movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
- movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
- movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
- movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
- shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
- movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
- shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
- movss xmm7, [esi+0*4]
- shufps xmm7, xmm7, 0
- mulps xmm0, xmm7
- movss xmm5, [esi+1*4]
- shufps xmm5, xmm5, 0
- mulps xmm1, xmm5
- addps xmm0, xmm1
- movss xmm6, [esi+2*4]
- shufps xmm6, xmm6, 0
- mulps xmm2, xmm6
- addps xmm0, xmm2
- movss xmm1, [esi+3*4]
- shufps xmm1, xmm1, 0
- mulps xmm3, xmm1
- addps xmm0, xmm3
- movss xmm2, [esi+4*4]
- shufps xmm2, xmm2, 0
- mulps xmm4, xmm2
- addps xmm0, xmm4
- mulss xmm7, [edi+20*4]
- mulss xmm5, [edi+21*4]
- addps xmm7, xmm5
- mulss xmm6, [edi+22*4]
- addps xmm7, xmm6
- mulss xmm1, [edi+23*4]
- addps xmm7, xmm1
- mulss xmm2, [edi+24*4]
- addps xmm7, xmm2
- STORE4( 0, xmm0, xmm3 )
- STORE1( 16, xmm7, xmm4 )
- }
- return;
- }
- case 6: { // 6x5 * 5x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, [esi]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
- movlps xmm7, [esi+8]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
- movlps xmm0, [edi]
- movhps xmm3, [edi+8]
- movaps xmm1, [edi+16]
- movlps xmm2, [edi+32]
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
- shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
- shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
- mulps xmm0, xmm6
- mulps xmm3, xmm7
- movlps xmm2, [edi+40]
- addps xmm0, xmm3 // xmm0 + xmm1
- movhps xmm5, [edi+40+8]
- movlps xmm3, [edi+40+16]
- movhps xmm3, [edi+40+24]
- movlps xmm4, [edi+40+32]
- shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
- shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
- shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
- mulps xmm2, xmm6
- mulps xmm5, xmm7
- addps xmm2, xmm5 // xmm2 + xmm3
- movss xmm5, [esi+16]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm4, xmm0
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
- shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
- addps xmm0, xmm4
- mulps xmm1, xmm5
- addps xmm0, xmm1
- STORE4( 0, xmm0, xmm2 )
- movlps xmm4, [edi+80]
- movhps xmm3, [edi+80+8]
- movaps xmm1, [edi+80+16]
- movlps xmm2, [edi+80+32]
- shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
- shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
- shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
- mulps xmm4, xmm6
- mulps xmm3, xmm7
- mulps xmm1, xmm5
- addps xmm4, xmm3 // xmm4 + xmm1
- shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
- shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
- addps xmm4, xmm1
- shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
- addps xmm4, xmm1
- STORE2LO( 16, xmm4, xmm2 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
- mPtr += 5;
- }
- return;
- }
- }
- break;
- }
- case 6: {
- switch( numRows ) {
- case 1: { // 1x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- mulss xmm0, [edi]
- movss xmm1, [esi+4]
- mulss xmm1, [edi+4]
- movss xmm2, [esi+8]
- addss xmm0, xmm1
- mulss xmm2, [edi+8]
- movss xmm3, [esi+12]
- addss xmm0, xmm2
- mulss xmm3, [edi+12]
- movss xmm4, [esi+16]
- addss xmm0, xmm3
- mulss xmm4, [edi+16]
- movss xmm5, [esi+20]
- addss xmm0, xmm4
- mulss xmm5, [edi+20]
- movss xmm6, [esi+24]
- addss xmm0, xmm5
- mulss xmm6, [edi+24]
- addss xmm0, xmm6
- STORE1( 0, xmm0, xmm7 )
- }
- return;
- }
- case 2: { // 2x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm0, xmm1
- addps xmm0, xmm1
- STORE2LO( 0, xmm0, xmm3 )
- }
- return;
- }
- case 3: { // 3x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
- movhlps xmm0, xmm1
- addps xmm0, xmm1
- STORE2LO( 0, xmm0, xmm3 )
- // row 2
- movaps xmm0, [edi+48]
- movaps xmm1, [edi+48+16]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- addps xmm0, xmm1
- movhlps xmm1, xmm0
- addps xmm0, xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
- addss xmm0, xmm1
- STORE1( 8, xmm0, xmm3 )
- }
- return;
- }
- case 4: { // 4x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm7, xmm0
- movlhps xmm7, xmm2
- addps xmm7, xmm1
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm7, xmm0
- // row 2 and 3
- movaps xmm0, [edi+48]
- movaps xmm1, [edi+48+16]
- movaps xmm2, [edi+48+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- // last 4 additions for the first 4 rows and store result
- movaps xmm0, xmm7
- shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm7
- STORE4( 0, xmm0, xmm4 )
- }
- return;
- }
- case 5: { // 5x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- // load idVecX
- movlps xmm4, [esi]
- movhps xmm4, [esi+8]
- movlps xmm5, [esi+16]
- movlhps xmm5, xmm4
- movhlps xmm6, xmm4
- movlhps xmm6, xmm5
- // row 0 and 1
- movaps xmm0, [edi]
- movaps xmm1, [edi+16]
- movaps xmm2, [edi+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm7, xmm0
- movlhps xmm7, xmm2
- addps xmm7, xmm1
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm7, xmm0
- // row 2 and 3
- movaps xmm0, [edi+48]
- movaps xmm1, [edi+48+16]
- movaps xmm2, [edi+48+32]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- mulps xmm2, xmm6
- movhlps xmm3, xmm0
- movlhps xmm3, xmm2
- addps xmm1, xmm3
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
- addps xmm1, xmm0
- // last 4 additions for the first 4 rows and store result
- movaps xmm0, xmm7
- shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
- shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
- addps xmm0, xmm7
- STORE4( 0, xmm0, xmm3 )
- // row 5
- movaps xmm0, [edi+96]
- movaps xmm1, [edi+96+16]
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- addps xmm0, xmm1
- movhlps xmm1, xmm0
- addps xmm0, xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, 0x01
- addss xmm0, xmm1
- STORE1( 16, xmm0, xmm3 )
- }
- return;
- }
- case 6: { // 6x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm7, qword ptr [esi]
- movlps xmm6, qword ptr [esi+8]
- shufps xmm7, xmm7, 0x44
- shufps xmm6, xmm6, 0x44
- movlps xmm0, qword ptr [edi ]
- movhps xmm0, qword ptr [edi+ 24]
- mulps xmm0, xmm7
- movlps xmm3, qword ptr [edi+ 8]
- movhps xmm3, qword ptr [edi+ 32]
- mulps xmm3, xmm6
- movlps xmm1, qword ptr [edi+ 48]
- movhps xmm1, qword ptr [edi+ 72]
- mulps xmm1, xmm7
- movlps xmm2, qword ptr [edi+ 96]
- movhps xmm2, qword ptr [edi+120]
- mulps xmm2, xmm7
- movlps xmm4, qword ptr [edi+ 56]
- movhps xmm4, qword ptr [edi+ 80]
- movlps xmm5, qword ptr [edi+104]
- movhps xmm5, qword ptr [edi+128]
- mulps xmm4, xmm6
- movlps xmm7, qword ptr [esi+16]
- addps xmm0, xmm3
- shufps xmm7, xmm7, 0x44
- mulps xmm5, xmm6
- addps xmm1, xmm4
- movlps xmm3, qword ptr [edi+ 16]
- movhps xmm3, qword ptr [edi+ 40]
- addps xmm2, xmm5
- movlps xmm4, qword ptr [edi+ 64]
- movhps xmm4, qword ptr [edi+ 88]
- mulps xmm3, xmm7
- movlps xmm5, qword ptr [edi+112]
- movhps xmm5, qword ptr [edi+136]
- addps xmm0, xmm3
- mulps xmm4, xmm7
- mulps xmm5, xmm7
- addps xmm1, xmm4
- addps xmm2, xmm5
- movaps xmm6, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm6, xmm1, 0xDD
- movaps xmm7, xmm2
- shufps xmm7, xmm2, 0x88
- shufps xmm2, xmm2, 0xDD
- addps xmm0, xmm6
- addps xmm2, xmm7
- STORE4( 0, xmm0, xmm3 )
- STORE2LO( 16, xmm2, xmm4 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numRows; i++ ) {
- dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
- mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
- mPtr += 6;
- }
- return;
- }
- }
- break;
- }
- default: {
- int numColumns = mat.GetNumColumns();
- for ( int i = 0; i < numRows; i++ ) {
- float sum = mPtr[0] * vPtr[0];
- for ( int j = 1; j < numColumns; j++ ) {
- sum += mPtr[j] * vPtr[j];
- }
- dstPtr[i] STOREC sum;
- mPtr += numColumns;
- }
- break;
- }
- }
- #undef STOREC
- #undef STORE4
- #undef STORE2HI
- #undef STORE2LO
- #undef STORE1
- }
- /*
- ============
- idSIMD_SSE::MatX_TransposeMultiplyVecX
- optimizes the following matrix multiplications:
- Nx6 * Nx1
- 6xN * 6x1
- with N in the range [1-6]
- ============
- */
- void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
- #define STORE1( offset, reg1, reg2 ) \
- __asm movss [eax+offset], reg1
- #define STORE2LO( offset, reg1, reg2 ) \
- __asm movlps [eax+offset], reg1
- #define STORE2HI( offset, reg1, reg2 ) \
- __asm movhps [eax+offset], reg1
- #define STORE4( offset, reg1, reg2 ) \
- __asm movlps [eax+offset], reg1 \
- __asm movhps [eax+offset+8], reg1
- #define STOREC =
- int numColumns;
- const float *mPtr, *vPtr;
- float *dstPtr;
- assert( vec.GetSize() >= mat.GetNumRows() );
- assert( dst.GetSize() >= mat.GetNumColumns() );
- mPtr = mat.ToFloatPtr();
- vPtr = vec.ToFloatPtr();
- dstPtr = dst.ToFloatPtr();
- numColumns = mat.GetNumColumns();
- switch( mat.GetNumRows() ) {
- case 1:
- switch( numColumns ) {
- case 6: { // 1x6 * 1x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm1, xmm0
- mulps xmm0, [edi]
- mulps xmm1, [edi+16]
- STORE4( 0, xmm0, xmm2 )
- STORE2LO( 16, xmm1, xmm3 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 2:
- switch( numColumns ) {
- case 6: { // 2x6 * 2x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi]
- movaps xmm1, xmm0
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
- movaps xmm2, [edi]
- mulps xmm2, xmm0
- movlps xmm3, [edi+24]
- movhps xmm3, [edi+32]
- mulps xmm3, xmm1
- addps xmm2, xmm3
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps xmm4, [edi+16]
- movhps xmm4, [edi+40]
- mulps xmm4, xmm0
- movhlps xmm3, xmm4
- addps xmm3, xmm4
- STORE4( 0, xmm2, xmm5 )
- STORE2LO( 16, xmm3, xmm6 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 3:
- switch( numColumns ) {
- case 6: { // 3x6 * 3x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movss xmm1, [esi+2*4]
- movlps xmm3, [edi+(0*6+0)*4]
- movhps xmm3, [edi+(0*6+2)*4]
- movaps xmm4, xmm0
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, xmm4
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(2*6+0)*4]
- movhps xmm4, [edi+(2*6+2)*4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm1
- addps xmm3, xmm4
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- movlps xmm5, [edi+(2*6+4)*4]
- mulps xmm5, xmm1
- addps xmm3, xmm5
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 4:
- switch( numColumns ) {
- case 6: { // 4x6 * 4x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movlps xmm1, [esi+2*4]
- movaps xmm3, xmm0
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, [edi+(0*6+0)*4]
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(2*6+0)*4]
- movhps xmm4, [edi+(2*6+2)*4]
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm6
- addps xmm3, xmm4
- movlps xmm5, [edi+(3*6+0)*4]
- movhps xmm5, [edi+(3*6+2)*4]
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movlps xmm4, [edi+(2*6+4)*4]
- movhps xmm4, [edi+(3*6+4)*4]
- mulps xmm4, xmm1
- addps xmm3, xmm4
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
- *(mPtr+3*numColumns) * vPtr[3];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 5:
- switch( numColumns ) {
- case 6: { // 5x6 * 5x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movlps xmm1, [esi+2*4]
- movss xmm2, [esi+4*4]
- movaps xmm3, xmm0
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, [edi+(0*6+0)*4]
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, [edi+(2*6+0)*4]
- addps xmm3, xmm6
- movlps xmm5, [edi+(3*6+0)*4]
- movhps xmm5, [edi+(3*6+2)*4]
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm4, xmm2
- mulps xmm4, [edi+(4*6+0)*4]
- addps xmm3, xmm4
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movlps xmm4, [edi+(2*6+4)*4]
- movhps xmm4, [edi+(3*6+4)*4]
- mulps xmm4, xmm1
- addps xmm3, xmm4
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- movlps xmm5, [edi+(4*6+4)*4]
- mulps xmm5, xmm2
- addps xmm3, xmm5
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
- *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 6:
- switch( numColumns ) {
- case 1: { // 6x1 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi]
- movhps xmm0, [esi+8]
- movlps xmm1, [esi+16]
- mulps xmm0, [edi]
- mulps xmm1, [edi+16]
- shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
- addps xmm0, xmm1
- movhlps xmm2, xmm0
- addss xmm2, xmm0
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
- addss xmm2, xmm0
- STORE1( 0, xmm2, xmm3 )
- }
- return;
- }
- case 2: { // 6x2 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- movaps xmm6, [edi+0*4]
- mulps xmm6, xmm0
- movlps xmm1, [esi+2*4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- movaps xmm7, [edi+4*4]
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movlps xmm2, [esi+4*4]
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
- movaps xmm7, [edi+8*4]
- mulps xmm7, xmm2
- addps xmm6, xmm7
- movhlps xmm3, xmm6
- addps xmm3, xmm6
- STORE2LO( 0, xmm3, xmm7 )
- }
- return;
- }
- case 3: { // 6x3 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [edi+(0*3+2)*4]
- movhps xmm0, [edi+(0*3+0)*4]
- shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
- movss xmm6, [esi+0*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm0
- movss xmm1, [edi+(1*3+0)*4]
- movhps xmm1, [edi+(1*3+1)*4]
- movss xmm7, [esi+1*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movss xmm2, [edi+(2*3+2)*4]
- movhps xmm2, [edi+(2*3+0)*4]
- shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
- movss xmm7, [esi+2*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm2
- addps xmm6, xmm7
- movss xmm3, [edi+(3*3+0)*4]
- movhps xmm3, [edi+(3*3+1)*4]
- movss xmm7, [esi+3*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm3
- addps xmm6, xmm7
- movss xmm4, [edi+(4*3+2)*4]
- movhps xmm4, [edi+(4*3+0)*4]
- shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
- movss xmm7, [esi+4*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm4
- addps xmm6, xmm7
- movss xmm5, [edi+(5*3+0)*4]
- movhps xmm5, [edi+(5*3+1)*4]
- movss xmm7, [esi+5*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm5
- addps xmm6, xmm7
- STORE1( 0, xmm6, xmm7 )
- STORE2HI( 4, xmm6, xmm7 )
- }
- return;
- }
- case 4: { // 6x4 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm3, [edi+(0*4+0)*4]
- movhps xmm3, [edi+(0*4+2)*4]
- movss xmm4, [esi+0*4]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, xmm4
- movlps xmm5, [edi+(1*4+0)*4]
- movhps xmm5, [edi+(1*4+2)*4]
- movss xmm6, [esi+1*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(2*4+0)*4]
- movhps xmm4, [edi+(2*4+2)*4]
- movss xmm6, [esi+2*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm6
- addps xmm3, xmm4
- movlps xmm5, [edi+(3*4+0)*4]
- movhps xmm5, [edi+(3*4+2)*4]
- movss xmm6, [esi+3*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(4*4+0)*4]
- movhps xmm4, [edi+(4*4+2)*4]
- movss xmm6, [esi+4*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm6
- addps xmm3, xmm4
- movlps xmm5, [edi+(5*4+0)*4]
- movhps xmm5, [edi+(5*4+2)*4]
- movss xmm6, [esi+5*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- STORE4( 0, xmm3, xmm7 )
- }
- return;
- }
- case 5: { // 6x5 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, [edi+(0*5+0)*4]
- movhps xmm6, [edi+(0*5+2)*4]
- movss xmm0, [esi+0*4]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm0
- movlps xmm7, [edi+(1*5+0)*4]
- movhps xmm7, [edi+(1*5+2)*4]
- movss xmm1, [esi+1*4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movlps xmm7, [edi+(2*5+0)*4]
- movhps xmm7, [edi+(2*5+2)*4]
- movss xmm2, [esi+2*4]
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm2
- addps xmm6, xmm7
- movlps xmm7, [edi+(3*5+0)*4]
- movhps xmm7, [edi+(3*5+2)*4]
- movss xmm3, [esi+3*4]
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm3
- addps xmm6, xmm7
- movlps xmm7, [edi+(4*5+0)*4]
- movhps xmm7, [edi+(4*5+2)*4]
- movss xmm4, [esi+4*4]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm4
- addps xmm6, xmm7
- movlps xmm7, [edi+(5*5+0)*4]
- movhps xmm7, [edi+(5*5+2)*4]
- movss xmm5, [esi+5*4]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm5
- addps xmm6, xmm7
- STORE4( 0, xmm6, xmm7 )
- movss xmm6, [edi+(0*5+4)*4]
- mulss xmm6, xmm0
- movss xmm7, [edi+(1*5+4)*4]
- mulss xmm7, xmm1
- addss xmm6, xmm7
- movss xmm7, [edi+(2*5+4)*4]
- mulss xmm7, xmm2
- addss xmm6, xmm7
- movss xmm7, [edi+(3*5+4)*4]
- mulss xmm7, xmm3
- addss xmm6, xmm7
- movss xmm7, [edi+(4*5+4)*4]
- mulss xmm7, xmm4
- addss xmm6, xmm7
- movss xmm7, [edi+(5*5+4)*4]
- mulss xmm7, xmm5
- addss xmm6, xmm7
- STORE1( 16, xmm6, xmm7 )
- }
- return;
- }
- case 6: { // 6x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movlps xmm1, [esi+2*4]
- movlps xmm2, [esi+4*4]
- movaps xmm3, xmm0
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, [edi+(0*6+0)*4]
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, [edi+(2*6+0)*4]
- addps xmm3, xmm6
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- movlps xmm5, [edi+(3*6+0)*4]
- movhps xmm5, [edi+(3*6+2)*4]
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movaps xmm6, xmm2
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, [edi+(4*6+0)*4]
- addps xmm3, xmm6
- movaps xmm6, xmm2
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- movlps xmm5, [edi+(5*6+0)*4]
- movhps xmm5, [edi+(5*6+2)*4]
- mulps xmm5, xmm6
- addps xmm3, xmm5
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movlps xmm4, [edi+(2*6+4)*4]
- movhps xmm4, [edi+(3*6+4)*4]
- mulps xmm4, xmm1
- addps xmm3, xmm4
- movlps xmm5, [edi+(4*6+4)*4]
- movhps xmm5, [edi+(5*6+4)*4]
- mulps xmm5, xmm2
- addps xmm3, xmm5
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
- *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
- mPtr++;
- }
- return;
- }
- }
- break;
- default:
- int numRows = mat.GetNumRows();
- for ( int i = 0; i < numColumns; i++ ) {
- mPtr = mat.ToFloatPtr() + i;
- float sum = mPtr[0] * vPtr[0];
- for ( int j = 1; j < numRows; j++ ) {
- mPtr += numColumns;
- sum += mPtr[0] * vPtr[j];
- }
- dstPtr[i] STOREC sum;
- }
- break;
- }
- #undef STOREC
- #undef STORE4
- #undef STORE2HI
- #undef STORE2LO
- #undef STORE1
- }
- /*
- ============
- idSIMD_SSE::MatX_TransposeMultiplyAddVecX
- optimizes the following matrix multiplications:
- Nx6 * Nx1
- 6xN * 6x1
- with N in the range [1-6]
- ============
- */
- void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
- #define STORE1( offset, reg1, reg2 ) \
- __asm movss reg2, [eax+offset] \
- __asm addss reg2, reg1 \
- __asm movss [eax+offset], reg2
- #define STORE2LO( offset, reg1, reg2 ) \
- __asm movlps reg2, [eax+offset] \
- __asm addps reg2, reg1 \
- __asm movlps [eax+offset], reg2
- #define STORE2HI( offset, reg1, reg2 ) \
- __asm movhps reg2, [eax+offset] \
- __asm addps reg2, reg1 \
- __asm movhps [eax+offset], reg2
- #define STORE4( offset, reg1, reg2 ) \
- __asm movlps reg2, [eax+offset] \
- __asm movhps reg2, [eax+offset+8] \
- __asm addps reg2, reg1 \
- __asm movlps [eax+offset], reg2 \
- __asm movhps [eax+offset+8], reg2
- #define STOREC +=
- int numColumns;
- const float *mPtr, *vPtr;
- float *dstPtr;
- assert( vec.GetSize() >= mat.GetNumRows() );
- assert( dst.GetSize() >= mat.GetNumColumns() );
- mPtr = mat.ToFloatPtr();
- vPtr = vec.ToFloatPtr();
- dstPtr = dst.ToFloatPtr();
- numColumns = mat.GetNumColumns();
- switch( mat.GetNumRows() ) {
- case 1:
- switch( numColumns ) {
- case 6: { // 1x6 * 1x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm1, xmm0
- mulps xmm0, [edi]
- mulps xmm1, [edi+16]
- STORE4( 0, xmm0, xmm2 )
- STORE2LO( 16, xmm1, xmm3 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 2:
- switch( numColumns ) {
- case 6: { // 2x6 * 2x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi]
- movaps xmm1, xmm0
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
- movaps xmm2, [edi]
- mulps xmm2, xmm0
- movlps xmm3, [edi+24]
- movhps xmm3, [edi+32]
- mulps xmm3, xmm1
- addps xmm2, xmm3
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps xmm4, [edi+16]
- movhps xmm4, [edi+40]
- mulps xmm4, xmm0
- movhlps xmm3, xmm4
- addps xmm3, xmm4
- STORE4( 0, xmm2, xmm5 )
- STORE2LO( 16, xmm3, xmm6 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 3:
- switch( numColumns ) {
- case 6: { // 3x6 * 3x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movss xmm1, [esi+2*4]
- movlps xmm3, [edi+(0*6+0)*4]
- movhps xmm3, [edi+(0*6+2)*4]
- movaps xmm4, xmm0
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, xmm4
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(2*6+0)*4]
- movhps xmm4, [edi+(2*6+2)*4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm1
- addps xmm3, xmm4
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- movlps xmm5, [edi+(2*6+4)*4]
- mulps xmm5, xmm1
- addps xmm3, xmm5
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 4:
- switch( numColumns ) {
- case 6: { // 4x6 * 4x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movlps xmm1, [esi+2*4]
- movaps xmm3, xmm0
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, [edi+(0*6+0)*4]
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(2*6+0)*4]
- movhps xmm4, [edi+(2*6+2)*4]
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm6
- addps xmm3, xmm4
- movlps xmm5, [edi+(3*6+0)*4]
- movhps xmm5, [edi+(3*6+2)*4]
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movlps xmm4, [edi+(2*6+4)*4]
- movhps xmm4, [edi+(3*6+4)*4]
- mulps xmm4, xmm1
- addps xmm3, xmm4
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
- *(mPtr+3*numColumns) * vPtr[3];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 5:
- switch( numColumns ) {
- case 6: { // 5x6 * 5x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movlps xmm1, [esi+2*4]
- movss xmm2, [esi+4*4]
- movaps xmm3, xmm0
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, [edi+(0*6+0)*4]
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, [edi+(2*6+0)*4]
- addps xmm3, xmm6
- movlps xmm5, [edi+(3*6+0)*4]
- movhps xmm5, [edi+(3*6+2)*4]
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm4, xmm2
- mulps xmm4, [edi+(4*6+0)*4]
- addps xmm3, xmm4
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movlps xmm4, [edi+(2*6+4)*4]
- movhps xmm4, [edi+(3*6+4)*4]
- mulps xmm4, xmm1
- addps xmm3, xmm4
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- movlps xmm5, [edi+(4*6+4)*4]
- mulps xmm5, xmm2
- addps xmm3, xmm5
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
- *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 6:
- switch( numColumns ) {
- case 1: { // 6x1 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi]
- movhps xmm0, [esi+8]
- movlps xmm1, [esi+16]
- mulps xmm0, [edi]
- mulps xmm1, [edi+16]
- shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
- addps xmm0, xmm1
- movhlps xmm2, xmm0
- addss xmm2, xmm0
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
- addss xmm2, xmm0
- STORE1( 0, xmm2, xmm3 )
- }
- return;
- }
- case 2: { // 6x2 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- movaps xmm6, [edi+0*4]
- mulps xmm6, xmm0
- movlps xmm1, [esi+2*4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- movaps xmm7, [edi+4*4]
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movlps xmm2, [esi+4*4]
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
- movaps xmm7, [edi+8*4]
- mulps xmm7, xmm2
- addps xmm6, xmm7
- movhlps xmm3, xmm6
- addps xmm3, xmm6
- STORE2LO( 0, xmm3, xmm7 )
- }
- return;
- }
- case 3: { // 6x3 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [edi+(0*3+2)*4]
- movhps xmm0, [edi+(0*3+0)*4]
- shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
- movss xmm6, [esi+0*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm0
- movss xmm1, [edi+(1*3+0)*4]
- movhps xmm1, [edi+(1*3+1)*4]
- movss xmm7, [esi+1*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movss xmm2, [edi+(2*3+2)*4]
- movhps xmm2, [edi+(2*3+0)*4]
- shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
- movss xmm7, [esi+2*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm2
- addps xmm6, xmm7
- movss xmm3, [edi+(3*3+0)*4]
- movhps xmm3, [edi+(3*3+1)*4]
- movss xmm7, [esi+3*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm3
- addps xmm6, xmm7
- movss xmm4, [edi+(4*3+2)*4]
- movhps xmm4, [edi+(4*3+0)*4]
- shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
- movss xmm7, [esi+4*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm4
- addps xmm6, xmm7
- movss xmm5, [edi+(5*3+0)*4]
- movhps xmm5, [edi+(5*3+1)*4]
- movss xmm7, [esi+5*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm5
- addps xmm6, xmm7
- STORE1( 0, xmm6, xmm7 )
- STORE2HI( 4, xmm6, xmm7 )
- }
- return;
- }
- case 4: { // 6x4 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm3, [edi+(0*4+0)*4]
- movhps xmm3, [edi+(0*4+2)*4]
- movss xmm4, [esi+0*4]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, xmm4
- movlps xmm5, [edi+(1*4+0)*4]
- movhps xmm5, [edi+(1*4+2)*4]
- movss xmm6, [esi+1*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(2*4+0)*4]
- movhps xmm4, [edi+(2*4+2)*4]
- movss xmm6, [esi+2*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm6
- addps xmm3, xmm4
- movlps xmm5, [edi+(3*4+0)*4]
- movhps xmm5, [edi+(3*4+2)*4]
- movss xmm6, [esi+3*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(4*4+0)*4]
- movhps xmm4, [edi+(4*4+2)*4]
- movss xmm6, [esi+4*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm6
- addps xmm3, xmm4
- movlps xmm5, [edi+(5*4+0)*4]
- movhps xmm5, [edi+(5*4+2)*4]
- movss xmm6, [esi+5*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- STORE4( 0, xmm3, xmm7 )
- }
- return;
- }
- case 5: { // 6x5 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, [edi+(0*5+0)*4]
- movhps xmm6, [edi+(0*5+2)*4]
- movss xmm0, [esi+0*4]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm0
- movlps xmm7, [edi+(1*5+0)*4]
- movhps xmm7, [edi+(1*5+2)*4]
- movss xmm1, [esi+1*4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movlps xmm7, [edi+(2*5+0)*4]
- movhps xmm7, [edi+(2*5+2)*4]
- movss xmm2, [esi+2*4]
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm2
- addps xmm6, xmm7
- movlps xmm7, [edi+(3*5+0)*4]
- movhps xmm7, [edi+(3*5+2)*4]
- movss xmm3, [esi+3*4]
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm3
- addps xmm6, xmm7
- movlps xmm7, [edi+(4*5+0)*4]
- movhps xmm7, [edi+(4*5+2)*4]
- movss xmm4, [esi+4*4]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm4
- addps xmm6, xmm7
- movlps xmm7, [edi+(5*5+0)*4]
- movhps xmm7, [edi+(5*5+2)*4]
- movss xmm5, [esi+5*4]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm5
- addps xmm6, xmm7
- STORE4( 0, xmm6, xmm7 )
- movss xmm6, [edi+(0*5+4)*4]
- mulss xmm6, xmm0
- movss xmm7, [edi+(1*5+4)*4]
- mulss xmm7, xmm1
- addss xmm6, xmm7
- movss xmm7, [edi+(2*5+4)*4]
- mulss xmm7, xmm2
- addss xmm6, xmm7
- movss xmm7, [edi+(3*5+4)*4]
- mulss xmm7, xmm3
- addss xmm6, xmm7
- movss xmm7, [edi+(4*5+4)*4]
- mulss xmm7, xmm4
- addss xmm6, xmm7
- movss xmm7, [edi+(5*5+4)*4]
- mulss xmm7, xmm5
- addss xmm6, xmm7
- STORE1( 16, xmm6, xmm7 )
- }
- return;
- }
- case 6: { // 6x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movlps xmm1, [esi+2*4]
- movlps xmm2, [esi+4*4]
- movaps xmm3, xmm0
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, [edi+(0*6+0)*4]
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, [edi+(2*6+0)*4]
- addps xmm3, xmm6
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- movlps xmm5, [edi+(3*6+0)*4]
- movhps xmm5, [edi+(3*6+2)*4]
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movaps xmm6, xmm2
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, [edi+(4*6+0)*4]
- addps xmm3, xmm6
- movaps xmm6, xmm2
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- movlps xmm5, [edi+(5*6+0)*4]
- movhps xmm5, [edi+(5*6+2)*4]
- mulps xmm5, xmm6
- addps xmm3, xmm5
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movlps xmm4, [edi+(2*6+4)*4]
- movhps xmm4, [edi+(3*6+4)*4]
- mulps xmm4, xmm1
- addps xmm3, xmm4
- movlps xmm5, [edi+(4*6+4)*4]
- movhps xmm5, [edi+(5*6+4)*4]
- mulps xmm5, xmm2
- addps xmm3, xmm5
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
- *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
- mPtr++;
- }
- return;
- }
- }
- break;
- default:
- int numRows = mat.GetNumRows();
- for ( int i = 0; i < numColumns; i++ ) {
- mPtr = mat.ToFloatPtr() + i;
- float sum = mPtr[0] * vPtr[0];
- for ( int j = 1; j < numRows; j++ ) {
- mPtr += numColumns;
- sum += mPtr[0] * vPtr[j];
- }
- dstPtr[i] STOREC sum;
- }
- break;
- }
- #undef STOREC
- #undef STORE4
- #undef STORE2HI
- #undef STORE2LO
- #undef STORE1
- }
- /*
- ============
- void idSIMD_SSE::MatX_TransposeMultiplySubVecX
- optimizes the following matrix multiplications:
- Nx6 * Nx1
- 6xN * 6x1
- with N in the range [1-6]
- ============
- */
- void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
- #define STORE1( offset, reg1, reg2 ) \
- __asm movss reg2, [eax+offset] \
- __asm subss reg2, reg1 \
- __asm movss [eax+offset], reg2
- #define STORE2LO( offset, reg1, reg2 ) \
- __asm movlps reg2, [eax+offset] \
- __asm subps reg2, reg1 \
- __asm movlps [eax+offset], reg2
- #define STORE2HI( offset, reg1, reg2 ) \
- __asm movhps reg2, [eax+offset] \
- __asm subps reg2, reg1 \
- __asm movhps [eax+offset], reg2
- #define STORE4( offset, reg1, reg2 ) \
- __asm movlps reg2, [eax+offset] \
- __asm movhps reg2, [eax+offset+8] \
- __asm subps reg2, reg1 \
- __asm movlps [eax+offset], reg2 \
- __asm movhps [eax+offset+8], reg2
- #define STOREC -=
- int numColumns;
- const float *mPtr, *vPtr;
- float *dstPtr;
- assert( vec.GetSize() >= mat.GetNumRows() );
- assert( dst.GetSize() >= mat.GetNumColumns() );
- mPtr = mat.ToFloatPtr();
- vPtr = vec.ToFloatPtr();
- dstPtr = dst.ToFloatPtr();
- numColumns = mat.GetNumColumns();
- switch( mat.GetNumRows() ) {
- case 1:
- switch( numColumns ) {
- case 6: { // 1x6 * 1x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [esi]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm1, xmm0
- mulps xmm0, [edi]
- mulps xmm1, [edi+16]
- STORE4( 0, xmm0, xmm2 )
- STORE2LO( 16, xmm1, xmm3 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 2:
- switch( numColumns ) {
- case 6: { // 2x6 * 2x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi]
- movaps xmm1, xmm0
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
- movaps xmm2, [edi]
- mulps xmm2, xmm0
- movlps xmm3, [edi+24]
- movhps xmm3, [edi+32]
- mulps xmm3, xmm1
- addps xmm2, xmm3
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps xmm4, [edi+16]
- movhps xmm4, [edi+40]
- mulps xmm4, xmm0
- movhlps xmm3, xmm4
- addps xmm3, xmm4
- STORE4( 0, xmm2, xmm5 )
- STORE2LO( 16, xmm3, xmm6 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 3:
- switch( numColumns ) {
- case 6: { // 3x6 * 3x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movss xmm1, [esi+2*4]
- movlps xmm3, [edi+(0*6+0)*4]
- movhps xmm3, [edi+(0*6+2)*4]
- movaps xmm4, xmm0
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, xmm4
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(2*6+0)*4]
- movhps xmm4, [edi+(2*6+2)*4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm1
- addps xmm3, xmm4
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- movlps xmm5, [edi+(2*6+4)*4]
- mulps xmm5, xmm1
- addps xmm3, xmm5
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 4:
- switch( numColumns ) {
- case 6: { // 4x6 * 4x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movlps xmm1, [esi+2*4]
- movaps xmm3, xmm0
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, [edi+(0*6+0)*4]
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(2*6+0)*4]
- movhps xmm4, [edi+(2*6+2)*4]
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm6
- addps xmm3, xmm4
- movlps xmm5, [edi+(3*6+0)*4]
- movhps xmm5, [edi+(3*6+2)*4]
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movlps xmm4, [edi+(2*6+4)*4]
- movhps xmm4, [edi+(3*6+4)*4]
- mulps xmm4, xmm1
- addps xmm3, xmm4
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
- *(mPtr+3*numColumns) * vPtr[3];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 5:
- switch( numColumns ) {
- case 6: { // 5x6 * 5x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movlps xmm1, [esi+2*4]
- movss xmm2, [esi+4*4]
- movaps xmm3, xmm0
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, [edi+(0*6+0)*4]
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, [edi+(2*6+0)*4]
- addps xmm3, xmm6
- movlps xmm5, [edi+(3*6+0)*4]
- movhps xmm5, [edi+(3*6+2)*4]
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm4, xmm2
- mulps xmm4, [edi+(4*6+0)*4]
- addps xmm3, xmm4
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movlps xmm4, [edi+(2*6+4)*4]
- movhps xmm4, [edi+(3*6+4)*4]
- mulps xmm4, xmm1
- addps xmm3, xmm4
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- movlps xmm5, [edi+(4*6+4)*4]
- mulps xmm5, xmm2
- addps xmm3, xmm5
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
- *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
- mPtr++;
- }
- return;
- }
- }
- break;
- case 6:
- switch( numColumns ) {
- case 1: { // 6x1 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi]
- movhps xmm0, [esi+8]
- movlps xmm1, [esi+16]
- mulps xmm0, [edi]
- mulps xmm1, [edi+16]
- shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
- addps xmm0, xmm1
- movhlps xmm2, xmm0
- addss xmm2, xmm0
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
- addss xmm2, xmm0
- STORE1( 0, xmm2, xmm3 )
- }
- return;
- }
- case 2: { // 6x2 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- movaps xmm6, [edi+0*4]
- mulps xmm6, xmm0
- movlps xmm1, [esi+2*4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- movaps xmm7, [edi+4*4]
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movlps xmm2, [esi+4*4]
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
- movaps xmm7, [edi+8*4]
- mulps xmm7, xmm2
- addps xmm6, xmm7
- movhlps xmm3, xmm6
- addps xmm3, xmm6
- STORE2LO( 0, xmm3, xmm7 )
- }
- return;
- }
- case 3: { // 6x3 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movss xmm0, [edi+(0*3+2)*4]
- movhps xmm0, [edi+(0*3+0)*4]
- shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
- movss xmm6, [esi+0*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm0
- movss xmm1, [edi+(1*3+0)*4]
- movhps xmm1, [edi+(1*3+1)*4]
- movss xmm7, [esi+1*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movss xmm2, [edi+(2*3+2)*4]
- movhps xmm2, [edi+(2*3+0)*4]
- shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
- movss xmm7, [esi+2*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm2
- addps xmm6, xmm7
- movss xmm3, [edi+(3*3+0)*4]
- movhps xmm3, [edi+(3*3+1)*4]
- movss xmm7, [esi+3*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm3
- addps xmm6, xmm7
- movss xmm4, [edi+(4*3+2)*4]
- movhps xmm4, [edi+(4*3+0)*4]
- shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
- movss xmm7, [esi+4*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm4
- addps xmm6, xmm7
- movss xmm5, [edi+(5*3+0)*4]
- movhps xmm5, [edi+(5*3+1)*4]
- movss xmm7, [esi+5*4]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm5
- addps xmm6, xmm7
- STORE1( 0, xmm6, xmm7 )
- STORE2HI( 4, xmm6, xmm7 )
- }
- return;
- }
- case 4: { // 6x4 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm3, [edi+(0*4+0)*4]
- movhps xmm3, [edi+(0*4+2)*4]
- movss xmm4, [esi+0*4]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, xmm4
- movlps xmm5, [edi+(1*4+0)*4]
- movhps xmm5, [edi+(1*4+2)*4]
- movss xmm6, [esi+1*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(2*4+0)*4]
- movhps xmm4, [edi+(2*4+2)*4]
- movss xmm6, [esi+2*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm6
- addps xmm3, xmm4
- movlps xmm5, [edi+(3*4+0)*4]
- movhps xmm5, [edi+(3*4+2)*4]
- movss xmm6, [esi+3*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movlps xmm4, [edi+(4*4+0)*4]
- movhps xmm4, [edi+(4*4+2)*4]
- movss xmm6, [esi+4*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm6
- addps xmm3, xmm4
- movlps xmm5, [edi+(5*4+0)*4]
- movhps xmm5, [edi+(5*4+2)*4]
- movss xmm6, [esi+5*4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- STORE4( 0, xmm3, xmm7 )
- }
- return;
- }
- case 5: { // 6x5 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm6, [edi+(0*5+0)*4]
- movhps xmm6, [edi+(0*5+2)*4]
- movss xmm0, [esi+0*4]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm0
- movlps xmm7, [edi+(1*5+0)*4]
- movhps xmm7, [edi+(1*5+2)*4]
- movss xmm1, [esi+1*4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movlps xmm7, [edi+(2*5+0)*4]
- movhps xmm7, [edi+(2*5+2)*4]
- movss xmm2, [esi+2*4]
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm2
- addps xmm6, xmm7
- movlps xmm7, [edi+(3*5+0)*4]
- movhps xmm7, [edi+(3*5+2)*4]
- movss xmm3, [esi+3*4]
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm3
- addps xmm6, xmm7
- movlps xmm7, [edi+(4*5+0)*4]
- movhps xmm7, [edi+(4*5+2)*4]
- movss xmm4, [esi+4*4]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm4
- addps xmm6, xmm7
- movlps xmm7, [edi+(5*5+0)*4]
- movhps xmm7, [edi+(5*5+2)*4]
- movss xmm5, [esi+5*4]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm5
- addps xmm6, xmm7
- STORE4( 0, xmm6, xmm7 )
- movss xmm6, [edi+(0*5+4)*4]
- mulss xmm6, xmm0
- movss xmm7, [edi+(1*5+4)*4]
- mulss xmm7, xmm1
- addss xmm6, xmm7
- movss xmm7, [edi+(2*5+4)*4]
- mulss xmm7, xmm2
- addss xmm6, xmm7
- movss xmm7, [edi+(3*5+4)*4]
- mulss xmm7, xmm3
- addss xmm6, xmm7
- movss xmm7, [edi+(4*5+4)*4]
- mulss xmm7, xmm4
- addss xmm6, xmm7
- movss xmm7, [edi+(5*5+4)*4]
- mulss xmm7, xmm5
- addss xmm6, xmm7
- STORE1( 16, xmm6, xmm7 )
- }
- return;
- }
- case 6: { // 6x6 * 6x1
- __asm {
- mov esi, vPtr
- mov edi, mPtr
- mov eax, dstPtr
- movlps xmm0, [esi+0*4]
- movlps xmm1, [esi+2*4]
- movlps xmm2, [esi+4*4]
- movaps xmm3, xmm0
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, [edi+(0*6+0)*4]
- movlps xmm5, [edi+(1*6+0)*4]
- movhps xmm5, [edi+(1*6+2)*4]
- movaps xmm6, xmm0
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, [edi+(2*6+0)*4]
- addps xmm3, xmm6
- movaps xmm6, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- movlps xmm5, [edi+(3*6+0)*4]
- movhps xmm5, [edi+(3*6+2)*4]
- mulps xmm5, xmm6
- addps xmm3, xmm5
- movaps xmm6, xmm2
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, [edi+(4*6+0)*4]
- addps xmm3, xmm6
- movaps xmm6, xmm2
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- movlps xmm5, [edi+(5*6+0)*4]
- movhps xmm5, [edi+(5*6+2)*4]
- mulps xmm5, xmm6
- addps xmm3, xmm5
- STORE4( 0, xmm3, xmm7 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
- movlps xmm3, [edi+(0*6+4)*4]
- movhps xmm3, [edi+(1*6+4)*4]
- mulps xmm3, xmm0
- movlps xmm4, [edi+(2*6+4)*4]
- movhps xmm4, [edi+(3*6+4)*4]
- mulps xmm4, xmm1
- addps xmm3, xmm4
- movlps xmm5, [edi+(4*6+4)*4]
- movhps xmm5, [edi+(5*6+4)*4]
- mulps xmm5, xmm2
- addps xmm3, xmm5
- movhlps xmm4, xmm3
- addps xmm3, xmm4
- STORE2LO( 16, xmm3, xmm7 )
- }
- return;
- }
- default: {
- for ( int i = 0; i < numColumns; i++ ) {
- dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
- *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
- mPtr++;
- }
- return;
- }
- }
- break;
- default:
- int numRows = mat.GetNumRows();
- for ( int i = 0; i < numColumns; i++ ) {
- mPtr = mat.ToFloatPtr() + i;
- float sum = mPtr[0] * vPtr[0];
- for ( int j = 1; j < numRows; j++ ) {
- mPtr += numColumns;
- sum += mPtr[0] * vPtr[j];
- }
- dstPtr[i] STOREC sum;
- }
- break;
- }
- #undef STOREC
- #undef STORE4
- #undef STORE2HI
- #undef STORE2LO
- #undef STORE1
- }
- /*
- ============
- idSIMD_SSE::MatX_MultiplyMatX
- optimizes the following matrix multiplications:
- NxN * Nx6
- 6xN * Nx6
- Nx6 * 6xN
- 6x6 * 6xN
- with N in the range [1-6].
- The hot cache clock cycle counts are generally better for the SIMD version than the
- FPU version. At times up to 40% less clock cycles on a P3. In practise however,
- the results are poor probably due to memory access.
- ============
- */
- void VPCALL idSIMD_SSE::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
- int i, j, k, l, n;
- float *dstPtr;
- const float *m1Ptr, *m2Ptr;
- double sum;
- assert( m1.GetNumColumns() == m2.GetNumRows() );
- dstPtr = dst.ToFloatPtr();
- m1Ptr = m1.ToFloatPtr();
- m2Ptr = m2.ToFloatPtr();
- k = m1.GetNumRows();
- l = m2.GetNumColumns();
- n = m1.GetNumColumns();
- switch( n ) {
- case 1: {
- if ( !(l^6) ) {
- switch( k ) {
- case 1: { // 1x1 * 1x6, no precision loss compared to FPU version
- __asm {
- mov esi, m2Ptr
- mov edi, m1Ptr
- mov eax, dstPtr
- movss xmm0, [edi]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm1, [esi]
- mulps xmm1, xmm0
- movaps [eax], xmm1
- movlps xmm2, [esi+16]
- mulps xmm2, xmm0
- movlps [eax+16], xmm2
- }
- return;
- }
- case 6: { // 6x1 * 1x6, no precision loss compared to FPU version
- __asm {
- mov esi, m2Ptr
- mov edi, m1Ptr
- mov eax, dstPtr
- xorps xmm1, xmm1
- movaps xmm0, [edi]
- movlps xmm1, [edi+16]
- movlhps xmm1, xmm0
- movhlps xmm2, xmm0
- movlhps xmm2, xmm1
- // row 0 and 1
- movaps xmm3, [esi]
- movaps xmm4, xmm3
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm5, xmm3
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
- movaps xmm6, xmm3
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm4, xmm0
- mulps xmm5, xmm1
- mulps xmm6, xmm2
- movaps [eax], xmm4
- movaps [eax+16], xmm5
- movaps [eax+32], xmm6
- // row 2 and 3
- movaps xmm4, xmm3
- shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )
- movaps xmm5, xmm3
- shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
- mulps xmm4, xmm0
- mulps xmm5, xmm1
- mulps xmm3, xmm2
- movaps [eax+48], xmm4
- movaps [eax+64], xmm5
- movaps [eax+80], xmm3
- // row 4 and 5
- movlps xmm3, [esi+16]
- movaps xmm4, xmm3
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm5, xmm3
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
- shufps xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm4, xmm0
- mulps xmm5, xmm1
- mulps xmm3, xmm2
- movaps [eax+96], xmm4
- movaps [eax+112], xmm5
- movaps [eax+128], xmm3
- }
- return;
- }
- }
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0];
- m2Ptr++;
- }
- m1Ptr++;
- }
- break;
- }
- case 2: {
- if ( !(l^6) ) {
- switch( k ) {
- case 2: { // 2x2 * 2x6
- #define MUL_Nx2_2x6_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movaps xmm0, [esi] \
- __asm movlps xmm1, [esi+16] \
- __asm movhps xmm1, [esi+40] \
- __asm movlps xmm2, [esi+24] \
- __asm movhps xmm2, [esi+32]
- #define MUL_Nx2_2x6_ROW2( row ) \
- __asm movaps xmm3, [edi+row*16] \
- __asm movaps xmm5, xmm0 \
- __asm movaps xmm4, xmm3 \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm5, xmm4 \
- __asm movaps xmm4, xmm3 \
- __asm movaps xmm6, xmm2 \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 ) \
- __asm mulps xmm6, xmm4 \
- __asm addps xmm5, xmm6 \
- __asm movaps [eax+row*48], xmm5 \
- __asm movaps xmm4, xmm3 \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm movaps xmm7, xmm1 \
- __asm mulps xmm7, xmm4 \
- __asm movaps xmm4, xmm3 \
- __asm movaps xmm5, xmm0 \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) \
- __asm mulps xmm5, xmm4 \
- __asm movaps xmm4, xmm3 \
- __asm movaps xmm6, xmm2 \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 ) \
- __asm mulps xmm6, xmm4 \
- __asm addps xmm5, xmm6 \
- __asm shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 ) \
- __asm movaps xmm6, xmm1 \
- __asm mulps xmm6, xmm3 \
- __asm movaps xmm4, xmm7 \
- __asm movlhps xmm7, xmm6 \
- __asm movhlps xmm6, xmm4 \
- __asm addps xmm6, xmm7 \
- __asm movlps [eax+row*48+16], xmm6 \
- __asm movlps [eax+row*48+24], xmm5 \
- __asm movhps [eax+row*48+32], xmm5 \
- __asm movhps [eax+row*48+40], xmm6
- MUL_Nx2_2x6_INIT
- MUL_Nx2_2x6_ROW2( 0 )
- return;
- }
- case 6: { // 6x2 * 2x6
- MUL_Nx2_2x6_INIT
- MUL_Nx2_2x6_ROW2( 0 )
- MUL_Nx2_2x6_ROW2( 1 )
- MUL_Nx2_2x6_ROW2( 2 )
- return;
- }
- }
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
- m2Ptr++;
- }
- m1Ptr += 2;
- }
- break;
- }
- case 3: {
- if ( !(l^6) ) {
- switch( k ) {
- case 3: { // 3x3 * 3x6
- __asm {
- mov esi, m2Ptr
- mov edi, m1Ptr
- mov eax, dstPtr
- movaps xmm5, xmmword ptr [esi]
- movlps xmm6, qword ptr [esi+24]
- movhps xmm6, qword ptr [esi+32]
- movaps xmm7, xmmword ptr [esi+48]
- movss xmm0, dword ptr [edi]
- shufps xmm0, xmm0, 0
- mulps xmm0, xmm5
- movss xmm1, dword ptr [edi+4]
- shufps xmm1, xmm1, 0
- mulps xmm1, xmm6
- movss xmm2, dword ptr [edi+8]
- shufps xmm2, xmm2, 0
- mulps xmm2, xmm7
- addps xmm0, xmm1
- addps xmm0, xmm2
- movaps xmmword ptr [eax], xmm0
- movss xmm3, dword ptr [edi+12]
- shufps xmm3, xmm3, 0
- mulps xmm3, xmm5
- movss xmm4, dword ptr [edi+16]
- shufps xmm4, xmm4, 0
- mulps xmm4, xmm6
- movss xmm0, dword ptr [edi+20]
- shufps xmm0, xmm0, 0
- mulps xmm0, xmm7
- addps xmm3, xmm4
- addps xmm0, xmm3
- movlps qword ptr [eax+24], xmm0
- movhps qword ptr [eax+32], xmm0
- movss xmm1, dword ptr [edi+24]
- shufps xmm1, xmm1, 0
- mulps xmm1, xmm5
- movss xmm2, dword ptr [edi+28]
- shufps xmm2, xmm2, 0
- mulps xmm2, xmm6
- movss xmm3, dword ptr [edi+32]
- shufps xmm3, xmm3, 0
- mulps xmm3, xmm7
- addps xmm1, xmm2
- addps xmm1, xmm3
- movaps xmmword ptr [eax+48], xmm1
- movlps xmm5, qword ptr [esi+16]
- movlps xmm6, qword ptr [esi+40]
- movlps xmm7, qword ptr [esi+64]
- shufps xmm5, xmm5, 0x44
- shufps xmm6, xmm6, 0x44
- shufps xmm7, xmm7, 0x44
- movaps xmm3, xmmword ptr [edi]
- movlps xmm4, qword ptr [edi+16]
- movaps xmm0, xmm3
- shufps xmm0, xmm0, 0xF0
- mulps xmm0, xmm5
- movaps xmm1, xmm3
- shufps xmm1, xmm4, 0x05
- mulps xmm1, xmm6
- shufps xmm3, xmm4, 0x5A
- mulps xmm3, xmm7
- addps xmm1, xmm0
- addps xmm1, xmm3
- movlps qword ptr [eax+16], xmm1
- movhps qword ptr [eax+40], xmm1
- movss xmm0, dword ptr [edi+24]
- shufps xmm0, xmm0, 0
- mulps xmm0, xmm5
- movss xmm2, dword ptr [edi+28]
- shufps xmm2, xmm2, 0
- mulps xmm2, xmm6
- movss xmm4, dword ptr [edi+32]
- shufps xmm4, xmm4, 0
- mulps xmm4, xmm7
- addps xmm0, xmm2
- addps xmm0, xmm4
- movlps qword ptr [eax+64], xmm0
- }
- return;
- }
- case 6: { // 6x3 * 3x6
- #define MUL_Nx3_3x6_FIRST4COLUMNS_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movlps xmm0, [esi+ 0*4] \
- __asm movhps xmm0, [esi+ 2*4] \
- __asm movlps xmm1, [esi+ 6*4] \
- __asm movhps xmm1, [esi+ 8*4] \
- __asm movlps xmm2, [esi+12*4] \
- __asm movhps xmm2, [esi+14*4]
- #define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row ) \
- __asm movss xmm3, [edi+(row*3+0)*4] \
- __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm3, xmm0 \
- __asm movss xmm4, [edi+(row*3+1)*4] \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm4, xmm1 \
- __asm addps xmm3, xmm4 \
- __asm movss xmm5, [edi+(row*3+2)*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm5, xmm2 \
- __asm addps xmm3, xmm5 \
- __asm movlps [eax+(row*6+0)*4], xmm3 \
- __asm movhps [eax+(row*6+2)*4], xmm3
- #define MUL_Nx3_3x6_LAST2COLUMNS_ROW6 \
- __asm movlps xmm0, [esi+ 4*4] \
- __asm movlps xmm1, [esi+10*4] \
- __asm movlps xmm2, [esi+16*4] \
- __asm shufps xmm0, xmm0, 0x44 \
- __asm shufps xmm1, xmm1, 0x44 \
- __asm shufps xmm2, xmm2, 0x44 \
- __asm movlps xmm3, [edi+0*4] \
- __asm movhps xmm3, [edi+2*4] \
- __asm movaps xmm4, xmm3 \
- __asm movaps xmm5, xmm3 \
- __asm shufps xmm3, xmm3, 0xF0 \
- __asm mulps xmm3, xmm0 \
- __asm movlps xmm6, [edi+4*4] \
- __asm movhps xmm6, [edi+6*4] \
- __asm shufps xmm4, xmm6, 0x05 \
- __asm mulps xmm4, xmm1 \
- __asm addps xmm3, xmm4 \
- __asm shufps xmm5, xmm6, 0x5A \
- __asm mulps xmm5, xmm2 \
- __asm addps xmm3, xmm5 \
- __asm movlps [eax+4*4], xmm3 \
- __asm movhps [eax+10*4], xmm3 \
- __asm movaps xmm5, xmm6 \
- __asm movlps xmm3, [edi+8*4] \
- __asm movhps xmm3, [edi+10*4] \
- __asm movaps xmm4, xmm3 \
- __asm shufps xmm5, xmm3, 0x5A \
- __asm mulps xmm5, xmm0 \
- __asm shufps xmm6, xmm3, 0xAF \
- __asm mulps xmm6, xmm1 \
- __asm addps xmm5, xmm6 \
- __asm shufps xmm4, xmm4, 0xF0 \
- __asm mulps xmm4, xmm2 \
- __asm addps xmm4, xmm5 \
- __asm movlps [eax+16*4], xmm4 \
- __asm movhps [eax+22*4], xmm4 \
- __asm movlps xmm6, [edi+12*4] \
- __asm movhps xmm6, [edi+14*4] \
- __asm movaps xmm5, xmm6 \
- __asm movaps xmm4, xmm6 \
- __asm shufps xmm6, xmm6, 0xF0 \
- __asm mulps xmm6, xmm0 \
- __asm movlps xmm3, [edi+16*4] \
- __asm shufps xmm5, xmm3, 0x05 \
- __asm mulps xmm5, xmm1 \
- __asm addps xmm5, xmm6 \
- __asm shufps xmm4, xmm3, 0x5A \
- __asm mulps xmm4, xmm2 \
- __asm addps xmm4, xmm5 \
- __asm movlps [eax+28*4], xmm4 \
- __asm movhps [eax+34*4], xmm4
- MUL_Nx3_3x6_FIRST4COLUMNS_INIT
- MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 )
- MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 )
- MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 )
- MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 )
- MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 )
- MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 )
- MUL_Nx3_3x6_LAST2COLUMNS_ROW6
- return;
- }
- }
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
- m2Ptr++;
- }
- m1Ptr += 3;
- }
- break;
- }
- case 4: {
- if ( !(l^6) ) {
- switch( k ) {
- case 4: { // 4x4 * 4x6
- #define MUL_Nx4_4x6_FIRST4COLUMNS_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movlps xmm0, [esi+ 0*4] \
- __asm movhps xmm0, [esi+ 2*4] \
- __asm movlps xmm1, [esi+ 6*4] \
- __asm movhps xmm1, [esi+ 8*4] \
- __asm movlps xmm2, [esi+12*4] \
- __asm movhps xmm2, [esi+14*4] \
- __asm movlps xmm3, [esi+18*4] \
- __asm movhps xmm3, [esi+20*4]
- #define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row ) \
- __asm movss xmm4, [edi+row*16+0*4] \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm4, xmm0 \
- __asm movss xmm5, [edi+row*16+1*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm5, xmm1 \
- __asm addps xmm4, xmm5 \
- __asm movss xmm6, [edi+row*16+2*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm2 \
- __asm addps xmm4, xmm6 \
- __asm movss xmm7, [edi+row*16+3*4] \
- __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm7, xmm3 \
- __asm addps xmm4, xmm7 \
- __asm movlps [eax+row*24+0], xmm4 \
- __asm movhps [eax+row*24+8], xmm4
- #define MUL_Nx4_4x6_LAST2COLUMNS_INIT \
- __asm movlps xmm0, [esi+ 4*4] \
- __asm movlps xmm1, [esi+10*4] \
- __asm movlps xmm2, [esi+16*4] \
- __asm movlps xmm3, [esi+22*4] \
- __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
- #define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row ) \
- __asm movlps xmm7, [edi+row*32+ 0*4] \
- __asm movhps xmm7, [edi+row*32+ 4*4] \
- __asm movaps xmm6, xmm7 \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 ) \
- __asm mulps xmm6, xmm0 \
- __asm shufps xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 ) \
- __asm mulps xmm7, xmm1 \
- __asm addps xmm6, xmm7 \
- __asm movlps xmm4, [edi+row*32+ 2*4] \
- __asm movhps xmm4, [edi+row*32+ 6*4] \
- __asm movaps xmm5, xmm4 \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 ) \
- __asm mulps xmm5, xmm2 \
- __asm addps xmm6, xmm5 \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 ) \
- __asm mulps xmm4, xmm3 \
- __asm addps xmm6, xmm4 \
- __asm movlps [eax+row*48+ 4*4], xmm6 \
- __asm movhps [eax+row*48+10*4], xmm6
- MUL_Nx4_4x6_FIRST4COLUMNS_INIT
- MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
- MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
- MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
- MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
- MUL_Nx4_4x6_LAST2COLUMNS_INIT
- MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
- MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
- return;
- }
- case 6: { // 6x4 * 4x6
- MUL_Nx4_4x6_FIRST4COLUMNS_INIT
- MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
- MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
- MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
- MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
- MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 )
- MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 )
- MUL_Nx4_4x6_LAST2COLUMNS_INIT
- MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
- MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
- MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 )
- return;
- }
- }
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
- m1Ptr[3] * m2Ptr[3*l];
- m2Ptr++;
- }
- m1Ptr += 4;
- }
- break;
- }
- case 5: {
- if ( !(l^6) ) {
- switch( k ) {
- case 5: { // 5x5 * 5x6
- #define MUL_Nx5_5x6_FIRST4COLUMNS_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movlps xmm0, [esi+ 0*4] \
- __asm movhps xmm0, [esi+ 2*4] \
- __asm movlps xmm1, [esi+ 6*4] \
- __asm movhps xmm1, [esi+ 8*4] \
- __asm movlps xmm2, [esi+12*4] \
- __asm movhps xmm2, [esi+14*4] \
- __asm movlps xmm3, [esi+18*4] \
- __asm movhps xmm3, [esi+20*4] \
- __asm movlps xmm4, [esi+24*4] \
- __asm movhps xmm4, [esi+26*4]
- #define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row ) \
- __asm movss xmm6, [edi+row*20+0*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm0 \
- __asm movss xmm5, [edi+row*20+1*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm5, xmm1 \
- __asm addps xmm6, xmm5 \
- __asm movss xmm5, [edi+row*20+2*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm5, xmm2 \
- __asm addps xmm6, xmm5 \
- __asm movss xmm5, [edi+row*20+3*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm5, xmm3 \
- __asm addps xmm6, xmm5 \
- __asm movss xmm5, [edi+row*20+4*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm5, xmm4 \
- __asm addps xmm6, xmm5 \
- __asm movlps [eax+row*24+0], xmm6 \
- __asm movhps [eax+row*24+8], xmm6
- #define MUL_Nx5_5x6_LAST2COLUMNS_INIT \
- __asm movlps xmm0, [esi+ 4*4] \
- __asm movlps xmm1, [esi+10*4] \
- __asm movlps xmm2, [esi+16*4] \
- __asm movlps xmm3, [esi+22*4] \
- __asm movlps xmm4, [esi+28*4] \
- __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm shufps xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm shufps xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm shufps xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )
- #define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row ) \
- __asm movlps xmm7, [edi+row*40+ 0*4] \
- __asm movhps xmm7, [edi+row*40+ 6*4] \
- __asm movaps xmm6, xmm7 \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 ) \
- __asm mulps xmm6, xmm0 \
- __asm movaps xmm5, xmm7 \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
- __asm mulps xmm5, xmm1 \
- __asm addps xmm6, xmm5 \
- __asm movlps xmm7, [edi+row*40+ 2*4] \
- __asm movhps xmm7, [edi+row*40+ 8*4] \
- __asm movaps xmm5, xmm7 \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 ) \
- __asm mulps xmm5, xmm2 \
- __asm addps xmm6, xmm5 \
- __asm movaps xmm5, xmm7 \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
- __asm mulps xmm5, xmm3 \
- __asm addps xmm6, xmm5 \
- __asm movlps xmm5, [edi+row*40+ 4*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm5, xmm4 \
- __asm addps xmm6, xmm5 \
- __asm movlps [eax+row*48+ 4*4], xmm6 \
- __asm movhps [eax+row*48+10*4], xmm6
- #define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row ) \
- __asm movlps xmm6, [edi+20*4+0*4] \
- __asm unpcklps xmm6, xmm6 \
- __asm mulps xmm6, xmm0 \
- __asm movlps xmm5, [edi+20*4+2*4] \
- __asm unpcklps xmm5, xmm5 \
- __asm mulps xmm5, xmm2 \
- __asm addps xmm6, xmm5 \
- __asm movss xmm5, [edi+20*4+4*4] \
- __asm unpcklps xmm5, xmm5 \
- __asm mulps xmm5, xmm4 \
- __asm addps xmm6, xmm5 \
- __asm movhlps xmm7, xmm6 \
- __asm addps xmm6, xmm7 \
- __asm movlps [eax+row*24+4*4], xmm6
- MUL_Nx5_5x6_FIRST4COLUMNS_INIT
- MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
- MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
- MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
- MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
- MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
- MUL_Nx5_5x6_LAST2COLUMNS_INIT
- MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
- MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
- MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 )
- return;
- }
- case 6: { // 6x5 * 5x6
- MUL_Nx5_5x6_FIRST4COLUMNS_INIT
- MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
- MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
- MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
- MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
- MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
- MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 )
- MUL_Nx5_5x6_LAST2COLUMNS_INIT
- MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
- MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
- MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 )
- return;
- }
- }
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
- m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
- m2Ptr++;
- }
- m1Ptr += 5;
- }
- break;
- }
- case 6: {
- switch( k ) {
- case 1: {
- if ( !(l^1) ) { // 1x6 * 6x1
- dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
- m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
- return;
- }
- break;
- }
- case 2: {
- if ( !(l^2) ) { // 2x6 * 6x2
- #define MUL_Nx6_6x2_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movaps xmm0, [esi] \
- __asm movaps xmm1, [esi+16] \
- __asm movaps xmm2, [esi+32]
- #define MUL_Nx6_6x2_ROW2( row ) \
- __asm movaps xmm7, [edi+row*48+0*4] \
- __asm movaps xmm6, xmm7 \
- __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm7, xmm0 \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 ) \
- __asm mulps xmm6, xmm1 \
- __asm addps xmm7, xmm6 \
- __asm movaps xmm6, [edi+row*48+4*4] \
- __asm movaps xmm5, xmm6 \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm6, xmm2 \
- __asm addps xmm7, xmm6 \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) \
- __asm mulps xmm5, xmm0 \
- __asm movaps xmm6, [edi+row*48+24+2*4] \
- __asm movaps xmm4, xmm6 \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm6, xmm1 \
- __asm addps xmm5, xmm6 \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 ) \
- __asm mulps xmm4, xmm2 \
- __asm addps xmm5, xmm4 \
- __asm movaps xmm4, xmm5 \
- __asm movhlps xmm5, xmm7 \
- __asm movlhps xmm7, xmm4 \
- __asm addps xmm7, xmm5 \
- __asm movaps [eax+row*16], xmm7
- MUL_Nx6_6x2_INIT
- MUL_Nx6_6x2_ROW2( 0 )
- return;
- }
- break;
- }
- case 3: {
- if ( !(l^3) ) { // 3x6 * 6x3
- #define MUL_Nx6_6x3_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movss xmm0, [esi+ 0*4] \
- __asm movhps xmm0, [esi+ 1*4] \
- __asm movss xmm1, [esi+ 3*4] \
- __asm movhps xmm1, [esi+ 4*4] \
- __asm movss xmm2, [esi+ 6*4] \
- __asm movhps xmm2, [esi+ 7*4] \
- __asm movss xmm3, [esi+ 9*4] \
- __asm movhps xmm3, [esi+10*4] \
- __asm movss xmm4, [esi+12*4] \
- __asm movhps xmm4, [esi+13*4] \
- __asm movss xmm5, [esi+15*4] \
- __asm movhps xmm5, [esi+16*4]
- #define MUL_Nx6_6x3_ROW( row ) \
- __asm movss xmm7, [edi+row*24+0] \
- __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm7, xmm0 \
- __asm movss xmm6, [edi+row*24+4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm1 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+row*24+8] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm2 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+row*24+12] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm3 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+row*24+16] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm4 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+row*24+20] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm5 \
- __asm addps xmm7, xmm6 \
- __asm movss [eax+row*12+0], xmm7 \
- __asm movhps [eax+row*12+4], xmm7
- MUL_Nx6_6x3_INIT
- MUL_Nx6_6x3_ROW( 0 )
- MUL_Nx6_6x3_ROW( 1 )
- MUL_Nx6_6x3_ROW( 2 )
- return;
- }
- break;
- }
- case 4: {
- if ( !(l^4) ) { // 4x6 * 6x4
- #define MUL_Nx6_6x4_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movaps xmm0, [esi] \
- __asm movaps xmm1, [esi+16] \
- __asm movaps xmm2, [esi+32] \
- __asm movaps xmm3, [esi+48] \
- __asm movaps xmm4, [esi+64] \
- __asm movaps xmm5, [esi+80]
- #define MUL_Nx6_6x4_ROW( row ) \
- __asm movss xmm7, [edi+row*24+0] \
- __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm7, xmm0 \
- __asm movss xmm6, [edi+row*24+4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm1 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+row*24+8] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm2 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+row*24+12] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm3 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+row*24+16] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm4 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+row*24+20] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm5 \
- __asm addps xmm7, xmm6 \
- __asm movaps [eax+row*16], xmm7
- MUL_Nx6_6x4_INIT
- MUL_Nx6_6x4_ROW( 0 )
- MUL_Nx6_6x4_ROW( 1 )
- MUL_Nx6_6x4_ROW( 2 )
- MUL_Nx6_6x4_ROW( 3 )
-
- return;
- }
- break;
- }
- case 5: {
- if ( !(l^5) ) { // 5x6 * 6x5
- #define MUL_Nx6_6x5_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movaps xmm0, [esi] \
- __asm movlps xmm1, [esi+20] \
- __asm movhps xmm1, [esi+28] \
- __asm movlps xmm2, [esi+40] \
- __asm movhps xmm2, [esi+48] \
- __asm movlps xmm3, [esi+60] \
- __asm movhps xmm3, [esi+68] \
- __asm movaps xmm4, [esi+80] \
- __asm movlps xmm5, [esi+100] \
- __asm movhps xmm5, [esi+108]
- #define MUL_Nx6_6x5_ROW( row ) \
- __asm movss xmm7, [edi+row*24+0] \
- __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm7, xmm0 \
- __asm fld dword ptr [edi+(row*6+0)*4] \
- __asm fmul dword ptr [esi+(4+0*5)*4] \
- __asm movss xmm6, [edi+row*24+4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm1 \
- __asm addps xmm7, xmm6 \
- __asm fld dword ptr [edi+(row*6+1)*4] \
- __asm fmul dword ptr [esi+(4+1*5)*4] \
- __asm faddp st(1),st \
- __asm movss xmm6, [edi+row*24+8] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm2 \
- __asm addps xmm7, xmm6 \
- __asm fld dword ptr [edi+(row*6+2)*4] \
- __asm fmul dword ptr [esi+(4+2*5)*4] \
- __asm faddp st(1),st \
- __asm movss xmm6, [edi+row*24+12] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm3 \
- __asm addps xmm7, xmm6 \
- __asm fld dword ptr [edi+(row*6+3)*4] \
- __asm fmul dword ptr [esi+(4+3*5)*4] \
- __asm faddp st(1),st \
- __asm movss xmm6, [edi+row*24+16] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm4 \
- __asm addps xmm7, xmm6 \
- __asm fld dword ptr [edi+(row*6+4)*4] \
- __asm fmul dword ptr [esi+(4+4*5)*4] \
- __asm faddp st(1),st \
- __asm movss xmm6, [edi+row*24+20] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm5 \
- __asm addps xmm7, xmm6 \
- __asm fld dword ptr [edi+(row*6+5)*4] \
- __asm fmul dword ptr [esi+(4+5*5)*4] \
- __asm faddp st(1),st \
- __asm fstp dword ptr [eax+(row*5+4)*4] \
- __asm movlps [eax+row*20], xmm7 \
- __asm movhps [eax+row*20+8], xmm7
- MUL_Nx6_6x5_INIT
- MUL_Nx6_6x5_ROW( 0 )
- MUL_Nx6_6x5_ROW( 1 )
- MUL_Nx6_6x5_ROW( 2 )
- MUL_Nx6_6x5_ROW( 3 )
- MUL_Nx6_6x5_ROW( 4 )
- return;
- }
- break;
- }
- case 6: {
- switch( l ) {
- case 1: { // 6x6 * 6x1
- __asm {
- mov esi, m2Ptr
- mov edi, m1Ptr
- mov eax, dstPtr
- movlps xmm7, qword ptr [esi]
- movlps xmm6, qword ptr [esi+8]
- shufps xmm7, xmm7, 0x44
- shufps xmm6, xmm6, 0x44
- movlps xmm0, qword ptr [edi ]
- movhps xmm0, qword ptr [edi+ 24]
- mulps xmm0, xmm7
- movlps xmm3, qword ptr [edi+ 8]
- movhps xmm3, qword ptr [edi+ 32]
- mulps xmm3, xmm6
- movlps xmm1, qword ptr [edi+ 48]
- movhps xmm1, qword ptr [edi+ 72]
- mulps xmm1, xmm7
- movlps xmm2, qword ptr [edi+ 96]
- movhps xmm2, qword ptr [edi+120]
- mulps xmm2, xmm7
- movlps xmm4, qword ptr [edi+ 56]
- movhps xmm4, qword ptr [edi+ 80]
- movlps xmm5, qword ptr [edi+104]
- movhps xmm5, qword ptr [edi+128]
- mulps xmm4, xmm6
- movlps xmm7, qword ptr [esi+16]
- addps xmm0, xmm3
- shufps xmm7, xmm7, 0x44
- mulps xmm5, xmm6
- addps xmm1, xmm4
- movlps xmm3, qword ptr [edi+ 16]
- movhps xmm3, qword ptr [edi+ 40]
- addps xmm2, xmm5
- movlps xmm4, qword ptr [edi+ 64]
- movhps xmm4, qword ptr [edi+ 88]
- mulps xmm3, xmm7
- movlps xmm5, qword ptr [edi+112]
- movhps xmm5, qword ptr [edi+136]
- addps xmm0, xmm3
- mulps xmm4, xmm7
- mulps xmm5, xmm7
- addps xmm1, xmm4
- addps xmm2, xmm5
- movaps xmm6, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm6, xmm1, 0xDD
- movaps xmm7, xmm2
- shufps xmm7, xmm2, 0x88
- shufps xmm2, xmm2, 0xDD
- addps xmm0, xmm6
- addps xmm2, xmm7
- movlps [eax], xmm0
- movhps [eax+8], xmm0
- movlps [eax+16], xmm2
- }
- return;
- }
- case 2: { // 6x6 * 6x2
- MUL_Nx6_6x2_INIT
- MUL_Nx6_6x2_ROW2( 0 )
- MUL_Nx6_6x2_ROW2( 1 )
- MUL_Nx6_6x2_ROW2( 2 )
- return;
- }
- case 3: { // 6x6 * 6x3
- MUL_Nx6_6x3_INIT
- MUL_Nx6_6x3_ROW( 0 )
- MUL_Nx6_6x3_ROW( 1 )
- MUL_Nx6_6x3_ROW( 2 )
- MUL_Nx6_6x3_ROW( 3 )
- MUL_Nx6_6x3_ROW( 4 )
- MUL_Nx6_6x3_ROW( 5 )
- return;
- }
- case 4: { // 6x6 * 6x4
- MUL_Nx6_6x4_INIT
- MUL_Nx6_6x4_ROW( 0 )
- MUL_Nx6_6x4_ROW( 1 )
- MUL_Nx6_6x4_ROW( 2 )
- MUL_Nx6_6x4_ROW( 3 )
- MUL_Nx6_6x4_ROW( 4 )
- MUL_Nx6_6x4_ROW( 5 )
- return;
- }
- case 5: { // 6x6 * 6x5
- MUL_Nx6_6x5_INIT
- MUL_Nx6_6x5_ROW( 0 )
- MUL_Nx6_6x5_ROW( 1 )
- MUL_Nx6_6x5_ROW( 2 )
- MUL_Nx6_6x5_ROW( 3 )
- MUL_Nx6_6x5_ROW( 4 )
- MUL_Nx6_6x5_ROW( 5 )
- return;
- }
- case 6: { // 6x6 * 6x6
- __asm {
- mov ecx, dword ptr m2Ptr
- movlps xmm3, qword ptr [ecx+72]
- mov edx, dword ptr m1Ptr
- // Loading first 4 columns (upper 4 rows) of m2Ptr.
- movaps xmm0, xmmword ptr [ecx]
- movlps xmm1, qword ptr [ecx+24]
- movhps xmm1, qword ptr [ecx+32]
- movaps xmm2, xmmword ptr [ecx+48]
- movhps xmm3, qword ptr [ecx+80]
- // Calculating first 4 elements in the first row of the destination matrix.
- movss xmm4, dword ptr [edx]
- movss xmm5, dword ptr [edx+4]
- mov eax, dword ptr dstPtr
- shufps xmm4, xmm4, 0
- movss xmm6, dword ptr [edx+8]
- shufps xmm5, xmm5, 0
- movss xmm7, dword ptr [edx+12]
- mulps xmm4, xmm0
- shufps xmm6, xmm6, 0
- shufps xmm7, xmm7, 0
- mulps xmm5, xmm1
- mulps xmm6, xmm2
- addps xmm5, xmm4
- mulps xmm7, xmm3
- addps xmm6, xmm5
- addps xmm7, xmm6
- movaps xmmword ptr [eax], xmm7
- // Calculating first 4 elements in the second row of the destination matrix.
- movss xmm4, dword ptr [edx+24]
- shufps xmm4, xmm4, 0
- mulps xmm4, xmm0
- movss xmm5, dword ptr [edx+28]
- shufps xmm5, xmm5, 0
- mulps xmm5, xmm1
- movss xmm6, dword ptr [edx+32]
- shufps xmm6, xmm6, 0
- movss xmm7, dword ptr [edx+36]
- shufps xmm7, xmm7, 0
- mulps xmm6, xmm2
- mulps xmm7, xmm3
- addps xmm7, xmm6
- addps xmm5, xmm4
- addps xmm7, xmm5
- // Calculating first 4 elements in the third row of the destination matrix.
- movss xmm4, dword ptr [edx+48]
- movss xmm5, dword ptr [edx+52]
- movlps qword ptr [eax+24], xmm7 ; save 2nd
- movhps qword ptr [eax+32], xmm7 ; row
- movss xmm6, dword ptr [edx+56]
- movss xmm7, dword ptr [edx+60]
- shufps xmm4, xmm4, 0
- shufps xmm5, xmm5, 0
- shufps xmm6, xmm6, 0
- shufps xmm7, xmm7, 0
- mulps xmm4, xmm0
- mulps xmm5, xmm1
- mulps xmm6, xmm2
- mulps xmm7, xmm3
- addps xmm5, xmm4
- addps xmm7, xmm6
- addps xmm7, xmm5
- movaps xmmword ptr [eax+48], xmm7
- // Calculating first 4 elements in the fourth row of the destination matrix.
- movss xmm4, dword ptr [edx+72]
- movss xmm5, dword ptr [edx+76]
- movss xmm6, dword ptr [edx+80]
- movss xmm7, dword ptr [edx+84]
- shufps xmm4, xmm4, 0
- shufps xmm5, xmm5, 0
- shufps xmm6, xmm6, 0
- shufps xmm7, xmm7, 0
- mulps xmm4, xmm0
- mulps xmm5, xmm1
- mulps xmm6, xmm2
- mulps xmm7, xmm3
- addps xmm4, xmm5
- addps xmm6, xmm4
- addps xmm7, xmm6
- movlps qword ptr [eax+72], xmm7
- movhps qword ptr [eax+80], xmm7
- // Calculating first 4 elements in the fifth row of the destination matrix.
- movss xmm4, dword ptr [edx+96]
- movss xmm5, dword ptr [edx+100]
- movss xmm6, dword ptr [edx+104]
- movss xmm7, dword ptr [edx+108]
- shufps xmm4, xmm4, 0
- shufps xmm5, xmm5, 0
- shufps xmm6, xmm6, 0
- shufps xmm7, xmm7, 0
- mulps xmm4, xmm0
- mulps xmm5, xmm1
- mulps xmm6, xmm2
- mulps xmm7, xmm3
- addps xmm5, xmm4
- addps xmm7, xmm6
- addps xmm7, xmm5
- movaps xmmword ptr [eax+96], xmm7
- // Calculating first 4 elements in the sixth row of the destination matrix.
- movss xmm4, dword ptr [edx+120]
- movss xmm5, dword ptr [edx+124]
- movss xmm6, dword ptr [edx+128]
- movss xmm7, dword ptr [edx+132]
- shufps xmm4, xmm4, 0
- shufps xmm5, xmm5, 0
- shufps xmm6, xmm6, 0
- shufps xmm7, xmm7, 0
- mulps xmm4, xmm0
- mulps xmm5, xmm1
- mulps xmm6, xmm2
- mulps xmm7, xmm3
- addps xmm4, xmm5
- addps xmm6, xmm4
- addps xmm7, xmm6
- movhps qword ptr [eax+128], xmm7
- movlps qword ptr [eax+120], xmm7
- // Loading first 4 columns (lower 2 rows) of m2Ptr.
- movlps xmm0, qword ptr [ecx+96]
- movhps xmm0, qword ptr [ecx+104]
- movlps xmm1, qword ptr [ecx+120]
- movhps xmm1, qword ptr [ecx+128]
- // Calculating first 4 elements in the first row of the destination matrix.
- movss xmm2, dword ptr [edx+16]
- shufps xmm2, xmm2, 0
- movss xmm4, dword ptr [edx+40]
- movss xmm3, dword ptr [edx+20]
- movss xmm5, dword ptr [edx+44]
- movaps xmm6, xmmword ptr [eax]
- movlps xmm7, qword ptr [eax+24]
- shufps xmm3, xmm3, 0
- shufps xmm5, xmm5, 0
- movhps xmm7, qword ptr [eax+32]
- shufps xmm4, xmm4, 0
- mulps xmm5, xmm1
- mulps xmm2, xmm0
- mulps xmm3, xmm1
- mulps xmm4, xmm0
- addps xmm6, xmm2
- addps xmm7, xmm4
- addps xmm7, xmm5
- addps xmm6, xmm3
- movlps qword ptr [eax+24], xmm7
- movaps xmmword ptr [eax], xmm6
- movhps qword ptr [eax+32], xmm7
- // Calculating first 4 elements in the third row of the destination matrix.
- movss xmm2, dword ptr [edx+64]
- movss xmm4, dword ptr [edx+88]
- movss xmm5, dword ptr [edx+92]
- movss xmm3, dword ptr [edx+68]
- movaps xmm6, xmmword ptr [eax+48]
- movlps xmm7, qword ptr [eax+72]
- movhps xmm7, qword ptr [eax+80]
- shufps xmm2, xmm2, 0
- shufps xmm4, xmm4, 0
- shufps xmm5, xmm5, 0
- shufps xmm3, xmm3, 0
- mulps xmm2, xmm0
- mulps xmm4, xmm0
- mulps xmm5, xmm1
- mulps xmm3, xmm1
- addps xmm6, xmm2
- addps xmm6, xmm3
- addps xmm7, xmm4
- addps xmm7, xmm5
- movlps qword ptr [eax+72], xmm7
- movaps xmmword ptr [eax+48], xmm6
- movhps qword ptr [eax+80], xmm7
- // Calculating first 4 elements in the fifth row of the destination matrix.
- movss xmm2, dword ptr [edx+112]
- movss xmm3, dword ptr [edx+116]
- movaps xmm6, xmmword ptr [eax+96]
- shufps xmm2, xmm2, 0
- shufps xmm3, xmm3, 0
- mulps xmm2, xmm0
- mulps xmm3, xmm1
- addps xmm6, xmm2
- addps xmm6, xmm3
- movaps xmmword ptr [eax+96], xmm6
- // Calculating first 4 elements in the sixth row of the destination matrix.
- movss xmm4, dword ptr [edx+136]
- movss xmm5, dword ptr [edx+140]
- movhps xmm7, qword ptr [eax+128]
- movlps xmm7, qword ptr [eax+120]
- shufps xmm4, xmm4, 0
- shufps xmm5, xmm5, 0
- mulps xmm4, xmm0
- mulps xmm5, xmm1
- addps xmm7, xmm4
- addps xmm7, xmm5
- // Calculating last 2 columns of the destination matrix.
- movlps xmm0, qword ptr [ecx+16]
- movhps xmm0, qword ptr [ecx+40]
- movhps qword ptr [eax+128], xmm7
- movlps qword ptr [eax+120], xmm7
- movlps xmm2, qword ptr [ecx+64]
- movhps xmm2, qword ptr [ecx+88]
- movaps xmm3, xmm2
- shufps xmm3, xmm3, 4Eh
- movlps xmm4, qword ptr [ecx+112]
- movhps xmm4, qword ptr [ecx+136]
- movaps xmm5, xmm4
- shufps xmm5, xmm5, 4Eh
- movlps xmm6, qword ptr [edx]
- movhps xmm6, qword ptr [edx+24]
- movaps xmm7, xmm6
- shufps xmm7, xmm7, 0F0h
- mulps xmm7, xmm0
- shufps xmm6, xmm6, 0A5h
- movaps xmm1, xmm0
- shufps xmm1, xmm1, 4Eh
- mulps xmm1, xmm6
- addps xmm7, xmm1
- movlps xmm6, qword ptr [edx+8]
- movhps xmm6, qword ptr [edx+32]
- movaps xmm1, xmm6
- shufps xmm1, xmm1, 0F0h
- shufps xmm6, xmm6, 0A5h
- mulps xmm1, xmm2
- mulps xmm6, xmm3
- addps xmm7, xmm1
- addps xmm7, xmm6
- movhps xmm6, qword ptr [edx+40]
- movlps xmm6, qword ptr [edx+16]
- movaps xmm1, xmm6
- shufps xmm1, xmm1, 0F0h
- shufps xmm6, xmm6, 0A5h
- mulps xmm1, xmm4
- mulps xmm6, xmm5
- addps xmm7, xmm1
- addps xmm7, xmm6
- movlps qword ptr [eax+16], xmm7
- movhps qword ptr [eax+40], xmm7
- movlps xmm6, qword ptr [edx+48]
- movhps xmm6, qword ptr [edx+72]
- movaps xmm7, xmm6
- shufps xmm7, xmm7, 0F0h
- mulps xmm7, xmm0
- shufps xmm6, xmm6, 0A5h
- movaps xmm1, xmm0
- shufps xmm1, xmm1, 4Eh
- mulps xmm1, xmm6
- addps xmm7, xmm1
- movhps xmm6, qword ptr [edx+80]
- movlps xmm6, qword ptr [edx+56]
- movaps xmm1, xmm6
- shufps xmm1, xmm1, 0F0h
- shufps xmm6, xmm6, 0A5h
- mulps xmm1, xmm2
- mulps xmm6, xmm3
- addps xmm7, xmm1
- addps xmm7, xmm6
- movlps xmm6, qword ptr [edx+64]
- movhps xmm6, qword ptr [edx+88]
- movaps xmm1, xmm6
- shufps xmm1, xmm1, 0F0h
- shufps xmm6, xmm6, 0A5h
- mulps xmm1, xmm4
- mulps xmm6, xmm5
- addps xmm7, xmm1
- addps xmm7, xmm6
- movlps qword ptr [eax+64], xmm7
- movhps qword ptr [eax+88], xmm7
- movlps xmm6, qword ptr [edx+96]
- movhps xmm6, qword ptr [edx+120]
- movaps xmm7, xmm6
- shufps xmm7, xmm7, 0F0h
- mulps xmm7, xmm0
- shufps xmm6, xmm6, 0A5h
- movaps xmm1, xmm0
- shufps xmm1, xmm1, 4Eh
- mulps xmm1, xmm6
- addps xmm7, xmm1
- movlps xmm6, qword ptr [edx+104]
- movhps xmm6, qword ptr [edx+128]
- movaps xmm1, xmm6
- shufps xmm1, xmm1, 0F0h
- shufps xmm6, xmm6, 0A5h
- mulps xmm1, xmm2
- mulps xmm6, xmm3
- addps xmm7, xmm1
- addps xmm7, xmm6
- movlps xmm6, qword ptr [edx+112]
- movhps xmm6, qword ptr [edx+136]
- movaps xmm1, xmm6
- shufps xmm1, xmm1, 0F0h
- shufps xmm6, xmm6, 0A5h
- mulps xmm1, xmm4
- mulps xmm6, xmm5
- addps xmm7, xmm1
- addps xmm7, xmm6
- movlps qword ptr [eax+112], xmm7
- movhps qword ptr [eax+136], xmm7
- }
- return;
- }
- }
- }
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
- m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
- m2Ptr++;
- }
- m1Ptr += 6;
- }
- break;
- }
- default: {
- for ( i = 0; i < k; i++ ) {
- for ( j = 0; j < l; j++ ) {
- m2Ptr = m2.ToFloatPtr() + j;
- sum = m1Ptr[0] * m2Ptr[0];
- for ( n = 1; n < m1.GetNumColumns(); n++ ) {
- m2Ptr += l;
- sum += m1Ptr[n] * m2Ptr[0];
- }
- *dstPtr++ = sum;
- }
- m1Ptr += m1.GetNumColumns();
- }
- break;
- }
- }
- }
- /*
- ============
- idSIMD_SSE::MatX_TransposeMultiplyMatX
- optimizes the following transpose matrix multiplications:
- Nx6 * NxN
- 6xN * 6x6
- with N in the range [1-6].
- ============
- */
- void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
- int i, j, k, l, n;
- float *dstPtr;
- const float *m1Ptr, *m2Ptr;
- double sum;
- assert( m1.GetNumRows() == m2.GetNumRows() );
- m1Ptr = m1.ToFloatPtr();
- m2Ptr = m2.ToFloatPtr();
- dstPtr = dst.ToFloatPtr();
- k = m1.GetNumColumns();
- l = m2.GetNumColumns();
- switch( m1.GetNumRows() ) {
- case 1:
- if ( !((k^6)|(l^1)) ) { // 1x6 * 1x1
- __asm {
- mov esi, m2Ptr
- mov edi, m1Ptr
- mov eax, dstPtr
- movss xmm0, [esi]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm1, xmm0
- mulps xmm0, [edi]
- mulps xmm1, [edi+16]
- movaps [eax], xmm0
- movlps [eax+16], xmm1
- }
- return;
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0];
- m2Ptr++;
- }
- m1Ptr++;
- }
- break;
- case 2:
- if ( !((k^6)|(l^2)) ) { // 2x6 * 2x2
- #define MUL_2xN_2x2_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movlps xmm0, [esi] \
- __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm movlps xmm1, [esi+8] \
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )
- #define MUL_2xN_2x2_ROW2( N, row ) \
- __asm movlps xmm6, [edi+(row+0*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm movlps xmm7, [edi+(row+1*N)*4] \
- __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm6, xmm0 \
- __asm mulps xmm7, xmm1 \
- __asm addps xmm6, xmm7 \
- __asm movaps [eax+(row*2)*4], xmm6
- MUL_2xN_2x2_INIT
- MUL_2xN_2x2_ROW2( 6, 0 )
- MUL_2xN_2x2_ROW2( 6, 2 )
- MUL_2xN_2x2_ROW2( 6, 4 )
- return;
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
- m2Ptr++;
- }
- m1Ptr++;
- }
- break;
- case 3:
- if ( !((k^6)|(l^3)) ) { // 3x6 * 3x3
- #define MUL_3xN_3x3_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movss xmm0, [esi+(0*3+0)*4] \
- __asm movhps xmm0, [esi+(0*3+1)*4] \
- __asm movss xmm1, [esi+(1*3+0)*4] \
- __asm movhps xmm1, [esi+(1*3+1)*4] \
- __asm movss xmm2, [esi+(2*3+0)*4] \
- __asm movhps xmm2, [esi+(2*3+1)*4]
- #define MUL_3xN_3x3_INIT_ROW4 \
- __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 3, 0 ) \
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 0 ) \
- __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 2, 3, 0 )
- #define MUL_3xN_3x3_ROW4( N, row ) \
- __asm movlps xmm3, [edi+(row+0*N+0)*4] \
- __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 1 ) \
- __asm movlps xmm4, [edi+(row+1*N+0)*4] \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 1 ) \
- __asm movlps xmm5, [edi+(row+2*N+0)*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 1 ) \
- __asm mulps xmm3, xmm0 \
- __asm mulps xmm4, xmm1 \
- __asm mulps xmm5, xmm2 \
- __asm addps xmm3, xmm4 \
- __asm addps xmm3, xmm5 \
- __asm movaps [eax+(row*3+0)*4], xmm3 \
- __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
- __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
- __asm movlps xmm3, [edi+(row+0*N+1)*4] \
- __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm movlps xmm4, [edi+(row+1*N+1)*4] \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm movlps xmm5, [edi+(row+2*N+1)*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm3, xmm0 \
- __asm mulps xmm4, xmm1 \
- __asm mulps xmm5, xmm2 \
- __asm addps xmm3, xmm4 \
- __asm addps xmm3, xmm5 \
- __asm movaps [eax+(row*3+4)*4], xmm3 \
- __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
- __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
- __asm movlps xmm3, [edi+(row+0*N+2)*4] \
- __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 1, 1 ) \
- __asm movlps xmm4, [edi+(row+1*N+2)*4] \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 1, 1 ) \
- __asm movlps xmm5, [edi+(row+2*N+2)*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 1, 1 ) \
- __asm mulps xmm3, xmm0 \
- __asm mulps xmm4, xmm1 \
- __asm mulps xmm5, xmm2 \
- __asm addps xmm3, xmm4 \
- __asm addps xmm3, xmm5 \
- __asm movaps [eax+(row*3+8)*4], xmm3
- #define MUL_3xN_3x3_INIT_ROW4_ROW4 \
- __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) \
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) \
- __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- #define MUL_3xN_3x3_INIT_ROW4_ROW \
- __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 1, 2, 3 ) \
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 2, 3 ) \
- __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 1, 2, 3 )
- #define MUL_3xN_3x3_ROW( N, row ) \
- __asm movss xmm3, [edi+(row+0*N)*4] \
- __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm movss xmm4, [edi+(row+1*N)*4] \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm movss xmm5, [edi+(row+2*N)*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm3, xmm0 \
- __asm mulps xmm4, xmm1 \
- __asm mulps xmm5, xmm2 \
- __asm addps xmm3, xmm4 \
- __asm addps xmm3, xmm5 \
- __asm movss [eax+(row*3+0)*4], xmm3 \
- __asm movhps [eax+(row*3+1)*4], xmm3
- MUL_3xN_3x3_INIT
- MUL_3xN_3x3_INIT_ROW4
- MUL_3xN_3x3_ROW4( 6, 0 )
- MUL_3xN_3x3_INIT_ROW4_ROW
- MUL_3xN_3x3_ROW( 6, 4 )
- MUL_3xN_3x3_ROW( 6, 5 )
- return;
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
- m2Ptr++;
- }
- m1Ptr++;
- }
- break;
- case 4:
- if ( !((k^6)|(l^4)) ) { // 4x6 * 4x4
- #define MUL_4xN_4x4_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movaps xmm0, [esi] \
- __asm movaps xmm1, [esi+16] \
- __asm movaps xmm2, [esi+32] \
- __asm movaps xmm3, [esi+48]
- #define MUL_4xN_4x4_ROW( N, row ) \
- __asm movss xmm7, [edi+(row+0*N)*4] \
- __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm7, xmm0 \
- __asm movss xmm6, [edi+(row+1*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm1 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+(row+2*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm2 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+(row+3*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm3 \
- __asm addps xmm7, xmm6 \
- __asm movaps [eax+row*16], xmm7
- MUL_4xN_4x4_INIT
- MUL_4xN_4x4_ROW( 6, 0 )
- MUL_4xN_4x4_ROW( 6, 1 )
- MUL_4xN_4x4_ROW( 6, 2 )
- MUL_4xN_4x4_ROW( 6, 3 )
- MUL_4xN_4x4_ROW( 6, 4 )
- MUL_4xN_4x4_ROW( 6, 5 )
- return;
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
- m1Ptr[3*k] * m2Ptr[3*l];
- m2Ptr++;
- }
- m1Ptr++;
- }
- break;
- case 5:
- if ( !((k^6)|(l^5)) ) { // 5x6 * 5x5
- #define MUL_5xN_5x5_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movlps xmm0, [esi+ 0*4] \
- __asm movhps xmm0, [esi+ 2*4] \
- __asm movlps xmm1, [esi+ 5*4] \
- __asm movhps xmm1, [esi+ 7*4] \
- __asm movlps xmm2, [esi+10*4] \
- __asm movhps xmm2, [esi+12*4] \
- __asm movlps xmm3, [esi+15*4] \
- __asm movhps xmm3, [esi+17*4] \
- __asm movlps xmm4, [esi+20*4] \
- __asm movhps xmm4, [esi+22*4]
- #define MUL_5xN_5x5_ROW( N, row ) \
- __asm movss xmm6, [edi+(row+0*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm0 \
- __asm fld dword ptr [edi+(row+0*N)*4] \
- __asm fmul dword ptr [esi+ 4*4] \
- __asm movss xmm5, [edi+(row+1*N)*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm5, xmm1 \
- __asm addps xmm6, xmm5 \
- __asm fld dword ptr [edi+(row+1*N)*4] \
- __asm fmul dword ptr [esi+ 9*4] \
- __asm faddp st(1),st \
- __asm movss xmm5, [edi+(row+2*N)*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm5, xmm2 \
- __asm addps xmm6, xmm5 \
- __asm fld dword ptr [edi+(row+2*N)*4] \
- __asm fmul dword ptr [esi+14*4] \
- __asm faddp st(1),st \
- __asm movss xmm5, [edi+(row+3*N)*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm5, xmm3 \
- __asm addps xmm6, xmm5 \
- __asm fld dword ptr [edi+(row+3*N)*4] \
- __asm fmul dword ptr [esi+19*4] \
- __asm faddp st(1),st \
- __asm movss xmm5, [edi+(row+4*N)*4] \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm5, xmm4 \
- __asm addps xmm6, xmm5 \
- __asm fld dword ptr [edi+(row+4*N)*4] \
- __asm fmul dword ptr [esi+24*4] \
- __asm faddp st(1),st \
- __asm fstp dword ptr [eax+(row*5+4)*4] \
- __asm movlps [eax+(row*5+0)*4], xmm6 \
- __asm movhps [eax+(row*5+2)*4], xmm6
- MUL_5xN_5x5_INIT
- MUL_5xN_5x5_ROW( 6, 0 )
- MUL_5xN_5x5_ROW( 6, 1 )
- MUL_5xN_5x5_ROW( 6, 2 )
- MUL_5xN_5x5_ROW( 6, 3 )
- MUL_5xN_5x5_ROW( 6, 4 )
- MUL_5xN_5x5_ROW( 6, 5 )
- return;
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
- m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
- m2Ptr++;
- }
- m1Ptr++;
- }
- break;
- case 6:
- if ( !(l^6) ) {
- switch( k ) {
- case 1: { // 6x1 * 6x6
- #define MUL_6xN_6x6_FIRST4COLUMNS_INIT \
- __asm mov esi, m2Ptr \
- __asm mov edi, m1Ptr \
- __asm mov eax, dstPtr \
- __asm movlps xmm0, [esi+ 0*4] \
- __asm movhps xmm0, [esi+ 2*4] \
- __asm movlps xmm1, [esi+ 6*4] \
- __asm movhps xmm1, [esi+ 8*4] \
- __asm movlps xmm2, [esi+12*4] \
- __asm movhps xmm2, [esi+14*4] \
- __asm movlps xmm3, [esi+18*4] \
- __asm movhps xmm3, [esi+20*4] \
- __asm movlps xmm4, [esi+24*4] \
- __asm movhps xmm4, [esi+26*4] \
- __asm movlps xmm5, [esi+30*4] \
- __asm movhps xmm5, [esi+32*4]
- #define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row ) \
- __asm movss xmm7, [edi+(row+0*N)*4] \
- __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm7, xmm0 \
- __asm movss xmm6, [edi+(row+1*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm1 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+(row+2*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm2 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+(row+3*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm3 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+(row+4*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm4 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+(row+5*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm5 \
- __asm addps xmm7, xmm6 \
- __asm movlps [eax+(row*6+0)*4], xmm7 \
- __asm movhps [eax+(row*6+2)*4], xmm7
- #define MUL_6xN_6x6_LAST2COLUMNS_INIT \
- __asm movlps xmm0, [esi+ 4*4] \
- __asm movlps xmm1, [esi+10*4] \
- __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm movlps xmm2, [esi+16*4] \
- __asm movlps xmm3, [esi+22*4] \
- __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm movlps xmm4, [esi+28*4] \
- __asm movlps xmm5, [esi+34*4] \
- __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
- __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
- #define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row ) \
- __asm movlps xmm7, [edi+(row*2+0*N)*4] \
- __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm7, xmm0 \
- __asm movlps xmm6, [edi+(row*2+1*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm6, xmm1 \
- __asm addps xmm7, xmm6 \
- __asm movlps xmm6, [edi+(row*2+2*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm6, xmm2 \
- __asm addps xmm7, xmm6 \
- __asm movlps xmm6, [edi+(row*2+3*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm6, xmm3 \
- __asm addps xmm7, xmm6 \
- __asm movlps xmm6, [edi+(row*2+4*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm6, xmm4 \
- __asm addps xmm7, xmm6 \
- __asm movlps xmm6, [edi+(row*2+5*N)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
- __asm mulps xmm6, xmm5 \
- __asm addps xmm7, xmm6 \
- __asm movlps [eax+(row*12+ 4)*4], xmm7 \
- __asm movhps [eax+(row*12+10)*4], xmm7
- #define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row ) \
- __asm movss xmm7, [edi+(1*N-1)*4] \
- __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm7, xmm0 \
- __asm movss xmm6, [edi+(2*N-1)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm1 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+(3*N-1)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm2 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+(4*N-1)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm3 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+(5*N-1)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm4 \
- __asm addps xmm7, xmm6 \
- __asm movss xmm6, [edi+(6*N-1)*4] \
- __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
- __asm mulps xmm6, xmm5 \
- __asm addps xmm7, xmm6 \
- __asm movlps [eax+(row*6+4)*4], xmm7
- MUL_6xN_6x6_FIRST4COLUMNS_INIT
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 )
- MUL_6xN_6x6_LAST2COLUMNS_INIT
- MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 )
- return;
- }
- case 2: { // 6x2 * 6x6
- MUL_6xN_6x6_FIRST4COLUMNS_INIT
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 )
- MUL_6xN_6x6_LAST2COLUMNS_INIT
- MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 )
- return;
- }
- case 3: { // 6x3 * 6x6
- MUL_6xN_6x6_FIRST4COLUMNS_INIT
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 )
- MUL_6xN_6x6_LAST2COLUMNS_INIT
- MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 )
- MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 )
- return;
- }
- case 4: { // 6x4 * 6x6
- MUL_6xN_6x6_FIRST4COLUMNS_INIT
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 )
- MUL_6xN_6x6_LAST2COLUMNS_INIT
- MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 )
- MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 )
- return;
- }
- case 5: { // 6x5 * 6x6
- MUL_6xN_6x6_FIRST4COLUMNS_INIT
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 )
- MUL_6xN_6x6_LAST2COLUMNS_INIT
- MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 )
- MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 )
- MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 )
- return;
- }
- case 6: { // 6x6 * 6x6
- MUL_6xN_6x6_FIRST4COLUMNS_INIT
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 )
- MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 )
- MUL_6xN_6x6_LAST2COLUMNS_INIT
- MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 )
- MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 )
- MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 )
- return;
- }
- }
- }
- for ( i = 0; i < k; i++ ) {
- m2Ptr = m2.ToFloatPtr();
- for ( j = 0; j < l; j++ ) {
- *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
- m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
- m2Ptr++;
- }
- m1Ptr++;
- }
- break;
- default:
- for ( i = 0; i < k; i++ ) {
- for ( j = 0; j < l; j++ ) {
- m1Ptr = m1.ToFloatPtr() + i;
- m2Ptr = m2.ToFloatPtr() + j;
- sum = m1Ptr[0] * m2Ptr[0];
- for ( n = 1; n < m1.GetNumRows(); n++ ) {
- m1Ptr += k;
- m2Ptr += l;
- sum += m1Ptr[0] * m2Ptr[0];
- }
- *dstPtr++ = sum;
- }
- }
- break;
- }
- }
- /*
- ============
- idSIMD_SSE::MatX_LowerTriangularSolve
- solves x in Lx = b for the n * n sub-matrix of L
- if skip > 0 the first skip elements of x are assumed to be valid already
- L has to be a lower triangular matrix with (implicit) ones on the diagonal
- x == b is allowed
- ============
- */
- void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
- int nc;
- const float *lptr;
- if ( skip >= n ) {
- return;
- }
- lptr = L.ToFloatPtr();
- nc = L.GetNumColumns();
- // unrolled cases for n < 8
- if ( n < 8 ) {
- #define NSKIP( n, s ) ((n<<3)|(s&7))
- switch( NSKIP( n, skip ) ) {
- case NSKIP( 1, 0 ): x[0] = b[0];
- return;
- case NSKIP( 2, 0 ): x[0] = b[0];
- case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- return;
- case NSKIP( 3, 0 ): x[0] = b[0];
- case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- return;
- case NSKIP( 4, 0 ): x[0] = b[0];
- case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- return;
- case NSKIP( 5, 0 ): x[0] = b[0];
- case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
- return;
- case NSKIP( 6, 0 ): x[0] = b[0];
- case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
- case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
- return;
- case NSKIP( 7, 0 ): x[0] = b[0];
- case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
- case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
- case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
- return;
- }
- return;
- }
- // process first 4 rows
- switch( skip ) {
- case 0: x[0] = b[0];
- case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
- case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- skip = 4;
- }
- lptr = L[skip];
- // this code assumes n > 4
- __asm {
- push ebx
- mov eax, skip // eax = i
- shl eax, 2 // eax = i*4
- mov edx, n // edx = n
- shl edx, 2 // edx = n*4
- mov esi, x // esi = x
- mov edi, lptr // edi = lptr
- add esi, eax
- add edi, eax
- mov ebx, b // ebx = b
- // check for aligned memory
- mov ecx, nc
- shl ecx, 2
- or ecx, esi
- or ecx, edi
- and ecx, 15
- jnz loopurow
- // aligned
- looprow:
- mov ecx, eax
- neg ecx
- movaps xmm0, [esi+ecx]
- mulps xmm0, [edi+ecx]
- add ecx, 12*4
- jg donedot8
- dot8:
- movaps xmm1, [esi+ecx-(8*4)]
- mulps xmm1, [edi+ecx-(8*4)]
- addps xmm0, xmm1
- movaps xmm3, [esi+ecx-(4*4)]
- mulps xmm3, [edi+ecx-(4*4)]
- addps xmm0, xmm3
- add ecx, 8*4
- jle dot8
- donedot8:
- sub ecx, 4*4
- jg donedot4
- //dot4:
- movaps xmm1, [esi+ecx-(4*4)]
- mulps xmm1, [edi+ecx-(4*4)]
- addps xmm0, xmm1
- add ecx, 4*4
- donedot4:
- movhlps xmm1, xmm0
- addps xmm0, xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
- addss xmm0, xmm1
- sub ecx, 4*4
- jz dot0
- add ecx, 4
- jz dot1
- add ecx, 4
- jz dot2
- //dot3:
- movss xmm1, [esi-(3*4)]
- mulss xmm1, [edi-(3*4)]
- addss xmm0, xmm1
- dot2:
- movss xmm3, [esi-(2*4)]
- mulss xmm3, [edi-(2*4)]
- addss xmm0, xmm3
- dot1:
- movss xmm5, [esi-(1*4)]
- mulss xmm5, [edi-(1*4)]
- addss xmm0, xmm5
- dot0:
- movss xmm1, [ebx+eax]
- subss xmm1, xmm0
- movss [esi], xmm1
- add eax, 4
- cmp eax, edx
- jge done
- add esi, 4
- mov ecx, nc
- shl ecx, 2
- add edi, ecx
- add edi, 4
- jmp looprow
- // unaligned
- loopurow:
- mov ecx, eax
- neg ecx
- movups xmm0, [esi+ecx]
- movups xmm1, [edi+ecx]
- mulps xmm0, xmm1
- add ecx, 12*4
- jg doneudot8
- udot8:
- movups xmm1, [esi+ecx-(8*4)]
- movups xmm2, [edi+ecx-(8*4)]
- mulps xmm1, xmm2
- addps xmm0, xmm1
- movups xmm3, [esi+ecx-(4*4)]
- movups xmm4, [edi+ecx-(4*4)]
- mulps xmm3, xmm4
- addps xmm0, xmm3
- add ecx, 8*4
- jle udot8
- doneudot8:
- sub ecx, 4*4
- jg doneudot4
- //udot4:
- movups xmm1, [esi+ecx-(4*4)]
- movups xmm2, [edi+ecx-(4*4)]
- mulps xmm1, xmm2
- addps xmm0, xmm1
- add ecx, 4*4
- doneudot4:
- movhlps xmm1, xmm0
- addps xmm0, xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
- addss xmm0, xmm1
- sub ecx, 4*4
- jz udot0
- add ecx, 4
- jz udot1
- add ecx, 4
- jz udot2
- //udot3:
- movss xmm1, [esi-(3*4)]
- movss xmm2, [edi-(3*4)]
- mulss xmm1, xmm2
- addss xmm0, xmm1
- udot2:
- movss xmm3, [esi-(2*4)]
- movss xmm4, [edi-(2*4)]
- mulss xmm3, xmm4
- addss xmm0, xmm3
- udot1:
- movss xmm5, [esi-(1*4)]
- movss xmm6, [edi-(1*4)]
- mulss xmm5, xmm6
- addss xmm0, xmm5
- udot0:
- movss xmm1, [ebx+eax]
- subss xmm1, xmm0
- movss [esi], xmm1
- add eax, 4
- cmp eax, edx
- jge done
- add esi, 4
- mov ecx, nc
- shl ecx, 2
- add edi, ecx
- add edi, 4
- jmp loopurow
- done:
- pop ebx
- }
- }
- /*
- ============
- idSIMD_SSE::MatX_LowerTriangularSolveTranspose
- solves x in L'x = b for the n * n sub-matrix of L
- L has to be a lower triangular matrix with (implicit) ones on the diagonal
- x == b is allowed
- ============
- */
- void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
- int nc;
- const float *lptr;
- lptr = L.ToFloatPtr();
- nc = L.GetNumColumns();
- // unrolled cases for n < 8
- if ( n < 8 ) {
- switch( n ) {
- case 0:
- return;
- case 1:
- x[0] = b[0];
- return;
- case 2:
- x[1] = b[1];
- x[0] = b[0] - lptr[1*nc+0] * x[1];
- return;
- case 3:
- x[2] = b[2];
- x[1] = b[1] - lptr[2*nc+1] * x[2];
- x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
- return;
- case 4:
- x[3] = b[3];
- x[2] = b[2] - lptr[3*nc+2] * x[3];
- x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
- x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
- return;
- case 5:
- x[4] = b[4];
- x[3] = b[3] - lptr[4*nc+3] * x[4];
- x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
- x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
- x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
- return;
- case 6:
- x[5] = b[5];
- x[4] = b[4] - lptr[5*nc+4] * x[5];
- x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
- x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
- x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
- x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
- return;
- case 7:
- x[6] = b[6];
- x[5] = b[5] - lptr[6*nc+5] * x[6];
- x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
- x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
- x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
- x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
- x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
- return;
- }
- return;
- }
- #if 1
- int i, j, m;
- float *xptr;
- double s0;
- // if the number of columns is not a multiple of 2 we're screwed for alignment.
- // however, if the number of columns is a multiple of 2 but the number of to be
- // processed rows is not a multiple of 2 we can still run 8 byte aligned
- m = n;
- if ( m & 1 ) {
- m--;
- x[m] = b[m];
- lptr = L.ToFloatPtr() + m * nc + m - 4;
- xptr = x + m;
- __asm {
- push ebx
- mov eax, m // eax = i
- mov esi, xptr // esi = xptr
- mov edi, lptr // edi = lptr
- mov ebx, b // ebx = b
- mov edx, nc // edx = nc*sizeof(float)
- shl edx, 2
- process4rows_1:
- movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
- movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3]
- xor ecx, ecx
- sub eax, m
- neg eax
- jz done4x4_1
- process4x4_1: // process 4x4 blocks
- movlps xmm2, [edi+0]
- movhps xmm2, [edi+8]
- add edi, edx
- movss xmm1, [esi+4*ecx+0]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps xmm3, [edi+0]
- movhps xmm3, [edi+8]
- add edi, edx
- mulps xmm1, xmm2
- subps xmm0, xmm1
- movss xmm1, [esi+4*ecx+4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps xmm4, [edi+0]
- movhps xmm4, [edi+8]
- add edi, edx
- mulps xmm1, xmm3
- subps xmm0, xmm1
- movss xmm1, [esi+4*ecx+8]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps xmm5, [edi+0]
- movhps xmm5, [edi+8]
- add edi, edx
- mulps xmm1, xmm4
- subps xmm0, xmm1
- movss xmm1, [esi+4*ecx+12]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- add ecx, 4
- cmp ecx, eax
- mulps xmm1, xmm5
- subps xmm0, xmm1
- jl process4x4_1
- done4x4_1: // process left over of the 4 rows
- movlps xmm2, [edi+0]
- movhps xmm2, [edi+8]
- movss xmm1, [esi+4*ecx]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm1, xmm2
- subps xmm0, xmm1
- imul ecx, edx
- sub edi, ecx
- neg eax
- add eax, m
- sub eax, 4
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
- movaps xmm2, xmm0
- shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
- movaps xmm3, xmm0
- shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
- sub edi, edx
- movss [esi-4], xmm3 // xptr[-1] = s3
- movss xmm4, xmm3
- movss xmm5, xmm3
- mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3
- mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3
- mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3
- subss xmm2, xmm3
- movss [esi-8], xmm2 // xptr[-2] = s2
- movss xmm6, xmm2
- sub edi, edx
- subss xmm0, xmm5
- subss xmm1, xmm4
- mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2
- mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2
- subss xmm1, xmm2
- movss [esi-12], xmm1 // xptr[-3] = s1
- subss xmm0, xmm6
- sub edi, edx
- cmp eax, 4
- mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1
- subss xmm0, xmm1
- movss [esi-16], xmm0 // xptr[-4] = s0
- jl done4rows_1
- sub edi, edx
- sub edi, 16
- sub esi, 16
- jmp process4rows_1
- done4rows_1:
- pop ebx
- }
- } else {
- lptr = L.ToFloatPtr() + m * nc + m - 4;
- xptr = x + m;
- __asm {
- push ebx
- mov eax, m // eax = i
- mov esi, xptr // esi = xptr
- mov edi, lptr // edi = lptr
- mov ebx, b // ebx = b
- mov edx, nc // edx = nc*sizeof(float)
- shl edx, 2
- process4rows:
- movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
- movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3]
- sub eax, m
- jz done4x4
- neg eax
- xor ecx, ecx
- process4x4: // process 4x4 blocks
- movlps xmm2, [edi+0]
- movhps xmm2, [edi+8]
- add edi, edx
- movss xmm1, [esi+4*ecx+0]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps xmm3, [edi+0]
- movhps xmm3, [edi+8]
- add edi, edx
- mulps xmm1, xmm2
- subps xmm0, xmm1
- movss xmm1, [esi+4*ecx+4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps xmm4, [edi+0]
- movhps xmm4, [edi+8]
- add edi, edx
- mulps xmm1, xmm3
- subps xmm0, xmm1
- movss xmm1, [esi+4*ecx+8]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps xmm5, [edi+0]
- movhps xmm5, [edi+8]
- add edi, edx
- mulps xmm1, xmm4
- subps xmm0, xmm1
- movss xmm1, [esi+4*ecx+12]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- add ecx, 4
- cmp ecx, eax
- mulps xmm1, xmm5
- subps xmm0, xmm1
- jl process4x4
- imul ecx, edx
- sub edi, ecx
- neg eax
- done4x4: // process left over of the 4 rows
- add eax, m
- sub eax, 4
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
- movaps xmm2, xmm0
- shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
- movaps xmm3, xmm0
- shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
- sub edi, edx
- movss [esi-4], xmm3 // xptr[-1] = s3
- movss xmm4, xmm3
- movss xmm5, xmm3
- mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3
- mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3
- mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3
- subss xmm2, xmm3
- movss [esi-8], xmm2 // xptr[-2] = s2
- movss xmm6, xmm2
- sub edi, edx
- subss xmm0, xmm5
- subss xmm1, xmm4
- mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2
- mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2
- subss xmm1, xmm2
- movss [esi-12], xmm1 // xptr[-3] = s1
- subss xmm0, xmm6
- sub edi, edx
- cmp eax, 4
- mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1
- subss xmm0, xmm1
- movss [esi-16], xmm0 // xptr[-4] = s0
- jl done4rows
- sub edi, edx
- sub edi, 16
- sub esi, 16
- jmp process4rows
- done4rows:
- pop ebx
- }
- }
- // process left over rows
- for ( i = (m&3)-1; i >= 0; i-- ) {
- s0 = b[i];
- lptr = L[0] + i;
- for ( j = i + 1; j < n; j++ ) {
- s0 -= lptr[j*nc] * x[j];
- }
- x[i] = s0;
- }
- #else
- int i, j, m;
- double s0, s1, s2, s3, t;
- const float *lptr2;
- float *xptr, *xptr2;
- m = n;
- if ( m & 1 ) {
- m--;
- x[m] = b[m];
- lptr = L.ToFloatPtr() + m * nc + m - 4;
- xptr = x + m;
- // process 4 rows at a time
- for ( i = m; i >= 4; i -= 4 ) {
- s0 = b[i-4];
- s1 = b[i-3];
- s2 = b[i-2];
- s3 = b[i-1];
- // process 4x4 blocks
- xptr2 = xptr; // x + i;
- lptr2 = lptr; // ptr = L[i] + i - 4;
- for ( j = 0; j < m-i; j += 4 ) {
- t = xptr2[0];
- s0 -= lptr2[0] * t;
- s1 -= lptr2[1] * t;
- s2 -= lptr2[2] * t;
- s3 -= lptr2[3] * t;
- lptr2 += nc;
- xptr2++;
- t = xptr2[0];
- s0 -= lptr2[0] * t;
- s1 -= lptr2[1] * t;
- s2 -= lptr2[2] * t;
- s3 -= lptr2[3] * t;
- lptr2 += nc;
- xptr2++;
- t = xptr2[0];
- s0 -= lptr2[0] * t;
- s1 -= lptr2[1] * t;
- s2 -= lptr2[2] * t;
- s3 -= lptr2[3] * t;
- lptr2 += nc;
- xptr2++;
- t = xptr2[0];
- s0 -= lptr2[0] * t;
- s1 -= lptr2[1] * t;
- s2 -= lptr2[2] * t;
- s3 -= lptr2[3] * t;
- lptr2 += nc;
- xptr2++;
- }
- t = xptr2[0];
- s0 -= lptr2[0] * t;
- s1 -= lptr2[1] * t;
- s2 -= lptr2[2] * t;
- s3 -= lptr2[3] * t;
- // process left over of the 4 rows
- lptr -= nc;
- s0 -= lptr[0] * s3;
- s1 -= lptr[1] * s3;
- s2 -= lptr[2] * s3;
- lptr -= nc;
- s0 -= lptr[0] * s2;
- s1 -= lptr[1] * s2;
- lptr -= nc;
- s0 -= lptr[0] * s1;
- lptr -= nc;
- // store result
- xptr[-4] = s0;
- xptr[-3] = s1;
- xptr[-2] = s2;
- xptr[-1] = s3;
- // update pointers for next four rows
- lptr -= 4;
- xptr -= 4;
- }
- } else {
- lptr = L.ToFloatPtr() + m * nc + m - 4;
- xptr = x + m;
- // process 4 rows at a time
- for ( i = m; i >= 4; i -= 4 ) {
- s0 = b[i-4];
- s1 = b[i-3];
- s2 = b[i-2];
- s3 = b[i-1];
- // process 4x4 blocks
- xptr2 = xptr; // x + i;
- lptr2 = lptr; // ptr = L[i] + i - 4;
- for ( j = 0; j < m-i; j += 4 ) {
- t = xptr2[0];
- s0 -= lptr2[0] * t;
- s1 -= lptr2[1] * t;
- s2 -= lptr2[2] * t;
- s3 -= lptr2[3] * t;
- lptr2 += nc;
- xptr2++;
- t = xptr2[0];
- s0 -= lptr2[0] * t;
- s1 -= lptr2[1] * t;
- s2 -= lptr2[2] * t;
- s3 -= lptr2[3] * t;
- lptr2 += nc;
- xptr2++;
- t = xptr2[0];
- s0 -= lptr2[0] * t;
- s1 -= lptr2[1] * t;
- s2 -= lptr2[2] * t;
- s3 -= lptr2[3] * t;
- lptr2 += nc;
- xptr2++;
- t = xptr2[0];
- s0 -= lptr2[0] * t;
- s1 -= lptr2[1] * t;
- s2 -= lptr2[2] * t;
- s3 -= lptr2[3] * t;
- lptr2 += nc;
- xptr2++;
- }
- // process left over of the 4 rows
- lptr -= nc;
- s0 -= lptr[0] * s3;
- s1 -= lptr[1] * s3;
- s2 -= lptr[2] * s3;
- lptr -= nc;
- s0 -= lptr[0] * s2;
- s1 -= lptr[1] * s2;
- lptr -= nc;
- s0 -= lptr[0] * s1;
- lptr -= nc;
- // store result
- xptr[-4] = s0;
- xptr[-3] = s1;
- xptr[-2] = s2;
- xptr[-1] = s3;
- // update pointers for next four rows
- lptr -= 4;
- xptr -= 4;
- }
- }
- // process left over rows
- for ( i--; i >= 0; i-- ) {
- s0 = b[i];
- lptr = L[0] + i;
- for ( j = i + 1; j < m; j++ ) {
- s0 -= lptr[j*nc] * x[j];
- }
- x[i] = s0;
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::MatX_LDLTFactor
- in-place factorization LDL' of the n * n sub-matrix of mat
- the reciprocal of the diagonal elements are stored in invDiag
- currently assumes the number of columns of mat is a multiple of 4
- ============
- */
- bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
- #if 1
- int j, nc;
- float *v, *diag, *invDiagPtr, *mptr;
- double s0, s1, s2, sum, d;
- v = (float *) _alloca16( n * sizeof( float ) );
- diag = (float *) _alloca16( n * sizeof( float ) );
- invDiagPtr = invDiag.ToFloatPtr();
- nc = mat.GetNumColumns();
- assert( ( nc & 3 ) == 0 );
- if ( n <= 0 ) {
- return true;
- }
- mptr = mat[0];
- sum = mptr[0];
- if ( sum == 0.0f ) {
- return false;
- }
- diag[0] = sum;
- invDiagPtr[0] = d = 1.0f / sum;
- if ( n <= 1 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 1; j < n; j++ ) {
- mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
- }
- mptr = mat[1];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- sum = mptr[1] - s0;
- if ( sum == 0.0f ) {
- return false;
- }
- mat[1][1] = sum;
- diag[1] = sum;
- invDiagPtr[1] = d = 1.0f / sum;
- if ( n <= 2 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 2; j < n; j++ ) {
- mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
- }
- mptr = mat[2];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
- sum = mptr[2] - s0 - s1;
- if ( sum == 0.0f ) {
- return false;
- }
- mat[2][2] = sum;
- diag[2] = sum;
- invDiagPtr[2] = d = 1.0f / sum;
- if ( n <= 3 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 3; j < n; j++ ) {
- mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
- }
- mptr = mat[3];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
- v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
- sum = mptr[3] - s0 - s1 - s2;
- if ( sum == 0.0f ) {
- return false;
- }
- mat[3][3] = sum;
- diag[3] = sum;
- invDiagPtr[3] = d = 1.0f / sum;
- if ( n <= 4 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 4; j < n; j++ ) {
- mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
- }
- int ncf = nc * sizeof( float );
- mptr = mat[0];
- __asm {
- xorps xmm2, xmm2
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- push ebx
- mov ebx, 4
- loopRow:
- cmp ebx, n
- jge done
- mov ecx, ebx // esi = i
- shl ecx, 2 // esi = i * 4
- mov edx, diag // edx = diag
- add edx, ecx // edx = &diag[i]
- mov edi, ebx // edi = i
- imul edi, ncf // edi = i * nc * sizeof( float )
- add edi, mptr // edi = mat[i]
- add edi, ecx // edi = &mat[i][i]
- mov esi, v // ecx = v
- add esi, ecx // ecx = &v[i]
- mov eax, invDiagPtr // eax = invDiagPtr
- add eax, ecx // eax = &invDiagPtr[i]
- neg ecx
- movaps xmm0, [edx+ecx]
- mulps xmm0, [edi+ecx]
- movaps [esi+ecx], xmm0
- mulps xmm0, [edi+ecx]
- add ecx, 12*4
- jg doneDot8
- dot8:
- movaps xmm1, [edx+ecx-(8*4)]
- mulps xmm1, [edi+ecx-(8*4)]
- movaps [esi+ecx-(8*4)], xmm1
- mulps xmm1, [edi+ecx-(8*4)]
- addps xmm0, xmm1
- movaps xmm2, [edx+ecx-(4*4)]
- mulps xmm2, [edi+ecx-(4*4)]
- movaps [esi+ecx-(4*4)], xmm2
- mulps xmm2, [edi+ecx-(4*4)]
- addps xmm0, xmm2
- add ecx, 8*4
- jle dot8
- doneDot8:
- sub ecx, 4*4
- jg doneDot4
- movaps xmm1, [edx+ecx-(4*4)]
- mulps xmm1, [edi+ecx-(4*4)]
- movaps [esi+ecx-(4*4)], xmm1
- mulps xmm1, [edi+ecx-(4*4)]
- addps xmm0, xmm1
- add ecx, 4*4
- doneDot4:
- sub ecx, 2*4
- jg doneDot2
- movlps xmm3, [edx+ecx-(2*4)]
- movlps xmm4, [edi+ecx-(2*4)]
- mulps xmm3, xmm4
- movlps [esi+ecx-(2*4)], xmm3
- mulps xmm3, xmm4
- addps xmm0, xmm3
- add ecx, 2*4
- doneDot2:
- sub ecx, 1*4
- jg doneDot1
- movss xmm3, [edx+ecx-(1*4)]
- movss xmm4, [edi+ecx-(1*4)]
- mulss xmm3, xmm4
- movss [esi+ecx-(1*4)], xmm3
- mulss xmm3, xmm4
- addss xmm0, xmm3
- doneDot1:
- movhlps xmm2, xmm0
- addps xmm0, xmm2
- movaps xmm2, xmm0
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
- addss xmm0, xmm2
- movss xmm1, [edi]
- subss xmm1, xmm0
- movss [edi], xmm1 // mptr[i] = sum;
- movss [edx], xmm1 // diag[i] = sum;
- // if ( sum == 0.0f ) return false;
- movaps xmm2, xmm1
- cmpeqss xmm2, SIMD_SP_zero
- andps xmm2, SIMD_SP_tiny
- orps xmm1, xmm2
- rcpss xmm7, xmm1
- mulss xmm1, xmm7
- mulss xmm1, xmm7
- addss xmm7, xmm7
- subss xmm7, xmm1
- movss [eax], xmm7 // invDiagPtr[i] = 1.0f / sum;
- mov edx, n // edx = n
- sub edx, ebx // edx = n - i
- dec edx // edx = n - i - 1
- jle doneSubRow // if ( i + 1 >= n ) return true;
- mov eax, ebx // eax = i
- shl eax, 2 // eax = i * 4
- neg eax
- loopSubRow:
- add edi, ncf
- mov ecx, eax
- movaps xmm0, [esi+ecx]
- mulps xmm0, [edi+ecx]
- add ecx, 12*4
- jg doneSubDot8
- subDot8:
- movaps xmm1, [esi+ecx-(8*4)]
- mulps xmm1, [edi+ecx-(8*4)]
- addps xmm0, xmm1
- movaps xmm2, [esi+ecx-(4*4)]
- mulps xmm2, [edi+ecx-(4*4)]
- addps xmm0, xmm2
- add ecx, 8*4
- jle subDot8
- doneSubDot8:
- sub ecx, 4*4
- jg doneSubDot4
- movaps xmm1, [esi+ecx-(4*4)]
- mulps xmm1, [edi+ecx-(4*4)]
- addps xmm0, xmm1
- add ecx, 4*4
- doneSubDot4:
- sub ecx, 2*4
- jg doneSubDot2
- movlps xmm3, [esi+ecx-(2*4)]
- movlps xmm4, [edi+ecx-(2*4)]
- mulps xmm3, xmm4
- addps xmm0, xmm3
- add ecx, 2*4
- doneSubDot2:
- sub ecx, 1*4
- jg doneSubDot1
- movss xmm3, [esi+ecx-(1*4)]
- movss xmm4, [edi+ecx-(1*4)]
- mulss xmm3, xmm4
- addss xmm0, xmm3
- doneSubDot1:
- movhlps xmm2, xmm0
- addps xmm0, xmm2
- movaps xmm2, xmm0
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
- addss xmm0, xmm2
- movss xmm1, [edi]
- subss xmm1, xmm0
- mulss xmm1, xmm7
- movss [edi], xmm1
- dec edx
- jg loopSubRow
- doneSubRow:
- inc ebx
- jmp loopRow
- done:
- pop ebx
- }
- return true;
- #else
- int i, j, k, nc;
- float *v, *diag, *mptr;
- double s0, s1, s2, s3, sum, d;
- v = (float *) _alloca16( n * sizeof( float ) );
- diag = (float *) _alloca16( n * sizeof( float ) );
- nc = mat.GetNumColumns();
- if ( n <= 0 ) {
- return true;
- }
- mptr = mat[0];
- sum = mptr[0];
- if ( sum == 0.0f ) {
- return false;
- }
- diag[0] = sum;
- invDiag[0] = d = 1.0f / sum;
- if ( n <= 1 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 1; j < n; j++ ) {
- mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
- }
- mptr = mat[1];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- sum = mptr[1] - s0;
- if ( sum == 0.0f ) {
- return false;
- }
- mat[1][1] = sum;
- diag[1] = sum;
- invDiag[1] = d = 1.0f / sum;
- if ( n <= 2 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 2; j < n; j++ ) {
- mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
- }
- mptr = mat[2];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
- sum = mptr[2] - s0 - s1;
- if ( sum == 0.0f ) {
- return false;
- }
- mat[2][2] = sum;
- diag[2] = sum;
- invDiag[2] = d = 1.0f / sum;
- if ( n <= 3 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 3; j < n; j++ ) {
- mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
- }
- mptr = mat[3];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
- v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
- sum = mptr[3] - s0 - s1 - s2;
- if ( sum == 0.0f ) {
- return false;
- }
- mat[3][3] = sum;
- diag[3] = sum;
- invDiag[3] = d = 1.0f / sum;
- if ( n <= 4 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 4; j < n; j++ ) {
- mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
- }
- for ( i = 4; i < n; i++ ) {
- mptr = mat[i];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
- v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
- v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
- for ( k = 4; k < i-3; k += 4 ) {
- v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
- v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
- v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
- v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
- }
- switch( i - k ) {
- case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
- case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
- case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
- }
- sum = s3;
- sum += s2;
- sum += s1;
- sum += s0;
- sum = mptr[i] - sum;
- if ( sum == 0.0f ) {
- return false;
- }
- mat[i][i] = sum;
- diag[i] = sum;
- invDiag[i] = d = 1.0f / sum;
- if ( i + 1 >= n ) {
- return true;
- }
- mptr = mat[i+1];
- for ( j = i+1; j < n; j++ ) {
- s0 = mptr[0] * v[0];
- s1 = mptr[1] * v[1];
- s2 = mptr[2] * v[2];
- s3 = mptr[3] * v[3];
- for ( k = 4; k < i-7; k += 8 ) {
- s0 += mptr[k+0] * v[k+0];
- s1 += mptr[k+1] * v[k+1];
- s2 += mptr[k+2] * v[k+2];
- s3 += mptr[k+3] * v[k+3];
- s0 += mptr[k+4] * v[k+4];
- s1 += mptr[k+5] * v[k+5];
- s2 += mptr[k+6] * v[k+6];
- s3 += mptr[k+7] * v[k+7];
- }
- switch( i - k ) {
- case 7: s0 += mptr[k+6] * v[k+6];
- case 6: s1 += mptr[k+5] * v[k+5];
- case 5: s2 += mptr[k+4] * v[k+4];
- case 4: s3 += mptr[k+3] * v[k+3];
- case 3: s0 += mptr[k+2] * v[k+2];
- case 2: s1 += mptr[k+1] * v[k+1];
- case 1: s2 += mptr[k+0] * v[k+0];
- }
- sum = s3;
- sum += s2;
- sum += s1;
- sum += s0;
- mptr[i] = ( mptr[i] - sum ) * d;
- mptr += nc;
- }
- }
- return true;
- #endif
- }
- /*
- ============
- idSIMD_SSE::BlendJoints
- ============
- */
- #define REFINE_BLENDJOINTS_RECIPROCAL
- void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
- int i;
- if ( lerp <= 0.0f ) {
- return;
- } else if ( lerp >= 1.0f ) {
- for ( i = 0; i < numJoints; i++ ) {
- int j = index[i];
- joints[j] = blendJoints[j];
- }
- return;
- }
- for ( i = 0; i <= numJoints - 4; i += 4 ) {
- ALIGN16( float jointVert0[4] );
- ALIGN16( float jointVert1[4] );
- ALIGN16( float jointVert2[4] );
- ALIGN16( float blendVert0[4] );
- ALIGN16( float blendVert1[4] );
- ALIGN16( float blendVert2[4] );
- ALIGN16( float jointQuat0[4] );
- ALIGN16( float jointQuat1[4] );
- ALIGN16( float jointQuat2[4] );
- ALIGN16( float jointQuat3[4] );
- ALIGN16( float blendQuat0[4] );
- ALIGN16( float blendQuat1[4] );
- ALIGN16( float blendQuat2[4] );
- ALIGN16( float blendQuat3[4] );
- for ( int j = 0; j < 4; j++ ) {
- int n = index[i+j];
- jointVert0[j] = joints[n].t[0];
- jointVert1[j] = joints[n].t[1];
- jointVert2[j] = joints[n].t[2];
- blendVert0[j] = blendJoints[n].t[0];
- blendVert1[j] = blendJoints[n].t[1];
- blendVert2[j] = blendJoints[n].t[2];
- jointQuat0[j] = joints[n].q[0];
- jointQuat1[j] = joints[n].q[1];
- jointQuat2[j] = joints[n].q[2];
- jointQuat3[j] = joints[n].q[3];
- blendQuat0[j] = blendJoints[n].q[0];
- blendQuat1[j] = blendJoints[n].q[1];
- blendQuat2[j] = blendJoints[n].q[2];
- blendQuat3[j] = blendJoints[n].q[3];
- }
- #if 1
- __asm {
- // lerp translation
- movss xmm7, lerp
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- movaps xmm0, blendVert0
- subps xmm0, jointVert0
- mulps xmm0, xmm7
- addps xmm0, jointVert0
- movaps jointVert0, xmm0
- movaps xmm1, blendVert1
- subps xmm1, jointVert1
- mulps xmm1, xmm7
- addps xmm1, jointVert1
- movaps jointVert1, xmm1
- movaps xmm2, blendVert2
- subps xmm2, jointVert2
- mulps xmm2, xmm7
- addps xmm2, jointVert2
- movaps jointVert2, xmm2
- // lerp quaternions
- movaps xmm0, jointQuat0
- mulps xmm0, blendQuat0
- movaps xmm1, jointQuat1
- mulps xmm1, blendQuat1
- addps xmm0, xmm1
- movaps xmm2, jointQuat2
- mulps xmm2, blendQuat2
- addps xmm0, xmm2
- movaps xmm3, jointQuat3
- mulps xmm3, blendQuat3
- addps xmm0, xmm3 // xmm0 = cosom
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- andps xmm1, SIMD_SP_signBitMask // xmm1 = signBit
- xorps xmm0, xmm1
- mulps xmm2, xmm2
- xorps xmm4, xmm4
- movaps xmm3, SIMD_SP_one
- subps xmm3, xmm2 // xmm3 = scale0
- cmpeqps xmm4, xmm3
- andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
- andps xmm3, SIMD_SP_absMask // make sure the values are positive
- orps xmm3, xmm4
- #ifdef REFINE_BLENDJOINTS_RECIPROCAL
- movaps xmm2, xmm3
- rsqrtps xmm4, xmm2
- mulps xmm2, xmm4
- mulps xmm2, xmm4
- subps xmm2, SIMD_SP_rsqrt_c0
- mulps xmm4, SIMD_SP_rsqrt_c1
- mulps xmm2, xmm4
- #else
- rsqrtps xmm2, xmm3 // xmm2 = sinom
- #endif
- mulps xmm3, xmm2 // xmm3 = sqrt( scale0 )
- // omega0 = atan2( xmm3, xmm0 )
- movaps xmm4, xmm0
- minps xmm0, xmm3
- maxps xmm3, xmm4
- cmpeqps xmm4, xmm0
- #ifdef REFINE_BLENDJOINTS_RECIPROCAL
- rcpps xmm5, xmm3
- mulps xmm3, xmm5
- mulps xmm3, xmm5
- addps xmm5, xmm5
- subps xmm5, xmm3 // xmm5 = 1 / y or 1 / x
- mulps xmm0, xmm5 // xmm0 = x / y or y / x
- #else
- rcpps xmm3, xmm3 // xmm3 = 1 / y or 1 / x
- mulps xmm0, xmm3 // xmm0 = x / y or y / x
- #endif
- movaps xmm3, xmm4
- andps xmm3, SIMD_SP_signBitMask
- xorps xmm0, xmm3 // xmm0 = -x / y or y / x
- andps xmm4, SIMD_SP_halfPI // xmm4 = HALF_PI or 0.0f
- movaps xmm3, xmm0
- mulps xmm3, xmm3 // xmm3 = s
- movaps xmm5, SIMD_SP_atan_c0
- mulps xmm5, xmm3
- addps xmm5, SIMD_SP_atan_c1
- mulps xmm5, xmm3
- addps xmm5, SIMD_SP_atan_c2
- mulps xmm5, xmm3
- addps xmm5, SIMD_SP_atan_c3
- mulps xmm5, xmm3
- addps xmm5, SIMD_SP_atan_c4
- mulps xmm5, xmm3
- addps xmm5, SIMD_SP_atan_c5
- mulps xmm5, xmm3
- addps xmm5, SIMD_SP_atan_c6
- mulps xmm5, xmm3
- addps xmm5, SIMD_SP_atan_c7
- mulps xmm5, xmm3
- addps xmm5, SIMD_SP_one
- mulps xmm5, xmm0
- addps xmm5, xmm4 // xmm5 = omega0
- movaps xmm6, xmm7 // xmm6 = lerp
- mulps xmm6, xmm5 // xmm6 = omega1
- subps xmm5, xmm6 // xmm5 = omega0
- // scale0 = sin( xmm5 ) * xmm2
- // scale1 = sin( xmm6 ) * xmm2
- movaps xmm3, xmm5
- movaps xmm7, xmm6
- mulps xmm3, xmm3
- mulps xmm7, xmm7
- movaps xmm4, SIMD_SP_sin_c0
- movaps xmm0, SIMD_SP_sin_c0
- mulps xmm4, xmm3
- mulps xmm0, xmm7
- addps xmm4, SIMD_SP_sin_c1
- addps xmm0, SIMD_SP_sin_c1
- mulps xmm4, xmm3
- mulps xmm0, xmm7
- addps xmm4, SIMD_SP_sin_c2
- addps xmm0, SIMD_SP_sin_c2
- mulps xmm4, xmm3
- mulps xmm0, xmm7
- addps xmm4, SIMD_SP_sin_c3
- addps xmm0, SIMD_SP_sin_c3
- mulps xmm4, xmm3
- mulps xmm0, xmm7
- addps xmm4, SIMD_SP_sin_c4
- addps xmm0, SIMD_SP_sin_c4
- mulps xmm4, xmm3
- mulps xmm0, xmm7
- addps xmm4, SIMD_SP_one
- addps xmm0, SIMD_SP_one
- mulps xmm5, xmm4
- mulps xmm6, xmm0
- mulps xmm5, xmm2 // xmm5 = scale0
- mulps xmm6, xmm2 // xmm6 = scale1
- xorps xmm6, xmm1
- movaps xmm0, jointQuat0
- mulps xmm0, xmm5
- movaps xmm1, blendQuat0
- mulps xmm1, xmm6
- addps xmm0, xmm1
- movaps jointQuat0, xmm0
- movaps xmm1, jointQuat1
- mulps xmm1, xmm5
- movaps xmm2, blendQuat1
- mulps xmm2, xmm6
- addps xmm1, xmm2
- movaps jointQuat1, xmm1
- movaps xmm2, jointQuat2
- mulps xmm2, xmm5
- movaps xmm3, blendQuat2
- mulps xmm3, xmm6
- addps xmm2, xmm3
- movaps jointQuat2, xmm2
- movaps xmm3, jointQuat3
- mulps xmm3, xmm5
- movaps xmm4, blendQuat3
- mulps xmm4, xmm6
- addps xmm3, xmm4
- movaps jointQuat3, xmm3
- }
- #else
- jointVert0[0] += lerp * ( blendVert0[0] - jointVert0[0] );
- jointVert0[1] += lerp * ( blendVert0[1] - jointVert0[1] );
- jointVert0[2] += lerp * ( blendVert0[2] - jointVert0[2] );
- jointVert0[3] += lerp * ( blendVert0[3] - jointVert0[3] );
- jointVert1[0] += lerp * ( blendVert1[0] - jointVert1[0] );
- jointVert1[1] += lerp * ( blendVert1[1] - jointVert1[1] );
- jointVert1[2] += lerp * ( blendVert1[2] - jointVert1[2] );
- jointVert1[3] += lerp * ( blendVert1[3] - jointVert1[3] );
- jointVert2[0] += lerp * ( blendVert2[0] - jointVert2[0] );
- jointVert2[1] += lerp * ( blendVert2[1] - jointVert2[1] );
- jointVert2[2] += lerp * ( blendVert2[2] - jointVert2[2] );
- jointVert2[3] += lerp * ( blendVert2[3] - jointVert2[3] );
- ALIGN16( float cosom[4] );
- ALIGN16( float sinom[4] );
- ALIGN16( float omega0[4] );
- ALIGN16( float omega1[4] );
- ALIGN16( float scale0[4] );
- ALIGN16( float scale1[4] );
- ALIGN16( unsigned long signBit[4] );
- cosom[0] = jointQuat0[0] * blendQuat0[0];
- cosom[1] = jointQuat0[1] * blendQuat0[1];
- cosom[2] = jointQuat0[2] * blendQuat0[2];
- cosom[3] = jointQuat0[3] * blendQuat0[3];
- cosom[0] += jointQuat1[0] * blendQuat1[0];
- cosom[1] += jointQuat1[1] * blendQuat1[1];
- cosom[2] += jointQuat1[2] * blendQuat1[2];
- cosom[3] += jointQuat1[3] * blendQuat1[3];
- cosom[0] += jointQuat2[0] * blendQuat2[0];
- cosom[1] += jointQuat2[1] * blendQuat2[1];
- cosom[2] += jointQuat2[2] * blendQuat2[2];
- cosom[3] += jointQuat2[3] * blendQuat2[3];
- cosom[0] += jointQuat3[0] * blendQuat3[0];
- cosom[1] += jointQuat3[1] * blendQuat3[1];
- cosom[2] += jointQuat3[2] * blendQuat3[2];
- cosom[3] += jointQuat3[3] * blendQuat3[3];
- signBit[0] = (*(unsigned long *)&cosom[0]) & ( 1 << 31 );
- signBit[1] = (*(unsigned long *)&cosom[1]) & ( 1 << 31 );
- signBit[2] = (*(unsigned long *)&cosom[2]) & ( 1 << 31 );
- signBit[3] = (*(unsigned long *)&cosom[3]) & ( 1 << 31 );
- (*(unsigned long *)&cosom[0]) ^= signBit[0];
- (*(unsigned long *)&cosom[1]) ^= signBit[1];
- (*(unsigned long *)&cosom[2]) ^= signBit[2];
- (*(unsigned long *)&cosom[3]) ^= signBit[3];
- scale0[0] = 1.0f - cosom[0] * cosom[0];
- scale0[1] = 1.0f - cosom[1] * cosom[1];
- scale0[2] = 1.0f - cosom[2] * cosom[2];
- scale0[3] = 1.0f - cosom[3] * cosom[3];
- scale0[0] = ( scale0[0] <= 0.0f ) ? SIMD_SP_tiny[0] : scale0[0];
- scale0[1] = ( scale0[1] <= 0.0f ) ? SIMD_SP_tiny[1] : scale0[1];
- scale0[2] = ( scale0[2] <= 0.0f ) ? SIMD_SP_tiny[2] : scale0[2];
- scale0[3] = ( scale0[3] <= 0.0f ) ? SIMD_SP_tiny[3] : scale0[3];
- sinom[0] = idMath::RSqrt( scale0[0] );
- sinom[1] = idMath::RSqrt( scale0[1] );
- sinom[2] = idMath::RSqrt( scale0[2] );
- sinom[3] = idMath::RSqrt( scale0[3] );
- scale0[0] *= sinom[0];
- scale0[1] *= sinom[1];
- scale0[2] *= sinom[2];
- scale0[3] *= sinom[3];
- omega0[0] = SSE_ATanPositive( scale0[0], cosom[0] );
- omega0[1] = SSE_ATanPositive( scale0[1], cosom[1] );
- omega0[2] = SSE_ATanPositive( scale0[2], cosom[2] );
- omega0[3] = SSE_ATanPositive( scale0[3], cosom[3] );
- omega1[0] = lerp * omega0[0];
- omega1[1] = lerp * omega0[1];
- omega1[2] = lerp * omega0[2];
- omega1[3] = lerp * omega0[3];
- omega0[0] -= omega1[0];
- omega0[1] -= omega1[1];
- omega0[2] -= omega1[2];
- omega0[3] -= omega1[3];
- scale0[0] = SSE_SinZeroHalfPI( omega0[0] ) * sinom[0];
- scale0[1] = SSE_SinZeroHalfPI( omega0[1] ) * sinom[1];
- scale0[2] = SSE_SinZeroHalfPI( omega0[2] ) * sinom[2];
- scale0[3] = SSE_SinZeroHalfPI( omega0[3] ) * sinom[3];
- scale1[0] = SSE_SinZeroHalfPI( omega1[0] ) * sinom[0];
- scale1[1] = SSE_SinZeroHalfPI( omega1[1] ) * sinom[1];
- scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2];
- scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3];
- (*(unsigned long *)&scale1[0]) ^= signBit[0];
- (*(unsigned long *)&scale1[1]) ^= signBit[1];
- (*(unsigned long *)&scale1[2]) ^= signBit[2];
- (*(unsigned long *)&scale1[3]) ^= signBit[3];
- jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0];
- jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1];
- jointQuat0[2] = scale0[2] * jointQuat0[2] + scale1[2] * blendQuat0[2];
- jointQuat0[3] = scale0[3] * jointQuat0[3] + scale1[3] * blendQuat0[3];
- jointQuat1[0] = scale0[0] * jointQuat1[0] + scale1[0] * blendQuat1[0];
- jointQuat1[1] = scale0[1] * jointQuat1[1] + scale1[1] * blendQuat1[1];
- jointQuat1[2] = scale0[2] * jointQuat1[2] + scale1[2] * blendQuat1[2];
- jointQuat1[3] = scale0[3] * jointQuat1[3] + scale1[3] * blendQuat1[3];
- jointQuat2[0] = scale0[0] * jointQuat2[0] + scale1[0] * blendQuat2[0];
- jointQuat2[1] = scale0[1] * jointQuat2[1] + scale1[1] * blendQuat2[1];
- jointQuat2[2] = scale0[2] * jointQuat2[2] + scale1[2] * blendQuat2[2];
- jointQuat2[3] = scale0[3] * jointQuat2[3] + scale1[3] * blendQuat2[3];
- jointQuat3[0] = scale0[0] * jointQuat3[0] + scale1[0] * blendQuat3[0];
- jointQuat3[1] = scale0[1] * jointQuat3[1] + scale1[1] * blendQuat3[1];
- jointQuat3[2] = scale0[2] * jointQuat3[2] + scale1[2] * blendQuat3[2];
- jointQuat3[3] = scale0[3] * jointQuat3[3] + scale1[3] * blendQuat3[3];
- #endif
- for ( int j = 0; j < 4; j++ ) {
- int n = index[i+j];
- joints[n].t[0] = jointVert0[j];
- joints[n].t[1] = jointVert1[j];
- joints[n].t[2] = jointVert2[j];
- joints[n].q[0] = jointQuat0[j];
- joints[n].q[1] = jointQuat1[j];
- joints[n].q[2] = jointQuat2[j];
- joints[n].q[3] = jointQuat3[j];
- }
- }
- for ( ; i < numJoints; i++ ) {
- int n = index[i];
- idVec3 &jointVert = joints[n].t;
- const idVec3 &blendVert = blendJoints[n].t;
- jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
- jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
- jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );
- idQuat &jointQuat = joints[n].q;
- const idQuat &blendQuat = blendJoints[n].q;
- float cosom;
- float sinom;
- float omega;
- float scale0;
- float scale1;
- unsigned long signBit;
- cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;
- signBit = (*(unsigned long *)&cosom) & ( 1 << 31 );
- (*(unsigned long *)&cosom) ^= signBit;
- scale0 = 1.0f - cosom * cosom;
- scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0;
- sinom = idMath::InvSqrt( scale0 );
- omega = idMath::ATan16( scale0 * sinom, cosom );
- scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom;
- scale1 = idMath::Sin16( lerp * omega ) * sinom;
- (*(unsigned long *)&scale1) ^= signBit;
- jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x;
- jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y;
- jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z;
- jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w;
- }
- }
- /*
- ============
- idSIMD_SSE::ConvertJointQuatsToJointMats
- ============
- */
- void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
- assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
- assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
- assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
- for ( int i = 0; i < numJoints; i++ ) {
- const float *q = jointQuats[i].q.ToFloatPtr();
- float *m = jointMats[i].ToFloatPtr();
- m[0*4+3] = q[4];
- m[1*4+3] = q[5];
- m[2*4+3] = q[6];
- float x2 = q[0] + q[0];
- float y2 = q[1] + q[1];
- float z2 = q[2] + q[2];
- {
- float xx = q[0] * x2;
- float yy = q[1] * y2;
- float zz = q[2] * z2;
- m[0*4+0] = 1.0f - yy - zz;
- m[1*4+1] = 1.0f - xx - zz;
- m[2*4+2] = 1.0f - xx - yy;
- }
- {
- float yz = q[1] * z2;
- float wx = q[3] * x2;
- m[2*4+1] = yz - wx;
- m[1*4+2] = yz + wx;
- }
- {
- float xy = q[0] * y2;
- float wz = q[3] * z2;
- m[1*4+0] = xy - wz;
- m[0*4+1] = xy + wz;
- }
- {
- float xz = q[0] * z2;
- float wy = q[3] * y2;
- m[0*4+2] = xz - wy;
- m[2*4+0] = xz + wy;
- }
- }
- }
- /*
- ============
- idSIMD_SSE::ConvertJointMatsToJointQuats
- ============
- */
- void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
- assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
- assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
- assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
- #if 1
- ALIGN16( byte shuffle[16] );
- __asm {
- mov eax, numJoints
- mov esi, jointMats
- mov edi, jointQuats
- and eax, ~3
- jz done4
- imul eax, JOINTMAT_SIZE
- add esi, eax
- neg eax
- loopMat4:
- movss xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4]
- movss xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4]
- movss xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4]
- shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
- movss xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4]
- movss xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4]
- movss xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4]
- movss xmm5, xmm0
- movss xmm6, xmm1
- movss xmm7, xmm2
- shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
- movss xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4]
- movss xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4]
- movss xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4]
- movss xmm5, xmm0
- movss xmm6, xmm1
- movss xmm7, xmm2
- shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
- movss xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
- movss xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
- movss xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
- movss xmm5, xmm0
- movss xmm6, xmm1
- movss xmm7, xmm2
- // -------------------
- movaps xmm0, xmm5
- addps xmm0, xmm6
- addps xmm0, xmm7
- cmpnltps xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
- movaps xmm1, xmm5
- movaps xmm2, xmm5
- cmpnltps xmm1, xmm6
- cmpnltps xmm2, xmm7
- andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
- movaps xmm4, xmm6
- cmpnltps xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
- movaps xmm1, xmm0
- andnps xmm1, xmm2
- orps xmm2, xmm0
- movaps xmm3, xmm2
- andnps xmm2, xmm4
- orps xmm3, xmm2
- xorps xmm3, SIMD_SP_not
- andps xmm0, SIMD_DW_mat2quatShuffle0
- movaps xmm4, xmm1
- andps xmm4, SIMD_DW_mat2quatShuffle1
- orps xmm0, xmm4
- movaps xmm4, xmm2
- andps xmm4, SIMD_DW_mat2quatShuffle2
- orps xmm0, xmm4
- movaps xmm4, xmm3
- andps xmm4, SIMD_DW_mat2quatShuffle3
- orps xmm4, xmm0
- movaps shuffle, xmm4
- movaps xmm0, xmm2
- orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0
- orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2
- orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1
- andps xmm0, SIMD_SP_signBitMask
- andps xmm1, SIMD_SP_signBitMask
- andps xmm2, SIMD_SP_signBitMask
- xorps xmm5, xmm0
- xorps xmm6, xmm1
- xorps xmm7, xmm2
- addps xmm5, xmm6
- addps xmm7, SIMD_SP_one
- addps xmm5, xmm7 // xmm5 = t
- movaps xmm7, xmm5 // xmm7 = t
- rsqrtps xmm6, xmm5
- mulps xmm5, xmm6
- mulps xmm5, xmm6
- subps xmm5, SIMD_SP_rsqrt_c0
- mulps xmm6, SIMD_SP_mat2quat_rsqrt_c1
- mulps xmm6, xmm5 // xmm5 = s
- mulps xmm7, xmm6 // xmm7 = s * t
- xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s
- // -------------------
- add edi, 4*JOINTQUAT_SIZE
- movzx ecx, byte ptr shuffle[0*4+0] // ecx = k0
- movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
- movzx edx, byte ptr shuffle[0*4+1] // edx = k1
- movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
- xorps xmm4, xmm2
- subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
- mulss xmm4, xmm6
- movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
- movzx ecx, byte ptr shuffle[0*4+2] // ecx = k2
- movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
- xorps xmm3, xmm1
- subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
- mulss xmm3, xmm6
- movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
- movzx edx, byte ptr shuffle[0*4+3] // edx = k3
- movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
- xorps xmm4, xmm0
- subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
- mulss xmm4, xmm6
- movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
- mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
- mov [edi-4*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
- mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
- mov [edi-4*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
- mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
- mov [edi-4*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movzx ecx, byte ptr shuffle[1*4+0] // ecx = k0
- movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
- movzx edx, byte ptr shuffle[1*4+1] // edx = k1
- movss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4]
- xorps xmm4, xmm2
- subss xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4]
- mulss xmm4, xmm6
- movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
- movzx ecx, byte ptr shuffle[1*4+2] // ecx = k2
- movss xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4]
- xorps xmm3, xmm1
- subss xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4]
- mulss xmm3, xmm6
- movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
- movzx edx, byte ptr shuffle[1*4+3] // edx = k3
- movss xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4]
- xorps xmm4, xmm0
- subss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4]
- mulss xmm4, xmm6
- movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
- mov ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4]
- mov [edi-3*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
- mov edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4]
- mov [edi-3*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
- mov ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4]
- mov [edi-3*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movzx ecx, byte ptr shuffle[2*4+0] // ecx = k0
- movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
- movzx edx, byte ptr shuffle[2*4+1] // edx = k1
- movss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4]
- xorps xmm4, xmm2
- subss xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4]
- mulss xmm4, xmm6
- movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
- movzx ecx, byte ptr shuffle[2*4+2] // ecx = k2
- movss xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4]
- xorps xmm3, xmm1
- subss xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4]
- mulss xmm3, xmm6
- movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
- movzx edx, byte ptr shuffle[2*4+3] // edx = k3
- movss xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4]
- xorps xmm4, xmm0
- subss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4]
- mulss xmm4, xmm6
- movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
- mov ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4]
- mov [edi-2*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
- mov edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4]
- mov [edi-2*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
- mov ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4]
- mov [edi-2*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movzx ecx, byte ptr shuffle[3*4+0] // ecx = k0
- movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
- movzx edx, byte ptr shuffle[3*4+1] // edx = k1
- movss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4]
- xorps xmm4, xmm2
- subss xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4]
- mulss xmm4, xmm6
- movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
- movzx ecx, byte ptr shuffle[3*4+2] // ecx = k2
- movss xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4]
- xorps xmm3, xmm1
- subss xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4]
- mulss xmm3, xmm6
- movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
- movzx edx, byte ptr shuffle[3*4+3] // edx = k3
- movss xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4]
- xorps xmm4, xmm0
- subss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4]
- mulss xmm4, xmm6
- movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
- mov ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4]
- mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
- mov edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4]
- mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
- mov ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4]
- mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
- add eax, 4*JOINTMAT_SIZE
- jl loopMat4
- done4:
- mov eax, numJoints
- and eax, 3
- jz done1
- imul eax, JOINTMAT_SIZE
- add esi, eax
- neg eax
- loopMat1:
- movss xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
- movss xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
- movss xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
- // -------------------
- movaps xmm0, xmm5
- addss xmm0, xmm6
- addss xmm0, xmm7
- cmpnltss xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
- movaps xmm1, xmm5
- movaps xmm2, xmm5
- cmpnltss xmm1, xmm6
- cmpnltss xmm2, xmm7
- andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
- movaps xmm4, xmm6
- cmpnltss xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
- movaps xmm1, xmm0
- andnps xmm1, xmm2
- orps xmm2, xmm0
- movaps xmm3, xmm2
- andnps xmm2, xmm4
- orps xmm3, xmm2
- xorps xmm3, SIMD_SP_not
- andps xmm0, SIMD_DW_mat2quatShuffle0
- movaps xmm4, xmm1
- andps xmm4, SIMD_DW_mat2quatShuffle1
- orps xmm0, xmm4
- movaps xmm4, xmm2
- andps xmm4, SIMD_DW_mat2quatShuffle2
- orps xmm0, xmm4
- movaps xmm4, xmm3
- andps xmm4, SIMD_DW_mat2quatShuffle3
- orps xmm4, xmm0
- movss shuffle, xmm4
- movaps xmm0, xmm2
- orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0
- orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2
- orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1
- andps xmm0, SIMD_SP_signBitMask
- andps xmm1, SIMD_SP_signBitMask
- andps xmm2, SIMD_SP_signBitMask
- xorps xmm5, xmm0
- xorps xmm6, xmm1
- xorps xmm7, xmm2
- addss xmm5, xmm6
- addss xmm7, SIMD_SP_one
- addss xmm5, xmm7 // xmm5 = t
- movss xmm7, xmm5 // xmm7 = t
- rsqrtss xmm6, xmm5
- mulss xmm5, xmm6
- mulss xmm5, xmm6
- subss xmm5, SIMD_SP_rsqrt_c0
- mulss xmm6, SIMD_SP_mat2quat_rsqrt_c1
- mulss xmm6, xmm5 // xmm5 = s
- mulss xmm7, xmm6 // xmm7 = s * t
- xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s
- // -------------------
- movzx ecx, byte ptr shuffle[0] // ecx = k0
- add edi, JOINTQUAT_SIZE
- movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
- movzx edx, byte ptr shuffle[1] // edx = k1
- movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
- xorps xmm4, xmm2
- subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
- mulss xmm4, xmm6
- movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
- movzx ecx, byte ptr shuffle[2] // ecx = k2
- movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
- xorps xmm3, xmm1
- subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
- mulss xmm3, xmm6
- movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
- movzx edx, byte ptr shuffle[3] // edx = k3
- movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
- xorps xmm4, xmm0
- subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
- mulss xmm4, xmm6
- movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
- mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
- mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
- mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
- mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
- mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
- mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
- add eax, JOINTMAT_SIZE
- jl loopMat1
- done1:
- }
- #elif 0
- for ( int i = 0; i < numJoints; i++ ) {
- float s0, s1, s2;
- int k0, k1, k2, k3;
- float *q = jointQuats[i].q.ToFloatPtr();
- const float *m = jointMats[i].ToFloatPtr();
- if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
- k0 = 3;
- k1 = 2;
- k2 = 1;
- k3 = 0;
- s0 = 1.0f;
- s1 = 1.0f;
- s2 = 1.0f;
- } else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
- k0 = 0;
- k1 = 1;
- k2 = 2;
- k3 = 3;
- s0 = 1.0f;
- s1 = -1.0f;
- s2 = -1.0f;
- } else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
- k0 = 1;
- k1 = 0;
- k2 = 3;
- k3 = 2;
- s0 = -1.0f;
- s1 = 1.0f;
- s2 = -1.0f;
- } else {
- k0 = 2;
- k1 = 3;
- k2 = 0;
- k3 = 1;
- s0 = -1.0f;
- s1 = -1.0f;
- s2 = 1.0f;
- }
- float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f;
- float s = idMath::InvSqrt( t ) * 0.5f;
- q[k0] = s * t;
- q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
- q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
- q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
- q[4] = m[0 * 4 + 3];
- q[5] = m[1 * 4 + 3];
- q[6] = m[2 * 4 + 3];
- }
- #elif 1
- for ( int i = 0; i < numJoints; i++ ) {
- float *q = jointQuats[i].q.ToFloatPtr();
- const float *m = jointMats[i].ToFloatPtr();
- if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
- float t = + m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
- float s = idMath::InvSqrt( t ) * 0.5f;
- q[3] = s * t;
- q[2] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
- q[1] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
- q[0] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
- } else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
- float t = + m[0 * 4 + 0] - m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
- float s = idMath::InvSqrt( t ) * 0.5f;
- q[0] = s * t;
- q[1] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
- q[2] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
- q[3] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
- } else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
- float t = - m[0 * 4 + 0] + m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
- float s = idMath::InvSqrt( t ) * 0.5f;
- q[1] = s * t;
- q[0] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
- q[3] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
- q[2] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
- } else {
- float t = - m[0 * 4 + 0] - m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
- float s = idMath::InvSqrt( t ) * 0.5f;
- q[2] = s * t;
- q[3] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
- q[0] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
- q[1] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
- }
- q[4] = m[0 * 4 + 3];
- q[5] = m[1 * 4 + 3];
- q[6] = m[2 * 4 + 3];
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::TransformJoints
- ============
- */
- void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
- #if 1
- assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
- __asm {
- mov ecx, firstJoint
- mov eax, lastJoint
- sub eax, ecx
- jl done
- imul ecx, 4
- mov edi, parents
- add edi, ecx
- imul ecx, 12
- mov esi, jointMats
- imul eax, 4
- add edi, eax
- neg eax
- loopJoint:
- movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
- mov edx, [edi+eax]
- movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
- imul edx, JOINTMAT_SIZE
- movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
- movss xmm4, [esi+edx+ 0]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm0
- movss xmm5, [esi+edx+ 4]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm1
- addps xmm4, xmm5
- movss xmm6, [esi+edx+ 8]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm2
- addps xmm4, xmm6
- movss xmm5, [esi+edx+16]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm0
- movss xmm7, [esi+edx+12]
- shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
- addps xmm4, xmm7
- movaps [esi+ecx+ 0], xmm4
- movss xmm6, [esi+edx+20]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm1
- addps xmm5, xmm6
- movss xmm7, [esi+edx+24]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm2
- addps xmm5, xmm7
- movss xmm6, [esi+edx+32]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm0
- movss xmm3, [esi+edx+28]
- shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
- addps xmm5, xmm3
- movaps [esi+ecx+16], xmm5
- movss xmm7, [esi+edx+36]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movss xmm3, [esi+edx+40]
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, xmm2
- addps xmm6, xmm3
- movss xmm7, [esi+edx+44]
- shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
- addps xmm6, xmm7
- movaps [esi+ecx+32], xmm6
- add ecx, JOINTMAT_SIZE
- add eax, 4
- jle loopJoint
- done:
- }
- #else
- int i;
- for( i = firstJoint; i <= lastJoint; i++ ) {
- assert( parents[i] < i );
- jointMats[i] *= jointMats[parents[i]];
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::UntransformJoints
- ============
- */
- void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
- #if 1
- assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
- __asm {
- mov edx, firstJoint
- mov eax, lastJoint
- mov ecx, eax
- sub eax, edx
- jl done
- mov esi, jointMats
- imul ecx, JOINTMAT_SIZE
- imul edx, 4
- mov edi, parents
- add edi, edx
- imul eax, 4
- loopJoint:
- movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
- mov edx, [edi+eax]
- movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
- imul edx, JOINTMAT_SIZE
- movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
- movss xmm6, [esi+edx+12]
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
- subps xmm0, xmm6
- movss xmm7, [esi+edx+28]
- shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
- subps xmm1, xmm7
- movss xmm3, [esi+edx+44]
- shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
- subps xmm2, xmm3
- movss xmm4, [esi+edx+ 0]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm4, xmm0
- movss xmm5, [esi+edx+16]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm1
- addps xmm4, xmm5
- movss xmm6, [esi+edx+32]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm2
- addps xmm4, xmm6
- movaps [esi+ecx+ 0], xmm4
- movss xmm5, [esi+edx+ 4]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm5, xmm0
- movss xmm6, [esi+edx+20]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm1
- addps xmm5, xmm6
- movss xmm7, [esi+edx+36]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm2
- addps xmm5, xmm7
- movaps [esi+ecx+16], xmm5
- movss xmm6, [esi+edx+ 8]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm0
- movss xmm7, [esi+edx+24]
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movss xmm3, [esi+edx+40]
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm3, xmm2
- addps xmm6, xmm3
- movaps [esi+ecx+32], xmm6
- sub ecx, JOINTMAT_SIZE
- sub eax, 4
- jge loopJoint
- done:
- }
- #else
- int i;
- for( i = lastJoint; i >= firstJoint; i-- ) {
- assert( parents[i] < i );
- jointMats[i] /= jointMats[parents[i]];
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::TransformVerts
- ============
- */
- void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
- #if 1
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
- assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
- __asm
- {
- mov eax, numVerts
- test eax, eax
- jz done
- imul eax, DRAWVERT_SIZE
- mov ecx, verts
- mov edx, index
- mov esi, weights
- mov edi, joints
- add ecx, eax
- neg eax
- loopVert:
- mov ebx, [edx]
- movaps xmm2, [esi]
- add edx, 8
- movaps xmm0, xmm2
- add esi, JOINTWEIGHT_SIZE
- movaps xmm1, xmm2
- mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
- mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
- mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
- cmp dword ptr [edx-4], 0
- jne doneWeight
- loopWeight:
- mov ebx, [edx]
- movaps xmm5, [esi]
- add edx, 8
- movaps xmm3, xmm5
- add esi, JOINTWEIGHT_SIZE
- movaps xmm4, xmm5
- mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
- mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
- mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
- cmp dword ptr [edx-4], 0
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5
- je loopWeight
- doneWeight:
- add eax, DRAWVERT_SIZE
- movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0
- unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4
- unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1
- addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
- movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2
- movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5
- movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
- addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
- movhps [ecx+eax-DRAWVERT_SIZE+0], xmm6
- movaps xmm5, xmm6 // xmm5 = m6+m8, m7+t2
- shufps xmm5, xmm5, R_SHUFFLEPS( 1, 0, 2, 3 ) // xmm5 = m7+t2, m6+m8
- addss xmm5, xmm6 // xmm5 = m6+m8+m7+t2
- movss [ecx+eax-DRAWVERT_SIZE+8], xmm5
- jl loopVert
- done:
- }
- #else
- int i, j;
- const byte *jointsPtr = (byte *)joints;
- for( j = i = 0; i < numVerts; i++ ) {
- idVec3 v;
- v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
- while( index[j*2+1] == 0 ) {
- j++;
- v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
- }
- j++;
- verts[i].xyz = v;
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::TracePointCull
- ============
- */
- void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
- #if 1
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- __asm {
- push ebx
- mov eax, numVerts
- test eax, eax
- jz done
- mov edi, planes
- movlps xmm1, [edi] // xmm1 = 0, 1, X, X
- movhps xmm1, [edi+16] // xmm1 = 0, 1, 4, 5
- movlps xmm3, [edi+8] // xmm3 = 2, 3, X, X
- movhps xmm3, [edi+24] // xmm3 = 2, 3, 6, 7
- movlps xmm4, [edi+32] // xmm4 = 8, 9, X, X
- movhps xmm4, [edi+48] // xmm4 = 8, 9, 12, 13
- movlps xmm5, [edi+40] // xmm5 = 10, 11, X, X
- movhps xmm5, [edi+56] // xmm5 = 10, 11, 14, 15
- movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5
- shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12
- shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13
- movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7
- shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14
- shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15
- movss xmm7, radius
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- xor edx, edx
- mov esi, verts
- mov edi, cullBits
- imul eax, DRAWVERT_SIZE
- add esi, eax
- neg eax
- loopVert:
- movss xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
- mulps xmm4, xmm0
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- movss xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
- mulps xmm5, xmm1
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- addps xmm4, xmm5
- mulps xmm6, xmm2
- addps xmm4, xmm3
- addps xmm4, xmm6
- movaps xmm5, xmm4
- xorps xmm5, SIMD_SP_signBitMask
- cmpltps xmm4, xmm7
- movmskps ecx, xmm4
- cmpltps xmm5, xmm7
- movmskps ebx, xmm5
- shl cx, 4
- or cl, bl
- inc edi
- or dl, cl
- add eax, DRAWVERT_SIZE
- mov byte ptr [edi-1], cl
- jl loopVert
- done:
- mov esi, totalOr
- mov byte ptr [esi], dl
- pop ebx
- }
- #else
- int i;
- byte tOr;
- tOr = 0;
- for ( i = 0; i < numVerts; i++ ) {
- byte bits;
- float d0, d1, d2, d3, t;
- const idVec3 &v = verts[i].xyz;
- d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
- d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
- d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
- d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
- t = d0 + radius;
- bits = FLOATSIGNBITSET( t ) << 0;
- t = d1 + radius;
- bits |= FLOATSIGNBITSET( t ) << 1;
- t = d2 + radius;
- bits |= FLOATSIGNBITSET( t ) << 2;
- t = d3 + radius;
- bits |= FLOATSIGNBITSET( t ) << 3;
- t = d0 - radius;
- bits |= FLOATSIGNBITSET( t ) << 4;
- t = d1 - radius;
- bits |= FLOATSIGNBITSET( t ) << 5;
- t = d2 - radius;
- bits |= FLOATSIGNBITSET( t ) << 6;
- t = d3 - radius;
- bits |= FLOATSIGNBITSET( t ) << 7;
- bits ^= 0x0F; // flip lower four bits
- tOr |= bits;
- cullBits[i] = bits;
- }
- totalOr = tOr;
- #endif
- }
- /*
- ============
- idSIMD_SSE::DecalPointCull
- ============
- */
- void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
- #if 1
- ALIGN16( float p0[4] );
- ALIGN16( float p1[4] );
- ALIGN16( float p2[4] );
- ALIGN16( float p3[4] );
- ALIGN16( float p4[4] );
- ALIGN16( float p5[4] );
- ALIGN16( float p6[4] );
- ALIGN16( float p7[4] );
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- __asm {
- mov ecx, planes
- movlps xmm1, [ecx] // xmm1 = 0, 1, X, X
- movhps xmm1, [ecx+16] // xmm1 = 0, 1, 4, 5
- movlps xmm3, [ecx+8] // xmm3 = 2, 3, X, X
- movhps xmm3, [ecx+24] // xmm3 = 2, 3, 6, 7
- movlps xmm4, [ecx+32] // xmm4 = 8, 9, X, X
- movhps xmm4, [ecx+48] // xmm4 = 8, 9, 12, 13
- movlps xmm5, [ecx+40] // xmm5 = 10, 11, X, X
- movhps xmm5, [ecx+56] // xmm5 = 10, 11, 14, 15
- movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5
- shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12
- shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13
- movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7
- shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14
- shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15
- movaps p0, xmm0
- movaps p1, xmm1
- movaps p2, xmm2
- movaps p3, xmm3
- movlps xmm4, [ecx+64] // xmm4 = p40, p41, X, X
- movhps xmm4, [ecx+80] // xmm4 = p40, p41, p50, p51
- movaps xmm5, xmm4 // xmm5 = p40, p41, p50, p51
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm4 = p40, p50, p40, p50
- shufps xmm5, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm5 = p41, p51, p41, p51
- movlps xmm6, [ecx+72] // xmm6 = p42, p43, X, X
- movhps xmm6, [ecx+88] // xmm6 = p42, p43, p52, p53
- movaps xmm7, xmm6 // xmm7 = p42, p43, p52, p53
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm6 = p42, p52, p42, p52
- shufps xmm7, xmm7, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm7 = p43, p53, p43, p53
- movaps p4, xmm4
- movaps p5, xmm5
- movaps p6, xmm6
- movaps p7, xmm7
- mov esi, verts
- mov edi, cullBits
- mov eax, numVerts
- and eax, ~1
- jz done2
- imul eax, DRAWVERT_SIZE
- add esi, eax
- neg eax
- loopVert2:
- movaps xmm6, p0
- movss xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm0
- movaps xmm7, p1
- movss xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movaps xmm7, p2
- movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm2
- addps xmm6, xmm7
- addps xmm6, p3
- cmpnltps xmm6, SIMD_SP_zero
- movmskps ecx, xmm6
-
- movaps xmm6, p0
- movss xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm3
- movaps xmm7, p1
- movss xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm4
- addps xmm6, xmm7
- movaps xmm7, p2
- movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm5
- addps xmm6, xmm7
- addps xmm6, p3
- cmpnltps xmm6, SIMD_SP_zero
- movmskps edx, xmm6
- mov ch, dl
- shufps xmm0, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm0, p4
- shufps xmm1, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm1, p5
- addps xmm0, xmm1
- shufps xmm2, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm2, p6
- addps xmm0, xmm2
- addps xmm0, p7
- cmpnltps xmm0, SIMD_SP_zero
- movmskps edx, xmm0
- add edi, 2
- mov dh, dl
- shl dl, 4
- shl dh, 2
- and edx, (3<<4)|(3<<12)
- or ecx, edx
- add eax, 2*DRAWVERT_SIZE
- mov word ptr [edi-2], cx
- jl loopVert2
- done2:
- mov eax, numVerts
- and eax, 1
- jz done
- movaps xmm6, p0
- movss xmm0, [esi+DRAWVERT_XYZ_OFFSET+0]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm6, xmm0
- movaps xmm7, p1
- movss xmm1, [esi+DRAWVERT_XYZ_OFFSET+4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm1
- addps xmm6, xmm7
- movaps xmm7, p2
- movss xmm2, [esi+DRAWVERT_XYZ_OFFSET+8]
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm7, xmm2
- addps xmm6, xmm7
- addps xmm6, p3
- cmpnltps xmm6, SIMD_SP_zero
- movmskps ecx, xmm6
- mulps xmm0, p4
- mulps xmm1, p5
- addps xmm0, xmm1
- mulps xmm2, p6
- addps xmm0, xmm2
- addps xmm0, p7
- cmpnltps xmm0, SIMD_SP_zero
- movmskps edx, xmm0
- and edx, 3
- shl edx, 4
- or ecx, edx
- mov byte ptr [edi], cl
- done:
- }
- #else
- int i;
- for ( i = 0; i < numVerts; i += 2 ) {
- unsigned short bits0, bits1;
- float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
- const idVec3 &v0 = verts[i+0].xyz;
- const idVec3 &v1 = verts[i+1].xyz;
- d0 = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3];
- d1 = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3];
- d2 = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3];
- d3 = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3];
- d4 = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3];
- d5 = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3];
- d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3];
- d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3];
- d6 = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3];
- d7 = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3];
- d8 = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3];
- d9 = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3];
- bits0 = FLOATSIGNBITSET( d0 ) << (0+0);
- bits0 |= FLOATSIGNBITSET( d1 ) << (0+1);
- bits0 |= FLOATSIGNBITSET( d2 ) << (0+2);
- bits0 |= FLOATSIGNBITSET( d3 ) << (0+3);
- bits0 |= FLOATSIGNBITSET( d4 ) << (0+4);
- bits0 |= FLOATSIGNBITSET( d5 ) << (0+5);
- bits1 = FLOATSIGNBITSET( d6 ) << (8+0);
- bits1 |= FLOATSIGNBITSET( d7 ) << (8+1);
- bits1 |= FLOATSIGNBITSET( d8 ) << (8+2);
- bits1 |= FLOATSIGNBITSET( d9 ) << (8+3);
- bits1 |= FLOATSIGNBITSET( d10 ) << (8+4);
- bits1 |= FLOATSIGNBITSET( d11 ) << (8+5);
- *(unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F;
- }
- if ( numVerts & 1 ) {
- byte bits;
- float d0, d1, d2, d3, d4, d5;
- const idVec3 &v = verts[numVerts - 1].xyz;
- d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
- d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
- d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
- d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
- d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3];
- d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3];
- bits = FLOATSIGNBITSET( d0 ) << 0;
- bits |= FLOATSIGNBITSET( d1 ) << 1;
- bits |= FLOATSIGNBITSET( d2 ) << 2;
- bits |= FLOATSIGNBITSET( d3 ) << 3;
- bits |= FLOATSIGNBITSET( d4 ) << 4;
- bits |= FLOATSIGNBITSET( d5 ) << 5;
- cullBits[numVerts - 1] = bits ^ 0x3F; // flip lower 6 bits
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::OverlayPointCull
- ============
- */
- void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
- #if 1
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- __asm {
- mov eax, numVerts
- mov edx, verts
- mov esi, texCoords
- mov edi, cullBits
- mov ecx, planes
- movss xmm4, [ecx+ 0]
- movss xmm5, [ecx+16]
- shufps xmm4, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
- movss xmm5, [ecx+ 4]
- movss xmm6, [ecx+20]
- shufps xmm5, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )
- movss xmm6, [ecx+ 8]
- movss xmm7, [ecx+24]
- shufps xmm6, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )
- movss xmm7, [ecx+12]
- movss xmm0, [ecx+28]
- shufps xmm7, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 0, 2 )
- and eax, ~1
- jz done2
- add edi, eax
- neg eax
- loopVert2:
- movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- movss xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm0, xmm4
- movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
- movss xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
- shufps xmm1, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm1, xmm5
- movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- movss xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- shufps xmm2, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm2, xmm6
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm0, xmm7
- movaps [esi], xmm0
- movaps xmm1, xmm0
- movaps xmm2, SIMD_SP_one
- subps xmm2, xmm0
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
- shufps xmm1, xmm2, R_SHUFFLEPS( 2, 3, 2, 3 )
- add edx, 2*DRAWVERT_SIZE
- movmskps ecx, xmm0
- mov byte ptr [edi+eax+0], cl
- add esi, 4*4
- movmskps ecx, xmm1
- mov byte ptr [edi+eax+1], cl
- add eax, 2
- jl loopVert2
- done2:
- mov eax, numVerts
- and eax, 1
- jz done
- movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm0, xmm4
- movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm1, xmm5
- movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm2, xmm6
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm0, xmm7
- movlps [esi], xmm0
- movaps xmm1, xmm0
- movaps xmm2, SIMD_SP_one
- subps xmm2, xmm0
- shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
- movmskps ecx, xmm0
- mov byte ptr [edi], cl
- done:
- }
- #else
- const idPlane &p0 = planes[0];
- const idPlane &p1 = planes[1];
- for ( int i = 0; i < numVerts - 1; i += 2 ) {
- unsigned short bits;
- float d0, d1, d2, d3;
- const idVec3 &v0 = verts[i+0].xyz;
- const idVec3 &v1 = verts[i+1].xyz;
- d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
- d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
- d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3];
- d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3];
- texCoords[i+0][0] = d0;
- texCoords[i+0][1] = d1;
- texCoords[i+1][0] = d2;
- texCoords[i+1][1] = d3;
- bits = FLOATSIGNBITSET( d0 ) << 0;
- bits |= FLOATSIGNBITSET( d1 ) << 1;
- bits |= FLOATSIGNBITSET( d2 ) << 8;
- bits |= FLOATSIGNBITSET( d3 ) << 9;
- d0 = 1.0f - d0;
- d1 = 1.0f - d1;
- d2 = 1.0f - d2;
- d3 = 1.0f - d3;
- bits |= FLOATSIGNBITSET( d0 ) << 2;
- bits |= FLOATSIGNBITSET( d1 ) << 3;
- bits |= FLOATSIGNBITSET( d2 ) << 10;
- bits |= FLOATSIGNBITSET( d3 ) << 11;
- *(unsigned short *)(cullBits + i) = bits;
- }
- if ( numVerts & 1 ) {
- byte bits;
- float d0, d1;
- const idPlane &p0 = planes[0];
- const idPlane &p1 = planes[1];
- const idVec3 &v0 = verts[numVerts - 1].xyz;
- d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
- d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
- texCoords[i][0] = d0;
- texCoords[i][1] = d1;
- bits = FLOATSIGNBITSET( d0 ) << 0;
- bits |= FLOATSIGNBITSET( d1 ) << 1;
- d0 = 1.0f - d0;
- d1 = 1.0f - d1;
- bits |= FLOATSIGNBITSET( d0 ) << 2;
- bits |= FLOATSIGNBITSET( d1 ) << 3;
- cullBits[numVerts - 1] = bits;
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::DeriveTriPlanes
- ============
- */
- void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
- #if 1
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- __asm {
- mov eax, numIndexes
- shl eax, 2
- mov esi, verts
- mov edi, indexes
- mov edx, planes
- add edi, eax
- neg eax
- add eax, 4*12
- jge done4
- loopPlane4:
- mov ebx, [edi+eax-4*12+4]
- imul ebx, DRAWVERT_SIZE
- mov ecx, [edi+eax-4*12+0]
- imul ecx, DRAWVERT_SIZE
- movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
- subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
- subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
- subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- mov ebx, [edi+eax-4*12+8]
- imul ebx, DRAWVERT_SIZE
- shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
- movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
- subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
- subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
- subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- mov ebx, [edi+eax-3*12+4]
- imul ebx, DRAWVERT_SIZE
- mov ecx, [edi+eax-3*12+0]
- imul ecx, DRAWVERT_SIZE
- shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
- movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
- subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- movss xmm0, xmm6
- movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
- subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- movss xmm1, xmm7
- movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
- subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- movss xmm2, xmm6
- mov ebx, [edi+eax-3*12+8]
- imul ebx, DRAWVERT_SIZE
- shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
- movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
- subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- movss xmm3, xmm7
- movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
- subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- movss xmm4, xmm6
- movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
- subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- movss xmm5, xmm7
- mov ebx, [edi+eax-2*12+4]
- imul ebx, DRAWVERT_SIZE
- mov ecx, [edi+eax-2*12+0]
- imul ecx, DRAWVERT_SIZE
- shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
- movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
- subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- movss xmm0, xmm6
- movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
- subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- movss xmm1, xmm7
- movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
- subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- movss xmm2, xmm6
- mov ebx, [edi+eax-2*12+8]
- imul ebx, DRAWVERT_SIZE
- shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
- movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
- subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- movss xmm3, xmm7
- movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
- subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- movss xmm4, xmm6
- movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
- subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- movss xmm5, xmm7
- mov ebx, [edi+eax-1*12+4]
- imul ebx, DRAWVERT_SIZE
- mov ecx, [edi+eax-1*12+0]
- imul ecx, DRAWVERT_SIZE
- shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
- shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
- movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
- subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- movss xmm0, xmm6
- movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
- subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- movss xmm1, xmm7
- movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
- subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- movss xmm2, xmm6
- mov ebx, [edi+eax-1*12+8]
- imul ebx, DRAWVERT_SIZE
- movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
- subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- movss xmm3, xmm7
- movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
- subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- movss xmm4, xmm6
- movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
- subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- movss xmm5, xmm7
- movaps xmm6, xmm4
- mulps xmm6, xmm2
- movaps xmm7, xmm5
- mulps xmm7, xmm1
- subps xmm6, xmm7
- mulps xmm5, xmm0
- mulps xmm2, xmm3
- subps xmm5, xmm2
- mulps xmm3, xmm1
- mulps xmm4, xmm0
- subps xmm3, xmm4
- movaps xmm0, xmm6
- mulps xmm6, xmm6
- movaps xmm1, xmm5
- mulps xmm5, xmm5
- movaps xmm2, xmm3
- mulps xmm3, xmm3
- addps xmm3, xmm5
- addps xmm3, xmm6
- rsqrtps xmm3, xmm3
- add edx, 4*16
- mov ecx, [edi+eax-1*12+0]
- imul ecx, DRAWVERT_SIZE
- mulps xmm0, xmm3
- mulps xmm1, xmm3
- mulps xmm2, xmm3
- movss [edx-1*16+0], xmm0
- movss [edx-1*16+4], xmm1
- movss [edx-1*16+8], xmm2
- mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- xorps xmm0, SIMD_SP_singleSignBitMask
- subss xmm0, xmm1
- subss xmm0, xmm2
- movss [edx-1*16+12], xmm0
- mov ecx, [edi+eax-2*12+0]
- imul ecx, DRAWVERT_SIZE
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [edx-2*16+0], xmm0
- movss [edx-2*16+4], xmm1
- movss [edx-2*16+8], xmm2
- mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- xorps xmm0, SIMD_SP_singleSignBitMask
- subss xmm0, xmm1
- subss xmm0, xmm2
- movss [edx-2*16+12], xmm0
- mov ecx, [edi+eax-3*12+0]
- imul ecx, DRAWVERT_SIZE
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [edx-3*16+0], xmm0
- movss [edx-3*16+4], xmm1
- movss [edx-3*16+8], xmm2
- mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- xorps xmm0, SIMD_SP_singleSignBitMask
- subss xmm0, xmm1
- subss xmm0, xmm2
- movss [edx-3*16+12], xmm0
- mov ecx, [edi+eax-4*12+0]
- imul ecx, DRAWVERT_SIZE
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [edx-4*16+0], xmm0
- movss [edx-4*16+4], xmm1
- movss [edx-4*16+8], xmm2
- mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- xorps xmm0, SIMD_SP_singleSignBitMask
- subss xmm0, xmm1
- subss xmm0, xmm2
- movss [edx-4*16+12], xmm0
- add eax, 4*12
- jle loopPlane4
- done4:
- sub eax, 4*12
- jge done
- loopPlane1:
- mov ebx, [edi+eax+4]
- imul ebx, DRAWVERT_SIZE
- mov ecx, [edi+eax+0]
- imul ecx, DRAWVERT_SIZE
- movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
- subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
- subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
- subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- mov ebx, [edi+eax+8]
- imul ebx, DRAWVERT_SIZE
- movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
- subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
- subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
- subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- movss xmm6, xmm4
- mulss xmm6, xmm2
- movss xmm7, xmm5
- mulss xmm7, xmm1
- subss xmm6, xmm7
- mulss xmm5, xmm0
- mulss xmm2, xmm3
- subss xmm5, xmm2
- mulss xmm3, xmm1
- mulss xmm4, xmm0
- subss xmm3, xmm4
- movss xmm0, xmm6
- mulss xmm6, xmm6
- movss xmm1, xmm5
- mulss xmm5, xmm5
- movss xmm2, xmm3
- mulss xmm3, xmm3
- addss xmm3, xmm5
- addss xmm3, xmm6
- rsqrtss xmm3, xmm3
- add edx, 1*16
- mulss xmm0, xmm3
- mulss xmm1, xmm3
- mulss xmm2, xmm3
- movss [edx-1*16+0], xmm0
- movss [edx-1*16+4], xmm1
- movss [edx-1*16+8], xmm2
- mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
- mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
- mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
- xorps xmm0, SIMD_SP_singleSignBitMask
- subss xmm0, xmm1
- subss xmm0, xmm2
- movss [edx-1*16+12], xmm0
- add eax, 1*12
- jl loopPlane1
- done:
- }
- #else
- int i, j;
- for ( i = 0; i <= numIndexes - 12; i += 12 ) {
- ALIGN16( float d0[4] );
- ALIGN16( float d1[4] );
- ALIGN16( float d2[4] );
- ALIGN16( float d3[4] );
- ALIGN16( float d4[4] );
- ALIGN16( float d5[4] );
- ALIGN16( float n0[4] );
- ALIGN16( float n1[4] );
- ALIGN16( float n2[4] );
- for ( j = 0; j < 4; j++ ) {
- const idDrawVert *a, *b, *c;
- a = verts + indexes[i + j * 3 + 0];
- b = verts + indexes[i + j * 3 + 1];
- c = verts + indexes[i + j * 3 + 2];
- d0[j] = b->xyz[0] - a->xyz[0];
- d1[j] = b->xyz[1] - a->xyz[1];
- d2[j] = b->xyz[2] - a->xyz[2];
- d3[j] = c->xyz[0] - a->xyz[0];
- d4[j] = c->xyz[1] - a->xyz[1];
- d5[j] = c->xyz[2] - a->xyz[2];
- }
- ALIGN16( float tmp[4] );
- n0[0] = d4[0] * d2[0];
- n0[1] = d4[1] * d2[1];
- n0[2] = d4[2] * d2[2];
- n0[3] = d4[3] * d2[3];
- n0[0] -= d5[0] * d1[0];
- n0[1] -= d5[1] * d1[1];
- n0[2] -= d5[2] * d1[2];
- n0[3] -= d5[3] * d1[3];
- n1[0] = d5[0] * d0[0];
- n1[1] = d5[1] * d0[1];
- n1[2] = d5[2] * d0[2];
- n1[3] = d5[3] * d0[3];
- n1[0] -= d3[0] * d2[0];
- n1[1] -= d3[1] * d2[1];
- n1[2] -= d3[2] * d2[2];
- n1[3] -= d3[3] * d2[3];
- n2[0] = d3[0] * d1[0];
- n2[1] = d3[1] * d1[1];
- n2[2] = d3[2] * d1[2];
- n2[3] = d3[3] * d1[3];
- n2[0] -= d4[0] * d0[0];
- n2[1] -= d4[1] * d0[1];
- n2[2] -= d4[2] * d0[2];
- n2[3] -= d4[3] * d0[3];
- tmp[0] = n0[0] * n0[0];
- tmp[1] = n0[1] * n0[1];
- tmp[2] = n0[2] * n0[2];
- tmp[3] = n0[3] * n0[3];
- tmp[0] += n1[0] * n1[0];
- tmp[1] += n1[1] * n1[1];
- tmp[2] += n1[2] * n1[2];
- tmp[3] += n1[3] * n1[3];
- tmp[0] += n2[0] * n2[0];
- tmp[1] += n2[1] * n2[1];
- tmp[2] += n2[2] * n2[2];
- tmp[3] += n2[3] * n2[3];
- tmp[0] = idMath::RSqrt( tmp[0] );
- tmp[1] = idMath::RSqrt( tmp[1] );
- tmp[2] = idMath::RSqrt( tmp[2] );
- tmp[3] = idMath::RSqrt( tmp[3] );
- n0[0] *= tmp[0];
- n0[1] *= tmp[1];
- n0[2] *= tmp[2];
- n0[3] *= tmp[3];
- n1[0] *= tmp[0];
- n1[1] *= tmp[1];
- n1[2] *= tmp[2];
- n1[3] *= tmp[3];
- n2[0] *= tmp[0];
- n2[1] *= tmp[1];
- n2[2] *= tmp[2];
- n2[3] *= tmp[3];
- for ( j = 0; j < 4; j++ ) {
- const idDrawVert *a;
- a = verts + indexes[i + j * 3];
- planes->Normal()[0] = n0[j];
- planes->Normal()[1] = n1[j];
- planes->Normal()[2] = n2[j];
- planes->FitThroughPoint( a->xyz );
- planes++;
- }
- }
- for ( ; i < numIndexes; i += 3 ) {
- const idDrawVert *a, *b, *c;
- float d0, d1, d2, d3, d4, d5;
- float n0, n1, n2;
- a = verts + indexes[i + 0];
- b = verts + indexes[i + 1];
- c = verts + indexes[i + 2];
- d0 = b->xyz[0] - a->xyz[0];
- d1 = b->xyz[1] - a->xyz[1];
- d2 = b->xyz[2] - a->xyz[2];
- d3 = c->xyz[0] - a->xyz[0];
- d4 = c->xyz[1] - a->xyz[1];
- d5 = c->xyz[2] - a->xyz[2];
- float tmp;
- n0 = d4 * d2 - d5 * d1;
- n1 = d5 * d0 - d3 * d2;
- n2 = d3 * d1 - d4 * d0;
- tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
- n0 *= tmp;
- n1 *= tmp;
- n2 *= tmp;
- planes->Normal()[0] = n0;
- planes->Normal()[1] = n1;
- planes->Normal()[2] = n2;
- planes->FitThroughPoint( a->xyz );
- planes++;
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::DeriveTangents
- ============
- */
- //#define REFINE_TANGENT_SQUAREROOT
- #define FIX_DEGENERATE_TANGENT
- void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
- int i;
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
- assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
- assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
- assert( planes != NULL );
- assert( verts != NULL );
- assert( numVerts >= 0 );
- #ifdef REFINE_TANGENT_SQUAREROOT
- __asm {
- movaps xmm6, SIMD_SP_rsqrt_c0
- movaps xmm7, SIMD_SP_rsqrt_c1
- }
- #endif
- bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
- memset( used, 0, numVerts * sizeof( used[0] ) );
- for ( i = 0; i <= numIndexes - 12; i += 12 ) {
- idDrawVert *a, *b, *c;
- ALIGN16( unsigned long signBit[4] );
- ALIGN16( float d0[4] );
- ALIGN16( float d1[4] );
- ALIGN16( float d2[4] );
- ALIGN16( float d3[4] );
- ALIGN16( float d4[4] );
- ALIGN16( float d5[4] );
- ALIGN16( float d6[4] );
- ALIGN16( float d7[4] );
- ALIGN16( float d8[4] );
- ALIGN16( float d9[4] );
- ALIGN16( float n0[4] );
- ALIGN16( float n1[4] );
- ALIGN16( float n2[4] );
- ALIGN16( float t0[4] );
- ALIGN16( float t1[4] );
- ALIGN16( float t2[4] );
- ALIGN16( float t3[4] );
- ALIGN16( float t4[4] );
- ALIGN16( float t5[4] );
- for ( int j = 0; j < 4; j++ ) {
- a = verts + indexes[i + j * 3 + 0];
- b = verts + indexes[i + j * 3 + 1];
- c = verts + indexes[i + j * 3 + 2];
- d0[j] = b->xyz[0] - a->xyz[0];
- d1[j] = b->xyz[1] - a->xyz[1];
- d2[j] = b->xyz[2] - a->xyz[2];
- d3[j] = b->st[0] - a->st[0];
- d4[j] = b->st[1] - a->st[1];
- d5[j] = c->xyz[0] - a->xyz[0];
- d6[j] = c->xyz[1] - a->xyz[1];
- d7[j] = c->xyz[2] - a->xyz[2];
- d8[j] = c->st[0] - a->st[0];
- d9[j] = c->st[1] - a->st[1];
- }
- #if 1
- __asm {
- // normal
- movaps xmm0, d6
- mulps xmm0, d2
- movaps xmm1, d7
- mulps xmm1, d1
- subps xmm0, xmm1
- movaps xmm1, d7
- mulps xmm1, d0
- movaps xmm2, d5
- mulps xmm2, d2
- subps xmm1, xmm2
- movaps xmm2, d5
- mulps xmm2, d1
- movaps xmm3, d6
- mulps xmm3, d0
- subps xmm2, xmm3
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm3, xmm4
- addps xmm3, xmm5
- #ifdef FIX_DEGENERATE_TANGENT
- xorps xmm4, xmm4
- cmpeqps xmm4, xmm3
- andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
- andps xmm3, SIMD_SP_absMask // make sure the values are positive
- orps xmm3, xmm4
- #endif
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtps xmm4, xmm3
- mulps xmm3, xmm4
- mulps xmm3, xmm4
- subps xmm3, xmm6
- mulps xmm4, xmm7
- mulps xmm3, xmm4
- #else
- rsqrtps xmm3, xmm3
- #endif
- mulps xmm0, xmm3
- movaps n0, xmm0
- mulps xmm1, xmm3
- movaps n1, xmm1
- mulps xmm2, xmm3
- movaps n2, xmm2
- // area sign bit
- movaps xmm0, d3
- mulps xmm0, d9
- movaps xmm1, d4
- mulps xmm1, d8
- subps xmm0, xmm1
- andps xmm0, SIMD_SP_signBitMask
- movaps signBit, xmm0
- // first tangent
- movaps xmm0, d0
- mulps xmm0, d9
- movaps xmm1, d4
- mulps xmm1, d5
- subps xmm0, xmm1
- movaps xmm1, d1
- mulps xmm1, d9
- movaps xmm2, d4
- mulps xmm2, d6
- subps xmm1, xmm2
- movaps xmm2, d2
- mulps xmm2, d9
- movaps xmm3, d4
- mulps xmm3, d7
- subps xmm2, xmm3
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm3, xmm4
- addps xmm3, xmm5
- #ifdef FIX_DEGENERATE_TANGENT
- xorps xmm4, xmm4
- cmpeqps xmm4, xmm3
- andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
- andps xmm3, SIMD_SP_absMask // make sure the values are positive
- orps xmm3, xmm4
- #endif
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtps xmm4, xmm3
- mulps xmm3, xmm4
- mulps xmm3, xmm4
- subps xmm3, xmm6
- mulps xmm4, xmm7
- mulps xmm3, xmm4
- #else
- rsqrtps xmm3, xmm3
- #endif
- xorps xmm3, signBit
- mulps xmm0, xmm3
- movaps t0, xmm0
- mulps xmm1, xmm3
- movaps t1, xmm1
- mulps xmm2, xmm3
- movaps t2, xmm2
- // second tangent
- movaps xmm0, d3
- mulps xmm0, d5
- movaps xmm1, d0
- mulps xmm1, d8
- subps xmm0, xmm1
- movaps xmm1, d3
- mulps xmm1, d6
- movaps xmm2, d1
- mulps xmm2, d8
- subps xmm1, xmm2
- movaps xmm2, d3
- mulps xmm2, d7
- movaps xmm3, d2
- mulps xmm3, d8
- subps xmm2, xmm3
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm3, xmm4
- addps xmm3, xmm5
- #ifdef FIX_DEGENERATE_TANGENT
- xorps xmm4, xmm4
- cmpeqps xmm4, xmm3
- andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
- andps xmm3, SIMD_SP_absMask // make sure the values are positive
- orps xmm3, xmm4
- #endif
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtps xmm4, xmm3
- mulps xmm3, xmm4
- mulps xmm3, xmm4
- subps xmm3, xmm6
- mulps xmm4, xmm7
- mulps xmm3, xmm4
- #else
- rsqrtps xmm3, xmm3
- #endif
- xorps xmm3, signBit
- mulps xmm0, xmm3
- movaps t3, xmm0
- mulps xmm1, xmm3
- movaps t4, xmm1
- mulps xmm2, xmm3
- movaps t5, xmm2
- }
- #else
- ALIGN16( float tmp[4] );
- // normal
- n0[0] = d6[0] * d2[0];
- n0[1] = d6[1] * d2[1];
- n0[2] = d6[2] * d2[2];
- n0[3] = d6[3] * d2[3];
- n0[0] -= d7[0] * d1[0];
- n0[1] -= d7[1] * d1[1];
- n0[2] -= d7[2] * d1[2];
- n0[3] -= d7[3] * d1[3];
- n1[0] = d7[0] * d0[0];
- n1[1] = d7[1] * d0[1];
- n1[2] = d7[2] * d0[2];
- n1[3] = d7[3] * d0[3];
- n1[0] -= d5[0] * d2[0];
- n1[1] -= d5[1] * d2[1];
- n1[2] -= d5[2] * d2[2];
- n1[3] -= d5[3] * d2[3];
- n2[0] = d5[0] * d1[0];
- n2[1] = d5[1] * d1[1];
- n2[2] = d5[2] * d1[2];
- n2[3] = d5[3] * d1[3];
- n2[0] -= d6[0] * d0[0];
- n2[1] -= d6[1] * d0[1];
- n2[2] -= d6[2] * d0[2];
- n2[3] -= d6[3] * d0[3];
- tmp[0] = n0[0] * n0[0];
- tmp[1] = n0[1] * n0[1];
- tmp[2] = n0[2] * n0[2];
- tmp[3] = n0[3] * n0[3];
- tmp[0] += n1[0] * n1[0];
- tmp[1] += n1[1] * n1[1];
- tmp[2] += n1[2] * n1[2];
- tmp[3] += n1[3] * n1[3];
- tmp[0] += n2[0] * n2[0];
- tmp[1] += n2[1] * n2[1];
- tmp[2] += n2[2] * n2[2];
- tmp[3] += n2[3] * n2[3];
- tmp[0] = idMath::RSqrt( tmp[0] );
- tmp[1] = idMath::RSqrt( tmp[1] );
- tmp[2] = idMath::RSqrt( tmp[2] );
- tmp[3] = idMath::RSqrt( tmp[3] );
- n0[0] *= tmp[0];
- n0[1] *= tmp[1];
- n0[2] *= tmp[2];
- n0[3] *= tmp[3];
- n1[0] *= tmp[0];
- n1[1] *= tmp[1];
- n1[2] *= tmp[2];
- n1[3] *= tmp[3];
- n2[0] *= tmp[0];
- n2[1] *= tmp[1];
- n2[2] *= tmp[2];
- n2[3] *= tmp[3];
- // area sign bit
- tmp[0] = d3[0] * d9[0];
- tmp[1] = d3[1] * d9[1];
- tmp[2] = d3[2] * d9[2];
- tmp[3] = d3[3] * d9[3];
- tmp[0] -= d4[0] * d8[0];
- tmp[1] -= d4[1] * d8[1];
- tmp[2] -= d4[2] * d8[2];
- tmp[3] -= d4[3] * d8[3];
- signBit[0] = ( *(unsigned long *)&tmp[0] ) & ( 1 << 31 );
- signBit[1] = ( *(unsigned long *)&tmp[1] ) & ( 1 << 31 );
- signBit[2] = ( *(unsigned long *)&tmp[2] ) & ( 1 << 31 );
- signBit[3] = ( *(unsigned long *)&tmp[3] ) & ( 1 << 31 );
- // first tangent
- t0[0] = d0[0] * d9[0];
- t0[1] = d0[1] * d9[1];
- t0[2] = d0[2] * d9[2];
- t0[3] = d0[3] * d9[3];
- t0[0] -= d4[0] * d5[0];
- t0[1] -= d4[1] * d5[1];
- t0[2] -= d4[2] * d5[2];
- t0[3] -= d4[3] * d5[3];
- t1[0] = d1[0] * d9[0];
- t1[1] = d1[1] * d9[1];
- t1[2] = d1[2] * d9[2];
- t1[3] = d1[3] * d9[3];
- t1[0] -= d4[0] * d6[0];
- t1[1] -= d4[1] * d6[1];
- t1[2] -= d4[2] * d6[2];
- t1[3] -= d4[3] * d6[3];
- t2[0] = d2[0] * d9[0];
- t2[1] = d2[1] * d9[1];
- t2[2] = d2[2] * d9[2];
- t2[3] = d2[3] * d9[3];
- t2[0] -= d4[0] * d7[0];
- t2[1] -= d4[1] * d7[1];
- t2[2] -= d4[2] * d7[2];
- t2[3] -= d4[3] * d7[3];
- tmp[0] = t0[0] * t0[0];
- tmp[1] = t0[1] * t0[1];
- tmp[2] = t0[2] * t0[2];
- tmp[3] = t0[3] * t0[3];
- tmp[0] += t1[0] * t1[0];
- tmp[1] += t1[1] * t1[1];
- tmp[2] += t1[2] * t1[2];
- tmp[3] += t1[3] * t1[3];
- tmp[0] += t2[0] * t2[0];
- tmp[1] += t2[1] * t2[1];
- tmp[2] += t2[2] * t2[2];
- tmp[3] += t2[3] * t2[3];
- tmp[0] = idMath::RSqrt( tmp[0] );
- tmp[1] = idMath::RSqrt( tmp[1] );
- tmp[2] = idMath::RSqrt( tmp[2] );
- tmp[3] = idMath::RSqrt( tmp[3] );
- *(unsigned long *)&tmp[0] ^= signBit[0];
- *(unsigned long *)&tmp[1] ^= signBit[1];
- *(unsigned long *)&tmp[2] ^= signBit[2];
- *(unsigned long *)&tmp[3] ^= signBit[3];
- t0[0] *= tmp[0];
- t0[1] *= tmp[1];
- t0[2] *= tmp[2];
- t0[3] *= tmp[3];
- t1[0] *= tmp[0];
- t1[1] *= tmp[1];
- t1[2] *= tmp[2];
- t1[3] *= tmp[3];
- t2[0] *= tmp[0];
- t2[1] *= tmp[1];
- t2[2] *= tmp[2];
- t2[3] *= tmp[3];
- // second tangent
- t3[0] = d3[0] * d5[0];
- t3[1] = d3[1] * d5[1];
- t3[2] = d3[2] * d5[2];
- t3[3] = d3[3] * d5[3];
- t3[0] -= d0[0] * d8[0];
- t3[1] -= d0[1] * d8[1];
- t3[2] -= d0[2] * d8[2];
- t3[3] -= d0[3] * d8[3];
- t4[0] = d3[0] * d6[0];
- t4[1] = d3[1] * d6[1];
- t4[2] = d3[2] * d6[2];
- t4[3] = d3[3] * d6[3];
- t4[0] -= d1[0] * d8[0];
- t4[1] -= d1[1] * d8[1];
- t4[2] -= d1[2] * d8[2];
- t4[3] -= d1[3] * d8[3];
- t5[0] = d3[0] * d7[0];
- t5[1] = d3[1] * d7[1];
- t5[2] = d3[2] * d7[2];
- t5[3] = d3[3] * d7[3];
- t5[0] -= d2[0] * d8[0];
- t5[1] -= d2[1] * d8[1];
- t5[2] -= d2[2] * d8[2];
- t5[3] -= d2[3] * d8[3];
- tmp[0] = t3[0] * t3[0];
- tmp[1] = t3[1] * t3[1];
- tmp[2] = t3[2] * t3[2];
- tmp[3] = t3[3] * t3[3];
- tmp[0] += t4[0] * t4[0];
- tmp[1] += t4[1] * t4[1];
- tmp[2] += t4[2] * t4[2];
- tmp[3] += t4[3] * t4[3];
- tmp[0] += t5[0] * t5[0];
- tmp[1] += t5[1] * t5[1];
- tmp[2] += t5[2] * t5[2];
- tmp[3] += t5[3] * t5[3];
- tmp[0] = idMath::RSqrt( tmp[0] );
- tmp[1] = idMath::RSqrt( tmp[1] );
- tmp[2] = idMath::RSqrt( tmp[2] );
- tmp[3] = idMath::RSqrt( tmp[3] );
- *(unsigned long *)&tmp[0] ^= signBit[0];
- *(unsigned long *)&tmp[1] ^= signBit[1];
- *(unsigned long *)&tmp[2] ^= signBit[2];
- *(unsigned long *)&tmp[3] ^= signBit[3];
- t3[0] *= tmp[0];
- t3[1] *= tmp[1];
- t3[2] *= tmp[2];
- t3[3] *= tmp[3];
- t4[0] *= tmp[0];
- t4[1] *= tmp[1];
- t4[2] *= tmp[2];
- t4[3] *= tmp[3];
- t5[0] *= tmp[0];
- t5[1] *= tmp[1];
- t5[2] *= tmp[2];
- t5[3] *= tmp[3];
- #endif
- for ( int j = 0; j < 4; j++ ) {
- const int v0 = indexes[i + j * 3 + 0];
- const int v1 = indexes[i + j * 3 + 1];
- const int v2 = indexes[i + j * 3 + 2];
- a = verts + v0;
- b = verts + v1;
- c = verts + v2;
- planes->Normal()[0] = n0[j];
- planes->Normal()[1] = n1[j];
- planes->Normal()[2] = n2[j];
- planes->FitThroughPoint( a->xyz );
- planes++;
- if ( used[v0] ) {
- a->normal[0] += n0[j];
- a->normal[1] += n1[j];
- a->normal[2] += n2[j];
- a->tangents[0][0] += t0[j];
- a->tangents[0][1] += t1[j];
- a->tangents[0][2] += t2[j];
- a->tangents[1][0] += t3[j];
- a->tangents[1][1] += t4[j];
- a->tangents[1][2] += t5[j];
- } else {
- a->normal[0] = n0[j];
- a->normal[1] = n1[j];
- a->normal[2] = n2[j];
- a->tangents[0][0] = t0[j];
- a->tangents[0][1] = t1[j];
- a->tangents[0][2] = t2[j];
- a->tangents[1][0] = t3[j];
- a->tangents[1][1] = t4[j];
- a->tangents[1][2] = t5[j];
- used[v0] = true;
- }
- if ( used[v1] ) {
- b->normal[0] += n0[j];
- b->normal[1] += n1[j];
- b->normal[2] += n2[j];
- b->tangents[0][0] += t0[j];
- b->tangents[0][1] += t1[j];
- b->tangents[0][2] += t2[j];
- b->tangents[1][0] += t3[j];
- b->tangents[1][1] += t4[j];
- b->tangents[1][2] += t5[j];
- } else {
- b->normal[0] = n0[j];
- b->normal[1] = n1[j];
- b->normal[2] = n2[j];
- b->tangents[0][0] = t0[j];
- b->tangents[0][1] = t1[j];
- b->tangents[0][2] = t2[j];
- b->tangents[1][0] = t3[j];
- b->tangents[1][1] = t4[j];
- b->tangents[1][2] = t5[j];
- used[v1] = true;
- }
- if ( used[v2] ) {
- c->normal[0] += n0[j];
- c->normal[1] += n1[j];
- c->normal[2] += n2[j];
- c->tangents[0][0] += t0[j];
- c->tangents[0][1] += t1[j];
- c->tangents[0][2] += t2[j];
- c->tangents[1][0] += t3[j];
- c->tangents[1][1] += t4[j];
- c->tangents[1][2] += t5[j];
- } else {
- c->normal[0] = n0[j];
- c->normal[1] = n1[j];
- c->normal[2] = n2[j];
- c->tangents[0][0] = t0[j];
- c->tangents[0][1] = t1[j];
- c->tangents[0][2] = t2[j];
- c->tangents[1][0] = t3[j];
- c->tangents[1][1] = t4[j];
- c->tangents[1][2] = t5[j];
- used[v2] = true;
- }
- }
- }
- for ( ; i < numIndexes; i += 3 ) {
- idDrawVert *a, *b, *c;
- ALIGN16( unsigned long signBit[4] );
- float d0, d1, d2, d3, d4;
- float d5, d6, d7, d8, d9;
- float n0, n1, n2;
- float t0, t1, t2;
- float t3, t4, t5;
- const int v0 = indexes[i + 0];
- const int v1 = indexes[i + 1];
- const int v2 = indexes[i + 2];
- a = verts + v0;
- b = verts + v1;
- c = verts + v2;
- d0 = b->xyz[0] - a->xyz[0];
- d1 = b->xyz[1] - a->xyz[1];
- d2 = b->xyz[2] - a->xyz[2];
- d3 = b->st[0] - a->st[0];
- d4 = b->st[1] - a->st[1];
- d5 = c->xyz[0] - a->xyz[0];
- d6 = c->xyz[1] - a->xyz[1];
- d7 = c->xyz[2] - a->xyz[2];
- d8 = c->st[0] - a->st[0];
- d9 = c->st[1] - a->st[1];
- #if 1
- __asm {
- // normal
- movss xmm0, d6
- mulss xmm0, d2
- movss xmm1, d7
- mulss xmm1, d1
- subss xmm0, xmm1
- movss xmm1, d7
- mulss xmm1, d0
- movss xmm2, d5
- mulss xmm2, d2
- subss xmm1, xmm2
- movss xmm2, d5
- mulss xmm2, d1
- movss xmm3, d6
- mulss xmm3, d0
- subss xmm2, xmm3
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- mulss xmm3, xmm3
- mulss xmm4, xmm4
- mulss xmm5, xmm5
- addss xmm3, xmm4
- addss xmm3, xmm5
- #ifdef FIX_DEGENERATE_TANGENT
- xorps xmm4, xmm4
- cmpeqps xmm4, xmm3
- andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
- andps xmm3, SIMD_SP_absMask // make sure the values are positive
- orps xmm3, xmm4
- #endif
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtss xmm4, xmm3
- mulss xmm3, xmm4
- mulss xmm3, xmm4
- subss xmm3, xmm6
- mulss xmm4, xmm7
- mulss xmm3, xmm4
- #else
- rsqrtss xmm3, xmm3
- #endif
- mulss xmm0, xmm3
- movss n0, xmm0
- mulss xmm1, xmm3
- movss n1, xmm1
- mulss xmm2, xmm3
- movss n2, xmm2
- // area sign bit
- movss xmm0, d3
- mulss xmm0, d9
- movss xmm1, d4
- mulss xmm1, d8
- subss xmm0, xmm1
- andps xmm0, SIMD_SP_signBitMask
- movaps signBit, xmm0
- // first tangent
- movss xmm0, d0
- mulss xmm0, d9
- movss xmm1, d4
- mulss xmm1, d5
- subss xmm0, xmm1
- movss xmm1, d1
- mulss xmm1, d9
- movss xmm2, d4
- mulss xmm2, d6
- subss xmm1, xmm2
- movss xmm2, d2
- mulss xmm2, d9
- movss xmm3, d4
- mulss xmm3, d7
- subss xmm2, xmm3
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- mulss xmm3, xmm3
- mulss xmm4, xmm4
- mulss xmm5, xmm5
- addss xmm3, xmm4
- addss xmm3, xmm5
- #ifdef FIX_DEGENERATE_TANGENT
- xorps xmm4, xmm4
- cmpeqps xmm4, xmm3
- andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
- andps xmm3, SIMD_SP_absMask // make sure the values are positive
- orps xmm3, xmm4
- #endif
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtss xmm4, xmm3
- mulss xmm3, xmm4
- mulss xmm3, xmm4
- subss xmm3, xmm6
- mulss xmm4, xmm7
- mulss xmm3, xmm4
- #else
- rsqrtss xmm3, xmm3
- #endif
- xorps xmm3, signBit
- mulss xmm0, xmm3
- movss t0, xmm0
- mulss xmm1, xmm3
- movss t1, xmm1
- mulss xmm2, xmm3
- movss t2, xmm2
- // second tangent
- movss xmm0, d3
- mulss xmm0, d5
- movss xmm1, d0
- mulss xmm1, d8
- subss xmm0, xmm1
- movss xmm1, d3
- mulss xmm1, d6
- movss xmm2, d1
- mulss xmm2, d8
- subss xmm1, xmm2
- movss xmm2, d3
- mulss xmm2, d7
- movss xmm3, d2
- mulss xmm3, d8
- subss xmm2, xmm3
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- mulss xmm3, xmm3
- mulss xmm4, xmm4
- mulss xmm5, xmm5
- addss xmm3, xmm4
- addss xmm3, xmm5
- #ifdef FIX_DEGENERATE_TANGENT
- xorps xmm4, xmm4
- cmpeqps xmm4, xmm3
- andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
- andps xmm3, SIMD_SP_absMask // make sure the values are positive
- orps xmm3, xmm4
- #endif
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtss xmm4, xmm3
- mulss xmm3, xmm4
- mulss xmm3, xmm4
- subss xmm3, xmm6
- mulss xmm4, xmm7
- mulss xmm3, xmm4
- #else
- rsqrtss xmm3, xmm3
- #endif
- xorps xmm3, signBit
- mulss xmm0, xmm3
- movss t3, xmm0
- mulss xmm1, xmm3
- movss t4, xmm1
- mulss xmm2, xmm3
- movss t5, xmm2
- }
- #else
- float tmp;
- // normal
- n0 = d6 * d2 - d7 * d1;
- n1 = d7 * d0 - d5 * d2;
- n2 = d5 * d1 - d6 * d0;
- tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
- n0 *= tmp;
- n1 *= tmp;
- n2 *= tmp;
- // area sign bit
- tmp = d3 * d9 - d4 * d8;
- signBit[0] = ( *(unsigned long *)&tmp ) & ( 1 << 31 );
- // first tangent
- t0 = d0 * d9 - d4 * d5;
- t1 = d1 * d9 - d4 * d6;
- t2 = d2 * d9 - d4 * d7;
- tmp = idMath::RSqrt( t0 * t0 + t1 * t1 + t2 * t2 );
- *(unsigned long *)&tmp ^= signBit[0];
- t0 *= tmp;
- t1 *= tmp;
- t2 *= tmp;
- // second tangent
- t3 = d3 * d5 - d0 * d8;
- t4 = d3 * d6 - d1 * d8;
- t5 = d3 * d7 - d2 * d8;
- tmp = idMath::RSqrt( t3 * t3 + t4 * t4 + t5 * t5 );
- *(unsigned long *)&tmp ^= signBit[0];
- t3 *= tmp;
- t4 *= tmp;
- t5 *= tmp;
- #endif
- planes->Normal()[0] = n0;
- planes->Normal()[1] = n1;
- planes->Normal()[2] = n2;
- planes->FitThroughPoint( a->xyz );
- planes++;
- if ( used[v0] ) {
- a->normal[0] += n0;
- a->normal[1] += n1;
- a->normal[2] += n2;
- a->tangents[0][0] += t0;
- a->tangents[0][1] += t1;
- a->tangents[0][2] += t2;
- a->tangents[1][0] += t3;
- a->tangents[1][1] += t4;
- a->tangents[1][2] += t5;
- } else {
- a->normal[0] = n0;
- a->normal[1] = n1;
- a->normal[2] = n2;
- a->tangents[0][0] = t0;
- a->tangents[0][1] = t1;
- a->tangents[0][2] = t2;
- a->tangents[1][0] = t3;
- a->tangents[1][1] = t4;
- a->tangents[1][2] = t5;
- used[v0] = true;
- }
- if ( used[v1] ) {
- b->normal[0] += n0;
- b->normal[1] += n1;
- b->normal[2] += n2;
- b->tangents[0][0] += t0;
- b->tangents[0][1] += t1;
- b->tangents[0][2] += t2;
- b->tangents[1][0] += t3;
- b->tangents[1][1] += t4;
- b->tangents[1][2] += t5;
- } else {
- b->normal[0] = n0;
- b->normal[1] = n1;
- b->normal[2] = n2;
- b->tangents[0][0] = t0;
- b->tangents[0][1] = t1;
- b->tangents[0][2] = t2;
- b->tangents[1][0] = t3;
- b->tangents[1][1] = t4;
- b->tangents[1][2] = t5;
- used[v1] = true;
- }
- if ( used[v2] ) {
- c->normal[0] += n0;
- c->normal[1] += n1;
- c->normal[2] += n2;
- c->tangents[0][0] += t0;
- c->tangents[0][1] += t1;
- c->tangents[0][2] += t2;
- c->tangents[1][0] += t3;
- c->tangents[1][1] += t4;
- c->tangents[1][2] += t5;
- } else {
- c->normal[0] = n0;
- c->normal[1] = n1;
- c->normal[2] = n2;
- c->tangents[0][0] = t0;
- c->tangents[0][1] = t1;
- c->tangents[0][2] = t2;
- c->tangents[1][0] = t3;
- c->tangents[1][1] = t4;
- c->tangents[1][2] = t5;
- used[v2] = true;
- }
- }
- }
- /*
- ============
- idSIMD_SSE::DeriveUnsmoothedTangents
- ============
- */
- #define DERIVE_UNSMOOTHED_BITANGENT
- void VPCALL idSIMD_SSE::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
- int i, j;
- for ( i = 0; i <= numVerts - 4; i += 4 ) {
- ALIGN16( float s0[4] );
- ALIGN16( float s1[4] );
- ALIGN16( float s2[4] );
- ALIGN16( float d0[4] );
- ALIGN16( float d1[4] );
- ALIGN16( float d2[4] );
- ALIGN16( float d3[4] );
- ALIGN16( float d4[4] );
- ALIGN16( float d5[4] );
- ALIGN16( float d6[4] );
- ALIGN16( float d7[4] );
- ALIGN16( float d8[4] );
- ALIGN16( float d9[4] );
- ALIGN16( float n0[4] );
- ALIGN16( float n1[4] );
- ALIGN16( float n2[4] );
- ALIGN16( float t0[4] );
- ALIGN16( float t1[4] );
- ALIGN16( float t2[4] );
- ALIGN16( float t3[4] );
- ALIGN16( float t4[4] );
- ALIGN16( float t5[4] );
- for ( j = 0; j < 4; j++ ) {
- const idDrawVert *a, *b, *c;
- const dominantTri_s &dt = dominantTris[i+j];
- s0[j] = dt.normalizationScale[0];
- s1[j] = dt.normalizationScale[1];
- s2[j] = dt.normalizationScale[2];
- a = verts + i + j;
- b = verts + dt.v2;
- c = verts + dt.v3;
- d0[j] = b->xyz[0] - a->xyz[0];
- d1[j] = b->xyz[1] - a->xyz[1];
- d2[j] = b->xyz[2] - a->xyz[2];
- d3[j] = b->st[0] - a->st[0];
- d4[j] = b->st[1] - a->st[1];
- d5[j] = c->xyz[0] - a->xyz[0];
- d6[j] = c->xyz[1] - a->xyz[1];
- d7[j] = c->xyz[2] - a->xyz[2];
- d8[j] = c->st[0] - a->st[0];
- d9[j] = c->st[1] - a->st[1];
- }
- #if 1
- __asm {
- movaps xmm0, d6
- mulps xmm0, d2
- movaps xmm1, d7
- mulps xmm1, d1
- movaps xmm2, d7
- mulps xmm2, d0
- movaps xmm3, d5
- mulps xmm3, d2
- movaps xmm4, d5
- mulps xmm4, d1
- movaps xmm5, d6
- mulps xmm5, d0
- subps xmm0, xmm1
- subps xmm2, xmm3
- movaps xmm7, s2
- subps xmm4, xmm5
- mulps xmm0, xmm7
- movaps n0, xmm0
- mulps xmm2, xmm7
- movaps n1, xmm2
- mulps xmm4, xmm7
- movaps n2, xmm4
- movaps xmm0, d0
- mulps xmm0, d9
- movaps xmm1, d4
- mulps xmm1, d5
- movaps xmm2, d1
- mulps xmm2, d9
- movaps xmm3, d4
- mulps xmm3, d6
- movaps xmm4, d2
- mulps xmm4, d9
- movaps xmm5, d4
- mulps xmm5, d7
- subps xmm0, xmm1
- subps xmm2, xmm3
- movaps xmm7, s0
- subps xmm4, xmm5
- mulps xmm0, xmm7
- movaps t0, xmm0
- mulps xmm2, xmm7
- movaps t1, xmm2
- mulps xmm4, xmm7
- movaps t2, xmm4
- #ifndef DERIVE_UNSMOOTHED_BITANGENT
- movaps xmm0, d3
- mulps xmm0, d5
- movaps xmm1, d0
- mulps xmm1, d8
- movaps xmm2, d3
- mulps xmm2, d6
- movaps xmm3, d1
- mulps xmm3, d8
- movaps xmm4, d3
- mulps xmm4, d7
- movaps xmm5, d2
- mulps xmm5, d8
- #else
- movaps xmm0, n2
- mulps xmm0, t1
- movaps xmm1, n1
- mulps xmm1, t2
- movaps xmm2, n0
- mulps xmm2, t2
- movaps xmm3, n2
- mulps xmm3, t0
- movaps xmm4, n1
- mulps xmm4, t0
- movaps xmm5, n0
- mulps xmm5, t1
- #endif
- subps xmm0, xmm1
- subps xmm2, xmm3
- movaps xmm7, s1
- subps xmm4, xmm5
- mulps xmm0, xmm7
- movaps t3, xmm0
- mulps xmm2, xmm7
- movaps t4, xmm2
- mulps xmm4, xmm7
- movaps t5, xmm4
- }
- #else
- n0[0] = d6[0] * d2[0];
- n0[1] = d6[1] * d2[1];
- n0[2] = d6[2] * d2[2];
- n0[3] = d6[3] * d2[3];
- n1[0] = d7[0] * d0[0];
- n1[1] = d7[1] * d0[1];
- n1[2] = d7[2] * d0[2];
- n1[3] = d7[3] * d0[3];
- n2[0] = d5[0] * d1[0];
- n2[1] = d5[1] * d1[1];
- n2[2] = d5[2] * d1[2];
- n2[3] = d5[3] * d1[3];
- n0[0] -= d7[0] * d1[0];
- n0[1] -= d7[1] * d1[1];
- n0[2] -= d7[2] * d1[2];
- n0[3] -= d7[3] * d1[3];
- n1[0] -= d5[0] * d2[0];
- n1[1] -= d5[1] * d2[1];
- n1[2] -= d5[2] * d2[2];
- n1[3] -= d5[3] * d2[3];
- n2[0] -= d6[0] * d0[0];
- n2[1] -= d6[1] * d0[1];
- n2[2] -= d6[2] * d0[2];
- n2[3] -= d6[3] * d0[3];
- n0[0] *= s2[0];
- n0[1] *= s2[1];
- n0[2] *= s2[2];
- n0[3] *= s2[3];
- n1[0] *= s2[0];
- n1[1] *= s2[1];
- n1[2] *= s2[2];
- n1[3] *= s2[3];
- n2[0] *= s2[0];
- n2[1] *= s2[1];
- n2[2] *= s2[2];
- n2[3] *= s2[3];
- t0[0] = d0[0] * d9[0];
- t0[1] = d0[1] * d9[1];
- t0[2] = d0[2] * d9[2];
- t0[3] = d0[3] * d9[3];
- t1[0] = d1[0] * d9[0];
- t1[1] = d1[1] * d9[1];
- t1[2] = d1[2] * d9[2];
- t1[3] = d1[3] * d9[3];
- t2[0] = d2[0] * d9[0];
- t2[1] = d2[1] * d9[1];
- t2[2] = d2[2] * d9[2];
- t2[3] = d2[3] * d9[3];
- t0[0] -= d4[0] * d5[0];
- t0[1] -= d4[1] * d5[1];
- t0[2] -= d4[2] * d5[2];
- t0[3] -= d4[3] * d5[3];
- t1[0] -= d4[0] * d6[0];
- t1[1] -= d4[1] * d6[1];
- t1[2] -= d4[2] * d6[2];
- t1[3] -= d4[3] * d6[3];
- t2[0] -= d4[0] * d7[0];
- t2[1] -= d4[1] * d7[1];
- t2[2] -= d4[2] * d7[2];
- t2[3] -= d4[3] * d7[3];
- t0[0] *= s0[0];
- t0[1] *= s0[1];
- t0[2] *= s0[2];
- t0[3] *= s0[3];
- t1[0] *= s0[0];
- t1[1] *= s0[1];
- t1[2] *= s0[2];
- t1[3] *= s0[3];
- t2[0] *= s0[0];
- t2[1] *= s0[1];
- t2[2] *= s0[2];
- t2[3] *= s0[3];
- #ifndef DERIVE_UNSMOOTHED_BITANGENT
- t3[0] = d3[0] * d5[0];
- t3[1] = d3[1] * d5[1];
- t3[2] = d3[2] * d5[2];
- t3[3] = d3[3] * d5[3];
- t4[0] = d3[0] * d6[0];
- t4[1] = d3[1] * d6[1];
- t4[2] = d3[2] * d6[2];
- t4[3] = d3[3] * d6[3];
- t5[0] = d3[0] * d7[0];
- t5[1] = d3[1] * d7[1];
- t5[2] = d3[2] * d7[2];
- t5[3] = d3[3] * d7[3];
- t3[0] -= d0[0] * d8[0];
- t3[1] -= d0[1] * d8[1];
- t3[2] -= d0[2] * d8[2];
- t3[3] -= d0[3] * d8[3];
- t4[0] -= d1[0] * d8[0];
- t4[1] -= d1[1] * d8[1];
- t4[2] -= d1[2] * d8[2];
- t4[3] -= d1[3] * d8[3];
- t5[0] -= d2[0] * d8[0];
- t5[1] -= d2[1] * d8[1];
- t5[2] -= d2[2] * d8[2];
- t5[3] -= d2[3] * d8[3];
- #else
- t3[0] = n2[0] * t1[0];
- t3[1] = n2[1] * t1[1];
- t3[2] = n2[2] * t1[2];
- t3[3] = n2[3] * t1[3];
- t4[0] = n0[0] * t2[0];
- t4[1] = n0[1] * t2[1];
- t4[2] = n0[2] * t2[2];
- t4[3] = n0[3] * t2[3];
- t5[0] = n1[0] * t0[0];
- t5[1] = n1[1] * t0[1];
- t5[2] = n1[2] * t0[2];
- t5[3] = n1[3] * t0[3];
- t3[0] -= n1[0] * t2[0];
- t3[1] -= n1[1] * t2[1];
- t3[2] -= n1[2] * t2[2];
- t3[3] -= n1[3] * t2[3];
- t4[0] -= n2[0] * t0[0];
- t4[1] -= n2[1] * t0[1];
- t4[2] -= n2[2] * t0[2];
- t4[3] -= n2[3] * t0[3];
- t5[0] -= n0[0] * t1[0];
- t5[1] -= n0[1] * t1[1];
- t5[2] -= n0[2] * t1[2];
- t5[3] -= n0[3] * t1[3];
- #endif
- t3[0] *= s1[0];
- t3[1] *= s1[1];
- t3[2] *= s1[2];
- t3[3] *= s1[3];
- t4[0] *= s1[0];
- t4[1] *= s1[1];
- t4[2] *= s1[2];
- t4[3] *= s1[3];
- t5[0] *= s1[0];
- t5[1] *= s1[1];
- t5[2] *= s1[2];
- t5[3] *= s1[3];
- #endif
- for ( j = 0; j < 4; j++ ) {
- idDrawVert *a;
- a = verts + i + j;
- a->normal[0] = n0[j];
- a->normal[1] = n1[j];
- a->normal[2] = n2[j];
- a->tangents[0][0] = t0[j];
- a->tangents[0][1] = t1[j];
- a->tangents[0][2] = t2[j];
- a->tangents[1][0] = t3[j];
- a->tangents[1][1] = t4[j];
- a->tangents[1][2] = t5[j];
- }
- }
- for ( ; i < numVerts; i++ ) {
- idDrawVert *a, *b, *c;
- float d0, d1, d2, d3, d4;
- float d5, d6, d7, d8, d9;
- float s0, s1, s2;
- float n0, n1, n2;
- float t0, t1, t2;
- float t3, t4, t5;
- const dominantTri_s &dt = dominantTris[i];
- s0 = dt.normalizationScale[0];
- s1 = dt.normalizationScale[1];
- s2 = dt.normalizationScale[2];
- a = verts + i;
- b = verts + dt.v2;
- c = verts + dt.v3;
- d0 = b->xyz[0] - a->xyz[0];
- d1 = b->xyz[1] - a->xyz[1];
- d2 = b->xyz[2] - a->xyz[2];
- d3 = b->st[0] - a->st[0];
- d4 = b->st[1] - a->st[1];
- d5 = c->xyz[0] - a->xyz[0];
- d6 = c->xyz[1] - a->xyz[1];
- d7 = c->xyz[2] - a->xyz[2];
- d8 = c->st[0] - a->st[0];
- d9 = c->st[1] - a->st[1];
- #if 1
- __asm {
- movss xmm0, d6
- mulss xmm0, d2
- movss xmm1, d7
- mulss xmm1, d1
- movss xmm2, d7
- mulss xmm2, d0
- movss xmm3, d5
- mulss xmm3, d2
- movss xmm4, d5
- mulss xmm4, d1
- movss xmm5, d6
- mulss xmm5, d0
- subss xmm0, xmm1
- subss xmm2, xmm3
- movss xmm7, s2
- subss xmm4, xmm5
- mulss xmm0, xmm7
- movss n0, xmm0
- mulss xmm2, xmm7
- movss n1, xmm2
- mulss xmm4, xmm7
- movss n2, xmm4
- movss xmm0, d0
- mulss xmm0, d9
- movss xmm1, d4
- mulss xmm1, d5
- movss xmm2, d1
- mulss xmm2, d9
- movss xmm3, d4
- mulss xmm3, d6
- movss xmm4, d2
- mulss xmm4, d9
- movss xmm5, d4
- mulss xmm5, d7
- subss xmm0, xmm1
- subss xmm2, xmm3
- movss xmm7, s0
- subss xmm4, xmm5
- mulss xmm0, xmm7
- movss t0, xmm0
- mulss xmm2, xmm7
- movss t1, xmm2
- mulss xmm4, xmm7
- movss t2, xmm4
- #ifndef DERIVE_UNSMOOTHED_BITANGENT
- movss xmm0, d3
- mulss xmm0, d5
- movss xmm1, d0
- mulss xmm1, d8
- movss xmm2, d3
- mulss xmm2, d6
- movss xmm3, d1
- mulss xmm3, d8
- movss xmm4, d3
- mulss xmm4, d7
- movss xmm5, d2
- mulss xmm5, d8
- #else
- movss xmm0, n2
- mulss xmm0, t1
- movss xmm1, n1
- mulss xmm1, t2
- movss xmm2, n0
- mulss xmm2, t2
- movss xmm3, n2
- mulss xmm3, t0
- movss xmm4, n1
- mulss xmm4, t0
- movss xmm5, n0
- mulss xmm5, t1
- #endif
- subss xmm0, xmm1
- subss xmm2, xmm3
- movss xmm7, s1
- subss xmm4, xmm5
- mulss xmm0, xmm7
- movss t3, xmm0
- mulss xmm2, xmm7
- movss t4, xmm2
- mulss xmm4, xmm7
- movss t5, xmm4
- }
- #else
- n0 = s2 * ( d6 * d2 - d7 * d1 );
- n1 = s2 * ( d7 * d0 - d5 * d2 );
- n2 = s2 * ( d5 * d1 - d6 * d0 );
- t0 = s0 * ( d0 * d9 - d4 * d5 );
- t1 = s0 * ( d1 * d9 - d4 * d6 );
- t2 = s0 * ( d2 * d9 - d4 * d7 );
- #ifndef DERIVE_UNSMOOTHED_BITANGENT
- t3 = s1 * ( d3 * d5 - d0 * d8 );
- t4 = s1 * ( d3 * d6 - d1 * d8 );
- t5 = s1 * ( d3 * d7 - d2 * d8 );
- #else
- t3 = s1 * ( n2 * t1 - n1 * t2 );
- t4 = s1 * ( n0 * t2 - n2 * t0 );
- t5 = s1 * ( n1 * t0 - n0 * t1 );
- #endif
- #endif
- a->normal[0] = n0;
- a->normal[1] = n1;
- a->normal[2] = n2;
- a->tangents[0][0] = t0;
- a->tangents[0][1] = t1;
- a->tangents[0][2] = t2;
- a->tangents[1][0] = t3;
- a->tangents[1][1] = t4;
- a->tangents[1][2] = t5;
- }
- }
- /*
- ============
- idSIMD_SSE::NormalizeTangents
- ============
- */
- void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
- ALIGN16( float normal[12] );
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
- assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
- assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
- assert( verts != NULL );
- assert( numVerts >= 0 );
- __asm {
- mov eax, numVerts
- test eax, eax
- jz done
- #ifdef REFINE_TANGENT_SQUAREROOT
- movaps xmm6, SIMD_SP_rsqrt_c0
- movaps xmm7, SIMD_SP_rsqrt_c1
- #endif
- mov esi, verts
- imul eax, DRAWVERT_SIZE
- add esi, eax
- neg eax
- add eax, DRAWVERT_SIZE*4
- jle loopVert4
- sub eax, DRAWVERT_SIZE*4
- jl loopVert1
- loopVert4:
- sub eax, DRAWVERT_SIZE*4
- // normalize 4 idDrawVert::normal
- movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0] // 0, X, X, X
- movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0] // 0, X, 3, 4
- movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8] // 5, X, X, X
- movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4] // 5, X, 1, 2
- movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0] // 6, X, X, X
- movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0] // 6, X, 9, 10
- movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8] // 11, X, X, X
- movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4] // 11, X, 7, 8
- movaps xmm1, xmm0
- movaps xmm5, xmm2
- shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
- shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
- shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
- shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
- shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm3, xmm4
- addps xmm3, xmm5
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtps xmm4, xmm3
- mulps xmm3, xmm4
- mulps xmm3, xmm4
- subps xmm3, xmm6
- mulps xmm4, xmm7
- mulps xmm3, xmm4
- #else
- rsqrtps xmm3, xmm3
- #endif
- mulps xmm0, xmm3
- mulps xmm1, xmm3
- mulps xmm2, xmm3
- // save the 4 idDrawVert::normal to project the tangents
- movaps [normal+ 0], xmm0
- movaps [normal+16], xmm1
- movaps [normal+32], xmm2
- movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+8], xmm2
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8], xmm2
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+8], xmm2
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8], xmm2
- // project and normalize 4 idDrawVert::tangent[0]
- movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, X, X
- movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, 3, 4
- movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8] // 5, X, X, X
- movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4] // 5, X, 1, 2
- movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, X, X
- movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, 9, 10
- movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8] // 11, X, X, X
- movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4] // 11, X, 7, 8
- movaps xmm1, xmm0
- movaps xmm5, xmm2
- shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
- shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
- shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
- shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
- shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- mulps xmm3, [normal+ 0]
- mulps xmm4, [normal+16]
- mulps xmm5, [normal+32]
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm3, [normal+ 0]
- mulps xmm4, [normal+16]
- mulps xmm5, [normal+32]
- subps xmm0, xmm3
- subps xmm1, xmm4
- subps xmm2, xmm5
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm3, xmm4
- addps xmm3, xmm5
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtps xmm4, xmm3
- mulps xmm3, xmm4
- mulps xmm3, xmm4
- subps xmm3, xmm6
- mulps xmm4, xmm7
- mulps xmm3, xmm4
- #else
- rsqrtps xmm3, xmm3
- #endif
- mulps xmm0, xmm3
- mulps xmm1, xmm3
- mulps xmm2, xmm3
- movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+8], xmm2
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8], xmm2
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+8], xmm2
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8], xmm2
- // project and normalize 4 idDrawVert::tangent[1]
- movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, X, X
- movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, 3, 4
- movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8] // 5, X, X, X
- movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4] // 5, X, 1, 2
- movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, X, X
- movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, 9, 10
- movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8] // 11, X, X, X
- movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4] // 11, X, 7, 8
- movaps xmm1, xmm0
- movaps xmm5, xmm2
- shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
- shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
- shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
- shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
- shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- mulps xmm3, [normal+ 0]
- mulps xmm4, [normal+16]
- mulps xmm5, [normal+32]
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm3, [normal+ 0]
- mulps xmm4, [normal+16]
- mulps xmm5, [normal+32]
- subps xmm0, xmm3
- subps xmm1, xmm4
- subps xmm2, xmm5
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm3, xmm4
- addps xmm3, xmm5
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtps xmm4, xmm3
- mulps xmm3, xmm4
- mulps xmm3, xmm4
- subps xmm3, xmm6
- mulps xmm4, xmm7
- mulps xmm3, xmm4
- #else
- rsqrtps xmm3, xmm3
- #endif
- mulps xmm0, xmm3
- mulps xmm1, xmm3
- mulps xmm2, xmm3
- movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+8], xmm2
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8], xmm2
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+8], xmm2
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
- shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8], xmm2
- add eax, DRAWVERT_SIZE*8
- jle loopVert4
- sub eax, DRAWVERT_SIZE*4
- jge done
- loopVert1:
- // normalize one idDrawVert::normal
- movss xmm0, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
- movss xmm1, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
- movss xmm2, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- mulss xmm3, xmm3
- mulss xmm4, xmm4
- mulss xmm5, xmm5
- addss xmm3, xmm4
- addss xmm3, xmm5
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtss xmm4, xmm3
- mulss xmm3, xmm4
- mulss xmm3, xmm4
- subss xmm3, xmm6
- mulss xmm4, xmm7
- mulss xmm3, xmm4
- #else
- rsqrtss xmm3, xmm3
- #endif
- mulss xmm0, xmm3
- mulss xmm1, xmm3
- mulss xmm2, xmm3
- movss [esi+eax+DRAWVERT_NORMAL_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_NORMAL_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_NORMAL_OFFSET+8], xmm2
- // project and normalize one idDrawVert::tangent[0]
- movss xmm0, [esi+eax+DRAWVERT_TANGENT0_OFFSET+0]
- movss xmm1, [esi+eax+DRAWVERT_TANGENT0_OFFSET+4]
- movss xmm2, [esi+eax+DRAWVERT_TANGENT0_OFFSET+8]
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
- mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
- mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
- addss xmm3, xmm4
- addss xmm3, xmm5
- movss xmm4, xmm3
- movss xmm5, xmm3
- mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
- mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
- mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
- subss xmm0, xmm3
- subss xmm1, xmm4
- subss xmm2, xmm5
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- mulss xmm3, xmm3
- mulss xmm4, xmm4
- mulss xmm5, xmm5
- addss xmm3, xmm4
- addss xmm3, xmm5
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtss xmm4, xmm3
- mulss xmm3, xmm4
- mulss xmm3, xmm4
- subss xmm3, xmm6
- mulss xmm4, xmm7
- mulss xmm3, xmm4
- #else
- rsqrtss xmm3, xmm3
- #endif
- mulss xmm0, xmm3
- mulss xmm1, xmm3
- mulss xmm2, xmm3
- movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+8], xmm2
- // project and normalize one idDrawVert::tangent[1]
- movss xmm0, [esi+eax+DRAWVERT_TANGENT1_OFFSET+0]
- movss xmm1, [esi+eax+DRAWVERT_TANGENT1_OFFSET+4]
- movss xmm2, [esi+eax+DRAWVERT_TANGENT1_OFFSET+8]
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
- mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
- mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
- addss xmm3, xmm4
- addss xmm3, xmm5
- movss xmm4, xmm3
- movss xmm5, xmm3
- mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
- mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
- mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
- subss xmm0, xmm3
- subss xmm1, xmm4
- subss xmm2, xmm5
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- mulss xmm3, xmm3
- mulss xmm4, xmm4
- mulss xmm5, xmm5
- addss xmm3, xmm4
- addss xmm3, xmm5
- #ifdef REFINE_TANGENT_SQUAREROOT
- rsqrtss xmm4, xmm3
- mulss xmm3, xmm4
- mulss xmm3, xmm4
- subss xmm3, xmm6
- mulss xmm4, xmm7
- mulss xmm3, xmm4
- #else
- rsqrtss xmm3, xmm3
- #endif
- mulss xmm0, xmm3
- mulss xmm1, xmm3
- mulss xmm2, xmm3
- movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+0], xmm0
- movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+4], xmm1
- movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+8], xmm2
- add eax, DRAWVERT_SIZE
- jl loopVert1
- done:
- }
- }
- /*
- ============
- idSIMD_SSE::CreateTextureSpaceLightVectors
- ============
- */
- void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
- assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
- assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
- bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
- memset( used, 0, numVerts * sizeof( used[0] ) );
- for ( int i = numIndexes - 1; i >= 0; i-- ) {
- used[indexes[i]] = true;
- }
- #if 0
- __asm {
- mov eax, numVerts
- mov esi, used
- add esi, eax
- mov edi, verts
- sub edi, DRAWVERT_SIZE
- neg eax
- dec eax
- mov ecx, lightOrigin
- movss xmm7, [ecx+0]
- movhps xmm7, [ecx+4]
- mov ecx, lightVectors
- sub ecx, 3*4
- loopVert:
- inc eax
- jge done
- add edi, DRAWVERT_SIZE
- add ecx, 3*4
- cmp byte ptr [esi+eax], 0
- je loopVert
- movaps xmm0, xmm7
- movss xmm1, [edi+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
- subps xmm0, xmm1
- // 0, X, 1, 2
- // 3, X, 4, 5
- // 6, X, 7, 8
- movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
- movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
- mulps xmm2, xmm0
- movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
- movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
- mulps xmm3, xmm0
- movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2
- unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X
- unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5
- movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
- movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
- mulps xmm4, xmm0
- movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X
- movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8
- shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7
- addps xmm5, xmm4
- addps xmm5, xmm2
- movlps [ecx+0], xmm5
- shufps xmm5, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
- movss [ecx+8], xmm5
- jmp loopVert
- done:
- }
- #elif 1
- for ( int i = 0; i < numVerts; i++ ) {
- if ( !used[i] ) {
- continue;
- }
- const idDrawVert *v = &verts[i];
- idVec3 lightDir;
- lightDir[0] = lightOrigin[0] - v->xyz[0];
- lightDir[1] = lightOrigin[1] - v->xyz[1];
- lightDir[2] = lightOrigin[2] - v->xyz[2];
- lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
- lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
- lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
- }
- #elif 1
- ALIGN16( int usedVertNums[4] );
- ALIGN16( float lightDir0[4] );
- ALIGN16( float lightDir1[4] );
- ALIGN16( float lightDir2[4] );
- ALIGN16( float normal0[4] );
- ALIGN16( float normal1[4] );
- ALIGN16( float normal2[4] );
- ALIGN16( float tangent0[4] );
- ALIGN16( float tangent1[4] );
- ALIGN16( float tangent2[4] );
- ALIGN16( float tangent3[4] );
- ALIGN16( float tangent4[4] );
- ALIGN16( float tangent5[4] );
- idVec3 localLightOrigin = lightOrigin;
- __asm {
- xor ecx, ecx
- mov eax, numVerts
- mov esi, used
- add esi, eax
- mov edi, verts
- sub edi, DRAWVERT_SIZE
- neg eax
- dec eax
- loopVert4:
- inc eax
- jge done4
- add edi, DRAWVERT_SIZE
- cmp byte ptr [esi+eax], 0
- je loopVert4
- mov usedVertNums[ecx*4], eax
- inc ecx
- cmp ecx, 4
- movss xmm0, localLightOrigin[0]
- movss xmm1, localLightOrigin[4]
- movss xmm2, localLightOrigin[8]
- subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
- subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
- subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
- movss lightDir0[ecx*4-4], xmm0
- movss lightDir1[ecx*4-4], xmm1
- movss lightDir2[ecx*4-4], xmm2
- movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
- movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
- movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
- movss normal0[ecx*4-4], xmm3
- movss normal1[ecx*4-4], xmm4
- movss normal2[ecx*4-4], xmm5
- movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
- movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
- movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
- movss tangent0[ecx*4-4], xmm0
- movss tangent1[ecx*4-4], xmm1
- movss tangent2[ecx*4-4], xmm2
- movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
- movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
- movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
- movss tangent3[ecx*4-4], xmm3
- movss tangent4[ecx*4-4], xmm4
- movss tangent5[ecx*4-4], xmm5
- jl loopVert4
- movaps xmm0, lightDir0
- movaps xmm1, lightDir1
- movaps xmm2, lightDir2
- movaps xmm3, tangent0
- mulps xmm3, xmm0
- movaps xmm4, tangent1
- mulps xmm4, xmm1
- movaps xmm5, tangent2
- mulps xmm5, xmm2
- addps xmm3, xmm4
- addps xmm5, xmm3
- movaps xmm3, tangent3
- mulps xmm3, xmm0
- movaps xmm4, tangent4
- mulps xmm4, xmm1
- movaps xmm6, tangent5
- mulps xmm6, xmm2
- addps xmm3, xmm4
- addps xmm6, xmm3
- mulps xmm0, normal0
- mulps xmm1, normal1
- mulps xmm2, normal2
- addps xmm0, xmm1
- addps xmm0, xmm2
- mov ecx, numVerts
- imul ecx, 12
- mov edx, usedVertNums[0]
- add ecx, lightVectors
- imul edx, 12
- movss [ecx+edx+0], xmm5
- movss [ecx+edx+4], xmm6
- movss [ecx+edx+8], xmm0
- shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
- mov edx, usedVertNums[4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
- imul edx, 12
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [ecx+edx+0], xmm5
- movss [ecx+edx+4], xmm6
- movss [ecx+edx+8], xmm0
- shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
- mov edx, usedVertNums[8]
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
- imul edx, 12
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [ecx+edx+0], xmm5
- movss [ecx+edx+4], xmm6
- movss [ecx+edx+8], xmm0
- shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
- mov edx, usedVertNums[12]
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
- imul edx, 12
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [ecx+edx+0], xmm5
- movss [ecx+edx+4], xmm6
- movss [ecx+edx+8], xmm0
- xor ecx, ecx
- jmp loopVert4
- done4:
- test ecx, ecx
- jz done
- xor eax, eax
- mov edi, numVerts
- imul edi, 12
- add edi, lightVectors
- loopVert1:
- movss xmm0, lightDir0[eax*4]
- movss xmm1, lightDir1[eax*4]
- movss xmm2, lightDir2[eax*4]
- mov edx, usedVertNums[eax*4]
- imul edx, 12
- movss xmm3, tangent0[eax*4]
- mulss xmm3, xmm0
- movss xmm4, tangent1[eax*4]
- mulss xmm4, xmm1
- movss xmm5, tangent2[eax*4]
- mulss xmm5, xmm2
- addss xmm3, xmm4
- addss xmm5, xmm3
- movss [edi+edx+0], xmm5
- movss xmm3, tangent3[eax*4]
- mulss xmm3, xmm0
- movss xmm4, tangent4[eax*4]
- mulss xmm4, xmm1
- movss xmm6, tangent5[eax*4]
- mulss xmm6, xmm2
- addss xmm3, xmm4
- addss xmm6, xmm3
- movss [edi+edx+4], xmm6
- mulss xmm0, normal0[eax*4]
- mulss xmm1, normal1[eax*4]
- mulss xmm2, normal2[eax*4]
- addss xmm0, xmm1
- addss xmm0, xmm2
- movss [edi+edx+8], xmm0
- inc eax
- dec ecx
- jg loopVert1
- done:
- }
- #else
- ALIGN16( float lightVectors0[4] );
- ALIGN16( float lightVectors1[4] );
- ALIGN16( float lightVectors2[4] );
- int numUsedVerts = 0;
- for ( int i = 0; i < numVerts; i++ ) {
- if ( !used[i] ) {
- continue;
- }
- const idDrawVert *v = &verts[i];
- lightDir0[numUsedVerts] = lightOrigin[0] - v->xyz[0];
- lightDir1[numUsedVerts] = lightOrigin[1] - v->xyz[1];
- lightDir2[numUsedVerts] = lightOrigin[2] - v->xyz[2];
- normal0[numUsedVerts] = v->normal[0];
- normal1[numUsedVerts] = v->normal[1];
- normal2[numUsedVerts] = v->normal[2];
- tangent0[numUsedVerts] = v->tangents[0][0];
- tangent1[numUsedVerts] = v->tangents[0][1];
- tangent2[numUsedVerts] = v->tangents[0][2];
- tangent3[numUsedVerts] = v->tangents[1][0];
- tangent4[numUsedVerts] = v->tangents[1][1];
- tangent5[numUsedVerts] = v->tangents[1][2];
- usedVertNums[numUsedVerts++] = i;
- if ( numUsedVerts < 4 ) {
- continue;
- }
- lightVectors0[0] = lightDir0[0] * tangent0[0];
- lightVectors0[1] = lightDir0[1] * tangent0[1];
- lightVectors0[2] = lightDir0[2] * tangent0[2];
- lightVectors0[3] = lightDir0[3] * tangent0[3];
- lightVectors0[0] += lightDir1[0] * tangent1[0];
- lightVectors0[1] += lightDir1[1] * tangent1[1];
- lightVectors0[2] += lightDir1[2] * tangent1[2];
- lightVectors0[3] += lightDir1[3] * tangent1[3];
- lightVectors0[0] += lightDir2[0] * tangent2[0];
- lightVectors0[1] += lightDir2[1] * tangent2[1];
- lightVectors0[2] += lightDir2[2] * tangent2[2];
- lightVectors0[3] += lightDir2[3] * tangent2[3];
- lightVectors1[0] = lightDir0[0] * tangent3[0];
- lightVectors1[1] = lightDir0[1] * tangent3[1];
- lightVectors1[2] = lightDir0[2] * tangent3[2];
- lightVectors1[3] = lightDir0[3] * tangent3[3];
- lightVectors1[0] += lightDir1[0] * tangent4[0];
- lightVectors1[1] += lightDir1[1] * tangent4[1];
- lightVectors1[2] += lightDir1[2] * tangent4[2];
- lightVectors1[3] += lightDir1[3] * tangent4[3];
- lightVectors1[0] += lightDir2[0] * tangent5[0];
- lightVectors1[1] += lightDir2[1] * tangent5[1];
- lightVectors1[2] += lightDir2[2] * tangent5[2];
- lightVectors1[3] += lightDir2[3] * tangent5[3];
- lightVectors2[0] = lightDir0[0] * normal0[0];
- lightVectors2[1] = lightDir0[1] * normal0[1];
- lightVectors2[2] = lightDir0[2] * normal0[2];
- lightVectors2[3] = lightDir0[3] * normal0[3];
- lightVectors2[0] += lightDir1[0] * normal1[0];
- lightVectors2[1] += lightDir1[1] * normal1[1];
- lightVectors2[2] += lightDir1[2] * normal1[2];
- lightVectors2[3] += lightDir1[3] * normal1[3];
- lightVectors2[0] += lightDir2[0] * normal2[0];
- lightVectors2[1] += lightDir2[1] * normal2[1];
- lightVectors2[2] += lightDir2[2] * normal2[2];
- lightVectors2[3] += lightDir2[3] * normal2[3];
- for ( int j = 0; j < 4; j++ ) {
- int n = usedVertNums[j];
- lightVectors[n][0] = lightVectors0[j];
- lightVectors[n][1] = lightVectors1[j];
- lightVectors[n][2] = lightVectors2[j];
- }
- numUsedVerts = 0;
- }
- for ( int i = 0; i < numUsedVerts; i++ ) {
- lightVectors0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
- lightVectors1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
- lightVectors2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
- int n = usedVertNums[i];
- lightVectors[n][0] = lightVectors0[i];
- lightVectors[n][1] = lightVectors1[i];
- lightVectors[n][2] = lightVectors2[i];
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::CreateSpecularTextureCoords
- ============
- */
- void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
- assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
- assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
- bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
- memset( used, 0, numVerts * sizeof( used[0] ) );
- for ( int i = numIndexes - 1; i >= 0; i-- ) {
- used[indexes[i]] = true;
- }
- #if 0
- __asm {
- mov eax, numVerts
- mov esi, used
- add esi, eax
- mov edi, verts
- sub edi, DRAWVERT_SIZE
- neg eax
- dec eax
- mov ecx, viewOrigin
- movss xmm6, [ecx+0]
- movhps xmm6, [ecx+4]
- mov ecx, lightOrigin
- movss xmm7, [ecx+0]
- movhps xmm7, [ecx+4]
- mov ecx, texCoords
- sub ecx, 4*4
- loopVert:
- inc eax
- jge done
- add edi, DRAWVERT_SIZE
- add ecx, 4*4
- cmp byte ptr [esi+eax], 0
- je loopVert
- movaps xmm0, xmm7
- movaps xmm1, xmm6
- movss xmm2, [edi+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm2, [edi+DRAWVERT_XYZ_OFFSET+4]
- subps xmm0, xmm2
- subps xmm1, xmm2
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- // 0, X, 1, 2
- // 3, X, 4, 5
- movaps xmm5, xmm3 // xmm5 = 0, X, 1, 2
- unpcklps xmm5, xmm4 // xmm5 = 0, 3, X, X
- unpckhps xmm3, xmm4 // xmm3 = 1, 4, 2, 5
- movhlps xmm4, xmm3 // xmm4 = 2, 5, 4, 5
- addps xmm5, xmm3
- addps xmm5, xmm4
- shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
- rsqrtps xmm5, xmm5
- movaps xmm4, xmm5
- shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
- shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm0, xmm4
- mulps xmm1, xmm5
- addps xmm0, xmm1
- movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
- movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
- mulps xmm2, xmm0
- movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
- movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
- mulps xmm3, xmm0
- movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
- movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
- mulps xmm4, xmm0
- movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2
- unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X
- unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5
- movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X
- movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8
- shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7
- movaps xmm3, SIMD_SP_one
- addps xmm5, xmm4
- addps xmm5, xmm2
- movaps [ecx+0], xmm5
- movss [ecx+12], xmm3
- jmp loopVert
- done:
- }
- #elif 0
- for ( int i = 0; i < numVerts; i++ ) {
- if ( !used[i] ) {
- continue;
- }
- const idDrawVert *v = &verts[i];
- idVec3 lightDir = lightOrigin - v->xyz;
- idVec3 viewDir = viewOrigin - v->xyz;
- float ilength;
- ilength = idMath::RSqrt( lightDir[0] * lightDir[0] + lightDir[1] * lightDir[1] + lightDir[2] * lightDir[2] );
- lightDir[0] *= ilength;
- lightDir[1] *= ilength;
- lightDir[2] *= ilength;
- ilength = idMath::RSqrt( viewDir[0] * viewDir[0] + viewDir[1] * viewDir[1] + viewDir[2] * viewDir[2] );
- viewDir[0] *= ilength;
- viewDir[1] *= ilength;
- viewDir[2] *= ilength;
- lightDir += viewDir;
- texCoords[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
- texCoords[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
- texCoords[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
- texCoords[i][3] = 1.0f;
- }
- #elif 1
- ALIGN16( int usedVertNums[4] );
- ALIGN16( float lightDir0[4] );
- ALIGN16( float lightDir1[4] );
- ALIGN16( float lightDir2[4] );
- ALIGN16( float viewDir0[4] );
- ALIGN16( float viewDir1[4] );
- ALIGN16( float viewDir2[4] );
- ALIGN16( float normal0[4] );
- ALIGN16( float normal1[4] );
- ALIGN16( float normal2[4] );
- ALIGN16( float tangent0[4] );
- ALIGN16( float tangent1[4] );
- ALIGN16( float tangent2[4] );
- ALIGN16( float tangent3[4] );
- ALIGN16( float tangent4[4] );
- ALIGN16( float tangent5[4] );
- idVec3 localLightOrigin = lightOrigin;
- idVec3 localViewOrigin = viewOrigin;
- __asm {
- xor ecx, ecx
- mov eax, numVerts
- mov esi, used
- add esi, eax
- mov edi, verts
- sub edi, DRAWVERT_SIZE
- neg eax
- dec eax
- loopVert4:
- inc eax
- jge done4
- add edi, DRAWVERT_SIZE
- cmp byte ptr [esi+eax], 0
- je loopVert4
- mov usedVertNums[ecx*4], eax
- inc ecx
- cmp ecx, 4
- movss xmm3, localLightOrigin[0]
- movss xmm4, localLightOrigin[4]
- movss xmm5, localLightOrigin[8]
- subss xmm3, [edi+DRAWVERT_XYZ_OFFSET+0]
- subss xmm4, [edi+DRAWVERT_XYZ_OFFSET+4]
- subss xmm5, [edi+DRAWVERT_XYZ_OFFSET+8]
- movss lightDir0[ecx*4-4], xmm3
- movss lightDir1[ecx*4-4], xmm4
- movss lightDir2[ecx*4-4], xmm5
- movss xmm0, localViewOrigin[0]
- movss xmm1, localViewOrigin[4]
- movss xmm2, localViewOrigin[8]
- subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
- subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
- subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
- movss viewDir0[ecx*4-4], xmm0
- movss viewDir1[ecx*4-4], xmm1
- movss viewDir2[ecx*4-4], xmm2
- movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
- movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
- movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
- movss normal0[ecx*4-4], xmm3
- movss normal1[ecx*4-4], xmm4
- movss normal2[ecx*4-4], xmm5
- movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
- movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
- movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
- movss tangent0[ecx*4-4], xmm0
- movss tangent1[ecx*4-4], xmm1
- movss tangent2[ecx*4-4], xmm2
- movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
- movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
- movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
- movss tangent3[ecx*4-4], xmm3
- movss tangent4[ecx*4-4], xmm4
- movss tangent5[ecx*4-4], xmm5
- jl loopVert4
- movaps xmm6, lightDir0
- movaps xmm0, xmm6
- mulps xmm6, xmm6
- movaps xmm7, lightDir1
- movaps xmm1, xmm7
- mulps xmm7, xmm7
- addps xmm6, xmm7
- movaps xmm5, lightDir2
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- addps xmm6, xmm5
- rsqrtps xmm6, xmm6
- mulps xmm0, xmm6
- mulps xmm1, xmm6
- mulps xmm2, xmm6
- movaps xmm3, viewDir0
- movaps xmm7, xmm3
- mulps xmm7, xmm7
- movaps xmm4, viewDir1
- movaps xmm6, xmm4
- mulps xmm6, xmm6
- addps xmm7, xmm6
- movaps xmm5, viewDir2
- movaps xmm6, xmm5
- mulps xmm6, xmm6
- addps xmm7, xmm6
- rsqrtps xmm7, xmm7
- mulps xmm3, xmm7
- addps xmm0, xmm3
- mulps xmm4, xmm7
- addps xmm1, xmm4
- mulps xmm5, xmm7
- addps xmm2, xmm5
- movaps xmm3, tangent0
- mulps xmm3, xmm0
- movaps xmm4, tangent1
- mulps xmm4, xmm1
- addps xmm3, xmm4
- movaps xmm5, tangent2
- mulps xmm5, xmm2
- addps xmm5, xmm3
- movaps xmm3, tangent3
- mulps xmm3, xmm0
- movaps xmm4, tangent4
- mulps xmm4, xmm1
- addps xmm3, xmm4
- movaps xmm6, tangent5
- mulps xmm6, xmm2
- addps xmm6, xmm3
- mulps xmm0, normal0
- mulps xmm1, normal1
- addps xmm0, xmm1
- mulps xmm2, normal2
- addps xmm0, xmm2
- mov ecx, numVerts
- shl ecx, 4
- mov edx, usedVertNums[0]
- add ecx, texCoords
- shl edx, 4
- movss xmm3, SIMD_SP_one
- movss [ecx+edx+0], xmm5
- movss [ecx+edx+4], xmm6
- movss [ecx+edx+8], xmm0
- movss [ecx+edx+12], xmm3
- shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
- mov edx, usedVertNums[4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
- shl edx, 4
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [ecx+edx+0], xmm5
- movss [ecx+edx+4], xmm6
- movss [ecx+edx+8], xmm0
- movss [ecx+edx+12], xmm3
- shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
- mov edx, usedVertNums[8]
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
- shl edx, 4
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [ecx+edx+0], xmm5
- movss [ecx+edx+4], xmm6
- movss [ecx+edx+8], xmm0
- movss [ecx+edx+12], xmm3
- shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
- mov edx, usedVertNums[12]
- shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
- shl edx, 4
- shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
- movss [ecx+edx+0], xmm5
- movss [ecx+edx+4], xmm6
- movss [ecx+edx+8], xmm0
- movss [ecx+edx+12], xmm3
- xor ecx, ecx
- jmp loopVert4
- done4:
- test ecx, ecx
- jz done
- xor eax, eax
- mov edi, numVerts
- shl edi, 4
- add edi, texCoords
- loopVert1:
- movss xmm6, lightDir0[eax*4]
- movss xmm0, xmm6
- mulss xmm6, xmm6
- movss xmm7, lightDir1[eax*4]
- movss xmm1, xmm7
- mulss xmm7, xmm7
- addss xmm6, xmm7
- movss xmm5, lightDir2[eax*4]
- movss xmm2, xmm5
- mulss xmm5, xmm5
- addss xmm6, xmm5
- rsqrtss xmm6, xmm6
- mulss xmm0, xmm6
- mulss xmm1, xmm6
- mulss xmm2, xmm6
- movss xmm3, viewDir0[eax*4]
- movss xmm7, xmm3
- mulss xmm7, xmm7
- movss xmm4, viewDir1[eax*4]
- movss xmm6, xmm4
- mulss xmm6, xmm6
- addss xmm7, xmm6
- movss xmm5, viewDir2[eax*4]
- movss xmm6, xmm5
- mulss xmm6, xmm6
- addss xmm7, xmm6
- rsqrtss xmm7, xmm7
- mulss xmm3, xmm7
- addss xmm0, xmm3
- mulss xmm4, xmm7
- addss xmm1, xmm4
- mulss xmm5, xmm7
- addss xmm2, xmm5
- mov edx, usedVertNums[eax*4]
- shl edx, 4
- movss xmm3, tangent0[eax*4]
- mulss xmm3, xmm0
- movss xmm4, tangent1[eax*4]
- mulss xmm4, xmm1
- addss xmm3, xmm4
- movss xmm5, tangent2[eax*4]
- mulss xmm5, xmm2
- addss xmm5, xmm3
- movss [edi+edx+0], xmm5
- movss xmm3, tangent3[eax*4]
- mulss xmm3, xmm0
- movss xmm4, tangent4[eax*4]
- mulss xmm4, xmm1
- addss xmm3, xmm4
- movss xmm6, tangent5[eax*4]
- mulss xmm6, xmm2
- addss xmm6, xmm3
- movss [edi+edx+4], xmm6
- mulss xmm0, normal0[eax*4]
- mulss xmm1, normal1[eax*4]
- addss xmm0, xmm1
- mulss xmm2, normal2[eax*4]
- addss xmm0, xmm2
- movss [edi+edx+8], xmm0
- movss xmm3, SIMD_SP_one
- movss [edi+edx+12], xmm3
- inc eax
- dec ecx
- jg loopVert1
- done:
- }
- #else
- ALIGN16( int usedVertNums[4] );
- ALIGN16( float lightDir0[4] );
- ALIGN16( float lightDir1[4] );
- ALIGN16( float lightDir2[4] );
- ALIGN16( float viewDir0[4] );
- ALIGN16( float viewDir1[4] );
- ALIGN16( float viewDir2[4] );
- ALIGN16( float normal0[4] );
- ALIGN16( float normal1[4] );
- ALIGN16( float normal2[4] );
- ALIGN16( float tangent0[4] );
- ALIGN16( float tangent1[4] );
- ALIGN16( float tangent2[4] );
- ALIGN16( float tangent3[4] );
- ALIGN16( float tangent4[4] );
- ALIGN16( float tangent5[4] );
- ALIGN16( float texCoords0[4] );
- ALIGN16( float texCoords1[4] );
- ALIGN16( float texCoords2[4] );
- idVec3 localLightOrigin = lightOrigin;
- idVec3 localViewOrigin = viewOrigin;
- int numUsedVerts = 0;
- for ( int i = 0; i < numVerts; i++ ) {
- if ( !used[i] ) {
- continue;
- }
- const idDrawVert *v = &verts[i];
- lightDir0[numUsedVerts] = localLightOrigin[0] - v->xyz[0];
- lightDir1[numUsedVerts] = localLightOrigin[1] - v->xyz[1];
- lightDir2[numUsedVerts] = localLightOrigin[2] - v->xyz[2];
- viewDir0[numUsedVerts] = localViewOrigin[0] - v->xyz[0];
- viewDir1[numUsedVerts] = localViewOrigin[1] - v->xyz[1];
- viewDir2[numUsedVerts] = localViewOrigin[2] - v->xyz[2];
- normal0[numUsedVerts] = v->normal[0];
- normal1[numUsedVerts] = v->normal[1];
- normal2[numUsedVerts] = v->normal[2];
- tangent0[numUsedVerts] = v->tangents[0][0];
- tangent1[numUsedVerts] = v->tangents[0][1];
- tangent2[numUsedVerts] = v->tangents[0][2];
- tangent3[numUsedVerts] = v->tangents[1][0];
- tangent4[numUsedVerts] = v->tangents[1][1];
- tangent5[numUsedVerts] = v->tangents[1][2];
- usedVertNums[numUsedVerts++] = i;
- if ( numUsedVerts < 4 ) {
- continue;
- }
- ALIGN16( float temp[4] );
- temp[0] = lightDir0[0] * lightDir0[0];
- temp[1] = lightDir0[1] * lightDir0[1];
- temp[2] = lightDir0[2] * lightDir0[2];
- temp[3] = lightDir0[3] * lightDir0[3];
- temp[0] += lightDir1[0] * lightDir1[0];
- temp[1] += lightDir1[1] * lightDir1[1];
- temp[2] += lightDir1[2] * lightDir1[2];
- temp[3] += lightDir1[3] * lightDir1[3];
- temp[0] += lightDir2[0] * lightDir2[0];
- temp[1] += lightDir2[1] * lightDir2[1];
- temp[2] += lightDir2[2] * lightDir2[2];
- temp[3] += lightDir2[3] * lightDir2[3];
- temp[0] = idMath::RSqrt( temp[0] );
- temp[1] = idMath::RSqrt( temp[1] );
- temp[2] = idMath::RSqrt( temp[2] );
- temp[3] = idMath::RSqrt( temp[3] );
- lightDir0[0] *= temp[0];
- lightDir0[1] *= temp[1];
- lightDir0[2] *= temp[2];
- lightDir0[3] *= temp[3];
- lightDir1[0] *= temp[0];
- lightDir1[1] *= temp[1];
- lightDir1[2] *= temp[2];
- lightDir1[3] *= temp[3];
- lightDir2[0] *= temp[0];
- lightDir2[1] *= temp[1];
- lightDir2[2] *= temp[2];
- lightDir2[3] *= temp[3];
- temp[0] = viewDir0[0] * viewDir0[0];
- temp[1] = viewDir0[1] * viewDir0[1];
- temp[2] = viewDir0[2] * viewDir0[2];
- temp[3] = viewDir0[3] * viewDir0[3];
- temp[0] += viewDir1[0] * viewDir1[0];
- temp[1] += viewDir1[1] * viewDir1[1];
- temp[2] += viewDir1[2] * viewDir1[2];
- temp[3] += viewDir1[3] * viewDir1[3];
- temp[0] += viewDir2[0] * viewDir2[0];
- temp[1] += viewDir2[1] * viewDir2[1];
- temp[2] += viewDir2[2] * viewDir2[2];
- temp[3] += viewDir2[3] * viewDir2[3];
- temp[0] = idMath::RSqrt( temp[0] );
- temp[1] = idMath::RSqrt( temp[1] );
- temp[2] = idMath::RSqrt( temp[2] );
- temp[3] = idMath::RSqrt( temp[3] );
- viewDir0[0] *= temp[0];
- viewDir0[1] *= temp[1];
- viewDir0[2] *= temp[2];
- viewDir0[3] *= temp[3];
- viewDir1[0] *= temp[0];
- viewDir1[1] *= temp[1];
- viewDir1[2] *= temp[2];
- viewDir1[3] *= temp[3];
- viewDir2[0] *= temp[0];
- viewDir2[1] *= temp[1];
- viewDir2[2] *= temp[2];
- viewDir2[3] *= temp[3];
- lightDir0[0] += viewDir0[0];
- lightDir0[1] += viewDir0[1];
- lightDir0[2] += viewDir0[2];
- lightDir0[3] += viewDir0[3];
- lightDir1[0] += viewDir1[0];
- lightDir1[1] += viewDir1[1];
- lightDir1[2] += viewDir1[2];
- lightDir1[3] += viewDir1[3];
- lightDir2[0] += viewDir2[0];
- lightDir2[1] += viewDir2[1];
- lightDir2[2] += viewDir2[2];
- lightDir2[3] += viewDir2[3];
- texCoords0[0] = lightDir0[0] * tangent0[0];
- texCoords0[1] = lightDir0[1] * tangent0[1];
- texCoords0[2] = lightDir0[2] * tangent0[2];
- texCoords0[3] = lightDir0[3] * tangent0[3];
- texCoords0[0] += lightDir1[0] * tangent1[0];
- texCoords0[1] += lightDir1[1] * tangent1[1];
- texCoords0[2] += lightDir1[2] * tangent1[2];
- texCoords0[3] += lightDir1[3] * tangent1[3];
- texCoords0[0] += lightDir2[0] * tangent2[0];
- texCoords0[1] += lightDir2[1] * tangent2[1];
- texCoords0[2] += lightDir2[2] * tangent2[2];
- texCoords0[3] += lightDir2[3] * tangent2[3];
- texCoords1[0] = lightDir0[0] * tangent3[0];
- texCoords1[1] = lightDir0[1] * tangent3[1];
- texCoords1[2] = lightDir0[2] * tangent3[2];
- texCoords1[3] = lightDir0[3] * tangent3[3];
- texCoords1[0] += lightDir1[0] * tangent4[0];
- texCoords1[1] += lightDir1[1] * tangent4[1];
- texCoords1[2] += lightDir1[2] * tangent4[2];
- texCoords1[3] += lightDir1[3] * tangent4[3];
- texCoords1[0] += lightDir2[0] * tangent5[0];
- texCoords1[1] += lightDir2[1] * tangent5[1];
- texCoords1[2] += lightDir2[2] * tangent5[2];
- texCoords1[3] += lightDir2[3] * tangent5[3];
- texCoords2[0] = lightDir0[0] * normal0[0];
- texCoords2[1] = lightDir0[1] * normal0[1];
- texCoords2[2] = lightDir0[2] * normal0[2];
- texCoords2[3] = lightDir0[3] * normal0[3];
- texCoords2[0] += lightDir1[0] * normal1[0];
- texCoords2[1] += lightDir1[1] * normal1[1];
- texCoords2[2] += lightDir1[2] * normal1[2];
- texCoords2[3] += lightDir1[3] * normal1[3];
- texCoords2[0] += lightDir2[0] * normal2[0];
- texCoords2[1] += lightDir2[1] * normal2[1];
- texCoords2[2] += lightDir2[2] * normal2[2];
- texCoords2[3] += lightDir2[3] * normal2[3];
- for ( int j = 0; j < 4; j++ ) {
- int n = usedVertNums[j];
- texCoords[n][0] = texCoords0[j];
- texCoords[n][1] = texCoords1[j];
- texCoords[n][2] = texCoords2[j];
- texCoords[n][3] = 1.0f;
- }
- numUsedVerts = 0;
- }
- for ( int i = 0; i < numUsedVerts; i++ ) {
- float temp;
- temp = lightDir0[i] * lightDir0[i] + lightDir1[i] * lightDir1[i] + lightDir2[i] * lightDir2[i];
- temp = idMath::RSqrt( temp );
- lightDir0[i] *= temp;
- lightDir1[i] *= temp;
- lightDir2[i] *= temp;
- temp = viewDir0[i] * viewDir0[i] + viewDir1[i] * viewDir1[i] + viewDir2[i] * viewDir2[i];
- temp = idMath::RSqrt( temp );
- viewDir0[i] *= temp;
- viewDir1[i] *= temp;
- viewDir2[i] *= temp;
- lightDir0[i] += viewDir0[i];
- lightDir1[i] += viewDir1[i];
- lightDir2[i] += viewDir2[i];
- texCoords0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
- texCoords1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
- texCoords2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
- int n = usedVertNums[i];
- texCoords[n][0] = texCoords0;
- texCoords[n][1] = texCoords1;
- texCoords[n][2] = texCoords2;
- texCoords[n][3] = 1.0f;
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::CreateShadowCache
- ============
- */
- int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
- #if 1
- int outVerts;
- __asm {
- push ebx
- mov esi, lightOrigin
- movaps xmm5, SIMD_SP_lastOne
- movss xmm6, [esi+0]
- movhps xmm6, [esi+4]
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 3, 1 )
- orps xmm6, SIMD_SP_lastOne
- movaps xmm7, xmm6
- xor ebx, ebx
- xor ecx, ecx
- mov edx, vertRemap
- mov esi, verts
- mov edi, vertexCache
- mov eax, numVerts
- and eax, ~3
- jz done4
- shl eax, 2
- add edx, eax
- neg eax
- loop4:
- prefetchnta [edx+128]
- prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
- cmp dword ptr [edx+eax+0], ebx
- jne skip1
- mov dword ptr [edx+eax+0], ecx
- movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- add ecx, 2
- shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
- orps xmm0, xmm5
- movaps [edi+0*16], xmm0
- subps xmm0, xmm6
- movaps [edi+1*16], xmm0
- add edi, 2*16
- skip1:
- cmp dword ptr [edx+eax+4], ebx
- jne skip2
- mov dword ptr [edx+eax+4], ecx
- movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
- add ecx, 2
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
- orps xmm1, xmm5
- movaps [edi+0*16], xmm1
- subps xmm1, xmm7
- movaps [edi+1*16], xmm1
- add edi, 2*16
- skip2:
- cmp dword ptr [edx+eax+8], ebx
- jne skip3
- mov dword ptr [edx+eax+8], ecx
- movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- add ecx, 2
- shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
- orps xmm2, xmm5
- movaps [edi+0*16], xmm2
- subps xmm2, xmm6
- movaps [edi+1*16], xmm2
- add edi, 2*16
- skip3:
- cmp dword ptr [edx+eax+12], ebx
- jne skip4
- mov dword ptr [edx+eax+12], ecx
- movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
- add ecx, 2
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
- orps xmm3, xmm5
- movaps [edi+0*16], xmm3
- subps xmm3, xmm7
- movaps [edi+1*16], xmm3
- add edi, 2*16
- skip4:
- add esi, 4*DRAWVERT_SIZE
- add eax, 4*4
- jl loop4
- done4:
- mov eax, numVerts
- and eax, 3
- jz done1
- shl eax, 2
- add edx, eax
- neg eax
- loop1:
- cmp dword ptr [edx+eax+0], ebx
- jne skip0
- mov dword ptr [edx+eax+0], ecx
- movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- add ecx, 2
- shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 )
- orps xmm0, xmm5
- movaps [edi+0*16], xmm0
- subps xmm0, xmm6
- movaps [edi+1*16], xmm0
- add edi, 2*16
- skip0:
- add esi, DRAWVERT_SIZE
- add eax, 4
- jl loop1
- done1:
- pop ebx
- mov outVerts, ecx
- }
- return outVerts;
- #else
- int outVerts = 0;
- for ( int i = 0; i < numVerts; i++ ) {
- if ( vertRemap[i] ) {
- continue;
- }
- const float *v = verts[i].xyz.ToFloatPtr();
- vertexCache[outVerts+0][0] = v[0];
- vertexCache[outVerts+0][1] = v[1];
- vertexCache[outVerts+0][2] = v[2];
- vertexCache[outVerts+0][3] = 1.0f;
- // R_SetupProjection() builds the projection matrix with a slight crunch
- // for depth, which keeps this w=0 division from rasterizing right at the
- // wrap around point and causing depth fighting with the rear caps
- vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
- vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
- vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
- vertexCache[outVerts+1][3] = 0.0f;
- vertRemap[i] = outVerts;
- outVerts += 2;
- }
- return outVerts;
- #endif
- }
- /*
- ============
- idSIMD_SSE::CreateVertexProgramShadowCache
- ============
- */
- int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
- #if 1
- __asm {
- movaps xmm4, SIMD_SP_lastOne
- movaps xmm5, xmm4
- movaps xmm6, xmm4
- movaps xmm7, xmm4
- mov esi, verts
- mov edi, vertexCache
- mov eax, numVerts
- and eax, ~3
- jz done4
- shl eax, 5
- add edi, eax
- neg eax
- loop4:
- prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
- movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
- movaps [edi+eax+1*16], xmm0
- orps xmm0, xmm4
- movaps [edi+eax+0*16], xmm0
- movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
- movaps [edi+eax+3*16], xmm1
- orps xmm1, xmm5
- movaps [edi+eax+2*16], xmm1
- movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
- movaps [edi+eax+5*16], xmm2
- orps xmm2, xmm6
- movaps [edi+eax+4*16], xmm2
- movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
- shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
- movaps [edi+eax+7*16], xmm3
- orps xmm3, xmm7
- movaps [edi+eax+6*16], xmm3
- add esi, 4*DRAWVERT_SIZE
- add eax, 4*8*4
- jl loop4
- done4:
- mov eax, numVerts
- and eax, 3
- jz done1
- shl eax, 5
- add edi, eax
- neg eax
- loop1:
- movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
- movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
- shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
- movaps [edi+eax+1*16], xmm0
- orps xmm0, xmm4
- movaps [edi+eax+0*16], xmm0
- add esi, DRAWVERT_SIZE
- add eax, 8*4
- jl loop1
- done1:
- }
- return numVerts * 2;
- #else
- for ( int i = 0; i < numVerts; i++ ) {
- const float *v = verts[i].xyz.ToFloatPtr();
- vertexCache[i*2+0][0] = v[0];
- vertexCache[i*2+0][1] = v[1];
- vertexCache[i*2+0][2] = v[2];
- vertexCache[i*2+0][3] = 1.0f;
- vertexCache[i*2+1][0] = v[0];
- vertexCache[i*2+1][1] = v[1];
- vertexCache[i*2+1][2] = v[2];
- vertexCache[i*2+1][3] = 0.0f;
- }
- return numVerts * 2;
- #endif
- }
- /*
- ============
- SSE_UpSample11kHzMonoPCMTo44kHz
- ============
- */
- static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
- __asm {
- mov esi, src
- mov edi, dest
- mov eax, numSamples
- and eax, ~1
- jz done2
- shl eax, 1
- add esi, eax
- neg eax
- align 16
- loop2:
- add edi, 2*4*4
- movsx ecx, word ptr [esi+eax+0]
- cvtsi2ss xmm0, ecx
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps [edi-2*4*4+0], xmm0
- movhps [edi-2*4*4+8], xmm0
- movsx edx, word ptr [esi+eax+2]
- cvtsi2ss xmm1, edx
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps [edi-1*4*4+0], xmm1
- movhps [edi-1*4*4+8], xmm1
- add eax, 2*2
- jl loop2
- done2:
- mov eax, numSamples
- and eax, 1
- jz done
- movsx ecx, word ptr [esi]
- cvtsi2ss xmm0, ecx
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps [edi+0], xmm0
- movhps [edi+8], xmm0
- done:
- }
- }
- /*
- ============
- SSE_UpSample11kHzStereoPCMTo44kHz
- ============
- */
- static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
- __asm {
- mov esi, src
- mov edi, dest
- mov eax, numSamples
- test eax, ~1
- jz done2
- shl eax, 1
- add esi, eax
- neg eax
- align 16
- loop2:
- add edi, 8*4
- movsx ecx, word ptr [esi+eax+0]
- cvtsi2ss xmm0, ecx
- movsx edx, word ptr [esi+eax+2]
- cvtsi2ss xmm1, edx
- unpcklps xmm0, xmm1
- movlps [edi-8*4+0], xmm0
- movlps [edi-8*4+8], xmm0
- movlps [edi-4*4+0], xmm0
- movlps [edi-4*4+8], xmm0
- add eax, 2*2
- jl loop2
- done2:
- }
- }
- /*
- ============
- SSE_UpSample22kHzMonoPCMTo44kHz
- ============
- */
- static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
- __asm {
- mov esi, src
- mov edi, dest
- mov eax, numSamples
- and eax, ~1
- jz done2
- shl eax, 1
- add esi, eax
- neg eax
- align 16
- loop2:
- add edi, 4*4
- movsx ecx, word ptr [esi+eax+0]
- cvtsi2ss xmm0, ecx
- movsx edx, word ptr [esi+eax+2]
- cvtsi2ss xmm1, edx
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps [edi-4*4+0], xmm0
- movhps [edi-4*4+8], xmm0
- add eax, 2*2
- jl loop2
- done2:
- mov eax, numSamples
- and eax, 1
- jz done
- movsx ecx, word ptr [esi]
- cvtsi2ss xmm0, ecx
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps [edi], xmm0
- done:
- }
- }
- /*
- ============
- SSE_UpSample22kHzStereoPCMTo44kHz
- ============
- */
- static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
- __asm {
- mov esi, src
- mov edi, dest
- mov eax, numSamples
- test eax, ~1
- jz done2
- shl eax, 1
- add esi, eax
- neg eax
- align 16
- loop2:
- add edi, 4*4
- movsx ecx, word ptr [esi+eax+0]
- cvtsi2ss xmm0, ecx
- movss [edi-4*4], xmm0
- movss [edi-2*4], xmm0
- movsx edx, word ptr [esi+eax+2]
- cvtsi2ss xmm1, edx
- movss [edi-3*4], xmm1
- movss [edi-1*4], xmm1
- add eax, 2*2
- jl loop2
- done2:
- }
- }
- /*
- ============
- SSE_UpSample44kHzMonoPCMTo44kHz
- ============
- */
- static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
- __asm {
- mov esi, src
- mov edi, dest
- mov eax, numSamples
- and eax, ~1
- jz done2
- shl eax, 1
- add esi, eax
- neg eax
- align 16
- loop2:
- add edi, 2*4
- movsx ecx, word ptr [esi+eax+0]
- cvtsi2ss xmm0, ecx
- movss [edi-2*4], xmm0
- movsx edx, word ptr [esi+eax+2]
- cvtsi2ss xmm1, edx
- movss [edi-1*4], xmm1
- add eax, 2*2
- jl loop2
- done2:
- mov eax, numSamples
- and eax, 1
- jz done
- movsx ecx, word ptr [esi]
- cvtsi2ss xmm0, ecx
- movss [edi], xmm0
- done:
- }
- }
- /*
- ============
- idSIMD_SSE::UpSamplePCMTo44kHz
- Duplicate samples for 44kHz output.
- ============
- */
- void idSIMD_SSE::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
- if ( kHz == 11025 ) {
- if ( numChannels == 1 ) {
- SSE_UpSample11kHzMonoPCMTo44kHz( dest, src, numSamples );
- } else {
- SSE_UpSample11kHzStereoPCMTo44kHz( dest, src, numSamples );
- }
- } else if ( kHz == 22050 ) {
- if ( numChannels == 1 ) {
- SSE_UpSample22kHzMonoPCMTo44kHz( dest, src, numSamples );
- } else {
- SSE_UpSample22kHzStereoPCMTo44kHz( dest, src, numSamples );
- }
- } else if ( kHz == 44100 ) {
- SSE_UpSample44kHzMonoPCMTo44kHz( dest, src, numSamples );
- } else {
- assert( 0 );
- }
- }
- /*
- ============
- SSE_UpSample11kHzMonoOGGTo44kHz
- ============
- */
- static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
- float constant = 32768.0f;
- __asm {
- mov esi, src
- mov edi, dest
- movss xmm7, constant
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mov eax, numSamples
- and eax, ~1
- jz done2
- shl eax, 2
- add esi, eax
- neg eax
- align 16
- loop2:
- add edi, 2*16
- movss xmm0, [esi+eax+0]
- mulss xmm0, xmm7
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps [edi-32], xmm0
- movlps [edi-24], xmm0
- movss xmm1, [esi+eax+4]
- mulss xmm1, xmm7
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps [edi-16], xmm1
- movlps [edi- 8], xmm1
- add eax, 2*4
- jl loop2
- done2:
- mov eax, numSamples
- and eax, 1
- jz done
- movss xmm0, [esi]
- mulss xmm0, xmm7
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps [edi+0], xmm0
- movlps [edi+8], xmm0
- done:
- }
- }
- /*
- ============
- SSE_UpSample11kHzStereoOGGTo44kHz
- ============
- */
- static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
- float constant = 32768.0f;
- __asm {
- mov esi, src
- mov ecx, [esi+0]
- mov edx, [esi+4]
- mov edi, dest
- movss xmm7, constant
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mov eax, numSamples
- and eax, ~1
- jz done2
- shl eax, 1
- add ecx, eax
- add edx, eax
- neg eax
- align 16
- loop2:
- add edi, 4*16
- movlps xmm0, [ecx+eax]
- movlps xmm1, [edx+eax]
- unpcklps xmm0, xmm1
- mulps xmm0, xmm7
- movlps [edi-8*8], xmm0
- movlps [edi-7*8], xmm0
- movlps [edi-6*8], xmm0
- movlps [edi-5*8], xmm0
- movhps [edi-4*8], xmm0
- movhps [edi-3*8], xmm0
- movhps [edi-2*8], xmm0
- movhps [edi-1*8], xmm0
- add eax, 2*4
- jl loop2
- done2:
- mov eax, numSamples
- and eax, 1
- jz done
- movss xmm0, [ecx]
- movss xmm1, [edx]
- unpcklps xmm0, xmm1
- mulps xmm0, xmm7
- movlps [edi+0*8], xmm0
- movlps [edi+1*8], xmm0
- movlps [edi+2*8], xmm0
- movlps [edi+3*8], xmm0
- done:
- }
- }
- /*
- ============
- SSE_UpSample22kHzMonoOGGTo44kHz
- ============
- */
- static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
- float constant = 32768.0f;
- __asm {
- mov esi, src
- mov edi, dest
- movss xmm7, constant
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mov eax, numSamples
- and eax, ~1
- jz done2
- shl eax, 2
- add esi, eax
- neg eax
- align 16
- loop2:
- add edi, 2*8
- movss xmm0, [esi+eax+0]
- movss xmm1, [esi+eax+4]
- shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm0, xmm7
- movlps [edi-16], xmm0
- movhps [edi- 8], xmm0
- add eax, 2*4
- jl loop2
- done2:
- mov eax, numSamples
- and eax, 1
- jz done
- movss xmm0, [esi]
- mulss xmm0, xmm7
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
- movlps [edi+0], xmm0
- done:
- }
- }
- /*
- ============
- SSE_UpSample22kHzStereoOGGTo44kHz
- ============
- */
- static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
- float constant = 32768.0f;
- __asm {
- mov esi, src
- mov ecx, [esi+0]
- mov edx, [esi+4]
- mov edi, dest
- movss xmm7, constant
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mov eax, numSamples
- and eax, ~1
- jz done2
- shl eax, 1
- add ecx, eax
- add edx, eax
- neg eax
- align 16
- loop2:
- add edi, 2*16
- movlps xmm0, [ecx+eax]
- movlps xmm1, [edx+eax]
- unpcklps xmm0, xmm1
- mulps xmm0, xmm7
- movlps [edi-4*8], xmm0
- movlps [edi-3*8], xmm0
- movhps [edi-2*8], xmm0
- movhps [edi-1*8], xmm0
- add eax, 2*4
- jl loop2
- done2:
- mov eax, numSamples
- and eax, 1
- jz done
- movss xmm0, [ecx]
- movss xmm1, [edx]
- unpcklps xmm0, xmm1
- mulps xmm0, xmm7
- movlps [edi+0*8], xmm0
- movlps [edi+1*8], xmm0
- done:
- }
- }
- /*
- ============
- SSE_UpSample44kHzMonoOGGTo44kHz
- ============
- */
- static void SSE_UpSample44kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
- float constant = 32768.0f;
- KFLOAT_CA( mul, dest, src, constant, numSamples )
- }
- /*
- ============
- SSE_UpSample44kHzStereoOGGTo44kHz
- ============
- */
- static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
- float constant = 32768.0f;
- __asm {
- mov esi, src
- mov ecx, [esi+0]
- mov edx, [esi+4]
- mov edi, dest
- movss xmm7, constant
- shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
- mov eax, numSamples
- and eax, ~1
- jz done2
- shl eax, 1
- add ecx, eax
- add edx, eax
- neg eax
- align 16
- loop2:
- add edi, 16
- movlps xmm0, [ecx+eax]
- movlps xmm1, [edx+eax]
- unpcklps xmm0, xmm1
- mulps xmm0, xmm7
- movlps [edi-2*8], xmm0
- movhps [edi-1*8], xmm0
- add eax, 2*4
- jl loop2
- done2:
- mov eax, numSamples
- and eax, 1
- jz done
- movss xmm0, [ecx]
- movss xmm1, [edx]
- unpcklps xmm0, xmm1
- mulps xmm0, xmm7
- movlps [edi+0*8], xmm0
- done:
- }
- }
- /*
- ============
- idSIMD_SSE::UpSampleOGGTo44kHz
- Duplicate samples for 44kHz output.
- ============
- */
- void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
- if ( kHz == 11025 ) {
- if ( numChannels == 1 ) {
- SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
- } else {
- SSE_UpSample11kHzStereoOGGTo44kHz( dest, ogg, numSamples );
- }
- } else if ( kHz == 22050 ) {
- if ( numChannels == 1 ) {
- SSE_UpSample22kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
- } else {
- SSE_UpSample22kHzStereoOGGTo44kHz( dest, ogg, numSamples );
- }
- } else if ( kHz == 44100 ) {
- if ( numChannels == 1 ) {
- SSE_UpSample44kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
- } else {
- SSE_UpSample44kHzStereoOGGTo44kHz( dest, ogg, numSamples );
- }
- } else {
- assert( 0 );
- }
- }
- /*
- ============
- idSIMD_SSE::MixSoundTwoSpeakerMono
- ============
- */
- void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
- #if 1
- ALIGN16( float incs[2] );
- assert( numSamples == MIXBUFFER_SAMPLES );
- incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- __asm {
- mov eax, MIXBUFFER_SAMPLES
- mov edi, mixBuffer
- mov esi, samples
- shl eax, 2
- add esi, eax
- neg eax
- mov ecx, lastV
- movlps xmm6, [ecx]
- xorps xmm7, xmm7
- movhps xmm7, incs
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
- addps xmm6, xmm7
- shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
- addps xmm7, xmm7
- loop16:
- add edi, 4*4*4
- movaps xmm0, [esi+eax+0*4*4]
- movaps xmm1, xmm0
- shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
- mulps xmm0, xmm6
- addps xmm0, [edi-4*4*4]
- addps xmm6, xmm7
- movaps [edi-4*4*4], xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
- mulps xmm1, xmm6
- addps xmm1, [edi-3*4*4]
- addps xmm6, xmm7
- movaps [edi-3*4*4], xmm1
- movaps xmm2, [esi+eax+1*4*4]
- movaps xmm3, xmm2
- shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
- mulps xmm2, xmm6
- addps xmm2, [edi-2*4*4]
- addps xmm6, xmm7
- movaps [edi-2*4*4], xmm2
- shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )
- mulps xmm3, xmm6
- addps xmm3, [edi-1*4*4]
- addps xmm6, xmm7
- movaps [edi-1*4*4], xmm3
- add eax, 2*4*4
- jl loop16
- }
- #else
- int i;
- float incL;
- float incR;
- float sL0, sL1;
- float sR0, sR1;
- assert( numSamples == MIXBUFFER_SAMPLES );
- incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- sL0 = lastV[0];
- sR0 = lastV[1];
- sL1 = lastV[0] + incL;
- sR1 = lastV[1] + incR;
- incL *= 2;
- incR *= 2;
- for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
- mixBuffer[i*2+0] += samples[i+0] * sL0;
- mixBuffer[i*2+1] += samples[i+0] * sR0;
- mixBuffer[i*2+2] += samples[i+1] * sL1;
- mixBuffer[i*2+3] += samples[i+1] * sR1;
- sL0 += incL;
- sR0 += incR;
- sL1 += incL;
- sR1 += incR;
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::MixSoundTwoSpeakerStereo
- ============
- */
- void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
- #if 1
- ALIGN16( float incs[2] );
- assert( numSamples == MIXBUFFER_SAMPLES );
- incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- __asm {
- mov eax, MIXBUFFER_SAMPLES
- mov edi, mixBuffer
- mov esi, samples
- shl eax, 3
- add esi, eax
- neg eax
- mov ecx, lastV
- movlps xmm6, [ecx]
- xorps xmm7, xmm7
- movhps xmm7, incs
- shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
- addps xmm6, xmm7
- shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
- addps xmm7, xmm7
- loop16:
- add edi, 4*4*4
- movaps xmm0, [esi+eax+0*4*4]
- mulps xmm0, xmm6
- addps xmm0, [edi-4*4*4]
- addps xmm6, xmm7
- movaps [edi-4*4*4], xmm0
- movaps xmm2, [esi+eax+1*4*4]
- mulps xmm2, xmm6
- addps xmm2, [edi-3*4*4]
- addps xmm6, xmm7
- movaps [edi-3*4*4], xmm2
- movaps xmm3, [esi+eax+2*4*4]
- mulps xmm3, xmm6
- addps xmm3, [edi-2*4*4]
- addps xmm6, xmm7
- movaps [edi-2*4*4], xmm3
- movaps xmm4, [esi+eax+3*4*4]
- mulps xmm4, xmm6
- addps xmm4, [edi-1*4*4]
- addps xmm6, xmm7
- movaps [edi-1*4*4], xmm4
- add eax, 4*4*4
- jl loop16
- }
- #else
- int i;
- float incL;
- float incR;
- float sL0, sL1;
- float sR0, sR1;
- assert( numSamples == MIXBUFFER_SAMPLES );
- incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- sL0 = lastV[0];
- sR0 = lastV[1];
- sL1 = lastV[0] + incL;
- sR1 = lastV[1] + incR;
- incL *= 2;
- incR *= 2;
- for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
- mixBuffer[i*2+0] += samples[i*2+0] * sL0;
- mixBuffer[i*2+1] += samples[i*2+1] * sR0;
- mixBuffer[i*2+2] += samples[i*2+2] * sL1;
- mixBuffer[i*2+3] += samples[i*2+3] * sR1;
- sL0 += incL;
- sR0 += incR;
- sL1 += incL;
- sR1 += incR;
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::MixSoundSixSpeakerMono
- ============
- */
- void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
- #if 1
- ALIGN16( float incs[6] );
- assert( numSamples == MIXBUFFER_SAMPLES );
- incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
- incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
- incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
- incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
- __asm {
- mov eax, MIXBUFFER_SAMPLES
- mov edi, mixBuffer
- mov esi, samples
- shl eax, 2
- add esi, eax
- neg eax
- mov ecx, lastV
- movlps xmm2, [ecx+ 0]
- movhps xmm2, [ecx+ 8]
- movlps xmm3, [ecx+16]
- movaps xmm4, xmm2
- shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
- shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
- xorps xmm5, xmm5
- movhps xmm5, incs
- movlps xmm7, incs+8
- movhps xmm7, incs+16
- addps xmm3, xmm5
- addps xmm4, xmm7
- shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
- movaps xmm6, xmm7
- shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
- addps xmm5, xmm5
- addps xmm6, xmm6
- addps xmm7, xmm7
- loop24:
- add edi, 6*16
- movaps xmm0, [esi+eax]
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- mulps xmm1, xmm2
- addps xmm1, [edi-6*16]
- addps xmm2, xmm5
- movaps [edi-6*16], xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
- mulps xmm1, xmm3
- addps xmm1, [edi-5*16]
- addps xmm3, xmm6
- movaps [edi-5*16], xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
- mulps xmm1, xmm4
- addps xmm1, [edi-4*16]
- addps xmm4, xmm7
- movaps [edi-4*16], xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 2, 2 )
- mulps xmm1, xmm2
- addps xmm1, [edi-3*16]
- addps xmm2, xmm5
- movaps [edi-3*16], xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
- mulps xmm1, xmm3
- addps xmm1, [edi-2*16]
- addps xmm3, xmm6
- movaps [edi-2*16], xmm1
- shufps xmm0, xmm0, R_SHUFFLEPS( 3, 3, 3, 3 )
- mulps xmm0, xmm4
- addps xmm0, [edi-1*16]
- addps xmm4, xmm7
- movaps [edi-1*16], xmm0
- add eax, 4*4
- jl loop24
- }
- #else
- int i;
- float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
- float incL0, incL1, incL2, incL3, incL4, incL5;
- assert( numSamples == MIXBUFFER_SAMPLES );
- incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
- incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
- incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
- incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
- sL0 = lastV[0];
- sL1 = lastV[1];
- sL2 = lastV[2];
- sL3 = lastV[3];
- sL4 = lastV[4];
- sL5 = lastV[5];
- sL6 = lastV[0] + incL0;
- sL7 = lastV[1] + incL1;
- sL8 = lastV[2] + incL2;
- sL9 = lastV[3] + incL3;
- sL10 = lastV[4] + incL4;
- sL11 = lastV[5] + incL5;
- incL0 *= 2;
- incL1 *= 2;
- incL2 *= 2;
- incL3 *= 2;
- incL4 *= 2;
- incL5 *= 2;
- for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
- mixBuffer[i*6+ 0] += samples[i+0] * sL0;
- mixBuffer[i*6+ 1] += samples[i+0] * sL1;
- mixBuffer[i*6+ 2] += samples[i+0] * sL2;
- mixBuffer[i*6+ 3] += samples[i+0] * sL3;
- mixBuffer[i*6+ 4] += samples[i+0] * sL4;
- mixBuffer[i*6+ 5] += samples[i+0] * sL5;
- mixBuffer[i*6+ 6] += samples[i+1] * sL6;
- mixBuffer[i*6+ 7] += samples[i+1] * sL7;
- mixBuffer[i*6+ 8] += samples[i+1] * sL8;
- mixBuffer[i*6+ 9] += samples[i+1] * sL9;
- mixBuffer[i*6+10] += samples[i+1] * sL10;
- mixBuffer[i*6+11] += samples[i+1] * sL11;
- sL0 += incL0;
- sL1 += incL1;
- sL2 += incL2;
- sL3 += incL3;
- sL4 += incL4;
- sL5 += incL5;
- sL6 += incL0;
- sL7 += incL1;
- sL8 += incL2;
- sL9 += incL3;
- sL10 += incL4;
- sL11 += incL5;
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::MixSoundSixSpeakerStereo
- ============
- */
- void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
- #if 1
- ALIGN16( float incs[6] );
- assert( numSamples == MIXBUFFER_SAMPLES );
- assert( SPEAKER_RIGHT == 1 );
- assert( SPEAKER_BACKRIGHT == 5 );
- incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
- incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
- incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
- incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
- __asm {
- mov eax, MIXBUFFER_SAMPLES
- mov edi, mixBuffer
- mov esi, samples
- shl eax, 3
- add esi, eax
- neg eax
- mov ecx, lastV
- movlps xmm2, [ecx+ 0]
- movhps xmm2, [ecx+ 8]
- movlps xmm3, [ecx+16]
- movaps xmm4, xmm2
- shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
- shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
- xorps xmm5, xmm5
- movhps xmm5, incs
- movlps xmm7, incs+ 8
- movhps xmm7, incs+16
- addps xmm3, xmm5
- addps xmm4, xmm7
- shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
- movaps xmm6, xmm7
- shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
- addps xmm5, xmm5
- addps xmm6, xmm6
- addps xmm7, xmm7
- loop12:
- add edi, 3*16
- movaps xmm0, [esi+eax+0]
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 0 )
- mulps xmm1, xmm2
- addps xmm1, [edi-3*16]
- addps xmm2, xmm5
- movaps [edi-3*16], xmm1
- movaps xmm1, xmm0
- shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 2, 3 )
- mulps xmm1, xmm3
- addps xmm1, [edi-2*16]
- addps xmm3, xmm6
- movaps [edi-2*16], xmm1
- add eax, 4*4
- shufps xmm0, xmm0, R_SHUFFLEPS( 2, 2, 2, 3 )
- mulps xmm0, xmm4
- addps xmm0, [edi-1*16]
- addps xmm4, xmm7
- movaps [edi-1*16], xmm0
- jl loop12
- emms
- }
- #else
- int i;
- float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
- float incL0, incL1, incL2, incL3, incL4, incL5;
- assert( numSamples == MIXBUFFER_SAMPLES );
- assert( SPEAKER_RIGHT == 1 );
- assert( SPEAKER_BACKRIGHT == 5 );
- incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
- incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
- incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
- incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
- sL0 = lastV[0];
- sL1 = lastV[1];
- sL2 = lastV[2];
- sL3 = lastV[3];
- sL4 = lastV[4];
- sL5 = lastV[5];
- sL6 = lastV[0] + incL0;
- sL7 = lastV[1] + incL1;
- sL8 = lastV[2] + incL2;
- sL9 = lastV[3] + incL3;
- sL10 = lastV[4] + incL4;
- sL11 = lastV[5] + incL5;
- incL0 *= 2;
- incL1 *= 2;
- incL2 *= 2;
- incL3 *= 2;
- incL4 *= 2;
- incL5 *= 2;
- for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
- mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0;
- mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1;
- mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2;
- mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3;
- mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4;
- mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5;
- mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6;
- mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7;
- mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8;
- mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9;
- mixBuffer[i*6+10] += samples[i*2+2+0] * sL10;
- mixBuffer[i*6+11] += samples[i*2+2+1] * sL11;
- sL0 += incL0;
- sL1 += incL1;
- sL2 += incL2;
- sL3 += incL3;
- sL4 += incL4;
- sL5 += incL5;
- sL6 += incL0;
- sL7 += incL1;
- sL8 += incL2;
- sL9 += incL3;
- sL10 += incL4;
- sL11 += incL5;
- }
- #endif
- }
- /*
- ============
- idSIMD_SSE::MixedSoundToSamples
- ============
- */
- void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
- #if 1
- assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
- __asm {
- mov eax, numSamples
- mov edi, mixBuffer
- mov esi, samples
- shl eax, 2
- add edi, eax
- neg eax
- loop16:
- movaps xmm0, [edi+eax+0*16]
- movaps xmm2, [edi+eax+1*16]
- movaps xmm4, [edi+eax+2*16]
- movaps xmm6, [edi+eax+3*16]
- add esi, 4*4*2
- movhlps xmm1, xmm0
- movhlps xmm3, xmm2
- movhlps xmm5, xmm4
- movhlps xmm7, xmm6
- prefetchnta [edi+eax+64]
- cvtps2pi mm0, xmm0
- cvtps2pi mm2, xmm2
- cvtps2pi mm4, xmm4
- cvtps2pi mm6, xmm6
- prefetchnta [edi+eax+128]
- cvtps2pi mm1, xmm1
- cvtps2pi mm3, xmm3
- cvtps2pi mm5, xmm5
- cvtps2pi mm7, xmm7
- add eax, 4*16
- packssdw mm0, mm1
- packssdw mm2, mm3
- packssdw mm4, mm5
- packssdw mm6, mm7
- movq [esi-4*4*2], mm0
- movq [esi-3*4*2], mm2
- movq [esi-2*4*2], mm4
- movq [esi-1*4*2], mm6
- jl loop16
- emms
- }
- #else
- for ( int i = 0; i < numSamples; i++ ) {
- if ( mixBuffer[i] <= -32768.0f ) {
- samples[i] = -32768;
- } else if ( mixBuffer[i] >= 32767.0f ) {
- samples[i] = 32767;
- } else {
- samples[i] = (short) mixBuffer[i];
- }
- }
- #endif
- }
- #endif /* _WIN32 */
|