Simd_SSE.cpp 480 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419144201442114422144231442414425144261442714428144291443014431144321443314434144351443614437144381443914440144411444214443144441444514446144471444814449144501445114452144531445414455144561445714458144591446014461144621446314464144651446614467144681446914470144711447214473144741447514476144771447814479144801448114482144831448414485144861448714488144891449014491144921449314494144951449614497144981449914500145011450214503145041450514506145071450814509145101451114512145131451414515145161451714518145191452014521145221452314524145251452614527145281452914530145311453214533145341453514536145371453814539145401454114542145431454414545145461454714548145491455014551145521455314554145551455614557145581455914560145611456214563145641456514566145671456814569145701457114572145731457414575145761457714578145791458014581145821458314584145851458614587145881458914590145911459214593145941459514596145971459814599146001460114602146031460414605146061460714608146091461014611146121461314614146151461614617146181461914620146211462214623146241462514626146271462814629146301463114632146331463414635146361463714638146391464014641146421464314644146451464614647146481464914650146511465214653146541465514656146571465814659146601466114662146631466414665146661466714668146691467014671146721467314674146751467614677146781467914680146811468214683146841468514686146871468814689146901469114692146931469414695146961469714698146991470014701147021470314704147051470614707147081470914710147111471214713147141471514716147171471814719147201472114722147231472414725147261472714728147291473014731147321473314734147351473614737147381473914740147411474214743147441474514746147471474814749147501475114752147531475414755147561475714758147591476014761147621476314764147651476614767147681476914770147711477214773147741477514776147771477814779147801478114782147831478414785147861478714788147891479014791147921479314794147951479614797147981479914800148011480214803148041480514806148071480814809148101481114812148131481414815148161481714818148191482014821148221482314824148251482614827148281482914830148311483214833148341483514836148371483814839148401484114842148431484414845148461484714848148491485014851148521485314854148551485614857148581485914860148611486214863148641486514866148671486814869148701487114872148731487414875148761487714878148791488014881148821488314884148851488614887148881488914890148911489214893148941489514896148971489814899149001490114902149031490414905149061490714908149091491014911149121491314914149151491614917149181491914920149211492214923149241492514926149271492814929149301493114932149331493414935149361493714938149391494014941149421494314944149451494614947149481494914950149511495214953149541495514956149571495814959149601496114962149631496414965149661496714968149691497014971149721497314974149751497614977149781497914980149811498214983149841498514986149871498814989149901499114992149931499414995149961499714998149991500015001150021500315004150051500615007150081500915010150111501215013150141501515016150171501815019150201502115022150231502415025150261502715028150291503015031150321503315034150351503615037150381503915040150411504215043150441504515046150471504815049150501505115052150531505415055150561505715058150591506015061150621506315064150651506615067150681506915070150711507215073150741507515076150771507815079150801508115082150831508415085150861508715088150891509015091150921509315094150951509615097150981509915100151011510215103151041510515106151071510815109151101511115112151131511415115151161511715118151191512015121151221512315124151251512615127151281512915130151311513215133151341513515136151371513815139151401514115142151431514415145151461514715148151491515015151151521515315154151551515615157151581515915160151611516215163151641516515166151671516815169151701517115172151731517415175151761517715178151791518015181151821518315184151851518615187151881518915190151911519215193151941519515196151971519815199152001520115202152031520415205152061520715208152091521015211152121521315214152151521615217152181521915220152211522215223152241522515226152271522815229152301523115232152331523415235152361523715238152391524015241152421524315244152451524615247152481524915250152511525215253152541525515256152571525815259152601526115262152631526415265152661526715268152691527015271152721527315274152751527615277152781527915280152811528215283152841528515286152871528815289152901529115292152931529415295152961529715298152991530015301153021530315304153051530615307153081530915310153111531215313153141531515316153171531815319153201532115322153231532415325153261532715328153291533015331153321533315334153351533615337153381533915340153411534215343153441534515346153471534815349153501535115352153531535415355153561535715358153591536015361153621536315364153651536615367153681536915370153711537215373153741537515376153771537815379153801538115382153831538415385153861538715388153891539015391153921539315394153951539615397153981539915400154011540215403154041540515406154071540815409154101541115412154131541415415154161541715418154191542015421154221542315424154251542615427154281542915430154311543215433154341543515436154371543815439154401544115442154431544415445154461544715448154491545015451154521545315454154551545615457154581545915460154611546215463154641546515466154671546815469154701547115472154731547415475154761547715478154791548015481154821548315484154851548615487154881548915490154911549215493154941549515496154971549815499155001550115502155031550415505155061550715508155091551015511155121551315514155151551615517155181551915520155211552215523155241552515526155271552815529155301553115532155331553415535155361553715538155391554015541155421554315544155451554615547155481554915550155511555215553155541555515556155571555815559155601556115562155631556415565155661556715568155691557015571155721557315574155751557615577155781557915580155811558215583155841558515586155871558815589155901559115592155931559415595155961559715598155991560015601156021560315604156051560615607156081560915610156111561215613156141561515616156171561815619156201562115622156231562415625156261562715628156291563015631156321563315634156351563615637156381563915640156411564215643156441564515646156471564815649156501565115652156531565415655156561565715658156591566015661156621566315664156651566615667156681566915670156711567215673156741567515676156771567815679156801568115682156831568415685156861568715688156891569015691156921569315694156951569615697156981569915700157011570215703157041570515706157071570815709157101571115712157131571415715157161571715718157191572015721157221572315724157251572615727157281572915730157311573215733157341573515736157371573815739157401574115742157431574415745157461574715748157491575015751157521575315754157551575615757157581575915760157611576215763157641576515766157671576815769157701577115772157731577415775157761577715778157791578015781157821578315784157851578615787157881578915790157911579215793157941579515796157971579815799158001580115802158031580415805158061580715808158091581015811158121581315814158151581615817158181581915820158211582215823158241582515826158271582815829158301583115832158331583415835158361583715838158391584015841158421584315844158451584615847158481584915850158511585215853158541585515856158571585815859158601586115862158631586415865158661586715868158691587015871158721587315874158751587615877158781587915880158811588215883158841588515886158871588815889158901589115892158931589415895158961589715898158991590015901159021590315904159051590615907159081590915910159111591215913159141591515916159171591815919159201592115922159231592415925159261592715928159291593015931159321593315934159351593615937159381593915940159411594215943159441594515946159471594815949159501595115952159531595415955159561595715958159591596015961159621596315964159651596615967159681596915970159711597215973159741597515976159771597815979159801598115982159831598415985159861598715988159891599015991159921599315994159951599615997159981599916000160011600216003160041600516006160071600816009160101601116012160131601416015160161601716018160191602016021160221602316024160251602616027160281602916030160311603216033160341603516036160371603816039160401604116042160431604416045160461604716048160491605016051160521605316054160551605616057160581605916060160611606216063160641606516066160671606816069160701607116072160731607416075160761607716078160791608016081160821608316084160851608616087160881608916090160911609216093160941609516096160971609816099161001610116102161031610416105161061610716108161091611016111161121611316114161151611616117161181611916120161211612216123161241612516126161271612816129161301613116132161331613416135161361613716138161391614016141161421614316144161451614616147161481614916150161511615216153161541615516156161571615816159161601616116162161631616416165161661616716168161691617016171161721617316174161751617616177161781617916180161811618216183161841618516186161871618816189161901619116192161931619416195161961619716198161991620016201162021620316204162051620616207162081620916210162111621216213162141621516216162171621816219162201622116222162231622416225162261622716228162291623016231162321623316234162351623616237162381623916240162411624216243162441624516246162471624816249162501625116252162531625416255162561625716258162591626016261162621626316264162651626616267162681626916270162711627216273162741627516276162771627816279162801628116282162831628416285162861628716288162891629016291162921629316294162951629616297162981629916300163011630216303163041630516306163071630816309163101631116312163131631416315163161631716318163191632016321163221632316324163251632616327163281632916330163311633216333163341633516336163371633816339163401634116342163431634416345163461634716348163491635016351163521635316354163551635616357163581635916360163611636216363163641636516366163671636816369163701637116372163731637416375163761637716378163791638016381163821638316384163851638616387163881638916390163911639216393163941639516396163971639816399164001640116402164031640416405164061640716408164091641016411164121641316414164151641616417164181641916420164211642216423164241642516426164271642816429164301643116432164331643416435164361643716438164391644016441164421644316444164451644616447164481644916450164511645216453164541645516456164571645816459164601646116462164631646416465164661646716468164691647016471164721647316474164751647616477164781647916480164811648216483164841648516486164871648816489164901649116492164931649416495164961649716498164991650016501165021650316504165051650616507165081650916510165111651216513165141651516516165171651816519165201652116522165231652416525165261652716528165291653016531165321653316534165351653616537165381653916540165411654216543165441654516546165471654816549165501655116552165531655416555165561655716558165591656016561165621656316564165651656616567165681656916570165711657216573165741657516576165771657816579165801658116582165831658416585165861658716588165891659016591165921659316594165951659616597165981659916600166011660216603166041660516606166071660816609166101661116612166131661416615166161661716618166191662016621166221662316624166251662616627166281662916630166311663216633166341663516636166371663816639166401664116642166431664416645166461664716648166491665016651166521665316654166551665616657166581665916660166611666216663166641666516666166671666816669166701667116672166731667416675166761667716678166791668016681166821668316684166851668616687166881668916690166911669216693166941669516696166971669816699167001670116702167031670416705167061670716708167091671016711167121671316714167151671616717167181671916720167211672216723167241672516726167271672816729167301673116732167331673416735167361673716738167391674016741167421674316744167451674616747167481674916750167511675216753167541675516756167571675816759167601676116762167631676416765167661676716768167691677016771167721677316774167751677616777167781677916780167811678216783167841678516786167871678816789167901679116792167931679416795167961679716798167991680016801168021680316804168051680616807168081680916810168111681216813168141681516816168171681816819168201682116822168231682416825168261682716828168291683016831168321683316834168351683616837168381683916840168411684216843168441684516846168471684816849168501685116852168531685416855168561685716858168591686016861168621686316864168651686616867168681686916870168711687216873168741687516876168771687816879168801688116882168831688416885168861688716888168891689016891168921689316894168951689616897168981689916900169011690216903169041690516906169071690816909169101691116912169131691416915169161691716918169191692016921169221692316924169251692616927169281692916930169311693216933169341693516936169371693816939169401694116942169431694416945169461694716948169491695016951169521695316954169551695616957169581695916960169611696216963169641696516966169671696816969169701697116972169731697416975169761697716978169791698016981169821698316984169851698616987169881698916990169911699216993169941699516996169971699816999170001700117002170031700417005170061700717008170091701017011170121701317014170151701617017170181701917020170211702217023170241702517026170271702817029170301703117032170331703417035170361703717038170391704017041170421704317044170451704617047170481704917050170511705217053170541705517056170571705817059170601706117062170631706417065170661706717068170691707017071170721707317074170751707617077170781707917080170811708217083170841708517086170871708817089170901709117092170931709417095170961709717098170991710017101171021710317104171051710617107171081710917110171111711217113171141711517116171171711817119171201712117122171231712417125171261712717128171291713017131171321713317134171351713617137171381713917140171411714217143171441714517146171471714817149171501715117152171531715417155171561715717158171591716017161171621716317164171651716617167171681716917170171711717217173171741717517176171771717817179171801718117182171831718417185171861718717188171891719017191171921719317194171951719617197171981719917200172011720217203172041720517206172071720817209172101721117212172131721417215172161721717218172191722017221172221722317224172251722617227172281722917230172311723217233172341723517236172371723817239172401724117242172431724417245172461724717248172491725017251172521725317254172551725617257172581725917260172611726217263172641726517266172671726817269172701727117272172731727417275172761727717278172791728017281172821728317284172851728617287172881728917290172911729217293172941729517296172971729817299173001730117302173031730417305173061730717308173091731017311173121731317314173151731617317173181731917320173211732217323173241732517326173271732817329173301733117332173331733417335173361733717338173391734017341173421734317344173451734617347173481734917350173511735217353173541735517356173571735817359173601736117362173631736417365173661736717368173691737017371173721737317374173751737617377173781737917380173811738217383173841738517386173871738817389173901739117392173931739417395173961739717398173991740017401174021740317404174051740617407174081740917410174111741217413174141741517416174171741817419174201742117422174231742417425174261742717428174291743017431174321743317434174351743617437174381743917440174411744217443174441744517446174471744817449174501745117452174531745417455174561745717458174591746017461174621746317464174651746617467174681746917470174711747217473174741747517476174771747817479174801748117482174831748417485174861748717488174891749017491174921749317494174951749617497174981749917500175011750217503175041750517506175071750817509175101751117512175131751417515175161751717518175191752017521175221752317524175251752617527175281752917530175311753217533175341753517536175371753817539175401754117542175431754417545175461754717548175491755017551175521755317554175551755617557175581755917560175611756217563175641756517566175671756817569175701757117572175731757417575175761757717578175791758017581175821758317584175851758617587175881758917590175911759217593175941759517596175971759817599176001760117602176031760417605176061760717608176091761017611176121761317614176151761617617176181761917620176211762217623176241762517626176271762817629176301763117632176331763417635176361763717638176391764017641176421764317644176451764617647176481764917650176511765217653176541765517656176571765817659176601766117662176631766417665176661766717668176691767017671176721767317674176751767617677176781767917680176811768217683176841768517686176871768817689176901769117692176931769417695176961769717698176991770017701177021770317704177051770617707177081770917710177111771217713177141771517716177171771817719177201772117722177231772417725177261772717728177291773017731177321773317734177351773617737177381773917740177411774217743177441774517746177471774817749177501775117752177531775417755177561775717758177591776017761177621776317764177651776617767177681776917770177711777217773177741777517776177771777817779177801778117782177831778417785177861778717788177891779017791177921779317794177951779617797177981779917800178011780217803178041780517806178071780817809178101781117812178131781417815178161781717818178191782017821178221782317824178251782617827178281782917830178311783217833178341783517836178371783817839178401784117842178431784417845178461784717848178491785017851178521785317854178551785617857178581785917860178611786217863178641786517866178671786817869178701787117872178731787417875178761787717878178791788017881178821788317884178851788617887178881788917890178911789217893178941789517896178971789817899179001790117902179031790417905179061790717908179091791017911179121791317914179151791617917179181791917920179211792217923179241792517926179271792817929179301793117932179331793417935179361793717938179391794017941179421794317944179451794617947179481794917950179511795217953179541795517956179571795817959179601796117962179631796417965179661796717968179691797017971179721797317974179751797617977179781797917980179811798217983179841798517986179871798817989179901799117992179931799417995179961799717998179991800018001180021800318004180051800618007180081800918010180111801218013180141801518016180171801818019180201802118022180231802418025180261802718028180291803018031180321803318034180351803618037180381803918040180411804218043180441804518046180471804818049180501805118052180531805418055180561805718058180591806018061180621806318064180651806618067180681806918070180711807218073180741807518076180771807818079180801808118082180831808418085180861808718088
  1. /*
  2. ===========================================================================
  3. Doom 3 GPL Source Code
  4. Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
  6. Doom 3 Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #include "../precompiled.h"
  21. #pragma hdrstop
  22. #include "Simd_Generic.h"
  23. #include "Simd_MMX.h"
  24. #include "Simd_SSE.h"
  25. //===============================================================
  26. // M
  27. // SSE implementation of idSIMDProcessor MrE
  28. // E
  29. //===============================================================
  30. #if defined(MACOS_X) && defined(__i386__)
  31. #include <xmmintrin.h>
  32. #define DRAWVERT_SIZE 60
  33. #define DRAWVERT_XYZ_OFFSET (0*4)
  34. #define DRAWVERT_ST_OFFSET (3*4)
  35. #define DRAWVERT_NORMAL_OFFSET (5*4)
  36. #define DRAWVERT_TANGENT0_OFFSET (8*4)
  37. #define DRAWVERT_TANGENT1_OFFSET (11*4)
  38. #define DRAWVERT_COLOR_OFFSET (14*4)
  39. #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
  40. #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
  41. /*
  42. ============
  43. idSIMD_SSE::GetName
  44. ============
  45. */
  46. const char * idSIMD_SSE::GetName( void ) const {
  47. return "MMX & SSE";
  48. }
  49. /*
  50. ============
  51. idSIMD_SSE::Dot
  52. dst[i] = constant.Normal() * src[i].xyz + constant[3];
  53. ============
  54. */
  55. void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
  56. // 0, 1, 2
  57. // 3, 4, 5
  58. // 6, 7, 8
  59. // 9, 10, 11
  60. /*
  61. mov eax, count
  62. mov edi, constant
  63. mov edx, eax
  64. mov esi, src
  65. mov ecx, dst
  66. */
  67. __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // Declare 8 xmm registers.
  68. int count_l4 = count; // count_l4 = eax
  69. int count_l1 = count; // count_l1 = edx
  70. char *constant_p = (char *)&constant; // constant_p = edi
  71. char *src_p = (char *) src; // src_p = esi
  72. char *dst_p = (char *) dst; // dst_p = ecx
  73. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  74. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  75. /*
  76. and eax, ~3
  77. movss xmm4, [edi+0]
  78. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  79. movss xmm5, [edi+4]
  80. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  81. movss xmm6, [edi+8]
  82. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  83. movss xmm7, [edi+12]
  84. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  85. */
  86. count_l4 = count_l4 & ~3;
  87. xmm4 = _mm_load_ss((float *) (constant_p));
  88. xmm4 = _mm_shuffle_ps(xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ));
  89. xmm5 = _mm_load_ss((float *) (constant_p + 4));
  90. xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
  91. xmm6 = _mm_load_ss((float *) (constant_p + 8));
  92. xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
  93. xmm7 = _mm_load_ss((float *) (constant_p + 12));
  94. xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
  95. /*
  96. jz startVert1
  97. */
  98. if(count_l4 != 0) {
  99. /*
  100. imul eax, DRAWVERT_SIZE
  101. add esi, eax
  102. neg eax
  103. */
  104. count_l4 = count_l4 * DRAWVERT_SIZE;
  105. src_p = src_p + count_l4;
  106. count_l4 = -count_l4;
  107. /*
  108. loopVert4:
  109. */
  110. do {
  111. /*
  112. movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
  113. movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
  114. movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
  115. movaps xmm1, xmm0 // 3, X, 0, 1
  116. */
  117. xmm0 = _mm_load_ss((float *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, X, X
  118. xmm2 = _mm_load_ss((float *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 2, X, X, X
  119. xmm0 = _mm_loadh_pi(xmm0, (__m64 *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, 0, 1
  120. xmm1 = xmm0; // 3, X, 0, 1
  121. /*
  122. movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
  123. shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
  124. */
  125. xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 4, 5, 0, 1
  126. xmm2 = _mm_shuffle_ps(xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )); // 2, X, 4, 5
  127. /*
  128. movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
  129. movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
  130. shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
  131. */
  132. xmm3 = _mm_load_ss((float *) (src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, X, X
  133. xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, 6, 7
  134. xmm0 = _mm_shuffle_ps(xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )); // 0, 3, 6, 9
  135. /*
  136. movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
  137. shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
  138. */
  139. xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 10, 11, 6, 7
  140. xmm1 = _mm_shuffle_ps(xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )); // 1, 4, 7, 10
  141. /*
  142. movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
  143. shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
  144. */
  145. xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 10, 11, 8, X
  146. xmm2 = _mm_shuffle_ps(xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )); // 2, 5, 8, 11
  147. /*
  148. add ecx, 16
  149. add eax, 4*DRAWVERT_SIZE
  150. */
  151. dst_p = dst_p + 16;
  152. count_l4 = count_l4 + 4*DRAWVERT_SIZE;
  153. /*
  154. mulps xmm0, xmm4
  155. mulps xmm1, xmm5
  156. mulps xmm2, xmm6
  157. addps xmm0, xmm7
  158. addps xmm0, xmm1
  159. addps xmm0, xmm2
  160. */
  161. xmm0 = _mm_mul_ps(xmm0, xmm4);
  162. xmm1 = _mm_mul_ps(xmm1, xmm5);
  163. xmm2 = _mm_mul_ps(xmm2, xmm6);
  164. xmm0 = _mm_add_ps(xmm0, xmm7);
  165. xmm0 = _mm_add_ps(xmm0, xmm1);
  166. xmm0 = _mm_add_ps(xmm0, xmm2);
  167. /*
  168. movlps [ecx-16+0], xmm0
  169. movhps [ecx-16+8], xmm0
  170. jl loopVert4
  171. */
  172. _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
  173. _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
  174. } while(count_l4 < 0);
  175. }
  176. /*
  177. startVert1:
  178. and edx, 3
  179. jz done
  180. */
  181. count_l1 = count_l1 & 3;
  182. if(count_l1 != 0) {
  183. /*
  184. loopVert1:
  185. movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
  186. movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
  187. movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
  188. mulss xmm0, xmm4
  189. mulss xmm1, xmm5
  190. mulss xmm2, xmm6
  191. addss xmm0, xmm7
  192. add ecx, 4
  193. addss xmm0, xmm1
  194. add eax, DRAWVERT_SIZE
  195. addss xmm0, xmm2
  196. dec edx
  197. movss [ecx-4], xmm0
  198. jnz loopVert1
  199. */
  200. do {
  201. xmm0 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+0));
  202. xmm1 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+4));
  203. xmm2 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+8));
  204. xmm0 = _mm_mul_ss(xmm0, xmm4);
  205. xmm1 = _mm_mul_ss(xmm1, xmm5);
  206. xmm2 = _mm_mul_ss(xmm2, xmm6);
  207. xmm0 = _mm_add_ss(xmm0, xmm7);
  208. dst_p = dst_p + 4;
  209. xmm0 = _mm_add_ss(xmm0, xmm1);
  210. count_l4 = count_l4 + DRAWVERT_SIZE;
  211. xmm0 = _mm_add_ss(xmm0, xmm2);
  212. count_l1 = count_l1 - 1;
  213. _mm_store_ss((float *) (dst_p-4), xmm0);
  214. } while( count_l1 != 0);
  215. }
  216. /*
  217. done:
  218. */
  219. }
  220. /*
  221. ============
  222. idSIMD_SSE::MinMax
  223. ============
  224. */
  225. void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
  226. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  227. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  228. __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  229. char *indexes_p;
  230. char *src_p;
  231. int count_l;
  232. int edx;
  233. char *min_p;
  234. char *max_p;
  235. /*
  236. movss xmm0, idMath::INFINITY
  237. xorps xmm1, xmm1
  238. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  239. subps xmm1, xmm0
  240. movaps xmm2, xmm0
  241. movaps xmm3, xmm1
  242. */
  243. xmm0 = _mm_load_ss(&idMath::INFINITY);
  244. // To satisfy the compiler use xmm0 instead.
  245. xmm1 = _mm_xor_ps(xmm0, xmm0);
  246. xmm0 = _mm_shuffle_ps(xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ));
  247. xmm1 = _mm_sub_ps(xmm1, xmm0);
  248. xmm2 = xmm0;
  249. xmm3 = xmm1;
  250. /*
  251. mov edi, indexes
  252. mov esi, src
  253. mov eax, count
  254. and eax, ~3
  255. jz done4
  256. */
  257. indexes_p = (char *) indexes;
  258. src_p = (char *) src;
  259. count_l = count;
  260. count_l = count_l & ~3;
  261. if(count_l != 0) {
  262. /*
  263. shl eax, 2
  264. add edi, eax
  265. neg eax
  266. */
  267. count_l = count_l << 2;
  268. indexes_p = indexes_p + count_l;
  269. count_l = -count_l;
  270. /*
  271. loop4:
  272. // prefetchnta [edi+128]
  273. // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
  274. */
  275. do {
  276. /*
  277. mov edx, [edi+eax+0]
  278. imul edx, DRAWVERT_SIZE
  279. movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
  280. movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
  281. minps xmm0, xmm4
  282. maxps xmm1, xmm4
  283. */
  284. edx = *((int*)(indexes_p+count_l+0));
  285. edx = edx * DRAWVERT_SIZE;
  286. xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
  287. xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
  288. xmm0 = _mm_min_ps(xmm0, xmm4);
  289. xmm1 = _mm_max_ps(xmm1, xmm4);
  290. /*
  291. mov edx, [edi+eax+4]
  292. imul edx, DRAWVERT_SIZE
  293. movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
  294. movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
  295. minps xmm2, xmm5
  296. maxps xmm3, xmm5
  297. */
  298. edx = *((int*)(indexes_p+count_l+4));
  299. edx = edx * DRAWVERT_SIZE;
  300. xmm5 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
  301. xmm5 = _mm_loadh_pi(xmm5, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
  302. xmm2 = _mm_min_ps(xmm2, xmm5);
  303. xmm3 = _mm_max_ps(xmm3, xmm5);
  304. /*
  305. mov edx, [edi+eax+8]
  306. imul edx, DRAWVERT_SIZE
  307. movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
  308. movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
  309. minps xmm0, xmm6
  310. maxps xmm1, xmm6
  311. */
  312. edx = *((int*)(indexes_p+count_l+8));
  313. edx = edx * DRAWVERT_SIZE;
  314. xmm6 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
  315. xmm6 = _mm_loadh_pi(xmm6, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
  316. xmm0 = _mm_min_ps(xmm0, xmm6);
  317. xmm1 = _mm_max_ps(xmm1, xmm6);
  318. /*
  319. mov edx, [edi+eax+12]
  320. imul edx, DRAWVERT_SIZE
  321. movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
  322. movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
  323. minps xmm2, xmm7
  324. maxps xmm3, xmm7
  325. */
  326. edx = *((int*)(indexes_p+count_l+12));
  327. edx = edx * DRAWVERT_SIZE;
  328. xmm7 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
  329. xmm7 = _mm_loadh_pi(xmm7, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
  330. xmm2 = _mm_min_ps(xmm2, xmm7);
  331. xmm3 = _mm_max_ps(xmm3, xmm7);
  332. /*
  333. add eax, 4*4
  334. jl loop4
  335. */
  336. count_l = count_l + 4*4;
  337. } while (count_l < 0);
  338. }
  339. /*
  340. done4:
  341. mov eax, count
  342. and eax, 3
  343. jz done1
  344. */
  345. count_l = count;
  346. count_l = count_l & 3;
  347. if(count_l != 0) {
  348. /*
  349. shl eax, 2
  350. add edi, eax
  351. neg eax
  352. */
  353. count_l = count_l << 2;
  354. indexes_p = indexes_p + count_l;
  355. count_l = -count_l;
  356. /*
  357. loop1:
  358. */
  359. do{
  360. /*
  361. mov edx, [edi+eax+0]
  362. imul edx, DRAWVERT_SIZE;
  363. movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
  364. movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
  365. minps xmm0, xmm4
  366. maxps xmm1, xmm4
  367. */
  368. edx = *((int*)(indexes_p+count_l+0));
  369. edx = edx * DRAWVERT_SIZE;
  370. xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
  371. xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
  372. xmm0 = _mm_min_ps(xmm0, xmm4);
  373. xmm1 = _mm_max_ps(xmm1, xmm4);
  374. /*
  375. add eax, 4
  376. jl loop1
  377. */
  378. count_l = count_l + 4;
  379. } while (count_l < 0);
  380. }
  381. /*
  382. done1:
  383. shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
  384. shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
  385. minps xmm0, xmm2
  386. maxps xmm1, xmm3
  387. mov esi, min
  388. movhps [esi], xmm0
  389. movss [esi+8], xmm0
  390. mov edi, max
  391. movhps [edi], xmm1
  392. movss [edi+8], xmm1
  393. */
  394. xmm2 = _mm_shuffle_ps(xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ));
  395. xmm3 = _mm_shuffle_ps(xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ));
  396. xmm0 = _mm_min_ps(xmm0, xmm2);
  397. xmm1 = _mm_max_ps(xmm1, xmm3);
  398. min_p = (char *) &min;
  399. _mm_storeh_pi((__m64 *)(min_p), xmm0);
  400. _mm_store_ss((float *)(min_p+8), xmm0);
  401. max_p = (char *) &max;
  402. _mm_storeh_pi((__m64 *)(max_p), xmm1);
  403. _mm_store_ss((float *)(max_p+8), xmm1);
  404. }
  405. /*
  406. ============
  407. idSIMD_SSE::Dot
  408. dst[i] = constant * src[i].Normal() + src[i][3];
  409. ============
  410. */
  411. void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
  412. int count_l4;
  413. int count_l1;
  414. char *constant_p;
  415. char *src_p;
  416. char *dst_p;
  417. __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  418. /*
  419. mov eax, count
  420. mov edi, constant
  421. mov edx, eax
  422. mov esi, src
  423. mov ecx, dst
  424. and eax, ~3
  425. */
  426. count_l4 = count;
  427. constant_p = (char *) &constant;
  428. count_l1 = count_l4;
  429. src_p = (char *) src;
  430. dst_p = (char *) dst;
  431. count_l4 = count_l4 & ~3;
  432. /*
  433. movss xmm5, [edi+0]
  434. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  435. movss xmm6, [edi+4]
  436. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  437. movss xmm7, [edi+8]
  438. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  439. */
  440. xmm5 = _mm_load_ss((float *) (constant_p+0));
  441. xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
  442. xmm6 = _mm_load_ss((float *) (constant_p+4));
  443. xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
  444. xmm7 = _mm_load_ss((float *) (constant_p+8));
  445. xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
  446. /*
  447. jz startVert1
  448. */
  449. if (count != 0) {
  450. /*
  451. imul eax, 16
  452. add esi, eax
  453. neg eax
  454. */
  455. count_l4 = count_l4 * 16;
  456. src_p = src_p + count_l4;
  457. count_l4 = -count_l4;
  458. /*
  459. loopVert4:
  460. */
  461. do {
  462. /*
  463. movlps xmm1, [esi+eax+ 0]
  464. movlps xmm3, [esi+eax+ 8]
  465. movhps xmm1, [esi+eax+16]
  466. movhps xmm3, [esi+eax+24]
  467. movlps xmm2, [esi+eax+32]
  468. movlps xmm4, [esi+eax+40]
  469. movhps xmm2, [esi+eax+48]
  470. movhps xmm4, [esi+eax+56]
  471. movaps xmm0, xmm1
  472. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
  473. shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
  474. movaps xmm2, xmm3
  475. shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
  476. shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
  477. */
  478. xmm1 = _mm_loadl_pi(xmm1, (__m64 *)(src_p+count_l4+ 0));
  479. xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+ 8));
  480. xmm1 = _mm_loadh_pi(xmm1, (__m64 *)(src_p+count_l4+16));
  481. xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+24));
  482. xmm2 = _mm_loadl_pi(xmm2, (__m64 *)(src_p+count_l4+32));
  483. xmm4 = _mm_loadl_pi(xmm4, (__m64 *)(src_p+count_l4+40));
  484. xmm2 = _mm_loadh_pi(xmm2, (__m64 *)(src_p+count_l4+48));
  485. xmm4 = _mm_loadh_pi(xmm4, (__m64 *)(src_p+count_l4+56));
  486. xmm0 = xmm1;
  487. xmm0 = _mm_shuffle_ps(xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ));
  488. xmm1 = _mm_shuffle_ps(xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ));
  489. xmm2 = xmm3;
  490. xmm2 = _mm_shuffle_ps(xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ));
  491. xmm3 = _mm_shuffle_ps(xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ));
  492. /*
  493. add ecx, 16
  494. add eax, 4*16
  495. */
  496. dst_p = dst_p + 16;
  497. count_l4 = count_l4 + 4*16;
  498. /*
  499. mulps xmm0, xmm5
  500. mulps xmm1, xmm6
  501. mulps xmm2, xmm7
  502. addps xmm0, xmm3
  503. addps xmm0, xmm1
  504. addps xmm0, xmm2
  505. */
  506. xmm0 = _mm_mul_ps(xmm0, xmm5);
  507. xmm1 = _mm_mul_ps(xmm1, xmm6);
  508. xmm2 = _mm_mul_ps(xmm2, xmm7);
  509. xmm0 = _mm_add_ps(xmm0, xmm3);
  510. xmm0 = _mm_add_ps(xmm0, xmm1);
  511. xmm0 = _mm_add_ps(xmm0, xmm2);
  512. /*
  513. movlps [ecx-16+0], xmm0
  514. movhps [ecx-16+8], xmm0
  515. jl loopVert4
  516. */
  517. _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
  518. _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
  519. } while (count_l4 < 0);
  520. }
  521. /*
  522. startVert1:
  523. and edx, 3
  524. jz done
  525. */
  526. count_l1 = count_l1 & 3;
  527. if(count_l1 != 0) {
  528. /*
  529. loopVert1:
  530. */
  531. do {
  532. /*
  533. movss xmm0, [esi+eax+0]
  534. movss xmm1, [esi+eax+4]
  535. movss xmm2, [esi+eax+8]
  536. mulss xmm0, xmm5
  537. mulss xmm1, xmm6
  538. mulss xmm2, xmm7
  539. addss xmm0, [esi+eax+12]
  540. add ecx, 4
  541. addss xmm0, xmm1
  542. add eax, 16
  543. addss xmm0, xmm2
  544. dec edx
  545. movss [ecx-4], xmm0
  546. jnz loopVert1
  547. */
  548. xmm0 = _mm_load_ss((float *) (src_p+count_l4+ 0));
  549. xmm1 = _mm_load_ss((float *) (src_p+count_l4+ 4));
  550. xmm2 = _mm_load_ss((float *) (src_p+count_l4+ 8));
  551. xmm3 = _mm_load_ss((float *) (src_p+count_l4+12));
  552. xmm0 = _mm_mul_ss(xmm0, xmm5);
  553. xmm1 = _mm_mul_ss(xmm1, xmm6);
  554. xmm2 = _mm_mul_ss(xmm2, xmm7);
  555. xmm0 = _mm_add_ss(xmm0, xmm3);
  556. dst_p = dst_p + 4;
  557. xmm0 = _mm_add_ss(xmm0, xmm1);
  558. count_l4 = count_l4 + 16;
  559. xmm0 = _mm_add_ss(xmm0, xmm2);
  560. count_l1 = count_l1 - 1;
  561. _mm_store_ss((float *) (dst_p-4), xmm0);
  562. } while (count_l1 != 0);
  563. }
  564. /*
  565. done:
  566. */
  567. }
  568. #elif defined(_WIN32)
  569. #include <xmmintrin.h>
  570. #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
  571. #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
  572. // transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary)
  573. #define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 ) \
  574. __asm movaps reg4, reg2 /* reg4 = 8, 9, 10, 11 */ \
  575. __asm unpcklps reg2, reg3 /* reg2 = 8, 12, 9, 13 */ \
  576. __asm unpckhps reg4, reg3 /* reg4 = 10, 14, 11, 15 */ \
  577. __asm movaps reg3, reg0 /* reg3 = 0, 1, 2, 3 */ \
  578. __asm unpcklps reg0, reg1 /* reg0 = 0, 4, 1, 5 */ \
  579. __asm unpckhps reg3, reg1 /* reg3 = 2, 6, 3, 7 */ \
  580. __asm movaps reg1, reg0 /* reg1 = 0, 4, 1, 5 */ \
  581. __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg0 = 0, 4, 8, 12 */ \
  582. __asm shufps reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg1 = 1, 5, 9, 13 */ \
  583. __asm movaps reg2, reg3 /* reg2 = 2, 6, 3, 7 */ \
  584. __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg2 = 2, 6, 10, 14 */ \
  585. __asm shufps reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg3 = 3, 7, 11, 15 */
  586. // transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary)
  587. #define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
  588. __asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
  589. __asm movlps reg3, [address+ 8] /* reg3 = 2, 3, X, X */ \
  590. __asm movhps reg1, [address+16] /* reg1 = 0, 1, 4, 5 */ \
  591. __asm movhps reg3, [address+24] /* reg3 = 2, 3, 6, 7 */ \
  592. __asm movlps reg2, [address+32] /* reg2 = 8, 9, X, X */ \
  593. __asm movlps reg4, [address+40] /* reg4 = 10, 11, X, X */ \
  594. __asm movhps reg2, [address+48] /* reg2 = 8, 9, 12, 13 */ \
  595. __asm movhps reg4, [address+56] /* reg4 = 10, 11, 14, 15 */ \
  596. __asm movaps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
  597. __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg0 = 0, 4, 8, 12 */ \
  598. __asm shufps reg1, reg2, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg1 = 1, 5, 9, 13 */ \
  599. __asm movaps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
  600. __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg2 = 2, 6, 10, 14 */ \
  601. __asm shufps reg3, reg4, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg3 = 3, 7, 11, 15 */
  602. // transpose a 4x4 matrix to memory from 4 xmm registers (reg4 is temporary)
  603. #define TRANPOSE_4x4_TO_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
  604. __asm movaps reg4, reg0 /* reg4 = 0, 4, 8, 12 */ \
  605. __asm unpcklps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
  606. __asm unpckhps reg4, reg1 /* reg4 = 8, 9, 12, 13 */ \
  607. __asm movaps reg1, reg2 /* reg1 = 2, 6, 10, 14 */ \
  608. __asm unpcklps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
  609. __asm unpckhps reg1, reg3 /* reg1 = 10, 11, 14, 15 */ \
  610. __asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
  611. __asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
  612. __asm movhps [address+16], reg0 /* mem1 = 4, 5, X, X */ \
  613. __asm movhps [address+24], reg2 /* mem1 = 4, 5, 6, 7 */ \
  614. __asm movlps [address+32], reg4 /* mem2 = 8, 9, X, X */ \
  615. __asm movlps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */ \
  616. __asm movhps [address+48], reg4 /* mem3 = 12, 13, X, X */ \
  617. __asm movhps [address+56], reg1 /* mem3 = 12, 13, 14, 15 */
  618. // transpose a 4x3 matrix loaded into 3 xmm registers (reg3 is temporary)
  619. #define TRANSPOSE_4x3( reg0, reg1, reg2, reg3 ) \
  620. __asm movaps reg3, reg2 /* reg3 = 8, 9, 10, 11 */ \
  621. __asm shufps reg3, reg1, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg3 = 10, 11, 4, 5 */ \
  622. __asm shufps reg2, reg0, R_SHUFFLEPS( 0, 1, 2, 3 ) /* reg2 = 8, 9, 2, 3 */ \
  623. __asm shufps reg1, reg0, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg1 = 6, 7, 0, 1 */ \
  624. __asm movaps reg0, reg1 /* reg0 = 6, 7, 0, 1 */ \
  625. __asm shufps reg0, reg2, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg0 = 0, 6, 3, 9 */ \
  626. __asm shufps reg1, reg3, R_SHUFFLEPS( 3, 1, 2, 0 ) /* reg1 = 1, 7, 4, 10 */ \
  627. __asm shufps reg2, reg3, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg2 = 2, 8, 5, 11 */
  628. // transpose a 4x3 matrix from memory into 3 xmm registers (reg3 is temporary)
  629. #define TRANSPOSE_4x3_FROM_MEMORY( address, reg0, reg1, reg2, reg3 ) \
  630. __asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
  631. __asm movlps reg2, [address+ 8] /* reg2 = 2, 3, X, X */ \
  632. __asm movlps reg3, [address+16] /* reg3 = 4, 5, X, X */ \
  633. __asm movhps reg1, [address+24] /* reg1 = 0, 1, 6, 7 */ \
  634. __asm movhps reg2, [address+32] /* reg2 = 2, 3, 8, 9 */ \
  635. __asm movhps reg3, [address+40] /* reg3 = 4, 5, 10, 11 */ \
  636. __asm movaps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
  637. __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg0 = 0, 6, 3, 9 */ \
  638. __asm shufps reg1, reg3, R_SHUFFLEPS( 1, 3, 0, 2 ) /* reg1 = 1, 7, 4, 10 */ \
  639. __asm shufps reg2, reg3, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg2 = 2, 8, 5, 11 */
  640. // transpose a 4x3 matrix to memory from 3 xmm registers (reg3 is temporary)
  641. #define TRANSPOSE_4x3_TO_MEMORY( address, reg0, reg1, reg2, reg3 ) \
  642. __asm movhlps reg3, reg0 /* reg3 = 3, 9, X, X */ \
  643. __asm unpcklps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
  644. __asm unpckhps reg1, reg2 /* reg1 = 4, 5, 10, 11 */ \
  645. __asm unpcklps reg2, reg3 /* reg2 = 2, 3, 8, 9 */ \
  646. __asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
  647. __asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
  648. __asm movlps [address+16], reg1 /* mem1 = 4, 5, X, X */ \
  649. __asm movhps [address+24], reg0 /* mem1 = 4, 5, 6, 7 */ \
  650. __asm movhps [address+32], reg2 /* mem2 = 8, 9, X, X */ \
  651. __asm movhps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */
  652. // with alignment
  653. #define KFLOATINITS( SRC0, COUNT, PRE, POST ) KFLOATINITDSS( SRC0,SRC0,SRC0,COUNT,PRE,POST )
  654. #define KFLOATINITD( DST, COUNT, PRE, POST ) KFLOATINITDSS( DST,DST,DST,COUNT,PRE,POST )
  655. #define KFLOATINITDS( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS( DST,SRC0,SRC0,COUNT,PRE,POST )
  656. #define KFLOATINITDSS( DST, SRC0, SRC1, COUNT, PRE, POST )\
  657. __asm mov ecx,DST \
  658. __asm shr ecx,2 \
  659. __asm mov ebx,COUNT \
  660. __asm neg ecx \
  661. __asm mov edx,SRC0 \
  662. __asm and ecx,3 \
  663. __asm mov esi,SRC1 \
  664. __asm sub ebx,ecx \
  665. __asm jge noUnderFlow \
  666. __asm xor ebx,ebx \
  667. __asm mov ecx,COUNT \
  668. __asm noUnderFlow: \
  669. __asm mov PRE,ecx \
  670. __asm mov eax,ebx \
  671. __asm mov edi,DST \
  672. __asm and eax,8-1 \
  673. __asm mov POST,eax \
  674. __asm and ebx,0xfffffff8 \
  675. __asm jle done \
  676. __asm shl ebx,2 \
  677. __asm lea ecx,[ecx*4+ebx] \
  678. __asm neg ebx \
  679. __asm add edx,ecx \
  680. __asm add esi,ecx \
  681. __asm add edi,ecx \
  682. __asm mov eax,edx \
  683. __asm or eax,esi
  684. // without alignment (pre==0)
  685. #define KFLOATINITS_NA( SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( SRC0,SRC0,SRC0,COUNT,PRE,POST )
  686. #define KFLOATINITD_NA( DST, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,DST,DST,COUNT,PRE,POST )
  687. #define KFLOATINITDS_NA( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,SRC0,SRC0,COUNT,PRE,POST )
  688. #define KFLOATINITDSS_NA( DST, SRC0, SRC1, COUNT, PRE, POST )\
  689. __asm mov eax,COUNT \
  690. __asm mov PRE,0 \
  691. __asm and eax,8-1 \
  692. __asm mov ebx,COUNT \
  693. __asm mov POST,eax \
  694. __asm and ebx,0xfffffff8 \
  695. __asm je done \
  696. __asm shl ebx,2 \
  697. __asm mov edx,SRC0 \
  698. __asm mov esi,SRC1 \
  699. __asm mov edi,DST \
  700. __asm add edx,ebx \
  701. __asm add esi,ebx \
  702. __asm add edi,ebx \
  703. __asm mov eax,edx \
  704. __asm or eax,esi \
  705. __asm or eax,edi \
  706. __asm neg ebx \
  707. /*
  708. when OPER is called:
  709. edx = s0
  710. esi = s1
  711. edi = d
  712. ebx = index*4
  713. xmm0 & xmm1 must not be trashed
  714. */
  715. #define KMOVDS1( DST, SRC0 ) \
  716. __asm movss xmm2,SRC0 \
  717. __asm movss DST,xmm2
  718. #define KMOVDS4( DST, SRC0 ) \
  719. __asm movups xmm2,SRC0 \
  720. __asm movups DST,xmm2
  721. #define KMINDS1( DST, SRC0 ) \
  722. __asm movss xmm2,SRC0 \
  723. __asm minss DST,xmm2
  724. #define KMAXDS1( DST, SRC0 ) \
  725. __asm movss xmm2,SRC0 \
  726. __asm maxss DST,xmm2
  727. // general ALU operation
  728. #define KALUDSS1( OP, DST, SRC0, SRC1 ) \
  729. __asm movss xmm2,SRC0 \
  730. __asm OP##ss xmm2,SRC1 \
  731. __asm movss DST,xmm2
  732. #define KALUDSS4( OP, DST, SRC0, SRC1 ) \
  733. __asm movups xmm2,SRC0 \
  734. __asm movups xmm3,SRC1 \
  735. __asm OP##ps xmm2,xmm3 \
  736. __asm movups DST,xmm2
  737. #define KADDDSS1( DST, SRC0, SRC1 ) KALUDSS1( add, DST,SRC0,SRC1 )
  738. #define KADDDSS4( DST, SRC0, SRC1 ) KALUDSS4( add, DST,SRC0,SRC1 )
  739. #define KSUBDSS1( DST, SRC0, SRC1 ) KALUDSS1( sub, DST,SRC0,SRC1 )
  740. #define KSUBDSS4( DST, SRC0, SRC1 ) KALUDSS4( sub, DST,SRC0,SRC1 )
  741. #define KMULDSS1( DST, SRC0, SRC1 ) KALUDSS1( mul, DST,SRC0,SRC1 )
  742. #define KMULDSS4( DST, SRC0, SRC1 ) KALUDSS4( mul, DST,SRC0,SRC1 )
  743. #define KDIVDSS1( DST, SRC0, SRC1 ) \
  744. __asm movss xmm2,SRC1 \
  745. __asm rcpss xmm3,xmm2 \
  746. __asm mulss xmm2,xmm3 \
  747. __asm mulss xmm2,xmm3 \
  748. __asm addss xmm3,xmm3 \
  749. __asm subss xmm3,xmm2 \
  750. __asm mulss xmm3,SRC0 \
  751. __asm movss DST,xmm3
  752. #define KDIVDSS4( DST, SRC0, SRC1 ) \
  753. __asm movups xmm2,SRC1 \
  754. __asm rcpps xmm3,xmm2 \
  755. __asm mulps xmm2,xmm3 \
  756. __asm mulps xmm2,xmm3 \
  757. __asm addps xmm3,xmm3 \
  758. __asm subps xmm3,xmm2 \
  759. __asm movups xmm2,SRC0 \
  760. __asm mulps xmm3,xmm2 \
  761. __asm movups DST,xmm3
  762. #define KF2IDS1( SRC0 ) \
  763. __asm movss xmm2,SRC0 \
  764. __asm cvttps2pi mm2,xmm2 \
  765. __asm movd [edi+ebx],mm2
  766. #define KF2IDS4( SRC0 ) \
  767. __asm movups xmm2,SRC0 \
  768. __asm cvttps2pi mm2,xmm2 \
  769. __asm movq [edi+ebx+0],mm2 \
  770. __asm shufps xmm2,xmm2,SHUFFLEPS(1,0,3,2) \
  771. __asm cvttps2pi mm2,xmm2 \
  772. __asm movq [edi+ebx+8],mm2
  773. #define KISQRTDS1( DST,SRC0 ) \
  774. __asm movss xmm2,SRC0 \
  775. __asm rsqrtss xmm3,xmm2 \
  776. __asm mulss xmm2,xmm3 \
  777. __asm mulss xmm2,xmm3 \
  778. __asm subss xmm2,xmm1 \
  779. __asm mulss xmm3,xmm0 \
  780. __asm mulss xmm3,xmm2 \
  781. __asm movss DST,xmm3
  782. #define KISQRTDS4( DST,SRC0 ) \
  783. __asm movups xmm2,SRC0 \
  784. __asm rsqrtps xmm3,xmm2 \
  785. __asm mulps xmm2,xmm3 \
  786. __asm mulps xmm2,xmm3 \
  787. __asm subps xmm2,xmm1 \
  788. __asm mulps xmm3,xmm0 \
  789. __asm mulps xmm3,xmm2 \
  790. __asm movups DST,xmm3
  791. // this is used in vector4 implementation to shift constant V4
  792. #define KANDREGDSV( DST, SRC0, VALUE ) \
  793. __asm mov DST,SRC0 \
  794. __asm and DST,VALUE
  795. // this is used in vector4 code to operate with float arrays as sources
  796. #define KEXPANDFLOAT( DST, SRC ) \
  797. __asm movss DST,SRC \
  798. __asm shufps DST,DST,0
  799. #define KADDDS1( DST,SRC ) KADDDSS1( DST,DST,SRC )
  800. #define KADDDS4( DST,SRC ) KADDDSS4( DST,DST,SRC )
  801. #define KSUBDS1( DST,SRC ) KSUBDSS1( DST,DST,SRC )
  802. #define KSUBDS4( DST,SRC ) KSUBDSS4( DST,DST,SRC )
  803. #define KMULDS1( DST,SRC ) KMULDSS1( DST,DST,SRC )
  804. #define KMULDS4( DST,SRC ) KMULDSS4( DST,DST,SRC )
  805. #define KDIVDS1( DST,SRC ) KDIVDSS1( DST,DST,SRC )
  806. #define KDIVDS4( DST,SRC ) KDIVDSS4( DST,DST,SRC )
  807. // handles pre & post leftovers
  808. #define KFLOATOPER( OPER, OPER4, COUNT ) \
  809. __asm mov ecx,pre \
  810. __asm mov ebx,COUNT \
  811. __asm cmp ebx,ecx \
  812. __asm cmovl ecx,COUNT \
  813. __asm test ecx,ecx \
  814. __asm je preDone \
  815. __asm xor ebx,ebx \
  816. __asm lpPre: \
  817. OPER \
  818. __asm add ebx,4 \
  819. __asm dec ecx \
  820. __asm jg lpPre \
  821. __asm preDone: \
  822. __asm mov ecx,post \
  823. __asm mov ebx,COUNT \
  824. __asm sub ebx,ecx \
  825. __asm shl ebx,2 \
  826. __asm cmp ecx,4 \
  827. __asm jl post4Done \
  828. OPER4 \
  829. __asm sub ecx,4 \
  830. __asm add ebx,4*4 \
  831. __asm post4Done: \
  832. __asm test ecx,ecx \
  833. __asm je postDone \
  834. __asm lpPost: \
  835. OPER \
  836. __asm add ebx,4 \
  837. __asm dec ecx \
  838. __asm jg lpPost \
  839. __asm postDone:
  840. // operate on a constant and a float array
  841. #define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT ) \
  842. int pre,post; \
  843. __asm movss xmm0,CONSTANT \
  844. __asm shufps xmm0,xmm0,0 \
  845. KFLOATINITDS( DST, SRC, COUNT, pre, post ) \
  846. __asm and eax,15 \
  847. __asm jne lpNA \
  848. __asm jmp lpA \
  849. __asm align 16 \
  850. __asm lpA: \
  851. __asm prefetchnta [edx+ebx+64] \
  852. __asm movaps xmm1,xmm0 \
  853. __asm movaps xmm2,xmm0 \
  854. __asm ALUOP##ps xmm1,[edx+ebx] \
  855. __asm ALUOP##ps xmm2,[edx+ebx+16] \
  856. __asm movaps [edi+ebx],xmm1 \
  857. __asm movaps [edi+ebx+16],xmm2 \
  858. __asm add ebx,16*2 \
  859. __asm jl lpA \
  860. __asm jmp done \
  861. __asm align 16 \
  862. __asm lpNA: \
  863. __asm prefetchnta [edx+ebx+64] \
  864. __asm movaps xmm1,xmm0 \
  865. __asm movaps xmm2,xmm0 \
  866. __asm movups xmm3,[edx+ebx] \
  867. __asm movups xmm4,[edx+ebx+16] \
  868. __asm ALUOP##ps xmm1,xmm3 \
  869. __asm ALUOP##ps xmm2,xmm4 \
  870. __asm movaps [edi+ebx],xmm1 \
  871. __asm movaps [edi+ebx+16],xmm2 \
  872. __asm add ebx,16*2 \
  873. __asm jl lpNA \
  874. __asm done: \
  875. __asm mov edx,SRC \
  876. __asm mov edi,DST \
  877. __asm KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), \
  878. __asm KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT )
  879. // operate on two float arrays
  880. #define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT ) \
  881. int pre,post; \
  882. KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post ) \
  883. __asm and eax,15 \
  884. __asm jne lpNA \
  885. __asm jmp lpA \
  886. __asm align 16 \
  887. __asm lpA: \
  888. __asm movaps xmm1,[edx+ebx] \
  889. __asm movaps xmm2,[edx+ebx+16] \
  890. __asm ALUOP##ps xmm1,[esi+ebx] \
  891. __asm ALUOP##ps xmm2,[esi+ebx+16] \
  892. __asm prefetchnta [edx+ebx+64] \
  893. __asm prefetchnta [esi+ebx+64] \
  894. __asm movaps [edi+ebx],xmm1 \
  895. __asm movaps [edi+ebx+16],xmm2 \
  896. __asm add ebx,16*2 \
  897. __asm jl lpA \
  898. __asm jmp done \
  899. __asm align 16 \
  900. __asm lpNA: \
  901. __asm movups xmm1,[edx+ebx] \
  902. __asm movups xmm2,[edx+ebx+16] \
  903. __asm movups xmm3,[esi+ebx] \
  904. __asm movups xmm4,[esi+ebx+16] \
  905. __asm prefetchnta [edx+ebx+64] \
  906. __asm prefetchnta [esi+ebx+64] \
  907. __asm ALUOP##ps xmm1,xmm3 \
  908. __asm ALUOP##ps xmm2,xmm4 \
  909. __asm movaps [edi+ebx],xmm1 \
  910. __asm movaps [edi+ebx+16],xmm2 \
  911. __asm add ebx,16*2 \
  912. __asm jl lpNA \
  913. __asm done: \
  914. __asm mov edx,SRC0 \
  915. __asm mov esi,SRC1 \
  916. __asm mov edi,DST \
  917. KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), \
  918. KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )
  919. #define DRAWVERT_SIZE 60
  920. #define DRAWVERT_XYZ_OFFSET (0*4)
  921. #define DRAWVERT_ST_OFFSET (3*4)
  922. #define DRAWVERT_NORMAL_OFFSET (5*4)
  923. #define DRAWVERT_TANGENT0_OFFSET (8*4)
  924. #define DRAWVERT_TANGENT1_OFFSET (11*4)
  925. #define DRAWVERT_COLOR_OFFSET (14*4)
  926. #define JOINTQUAT_SIZE (7*4)
  927. #define JOINTMAT_SIZE (4*3*4)
  928. #define JOINTWEIGHT_SIZE (4*4)
  929. #define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
  930. #define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
  931. #define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
  932. ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
  933. ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
  934. ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) );
  935. ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) );
  936. ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) );
  937. ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) );
  938. ALIGN4_INIT4( unsigned long SIMD_SP_singleSignBitMask, (unsigned long) ( 1 << 31 ), 0, 0, 0 );
  939. ALIGN4_INIT1( unsigned long SIMD_SP_signBitMask, (unsigned long) ( 1 << 31 ) );
  940. ALIGN4_INIT1( unsigned long SIMD_SP_absMask, (unsigned long) ~( 1 << 31 ) );
  941. ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, (unsigned long) ~( 1 << 23 ) );
  942. ALIGN4_INIT1( unsigned long SIMD_SP_not, 0xFFFFFFFF );
  943. ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
  944. ALIGN4_INIT1( float SIMD_SP_half, 0.5f );
  945. ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
  946. ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
  947. ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
  948. ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
  949. ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
  950. ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
  951. ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
  952. ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
  953. ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
  954. ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
  955. ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
  956. ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f );
  957. ALIGN4_INIT1( float SIMD_SP_rsqrt_c0, 3.0f );
  958. ALIGN4_INIT1( float SIMD_SP_rsqrt_c1, -0.5f );
  959. ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f*0.5f );
  960. ALIGN4_INIT1( float SIMD_SP_sin_c0, -2.39e-08f );
  961. ALIGN4_INIT1( float SIMD_SP_sin_c1, 2.7526e-06f );
  962. ALIGN4_INIT1( float SIMD_SP_sin_c2, -1.98409e-04f );
  963. ALIGN4_INIT1( float SIMD_SP_sin_c3, 8.3333315e-03f );
  964. ALIGN4_INIT1( float SIMD_SP_sin_c4, -1.666666664e-01f );
  965. ALIGN4_INIT1( float SIMD_SP_cos_c0, -2.605e-07f );
  966. ALIGN4_INIT1( float SIMD_SP_cos_c1, 2.47609e-05f );
  967. ALIGN4_INIT1( float SIMD_SP_cos_c2, -1.3888397e-03f );
  968. ALIGN4_INIT1( float SIMD_SP_cos_c3, 4.16666418e-02f );
  969. ALIGN4_INIT1( float SIMD_SP_cos_c4, -4.999999963e-01f );
  970. ALIGN4_INIT1( float SIMD_SP_atan_c0, 0.0028662257f );
  971. ALIGN4_INIT1( float SIMD_SP_atan_c1, -0.0161657367f );
  972. ALIGN4_INIT1( float SIMD_SP_atan_c2, 0.0429096138f );
  973. ALIGN4_INIT1( float SIMD_SP_atan_c3, -0.0752896400f );
  974. ALIGN4_INIT1( float SIMD_SP_atan_c4, 0.1065626393f );
  975. ALIGN4_INIT1( float SIMD_SP_atan_c5, -0.1420889944f );
  976. ALIGN4_INIT1( float SIMD_SP_atan_c6, 0.1999355085f );
  977. ALIGN4_INIT1( float SIMD_SP_atan_c7, -0.3333314528f );
  978. /*
  979. ============
  980. SSE_InvSqrt
  981. ============
  982. */
  983. float SSE_InvSqrt( float x ) {
  984. float y;
  985. __asm {
  986. movss xmm0, x
  987. rsqrtss xmm1, xmm0
  988. mulss xmm0, xmm1
  989. mulss xmm0, xmm1
  990. subss xmm0, SIMD_SP_rsqrt_c0
  991. mulss xmm1, SIMD_SP_rsqrt_c1
  992. mulss xmm0, xmm1
  993. movss y, xmm0
  994. }
  995. return y;
  996. }
  997. /*
  998. ============
  999. SSE_InvSqrt4
  1000. ============
  1001. */
  1002. void SSE_InvSqrt4( float x[4] ) {
  1003. __asm {
  1004. mov edi, x
  1005. movaps xmm0, [edi]
  1006. rsqrtps xmm1, xmm0
  1007. mulps xmm0, xmm1
  1008. mulps xmm0, xmm1
  1009. subps xmm0, SIMD_SP_rsqrt_c0
  1010. mulps xmm1, SIMD_SP_rsqrt_c1
  1011. mulps xmm0, xmm1
  1012. movaps [edi], xmm0
  1013. }
  1014. }
  1015. /*
  1016. ============
  1017. SSE_SinZeroHalfPI
  1018. The angle must be between zero and half PI.
  1019. ============
  1020. */
  1021. float SSE_SinZeroHalfPI( float a ) {
  1022. #if 1
  1023. float t;
  1024. assert( a >= 0.0f && a <= idMath::HALF_PI );
  1025. __asm {
  1026. movss xmm0, a
  1027. movss xmm1, xmm0
  1028. mulss xmm1, xmm1
  1029. movss xmm2, SIMD_SP_sin_c0
  1030. mulss xmm2, xmm1
  1031. addss xmm2, SIMD_SP_sin_c1
  1032. mulss xmm2, xmm1
  1033. addss xmm2, SIMD_SP_sin_c2
  1034. mulss xmm2, xmm1
  1035. addss xmm2, SIMD_SP_sin_c3
  1036. mulss xmm2, xmm1
  1037. addss xmm2, SIMD_SP_sin_c4
  1038. mulss xmm2, xmm1
  1039. addss xmm2, SIMD_SP_one
  1040. mulss xmm2, xmm0
  1041. movss t, xmm2
  1042. }
  1043. return t;
  1044. #else
  1045. float s, t;
  1046. assert( a >= 0.0f && a <= idMath::HALF_PI );
  1047. s = a * a;
  1048. t = -2.39e-08f;
  1049. t *= s;
  1050. t += 2.7526e-06f;
  1051. t *= s;
  1052. t += -1.98409e-04f;
  1053. t *= s;
  1054. t += 8.3333315e-03f;
  1055. t *= s;
  1056. t += -1.666666664e-01f;
  1057. t *= s;
  1058. t += 1.0f;
  1059. t *= a;
  1060. return t;
  1061. #endif
  1062. }
  1063. /*
  1064. ============
  1065. SSE_Sin4ZeroHalfPI
  1066. The angle must be between zero and half PI.
  1067. ============
  1068. */
  1069. void SSE_Sin4ZeroHalfPI( float a[4], float s[4] ) {
  1070. __asm {
  1071. mov edi, a
  1072. mov esi, s
  1073. movaps xmm0, [edi]
  1074. movaps xmm1, xmm0
  1075. mulps xmm1, xmm1
  1076. movaps xmm2, SIMD_SP_sin_c0
  1077. mulps xmm2, xmm1
  1078. addps xmm2, SIMD_SP_sin_c1
  1079. mulps xmm2, xmm1
  1080. addps xmm2, SIMD_SP_sin_c2
  1081. mulps xmm2, xmm1
  1082. addps xmm2, SIMD_SP_sin_c3
  1083. mulps xmm2, xmm1
  1084. addps xmm2, SIMD_SP_sin_c4
  1085. mulps xmm2, xmm1
  1086. addps xmm2, SIMD_SP_one
  1087. mulps xmm2, xmm0
  1088. movaps [esi], xmm2
  1089. }
  1090. }
  1091. /*
  1092. ============
  1093. SSE_Sin
  1094. ============
  1095. */
  1096. float SSE_Sin( float a ) {
  1097. #if 1
  1098. float t;
  1099. __asm {
  1100. movss xmm1, a
  1101. movss xmm2, xmm1
  1102. movss xmm3, xmm1
  1103. mulss xmm2, SIMD_SP_oneOverTwoPI
  1104. cvttss2si ecx, xmm2
  1105. cmpltss xmm3, SIMD_SP_zero
  1106. andps xmm3, SIMD_SP_one
  1107. cvtsi2ss xmm2, ecx
  1108. subss xmm2, xmm3
  1109. mulss xmm2, SIMD_SP_twoPI
  1110. subss xmm1, xmm2
  1111. movss xmm0, SIMD_SP_PI // xmm0 = PI
  1112. subss xmm0, xmm1 // xmm0 = PI - a
  1113. movss xmm1, xmm0 // xmm1 = PI - a
  1114. andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
  1115. movss xmm2, xmm0 // xmm2 = PI - a
  1116. xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
  1117. cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
  1118. movss xmm3, SIMD_SP_PI // xmm3 = PI
  1119. xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
  1120. andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
  1121. andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
  1122. xorps xmm0, xmm2
  1123. addps xmm0, xmm3
  1124. movss xmm1, xmm0
  1125. mulss xmm1, xmm1
  1126. movss xmm2, SIMD_SP_sin_c0
  1127. mulss xmm2, xmm1
  1128. addss xmm2, SIMD_SP_sin_c1
  1129. mulss xmm2, xmm1
  1130. addss xmm2, SIMD_SP_sin_c2
  1131. mulss xmm2, xmm1
  1132. addss xmm2, SIMD_SP_sin_c3
  1133. mulss xmm2, xmm1
  1134. addss xmm2, SIMD_SP_sin_c4
  1135. mulss xmm2, xmm1
  1136. addss xmm2, SIMD_SP_one
  1137. mulss xmm2, xmm0
  1138. movss t, xmm2
  1139. }
  1140. return t;
  1141. #else
  1142. float s, t;
  1143. if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
  1144. a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
  1145. }
  1146. a = idMath::PI - a;
  1147. if ( fabs( a ) >= idMath::HALF_PI ) {
  1148. a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
  1149. }
  1150. s = a * a;
  1151. t = -2.39e-08f;
  1152. t *= s;
  1153. t += 2.7526e-06f;
  1154. t *= s;
  1155. t += -1.98409e-04f;
  1156. t *= s;
  1157. t += 8.3333315e-03f;
  1158. t *= s;
  1159. t += -1.666666664e-01f;
  1160. t *= s;
  1161. t += 1.0f;
  1162. t *= a;
  1163. return t;
  1164. #endif
  1165. }
  1166. /*
  1167. ============
  1168. SSE_Sin4
  1169. ============
  1170. */
  1171. void SSE_Sin4( float a[4], float s[4] ) {
  1172. __asm {
  1173. mov edi, a
  1174. mov esi, s
  1175. movaps xmm1, [edi]
  1176. movaps xmm2, xmm1
  1177. mulps xmm2, SIMD_SP_oneOverTwoPI
  1178. movhlps xmm3, xmm2
  1179. cvttss2si ecx, xmm2
  1180. cvtsi2ss xmm2, ecx
  1181. cvttss2si edx, xmm3
  1182. cvtsi2ss xmm3, edx
  1183. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
  1184. shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
  1185. cvttss2si ecx, xmm2
  1186. cvtsi2ss xmm2, ecx
  1187. cvttss2si edx, xmm3
  1188. cvtsi2ss xmm3, edx
  1189. shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
  1190. movaps xmm3, xmm1
  1191. cmpltps xmm3, SIMD_SP_zero
  1192. andps xmm3, SIMD_SP_one
  1193. subps xmm2, xmm3
  1194. mulps xmm2, SIMD_SP_twoPI
  1195. subps xmm1, xmm2
  1196. movaps xmm0, SIMD_SP_PI // xmm0 = PI
  1197. subps xmm0, xmm1 // xmm0 = PI - a
  1198. movaps xmm1, xmm0 // xmm1 = PI - a
  1199. andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
  1200. movaps xmm2, xmm0 // xmm2 = PI - a
  1201. xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
  1202. cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
  1203. movaps xmm3, SIMD_SP_PI // xmm3 = PI
  1204. xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
  1205. andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
  1206. andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
  1207. xorps xmm0, xmm2
  1208. addps xmm0, xmm3
  1209. movaps xmm1, xmm0
  1210. mulps xmm1, xmm1
  1211. movaps xmm2, SIMD_SP_sin_c0
  1212. mulps xmm2, xmm1
  1213. addps xmm2, SIMD_SP_sin_c1
  1214. mulps xmm2, xmm1
  1215. addps xmm2, SIMD_SP_sin_c2
  1216. mulps xmm2, xmm1
  1217. addps xmm2, SIMD_SP_sin_c3
  1218. mulps xmm2, xmm1
  1219. addps xmm2, SIMD_SP_sin_c4
  1220. mulps xmm2, xmm1
  1221. addps xmm2, SIMD_SP_one
  1222. mulps xmm2, xmm0
  1223. movaps [esi], xmm2
  1224. }
  1225. }
  1226. /*
  1227. ============
  1228. SSE_CosZeroHalfPI
  1229. The angle must be between zero and half PI.
  1230. ============
  1231. */
  1232. float SSE_CosZeroHalfPI( float a ) {
  1233. #if 1
  1234. float t;
  1235. assert( a >= 0.0f && a <= idMath::HALF_PI );
  1236. __asm {
  1237. movss xmm0, a
  1238. mulss xmm0, xmm0
  1239. movss xmm1, SIMD_SP_cos_c0
  1240. mulss xmm1, xmm0
  1241. addss xmm1, SIMD_SP_cos_c1
  1242. mulss xmm1, xmm0
  1243. addss xmm1, SIMD_SP_cos_c2
  1244. mulss xmm1, xmm0
  1245. addss xmm1, SIMD_SP_cos_c3
  1246. mulss xmm1, xmm0
  1247. addss xmm1, SIMD_SP_cos_c4
  1248. mulss xmm1, xmm0
  1249. addss xmm1, SIMD_SP_one
  1250. movss t, xmm1
  1251. }
  1252. return t;
  1253. #else
  1254. float s, t;
  1255. assert( a >= 0.0f && a <= idMath::HALF_PI );
  1256. s = a * a;
  1257. t = -2.605e-07f;
  1258. t *= s;
  1259. t += 2.47609e-05f;
  1260. t *= s;
  1261. t += -1.3888397e-03f;
  1262. t *= s;
  1263. t += 4.16666418e-02f;
  1264. t *= s;
  1265. t += -4.999999963e-01f;
  1266. t *= s;
  1267. t += 1.0f;
  1268. return t;
  1269. #endif
  1270. }
  1271. /*
  1272. ============
  1273. SSE_Cos4ZeroHalfPI
  1274. The angle must be between zero and half PI.
  1275. ============
  1276. */
  1277. void SSE_Cos4ZeroHalfPI( float a[4], float c[4] ) {
  1278. __asm {
  1279. mov edi, a
  1280. mov esi, c
  1281. movaps xmm0, [edi]
  1282. mulps xmm0, xmm0
  1283. movaps xmm1, SIMD_SP_cos_c0
  1284. mulps xmm1, xmm0
  1285. addps xmm1, SIMD_SP_cos_c1
  1286. mulps xmm1, xmm0
  1287. addps xmm1, SIMD_SP_cos_c2
  1288. mulps xmm1, xmm0
  1289. addps xmm1, SIMD_SP_cos_c3
  1290. mulps xmm1, xmm0
  1291. addps xmm1, SIMD_SP_cos_c4
  1292. mulps xmm1, xmm0
  1293. addps xmm1, SIMD_SP_one
  1294. movaps [esi], xmm2
  1295. }
  1296. }
  1297. /*
  1298. ============
  1299. SSE_Cos
  1300. ============
  1301. */
  1302. float SSE_Cos( float a ) {
  1303. #if 1
  1304. float t;
  1305. __asm {
  1306. movss xmm1, a
  1307. movss xmm2, xmm1
  1308. movss xmm3, xmm1
  1309. mulss xmm2, SIMD_SP_oneOverTwoPI
  1310. cvttss2si ecx, xmm2
  1311. cmpltss xmm3, SIMD_SP_zero
  1312. andps xmm3, SIMD_SP_one
  1313. cvtsi2ss xmm2, ecx
  1314. subss xmm2, xmm3
  1315. mulss xmm2, SIMD_SP_twoPI
  1316. subss xmm1, xmm2
  1317. movss xmm0, SIMD_SP_PI // xmm0 = PI
  1318. subss xmm0, xmm1 // xmm0 = PI - a
  1319. movss xmm1, xmm0 // xmm1 = PI - a
  1320. andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
  1321. movss xmm2, xmm0 // xmm2 = PI - a
  1322. xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
  1323. cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
  1324. movss xmm3, SIMD_SP_PI // xmm3 = PI
  1325. xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
  1326. andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
  1327. andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
  1328. xorps xmm0, xmm2
  1329. addps xmm0, xmm3
  1330. mulss xmm0, xmm0
  1331. movss xmm1, SIMD_SP_cos_c0
  1332. mulss xmm1, xmm0
  1333. addss xmm1, SIMD_SP_cos_c1
  1334. mulss xmm1, xmm0
  1335. addss xmm1, SIMD_SP_cos_c2
  1336. mulss xmm1, xmm0
  1337. addss xmm1, SIMD_SP_cos_c3
  1338. mulss xmm1, xmm0
  1339. addss xmm1, SIMD_SP_cos_c4
  1340. mulss xmm1, xmm0
  1341. addss xmm1, SIMD_SP_one
  1342. xorps xmm2, SIMD_SP_signBitMask
  1343. xorps xmm1, xmm2
  1344. movss t, xmm1
  1345. }
  1346. return t;
  1347. #else
  1348. float s, t;
  1349. if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
  1350. a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
  1351. }
  1352. a = idMath::PI - a;
  1353. if ( fabs( a ) >= idMath::HALF_PI ) {
  1354. a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
  1355. d = 1.0f;
  1356. } else {
  1357. d = -1.0f;
  1358. }
  1359. s = a * a;
  1360. t = -2.605e-07f;
  1361. t *= s;
  1362. t += 2.47609e-05f;
  1363. t *= s;
  1364. t += -1.3888397e-03f;
  1365. t *= s;
  1366. t += 4.16666418e-02f;
  1367. t *= s;
  1368. t += -4.999999963e-01f;
  1369. t *= s;
  1370. t += 1.0f;
  1371. t *= d;
  1372. return t;
  1373. #endif
  1374. }
  1375. /*
  1376. ============
  1377. SSE_Cos4
  1378. ============
  1379. */
  1380. void SSE_Cos4( float a[4], float c[4] ) {
  1381. __asm {
  1382. mov edi, a
  1383. mov esi, c
  1384. movaps xmm1, [edi]
  1385. movaps xmm2, xmm1
  1386. mulps xmm2, SIMD_SP_oneOverTwoPI
  1387. movhlps xmm3, xmm2
  1388. cvttss2si ecx, xmm2
  1389. cvtsi2ss xmm2, ecx
  1390. cvttss2si edx, xmm3
  1391. cvtsi2ss xmm3, edx
  1392. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
  1393. shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
  1394. cvttss2si ecx, xmm2
  1395. cvtsi2ss xmm2, ecx
  1396. cvttss2si edx, xmm3
  1397. cvtsi2ss xmm3, edx
  1398. shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
  1399. movaps xmm3, xmm1
  1400. cmpltps xmm3, SIMD_SP_zero
  1401. andps xmm3, SIMD_SP_one
  1402. subps xmm2, xmm3
  1403. mulps xmm2, SIMD_SP_twoPI
  1404. subps xmm1, xmm2
  1405. movaps xmm0, SIMD_SP_PI // xmm0 = PI
  1406. subps xmm0, xmm1 // xmm0 = PI - a
  1407. movaps xmm1, xmm0 // xmm1 = PI - a
  1408. andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
  1409. movaps xmm2, xmm0 // xmm2 = PI - a
  1410. xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
  1411. cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
  1412. movaps xmm3, SIMD_SP_PI // xmm3 = PI
  1413. xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
  1414. andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
  1415. andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
  1416. xorps xmm0, xmm2
  1417. addps xmm0, xmm3
  1418. mulps xmm0, xmm0
  1419. movaps xmm1, SIMD_SP_cos_c0
  1420. mulps xmm1, xmm0
  1421. addps xmm1, SIMD_SP_cos_c1
  1422. mulps xmm1, xmm0
  1423. addps xmm1, SIMD_SP_cos_c2
  1424. mulps xmm1, xmm0
  1425. addps xmm1, SIMD_SP_cos_c3
  1426. mulps xmm1, xmm0
  1427. addps xmm1, SIMD_SP_cos_c4
  1428. mulps xmm1, xmm0
  1429. addps xmm1, SIMD_SP_one
  1430. xorps xmm2, SIMD_SP_signBitMask
  1431. xorps xmm1, xmm2
  1432. movaps [esi], xmm1
  1433. }
  1434. }
  1435. /*
  1436. ============
  1437. SSE_SinCos
  1438. ============
  1439. */
  1440. void SSE_SinCos( float a, float &s, float &c ) {
  1441. __asm {
  1442. mov edi, s
  1443. mov esi, c
  1444. movss xmm1, a
  1445. movss xmm2, xmm1
  1446. movss xmm3, xmm1
  1447. mulss xmm2, SIMD_SP_oneOverTwoPI
  1448. cvttss2si ecx, xmm2
  1449. cmpltss xmm3, SIMD_SP_zero
  1450. andps xmm3, SIMD_SP_one
  1451. cvtsi2ss xmm2, ecx
  1452. subss xmm2, xmm3
  1453. mulss xmm2, SIMD_SP_twoPI
  1454. subss xmm1, xmm2
  1455. movss xmm0, SIMD_SP_PI // xmm0 = PI
  1456. subss xmm0, xmm1 // xmm0 = PI - a
  1457. movss xmm1, xmm0 // xmm1 = PI - a
  1458. andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
  1459. movss xmm2, xmm0 // xmm2 = PI - a
  1460. xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
  1461. cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
  1462. movss xmm3, SIMD_SP_PI // xmm3 = PI
  1463. xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
  1464. andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
  1465. andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
  1466. xorps xmm0, xmm2
  1467. addps xmm0, xmm3
  1468. movss xmm1, xmm0
  1469. mulss xmm1, xmm1
  1470. movss xmm3, SIMD_SP_sin_c0
  1471. movss xmm4, SIMD_SP_cos_c0
  1472. mulss xmm3, xmm1
  1473. mulss xmm4, xmm1
  1474. addss xmm3, SIMD_SP_sin_c1
  1475. addss xmm4, SIMD_SP_cos_c1
  1476. mulss xmm3, xmm1
  1477. mulss xmm4, xmm1
  1478. addss xmm3, SIMD_SP_sin_c2
  1479. addss xmm4, SIMD_SP_cos_c2
  1480. mulss xmm3, xmm1
  1481. mulss xmm4, xmm1
  1482. addss xmm3, SIMD_SP_sin_c3
  1483. addss xmm4, SIMD_SP_cos_c3
  1484. mulss xmm3, xmm1
  1485. mulss xmm4, xmm1
  1486. addss xmm3, SIMD_SP_sin_c4
  1487. addss xmm4, SIMD_SP_cos_c4
  1488. mulss xmm3, xmm1
  1489. mulss xmm4, xmm1
  1490. addss xmm3, SIMD_SP_one
  1491. addss xmm4, SIMD_SP_one
  1492. mulss xmm3, xmm0
  1493. xorps xmm2, SIMD_SP_signBitMask
  1494. xorps xmm4, xmm2
  1495. movss [edi], xmm2
  1496. movss [esi], xmm3
  1497. }
  1498. }
  1499. /*
  1500. ============
  1501. SSE_SinCos4
  1502. ============
  1503. */
  1504. void SSE_SinCos4( float a[4], float s[4], float c[4] ) {
  1505. __asm {
  1506. mov eax, a
  1507. mov edi, s
  1508. mov esi, c
  1509. movaps xmm1, [eax]
  1510. movaps xmm2, xmm1
  1511. mulps xmm2, SIMD_SP_oneOverTwoPI
  1512. movhlps xmm3, xmm2
  1513. cvttss2si ecx, xmm2
  1514. cvtsi2ss xmm2, ecx
  1515. cvttss2si edx, xmm3
  1516. cvtsi2ss xmm3, edx
  1517. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
  1518. shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
  1519. cvttss2si ecx, xmm2
  1520. cvtsi2ss xmm2, ecx
  1521. cvttss2si edx, xmm3
  1522. cvtsi2ss xmm3, edx
  1523. shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
  1524. movaps xmm3, xmm1
  1525. cmpltps xmm3, SIMD_SP_zero
  1526. andps xmm3, SIMD_SP_one
  1527. subps xmm2, xmm3
  1528. mulps xmm2, SIMD_SP_twoPI
  1529. subps xmm1, xmm2
  1530. movaps xmm0, SIMD_SP_PI // xmm0 = PI
  1531. subps xmm0, xmm1 // xmm0 = PI - a
  1532. movaps xmm1, xmm0 // xmm1 = PI - a
  1533. andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
  1534. movaps xmm2, xmm0 // xmm2 = PI - a
  1535. xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
  1536. cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
  1537. movaps xmm3, SIMD_SP_PI // xmm3 = PI
  1538. xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
  1539. andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
  1540. andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
  1541. xorps xmm0, xmm2
  1542. addps xmm0, xmm3
  1543. movaps xmm0, [eax]
  1544. movaps xmm1, xmm0
  1545. mulps xmm1, xmm1
  1546. movaps xmm3, SIMD_SP_sin_c0
  1547. movaps xmm4, SIMD_SP_cos_c0
  1548. mulps xmm3, xmm1
  1549. mulps xmm4, xmm1
  1550. addps xmm3, SIMD_SP_sin_c1
  1551. addps xmm4, SIMD_SP_cos_c1
  1552. mulps xmm3, xmm1
  1553. mulps xmm4, xmm1
  1554. addps xmm3, SIMD_SP_sin_c2
  1555. addps xmm4, SIMD_SP_cos_c2
  1556. mulps xmm3, xmm1
  1557. mulps xmm4, xmm1
  1558. addps xmm3, SIMD_SP_sin_c3
  1559. addps xmm4, SIMD_SP_cos_c3
  1560. mulps xmm3, xmm1
  1561. mulps xmm4, xmm1
  1562. addps xmm3, SIMD_SP_sin_c4
  1563. addps xmm4, SIMD_SP_cos_c4
  1564. mulps xmm3, xmm1
  1565. mulps xmm4, xmm1
  1566. addps xmm3, SIMD_SP_one
  1567. addps xmm4, SIMD_SP_one
  1568. mulps xmm3, xmm0
  1569. xorps xmm2, SIMD_SP_signBitMask
  1570. xorps xmm4, xmm2
  1571. movaps [edi], xmm3
  1572. movaps [esi], xmm4
  1573. }
  1574. }
  1575. /*
  1576. ============
  1577. SSE_ATanPositive
  1578. Both 'x' and 'y' must be positive.
  1579. ============
  1580. */
  1581. float SSE_ATanPositive( float y, float x ) {
  1582. #if 1
  1583. float t;
  1584. assert( y >= 0.0f && x >= 0.0f );
  1585. __asm {
  1586. movss xmm0, x
  1587. movss xmm3, xmm0
  1588. movss xmm1, y
  1589. minss xmm0, xmm1
  1590. maxss xmm1, xmm3
  1591. cmpeqss xmm3, xmm0
  1592. rcpss xmm2, xmm1
  1593. mulss xmm1, xmm2
  1594. mulss xmm1, xmm2
  1595. addss xmm2, xmm2
  1596. subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x
  1597. mulss xmm0, xmm2 // xmm0 = x / y or y / x
  1598. movss xmm1, xmm3
  1599. andps xmm1, SIMD_SP_signBitMask
  1600. xorps xmm0, xmm1 // xmm0 = -x / y or y / x
  1601. andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f
  1602. movss xmm1, xmm0
  1603. mulss xmm1, xmm1 // xmm1 = s
  1604. movss xmm2, SIMD_SP_atan_c0
  1605. mulss xmm2, xmm1
  1606. addss xmm2, SIMD_SP_atan_c1
  1607. mulss xmm2, xmm1
  1608. addss xmm2, SIMD_SP_atan_c2
  1609. mulss xmm2, xmm1
  1610. addss xmm2, SIMD_SP_atan_c3
  1611. mulss xmm2, xmm1
  1612. addss xmm2, SIMD_SP_atan_c4
  1613. mulss xmm2, xmm1
  1614. addss xmm2, SIMD_SP_atan_c5
  1615. mulss xmm2, xmm1
  1616. addss xmm2, SIMD_SP_atan_c6
  1617. mulss xmm2, xmm1
  1618. addss xmm2, SIMD_SP_atan_c7
  1619. mulss xmm2, xmm1
  1620. addss xmm2, SIMD_SP_one
  1621. mulss xmm2, xmm0
  1622. addss xmm2, xmm3
  1623. movss t, xmm2
  1624. }
  1625. return t;
  1626. #else
  1627. float a, d, s, t;
  1628. assert( y >= 0.0f && x >= 0.0f );
  1629. if ( y > x ) {
  1630. a = -x / y;
  1631. d = idMath::HALF_PI;
  1632. } else {
  1633. a = y / x;
  1634. d = 0.0f;
  1635. }
  1636. s = a * a;
  1637. t = 0.0028662257f;
  1638. t *= s;
  1639. t += -0.0161657367f;
  1640. t *= s;
  1641. t += 0.0429096138f;
  1642. t *= s;
  1643. t += -0.0752896400f;
  1644. t *= s;
  1645. t += 0.1065626393f;
  1646. t *= s;
  1647. t += -0.1420889944f;
  1648. t *= s;
  1649. t += 0.1999355085f;
  1650. t *= s;
  1651. t += -0.3333314528f;
  1652. t *= s;
  1653. t += 1.0f;
  1654. t *= a;
  1655. t += d;
  1656. return t;
  1657. #endif
  1658. }
  1659. /*
  1660. ============
  1661. SSE_ATan4Positive
  1662. Both 'x' and 'y' must be positive.
  1663. ============
  1664. */
  1665. void SSE_ATan4Positive( float y[4], float x[4], float at[4] ) {
  1666. __asm {
  1667. mov esi, x
  1668. mov edi, y
  1669. mov edx, at
  1670. movaps xmm0, [esi]
  1671. movaps xmm3, xmm0
  1672. movaps xmm1, [edi]
  1673. minps xmm0, xmm1
  1674. maxps xmm1, xmm3
  1675. cmpeqps xmm3, xmm0
  1676. rcpps xmm2, xmm1
  1677. mulps xmm1, xmm2
  1678. mulps xmm1, xmm2
  1679. addps xmm2, xmm2
  1680. subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x
  1681. mulps xmm0, xmm2 // xmm0 = x / y or y / x
  1682. movaps xmm1, xmm3
  1683. andps xmm1, SIMD_SP_signBitMask
  1684. xorps xmm0, xmm1 // xmm0 = -x / y or y / x
  1685. andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f
  1686. movaps xmm1, xmm0
  1687. mulps xmm1, xmm1 // xmm1 = s
  1688. movaps xmm2, SIMD_SP_atan_c0
  1689. mulps xmm2, xmm1
  1690. addps xmm2, SIMD_SP_atan_c1
  1691. mulps xmm2, xmm1
  1692. addps xmm2, SIMD_SP_atan_c2
  1693. mulps xmm2, xmm1
  1694. addps xmm2, SIMD_SP_atan_c3
  1695. mulps xmm2, xmm1
  1696. addps xmm2, SIMD_SP_atan_c4
  1697. mulps xmm2, xmm1
  1698. addps xmm2, SIMD_SP_atan_c5
  1699. mulps xmm2, xmm1
  1700. addps xmm2, SIMD_SP_atan_c6
  1701. mulps xmm2, xmm1
  1702. addps xmm2, SIMD_SP_atan_c7
  1703. mulps xmm2, xmm1
  1704. addps xmm2, SIMD_SP_one
  1705. mulps xmm2, xmm0
  1706. addps xmm2, xmm3
  1707. movaps [edx], xmm2
  1708. }
  1709. }
  1710. /*
  1711. ============
  1712. SSE_ATan
  1713. ============
  1714. */
  1715. float SSE_ATan( float y, float x ) {
  1716. #if 1
  1717. float t;
  1718. __asm {
  1719. movss xmm0, x
  1720. movss xmm3, xmm0
  1721. movss xmm4, xmm0
  1722. andps xmm0, SIMD_SP_absMask
  1723. movss xmm1, y
  1724. xorps xmm4, xmm1
  1725. andps xmm1, SIMD_SP_absMask
  1726. andps xmm4, SIMD_SP_signBitMask
  1727. minss xmm0, xmm1
  1728. maxss xmm1, xmm3
  1729. cmpeqss xmm3, xmm0
  1730. rcpss xmm2, xmm1
  1731. mulss xmm1, xmm2
  1732. mulss xmm1, xmm2
  1733. addss xmm2, xmm2
  1734. subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x
  1735. mulss xmm0, xmm2 // xmm0 = x / y or y / x
  1736. xorps xmm0, xmm4
  1737. movss xmm1, xmm3
  1738. andps xmm1, SIMD_SP_signBitMask
  1739. xorps xmm0, xmm1 // xmm0 = -x / y or y / x
  1740. orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI
  1741. andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f
  1742. movss xmm1, xmm0
  1743. mulss xmm1, xmm1 // xmm1 = s
  1744. movss xmm2, SIMD_SP_atan_c0
  1745. mulss xmm2, xmm1
  1746. addss xmm2, SIMD_SP_atan_c1
  1747. mulss xmm2, xmm1
  1748. addss xmm2, SIMD_SP_atan_c2
  1749. mulss xmm2, xmm1
  1750. addss xmm2, SIMD_SP_atan_c3
  1751. mulss xmm2, xmm1
  1752. addss xmm2, SIMD_SP_atan_c4
  1753. mulss xmm2, xmm1
  1754. addss xmm2, SIMD_SP_atan_c5
  1755. mulss xmm2, xmm1
  1756. addss xmm2, SIMD_SP_atan_c6
  1757. mulss xmm2, xmm1
  1758. addss xmm2, SIMD_SP_atan_c7
  1759. mulss xmm2, xmm1
  1760. addss xmm2, SIMD_SP_one
  1761. mulss xmm2, xmm0
  1762. addss xmm2, xmm3
  1763. movss t, xmm2
  1764. }
  1765. return t;
  1766. #else
  1767. float a, d, s, t;
  1768. if ( fabs( y ) > fabs( x ) ) {
  1769. a = -x / y;
  1770. d = idMath::HALF_PI;
  1771. *((unsigned long *)&d) ^= ( *((unsigned long *)&x) ^ *((unsigned long *)&y) ) & (1<<31);
  1772. } else {
  1773. a = y / x;
  1774. d = 0.0f;
  1775. }
  1776. s = a * a;
  1777. t = 0.0028662257f;
  1778. t *= s;
  1779. t += -0.0161657367f;
  1780. t *= s;
  1781. t += 0.0429096138f;
  1782. t *= s;
  1783. t += -0.0752896400f;
  1784. t *= s;
  1785. t += 0.1065626393f;
  1786. t *= s;
  1787. t += -0.1420889944f;
  1788. t *= s;
  1789. t += 0.1999355085f;
  1790. t *= s;
  1791. t += -0.3333314528f;
  1792. t *= s;
  1793. t += 1.0f;
  1794. t *= a;
  1795. t += d;
  1796. return t;
  1797. #endif
  1798. }
  1799. /*
  1800. ============
  1801. SSE_ATan4
  1802. ============
  1803. */
  1804. void SSE_ATan4( float y[4], float x[4], float at[4] ) {
  1805. __asm {
  1806. mov esi, x
  1807. mov edi, y
  1808. mov edx, at
  1809. movaps xmm0, [esi]
  1810. movaps xmm3, xmm0
  1811. movaps xmm4, xmm0
  1812. andps xmm0, SIMD_SP_absMask
  1813. movaps xmm1, [edi]
  1814. xorps xmm4, xmm1
  1815. andps xmm1, SIMD_SP_absMask
  1816. andps xmm4, SIMD_SP_signBitMask
  1817. minps xmm0, xmm1
  1818. maxps xmm1, xmm3
  1819. cmpeqps xmm3, xmm0
  1820. rcpps xmm2, xmm1
  1821. mulps xmm1, xmm2
  1822. mulps xmm1, xmm2
  1823. addps xmm2, xmm2
  1824. subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x
  1825. mulps xmm0, xmm2 // xmm0 = x / y or y / x
  1826. xorps xmm0, xmm4
  1827. movaps xmm1, xmm3
  1828. andps xmm1, SIMD_SP_signBitMask
  1829. xorps xmm0, xmm1 // xmm0 = -x / y or y / x
  1830. orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI
  1831. andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f
  1832. movaps xmm1, xmm0
  1833. mulps xmm1, xmm1 // xmm1 = s
  1834. movaps xmm2, SIMD_SP_atan_c0
  1835. mulps xmm2, xmm1
  1836. addps xmm2, SIMD_SP_atan_c1
  1837. mulps xmm2, xmm1
  1838. addps xmm2, SIMD_SP_atan_c2
  1839. mulps xmm2, xmm1
  1840. addps xmm2, SIMD_SP_atan_c3
  1841. mulps xmm2, xmm1
  1842. addps xmm2, SIMD_SP_atan_c4
  1843. mulps xmm2, xmm1
  1844. addps xmm2, SIMD_SP_atan_c5
  1845. mulps xmm2, xmm1
  1846. addps xmm2, SIMD_SP_atan_c6
  1847. mulps xmm2, xmm1
  1848. addps xmm2, SIMD_SP_atan_c7
  1849. mulps xmm2, xmm1
  1850. addps xmm2, SIMD_SP_one
  1851. mulps xmm2, xmm0
  1852. addps xmm2, xmm3
  1853. movaps [edx], xmm2
  1854. }
  1855. }
  1856. /*
  1857. ============
  1858. SSE_TestTrigonometry
  1859. ============
  1860. */
  1861. void SSE_TestTrigonometry( void ) {
  1862. int i;
  1863. float a, s1, s2, c1, c2;
  1864. for ( i = 0; i < 100; i++ ) {
  1865. a = i * idMath::HALF_PI / 100.0f;
  1866. s1 = sin( a );
  1867. s2 = SSE_SinZeroHalfPI( a );
  1868. if ( fabs( s1 - s2 ) > 1e-7f ) {
  1869. assert( 0 );
  1870. }
  1871. c1 = cos( a );
  1872. c2 = SSE_CosZeroHalfPI( a );
  1873. if ( fabs( c1 - c2 ) > 1e-7f ) {
  1874. assert( 0 );
  1875. }
  1876. }
  1877. for ( i = -200; i < 200; i++ ) {
  1878. a = i * idMath::TWO_PI / 100.0f;
  1879. s1 = sin( a );
  1880. s2 = SSE_Sin( a );
  1881. if ( fabs( s1 - s2 ) > 1e-6f ) {
  1882. assert( 0 );
  1883. }
  1884. c1 = cos( a );
  1885. c2 = SSE_Cos( a );
  1886. if ( fabs( c1 - c2 ) > 1e-6f ) {
  1887. assert( 0 );
  1888. }
  1889. SSE_SinCos( a, s2, c2 );
  1890. if ( fabs( s1 - s2 ) > 1e-6f || fabs( c1 - c2 ) > 1e-6f ) {
  1891. assert( 0 );
  1892. }
  1893. }
  1894. }
  1895. /*
  1896. ============
  1897. idSIMD_SSE::GetName
  1898. ============
  1899. */
  1900. const char * idSIMD_SSE::GetName( void ) const {
  1901. return "MMX & SSE";
  1902. }
  1903. /*
  1904. ============
  1905. idSIMD_SSE::Add
  1906. dst[i] = constant + src[i];
  1907. ============
  1908. */
  1909. void VPCALL idSIMD_SSE::Add( float *dst, const float constant, const float *src, const int count ) {
  1910. KFLOAT_CA( add, dst, src, constant, count )
  1911. }
  1912. /*
  1913. ============
  1914. idSIMD_SSE::Add
  1915. dst[i] = src0[i] + src1[i];
  1916. ============
  1917. */
  1918. void VPCALL idSIMD_SSE::Add( float *dst, const float *src0, const float *src1, const int count ) {
  1919. KFLOAT_AA( add, dst, src0, src1, count )
  1920. }
  1921. /*
  1922. ============
  1923. idSIMD_SSE::Sub
  1924. dst[i] = constant - src[i];
  1925. ============
  1926. */
  1927. void VPCALL idSIMD_SSE::Sub( float *dst, const float constant, const float *src, const int count ) {
  1928. KFLOAT_CA( sub, dst, src, constant, count )
  1929. }
  1930. /*
  1931. ============
  1932. idSIMD_SSE::Sub
  1933. dst[i] = src0[i] - src1[i];
  1934. ============
  1935. */
  1936. void VPCALL idSIMD_SSE::Sub( float *dst, const float *src0, const float *src1, const int count ) {
  1937. KFLOAT_AA( sub, dst, src0, src1, count )
  1938. }
  1939. /*
  1940. ============
  1941. idSIMD_SSE::Mul
  1942. dst[i] = constant * src[i];
  1943. ============
  1944. */
  1945. void VPCALL idSIMD_SSE::Mul( float *dst, const float constant, const float *src, const int count ) {
  1946. KFLOAT_CA( mul, dst, src, constant, count )
  1947. }
  1948. /*
  1949. ============
  1950. idSIMD_SSE::Mul
  1951. dst[i] = src0[i] * src1[i];
  1952. ============
  1953. */
  1954. void VPCALL idSIMD_SSE::Mul( float *dst, const float *src0, const float *src1, const int count ) {
  1955. KFLOAT_AA( mul, dst, src0, src1, count )
  1956. }
  1957. /*
  1958. ============
  1959. idSIMD_SSE::Div
  1960. dst[i] = constant / src[i];
  1961. ============
  1962. */
  1963. void VPCALL idSIMD_SSE::Div( float *dst, const float constant, const float *src, const int count ) {
  1964. int pre, post;
  1965. // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
  1966. __asm
  1967. {
  1968. movss xmm1,constant
  1969. shufps xmm1,xmm1,0
  1970. KFLOATINITDS( dst, src, count, pre, post )
  1971. and eax,15
  1972. jne lpNA
  1973. jmp lpA
  1974. align 16
  1975. lpA:
  1976. movaps xmm2,[edx+ebx]
  1977. movaps xmm3,[edx+ebx+16]
  1978. rcpps xmm4,xmm2
  1979. rcpps xmm5,xmm3
  1980. prefetchnta [edx+ebx+64]
  1981. mulps xmm2,xmm4
  1982. mulps xmm2,xmm4
  1983. mulps xmm3,xmm5
  1984. mulps xmm3,xmm5
  1985. addps xmm4,xmm4
  1986. addps xmm5,xmm5
  1987. subps xmm4,xmm2
  1988. subps xmm5,xmm3
  1989. mulps xmm4,xmm1
  1990. mulps xmm5,xmm1
  1991. movaps [edi+ebx],xmm4
  1992. movaps [edi+ebx+16],xmm5
  1993. add ebx,16*2
  1994. jl lpA
  1995. jmp done
  1996. align 16
  1997. lpNA:
  1998. movups xmm2,[edx+ebx]
  1999. movups xmm3,[edx+ebx+16]
  2000. rcpps xmm4,xmm2
  2001. rcpps xmm5,xmm3
  2002. prefetchnta [edx+ebx+64]
  2003. mulps xmm2,xmm4
  2004. mulps xmm2,xmm4
  2005. mulps xmm3,xmm5
  2006. mulps xmm3,xmm5
  2007. addps xmm4,xmm4
  2008. addps xmm5,xmm5
  2009. subps xmm4,xmm2
  2010. subps xmm5,xmm3
  2011. mulps xmm4,xmm1
  2012. mulps xmm5,xmm1
  2013. movaps [edi+ebx],xmm4
  2014. movaps [edi+ebx+16],xmm5
  2015. add ebx,16*2
  2016. jl lpNA
  2017. done:
  2018. mov edx,src
  2019. mov edi,dst
  2020. KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ),
  2021. KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count )
  2022. }
  2023. }
  2024. /*
  2025. ============
  2026. idSIMD_SSE::Div
  2027. dst[i] = src0[i] / src1[i];
  2028. ============
  2029. */
  2030. void VPCALL idSIMD_SSE::Div( float *dst, const float *src0, const float *src1, const int count ) {
  2031. int pre,post;
  2032. // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
  2033. __asm
  2034. {
  2035. KFLOATINITDSS( dst, src0, src1, count, pre, post )
  2036. and eax,15
  2037. jne lpNA
  2038. jmp lpA
  2039. align 16
  2040. lpA:
  2041. movaps xmm2,[esi+ebx]
  2042. movaps xmm3,[esi+ebx+16]
  2043. rcpps xmm4,xmm2
  2044. rcpps xmm5,xmm3
  2045. prefetchnta [esi+ebx+64]
  2046. mulps xmm2,xmm4
  2047. mulps xmm2,xmm4
  2048. mulps xmm3,xmm5
  2049. mulps xmm3,xmm5
  2050. addps xmm4,xmm4
  2051. addps xmm5,xmm5
  2052. subps xmm4,xmm2
  2053. subps xmm5,xmm3
  2054. mulps xmm4,[edx+ebx]
  2055. mulps xmm5,[edx+ebx+16]
  2056. movaps [edi+ebx],xmm4
  2057. movaps [edi+ebx+16],xmm5
  2058. add ebx,16*2
  2059. jl lpA
  2060. jmp done
  2061. align 16
  2062. lpNA:
  2063. movups xmm2,[esi+ebx]
  2064. movups xmm3,[esi+ebx+16]
  2065. rcpps xmm4,xmm2
  2066. rcpps xmm5,xmm3
  2067. prefetchnta [esi+ebx+64]
  2068. mulps xmm2,xmm4
  2069. mulps xmm2,xmm4
  2070. mulps xmm3,xmm5
  2071. mulps xmm3,xmm5
  2072. addps xmm4,xmm4
  2073. addps xmm5,xmm5
  2074. subps xmm4,xmm2
  2075. subps xmm5,xmm3
  2076. movups xmm2,[edx+ebx]
  2077. movups xmm3,[edx+ebx+16]
  2078. mulps xmm4,xmm2
  2079. mulps xmm5,xmm3
  2080. movaps [edi+ebx],xmm4
  2081. movaps [edi+ebx+16],xmm5
  2082. add ebx,16*2
  2083. jl lpNA
  2084. done:
  2085. mov edx,src0
  2086. mov esi,src1
  2087. mov edi,dst
  2088. KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ),
  2089. KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count )
  2090. }
  2091. }
  2092. /*
  2093. ============
  2094. Simd_MulAdd
  2095. assumes count >= 7
  2096. ============
  2097. */
  2098. static void Simd_MulAdd( float *dst, const float constant, const float *src, const int count ) {
  2099. __asm mov esi, dst
  2100. __asm mov edi, src
  2101. __asm mov eax, count
  2102. __asm shl eax, 2
  2103. __asm mov ecx, esi
  2104. __asm mov edx, eax
  2105. __asm or ecx, edi
  2106. __asm fld constant
  2107. __asm and ecx, 15
  2108. __asm jz SimdMulAdd16
  2109. __asm and ecx, 3
  2110. __asm jnz SimdMulAdd8
  2111. __asm mov ecx, esi
  2112. __asm xor ecx, edi
  2113. __asm and ecx, 15
  2114. __asm jnz MulAdd8
  2115. __asm mov ecx, esi
  2116. __asm and ecx, 15
  2117. __asm neg ecx
  2118. __asm add ecx, 16
  2119. __asm sub eax, ecx
  2120. __asm add edi, ecx
  2121. __asm add esi, ecx
  2122. __asm neg ecx
  2123. __asm mov edx, eax
  2124. __asm loopPreMulAdd16:
  2125. __asm fld st
  2126. __asm fmul dword ptr [edi+ecx]
  2127. __asm fadd dword ptr [esi+ecx]
  2128. __asm fstp dword ptr [esi+ecx]
  2129. __asm add ecx, 4
  2130. __asm jl loopPreMulAdd16
  2131. __asm SimdMulAdd16:
  2132. __asm and eax, ~15
  2133. __asm movss xmm1, constant
  2134. __asm shufps xmm1, xmm1, 0x00
  2135. __asm add esi, eax
  2136. __asm add edi, eax
  2137. __asm neg eax
  2138. __asm align 16
  2139. __asm loopMulAdd16:
  2140. __asm movaps xmm0, [edi+eax]
  2141. __asm mulps xmm0, xmm1
  2142. __asm addps xmm0, [esi+eax]
  2143. __asm movaps [esi+eax], xmm0
  2144. __asm add eax, 16
  2145. __asm jl loopMulAdd16
  2146. __asm jmp postMulAdd
  2147. __asm MulAdd8:
  2148. __asm mov ecx, esi
  2149. __asm and ecx, 7
  2150. __asm jz SimdMulAdd8
  2151. __asm sub eax, ecx
  2152. __asm add esi, ecx
  2153. __asm add edi, ecx
  2154. __asm neg ecx
  2155. __asm mov edx, eax
  2156. __asm loopPreMulAdd8:
  2157. __asm fld st
  2158. __asm fmul dword ptr [edi+ecx]
  2159. __asm fadd dword ptr [esi+ecx]
  2160. __asm fstp dword ptr [esi+ecx]
  2161. __asm add ecx, 4
  2162. __asm jl loopPreMulAdd8
  2163. __asm SimdMulAdd8:
  2164. __asm and eax, ~15
  2165. __asm movss xmm1, constant
  2166. __asm shufps xmm1, xmm1, 0x00
  2167. __asm add esi, eax
  2168. __asm add edi, eax
  2169. __asm neg eax
  2170. __asm align 16
  2171. __asm loopMulAdd8:
  2172. __asm movlps xmm0, [edi+eax]
  2173. __asm movhps xmm0, [edi+eax+8]
  2174. __asm mulps xmm0, xmm1
  2175. __asm movlps xmm2, [esi+eax]
  2176. __asm movhps xmm2, [esi+eax+8]
  2177. __asm addps xmm0, xmm2
  2178. __asm movlps [esi+eax], xmm0
  2179. __asm movhps [esi+eax+8], xmm0
  2180. __asm add eax, 16
  2181. __asm jl loopMulAdd8
  2182. __asm jmp postMulAdd
  2183. __asm postMulAdd:
  2184. __asm and edx, 15
  2185. __asm jz MulAddDone
  2186. __asm add esi, edx
  2187. __asm add edi, edx
  2188. __asm neg edx
  2189. __asm loopPostMulAdd:
  2190. __asm fld st
  2191. __asm fmul dword ptr [edi+edx]
  2192. __asm fadd dword ptr [esi+edx]
  2193. __asm fstp dword ptr [esi+edx]
  2194. __asm add edx, 4
  2195. __asm jl loopPostMulAdd
  2196. __asm MulAddDone:
  2197. __asm fstp st
  2198. }
  2199. #define MULADD_FEW( OPER ) \
  2200. switch( count ) { \
  2201. case 0: \
  2202. return; \
  2203. case 1: \
  2204. dst[0] OPER c * src[0]; \
  2205. return; \
  2206. case 2: \
  2207. dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; \
  2208. return; \
  2209. case 3: \
  2210. dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; \
  2211. return; \
  2212. case 4: \
  2213. dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
  2214. return; \
  2215. case 5: \
  2216. dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
  2217. dst[4] OPER c * src[4]; \
  2218. return; \
  2219. case 6: \
  2220. dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
  2221. dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; \
  2222. return; \
  2223. case 7: \
  2224. dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
  2225. dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; \
  2226. return; \
  2227. case 8: \
  2228. dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
  2229. dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
  2230. return; \
  2231. case 9: \
  2232. dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
  2233. dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
  2234. dst[8] OPER c * src[8]; \
  2235. return; \
  2236. case 10: \
  2237. dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
  2238. dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
  2239. dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; \
  2240. return; \
  2241. case 11: \
  2242. dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
  2243. dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
  2244. dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; dst[10] OPER c * src[10]; \
  2245. return; \
  2246. }
  2247. /*
  2248. ============
  2249. idSIMD_SSE::MulAdd
  2250. dst[i] += constant * src[i];
  2251. ============
  2252. */
  2253. void VPCALL idSIMD_SSE::MulAdd( float *dst, const float constant, const float *src, const int count ) {
  2254. float c = constant;
  2255. MULADD_FEW( += )
  2256. Simd_MulAdd( dst, constant, src, count );
  2257. }
  2258. /*
  2259. ============
  2260. idSIMD_SSE::MulAdd
  2261. dst[i] += src0[i] * src1[i];
  2262. ============
  2263. */
  2264. void VPCALL idSIMD_SSE::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
  2265. for ( int i = 0; i < count; i++ ) {
  2266. dst[i] += src0[i] + src1[i];
  2267. }
  2268. }
  2269. /*
  2270. ============
  2271. idSIMD_SSE::MulSub
  2272. dst[i] -= constant * src[i];
  2273. ============
  2274. */
  2275. void VPCALL idSIMD_SSE::MulSub( float *dst, const float constant, const float *src, const int count ) {
  2276. float c = constant;
  2277. MULADD_FEW( -= )
  2278. Simd_MulAdd( dst, -constant, src, count );
  2279. }
  2280. /*
  2281. ============
  2282. idSIMD_SSE::MulSub
  2283. dst[i] -= src0[i] * src1[i];
  2284. ============
  2285. */
  2286. void VPCALL idSIMD_SSE::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
  2287. for ( int i = 0; i < count; i++ ) {
  2288. dst[i] -= src0[i] + src1[i];
  2289. }
  2290. }
  2291. /*
  2292. ============
  2293. idSIMD_SSE::Dot
  2294. dst[i] = constant * src[i];
  2295. ============
  2296. */
  2297. void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
  2298. __asm
  2299. {
  2300. mov eax, count
  2301. mov edi, constant
  2302. mov edx, eax
  2303. mov esi, src
  2304. mov ecx, dst
  2305. and eax, ~3
  2306. movss xmm4, [edi+0]
  2307. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  2308. movss xmm5, [edi+4]
  2309. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  2310. movss xmm6, [edi+8]
  2311. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  2312. jz done4
  2313. imul eax, 12
  2314. add esi, eax
  2315. neg eax
  2316. loop4:
  2317. movlps xmm1, [esi+eax+ 0]
  2318. movlps xmm2, [esi+eax+ 8]
  2319. movlps xmm3, [esi+eax+16]
  2320. movhps xmm1, [esi+eax+24]
  2321. movhps xmm2, [esi+eax+32]
  2322. movhps xmm3, [esi+eax+40]
  2323. movaps xmm0, xmm1
  2324. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
  2325. shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
  2326. shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
  2327. add ecx, 16
  2328. add eax, 4*12
  2329. mulps xmm0, xmm4
  2330. mulps xmm1, xmm5
  2331. mulps xmm2, xmm6
  2332. addps xmm0, xmm1
  2333. addps xmm0, xmm2
  2334. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
  2335. movlps [ecx-16+0], xmm0
  2336. movhps [ecx-16+8], xmm0
  2337. jl loop4
  2338. done4:
  2339. and edx, 3
  2340. jz done1
  2341. loop1:
  2342. movss xmm0, [esi+eax+0]
  2343. movss xmm1, [esi+eax+4]
  2344. movss xmm2, [esi+eax+8]
  2345. mulss xmm0, xmm4
  2346. mulss xmm1, xmm5
  2347. mulss xmm2, xmm6
  2348. add ecx, 4
  2349. addss xmm0, xmm1
  2350. add eax, 12
  2351. addss xmm0, xmm2
  2352. dec edx
  2353. movss [ecx-4], xmm0
  2354. jnz loop1
  2355. done1:
  2356. }
  2357. }
  2358. /*
  2359. ============
  2360. idSIMD_SSE::Dot
  2361. dst[i] = constant * src[i].Normal() + src[i][3];
  2362. ============
  2363. */
  2364. void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
  2365. __asm {
  2366. mov eax, count
  2367. mov edi, constant
  2368. mov edx, eax
  2369. mov esi, src
  2370. mov ecx, dst
  2371. and eax, ~3
  2372. movss xmm5, [edi+0]
  2373. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  2374. movss xmm6, [edi+4]
  2375. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  2376. movss xmm7, [edi+8]
  2377. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  2378. jz startVert1
  2379. imul eax, 16
  2380. add esi, eax
  2381. neg eax
  2382. loopVert4:
  2383. movlps xmm1, [esi+eax+ 0]
  2384. movlps xmm3, [esi+eax+ 8]
  2385. movhps xmm1, [esi+eax+16]
  2386. movhps xmm3, [esi+eax+24]
  2387. movlps xmm2, [esi+eax+32]
  2388. movlps xmm4, [esi+eax+40]
  2389. movhps xmm2, [esi+eax+48]
  2390. movhps xmm4, [esi+eax+56]
  2391. movaps xmm0, xmm1
  2392. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
  2393. shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
  2394. movaps xmm2, xmm3
  2395. shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
  2396. shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
  2397. add ecx, 16
  2398. add eax, 4*16
  2399. mulps xmm0, xmm5
  2400. mulps xmm1, xmm6
  2401. mulps xmm2, xmm7
  2402. addps xmm0, xmm3
  2403. addps xmm0, xmm1
  2404. addps xmm0, xmm2
  2405. movlps [ecx-16+0], xmm0
  2406. movhps [ecx-16+8], xmm0
  2407. jl loopVert4
  2408. startVert1:
  2409. and edx, 3
  2410. jz done
  2411. loopVert1:
  2412. movss xmm0, [esi+eax+0]
  2413. movss xmm1, [esi+eax+4]
  2414. movss xmm2, [esi+eax+8]
  2415. mulss xmm0, xmm5
  2416. mulss xmm1, xmm6
  2417. mulss xmm2, xmm7
  2418. addss xmm0, [esi+eax+12]
  2419. add ecx, 4
  2420. addss xmm0, xmm1
  2421. add eax, 16
  2422. addss xmm0, xmm2
  2423. dec edx
  2424. movss [ecx-4], xmm0
  2425. jnz loopVert1
  2426. done:
  2427. }
  2428. }
  2429. /*
  2430. ============
  2431. idSIMD_SSE::Dot
  2432. dst[i] = constant * src[i].xyz;
  2433. ============
  2434. */
  2435. void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
  2436. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  2437. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  2438. // 0, 1, 2
  2439. // 3, 4, 5
  2440. // 6, 7, 8
  2441. // 9, 10, 11
  2442. __asm {
  2443. mov eax, count
  2444. mov edi, constant
  2445. mov edx, eax
  2446. mov esi, src
  2447. mov ecx, dst
  2448. and eax, ~3
  2449. movss xmm4, [edi+0]
  2450. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  2451. movss xmm5, [edi+4]
  2452. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  2453. movss xmm6, [edi+8]
  2454. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  2455. jz startVert1
  2456. imul eax, DRAWVERT_SIZE
  2457. add esi, eax
  2458. neg eax
  2459. loopVert4:
  2460. movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
  2461. movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
  2462. movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
  2463. movaps xmm1, xmm0 // 3, X, 0, 1
  2464. movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
  2465. shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
  2466. movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
  2467. movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
  2468. shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
  2469. movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
  2470. shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
  2471. movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
  2472. shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
  2473. add ecx, 16
  2474. add eax, 4*DRAWVERT_SIZE
  2475. mulps xmm0, xmm4
  2476. mulps xmm1, xmm5
  2477. mulps xmm2, xmm6
  2478. addps xmm0, xmm1
  2479. addps xmm0, xmm2
  2480. movlps [ecx-16+0], xmm0
  2481. movhps [ecx-16+8], xmm0
  2482. jl loopVert4
  2483. startVert1:
  2484. and edx, 3
  2485. jz done
  2486. loopVert1:
  2487. movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
  2488. movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
  2489. movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
  2490. mulss xmm0, xmm4
  2491. mulss xmm1, xmm5
  2492. mulss xmm2, xmm6
  2493. add ecx, 4
  2494. addss xmm0, xmm1
  2495. add eax, DRAWVERT_SIZE
  2496. addss xmm0, xmm2
  2497. dec edx
  2498. movss [ecx-4], xmm0
  2499. jnz loopVert1
  2500. done:
  2501. }
  2502. }
  2503. /*
  2504. ============
  2505. idSIMD_SSE::Dot
  2506. dst[i] = constant.Normal() * src[i] + constant[3];
  2507. ============
  2508. */
  2509. void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
  2510. __asm
  2511. {
  2512. mov eax, count
  2513. mov edi, constant
  2514. mov edx, eax
  2515. mov esi, src
  2516. mov ecx, dst
  2517. and eax, ~3
  2518. movss xmm4, [edi+0]
  2519. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  2520. movss xmm5, [edi+4]
  2521. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  2522. movss xmm6, [edi+8]
  2523. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  2524. movss xmm7, [edi+12]
  2525. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  2526. jz done4
  2527. imul eax, 12
  2528. add esi, eax
  2529. neg eax
  2530. loop4:
  2531. movlps xmm1, [esi+eax+ 0]
  2532. movlps xmm2, [esi+eax+ 8]
  2533. movlps xmm3, [esi+eax+16]
  2534. movhps xmm1, [esi+eax+24]
  2535. movhps xmm2, [esi+eax+32]
  2536. movhps xmm3, [esi+eax+40]
  2537. movaps xmm0, xmm1
  2538. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
  2539. shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
  2540. shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
  2541. add ecx, 16
  2542. add eax, 4*12
  2543. mulps xmm0, xmm4
  2544. mulps xmm1, xmm5
  2545. mulps xmm2, xmm6
  2546. addps xmm0, xmm7
  2547. addps xmm0, xmm1
  2548. addps xmm0, xmm2
  2549. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
  2550. movlps [ecx-16+0], xmm0
  2551. movhps [ecx-16+8], xmm0
  2552. jl loop4
  2553. done4:
  2554. and edx, 3
  2555. jz done1
  2556. loop1:
  2557. movss xmm0, [esi+eax+0]
  2558. movss xmm1, [esi+eax+4]
  2559. movss xmm2, [esi+eax+8]
  2560. mulss xmm0, xmm4
  2561. mulss xmm1, xmm5
  2562. mulss xmm2, xmm6
  2563. addss xmm0, xmm7
  2564. add ecx, 4
  2565. addss xmm0, xmm1
  2566. add eax, 12
  2567. addss xmm0, xmm2
  2568. dec edx
  2569. movss [ecx-4], xmm0
  2570. jnz loop1
  2571. done1:
  2572. }
  2573. }
  2574. /*
  2575. ============
  2576. idSIMD_SSE::Dot
  2577. dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
  2578. ============
  2579. */
  2580. void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
  2581. #define SINGLE_OP(SRC, DEST) \
  2582. __asm movlps xmm0,[SRC] \
  2583. __asm movlps xmm1,[SRC+8] \
  2584. __asm mulps xmm0,xmm4 \
  2585. __asm mulps xmm1,xmm5 \
  2586. __asm addps xmm0,xmm1 \
  2587. __asm movaps xmm1,xmm0 \
  2588. __asm shufps xmm1,xmm1,SHUFFLEPS(1,1,1,1) \
  2589. __asm addss xmm0,xmm1 \
  2590. __asm movss [DEST],xmm0 \
  2591. __asm add SRC,16 \
  2592. __asm add DEST,4
  2593. #define DUAL_OP(SRC, DEST) \
  2594. __asm movlps xmm0,[SRC] \
  2595. __asm movlps xmm1,[SRC+8] \
  2596. __asm movhps xmm0,[SRC+16] \
  2597. __asm movhps xmm1,[SRC+24] \
  2598. __asm mulps xmm0,xmm4 \
  2599. __asm mulps xmm1,xmm5 \
  2600. __asm addps xmm0,xmm1 \
  2601. __asm shufps xmm1,xmm0,SHUFFLEPS(2,0,1,0) \
  2602. __asm shufps xmm0,xmm0,SHUFFLEPS(3,1,2,0) \
  2603. __asm addps xmm0,xmm1 \
  2604. __asm movhps [DEST],xmm0 \
  2605. __asm add SRC,32 \
  2606. __asm add DEST,8
  2607. __asm {
  2608. mov edx, dst
  2609. mov eax, src
  2610. mov ebx, constant
  2611. mov ecx, count
  2612. movlps xmm4, [ebx]
  2613. shufps xmm4, xmm4, SHUFFLEPS(1,0,1,0)
  2614. movlps xmm5, [ebx+8]
  2615. shufps xmm5, xmm5, SHUFFLEPS(1,0,1,0)
  2616. xorps xmm0, xmm0
  2617. xorps xmm1, xmm1
  2618. _lpAlignDest:
  2619. test edx, 0x0f
  2620. jz _destAligned
  2621. SINGLE_OP(eax,edx)
  2622. dec ecx
  2623. jnz _lpAlignDest
  2624. jmp _vpExit
  2625. _destAligned:
  2626. push ecx
  2627. cmp ecx, 4
  2628. jl _post
  2629. and ecx, ~3
  2630. shl ecx, 2
  2631. lea eax, [eax+ecx*4]
  2632. add edx, ecx
  2633. neg ecx
  2634. movlps xmm0, [eax+ecx*4]
  2635. movhps xmm0, [eax+ecx*4+16]
  2636. movlps xmm2, [eax+ecx*4+32]
  2637. movhps xmm2, [eax+ecx*4+48]
  2638. jmp _lpStart
  2639. align 16
  2640. _lp:
  2641. prefetchnta [eax+ecx*4+128]
  2642. addps xmm1, xmm0
  2643. movlps xmm0, [eax+ecx*4]
  2644. movhps xmm0, [eax+ecx*4+16]
  2645. movlps xmm2, [eax+ecx*4+32]
  2646. movhps xmm2, [eax+ecx*4+48]
  2647. movaps [edx+ecx-16],xmm1
  2648. _lpStart:
  2649. movlps xmm1, [eax+ecx*4+8]
  2650. movhps xmm1, [eax+ecx*4+24]
  2651. movlps xmm3, [eax+ecx*4+40]
  2652. movhps xmm3, [eax+ecx*4+56]
  2653. add ecx, 16
  2654. mulps xmm1, xmm5
  2655. mulps xmm2, xmm4
  2656. mulps xmm3, xmm5
  2657. addps xmm2, xmm3 // y3+w3 x3+z3 y2+w2 x2+z2
  2658. mulps xmm0, xmm4
  2659. addps xmm0, xmm1 // y1+w1 x1+z1 y0+w0 x0+z0
  2660. movaps xmm1, xmm0
  2661. shufps xmm0, xmm2, SHUFFLEPS(2,0,2,0) // x3+z3 x2+z2 x1+z1 x0+z0
  2662. shufps xmm1, xmm2, SHUFFLEPS(3,1,3,1) // y3+w3 y2+w2 y1+w1 y0+w0
  2663. js _lp
  2664. addps xmm1, xmm0
  2665. movaps [edx+ecx-16], xmm1
  2666. _post:
  2667. pop ecx
  2668. and ecx, 0x3
  2669. cmp ecx, 2
  2670. jl _post1
  2671. DUAL_OP(eax,edx)
  2672. sub ecx, 2
  2673. _post1:
  2674. cmp ecx, 1
  2675. jne _vpExit
  2676. SINGLE_OP(eax,edx)
  2677. _vpExit:
  2678. }
  2679. #undef DUAL_OP
  2680. #undef SINGLE_OP
  2681. }
  2682. /*
  2683. ============
  2684. idSIMD_SSE::Dot
  2685. dst[i] = constant.Normal() * src[i].xyz + constant[3];
  2686. ============
  2687. */
  2688. void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
  2689. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  2690. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  2691. // 0, 1, 2
  2692. // 3, 4, 5
  2693. // 6, 7, 8
  2694. // 9, 10, 11
  2695. __asm {
  2696. mov eax, count
  2697. mov edi, constant
  2698. mov edx, eax
  2699. mov esi, src
  2700. mov ecx, dst
  2701. and eax, ~3
  2702. movss xmm4, [edi+0]
  2703. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  2704. movss xmm5, [edi+4]
  2705. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  2706. movss xmm6, [edi+8]
  2707. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  2708. movss xmm7, [edi+12]
  2709. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  2710. jz startVert1
  2711. imul eax, DRAWVERT_SIZE
  2712. add esi, eax
  2713. neg eax
  2714. loopVert4:
  2715. movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
  2716. movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
  2717. movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
  2718. movaps xmm1, xmm0 // 3, X, 0, 1
  2719. movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
  2720. shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
  2721. movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
  2722. movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
  2723. shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
  2724. movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
  2725. shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
  2726. movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
  2727. shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
  2728. add ecx, 16
  2729. add eax, 4*DRAWVERT_SIZE
  2730. mulps xmm0, xmm4
  2731. mulps xmm1, xmm5
  2732. mulps xmm2, xmm6
  2733. addps xmm0, xmm7
  2734. addps xmm0, xmm1
  2735. addps xmm0, xmm2
  2736. movlps [ecx-16+0], xmm0
  2737. movhps [ecx-16+8], xmm0
  2738. jl loopVert4
  2739. startVert1:
  2740. and edx, 3
  2741. jz done
  2742. loopVert1:
  2743. movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
  2744. movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
  2745. movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
  2746. mulss xmm0, xmm4
  2747. mulss xmm1, xmm5
  2748. mulss xmm2, xmm6
  2749. addss xmm0, xmm7
  2750. add ecx, 4
  2751. addss xmm0, xmm1
  2752. add eax, DRAWVERT_SIZE
  2753. addss xmm0, xmm2
  2754. dec edx
  2755. movss [ecx-4], xmm0
  2756. jnz loopVert1
  2757. done:
  2758. }
  2759. }
  2760. /*
  2761. ============
  2762. idSIMD_SSE::Dot
  2763. dst[i] = src0[i] * src1[i];
  2764. ============
  2765. */
  2766. void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
  2767. __asm
  2768. {
  2769. mov eax, count
  2770. mov edi, src0
  2771. mov edx, eax
  2772. mov esi, src1
  2773. mov ecx, dst
  2774. and eax, ~3
  2775. jz done4
  2776. imul eax, 12
  2777. add edi, eax
  2778. add esi, eax
  2779. neg eax
  2780. loop4:
  2781. movlps xmm0, [esi+eax] // 0, 1, X, X
  2782. movlps xmm3, [edi+eax] // 0, 1, X, X
  2783. movlps xmm1, [esi+eax+8] // 2, 3, X, X
  2784. movlps xmm4, [edi+eax+8] // 2, 3, X, X
  2785. movhps xmm0, [esi+eax+24] // 0, 1, 6, 7
  2786. movhps xmm3, [edi+eax+24] // 0, 1, 6, 7
  2787. movhps xmm1, [esi+eax+32] // 2, 3, 8, 9
  2788. movhps xmm4, [edi+eax+32] // 2, 3, 8, 9
  2789. movlps xmm2, [esi+eax+16] // 4, 5, X, X
  2790. movlps xmm5, [edi+eax+16] // 4, 5, X, X
  2791. movhps xmm2, [esi+eax+40] // 4, 5, 10, 11
  2792. movhps xmm5, [edi+eax+40] // 4, 5, 10, 11
  2793. add ecx, 16
  2794. add eax, 48
  2795. mulps xmm0, xmm3
  2796. mulps xmm1, xmm4
  2797. mulps xmm2, xmm5
  2798. movaps xmm7, xmm0
  2799. shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) // 0, 6, 3, 9
  2800. shufps xmm0, xmm2, R_SHUFFLEPS( 1, 3, 0, 2 ) // 1, 7, 4, 10
  2801. shufps xmm1, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 ) // 2, 8, 5, 11
  2802. addps xmm7, xmm0
  2803. addps xmm7, xmm1
  2804. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 1, 3 )
  2805. movlps [ecx-16+0], xmm7
  2806. movhps [ecx-16+8], xmm7
  2807. jl loop4
  2808. done4:
  2809. and edx, 3
  2810. jz done1
  2811. loop1:
  2812. movss xmm0, [esi+eax+0]
  2813. movss xmm3, [edi+eax+0]
  2814. movss xmm1, [esi+eax+4]
  2815. movss xmm4, [edi+eax+4]
  2816. movss xmm2, [esi+eax+8]
  2817. movss xmm5, [edi+eax+8]
  2818. mulss xmm0, xmm3
  2819. mulss xmm1, xmm4
  2820. mulss xmm2, xmm5
  2821. add ecx, 4
  2822. addss xmm0, xmm1
  2823. add eax, 12
  2824. addss xmm0, xmm2
  2825. dec edx
  2826. movss [ecx-4], xmm0
  2827. jnz loop1
  2828. done1:
  2829. }
  2830. }
  2831. /*
  2832. ============
  2833. idSIMD_SSE::Dot
  2834. dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
  2835. ============
  2836. */
  2837. void VPCALL idSIMD_SSE::Dot( float &dot, const float *src1, const float *src2, const int count ) {
  2838. switch( count ) {
  2839. case 0:
  2840. dot = 0.0f;
  2841. return;
  2842. case 1:
  2843. dot = src1[0] * src2[0];
  2844. return;
  2845. case 2:
  2846. dot = src1[0] * src2[0] + src1[1] * src2[1];
  2847. return;
  2848. case 3:
  2849. dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
  2850. return;
  2851. default:
  2852. __asm {
  2853. mov ecx, src1
  2854. mov edx, src2
  2855. mov eax, ecx
  2856. or eax, edx
  2857. and eax, 15
  2858. jz alignedDot
  2859. // unaligned
  2860. mov eax, count
  2861. shr eax, 2
  2862. shl eax, 4
  2863. add ecx, eax
  2864. add edx, eax
  2865. neg eax
  2866. movups xmm0, [ecx+eax]
  2867. movups xmm1, [edx+eax]
  2868. mulps xmm0, xmm1
  2869. add eax, 16
  2870. jz doneDot
  2871. loopUnalignedDot:
  2872. movups xmm1, [ecx+eax]
  2873. movups xmm2, [edx+eax]
  2874. mulps xmm1, xmm2
  2875. addps xmm0, xmm1
  2876. add eax, 16
  2877. jl loopUnalignedDot
  2878. jmp doneDot
  2879. // aligned
  2880. alignedDot:
  2881. mov eax, count
  2882. shr eax, 2
  2883. shl eax, 4
  2884. add ecx, eax
  2885. add edx, eax
  2886. neg eax
  2887. movaps xmm0, [ecx+eax]
  2888. movaps xmm1, [edx+eax]
  2889. mulps xmm0, xmm1
  2890. add eax, 16
  2891. jz doneDot
  2892. loopAlignedDot:
  2893. movaps xmm1, [ecx+eax]
  2894. movaps xmm2, [edx+eax]
  2895. mulps xmm1, xmm2
  2896. addps xmm0, xmm1
  2897. add eax, 16
  2898. jl loopAlignedDot
  2899. doneDot:
  2900. }
  2901. switch( count & 3 ) {
  2902. case 1:
  2903. __asm {
  2904. movss xmm1, [ecx]
  2905. movss xmm2, [edx]
  2906. mulss xmm1, xmm2
  2907. addss xmm0, xmm1
  2908. }
  2909. break;
  2910. case 2:
  2911. __asm {
  2912. xorps xmm2, xmm2
  2913. movlps xmm1, [ecx]
  2914. movlps xmm2, [edx]
  2915. mulps xmm1, xmm2
  2916. addps xmm0, xmm1
  2917. }
  2918. break;
  2919. case 3:
  2920. __asm {
  2921. movss xmm1, [ecx]
  2922. movhps xmm1, [ecx+4]
  2923. movss xmm2, [edx]
  2924. movhps xmm2, [edx+4]
  2925. mulps xmm1, xmm2
  2926. addps xmm0, xmm1
  2927. }
  2928. break;
  2929. }
  2930. __asm {
  2931. movhlps xmm1, xmm0
  2932. addps xmm0, xmm1
  2933. movaps xmm1, xmm0
  2934. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
  2935. addss xmm0, xmm1
  2936. mov eax, dot
  2937. movss [eax], xmm0
  2938. }
  2939. return;
  2940. }
  2941. }
  2942. //
  2943. // cmpeqps == Equal
  2944. // cmpneqps != Not Equal
  2945. // cmpltps < Less Than
  2946. // cmpnltps >= Not Less Than
  2947. // cmpnleps > Not Less Or Equal
  2948. //
  2949. #define FLIP not al
  2950. #define NOFLIP
  2951. #define COMPARECONSTANT( DST, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
  2952. int i, cnt, pre, post; \
  2953. float *aligned; \
  2954. \
  2955. /* if the float array is not aligned on a 4 byte boundary */ \
  2956. if ( ((int) SRC0) & 3 ) { \
  2957. /* unaligned memory access */ \
  2958. pre = 0; \
  2959. cnt = COUNT >> 2; \
  2960. post = COUNT - (cnt<<2); \
  2961. __asm mov edx, cnt \
  2962. __asm test edx, edx \
  2963. __asm je doneCmp \
  2964. __asm push ebx \
  2965. __asm neg edx \
  2966. __asm mov esi, SRC0 \
  2967. __asm prefetchnta [esi+64] \
  2968. __asm movss xmm1, CONSTANT \
  2969. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  2970. __asm mov edi, DST \
  2971. __asm mov ecx, 0x01010101 \
  2972. __asm loopNA: \
  2973. __asm movups xmm0, [esi] \
  2974. __asm prefetchnta [esi+128] \
  2975. __asm CMPSIMD xmm0, xmm1 \
  2976. __asm movmskps eax, xmm0 \
  2977. __asm DOFLIP \
  2978. __asm mov ah, al \
  2979. __asm shr ah, 1 \
  2980. __asm mov bx, ax \
  2981. __asm shl ebx, 14 \
  2982. __asm mov bx, ax \
  2983. __asm and ebx, ecx \
  2984. __asm mov dword ptr [edi], ebx \
  2985. __asm add esi, 16 \
  2986. __asm add edi, 4 \
  2987. __asm inc edx \
  2988. __asm jl loopNA \
  2989. __asm pop ebx \
  2990. } \
  2991. else { \
  2992. /* aligned memory access */ \
  2993. aligned = (float *) ((((int) SRC0) + 15) & ~15); \
  2994. if ( (int)aligned > ((int)src0) + COUNT ) { \
  2995. pre = COUNT; \
  2996. post = 0; \
  2997. } \
  2998. else { \
  2999. pre = aligned - SRC0; \
  3000. cnt = (COUNT - pre) >> 2; \
  3001. post = COUNT - pre - (cnt<<2); \
  3002. __asm mov edx, cnt \
  3003. __asm test edx, edx \
  3004. __asm je doneCmp \
  3005. __asm push ebx \
  3006. __asm neg edx \
  3007. __asm mov esi, aligned \
  3008. __asm prefetchnta [esi+64] \
  3009. __asm movss xmm1, CONSTANT \
  3010. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  3011. __asm mov edi, DST \
  3012. __asm add edi, pre \
  3013. __asm mov ecx, 0x01010101 \
  3014. __asm loopA: \
  3015. __asm movaps xmm0, [esi] \
  3016. __asm prefetchnta [esi+128] \
  3017. __asm CMPSIMD xmm0, xmm1 \
  3018. __asm movmskps eax, xmm0 \
  3019. __asm DOFLIP \
  3020. __asm mov ah, al \
  3021. __asm shr ah, 1 \
  3022. __asm mov bx, ax \
  3023. __asm shl ebx, 14 \
  3024. __asm mov bx, ax \
  3025. __asm and ebx, ecx \
  3026. __asm mov dword ptr [edi], ebx \
  3027. __asm add esi, 16 \
  3028. __asm add edi, 4 \
  3029. __asm inc edx \
  3030. __asm jl loopA \
  3031. __asm pop ebx \
  3032. } \
  3033. } \
  3034. doneCmp: \
  3035. double c = constant; \
  3036. for ( i = 0; i < pre; i++ ) { \
  3037. dst[i] = src0[i] CMP c; \
  3038. } \
  3039. for ( i = count - post; i < count; i++ ) { \
  3040. dst[i] = src0[i] CMP c; \
  3041. }
  3042. #define COMPAREBITCONSTANT( DST, BITNUM, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
  3043. int i, cnt, pre, post; \
  3044. float *aligned; \
  3045. \
  3046. /* if the float array is not aligned on a 4 byte boundary */ \
  3047. if ( ((int) SRC0) & 3 ) { \
  3048. /* unaligned memory access */ \
  3049. pre = 0; \
  3050. cnt = COUNT >> 2; \
  3051. post = COUNT - (cnt<<2); \
  3052. __asm mov edx, cnt \
  3053. __asm test edx, edx \
  3054. __asm je doneCmp \
  3055. __asm push ebx \
  3056. __asm neg edx \
  3057. __asm mov esi, SRC0 \
  3058. __asm prefetchnta [esi+64] \
  3059. __asm movss xmm1, CONSTANT \
  3060. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  3061. __asm mov edi, DST \
  3062. __asm mov cl, bitNum \
  3063. __asm loopNA: \
  3064. __asm movups xmm0, [esi] \
  3065. __asm prefetchnta [esi+128] \
  3066. __asm CMPSIMD xmm0, xmm1 \
  3067. __asm movmskps eax, xmm0 \
  3068. __asm DOFLIP \
  3069. __asm mov ah, al \
  3070. __asm shr ah, 1 \
  3071. __asm mov bx, ax \
  3072. __asm shl ebx, 14 \
  3073. __asm mov bx, ax \
  3074. __asm and ebx, 0x01010101 \
  3075. __asm shl ebx, cl \
  3076. __asm or ebx, dword ptr [edi] \
  3077. __asm mov dword ptr [edi], ebx \
  3078. __asm add esi, 16 \
  3079. __asm add edi, 4 \
  3080. __asm inc edx \
  3081. __asm jl loopNA \
  3082. __asm pop ebx \
  3083. } \
  3084. else { \
  3085. /* aligned memory access */ \
  3086. aligned = (float *) ((((int) SRC0) + 15) & ~15); \
  3087. if ( (int)aligned > ((int)src0) + COUNT ) { \
  3088. pre = COUNT; \
  3089. post = 0; \
  3090. } \
  3091. else { \
  3092. pre = aligned - SRC0; \
  3093. cnt = (COUNT - pre) >> 2; \
  3094. post = COUNT - pre - (cnt<<2); \
  3095. __asm mov edx, cnt \
  3096. __asm test edx, edx \
  3097. __asm je doneCmp \
  3098. __asm push ebx \
  3099. __asm neg edx \
  3100. __asm mov esi, aligned \
  3101. __asm prefetchnta [esi+64] \
  3102. __asm movss xmm1, CONSTANT \
  3103. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  3104. __asm mov edi, DST \
  3105. __asm add edi, pre \
  3106. __asm mov cl, bitNum \
  3107. __asm loopA: \
  3108. __asm movaps xmm0, [esi] \
  3109. __asm prefetchnta [esi+128] \
  3110. __asm CMPSIMD xmm0, xmm1 \
  3111. __asm movmskps eax, xmm0 \
  3112. __asm DOFLIP \
  3113. __asm mov ah, al \
  3114. __asm shr ah, 1 \
  3115. __asm mov bx, ax \
  3116. __asm shl ebx, 14 \
  3117. __asm mov bx, ax \
  3118. __asm and ebx, 0x01010101 \
  3119. __asm shl ebx, cl \
  3120. __asm or ebx, dword ptr [edi] \
  3121. __asm mov dword ptr [edi], ebx \
  3122. __asm add esi, 16 \
  3123. __asm add edi, 4 \
  3124. __asm inc edx \
  3125. __asm jl loopA \
  3126. __asm pop ebx \
  3127. } \
  3128. } \
  3129. doneCmp: \
  3130. float c = constant; \
  3131. for ( i = 0; i < pre; i++ ) { \
  3132. dst[i] |= ( src0[i] CMP c ) << BITNUM; \
  3133. } \
  3134. for ( i = count - post; i < count; i++ ) { \
  3135. dst[i] |= ( src0[i] CMP c ) << BITNUM; \
  3136. }
  3137. /*
  3138. ============
  3139. idSIMD_SSE::CmpGT
  3140. dst[i] = src0[i] > constant;
  3141. ============
  3142. */
  3143. void VPCALL idSIMD_SSE::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
  3144. COMPARECONSTANT( dst, src0, constant, count, >, cmpnleps, NOFLIP )
  3145. }
  3146. /*
  3147. ============
  3148. idSIMD_SSE::CmpGT
  3149. dst[i] |= ( src0[i] > constant ) << bitNum;
  3150. ============
  3151. */
  3152. void VPCALL idSIMD_SSE::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  3153. COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >, cmpnleps, NOFLIP )
  3154. }
  3155. /*
  3156. ============
  3157. idSIMD_SSE::CmpGE
  3158. dst[i] = src0[i] >= constant;
  3159. ============
  3160. */
  3161. void VPCALL idSIMD_SSE::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
  3162. COMPARECONSTANT( dst, src0, constant, count, >=, cmpnltps, NOFLIP )
  3163. }
  3164. /*
  3165. ============
  3166. idSIMD_SSE::CmpGE
  3167. dst[i] |= ( src0[i] >= constant ) << bitNum;
  3168. ============
  3169. */
  3170. void VPCALL idSIMD_SSE::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  3171. COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >=, cmpnltps, NOFLIP )
  3172. }
  3173. /*
  3174. ============
  3175. idSIMD_SSE::CmpLT
  3176. dst[i] = src0[i] < constant;
  3177. ============
  3178. */
  3179. void VPCALL idSIMD_SSE::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
  3180. COMPARECONSTANT( dst, src0, constant, count, <, cmpltps, NOFLIP )
  3181. }
  3182. /*
  3183. ============
  3184. idSIMD_SSE::CmpLT
  3185. dst[i] |= ( src0[i] < constant ) << bitNum;
  3186. ============
  3187. */
  3188. void VPCALL idSIMD_SSE::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  3189. COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <, cmpltps, NOFLIP )
  3190. }
  3191. /*
  3192. ============
  3193. idSIMD_SSE::CmpLE
  3194. dst[i] = src0[i] <= constant;
  3195. ============
  3196. */
  3197. void VPCALL idSIMD_SSE::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
  3198. COMPARECONSTANT( dst, src0, constant, count, <=, cmpnleps, FLIP )
  3199. }
  3200. /*
  3201. ============
  3202. idSIMD_SSE::CmpLE
  3203. dst[i] |= ( src0[i] <= constant ) << bitNum;
  3204. ============
  3205. */
  3206. void VPCALL idSIMD_SSE::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  3207. COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <=, cmpnleps, FLIP )
  3208. }
  3209. /*
  3210. ============
  3211. idSIMD_SSE::MinMax
  3212. ============
  3213. */
  3214. void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const int count ) {
  3215. int i, pre, post;
  3216. min = idMath::INFINITY; max = -idMath::INFINITY;
  3217. __asm
  3218. {
  3219. push ebx
  3220. mov eax, min
  3221. mov ebx, max
  3222. movss xmm0, [eax]
  3223. movss xmm1, [ebx]
  3224. shufps xmm0, xmm0, 0
  3225. shufps xmm1, xmm1, 0
  3226. KFLOATINITS( src, count, pre, post )
  3227. and eax, 15
  3228. jz lpA
  3229. jmp lpNA
  3230. align 16
  3231. lpNA:
  3232. movups xmm2, [edx+ebx]
  3233. movups xmm3, [edx+ebx+16]
  3234. minps xmm0, xmm2
  3235. maxps xmm1, xmm2
  3236. prefetchnta [edx+ebx+64]
  3237. minps xmm0, xmm3
  3238. maxps xmm1, xmm3
  3239. add ebx, 16*2
  3240. jl lpNA
  3241. jmp done2
  3242. lpA:
  3243. movaps xmm2, [edx+ebx]
  3244. movaps xmm3, [edx+ebx+16]
  3245. minps xmm0, xmm2
  3246. maxps xmm1, xmm2
  3247. prefetchnta [edx+ebx+64]
  3248. minps xmm0, xmm3
  3249. maxps xmm1, xmm3
  3250. add ebx, 16*2
  3251. jl lpA
  3252. jmp done2
  3253. align 16
  3254. done2:
  3255. movaps xmm2, xmm0
  3256. movaps xmm3, xmm1
  3257. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  3258. shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
  3259. minss xmm0, xmm2
  3260. maxss xmm1, xmm3
  3261. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  3262. shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
  3263. minss xmm0, xmm2
  3264. maxss xmm1, xmm3
  3265. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  3266. shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
  3267. minss xmm0, xmm2
  3268. maxss xmm1, xmm3
  3269. mov eax, min
  3270. mov ebx, max
  3271. movss [eax], xmm0
  3272. movss [ebx], xmm1
  3273. done:
  3274. pop ebx
  3275. }
  3276. for ( i = 0; i < pre; i++ ) {
  3277. float tmp = src[i];
  3278. if ( tmp > max ) {
  3279. max = tmp;
  3280. }
  3281. if ( tmp < min ) {
  3282. min = tmp;
  3283. }
  3284. }
  3285. for ( i = count - post; i < count; i++ ) {
  3286. float tmp = src[i];
  3287. if ( tmp > max ) {
  3288. max = tmp;
  3289. }
  3290. if ( tmp < min ) {
  3291. min = tmp;
  3292. }
  3293. }
  3294. }
  3295. /*
  3296. ============
  3297. idSIMD_SSE::MinMax
  3298. ============
  3299. */
  3300. void VPCALL idSIMD_SSE::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
  3301. __asm {
  3302. mov eax, count
  3303. test eax, eax
  3304. movss xmm0, idMath::INFINITY
  3305. xorps xmm1, xmm1
  3306. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  3307. subps xmm1, xmm0
  3308. jz done
  3309. mov ecx, eax
  3310. and ecx, 1
  3311. mov esi, src
  3312. jz startLoop
  3313. movlps xmm2, [esi]
  3314. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
  3315. dec eax
  3316. add esi, 2*4
  3317. minps xmm0, xmm2
  3318. maxps xmm1, xmm2
  3319. startLoop:
  3320. imul eax, 2*4
  3321. add esi, eax
  3322. neg eax
  3323. loopVert:
  3324. movlps xmm2, [esi+eax]
  3325. movhps xmm2, [esi+eax+8]
  3326. add eax, 4*4
  3327. minps xmm0, xmm2
  3328. maxps xmm1, xmm2
  3329. jl loopVert
  3330. done:
  3331. movaps xmm2, xmm0
  3332. shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 )
  3333. minps xmm0, xmm2
  3334. mov esi, min
  3335. movlps [esi], xmm0
  3336. movaps xmm3, xmm1
  3337. shufps xmm3, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
  3338. maxps xmm1, xmm3
  3339. mov edi, max
  3340. movlps [edi], xmm1
  3341. }
  3342. }
  3343. /*
  3344. ============
  3345. idSIMD_SSE::MinMax
  3346. ============
  3347. */
  3348. void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
  3349. __asm {
  3350. movss xmm0, idMath::INFINITY
  3351. xorps xmm1, xmm1
  3352. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  3353. subps xmm1, xmm0
  3354. movaps xmm2, xmm0
  3355. movaps xmm3, xmm1
  3356. mov esi, src
  3357. mov eax, count
  3358. and eax, ~3
  3359. jz done4
  3360. imul eax, 12
  3361. add esi, eax
  3362. neg eax
  3363. loop4:
  3364. // prefetchnta [esi+4*12]
  3365. movss xmm4, [esi+eax+0*12+8]
  3366. movhps xmm4, [esi+eax+0*12+0]
  3367. minps xmm0, xmm4
  3368. maxps xmm1, xmm4
  3369. movss xmm5, [esi+eax+1*12+0]
  3370. movhps xmm5, [esi+eax+1*12+4]
  3371. minps xmm2, xmm5
  3372. maxps xmm3, xmm5
  3373. movss xmm6, [esi+eax+2*12+8]
  3374. movhps xmm6, [esi+eax+2*12+0]
  3375. minps xmm0, xmm6
  3376. maxps xmm1, xmm6
  3377. movss xmm7, [esi+eax+3*12+0]
  3378. movhps xmm7, [esi+eax+3*12+4]
  3379. minps xmm2, xmm7
  3380. maxps xmm3, xmm7
  3381. add eax, 4*12
  3382. jl loop4
  3383. done4:
  3384. mov eax, count
  3385. and eax, 3
  3386. jz done1
  3387. imul eax, 12
  3388. add esi, eax
  3389. neg eax
  3390. loop1:
  3391. movss xmm4, [esi+eax+0*12+8]
  3392. movhps xmm4, [esi+eax+0*12+0]
  3393. minps xmm0, xmm4
  3394. maxps xmm1, xmm4
  3395. add eax, 12
  3396. jl loop1
  3397. done1:
  3398. shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
  3399. shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
  3400. minps xmm0, xmm2
  3401. maxps xmm1, xmm3
  3402. mov esi, min
  3403. movhps [esi], xmm0
  3404. movss [esi+8], xmm0
  3405. mov edi, max
  3406. movhps [edi], xmm1
  3407. movss [edi+8], xmm1
  3408. }
  3409. }
  3410. /*
  3411. ============
  3412. idSIMD_SSE::MinMax
  3413. ============
  3414. */
  3415. void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
  3416. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  3417. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  3418. __asm {
  3419. movss xmm0, idMath::INFINITY
  3420. xorps xmm1, xmm1
  3421. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  3422. subps xmm1, xmm0
  3423. movaps xmm2, xmm0
  3424. movaps xmm3, xmm1
  3425. mov esi, src
  3426. mov eax, count
  3427. and eax, ~3
  3428. jz done4
  3429. imul eax, DRAWVERT_SIZE
  3430. add esi, eax
  3431. neg eax
  3432. loop4:
  3433. // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
  3434. movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  3435. movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  3436. minps xmm0, xmm4
  3437. maxps xmm1, xmm4
  3438. movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  3439. movhps xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
  3440. minps xmm2, xmm5
  3441. maxps xmm3, xmm5
  3442. movss xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  3443. movhps xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  3444. minps xmm0, xmm6
  3445. maxps xmm1, xmm6
  3446. movss xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  3447. movhps xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
  3448. minps xmm2, xmm7
  3449. maxps xmm3, xmm7
  3450. add eax, 4*DRAWVERT_SIZE
  3451. jl loop4
  3452. done4:
  3453. mov eax, count
  3454. and eax, 3
  3455. jz done1
  3456. imul eax, DRAWVERT_SIZE
  3457. add esi, eax
  3458. neg eax
  3459. loop1:
  3460. movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  3461. movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  3462. minps xmm0, xmm4
  3463. maxps xmm1, xmm4
  3464. add eax, DRAWVERT_SIZE
  3465. jl loop1
  3466. done1:
  3467. shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
  3468. shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
  3469. minps xmm0, xmm2
  3470. maxps xmm1, xmm3
  3471. mov esi, min
  3472. movhps [esi], xmm0
  3473. movss [esi+8], xmm0
  3474. mov edi, max
  3475. movhps [edi], xmm1
  3476. movss [edi+8], xmm1
  3477. }
  3478. }
  3479. /*
  3480. ============
  3481. idSIMD_SSE::MinMax
  3482. ============
  3483. */
  3484. void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
  3485. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  3486. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  3487. __asm {
  3488. movss xmm0, idMath::INFINITY
  3489. xorps xmm1, xmm1
  3490. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  3491. subps xmm1, xmm0
  3492. movaps xmm2, xmm0
  3493. movaps xmm3, xmm1
  3494. mov edi, indexes
  3495. mov esi, src
  3496. mov eax, count
  3497. and eax, ~3
  3498. jz done4
  3499. shl eax, 2
  3500. add edi, eax
  3501. neg eax
  3502. loop4:
  3503. // prefetchnta [edi+128]
  3504. // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
  3505. mov edx, [edi+eax+0]
  3506. imul edx, DRAWVERT_SIZE
  3507. movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
  3508. movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
  3509. minps xmm0, xmm4
  3510. maxps xmm1, xmm4
  3511. mov edx, [edi+eax+4]
  3512. imul edx, DRAWVERT_SIZE
  3513. movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
  3514. movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
  3515. minps xmm2, xmm5
  3516. maxps xmm3, xmm5
  3517. mov edx, [edi+eax+8]
  3518. imul edx, DRAWVERT_SIZE
  3519. movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
  3520. movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
  3521. minps xmm0, xmm6
  3522. maxps xmm1, xmm6
  3523. mov edx, [edi+eax+12]
  3524. imul edx, DRAWVERT_SIZE
  3525. movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
  3526. movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
  3527. minps xmm2, xmm7
  3528. maxps xmm3, xmm7
  3529. add eax, 4*4
  3530. jl loop4
  3531. done4:
  3532. mov eax, count
  3533. and eax, 3
  3534. jz done1
  3535. shl eax, 2
  3536. add edi, eax
  3537. neg eax
  3538. loop1:
  3539. mov edx, [edi+eax+0]
  3540. imul edx, DRAWVERT_SIZE;
  3541. movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
  3542. movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
  3543. minps xmm0, xmm4
  3544. maxps xmm1, xmm4
  3545. add eax, 4
  3546. jl loop1
  3547. done1:
  3548. shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
  3549. shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
  3550. minps xmm0, xmm2
  3551. maxps xmm1, xmm3
  3552. mov esi, min
  3553. movhps [esi], xmm0
  3554. movss [esi+8], xmm0
  3555. mov edi, max
  3556. movhps [edi], xmm1
  3557. movss [edi+8], xmm1
  3558. }
  3559. }
  3560. /*
  3561. ============
  3562. idSIMD_SSE::Clamp
  3563. ============
  3564. */
  3565. void VPCALL idSIMD_SSE::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
  3566. int i, pre, post;
  3567. __asm
  3568. {
  3569. movss xmm0,min
  3570. movss xmm1,max
  3571. shufps xmm0,xmm0,0
  3572. shufps xmm1,xmm1,0
  3573. KFLOATINITDS( dst, src, count, pre, post )
  3574. and eax,15
  3575. jne lpNA
  3576. jmp lpA
  3577. align 16
  3578. lpA:
  3579. movaps xmm2,[edx+ebx]
  3580. movaps xmm3,[edx+ebx+16]
  3581. maxps xmm2,xmm0
  3582. maxps xmm3,xmm0
  3583. prefetchnta [edx+ebx+64]
  3584. minps xmm2,xmm1
  3585. minps xmm3,xmm1
  3586. movaps [edi+ebx],xmm2
  3587. movaps [edi+ebx+16],xmm3
  3588. add ebx,16*2
  3589. jl lpA
  3590. jmp done
  3591. align 16
  3592. lpNA:
  3593. movups xmm2,[edx+ebx]
  3594. movups xmm3,[edx+ebx+16]
  3595. maxps xmm2,xmm0
  3596. maxps xmm3,xmm0
  3597. prefetchnta [edx+ebx+64]
  3598. minps xmm2,xmm1
  3599. minps xmm3,xmm1
  3600. movaps [edi+ebx],xmm2
  3601. movaps [edi+ebx+16],xmm3
  3602. add ebx,16*2
  3603. jl lpNA
  3604. done:
  3605. }
  3606. for ( i = 0; i < pre; i++ ) {
  3607. if ( src[i] < min )
  3608. dst[i] = min;
  3609. else if ( src[i] > max )
  3610. dst[i] = max;
  3611. else
  3612. dst[i] = src[i];
  3613. }
  3614. for( i = count - post; i < count; i++ ) {
  3615. if ( src[i] < min )
  3616. dst[i] = min;
  3617. else if ( src[i] > max )
  3618. dst[i] = max;
  3619. else
  3620. dst[i] = src[i];
  3621. }
  3622. }
  3623. /*
  3624. ============
  3625. idSIMD_SSE::ClampMin
  3626. ============
  3627. */
  3628. void VPCALL idSIMD_SSE::ClampMin( float *dst, const float *src, const float min, const int count ) {
  3629. int i, pre, post;
  3630. __asm
  3631. {
  3632. movss xmm0,min
  3633. shufps xmm0,xmm0,0
  3634. KFLOATINITDS( dst, src, count, pre, post )
  3635. and eax,15
  3636. jne lpNA
  3637. jmp lpA
  3638. align 16
  3639. lpA:
  3640. movaps xmm2,[edx+ebx]
  3641. movaps xmm3,[edx+ebx+16]
  3642. maxps xmm2,xmm0
  3643. prefetchnta [edx+ebx+64]
  3644. maxps xmm3,xmm0
  3645. movaps [edi+ebx],xmm2
  3646. movaps [edi+ebx+16],xmm3
  3647. add ebx,16*2
  3648. jl lpA
  3649. jmp done
  3650. align 16
  3651. lpNA:
  3652. movups xmm2,[edx+ebx]
  3653. movups xmm3,[edx+ebx+16]
  3654. maxps xmm2,xmm0
  3655. prefetchnta [edx+ebx+64]
  3656. maxps xmm3,xmm0
  3657. movaps [edi+ebx],xmm2
  3658. movaps [edi+ebx+16],xmm3
  3659. add ebx,16*2
  3660. jl lpNA
  3661. done:
  3662. }
  3663. for( i = 0; i < pre; i++ ) {
  3664. if ( src[i] < min )
  3665. dst[i] = min;
  3666. else
  3667. dst[i] = src[i];
  3668. }
  3669. for( i = count - post; i < count; i++ ) {
  3670. if ( src[i] < min )
  3671. dst[i] = min;
  3672. else
  3673. dst[i] = src[i];
  3674. }
  3675. }
  3676. /*
  3677. ============
  3678. idSIMD_SSE::ClampMax
  3679. ============
  3680. */
  3681. void VPCALL idSIMD_SSE::ClampMax( float *dst, const float *src, const float max, const int count ) {
  3682. int i, pre, post;
  3683. __asm
  3684. {
  3685. movss xmm1,max
  3686. shufps xmm1,xmm1,0
  3687. KFLOATINITDS( dst, src, count, pre, post )
  3688. and eax,15
  3689. jne lpNA
  3690. jmp lpA
  3691. align 16
  3692. lpA:
  3693. movaps xmm2,[edx+ebx]
  3694. movaps xmm3,[edx+ebx+16]
  3695. minps xmm2,xmm1
  3696. prefetchnta [edx+ebx+64]
  3697. minps xmm3,xmm1
  3698. movaps [edi+ebx],xmm2
  3699. movaps [edi+ebx+16],xmm3
  3700. add ebx,16*2
  3701. jl lpA
  3702. jmp done
  3703. align 16
  3704. lpNA:
  3705. movups xmm2,[edx+ebx]
  3706. movups xmm3,[edx+ebx+16]
  3707. minps xmm2,xmm1
  3708. prefetchnta [edx+ebx+64]
  3709. minps xmm3,xmm1
  3710. movaps [edi+ebx],xmm2
  3711. movaps [edi+ebx+16],xmm3
  3712. add ebx,16*2
  3713. jl lpNA
  3714. done:
  3715. }
  3716. for( i = 0; i < pre; i++ ) {
  3717. if ( src[i] > max )
  3718. dst[i] = max;
  3719. else
  3720. dst[i] = src[i];
  3721. }
  3722. for( i = count - post; i < count; i++ ) {
  3723. if ( src[i] > max )
  3724. dst[i] = max;
  3725. else
  3726. dst[i] = src[i];
  3727. }
  3728. }
  3729. /*
  3730. ============
  3731. idSIMD_SSE::Zero16
  3732. ============
  3733. */
  3734. void VPCALL idSIMD_SSE::Zero16( float *dst, const int count ) {
  3735. __asm {
  3736. mov edx, dst
  3737. mov eax, count
  3738. add eax, 3
  3739. shr eax, 2
  3740. jz doneZero16
  3741. shl eax, 4
  3742. add edx, eax
  3743. neg eax
  3744. xorps xmm0, xmm0
  3745. loopZero16:
  3746. movaps [edx+eax], xmm0
  3747. add eax, 16
  3748. jl loopZero16
  3749. doneZero16:
  3750. }
  3751. }
  3752. /*
  3753. ============
  3754. idSIMD_SSE::Negate16
  3755. ============
  3756. */
  3757. void VPCALL idSIMD_SSE::Negate16( float *dst, const int count ) {
  3758. __asm {
  3759. mov edx, dst
  3760. mov eax, count
  3761. add eax, 3
  3762. shr eax, 2
  3763. jz doneNegate16
  3764. shl eax, 4
  3765. add edx, eax
  3766. neg eax
  3767. movss xmm0, SIMD_SP_signBitMask
  3768. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  3769. loopNegate16:
  3770. movaps xmm1, [edx+eax]
  3771. xorps xmm1, xmm0
  3772. movaps [edx+eax], xmm1
  3773. add eax, 16
  3774. jl loopNegate16
  3775. doneNegate16:
  3776. }
  3777. }
  3778. /*
  3779. ============
  3780. idSIMD_SSE::Copy16
  3781. ============
  3782. */
  3783. void VPCALL idSIMD_SSE::Copy16( float *dst, const float *src, const int count ) {
  3784. __asm {
  3785. mov ecx, src
  3786. mov edx, dst
  3787. mov eax, count
  3788. add eax, 3
  3789. shr eax, 2
  3790. jz doneCopy16
  3791. shl eax, 4
  3792. add ecx, eax
  3793. add edx, eax
  3794. neg eax
  3795. loopCopy16:
  3796. movaps xmm0, [ecx+eax]
  3797. movaps [edx+eax], xmm0
  3798. add eax, 16
  3799. jl loopCopy16
  3800. doneCopy16:
  3801. }
  3802. }
  3803. /*
  3804. ============
  3805. idSIMD_SSE::Add16
  3806. ============
  3807. */
  3808. void VPCALL idSIMD_SSE::Add16( float *dst, const float *src1, const float *src2, const int count ) {
  3809. __asm {
  3810. mov ecx, src1
  3811. mov edx, src2
  3812. mov esi, dst
  3813. mov eax, count
  3814. add eax, 3
  3815. shr eax, 2
  3816. jz doneAdd16
  3817. shl eax, 4
  3818. add esi, eax
  3819. add ecx, eax
  3820. add edx, eax
  3821. neg eax
  3822. loopAdd16:
  3823. movaps xmm0, [ecx+eax]
  3824. addps xmm0, [edx+eax]
  3825. movaps [esi+eax], xmm0
  3826. add eax, 16
  3827. jl loopAdd16
  3828. doneAdd16:
  3829. }
  3830. }
  3831. /*
  3832. ============
  3833. idSIMD_SSE::Sub16
  3834. ============
  3835. */
  3836. void VPCALL idSIMD_SSE::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
  3837. __asm {
  3838. mov ecx, src1
  3839. mov edx, src2
  3840. mov esi, dst
  3841. mov eax, count
  3842. add eax, 3
  3843. shr eax, 2
  3844. jz doneSub16
  3845. shl eax, 4
  3846. add esi, eax
  3847. add ecx, eax
  3848. add edx, eax
  3849. neg eax
  3850. loopSub16:
  3851. movaps xmm0, [ecx+eax]
  3852. subps xmm0, [edx+eax]
  3853. movaps [esi+eax], xmm0
  3854. add eax, 16
  3855. jl loopSub16
  3856. doneSub16:
  3857. }
  3858. }
  3859. /*
  3860. ============
  3861. idSIMD_SSE::Mul16
  3862. ============
  3863. */
  3864. void VPCALL idSIMD_SSE::Mul16( float *dst, const float *src1, const float constant, const int count ) {
  3865. __asm {
  3866. mov ecx, dst
  3867. mov edx, src1
  3868. mov eax, count
  3869. add eax, 3
  3870. shr eax, 2
  3871. jz doneMulScalar16
  3872. movss xmm1, constant
  3873. shl eax, 4
  3874. add ecx, eax
  3875. add edx, eax
  3876. neg eax
  3877. shufps xmm1, xmm1, 0x00
  3878. loopMulScalar16:
  3879. movaps xmm0, [edx+eax]
  3880. mulps xmm0, xmm1
  3881. movaps [ecx+eax], xmm0
  3882. add eax, 16
  3883. jl loopMulScalar16
  3884. doneMulScalar16:
  3885. }
  3886. }
  3887. /*
  3888. ============
  3889. idSIMD_SSE::AddAssign16
  3890. ============
  3891. */
  3892. void VPCALL idSIMD_SSE::AddAssign16( float *dst, const float *src, const int count ) {
  3893. __asm {
  3894. mov ecx, dst
  3895. mov edx, src
  3896. mov eax, count
  3897. add eax, 3
  3898. shr eax, 2
  3899. jz doneAddAssign16
  3900. shl eax, 4
  3901. add ecx, eax
  3902. add edx, eax
  3903. neg eax
  3904. loopAddAssign16:
  3905. movaps xmm0, [ecx+eax]
  3906. addps xmm0, [edx+eax]
  3907. movaps [ecx+eax], xmm0
  3908. add eax, 16
  3909. jl loopAddAssign16
  3910. doneAddAssign16:
  3911. }
  3912. }
  3913. /*
  3914. ============
  3915. idSIMD_SSE::SubAssign16
  3916. ============
  3917. */
  3918. void VPCALL idSIMD_SSE::SubAssign16( float *dst, const float *src, const int count ) {
  3919. __asm {
  3920. mov ecx, dst
  3921. mov edx, src
  3922. mov eax, count
  3923. add eax, 3
  3924. shr eax, 2
  3925. jz doneSubAssign16
  3926. shl eax, 4
  3927. add ecx, eax
  3928. add edx, eax
  3929. neg eax
  3930. loopSubAssign16:
  3931. movaps xmm0, [ecx+eax]
  3932. subps xmm0, [edx+eax]
  3933. movaps [ecx+eax], xmm0
  3934. add eax, 16
  3935. jl loopSubAssign16
  3936. doneSubAssign16:
  3937. }
  3938. }
  3939. /*
  3940. ============
  3941. idSIMD_SSE::MulAssign16
  3942. ============
  3943. */
  3944. void VPCALL idSIMD_SSE::MulAssign16( float *dst, const float constant, const int count ) {
  3945. __asm {
  3946. mov ecx, dst
  3947. mov eax, count
  3948. add eax, 3
  3949. shr eax, 2
  3950. jz doneMulAssign16
  3951. movss xmm1, constant
  3952. shl eax, 4
  3953. add ecx, eax
  3954. neg eax
  3955. shufps xmm1, xmm1, 0x00
  3956. loopMulAssign16:
  3957. movaps xmm0, [ecx+eax]
  3958. mulps xmm0, xmm1
  3959. movaps [ecx+eax], xmm0
  3960. add eax, 16
  3961. jl loopMulAssign16
  3962. doneMulAssign16:
  3963. }
  3964. }
  3965. /*
  3966. ============
  3967. idSIMD_SSE::MatX_MultiplyVecX
  3968. optimizes the following matrix multiplications:
  3969. NxN * Nx1
  3970. Nx6 * 6x1
  3971. 6xN * Nx1
  3972. with N in the range [1-6]
  3973. ============
  3974. */
  3975. void VPCALL idSIMD_SSE::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  3976. #define STORE1( offset, reg1, reg2 ) \
  3977. __asm movss [eax+offset], reg1
  3978. #define STORE2LO( offset, reg1, reg2 ) \
  3979. __asm movlps [eax+offset], reg1
  3980. #define STORE2HI( offset, reg1, reg2 ) \
  3981. __asm movhps [eax+offset], reg1
  3982. #define STORE4( offset, reg1, reg2 ) \
  3983. __asm movlps [eax+offset], reg1 \
  3984. __asm movhps [eax+offset+8], reg1
  3985. #define STOREC =
  3986. int numRows;
  3987. const float *mPtr, *vPtr;
  3988. float *dstPtr;
  3989. assert( vec.GetSize() >= mat.GetNumColumns() );
  3990. assert( dst.GetSize() >= mat.GetNumRows() );
  3991. mPtr = mat.ToFloatPtr();
  3992. vPtr = vec.ToFloatPtr();
  3993. dstPtr = dst.ToFloatPtr();
  3994. numRows = mat.GetNumRows();
  3995. switch( mat.GetNumColumns() ) {
  3996. case 1: {
  3997. switch( numRows ) {
  3998. case 1: { // 1x1 * 1x1
  3999. __asm {
  4000. mov esi, vPtr
  4001. mov edi, mPtr
  4002. mov eax, dstPtr
  4003. movss xmm0, [esi]
  4004. mulss xmm0, [edi]
  4005. STORE1( 0, xmm0, xmm1 )
  4006. }
  4007. return;
  4008. }
  4009. case 6: { // 6x1 * 1x1
  4010. __asm {
  4011. mov esi, vPtr
  4012. mov edi, mPtr
  4013. mov eax, dstPtr
  4014. movss xmm0, [esi]
  4015. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  4016. movaps xmm1, xmm0
  4017. mulps xmm0, [edi]
  4018. mulps xmm1, [edi+16]
  4019. STORE4( 0, xmm0, xmm2 )
  4020. STORE2LO( 16, xmm1, xmm2 )
  4021. }
  4022. return;
  4023. }
  4024. default: {
  4025. for ( int i = 0; i < numRows; i++ ) {
  4026. dstPtr[i] STOREC mPtr[0] * vPtr[0];
  4027. mPtr++;
  4028. }
  4029. return;
  4030. }
  4031. }
  4032. break;
  4033. }
  4034. case 2: {
  4035. switch( numRows ) {
  4036. case 2: { // 2x2 * 2x1
  4037. __asm {
  4038. mov esi, vPtr
  4039. mov edi, mPtr
  4040. mov eax, dstPtr
  4041. movss xmm0, [esi]
  4042. movss xmm1, [esi+4]
  4043. movss xmm2, [edi]
  4044. mulss xmm2, xmm0
  4045. movss xmm3, [edi+4]
  4046. mulss xmm3, xmm1
  4047. addss xmm2, xmm3
  4048. STORE1( 0, xmm2, xmm4 )
  4049. mulss xmm0, [edi+8]
  4050. mulss xmm1, [edi+8+4]
  4051. addss xmm0, xmm1
  4052. STORE1( 4, xmm0, xmm4 )
  4053. }
  4054. return;
  4055. }
  4056. case 6: { // 6x2 * 2x1
  4057. __asm {
  4058. mov esi, vPtr
  4059. mov edi, mPtr
  4060. mov eax, dstPtr
  4061. movlps xmm7, [esi]
  4062. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  4063. movaps xmm0, [edi]
  4064. mulps xmm0, xmm7
  4065. movaps xmm1, [edi+16]
  4066. mulps xmm1, xmm7
  4067. movaps xmm2, xmm0
  4068. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  4069. shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  4070. movaps xmm3, [edi+32]
  4071. addps xmm0, xmm2
  4072. mulps xmm3, xmm7
  4073. STORE4( 0, xmm0, xmm4 )
  4074. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
  4075. movhlps xmm1, xmm3
  4076. addps xmm3, xmm1
  4077. STORE2LO( 16, xmm3, xmm4 )
  4078. }
  4079. return;
  4080. }
  4081. default: {
  4082. for ( int i = 0; i < numRows; i++ ) {
  4083. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
  4084. mPtr += 2;
  4085. }
  4086. return;
  4087. }
  4088. }
  4089. break;
  4090. }
  4091. case 3: {
  4092. switch( numRows ) {
  4093. case 3: { // 3x3 * 3x1
  4094. __asm {
  4095. mov esi, vPtr
  4096. mov edi, mPtr
  4097. mov eax, dstPtr
  4098. movss xmm0, [esi]
  4099. movss xmm4, [edi]
  4100. mulss xmm4, xmm0
  4101. movss xmm1, [esi+4]
  4102. movss xmm5, [edi+4]
  4103. mulss xmm5, xmm1
  4104. addss xmm4, xmm5
  4105. movss xmm2, [esi+8]
  4106. movss xmm6, [edi+8]
  4107. mulss xmm6, xmm2
  4108. addss xmm4, xmm6
  4109. movss xmm3, [edi+12]
  4110. mulss xmm3, xmm0
  4111. STORE1( 0, xmm4, xmm7 );
  4112. movss xmm5, [edi+12+4]
  4113. mulss xmm5, xmm1
  4114. addss xmm3, xmm5
  4115. movss xmm6, [edi+12+8]
  4116. mulss xmm6, xmm2
  4117. addss xmm3, xmm6
  4118. mulss xmm0, [edi+24]
  4119. mulss xmm1, [edi+24+4]
  4120. STORE1( 4, xmm3, xmm7 );
  4121. addss xmm0, xmm1
  4122. mulss xmm2, [edi+24+8]
  4123. addss xmm0, xmm2
  4124. STORE1( 8, xmm0, xmm7 );
  4125. }
  4126. return;
  4127. }
  4128. case 6: { // 6x3 * 3x1
  4129. __asm {
  4130. mov esi, vPtr
  4131. mov edi, mPtr
  4132. mov eax, dstPtr
  4133. movss xmm5, [esi]
  4134. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  4135. movss xmm6, [esi+4]
  4136. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  4137. movss xmm7, [esi+8]
  4138. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  4139. movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
  4140. movlps xmm1, [edi+4*4]
  4141. shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
  4142. movlps xmm2, [edi+6*4]
  4143. movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
  4144. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
  4145. mulps xmm0, xmm5
  4146. movlps xmm3, [edi+10*4]
  4147. shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
  4148. movaps xmm3, xmm1
  4149. shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
  4150. mulps xmm1, xmm6
  4151. shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
  4152. mulps xmm3, xmm7
  4153. addps xmm0, xmm1
  4154. addps xmm0, xmm3
  4155. STORE4( 0, xmm0, xmm4 )
  4156. movss xmm1, [edi+12*4]
  4157. mulss xmm1, xmm5
  4158. movss xmm2, [edi+13*4]
  4159. mulss xmm2, xmm6
  4160. movss xmm3, [edi+14*4]
  4161. mulss xmm3, xmm7
  4162. addss xmm1, xmm2
  4163. addss xmm1, xmm3
  4164. STORE1( 16, xmm1, xmm4 )
  4165. mulss xmm5, [edi+15*4]
  4166. mulss xmm6, [edi+16*4]
  4167. mulss xmm7, [edi+17*4]
  4168. addss xmm5, xmm6
  4169. addss xmm5, xmm7
  4170. STORE1( 20, xmm5, xmm4 )
  4171. }
  4172. return;
  4173. }
  4174. default: {
  4175. for ( int i = 0; i < numRows; i++ ) {
  4176. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
  4177. mPtr += 3;
  4178. }
  4179. return;
  4180. }
  4181. }
  4182. break;
  4183. }
  4184. case 4: {
  4185. switch( numRows ) {
  4186. case 4: { // 4x4 * 4x1
  4187. __asm {
  4188. mov esi, vPtr
  4189. mov edi, mPtr
  4190. mov eax, dstPtr
  4191. movlps xmm6, qword ptr [esi ]
  4192. movlps xmm0, qword ptr [edi ]
  4193. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
  4194. movhps xmm0, qword ptr [edi+16]
  4195. mulps xmm0, xmm6
  4196. movlps xmm7, qword ptr [esi+ 8]
  4197. movlps xmm2, qword ptr [edi+ 8]
  4198. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  4199. movhps xmm2, qword ptr [edi+24]
  4200. mulps xmm2, xmm7
  4201. movlps xmm1, qword ptr [edi+32]
  4202. movhps xmm1, qword ptr [edi+48]
  4203. mulps xmm1, xmm6
  4204. movlps xmm3, qword ptr [edi+40]
  4205. addps xmm0, xmm2
  4206. movhps xmm3, qword ptr [edi+56]
  4207. mulps xmm3, xmm7
  4208. movaps xmm4, xmm0
  4209. addps xmm1, xmm3
  4210. shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  4211. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  4212. addps xmm0, xmm4
  4213. STORE4( 0, xmm0, xmm2 )
  4214. }
  4215. return;
  4216. }
  4217. case 6: { // 6x4 * 4x1
  4218. __asm {
  4219. mov esi, vPtr
  4220. mov edi, mPtr
  4221. mov eax, dstPtr
  4222. movlps xmm6, qword ptr [esi+ 0]
  4223. movlps xmm0, qword ptr [edi+ 0]
  4224. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
  4225. movhps xmm0, qword ptr [edi+16]
  4226. mulps xmm0, xmm6
  4227. movlps xmm7, qword ptr [esi+ 8]
  4228. movlps xmm2, qword ptr [edi+ 8]
  4229. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  4230. movhps xmm2, qword ptr [edi+24]
  4231. mulps xmm2, xmm7
  4232. movlps xmm1, qword ptr [edi+32]
  4233. movhps xmm1, qword ptr [edi+48]
  4234. mulps xmm1, xmm6
  4235. movlps xmm3, qword ptr [edi+40]
  4236. addps xmm0, xmm2
  4237. movhps xmm3, qword ptr [edi+56]
  4238. mulps xmm3, xmm7
  4239. movaps xmm4, xmm0
  4240. addps xmm1, xmm3
  4241. shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  4242. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  4243. addps xmm0, xmm4
  4244. movlps xmm1, qword ptr [edi+64]
  4245. movhps xmm1, qword ptr [edi+80]
  4246. STORE4( 0, xmm0, xmm4 )
  4247. mulps xmm1, xmm6
  4248. movlps xmm2, qword ptr [edi+72]
  4249. movhps xmm2, qword ptr [edi+88]
  4250. mulps xmm2, xmm7
  4251. addps xmm1, xmm2
  4252. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
  4253. movhlps xmm3, xmm1
  4254. addps xmm1, xmm3
  4255. STORE2LO( 16, xmm1, xmm4 )
  4256. }
  4257. return;
  4258. }
  4259. default: {
  4260. for ( int i = 0; i < numRows; i++ ) {
  4261. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
  4262. mPtr += 4;
  4263. }
  4264. return;
  4265. }
  4266. }
  4267. break;
  4268. }
  4269. case 5: {
  4270. switch( numRows ) {
  4271. case 5: { // 5x5 * 5x1
  4272. __asm {
  4273. mov esi, vPtr
  4274. mov edi, mPtr
  4275. mov eax, dstPtr
  4276. movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
  4277. movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
  4278. movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
  4279. movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
  4280. movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
  4281. shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
  4282. movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
  4283. movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
  4284. movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
  4285. shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
  4286. movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
  4287. movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
  4288. movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
  4289. shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
  4290. movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
  4291. movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
  4292. movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
  4293. movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
  4294. shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
  4295. movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
  4296. shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
  4297. movss xmm7, [esi+0*4]
  4298. shufps xmm7, xmm7, 0
  4299. mulps xmm0, xmm7
  4300. movss xmm5, [esi+1*4]
  4301. shufps xmm5, xmm5, 0
  4302. mulps xmm1, xmm5
  4303. addps xmm0, xmm1
  4304. movss xmm6, [esi+2*4]
  4305. shufps xmm6, xmm6, 0
  4306. mulps xmm2, xmm6
  4307. addps xmm0, xmm2
  4308. movss xmm1, [esi+3*4]
  4309. shufps xmm1, xmm1, 0
  4310. mulps xmm3, xmm1
  4311. addps xmm0, xmm3
  4312. movss xmm2, [esi+4*4]
  4313. shufps xmm2, xmm2, 0
  4314. mulps xmm4, xmm2
  4315. addps xmm0, xmm4
  4316. mulss xmm7, [edi+20*4]
  4317. mulss xmm5, [edi+21*4]
  4318. addps xmm7, xmm5
  4319. mulss xmm6, [edi+22*4]
  4320. addps xmm7, xmm6
  4321. mulss xmm1, [edi+23*4]
  4322. addps xmm7, xmm1
  4323. mulss xmm2, [edi+24*4]
  4324. addps xmm7, xmm2
  4325. STORE4( 0, xmm0, xmm3 )
  4326. STORE1( 16, xmm7, xmm4 )
  4327. }
  4328. return;
  4329. }
  4330. case 6: { // 6x5 * 5x1
  4331. __asm {
  4332. mov esi, vPtr
  4333. mov edi, mPtr
  4334. mov eax, dstPtr
  4335. movlps xmm6, [esi]
  4336. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
  4337. movlps xmm7, [esi+8]
  4338. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  4339. movlps xmm0, [edi]
  4340. movhps xmm3, [edi+8]
  4341. movaps xmm1, [edi+16]
  4342. movlps xmm2, [edi+32]
  4343. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
  4344. shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
  4345. shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
  4346. mulps xmm0, xmm6
  4347. mulps xmm3, xmm7
  4348. movlps xmm2, [edi+40]
  4349. addps xmm0, xmm3 // xmm0 + xmm1
  4350. movhps xmm5, [edi+40+8]
  4351. movlps xmm3, [edi+40+16]
  4352. movhps xmm3, [edi+40+24]
  4353. movlps xmm4, [edi+40+32]
  4354. shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
  4355. shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
  4356. shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
  4357. mulps xmm2, xmm6
  4358. mulps xmm5, xmm7
  4359. addps xmm2, xmm5 // xmm2 + xmm3
  4360. movss xmm5, [esi+16]
  4361. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  4362. movaps xmm4, xmm0
  4363. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
  4364. shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
  4365. shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
  4366. addps xmm0, xmm4
  4367. mulps xmm1, xmm5
  4368. addps xmm0, xmm1
  4369. STORE4( 0, xmm0, xmm2 )
  4370. movlps xmm4, [edi+80]
  4371. movhps xmm3, [edi+80+8]
  4372. movaps xmm1, [edi+80+16]
  4373. movlps xmm2, [edi+80+32]
  4374. shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
  4375. shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
  4376. shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
  4377. mulps xmm4, xmm6
  4378. mulps xmm3, xmm7
  4379. mulps xmm1, xmm5
  4380. addps xmm4, xmm3 // xmm4 + xmm1
  4381. shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
  4382. shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
  4383. addps xmm4, xmm1
  4384. shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
  4385. addps xmm4, xmm1
  4386. STORE2LO( 16, xmm4, xmm2 )
  4387. }
  4388. return;
  4389. }
  4390. default: {
  4391. for ( int i = 0; i < numRows; i++ ) {
  4392. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
  4393. mPtr += 5;
  4394. }
  4395. return;
  4396. }
  4397. }
  4398. break;
  4399. }
  4400. case 6: {
  4401. switch( numRows ) {
  4402. case 1: { // 1x6 * 6x1
  4403. __asm {
  4404. mov esi, vPtr
  4405. mov edi, mPtr
  4406. mov eax, dstPtr
  4407. movss xmm0, [esi]
  4408. mulss xmm0, [edi]
  4409. movss xmm1, [esi+4]
  4410. mulss xmm1, [edi+4]
  4411. movss xmm2, [esi+8]
  4412. addss xmm0, xmm1
  4413. mulss xmm2, [edi+8]
  4414. movss xmm3, [esi+12]
  4415. addss xmm0, xmm2
  4416. mulss xmm3, [edi+12]
  4417. movss xmm4, [esi+16]
  4418. addss xmm0, xmm3
  4419. mulss xmm4, [edi+16]
  4420. movss xmm5, [esi+20]
  4421. addss xmm0, xmm4
  4422. mulss xmm5, [edi+20]
  4423. movss xmm6, [esi+24]
  4424. addss xmm0, xmm5
  4425. mulss xmm6, [edi+24]
  4426. addss xmm0, xmm6
  4427. STORE1( 0, xmm0, xmm7 )
  4428. }
  4429. return;
  4430. }
  4431. case 2: { // 2x6 * 6x1
  4432. __asm {
  4433. mov esi, vPtr
  4434. mov edi, mPtr
  4435. mov eax, dstPtr
  4436. // load idVecX
  4437. movlps xmm4, [esi]
  4438. movhps xmm4, [esi+8]
  4439. movlps xmm5, [esi+16]
  4440. movlhps xmm5, xmm4
  4441. movhlps xmm6, xmm4
  4442. movlhps xmm6, xmm5
  4443. // row 0 and 1
  4444. movaps xmm0, [edi]
  4445. movaps xmm1, [edi+16]
  4446. movaps xmm2, [edi+32]
  4447. mulps xmm0, xmm4
  4448. mulps xmm1, xmm5
  4449. mulps xmm2, xmm6
  4450. movhlps xmm3, xmm0
  4451. movlhps xmm3, xmm2
  4452. addps xmm1, xmm3
  4453. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  4454. addps xmm1, xmm0
  4455. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
  4456. movhlps xmm0, xmm1
  4457. addps xmm0, xmm1
  4458. STORE2LO( 0, xmm0, xmm3 )
  4459. }
  4460. return;
  4461. }
  4462. case 3: { // 3x6 * 6x1
  4463. __asm {
  4464. mov esi, vPtr
  4465. mov edi, mPtr
  4466. mov eax, dstPtr
  4467. // load idVecX
  4468. movlps xmm4, [esi]
  4469. movhps xmm4, [esi+8]
  4470. movlps xmm5, [esi+16]
  4471. movlhps xmm5, xmm4
  4472. movhlps xmm6, xmm4
  4473. movlhps xmm6, xmm5
  4474. // row 0 and 1
  4475. movaps xmm0, [edi]
  4476. movaps xmm1, [edi+16]
  4477. movaps xmm2, [edi+32]
  4478. mulps xmm0, xmm4
  4479. mulps xmm1, xmm5
  4480. mulps xmm2, xmm6
  4481. movhlps xmm3, xmm0
  4482. movlhps xmm3, xmm2
  4483. addps xmm1, xmm3
  4484. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  4485. addps xmm1, xmm0
  4486. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
  4487. movhlps xmm0, xmm1
  4488. addps xmm0, xmm1
  4489. STORE2LO( 0, xmm0, xmm3 )
  4490. // row 2
  4491. movaps xmm0, [edi+48]
  4492. movaps xmm1, [edi+48+16]
  4493. mulps xmm0, xmm4
  4494. mulps xmm1, xmm5
  4495. addps xmm0, xmm1
  4496. movhlps xmm1, xmm0
  4497. addps xmm0, xmm1
  4498. movaps xmm1, xmm0
  4499. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
  4500. addss xmm0, xmm1
  4501. STORE1( 8, xmm0, xmm3 )
  4502. }
  4503. return;
  4504. }
  4505. case 4: { // 4x6 * 6x1
  4506. __asm {
  4507. mov esi, vPtr
  4508. mov edi, mPtr
  4509. mov eax, dstPtr
  4510. // load idVecX
  4511. movlps xmm4, [esi]
  4512. movhps xmm4, [esi+8]
  4513. movlps xmm5, [esi+16]
  4514. movlhps xmm5, xmm4
  4515. movhlps xmm6, xmm4
  4516. movlhps xmm6, xmm5
  4517. // row 0 and 1
  4518. movaps xmm0, [edi]
  4519. movaps xmm1, [edi+16]
  4520. movaps xmm2, [edi+32]
  4521. mulps xmm0, xmm4
  4522. mulps xmm1, xmm5
  4523. mulps xmm2, xmm6
  4524. movhlps xmm7, xmm0
  4525. movlhps xmm7, xmm2
  4526. addps xmm7, xmm1
  4527. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  4528. addps xmm7, xmm0
  4529. // row 2 and 3
  4530. movaps xmm0, [edi+48]
  4531. movaps xmm1, [edi+48+16]
  4532. movaps xmm2, [edi+48+32]
  4533. mulps xmm0, xmm4
  4534. mulps xmm1, xmm5
  4535. mulps xmm2, xmm6
  4536. movhlps xmm3, xmm0
  4537. movlhps xmm3, xmm2
  4538. addps xmm1, xmm3
  4539. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  4540. addps xmm1, xmm0
  4541. // last 4 additions for the first 4 rows and store result
  4542. movaps xmm0, xmm7
  4543. shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  4544. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  4545. addps xmm0, xmm7
  4546. STORE4( 0, xmm0, xmm4 )
  4547. }
  4548. return;
  4549. }
  4550. case 5: { // 5x6 * 6x1
  4551. __asm {
  4552. mov esi, vPtr
  4553. mov edi, mPtr
  4554. mov eax, dstPtr
  4555. // load idVecX
  4556. movlps xmm4, [esi]
  4557. movhps xmm4, [esi+8]
  4558. movlps xmm5, [esi+16]
  4559. movlhps xmm5, xmm4
  4560. movhlps xmm6, xmm4
  4561. movlhps xmm6, xmm5
  4562. // row 0 and 1
  4563. movaps xmm0, [edi]
  4564. movaps xmm1, [edi+16]
  4565. movaps xmm2, [edi+32]
  4566. mulps xmm0, xmm4
  4567. mulps xmm1, xmm5
  4568. mulps xmm2, xmm6
  4569. movhlps xmm7, xmm0
  4570. movlhps xmm7, xmm2
  4571. addps xmm7, xmm1
  4572. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  4573. addps xmm7, xmm0
  4574. // row 2 and 3
  4575. movaps xmm0, [edi+48]
  4576. movaps xmm1, [edi+48+16]
  4577. movaps xmm2, [edi+48+32]
  4578. mulps xmm0, xmm4
  4579. mulps xmm1, xmm5
  4580. mulps xmm2, xmm6
  4581. movhlps xmm3, xmm0
  4582. movlhps xmm3, xmm2
  4583. addps xmm1, xmm3
  4584. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  4585. addps xmm1, xmm0
  4586. // last 4 additions for the first 4 rows and store result
  4587. movaps xmm0, xmm7
  4588. shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  4589. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  4590. addps xmm0, xmm7
  4591. STORE4( 0, xmm0, xmm3 )
  4592. // row 5
  4593. movaps xmm0, [edi+96]
  4594. movaps xmm1, [edi+96+16]
  4595. mulps xmm0, xmm4
  4596. mulps xmm1, xmm5
  4597. addps xmm0, xmm1
  4598. movhlps xmm1, xmm0
  4599. addps xmm0, xmm1
  4600. movaps xmm1, xmm0
  4601. shufps xmm1, xmm1, 0x01
  4602. addss xmm0, xmm1
  4603. STORE1( 16, xmm0, xmm3 )
  4604. }
  4605. return;
  4606. }
  4607. case 6: { // 6x6 * 6x1
  4608. __asm {
  4609. mov esi, vPtr
  4610. mov edi, mPtr
  4611. mov eax, dstPtr
  4612. movlps xmm7, qword ptr [esi]
  4613. movlps xmm6, qword ptr [esi+8]
  4614. shufps xmm7, xmm7, 0x44
  4615. shufps xmm6, xmm6, 0x44
  4616. movlps xmm0, qword ptr [edi ]
  4617. movhps xmm0, qword ptr [edi+ 24]
  4618. mulps xmm0, xmm7
  4619. movlps xmm3, qword ptr [edi+ 8]
  4620. movhps xmm3, qword ptr [edi+ 32]
  4621. mulps xmm3, xmm6
  4622. movlps xmm1, qword ptr [edi+ 48]
  4623. movhps xmm1, qword ptr [edi+ 72]
  4624. mulps xmm1, xmm7
  4625. movlps xmm2, qword ptr [edi+ 96]
  4626. movhps xmm2, qword ptr [edi+120]
  4627. mulps xmm2, xmm7
  4628. movlps xmm4, qword ptr [edi+ 56]
  4629. movhps xmm4, qword ptr [edi+ 80]
  4630. movlps xmm5, qword ptr [edi+104]
  4631. movhps xmm5, qword ptr [edi+128]
  4632. mulps xmm4, xmm6
  4633. movlps xmm7, qword ptr [esi+16]
  4634. addps xmm0, xmm3
  4635. shufps xmm7, xmm7, 0x44
  4636. mulps xmm5, xmm6
  4637. addps xmm1, xmm4
  4638. movlps xmm3, qword ptr [edi+ 16]
  4639. movhps xmm3, qword ptr [edi+ 40]
  4640. addps xmm2, xmm5
  4641. movlps xmm4, qword ptr [edi+ 64]
  4642. movhps xmm4, qword ptr [edi+ 88]
  4643. mulps xmm3, xmm7
  4644. movlps xmm5, qword ptr [edi+112]
  4645. movhps xmm5, qword ptr [edi+136]
  4646. addps xmm0, xmm3
  4647. mulps xmm4, xmm7
  4648. mulps xmm5, xmm7
  4649. addps xmm1, xmm4
  4650. addps xmm2, xmm5
  4651. movaps xmm6, xmm0
  4652. shufps xmm0, xmm1, 0x88
  4653. shufps xmm6, xmm1, 0xDD
  4654. movaps xmm7, xmm2
  4655. shufps xmm7, xmm2, 0x88
  4656. shufps xmm2, xmm2, 0xDD
  4657. addps xmm0, xmm6
  4658. addps xmm2, xmm7
  4659. STORE4( 0, xmm0, xmm3 )
  4660. STORE2LO( 16, xmm2, xmm4 )
  4661. }
  4662. return;
  4663. }
  4664. default: {
  4665. for ( int i = 0; i < numRows; i++ ) {
  4666. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  4667. mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
  4668. mPtr += 6;
  4669. }
  4670. return;
  4671. }
  4672. }
  4673. break;
  4674. }
  4675. default: {
  4676. int numColumns = mat.GetNumColumns();
  4677. for ( int i = 0; i < numRows; i++ ) {
  4678. float sum = mPtr[0] * vPtr[0];
  4679. for ( int j = 1; j < numColumns; j++ ) {
  4680. sum += mPtr[j] * vPtr[j];
  4681. }
  4682. dstPtr[i] STOREC sum;
  4683. mPtr += numColumns;
  4684. }
  4685. break;
  4686. }
  4687. }
  4688. #undef STOREC
  4689. #undef STORE4
  4690. #undef STORE2HI
  4691. #undef STORE2LO
  4692. #undef STORE1
  4693. }
  4694. /*
  4695. ============
  4696. idSIMD_SSE::MatX_MultiplyAddVecX
  4697. optimizes the following matrix multiplications:
  4698. NxN * Nx1
  4699. Nx6 * 6x1
  4700. 6xN * Nx1
  4701. with N in the range [1-6]
  4702. ============
  4703. */
  4704. void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  4705. #define STORE1( offset, reg1, reg2 ) \
  4706. __asm movss reg2, [eax+offset] \
  4707. __asm addss reg2, reg1 \
  4708. __asm movss [eax+offset], reg2
  4709. #define STORE2LO( offset, reg1, reg2 ) \
  4710. __asm movlps reg2, [eax+offset] \
  4711. __asm addps reg2, reg1 \
  4712. __asm movlps [eax+offset], reg2
  4713. #define STORE2HI( offset, reg1, reg2 ) \
  4714. __asm movhps reg2, [eax+offset] \
  4715. __asm addps reg2, reg1 \
  4716. __asm movhps [eax+offset], reg2
  4717. #define STORE4( offset, reg1, reg2 ) \
  4718. __asm movlps reg2, [eax+offset] \
  4719. __asm movhps reg2, [eax+offset+8] \
  4720. __asm addps reg2, reg1 \
  4721. __asm movlps [eax+offset], reg2 \
  4722. __asm movhps [eax+offset+8], reg2
  4723. #define STOREC +=
  4724. int numRows;
  4725. const float *mPtr, *vPtr;
  4726. float *dstPtr;
  4727. assert( vec.GetSize() >= mat.GetNumColumns() );
  4728. assert( dst.GetSize() >= mat.GetNumRows() );
  4729. mPtr = mat.ToFloatPtr();
  4730. vPtr = vec.ToFloatPtr();
  4731. dstPtr = dst.ToFloatPtr();
  4732. numRows = mat.GetNumRows();
  4733. switch( mat.GetNumColumns() ) {
  4734. case 1: {
  4735. switch( numRows ) {
  4736. case 1: { // 1x1 * 1x1
  4737. __asm {
  4738. mov esi, vPtr
  4739. mov edi, mPtr
  4740. mov eax, dstPtr
  4741. movss xmm0, [esi]
  4742. mulss xmm0, [edi]
  4743. STORE1( 0, xmm0, xmm1 )
  4744. }
  4745. return;
  4746. }
  4747. case 6: { // 6x1 * 1x1
  4748. __asm {
  4749. mov esi, vPtr
  4750. mov edi, mPtr
  4751. mov eax, dstPtr
  4752. movss xmm0, [esi]
  4753. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  4754. movaps xmm1, xmm0
  4755. mulps xmm0, [edi]
  4756. mulps xmm1, [edi+16]
  4757. STORE4( 0, xmm0, xmm2 )
  4758. STORE2LO( 16, xmm1, xmm2 )
  4759. }
  4760. return;
  4761. }
  4762. default: {
  4763. for ( int i = 0; i < numRows; i++ ) {
  4764. dstPtr[i] STOREC mPtr[0] * vPtr[0];
  4765. mPtr++;
  4766. }
  4767. return;
  4768. }
  4769. }
  4770. break;
  4771. }
  4772. case 2: {
  4773. switch( numRows ) {
  4774. case 2: { // 2x2 * 2x1
  4775. __asm {
  4776. mov esi, vPtr
  4777. mov edi, mPtr
  4778. mov eax, dstPtr
  4779. movss xmm0, [esi]
  4780. movss xmm1, [esi+4]
  4781. movss xmm2, [edi]
  4782. mulss xmm2, xmm0
  4783. movss xmm3, [edi+4]
  4784. mulss xmm3, xmm1
  4785. addss xmm2, xmm3
  4786. STORE1( 0, xmm2, xmm4 )
  4787. mulss xmm0, [edi+8]
  4788. mulss xmm1, [edi+8+4]
  4789. addss xmm0, xmm1
  4790. STORE1( 4, xmm0, xmm4 )
  4791. }
  4792. return;
  4793. }
  4794. case 6: { // 6x2 * 2x1
  4795. __asm {
  4796. mov esi, vPtr
  4797. mov edi, mPtr
  4798. mov eax, dstPtr
  4799. movlps xmm7, [esi]
  4800. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  4801. movaps xmm0, [edi]
  4802. mulps xmm0, xmm7
  4803. movaps xmm1, [edi+16]
  4804. mulps xmm1, xmm7
  4805. movaps xmm2, xmm0
  4806. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  4807. shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  4808. movaps xmm3, [edi+32]
  4809. addps xmm0, xmm2
  4810. mulps xmm3, xmm7
  4811. STORE4( 0, xmm0, xmm4 )
  4812. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
  4813. movhlps xmm1, xmm3
  4814. addps xmm3, xmm1
  4815. STORE2LO( 16, xmm3, xmm4 )
  4816. }
  4817. return;
  4818. }
  4819. default: {
  4820. for ( int i = 0; i < numRows; i++ ) {
  4821. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
  4822. mPtr += 2;
  4823. }
  4824. return;
  4825. }
  4826. }
  4827. break;
  4828. }
  4829. case 3: {
  4830. switch( numRows ) {
  4831. case 3: { // 3x3 * 3x1
  4832. __asm {
  4833. mov esi, vPtr
  4834. mov edi, mPtr
  4835. mov eax, dstPtr
  4836. movss xmm0, [esi]
  4837. movss xmm4, [edi]
  4838. mulss xmm4, xmm0
  4839. movss xmm1, [esi+4]
  4840. movss xmm5, [edi+4]
  4841. mulss xmm5, xmm1
  4842. addss xmm4, xmm5
  4843. movss xmm2, [esi+8]
  4844. movss xmm6, [edi+8]
  4845. mulss xmm6, xmm2
  4846. addss xmm4, xmm6
  4847. movss xmm3, [edi+12]
  4848. mulss xmm3, xmm0
  4849. STORE1( 0, xmm4, xmm7 );
  4850. movss xmm5, [edi+12+4]
  4851. mulss xmm5, xmm1
  4852. addss xmm3, xmm5
  4853. movss xmm6, [edi+12+8]
  4854. mulss xmm6, xmm2
  4855. addss xmm3, xmm6
  4856. mulss xmm0, [edi+24]
  4857. mulss xmm1, [edi+24+4]
  4858. STORE1( 4, xmm3, xmm7 );
  4859. addss xmm0, xmm1
  4860. mulss xmm2, [edi+24+8]
  4861. addss xmm0, xmm2
  4862. STORE1( 8, xmm0, xmm7 );
  4863. }
  4864. return;
  4865. }
  4866. case 6: { // 6x3 * 3x1
  4867. __asm {
  4868. mov esi, vPtr
  4869. mov edi, mPtr
  4870. mov eax, dstPtr
  4871. movss xmm5, [esi]
  4872. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  4873. movss xmm6, [esi+4]
  4874. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  4875. movss xmm7, [esi+8]
  4876. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  4877. movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
  4878. movlps xmm1, [edi+4*4]
  4879. shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
  4880. movlps xmm2, [edi+6*4]
  4881. movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
  4882. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
  4883. mulps xmm0, xmm5
  4884. movlps xmm3, [edi+10*4]
  4885. shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
  4886. movaps xmm3, xmm1
  4887. shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
  4888. mulps xmm1, xmm6
  4889. shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
  4890. mulps xmm3, xmm7
  4891. addps xmm0, xmm1
  4892. addps xmm0, xmm3
  4893. STORE4( 0, xmm0, xmm4 )
  4894. movss xmm1, [edi+12*4]
  4895. mulss xmm1, xmm5
  4896. movss xmm2, [edi+13*4]
  4897. mulss xmm2, xmm6
  4898. movss xmm3, [edi+14*4]
  4899. mulss xmm3, xmm7
  4900. addss xmm1, xmm2
  4901. addss xmm1, xmm3
  4902. STORE1( 16, xmm1, xmm4 )
  4903. mulss xmm5, [edi+15*4]
  4904. mulss xmm6, [edi+16*4]
  4905. mulss xmm7, [edi+17*4]
  4906. addss xmm5, xmm6
  4907. addss xmm5, xmm7
  4908. STORE1( 20, xmm5, xmm4 )
  4909. }
  4910. return;
  4911. }
  4912. default: {
  4913. for ( int i = 0; i < numRows; i++ ) {
  4914. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
  4915. mPtr += 3;
  4916. }
  4917. return;
  4918. }
  4919. }
  4920. break;
  4921. }
  4922. case 4: {
  4923. switch( numRows ) {
  4924. case 4: { // 4x4 * 4x1
  4925. __asm {
  4926. mov esi, vPtr
  4927. mov edi, mPtr
  4928. mov eax, dstPtr
  4929. movlps xmm6, qword ptr [esi ]
  4930. movlps xmm0, qword ptr [edi ]
  4931. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
  4932. movhps xmm0, qword ptr [edi+16]
  4933. mulps xmm0, xmm6
  4934. movlps xmm7, qword ptr [esi+ 8]
  4935. movlps xmm2, qword ptr [edi+ 8]
  4936. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  4937. movhps xmm2, qword ptr [edi+24]
  4938. mulps xmm2, xmm7
  4939. movlps xmm1, qword ptr [edi+32]
  4940. movhps xmm1, qword ptr [edi+48]
  4941. mulps xmm1, xmm6
  4942. movlps xmm3, qword ptr [edi+40]
  4943. addps xmm0, xmm2
  4944. movhps xmm3, qword ptr [edi+56]
  4945. mulps xmm3, xmm7
  4946. movaps xmm4, xmm0
  4947. addps xmm1, xmm3
  4948. shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  4949. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  4950. addps xmm0, xmm4
  4951. STORE4( 0, xmm0, xmm2 )
  4952. }
  4953. return;
  4954. }
  4955. case 6: { // 6x4 * 4x1
  4956. __asm {
  4957. mov esi, vPtr
  4958. mov edi, mPtr
  4959. mov eax, dstPtr
  4960. movlps xmm6, qword ptr [esi+ 0]
  4961. movlps xmm0, qword ptr [edi+ 0]
  4962. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
  4963. movhps xmm0, qword ptr [edi+16]
  4964. mulps xmm0, xmm6
  4965. movlps xmm7, qword ptr [esi+ 8]
  4966. movlps xmm2, qword ptr [edi+ 8]
  4967. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  4968. movhps xmm2, qword ptr [edi+24]
  4969. mulps xmm2, xmm7
  4970. movlps xmm1, qword ptr [edi+32]
  4971. movhps xmm1, qword ptr [edi+48]
  4972. mulps xmm1, xmm6
  4973. movlps xmm3, qword ptr [edi+40]
  4974. addps xmm0, xmm2
  4975. movhps xmm3, qword ptr [edi+56]
  4976. mulps xmm3, xmm7
  4977. movaps xmm4, xmm0
  4978. addps xmm1, xmm3
  4979. shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  4980. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  4981. addps xmm0, xmm4
  4982. movlps xmm1, qword ptr [edi+64]
  4983. movhps xmm1, qword ptr [edi+80]
  4984. STORE4( 0, xmm0, xmm4 )
  4985. mulps xmm1, xmm6
  4986. movlps xmm2, qword ptr [edi+72]
  4987. movhps xmm2, qword ptr [edi+88]
  4988. mulps xmm2, xmm7
  4989. addps xmm1, xmm2
  4990. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
  4991. movhlps xmm3, xmm1
  4992. addps xmm1, xmm3
  4993. STORE2LO( 16, xmm1, xmm4 )
  4994. }
  4995. return;
  4996. }
  4997. default: {
  4998. for ( int i = 0; i < numRows; i++ ) {
  4999. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
  5000. mPtr += 4;
  5001. }
  5002. return;
  5003. }
  5004. }
  5005. break;
  5006. }
  5007. case 5: {
  5008. switch( numRows ) {
  5009. case 5: { // 5x5 * 5x1
  5010. __asm {
  5011. mov esi, vPtr
  5012. mov edi, mPtr
  5013. mov eax, dstPtr
  5014. movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
  5015. movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
  5016. movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
  5017. movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
  5018. movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
  5019. shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
  5020. movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
  5021. movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
  5022. movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
  5023. shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
  5024. movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
  5025. movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
  5026. movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
  5027. shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
  5028. movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
  5029. movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
  5030. movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
  5031. movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
  5032. shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
  5033. movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
  5034. shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
  5035. movss xmm7, [esi+0*4]
  5036. shufps xmm7, xmm7, 0
  5037. mulps xmm0, xmm7
  5038. movss xmm5, [esi+1*4]
  5039. shufps xmm5, xmm5, 0
  5040. mulps xmm1, xmm5
  5041. addps xmm0, xmm1
  5042. movss xmm6, [esi+2*4]
  5043. shufps xmm6, xmm6, 0
  5044. mulps xmm2, xmm6
  5045. addps xmm0, xmm2
  5046. movss xmm1, [esi+3*4]
  5047. shufps xmm1, xmm1, 0
  5048. mulps xmm3, xmm1
  5049. addps xmm0, xmm3
  5050. movss xmm2, [esi+4*4]
  5051. shufps xmm2, xmm2, 0
  5052. mulps xmm4, xmm2
  5053. addps xmm0, xmm4
  5054. mulss xmm7, [edi+20*4]
  5055. mulss xmm5, [edi+21*4]
  5056. addps xmm7, xmm5
  5057. mulss xmm6, [edi+22*4]
  5058. addps xmm7, xmm6
  5059. mulss xmm1, [edi+23*4]
  5060. addps xmm7, xmm1
  5061. mulss xmm2, [edi+24*4]
  5062. addps xmm7, xmm2
  5063. STORE4( 0, xmm0, xmm3 )
  5064. STORE1( 16, xmm7, xmm4 )
  5065. }
  5066. return;
  5067. }
  5068. case 6: { // 6x5 * 5x1
  5069. __asm {
  5070. mov esi, vPtr
  5071. mov edi, mPtr
  5072. mov eax, dstPtr
  5073. movlps xmm6, [esi]
  5074. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
  5075. movlps xmm7, [esi+8]
  5076. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  5077. movlps xmm0, [edi]
  5078. movhps xmm3, [edi+8]
  5079. movaps xmm1, [edi+16]
  5080. movlps xmm2, [edi+32]
  5081. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
  5082. shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
  5083. shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
  5084. mulps xmm0, xmm6
  5085. mulps xmm3, xmm7
  5086. movlps xmm2, [edi+40]
  5087. addps xmm0, xmm3 // xmm0 + xmm1
  5088. movhps xmm5, [edi+40+8]
  5089. movlps xmm3, [edi+40+16]
  5090. movhps xmm3, [edi+40+24]
  5091. movlps xmm4, [edi+40+32]
  5092. shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
  5093. shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
  5094. shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
  5095. mulps xmm2, xmm6
  5096. mulps xmm5, xmm7
  5097. addps xmm2, xmm5 // xmm2 + xmm3
  5098. movss xmm5, [esi+16]
  5099. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  5100. movaps xmm4, xmm0
  5101. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
  5102. shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
  5103. shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
  5104. addps xmm0, xmm4
  5105. mulps xmm1, xmm5
  5106. addps xmm0, xmm1
  5107. STORE4( 0, xmm0, xmm2 )
  5108. movlps xmm4, [edi+80]
  5109. movhps xmm3, [edi+80+8]
  5110. movaps xmm1, [edi+80+16]
  5111. movlps xmm2, [edi+80+32]
  5112. shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
  5113. shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
  5114. shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
  5115. mulps xmm4, xmm6
  5116. mulps xmm3, xmm7
  5117. mulps xmm1, xmm5
  5118. addps xmm4, xmm3 // xmm4 + xmm1
  5119. shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
  5120. shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
  5121. addps xmm4, xmm1
  5122. shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
  5123. addps xmm4, xmm1
  5124. STORE2LO( 16, xmm4, xmm2 )
  5125. }
  5126. return;
  5127. }
  5128. default: {
  5129. for ( int i = 0; i < numRows; i++ ) {
  5130. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
  5131. mPtr += 5;
  5132. }
  5133. return;
  5134. }
  5135. }
  5136. break;
  5137. }
  5138. case 6: {
  5139. switch( numRows ) {
  5140. case 1: { // 1x6 * 6x1
  5141. __asm {
  5142. mov esi, vPtr
  5143. mov edi, mPtr
  5144. mov eax, dstPtr
  5145. movss xmm0, [esi]
  5146. mulss xmm0, [edi]
  5147. movss xmm1, [esi+4]
  5148. mulss xmm1, [edi+4]
  5149. movss xmm2, [esi+8]
  5150. addss xmm0, xmm1
  5151. mulss xmm2, [edi+8]
  5152. movss xmm3, [esi+12]
  5153. addss xmm0, xmm2
  5154. mulss xmm3, [edi+12]
  5155. movss xmm4, [esi+16]
  5156. addss xmm0, xmm3
  5157. mulss xmm4, [edi+16]
  5158. movss xmm5, [esi+20]
  5159. addss xmm0, xmm4
  5160. mulss xmm5, [edi+20]
  5161. movss xmm6, [esi+24]
  5162. addss xmm0, xmm5
  5163. mulss xmm6, [edi+24]
  5164. addss xmm0, xmm6
  5165. STORE1( 0, xmm0, xmm7 )
  5166. }
  5167. return;
  5168. }
  5169. case 2: { // 2x6 * 6x1
  5170. __asm {
  5171. mov esi, vPtr
  5172. mov edi, mPtr
  5173. mov eax, dstPtr
  5174. // load idVecX
  5175. movlps xmm4, [esi]
  5176. movhps xmm4, [esi+8]
  5177. movlps xmm5, [esi+16]
  5178. movlhps xmm5, xmm4
  5179. movhlps xmm6, xmm4
  5180. movlhps xmm6, xmm5
  5181. // row 0 and 1
  5182. movaps xmm0, [edi]
  5183. movaps xmm1, [edi+16]
  5184. movaps xmm2, [edi+32]
  5185. mulps xmm0, xmm4
  5186. mulps xmm1, xmm5
  5187. mulps xmm2, xmm6
  5188. movhlps xmm3, xmm0
  5189. movlhps xmm3, xmm2
  5190. addps xmm1, xmm3
  5191. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  5192. addps xmm1, xmm0
  5193. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
  5194. movhlps xmm0, xmm1
  5195. addps xmm0, xmm1
  5196. STORE2LO( 0, xmm0, xmm3 )
  5197. }
  5198. return;
  5199. }
  5200. case 3: { // 3x6 * 6x1
  5201. __asm {
  5202. mov esi, vPtr
  5203. mov edi, mPtr
  5204. mov eax, dstPtr
  5205. // load idVecX
  5206. movlps xmm4, [esi]
  5207. movhps xmm4, [esi+8]
  5208. movlps xmm5, [esi+16]
  5209. movlhps xmm5, xmm4
  5210. movhlps xmm6, xmm4
  5211. movlhps xmm6, xmm5
  5212. // row 0 and 1
  5213. movaps xmm0, [edi]
  5214. movaps xmm1, [edi+16]
  5215. movaps xmm2, [edi+32]
  5216. mulps xmm0, xmm4
  5217. mulps xmm1, xmm5
  5218. mulps xmm2, xmm6
  5219. movhlps xmm3, xmm0
  5220. movlhps xmm3, xmm2
  5221. addps xmm1, xmm3
  5222. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  5223. addps xmm1, xmm0
  5224. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
  5225. movhlps xmm0, xmm1
  5226. addps xmm0, xmm1
  5227. STORE2LO( 0, xmm0, xmm3 )
  5228. // row 2
  5229. movaps xmm0, [edi+48]
  5230. movaps xmm1, [edi+48+16]
  5231. mulps xmm0, xmm4
  5232. mulps xmm1, xmm5
  5233. addps xmm0, xmm1
  5234. movhlps xmm1, xmm0
  5235. addps xmm0, xmm1
  5236. movaps xmm1, xmm0
  5237. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
  5238. addss xmm0, xmm1
  5239. STORE1( 8, xmm0, xmm3 )
  5240. }
  5241. return;
  5242. }
  5243. case 4: { // 4x6 * 6x1
  5244. __asm {
  5245. mov esi, vPtr
  5246. mov edi, mPtr
  5247. mov eax, dstPtr
  5248. // load idVecX
  5249. movlps xmm4, [esi]
  5250. movhps xmm4, [esi+8]
  5251. movlps xmm5, [esi+16]
  5252. movlhps xmm5, xmm4
  5253. movhlps xmm6, xmm4
  5254. movlhps xmm6, xmm5
  5255. // row 0 and 1
  5256. movaps xmm0, [edi]
  5257. movaps xmm1, [edi+16]
  5258. movaps xmm2, [edi+32]
  5259. mulps xmm0, xmm4
  5260. mulps xmm1, xmm5
  5261. mulps xmm2, xmm6
  5262. movhlps xmm7, xmm0
  5263. movlhps xmm7, xmm2
  5264. addps xmm7, xmm1
  5265. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  5266. addps xmm7, xmm0
  5267. // row 2 and 3
  5268. movaps xmm0, [edi+48]
  5269. movaps xmm1, [edi+48+16]
  5270. movaps xmm2, [edi+48+32]
  5271. mulps xmm0, xmm4
  5272. mulps xmm1, xmm5
  5273. mulps xmm2, xmm6
  5274. movhlps xmm3, xmm0
  5275. movlhps xmm3, xmm2
  5276. addps xmm1, xmm3
  5277. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  5278. addps xmm1, xmm0
  5279. // last 4 additions for the first 4 rows and store result
  5280. movaps xmm0, xmm7
  5281. shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  5282. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  5283. addps xmm0, xmm7
  5284. STORE4( 0, xmm0, xmm4 )
  5285. }
  5286. return;
  5287. }
  5288. case 5: { // 5x6 * 6x1
  5289. __asm {
  5290. mov esi, vPtr
  5291. mov edi, mPtr
  5292. mov eax, dstPtr
  5293. // load idVecX
  5294. movlps xmm4, [esi]
  5295. movhps xmm4, [esi+8]
  5296. movlps xmm5, [esi+16]
  5297. movlhps xmm5, xmm4
  5298. movhlps xmm6, xmm4
  5299. movlhps xmm6, xmm5
  5300. // row 0 and 1
  5301. movaps xmm0, [edi]
  5302. movaps xmm1, [edi+16]
  5303. movaps xmm2, [edi+32]
  5304. mulps xmm0, xmm4
  5305. mulps xmm1, xmm5
  5306. mulps xmm2, xmm6
  5307. movhlps xmm7, xmm0
  5308. movlhps xmm7, xmm2
  5309. addps xmm7, xmm1
  5310. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  5311. addps xmm7, xmm0
  5312. // row 2 and 3
  5313. movaps xmm0, [edi+48]
  5314. movaps xmm1, [edi+48+16]
  5315. movaps xmm2, [edi+48+32]
  5316. mulps xmm0, xmm4
  5317. mulps xmm1, xmm5
  5318. mulps xmm2, xmm6
  5319. movhlps xmm3, xmm0
  5320. movlhps xmm3, xmm2
  5321. addps xmm1, xmm3
  5322. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  5323. addps xmm1, xmm0
  5324. // last 4 additions for the first 4 rows and store result
  5325. movaps xmm0, xmm7
  5326. shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  5327. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  5328. addps xmm0, xmm7
  5329. STORE4( 0, xmm0, xmm3 )
  5330. // row 5
  5331. movaps xmm0, [edi+96]
  5332. movaps xmm1, [edi+96+16]
  5333. mulps xmm0, xmm4
  5334. mulps xmm1, xmm5
  5335. addps xmm0, xmm1
  5336. movhlps xmm1, xmm0
  5337. addps xmm0, xmm1
  5338. movaps xmm1, xmm0
  5339. shufps xmm1, xmm1, 0x01
  5340. addss xmm0, xmm1
  5341. STORE1( 16, xmm0, xmm3 )
  5342. }
  5343. return;
  5344. }
  5345. case 6: { // 6x6 * 6x1
  5346. __asm {
  5347. mov esi, vPtr
  5348. mov edi, mPtr
  5349. mov eax, dstPtr
  5350. movlps xmm7, qword ptr [esi]
  5351. movlps xmm6, qword ptr [esi+8]
  5352. shufps xmm7, xmm7, 0x44
  5353. shufps xmm6, xmm6, 0x44
  5354. movlps xmm0, qword ptr [edi ]
  5355. movhps xmm0, qword ptr [edi+ 24]
  5356. mulps xmm0, xmm7
  5357. movlps xmm3, qword ptr [edi+ 8]
  5358. movhps xmm3, qword ptr [edi+ 32]
  5359. mulps xmm3, xmm6
  5360. movlps xmm1, qword ptr [edi+ 48]
  5361. movhps xmm1, qword ptr [edi+ 72]
  5362. mulps xmm1, xmm7
  5363. movlps xmm2, qword ptr [edi+ 96]
  5364. movhps xmm2, qword ptr [edi+120]
  5365. mulps xmm2, xmm7
  5366. movlps xmm4, qword ptr [edi+ 56]
  5367. movhps xmm4, qword ptr [edi+ 80]
  5368. movlps xmm5, qword ptr [edi+104]
  5369. movhps xmm5, qword ptr [edi+128]
  5370. mulps xmm4, xmm6
  5371. movlps xmm7, qword ptr [esi+16]
  5372. addps xmm0, xmm3
  5373. shufps xmm7, xmm7, 0x44
  5374. mulps xmm5, xmm6
  5375. addps xmm1, xmm4
  5376. movlps xmm3, qword ptr [edi+ 16]
  5377. movhps xmm3, qword ptr [edi+ 40]
  5378. addps xmm2, xmm5
  5379. movlps xmm4, qword ptr [edi+ 64]
  5380. movhps xmm4, qword ptr [edi+ 88]
  5381. mulps xmm3, xmm7
  5382. movlps xmm5, qword ptr [edi+112]
  5383. movhps xmm5, qword ptr [edi+136]
  5384. addps xmm0, xmm3
  5385. mulps xmm4, xmm7
  5386. mulps xmm5, xmm7
  5387. addps xmm1, xmm4
  5388. addps xmm2, xmm5
  5389. movaps xmm6, xmm0
  5390. shufps xmm0, xmm1, 0x88
  5391. shufps xmm6, xmm1, 0xDD
  5392. movaps xmm7, xmm2
  5393. shufps xmm7, xmm2, 0x88
  5394. shufps xmm2, xmm2, 0xDD
  5395. addps xmm0, xmm6
  5396. addps xmm2, xmm7
  5397. STORE4( 0, xmm0, xmm3 )
  5398. STORE2LO( 16, xmm2, xmm4 )
  5399. }
  5400. return;
  5401. }
  5402. default: {
  5403. for ( int i = 0; i < numRows; i++ ) {
  5404. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  5405. mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
  5406. mPtr += 6;
  5407. }
  5408. return;
  5409. }
  5410. }
  5411. break;
  5412. }
  5413. default: {
  5414. int numColumns = mat.GetNumColumns();
  5415. for ( int i = 0; i < numRows; i++ ) {
  5416. float sum = mPtr[0] * vPtr[0];
  5417. for ( int j = 1; j < numColumns; j++ ) {
  5418. sum += mPtr[j] * vPtr[j];
  5419. }
  5420. dstPtr[i] STOREC sum;
  5421. mPtr += numColumns;
  5422. }
  5423. break;
  5424. }
  5425. }
  5426. #undef STOREC
  5427. #undef STORE4
  5428. #undef STORE2HI
  5429. #undef STORE2LO
  5430. #undef STORE1
  5431. }
  5432. /*
  5433. ============
  5434. idSIMD_SSE::MatX_MultiplySubVecX
  5435. optimizes the following matrix multiplications:
  5436. NxN * Nx1
  5437. Nx6 * 6x1
  5438. 6xN * Nx1
  5439. with N in the range [1-6]
  5440. ============
  5441. */
  5442. void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  5443. #define STORE1( offset, reg1, reg2 ) \
  5444. __asm movss reg2, [eax+offset] \
  5445. __asm subss reg2, reg1 \
  5446. __asm movss [eax+offset], reg2
  5447. #define STORE2LO( offset, reg1, reg2 ) \
  5448. __asm movlps reg2, [eax+offset] \
  5449. __asm subps reg2, reg1 \
  5450. __asm movlps [eax+offset], reg2
  5451. #define STORE2HI( offset, reg1, reg2 ) \
  5452. __asm movhps reg2, [eax+offset] \
  5453. __asm subps reg2, reg1 \
  5454. __asm movhps [eax+offset], reg2
  5455. #define STORE4( offset, reg1, reg2 ) \
  5456. __asm movlps reg2, [eax+offset] \
  5457. __asm movhps reg2, [eax+offset+8] \
  5458. __asm subps reg2, reg1 \
  5459. __asm movlps [eax+offset], reg2 \
  5460. __asm movhps [eax+offset+8], reg2
  5461. #define STOREC -=
  5462. int numRows;
  5463. const float *mPtr, *vPtr;
  5464. float *dstPtr;
  5465. assert( vec.GetSize() >= mat.GetNumColumns() );
  5466. assert( dst.GetSize() >= mat.GetNumRows() );
  5467. mPtr = mat.ToFloatPtr();
  5468. vPtr = vec.ToFloatPtr();
  5469. dstPtr = dst.ToFloatPtr();
  5470. numRows = mat.GetNumRows();
  5471. switch( mat.GetNumColumns() ) {
  5472. case 1: {
  5473. switch( numRows ) {
  5474. case 1: { // 1x1 * 1x1
  5475. __asm {
  5476. mov esi, vPtr
  5477. mov edi, mPtr
  5478. mov eax, dstPtr
  5479. movss xmm0, [esi]
  5480. mulss xmm0, [edi]
  5481. STORE1( 0, xmm0, xmm1 )
  5482. }
  5483. return;
  5484. }
  5485. case 6: { // 6x1 * 1x1
  5486. __asm {
  5487. mov esi, vPtr
  5488. mov edi, mPtr
  5489. mov eax, dstPtr
  5490. movss xmm0, [esi]
  5491. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  5492. movaps xmm1, xmm0
  5493. mulps xmm0, [edi]
  5494. mulps xmm1, [edi+16]
  5495. STORE4( 0, xmm0, xmm2 )
  5496. STORE2LO( 16, xmm1, xmm2 )
  5497. }
  5498. return;
  5499. }
  5500. default: {
  5501. for ( int i = 0; i < numRows; i++ ) {
  5502. dstPtr[i] STOREC mPtr[0] * vPtr[0];
  5503. mPtr++;
  5504. }
  5505. return;
  5506. }
  5507. }
  5508. break;
  5509. }
  5510. case 2: {
  5511. switch( numRows ) {
  5512. case 2: { // 2x2 * 2x1
  5513. __asm {
  5514. mov esi, vPtr
  5515. mov edi, mPtr
  5516. mov eax, dstPtr
  5517. movss xmm0, [esi]
  5518. movss xmm1, [esi+4]
  5519. movss xmm2, [edi]
  5520. mulss xmm2, xmm0
  5521. movss xmm3, [edi+4]
  5522. mulss xmm3, xmm1
  5523. addss xmm2, xmm3
  5524. STORE1( 0, xmm2, xmm4 )
  5525. mulss xmm0, [edi+8]
  5526. mulss xmm1, [edi+8+4]
  5527. addss xmm0, xmm1
  5528. STORE1( 4, xmm0, xmm4 )
  5529. }
  5530. return;
  5531. }
  5532. case 6: { // 6x2 * 2x1
  5533. __asm {
  5534. mov esi, vPtr
  5535. mov edi, mPtr
  5536. mov eax, dstPtr
  5537. movlps xmm7, [esi]
  5538. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  5539. movaps xmm0, [edi]
  5540. mulps xmm0, xmm7
  5541. movaps xmm1, [edi+16]
  5542. mulps xmm1, xmm7
  5543. movaps xmm2, xmm0
  5544. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  5545. shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  5546. movaps xmm3, [edi+32]
  5547. addps xmm0, xmm2
  5548. mulps xmm3, xmm7
  5549. STORE4( 0, xmm0, xmm4 )
  5550. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
  5551. movhlps xmm1, xmm3
  5552. addps xmm3, xmm1
  5553. STORE2LO( 16, xmm3, xmm4 )
  5554. }
  5555. return;
  5556. }
  5557. default: {
  5558. for ( int i = 0; i < numRows; i++ ) {
  5559. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
  5560. mPtr += 2;
  5561. }
  5562. return;
  5563. }
  5564. }
  5565. break;
  5566. }
  5567. case 3: {
  5568. switch( numRows ) {
  5569. case 3: { // 3x3 * 3x1
  5570. __asm {
  5571. mov esi, vPtr
  5572. mov edi, mPtr
  5573. mov eax, dstPtr
  5574. movss xmm0, [esi]
  5575. movss xmm4, [edi]
  5576. mulss xmm4, xmm0
  5577. movss xmm1, [esi+4]
  5578. movss xmm5, [edi+4]
  5579. mulss xmm5, xmm1
  5580. addss xmm4, xmm5
  5581. movss xmm2, [esi+8]
  5582. movss xmm6, [edi+8]
  5583. mulss xmm6, xmm2
  5584. addss xmm4, xmm6
  5585. movss xmm3, [edi+12]
  5586. mulss xmm3, xmm0
  5587. STORE1( 0, xmm4, xmm7 );
  5588. movss xmm5, [edi+12+4]
  5589. mulss xmm5, xmm1
  5590. addss xmm3, xmm5
  5591. movss xmm6, [edi+12+8]
  5592. mulss xmm6, xmm2
  5593. addss xmm3, xmm6
  5594. mulss xmm0, [edi+24]
  5595. mulss xmm1, [edi+24+4]
  5596. STORE1( 4, xmm3, xmm7 );
  5597. addss xmm0, xmm1
  5598. mulss xmm2, [edi+24+8]
  5599. addss xmm0, xmm2
  5600. STORE1( 8, xmm0, xmm7 );
  5601. }
  5602. return;
  5603. }
  5604. case 6: { // 6x3 * 3x1
  5605. __asm {
  5606. mov esi, vPtr
  5607. mov edi, mPtr
  5608. mov eax, dstPtr
  5609. movss xmm5, [esi]
  5610. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  5611. movss xmm6, [esi+4]
  5612. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  5613. movss xmm7, [esi+8]
  5614. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  5615. movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
  5616. movlps xmm1, [edi+4*4]
  5617. shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
  5618. movlps xmm2, [edi+6*4]
  5619. movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
  5620. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
  5621. mulps xmm0, xmm5
  5622. movlps xmm3, [edi+10*4]
  5623. shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
  5624. movaps xmm3, xmm1
  5625. shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
  5626. mulps xmm1, xmm6
  5627. shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
  5628. mulps xmm3, xmm7
  5629. addps xmm0, xmm1
  5630. addps xmm0, xmm3
  5631. STORE4( 0, xmm0, xmm4 )
  5632. movss xmm1, [edi+12*4]
  5633. mulss xmm1, xmm5
  5634. movss xmm2, [edi+13*4]
  5635. mulss xmm2, xmm6
  5636. movss xmm3, [edi+14*4]
  5637. mulss xmm3, xmm7
  5638. addss xmm1, xmm2
  5639. addss xmm1, xmm3
  5640. STORE1( 16, xmm1, xmm4 )
  5641. mulss xmm5, [edi+15*4]
  5642. mulss xmm6, [edi+16*4]
  5643. mulss xmm7, [edi+17*4]
  5644. addss xmm5, xmm6
  5645. addss xmm5, xmm7
  5646. STORE1( 20, xmm5, xmm4 )
  5647. }
  5648. return;
  5649. }
  5650. default: {
  5651. for ( int i = 0; i < numRows; i++ ) {
  5652. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
  5653. mPtr += 3;
  5654. }
  5655. return;
  5656. }
  5657. }
  5658. break;
  5659. }
  5660. case 4: {
  5661. switch( numRows ) {
  5662. case 4: { // 4x4 * 4x1
  5663. __asm {
  5664. mov esi, vPtr
  5665. mov edi, mPtr
  5666. mov eax, dstPtr
  5667. movlps xmm6, qword ptr [esi ]
  5668. movlps xmm0, qword ptr [edi ]
  5669. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
  5670. movhps xmm0, qword ptr [edi+16]
  5671. mulps xmm0, xmm6
  5672. movlps xmm7, qword ptr [esi+ 8]
  5673. movlps xmm2, qword ptr [edi+ 8]
  5674. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  5675. movhps xmm2, qword ptr [edi+24]
  5676. mulps xmm2, xmm7
  5677. movlps xmm1, qword ptr [edi+32]
  5678. movhps xmm1, qword ptr [edi+48]
  5679. mulps xmm1, xmm6
  5680. movlps xmm3, qword ptr [edi+40]
  5681. addps xmm0, xmm2
  5682. movhps xmm3, qword ptr [edi+56]
  5683. mulps xmm3, xmm7
  5684. movaps xmm4, xmm0
  5685. addps xmm1, xmm3
  5686. shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  5687. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  5688. addps xmm0, xmm4
  5689. STORE4( 0, xmm0, xmm2 )
  5690. }
  5691. return;
  5692. }
  5693. case 6: { // 6x4 * 4x1
  5694. __asm {
  5695. mov esi, vPtr
  5696. mov edi, mPtr
  5697. mov eax, dstPtr
  5698. movlps xmm6, qword ptr [esi+ 0]
  5699. movlps xmm0, qword ptr [edi+ 0]
  5700. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
  5701. movhps xmm0, qword ptr [edi+16]
  5702. mulps xmm0, xmm6
  5703. movlps xmm7, qword ptr [esi+ 8]
  5704. movlps xmm2, qword ptr [edi+ 8]
  5705. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  5706. movhps xmm2, qword ptr [edi+24]
  5707. mulps xmm2, xmm7
  5708. movlps xmm1, qword ptr [edi+32]
  5709. movhps xmm1, qword ptr [edi+48]
  5710. mulps xmm1, xmm6
  5711. movlps xmm3, qword ptr [edi+40]
  5712. addps xmm0, xmm2
  5713. movhps xmm3, qword ptr [edi+56]
  5714. mulps xmm3, xmm7
  5715. movaps xmm4, xmm0
  5716. addps xmm1, xmm3
  5717. shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  5718. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  5719. addps xmm0, xmm4
  5720. movlps xmm1, qword ptr [edi+64]
  5721. movhps xmm1, qword ptr [edi+80]
  5722. STORE4( 0, xmm0, xmm4 )
  5723. mulps xmm1, xmm6
  5724. movlps xmm2, qword ptr [edi+72]
  5725. movhps xmm2, qword ptr [edi+88]
  5726. mulps xmm2, xmm7
  5727. addps xmm1, xmm2
  5728. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
  5729. movhlps xmm3, xmm1
  5730. addps xmm1, xmm3
  5731. STORE2LO( 16, xmm1, xmm4 )
  5732. }
  5733. return;
  5734. }
  5735. default: {
  5736. for ( int i = 0; i < numRows; i++ ) {
  5737. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
  5738. mPtr += 4;
  5739. }
  5740. return;
  5741. }
  5742. }
  5743. break;
  5744. }
  5745. case 5: {
  5746. switch( numRows ) {
  5747. case 5: { // 5x5 * 5x1
  5748. __asm {
  5749. mov esi, vPtr
  5750. mov edi, mPtr
  5751. mov eax, dstPtr
  5752. movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
  5753. movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
  5754. movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
  5755. movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
  5756. movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
  5757. shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
  5758. movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
  5759. movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
  5760. movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
  5761. shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
  5762. movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
  5763. movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
  5764. movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
  5765. shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
  5766. movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
  5767. movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
  5768. movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
  5769. movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
  5770. shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
  5771. movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
  5772. shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
  5773. movss xmm7, [esi+0*4]
  5774. shufps xmm7, xmm7, 0
  5775. mulps xmm0, xmm7
  5776. movss xmm5, [esi+1*4]
  5777. shufps xmm5, xmm5, 0
  5778. mulps xmm1, xmm5
  5779. addps xmm0, xmm1
  5780. movss xmm6, [esi+2*4]
  5781. shufps xmm6, xmm6, 0
  5782. mulps xmm2, xmm6
  5783. addps xmm0, xmm2
  5784. movss xmm1, [esi+3*4]
  5785. shufps xmm1, xmm1, 0
  5786. mulps xmm3, xmm1
  5787. addps xmm0, xmm3
  5788. movss xmm2, [esi+4*4]
  5789. shufps xmm2, xmm2, 0
  5790. mulps xmm4, xmm2
  5791. addps xmm0, xmm4
  5792. mulss xmm7, [edi+20*4]
  5793. mulss xmm5, [edi+21*4]
  5794. addps xmm7, xmm5
  5795. mulss xmm6, [edi+22*4]
  5796. addps xmm7, xmm6
  5797. mulss xmm1, [edi+23*4]
  5798. addps xmm7, xmm1
  5799. mulss xmm2, [edi+24*4]
  5800. addps xmm7, xmm2
  5801. STORE4( 0, xmm0, xmm3 )
  5802. STORE1( 16, xmm7, xmm4 )
  5803. }
  5804. return;
  5805. }
  5806. case 6: { // 6x5 * 5x1
  5807. __asm {
  5808. mov esi, vPtr
  5809. mov edi, mPtr
  5810. mov eax, dstPtr
  5811. movlps xmm6, [esi]
  5812. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
  5813. movlps xmm7, [esi+8]
  5814. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
  5815. movlps xmm0, [edi]
  5816. movhps xmm3, [edi+8]
  5817. movaps xmm1, [edi+16]
  5818. movlps xmm2, [edi+32]
  5819. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
  5820. shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
  5821. shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
  5822. mulps xmm0, xmm6
  5823. mulps xmm3, xmm7
  5824. movlps xmm2, [edi+40]
  5825. addps xmm0, xmm3 // xmm0 + xmm1
  5826. movhps xmm5, [edi+40+8]
  5827. movlps xmm3, [edi+40+16]
  5828. movhps xmm3, [edi+40+24]
  5829. movlps xmm4, [edi+40+32]
  5830. shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
  5831. shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
  5832. shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
  5833. mulps xmm2, xmm6
  5834. mulps xmm5, xmm7
  5835. addps xmm2, xmm5 // xmm2 + xmm3
  5836. movss xmm5, [esi+16]
  5837. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  5838. movaps xmm4, xmm0
  5839. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
  5840. shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
  5841. shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
  5842. addps xmm0, xmm4
  5843. mulps xmm1, xmm5
  5844. addps xmm0, xmm1
  5845. STORE4( 0, xmm0, xmm2 )
  5846. movlps xmm4, [edi+80]
  5847. movhps xmm3, [edi+80+8]
  5848. movaps xmm1, [edi+80+16]
  5849. movlps xmm2, [edi+80+32]
  5850. shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
  5851. shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
  5852. shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
  5853. mulps xmm4, xmm6
  5854. mulps xmm3, xmm7
  5855. mulps xmm1, xmm5
  5856. addps xmm4, xmm3 // xmm4 + xmm1
  5857. shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
  5858. shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
  5859. addps xmm4, xmm1
  5860. shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
  5861. addps xmm4, xmm1
  5862. STORE2LO( 16, xmm4, xmm2 )
  5863. }
  5864. return;
  5865. }
  5866. default: {
  5867. for ( int i = 0; i < numRows; i++ ) {
  5868. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
  5869. mPtr += 5;
  5870. }
  5871. return;
  5872. }
  5873. }
  5874. break;
  5875. }
  5876. case 6: {
  5877. switch( numRows ) {
  5878. case 1: { // 1x6 * 6x1
  5879. __asm {
  5880. mov esi, vPtr
  5881. mov edi, mPtr
  5882. mov eax, dstPtr
  5883. movss xmm0, [esi]
  5884. mulss xmm0, [edi]
  5885. movss xmm1, [esi+4]
  5886. mulss xmm1, [edi+4]
  5887. movss xmm2, [esi+8]
  5888. addss xmm0, xmm1
  5889. mulss xmm2, [edi+8]
  5890. movss xmm3, [esi+12]
  5891. addss xmm0, xmm2
  5892. mulss xmm3, [edi+12]
  5893. movss xmm4, [esi+16]
  5894. addss xmm0, xmm3
  5895. mulss xmm4, [edi+16]
  5896. movss xmm5, [esi+20]
  5897. addss xmm0, xmm4
  5898. mulss xmm5, [edi+20]
  5899. movss xmm6, [esi+24]
  5900. addss xmm0, xmm5
  5901. mulss xmm6, [edi+24]
  5902. addss xmm0, xmm6
  5903. STORE1( 0, xmm0, xmm7 )
  5904. }
  5905. return;
  5906. }
  5907. case 2: { // 2x6 * 6x1
  5908. __asm {
  5909. mov esi, vPtr
  5910. mov edi, mPtr
  5911. mov eax, dstPtr
  5912. // load idVecX
  5913. movlps xmm4, [esi]
  5914. movhps xmm4, [esi+8]
  5915. movlps xmm5, [esi+16]
  5916. movlhps xmm5, xmm4
  5917. movhlps xmm6, xmm4
  5918. movlhps xmm6, xmm5
  5919. // row 0 and 1
  5920. movaps xmm0, [edi]
  5921. movaps xmm1, [edi+16]
  5922. movaps xmm2, [edi+32]
  5923. mulps xmm0, xmm4
  5924. mulps xmm1, xmm5
  5925. mulps xmm2, xmm6
  5926. movhlps xmm3, xmm0
  5927. movlhps xmm3, xmm2
  5928. addps xmm1, xmm3
  5929. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  5930. addps xmm1, xmm0
  5931. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
  5932. movhlps xmm0, xmm1
  5933. addps xmm0, xmm1
  5934. STORE2LO( 0, xmm0, xmm3 )
  5935. }
  5936. return;
  5937. }
  5938. case 3: { // 3x6 * 6x1
  5939. __asm {
  5940. mov esi, vPtr
  5941. mov edi, mPtr
  5942. mov eax, dstPtr
  5943. // load idVecX
  5944. movlps xmm4, [esi]
  5945. movhps xmm4, [esi+8]
  5946. movlps xmm5, [esi+16]
  5947. movlhps xmm5, xmm4
  5948. movhlps xmm6, xmm4
  5949. movlhps xmm6, xmm5
  5950. // row 0 and 1
  5951. movaps xmm0, [edi]
  5952. movaps xmm1, [edi+16]
  5953. movaps xmm2, [edi+32]
  5954. mulps xmm0, xmm4
  5955. mulps xmm1, xmm5
  5956. mulps xmm2, xmm6
  5957. movhlps xmm3, xmm0
  5958. movlhps xmm3, xmm2
  5959. addps xmm1, xmm3
  5960. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  5961. addps xmm1, xmm0
  5962. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
  5963. movhlps xmm0, xmm1
  5964. addps xmm0, xmm1
  5965. STORE2LO( 0, xmm0, xmm3 )
  5966. // row 2
  5967. movaps xmm0, [edi+48]
  5968. movaps xmm1, [edi+48+16]
  5969. mulps xmm0, xmm4
  5970. mulps xmm1, xmm5
  5971. addps xmm0, xmm1
  5972. movhlps xmm1, xmm0
  5973. addps xmm0, xmm1
  5974. movaps xmm1, xmm0
  5975. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
  5976. addss xmm0, xmm1
  5977. STORE1( 8, xmm0, xmm3 )
  5978. }
  5979. return;
  5980. }
  5981. case 4: { // 4x6 * 6x1
  5982. __asm {
  5983. mov esi, vPtr
  5984. mov edi, mPtr
  5985. mov eax, dstPtr
  5986. // load idVecX
  5987. movlps xmm4, [esi]
  5988. movhps xmm4, [esi+8]
  5989. movlps xmm5, [esi+16]
  5990. movlhps xmm5, xmm4
  5991. movhlps xmm6, xmm4
  5992. movlhps xmm6, xmm5
  5993. // row 0 and 1
  5994. movaps xmm0, [edi]
  5995. movaps xmm1, [edi+16]
  5996. movaps xmm2, [edi+32]
  5997. mulps xmm0, xmm4
  5998. mulps xmm1, xmm5
  5999. mulps xmm2, xmm6
  6000. movhlps xmm7, xmm0
  6001. movlhps xmm7, xmm2
  6002. addps xmm7, xmm1
  6003. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  6004. addps xmm7, xmm0
  6005. // row 2 and 3
  6006. movaps xmm0, [edi+48]
  6007. movaps xmm1, [edi+48+16]
  6008. movaps xmm2, [edi+48+32]
  6009. mulps xmm0, xmm4
  6010. mulps xmm1, xmm5
  6011. mulps xmm2, xmm6
  6012. movhlps xmm3, xmm0
  6013. movlhps xmm3, xmm2
  6014. addps xmm1, xmm3
  6015. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  6016. addps xmm1, xmm0
  6017. // last 4 additions for the first 4 rows and store result
  6018. movaps xmm0, xmm7
  6019. shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  6020. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  6021. addps xmm0, xmm7
  6022. STORE4( 0, xmm0, xmm4 )
  6023. }
  6024. return;
  6025. }
  6026. case 5: { // 5x6 * 6x1
  6027. __asm {
  6028. mov esi, vPtr
  6029. mov edi, mPtr
  6030. mov eax, dstPtr
  6031. // load idVecX
  6032. movlps xmm4, [esi]
  6033. movhps xmm4, [esi+8]
  6034. movlps xmm5, [esi+16]
  6035. movlhps xmm5, xmm4
  6036. movhlps xmm6, xmm4
  6037. movlhps xmm6, xmm5
  6038. // row 0 and 1
  6039. movaps xmm0, [edi]
  6040. movaps xmm1, [edi+16]
  6041. movaps xmm2, [edi+32]
  6042. mulps xmm0, xmm4
  6043. mulps xmm1, xmm5
  6044. mulps xmm2, xmm6
  6045. movhlps xmm7, xmm0
  6046. movlhps xmm7, xmm2
  6047. addps xmm7, xmm1
  6048. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  6049. addps xmm7, xmm0
  6050. // row 2 and 3
  6051. movaps xmm0, [edi+48]
  6052. movaps xmm1, [edi+48+16]
  6053. movaps xmm2, [edi+48+32]
  6054. mulps xmm0, xmm4
  6055. mulps xmm1, xmm5
  6056. mulps xmm2, xmm6
  6057. movhlps xmm3, xmm0
  6058. movlhps xmm3, xmm2
  6059. addps xmm1, xmm3
  6060. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
  6061. addps xmm1, xmm0
  6062. // last 4 additions for the first 4 rows and store result
  6063. movaps xmm0, xmm7
  6064. shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
  6065. shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
  6066. addps xmm0, xmm7
  6067. STORE4( 0, xmm0, xmm3 )
  6068. // row 5
  6069. movaps xmm0, [edi+96]
  6070. movaps xmm1, [edi+96+16]
  6071. mulps xmm0, xmm4
  6072. mulps xmm1, xmm5
  6073. addps xmm0, xmm1
  6074. movhlps xmm1, xmm0
  6075. addps xmm0, xmm1
  6076. movaps xmm1, xmm0
  6077. shufps xmm1, xmm1, 0x01
  6078. addss xmm0, xmm1
  6079. STORE1( 16, xmm0, xmm3 )
  6080. }
  6081. return;
  6082. }
  6083. case 6: { // 6x6 * 6x1
  6084. __asm {
  6085. mov esi, vPtr
  6086. mov edi, mPtr
  6087. mov eax, dstPtr
  6088. movlps xmm7, qword ptr [esi]
  6089. movlps xmm6, qword ptr [esi+8]
  6090. shufps xmm7, xmm7, 0x44
  6091. shufps xmm6, xmm6, 0x44
  6092. movlps xmm0, qword ptr [edi ]
  6093. movhps xmm0, qword ptr [edi+ 24]
  6094. mulps xmm0, xmm7
  6095. movlps xmm3, qword ptr [edi+ 8]
  6096. movhps xmm3, qword ptr [edi+ 32]
  6097. mulps xmm3, xmm6
  6098. movlps xmm1, qword ptr [edi+ 48]
  6099. movhps xmm1, qword ptr [edi+ 72]
  6100. mulps xmm1, xmm7
  6101. movlps xmm2, qword ptr [edi+ 96]
  6102. movhps xmm2, qword ptr [edi+120]
  6103. mulps xmm2, xmm7
  6104. movlps xmm4, qword ptr [edi+ 56]
  6105. movhps xmm4, qword ptr [edi+ 80]
  6106. movlps xmm5, qword ptr [edi+104]
  6107. movhps xmm5, qword ptr [edi+128]
  6108. mulps xmm4, xmm6
  6109. movlps xmm7, qword ptr [esi+16]
  6110. addps xmm0, xmm3
  6111. shufps xmm7, xmm7, 0x44
  6112. mulps xmm5, xmm6
  6113. addps xmm1, xmm4
  6114. movlps xmm3, qword ptr [edi+ 16]
  6115. movhps xmm3, qword ptr [edi+ 40]
  6116. addps xmm2, xmm5
  6117. movlps xmm4, qword ptr [edi+ 64]
  6118. movhps xmm4, qword ptr [edi+ 88]
  6119. mulps xmm3, xmm7
  6120. movlps xmm5, qword ptr [edi+112]
  6121. movhps xmm5, qword ptr [edi+136]
  6122. addps xmm0, xmm3
  6123. mulps xmm4, xmm7
  6124. mulps xmm5, xmm7
  6125. addps xmm1, xmm4
  6126. addps xmm2, xmm5
  6127. movaps xmm6, xmm0
  6128. shufps xmm0, xmm1, 0x88
  6129. shufps xmm6, xmm1, 0xDD
  6130. movaps xmm7, xmm2
  6131. shufps xmm7, xmm2, 0x88
  6132. shufps xmm2, xmm2, 0xDD
  6133. addps xmm0, xmm6
  6134. addps xmm2, xmm7
  6135. STORE4( 0, xmm0, xmm3 )
  6136. STORE2LO( 16, xmm2, xmm4 )
  6137. }
  6138. return;
  6139. }
  6140. default: {
  6141. for ( int i = 0; i < numRows; i++ ) {
  6142. dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  6143. mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
  6144. mPtr += 6;
  6145. }
  6146. return;
  6147. }
  6148. }
  6149. break;
  6150. }
  6151. default: {
  6152. int numColumns = mat.GetNumColumns();
  6153. for ( int i = 0; i < numRows; i++ ) {
  6154. float sum = mPtr[0] * vPtr[0];
  6155. for ( int j = 1; j < numColumns; j++ ) {
  6156. sum += mPtr[j] * vPtr[j];
  6157. }
  6158. dstPtr[i] STOREC sum;
  6159. mPtr += numColumns;
  6160. }
  6161. break;
  6162. }
  6163. }
  6164. #undef STOREC
  6165. #undef STORE4
  6166. #undef STORE2HI
  6167. #undef STORE2LO
  6168. #undef STORE1
  6169. }
  6170. /*
  6171. ============
  6172. idSIMD_SSE::MatX_TransposeMultiplyVecX
  6173. optimizes the following matrix multiplications:
  6174. Nx6 * Nx1
  6175. 6xN * 6x1
  6176. with N in the range [1-6]
  6177. ============
  6178. */
  6179. void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  6180. #define STORE1( offset, reg1, reg2 ) \
  6181. __asm movss [eax+offset], reg1
  6182. #define STORE2LO( offset, reg1, reg2 ) \
  6183. __asm movlps [eax+offset], reg1
  6184. #define STORE2HI( offset, reg1, reg2 ) \
  6185. __asm movhps [eax+offset], reg1
  6186. #define STORE4( offset, reg1, reg2 ) \
  6187. __asm movlps [eax+offset], reg1 \
  6188. __asm movhps [eax+offset+8], reg1
  6189. #define STOREC =
  6190. int numColumns;
  6191. const float *mPtr, *vPtr;
  6192. float *dstPtr;
  6193. assert( vec.GetSize() >= mat.GetNumRows() );
  6194. assert( dst.GetSize() >= mat.GetNumColumns() );
  6195. mPtr = mat.ToFloatPtr();
  6196. vPtr = vec.ToFloatPtr();
  6197. dstPtr = dst.ToFloatPtr();
  6198. numColumns = mat.GetNumColumns();
  6199. switch( mat.GetNumRows() ) {
  6200. case 1:
  6201. switch( numColumns ) {
  6202. case 6: { // 1x6 * 1x1
  6203. __asm {
  6204. mov esi, vPtr
  6205. mov edi, mPtr
  6206. mov eax, dstPtr
  6207. movss xmm0, [esi]
  6208. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  6209. movaps xmm1, xmm0
  6210. mulps xmm0, [edi]
  6211. mulps xmm1, [edi+16]
  6212. STORE4( 0, xmm0, xmm2 )
  6213. STORE2LO( 16, xmm1, xmm3 )
  6214. }
  6215. return;
  6216. }
  6217. default: {
  6218. for ( int i = 0; i < numColumns; i++ ) {
  6219. dstPtr[i] STOREC *(mPtr) * vPtr[0];
  6220. mPtr++;
  6221. }
  6222. return;
  6223. }
  6224. }
  6225. break;
  6226. case 2:
  6227. switch( numColumns ) {
  6228. case 6: { // 2x6 * 2x1
  6229. __asm {
  6230. mov esi, vPtr
  6231. mov edi, mPtr
  6232. mov eax, dstPtr
  6233. movlps xmm0, [esi]
  6234. movaps xmm1, xmm0
  6235. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  6236. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
  6237. movaps xmm2, [edi]
  6238. mulps xmm2, xmm0
  6239. movlps xmm3, [edi+24]
  6240. movhps xmm3, [edi+32]
  6241. mulps xmm3, xmm1
  6242. addps xmm2, xmm3
  6243. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  6244. movlps xmm4, [edi+16]
  6245. movhps xmm4, [edi+40]
  6246. mulps xmm4, xmm0
  6247. movhlps xmm3, xmm4
  6248. addps xmm3, xmm4
  6249. STORE4( 0, xmm2, xmm5 )
  6250. STORE2LO( 16, xmm3, xmm6 )
  6251. }
  6252. return;
  6253. }
  6254. default: {
  6255. for ( int i = 0; i < numColumns; i++ ) {
  6256. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
  6257. mPtr++;
  6258. }
  6259. return;
  6260. }
  6261. }
  6262. break;
  6263. case 3:
  6264. switch( numColumns ) {
  6265. case 6: { // 3x6 * 3x1
  6266. __asm {
  6267. mov esi, vPtr
  6268. mov edi, mPtr
  6269. mov eax, dstPtr
  6270. movlps xmm0, [esi+0*4]
  6271. movss xmm1, [esi+2*4]
  6272. movlps xmm3, [edi+(0*6+0)*4]
  6273. movhps xmm3, [edi+(0*6+2)*4]
  6274. movaps xmm4, xmm0
  6275. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  6276. mulps xmm3, xmm4
  6277. movlps xmm5, [edi+(1*6+0)*4]
  6278. movhps xmm5, [edi+(1*6+2)*4]
  6279. movaps xmm6, xmm0
  6280. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6281. mulps xmm5, xmm6
  6282. addps xmm3, xmm5
  6283. movlps xmm4, [edi+(2*6+0)*4]
  6284. movhps xmm4, [edi+(2*6+2)*4]
  6285. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  6286. mulps xmm4, xmm1
  6287. addps xmm3, xmm4
  6288. STORE4( 0, xmm3, xmm7 )
  6289. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  6290. movlps xmm3, [edi+(0*6+4)*4]
  6291. movhps xmm3, [edi+(1*6+4)*4]
  6292. mulps xmm3, xmm0
  6293. movhlps xmm4, xmm3
  6294. addps xmm3, xmm4
  6295. movlps xmm5, [edi+(2*6+4)*4]
  6296. mulps xmm5, xmm1
  6297. addps xmm3, xmm5
  6298. STORE2LO( 16, xmm3, xmm7 )
  6299. }
  6300. return;
  6301. }
  6302. default: {
  6303. for ( int i = 0; i < numColumns; i++ ) {
  6304. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
  6305. mPtr++;
  6306. }
  6307. return;
  6308. }
  6309. }
  6310. break;
  6311. case 4:
  6312. switch( numColumns ) {
  6313. case 6: { // 4x6 * 4x1
  6314. __asm {
  6315. mov esi, vPtr
  6316. mov edi, mPtr
  6317. mov eax, dstPtr
  6318. movlps xmm0, [esi+0*4]
  6319. movlps xmm1, [esi+2*4]
  6320. movaps xmm3, xmm0
  6321. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  6322. mulps xmm3, [edi+(0*6+0)*4]
  6323. movlps xmm5, [edi+(1*6+0)*4]
  6324. movhps xmm5, [edi+(1*6+2)*4]
  6325. movaps xmm6, xmm0
  6326. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6327. mulps xmm5, xmm6
  6328. addps xmm3, xmm5
  6329. movlps xmm4, [edi+(2*6+0)*4]
  6330. movhps xmm4, [edi+(2*6+2)*4]
  6331. movaps xmm6, xmm1
  6332. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6333. mulps xmm4, xmm6
  6334. addps xmm3, xmm4
  6335. movlps xmm5, [edi+(3*6+0)*4]
  6336. movhps xmm5, [edi+(3*6+2)*4]
  6337. movaps xmm6, xmm1
  6338. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6339. mulps xmm5, xmm6
  6340. addps xmm3, xmm5
  6341. STORE4( 0, xmm3, xmm7 )
  6342. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  6343. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  6344. movlps xmm3, [edi+(0*6+4)*4]
  6345. movhps xmm3, [edi+(1*6+4)*4]
  6346. mulps xmm3, xmm0
  6347. movlps xmm4, [edi+(2*6+4)*4]
  6348. movhps xmm4, [edi+(3*6+4)*4]
  6349. mulps xmm4, xmm1
  6350. addps xmm3, xmm4
  6351. movhlps xmm4, xmm3
  6352. addps xmm3, xmm4
  6353. STORE2LO( 16, xmm3, xmm7 )
  6354. }
  6355. return;
  6356. }
  6357. default: {
  6358. for ( int i = 0; i < numColumns; i++ ) {
  6359. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  6360. *(mPtr+3*numColumns) * vPtr[3];
  6361. mPtr++;
  6362. }
  6363. return;
  6364. }
  6365. }
  6366. break;
  6367. case 5:
  6368. switch( numColumns ) {
  6369. case 6: { // 5x6 * 5x1
  6370. __asm {
  6371. mov esi, vPtr
  6372. mov edi, mPtr
  6373. mov eax, dstPtr
  6374. movlps xmm0, [esi+0*4]
  6375. movlps xmm1, [esi+2*4]
  6376. movss xmm2, [esi+4*4]
  6377. movaps xmm3, xmm0
  6378. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  6379. mulps xmm3, [edi+(0*6+0)*4]
  6380. movlps xmm5, [edi+(1*6+0)*4]
  6381. movhps xmm5, [edi+(1*6+2)*4]
  6382. movaps xmm6, xmm0
  6383. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6384. mulps xmm5, xmm6
  6385. addps xmm3, xmm5
  6386. movaps xmm6, xmm1
  6387. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6388. mulps xmm6, [edi+(2*6+0)*4]
  6389. addps xmm3, xmm6
  6390. movlps xmm5, [edi+(3*6+0)*4]
  6391. movhps xmm5, [edi+(3*6+2)*4]
  6392. movaps xmm6, xmm1
  6393. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6394. mulps xmm5, xmm6
  6395. addps xmm3, xmm5
  6396. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
  6397. movaps xmm4, xmm2
  6398. mulps xmm4, [edi+(4*6+0)*4]
  6399. addps xmm3, xmm4
  6400. STORE4( 0, xmm3, xmm7 )
  6401. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  6402. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  6403. movlps xmm3, [edi+(0*6+4)*4]
  6404. movhps xmm3, [edi+(1*6+4)*4]
  6405. mulps xmm3, xmm0
  6406. movlps xmm4, [edi+(2*6+4)*4]
  6407. movhps xmm4, [edi+(3*6+4)*4]
  6408. mulps xmm4, xmm1
  6409. addps xmm3, xmm4
  6410. movhlps xmm4, xmm3
  6411. addps xmm3, xmm4
  6412. movlps xmm5, [edi+(4*6+4)*4]
  6413. mulps xmm5, xmm2
  6414. addps xmm3, xmm5
  6415. STORE2LO( 16, xmm3, xmm7 )
  6416. }
  6417. return;
  6418. }
  6419. default: {
  6420. for ( int i = 0; i < numColumns; i++ ) {
  6421. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  6422. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
  6423. mPtr++;
  6424. }
  6425. return;
  6426. }
  6427. }
  6428. break;
  6429. case 6:
  6430. switch( numColumns ) {
  6431. case 1: { // 6x1 * 6x1
  6432. __asm {
  6433. mov esi, vPtr
  6434. mov edi, mPtr
  6435. mov eax, dstPtr
  6436. movlps xmm0, [esi]
  6437. movhps xmm0, [esi+8]
  6438. movlps xmm1, [esi+16]
  6439. mulps xmm0, [edi]
  6440. mulps xmm1, [edi+16]
  6441. shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
  6442. addps xmm0, xmm1
  6443. movhlps xmm2, xmm0
  6444. addss xmm2, xmm0
  6445. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
  6446. addss xmm2, xmm0
  6447. STORE1( 0, xmm2, xmm3 )
  6448. }
  6449. return;
  6450. }
  6451. case 2: { // 6x2 * 6x1
  6452. __asm {
  6453. mov esi, vPtr
  6454. mov edi, mPtr
  6455. mov eax, dstPtr
  6456. movlps xmm0, [esi+0*4]
  6457. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  6458. movaps xmm6, [edi+0*4]
  6459. mulps xmm6, xmm0
  6460. movlps xmm1, [esi+2*4]
  6461. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  6462. movaps xmm7, [edi+4*4]
  6463. mulps xmm7, xmm1
  6464. addps xmm6, xmm7
  6465. movlps xmm2, [esi+4*4]
  6466. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
  6467. movaps xmm7, [edi+8*4]
  6468. mulps xmm7, xmm2
  6469. addps xmm6, xmm7
  6470. movhlps xmm3, xmm6
  6471. addps xmm3, xmm6
  6472. STORE2LO( 0, xmm3, xmm7 )
  6473. }
  6474. return;
  6475. }
  6476. case 3: { // 6x3 * 6x1
  6477. __asm {
  6478. mov esi, vPtr
  6479. mov edi, mPtr
  6480. mov eax, dstPtr
  6481. movss xmm0, [edi+(0*3+2)*4]
  6482. movhps xmm0, [edi+(0*3+0)*4]
  6483. shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
  6484. movss xmm6, [esi+0*4]
  6485. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6486. mulps xmm6, xmm0
  6487. movss xmm1, [edi+(1*3+0)*4]
  6488. movhps xmm1, [edi+(1*3+1)*4]
  6489. movss xmm7, [esi+1*4]
  6490. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  6491. mulps xmm7, xmm1
  6492. addps xmm6, xmm7
  6493. movss xmm2, [edi+(2*3+2)*4]
  6494. movhps xmm2, [edi+(2*3+0)*4]
  6495. shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
  6496. movss xmm7, [esi+2*4]
  6497. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  6498. mulps xmm7, xmm2
  6499. addps xmm6, xmm7
  6500. movss xmm3, [edi+(3*3+0)*4]
  6501. movhps xmm3, [edi+(3*3+1)*4]
  6502. movss xmm7, [esi+3*4]
  6503. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  6504. mulps xmm7, xmm3
  6505. addps xmm6, xmm7
  6506. movss xmm4, [edi+(4*3+2)*4]
  6507. movhps xmm4, [edi+(4*3+0)*4]
  6508. shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
  6509. movss xmm7, [esi+4*4]
  6510. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  6511. mulps xmm7, xmm4
  6512. addps xmm6, xmm7
  6513. movss xmm5, [edi+(5*3+0)*4]
  6514. movhps xmm5, [edi+(5*3+1)*4]
  6515. movss xmm7, [esi+5*4]
  6516. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  6517. mulps xmm7, xmm5
  6518. addps xmm6, xmm7
  6519. STORE1( 0, xmm6, xmm7 )
  6520. STORE2HI( 4, xmm6, xmm7 )
  6521. }
  6522. return;
  6523. }
  6524. case 4: { // 6x4 * 6x1
  6525. __asm {
  6526. mov esi, vPtr
  6527. mov edi, mPtr
  6528. mov eax, dstPtr
  6529. movlps xmm3, [edi+(0*4+0)*4]
  6530. movhps xmm3, [edi+(0*4+2)*4]
  6531. movss xmm4, [esi+0*4]
  6532. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  6533. mulps xmm3, xmm4
  6534. movlps xmm5, [edi+(1*4+0)*4]
  6535. movhps xmm5, [edi+(1*4+2)*4]
  6536. movss xmm6, [esi+1*4]
  6537. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6538. mulps xmm5, xmm6
  6539. addps xmm3, xmm5
  6540. movlps xmm4, [edi+(2*4+0)*4]
  6541. movhps xmm4, [edi+(2*4+2)*4]
  6542. movss xmm6, [esi+2*4]
  6543. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6544. mulps xmm4, xmm6
  6545. addps xmm3, xmm4
  6546. movlps xmm5, [edi+(3*4+0)*4]
  6547. movhps xmm5, [edi+(3*4+2)*4]
  6548. movss xmm6, [esi+3*4]
  6549. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6550. mulps xmm5, xmm6
  6551. addps xmm3, xmm5
  6552. movlps xmm4, [edi+(4*4+0)*4]
  6553. movhps xmm4, [edi+(4*4+2)*4]
  6554. movss xmm6, [esi+4*4]
  6555. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6556. mulps xmm4, xmm6
  6557. addps xmm3, xmm4
  6558. movlps xmm5, [edi+(5*4+0)*4]
  6559. movhps xmm5, [edi+(5*4+2)*4]
  6560. movss xmm6, [esi+5*4]
  6561. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6562. mulps xmm5, xmm6
  6563. addps xmm3, xmm5
  6564. STORE4( 0, xmm3, xmm7 )
  6565. }
  6566. return;
  6567. }
  6568. case 5: { // 6x5 * 6x1
  6569. __asm {
  6570. mov esi, vPtr
  6571. mov edi, mPtr
  6572. mov eax, dstPtr
  6573. movlps xmm6, [edi+(0*5+0)*4]
  6574. movhps xmm6, [edi+(0*5+2)*4]
  6575. movss xmm0, [esi+0*4]
  6576. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  6577. mulps xmm6, xmm0
  6578. movlps xmm7, [edi+(1*5+0)*4]
  6579. movhps xmm7, [edi+(1*5+2)*4]
  6580. movss xmm1, [esi+1*4]
  6581. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  6582. mulps xmm7, xmm1
  6583. addps xmm6, xmm7
  6584. movlps xmm7, [edi+(2*5+0)*4]
  6585. movhps xmm7, [edi+(2*5+2)*4]
  6586. movss xmm2, [esi+2*4]
  6587. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
  6588. mulps xmm7, xmm2
  6589. addps xmm6, xmm7
  6590. movlps xmm7, [edi+(3*5+0)*4]
  6591. movhps xmm7, [edi+(3*5+2)*4]
  6592. movss xmm3, [esi+3*4]
  6593. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  6594. mulps xmm7, xmm3
  6595. addps xmm6, xmm7
  6596. movlps xmm7, [edi+(4*5+0)*4]
  6597. movhps xmm7, [edi+(4*5+2)*4]
  6598. movss xmm4, [esi+4*4]
  6599. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  6600. mulps xmm7, xmm4
  6601. addps xmm6, xmm7
  6602. movlps xmm7, [edi+(5*5+0)*4]
  6603. movhps xmm7, [edi+(5*5+2)*4]
  6604. movss xmm5, [esi+5*4]
  6605. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  6606. mulps xmm7, xmm5
  6607. addps xmm6, xmm7
  6608. STORE4( 0, xmm6, xmm7 )
  6609. movss xmm6, [edi+(0*5+4)*4]
  6610. mulss xmm6, xmm0
  6611. movss xmm7, [edi+(1*5+4)*4]
  6612. mulss xmm7, xmm1
  6613. addss xmm6, xmm7
  6614. movss xmm7, [edi+(2*5+4)*4]
  6615. mulss xmm7, xmm2
  6616. addss xmm6, xmm7
  6617. movss xmm7, [edi+(3*5+4)*4]
  6618. mulss xmm7, xmm3
  6619. addss xmm6, xmm7
  6620. movss xmm7, [edi+(4*5+4)*4]
  6621. mulss xmm7, xmm4
  6622. addss xmm6, xmm7
  6623. movss xmm7, [edi+(5*5+4)*4]
  6624. mulss xmm7, xmm5
  6625. addss xmm6, xmm7
  6626. STORE1( 16, xmm6, xmm7 )
  6627. }
  6628. return;
  6629. }
  6630. case 6: { // 6x6 * 6x1
  6631. __asm {
  6632. mov esi, vPtr
  6633. mov edi, mPtr
  6634. mov eax, dstPtr
  6635. movlps xmm0, [esi+0*4]
  6636. movlps xmm1, [esi+2*4]
  6637. movlps xmm2, [esi+4*4]
  6638. movaps xmm3, xmm0
  6639. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  6640. mulps xmm3, [edi+(0*6+0)*4]
  6641. movlps xmm5, [edi+(1*6+0)*4]
  6642. movhps xmm5, [edi+(1*6+2)*4]
  6643. movaps xmm6, xmm0
  6644. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6645. mulps xmm5, xmm6
  6646. addps xmm3, xmm5
  6647. movaps xmm6, xmm1
  6648. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6649. mulps xmm6, [edi+(2*6+0)*4]
  6650. addps xmm3, xmm6
  6651. movaps xmm6, xmm1
  6652. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6653. movlps xmm5, [edi+(3*6+0)*4]
  6654. movhps xmm5, [edi+(3*6+2)*4]
  6655. mulps xmm5, xmm6
  6656. addps xmm3, xmm5
  6657. movaps xmm6, xmm2
  6658. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6659. mulps xmm6, [edi+(4*6+0)*4]
  6660. addps xmm3, xmm6
  6661. movaps xmm6, xmm2
  6662. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6663. movlps xmm5, [edi+(5*6+0)*4]
  6664. movhps xmm5, [edi+(5*6+2)*4]
  6665. mulps xmm5, xmm6
  6666. addps xmm3, xmm5
  6667. STORE4( 0, xmm3, xmm7 )
  6668. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  6669. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  6670. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
  6671. movlps xmm3, [edi+(0*6+4)*4]
  6672. movhps xmm3, [edi+(1*6+4)*4]
  6673. mulps xmm3, xmm0
  6674. movlps xmm4, [edi+(2*6+4)*4]
  6675. movhps xmm4, [edi+(3*6+4)*4]
  6676. mulps xmm4, xmm1
  6677. addps xmm3, xmm4
  6678. movlps xmm5, [edi+(4*6+4)*4]
  6679. movhps xmm5, [edi+(5*6+4)*4]
  6680. mulps xmm5, xmm2
  6681. addps xmm3, xmm5
  6682. movhlps xmm4, xmm3
  6683. addps xmm3, xmm4
  6684. STORE2LO( 16, xmm3, xmm7 )
  6685. }
  6686. return;
  6687. }
  6688. default: {
  6689. for ( int i = 0; i < numColumns; i++ ) {
  6690. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  6691. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
  6692. mPtr++;
  6693. }
  6694. return;
  6695. }
  6696. }
  6697. break;
  6698. default:
  6699. int numRows = mat.GetNumRows();
  6700. for ( int i = 0; i < numColumns; i++ ) {
  6701. mPtr = mat.ToFloatPtr() + i;
  6702. float sum = mPtr[0] * vPtr[0];
  6703. for ( int j = 1; j < numRows; j++ ) {
  6704. mPtr += numColumns;
  6705. sum += mPtr[0] * vPtr[j];
  6706. }
  6707. dstPtr[i] STOREC sum;
  6708. }
  6709. break;
  6710. }
  6711. #undef STOREC
  6712. #undef STORE4
  6713. #undef STORE2HI
  6714. #undef STORE2LO
  6715. #undef STORE1
  6716. }
  6717. /*
  6718. ============
  6719. idSIMD_SSE::MatX_TransposeMultiplyAddVecX
  6720. optimizes the following matrix multiplications:
  6721. Nx6 * Nx1
  6722. 6xN * 6x1
  6723. with N in the range [1-6]
  6724. ============
  6725. */
  6726. void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  6727. #define STORE1( offset, reg1, reg2 ) \
  6728. __asm movss reg2, [eax+offset] \
  6729. __asm addss reg2, reg1 \
  6730. __asm movss [eax+offset], reg2
  6731. #define STORE2LO( offset, reg1, reg2 ) \
  6732. __asm movlps reg2, [eax+offset] \
  6733. __asm addps reg2, reg1 \
  6734. __asm movlps [eax+offset], reg2
  6735. #define STORE2HI( offset, reg1, reg2 ) \
  6736. __asm movhps reg2, [eax+offset] \
  6737. __asm addps reg2, reg1 \
  6738. __asm movhps [eax+offset], reg2
  6739. #define STORE4( offset, reg1, reg2 ) \
  6740. __asm movlps reg2, [eax+offset] \
  6741. __asm movhps reg2, [eax+offset+8] \
  6742. __asm addps reg2, reg1 \
  6743. __asm movlps [eax+offset], reg2 \
  6744. __asm movhps [eax+offset+8], reg2
  6745. #define STOREC +=
  6746. int numColumns;
  6747. const float *mPtr, *vPtr;
  6748. float *dstPtr;
  6749. assert( vec.GetSize() >= mat.GetNumRows() );
  6750. assert( dst.GetSize() >= mat.GetNumColumns() );
  6751. mPtr = mat.ToFloatPtr();
  6752. vPtr = vec.ToFloatPtr();
  6753. dstPtr = dst.ToFloatPtr();
  6754. numColumns = mat.GetNumColumns();
  6755. switch( mat.GetNumRows() ) {
  6756. case 1:
  6757. switch( numColumns ) {
  6758. case 6: { // 1x6 * 1x1
  6759. __asm {
  6760. mov esi, vPtr
  6761. mov edi, mPtr
  6762. mov eax, dstPtr
  6763. movss xmm0, [esi]
  6764. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  6765. movaps xmm1, xmm0
  6766. mulps xmm0, [edi]
  6767. mulps xmm1, [edi+16]
  6768. STORE4( 0, xmm0, xmm2 )
  6769. STORE2LO( 16, xmm1, xmm3 )
  6770. }
  6771. return;
  6772. }
  6773. default: {
  6774. for ( int i = 0; i < numColumns; i++ ) {
  6775. dstPtr[i] STOREC *(mPtr) * vPtr[0];
  6776. mPtr++;
  6777. }
  6778. return;
  6779. }
  6780. }
  6781. break;
  6782. case 2:
  6783. switch( numColumns ) {
  6784. case 6: { // 2x6 * 2x1
  6785. __asm {
  6786. mov esi, vPtr
  6787. mov edi, mPtr
  6788. mov eax, dstPtr
  6789. movlps xmm0, [esi]
  6790. movaps xmm1, xmm0
  6791. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  6792. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
  6793. movaps xmm2, [edi]
  6794. mulps xmm2, xmm0
  6795. movlps xmm3, [edi+24]
  6796. movhps xmm3, [edi+32]
  6797. mulps xmm3, xmm1
  6798. addps xmm2, xmm3
  6799. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  6800. movlps xmm4, [edi+16]
  6801. movhps xmm4, [edi+40]
  6802. mulps xmm4, xmm0
  6803. movhlps xmm3, xmm4
  6804. addps xmm3, xmm4
  6805. STORE4( 0, xmm2, xmm5 )
  6806. STORE2LO( 16, xmm3, xmm6 )
  6807. }
  6808. return;
  6809. }
  6810. default: {
  6811. for ( int i = 0; i < numColumns; i++ ) {
  6812. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
  6813. mPtr++;
  6814. }
  6815. return;
  6816. }
  6817. }
  6818. break;
  6819. case 3:
  6820. switch( numColumns ) {
  6821. case 6: { // 3x6 * 3x1
  6822. __asm {
  6823. mov esi, vPtr
  6824. mov edi, mPtr
  6825. mov eax, dstPtr
  6826. movlps xmm0, [esi+0*4]
  6827. movss xmm1, [esi+2*4]
  6828. movlps xmm3, [edi+(0*6+0)*4]
  6829. movhps xmm3, [edi+(0*6+2)*4]
  6830. movaps xmm4, xmm0
  6831. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  6832. mulps xmm3, xmm4
  6833. movlps xmm5, [edi+(1*6+0)*4]
  6834. movhps xmm5, [edi+(1*6+2)*4]
  6835. movaps xmm6, xmm0
  6836. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6837. mulps xmm5, xmm6
  6838. addps xmm3, xmm5
  6839. movlps xmm4, [edi+(2*6+0)*4]
  6840. movhps xmm4, [edi+(2*6+2)*4]
  6841. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  6842. mulps xmm4, xmm1
  6843. addps xmm3, xmm4
  6844. STORE4( 0, xmm3, xmm7 )
  6845. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  6846. movlps xmm3, [edi+(0*6+4)*4]
  6847. movhps xmm3, [edi+(1*6+4)*4]
  6848. mulps xmm3, xmm0
  6849. movhlps xmm4, xmm3
  6850. addps xmm3, xmm4
  6851. movlps xmm5, [edi+(2*6+4)*4]
  6852. mulps xmm5, xmm1
  6853. addps xmm3, xmm5
  6854. STORE2LO( 16, xmm3, xmm7 )
  6855. }
  6856. return;
  6857. }
  6858. default: {
  6859. for ( int i = 0; i < numColumns; i++ ) {
  6860. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
  6861. mPtr++;
  6862. }
  6863. return;
  6864. }
  6865. }
  6866. break;
  6867. case 4:
  6868. switch( numColumns ) {
  6869. case 6: { // 4x6 * 4x1
  6870. __asm {
  6871. mov esi, vPtr
  6872. mov edi, mPtr
  6873. mov eax, dstPtr
  6874. movlps xmm0, [esi+0*4]
  6875. movlps xmm1, [esi+2*4]
  6876. movaps xmm3, xmm0
  6877. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  6878. mulps xmm3, [edi+(0*6+0)*4]
  6879. movlps xmm5, [edi+(1*6+0)*4]
  6880. movhps xmm5, [edi+(1*6+2)*4]
  6881. movaps xmm6, xmm0
  6882. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6883. mulps xmm5, xmm6
  6884. addps xmm3, xmm5
  6885. movlps xmm4, [edi+(2*6+0)*4]
  6886. movhps xmm4, [edi+(2*6+2)*4]
  6887. movaps xmm6, xmm1
  6888. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6889. mulps xmm4, xmm6
  6890. addps xmm3, xmm4
  6891. movlps xmm5, [edi+(3*6+0)*4]
  6892. movhps xmm5, [edi+(3*6+2)*4]
  6893. movaps xmm6, xmm1
  6894. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6895. mulps xmm5, xmm6
  6896. addps xmm3, xmm5
  6897. STORE4( 0, xmm3, xmm7 )
  6898. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  6899. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  6900. movlps xmm3, [edi+(0*6+4)*4]
  6901. movhps xmm3, [edi+(1*6+4)*4]
  6902. mulps xmm3, xmm0
  6903. movlps xmm4, [edi+(2*6+4)*4]
  6904. movhps xmm4, [edi+(3*6+4)*4]
  6905. mulps xmm4, xmm1
  6906. addps xmm3, xmm4
  6907. movhlps xmm4, xmm3
  6908. addps xmm3, xmm4
  6909. STORE2LO( 16, xmm3, xmm7 )
  6910. }
  6911. return;
  6912. }
  6913. default: {
  6914. for ( int i = 0; i < numColumns; i++ ) {
  6915. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  6916. *(mPtr+3*numColumns) * vPtr[3];
  6917. mPtr++;
  6918. }
  6919. return;
  6920. }
  6921. }
  6922. break;
  6923. case 5:
  6924. switch( numColumns ) {
  6925. case 6: { // 5x6 * 5x1
  6926. __asm {
  6927. mov esi, vPtr
  6928. mov edi, mPtr
  6929. mov eax, dstPtr
  6930. movlps xmm0, [esi+0*4]
  6931. movlps xmm1, [esi+2*4]
  6932. movss xmm2, [esi+4*4]
  6933. movaps xmm3, xmm0
  6934. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  6935. mulps xmm3, [edi+(0*6+0)*4]
  6936. movlps xmm5, [edi+(1*6+0)*4]
  6937. movhps xmm5, [edi+(1*6+2)*4]
  6938. movaps xmm6, xmm0
  6939. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6940. mulps xmm5, xmm6
  6941. addps xmm3, xmm5
  6942. movaps xmm6, xmm1
  6943. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  6944. mulps xmm6, [edi+(2*6+0)*4]
  6945. addps xmm3, xmm6
  6946. movlps xmm5, [edi+(3*6+0)*4]
  6947. movhps xmm5, [edi+(3*6+2)*4]
  6948. movaps xmm6, xmm1
  6949. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  6950. mulps xmm5, xmm6
  6951. addps xmm3, xmm5
  6952. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
  6953. movaps xmm4, xmm2
  6954. mulps xmm4, [edi+(4*6+0)*4]
  6955. addps xmm3, xmm4
  6956. STORE4( 0, xmm3, xmm7 )
  6957. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  6958. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  6959. movlps xmm3, [edi+(0*6+4)*4]
  6960. movhps xmm3, [edi+(1*6+4)*4]
  6961. mulps xmm3, xmm0
  6962. movlps xmm4, [edi+(2*6+4)*4]
  6963. movhps xmm4, [edi+(3*6+4)*4]
  6964. mulps xmm4, xmm1
  6965. addps xmm3, xmm4
  6966. movhlps xmm4, xmm3
  6967. addps xmm3, xmm4
  6968. movlps xmm5, [edi+(4*6+4)*4]
  6969. mulps xmm5, xmm2
  6970. addps xmm3, xmm5
  6971. STORE2LO( 16, xmm3, xmm7 )
  6972. }
  6973. return;
  6974. }
  6975. default: {
  6976. for ( int i = 0; i < numColumns; i++ ) {
  6977. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  6978. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
  6979. mPtr++;
  6980. }
  6981. return;
  6982. }
  6983. }
  6984. break;
  6985. case 6:
  6986. switch( numColumns ) {
  6987. case 1: { // 6x1 * 6x1
  6988. __asm {
  6989. mov esi, vPtr
  6990. mov edi, mPtr
  6991. mov eax, dstPtr
  6992. movlps xmm0, [esi]
  6993. movhps xmm0, [esi+8]
  6994. movlps xmm1, [esi+16]
  6995. mulps xmm0, [edi]
  6996. mulps xmm1, [edi+16]
  6997. shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
  6998. addps xmm0, xmm1
  6999. movhlps xmm2, xmm0
  7000. addss xmm2, xmm0
  7001. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
  7002. addss xmm2, xmm0
  7003. STORE1( 0, xmm2, xmm3 )
  7004. }
  7005. return;
  7006. }
  7007. case 2: { // 6x2 * 6x1
  7008. __asm {
  7009. mov esi, vPtr
  7010. mov edi, mPtr
  7011. mov eax, dstPtr
  7012. movlps xmm0, [esi+0*4]
  7013. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  7014. movaps xmm6, [edi+0*4]
  7015. mulps xmm6, xmm0
  7016. movlps xmm1, [esi+2*4]
  7017. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  7018. movaps xmm7, [edi+4*4]
  7019. mulps xmm7, xmm1
  7020. addps xmm6, xmm7
  7021. movlps xmm2, [esi+4*4]
  7022. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
  7023. movaps xmm7, [edi+8*4]
  7024. mulps xmm7, xmm2
  7025. addps xmm6, xmm7
  7026. movhlps xmm3, xmm6
  7027. addps xmm3, xmm6
  7028. STORE2LO( 0, xmm3, xmm7 )
  7029. }
  7030. return;
  7031. }
  7032. case 3: { // 6x3 * 6x1
  7033. __asm {
  7034. mov esi, vPtr
  7035. mov edi, mPtr
  7036. mov eax, dstPtr
  7037. movss xmm0, [edi+(0*3+2)*4]
  7038. movhps xmm0, [edi+(0*3+0)*4]
  7039. shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
  7040. movss xmm6, [esi+0*4]
  7041. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7042. mulps xmm6, xmm0
  7043. movss xmm1, [edi+(1*3+0)*4]
  7044. movhps xmm1, [edi+(1*3+1)*4]
  7045. movss xmm7, [esi+1*4]
  7046. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  7047. mulps xmm7, xmm1
  7048. addps xmm6, xmm7
  7049. movss xmm2, [edi+(2*3+2)*4]
  7050. movhps xmm2, [edi+(2*3+0)*4]
  7051. shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
  7052. movss xmm7, [esi+2*4]
  7053. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  7054. mulps xmm7, xmm2
  7055. addps xmm6, xmm7
  7056. movss xmm3, [edi+(3*3+0)*4]
  7057. movhps xmm3, [edi+(3*3+1)*4]
  7058. movss xmm7, [esi+3*4]
  7059. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  7060. mulps xmm7, xmm3
  7061. addps xmm6, xmm7
  7062. movss xmm4, [edi+(4*3+2)*4]
  7063. movhps xmm4, [edi+(4*3+0)*4]
  7064. shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
  7065. movss xmm7, [esi+4*4]
  7066. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  7067. mulps xmm7, xmm4
  7068. addps xmm6, xmm7
  7069. movss xmm5, [edi+(5*3+0)*4]
  7070. movhps xmm5, [edi+(5*3+1)*4]
  7071. movss xmm7, [esi+5*4]
  7072. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  7073. mulps xmm7, xmm5
  7074. addps xmm6, xmm7
  7075. STORE1( 0, xmm6, xmm7 )
  7076. STORE2HI( 4, xmm6, xmm7 )
  7077. }
  7078. return;
  7079. }
  7080. case 4: { // 6x4 * 6x1
  7081. __asm {
  7082. mov esi, vPtr
  7083. mov edi, mPtr
  7084. mov eax, dstPtr
  7085. movlps xmm3, [edi+(0*4+0)*4]
  7086. movhps xmm3, [edi+(0*4+2)*4]
  7087. movss xmm4, [esi+0*4]
  7088. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  7089. mulps xmm3, xmm4
  7090. movlps xmm5, [edi+(1*4+0)*4]
  7091. movhps xmm5, [edi+(1*4+2)*4]
  7092. movss xmm6, [esi+1*4]
  7093. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7094. mulps xmm5, xmm6
  7095. addps xmm3, xmm5
  7096. movlps xmm4, [edi+(2*4+0)*4]
  7097. movhps xmm4, [edi+(2*4+2)*4]
  7098. movss xmm6, [esi+2*4]
  7099. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7100. mulps xmm4, xmm6
  7101. addps xmm3, xmm4
  7102. movlps xmm5, [edi+(3*4+0)*4]
  7103. movhps xmm5, [edi+(3*4+2)*4]
  7104. movss xmm6, [esi+3*4]
  7105. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7106. mulps xmm5, xmm6
  7107. addps xmm3, xmm5
  7108. movlps xmm4, [edi+(4*4+0)*4]
  7109. movhps xmm4, [edi+(4*4+2)*4]
  7110. movss xmm6, [esi+4*4]
  7111. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7112. mulps xmm4, xmm6
  7113. addps xmm3, xmm4
  7114. movlps xmm5, [edi+(5*4+0)*4]
  7115. movhps xmm5, [edi+(5*4+2)*4]
  7116. movss xmm6, [esi+5*4]
  7117. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7118. mulps xmm5, xmm6
  7119. addps xmm3, xmm5
  7120. STORE4( 0, xmm3, xmm7 )
  7121. }
  7122. return;
  7123. }
  7124. case 5: { // 6x5 * 6x1
  7125. __asm {
  7126. mov esi, vPtr
  7127. mov edi, mPtr
  7128. mov eax, dstPtr
  7129. movlps xmm6, [edi+(0*5+0)*4]
  7130. movhps xmm6, [edi+(0*5+2)*4]
  7131. movss xmm0, [esi+0*4]
  7132. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  7133. mulps xmm6, xmm0
  7134. movlps xmm7, [edi+(1*5+0)*4]
  7135. movhps xmm7, [edi+(1*5+2)*4]
  7136. movss xmm1, [esi+1*4]
  7137. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  7138. mulps xmm7, xmm1
  7139. addps xmm6, xmm7
  7140. movlps xmm7, [edi+(2*5+0)*4]
  7141. movhps xmm7, [edi+(2*5+2)*4]
  7142. movss xmm2, [esi+2*4]
  7143. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
  7144. mulps xmm7, xmm2
  7145. addps xmm6, xmm7
  7146. movlps xmm7, [edi+(3*5+0)*4]
  7147. movhps xmm7, [edi+(3*5+2)*4]
  7148. movss xmm3, [esi+3*4]
  7149. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  7150. mulps xmm7, xmm3
  7151. addps xmm6, xmm7
  7152. movlps xmm7, [edi+(4*5+0)*4]
  7153. movhps xmm7, [edi+(4*5+2)*4]
  7154. movss xmm4, [esi+4*4]
  7155. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  7156. mulps xmm7, xmm4
  7157. addps xmm6, xmm7
  7158. movlps xmm7, [edi+(5*5+0)*4]
  7159. movhps xmm7, [edi+(5*5+2)*4]
  7160. movss xmm5, [esi+5*4]
  7161. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  7162. mulps xmm7, xmm5
  7163. addps xmm6, xmm7
  7164. STORE4( 0, xmm6, xmm7 )
  7165. movss xmm6, [edi+(0*5+4)*4]
  7166. mulss xmm6, xmm0
  7167. movss xmm7, [edi+(1*5+4)*4]
  7168. mulss xmm7, xmm1
  7169. addss xmm6, xmm7
  7170. movss xmm7, [edi+(2*5+4)*4]
  7171. mulss xmm7, xmm2
  7172. addss xmm6, xmm7
  7173. movss xmm7, [edi+(3*5+4)*4]
  7174. mulss xmm7, xmm3
  7175. addss xmm6, xmm7
  7176. movss xmm7, [edi+(4*5+4)*4]
  7177. mulss xmm7, xmm4
  7178. addss xmm6, xmm7
  7179. movss xmm7, [edi+(5*5+4)*4]
  7180. mulss xmm7, xmm5
  7181. addss xmm6, xmm7
  7182. STORE1( 16, xmm6, xmm7 )
  7183. }
  7184. return;
  7185. }
  7186. case 6: { // 6x6 * 6x1
  7187. __asm {
  7188. mov esi, vPtr
  7189. mov edi, mPtr
  7190. mov eax, dstPtr
  7191. movlps xmm0, [esi+0*4]
  7192. movlps xmm1, [esi+2*4]
  7193. movlps xmm2, [esi+4*4]
  7194. movaps xmm3, xmm0
  7195. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  7196. mulps xmm3, [edi+(0*6+0)*4]
  7197. movlps xmm5, [edi+(1*6+0)*4]
  7198. movhps xmm5, [edi+(1*6+2)*4]
  7199. movaps xmm6, xmm0
  7200. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7201. mulps xmm5, xmm6
  7202. addps xmm3, xmm5
  7203. movaps xmm6, xmm1
  7204. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7205. mulps xmm6, [edi+(2*6+0)*4]
  7206. addps xmm3, xmm6
  7207. movaps xmm6, xmm1
  7208. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7209. movlps xmm5, [edi+(3*6+0)*4]
  7210. movhps xmm5, [edi+(3*6+2)*4]
  7211. mulps xmm5, xmm6
  7212. addps xmm3, xmm5
  7213. movaps xmm6, xmm2
  7214. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7215. mulps xmm6, [edi+(4*6+0)*4]
  7216. addps xmm3, xmm6
  7217. movaps xmm6, xmm2
  7218. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7219. movlps xmm5, [edi+(5*6+0)*4]
  7220. movhps xmm5, [edi+(5*6+2)*4]
  7221. mulps xmm5, xmm6
  7222. addps xmm3, xmm5
  7223. STORE4( 0, xmm3, xmm7 )
  7224. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  7225. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  7226. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
  7227. movlps xmm3, [edi+(0*6+4)*4]
  7228. movhps xmm3, [edi+(1*6+4)*4]
  7229. mulps xmm3, xmm0
  7230. movlps xmm4, [edi+(2*6+4)*4]
  7231. movhps xmm4, [edi+(3*6+4)*4]
  7232. mulps xmm4, xmm1
  7233. addps xmm3, xmm4
  7234. movlps xmm5, [edi+(4*6+4)*4]
  7235. movhps xmm5, [edi+(5*6+4)*4]
  7236. mulps xmm5, xmm2
  7237. addps xmm3, xmm5
  7238. movhlps xmm4, xmm3
  7239. addps xmm3, xmm4
  7240. STORE2LO( 16, xmm3, xmm7 )
  7241. }
  7242. return;
  7243. }
  7244. default: {
  7245. for ( int i = 0; i < numColumns; i++ ) {
  7246. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  7247. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
  7248. mPtr++;
  7249. }
  7250. return;
  7251. }
  7252. }
  7253. break;
  7254. default:
  7255. int numRows = mat.GetNumRows();
  7256. for ( int i = 0; i < numColumns; i++ ) {
  7257. mPtr = mat.ToFloatPtr() + i;
  7258. float sum = mPtr[0] * vPtr[0];
  7259. for ( int j = 1; j < numRows; j++ ) {
  7260. mPtr += numColumns;
  7261. sum += mPtr[0] * vPtr[j];
  7262. }
  7263. dstPtr[i] STOREC sum;
  7264. }
  7265. break;
  7266. }
  7267. #undef STOREC
  7268. #undef STORE4
  7269. #undef STORE2HI
  7270. #undef STORE2LO
  7271. #undef STORE1
  7272. }
  7273. /*
  7274. ============
  7275. void idSIMD_SSE::MatX_TransposeMultiplySubVecX
  7276. optimizes the following matrix multiplications:
  7277. Nx6 * Nx1
  7278. 6xN * 6x1
  7279. with N in the range [1-6]
  7280. ============
  7281. */
  7282. void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  7283. #define STORE1( offset, reg1, reg2 ) \
  7284. __asm movss reg2, [eax+offset] \
  7285. __asm subss reg2, reg1 \
  7286. __asm movss [eax+offset], reg2
  7287. #define STORE2LO( offset, reg1, reg2 ) \
  7288. __asm movlps reg2, [eax+offset] \
  7289. __asm subps reg2, reg1 \
  7290. __asm movlps [eax+offset], reg2
  7291. #define STORE2HI( offset, reg1, reg2 ) \
  7292. __asm movhps reg2, [eax+offset] \
  7293. __asm subps reg2, reg1 \
  7294. __asm movhps [eax+offset], reg2
  7295. #define STORE4( offset, reg1, reg2 ) \
  7296. __asm movlps reg2, [eax+offset] \
  7297. __asm movhps reg2, [eax+offset+8] \
  7298. __asm subps reg2, reg1 \
  7299. __asm movlps [eax+offset], reg2 \
  7300. __asm movhps [eax+offset+8], reg2
  7301. #define STOREC -=
  7302. int numColumns;
  7303. const float *mPtr, *vPtr;
  7304. float *dstPtr;
  7305. assert( vec.GetSize() >= mat.GetNumRows() );
  7306. assert( dst.GetSize() >= mat.GetNumColumns() );
  7307. mPtr = mat.ToFloatPtr();
  7308. vPtr = vec.ToFloatPtr();
  7309. dstPtr = dst.ToFloatPtr();
  7310. numColumns = mat.GetNumColumns();
  7311. switch( mat.GetNumRows() ) {
  7312. case 1:
  7313. switch( numColumns ) {
  7314. case 6: { // 1x6 * 1x1
  7315. __asm {
  7316. mov esi, vPtr
  7317. mov edi, mPtr
  7318. mov eax, dstPtr
  7319. movss xmm0, [esi]
  7320. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  7321. movaps xmm1, xmm0
  7322. mulps xmm0, [edi]
  7323. mulps xmm1, [edi+16]
  7324. STORE4( 0, xmm0, xmm2 )
  7325. STORE2LO( 16, xmm1, xmm3 )
  7326. }
  7327. return;
  7328. }
  7329. default: {
  7330. for ( int i = 0; i < numColumns; i++ ) {
  7331. dstPtr[i] STOREC *(mPtr) * vPtr[0];
  7332. mPtr++;
  7333. }
  7334. return;
  7335. }
  7336. }
  7337. break;
  7338. case 2:
  7339. switch( numColumns ) {
  7340. case 6: { // 2x6 * 2x1
  7341. __asm {
  7342. mov esi, vPtr
  7343. mov edi, mPtr
  7344. mov eax, dstPtr
  7345. movlps xmm0, [esi]
  7346. movaps xmm1, xmm0
  7347. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  7348. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
  7349. movaps xmm2, [edi]
  7350. mulps xmm2, xmm0
  7351. movlps xmm3, [edi+24]
  7352. movhps xmm3, [edi+32]
  7353. mulps xmm3, xmm1
  7354. addps xmm2, xmm3
  7355. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  7356. movlps xmm4, [edi+16]
  7357. movhps xmm4, [edi+40]
  7358. mulps xmm4, xmm0
  7359. movhlps xmm3, xmm4
  7360. addps xmm3, xmm4
  7361. STORE4( 0, xmm2, xmm5 )
  7362. STORE2LO( 16, xmm3, xmm6 )
  7363. }
  7364. return;
  7365. }
  7366. default: {
  7367. for ( int i = 0; i < numColumns; i++ ) {
  7368. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
  7369. mPtr++;
  7370. }
  7371. return;
  7372. }
  7373. }
  7374. break;
  7375. case 3:
  7376. switch( numColumns ) {
  7377. case 6: { // 3x6 * 3x1
  7378. __asm {
  7379. mov esi, vPtr
  7380. mov edi, mPtr
  7381. mov eax, dstPtr
  7382. movlps xmm0, [esi+0*4]
  7383. movss xmm1, [esi+2*4]
  7384. movlps xmm3, [edi+(0*6+0)*4]
  7385. movhps xmm3, [edi+(0*6+2)*4]
  7386. movaps xmm4, xmm0
  7387. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  7388. mulps xmm3, xmm4
  7389. movlps xmm5, [edi+(1*6+0)*4]
  7390. movhps xmm5, [edi+(1*6+2)*4]
  7391. movaps xmm6, xmm0
  7392. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7393. mulps xmm5, xmm6
  7394. addps xmm3, xmm5
  7395. movlps xmm4, [edi+(2*6+0)*4]
  7396. movhps xmm4, [edi+(2*6+2)*4]
  7397. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  7398. mulps xmm4, xmm1
  7399. addps xmm3, xmm4
  7400. STORE4( 0, xmm3, xmm7 )
  7401. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  7402. movlps xmm3, [edi+(0*6+4)*4]
  7403. movhps xmm3, [edi+(1*6+4)*4]
  7404. mulps xmm3, xmm0
  7405. movhlps xmm4, xmm3
  7406. addps xmm3, xmm4
  7407. movlps xmm5, [edi+(2*6+4)*4]
  7408. mulps xmm5, xmm1
  7409. addps xmm3, xmm5
  7410. STORE2LO( 16, xmm3, xmm7 )
  7411. }
  7412. return;
  7413. }
  7414. default: {
  7415. for ( int i = 0; i < numColumns; i++ ) {
  7416. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
  7417. mPtr++;
  7418. }
  7419. return;
  7420. }
  7421. }
  7422. break;
  7423. case 4:
  7424. switch( numColumns ) {
  7425. case 6: { // 4x6 * 4x1
  7426. __asm {
  7427. mov esi, vPtr
  7428. mov edi, mPtr
  7429. mov eax, dstPtr
  7430. movlps xmm0, [esi+0*4]
  7431. movlps xmm1, [esi+2*4]
  7432. movaps xmm3, xmm0
  7433. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  7434. mulps xmm3, [edi+(0*6+0)*4]
  7435. movlps xmm5, [edi+(1*6+0)*4]
  7436. movhps xmm5, [edi+(1*6+2)*4]
  7437. movaps xmm6, xmm0
  7438. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7439. mulps xmm5, xmm6
  7440. addps xmm3, xmm5
  7441. movlps xmm4, [edi+(2*6+0)*4]
  7442. movhps xmm4, [edi+(2*6+2)*4]
  7443. movaps xmm6, xmm1
  7444. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7445. mulps xmm4, xmm6
  7446. addps xmm3, xmm4
  7447. movlps xmm5, [edi+(3*6+0)*4]
  7448. movhps xmm5, [edi+(3*6+2)*4]
  7449. movaps xmm6, xmm1
  7450. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7451. mulps xmm5, xmm6
  7452. addps xmm3, xmm5
  7453. STORE4( 0, xmm3, xmm7 )
  7454. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  7455. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  7456. movlps xmm3, [edi+(0*6+4)*4]
  7457. movhps xmm3, [edi+(1*6+4)*4]
  7458. mulps xmm3, xmm0
  7459. movlps xmm4, [edi+(2*6+4)*4]
  7460. movhps xmm4, [edi+(3*6+4)*4]
  7461. mulps xmm4, xmm1
  7462. addps xmm3, xmm4
  7463. movhlps xmm4, xmm3
  7464. addps xmm3, xmm4
  7465. STORE2LO( 16, xmm3, xmm7 )
  7466. }
  7467. return;
  7468. }
  7469. default: {
  7470. for ( int i = 0; i < numColumns; i++ ) {
  7471. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  7472. *(mPtr+3*numColumns) * vPtr[3];
  7473. mPtr++;
  7474. }
  7475. return;
  7476. }
  7477. }
  7478. break;
  7479. case 5:
  7480. switch( numColumns ) {
  7481. case 6: { // 5x6 * 5x1
  7482. __asm {
  7483. mov esi, vPtr
  7484. mov edi, mPtr
  7485. mov eax, dstPtr
  7486. movlps xmm0, [esi+0*4]
  7487. movlps xmm1, [esi+2*4]
  7488. movss xmm2, [esi+4*4]
  7489. movaps xmm3, xmm0
  7490. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  7491. mulps xmm3, [edi+(0*6+0)*4]
  7492. movlps xmm5, [edi+(1*6+0)*4]
  7493. movhps xmm5, [edi+(1*6+2)*4]
  7494. movaps xmm6, xmm0
  7495. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7496. mulps xmm5, xmm6
  7497. addps xmm3, xmm5
  7498. movaps xmm6, xmm1
  7499. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7500. mulps xmm6, [edi+(2*6+0)*4]
  7501. addps xmm3, xmm6
  7502. movlps xmm5, [edi+(3*6+0)*4]
  7503. movhps xmm5, [edi+(3*6+2)*4]
  7504. movaps xmm6, xmm1
  7505. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7506. mulps xmm5, xmm6
  7507. addps xmm3, xmm5
  7508. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
  7509. movaps xmm4, xmm2
  7510. mulps xmm4, [edi+(4*6+0)*4]
  7511. addps xmm3, xmm4
  7512. STORE4( 0, xmm3, xmm7 )
  7513. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  7514. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  7515. movlps xmm3, [edi+(0*6+4)*4]
  7516. movhps xmm3, [edi+(1*6+4)*4]
  7517. mulps xmm3, xmm0
  7518. movlps xmm4, [edi+(2*6+4)*4]
  7519. movhps xmm4, [edi+(3*6+4)*4]
  7520. mulps xmm4, xmm1
  7521. addps xmm3, xmm4
  7522. movhlps xmm4, xmm3
  7523. addps xmm3, xmm4
  7524. movlps xmm5, [edi+(4*6+4)*4]
  7525. mulps xmm5, xmm2
  7526. addps xmm3, xmm5
  7527. STORE2LO( 16, xmm3, xmm7 )
  7528. }
  7529. return;
  7530. }
  7531. default: {
  7532. for ( int i = 0; i < numColumns; i++ ) {
  7533. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  7534. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
  7535. mPtr++;
  7536. }
  7537. return;
  7538. }
  7539. }
  7540. break;
  7541. case 6:
  7542. switch( numColumns ) {
  7543. case 1: { // 6x1 * 6x1
  7544. __asm {
  7545. mov esi, vPtr
  7546. mov edi, mPtr
  7547. mov eax, dstPtr
  7548. movlps xmm0, [esi]
  7549. movhps xmm0, [esi+8]
  7550. movlps xmm1, [esi+16]
  7551. mulps xmm0, [edi]
  7552. mulps xmm1, [edi+16]
  7553. shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
  7554. addps xmm0, xmm1
  7555. movhlps xmm2, xmm0
  7556. addss xmm2, xmm0
  7557. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
  7558. addss xmm2, xmm0
  7559. STORE1( 0, xmm2, xmm3 )
  7560. }
  7561. return;
  7562. }
  7563. case 2: { // 6x2 * 6x1
  7564. __asm {
  7565. mov esi, vPtr
  7566. mov edi, mPtr
  7567. mov eax, dstPtr
  7568. movlps xmm0, [esi+0*4]
  7569. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  7570. movaps xmm6, [edi+0*4]
  7571. mulps xmm6, xmm0
  7572. movlps xmm1, [esi+2*4]
  7573. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  7574. movaps xmm7, [edi+4*4]
  7575. mulps xmm7, xmm1
  7576. addps xmm6, xmm7
  7577. movlps xmm2, [esi+4*4]
  7578. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
  7579. movaps xmm7, [edi+8*4]
  7580. mulps xmm7, xmm2
  7581. addps xmm6, xmm7
  7582. movhlps xmm3, xmm6
  7583. addps xmm3, xmm6
  7584. STORE2LO( 0, xmm3, xmm7 )
  7585. }
  7586. return;
  7587. }
  7588. case 3: { // 6x3 * 6x1
  7589. __asm {
  7590. mov esi, vPtr
  7591. mov edi, mPtr
  7592. mov eax, dstPtr
  7593. movss xmm0, [edi+(0*3+2)*4]
  7594. movhps xmm0, [edi+(0*3+0)*4]
  7595. shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
  7596. movss xmm6, [esi+0*4]
  7597. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7598. mulps xmm6, xmm0
  7599. movss xmm1, [edi+(1*3+0)*4]
  7600. movhps xmm1, [edi+(1*3+1)*4]
  7601. movss xmm7, [esi+1*4]
  7602. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  7603. mulps xmm7, xmm1
  7604. addps xmm6, xmm7
  7605. movss xmm2, [edi+(2*3+2)*4]
  7606. movhps xmm2, [edi+(2*3+0)*4]
  7607. shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
  7608. movss xmm7, [esi+2*4]
  7609. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  7610. mulps xmm7, xmm2
  7611. addps xmm6, xmm7
  7612. movss xmm3, [edi+(3*3+0)*4]
  7613. movhps xmm3, [edi+(3*3+1)*4]
  7614. movss xmm7, [esi+3*4]
  7615. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  7616. mulps xmm7, xmm3
  7617. addps xmm6, xmm7
  7618. movss xmm4, [edi+(4*3+2)*4]
  7619. movhps xmm4, [edi+(4*3+0)*4]
  7620. shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
  7621. movss xmm7, [esi+4*4]
  7622. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  7623. mulps xmm7, xmm4
  7624. addps xmm6, xmm7
  7625. movss xmm5, [edi+(5*3+0)*4]
  7626. movhps xmm5, [edi+(5*3+1)*4]
  7627. movss xmm7, [esi+5*4]
  7628. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  7629. mulps xmm7, xmm5
  7630. addps xmm6, xmm7
  7631. STORE1( 0, xmm6, xmm7 )
  7632. STORE2HI( 4, xmm6, xmm7 )
  7633. }
  7634. return;
  7635. }
  7636. case 4: { // 6x4 * 6x1
  7637. __asm {
  7638. mov esi, vPtr
  7639. mov edi, mPtr
  7640. mov eax, dstPtr
  7641. movlps xmm3, [edi+(0*4+0)*4]
  7642. movhps xmm3, [edi+(0*4+2)*4]
  7643. movss xmm4, [esi+0*4]
  7644. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  7645. mulps xmm3, xmm4
  7646. movlps xmm5, [edi+(1*4+0)*4]
  7647. movhps xmm5, [edi+(1*4+2)*4]
  7648. movss xmm6, [esi+1*4]
  7649. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7650. mulps xmm5, xmm6
  7651. addps xmm3, xmm5
  7652. movlps xmm4, [edi+(2*4+0)*4]
  7653. movhps xmm4, [edi+(2*4+2)*4]
  7654. movss xmm6, [esi+2*4]
  7655. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7656. mulps xmm4, xmm6
  7657. addps xmm3, xmm4
  7658. movlps xmm5, [edi+(3*4+0)*4]
  7659. movhps xmm5, [edi+(3*4+2)*4]
  7660. movss xmm6, [esi+3*4]
  7661. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7662. mulps xmm5, xmm6
  7663. addps xmm3, xmm5
  7664. movlps xmm4, [edi+(4*4+0)*4]
  7665. movhps xmm4, [edi+(4*4+2)*4]
  7666. movss xmm6, [esi+4*4]
  7667. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7668. mulps xmm4, xmm6
  7669. addps xmm3, xmm4
  7670. movlps xmm5, [edi+(5*4+0)*4]
  7671. movhps xmm5, [edi+(5*4+2)*4]
  7672. movss xmm6, [esi+5*4]
  7673. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7674. mulps xmm5, xmm6
  7675. addps xmm3, xmm5
  7676. STORE4( 0, xmm3, xmm7 )
  7677. }
  7678. return;
  7679. }
  7680. case 5: { // 6x5 * 6x1
  7681. __asm {
  7682. mov esi, vPtr
  7683. mov edi, mPtr
  7684. mov eax, dstPtr
  7685. movlps xmm6, [edi+(0*5+0)*4]
  7686. movhps xmm6, [edi+(0*5+2)*4]
  7687. movss xmm0, [esi+0*4]
  7688. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  7689. mulps xmm6, xmm0
  7690. movlps xmm7, [edi+(1*5+0)*4]
  7691. movhps xmm7, [edi+(1*5+2)*4]
  7692. movss xmm1, [esi+1*4]
  7693. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  7694. mulps xmm7, xmm1
  7695. addps xmm6, xmm7
  7696. movlps xmm7, [edi+(2*5+0)*4]
  7697. movhps xmm7, [edi+(2*5+2)*4]
  7698. movss xmm2, [esi+2*4]
  7699. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
  7700. mulps xmm7, xmm2
  7701. addps xmm6, xmm7
  7702. movlps xmm7, [edi+(3*5+0)*4]
  7703. movhps xmm7, [edi+(3*5+2)*4]
  7704. movss xmm3, [esi+3*4]
  7705. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  7706. mulps xmm7, xmm3
  7707. addps xmm6, xmm7
  7708. movlps xmm7, [edi+(4*5+0)*4]
  7709. movhps xmm7, [edi+(4*5+2)*4]
  7710. movss xmm4, [esi+4*4]
  7711. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  7712. mulps xmm7, xmm4
  7713. addps xmm6, xmm7
  7714. movlps xmm7, [edi+(5*5+0)*4]
  7715. movhps xmm7, [edi+(5*5+2)*4]
  7716. movss xmm5, [esi+5*4]
  7717. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  7718. mulps xmm7, xmm5
  7719. addps xmm6, xmm7
  7720. STORE4( 0, xmm6, xmm7 )
  7721. movss xmm6, [edi+(0*5+4)*4]
  7722. mulss xmm6, xmm0
  7723. movss xmm7, [edi+(1*5+4)*4]
  7724. mulss xmm7, xmm1
  7725. addss xmm6, xmm7
  7726. movss xmm7, [edi+(2*5+4)*4]
  7727. mulss xmm7, xmm2
  7728. addss xmm6, xmm7
  7729. movss xmm7, [edi+(3*5+4)*4]
  7730. mulss xmm7, xmm3
  7731. addss xmm6, xmm7
  7732. movss xmm7, [edi+(4*5+4)*4]
  7733. mulss xmm7, xmm4
  7734. addss xmm6, xmm7
  7735. movss xmm7, [edi+(5*5+4)*4]
  7736. mulss xmm7, xmm5
  7737. addss xmm6, xmm7
  7738. STORE1( 16, xmm6, xmm7 )
  7739. }
  7740. return;
  7741. }
  7742. case 6: { // 6x6 * 6x1
  7743. __asm {
  7744. mov esi, vPtr
  7745. mov edi, mPtr
  7746. mov eax, dstPtr
  7747. movlps xmm0, [esi+0*4]
  7748. movlps xmm1, [esi+2*4]
  7749. movlps xmm2, [esi+4*4]
  7750. movaps xmm3, xmm0
  7751. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  7752. mulps xmm3, [edi+(0*6+0)*4]
  7753. movlps xmm5, [edi+(1*6+0)*4]
  7754. movhps xmm5, [edi+(1*6+2)*4]
  7755. movaps xmm6, xmm0
  7756. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7757. mulps xmm5, xmm6
  7758. addps xmm3, xmm5
  7759. movaps xmm6, xmm1
  7760. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7761. mulps xmm6, [edi+(2*6+0)*4]
  7762. addps xmm3, xmm6
  7763. movaps xmm6, xmm1
  7764. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7765. movlps xmm5, [edi+(3*6+0)*4]
  7766. movhps xmm5, [edi+(3*6+2)*4]
  7767. mulps xmm5, xmm6
  7768. addps xmm3, xmm5
  7769. movaps xmm6, xmm2
  7770. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  7771. mulps xmm6, [edi+(4*6+0)*4]
  7772. addps xmm3, xmm6
  7773. movaps xmm6, xmm2
  7774. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7775. movlps xmm5, [edi+(5*6+0)*4]
  7776. movhps xmm5, [edi+(5*6+2)*4]
  7777. mulps xmm5, xmm6
  7778. addps xmm3, xmm5
  7779. STORE4( 0, xmm3, xmm7 )
  7780. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  7781. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  7782. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
  7783. movlps xmm3, [edi+(0*6+4)*4]
  7784. movhps xmm3, [edi+(1*6+4)*4]
  7785. mulps xmm3, xmm0
  7786. movlps xmm4, [edi+(2*6+4)*4]
  7787. movhps xmm4, [edi+(3*6+4)*4]
  7788. mulps xmm4, xmm1
  7789. addps xmm3, xmm4
  7790. movlps xmm5, [edi+(4*6+4)*4]
  7791. movhps xmm5, [edi+(5*6+4)*4]
  7792. mulps xmm5, xmm2
  7793. addps xmm3, xmm5
  7794. movhlps xmm4, xmm3
  7795. addps xmm3, xmm4
  7796. STORE2LO( 16, xmm3, xmm7 )
  7797. }
  7798. return;
  7799. }
  7800. default: {
  7801. for ( int i = 0; i < numColumns; i++ ) {
  7802. dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  7803. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
  7804. mPtr++;
  7805. }
  7806. return;
  7807. }
  7808. }
  7809. break;
  7810. default:
  7811. int numRows = mat.GetNumRows();
  7812. for ( int i = 0; i < numColumns; i++ ) {
  7813. mPtr = mat.ToFloatPtr() + i;
  7814. float sum = mPtr[0] * vPtr[0];
  7815. for ( int j = 1; j < numRows; j++ ) {
  7816. mPtr += numColumns;
  7817. sum += mPtr[0] * vPtr[j];
  7818. }
  7819. dstPtr[i] STOREC sum;
  7820. }
  7821. break;
  7822. }
  7823. #undef STOREC
  7824. #undef STORE4
  7825. #undef STORE2HI
  7826. #undef STORE2LO
  7827. #undef STORE1
  7828. }
  7829. /*
  7830. ============
  7831. idSIMD_SSE::MatX_MultiplyMatX
  7832. optimizes the following matrix multiplications:
  7833. NxN * Nx6
  7834. 6xN * Nx6
  7835. Nx6 * 6xN
  7836. 6x6 * 6xN
  7837. with N in the range [1-6].
  7838. The hot cache clock cycle counts are generally better for the SIMD version than the
  7839. FPU version. At times up to 40% less clock cycles on a P3. In practise however,
  7840. the results are poor probably due to memory access.
  7841. ============
  7842. */
  7843. void VPCALL idSIMD_SSE::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
  7844. int i, j, k, l, n;
  7845. float *dstPtr;
  7846. const float *m1Ptr, *m2Ptr;
  7847. double sum;
  7848. assert( m1.GetNumColumns() == m2.GetNumRows() );
  7849. dstPtr = dst.ToFloatPtr();
  7850. m1Ptr = m1.ToFloatPtr();
  7851. m2Ptr = m2.ToFloatPtr();
  7852. k = m1.GetNumRows();
  7853. l = m2.GetNumColumns();
  7854. n = m1.GetNumColumns();
  7855. switch( n ) {
  7856. case 1: {
  7857. if ( !(l^6) ) {
  7858. switch( k ) {
  7859. case 1: { // 1x1 * 1x6, no precision loss compared to FPU version
  7860. __asm {
  7861. mov esi, m2Ptr
  7862. mov edi, m1Ptr
  7863. mov eax, dstPtr
  7864. movss xmm0, [edi]
  7865. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  7866. movaps xmm1, [esi]
  7867. mulps xmm1, xmm0
  7868. movaps [eax], xmm1
  7869. movlps xmm2, [esi+16]
  7870. mulps xmm2, xmm0
  7871. movlps [eax+16], xmm2
  7872. }
  7873. return;
  7874. }
  7875. case 6: { // 6x1 * 1x6, no precision loss compared to FPU version
  7876. __asm {
  7877. mov esi, m2Ptr
  7878. mov edi, m1Ptr
  7879. mov eax, dstPtr
  7880. xorps xmm1, xmm1
  7881. movaps xmm0, [edi]
  7882. movlps xmm1, [edi+16]
  7883. movlhps xmm1, xmm0
  7884. movhlps xmm2, xmm0
  7885. movlhps xmm2, xmm1
  7886. // row 0 and 1
  7887. movaps xmm3, [esi]
  7888. movaps xmm4, xmm3
  7889. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  7890. movaps xmm5, xmm3
  7891. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
  7892. movaps xmm6, xmm3
  7893. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
  7894. mulps xmm4, xmm0
  7895. mulps xmm5, xmm1
  7896. mulps xmm6, xmm2
  7897. movaps [eax], xmm4
  7898. movaps [eax+16], xmm5
  7899. movaps [eax+32], xmm6
  7900. // row 2 and 3
  7901. movaps xmm4, xmm3
  7902. shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )
  7903. movaps xmm5, xmm3
  7904. shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )
  7905. shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
  7906. mulps xmm4, xmm0
  7907. mulps xmm5, xmm1
  7908. mulps xmm3, xmm2
  7909. movaps [eax+48], xmm4
  7910. movaps [eax+64], xmm5
  7911. movaps [eax+80], xmm3
  7912. // row 4 and 5
  7913. movlps xmm3, [esi+16]
  7914. movaps xmm4, xmm3
  7915. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  7916. movaps xmm5, xmm3
  7917. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
  7918. shufps xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 )
  7919. mulps xmm4, xmm0
  7920. mulps xmm5, xmm1
  7921. mulps xmm3, xmm2
  7922. movaps [eax+96], xmm4
  7923. movaps [eax+112], xmm5
  7924. movaps [eax+128], xmm3
  7925. }
  7926. return;
  7927. }
  7928. }
  7929. }
  7930. for ( i = 0; i < k; i++ ) {
  7931. m2Ptr = m2.ToFloatPtr();
  7932. for ( j = 0; j < l; j++ ) {
  7933. *dstPtr++ = m1Ptr[0] * m2Ptr[0];
  7934. m2Ptr++;
  7935. }
  7936. m1Ptr++;
  7937. }
  7938. break;
  7939. }
  7940. case 2: {
  7941. if ( !(l^6) ) {
  7942. switch( k ) {
  7943. case 2: { // 2x2 * 2x6
  7944. #define MUL_Nx2_2x6_INIT \
  7945. __asm mov esi, m2Ptr \
  7946. __asm mov edi, m1Ptr \
  7947. __asm mov eax, dstPtr \
  7948. __asm movaps xmm0, [esi] \
  7949. __asm movlps xmm1, [esi+16] \
  7950. __asm movhps xmm1, [esi+40] \
  7951. __asm movlps xmm2, [esi+24] \
  7952. __asm movhps xmm2, [esi+32]
  7953. #define MUL_Nx2_2x6_ROW2( row ) \
  7954. __asm movaps xmm3, [edi+row*16] \
  7955. __asm movaps xmm5, xmm0 \
  7956. __asm movaps xmm4, xmm3 \
  7957. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  7958. __asm mulps xmm5, xmm4 \
  7959. __asm movaps xmm4, xmm3 \
  7960. __asm movaps xmm6, xmm2 \
  7961. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 ) \
  7962. __asm mulps xmm6, xmm4 \
  7963. __asm addps xmm5, xmm6 \
  7964. __asm movaps [eax+row*48], xmm5 \
  7965. __asm movaps xmm4, xmm3 \
  7966. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  7967. __asm movaps xmm7, xmm1 \
  7968. __asm mulps xmm7, xmm4 \
  7969. __asm movaps xmm4, xmm3 \
  7970. __asm movaps xmm5, xmm0 \
  7971. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) \
  7972. __asm mulps xmm5, xmm4 \
  7973. __asm movaps xmm4, xmm3 \
  7974. __asm movaps xmm6, xmm2 \
  7975. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 ) \
  7976. __asm mulps xmm6, xmm4 \
  7977. __asm addps xmm5, xmm6 \
  7978. __asm shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 ) \
  7979. __asm movaps xmm6, xmm1 \
  7980. __asm mulps xmm6, xmm3 \
  7981. __asm movaps xmm4, xmm7 \
  7982. __asm movlhps xmm7, xmm6 \
  7983. __asm movhlps xmm6, xmm4 \
  7984. __asm addps xmm6, xmm7 \
  7985. __asm movlps [eax+row*48+16], xmm6 \
  7986. __asm movlps [eax+row*48+24], xmm5 \
  7987. __asm movhps [eax+row*48+32], xmm5 \
  7988. __asm movhps [eax+row*48+40], xmm6
  7989. MUL_Nx2_2x6_INIT
  7990. MUL_Nx2_2x6_ROW2( 0 )
  7991. return;
  7992. }
  7993. case 6: { // 6x2 * 2x6
  7994. MUL_Nx2_2x6_INIT
  7995. MUL_Nx2_2x6_ROW2( 0 )
  7996. MUL_Nx2_2x6_ROW2( 1 )
  7997. MUL_Nx2_2x6_ROW2( 2 )
  7998. return;
  7999. }
  8000. }
  8001. }
  8002. for ( i = 0; i < k; i++ ) {
  8003. m2Ptr = m2.ToFloatPtr();
  8004. for ( j = 0; j < l; j++ ) {
  8005. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
  8006. m2Ptr++;
  8007. }
  8008. m1Ptr += 2;
  8009. }
  8010. break;
  8011. }
  8012. case 3: {
  8013. if ( !(l^6) ) {
  8014. switch( k ) {
  8015. case 3: { // 3x3 * 3x6
  8016. __asm {
  8017. mov esi, m2Ptr
  8018. mov edi, m1Ptr
  8019. mov eax, dstPtr
  8020. movaps xmm5, xmmword ptr [esi]
  8021. movlps xmm6, qword ptr [esi+24]
  8022. movhps xmm6, qword ptr [esi+32]
  8023. movaps xmm7, xmmword ptr [esi+48]
  8024. movss xmm0, dword ptr [edi]
  8025. shufps xmm0, xmm0, 0
  8026. mulps xmm0, xmm5
  8027. movss xmm1, dword ptr [edi+4]
  8028. shufps xmm1, xmm1, 0
  8029. mulps xmm1, xmm6
  8030. movss xmm2, dword ptr [edi+8]
  8031. shufps xmm2, xmm2, 0
  8032. mulps xmm2, xmm7
  8033. addps xmm0, xmm1
  8034. addps xmm0, xmm2
  8035. movaps xmmword ptr [eax], xmm0
  8036. movss xmm3, dword ptr [edi+12]
  8037. shufps xmm3, xmm3, 0
  8038. mulps xmm3, xmm5
  8039. movss xmm4, dword ptr [edi+16]
  8040. shufps xmm4, xmm4, 0
  8041. mulps xmm4, xmm6
  8042. movss xmm0, dword ptr [edi+20]
  8043. shufps xmm0, xmm0, 0
  8044. mulps xmm0, xmm7
  8045. addps xmm3, xmm4
  8046. addps xmm0, xmm3
  8047. movlps qword ptr [eax+24], xmm0
  8048. movhps qword ptr [eax+32], xmm0
  8049. movss xmm1, dword ptr [edi+24]
  8050. shufps xmm1, xmm1, 0
  8051. mulps xmm1, xmm5
  8052. movss xmm2, dword ptr [edi+28]
  8053. shufps xmm2, xmm2, 0
  8054. mulps xmm2, xmm6
  8055. movss xmm3, dword ptr [edi+32]
  8056. shufps xmm3, xmm3, 0
  8057. mulps xmm3, xmm7
  8058. addps xmm1, xmm2
  8059. addps xmm1, xmm3
  8060. movaps xmmword ptr [eax+48], xmm1
  8061. movlps xmm5, qword ptr [esi+16]
  8062. movlps xmm6, qword ptr [esi+40]
  8063. movlps xmm7, qword ptr [esi+64]
  8064. shufps xmm5, xmm5, 0x44
  8065. shufps xmm6, xmm6, 0x44
  8066. shufps xmm7, xmm7, 0x44
  8067. movaps xmm3, xmmword ptr [edi]
  8068. movlps xmm4, qword ptr [edi+16]
  8069. movaps xmm0, xmm3
  8070. shufps xmm0, xmm0, 0xF0
  8071. mulps xmm0, xmm5
  8072. movaps xmm1, xmm3
  8073. shufps xmm1, xmm4, 0x05
  8074. mulps xmm1, xmm6
  8075. shufps xmm3, xmm4, 0x5A
  8076. mulps xmm3, xmm7
  8077. addps xmm1, xmm0
  8078. addps xmm1, xmm3
  8079. movlps qword ptr [eax+16], xmm1
  8080. movhps qword ptr [eax+40], xmm1
  8081. movss xmm0, dword ptr [edi+24]
  8082. shufps xmm0, xmm0, 0
  8083. mulps xmm0, xmm5
  8084. movss xmm2, dword ptr [edi+28]
  8085. shufps xmm2, xmm2, 0
  8086. mulps xmm2, xmm6
  8087. movss xmm4, dword ptr [edi+32]
  8088. shufps xmm4, xmm4, 0
  8089. mulps xmm4, xmm7
  8090. addps xmm0, xmm2
  8091. addps xmm0, xmm4
  8092. movlps qword ptr [eax+64], xmm0
  8093. }
  8094. return;
  8095. }
  8096. case 6: { // 6x3 * 3x6
  8097. #define MUL_Nx3_3x6_FIRST4COLUMNS_INIT \
  8098. __asm mov esi, m2Ptr \
  8099. __asm mov edi, m1Ptr \
  8100. __asm mov eax, dstPtr \
  8101. __asm movlps xmm0, [esi+ 0*4] \
  8102. __asm movhps xmm0, [esi+ 2*4] \
  8103. __asm movlps xmm1, [esi+ 6*4] \
  8104. __asm movhps xmm1, [esi+ 8*4] \
  8105. __asm movlps xmm2, [esi+12*4] \
  8106. __asm movhps xmm2, [esi+14*4]
  8107. #define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row ) \
  8108. __asm movss xmm3, [edi+(row*3+0)*4] \
  8109. __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8110. __asm mulps xmm3, xmm0 \
  8111. __asm movss xmm4, [edi+(row*3+1)*4] \
  8112. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8113. __asm mulps xmm4, xmm1 \
  8114. __asm addps xmm3, xmm4 \
  8115. __asm movss xmm5, [edi+(row*3+2)*4] \
  8116. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8117. __asm mulps xmm5, xmm2 \
  8118. __asm addps xmm3, xmm5 \
  8119. __asm movlps [eax+(row*6+0)*4], xmm3 \
  8120. __asm movhps [eax+(row*6+2)*4], xmm3
  8121. #define MUL_Nx3_3x6_LAST2COLUMNS_ROW6 \
  8122. __asm movlps xmm0, [esi+ 4*4] \
  8123. __asm movlps xmm1, [esi+10*4] \
  8124. __asm movlps xmm2, [esi+16*4] \
  8125. __asm shufps xmm0, xmm0, 0x44 \
  8126. __asm shufps xmm1, xmm1, 0x44 \
  8127. __asm shufps xmm2, xmm2, 0x44 \
  8128. __asm movlps xmm3, [edi+0*4] \
  8129. __asm movhps xmm3, [edi+2*4] \
  8130. __asm movaps xmm4, xmm3 \
  8131. __asm movaps xmm5, xmm3 \
  8132. __asm shufps xmm3, xmm3, 0xF0 \
  8133. __asm mulps xmm3, xmm0 \
  8134. __asm movlps xmm6, [edi+4*4] \
  8135. __asm movhps xmm6, [edi+6*4] \
  8136. __asm shufps xmm4, xmm6, 0x05 \
  8137. __asm mulps xmm4, xmm1 \
  8138. __asm addps xmm3, xmm4 \
  8139. __asm shufps xmm5, xmm6, 0x5A \
  8140. __asm mulps xmm5, xmm2 \
  8141. __asm addps xmm3, xmm5 \
  8142. __asm movlps [eax+4*4], xmm3 \
  8143. __asm movhps [eax+10*4], xmm3 \
  8144. __asm movaps xmm5, xmm6 \
  8145. __asm movlps xmm3, [edi+8*4] \
  8146. __asm movhps xmm3, [edi+10*4] \
  8147. __asm movaps xmm4, xmm3 \
  8148. __asm shufps xmm5, xmm3, 0x5A \
  8149. __asm mulps xmm5, xmm0 \
  8150. __asm shufps xmm6, xmm3, 0xAF \
  8151. __asm mulps xmm6, xmm1 \
  8152. __asm addps xmm5, xmm6 \
  8153. __asm shufps xmm4, xmm4, 0xF0 \
  8154. __asm mulps xmm4, xmm2 \
  8155. __asm addps xmm4, xmm5 \
  8156. __asm movlps [eax+16*4], xmm4 \
  8157. __asm movhps [eax+22*4], xmm4 \
  8158. __asm movlps xmm6, [edi+12*4] \
  8159. __asm movhps xmm6, [edi+14*4] \
  8160. __asm movaps xmm5, xmm6 \
  8161. __asm movaps xmm4, xmm6 \
  8162. __asm shufps xmm6, xmm6, 0xF0 \
  8163. __asm mulps xmm6, xmm0 \
  8164. __asm movlps xmm3, [edi+16*4] \
  8165. __asm shufps xmm5, xmm3, 0x05 \
  8166. __asm mulps xmm5, xmm1 \
  8167. __asm addps xmm5, xmm6 \
  8168. __asm shufps xmm4, xmm3, 0x5A \
  8169. __asm mulps xmm4, xmm2 \
  8170. __asm addps xmm4, xmm5 \
  8171. __asm movlps [eax+28*4], xmm4 \
  8172. __asm movhps [eax+34*4], xmm4
  8173. MUL_Nx3_3x6_FIRST4COLUMNS_INIT
  8174. MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 )
  8175. MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 )
  8176. MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 )
  8177. MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 )
  8178. MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 )
  8179. MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 )
  8180. MUL_Nx3_3x6_LAST2COLUMNS_ROW6
  8181. return;
  8182. }
  8183. }
  8184. }
  8185. for ( i = 0; i < k; i++ ) {
  8186. m2Ptr = m2.ToFloatPtr();
  8187. for ( j = 0; j < l; j++ ) {
  8188. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
  8189. m2Ptr++;
  8190. }
  8191. m1Ptr += 3;
  8192. }
  8193. break;
  8194. }
  8195. case 4: {
  8196. if ( !(l^6) ) {
  8197. switch( k ) {
  8198. case 4: { // 4x4 * 4x6
  8199. #define MUL_Nx4_4x6_FIRST4COLUMNS_INIT \
  8200. __asm mov esi, m2Ptr \
  8201. __asm mov edi, m1Ptr \
  8202. __asm mov eax, dstPtr \
  8203. __asm movlps xmm0, [esi+ 0*4] \
  8204. __asm movhps xmm0, [esi+ 2*4] \
  8205. __asm movlps xmm1, [esi+ 6*4] \
  8206. __asm movhps xmm1, [esi+ 8*4] \
  8207. __asm movlps xmm2, [esi+12*4] \
  8208. __asm movhps xmm2, [esi+14*4] \
  8209. __asm movlps xmm3, [esi+18*4] \
  8210. __asm movhps xmm3, [esi+20*4]
  8211. #define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row ) \
  8212. __asm movss xmm4, [edi+row*16+0*4] \
  8213. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8214. __asm mulps xmm4, xmm0 \
  8215. __asm movss xmm5, [edi+row*16+1*4] \
  8216. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8217. __asm mulps xmm5, xmm1 \
  8218. __asm addps xmm4, xmm5 \
  8219. __asm movss xmm6, [edi+row*16+2*4] \
  8220. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8221. __asm mulps xmm6, xmm2 \
  8222. __asm addps xmm4, xmm6 \
  8223. __asm movss xmm7, [edi+row*16+3*4] \
  8224. __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8225. __asm mulps xmm7, xmm3 \
  8226. __asm addps xmm4, xmm7 \
  8227. __asm movlps [eax+row*24+0], xmm4 \
  8228. __asm movhps [eax+row*24+8], xmm4
  8229. #define MUL_Nx4_4x6_LAST2COLUMNS_INIT \
  8230. __asm movlps xmm0, [esi+ 4*4] \
  8231. __asm movlps xmm1, [esi+10*4] \
  8232. __asm movlps xmm2, [esi+16*4] \
  8233. __asm movlps xmm3, [esi+22*4] \
  8234. __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  8235. __asm shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  8236. __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  8237. __asm shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
  8238. #define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row ) \
  8239. __asm movlps xmm7, [edi+row*32+ 0*4] \
  8240. __asm movhps xmm7, [edi+row*32+ 4*4] \
  8241. __asm movaps xmm6, xmm7 \
  8242. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 ) \
  8243. __asm mulps xmm6, xmm0 \
  8244. __asm shufps xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 ) \
  8245. __asm mulps xmm7, xmm1 \
  8246. __asm addps xmm6, xmm7 \
  8247. __asm movlps xmm4, [edi+row*32+ 2*4] \
  8248. __asm movhps xmm4, [edi+row*32+ 6*4] \
  8249. __asm movaps xmm5, xmm4 \
  8250. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 ) \
  8251. __asm mulps xmm5, xmm2 \
  8252. __asm addps xmm6, xmm5 \
  8253. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 ) \
  8254. __asm mulps xmm4, xmm3 \
  8255. __asm addps xmm6, xmm4 \
  8256. __asm movlps [eax+row*48+ 4*4], xmm6 \
  8257. __asm movhps [eax+row*48+10*4], xmm6
  8258. MUL_Nx4_4x6_FIRST4COLUMNS_INIT
  8259. MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
  8260. MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
  8261. MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
  8262. MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
  8263. MUL_Nx4_4x6_LAST2COLUMNS_INIT
  8264. MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
  8265. MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
  8266. return;
  8267. }
  8268. case 6: { // 6x4 * 4x6
  8269. MUL_Nx4_4x6_FIRST4COLUMNS_INIT
  8270. MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
  8271. MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
  8272. MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
  8273. MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
  8274. MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 )
  8275. MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 )
  8276. MUL_Nx4_4x6_LAST2COLUMNS_INIT
  8277. MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
  8278. MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
  8279. MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 )
  8280. return;
  8281. }
  8282. }
  8283. }
  8284. for ( i = 0; i < k; i++ ) {
  8285. m2Ptr = m2.ToFloatPtr();
  8286. for ( j = 0; j < l; j++ ) {
  8287. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
  8288. m1Ptr[3] * m2Ptr[3*l];
  8289. m2Ptr++;
  8290. }
  8291. m1Ptr += 4;
  8292. }
  8293. break;
  8294. }
  8295. case 5: {
  8296. if ( !(l^6) ) {
  8297. switch( k ) {
  8298. case 5: { // 5x5 * 5x6
  8299. #define MUL_Nx5_5x6_FIRST4COLUMNS_INIT \
  8300. __asm mov esi, m2Ptr \
  8301. __asm mov edi, m1Ptr \
  8302. __asm mov eax, dstPtr \
  8303. __asm movlps xmm0, [esi+ 0*4] \
  8304. __asm movhps xmm0, [esi+ 2*4] \
  8305. __asm movlps xmm1, [esi+ 6*4] \
  8306. __asm movhps xmm1, [esi+ 8*4] \
  8307. __asm movlps xmm2, [esi+12*4] \
  8308. __asm movhps xmm2, [esi+14*4] \
  8309. __asm movlps xmm3, [esi+18*4] \
  8310. __asm movhps xmm3, [esi+20*4] \
  8311. __asm movlps xmm4, [esi+24*4] \
  8312. __asm movhps xmm4, [esi+26*4]
  8313. #define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row ) \
  8314. __asm movss xmm6, [edi+row*20+0*4] \
  8315. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8316. __asm mulps xmm6, xmm0 \
  8317. __asm movss xmm5, [edi+row*20+1*4] \
  8318. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8319. __asm mulps xmm5, xmm1 \
  8320. __asm addps xmm6, xmm5 \
  8321. __asm movss xmm5, [edi+row*20+2*4] \
  8322. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8323. __asm mulps xmm5, xmm2 \
  8324. __asm addps xmm6, xmm5 \
  8325. __asm movss xmm5, [edi+row*20+3*4] \
  8326. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8327. __asm mulps xmm5, xmm3 \
  8328. __asm addps xmm6, xmm5 \
  8329. __asm movss xmm5, [edi+row*20+4*4] \
  8330. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8331. __asm mulps xmm5, xmm4 \
  8332. __asm addps xmm6, xmm5 \
  8333. __asm movlps [eax+row*24+0], xmm6 \
  8334. __asm movhps [eax+row*24+8], xmm6
  8335. #define MUL_Nx5_5x6_LAST2COLUMNS_INIT \
  8336. __asm movlps xmm0, [esi+ 4*4] \
  8337. __asm movlps xmm1, [esi+10*4] \
  8338. __asm movlps xmm2, [esi+16*4] \
  8339. __asm movlps xmm3, [esi+22*4] \
  8340. __asm movlps xmm4, [esi+28*4] \
  8341. __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  8342. __asm shufps xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  8343. __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  8344. __asm shufps xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  8345. __asm shufps xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )
  8346. #define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row ) \
  8347. __asm movlps xmm7, [edi+row*40+ 0*4] \
  8348. __asm movhps xmm7, [edi+row*40+ 6*4] \
  8349. __asm movaps xmm6, xmm7 \
  8350. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 ) \
  8351. __asm mulps xmm6, xmm0 \
  8352. __asm movaps xmm5, xmm7 \
  8353. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
  8354. __asm mulps xmm5, xmm1 \
  8355. __asm addps xmm6, xmm5 \
  8356. __asm movlps xmm7, [edi+row*40+ 2*4] \
  8357. __asm movhps xmm7, [edi+row*40+ 8*4] \
  8358. __asm movaps xmm5, xmm7 \
  8359. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 ) \
  8360. __asm mulps xmm5, xmm2 \
  8361. __asm addps xmm6, xmm5 \
  8362. __asm movaps xmm5, xmm7 \
  8363. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
  8364. __asm mulps xmm5, xmm3 \
  8365. __asm addps xmm6, xmm5 \
  8366. __asm movlps xmm5, [edi+row*40+ 4*4] \
  8367. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  8368. __asm mulps xmm5, xmm4 \
  8369. __asm addps xmm6, xmm5 \
  8370. __asm movlps [eax+row*48+ 4*4], xmm6 \
  8371. __asm movhps [eax+row*48+10*4], xmm6
  8372. #define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row ) \
  8373. __asm movlps xmm6, [edi+20*4+0*4] \
  8374. __asm unpcklps xmm6, xmm6 \
  8375. __asm mulps xmm6, xmm0 \
  8376. __asm movlps xmm5, [edi+20*4+2*4] \
  8377. __asm unpcklps xmm5, xmm5 \
  8378. __asm mulps xmm5, xmm2 \
  8379. __asm addps xmm6, xmm5 \
  8380. __asm movss xmm5, [edi+20*4+4*4] \
  8381. __asm unpcklps xmm5, xmm5 \
  8382. __asm mulps xmm5, xmm4 \
  8383. __asm addps xmm6, xmm5 \
  8384. __asm movhlps xmm7, xmm6 \
  8385. __asm addps xmm6, xmm7 \
  8386. __asm movlps [eax+row*24+4*4], xmm6
  8387. MUL_Nx5_5x6_FIRST4COLUMNS_INIT
  8388. MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
  8389. MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
  8390. MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
  8391. MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
  8392. MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
  8393. MUL_Nx5_5x6_LAST2COLUMNS_INIT
  8394. MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
  8395. MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
  8396. MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 )
  8397. return;
  8398. }
  8399. case 6: { // 6x5 * 5x6
  8400. MUL_Nx5_5x6_FIRST4COLUMNS_INIT
  8401. MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
  8402. MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
  8403. MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
  8404. MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
  8405. MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
  8406. MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 )
  8407. MUL_Nx5_5x6_LAST2COLUMNS_INIT
  8408. MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
  8409. MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
  8410. MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 )
  8411. return;
  8412. }
  8413. }
  8414. }
  8415. for ( i = 0; i < k; i++ ) {
  8416. m2Ptr = m2.ToFloatPtr();
  8417. for ( j = 0; j < l; j++ ) {
  8418. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
  8419. m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
  8420. m2Ptr++;
  8421. }
  8422. m1Ptr += 5;
  8423. }
  8424. break;
  8425. }
  8426. case 6: {
  8427. switch( k ) {
  8428. case 1: {
  8429. if ( !(l^1) ) { // 1x6 * 6x1
  8430. dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
  8431. m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
  8432. return;
  8433. }
  8434. break;
  8435. }
  8436. case 2: {
  8437. if ( !(l^2) ) { // 2x6 * 6x2
  8438. #define MUL_Nx6_6x2_INIT \
  8439. __asm mov esi, m2Ptr \
  8440. __asm mov edi, m1Ptr \
  8441. __asm mov eax, dstPtr \
  8442. __asm movaps xmm0, [esi] \
  8443. __asm movaps xmm1, [esi+16] \
  8444. __asm movaps xmm2, [esi+32]
  8445. #define MUL_Nx6_6x2_ROW2( row ) \
  8446. __asm movaps xmm7, [edi+row*48+0*4] \
  8447. __asm movaps xmm6, xmm7 \
  8448. __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  8449. __asm mulps xmm7, xmm0 \
  8450. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 ) \
  8451. __asm mulps xmm6, xmm1 \
  8452. __asm addps xmm7, xmm6 \
  8453. __asm movaps xmm6, [edi+row*48+4*4] \
  8454. __asm movaps xmm5, xmm6 \
  8455. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  8456. __asm mulps xmm6, xmm2 \
  8457. __asm addps xmm7, xmm6 \
  8458. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) \
  8459. __asm mulps xmm5, xmm0 \
  8460. __asm movaps xmm6, [edi+row*48+24+2*4] \
  8461. __asm movaps xmm4, xmm6 \
  8462. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  8463. __asm mulps xmm6, xmm1 \
  8464. __asm addps xmm5, xmm6 \
  8465. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 ) \
  8466. __asm mulps xmm4, xmm2 \
  8467. __asm addps xmm5, xmm4 \
  8468. __asm movaps xmm4, xmm5 \
  8469. __asm movhlps xmm5, xmm7 \
  8470. __asm movlhps xmm7, xmm4 \
  8471. __asm addps xmm7, xmm5 \
  8472. __asm movaps [eax+row*16], xmm7
  8473. MUL_Nx6_6x2_INIT
  8474. MUL_Nx6_6x2_ROW2( 0 )
  8475. return;
  8476. }
  8477. break;
  8478. }
  8479. case 3: {
  8480. if ( !(l^3) ) { // 3x6 * 6x3
  8481. #define MUL_Nx6_6x3_INIT \
  8482. __asm mov esi, m2Ptr \
  8483. __asm mov edi, m1Ptr \
  8484. __asm mov eax, dstPtr \
  8485. __asm movss xmm0, [esi+ 0*4] \
  8486. __asm movhps xmm0, [esi+ 1*4] \
  8487. __asm movss xmm1, [esi+ 3*4] \
  8488. __asm movhps xmm1, [esi+ 4*4] \
  8489. __asm movss xmm2, [esi+ 6*4] \
  8490. __asm movhps xmm2, [esi+ 7*4] \
  8491. __asm movss xmm3, [esi+ 9*4] \
  8492. __asm movhps xmm3, [esi+10*4] \
  8493. __asm movss xmm4, [esi+12*4] \
  8494. __asm movhps xmm4, [esi+13*4] \
  8495. __asm movss xmm5, [esi+15*4] \
  8496. __asm movhps xmm5, [esi+16*4]
  8497. #define MUL_Nx6_6x3_ROW( row ) \
  8498. __asm movss xmm7, [edi+row*24+0] \
  8499. __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8500. __asm mulps xmm7, xmm0 \
  8501. __asm movss xmm6, [edi+row*24+4] \
  8502. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8503. __asm mulps xmm6, xmm1 \
  8504. __asm addps xmm7, xmm6 \
  8505. __asm movss xmm6, [edi+row*24+8] \
  8506. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8507. __asm mulps xmm6, xmm2 \
  8508. __asm addps xmm7, xmm6 \
  8509. __asm movss xmm6, [edi+row*24+12] \
  8510. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8511. __asm mulps xmm6, xmm3 \
  8512. __asm addps xmm7, xmm6 \
  8513. __asm movss xmm6, [edi+row*24+16] \
  8514. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8515. __asm mulps xmm6, xmm4 \
  8516. __asm addps xmm7, xmm6 \
  8517. __asm movss xmm6, [edi+row*24+20] \
  8518. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8519. __asm mulps xmm6, xmm5 \
  8520. __asm addps xmm7, xmm6 \
  8521. __asm movss [eax+row*12+0], xmm7 \
  8522. __asm movhps [eax+row*12+4], xmm7
  8523. MUL_Nx6_6x3_INIT
  8524. MUL_Nx6_6x3_ROW( 0 )
  8525. MUL_Nx6_6x3_ROW( 1 )
  8526. MUL_Nx6_6x3_ROW( 2 )
  8527. return;
  8528. }
  8529. break;
  8530. }
  8531. case 4: {
  8532. if ( !(l^4) ) { // 4x6 * 6x4
  8533. #define MUL_Nx6_6x4_INIT \
  8534. __asm mov esi, m2Ptr \
  8535. __asm mov edi, m1Ptr \
  8536. __asm mov eax, dstPtr \
  8537. __asm movaps xmm0, [esi] \
  8538. __asm movaps xmm1, [esi+16] \
  8539. __asm movaps xmm2, [esi+32] \
  8540. __asm movaps xmm3, [esi+48] \
  8541. __asm movaps xmm4, [esi+64] \
  8542. __asm movaps xmm5, [esi+80]
  8543. #define MUL_Nx6_6x4_ROW( row ) \
  8544. __asm movss xmm7, [edi+row*24+0] \
  8545. __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8546. __asm mulps xmm7, xmm0 \
  8547. __asm movss xmm6, [edi+row*24+4] \
  8548. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8549. __asm mulps xmm6, xmm1 \
  8550. __asm addps xmm7, xmm6 \
  8551. __asm movss xmm6, [edi+row*24+8] \
  8552. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8553. __asm mulps xmm6, xmm2 \
  8554. __asm addps xmm7, xmm6 \
  8555. __asm movss xmm6, [edi+row*24+12] \
  8556. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8557. __asm mulps xmm6, xmm3 \
  8558. __asm addps xmm7, xmm6 \
  8559. __asm movss xmm6, [edi+row*24+16] \
  8560. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8561. __asm mulps xmm6, xmm4 \
  8562. __asm addps xmm7, xmm6 \
  8563. __asm movss xmm6, [edi+row*24+20] \
  8564. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8565. __asm mulps xmm6, xmm5 \
  8566. __asm addps xmm7, xmm6 \
  8567. __asm movaps [eax+row*16], xmm7
  8568. MUL_Nx6_6x4_INIT
  8569. MUL_Nx6_6x4_ROW( 0 )
  8570. MUL_Nx6_6x4_ROW( 1 )
  8571. MUL_Nx6_6x4_ROW( 2 )
  8572. MUL_Nx6_6x4_ROW( 3 )
  8573. return;
  8574. }
  8575. break;
  8576. }
  8577. case 5: {
  8578. if ( !(l^5) ) { // 5x6 * 6x5
  8579. #define MUL_Nx6_6x5_INIT \
  8580. __asm mov esi, m2Ptr \
  8581. __asm mov edi, m1Ptr \
  8582. __asm mov eax, dstPtr \
  8583. __asm movaps xmm0, [esi] \
  8584. __asm movlps xmm1, [esi+20] \
  8585. __asm movhps xmm1, [esi+28] \
  8586. __asm movlps xmm2, [esi+40] \
  8587. __asm movhps xmm2, [esi+48] \
  8588. __asm movlps xmm3, [esi+60] \
  8589. __asm movhps xmm3, [esi+68] \
  8590. __asm movaps xmm4, [esi+80] \
  8591. __asm movlps xmm5, [esi+100] \
  8592. __asm movhps xmm5, [esi+108]
  8593. #define MUL_Nx6_6x5_ROW( row ) \
  8594. __asm movss xmm7, [edi+row*24+0] \
  8595. __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8596. __asm mulps xmm7, xmm0 \
  8597. __asm fld dword ptr [edi+(row*6+0)*4] \
  8598. __asm fmul dword ptr [esi+(4+0*5)*4] \
  8599. __asm movss xmm6, [edi+row*24+4] \
  8600. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8601. __asm mulps xmm6, xmm1 \
  8602. __asm addps xmm7, xmm6 \
  8603. __asm fld dword ptr [edi+(row*6+1)*4] \
  8604. __asm fmul dword ptr [esi+(4+1*5)*4] \
  8605. __asm faddp st(1),st \
  8606. __asm movss xmm6, [edi+row*24+8] \
  8607. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8608. __asm mulps xmm6, xmm2 \
  8609. __asm addps xmm7, xmm6 \
  8610. __asm fld dword ptr [edi+(row*6+2)*4] \
  8611. __asm fmul dword ptr [esi+(4+2*5)*4] \
  8612. __asm faddp st(1),st \
  8613. __asm movss xmm6, [edi+row*24+12] \
  8614. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8615. __asm mulps xmm6, xmm3 \
  8616. __asm addps xmm7, xmm6 \
  8617. __asm fld dword ptr [edi+(row*6+3)*4] \
  8618. __asm fmul dword ptr [esi+(4+3*5)*4] \
  8619. __asm faddp st(1),st \
  8620. __asm movss xmm6, [edi+row*24+16] \
  8621. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8622. __asm mulps xmm6, xmm4 \
  8623. __asm addps xmm7, xmm6 \
  8624. __asm fld dword ptr [edi+(row*6+4)*4] \
  8625. __asm fmul dword ptr [esi+(4+4*5)*4] \
  8626. __asm faddp st(1),st \
  8627. __asm movss xmm6, [edi+row*24+20] \
  8628. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  8629. __asm mulps xmm6, xmm5 \
  8630. __asm addps xmm7, xmm6 \
  8631. __asm fld dword ptr [edi+(row*6+5)*4] \
  8632. __asm fmul dword ptr [esi+(4+5*5)*4] \
  8633. __asm faddp st(1),st \
  8634. __asm fstp dword ptr [eax+(row*5+4)*4] \
  8635. __asm movlps [eax+row*20], xmm7 \
  8636. __asm movhps [eax+row*20+8], xmm7
  8637. MUL_Nx6_6x5_INIT
  8638. MUL_Nx6_6x5_ROW( 0 )
  8639. MUL_Nx6_6x5_ROW( 1 )
  8640. MUL_Nx6_6x5_ROW( 2 )
  8641. MUL_Nx6_6x5_ROW( 3 )
  8642. MUL_Nx6_6x5_ROW( 4 )
  8643. return;
  8644. }
  8645. break;
  8646. }
  8647. case 6: {
  8648. switch( l ) {
  8649. case 1: { // 6x6 * 6x1
  8650. __asm {
  8651. mov esi, m2Ptr
  8652. mov edi, m1Ptr
  8653. mov eax, dstPtr
  8654. movlps xmm7, qword ptr [esi]
  8655. movlps xmm6, qword ptr [esi+8]
  8656. shufps xmm7, xmm7, 0x44
  8657. shufps xmm6, xmm6, 0x44
  8658. movlps xmm0, qword ptr [edi ]
  8659. movhps xmm0, qword ptr [edi+ 24]
  8660. mulps xmm0, xmm7
  8661. movlps xmm3, qword ptr [edi+ 8]
  8662. movhps xmm3, qword ptr [edi+ 32]
  8663. mulps xmm3, xmm6
  8664. movlps xmm1, qword ptr [edi+ 48]
  8665. movhps xmm1, qword ptr [edi+ 72]
  8666. mulps xmm1, xmm7
  8667. movlps xmm2, qword ptr [edi+ 96]
  8668. movhps xmm2, qword ptr [edi+120]
  8669. mulps xmm2, xmm7
  8670. movlps xmm4, qword ptr [edi+ 56]
  8671. movhps xmm4, qword ptr [edi+ 80]
  8672. movlps xmm5, qword ptr [edi+104]
  8673. movhps xmm5, qword ptr [edi+128]
  8674. mulps xmm4, xmm6
  8675. movlps xmm7, qword ptr [esi+16]
  8676. addps xmm0, xmm3
  8677. shufps xmm7, xmm7, 0x44
  8678. mulps xmm5, xmm6
  8679. addps xmm1, xmm4
  8680. movlps xmm3, qword ptr [edi+ 16]
  8681. movhps xmm3, qword ptr [edi+ 40]
  8682. addps xmm2, xmm5
  8683. movlps xmm4, qword ptr [edi+ 64]
  8684. movhps xmm4, qword ptr [edi+ 88]
  8685. mulps xmm3, xmm7
  8686. movlps xmm5, qword ptr [edi+112]
  8687. movhps xmm5, qword ptr [edi+136]
  8688. addps xmm0, xmm3
  8689. mulps xmm4, xmm7
  8690. mulps xmm5, xmm7
  8691. addps xmm1, xmm4
  8692. addps xmm2, xmm5
  8693. movaps xmm6, xmm0
  8694. shufps xmm0, xmm1, 0x88
  8695. shufps xmm6, xmm1, 0xDD
  8696. movaps xmm7, xmm2
  8697. shufps xmm7, xmm2, 0x88
  8698. shufps xmm2, xmm2, 0xDD
  8699. addps xmm0, xmm6
  8700. addps xmm2, xmm7
  8701. movlps [eax], xmm0
  8702. movhps [eax+8], xmm0
  8703. movlps [eax+16], xmm2
  8704. }
  8705. return;
  8706. }
  8707. case 2: { // 6x6 * 6x2
  8708. MUL_Nx6_6x2_INIT
  8709. MUL_Nx6_6x2_ROW2( 0 )
  8710. MUL_Nx6_6x2_ROW2( 1 )
  8711. MUL_Nx6_6x2_ROW2( 2 )
  8712. return;
  8713. }
  8714. case 3: { // 6x6 * 6x3
  8715. MUL_Nx6_6x3_INIT
  8716. MUL_Nx6_6x3_ROW( 0 )
  8717. MUL_Nx6_6x3_ROW( 1 )
  8718. MUL_Nx6_6x3_ROW( 2 )
  8719. MUL_Nx6_6x3_ROW( 3 )
  8720. MUL_Nx6_6x3_ROW( 4 )
  8721. MUL_Nx6_6x3_ROW( 5 )
  8722. return;
  8723. }
  8724. case 4: { // 6x6 * 6x4
  8725. MUL_Nx6_6x4_INIT
  8726. MUL_Nx6_6x4_ROW( 0 )
  8727. MUL_Nx6_6x4_ROW( 1 )
  8728. MUL_Nx6_6x4_ROW( 2 )
  8729. MUL_Nx6_6x4_ROW( 3 )
  8730. MUL_Nx6_6x4_ROW( 4 )
  8731. MUL_Nx6_6x4_ROW( 5 )
  8732. return;
  8733. }
  8734. case 5: { // 6x6 * 6x5
  8735. MUL_Nx6_6x5_INIT
  8736. MUL_Nx6_6x5_ROW( 0 )
  8737. MUL_Nx6_6x5_ROW( 1 )
  8738. MUL_Nx6_6x5_ROW( 2 )
  8739. MUL_Nx6_6x5_ROW( 3 )
  8740. MUL_Nx6_6x5_ROW( 4 )
  8741. MUL_Nx6_6x5_ROW( 5 )
  8742. return;
  8743. }
  8744. case 6: { // 6x6 * 6x6
  8745. __asm {
  8746. mov ecx, dword ptr m2Ptr
  8747. movlps xmm3, qword ptr [ecx+72]
  8748. mov edx, dword ptr m1Ptr
  8749. // Loading first 4 columns (upper 4 rows) of m2Ptr.
  8750. movaps xmm0, xmmword ptr [ecx]
  8751. movlps xmm1, qword ptr [ecx+24]
  8752. movhps xmm1, qword ptr [ecx+32]
  8753. movaps xmm2, xmmword ptr [ecx+48]
  8754. movhps xmm3, qword ptr [ecx+80]
  8755. // Calculating first 4 elements in the first row of the destination matrix.
  8756. movss xmm4, dword ptr [edx]
  8757. movss xmm5, dword ptr [edx+4]
  8758. mov eax, dword ptr dstPtr
  8759. shufps xmm4, xmm4, 0
  8760. movss xmm6, dword ptr [edx+8]
  8761. shufps xmm5, xmm5, 0
  8762. movss xmm7, dword ptr [edx+12]
  8763. mulps xmm4, xmm0
  8764. shufps xmm6, xmm6, 0
  8765. shufps xmm7, xmm7, 0
  8766. mulps xmm5, xmm1
  8767. mulps xmm6, xmm2
  8768. addps xmm5, xmm4
  8769. mulps xmm7, xmm3
  8770. addps xmm6, xmm5
  8771. addps xmm7, xmm6
  8772. movaps xmmword ptr [eax], xmm7
  8773. // Calculating first 4 elements in the second row of the destination matrix.
  8774. movss xmm4, dword ptr [edx+24]
  8775. shufps xmm4, xmm4, 0
  8776. mulps xmm4, xmm0
  8777. movss xmm5, dword ptr [edx+28]
  8778. shufps xmm5, xmm5, 0
  8779. mulps xmm5, xmm1
  8780. movss xmm6, dword ptr [edx+32]
  8781. shufps xmm6, xmm6, 0
  8782. movss xmm7, dword ptr [edx+36]
  8783. shufps xmm7, xmm7, 0
  8784. mulps xmm6, xmm2
  8785. mulps xmm7, xmm3
  8786. addps xmm7, xmm6
  8787. addps xmm5, xmm4
  8788. addps xmm7, xmm5
  8789. // Calculating first 4 elements in the third row of the destination matrix.
  8790. movss xmm4, dword ptr [edx+48]
  8791. movss xmm5, dword ptr [edx+52]
  8792. movlps qword ptr [eax+24], xmm7 ; save 2nd
  8793. movhps qword ptr [eax+32], xmm7 ; row
  8794. movss xmm6, dword ptr [edx+56]
  8795. movss xmm7, dword ptr [edx+60]
  8796. shufps xmm4, xmm4, 0
  8797. shufps xmm5, xmm5, 0
  8798. shufps xmm6, xmm6, 0
  8799. shufps xmm7, xmm7, 0
  8800. mulps xmm4, xmm0
  8801. mulps xmm5, xmm1
  8802. mulps xmm6, xmm2
  8803. mulps xmm7, xmm3
  8804. addps xmm5, xmm4
  8805. addps xmm7, xmm6
  8806. addps xmm7, xmm5
  8807. movaps xmmword ptr [eax+48], xmm7
  8808. // Calculating first 4 elements in the fourth row of the destination matrix.
  8809. movss xmm4, dword ptr [edx+72]
  8810. movss xmm5, dword ptr [edx+76]
  8811. movss xmm6, dword ptr [edx+80]
  8812. movss xmm7, dword ptr [edx+84]
  8813. shufps xmm4, xmm4, 0
  8814. shufps xmm5, xmm5, 0
  8815. shufps xmm6, xmm6, 0
  8816. shufps xmm7, xmm7, 0
  8817. mulps xmm4, xmm0
  8818. mulps xmm5, xmm1
  8819. mulps xmm6, xmm2
  8820. mulps xmm7, xmm3
  8821. addps xmm4, xmm5
  8822. addps xmm6, xmm4
  8823. addps xmm7, xmm6
  8824. movlps qword ptr [eax+72], xmm7
  8825. movhps qword ptr [eax+80], xmm7
  8826. // Calculating first 4 elements in the fifth row of the destination matrix.
  8827. movss xmm4, dword ptr [edx+96]
  8828. movss xmm5, dword ptr [edx+100]
  8829. movss xmm6, dword ptr [edx+104]
  8830. movss xmm7, dword ptr [edx+108]
  8831. shufps xmm4, xmm4, 0
  8832. shufps xmm5, xmm5, 0
  8833. shufps xmm6, xmm6, 0
  8834. shufps xmm7, xmm7, 0
  8835. mulps xmm4, xmm0
  8836. mulps xmm5, xmm1
  8837. mulps xmm6, xmm2
  8838. mulps xmm7, xmm3
  8839. addps xmm5, xmm4
  8840. addps xmm7, xmm6
  8841. addps xmm7, xmm5
  8842. movaps xmmword ptr [eax+96], xmm7
  8843. // Calculating first 4 elements in the sixth row of the destination matrix.
  8844. movss xmm4, dword ptr [edx+120]
  8845. movss xmm5, dword ptr [edx+124]
  8846. movss xmm6, dword ptr [edx+128]
  8847. movss xmm7, dword ptr [edx+132]
  8848. shufps xmm4, xmm4, 0
  8849. shufps xmm5, xmm5, 0
  8850. shufps xmm6, xmm6, 0
  8851. shufps xmm7, xmm7, 0
  8852. mulps xmm4, xmm0
  8853. mulps xmm5, xmm1
  8854. mulps xmm6, xmm2
  8855. mulps xmm7, xmm3
  8856. addps xmm4, xmm5
  8857. addps xmm6, xmm4
  8858. addps xmm7, xmm6
  8859. movhps qword ptr [eax+128], xmm7
  8860. movlps qword ptr [eax+120], xmm7
  8861. // Loading first 4 columns (lower 2 rows) of m2Ptr.
  8862. movlps xmm0, qword ptr [ecx+96]
  8863. movhps xmm0, qword ptr [ecx+104]
  8864. movlps xmm1, qword ptr [ecx+120]
  8865. movhps xmm1, qword ptr [ecx+128]
  8866. // Calculating first 4 elements in the first row of the destination matrix.
  8867. movss xmm2, dword ptr [edx+16]
  8868. shufps xmm2, xmm2, 0
  8869. movss xmm4, dword ptr [edx+40]
  8870. movss xmm3, dword ptr [edx+20]
  8871. movss xmm5, dword ptr [edx+44]
  8872. movaps xmm6, xmmword ptr [eax]
  8873. movlps xmm7, qword ptr [eax+24]
  8874. shufps xmm3, xmm3, 0
  8875. shufps xmm5, xmm5, 0
  8876. movhps xmm7, qword ptr [eax+32]
  8877. shufps xmm4, xmm4, 0
  8878. mulps xmm5, xmm1
  8879. mulps xmm2, xmm0
  8880. mulps xmm3, xmm1
  8881. mulps xmm4, xmm0
  8882. addps xmm6, xmm2
  8883. addps xmm7, xmm4
  8884. addps xmm7, xmm5
  8885. addps xmm6, xmm3
  8886. movlps qword ptr [eax+24], xmm7
  8887. movaps xmmword ptr [eax], xmm6
  8888. movhps qword ptr [eax+32], xmm7
  8889. // Calculating first 4 elements in the third row of the destination matrix.
  8890. movss xmm2, dword ptr [edx+64]
  8891. movss xmm4, dword ptr [edx+88]
  8892. movss xmm5, dword ptr [edx+92]
  8893. movss xmm3, dword ptr [edx+68]
  8894. movaps xmm6, xmmword ptr [eax+48]
  8895. movlps xmm7, qword ptr [eax+72]
  8896. movhps xmm7, qword ptr [eax+80]
  8897. shufps xmm2, xmm2, 0
  8898. shufps xmm4, xmm4, 0
  8899. shufps xmm5, xmm5, 0
  8900. shufps xmm3, xmm3, 0
  8901. mulps xmm2, xmm0
  8902. mulps xmm4, xmm0
  8903. mulps xmm5, xmm1
  8904. mulps xmm3, xmm1
  8905. addps xmm6, xmm2
  8906. addps xmm6, xmm3
  8907. addps xmm7, xmm4
  8908. addps xmm7, xmm5
  8909. movlps qword ptr [eax+72], xmm7
  8910. movaps xmmword ptr [eax+48], xmm6
  8911. movhps qword ptr [eax+80], xmm7
  8912. // Calculating first 4 elements in the fifth row of the destination matrix.
  8913. movss xmm2, dword ptr [edx+112]
  8914. movss xmm3, dword ptr [edx+116]
  8915. movaps xmm6, xmmword ptr [eax+96]
  8916. shufps xmm2, xmm2, 0
  8917. shufps xmm3, xmm3, 0
  8918. mulps xmm2, xmm0
  8919. mulps xmm3, xmm1
  8920. addps xmm6, xmm2
  8921. addps xmm6, xmm3
  8922. movaps xmmword ptr [eax+96], xmm6
  8923. // Calculating first 4 elements in the sixth row of the destination matrix.
  8924. movss xmm4, dword ptr [edx+136]
  8925. movss xmm5, dword ptr [edx+140]
  8926. movhps xmm7, qword ptr [eax+128]
  8927. movlps xmm7, qword ptr [eax+120]
  8928. shufps xmm4, xmm4, 0
  8929. shufps xmm5, xmm5, 0
  8930. mulps xmm4, xmm0
  8931. mulps xmm5, xmm1
  8932. addps xmm7, xmm4
  8933. addps xmm7, xmm5
  8934. // Calculating last 2 columns of the destination matrix.
  8935. movlps xmm0, qword ptr [ecx+16]
  8936. movhps xmm0, qword ptr [ecx+40]
  8937. movhps qword ptr [eax+128], xmm7
  8938. movlps qword ptr [eax+120], xmm7
  8939. movlps xmm2, qword ptr [ecx+64]
  8940. movhps xmm2, qword ptr [ecx+88]
  8941. movaps xmm3, xmm2
  8942. shufps xmm3, xmm3, 4Eh
  8943. movlps xmm4, qword ptr [ecx+112]
  8944. movhps xmm4, qword ptr [ecx+136]
  8945. movaps xmm5, xmm4
  8946. shufps xmm5, xmm5, 4Eh
  8947. movlps xmm6, qword ptr [edx]
  8948. movhps xmm6, qword ptr [edx+24]
  8949. movaps xmm7, xmm6
  8950. shufps xmm7, xmm7, 0F0h
  8951. mulps xmm7, xmm0
  8952. shufps xmm6, xmm6, 0A5h
  8953. movaps xmm1, xmm0
  8954. shufps xmm1, xmm1, 4Eh
  8955. mulps xmm1, xmm6
  8956. addps xmm7, xmm1
  8957. movlps xmm6, qword ptr [edx+8]
  8958. movhps xmm6, qword ptr [edx+32]
  8959. movaps xmm1, xmm6
  8960. shufps xmm1, xmm1, 0F0h
  8961. shufps xmm6, xmm6, 0A5h
  8962. mulps xmm1, xmm2
  8963. mulps xmm6, xmm3
  8964. addps xmm7, xmm1
  8965. addps xmm7, xmm6
  8966. movhps xmm6, qword ptr [edx+40]
  8967. movlps xmm6, qword ptr [edx+16]
  8968. movaps xmm1, xmm6
  8969. shufps xmm1, xmm1, 0F0h
  8970. shufps xmm6, xmm6, 0A5h
  8971. mulps xmm1, xmm4
  8972. mulps xmm6, xmm5
  8973. addps xmm7, xmm1
  8974. addps xmm7, xmm6
  8975. movlps qword ptr [eax+16], xmm7
  8976. movhps qword ptr [eax+40], xmm7
  8977. movlps xmm6, qword ptr [edx+48]
  8978. movhps xmm6, qword ptr [edx+72]
  8979. movaps xmm7, xmm6
  8980. shufps xmm7, xmm7, 0F0h
  8981. mulps xmm7, xmm0
  8982. shufps xmm6, xmm6, 0A5h
  8983. movaps xmm1, xmm0
  8984. shufps xmm1, xmm1, 4Eh
  8985. mulps xmm1, xmm6
  8986. addps xmm7, xmm1
  8987. movhps xmm6, qword ptr [edx+80]
  8988. movlps xmm6, qword ptr [edx+56]
  8989. movaps xmm1, xmm6
  8990. shufps xmm1, xmm1, 0F0h
  8991. shufps xmm6, xmm6, 0A5h
  8992. mulps xmm1, xmm2
  8993. mulps xmm6, xmm3
  8994. addps xmm7, xmm1
  8995. addps xmm7, xmm6
  8996. movlps xmm6, qword ptr [edx+64]
  8997. movhps xmm6, qword ptr [edx+88]
  8998. movaps xmm1, xmm6
  8999. shufps xmm1, xmm1, 0F0h
  9000. shufps xmm6, xmm6, 0A5h
  9001. mulps xmm1, xmm4
  9002. mulps xmm6, xmm5
  9003. addps xmm7, xmm1
  9004. addps xmm7, xmm6
  9005. movlps qword ptr [eax+64], xmm7
  9006. movhps qword ptr [eax+88], xmm7
  9007. movlps xmm6, qword ptr [edx+96]
  9008. movhps xmm6, qword ptr [edx+120]
  9009. movaps xmm7, xmm6
  9010. shufps xmm7, xmm7, 0F0h
  9011. mulps xmm7, xmm0
  9012. shufps xmm6, xmm6, 0A5h
  9013. movaps xmm1, xmm0
  9014. shufps xmm1, xmm1, 4Eh
  9015. mulps xmm1, xmm6
  9016. addps xmm7, xmm1
  9017. movlps xmm6, qword ptr [edx+104]
  9018. movhps xmm6, qword ptr [edx+128]
  9019. movaps xmm1, xmm6
  9020. shufps xmm1, xmm1, 0F0h
  9021. shufps xmm6, xmm6, 0A5h
  9022. mulps xmm1, xmm2
  9023. mulps xmm6, xmm3
  9024. addps xmm7, xmm1
  9025. addps xmm7, xmm6
  9026. movlps xmm6, qword ptr [edx+112]
  9027. movhps xmm6, qword ptr [edx+136]
  9028. movaps xmm1, xmm6
  9029. shufps xmm1, xmm1, 0F0h
  9030. shufps xmm6, xmm6, 0A5h
  9031. mulps xmm1, xmm4
  9032. mulps xmm6, xmm5
  9033. addps xmm7, xmm1
  9034. addps xmm7, xmm6
  9035. movlps qword ptr [eax+112], xmm7
  9036. movhps qword ptr [eax+136], xmm7
  9037. }
  9038. return;
  9039. }
  9040. }
  9041. }
  9042. }
  9043. for ( i = 0; i < k; i++ ) {
  9044. m2Ptr = m2.ToFloatPtr();
  9045. for ( j = 0; j < l; j++ ) {
  9046. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
  9047. m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
  9048. m2Ptr++;
  9049. }
  9050. m1Ptr += 6;
  9051. }
  9052. break;
  9053. }
  9054. default: {
  9055. for ( i = 0; i < k; i++ ) {
  9056. for ( j = 0; j < l; j++ ) {
  9057. m2Ptr = m2.ToFloatPtr() + j;
  9058. sum = m1Ptr[0] * m2Ptr[0];
  9059. for ( n = 1; n < m1.GetNumColumns(); n++ ) {
  9060. m2Ptr += l;
  9061. sum += m1Ptr[n] * m2Ptr[0];
  9062. }
  9063. *dstPtr++ = sum;
  9064. }
  9065. m1Ptr += m1.GetNumColumns();
  9066. }
  9067. break;
  9068. }
  9069. }
  9070. }
  9071. /*
  9072. ============
  9073. idSIMD_SSE::MatX_TransposeMultiplyMatX
  9074. optimizes the following transpose matrix multiplications:
  9075. Nx6 * NxN
  9076. 6xN * 6x6
  9077. with N in the range [1-6].
  9078. ============
  9079. */
  9080. void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
  9081. int i, j, k, l, n;
  9082. float *dstPtr;
  9083. const float *m1Ptr, *m2Ptr;
  9084. double sum;
  9085. assert( m1.GetNumRows() == m2.GetNumRows() );
  9086. m1Ptr = m1.ToFloatPtr();
  9087. m2Ptr = m2.ToFloatPtr();
  9088. dstPtr = dst.ToFloatPtr();
  9089. k = m1.GetNumColumns();
  9090. l = m2.GetNumColumns();
  9091. switch( m1.GetNumRows() ) {
  9092. case 1:
  9093. if ( !((k^6)|(l^1)) ) { // 1x6 * 1x1
  9094. __asm {
  9095. mov esi, m2Ptr
  9096. mov edi, m1Ptr
  9097. mov eax, dstPtr
  9098. movss xmm0, [esi]
  9099. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  9100. movaps xmm1, xmm0
  9101. mulps xmm0, [edi]
  9102. mulps xmm1, [edi+16]
  9103. movaps [eax], xmm0
  9104. movlps [eax+16], xmm1
  9105. }
  9106. return;
  9107. }
  9108. for ( i = 0; i < k; i++ ) {
  9109. m2Ptr = m2.ToFloatPtr();
  9110. for ( j = 0; j < l; j++ ) {
  9111. *dstPtr++ = m1Ptr[0] * m2Ptr[0];
  9112. m2Ptr++;
  9113. }
  9114. m1Ptr++;
  9115. }
  9116. break;
  9117. case 2:
  9118. if ( !((k^6)|(l^2)) ) { // 2x6 * 2x2
  9119. #define MUL_2xN_2x2_INIT \
  9120. __asm mov esi, m2Ptr \
  9121. __asm mov edi, m1Ptr \
  9122. __asm mov eax, dstPtr \
  9123. __asm movlps xmm0, [esi] \
  9124. __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  9125. __asm movlps xmm1, [esi+8] \
  9126. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )
  9127. #define MUL_2xN_2x2_ROW2( N, row ) \
  9128. __asm movlps xmm6, [edi+(row+0*N)*4] \
  9129. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  9130. __asm movlps xmm7, [edi+(row+1*N)*4] \
  9131. __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  9132. __asm mulps xmm6, xmm0 \
  9133. __asm mulps xmm7, xmm1 \
  9134. __asm addps xmm6, xmm7 \
  9135. __asm movaps [eax+(row*2)*4], xmm6
  9136. MUL_2xN_2x2_INIT
  9137. MUL_2xN_2x2_ROW2( 6, 0 )
  9138. MUL_2xN_2x2_ROW2( 6, 2 )
  9139. MUL_2xN_2x2_ROW2( 6, 4 )
  9140. return;
  9141. }
  9142. for ( i = 0; i < k; i++ ) {
  9143. m2Ptr = m2.ToFloatPtr();
  9144. for ( j = 0; j < l; j++ ) {
  9145. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
  9146. m2Ptr++;
  9147. }
  9148. m1Ptr++;
  9149. }
  9150. break;
  9151. case 3:
  9152. if ( !((k^6)|(l^3)) ) { // 3x6 * 3x3
  9153. #define MUL_3xN_3x3_INIT \
  9154. __asm mov esi, m2Ptr \
  9155. __asm mov edi, m1Ptr \
  9156. __asm mov eax, dstPtr \
  9157. __asm movss xmm0, [esi+(0*3+0)*4] \
  9158. __asm movhps xmm0, [esi+(0*3+1)*4] \
  9159. __asm movss xmm1, [esi+(1*3+0)*4] \
  9160. __asm movhps xmm1, [esi+(1*3+1)*4] \
  9161. __asm movss xmm2, [esi+(2*3+0)*4] \
  9162. __asm movhps xmm2, [esi+(2*3+1)*4]
  9163. #define MUL_3xN_3x3_INIT_ROW4 \
  9164. __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 3, 0 ) \
  9165. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 0 ) \
  9166. __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 2, 3, 0 )
  9167. #define MUL_3xN_3x3_ROW4( N, row ) \
  9168. __asm movlps xmm3, [edi+(row+0*N+0)*4] \
  9169. __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 1 ) \
  9170. __asm movlps xmm4, [edi+(row+1*N+0)*4] \
  9171. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 1 ) \
  9172. __asm movlps xmm5, [edi+(row+2*N+0)*4] \
  9173. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 1 ) \
  9174. __asm mulps xmm3, xmm0 \
  9175. __asm mulps xmm4, xmm1 \
  9176. __asm mulps xmm5, xmm2 \
  9177. __asm addps xmm3, xmm4 \
  9178. __asm addps xmm3, xmm5 \
  9179. __asm movaps [eax+(row*3+0)*4], xmm3 \
  9180. __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
  9181. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
  9182. __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
  9183. __asm movlps xmm3, [edi+(row+0*N+1)*4] \
  9184. __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  9185. __asm movlps xmm4, [edi+(row+1*N+1)*4] \
  9186. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  9187. __asm movlps xmm5, [edi+(row+2*N+1)*4] \
  9188. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  9189. __asm mulps xmm3, xmm0 \
  9190. __asm mulps xmm4, xmm1 \
  9191. __asm mulps xmm5, xmm2 \
  9192. __asm addps xmm3, xmm4 \
  9193. __asm addps xmm3, xmm5 \
  9194. __asm movaps [eax+(row*3+4)*4], xmm3 \
  9195. __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
  9196. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
  9197. __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
  9198. __asm movlps xmm3, [edi+(row+0*N+2)*4] \
  9199. __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 1, 1 ) \
  9200. __asm movlps xmm4, [edi+(row+1*N+2)*4] \
  9201. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 1, 1 ) \
  9202. __asm movlps xmm5, [edi+(row+2*N+2)*4] \
  9203. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 1, 1 ) \
  9204. __asm mulps xmm3, xmm0 \
  9205. __asm mulps xmm4, xmm1 \
  9206. __asm mulps xmm5, xmm2 \
  9207. __asm addps xmm3, xmm4 \
  9208. __asm addps xmm3, xmm5 \
  9209. __asm movaps [eax+(row*3+8)*4], xmm3
  9210. #define MUL_3xN_3x3_INIT_ROW4_ROW4 \
  9211. __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) \
  9212. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) \
  9213. __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  9214. #define MUL_3xN_3x3_INIT_ROW4_ROW \
  9215. __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 1, 2, 3 ) \
  9216. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 2, 3 ) \
  9217. __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 1, 2, 3 )
  9218. #define MUL_3xN_3x3_ROW( N, row ) \
  9219. __asm movss xmm3, [edi+(row+0*N)*4] \
  9220. __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9221. __asm movss xmm4, [edi+(row+1*N)*4] \
  9222. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9223. __asm movss xmm5, [edi+(row+2*N)*4] \
  9224. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9225. __asm mulps xmm3, xmm0 \
  9226. __asm mulps xmm4, xmm1 \
  9227. __asm mulps xmm5, xmm2 \
  9228. __asm addps xmm3, xmm4 \
  9229. __asm addps xmm3, xmm5 \
  9230. __asm movss [eax+(row*3+0)*4], xmm3 \
  9231. __asm movhps [eax+(row*3+1)*4], xmm3
  9232. MUL_3xN_3x3_INIT
  9233. MUL_3xN_3x3_INIT_ROW4
  9234. MUL_3xN_3x3_ROW4( 6, 0 )
  9235. MUL_3xN_3x3_INIT_ROW4_ROW
  9236. MUL_3xN_3x3_ROW( 6, 4 )
  9237. MUL_3xN_3x3_ROW( 6, 5 )
  9238. return;
  9239. }
  9240. for ( i = 0; i < k; i++ ) {
  9241. m2Ptr = m2.ToFloatPtr();
  9242. for ( j = 0; j < l; j++ ) {
  9243. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
  9244. m2Ptr++;
  9245. }
  9246. m1Ptr++;
  9247. }
  9248. break;
  9249. case 4:
  9250. if ( !((k^6)|(l^4)) ) { // 4x6 * 4x4
  9251. #define MUL_4xN_4x4_INIT \
  9252. __asm mov esi, m2Ptr \
  9253. __asm mov edi, m1Ptr \
  9254. __asm mov eax, dstPtr \
  9255. __asm movaps xmm0, [esi] \
  9256. __asm movaps xmm1, [esi+16] \
  9257. __asm movaps xmm2, [esi+32] \
  9258. __asm movaps xmm3, [esi+48]
  9259. #define MUL_4xN_4x4_ROW( N, row ) \
  9260. __asm movss xmm7, [edi+(row+0*N)*4] \
  9261. __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9262. __asm mulps xmm7, xmm0 \
  9263. __asm movss xmm6, [edi+(row+1*N)*4] \
  9264. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9265. __asm mulps xmm6, xmm1 \
  9266. __asm addps xmm7, xmm6 \
  9267. __asm movss xmm6, [edi+(row+2*N)*4] \
  9268. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9269. __asm mulps xmm6, xmm2 \
  9270. __asm addps xmm7, xmm6 \
  9271. __asm movss xmm6, [edi+(row+3*N)*4] \
  9272. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9273. __asm mulps xmm6, xmm3 \
  9274. __asm addps xmm7, xmm6 \
  9275. __asm movaps [eax+row*16], xmm7
  9276. MUL_4xN_4x4_INIT
  9277. MUL_4xN_4x4_ROW( 6, 0 )
  9278. MUL_4xN_4x4_ROW( 6, 1 )
  9279. MUL_4xN_4x4_ROW( 6, 2 )
  9280. MUL_4xN_4x4_ROW( 6, 3 )
  9281. MUL_4xN_4x4_ROW( 6, 4 )
  9282. MUL_4xN_4x4_ROW( 6, 5 )
  9283. return;
  9284. }
  9285. for ( i = 0; i < k; i++ ) {
  9286. m2Ptr = m2.ToFloatPtr();
  9287. for ( j = 0; j < l; j++ ) {
  9288. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
  9289. m1Ptr[3*k] * m2Ptr[3*l];
  9290. m2Ptr++;
  9291. }
  9292. m1Ptr++;
  9293. }
  9294. break;
  9295. case 5:
  9296. if ( !((k^6)|(l^5)) ) { // 5x6 * 5x5
  9297. #define MUL_5xN_5x5_INIT \
  9298. __asm mov esi, m2Ptr \
  9299. __asm mov edi, m1Ptr \
  9300. __asm mov eax, dstPtr \
  9301. __asm movlps xmm0, [esi+ 0*4] \
  9302. __asm movhps xmm0, [esi+ 2*4] \
  9303. __asm movlps xmm1, [esi+ 5*4] \
  9304. __asm movhps xmm1, [esi+ 7*4] \
  9305. __asm movlps xmm2, [esi+10*4] \
  9306. __asm movhps xmm2, [esi+12*4] \
  9307. __asm movlps xmm3, [esi+15*4] \
  9308. __asm movhps xmm3, [esi+17*4] \
  9309. __asm movlps xmm4, [esi+20*4] \
  9310. __asm movhps xmm4, [esi+22*4]
  9311. #define MUL_5xN_5x5_ROW( N, row ) \
  9312. __asm movss xmm6, [edi+(row+0*N)*4] \
  9313. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9314. __asm mulps xmm6, xmm0 \
  9315. __asm fld dword ptr [edi+(row+0*N)*4] \
  9316. __asm fmul dword ptr [esi+ 4*4] \
  9317. __asm movss xmm5, [edi+(row+1*N)*4] \
  9318. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9319. __asm mulps xmm5, xmm1 \
  9320. __asm addps xmm6, xmm5 \
  9321. __asm fld dword ptr [edi+(row+1*N)*4] \
  9322. __asm fmul dword ptr [esi+ 9*4] \
  9323. __asm faddp st(1),st \
  9324. __asm movss xmm5, [edi+(row+2*N)*4] \
  9325. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9326. __asm mulps xmm5, xmm2 \
  9327. __asm addps xmm6, xmm5 \
  9328. __asm fld dword ptr [edi+(row+2*N)*4] \
  9329. __asm fmul dword ptr [esi+14*4] \
  9330. __asm faddp st(1),st \
  9331. __asm movss xmm5, [edi+(row+3*N)*4] \
  9332. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9333. __asm mulps xmm5, xmm3 \
  9334. __asm addps xmm6, xmm5 \
  9335. __asm fld dword ptr [edi+(row+3*N)*4] \
  9336. __asm fmul dword ptr [esi+19*4] \
  9337. __asm faddp st(1),st \
  9338. __asm movss xmm5, [edi+(row+4*N)*4] \
  9339. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9340. __asm mulps xmm5, xmm4 \
  9341. __asm addps xmm6, xmm5 \
  9342. __asm fld dword ptr [edi+(row+4*N)*4] \
  9343. __asm fmul dword ptr [esi+24*4] \
  9344. __asm faddp st(1),st \
  9345. __asm fstp dword ptr [eax+(row*5+4)*4] \
  9346. __asm movlps [eax+(row*5+0)*4], xmm6 \
  9347. __asm movhps [eax+(row*5+2)*4], xmm6
  9348. MUL_5xN_5x5_INIT
  9349. MUL_5xN_5x5_ROW( 6, 0 )
  9350. MUL_5xN_5x5_ROW( 6, 1 )
  9351. MUL_5xN_5x5_ROW( 6, 2 )
  9352. MUL_5xN_5x5_ROW( 6, 3 )
  9353. MUL_5xN_5x5_ROW( 6, 4 )
  9354. MUL_5xN_5x5_ROW( 6, 5 )
  9355. return;
  9356. }
  9357. for ( i = 0; i < k; i++ ) {
  9358. m2Ptr = m2.ToFloatPtr();
  9359. for ( j = 0; j < l; j++ ) {
  9360. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
  9361. m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
  9362. m2Ptr++;
  9363. }
  9364. m1Ptr++;
  9365. }
  9366. break;
  9367. case 6:
  9368. if ( !(l^6) ) {
  9369. switch( k ) {
  9370. case 1: { // 6x1 * 6x6
  9371. #define MUL_6xN_6x6_FIRST4COLUMNS_INIT \
  9372. __asm mov esi, m2Ptr \
  9373. __asm mov edi, m1Ptr \
  9374. __asm mov eax, dstPtr \
  9375. __asm movlps xmm0, [esi+ 0*4] \
  9376. __asm movhps xmm0, [esi+ 2*4] \
  9377. __asm movlps xmm1, [esi+ 6*4] \
  9378. __asm movhps xmm1, [esi+ 8*4] \
  9379. __asm movlps xmm2, [esi+12*4] \
  9380. __asm movhps xmm2, [esi+14*4] \
  9381. __asm movlps xmm3, [esi+18*4] \
  9382. __asm movhps xmm3, [esi+20*4] \
  9383. __asm movlps xmm4, [esi+24*4] \
  9384. __asm movhps xmm4, [esi+26*4] \
  9385. __asm movlps xmm5, [esi+30*4] \
  9386. __asm movhps xmm5, [esi+32*4]
  9387. #define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row ) \
  9388. __asm movss xmm7, [edi+(row+0*N)*4] \
  9389. __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9390. __asm mulps xmm7, xmm0 \
  9391. __asm movss xmm6, [edi+(row+1*N)*4] \
  9392. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9393. __asm mulps xmm6, xmm1 \
  9394. __asm addps xmm7, xmm6 \
  9395. __asm movss xmm6, [edi+(row+2*N)*4] \
  9396. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9397. __asm mulps xmm6, xmm2 \
  9398. __asm addps xmm7, xmm6 \
  9399. __asm movss xmm6, [edi+(row+3*N)*4] \
  9400. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9401. __asm mulps xmm6, xmm3 \
  9402. __asm addps xmm7, xmm6 \
  9403. __asm movss xmm6, [edi+(row+4*N)*4] \
  9404. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9405. __asm mulps xmm6, xmm4 \
  9406. __asm addps xmm7, xmm6 \
  9407. __asm movss xmm6, [edi+(row+5*N)*4] \
  9408. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9409. __asm mulps xmm6, xmm5 \
  9410. __asm addps xmm7, xmm6 \
  9411. __asm movlps [eax+(row*6+0)*4], xmm7 \
  9412. __asm movhps [eax+(row*6+2)*4], xmm7
  9413. #define MUL_6xN_6x6_LAST2COLUMNS_INIT \
  9414. __asm movlps xmm0, [esi+ 4*4] \
  9415. __asm movlps xmm1, [esi+10*4] \
  9416. __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  9417. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  9418. __asm movlps xmm2, [esi+16*4] \
  9419. __asm movlps xmm3, [esi+22*4] \
  9420. __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  9421. __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  9422. __asm movlps xmm4, [esi+28*4] \
  9423. __asm movlps xmm5, [esi+34*4] \
  9424. __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
  9425. __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
  9426. #define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row ) \
  9427. __asm movlps xmm7, [edi+(row*2+0*N)*4] \
  9428. __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  9429. __asm mulps xmm7, xmm0 \
  9430. __asm movlps xmm6, [edi+(row*2+1*N)*4] \
  9431. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  9432. __asm mulps xmm6, xmm1 \
  9433. __asm addps xmm7, xmm6 \
  9434. __asm movlps xmm6, [edi+(row*2+2*N)*4] \
  9435. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  9436. __asm mulps xmm6, xmm2 \
  9437. __asm addps xmm7, xmm6 \
  9438. __asm movlps xmm6, [edi+(row*2+3*N)*4] \
  9439. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  9440. __asm mulps xmm6, xmm3 \
  9441. __asm addps xmm7, xmm6 \
  9442. __asm movlps xmm6, [edi+(row*2+4*N)*4] \
  9443. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  9444. __asm mulps xmm6, xmm4 \
  9445. __asm addps xmm7, xmm6 \
  9446. __asm movlps xmm6, [edi+(row*2+5*N)*4] \
  9447. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
  9448. __asm mulps xmm6, xmm5 \
  9449. __asm addps xmm7, xmm6 \
  9450. __asm movlps [eax+(row*12+ 4)*4], xmm7 \
  9451. __asm movhps [eax+(row*12+10)*4], xmm7
  9452. #define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row ) \
  9453. __asm movss xmm7, [edi+(1*N-1)*4] \
  9454. __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9455. __asm mulps xmm7, xmm0 \
  9456. __asm movss xmm6, [edi+(2*N-1)*4] \
  9457. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9458. __asm mulps xmm6, xmm1 \
  9459. __asm addps xmm7, xmm6 \
  9460. __asm movss xmm6, [edi+(3*N-1)*4] \
  9461. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9462. __asm mulps xmm6, xmm2 \
  9463. __asm addps xmm7, xmm6 \
  9464. __asm movss xmm6, [edi+(4*N-1)*4] \
  9465. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9466. __asm mulps xmm6, xmm3 \
  9467. __asm addps xmm7, xmm6 \
  9468. __asm movss xmm6, [edi+(5*N-1)*4] \
  9469. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9470. __asm mulps xmm6, xmm4 \
  9471. __asm addps xmm7, xmm6 \
  9472. __asm movss xmm6, [edi+(6*N-1)*4] \
  9473. __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
  9474. __asm mulps xmm6, xmm5 \
  9475. __asm addps xmm7, xmm6 \
  9476. __asm movlps [eax+(row*6+4)*4], xmm7
  9477. MUL_6xN_6x6_FIRST4COLUMNS_INIT
  9478. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 )
  9479. MUL_6xN_6x6_LAST2COLUMNS_INIT
  9480. MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 )
  9481. return;
  9482. }
  9483. case 2: { // 6x2 * 6x6
  9484. MUL_6xN_6x6_FIRST4COLUMNS_INIT
  9485. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 )
  9486. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 )
  9487. MUL_6xN_6x6_LAST2COLUMNS_INIT
  9488. MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 )
  9489. return;
  9490. }
  9491. case 3: { // 6x3 * 6x6
  9492. MUL_6xN_6x6_FIRST4COLUMNS_INIT
  9493. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 )
  9494. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 )
  9495. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 )
  9496. MUL_6xN_6x6_LAST2COLUMNS_INIT
  9497. MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 )
  9498. MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 )
  9499. return;
  9500. }
  9501. case 4: { // 6x4 * 6x6
  9502. MUL_6xN_6x6_FIRST4COLUMNS_INIT
  9503. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 )
  9504. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 )
  9505. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 )
  9506. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 )
  9507. MUL_6xN_6x6_LAST2COLUMNS_INIT
  9508. MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 )
  9509. MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 )
  9510. return;
  9511. }
  9512. case 5: { // 6x5 * 6x6
  9513. MUL_6xN_6x6_FIRST4COLUMNS_INIT
  9514. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 )
  9515. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 )
  9516. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 )
  9517. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 )
  9518. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 )
  9519. MUL_6xN_6x6_LAST2COLUMNS_INIT
  9520. MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 )
  9521. MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 )
  9522. MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 )
  9523. return;
  9524. }
  9525. case 6: { // 6x6 * 6x6
  9526. MUL_6xN_6x6_FIRST4COLUMNS_INIT
  9527. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 )
  9528. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 )
  9529. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 )
  9530. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 )
  9531. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 )
  9532. MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 )
  9533. MUL_6xN_6x6_LAST2COLUMNS_INIT
  9534. MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 )
  9535. MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 )
  9536. MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 )
  9537. return;
  9538. }
  9539. }
  9540. }
  9541. for ( i = 0; i < k; i++ ) {
  9542. m2Ptr = m2.ToFloatPtr();
  9543. for ( j = 0; j < l; j++ ) {
  9544. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
  9545. m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
  9546. m2Ptr++;
  9547. }
  9548. m1Ptr++;
  9549. }
  9550. break;
  9551. default:
  9552. for ( i = 0; i < k; i++ ) {
  9553. for ( j = 0; j < l; j++ ) {
  9554. m1Ptr = m1.ToFloatPtr() + i;
  9555. m2Ptr = m2.ToFloatPtr() + j;
  9556. sum = m1Ptr[0] * m2Ptr[0];
  9557. for ( n = 1; n < m1.GetNumRows(); n++ ) {
  9558. m1Ptr += k;
  9559. m2Ptr += l;
  9560. sum += m1Ptr[0] * m2Ptr[0];
  9561. }
  9562. *dstPtr++ = sum;
  9563. }
  9564. }
  9565. break;
  9566. }
  9567. }
  9568. /*
  9569. ============
  9570. idSIMD_SSE::MatX_LowerTriangularSolve
  9571. solves x in Lx = b for the n * n sub-matrix of L
  9572. if skip > 0 the first skip elements of x are assumed to be valid already
  9573. L has to be a lower triangular matrix with (implicit) ones on the diagonal
  9574. x == b is allowed
  9575. ============
  9576. */
  9577. void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
  9578. int nc;
  9579. const float *lptr;
  9580. if ( skip >= n ) {
  9581. return;
  9582. }
  9583. lptr = L.ToFloatPtr();
  9584. nc = L.GetNumColumns();
  9585. // unrolled cases for n < 8
  9586. if ( n < 8 ) {
  9587. #define NSKIP( n, s ) ((n<<3)|(s&7))
  9588. switch( NSKIP( n, skip ) ) {
  9589. case NSKIP( 1, 0 ): x[0] = b[0];
  9590. return;
  9591. case NSKIP( 2, 0 ): x[0] = b[0];
  9592. case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  9593. return;
  9594. case NSKIP( 3, 0 ): x[0] = b[0];
  9595. case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  9596. case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  9597. return;
  9598. case NSKIP( 4, 0 ): x[0] = b[0];
  9599. case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  9600. case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  9601. case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  9602. return;
  9603. case NSKIP( 5, 0 ): x[0] = b[0];
  9604. case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  9605. case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  9606. case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  9607. case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  9608. return;
  9609. case NSKIP( 6, 0 ): x[0] = b[0];
  9610. case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  9611. case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  9612. case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  9613. case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  9614. case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
  9615. return;
  9616. case NSKIP( 7, 0 ): x[0] = b[0];
  9617. case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  9618. case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  9619. case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  9620. case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  9621. case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
  9622. case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
  9623. return;
  9624. }
  9625. return;
  9626. }
  9627. // process first 4 rows
  9628. switch( skip ) {
  9629. case 0: x[0] = b[0];
  9630. case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
  9631. case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  9632. case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  9633. skip = 4;
  9634. }
  9635. lptr = L[skip];
  9636. // this code assumes n > 4
  9637. __asm {
  9638. push ebx
  9639. mov eax, skip // eax = i
  9640. shl eax, 2 // eax = i*4
  9641. mov edx, n // edx = n
  9642. shl edx, 2 // edx = n*4
  9643. mov esi, x // esi = x
  9644. mov edi, lptr // edi = lptr
  9645. add esi, eax
  9646. add edi, eax
  9647. mov ebx, b // ebx = b
  9648. // check for aligned memory
  9649. mov ecx, nc
  9650. shl ecx, 2
  9651. or ecx, esi
  9652. or ecx, edi
  9653. and ecx, 15
  9654. jnz loopurow
  9655. // aligned
  9656. looprow:
  9657. mov ecx, eax
  9658. neg ecx
  9659. movaps xmm0, [esi+ecx]
  9660. mulps xmm0, [edi+ecx]
  9661. add ecx, 12*4
  9662. jg donedot8
  9663. dot8:
  9664. movaps xmm1, [esi+ecx-(8*4)]
  9665. mulps xmm1, [edi+ecx-(8*4)]
  9666. addps xmm0, xmm1
  9667. movaps xmm3, [esi+ecx-(4*4)]
  9668. mulps xmm3, [edi+ecx-(4*4)]
  9669. addps xmm0, xmm3
  9670. add ecx, 8*4
  9671. jle dot8
  9672. donedot8:
  9673. sub ecx, 4*4
  9674. jg donedot4
  9675. //dot4:
  9676. movaps xmm1, [esi+ecx-(4*4)]
  9677. mulps xmm1, [edi+ecx-(4*4)]
  9678. addps xmm0, xmm1
  9679. add ecx, 4*4
  9680. donedot4:
  9681. movhlps xmm1, xmm0
  9682. addps xmm0, xmm1
  9683. movaps xmm1, xmm0
  9684. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
  9685. addss xmm0, xmm1
  9686. sub ecx, 4*4
  9687. jz dot0
  9688. add ecx, 4
  9689. jz dot1
  9690. add ecx, 4
  9691. jz dot2
  9692. //dot3:
  9693. movss xmm1, [esi-(3*4)]
  9694. mulss xmm1, [edi-(3*4)]
  9695. addss xmm0, xmm1
  9696. dot2:
  9697. movss xmm3, [esi-(2*4)]
  9698. mulss xmm3, [edi-(2*4)]
  9699. addss xmm0, xmm3
  9700. dot1:
  9701. movss xmm5, [esi-(1*4)]
  9702. mulss xmm5, [edi-(1*4)]
  9703. addss xmm0, xmm5
  9704. dot0:
  9705. movss xmm1, [ebx+eax]
  9706. subss xmm1, xmm0
  9707. movss [esi], xmm1
  9708. add eax, 4
  9709. cmp eax, edx
  9710. jge done
  9711. add esi, 4
  9712. mov ecx, nc
  9713. shl ecx, 2
  9714. add edi, ecx
  9715. add edi, 4
  9716. jmp looprow
  9717. // unaligned
  9718. loopurow:
  9719. mov ecx, eax
  9720. neg ecx
  9721. movups xmm0, [esi+ecx]
  9722. movups xmm1, [edi+ecx]
  9723. mulps xmm0, xmm1
  9724. add ecx, 12*4
  9725. jg doneudot8
  9726. udot8:
  9727. movups xmm1, [esi+ecx-(8*4)]
  9728. movups xmm2, [edi+ecx-(8*4)]
  9729. mulps xmm1, xmm2
  9730. addps xmm0, xmm1
  9731. movups xmm3, [esi+ecx-(4*4)]
  9732. movups xmm4, [edi+ecx-(4*4)]
  9733. mulps xmm3, xmm4
  9734. addps xmm0, xmm3
  9735. add ecx, 8*4
  9736. jle udot8
  9737. doneudot8:
  9738. sub ecx, 4*4
  9739. jg doneudot4
  9740. //udot4:
  9741. movups xmm1, [esi+ecx-(4*4)]
  9742. movups xmm2, [edi+ecx-(4*4)]
  9743. mulps xmm1, xmm2
  9744. addps xmm0, xmm1
  9745. add ecx, 4*4
  9746. doneudot4:
  9747. movhlps xmm1, xmm0
  9748. addps xmm0, xmm1
  9749. movaps xmm1, xmm0
  9750. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
  9751. addss xmm0, xmm1
  9752. sub ecx, 4*4
  9753. jz udot0
  9754. add ecx, 4
  9755. jz udot1
  9756. add ecx, 4
  9757. jz udot2
  9758. //udot3:
  9759. movss xmm1, [esi-(3*4)]
  9760. movss xmm2, [edi-(3*4)]
  9761. mulss xmm1, xmm2
  9762. addss xmm0, xmm1
  9763. udot2:
  9764. movss xmm3, [esi-(2*4)]
  9765. movss xmm4, [edi-(2*4)]
  9766. mulss xmm3, xmm4
  9767. addss xmm0, xmm3
  9768. udot1:
  9769. movss xmm5, [esi-(1*4)]
  9770. movss xmm6, [edi-(1*4)]
  9771. mulss xmm5, xmm6
  9772. addss xmm0, xmm5
  9773. udot0:
  9774. movss xmm1, [ebx+eax]
  9775. subss xmm1, xmm0
  9776. movss [esi], xmm1
  9777. add eax, 4
  9778. cmp eax, edx
  9779. jge done
  9780. add esi, 4
  9781. mov ecx, nc
  9782. shl ecx, 2
  9783. add edi, ecx
  9784. add edi, 4
  9785. jmp loopurow
  9786. done:
  9787. pop ebx
  9788. }
  9789. }
  9790. /*
  9791. ============
  9792. idSIMD_SSE::MatX_LowerTriangularSolveTranspose
  9793. solves x in L'x = b for the n * n sub-matrix of L
  9794. L has to be a lower triangular matrix with (implicit) ones on the diagonal
  9795. x == b is allowed
  9796. ============
  9797. */
  9798. void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
  9799. int nc;
  9800. const float *lptr;
  9801. lptr = L.ToFloatPtr();
  9802. nc = L.GetNumColumns();
  9803. // unrolled cases for n < 8
  9804. if ( n < 8 ) {
  9805. switch( n ) {
  9806. case 0:
  9807. return;
  9808. case 1:
  9809. x[0] = b[0];
  9810. return;
  9811. case 2:
  9812. x[1] = b[1];
  9813. x[0] = b[0] - lptr[1*nc+0] * x[1];
  9814. return;
  9815. case 3:
  9816. x[2] = b[2];
  9817. x[1] = b[1] - lptr[2*nc+1] * x[2];
  9818. x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  9819. return;
  9820. case 4:
  9821. x[3] = b[3];
  9822. x[2] = b[2] - lptr[3*nc+2] * x[3];
  9823. x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  9824. x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  9825. return;
  9826. case 5:
  9827. x[4] = b[4];
  9828. x[3] = b[3] - lptr[4*nc+3] * x[4];
  9829. x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
  9830. x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  9831. x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  9832. return;
  9833. case 6:
  9834. x[5] = b[5];
  9835. x[4] = b[4] - lptr[5*nc+4] * x[5];
  9836. x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
  9837. x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
  9838. x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  9839. x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  9840. return;
  9841. case 7:
  9842. x[6] = b[6];
  9843. x[5] = b[5] - lptr[6*nc+5] * x[6];
  9844. x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
  9845. x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
  9846. x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
  9847. x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  9848. x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  9849. return;
  9850. }
  9851. return;
  9852. }
  9853. #if 1
  9854. int i, j, m;
  9855. float *xptr;
  9856. double s0;
  9857. // if the number of columns is not a multiple of 2 we're screwed for alignment.
  9858. // however, if the number of columns is a multiple of 2 but the number of to be
  9859. // processed rows is not a multiple of 2 we can still run 8 byte aligned
  9860. m = n;
  9861. if ( m & 1 ) {
  9862. m--;
  9863. x[m] = b[m];
  9864. lptr = L.ToFloatPtr() + m * nc + m - 4;
  9865. xptr = x + m;
  9866. __asm {
  9867. push ebx
  9868. mov eax, m // eax = i
  9869. mov esi, xptr // esi = xptr
  9870. mov edi, lptr // edi = lptr
  9871. mov ebx, b // ebx = b
  9872. mov edx, nc // edx = nc*sizeof(float)
  9873. shl edx, 2
  9874. process4rows_1:
  9875. movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
  9876. movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3]
  9877. xor ecx, ecx
  9878. sub eax, m
  9879. neg eax
  9880. jz done4x4_1
  9881. process4x4_1: // process 4x4 blocks
  9882. movlps xmm2, [edi+0]
  9883. movhps xmm2, [edi+8]
  9884. add edi, edx
  9885. movss xmm1, [esi+4*ecx+0]
  9886. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  9887. movlps xmm3, [edi+0]
  9888. movhps xmm3, [edi+8]
  9889. add edi, edx
  9890. mulps xmm1, xmm2
  9891. subps xmm0, xmm1
  9892. movss xmm1, [esi+4*ecx+4]
  9893. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  9894. movlps xmm4, [edi+0]
  9895. movhps xmm4, [edi+8]
  9896. add edi, edx
  9897. mulps xmm1, xmm3
  9898. subps xmm0, xmm1
  9899. movss xmm1, [esi+4*ecx+8]
  9900. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  9901. movlps xmm5, [edi+0]
  9902. movhps xmm5, [edi+8]
  9903. add edi, edx
  9904. mulps xmm1, xmm4
  9905. subps xmm0, xmm1
  9906. movss xmm1, [esi+4*ecx+12]
  9907. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  9908. add ecx, 4
  9909. cmp ecx, eax
  9910. mulps xmm1, xmm5
  9911. subps xmm0, xmm1
  9912. jl process4x4_1
  9913. done4x4_1: // process left over of the 4 rows
  9914. movlps xmm2, [edi+0]
  9915. movhps xmm2, [edi+8]
  9916. movss xmm1, [esi+4*ecx]
  9917. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  9918. mulps xmm1, xmm2
  9919. subps xmm0, xmm1
  9920. imul ecx, edx
  9921. sub edi, ecx
  9922. neg eax
  9923. add eax, m
  9924. sub eax, 4
  9925. movaps xmm1, xmm0
  9926. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
  9927. movaps xmm2, xmm0
  9928. shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
  9929. movaps xmm3, xmm0
  9930. shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
  9931. sub edi, edx
  9932. movss [esi-4], xmm3 // xptr[-1] = s3
  9933. movss xmm4, xmm3
  9934. movss xmm5, xmm3
  9935. mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3
  9936. mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3
  9937. mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3
  9938. subss xmm2, xmm3
  9939. movss [esi-8], xmm2 // xptr[-2] = s2
  9940. movss xmm6, xmm2
  9941. sub edi, edx
  9942. subss xmm0, xmm5
  9943. subss xmm1, xmm4
  9944. mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2
  9945. mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2
  9946. subss xmm1, xmm2
  9947. movss [esi-12], xmm1 // xptr[-3] = s1
  9948. subss xmm0, xmm6
  9949. sub edi, edx
  9950. cmp eax, 4
  9951. mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1
  9952. subss xmm0, xmm1
  9953. movss [esi-16], xmm0 // xptr[-4] = s0
  9954. jl done4rows_1
  9955. sub edi, edx
  9956. sub edi, 16
  9957. sub esi, 16
  9958. jmp process4rows_1
  9959. done4rows_1:
  9960. pop ebx
  9961. }
  9962. } else {
  9963. lptr = L.ToFloatPtr() + m * nc + m - 4;
  9964. xptr = x + m;
  9965. __asm {
  9966. push ebx
  9967. mov eax, m // eax = i
  9968. mov esi, xptr // esi = xptr
  9969. mov edi, lptr // edi = lptr
  9970. mov ebx, b // ebx = b
  9971. mov edx, nc // edx = nc*sizeof(float)
  9972. shl edx, 2
  9973. process4rows:
  9974. movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
  9975. movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3]
  9976. sub eax, m
  9977. jz done4x4
  9978. neg eax
  9979. xor ecx, ecx
  9980. process4x4: // process 4x4 blocks
  9981. movlps xmm2, [edi+0]
  9982. movhps xmm2, [edi+8]
  9983. add edi, edx
  9984. movss xmm1, [esi+4*ecx+0]
  9985. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  9986. movlps xmm3, [edi+0]
  9987. movhps xmm3, [edi+8]
  9988. add edi, edx
  9989. mulps xmm1, xmm2
  9990. subps xmm0, xmm1
  9991. movss xmm1, [esi+4*ecx+4]
  9992. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  9993. movlps xmm4, [edi+0]
  9994. movhps xmm4, [edi+8]
  9995. add edi, edx
  9996. mulps xmm1, xmm3
  9997. subps xmm0, xmm1
  9998. movss xmm1, [esi+4*ecx+8]
  9999. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  10000. movlps xmm5, [edi+0]
  10001. movhps xmm5, [edi+8]
  10002. add edi, edx
  10003. mulps xmm1, xmm4
  10004. subps xmm0, xmm1
  10005. movss xmm1, [esi+4*ecx+12]
  10006. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  10007. add ecx, 4
  10008. cmp ecx, eax
  10009. mulps xmm1, xmm5
  10010. subps xmm0, xmm1
  10011. jl process4x4
  10012. imul ecx, edx
  10013. sub edi, ecx
  10014. neg eax
  10015. done4x4: // process left over of the 4 rows
  10016. add eax, m
  10017. sub eax, 4
  10018. movaps xmm1, xmm0
  10019. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
  10020. movaps xmm2, xmm0
  10021. shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
  10022. movaps xmm3, xmm0
  10023. shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
  10024. sub edi, edx
  10025. movss [esi-4], xmm3 // xptr[-1] = s3
  10026. movss xmm4, xmm3
  10027. movss xmm5, xmm3
  10028. mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3
  10029. mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3
  10030. mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3
  10031. subss xmm2, xmm3
  10032. movss [esi-8], xmm2 // xptr[-2] = s2
  10033. movss xmm6, xmm2
  10034. sub edi, edx
  10035. subss xmm0, xmm5
  10036. subss xmm1, xmm4
  10037. mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2
  10038. mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2
  10039. subss xmm1, xmm2
  10040. movss [esi-12], xmm1 // xptr[-3] = s1
  10041. subss xmm0, xmm6
  10042. sub edi, edx
  10043. cmp eax, 4
  10044. mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1
  10045. subss xmm0, xmm1
  10046. movss [esi-16], xmm0 // xptr[-4] = s0
  10047. jl done4rows
  10048. sub edi, edx
  10049. sub edi, 16
  10050. sub esi, 16
  10051. jmp process4rows
  10052. done4rows:
  10053. pop ebx
  10054. }
  10055. }
  10056. // process left over rows
  10057. for ( i = (m&3)-1; i >= 0; i-- ) {
  10058. s0 = b[i];
  10059. lptr = L[0] + i;
  10060. for ( j = i + 1; j < n; j++ ) {
  10061. s0 -= lptr[j*nc] * x[j];
  10062. }
  10063. x[i] = s0;
  10064. }
  10065. #else
  10066. int i, j, m;
  10067. double s0, s1, s2, s3, t;
  10068. const float *lptr2;
  10069. float *xptr, *xptr2;
  10070. m = n;
  10071. if ( m & 1 ) {
  10072. m--;
  10073. x[m] = b[m];
  10074. lptr = L.ToFloatPtr() + m * nc + m - 4;
  10075. xptr = x + m;
  10076. // process 4 rows at a time
  10077. for ( i = m; i >= 4; i -= 4 ) {
  10078. s0 = b[i-4];
  10079. s1 = b[i-3];
  10080. s2 = b[i-2];
  10081. s3 = b[i-1];
  10082. // process 4x4 blocks
  10083. xptr2 = xptr; // x + i;
  10084. lptr2 = lptr; // ptr = L[i] + i - 4;
  10085. for ( j = 0; j < m-i; j += 4 ) {
  10086. t = xptr2[0];
  10087. s0 -= lptr2[0] * t;
  10088. s1 -= lptr2[1] * t;
  10089. s2 -= lptr2[2] * t;
  10090. s3 -= lptr2[3] * t;
  10091. lptr2 += nc;
  10092. xptr2++;
  10093. t = xptr2[0];
  10094. s0 -= lptr2[0] * t;
  10095. s1 -= lptr2[1] * t;
  10096. s2 -= lptr2[2] * t;
  10097. s3 -= lptr2[3] * t;
  10098. lptr2 += nc;
  10099. xptr2++;
  10100. t = xptr2[0];
  10101. s0 -= lptr2[0] * t;
  10102. s1 -= lptr2[1] * t;
  10103. s2 -= lptr2[2] * t;
  10104. s3 -= lptr2[3] * t;
  10105. lptr2 += nc;
  10106. xptr2++;
  10107. t = xptr2[0];
  10108. s0 -= lptr2[0] * t;
  10109. s1 -= lptr2[1] * t;
  10110. s2 -= lptr2[2] * t;
  10111. s3 -= lptr2[3] * t;
  10112. lptr2 += nc;
  10113. xptr2++;
  10114. }
  10115. t = xptr2[0];
  10116. s0 -= lptr2[0] * t;
  10117. s1 -= lptr2[1] * t;
  10118. s2 -= lptr2[2] * t;
  10119. s3 -= lptr2[3] * t;
  10120. // process left over of the 4 rows
  10121. lptr -= nc;
  10122. s0 -= lptr[0] * s3;
  10123. s1 -= lptr[1] * s3;
  10124. s2 -= lptr[2] * s3;
  10125. lptr -= nc;
  10126. s0 -= lptr[0] * s2;
  10127. s1 -= lptr[1] * s2;
  10128. lptr -= nc;
  10129. s0 -= lptr[0] * s1;
  10130. lptr -= nc;
  10131. // store result
  10132. xptr[-4] = s0;
  10133. xptr[-3] = s1;
  10134. xptr[-2] = s2;
  10135. xptr[-1] = s3;
  10136. // update pointers for next four rows
  10137. lptr -= 4;
  10138. xptr -= 4;
  10139. }
  10140. } else {
  10141. lptr = L.ToFloatPtr() + m * nc + m - 4;
  10142. xptr = x + m;
  10143. // process 4 rows at a time
  10144. for ( i = m; i >= 4; i -= 4 ) {
  10145. s0 = b[i-4];
  10146. s1 = b[i-3];
  10147. s2 = b[i-2];
  10148. s3 = b[i-1];
  10149. // process 4x4 blocks
  10150. xptr2 = xptr; // x + i;
  10151. lptr2 = lptr; // ptr = L[i] + i - 4;
  10152. for ( j = 0; j < m-i; j += 4 ) {
  10153. t = xptr2[0];
  10154. s0 -= lptr2[0] * t;
  10155. s1 -= lptr2[1] * t;
  10156. s2 -= lptr2[2] * t;
  10157. s3 -= lptr2[3] * t;
  10158. lptr2 += nc;
  10159. xptr2++;
  10160. t = xptr2[0];
  10161. s0 -= lptr2[0] * t;
  10162. s1 -= lptr2[1] * t;
  10163. s2 -= lptr2[2] * t;
  10164. s3 -= lptr2[3] * t;
  10165. lptr2 += nc;
  10166. xptr2++;
  10167. t = xptr2[0];
  10168. s0 -= lptr2[0] * t;
  10169. s1 -= lptr2[1] * t;
  10170. s2 -= lptr2[2] * t;
  10171. s3 -= lptr2[3] * t;
  10172. lptr2 += nc;
  10173. xptr2++;
  10174. t = xptr2[0];
  10175. s0 -= lptr2[0] * t;
  10176. s1 -= lptr2[1] * t;
  10177. s2 -= lptr2[2] * t;
  10178. s3 -= lptr2[3] * t;
  10179. lptr2 += nc;
  10180. xptr2++;
  10181. }
  10182. // process left over of the 4 rows
  10183. lptr -= nc;
  10184. s0 -= lptr[0] * s3;
  10185. s1 -= lptr[1] * s3;
  10186. s2 -= lptr[2] * s3;
  10187. lptr -= nc;
  10188. s0 -= lptr[0] * s2;
  10189. s1 -= lptr[1] * s2;
  10190. lptr -= nc;
  10191. s0 -= lptr[0] * s1;
  10192. lptr -= nc;
  10193. // store result
  10194. xptr[-4] = s0;
  10195. xptr[-3] = s1;
  10196. xptr[-2] = s2;
  10197. xptr[-1] = s3;
  10198. // update pointers for next four rows
  10199. lptr -= 4;
  10200. xptr -= 4;
  10201. }
  10202. }
  10203. // process left over rows
  10204. for ( i--; i >= 0; i-- ) {
  10205. s0 = b[i];
  10206. lptr = L[0] + i;
  10207. for ( j = i + 1; j < m; j++ ) {
  10208. s0 -= lptr[j*nc] * x[j];
  10209. }
  10210. x[i] = s0;
  10211. }
  10212. #endif
  10213. }
  10214. /*
  10215. ============
  10216. idSIMD_SSE::MatX_LDLTFactor
  10217. in-place factorization LDL' of the n * n sub-matrix of mat
  10218. the reciprocal of the diagonal elements are stored in invDiag
  10219. currently assumes the number of columns of mat is a multiple of 4
  10220. ============
  10221. */
  10222. bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
  10223. #if 1
  10224. int j, nc;
  10225. float *v, *diag, *invDiagPtr, *mptr;
  10226. double s0, s1, s2, sum, d;
  10227. v = (float *) _alloca16( n * sizeof( float ) );
  10228. diag = (float *) _alloca16( n * sizeof( float ) );
  10229. invDiagPtr = invDiag.ToFloatPtr();
  10230. nc = mat.GetNumColumns();
  10231. assert( ( nc & 3 ) == 0 );
  10232. if ( n <= 0 ) {
  10233. return true;
  10234. }
  10235. mptr = mat[0];
  10236. sum = mptr[0];
  10237. if ( sum == 0.0f ) {
  10238. return false;
  10239. }
  10240. diag[0] = sum;
  10241. invDiagPtr[0] = d = 1.0f / sum;
  10242. if ( n <= 1 ) {
  10243. return true;
  10244. }
  10245. mptr = mat[0];
  10246. for ( j = 1; j < n; j++ ) {
  10247. mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
  10248. }
  10249. mptr = mat[1];
  10250. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  10251. sum = mptr[1] - s0;
  10252. if ( sum == 0.0f ) {
  10253. return false;
  10254. }
  10255. mat[1][1] = sum;
  10256. diag[1] = sum;
  10257. invDiagPtr[1] = d = 1.0f / sum;
  10258. if ( n <= 2 ) {
  10259. return true;
  10260. }
  10261. mptr = mat[0];
  10262. for ( j = 2; j < n; j++ ) {
  10263. mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
  10264. }
  10265. mptr = mat[2];
  10266. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  10267. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  10268. sum = mptr[2] - s0 - s1;
  10269. if ( sum == 0.0f ) {
  10270. return false;
  10271. }
  10272. mat[2][2] = sum;
  10273. diag[2] = sum;
  10274. invDiagPtr[2] = d = 1.0f / sum;
  10275. if ( n <= 3 ) {
  10276. return true;
  10277. }
  10278. mptr = mat[0];
  10279. for ( j = 3; j < n; j++ ) {
  10280. mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
  10281. }
  10282. mptr = mat[3];
  10283. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  10284. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  10285. v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
  10286. sum = mptr[3] - s0 - s1 - s2;
  10287. if ( sum == 0.0f ) {
  10288. return false;
  10289. }
  10290. mat[3][3] = sum;
  10291. diag[3] = sum;
  10292. invDiagPtr[3] = d = 1.0f / sum;
  10293. if ( n <= 4 ) {
  10294. return true;
  10295. }
  10296. mptr = mat[0];
  10297. for ( j = 4; j < n; j++ ) {
  10298. mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
  10299. }
  10300. int ncf = nc * sizeof( float );
  10301. mptr = mat[0];
  10302. __asm {
  10303. xorps xmm2, xmm2
  10304. xorps xmm3, xmm3
  10305. xorps xmm4, xmm4
  10306. push ebx
  10307. mov ebx, 4
  10308. loopRow:
  10309. cmp ebx, n
  10310. jge done
  10311. mov ecx, ebx // esi = i
  10312. shl ecx, 2 // esi = i * 4
  10313. mov edx, diag // edx = diag
  10314. add edx, ecx // edx = &diag[i]
  10315. mov edi, ebx // edi = i
  10316. imul edi, ncf // edi = i * nc * sizeof( float )
  10317. add edi, mptr // edi = mat[i]
  10318. add edi, ecx // edi = &mat[i][i]
  10319. mov esi, v // ecx = v
  10320. add esi, ecx // ecx = &v[i]
  10321. mov eax, invDiagPtr // eax = invDiagPtr
  10322. add eax, ecx // eax = &invDiagPtr[i]
  10323. neg ecx
  10324. movaps xmm0, [edx+ecx]
  10325. mulps xmm0, [edi+ecx]
  10326. movaps [esi+ecx], xmm0
  10327. mulps xmm0, [edi+ecx]
  10328. add ecx, 12*4
  10329. jg doneDot8
  10330. dot8:
  10331. movaps xmm1, [edx+ecx-(8*4)]
  10332. mulps xmm1, [edi+ecx-(8*4)]
  10333. movaps [esi+ecx-(8*4)], xmm1
  10334. mulps xmm1, [edi+ecx-(8*4)]
  10335. addps xmm0, xmm1
  10336. movaps xmm2, [edx+ecx-(4*4)]
  10337. mulps xmm2, [edi+ecx-(4*4)]
  10338. movaps [esi+ecx-(4*4)], xmm2
  10339. mulps xmm2, [edi+ecx-(4*4)]
  10340. addps xmm0, xmm2
  10341. add ecx, 8*4
  10342. jle dot8
  10343. doneDot8:
  10344. sub ecx, 4*4
  10345. jg doneDot4
  10346. movaps xmm1, [edx+ecx-(4*4)]
  10347. mulps xmm1, [edi+ecx-(4*4)]
  10348. movaps [esi+ecx-(4*4)], xmm1
  10349. mulps xmm1, [edi+ecx-(4*4)]
  10350. addps xmm0, xmm1
  10351. add ecx, 4*4
  10352. doneDot4:
  10353. sub ecx, 2*4
  10354. jg doneDot2
  10355. movlps xmm3, [edx+ecx-(2*4)]
  10356. movlps xmm4, [edi+ecx-(2*4)]
  10357. mulps xmm3, xmm4
  10358. movlps [esi+ecx-(2*4)], xmm3
  10359. mulps xmm3, xmm4
  10360. addps xmm0, xmm3
  10361. add ecx, 2*4
  10362. doneDot2:
  10363. sub ecx, 1*4
  10364. jg doneDot1
  10365. movss xmm3, [edx+ecx-(1*4)]
  10366. movss xmm4, [edi+ecx-(1*4)]
  10367. mulss xmm3, xmm4
  10368. movss [esi+ecx-(1*4)], xmm3
  10369. mulss xmm3, xmm4
  10370. addss xmm0, xmm3
  10371. doneDot1:
  10372. movhlps xmm2, xmm0
  10373. addps xmm0, xmm2
  10374. movaps xmm2, xmm0
  10375. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
  10376. addss xmm0, xmm2
  10377. movss xmm1, [edi]
  10378. subss xmm1, xmm0
  10379. movss [edi], xmm1 // mptr[i] = sum;
  10380. movss [edx], xmm1 // diag[i] = sum;
  10381. // if ( sum == 0.0f ) return false;
  10382. movaps xmm2, xmm1
  10383. cmpeqss xmm2, SIMD_SP_zero
  10384. andps xmm2, SIMD_SP_tiny
  10385. orps xmm1, xmm2
  10386. rcpss xmm7, xmm1
  10387. mulss xmm1, xmm7
  10388. mulss xmm1, xmm7
  10389. addss xmm7, xmm7
  10390. subss xmm7, xmm1
  10391. movss [eax], xmm7 // invDiagPtr[i] = 1.0f / sum;
  10392. mov edx, n // edx = n
  10393. sub edx, ebx // edx = n - i
  10394. dec edx // edx = n - i - 1
  10395. jle doneSubRow // if ( i + 1 >= n ) return true;
  10396. mov eax, ebx // eax = i
  10397. shl eax, 2 // eax = i * 4
  10398. neg eax
  10399. loopSubRow:
  10400. add edi, ncf
  10401. mov ecx, eax
  10402. movaps xmm0, [esi+ecx]
  10403. mulps xmm0, [edi+ecx]
  10404. add ecx, 12*4
  10405. jg doneSubDot8
  10406. subDot8:
  10407. movaps xmm1, [esi+ecx-(8*4)]
  10408. mulps xmm1, [edi+ecx-(8*4)]
  10409. addps xmm0, xmm1
  10410. movaps xmm2, [esi+ecx-(4*4)]
  10411. mulps xmm2, [edi+ecx-(4*4)]
  10412. addps xmm0, xmm2
  10413. add ecx, 8*4
  10414. jle subDot8
  10415. doneSubDot8:
  10416. sub ecx, 4*4
  10417. jg doneSubDot4
  10418. movaps xmm1, [esi+ecx-(4*4)]
  10419. mulps xmm1, [edi+ecx-(4*4)]
  10420. addps xmm0, xmm1
  10421. add ecx, 4*4
  10422. doneSubDot4:
  10423. sub ecx, 2*4
  10424. jg doneSubDot2
  10425. movlps xmm3, [esi+ecx-(2*4)]
  10426. movlps xmm4, [edi+ecx-(2*4)]
  10427. mulps xmm3, xmm4
  10428. addps xmm0, xmm3
  10429. add ecx, 2*4
  10430. doneSubDot2:
  10431. sub ecx, 1*4
  10432. jg doneSubDot1
  10433. movss xmm3, [esi+ecx-(1*4)]
  10434. movss xmm4, [edi+ecx-(1*4)]
  10435. mulss xmm3, xmm4
  10436. addss xmm0, xmm3
  10437. doneSubDot1:
  10438. movhlps xmm2, xmm0
  10439. addps xmm0, xmm2
  10440. movaps xmm2, xmm0
  10441. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
  10442. addss xmm0, xmm2
  10443. movss xmm1, [edi]
  10444. subss xmm1, xmm0
  10445. mulss xmm1, xmm7
  10446. movss [edi], xmm1
  10447. dec edx
  10448. jg loopSubRow
  10449. doneSubRow:
  10450. inc ebx
  10451. jmp loopRow
  10452. done:
  10453. pop ebx
  10454. }
  10455. return true;
  10456. #else
  10457. int i, j, k, nc;
  10458. float *v, *diag, *mptr;
  10459. double s0, s1, s2, s3, sum, d;
  10460. v = (float *) _alloca16( n * sizeof( float ) );
  10461. diag = (float *) _alloca16( n * sizeof( float ) );
  10462. nc = mat.GetNumColumns();
  10463. if ( n <= 0 ) {
  10464. return true;
  10465. }
  10466. mptr = mat[0];
  10467. sum = mptr[0];
  10468. if ( sum == 0.0f ) {
  10469. return false;
  10470. }
  10471. diag[0] = sum;
  10472. invDiag[0] = d = 1.0f / sum;
  10473. if ( n <= 1 ) {
  10474. return true;
  10475. }
  10476. mptr = mat[0];
  10477. for ( j = 1; j < n; j++ ) {
  10478. mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
  10479. }
  10480. mptr = mat[1];
  10481. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  10482. sum = mptr[1] - s0;
  10483. if ( sum == 0.0f ) {
  10484. return false;
  10485. }
  10486. mat[1][1] = sum;
  10487. diag[1] = sum;
  10488. invDiag[1] = d = 1.0f / sum;
  10489. if ( n <= 2 ) {
  10490. return true;
  10491. }
  10492. mptr = mat[0];
  10493. for ( j = 2; j < n; j++ ) {
  10494. mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
  10495. }
  10496. mptr = mat[2];
  10497. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  10498. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  10499. sum = mptr[2] - s0 - s1;
  10500. if ( sum == 0.0f ) {
  10501. return false;
  10502. }
  10503. mat[2][2] = sum;
  10504. diag[2] = sum;
  10505. invDiag[2] = d = 1.0f / sum;
  10506. if ( n <= 3 ) {
  10507. return true;
  10508. }
  10509. mptr = mat[0];
  10510. for ( j = 3; j < n; j++ ) {
  10511. mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
  10512. }
  10513. mptr = mat[3];
  10514. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  10515. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  10516. v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
  10517. sum = mptr[3] - s0 - s1 - s2;
  10518. if ( sum == 0.0f ) {
  10519. return false;
  10520. }
  10521. mat[3][3] = sum;
  10522. diag[3] = sum;
  10523. invDiag[3] = d = 1.0f / sum;
  10524. if ( n <= 4 ) {
  10525. return true;
  10526. }
  10527. mptr = mat[0];
  10528. for ( j = 4; j < n; j++ ) {
  10529. mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
  10530. }
  10531. for ( i = 4; i < n; i++ ) {
  10532. mptr = mat[i];
  10533. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  10534. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  10535. v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
  10536. v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
  10537. for ( k = 4; k < i-3; k += 4 ) {
  10538. v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
  10539. v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
  10540. v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
  10541. v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
  10542. }
  10543. switch( i - k ) {
  10544. case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
  10545. case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
  10546. case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
  10547. }
  10548. sum = s3;
  10549. sum += s2;
  10550. sum += s1;
  10551. sum += s0;
  10552. sum = mptr[i] - sum;
  10553. if ( sum == 0.0f ) {
  10554. return false;
  10555. }
  10556. mat[i][i] = sum;
  10557. diag[i] = sum;
  10558. invDiag[i] = d = 1.0f / sum;
  10559. if ( i + 1 >= n ) {
  10560. return true;
  10561. }
  10562. mptr = mat[i+1];
  10563. for ( j = i+1; j < n; j++ ) {
  10564. s0 = mptr[0] * v[0];
  10565. s1 = mptr[1] * v[1];
  10566. s2 = mptr[2] * v[2];
  10567. s3 = mptr[3] * v[3];
  10568. for ( k = 4; k < i-7; k += 8 ) {
  10569. s0 += mptr[k+0] * v[k+0];
  10570. s1 += mptr[k+1] * v[k+1];
  10571. s2 += mptr[k+2] * v[k+2];
  10572. s3 += mptr[k+3] * v[k+3];
  10573. s0 += mptr[k+4] * v[k+4];
  10574. s1 += mptr[k+5] * v[k+5];
  10575. s2 += mptr[k+6] * v[k+6];
  10576. s3 += mptr[k+7] * v[k+7];
  10577. }
  10578. switch( i - k ) {
  10579. case 7: s0 += mptr[k+6] * v[k+6];
  10580. case 6: s1 += mptr[k+5] * v[k+5];
  10581. case 5: s2 += mptr[k+4] * v[k+4];
  10582. case 4: s3 += mptr[k+3] * v[k+3];
  10583. case 3: s0 += mptr[k+2] * v[k+2];
  10584. case 2: s1 += mptr[k+1] * v[k+1];
  10585. case 1: s2 += mptr[k+0] * v[k+0];
  10586. }
  10587. sum = s3;
  10588. sum += s2;
  10589. sum += s1;
  10590. sum += s0;
  10591. mptr[i] = ( mptr[i] - sum ) * d;
  10592. mptr += nc;
  10593. }
  10594. }
  10595. return true;
  10596. #endif
  10597. }
  10598. /*
  10599. ============
  10600. idSIMD_SSE::BlendJoints
  10601. ============
  10602. */
  10603. #define REFINE_BLENDJOINTS_RECIPROCAL
  10604. void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
  10605. int i;
  10606. if ( lerp <= 0.0f ) {
  10607. return;
  10608. } else if ( lerp >= 1.0f ) {
  10609. for ( i = 0; i < numJoints; i++ ) {
  10610. int j = index[i];
  10611. joints[j] = blendJoints[j];
  10612. }
  10613. return;
  10614. }
  10615. for ( i = 0; i <= numJoints - 4; i += 4 ) {
  10616. ALIGN16( float jointVert0[4] );
  10617. ALIGN16( float jointVert1[4] );
  10618. ALIGN16( float jointVert2[4] );
  10619. ALIGN16( float blendVert0[4] );
  10620. ALIGN16( float blendVert1[4] );
  10621. ALIGN16( float blendVert2[4] );
  10622. ALIGN16( float jointQuat0[4] );
  10623. ALIGN16( float jointQuat1[4] );
  10624. ALIGN16( float jointQuat2[4] );
  10625. ALIGN16( float jointQuat3[4] );
  10626. ALIGN16( float blendQuat0[4] );
  10627. ALIGN16( float blendQuat1[4] );
  10628. ALIGN16( float blendQuat2[4] );
  10629. ALIGN16( float blendQuat3[4] );
  10630. for ( int j = 0; j < 4; j++ ) {
  10631. int n = index[i+j];
  10632. jointVert0[j] = joints[n].t[0];
  10633. jointVert1[j] = joints[n].t[1];
  10634. jointVert2[j] = joints[n].t[2];
  10635. blendVert0[j] = blendJoints[n].t[0];
  10636. blendVert1[j] = blendJoints[n].t[1];
  10637. blendVert2[j] = blendJoints[n].t[2];
  10638. jointQuat0[j] = joints[n].q[0];
  10639. jointQuat1[j] = joints[n].q[1];
  10640. jointQuat2[j] = joints[n].q[2];
  10641. jointQuat3[j] = joints[n].q[3];
  10642. blendQuat0[j] = blendJoints[n].q[0];
  10643. blendQuat1[j] = blendJoints[n].q[1];
  10644. blendQuat2[j] = blendJoints[n].q[2];
  10645. blendQuat3[j] = blendJoints[n].q[3];
  10646. }
  10647. #if 1
  10648. __asm {
  10649. // lerp translation
  10650. movss xmm7, lerp
  10651. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  10652. movaps xmm0, blendVert0
  10653. subps xmm0, jointVert0
  10654. mulps xmm0, xmm7
  10655. addps xmm0, jointVert0
  10656. movaps jointVert0, xmm0
  10657. movaps xmm1, blendVert1
  10658. subps xmm1, jointVert1
  10659. mulps xmm1, xmm7
  10660. addps xmm1, jointVert1
  10661. movaps jointVert1, xmm1
  10662. movaps xmm2, blendVert2
  10663. subps xmm2, jointVert2
  10664. mulps xmm2, xmm7
  10665. addps xmm2, jointVert2
  10666. movaps jointVert2, xmm2
  10667. // lerp quaternions
  10668. movaps xmm0, jointQuat0
  10669. mulps xmm0, blendQuat0
  10670. movaps xmm1, jointQuat1
  10671. mulps xmm1, blendQuat1
  10672. addps xmm0, xmm1
  10673. movaps xmm2, jointQuat2
  10674. mulps xmm2, blendQuat2
  10675. addps xmm0, xmm2
  10676. movaps xmm3, jointQuat3
  10677. mulps xmm3, blendQuat3
  10678. addps xmm0, xmm3 // xmm0 = cosom
  10679. movaps xmm1, xmm0
  10680. movaps xmm2, xmm0
  10681. andps xmm1, SIMD_SP_signBitMask // xmm1 = signBit
  10682. xorps xmm0, xmm1
  10683. mulps xmm2, xmm2
  10684. xorps xmm4, xmm4
  10685. movaps xmm3, SIMD_SP_one
  10686. subps xmm3, xmm2 // xmm3 = scale0
  10687. cmpeqps xmm4, xmm3
  10688. andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
  10689. andps xmm3, SIMD_SP_absMask // make sure the values are positive
  10690. orps xmm3, xmm4
  10691. #ifdef REFINE_BLENDJOINTS_RECIPROCAL
  10692. movaps xmm2, xmm3
  10693. rsqrtps xmm4, xmm2
  10694. mulps xmm2, xmm4
  10695. mulps xmm2, xmm4
  10696. subps xmm2, SIMD_SP_rsqrt_c0
  10697. mulps xmm4, SIMD_SP_rsqrt_c1
  10698. mulps xmm2, xmm4
  10699. #else
  10700. rsqrtps xmm2, xmm3 // xmm2 = sinom
  10701. #endif
  10702. mulps xmm3, xmm2 // xmm3 = sqrt( scale0 )
  10703. // omega0 = atan2( xmm3, xmm0 )
  10704. movaps xmm4, xmm0
  10705. minps xmm0, xmm3
  10706. maxps xmm3, xmm4
  10707. cmpeqps xmm4, xmm0
  10708. #ifdef REFINE_BLENDJOINTS_RECIPROCAL
  10709. rcpps xmm5, xmm3
  10710. mulps xmm3, xmm5
  10711. mulps xmm3, xmm5
  10712. addps xmm5, xmm5
  10713. subps xmm5, xmm3 // xmm5 = 1 / y or 1 / x
  10714. mulps xmm0, xmm5 // xmm0 = x / y or y / x
  10715. #else
  10716. rcpps xmm3, xmm3 // xmm3 = 1 / y or 1 / x
  10717. mulps xmm0, xmm3 // xmm0 = x / y or y / x
  10718. #endif
  10719. movaps xmm3, xmm4
  10720. andps xmm3, SIMD_SP_signBitMask
  10721. xorps xmm0, xmm3 // xmm0 = -x / y or y / x
  10722. andps xmm4, SIMD_SP_halfPI // xmm4 = HALF_PI or 0.0f
  10723. movaps xmm3, xmm0
  10724. mulps xmm3, xmm3 // xmm3 = s
  10725. movaps xmm5, SIMD_SP_atan_c0
  10726. mulps xmm5, xmm3
  10727. addps xmm5, SIMD_SP_atan_c1
  10728. mulps xmm5, xmm3
  10729. addps xmm5, SIMD_SP_atan_c2
  10730. mulps xmm5, xmm3
  10731. addps xmm5, SIMD_SP_atan_c3
  10732. mulps xmm5, xmm3
  10733. addps xmm5, SIMD_SP_atan_c4
  10734. mulps xmm5, xmm3
  10735. addps xmm5, SIMD_SP_atan_c5
  10736. mulps xmm5, xmm3
  10737. addps xmm5, SIMD_SP_atan_c6
  10738. mulps xmm5, xmm3
  10739. addps xmm5, SIMD_SP_atan_c7
  10740. mulps xmm5, xmm3
  10741. addps xmm5, SIMD_SP_one
  10742. mulps xmm5, xmm0
  10743. addps xmm5, xmm4 // xmm5 = omega0
  10744. movaps xmm6, xmm7 // xmm6 = lerp
  10745. mulps xmm6, xmm5 // xmm6 = omega1
  10746. subps xmm5, xmm6 // xmm5 = omega0
  10747. // scale0 = sin( xmm5 ) * xmm2
  10748. // scale1 = sin( xmm6 ) * xmm2
  10749. movaps xmm3, xmm5
  10750. movaps xmm7, xmm6
  10751. mulps xmm3, xmm3
  10752. mulps xmm7, xmm7
  10753. movaps xmm4, SIMD_SP_sin_c0
  10754. movaps xmm0, SIMD_SP_sin_c0
  10755. mulps xmm4, xmm3
  10756. mulps xmm0, xmm7
  10757. addps xmm4, SIMD_SP_sin_c1
  10758. addps xmm0, SIMD_SP_sin_c1
  10759. mulps xmm4, xmm3
  10760. mulps xmm0, xmm7
  10761. addps xmm4, SIMD_SP_sin_c2
  10762. addps xmm0, SIMD_SP_sin_c2
  10763. mulps xmm4, xmm3
  10764. mulps xmm0, xmm7
  10765. addps xmm4, SIMD_SP_sin_c3
  10766. addps xmm0, SIMD_SP_sin_c3
  10767. mulps xmm4, xmm3
  10768. mulps xmm0, xmm7
  10769. addps xmm4, SIMD_SP_sin_c4
  10770. addps xmm0, SIMD_SP_sin_c4
  10771. mulps xmm4, xmm3
  10772. mulps xmm0, xmm7
  10773. addps xmm4, SIMD_SP_one
  10774. addps xmm0, SIMD_SP_one
  10775. mulps xmm5, xmm4
  10776. mulps xmm6, xmm0
  10777. mulps xmm5, xmm2 // xmm5 = scale0
  10778. mulps xmm6, xmm2 // xmm6 = scale1
  10779. xorps xmm6, xmm1
  10780. movaps xmm0, jointQuat0
  10781. mulps xmm0, xmm5
  10782. movaps xmm1, blendQuat0
  10783. mulps xmm1, xmm6
  10784. addps xmm0, xmm1
  10785. movaps jointQuat0, xmm0
  10786. movaps xmm1, jointQuat1
  10787. mulps xmm1, xmm5
  10788. movaps xmm2, blendQuat1
  10789. mulps xmm2, xmm6
  10790. addps xmm1, xmm2
  10791. movaps jointQuat1, xmm1
  10792. movaps xmm2, jointQuat2
  10793. mulps xmm2, xmm5
  10794. movaps xmm3, blendQuat2
  10795. mulps xmm3, xmm6
  10796. addps xmm2, xmm3
  10797. movaps jointQuat2, xmm2
  10798. movaps xmm3, jointQuat3
  10799. mulps xmm3, xmm5
  10800. movaps xmm4, blendQuat3
  10801. mulps xmm4, xmm6
  10802. addps xmm3, xmm4
  10803. movaps jointQuat3, xmm3
  10804. }
  10805. #else
  10806. jointVert0[0] += lerp * ( blendVert0[0] - jointVert0[0] );
  10807. jointVert0[1] += lerp * ( blendVert0[1] - jointVert0[1] );
  10808. jointVert0[2] += lerp * ( blendVert0[2] - jointVert0[2] );
  10809. jointVert0[3] += lerp * ( blendVert0[3] - jointVert0[3] );
  10810. jointVert1[0] += lerp * ( blendVert1[0] - jointVert1[0] );
  10811. jointVert1[1] += lerp * ( blendVert1[1] - jointVert1[1] );
  10812. jointVert1[2] += lerp * ( blendVert1[2] - jointVert1[2] );
  10813. jointVert1[3] += lerp * ( blendVert1[3] - jointVert1[3] );
  10814. jointVert2[0] += lerp * ( blendVert2[0] - jointVert2[0] );
  10815. jointVert2[1] += lerp * ( blendVert2[1] - jointVert2[1] );
  10816. jointVert2[2] += lerp * ( blendVert2[2] - jointVert2[2] );
  10817. jointVert2[3] += lerp * ( blendVert2[3] - jointVert2[3] );
  10818. ALIGN16( float cosom[4] );
  10819. ALIGN16( float sinom[4] );
  10820. ALIGN16( float omega0[4] );
  10821. ALIGN16( float omega1[4] );
  10822. ALIGN16( float scale0[4] );
  10823. ALIGN16( float scale1[4] );
  10824. ALIGN16( unsigned long signBit[4] );
  10825. cosom[0] = jointQuat0[0] * blendQuat0[0];
  10826. cosom[1] = jointQuat0[1] * blendQuat0[1];
  10827. cosom[2] = jointQuat0[2] * blendQuat0[2];
  10828. cosom[3] = jointQuat0[3] * blendQuat0[3];
  10829. cosom[0] += jointQuat1[0] * blendQuat1[0];
  10830. cosom[1] += jointQuat1[1] * blendQuat1[1];
  10831. cosom[2] += jointQuat1[2] * blendQuat1[2];
  10832. cosom[3] += jointQuat1[3] * blendQuat1[3];
  10833. cosom[0] += jointQuat2[0] * blendQuat2[0];
  10834. cosom[1] += jointQuat2[1] * blendQuat2[1];
  10835. cosom[2] += jointQuat2[2] * blendQuat2[2];
  10836. cosom[3] += jointQuat2[3] * blendQuat2[3];
  10837. cosom[0] += jointQuat3[0] * blendQuat3[0];
  10838. cosom[1] += jointQuat3[1] * blendQuat3[1];
  10839. cosom[2] += jointQuat3[2] * blendQuat3[2];
  10840. cosom[3] += jointQuat3[3] * blendQuat3[3];
  10841. signBit[0] = (*(unsigned long *)&cosom[0]) & ( 1 << 31 );
  10842. signBit[1] = (*(unsigned long *)&cosom[1]) & ( 1 << 31 );
  10843. signBit[2] = (*(unsigned long *)&cosom[2]) & ( 1 << 31 );
  10844. signBit[3] = (*(unsigned long *)&cosom[3]) & ( 1 << 31 );
  10845. (*(unsigned long *)&cosom[0]) ^= signBit[0];
  10846. (*(unsigned long *)&cosom[1]) ^= signBit[1];
  10847. (*(unsigned long *)&cosom[2]) ^= signBit[2];
  10848. (*(unsigned long *)&cosom[3]) ^= signBit[3];
  10849. scale0[0] = 1.0f - cosom[0] * cosom[0];
  10850. scale0[1] = 1.0f - cosom[1] * cosom[1];
  10851. scale0[2] = 1.0f - cosom[2] * cosom[2];
  10852. scale0[3] = 1.0f - cosom[3] * cosom[3];
  10853. scale0[0] = ( scale0[0] <= 0.0f ) ? SIMD_SP_tiny[0] : scale0[0];
  10854. scale0[1] = ( scale0[1] <= 0.0f ) ? SIMD_SP_tiny[1] : scale0[1];
  10855. scale0[2] = ( scale0[2] <= 0.0f ) ? SIMD_SP_tiny[2] : scale0[2];
  10856. scale0[3] = ( scale0[3] <= 0.0f ) ? SIMD_SP_tiny[3] : scale0[3];
  10857. sinom[0] = idMath::RSqrt( scale0[0] );
  10858. sinom[1] = idMath::RSqrt( scale0[1] );
  10859. sinom[2] = idMath::RSqrt( scale0[2] );
  10860. sinom[3] = idMath::RSqrt( scale0[3] );
  10861. scale0[0] *= sinom[0];
  10862. scale0[1] *= sinom[1];
  10863. scale0[2] *= sinom[2];
  10864. scale0[3] *= sinom[3];
  10865. omega0[0] = SSE_ATanPositive( scale0[0], cosom[0] );
  10866. omega0[1] = SSE_ATanPositive( scale0[1], cosom[1] );
  10867. omega0[2] = SSE_ATanPositive( scale0[2], cosom[2] );
  10868. omega0[3] = SSE_ATanPositive( scale0[3], cosom[3] );
  10869. omega1[0] = lerp * omega0[0];
  10870. omega1[1] = lerp * omega0[1];
  10871. omega1[2] = lerp * omega0[2];
  10872. omega1[3] = lerp * omega0[3];
  10873. omega0[0] -= omega1[0];
  10874. omega0[1] -= omega1[1];
  10875. omega0[2] -= omega1[2];
  10876. omega0[3] -= omega1[3];
  10877. scale0[0] = SSE_SinZeroHalfPI( omega0[0] ) * sinom[0];
  10878. scale0[1] = SSE_SinZeroHalfPI( omega0[1] ) * sinom[1];
  10879. scale0[2] = SSE_SinZeroHalfPI( omega0[2] ) * sinom[2];
  10880. scale0[3] = SSE_SinZeroHalfPI( omega0[3] ) * sinom[3];
  10881. scale1[0] = SSE_SinZeroHalfPI( omega1[0] ) * sinom[0];
  10882. scale1[1] = SSE_SinZeroHalfPI( omega1[1] ) * sinom[1];
  10883. scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2];
  10884. scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3];
  10885. (*(unsigned long *)&scale1[0]) ^= signBit[0];
  10886. (*(unsigned long *)&scale1[1]) ^= signBit[1];
  10887. (*(unsigned long *)&scale1[2]) ^= signBit[2];
  10888. (*(unsigned long *)&scale1[3]) ^= signBit[3];
  10889. jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0];
  10890. jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1];
  10891. jointQuat0[2] = scale0[2] * jointQuat0[2] + scale1[2] * blendQuat0[2];
  10892. jointQuat0[3] = scale0[3] * jointQuat0[3] + scale1[3] * blendQuat0[3];
  10893. jointQuat1[0] = scale0[0] * jointQuat1[0] + scale1[0] * blendQuat1[0];
  10894. jointQuat1[1] = scale0[1] * jointQuat1[1] + scale1[1] * blendQuat1[1];
  10895. jointQuat1[2] = scale0[2] * jointQuat1[2] + scale1[2] * blendQuat1[2];
  10896. jointQuat1[3] = scale0[3] * jointQuat1[3] + scale1[3] * blendQuat1[3];
  10897. jointQuat2[0] = scale0[0] * jointQuat2[0] + scale1[0] * blendQuat2[0];
  10898. jointQuat2[1] = scale0[1] * jointQuat2[1] + scale1[1] * blendQuat2[1];
  10899. jointQuat2[2] = scale0[2] * jointQuat2[2] + scale1[2] * blendQuat2[2];
  10900. jointQuat2[3] = scale0[3] * jointQuat2[3] + scale1[3] * blendQuat2[3];
  10901. jointQuat3[0] = scale0[0] * jointQuat3[0] + scale1[0] * blendQuat3[0];
  10902. jointQuat3[1] = scale0[1] * jointQuat3[1] + scale1[1] * blendQuat3[1];
  10903. jointQuat3[2] = scale0[2] * jointQuat3[2] + scale1[2] * blendQuat3[2];
  10904. jointQuat3[3] = scale0[3] * jointQuat3[3] + scale1[3] * blendQuat3[3];
  10905. #endif
  10906. for ( int j = 0; j < 4; j++ ) {
  10907. int n = index[i+j];
  10908. joints[n].t[0] = jointVert0[j];
  10909. joints[n].t[1] = jointVert1[j];
  10910. joints[n].t[2] = jointVert2[j];
  10911. joints[n].q[0] = jointQuat0[j];
  10912. joints[n].q[1] = jointQuat1[j];
  10913. joints[n].q[2] = jointQuat2[j];
  10914. joints[n].q[3] = jointQuat3[j];
  10915. }
  10916. }
  10917. for ( ; i < numJoints; i++ ) {
  10918. int n = index[i];
  10919. idVec3 &jointVert = joints[n].t;
  10920. const idVec3 &blendVert = blendJoints[n].t;
  10921. jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
  10922. jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
  10923. jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );
  10924. idQuat &jointQuat = joints[n].q;
  10925. const idQuat &blendQuat = blendJoints[n].q;
  10926. float cosom;
  10927. float sinom;
  10928. float omega;
  10929. float scale0;
  10930. float scale1;
  10931. unsigned long signBit;
  10932. cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;
  10933. signBit = (*(unsigned long *)&cosom) & ( 1 << 31 );
  10934. (*(unsigned long *)&cosom) ^= signBit;
  10935. scale0 = 1.0f - cosom * cosom;
  10936. scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0;
  10937. sinom = idMath::InvSqrt( scale0 );
  10938. omega = idMath::ATan16( scale0 * sinom, cosom );
  10939. scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom;
  10940. scale1 = idMath::Sin16( lerp * omega ) * sinom;
  10941. (*(unsigned long *)&scale1) ^= signBit;
  10942. jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x;
  10943. jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y;
  10944. jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z;
  10945. jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w;
  10946. }
  10947. }
  10948. /*
  10949. ============
  10950. idSIMD_SSE::ConvertJointQuatsToJointMats
  10951. ============
  10952. */
  10953. void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
  10954. assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
  10955. assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
  10956. assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
  10957. for ( int i = 0; i < numJoints; i++ ) {
  10958. const float *q = jointQuats[i].q.ToFloatPtr();
  10959. float *m = jointMats[i].ToFloatPtr();
  10960. m[0*4+3] = q[4];
  10961. m[1*4+3] = q[5];
  10962. m[2*4+3] = q[6];
  10963. float x2 = q[0] + q[0];
  10964. float y2 = q[1] + q[1];
  10965. float z2 = q[2] + q[2];
  10966. {
  10967. float xx = q[0] * x2;
  10968. float yy = q[1] * y2;
  10969. float zz = q[2] * z2;
  10970. m[0*4+0] = 1.0f - yy - zz;
  10971. m[1*4+1] = 1.0f - xx - zz;
  10972. m[2*4+2] = 1.0f - xx - yy;
  10973. }
  10974. {
  10975. float yz = q[1] * z2;
  10976. float wx = q[3] * x2;
  10977. m[2*4+1] = yz - wx;
  10978. m[1*4+2] = yz + wx;
  10979. }
  10980. {
  10981. float xy = q[0] * y2;
  10982. float wz = q[3] * z2;
  10983. m[1*4+0] = xy - wz;
  10984. m[0*4+1] = xy + wz;
  10985. }
  10986. {
  10987. float xz = q[0] * z2;
  10988. float wy = q[3] * y2;
  10989. m[0*4+2] = xz - wy;
  10990. m[2*4+0] = xz + wy;
  10991. }
  10992. }
  10993. }
  10994. /*
  10995. ============
  10996. idSIMD_SSE::ConvertJointMatsToJointQuats
  10997. ============
  10998. */
  10999. void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
  11000. assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
  11001. assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
  11002. assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
  11003. #if 1
  11004. ALIGN16( byte shuffle[16] );
  11005. __asm {
  11006. mov eax, numJoints
  11007. mov esi, jointMats
  11008. mov edi, jointQuats
  11009. and eax, ~3
  11010. jz done4
  11011. imul eax, JOINTMAT_SIZE
  11012. add esi, eax
  11013. neg eax
  11014. loopMat4:
  11015. movss xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4]
  11016. movss xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4]
  11017. movss xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4]
  11018. shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
  11019. shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
  11020. shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
  11021. movss xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4]
  11022. movss xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4]
  11023. movss xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4]
  11024. movss xmm5, xmm0
  11025. movss xmm6, xmm1
  11026. movss xmm7, xmm2
  11027. shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
  11028. shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
  11029. shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
  11030. movss xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4]
  11031. movss xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4]
  11032. movss xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4]
  11033. movss xmm5, xmm0
  11034. movss xmm6, xmm1
  11035. movss xmm7, xmm2
  11036. shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
  11037. shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
  11038. shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
  11039. movss xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
  11040. movss xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
  11041. movss xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
  11042. movss xmm5, xmm0
  11043. movss xmm6, xmm1
  11044. movss xmm7, xmm2
  11045. // -------------------
  11046. movaps xmm0, xmm5
  11047. addps xmm0, xmm6
  11048. addps xmm0, xmm7
  11049. cmpnltps xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
  11050. movaps xmm1, xmm5
  11051. movaps xmm2, xmm5
  11052. cmpnltps xmm1, xmm6
  11053. cmpnltps xmm2, xmm7
  11054. andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
  11055. movaps xmm4, xmm6
  11056. cmpnltps xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
  11057. movaps xmm1, xmm0
  11058. andnps xmm1, xmm2
  11059. orps xmm2, xmm0
  11060. movaps xmm3, xmm2
  11061. andnps xmm2, xmm4
  11062. orps xmm3, xmm2
  11063. xorps xmm3, SIMD_SP_not
  11064. andps xmm0, SIMD_DW_mat2quatShuffle0
  11065. movaps xmm4, xmm1
  11066. andps xmm4, SIMD_DW_mat2quatShuffle1
  11067. orps xmm0, xmm4
  11068. movaps xmm4, xmm2
  11069. andps xmm4, SIMD_DW_mat2quatShuffle2
  11070. orps xmm0, xmm4
  11071. movaps xmm4, xmm3
  11072. andps xmm4, SIMD_DW_mat2quatShuffle3
  11073. orps xmm4, xmm0
  11074. movaps shuffle, xmm4
  11075. movaps xmm0, xmm2
  11076. orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0
  11077. orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2
  11078. orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1
  11079. andps xmm0, SIMD_SP_signBitMask
  11080. andps xmm1, SIMD_SP_signBitMask
  11081. andps xmm2, SIMD_SP_signBitMask
  11082. xorps xmm5, xmm0
  11083. xorps xmm6, xmm1
  11084. xorps xmm7, xmm2
  11085. addps xmm5, xmm6
  11086. addps xmm7, SIMD_SP_one
  11087. addps xmm5, xmm7 // xmm5 = t
  11088. movaps xmm7, xmm5 // xmm7 = t
  11089. rsqrtps xmm6, xmm5
  11090. mulps xmm5, xmm6
  11091. mulps xmm5, xmm6
  11092. subps xmm5, SIMD_SP_rsqrt_c0
  11093. mulps xmm6, SIMD_SP_mat2quat_rsqrt_c1
  11094. mulps xmm6, xmm5 // xmm5 = s
  11095. mulps xmm7, xmm6 // xmm7 = s * t
  11096. xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s
  11097. // -------------------
  11098. add edi, 4*JOINTQUAT_SIZE
  11099. movzx ecx, byte ptr shuffle[0*4+0] // ecx = k0
  11100. movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
  11101. movzx edx, byte ptr shuffle[0*4+1] // edx = k1
  11102. movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
  11103. xorps xmm4, xmm2
  11104. subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
  11105. mulss xmm4, xmm6
  11106. movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
  11107. movzx ecx, byte ptr shuffle[0*4+2] // ecx = k2
  11108. movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
  11109. xorps xmm3, xmm1
  11110. subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
  11111. mulss xmm3, xmm6
  11112. movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
  11113. movzx edx, byte ptr shuffle[0*4+3] // edx = k3
  11114. movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
  11115. xorps xmm4, xmm0
  11116. subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
  11117. mulss xmm4, xmm6
  11118. movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
  11119. mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
  11120. mov [edi-4*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
  11121. mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
  11122. mov [edi-4*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
  11123. mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
  11124. mov [edi-4*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
  11125. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
  11126. shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
  11127. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  11128. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  11129. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  11130. movzx ecx, byte ptr shuffle[1*4+0] // ecx = k0
  11131. movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
  11132. movzx edx, byte ptr shuffle[1*4+1] // edx = k1
  11133. movss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4]
  11134. xorps xmm4, xmm2
  11135. subss xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4]
  11136. mulss xmm4, xmm6
  11137. movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
  11138. movzx ecx, byte ptr shuffle[1*4+2] // ecx = k2
  11139. movss xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4]
  11140. xorps xmm3, xmm1
  11141. subss xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4]
  11142. mulss xmm3, xmm6
  11143. movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
  11144. movzx edx, byte ptr shuffle[1*4+3] // edx = k3
  11145. movss xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4]
  11146. xorps xmm4, xmm0
  11147. subss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4]
  11148. mulss xmm4, xmm6
  11149. movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
  11150. mov ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4]
  11151. mov [edi-3*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
  11152. mov edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4]
  11153. mov [edi-3*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
  11154. mov ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4]
  11155. mov [edi-3*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
  11156. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
  11157. shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
  11158. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  11159. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  11160. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  11161. movzx ecx, byte ptr shuffle[2*4+0] // ecx = k0
  11162. movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
  11163. movzx edx, byte ptr shuffle[2*4+1] // edx = k1
  11164. movss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4]
  11165. xorps xmm4, xmm2
  11166. subss xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4]
  11167. mulss xmm4, xmm6
  11168. movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
  11169. movzx ecx, byte ptr shuffle[2*4+2] // ecx = k2
  11170. movss xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4]
  11171. xorps xmm3, xmm1
  11172. subss xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4]
  11173. mulss xmm3, xmm6
  11174. movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
  11175. movzx edx, byte ptr shuffle[2*4+3] // edx = k3
  11176. movss xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4]
  11177. xorps xmm4, xmm0
  11178. subss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4]
  11179. mulss xmm4, xmm6
  11180. movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
  11181. mov ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4]
  11182. mov [edi-2*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
  11183. mov edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4]
  11184. mov [edi-2*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
  11185. mov ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4]
  11186. mov [edi-2*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
  11187. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
  11188. shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
  11189. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  11190. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  11191. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  11192. movzx ecx, byte ptr shuffle[3*4+0] // ecx = k0
  11193. movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
  11194. movzx edx, byte ptr shuffle[3*4+1] // edx = k1
  11195. movss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4]
  11196. xorps xmm4, xmm2
  11197. subss xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4]
  11198. mulss xmm4, xmm6
  11199. movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
  11200. movzx ecx, byte ptr shuffle[3*4+2] // ecx = k2
  11201. movss xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4]
  11202. xorps xmm3, xmm1
  11203. subss xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4]
  11204. mulss xmm3, xmm6
  11205. movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
  11206. movzx edx, byte ptr shuffle[3*4+3] // edx = k3
  11207. movss xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4]
  11208. xorps xmm4, xmm0
  11209. subss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4]
  11210. mulss xmm4, xmm6
  11211. movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
  11212. mov ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4]
  11213. mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
  11214. mov edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4]
  11215. mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
  11216. mov ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4]
  11217. mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
  11218. add eax, 4*JOINTMAT_SIZE
  11219. jl loopMat4
  11220. done4:
  11221. mov eax, numJoints
  11222. and eax, 3
  11223. jz done1
  11224. imul eax, JOINTMAT_SIZE
  11225. add esi, eax
  11226. neg eax
  11227. loopMat1:
  11228. movss xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
  11229. movss xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
  11230. movss xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
  11231. // -------------------
  11232. movaps xmm0, xmm5
  11233. addss xmm0, xmm6
  11234. addss xmm0, xmm7
  11235. cmpnltss xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
  11236. movaps xmm1, xmm5
  11237. movaps xmm2, xmm5
  11238. cmpnltss xmm1, xmm6
  11239. cmpnltss xmm2, xmm7
  11240. andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
  11241. movaps xmm4, xmm6
  11242. cmpnltss xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
  11243. movaps xmm1, xmm0
  11244. andnps xmm1, xmm2
  11245. orps xmm2, xmm0
  11246. movaps xmm3, xmm2
  11247. andnps xmm2, xmm4
  11248. orps xmm3, xmm2
  11249. xorps xmm3, SIMD_SP_not
  11250. andps xmm0, SIMD_DW_mat2quatShuffle0
  11251. movaps xmm4, xmm1
  11252. andps xmm4, SIMD_DW_mat2quatShuffle1
  11253. orps xmm0, xmm4
  11254. movaps xmm4, xmm2
  11255. andps xmm4, SIMD_DW_mat2quatShuffle2
  11256. orps xmm0, xmm4
  11257. movaps xmm4, xmm3
  11258. andps xmm4, SIMD_DW_mat2quatShuffle3
  11259. orps xmm4, xmm0
  11260. movss shuffle, xmm4
  11261. movaps xmm0, xmm2
  11262. orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0
  11263. orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2
  11264. orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1
  11265. andps xmm0, SIMD_SP_signBitMask
  11266. andps xmm1, SIMD_SP_signBitMask
  11267. andps xmm2, SIMD_SP_signBitMask
  11268. xorps xmm5, xmm0
  11269. xorps xmm6, xmm1
  11270. xorps xmm7, xmm2
  11271. addss xmm5, xmm6
  11272. addss xmm7, SIMD_SP_one
  11273. addss xmm5, xmm7 // xmm5 = t
  11274. movss xmm7, xmm5 // xmm7 = t
  11275. rsqrtss xmm6, xmm5
  11276. mulss xmm5, xmm6
  11277. mulss xmm5, xmm6
  11278. subss xmm5, SIMD_SP_rsqrt_c0
  11279. mulss xmm6, SIMD_SP_mat2quat_rsqrt_c1
  11280. mulss xmm6, xmm5 // xmm5 = s
  11281. mulss xmm7, xmm6 // xmm7 = s * t
  11282. xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s
  11283. // -------------------
  11284. movzx ecx, byte ptr shuffle[0] // ecx = k0
  11285. add edi, JOINTQUAT_SIZE
  11286. movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
  11287. movzx edx, byte ptr shuffle[1] // edx = k1
  11288. movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
  11289. xorps xmm4, xmm2
  11290. subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
  11291. mulss xmm4, xmm6
  11292. movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
  11293. movzx ecx, byte ptr shuffle[2] // ecx = k2
  11294. movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
  11295. xorps xmm3, xmm1
  11296. subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
  11297. mulss xmm3, xmm6
  11298. movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
  11299. movzx edx, byte ptr shuffle[3] // edx = k3
  11300. movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
  11301. xorps xmm4, xmm0
  11302. subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
  11303. mulss xmm4, xmm6
  11304. movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
  11305. mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
  11306. mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
  11307. mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
  11308. mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
  11309. mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
  11310. mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
  11311. add eax, JOINTMAT_SIZE
  11312. jl loopMat1
  11313. done1:
  11314. }
  11315. #elif 0
  11316. for ( int i = 0; i < numJoints; i++ ) {
  11317. float s0, s1, s2;
  11318. int k0, k1, k2, k3;
  11319. float *q = jointQuats[i].q.ToFloatPtr();
  11320. const float *m = jointMats[i].ToFloatPtr();
  11321. if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
  11322. k0 = 3;
  11323. k1 = 2;
  11324. k2 = 1;
  11325. k3 = 0;
  11326. s0 = 1.0f;
  11327. s1 = 1.0f;
  11328. s2 = 1.0f;
  11329. } else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
  11330. k0 = 0;
  11331. k1 = 1;
  11332. k2 = 2;
  11333. k3 = 3;
  11334. s0 = 1.0f;
  11335. s1 = -1.0f;
  11336. s2 = -1.0f;
  11337. } else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
  11338. k0 = 1;
  11339. k1 = 0;
  11340. k2 = 3;
  11341. k3 = 2;
  11342. s0 = -1.0f;
  11343. s1 = 1.0f;
  11344. s2 = -1.0f;
  11345. } else {
  11346. k0 = 2;
  11347. k1 = 3;
  11348. k2 = 0;
  11349. k3 = 1;
  11350. s0 = -1.0f;
  11351. s1 = -1.0f;
  11352. s2 = 1.0f;
  11353. }
  11354. float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f;
  11355. float s = idMath::InvSqrt( t ) * 0.5f;
  11356. q[k0] = s * t;
  11357. q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
  11358. q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
  11359. q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
  11360. q[4] = m[0 * 4 + 3];
  11361. q[5] = m[1 * 4 + 3];
  11362. q[6] = m[2 * 4 + 3];
  11363. }
  11364. #elif 1
  11365. for ( int i = 0; i < numJoints; i++ ) {
  11366. float *q = jointQuats[i].q.ToFloatPtr();
  11367. const float *m = jointMats[i].ToFloatPtr();
  11368. if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
  11369. float t = + m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
  11370. float s = idMath::InvSqrt( t ) * 0.5f;
  11371. q[3] = s * t;
  11372. q[2] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
  11373. q[1] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
  11374. q[0] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
  11375. } else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
  11376. float t = + m[0 * 4 + 0] - m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
  11377. float s = idMath::InvSqrt( t ) * 0.5f;
  11378. q[0] = s * t;
  11379. q[1] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
  11380. q[2] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
  11381. q[3] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
  11382. } else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
  11383. float t = - m[0 * 4 + 0] + m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
  11384. float s = idMath::InvSqrt( t ) * 0.5f;
  11385. q[1] = s * t;
  11386. q[0] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
  11387. q[3] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
  11388. q[2] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
  11389. } else {
  11390. float t = - m[0 * 4 + 0] - m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
  11391. float s = idMath::InvSqrt( t ) * 0.5f;
  11392. q[2] = s * t;
  11393. q[3] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
  11394. q[0] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
  11395. q[1] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
  11396. }
  11397. q[4] = m[0 * 4 + 3];
  11398. q[5] = m[1 * 4 + 3];
  11399. q[6] = m[2 * 4 + 3];
  11400. }
  11401. #endif
  11402. }
  11403. /*
  11404. ============
  11405. idSIMD_SSE::TransformJoints
  11406. ============
  11407. */
  11408. void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
  11409. #if 1
  11410. assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
  11411. __asm {
  11412. mov ecx, firstJoint
  11413. mov eax, lastJoint
  11414. sub eax, ecx
  11415. jl done
  11416. imul ecx, 4
  11417. mov edi, parents
  11418. add edi, ecx
  11419. imul ecx, 12
  11420. mov esi, jointMats
  11421. imul eax, 4
  11422. add edi, eax
  11423. neg eax
  11424. loopJoint:
  11425. movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
  11426. mov edx, [edi+eax]
  11427. movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
  11428. imul edx, JOINTMAT_SIZE
  11429. movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
  11430. movss xmm4, [esi+edx+ 0]
  11431. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  11432. mulps xmm4, xmm0
  11433. movss xmm5, [esi+edx+ 4]
  11434. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  11435. mulps xmm5, xmm1
  11436. addps xmm4, xmm5
  11437. movss xmm6, [esi+edx+ 8]
  11438. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  11439. mulps xmm6, xmm2
  11440. addps xmm4, xmm6
  11441. movss xmm5, [esi+edx+16]
  11442. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  11443. mulps xmm5, xmm0
  11444. movss xmm7, [esi+edx+12]
  11445. shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
  11446. addps xmm4, xmm7
  11447. movaps [esi+ecx+ 0], xmm4
  11448. movss xmm6, [esi+edx+20]
  11449. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  11450. mulps xmm6, xmm1
  11451. addps xmm5, xmm6
  11452. movss xmm7, [esi+edx+24]
  11453. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  11454. mulps xmm7, xmm2
  11455. addps xmm5, xmm7
  11456. movss xmm6, [esi+edx+32]
  11457. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  11458. mulps xmm6, xmm0
  11459. movss xmm3, [esi+edx+28]
  11460. shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
  11461. addps xmm5, xmm3
  11462. movaps [esi+ecx+16], xmm5
  11463. movss xmm7, [esi+edx+36]
  11464. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  11465. mulps xmm7, xmm1
  11466. addps xmm6, xmm7
  11467. movss xmm3, [esi+edx+40]
  11468. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  11469. mulps xmm3, xmm2
  11470. addps xmm6, xmm3
  11471. movss xmm7, [esi+edx+44]
  11472. shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
  11473. addps xmm6, xmm7
  11474. movaps [esi+ecx+32], xmm6
  11475. add ecx, JOINTMAT_SIZE
  11476. add eax, 4
  11477. jle loopJoint
  11478. done:
  11479. }
  11480. #else
  11481. int i;
  11482. for( i = firstJoint; i <= lastJoint; i++ ) {
  11483. assert( parents[i] < i );
  11484. jointMats[i] *= jointMats[parents[i]];
  11485. }
  11486. #endif
  11487. }
  11488. /*
  11489. ============
  11490. idSIMD_SSE::UntransformJoints
  11491. ============
  11492. */
  11493. void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
  11494. #if 1
  11495. assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
  11496. __asm {
  11497. mov edx, firstJoint
  11498. mov eax, lastJoint
  11499. mov ecx, eax
  11500. sub eax, edx
  11501. jl done
  11502. mov esi, jointMats
  11503. imul ecx, JOINTMAT_SIZE
  11504. imul edx, 4
  11505. mov edi, parents
  11506. add edi, edx
  11507. imul eax, 4
  11508. loopJoint:
  11509. movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
  11510. mov edx, [edi+eax]
  11511. movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
  11512. imul edx, JOINTMAT_SIZE
  11513. movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
  11514. movss xmm6, [esi+edx+12]
  11515. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
  11516. subps xmm0, xmm6
  11517. movss xmm7, [esi+edx+28]
  11518. shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
  11519. subps xmm1, xmm7
  11520. movss xmm3, [esi+edx+44]
  11521. shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
  11522. subps xmm2, xmm3
  11523. movss xmm4, [esi+edx+ 0]
  11524. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  11525. mulps xmm4, xmm0
  11526. movss xmm5, [esi+edx+16]
  11527. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  11528. mulps xmm5, xmm1
  11529. addps xmm4, xmm5
  11530. movss xmm6, [esi+edx+32]
  11531. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  11532. mulps xmm6, xmm2
  11533. addps xmm4, xmm6
  11534. movaps [esi+ecx+ 0], xmm4
  11535. movss xmm5, [esi+edx+ 4]
  11536. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  11537. mulps xmm5, xmm0
  11538. movss xmm6, [esi+edx+20]
  11539. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  11540. mulps xmm6, xmm1
  11541. addps xmm5, xmm6
  11542. movss xmm7, [esi+edx+36]
  11543. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  11544. mulps xmm7, xmm2
  11545. addps xmm5, xmm7
  11546. movaps [esi+ecx+16], xmm5
  11547. movss xmm6, [esi+edx+ 8]
  11548. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  11549. mulps xmm6, xmm0
  11550. movss xmm7, [esi+edx+24]
  11551. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  11552. mulps xmm7, xmm1
  11553. addps xmm6, xmm7
  11554. movss xmm3, [esi+edx+40]
  11555. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  11556. mulps xmm3, xmm2
  11557. addps xmm6, xmm3
  11558. movaps [esi+ecx+32], xmm6
  11559. sub ecx, JOINTMAT_SIZE
  11560. sub eax, 4
  11561. jge loopJoint
  11562. done:
  11563. }
  11564. #else
  11565. int i;
  11566. for( i = lastJoint; i >= firstJoint; i-- ) {
  11567. assert( parents[i] < i );
  11568. jointMats[i] /= jointMats[parents[i]];
  11569. }
  11570. #endif
  11571. }
  11572. /*
  11573. ============
  11574. idSIMD_SSE::TransformVerts
  11575. ============
  11576. */
  11577. void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
  11578. #if 1
  11579. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  11580. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  11581. assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
  11582. assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
  11583. __asm
  11584. {
  11585. mov eax, numVerts
  11586. test eax, eax
  11587. jz done
  11588. imul eax, DRAWVERT_SIZE
  11589. mov ecx, verts
  11590. mov edx, index
  11591. mov esi, weights
  11592. mov edi, joints
  11593. add ecx, eax
  11594. neg eax
  11595. loopVert:
  11596. mov ebx, [edx]
  11597. movaps xmm2, [esi]
  11598. add edx, 8
  11599. movaps xmm0, xmm2
  11600. add esi, JOINTWEIGHT_SIZE
  11601. movaps xmm1, xmm2
  11602. mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
  11603. mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
  11604. mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
  11605. cmp dword ptr [edx-4], 0
  11606. jne doneWeight
  11607. loopWeight:
  11608. mov ebx, [edx]
  11609. movaps xmm5, [esi]
  11610. add edx, 8
  11611. movaps xmm3, xmm5
  11612. add esi, JOINTWEIGHT_SIZE
  11613. movaps xmm4, xmm5
  11614. mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
  11615. mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
  11616. mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
  11617. cmp dword ptr [edx-4], 0
  11618. addps xmm0, xmm3
  11619. addps xmm1, xmm4
  11620. addps xmm2, xmm5
  11621. je loopWeight
  11622. doneWeight:
  11623. add eax, DRAWVERT_SIZE
  11624. movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0
  11625. unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4
  11626. unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1
  11627. addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
  11628. movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2
  11629. movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5
  11630. movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
  11631. addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
  11632. movhps [ecx+eax-DRAWVERT_SIZE+0], xmm6
  11633. movaps xmm5, xmm6 // xmm5 = m6+m8, m7+t2
  11634. shufps xmm5, xmm5, R_SHUFFLEPS( 1, 0, 2, 3 ) // xmm5 = m7+t2, m6+m8
  11635. addss xmm5, xmm6 // xmm5 = m6+m8+m7+t2
  11636. movss [ecx+eax-DRAWVERT_SIZE+8], xmm5
  11637. jl loopVert
  11638. done:
  11639. }
  11640. #else
  11641. int i, j;
  11642. const byte *jointsPtr = (byte *)joints;
  11643. for( j = i = 0; i < numVerts; i++ ) {
  11644. idVec3 v;
  11645. v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
  11646. while( index[j*2+1] == 0 ) {
  11647. j++;
  11648. v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
  11649. }
  11650. j++;
  11651. verts[i].xyz = v;
  11652. }
  11653. #endif
  11654. }
  11655. /*
  11656. ============
  11657. idSIMD_SSE::TracePointCull
  11658. ============
  11659. */
  11660. void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  11661. #if 1
  11662. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  11663. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  11664. __asm {
  11665. push ebx
  11666. mov eax, numVerts
  11667. test eax, eax
  11668. jz done
  11669. mov edi, planes
  11670. movlps xmm1, [edi] // xmm1 = 0, 1, X, X
  11671. movhps xmm1, [edi+16] // xmm1 = 0, 1, 4, 5
  11672. movlps xmm3, [edi+8] // xmm3 = 2, 3, X, X
  11673. movhps xmm3, [edi+24] // xmm3 = 2, 3, 6, 7
  11674. movlps xmm4, [edi+32] // xmm4 = 8, 9, X, X
  11675. movhps xmm4, [edi+48] // xmm4 = 8, 9, 12, 13
  11676. movlps xmm5, [edi+40] // xmm5 = 10, 11, X, X
  11677. movhps xmm5, [edi+56] // xmm5 = 10, 11, 14, 15
  11678. movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5
  11679. shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12
  11680. shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13
  11681. movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7
  11682. shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14
  11683. shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15
  11684. movss xmm7, radius
  11685. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  11686. xor edx, edx
  11687. mov esi, verts
  11688. mov edi, cullBits
  11689. imul eax, DRAWVERT_SIZE
  11690. add esi, eax
  11691. neg eax
  11692. loopVert:
  11693. movss xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
  11694. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  11695. movss xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
  11696. mulps xmm4, xmm0
  11697. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  11698. movss xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
  11699. mulps xmm5, xmm1
  11700. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  11701. addps xmm4, xmm5
  11702. mulps xmm6, xmm2
  11703. addps xmm4, xmm3
  11704. addps xmm4, xmm6
  11705. movaps xmm5, xmm4
  11706. xorps xmm5, SIMD_SP_signBitMask
  11707. cmpltps xmm4, xmm7
  11708. movmskps ecx, xmm4
  11709. cmpltps xmm5, xmm7
  11710. movmskps ebx, xmm5
  11711. shl cx, 4
  11712. or cl, bl
  11713. inc edi
  11714. or dl, cl
  11715. add eax, DRAWVERT_SIZE
  11716. mov byte ptr [edi-1], cl
  11717. jl loopVert
  11718. done:
  11719. mov esi, totalOr
  11720. mov byte ptr [esi], dl
  11721. pop ebx
  11722. }
  11723. #else
  11724. int i;
  11725. byte tOr;
  11726. tOr = 0;
  11727. for ( i = 0; i < numVerts; i++ ) {
  11728. byte bits;
  11729. float d0, d1, d2, d3, t;
  11730. const idVec3 &v = verts[i].xyz;
  11731. d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
  11732. d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
  11733. d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
  11734. d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
  11735. t = d0 + radius;
  11736. bits = FLOATSIGNBITSET( t ) << 0;
  11737. t = d1 + radius;
  11738. bits |= FLOATSIGNBITSET( t ) << 1;
  11739. t = d2 + radius;
  11740. bits |= FLOATSIGNBITSET( t ) << 2;
  11741. t = d3 + radius;
  11742. bits |= FLOATSIGNBITSET( t ) << 3;
  11743. t = d0 - radius;
  11744. bits |= FLOATSIGNBITSET( t ) << 4;
  11745. t = d1 - radius;
  11746. bits |= FLOATSIGNBITSET( t ) << 5;
  11747. t = d2 - radius;
  11748. bits |= FLOATSIGNBITSET( t ) << 6;
  11749. t = d3 - radius;
  11750. bits |= FLOATSIGNBITSET( t ) << 7;
  11751. bits ^= 0x0F; // flip lower four bits
  11752. tOr |= bits;
  11753. cullBits[i] = bits;
  11754. }
  11755. totalOr = tOr;
  11756. #endif
  11757. }
  11758. /*
  11759. ============
  11760. idSIMD_SSE::DecalPointCull
  11761. ============
  11762. */
  11763. void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  11764. #if 1
  11765. ALIGN16( float p0[4] );
  11766. ALIGN16( float p1[4] );
  11767. ALIGN16( float p2[4] );
  11768. ALIGN16( float p3[4] );
  11769. ALIGN16( float p4[4] );
  11770. ALIGN16( float p5[4] );
  11771. ALIGN16( float p6[4] );
  11772. ALIGN16( float p7[4] );
  11773. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  11774. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  11775. __asm {
  11776. mov ecx, planes
  11777. movlps xmm1, [ecx] // xmm1 = 0, 1, X, X
  11778. movhps xmm1, [ecx+16] // xmm1 = 0, 1, 4, 5
  11779. movlps xmm3, [ecx+8] // xmm3 = 2, 3, X, X
  11780. movhps xmm3, [ecx+24] // xmm3 = 2, 3, 6, 7
  11781. movlps xmm4, [ecx+32] // xmm4 = 8, 9, X, X
  11782. movhps xmm4, [ecx+48] // xmm4 = 8, 9, 12, 13
  11783. movlps xmm5, [ecx+40] // xmm5 = 10, 11, X, X
  11784. movhps xmm5, [ecx+56] // xmm5 = 10, 11, 14, 15
  11785. movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5
  11786. shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12
  11787. shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13
  11788. movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7
  11789. shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14
  11790. shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15
  11791. movaps p0, xmm0
  11792. movaps p1, xmm1
  11793. movaps p2, xmm2
  11794. movaps p3, xmm3
  11795. movlps xmm4, [ecx+64] // xmm4 = p40, p41, X, X
  11796. movhps xmm4, [ecx+80] // xmm4 = p40, p41, p50, p51
  11797. movaps xmm5, xmm4 // xmm5 = p40, p41, p50, p51
  11798. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm4 = p40, p50, p40, p50
  11799. shufps xmm5, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm5 = p41, p51, p41, p51
  11800. movlps xmm6, [ecx+72] // xmm6 = p42, p43, X, X
  11801. movhps xmm6, [ecx+88] // xmm6 = p42, p43, p52, p53
  11802. movaps xmm7, xmm6 // xmm7 = p42, p43, p52, p53
  11803. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm6 = p42, p52, p42, p52
  11804. shufps xmm7, xmm7, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm7 = p43, p53, p43, p53
  11805. movaps p4, xmm4
  11806. movaps p5, xmm5
  11807. movaps p6, xmm6
  11808. movaps p7, xmm7
  11809. mov esi, verts
  11810. mov edi, cullBits
  11811. mov eax, numVerts
  11812. and eax, ~1
  11813. jz done2
  11814. imul eax, DRAWVERT_SIZE
  11815. add esi, eax
  11816. neg eax
  11817. loopVert2:
  11818. movaps xmm6, p0
  11819. movss xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  11820. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  11821. mulps xmm6, xmm0
  11822. movaps xmm7, p1
  11823. movss xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
  11824. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  11825. mulps xmm7, xmm1
  11826. addps xmm6, xmm7
  11827. movaps xmm7, p2
  11828. movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  11829. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
  11830. mulps xmm7, xmm2
  11831. addps xmm6, xmm7
  11832. addps xmm6, p3
  11833. cmpnltps xmm6, SIMD_SP_zero
  11834. movmskps ecx, xmm6
  11835. movaps xmm6, p0
  11836. movss xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  11837. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  11838. mulps xmm6, xmm3
  11839. movaps xmm7, p1
  11840. movss xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
  11841. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  11842. mulps xmm7, xmm4
  11843. addps xmm6, xmm7
  11844. movaps xmm7, p2
  11845. movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  11846. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  11847. mulps xmm7, xmm5
  11848. addps xmm6, xmm7
  11849. addps xmm6, p3
  11850. cmpnltps xmm6, SIMD_SP_zero
  11851. movmskps edx, xmm6
  11852. mov ch, dl
  11853. shufps xmm0, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  11854. mulps xmm0, p4
  11855. shufps xmm1, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  11856. mulps xmm1, p5
  11857. addps xmm0, xmm1
  11858. shufps xmm2, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  11859. mulps xmm2, p6
  11860. addps xmm0, xmm2
  11861. addps xmm0, p7
  11862. cmpnltps xmm0, SIMD_SP_zero
  11863. movmskps edx, xmm0
  11864. add edi, 2
  11865. mov dh, dl
  11866. shl dl, 4
  11867. shl dh, 2
  11868. and edx, (3<<4)|(3<<12)
  11869. or ecx, edx
  11870. add eax, 2*DRAWVERT_SIZE
  11871. mov word ptr [edi-2], cx
  11872. jl loopVert2
  11873. done2:
  11874. mov eax, numVerts
  11875. and eax, 1
  11876. jz done
  11877. movaps xmm6, p0
  11878. movss xmm0, [esi+DRAWVERT_XYZ_OFFSET+0]
  11879. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  11880. mulps xmm6, xmm0
  11881. movaps xmm7, p1
  11882. movss xmm1, [esi+DRAWVERT_XYZ_OFFSET+4]
  11883. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  11884. mulps xmm7, xmm1
  11885. addps xmm6, xmm7
  11886. movaps xmm7, p2
  11887. movss xmm2, [esi+DRAWVERT_XYZ_OFFSET+8]
  11888. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
  11889. mulps xmm7, xmm2
  11890. addps xmm6, xmm7
  11891. addps xmm6, p3
  11892. cmpnltps xmm6, SIMD_SP_zero
  11893. movmskps ecx, xmm6
  11894. mulps xmm0, p4
  11895. mulps xmm1, p5
  11896. addps xmm0, xmm1
  11897. mulps xmm2, p6
  11898. addps xmm0, xmm2
  11899. addps xmm0, p7
  11900. cmpnltps xmm0, SIMD_SP_zero
  11901. movmskps edx, xmm0
  11902. and edx, 3
  11903. shl edx, 4
  11904. or ecx, edx
  11905. mov byte ptr [edi], cl
  11906. done:
  11907. }
  11908. #else
  11909. int i;
  11910. for ( i = 0; i < numVerts; i += 2 ) {
  11911. unsigned short bits0, bits1;
  11912. float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
  11913. const idVec3 &v0 = verts[i+0].xyz;
  11914. const idVec3 &v1 = verts[i+1].xyz;
  11915. d0 = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3];
  11916. d1 = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3];
  11917. d2 = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3];
  11918. d3 = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3];
  11919. d4 = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3];
  11920. d5 = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3];
  11921. d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3];
  11922. d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3];
  11923. d6 = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3];
  11924. d7 = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3];
  11925. d8 = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3];
  11926. d9 = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3];
  11927. bits0 = FLOATSIGNBITSET( d0 ) << (0+0);
  11928. bits0 |= FLOATSIGNBITSET( d1 ) << (0+1);
  11929. bits0 |= FLOATSIGNBITSET( d2 ) << (0+2);
  11930. bits0 |= FLOATSIGNBITSET( d3 ) << (0+3);
  11931. bits0 |= FLOATSIGNBITSET( d4 ) << (0+4);
  11932. bits0 |= FLOATSIGNBITSET( d5 ) << (0+5);
  11933. bits1 = FLOATSIGNBITSET( d6 ) << (8+0);
  11934. bits1 |= FLOATSIGNBITSET( d7 ) << (8+1);
  11935. bits1 |= FLOATSIGNBITSET( d8 ) << (8+2);
  11936. bits1 |= FLOATSIGNBITSET( d9 ) << (8+3);
  11937. bits1 |= FLOATSIGNBITSET( d10 ) << (8+4);
  11938. bits1 |= FLOATSIGNBITSET( d11 ) << (8+5);
  11939. *(unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F;
  11940. }
  11941. if ( numVerts & 1 ) {
  11942. byte bits;
  11943. float d0, d1, d2, d3, d4, d5;
  11944. const idVec3 &v = verts[numVerts - 1].xyz;
  11945. d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
  11946. d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
  11947. d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
  11948. d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
  11949. d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3];
  11950. d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3];
  11951. bits = FLOATSIGNBITSET( d0 ) << 0;
  11952. bits |= FLOATSIGNBITSET( d1 ) << 1;
  11953. bits |= FLOATSIGNBITSET( d2 ) << 2;
  11954. bits |= FLOATSIGNBITSET( d3 ) << 3;
  11955. bits |= FLOATSIGNBITSET( d4 ) << 4;
  11956. bits |= FLOATSIGNBITSET( d5 ) << 5;
  11957. cullBits[numVerts - 1] = bits ^ 0x3F; // flip lower 6 bits
  11958. }
  11959. #endif
  11960. }
  11961. /*
  11962. ============
  11963. idSIMD_SSE::OverlayPointCull
  11964. ============
  11965. */
  11966. void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  11967. #if 1
  11968. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  11969. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  11970. __asm {
  11971. mov eax, numVerts
  11972. mov edx, verts
  11973. mov esi, texCoords
  11974. mov edi, cullBits
  11975. mov ecx, planes
  11976. movss xmm4, [ecx+ 0]
  11977. movss xmm5, [ecx+16]
  11978. shufps xmm4, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
  11979. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
  11980. movss xmm5, [ecx+ 4]
  11981. movss xmm6, [ecx+20]
  11982. shufps xmm5, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
  11983. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )
  11984. movss xmm6, [ecx+ 8]
  11985. movss xmm7, [ecx+24]
  11986. shufps xmm6, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  11987. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )
  11988. movss xmm7, [ecx+12]
  11989. movss xmm0, [ecx+28]
  11990. shufps xmm7, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  11991. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 0, 2 )
  11992. and eax, ~1
  11993. jz done2
  11994. add edi, eax
  11995. neg eax
  11996. loopVert2:
  11997. movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  11998. movss xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  11999. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  12000. mulps xmm0, xmm4
  12001. movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
  12002. movss xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
  12003. shufps xmm1, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
  12004. mulps xmm1, xmm5
  12005. movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  12006. movss xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  12007. shufps xmm2, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
  12008. mulps xmm2, xmm6
  12009. addps xmm0, xmm1
  12010. addps xmm0, xmm2
  12011. addps xmm0, xmm7
  12012. movaps [esi], xmm0
  12013. movaps xmm1, xmm0
  12014. movaps xmm2, SIMD_SP_one
  12015. subps xmm2, xmm0
  12016. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
  12017. shufps xmm1, xmm2, R_SHUFFLEPS( 2, 3, 2, 3 )
  12018. add edx, 2*DRAWVERT_SIZE
  12019. movmskps ecx, xmm0
  12020. mov byte ptr [edi+eax+0], cl
  12021. add esi, 4*4
  12022. movmskps ecx, xmm1
  12023. mov byte ptr [edi+eax+1], cl
  12024. add eax, 2
  12025. jl loopVert2
  12026. done2:
  12027. mov eax, numVerts
  12028. and eax, 1
  12029. jz done
  12030. movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  12031. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  12032. mulps xmm0, xmm4
  12033. movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
  12034. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  12035. mulps xmm1, xmm5
  12036. movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  12037. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
  12038. mulps xmm2, xmm6
  12039. addps xmm0, xmm1
  12040. addps xmm0, xmm2
  12041. addps xmm0, xmm7
  12042. movlps [esi], xmm0
  12043. movaps xmm1, xmm0
  12044. movaps xmm2, SIMD_SP_one
  12045. subps xmm2, xmm0
  12046. shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
  12047. movmskps ecx, xmm0
  12048. mov byte ptr [edi], cl
  12049. done:
  12050. }
  12051. #else
  12052. const idPlane &p0 = planes[0];
  12053. const idPlane &p1 = planes[1];
  12054. for ( int i = 0; i < numVerts - 1; i += 2 ) {
  12055. unsigned short bits;
  12056. float d0, d1, d2, d3;
  12057. const idVec3 &v0 = verts[i+0].xyz;
  12058. const idVec3 &v1 = verts[i+1].xyz;
  12059. d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
  12060. d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
  12061. d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3];
  12062. d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3];
  12063. texCoords[i+0][0] = d0;
  12064. texCoords[i+0][1] = d1;
  12065. texCoords[i+1][0] = d2;
  12066. texCoords[i+1][1] = d3;
  12067. bits = FLOATSIGNBITSET( d0 ) << 0;
  12068. bits |= FLOATSIGNBITSET( d1 ) << 1;
  12069. bits |= FLOATSIGNBITSET( d2 ) << 8;
  12070. bits |= FLOATSIGNBITSET( d3 ) << 9;
  12071. d0 = 1.0f - d0;
  12072. d1 = 1.0f - d1;
  12073. d2 = 1.0f - d2;
  12074. d3 = 1.0f - d3;
  12075. bits |= FLOATSIGNBITSET( d0 ) << 2;
  12076. bits |= FLOATSIGNBITSET( d1 ) << 3;
  12077. bits |= FLOATSIGNBITSET( d2 ) << 10;
  12078. bits |= FLOATSIGNBITSET( d3 ) << 11;
  12079. *(unsigned short *)(cullBits + i) = bits;
  12080. }
  12081. if ( numVerts & 1 ) {
  12082. byte bits;
  12083. float d0, d1;
  12084. const idPlane &p0 = planes[0];
  12085. const idPlane &p1 = planes[1];
  12086. const idVec3 &v0 = verts[numVerts - 1].xyz;
  12087. d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
  12088. d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
  12089. texCoords[i][0] = d0;
  12090. texCoords[i][1] = d1;
  12091. bits = FLOATSIGNBITSET( d0 ) << 0;
  12092. bits |= FLOATSIGNBITSET( d1 ) << 1;
  12093. d0 = 1.0f - d0;
  12094. d1 = 1.0f - d1;
  12095. bits |= FLOATSIGNBITSET( d0 ) << 2;
  12096. bits |= FLOATSIGNBITSET( d1 ) << 3;
  12097. cullBits[numVerts - 1] = bits;
  12098. }
  12099. #endif
  12100. }
  12101. /*
  12102. ============
  12103. idSIMD_SSE::DeriveTriPlanes
  12104. ============
  12105. */
  12106. void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  12107. #if 1
  12108. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  12109. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  12110. __asm {
  12111. mov eax, numIndexes
  12112. shl eax, 2
  12113. mov esi, verts
  12114. mov edi, indexes
  12115. mov edx, planes
  12116. add edi, eax
  12117. neg eax
  12118. add eax, 4*12
  12119. jge done4
  12120. loopPlane4:
  12121. mov ebx, [edi+eax-4*12+4]
  12122. imul ebx, DRAWVERT_SIZE
  12123. mov ecx, [edi+eax-4*12+0]
  12124. imul ecx, DRAWVERT_SIZE
  12125. movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
  12126. subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12127. movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
  12128. subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12129. movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
  12130. subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12131. mov ebx, [edi+eax-4*12+8]
  12132. imul ebx, DRAWVERT_SIZE
  12133. shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
  12134. shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
  12135. shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
  12136. movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
  12137. subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12138. movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
  12139. subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12140. movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
  12141. subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12142. mov ebx, [edi+eax-3*12+4]
  12143. imul ebx, DRAWVERT_SIZE
  12144. mov ecx, [edi+eax-3*12+0]
  12145. imul ecx, DRAWVERT_SIZE
  12146. shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
  12147. shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
  12148. shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
  12149. movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
  12150. subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12151. movss xmm0, xmm6
  12152. movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
  12153. subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12154. movss xmm1, xmm7
  12155. movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
  12156. subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12157. movss xmm2, xmm6
  12158. mov ebx, [edi+eax-3*12+8]
  12159. imul ebx, DRAWVERT_SIZE
  12160. shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
  12161. shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
  12162. shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
  12163. movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
  12164. subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12165. movss xmm3, xmm7
  12166. movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
  12167. subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12168. movss xmm4, xmm6
  12169. movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
  12170. subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12171. movss xmm5, xmm7
  12172. mov ebx, [edi+eax-2*12+4]
  12173. imul ebx, DRAWVERT_SIZE
  12174. mov ecx, [edi+eax-2*12+0]
  12175. imul ecx, DRAWVERT_SIZE
  12176. shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
  12177. shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
  12178. shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
  12179. movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
  12180. subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12181. movss xmm0, xmm6
  12182. movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
  12183. subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12184. movss xmm1, xmm7
  12185. movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
  12186. subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12187. movss xmm2, xmm6
  12188. mov ebx, [edi+eax-2*12+8]
  12189. imul ebx, DRAWVERT_SIZE
  12190. shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
  12191. shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
  12192. shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
  12193. movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
  12194. subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12195. movss xmm3, xmm7
  12196. movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
  12197. subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12198. movss xmm4, xmm6
  12199. movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
  12200. subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12201. movss xmm5, xmm7
  12202. mov ebx, [edi+eax-1*12+4]
  12203. imul ebx, DRAWVERT_SIZE
  12204. mov ecx, [edi+eax-1*12+0]
  12205. imul ecx, DRAWVERT_SIZE
  12206. shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
  12207. shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
  12208. shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
  12209. movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
  12210. subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12211. movss xmm0, xmm6
  12212. movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
  12213. subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12214. movss xmm1, xmm7
  12215. movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
  12216. subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12217. movss xmm2, xmm6
  12218. mov ebx, [edi+eax-1*12+8]
  12219. imul ebx, DRAWVERT_SIZE
  12220. movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
  12221. subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12222. movss xmm3, xmm7
  12223. movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
  12224. subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12225. movss xmm4, xmm6
  12226. movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
  12227. subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12228. movss xmm5, xmm7
  12229. movaps xmm6, xmm4
  12230. mulps xmm6, xmm2
  12231. movaps xmm7, xmm5
  12232. mulps xmm7, xmm1
  12233. subps xmm6, xmm7
  12234. mulps xmm5, xmm0
  12235. mulps xmm2, xmm3
  12236. subps xmm5, xmm2
  12237. mulps xmm3, xmm1
  12238. mulps xmm4, xmm0
  12239. subps xmm3, xmm4
  12240. movaps xmm0, xmm6
  12241. mulps xmm6, xmm6
  12242. movaps xmm1, xmm5
  12243. mulps xmm5, xmm5
  12244. movaps xmm2, xmm3
  12245. mulps xmm3, xmm3
  12246. addps xmm3, xmm5
  12247. addps xmm3, xmm6
  12248. rsqrtps xmm3, xmm3
  12249. add edx, 4*16
  12250. mov ecx, [edi+eax-1*12+0]
  12251. imul ecx, DRAWVERT_SIZE
  12252. mulps xmm0, xmm3
  12253. mulps xmm1, xmm3
  12254. mulps xmm2, xmm3
  12255. movss [edx-1*16+0], xmm0
  12256. movss [edx-1*16+4], xmm1
  12257. movss [edx-1*16+8], xmm2
  12258. mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12259. mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12260. mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12261. xorps xmm0, SIMD_SP_singleSignBitMask
  12262. subss xmm0, xmm1
  12263. subss xmm0, xmm2
  12264. movss [edx-1*16+12], xmm0
  12265. mov ecx, [edi+eax-2*12+0]
  12266. imul ecx, DRAWVERT_SIZE
  12267. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  12268. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  12269. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  12270. movss [edx-2*16+0], xmm0
  12271. movss [edx-2*16+4], xmm1
  12272. movss [edx-2*16+8], xmm2
  12273. mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12274. mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12275. mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12276. xorps xmm0, SIMD_SP_singleSignBitMask
  12277. subss xmm0, xmm1
  12278. subss xmm0, xmm2
  12279. movss [edx-2*16+12], xmm0
  12280. mov ecx, [edi+eax-3*12+0]
  12281. imul ecx, DRAWVERT_SIZE
  12282. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  12283. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  12284. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  12285. movss [edx-3*16+0], xmm0
  12286. movss [edx-3*16+4], xmm1
  12287. movss [edx-3*16+8], xmm2
  12288. mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12289. mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12290. mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12291. xorps xmm0, SIMD_SP_singleSignBitMask
  12292. subss xmm0, xmm1
  12293. subss xmm0, xmm2
  12294. movss [edx-3*16+12], xmm0
  12295. mov ecx, [edi+eax-4*12+0]
  12296. imul ecx, DRAWVERT_SIZE
  12297. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  12298. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  12299. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  12300. movss [edx-4*16+0], xmm0
  12301. movss [edx-4*16+4], xmm1
  12302. movss [edx-4*16+8], xmm2
  12303. mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12304. mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12305. mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12306. xorps xmm0, SIMD_SP_singleSignBitMask
  12307. subss xmm0, xmm1
  12308. subss xmm0, xmm2
  12309. movss [edx-4*16+12], xmm0
  12310. add eax, 4*12
  12311. jle loopPlane4
  12312. done4:
  12313. sub eax, 4*12
  12314. jge done
  12315. loopPlane1:
  12316. mov ebx, [edi+eax+4]
  12317. imul ebx, DRAWVERT_SIZE
  12318. mov ecx, [edi+eax+0]
  12319. imul ecx, DRAWVERT_SIZE
  12320. movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
  12321. subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12322. movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
  12323. subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12324. movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
  12325. subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12326. mov ebx, [edi+eax+8]
  12327. imul ebx, DRAWVERT_SIZE
  12328. movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
  12329. subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12330. movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
  12331. subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12332. movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
  12333. subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12334. movss xmm6, xmm4
  12335. mulss xmm6, xmm2
  12336. movss xmm7, xmm5
  12337. mulss xmm7, xmm1
  12338. subss xmm6, xmm7
  12339. mulss xmm5, xmm0
  12340. mulss xmm2, xmm3
  12341. subss xmm5, xmm2
  12342. mulss xmm3, xmm1
  12343. mulss xmm4, xmm0
  12344. subss xmm3, xmm4
  12345. movss xmm0, xmm6
  12346. mulss xmm6, xmm6
  12347. movss xmm1, xmm5
  12348. mulss xmm5, xmm5
  12349. movss xmm2, xmm3
  12350. mulss xmm3, xmm3
  12351. addss xmm3, xmm5
  12352. addss xmm3, xmm6
  12353. rsqrtss xmm3, xmm3
  12354. add edx, 1*16
  12355. mulss xmm0, xmm3
  12356. mulss xmm1, xmm3
  12357. mulss xmm2, xmm3
  12358. movss [edx-1*16+0], xmm0
  12359. movss [edx-1*16+4], xmm1
  12360. movss [edx-1*16+8], xmm2
  12361. mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
  12362. mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
  12363. mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
  12364. xorps xmm0, SIMD_SP_singleSignBitMask
  12365. subss xmm0, xmm1
  12366. subss xmm0, xmm2
  12367. movss [edx-1*16+12], xmm0
  12368. add eax, 1*12
  12369. jl loopPlane1
  12370. done:
  12371. }
  12372. #else
  12373. int i, j;
  12374. for ( i = 0; i <= numIndexes - 12; i += 12 ) {
  12375. ALIGN16( float d0[4] );
  12376. ALIGN16( float d1[4] );
  12377. ALIGN16( float d2[4] );
  12378. ALIGN16( float d3[4] );
  12379. ALIGN16( float d4[4] );
  12380. ALIGN16( float d5[4] );
  12381. ALIGN16( float n0[4] );
  12382. ALIGN16( float n1[4] );
  12383. ALIGN16( float n2[4] );
  12384. for ( j = 0; j < 4; j++ ) {
  12385. const idDrawVert *a, *b, *c;
  12386. a = verts + indexes[i + j * 3 + 0];
  12387. b = verts + indexes[i + j * 3 + 1];
  12388. c = verts + indexes[i + j * 3 + 2];
  12389. d0[j] = b->xyz[0] - a->xyz[0];
  12390. d1[j] = b->xyz[1] - a->xyz[1];
  12391. d2[j] = b->xyz[2] - a->xyz[2];
  12392. d3[j] = c->xyz[0] - a->xyz[0];
  12393. d4[j] = c->xyz[1] - a->xyz[1];
  12394. d5[j] = c->xyz[2] - a->xyz[2];
  12395. }
  12396. ALIGN16( float tmp[4] );
  12397. n0[0] = d4[0] * d2[0];
  12398. n0[1] = d4[1] * d2[1];
  12399. n0[2] = d4[2] * d2[2];
  12400. n0[3] = d4[3] * d2[3];
  12401. n0[0] -= d5[0] * d1[0];
  12402. n0[1] -= d5[1] * d1[1];
  12403. n0[2] -= d5[2] * d1[2];
  12404. n0[3] -= d5[3] * d1[3];
  12405. n1[0] = d5[0] * d0[0];
  12406. n1[1] = d5[1] * d0[1];
  12407. n1[2] = d5[2] * d0[2];
  12408. n1[3] = d5[3] * d0[3];
  12409. n1[0] -= d3[0] * d2[0];
  12410. n1[1] -= d3[1] * d2[1];
  12411. n1[2] -= d3[2] * d2[2];
  12412. n1[3] -= d3[3] * d2[3];
  12413. n2[0] = d3[0] * d1[0];
  12414. n2[1] = d3[1] * d1[1];
  12415. n2[2] = d3[2] * d1[2];
  12416. n2[3] = d3[3] * d1[3];
  12417. n2[0] -= d4[0] * d0[0];
  12418. n2[1] -= d4[1] * d0[1];
  12419. n2[2] -= d4[2] * d0[2];
  12420. n2[3] -= d4[3] * d0[3];
  12421. tmp[0] = n0[0] * n0[0];
  12422. tmp[1] = n0[1] * n0[1];
  12423. tmp[2] = n0[2] * n0[2];
  12424. tmp[3] = n0[3] * n0[3];
  12425. tmp[0] += n1[0] * n1[0];
  12426. tmp[1] += n1[1] * n1[1];
  12427. tmp[2] += n1[2] * n1[2];
  12428. tmp[3] += n1[3] * n1[3];
  12429. tmp[0] += n2[0] * n2[0];
  12430. tmp[1] += n2[1] * n2[1];
  12431. tmp[2] += n2[2] * n2[2];
  12432. tmp[3] += n2[3] * n2[3];
  12433. tmp[0] = idMath::RSqrt( tmp[0] );
  12434. tmp[1] = idMath::RSqrt( tmp[1] );
  12435. tmp[2] = idMath::RSqrt( tmp[2] );
  12436. tmp[3] = idMath::RSqrt( tmp[3] );
  12437. n0[0] *= tmp[0];
  12438. n0[1] *= tmp[1];
  12439. n0[2] *= tmp[2];
  12440. n0[3] *= tmp[3];
  12441. n1[0] *= tmp[0];
  12442. n1[1] *= tmp[1];
  12443. n1[2] *= tmp[2];
  12444. n1[3] *= tmp[3];
  12445. n2[0] *= tmp[0];
  12446. n2[1] *= tmp[1];
  12447. n2[2] *= tmp[2];
  12448. n2[3] *= tmp[3];
  12449. for ( j = 0; j < 4; j++ ) {
  12450. const idDrawVert *a;
  12451. a = verts + indexes[i + j * 3];
  12452. planes->Normal()[0] = n0[j];
  12453. planes->Normal()[1] = n1[j];
  12454. planes->Normal()[2] = n2[j];
  12455. planes->FitThroughPoint( a->xyz );
  12456. planes++;
  12457. }
  12458. }
  12459. for ( ; i < numIndexes; i += 3 ) {
  12460. const idDrawVert *a, *b, *c;
  12461. float d0, d1, d2, d3, d4, d5;
  12462. float n0, n1, n2;
  12463. a = verts + indexes[i + 0];
  12464. b = verts + indexes[i + 1];
  12465. c = verts + indexes[i + 2];
  12466. d0 = b->xyz[0] - a->xyz[0];
  12467. d1 = b->xyz[1] - a->xyz[1];
  12468. d2 = b->xyz[2] - a->xyz[2];
  12469. d3 = c->xyz[0] - a->xyz[0];
  12470. d4 = c->xyz[1] - a->xyz[1];
  12471. d5 = c->xyz[2] - a->xyz[2];
  12472. float tmp;
  12473. n0 = d4 * d2 - d5 * d1;
  12474. n1 = d5 * d0 - d3 * d2;
  12475. n2 = d3 * d1 - d4 * d0;
  12476. tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
  12477. n0 *= tmp;
  12478. n1 *= tmp;
  12479. n2 *= tmp;
  12480. planes->Normal()[0] = n0;
  12481. planes->Normal()[1] = n1;
  12482. planes->Normal()[2] = n2;
  12483. planes->FitThroughPoint( a->xyz );
  12484. planes++;
  12485. }
  12486. #endif
  12487. }
  12488. /*
  12489. ============
  12490. idSIMD_SSE::DeriveTangents
  12491. ============
  12492. */
  12493. //#define REFINE_TANGENT_SQUAREROOT
  12494. #define FIX_DEGENERATE_TANGENT
  12495. void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  12496. int i;
  12497. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  12498. assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
  12499. assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
  12500. assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
  12501. assert( planes != NULL );
  12502. assert( verts != NULL );
  12503. assert( numVerts >= 0 );
  12504. #ifdef REFINE_TANGENT_SQUAREROOT
  12505. __asm {
  12506. movaps xmm6, SIMD_SP_rsqrt_c0
  12507. movaps xmm7, SIMD_SP_rsqrt_c1
  12508. }
  12509. #endif
  12510. bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
  12511. memset( used, 0, numVerts * sizeof( used[0] ) );
  12512. for ( i = 0; i <= numIndexes - 12; i += 12 ) {
  12513. idDrawVert *a, *b, *c;
  12514. ALIGN16( unsigned long signBit[4] );
  12515. ALIGN16( float d0[4] );
  12516. ALIGN16( float d1[4] );
  12517. ALIGN16( float d2[4] );
  12518. ALIGN16( float d3[4] );
  12519. ALIGN16( float d4[4] );
  12520. ALIGN16( float d5[4] );
  12521. ALIGN16( float d6[4] );
  12522. ALIGN16( float d7[4] );
  12523. ALIGN16( float d8[4] );
  12524. ALIGN16( float d9[4] );
  12525. ALIGN16( float n0[4] );
  12526. ALIGN16( float n1[4] );
  12527. ALIGN16( float n2[4] );
  12528. ALIGN16( float t0[4] );
  12529. ALIGN16( float t1[4] );
  12530. ALIGN16( float t2[4] );
  12531. ALIGN16( float t3[4] );
  12532. ALIGN16( float t4[4] );
  12533. ALIGN16( float t5[4] );
  12534. for ( int j = 0; j < 4; j++ ) {
  12535. a = verts + indexes[i + j * 3 + 0];
  12536. b = verts + indexes[i + j * 3 + 1];
  12537. c = verts + indexes[i + j * 3 + 2];
  12538. d0[j] = b->xyz[0] - a->xyz[0];
  12539. d1[j] = b->xyz[1] - a->xyz[1];
  12540. d2[j] = b->xyz[2] - a->xyz[2];
  12541. d3[j] = b->st[0] - a->st[0];
  12542. d4[j] = b->st[1] - a->st[1];
  12543. d5[j] = c->xyz[0] - a->xyz[0];
  12544. d6[j] = c->xyz[1] - a->xyz[1];
  12545. d7[j] = c->xyz[2] - a->xyz[2];
  12546. d8[j] = c->st[0] - a->st[0];
  12547. d9[j] = c->st[1] - a->st[1];
  12548. }
  12549. #if 1
  12550. __asm {
  12551. // normal
  12552. movaps xmm0, d6
  12553. mulps xmm0, d2
  12554. movaps xmm1, d7
  12555. mulps xmm1, d1
  12556. subps xmm0, xmm1
  12557. movaps xmm1, d7
  12558. mulps xmm1, d0
  12559. movaps xmm2, d5
  12560. mulps xmm2, d2
  12561. subps xmm1, xmm2
  12562. movaps xmm2, d5
  12563. mulps xmm2, d1
  12564. movaps xmm3, d6
  12565. mulps xmm3, d0
  12566. subps xmm2, xmm3
  12567. movaps xmm3, xmm0
  12568. movaps xmm4, xmm1
  12569. movaps xmm5, xmm2
  12570. mulps xmm3, xmm3
  12571. mulps xmm4, xmm4
  12572. mulps xmm5, xmm5
  12573. addps xmm3, xmm4
  12574. addps xmm3, xmm5
  12575. #ifdef FIX_DEGENERATE_TANGENT
  12576. xorps xmm4, xmm4
  12577. cmpeqps xmm4, xmm3
  12578. andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
  12579. andps xmm3, SIMD_SP_absMask // make sure the values are positive
  12580. orps xmm3, xmm4
  12581. #endif
  12582. #ifdef REFINE_TANGENT_SQUAREROOT
  12583. rsqrtps xmm4, xmm3
  12584. mulps xmm3, xmm4
  12585. mulps xmm3, xmm4
  12586. subps xmm3, xmm6
  12587. mulps xmm4, xmm7
  12588. mulps xmm3, xmm4
  12589. #else
  12590. rsqrtps xmm3, xmm3
  12591. #endif
  12592. mulps xmm0, xmm3
  12593. movaps n0, xmm0
  12594. mulps xmm1, xmm3
  12595. movaps n1, xmm1
  12596. mulps xmm2, xmm3
  12597. movaps n2, xmm2
  12598. // area sign bit
  12599. movaps xmm0, d3
  12600. mulps xmm0, d9
  12601. movaps xmm1, d4
  12602. mulps xmm1, d8
  12603. subps xmm0, xmm1
  12604. andps xmm0, SIMD_SP_signBitMask
  12605. movaps signBit, xmm0
  12606. // first tangent
  12607. movaps xmm0, d0
  12608. mulps xmm0, d9
  12609. movaps xmm1, d4
  12610. mulps xmm1, d5
  12611. subps xmm0, xmm1
  12612. movaps xmm1, d1
  12613. mulps xmm1, d9
  12614. movaps xmm2, d4
  12615. mulps xmm2, d6
  12616. subps xmm1, xmm2
  12617. movaps xmm2, d2
  12618. mulps xmm2, d9
  12619. movaps xmm3, d4
  12620. mulps xmm3, d7
  12621. subps xmm2, xmm3
  12622. movaps xmm3, xmm0
  12623. movaps xmm4, xmm1
  12624. movaps xmm5, xmm2
  12625. mulps xmm3, xmm3
  12626. mulps xmm4, xmm4
  12627. mulps xmm5, xmm5
  12628. addps xmm3, xmm4
  12629. addps xmm3, xmm5
  12630. #ifdef FIX_DEGENERATE_TANGENT
  12631. xorps xmm4, xmm4
  12632. cmpeqps xmm4, xmm3
  12633. andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
  12634. andps xmm3, SIMD_SP_absMask // make sure the values are positive
  12635. orps xmm3, xmm4
  12636. #endif
  12637. #ifdef REFINE_TANGENT_SQUAREROOT
  12638. rsqrtps xmm4, xmm3
  12639. mulps xmm3, xmm4
  12640. mulps xmm3, xmm4
  12641. subps xmm3, xmm6
  12642. mulps xmm4, xmm7
  12643. mulps xmm3, xmm4
  12644. #else
  12645. rsqrtps xmm3, xmm3
  12646. #endif
  12647. xorps xmm3, signBit
  12648. mulps xmm0, xmm3
  12649. movaps t0, xmm0
  12650. mulps xmm1, xmm3
  12651. movaps t1, xmm1
  12652. mulps xmm2, xmm3
  12653. movaps t2, xmm2
  12654. // second tangent
  12655. movaps xmm0, d3
  12656. mulps xmm0, d5
  12657. movaps xmm1, d0
  12658. mulps xmm1, d8
  12659. subps xmm0, xmm1
  12660. movaps xmm1, d3
  12661. mulps xmm1, d6
  12662. movaps xmm2, d1
  12663. mulps xmm2, d8
  12664. subps xmm1, xmm2
  12665. movaps xmm2, d3
  12666. mulps xmm2, d7
  12667. movaps xmm3, d2
  12668. mulps xmm3, d8
  12669. subps xmm2, xmm3
  12670. movaps xmm3, xmm0
  12671. movaps xmm4, xmm1
  12672. movaps xmm5, xmm2
  12673. mulps xmm3, xmm3
  12674. mulps xmm4, xmm4
  12675. mulps xmm5, xmm5
  12676. addps xmm3, xmm4
  12677. addps xmm3, xmm5
  12678. #ifdef FIX_DEGENERATE_TANGENT
  12679. xorps xmm4, xmm4
  12680. cmpeqps xmm4, xmm3
  12681. andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
  12682. andps xmm3, SIMD_SP_absMask // make sure the values are positive
  12683. orps xmm3, xmm4
  12684. #endif
  12685. #ifdef REFINE_TANGENT_SQUAREROOT
  12686. rsqrtps xmm4, xmm3
  12687. mulps xmm3, xmm4
  12688. mulps xmm3, xmm4
  12689. subps xmm3, xmm6
  12690. mulps xmm4, xmm7
  12691. mulps xmm3, xmm4
  12692. #else
  12693. rsqrtps xmm3, xmm3
  12694. #endif
  12695. xorps xmm3, signBit
  12696. mulps xmm0, xmm3
  12697. movaps t3, xmm0
  12698. mulps xmm1, xmm3
  12699. movaps t4, xmm1
  12700. mulps xmm2, xmm3
  12701. movaps t5, xmm2
  12702. }
  12703. #else
  12704. ALIGN16( float tmp[4] );
  12705. // normal
  12706. n0[0] = d6[0] * d2[0];
  12707. n0[1] = d6[1] * d2[1];
  12708. n0[2] = d6[2] * d2[2];
  12709. n0[3] = d6[3] * d2[3];
  12710. n0[0] -= d7[0] * d1[0];
  12711. n0[1] -= d7[1] * d1[1];
  12712. n0[2] -= d7[2] * d1[2];
  12713. n0[3] -= d7[3] * d1[3];
  12714. n1[0] = d7[0] * d0[0];
  12715. n1[1] = d7[1] * d0[1];
  12716. n1[2] = d7[2] * d0[2];
  12717. n1[3] = d7[3] * d0[3];
  12718. n1[0] -= d5[0] * d2[0];
  12719. n1[1] -= d5[1] * d2[1];
  12720. n1[2] -= d5[2] * d2[2];
  12721. n1[3] -= d5[3] * d2[3];
  12722. n2[0] = d5[0] * d1[0];
  12723. n2[1] = d5[1] * d1[1];
  12724. n2[2] = d5[2] * d1[2];
  12725. n2[3] = d5[3] * d1[3];
  12726. n2[0] -= d6[0] * d0[0];
  12727. n2[1] -= d6[1] * d0[1];
  12728. n2[2] -= d6[2] * d0[2];
  12729. n2[3] -= d6[3] * d0[3];
  12730. tmp[0] = n0[0] * n0[0];
  12731. tmp[1] = n0[1] * n0[1];
  12732. tmp[2] = n0[2] * n0[2];
  12733. tmp[3] = n0[3] * n0[3];
  12734. tmp[0] += n1[0] * n1[0];
  12735. tmp[1] += n1[1] * n1[1];
  12736. tmp[2] += n1[2] * n1[2];
  12737. tmp[3] += n1[3] * n1[3];
  12738. tmp[0] += n2[0] * n2[0];
  12739. tmp[1] += n2[1] * n2[1];
  12740. tmp[2] += n2[2] * n2[2];
  12741. tmp[3] += n2[3] * n2[3];
  12742. tmp[0] = idMath::RSqrt( tmp[0] );
  12743. tmp[1] = idMath::RSqrt( tmp[1] );
  12744. tmp[2] = idMath::RSqrt( tmp[2] );
  12745. tmp[3] = idMath::RSqrt( tmp[3] );
  12746. n0[0] *= tmp[0];
  12747. n0[1] *= tmp[1];
  12748. n0[2] *= tmp[2];
  12749. n0[3] *= tmp[3];
  12750. n1[0] *= tmp[0];
  12751. n1[1] *= tmp[1];
  12752. n1[2] *= tmp[2];
  12753. n1[3] *= tmp[3];
  12754. n2[0] *= tmp[0];
  12755. n2[1] *= tmp[1];
  12756. n2[2] *= tmp[2];
  12757. n2[3] *= tmp[3];
  12758. // area sign bit
  12759. tmp[0] = d3[0] * d9[0];
  12760. tmp[1] = d3[1] * d9[1];
  12761. tmp[2] = d3[2] * d9[2];
  12762. tmp[3] = d3[3] * d9[3];
  12763. tmp[0] -= d4[0] * d8[0];
  12764. tmp[1] -= d4[1] * d8[1];
  12765. tmp[2] -= d4[2] * d8[2];
  12766. tmp[3] -= d4[3] * d8[3];
  12767. signBit[0] = ( *(unsigned long *)&tmp[0] ) & ( 1 << 31 );
  12768. signBit[1] = ( *(unsigned long *)&tmp[1] ) & ( 1 << 31 );
  12769. signBit[2] = ( *(unsigned long *)&tmp[2] ) & ( 1 << 31 );
  12770. signBit[3] = ( *(unsigned long *)&tmp[3] ) & ( 1 << 31 );
  12771. // first tangent
  12772. t0[0] = d0[0] * d9[0];
  12773. t0[1] = d0[1] * d9[1];
  12774. t0[2] = d0[2] * d9[2];
  12775. t0[3] = d0[3] * d9[3];
  12776. t0[0] -= d4[0] * d5[0];
  12777. t0[1] -= d4[1] * d5[1];
  12778. t0[2] -= d4[2] * d5[2];
  12779. t0[3] -= d4[3] * d5[3];
  12780. t1[0] = d1[0] * d9[0];
  12781. t1[1] = d1[1] * d9[1];
  12782. t1[2] = d1[2] * d9[2];
  12783. t1[3] = d1[3] * d9[3];
  12784. t1[0] -= d4[0] * d6[0];
  12785. t1[1] -= d4[1] * d6[1];
  12786. t1[2] -= d4[2] * d6[2];
  12787. t1[3] -= d4[3] * d6[3];
  12788. t2[0] = d2[0] * d9[0];
  12789. t2[1] = d2[1] * d9[1];
  12790. t2[2] = d2[2] * d9[2];
  12791. t2[3] = d2[3] * d9[3];
  12792. t2[0] -= d4[0] * d7[0];
  12793. t2[1] -= d4[1] * d7[1];
  12794. t2[2] -= d4[2] * d7[2];
  12795. t2[3] -= d4[3] * d7[3];
  12796. tmp[0] = t0[0] * t0[0];
  12797. tmp[1] = t0[1] * t0[1];
  12798. tmp[2] = t0[2] * t0[2];
  12799. tmp[3] = t0[3] * t0[3];
  12800. tmp[0] += t1[0] * t1[0];
  12801. tmp[1] += t1[1] * t1[1];
  12802. tmp[2] += t1[2] * t1[2];
  12803. tmp[3] += t1[3] * t1[3];
  12804. tmp[0] += t2[0] * t2[0];
  12805. tmp[1] += t2[1] * t2[1];
  12806. tmp[2] += t2[2] * t2[2];
  12807. tmp[3] += t2[3] * t2[3];
  12808. tmp[0] = idMath::RSqrt( tmp[0] );
  12809. tmp[1] = idMath::RSqrt( tmp[1] );
  12810. tmp[2] = idMath::RSqrt( tmp[2] );
  12811. tmp[3] = idMath::RSqrt( tmp[3] );
  12812. *(unsigned long *)&tmp[0] ^= signBit[0];
  12813. *(unsigned long *)&tmp[1] ^= signBit[1];
  12814. *(unsigned long *)&tmp[2] ^= signBit[2];
  12815. *(unsigned long *)&tmp[3] ^= signBit[3];
  12816. t0[0] *= tmp[0];
  12817. t0[1] *= tmp[1];
  12818. t0[2] *= tmp[2];
  12819. t0[3] *= tmp[3];
  12820. t1[0] *= tmp[0];
  12821. t1[1] *= tmp[1];
  12822. t1[2] *= tmp[2];
  12823. t1[3] *= tmp[3];
  12824. t2[0] *= tmp[0];
  12825. t2[1] *= tmp[1];
  12826. t2[2] *= tmp[2];
  12827. t2[3] *= tmp[3];
  12828. // second tangent
  12829. t3[0] = d3[0] * d5[0];
  12830. t3[1] = d3[1] * d5[1];
  12831. t3[2] = d3[2] * d5[2];
  12832. t3[3] = d3[3] * d5[3];
  12833. t3[0] -= d0[0] * d8[0];
  12834. t3[1] -= d0[1] * d8[1];
  12835. t3[2] -= d0[2] * d8[2];
  12836. t3[3] -= d0[3] * d8[3];
  12837. t4[0] = d3[0] * d6[0];
  12838. t4[1] = d3[1] * d6[1];
  12839. t4[2] = d3[2] * d6[2];
  12840. t4[3] = d3[3] * d6[3];
  12841. t4[0] -= d1[0] * d8[0];
  12842. t4[1] -= d1[1] * d8[1];
  12843. t4[2] -= d1[2] * d8[2];
  12844. t4[3] -= d1[3] * d8[3];
  12845. t5[0] = d3[0] * d7[0];
  12846. t5[1] = d3[1] * d7[1];
  12847. t5[2] = d3[2] * d7[2];
  12848. t5[3] = d3[3] * d7[3];
  12849. t5[0] -= d2[0] * d8[0];
  12850. t5[1] -= d2[1] * d8[1];
  12851. t5[2] -= d2[2] * d8[2];
  12852. t5[3] -= d2[3] * d8[3];
  12853. tmp[0] = t3[0] * t3[0];
  12854. tmp[1] = t3[1] * t3[1];
  12855. tmp[2] = t3[2] * t3[2];
  12856. tmp[3] = t3[3] * t3[3];
  12857. tmp[0] += t4[0] * t4[0];
  12858. tmp[1] += t4[1] * t4[1];
  12859. tmp[2] += t4[2] * t4[2];
  12860. tmp[3] += t4[3] * t4[3];
  12861. tmp[0] += t5[0] * t5[0];
  12862. tmp[1] += t5[1] * t5[1];
  12863. tmp[2] += t5[2] * t5[2];
  12864. tmp[3] += t5[3] * t5[3];
  12865. tmp[0] = idMath::RSqrt( tmp[0] );
  12866. tmp[1] = idMath::RSqrt( tmp[1] );
  12867. tmp[2] = idMath::RSqrt( tmp[2] );
  12868. tmp[3] = idMath::RSqrt( tmp[3] );
  12869. *(unsigned long *)&tmp[0] ^= signBit[0];
  12870. *(unsigned long *)&tmp[1] ^= signBit[1];
  12871. *(unsigned long *)&tmp[2] ^= signBit[2];
  12872. *(unsigned long *)&tmp[3] ^= signBit[3];
  12873. t3[0] *= tmp[0];
  12874. t3[1] *= tmp[1];
  12875. t3[2] *= tmp[2];
  12876. t3[3] *= tmp[3];
  12877. t4[0] *= tmp[0];
  12878. t4[1] *= tmp[1];
  12879. t4[2] *= tmp[2];
  12880. t4[3] *= tmp[3];
  12881. t5[0] *= tmp[0];
  12882. t5[1] *= tmp[1];
  12883. t5[2] *= tmp[2];
  12884. t5[3] *= tmp[3];
  12885. #endif
  12886. for ( int j = 0; j < 4; j++ ) {
  12887. const int v0 = indexes[i + j * 3 + 0];
  12888. const int v1 = indexes[i + j * 3 + 1];
  12889. const int v2 = indexes[i + j * 3 + 2];
  12890. a = verts + v0;
  12891. b = verts + v1;
  12892. c = verts + v2;
  12893. planes->Normal()[0] = n0[j];
  12894. planes->Normal()[1] = n1[j];
  12895. planes->Normal()[2] = n2[j];
  12896. planes->FitThroughPoint( a->xyz );
  12897. planes++;
  12898. if ( used[v0] ) {
  12899. a->normal[0] += n0[j];
  12900. a->normal[1] += n1[j];
  12901. a->normal[2] += n2[j];
  12902. a->tangents[0][0] += t0[j];
  12903. a->tangents[0][1] += t1[j];
  12904. a->tangents[0][2] += t2[j];
  12905. a->tangents[1][0] += t3[j];
  12906. a->tangents[1][1] += t4[j];
  12907. a->tangents[1][2] += t5[j];
  12908. } else {
  12909. a->normal[0] = n0[j];
  12910. a->normal[1] = n1[j];
  12911. a->normal[2] = n2[j];
  12912. a->tangents[0][0] = t0[j];
  12913. a->tangents[0][1] = t1[j];
  12914. a->tangents[0][2] = t2[j];
  12915. a->tangents[1][0] = t3[j];
  12916. a->tangents[1][1] = t4[j];
  12917. a->tangents[1][2] = t5[j];
  12918. used[v0] = true;
  12919. }
  12920. if ( used[v1] ) {
  12921. b->normal[0] += n0[j];
  12922. b->normal[1] += n1[j];
  12923. b->normal[2] += n2[j];
  12924. b->tangents[0][0] += t0[j];
  12925. b->tangents[0][1] += t1[j];
  12926. b->tangents[0][2] += t2[j];
  12927. b->tangents[1][0] += t3[j];
  12928. b->tangents[1][1] += t4[j];
  12929. b->tangents[1][2] += t5[j];
  12930. } else {
  12931. b->normal[0] = n0[j];
  12932. b->normal[1] = n1[j];
  12933. b->normal[2] = n2[j];
  12934. b->tangents[0][0] = t0[j];
  12935. b->tangents[0][1] = t1[j];
  12936. b->tangents[0][2] = t2[j];
  12937. b->tangents[1][0] = t3[j];
  12938. b->tangents[1][1] = t4[j];
  12939. b->tangents[1][2] = t5[j];
  12940. used[v1] = true;
  12941. }
  12942. if ( used[v2] ) {
  12943. c->normal[0] += n0[j];
  12944. c->normal[1] += n1[j];
  12945. c->normal[2] += n2[j];
  12946. c->tangents[0][0] += t0[j];
  12947. c->tangents[0][1] += t1[j];
  12948. c->tangents[0][2] += t2[j];
  12949. c->tangents[1][0] += t3[j];
  12950. c->tangents[1][1] += t4[j];
  12951. c->tangents[1][2] += t5[j];
  12952. } else {
  12953. c->normal[0] = n0[j];
  12954. c->normal[1] = n1[j];
  12955. c->normal[2] = n2[j];
  12956. c->tangents[0][0] = t0[j];
  12957. c->tangents[0][1] = t1[j];
  12958. c->tangents[0][2] = t2[j];
  12959. c->tangents[1][0] = t3[j];
  12960. c->tangents[1][1] = t4[j];
  12961. c->tangents[1][2] = t5[j];
  12962. used[v2] = true;
  12963. }
  12964. }
  12965. }
  12966. for ( ; i < numIndexes; i += 3 ) {
  12967. idDrawVert *a, *b, *c;
  12968. ALIGN16( unsigned long signBit[4] );
  12969. float d0, d1, d2, d3, d4;
  12970. float d5, d6, d7, d8, d9;
  12971. float n0, n1, n2;
  12972. float t0, t1, t2;
  12973. float t3, t4, t5;
  12974. const int v0 = indexes[i + 0];
  12975. const int v1 = indexes[i + 1];
  12976. const int v2 = indexes[i + 2];
  12977. a = verts + v0;
  12978. b = verts + v1;
  12979. c = verts + v2;
  12980. d0 = b->xyz[0] - a->xyz[0];
  12981. d1 = b->xyz[1] - a->xyz[1];
  12982. d2 = b->xyz[2] - a->xyz[2];
  12983. d3 = b->st[0] - a->st[0];
  12984. d4 = b->st[1] - a->st[1];
  12985. d5 = c->xyz[0] - a->xyz[0];
  12986. d6 = c->xyz[1] - a->xyz[1];
  12987. d7 = c->xyz[2] - a->xyz[2];
  12988. d8 = c->st[0] - a->st[0];
  12989. d9 = c->st[1] - a->st[1];
  12990. #if 1
  12991. __asm {
  12992. // normal
  12993. movss xmm0, d6
  12994. mulss xmm0, d2
  12995. movss xmm1, d7
  12996. mulss xmm1, d1
  12997. subss xmm0, xmm1
  12998. movss xmm1, d7
  12999. mulss xmm1, d0
  13000. movss xmm2, d5
  13001. mulss xmm2, d2
  13002. subss xmm1, xmm2
  13003. movss xmm2, d5
  13004. mulss xmm2, d1
  13005. movss xmm3, d6
  13006. mulss xmm3, d0
  13007. subss xmm2, xmm3
  13008. movss xmm3, xmm0
  13009. movss xmm4, xmm1
  13010. movss xmm5, xmm2
  13011. mulss xmm3, xmm3
  13012. mulss xmm4, xmm4
  13013. mulss xmm5, xmm5
  13014. addss xmm3, xmm4
  13015. addss xmm3, xmm5
  13016. #ifdef FIX_DEGENERATE_TANGENT
  13017. xorps xmm4, xmm4
  13018. cmpeqps xmm4, xmm3
  13019. andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
  13020. andps xmm3, SIMD_SP_absMask // make sure the values are positive
  13021. orps xmm3, xmm4
  13022. #endif
  13023. #ifdef REFINE_TANGENT_SQUAREROOT
  13024. rsqrtss xmm4, xmm3
  13025. mulss xmm3, xmm4
  13026. mulss xmm3, xmm4
  13027. subss xmm3, xmm6
  13028. mulss xmm4, xmm7
  13029. mulss xmm3, xmm4
  13030. #else
  13031. rsqrtss xmm3, xmm3
  13032. #endif
  13033. mulss xmm0, xmm3
  13034. movss n0, xmm0
  13035. mulss xmm1, xmm3
  13036. movss n1, xmm1
  13037. mulss xmm2, xmm3
  13038. movss n2, xmm2
  13039. // area sign bit
  13040. movss xmm0, d3
  13041. mulss xmm0, d9
  13042. movss xmm1, d4
  13043. mulss xmm1, d8
  13044. subss xmm0, xmm1
  13045. andps xmm0, SIMD_SP_signBitMask
  13046. movaps signBit, xmm0
  13047. // first tangent
  13048. movss xmm0, d0
  13049. mulss xmm0, d9
  13050. movss xmm1, d4
  13051. mulss xmm1, d5
  13052. subss xmm0, xmm1
  13053. movss xmm1, d1
  13054. mulss xmm1, d9
  13055. movss xmm2, d4
  13056. mulss xmm2, d6
  13057. subss xmm1, xmm2
  13058. movss xmm2, d2
  13059. mulss xmm2, d9
  13060. movss xmm3, d4
  13061. mulss xmm3, d7
  13062. subss xmm2, xmm3
  13063. movss xmm3, xmm0
  13064. movss xmm4, xmm1
  13065. movss xmm5, xmm2
  13066. mulss xmm3, xmm3
  13067. mulss xmm4, xmm4
  13068. mulss xmm5, xmm5
  13069. addss xmm3, xmm4
  13070. addss xmm3, xmm5
  13071. #ifdef FIX_DEGENERATE_TANGENT
  13072. xorps xmm4, xmm4
  13073. cmpeqps xmm4, xmm3
  13074. andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
  13075. andps xmm3, SIMD_SP_absMask // make sure the values are positive
  13076. orps xmm3, xmm4
  13077. #endif
  13078. #ifdef REFINE_TANGENT_SQUAREROOT
  13079. rsqrtss xmm4, xmm3
  13080. mulss xmm3, xmm4
  13081. mulss xmm3, xmm4
  13082. subss xmm3, xmm6
  13083. mulss xmm4, xmm7
  13084. mulss xmm3, xmm4
  13085. #else
  13086. rsqrtss xmm3, xmm3
  13087. #endif
  13088. xorps xmm3, signBit
  13089. mulss xmm0, xmm3
  13090. movss t0, xmm0
  13091. mulss xmm1, xmm3
  13092. movss t1, xmm1
  13093. mulss xmm2, xmm3
  13094. movss t2, xmm2
  13095. // second tangent
  13096. movss xmm0, d3
  13097. mulss xmm0, d5
  13098. movss xmm1, d0
  13099. mulss xmm1, d8
  13100. subss xmm0, xmm1
  13101. movss xmm1, d3
  13102. mulss xmm1, d6
  13103. movss xmm2, d1
  13104. mulss xmm2, d8
  13105. subss xmm1, xmm2
  13106. movss xmm2, d3
  13107. mulss xmm2, d7
  13108. movss xmm3, d2
  13109. mulss xmm3, d8
  13110. subss xmm2, xmm3
  13111. movss xmm3, xmm0
  13112. movss xmm4, xmm1
  13113. movss xmm5, xmm2
  13114. mulss xmm3, xmm3
  13115. mulss xmm4, xmm4
  13116. mulss xmm5, xmm5
  13117. addss xmm3, xmm4
  13118. addss xmm3, xmm5
  13119. #ifdef FIX_DEGENERATE_TANGENT
  13120. xorps xmm4, xmm4
  13121. cmpeqps xmm4, xmm3
  13122. andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
  13123. andps xmm3, SIMD_SP_absMask // make sure the values are positive
  13124. orps xmm3, xmm4
  13125. #endif
  13126. #ifdef REFINE_TANGENT_SQUAREROOT
  13127. rsqrtss xmm4, xmm3
  13128. mulss xmm3, xmm4
  13129. mulss xmm3, xmm4
  13130. subss xmm3, xmm6
  13131. mulss xmm4, xmm7
  13132. mulss xmm3, xmm4
  13133. #else
  13134. rsqrtss xmm3, xmm3
  13135. #endif
  13136. xorps xmm3, signBit
  13137. mulss xmm0, xmm3
  13138. movss t3, xmm0
  13139. mulss xmm1, xmm3
  13140. movss t4, xmm1
  13141. mulss xmm2, xmm3
  13142. movss t5, xmm2
  13143. }
  13144. #else
  13145. float tmp;
  13146. // normal
  13147. n0 = d6 * d2 - d7 * d1;
  13148. n1 = d7 * d0 - d5 * d2;
  13149. n2 = d5 * d1 - d6 * d0;
  13150. tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
  13151. n0 *= tmp;
  13152. n1 *= tmp;
  13153. n2 *= tmp;
  13154. // area sign bit
  13155. tmp = d3 * d9 - d4 * d8;
  13156. signBit[0] = ( *(unsigned long *)&tmp ) & ( 1 << 31 );
  13157. // first tangent
  13158. t0 = d0 * d9 - d4 * d5;
  13159. t1 = d1 * d9 - d4 * d6;
  13160. t2 = d2 * d9 - d4 * d7;
  13161. tmp = idMath::RSqrt( t0 * t0 + t1 * t1 + t2 * t2 );
  13162. *(unsigned long *)&tmp ^= signBit[0];
  13163. t0 *= tmp;
  13164. t1 *= tmp;
  13165. t2 *= tmp;
  13166. // second tangent
  13167. t3 = d3 * d5 - d0 * d8;
  13168. t4 = d3 * d6 - d1 * d8;
  13169. t5 = d3 * d7 - d2 * d8;
  13170. tmp = idMath::RSqrt( t3 * t3 + t4 * t4 + t5 * t5 );
  13171. *(unsigned long *)&tmp ^= signBit[0];
  13172. t3 *= tmp;
  13173. t4 *= tmp;
  13174. t5 *= tmp;
  13175. #endif
  13176. planes->Normal()[0] = n0;
  13177. planes->Normal()[1] = n1;
  13178. planes->Normal()[2] = n2;
  13179. planes->FitThroughPoint( a->xyz );
  13180. planes++;
  13181. if ( used[v0] ) {
  13182. a->normal[0] += n0;
  13183. a->normal[1] += n1;
  13184. a->normal[2] += n2;
  13185. a->tangents[0][0] += t0;
  13186. a->tangents[0][1] += t1;
  13187. a->tangents[0][2] += t2;
  13188. a->tangents[1][0] += t3;
  13189. a->tangents[1][1] += t4;
  13190. a->tangents[1][2] += t5;
  13191. } else {
  13192. a->normal[0] = n0;
  13193. a->normal[1] = n1;
  13194. a->normal[2] = n2;
  13195. a->tangents[0][0] = t0;
  13196. a->tangents[0][1] = t1;
  13197. a->tangents[0][2] = t2;
  13198. a->tangents[1][0] = t3;
  13199. a->tangents[1][1] = t4;
  13200. a->tangents[1][2] = t5;
  13201. used[v0] = true;
  13202. }
  13203. if ( used[v1] ) {
  13204. b->normal[0] += n0;
  13205. b->normal[1] += n1;
  13206. b->normal[2] += n2;
  13207. b->tangents[0][0] += t0;
  13208. b->tangents[0][1] += t1;
  13209. b->tangents[0][2] += t2;
  13210. b->tangents[1][0] += t3;
  13211. b->tangents[1][1] += t4;
  13212. b->tangents[1][2] += t5;
  13213. } else {
  13214. b->normal[0] = n0;
  13215. b->normal[1] = n1;
  13216. b->normal[2] = n2;
  13217. b->tangents[0][0] = t0;
  13218. b->tangents[0][1] = t1;
  13219. b->tangents[0][2] = t2;
  13220. b->tangents[1][0] = t3;
  13221. b->tangents[1][1] = t4;
  13222. b->tangents[1][2] = t5;
  13223. used[v1] = true;
  13224. }
  13225. if ( used[v2] ) {
  13226. c->normal[0] += n0;
  13227. c->normal[1] += n1;
  13228. c->normal[2] += n2;
  13229. c->tangents[0][0] += t0;
  13230. c->tangents[0][1] += t1;
  13231. c->tangents[0][2] += t2;
  13232. c->tangents[1][0] += t3;
  13233. c->tangents[1][1] += t4;
  13234. c->tangents[1][2] += t5;
  13235. } else {
  13236. c->normal[0] = n0;
  13237. c->normal[1] = n1;
  13238. c->normal[2] = n2;
  13239. c->tangents[0][0] = t0;
  13240. c->tangents[0][1] = t1;
  13241. c->tangents[0][2] = t2;
  13242. c->tangents[1][0] = t3;
  13243. c->tangents[1][1] = t4;
  13244. c->tangents[1][2] = t5;
  13245. used[v2] = true;
  13246. }
  13247. }
  13248. }
  13249. /*
  13250. ============
  13251. idSIMD_SSE::DeriveUnsmoothedTangents
  13252. ============
  13253. */
  13254. #define DERIVE_UNSMOOTHED_BITANGENT
  13255. void VPCALL idSIMD_SSE::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
  13256. int i, j;
  13257. for ( i = 0; i <= numVerts - 4; i += 4 ) {
  13258. ALIGN16( float s0[4] );
  13259. ALIGN16( float s1[4] );
  13260. ALIGN16( float s2[4] );
  13261. ALIGN16( float d0[4] );
  13262. ALIGN16( float d1[4] );
  13263. ALIGN16( float d2[4] );
  13264. ALIGN16( float d3[4] );
  13265. ALIGN16( float d4[4] );
  13266. ALIGN16( float d5[4] );
  13267. ALIGN16( float d6[4] );
  13268. ALIGN16( float d7[4] );
  13269. ALIGN16( float d8[4] );
  13270. ALIGN16( float d9[4] );
  13271. ALIGN16( float n0[4] );
  13272. ALIGN16( float n1[4] );
  13273. ALIGN16( float n2[4] );
  13274. ALIGN16( float t0[4] );
  13275. ALIGN16( float t1[4] );
  13276. ALIGN16( float t2[4] );
  13277. ALIGN16( float t3[4] );
  13278. ALIGN16( float t4[4] );
  13279. ALIGN16( float t5[4] );
  13280. for ( j = 0; j < 4; j++ ) {
  13281. const idDrawVert *a, *b, *c;
  13282. const dominantTri_s &dt = dominantTris[i+j];
  13283. s0[j] = dt.normalizationScale[0];
  13284. s1[j] = dt.normalizationScale[1];
  13285. s2[j] = dt.normalizationScale[2];
  13286. a = verts + i + j;
  13287. b = verts + dt.v2;
  13288. c = verts + dt.v3;
  13289. d0[j] = b->xyz[0] - a->xyz[0];
  13290. d1[j] = b->xyz[1] - a->xyz[1];
  13291. d2[j] = b->xyz[2] - a->xyz[2];
  13292. d3[j] = b->st[0] - a->st[0];
  13293. d4[j] = b->st[1] - a->st[1];
  13294. d5[j] = c->xyz[0] - a->xyz[0];
  13295. d6[j] = c->xyz[1] - a->xyz[1];
  13296. d7[j] = c->xyz[2] - a->xyz[2];
  13297. d8[j] = c->st[0] - a->st[0];
  13298. d9[j] = c->st[1] - a->st[1];
  13299. }
  13300. #if 1
  13301. __asm {
  13302. movaps xmm0, d6
  13303. mulps xmm0, d2
  13304. movaps xmm1, d7
  13305. mulps xmm1, d1
  13306. movaps xmm2, d7
  13307. mulps xmm2, d0
  13308. movaps xmm3, d5
  13309. mulps xmm3, d2
  13310. movaps xmm4, d5
  13311. mulps xmm4, d1
  13312. movaps xmm5, d6
  13313. mulps xmm5, d0
  13314. subps xmm0, xmm1
  13315. subps xmm2, xmm3
  13316. movaps xmm7, s2
  13317. subps xmm4, xmm5
  13318. mulps xmm0, xmm7
  13319. movaps n0, xmm0
  13320. mulps xmm2, xmm7
  13321. movaps n1, xmm2
  13322. mulps xmm4, xmm7
  13323. movaps n2, xmm4
  13324. movaps xmm0, d0
  13325. mulps xmm0, d9
  13326. movaps xmm1, d4
  13327. mulps xmm1, d5
  13328. movaps xmm2, d1
  13329. mulps xmm2, d9
  13330. movaps xmm3, d4
  13331. mulps xmm3, d6
  13332. movaps xmm4, d2
  13333. mulps xmm4, d9
  13334. movaps xmm5, d4
  13335. mulps xmm5, d7
  13336. subps xmm0, xmm1
  13337. subps xmm2, xmm3
  13338. movaps xmm7, s0
  13339. subps xmm4, xmm5
  13340. mulps xmm0, xmm7
  13341. movaps t0, xmm0
  13342. mulps xmm2, xmm7
  13343. movaps t1, xmm2
  13344. mulps xmm4, xmm7
  13345. movaps t2, xmm4
  13346. #ifndef DERIVE_UNSMOOTHED_BITANGENT
  13347. movaps xmm0, d3
  13348. mulps xmm0, d5
  13349. movaps xmm1, d0
  13350. mulps xmm1, d8
  13351. movaps xmm2, d3
  13352. mulps xmm2, d6
  13353. movaps xmm3, d1
  13354. mulps xmm3, d8
  13355. movaps xmm4, d3
  13356. mulps xmm4, d7
  13357. movaps xmm5, d2
  13358. mulps xmm5, d8
  13359. #else
  13360. movaps xmm0, n2
  13361. mulps xmm0, t1
  13362. movaps xmm1, n1
  13363. mulps xmm1, t2
  13364. movaps xmm2, n0
  13365. mulps xmm2, t2
  13366. movaps xmm3, n2
  13367. mulps xmm3, t0
  13368. movaps xmm4, n1
  13369. mulps xmm4, t0
  13370. movaps xmm5, n0
  13371. mulps xmm5, t1
  13372. #endif
  13373. subps xmm0, xmm1
  13374. subps xmm2, xmm3
  13375. movaps xmm7, s1
  13376. subps xmm4, xmm5
  13377. mulps xmm0, xmm7
  13378. movaps t3, xmm0
  13379. mulps xmm2, xmm7
  13380. movaps t4, xmm2
  13381. mulps xmm4, xmm7
  13382. movaps t5, xmm4
  13383. }
  13384. #else
  13385. n0[0] = d6[0] * d2[0];
  13386. n0[1] = d6[1] * d2[1];
  13387. n0[2] = d6[2] * d2[2];
  13388. n0[3] = d6[3] * d2[3];
  13389. n1[0] = d7[0] * d0[0];
  13390. n1[1] = d7[1] * d0[1];
  13391. n1[2] = d7[2] * d0[2];
  13392. n1[3] = d7[3] * d0[3];
  13393. n2[0] = d5[0] * d1[0];
  13394. n2[1] = d5[1] * d1[1];
  13395. n2[2] = d5[2] * d1[2];
  13396. n2[3] = d5[3] * d1[3];
  13397. n0[0] -= d7[0] * d1[0];
  13398. n0[1] -= d7[1] * d1[1];
  13399. n0[2] -= d7[2] * d1[2];
  13400. n0[3] -= d7[3] * d1[3];
  13401. n1[0] -= d5[0] * d2[0];
  13402. n1[1] -= d5[1] * d2[1];
  13403. n1[2] -= d5[2] * d2[2];
  13404. n1[3] -= d5[3] * d2[3];
  13405. n2[0] -= d6[0] * d0[0];
  13406. n2[1] -= d6[1] * d0[1];
  13407. n2[2] -= d6[2] * d0[2];
  13408. n2[3] -= d6[3] * d0[3];
  13409. n0[0] *= s2[0];
  13410. n0[1] *= s2[1];
  13411. n0[2] *= s2[2];
  13412. n0[3] *= s2[3];
  13413. n1[0] *= s2[0];
  13414. n1[1] *= s2[1];
  13415. n1[2] *= s2[2];
  13416. n1[3] *= s2[3];
  13417. n2[0] *= s2[0];
  13418. n2[1] *= s2[1];
  13419. n2[2] *= s2[2];
  13420. n2[3] *= s2[3];
  13421. t0[0] = d0[0] * d9[0];
  13422. t0[1] = d0[1] * d9[1];
  13423. t0[2] = d0[2] * d9[2];
  13424. t0[3] = d0[3] * d9[3];
  13425. t1[0] = d1[0] * d9[0];
  13426. t1[1] = d1[1] * d9[1];
  13427. t1[2] = d1[2] * d9[2];
  13428. t1[3] = d1[3] * d9[3];
  13429. t2[0] = d2[0] * d9[0];
  13430. t2[1] = d2[1] * d9[1];
  13431. t2[2] = d2[2] * d9[2];
  13432. t2[3] = d2[3] * d9[3];
  13433. t0[0] -= d4[0] * d5[0];
  13434. t0[1] -= d4[1] * d5[1];
  13435. t0[2] -= d4[2] * d5[2];
  13436. t0[3] -= d4[3] * d5[3];
  13437. t1[0] -= d4[0] * d6[0];
  13438. t1[1] -= d4[1] * d6[1];
  13439. t1[2] -= d4[2] * d6[2];
  13440. t1[3] -= d4[3] * d6[3];
  13441. t2[0] -= d4[0] * d7[0];
  13442. t2[1] -= d4[1] * d7[1];
  13443. t2[2] -= d4[2] * d7[2];
  13444. t2[3] -= d4[3] * d7[3];
  13445. t0[0] *= s0[0];
  13446. t0[1] *= s0[1];
  13447. t0[2] *= s0[2];
  13448. t0[3] *= s0[3];
  13449. t1[0] *= s0[0];
  13450. t1[1] *= s0[1];
  13451. t1[2] *= s0[2];
  13452. t1[3] *= s0[3];
  13453. t2[0] *= s0[0];
  13454. t2[1] *= s0[1];
  13455. t2[2] *= s0[2];
  13456. t2[3] *= s0[3];
  13457. #ifndef DERIVE_UNSMOOTHED_BITANGENT
  13458. t3[0] = d3[0] * d5[0];
  13459. t3[1] = d3[1] * d5[1];
  13460. t3[2] = d3[2] * d5[2];
  13461. t3[3] = d3[3] * d5[3];
  13462. t4[0] = d3[0] * d6[0];
  13463. t4[1] = d3[1] * d6[1];
  13464. t4[2] = d3[2] * d6[2];
  13465. t4[3] = d3[3] * d6[3];
  13466. t5[0] = d3[0] * d7[0];
  13467. t5[1] = d3[1] * d7[1];
  13468. t5[2] = d3[2] * d7[2];
  13469. t5[3] = d3[3] * d7[3];
  13470. t3[0] -= d0[0] * d8[0];
  13471. t3[1] -= d0[1] * d8[1];
  13472. t3[2] -= d0[2] * d8[2];
  13473. t3[3] -= d0[3] * d8[3];
  13474. t4[0] -= d1[0] * d8[0];
  13475. t4[1] -= d1[1] * d8[1];
  13476. t4[2] -= d1[2] * d8[2];
  13477. t4[3] -= d1[3] * d8[3];
  13478. t5[0] -= d2[0] * d8[0];
  13479. t5[1] -= d2[1] * d8[1];
  13480. t5[2] -= d2[2] * d8[2];
  13481. t5[3] -= d2[3] * d8[3];
  13482. #else
  13483. t3[0] = n2[0] * t1[0];
  13484. t3[1] = n2[1] * t1[1];
  13485. t3[2] = n2[2] * t1[2];
  13486. t3[3] = n2[3] * t1[3];
  13487. t4[0] = n0[0] * t2[0];
  13488. t4[1] = n0[1] * t2[1];
  13489. t4[2] = n0[2] * t2[2];
  13490. t4[3] = n0[3] * t2[3];
  13491. t5[0] = n1[0] * t0[0];
  13492. t5[1] = n1[1] * t0[1];
  13493. t5[2] = n1[2] * t0[2];
  13494. t5[3] = n1[3] * t0[3];
  13495. t3[0] -= n1[0] * t2[0];
  13496. t3[1] -= n1[1] * t2[1];
  13497. t3[2] -= n1[2] * t2[2];
  13498. t3[3] -= n1[3] * t2[3];
  13499. t4[0] -= n2[0] * t0[0];
  13500. t4[1] -= n2[1] * t0[1];
  13501. t4[2] -= n2[2] * t0[2];
  13502. t4[3] -= n2[3] * t0[3];
  13503. t5[0] -= n0[0] * t1[0];
  13504. t5[1] -= n0[1] * t1[1];
  13505. t5[2] -= n0[2] * t1[2];
  13506. t5[3] -= n0[3] * t1[3];
  13507. #endif
  13508. t3[0] *= s1[0];
  13509. t3[1] *= s1[1];
  13510. t3[2] *= s1[2];
  13511. t3[3] *= s1[3];
  13512. t4[0] *= s1[0];
  13513. t4[1] *= s1[1];
  13514. t4[2] *= s1[2];
  13515. t4[3] *= s1[3];
  13516. t5[0] *= s1[0];
  13517. t5[1] *= s1[1];
  13518. t5[2] *= s1[2];
  13519. t5[3] *= s1[3];
  13520. #endif
  13521. for ( j = 0; j < 4; j++ ) {
  13522. idDrawVert *a;
  13523. a = verts + i + j;
  13524. a->normal[0] = n0[j];
  13525. a->normal[1] = n1[j];
  13526. a->normal[2] = n2[j];
  13527. a->tangents[0][0] = t0[j];
  13528. a->tangents[0][1] = t1[j];
  13529. a->tangents[0][2] = t2[j];
  13530. a->tangents[1][0] = t3[j];
  13531. a->tangents[1][1] = t4[j];
  13532. a->tangents[1][2] = t5[j];
  13533. }
  13534. }
  13535. for ( ; i < numVerts; i++ ) {
  13536. idDrawVert *a, *b, *c;
  13537. float d0, d1, d2, d3, d4;
  13538. float d5, d6, d7, d8, d9;
  13539. float s0, s1, s2;
  13540. float n0, n1, n2;
  13541. float t0, t1, t2;
  13542. float t3, t4, t5;
  13543. const dominantTri_s &dt = dominantTris[i];
  13544. s0 = dt.normalizationScale[0];
  13545. s1 = dt.normalizationScale[1];
  13546. s2 = dt.normalizationScale[2];
  13547. a = verts + i;
  13548. b = verts + dt.v2;
  13549. c = verts + dt.v3;
  13550. d0 = b->xyz[0] - a->xyz[0];
  13551. d1 = b->xyz[1] - a->xyz[1];
  13552. d2 = b->xyz[2] - a->xyz[2];
  13553. d3 = b->st[0] - a->st[0];
  13554. d4 = b->st[1] - a->st[1];
  13555. d5 = c->xyz[0] - a->xyz[0];
  13556. d6 = c->xyz[1] - a->xyz[1];
  13557. d7 = c->xyz[2] - a->xyz[2];
  13558. d8 = c->st[0] - a->st[0];
  13559. d9 = c->st[1] - a->st[1];
  13560. #if 1
  13561. __asm {
  13562. movss xmm0, d6
  13563. mulss xmm0, d2
  13564. movss xmm1, d7
  13565. mulss xmm1, d1
  13566. movss xmm2, d7
  13567. mulss xmm2, d0
  13568. movss xmm3, d5
  13569. mulss xmm3, d2
  13570. movss xmm4, d5
  13571. mulss xmm4, d1
  13572. movss xmm5, d6
  13573. mulss xmm5, d0
  13574. subss xmm0, xmm1
  13575. subss xmm2, xmm3
  13576. movss xmm7, s2
  13577. subss xmm4, xmm5
  13578. mulss xmm0, xmm7
  13579. movss n0, xmm0
  13580. mulss xmm2, xmm7
  13581. movss n1, xmm2
  13582. mulss xmm4, xmm7
  13583. movss n2, xmm4
  13584. movss xmm0, d0
  13585. mulss xmm0, d9
  13586. movss xmm1, d4
  13587. mulss xmm1, d5
  13588. movss xmm2, d1
  13589. mulss xmm2, d9
  13590. movss xmm3, d4
  13591. mulss xmm3, d6
  13592. movss xmm4, d2
  13593. mulss xmm4, d9
  13594. movss xmm5, d4
  13595. mulss xmm5, d7
  13596. subss xmm0, xmm1
  13597. subss xmm2, xmm3
  13598. movss xmm7, s0
  13599. subss xmm4, xmm5
  13600. mulss xmm0, xmm7
  13601. movss t0, xmm0
  13602. mulss xmm2, xmm7
  13603. movss t1, xmm2
  13604. mulss xmm4, xmm7
  13605. movss t2, xmm4
  13606. #ifndef DERIVE_UNSMOOTHED_BITANGENT
  13607. movss xmm0, d3
  13608. mulss xmm0, d5
  13609. movss xmm1, d0
  13610. mulss xmm1, d8
  13611. movss xmm2, d3
  13612. mulss xmm2, d6
  13613. movss xmm3, d1
  13614. mulss xmm3, d8
  13615. movss xmm4, d3
  13616. mulss xmm4, d7
  13617. movss xmm5, d2
  13618. mulss xmm5, d8
  13619. #else
  13620. movss xmm0, n2
  13621. mulss xmm0, t1
  13622. movss xmm1, n1
  13623. mulss xmm1, t2
  13624. movss xmm2, n0
  13625. mulss xmm2, t2
  13626. movss xmm3, n2
  13627. mulss xmm3, t0
  13628. movss xmm4, n1
  13629. mulss xmm4, t0
  13630. movss xmm5, n0
  13631. mulss xmm5, t1
  13632. #endif
  13633. subss xmm0, xmm1
  13634. subss xmm2, xmm3
  13635. movss xmm7, s1
  13636. subss xmm4, xmm5
  13637. mulss xmm0, xmm7
  13638. movss t3, xmm0
  13639. mulss xmm2, xmm7
  13640. movss t4, xmm2
  13641. mulss xmm4, xmm7
  13642. movss t5, xmm4
  13643. }
  13644. #else
  13645. n0 = s2 * ( d6 * d2 - d7 * d1 );
  13646. n1 = s2 * ( d7 * d0 - d5 * d2 );
  13647. n2 = s2 * ( d5 * d1 - d6 * d0 );
  13648. t0 = s0 * ( d0 * d9 - d4 * d5 );
  13649. t1 = s0 * ( d1 * d9 - d4 * d6 );
  13650. t2 = s0 * ( d2 * d9 - d4 * d7 );
  13651. #ifndef DERIVE_UNSMOOTHED_BITANGENT
  13652. t3 = s1 * ( d3 * d5 - d0 * d8 );
  13653. t4 = s1 * ( d3 * d6 - d1 * d8 );
  13654. t5 = s1 * ( d3 * d7 - d2 * d8 );
  13655. #else
  13656. t3 = s1 * ( n2 * t1 - n1 * t2 );
  13657. t4 = s1 * ( n0 * t2 - n2 * t0 );
  13658. t5 = s1 * ( n1 * t0 - n0 * t1 );
  13659. #endif
  13660. #endif
  13661. a->normal[0] = n0;
  13662. a->normal[1] = n1;
  13663. a->normal[2] = n2;
  13664. a->tangents[0][0] = t0;
  13665. a->tangents[0][1] = t1;
  13666. a->tangents[0][2] = t2;
  13667. a->tangents[1][0] = t3;
  13668. a->tangents[1][1] = t4;
  13669. a->tangents[1][2] = t5;
  13670. }
  13671. }
  13672. /*
  13673. ============
  13674. idSIMD_SSE::NormalizeTangents
  13675. ============
  13676. */
  13677. void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
  13678. ALIGN16( float normal[12] );
  13679. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  13680. assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
  13681. assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
  13682. assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
  13683. assert( verts != NULL );
  13684. assert( numVerts >= 0 );
  13685. __asm {
  13686. mov eax, numVerts
  13687. test eax, eax
  13688. jz done
  13689. #ifdef REFINE_TANGENT_SQUAREROOT
  13690. movaps xmm6, SIMD_SP_rsqrt_c0
  13691. movaps xmm7, SIMD_SP_rsqrt_c1
  13692. #endif
  13693. mov esi, verts
  13694. imul eax, DRAWVERT_SIZE
  13695. add esi, eax
  13696. neg eax
  13697. add eax, DRAWVERT_SIZE*4
  13698. jle loopVert4
  13699. sub eax, DRAWVERT_SIZE*4
  13700. jl loopVert1
  13701. loopVert4:
  13702. sub eax, DRAWVERT_SIZE*4
  13703. // normalize 4 idDrawVert::normal
  13704. movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0] // 0, X, X, X
  13705. movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0] // 0, X, 3, 4
  13706. movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8] // 5, X, X, X
  13707. movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4] // 5, X, 1, 2
  13708. movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0] // 6, X, X, X
  13709. movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0] // 6, X, 9, 10
  13710. movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8] // 11, X, X, X
  13711. movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4] // 11, X, 7, 8
  13712. movaps xmm1, xmm0
  13713. movaps xmm5, xmm2
  13714. shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
  13715. shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
  13716. shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
  13717. shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
  13718. shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
  13719. movaps xmm3, xmm0
  13720. movaps xmm4, xmm1
  13721. movaps xmm5, xmm2
  13722. mulps xmm3, xmm3
  13723. mulps xmm4, xmm4
  13724. mulps xmm5, xmm5
  13725. addps xmm3, xmm4
  13726. addps xmm3, xmm5
  13727. #ifdef REFINE_TANGENT_SQUAREROOT
  13728. rsqrtps xmm4, xmm3
  13729. mulps xmm3, xmm4
  13730. mulps xmm3, xmm4
  13731. subps xmm3, xmm6
  13732. mulps xmm4, xmm7
  13733. mulps xmm3, xmm4
  13734. #else
  13735. rsqrtps xmm3, xmm3
  13736. #endif
  13737. mulps xmm0, xmm3
  13738. mulps xmm1, xmm3
  13739. mulps xmm2, xmm3
  13740. // save the 4 idDrawVert::normal to project the tangents
  13741. movaps [normal+ 0], xmm0
  13742. movaps [normal+16], xmm1
  13743. movaps [normal+32], xmm2
  13744. movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0], xmm0
  13745. movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4], xmm1
  13746. movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+8], xmm2
  13747. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  13748. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  13749. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  13750. movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0], xmm0
  13751. movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+4], xmm1
  13752. movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8], xmm2
  13753. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  13754. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  13755. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  13756. movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0], xmm0
  13757. movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4], xmm1
  13758. movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+8], xmm2
  13759. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  13760. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  13761. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  13762. movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0], xmm0
  13763. movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+4], xmm1
  13764. movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8], xmm2
  13765. // project and normalize 4 idDrawVert::tangent[0]
  13766. movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, X, X
  13767. movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, 3, 4
  13768. movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8] // 5, X, X, X
  13769. movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4] // 5, X, 1, 2
  13770. movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, X, X
  13771. movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, 9, 10
  13772. movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8] // 11, X, X, X
  13773. movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4] // 11, X, 7, 8
  13774. movaps xmm1, xmm0
  13775. movaps xmm5, xmm2
  13776. shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
  13777. shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
  13778. shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
  13779. shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
  13780. shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
  13781. movaps xmm3, xmm0
  13782. movaps xmm4, xmm1
  13783. movaps xmm5, xmm2
  13784. mulps xmm3, [normal+ 0]
  13785. mulps xmm4, [normal+16]
  13786. mulps xmm5, [normal+32]
  13787. addps xmm3, xmm4
  13788. addps xmm3, xmm5
  13789. movaps xmm4, xmm3
  13790. movaps xmm5, xmm3
  13791. mulps xmm3, [normal+ 0]
  13792. mulps xmm4, [normal+16]
  13793. mulps xmm5, [normal+32]
  13794. subps xmm0, xmm3
  13795. subps xmm1, xmm4
  13796. subps xmm2, xmm5
  13797. movaps xmm3, xmm0
  13798. movaps xmm4, xmm1
  13799. movaps xmm5, xmm2
  13800. mulps xmm3, xmm3
  13801. mulps xmm4, xmm4
  13802. mulps xmm5, xmm5
  13803. addps xmm3, xmm4
  13804. addps xmm3, xmm5
  13805. #ifdef REFINE_TANGENT_SQUAREROOT
  13806. rsqrtps xmm4, xmm3
  13807. mulps xmm3, xmm4
  13808. mulps xmm3, xmm4
  13809. subps xmm3, xmm6
  13810. mulps xmm4, xmm7
  13811. mulps xmm3, xmm4
  13812. #else
  13813. rsqrtps xmm3, xmm3
  13814. #endif
  13815. mulps xmm0, xmm3
  13816. mulps xmm1, xmm3
  13817. mulps xmm2, xmm3
  13818. movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0], xmm0
  13819. movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4], xmm1
  13820. movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+8], xmm2
  13821. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  13822. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  13823. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  13824. movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0], xmm0
  13825. movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+4], xmm1
  13826. movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8], xmm2
  13827. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  13828. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  13829. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  13830. movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0], xmm0
  13831. movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4], xmm1
  13832. movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+8], xmm2
  13833. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  13834. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  13835. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  13836. movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0], xmm0
  13837. movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+4], xmm1
  13838. movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8], xmm2
  13839. // project and normalize 4 idDrawVert::tangent[1]
  13840. movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, X, X
  13841. movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, 3, 4
  13842. movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8] // 5, X, X, X
  13843. movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4] // 5, X, 1, 2
  13844. movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, X, X
  13845. movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, 9, 10
  13846. movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8] // 11, X, X, X
  13847. movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4] // 11, X, 7, 8
  13848. movaps xmm1, xmm0
  13849. movaps xmm5, xmm2
  13850. shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
  13851. shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
  13852. shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
  13853. shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
  13854. shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
  13855. movaps xmm3, xmm0
  13856. movaps xmm4, xmm1
  13857. movaps xmm5, xmm2
  13858. mulps xmm3, [normal+ 0]
  13859. mulps xmm4, [normal+16]
  13860. mulps xmm5, [normal+32]
  13861. addps xmm3, xmm4
  13862. addps xmm3, xmm5
  13863. movaps xmm4, xmm3
  13864. movaps xmm5, xmm3
  13865. mulps xmm3, [normal+ 0]
  13866. mulps xmm4, [normal+16]
  13867. mulps xmm5, [normal+32]
  13868. subps xmm0, xmm3
  13869. subps xmm1, xmm4
  13870. subps xmm2, xmm5
  13871. movaps xmm3, xmm0
  13872. movaps xmm4, xmm1
  13873. movaps xmm5, xmm2
  13874. mulps xmm3, xmm3
  13875. mulps xmm4, xmm4
  13876. mulps xmm5, xmm5
  13877. addps xmm3, xmm4
  13878. addps xmm3, xmm5
  13879. #ifdef REFINE_TANGENT_SQUAREROOT
  13880. rsqrtps xmm4, xmm3
  13881. mulps xmm3, xmm4
  13882. mulps xmm3, xmm4
  13883. subps xmm3, xmm6
  13884. mulps xmm4, xmm7
  13885. mulps xmm3, xmm4
  13886. #else
  13887. rsqrtps xmm3, xmm3
  13888. #endif
  13889. mulps xmm0, xmm3
  13890. mulps xmm1, xmm3
  13891. mulps xmm2, xmm3
  13892. movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0], xmm0
  13893. movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4], xmm1
  13894. movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+8], xmm2
  13895. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  13896. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  13897. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  13898. movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0], xmm0
  13899. movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+4], xmm1
  13900. movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8], xmm2
  13901. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  13902. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  13903. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  13904. movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0], xmm0
  13905. movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4], xmm1
  13906. movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+8], xmm2
  13907. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  13908. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
  13909. shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
  13910. movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0], xmm0
  13911. movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+4], xmm1
  13912. movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8], xmm2
  13913. add eax, DRAWVERT_SIZE*8
  13914. jle loopVert4
  13915. sub eax, DRAWVERT_SIZE*4
  13916. jge done
  13917. loopVert1:
  13918. // normalize one idDrawVert::normal
  13919. movss xmm0, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
  13920. movss xmm1, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
  13921. movss xmm2, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
  13922. movss xmm3, xmm0
  13923. movss xmm4, xmm1
  13924. movss xmm5, xmm2
  13925. mulss xmm3, xmm3
  13926. mulss xmm4, xmm4
  13927. mulss xmm5, xmm5
  13928. addss xmm3, xmm4
  13929. addss xmm3, xmm5
  13930. #ifdef REFINE_TANGENT_SQUAREROOT
  13931. rsqrtss xmm4, xmm3
  13932. mulss xmm3, xmm4
  13933. mulss xmm3, xmm4
  13934. subss xmm3, xmm6
  13935. mulss xmm4, xmm7
  13936. mulss xmm3, xmm4
  13937. #else
  13938. rsqrtss xmm3, xmm3
  13939. #endif
  13940. mulss xmm0, xmm3
  13941. mulss xmm1, xmm3
  13942. mulss xmm2, xmm3
  13943. movss [esi+eax+DRAWVERT_NORMAL_OFFSET+0], xmm0
  13944. movss [esi+eax+DRAWVERT_NORMAL_OFFSET+4], xmm1
  13945. movss [esi+eax+DRAWVERT_NORMAL_OFFSET+8], xmm2
  13946. // project and normalize one idDrawVert::tangent[0]
  13947. movss xmm0, [esi+eax+DRAWVERT_TANGENT0_OFFSET+0]
  13948. movss xmm1, [esi+eax+DRAWVERT_TANGENT0_OFFSET+4]
  13949. movss xmm2, [esi+eax+DRAWVERT_TANGENT0_OFFSET+8]
  13950. movss xmm3, xmm0
  13951. movss xmm4, xmm1
  13952. movss xmm5, xmm2
  13953. mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
  13954. mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
  13955. mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
  13956. addss xmm3, xmm4
  13957. addss xmm3, xmm5
  13958. movss xmm4, xmm3
  13959. movss xmm5, xmm3
  13960. mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
  13961. mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
  13962. mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
  13963. subss xmm0, xmm3
  13964. subss xmm1, xmm4
  13965. subss xmm2, xmm5
  13966. movss xmm3, xmm0
  13967. movss xmm4, xmm1
  13968. movss xmm5, xmm2
  13969. mulss xmm3, xmm3
  13970. mulss xmm4, xmm4
  13971. mulss xmm5, xmm5
  13972. addss xmm3, xmm4
  13973. addss xmm3, xmm5
  13974. #ifdef REFINE_TANGENT_SQUAREROOT
  13975. rsqrtss xmm4, xmm3
  13976. mulss xmm3, xmm4
  13977. mulss xmm3, xmm4
  13978. subss xmm3, xmm6
  13979. mulss xmm4, xmm7
  13980. mulss xmm3, xmm4
  13981. #else
  13982. rsqrtss xmm3, xmm3
  13983. #endif
  13984. mulss xmm0, xmm3
  13985. mulss xmm1, xmm3
  13986. mulss xmm2, xmm3
  13987. movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+0], xmm0
  13988. movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+4], xmm1
  13989. movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+8], xmm2
  13990. // project and normalize one idDrawVert::tangent[1]
  13991. movss xmm0, [esi+eax+DRAWVERT_TANGENT1_OFFSET+0]
  13992. movss xmm1, [esi+eax+DRAWVERT_TANGENT1_OFFSET+4]
  13993. movss xmm2, [esi+eax+DRAWVERT_TANGENT1_OFFSET+8]
  13994. movss xmm3, xmm0
  13995. movss xmm4, xmm1
  13996. movss xmm5, xmm2
  13997. mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
  13998. mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
  13999. mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
  14000. addss xmm3, xmm4
  14001. addss xmm3, xmm5
  14002. movss xmm4, xmm3
  14003. movss xmm5, xmm3
  14004. mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
  14005. mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
  14006. mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
  14007. subss xmm0, xmm3
  14008. subss xmm1, xmm4
  14009. subss xmm2, xmm5
  14010. movss xmm3, xmm0
  14011. movss xmm4, xmm1
  14012. movss xmm5, xmm2
  14013. mulss xmm3, xmm3
  14014. mulss xmm4, xmm4
  14015. mulss xmm5, xmm5
  14016. addss xmm3, xmm4
  14017. addss xmm3, xmm5
  14018. #ifdef REFINE_TANGENT_SQUAREROOT
  14019. rsqrtss xmm4, xmm3
  14020. mulss xmm3, xmm4
  14021. mulss xmm3, xmm4
  14022. subss xmm3, xmm6
  14023. mulss xmm4, xmm7
  14024. mulss xmm3, xmm4
  14025. #else
  14026. rsqrtss xmm3, xmm3
  14027. #endif
  14028. mulss xmm0, xmm3
  14029. mulss xmm1, xmm3
  14030. mulss xmm2, xmm3
  14031. movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+0], xmm0
  14032. movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+4], xmm1
  14033. movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+8], xmm2
  14034. add eax, DRAWVERT_SIZE
  14035. jl loopVert1
  14036. done:
  14037. }
  14038. }
  14039. /*
  14040. ============
  14041. idSIMD_SSE::CreateTextureSpaceLightVectors
  14042. ============
  14043. */
  14044. void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  14045. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  14046. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  14047. assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
  14048. assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
  14049. assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
  14050. bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
  14051. memset( used, 0, numVerts * sizeof( used[0] ) );
  14052. for ( int i = numIndexes - 1; i >= 0; i-- ) {
  14053. used[indexes[i]] = true;
  14054. }
  14055. #if 0
  14056. __asm {
  14057. mov eax, numVerts
  14058. mov esi, used
  14059. add esi, eax
  14060. mov edi, verts
  14061. sub edi, DRAWVERT_SIZE
  14062. neg eax
  14063. dec eax
  14064. mov ecx, lightOrigin
  14065. movss xmm7, [ecx+0]
  14066. movhps xmm7, [ecx+4]
  14067. mov ecx, lightVectors
  14068. sub ecx, 3*4
  14069. loopVert:
  14070. inc eax
  14071. jge done
  14072. add edi, DRAWVERT_SIZE
  14073. add ecx, 3*4
  14074. cmp byte ptr [esi+eax], 0
  14075. je loopVert
  14076. movaps xmm0, xmm7
  14077. movss xmm1, [edi+DRAWVERT_XYZ_OFFSET+0]
  14078. movhps xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
  14079. subps xmm0, xmm1
  14080. // 0, X, 1, 2
  14081. // 3, X, 4, 5
  14082. // 6, X, 7, 8
  14083. movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
  14084. movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
  14085. mulps xmm2, xmm0
  14086. movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
  14087. movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
  14088. mulps xmm3, xmm0
  14089. movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2
  14090. unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X
  14091. unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5
  14092. movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
  14093. movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
  14094. mulps xmm4, xmm0
  14095. movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X
  14096. movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8
  14097. shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7
  14098. addps xmm5, xmm4
  14099. addps xmm5, xmm2
  14100. movlps [ecx+0], xmm5
  14101. shufps xmm5, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
  14102. movss [ecx+8], xmm5
  14103. jmp loopVert
  14104. done:
  14105. }
  14106. #elif 1
  14107. for ( int i = 0; i < numVerts; i++ ) {
  14108. if ( !used[i] ) {
  14109. continue;
  14110. }
  14111. const idDrawVert *v = &verts[i];
  14112. idVec3 lightDir;
  14113. lightDir[0] = lightOrigin[0] - v->xyz[0];
  14114. lightDir[1] = lightOrigin[1] - v->xyz[1];
  14115. lightDir[2] = lightOrigin[2] - v->xyz[2];
  14116. lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
  14117. lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
  14118. lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
  14119. }
  14120. #elif 1
  14121. ALIGN16( int usedVertNums[4] );
  14122. ALIGN16( float lightDir0[4] );
  14123. ALIGN16( float lightDir1[4] );
  14124. ALIGN16( float lightDir2[4] );
  14125. ALIGN16( float normal0[4] );
  14126. ALIGN16( float normal1[4] );
  14127. ALIGN16( float normal2[4] );
  14128. ALIGN16( float tangent0[4] );
  14129. ALIGN16( float tangent1[4] );
  14130. ALIGN16( float tangent2[4] );
  14131. ALIGN16( float tangent3[4] );
  14132. ALIGN16( float tangent4[4] );
  14133. ALIGN16( float tangent5[4] );
  14134. idVec3 localLightOrigin = lightOrigin;
  14135. __asm {
  14136. xor ecx, ecx
  14137. mov eax, numVerts
  14138. mov esi, used
  14139. add esi, eax
  14140. mov edi, verts
  14141. sub edi, DRAWVERT_SIZE
  14142. neg eax
  14143. dec eax
  14144. loopVert4:
  14145. inc eax
  14146. jge done4
  14147. add edi, DRAWVERT_SIZE
  14148. cmp byte ptr [esi+eax], 0
  14149. je loopVert4
  14150. mov usedVertNums[ecx*4], eax
  14151. inc ecx
  14152. cmp ecx, 4
  14153. movss xmm0, localLightOrigin[0]
  14154. movss xmm1, localLightOrigin[4]
  14155. movss xmm2, localLightOrigin[8]
  14156. subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
  14157. subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
  14158. subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
  14159. movss lightDir0[ecx*4-4], xmm0
  14160. movss lightDir1[ecx*4-4], xmm1
  14161. movss lightDir2[ecx*4-4], xmm2
  14162. movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
  14163. movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
  14164. movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
  14165. movss normal0[ecx*4-4], xmm3
  14166. movss normal1[ecx*4-4], xmm4
  14167. movss normal2[ecx*4-4], xmm5
  14168. movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
  14169. movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
  14170. movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
  14171. movss tangent0[ecx*4-4], xmm0
  14172. movss tangent1[ecx*4-4], xmm1
  14173. movss tangent2[ecx*4-4], xmm2
  14174. movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
  14175. movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
  14176. movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
  14177. movss tangent3[ecx*4-4], xmm3
  14178. movss tangent4[ecx*4-4], xmm4
  14179. movss tangent5[ecx*4-4], xmm5
  14180. jl loopVert4
  14181. movaps xmm0, lightDir0
  14182. movaps xmm1, lightDir1
  14183. movaps xmm2, lightDir2
  14184. movaps xmm3, tangent0
  14185. mulps xmm3, xmm0
  14186. movaps xmm4, tangent1
  14187. mulps xmm4, xmm1
  14188. movaps xmm5, tangent2
  14189. mulps xmm5, xmm2
  14190. addps xmm3, xmm4
  14191. addps xmm5, xmm3
  14192. movaps xmm3, tangent3
  14193. mulps xmm3, xmm0
  14194. movaps xmm4, tangent4
  14195. mulps xmm4, xmm1
  14196. movaps xmm6, tangent5
  14197. mulps xmm6, xmm2
  14198. addps xmm3, xmm4
  14199. addps xmm6, xmm3
  14200. mulps xmm0, normal0
  14201. mulps xmm1, normal1
  14202. mulps xmm2, normal2
  14203. addps xmm0, xmm1
  14204. addps xmm0, xmm2
  14205. mov ecx, numVerts
  14206. imul ecx, 12
  14207. mov edx, usedVertNums[0]
  14208. add ecx, lightVectors
  14209. imul edx, 12
  14210. movss [ecx+edx+0], xmm5
  14211. movss [ecx+edx+4], xmm6
  14212. movss [ecx+edx+8], xmm0
  14213. shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
  14214. mov edx, usedVertNums[4]
  14215. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
  14216. imul edx, 12
  14217. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  14218. movss [ecx+edx+0], xmm5
  14219. movss [ecx+edx+4], xmm6
  14220. movss [ecx+edx+8], xmm0
  14221. shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
  14222. mov edx, usedVertNums[8]
  14223. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
  14224. imul edx, 12
  14225. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  14226. movss [ecx+edx+0], xmm5
  14227. movss [ecx+edx+4], xmm6
  14228. movss [ecx+edx+8], xmm0
  14229. shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
  14230. mov edx, usedVertNums[12]
  14231. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
  14232. imul edx, 12
  14233. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  14234. movss [ecx+edx+0], xmm5
  14235. movss [ecx+edx+4], xmm6
  14236. movss [ecx+edx+8], xmm0
  14237. xor ecx, ecx
  14238. jmp loopVert4
  14239. done4:
  14240. test ecx, ecx
  14241. jz done
  14242. xor eax, eax
  14243. mov edi, numVerts
  14244. imul edi, 12
  14245. add edi, lightVectors
  14246. loopVert1:
  14247. movss xmm0, lightDir0[eax*4]
  14248. movss xmm1, lightDir1[eax*4]
  14249. movss xmm2, lightDir2[eax*4]
  14250. mov edx, usedVertNums[eax*4]
  14251. imul edx, 12
  14252. movss xmm3, tangent0[eax*4]
  14253. mulss xmm3, xmm0
  14254. movss xmm4, tangent1[eax*4]
  14255. mulss xmm4, xmm1
  14256. movss xmm5, tangent2[eax*4]
  14257. mulss xmm5, xmm2
  14258. addss xmm3, xmm4
  14259. addss xmm5, xmm3
  14260. movss [edi+edx+0], xmm5
  14261. movss xmm3, tangent3[eax*4]
  14262. mulss xmm3, xmm0
  14263. movss xmm4, tangent4[eax*4]
  14264. mulss xmm4, xmm1
  14265. movss xmm6, tangent5[eax*4]
  14266. mulss xmm6, xmm2
  14267. addss xmm3, xmm4
  14268. addss xmm6, xmm3
  14269. movss [edi+edx+4], xmm6
  14270. mulss xmm0, normal0[eax*4]
  14271. mulss xmm1, normal1[eax*4]
  14272. mulss xmm2, normal2[eax*4]
  14273. addss xmm0, xmm1
  14274. addss xmm0, xmm2
  14275. movss [edi+edx+8], xmm0
  14276. inc eax
  14277. dec ecx
  14278. jg loopVert1
  14279. done:
  14280. }
  14281. #else
  14282. ALIGN16( float lightVectors0[4] );
  14283. ALIGN16( float lightVectors1[4] );
  14284. ALIGN16( float lightVectors2[4] );
  14285. int numUsedVerts = 0;
  14286. for ( int i = 0; i < numVerts; i++ ) {
  14287. if ( !used[i] ) {
  14288. continue;
  14289. }
  14290. const idDrawVert *v = &verts[i];
  14291. lightDir0[numUsedVerts] = lightOrigin[0] - v->xyz[0];
  14292. lightDir1[numUsedVerts] = lightOrigin[1] - v->xyz[1];
  14293. lightDir2[numUsedVerts] = lightOrigin[2] - v->xyz[2];
  14294. normal0[numUsedVerts] = v->normal[0];
  14295. normal1[numUsedVerts] = v->normal[1];
  14296. normal2[numUsedVerts] = v->normal[2];
  14297. tangent0[numUsedVerts] = v->tangents[0][0];
  14298. tangent1[numUsedVerts] = v->tangents[0][1];
  14299. tangent2[numUsedVerts] = v->tangents[0][2];
  14300. tangent3[numUsedVerts] = v->tangents[1][0];
  14301. tangent4[numUsedVerts] = v->tangents[1][1];
  14302. tangent5[numUsedVerts] = v->tangents[1][2];
  14303. usedVertNums[numUsedVerts++] = i;
  14304. if ( numUsedVerts < 4 ) {
  14305. continue;
  14306. }
  14307. lightVectors0[0] = lightDir0[0] * tangent0[0];
  14308. lightVectors0[1] = lightDir0[1] * tangent0[1];
  14309. lightVectors0[2] = lightDir0[2] * tangent0[2];
  14310. lightVectors0[3] = lightDir0[3] * tangent0[3];
  14311. lightVectors0[0] += lightDir1[0] * tangent1[0];
  14312. lightVectors0[1] += lightDir1[1] * tangent1[1];
  14313. lightVectors0[2] += lightDir1[2] * tangent1[2];
  14314. lightVectors0[3] += lightDir1[3] * tangent1[3];
  14315. lightVectors0[0] += lightDir2[0] * tangent2[0];
  14316. lightVectors0[1] += lightDir2[1] * tangent2[1];
  14317. lightVectors0[2] += lightDir2[2] * tangent2[2];
  14318. lightVectors0[3] += lightDir2[3] * tangent2[3];
  14319. lightVectors1[0] = lightDir0[0] * tangent3[0];
  14320. lightVectors1[1] = lightDir0[1] * tangent3[1];
  14321. lightVectors1[2] = lightDir0[2] * tangent3[2];
  14322. lightVectors1[3] = lightDir0[3] * tangent3[3];
  14323. lightVectors1[0] += lightDir1[0] * tangent4[0];
  14324. lightVectors1[1] += lightDir1[1] * tangent4[1];
  14325. lightVectors1[2] += lightDir1[2] * tangent4[2];
  14326. lightVectors1[3] += lightDir1[3] * tangent4[3];
  14327. lightVectors1[0] += lightDir2[0] * tangent5[0];
  14328. lightVectors1[1] += lightDir2[1] * tangent5[1];
  14329. lightVectors1[2] += lightDir2[2] * tangent5[2];
  14330. lightVectors1[3] += lightDir2[3] * tangent5[3];
  14331. lightVectors2[0] = lightDir0[0] * normal0[0];
  14332. lightVectors2[1] = lightDir0[1] * normal0[1];
  14333. lightVectors2[2] = lightDir0[2] * normal0[2];
  14334. lightVectors2[3] = lightDir0[3] * normal0[3];
  14335. lightVectors2[0] += lightDir1[0] * normal1[0];
  14336. lightVectors2[1] += lightDir1[1] * normal1[1];
  14337. lightVectors2[2] += lightDir1[2] * normal1[2];
  14338. lightVectors2[3] += lightDir1[3] * normal1[3];
  14339. lightVectors2[0] += lightDir2[0] * normal2[0];
  14340. lightVectors2[1] += lightDir2[1] * normal2[1];
  14341. lightVectors2[2] += lightDir2[2] * normal2[2];
  14342. lightVectors2[3] += lightDir2[3] * normal2[3];
  14343. for ( int j = 0; j < 4; j++ ) {
  14344. int n = usedVertNums[j];
  14345. lightVectors[n][0] = lightVectors0[j];
  14346. lightVectors[n][1] = lightVectors1[j];
  14347. lightVectors[n][2] = lightVectors2[j];
  14348. }
  14349. numUsedVerts = 0;
  14350. }
  14351. for ( int i = 0; i < numUsedVerts; i++ ) {
  14352. lightVectors0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
  14353. lightVectors1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
  14354. lightVectors2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
  14355. int n = usedVertNums[i];
  14356. lightVectors[n][0] = lightVectors0[i];
  14357. lightVectors[n][1] = lightVectors1[i];
  14358. lightVectors[n][2] = lightVectors2[i];
  14359. }
  14360. #endif
  14361. }
  14362. /*
  14363. ============
  14364. idSIMD_SSE::CreateSpecularTextureCoords
  14365. ============
  14366. */
  14367. void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  14368. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  14369. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  14370. assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
  14371. assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
  14372. assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
  14373. bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
  14374. memset( used, 0, numVerts * sizeof( used[0] ) );
  14375. for ( int i = numIndexes - 1; i >= 0; i-- ) {
  14376. used[indexes[i]] = true;
  14377. }
  14378. #if 0
  14379. __asm {
  14380. mov eax, numVerts
  14381. mov esi, used
  14382. add esi, eax
  14383. mov edi, verts
  14384. sub edi, DRAWVERT_SIZE
  14385. neg eax
  14386. dec eax
  14387. mov ecx, viewOrigin
  14388. movss xmm6, [ecx+0]
  14389. movhps xmm6, [ecx+4]
  14390. mov ecx, lightOrigin
  14391. movss xmm7, [ecx+0]
  14392. movhps xmm7, [ecx+4]
  14393. mov ecx, texCoords
  14394. sub ecx, 4*4
  14395. loopVert:
  14396. inc eax
  14397. jge done
  14398. add edi, DRAWVERT_SIZE
  14399. add ecx, 4*4
  14400. cmp byte ptr [esi+eax], 0
  14401. je loopVert
  14402. movaps xmm0, xmm7
  14403. movaps xmm1, xmm6
  14404. movss xmm2, [edi+DRAWVERT_XYZ_OFFSET+0]
  14405. movhps xmm2, [edi+DRAWVERT_XYZ_OFFSET+4]
  14406. subps xmm0, xmm2
  14407. subps xmm1, xmm2
  14408. movaps xmm3, xmm0
  14409. movaps xmm4, xmm1
  14410. mulps xmm3, xmm3
  14411. mulps xmm4, xmm4
  14412. // 0, X, 1, 2
  14413. // 3, X, 4, 5
  14414. movaps xmm5, xmm3 // xmm5 = 0, X, 1, 2
  14415. unpcklps xmm5, xmm4 // xmm5 = 0, 3, X, X
  14416. unpckhps xmm3, xmm4 // xmm3 = 1, 4, 2, 5
  14417. movhlps xmm4, xmm3 // xmm4 = 2, 5, 4, 5
  14418. addps xmm5, xmm3
  14419. addps xmm5, xmm4
  14420. shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
  14421. rsqrtps xmm5, xmm5
  14422. movaps xmm4, xmm5
  14423. shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
  14424. shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 1, 1 )
  14425. mulps xmm0, xmm4
  14426. mulps xmm1, xmm5
  14427. addps xmm0, xmm1
  14428. movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
  14429. movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
  14430. mulps xmm2, xmm0
  14431. movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
  14432. movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
  14433. mulps xmm3, xmm0
  14434. movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
  14435. movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
  14436. mulps xmm4, xmm0
  14437. movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2
  14438. unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X
  14439. unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5
  14440. movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X
  14441. movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8
  14442. shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7
  14443. movaps xmm3, SIMD_SP_one
  14444. addps xmm5, xmm4
  14445. addps xmm5, xmm2
  14446. movaps [ecx+0], xmm5
  14447. movss [ecx+12], xmm3
  14448. jmp loopVert
  14449. done:
  14450. }
  14451. #elif 0
  14452. for ( int i = 0; i < numVerts; i++ ) {
  14453. if ( !used[i] ) {
  14454. continue;
  14455. }
  14456. const idDrawVert *v = &verts[i];
  14457. idVec3 lightDir = lightOrigin - v->xyz;
  14458. idVec3 viewDir = viewOrigin - v->xyz;
  14459. float ilength;
  14460. ilength = idMath::RSqrt( lightDir[0] * lightDir[0] + lightDir[1] * lightDir[1] + lightDir[2] * lightDir[2] );
  14461. lightDir[0] *= ilength;
  14462. lightDir[1] *= ilength;
  14463. lightDir[2] *= ilength;
  14464. ilength = idMath::RSqrt( viewDir[0] * viewDir[0] + viewDir[1] * viewDir[1] + viewDir[2] * viewDir[2] );
  14465. viewDir[0] *= ilength;
  14466. viewDir[1] *= ilength;
  14467. viewDir[2] *= ilength;
  14468. lightDir += viewDir;
  14469. texCoords[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
  14470. texCoords[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
  14471. texCoords[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
  14472. texCoords[i][3] = 1.0f;
  14473. }
  14474. #elif 1
  14475. ALIGN16( int usedVertNums[4] );
  14476. ALIGN16( float lightDir0[4] );
  14477. ALIGN16( float lightDir1[4] );
  14478. ALIGN16( float lightDir2[4] );
  14479. ALIGN16( float viewDir0[4] );
  14480. ALIGN16( float viewDir1[4] );
  14481. ALIGN16( float viewDir2[4] );
  14482. ALIGN16( float normal0[4] );
  14483. ALIGN16( float normal1[4] );
  14484. ALIGN16( float normal2[4] );
  14485. ALIGN16( float tangent0[4] );
  14486. ALIGN16( float tangent1[4] );
  14487. ALIGN16( float tangent2[4] );
  14488. ALIGN16( float tangent3[4] );
  14489. ALIGN16( float tangent4[4] );
  14490. ALIGN16( float tangent5[4] );
  14491. idVec3 localLightOrigin = lightOrigin;
  14492. idVec3 localViewOrigin = viewOrigin;
  14493. __asm {
  14494. xor ecx, ecx
  14495. mov eax, numVerts
  14496. mov esi, used
  14497. add esi, eax
  14498. mov edi, verts
  14499. sub edi, DRAWVERT_SIZE
  14500. neg eax
  14501. dec eax
  14502. loopVert4:
  14503. inc eax
  14504. jge done4
  14505. add edi, DRAWVERT_SIZE
  14506. cmp byte ptr [esi+eax], 0
  14507. je loopVert4
  14508. mov usedVertNums[ecx*4], eax
  14509. inc ecx
  14510. cmp ecx, 4
  14511. movss xmm3, localLightOrigin[0]
  14512. movss xmm4, localLightOrigin[4]
  14513. movss xmm5, localLightOrigin[8]
  14514. subss xmm3, [edi+DRAWVERT_XYZ_OFFSET+0]
  14515. subss xmm4, [edi+DRAWVERT_XYZ_OFFSET+4]
  14516. subss xmm5, [edi+DRAWVERT_XYZ_OFFSET+8]
  14517. movss lightDir0[ecx*4-4], xmm3
  14518. movss lightDir1[ecx*4-4], xmm4
  14519. movss lightDir2[ecx*4-4], xmm5
  14520. movss xmm0, localViewOrigin[0]
  14521. movss xmm1, localViewOrigin[4]
  14522. movss xmm2, localViewOrigin[8]
  14523. subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
  14524. subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
  14525. subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
  14526. movss viewDir0[ecx*4-4], xmm0
  14527. movss viewDir1[ecx*4-4], xmm1
  14528. movss viewDir2[ecx*4-4], xmm2
  14529. movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
  14530. movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
  14531. movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
  14532. movss normal0[ecx*4-4], xmm3
  14533. movss normal1[ecx*4-4], xmm4
  14534. movss normal2[ecx*4-4], xmm5
  14535. movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
  14536. movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
  14537. movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
  14538. movss tangent0[ecx*4-4], xmm0
  14539. movss tangent1[ecx*4-4], xmm1
  14540. movss tangent2[ecx*4-4], xmm2
  14541. movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
  14542. movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
  14543. movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
  14544. movss tangent3[ecx*4-4], xmm3
  14545. movss tangent4[ecx*4-4], xmm4
  14546. movss tangent5[ecx*4-4], xmm5
  14547. jl loopVert4
  14548. movaps xmm6, lightDir0
  14549. movaps xmm0, xmm6
  14550. mulps xmm6, xmm6
  14551. movaps xmm7, lightDir1
  14552. movaps xmm1, xmm7
  14553. mulps xmm7, xmm7
  14554. addps xmm6, xmm7
  14555. movaps xmm5, lightDir2
  14556. movaps xmm2, xmm5
  14557. mulps xmm5, xmm5
  14558. addps xmm6, xmm5
  14559. rsqrtps xmm6, xmm6
  14560. mulps xmm0, xmm6
  14561. mulps xmm1, xmm6
  14562. mulps xmm2, xmm6
  14563. movaps xmm3, viewDir0
  14564. movaps xmm7, xmm3
  14565. mulps xmm7, xmm7
  14566. movaps xmm4, viewDir1
  14567. movaps xmm6, xmm4
  14568. mulps xmm6, xmm6
  14569. addps xmm7, xmm6
  14570. movaps xmm5, viewDir2
  14571. movaps xmm6, xmm5
  14572. mulps xmm6, xmm6
  14573. addps xmm7, xmm6
  14574. rsqrtps xmm7, xmm7
  14575. mulps xmm3, xmm7
  14576. addps xmm0, xmm3
  14577. mulps xmm4, xmm7
  14578. addps xmm1, xmm4
  14579. mulps xmm5, xmm7
  14580. addps xmm2, xmm5
  14581. movaps xmm3, tangent0
  14582. mulps xmm3, xmm0
  14583. movaps xmm4, tangent1
  14584. mulps xmm4, xmm1
  14585. addps xmm3, xmm4
  14586. movaps xmm5, tangent2
  14587. mulps xmm5, xmm2
  14588. addps xmm5, xmm3
  14589. movaps xmm3, tangent3
  14590. mulps xmm3, xmm0
  14591. movaps xmm4, tangent4
  14592. mulps xmm4, xmm1
  14593. addps xmm3, xmm4
  14594. movaps xmm6, tangent5
  14595. mulps xmm6, xmm2
  14596. addps xmm6, xmm3
  14597. mulps xmm0, normal0
  14598. mulps xmm1, normal1
  14599. addps xmm0, xmm1
  14600. mulps xmm2, normal2
  14601. addps xmm0, xmm2
  14602. mov ecx, numVerts
  14603. shl ecx, 4
  14604. mov edx, usedVertNums[0]
  14605. add ecx, texCoords
  14606. shl edx, 4
  14607. movss xmm3, SIMD_SP_one
  14608. movss [ecx+edx+0], xmm5
  14609. movss [ecx+edx+4], xmm6
  14610. movss [ecx+edx+8], xmm0
  14611. movss [ecx+edx+12], xmm3
  14612. shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
  14613. mov edx, usedVertNums[4]
  14614. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
  14615. shl edx, 4
  14616. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  14617. movss [ecx+edx+0], xmm5
  14618. movss [ecx+edx+4], xmm6
  14619. movss [ecx+edx+8], xmm0
  14620. movss [ecx+edx+12], xmm3
  14621. shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
  14622. mov edx, usedVertNums[8]
  14623. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
  14624. shl edx, 4
  14625. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  14626. movss [ecx+edx+0], xmm5
  14627. movss [ecx+edx+4], xmm6
  14628. movss [ecx+edx+8], xmm0
  14629. movss [ecx+edx+12], xmm3
  14630. shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
  14631. mov edx, usedVertNums[12]
  14632. shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
  14633. shl edx, 4
  14634. shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
  14635. movss [ecx+edx+0], xmm5
  14636. movss [ecx+edx+4], xmm6
  14637. movss [ecx+edx+8], xmm0
  14638. movss [ecx+edx+12], xmm3
  14639. xor ecx, ecx
  14640. jmp loopVert4
  14641. done4:
  14642. test ecx, ecx
  14643. jz done
  14644. xor eax, eax
  14645. mov edi, numVerts
  14646. shl edi, 4
  14647. add edi, texCoords
  14648. loopVert1:
  14649. movss xmm6, lightDir0[eax*4]
  14650. movss xmm0, xmm6
  14651. mulss xmm6, xmm6
  14652. movss xmm7, lightDir1[eax*4]
  14653. movss xmm1, xmm7
  14654. mulss xmm7, xmm7
  14655. addss xmm6, xmm7
  14656. movss xmm5, lightDir2[eax*4]
  14657. movss xmm2, xmm5
  14658. mulss xmm5, xmm5
  14659. addss xmm6, xmm5
  14660. rsqrtss xmm6, xmm6
  14661. mulss xmm0, xmm6
  14662. mulss xmm1, xmm6
  14663. mulss xmm2, xmm6
  14664. movss xmm3, viewDir0[eax*4]
  14665. movss xmm7, xmm3
  14666. mulss xmm7, xmm7
  14667. movss xmm4, viewDir1[eax*4]
  14668. movss xmm6, xmm4
  14669. mulss xmm6, xmm6
  14670. addss xmm7, xmm6
  14671. movss xmm5, viewDir2[eax*4]
  14672. movss xmm6, xmm5
  14673. mulss xmm6, xmm6
  14674. addss xmm7, xmm6
  14675. rsqrtss xmm7, xmm7
  14676. mulss xmm3, xmm7
  14677. addss xmm0, xmm3
  14678. mulss xmm4, xmm7
  14679. addss xmm1, xmm4
  14680. mulss xmm5, xmm7
  14681. addss xmm2, xmm5
  14682. mov edx, usedVertNums[eax*4]
  14683. shl edx, 4
  14684. movss xmm3, tangent0[eax*4]
  14685. mulss xmm3, xmm0
  14686. movss xmm4, tangent1[eax*4]
  14687. mulss xmm4, xmm1
  14688. addss xmm3, xmm4
  14689. movss xmm5, tangent2[eax*4]
  14690. mulss xmm5, xmm2
  14691. addss xmm5, xmm3
  14692. movss [edi+edx+0], xmm5
  14693. movss xmm3, tangent3[eax*4]
  14694. mulss xmm3, xmm0
  14695. movss xmm4, tangent4[eax*4]
  14696. mulss xmm4, xmm1
  14697. addss xmm3, xmm4
  14698. movss xmm6, tangent5[eax*4]
  14699. mulss xmm6, xmm2
  14700. addss xmm6, xmm3
  14701. movss [edi+edx+4], xmm6
  14702. mulss xmm0, normal0[eax*4]
  14703. mulss xmm1, normal1[eax*4]
  14704. addss xmm0, xmm1
  14705. mulss xmm2, normal2[eax*4]
  14706. addss xmm0, xmm2
  14707. movss [edi+edx+8], xmm0
  14708. movss xmm3, SIMD_SP_one
  14709. movss [edi+edx+12], xmm3
  14710. inc eax
  14711. dec ecx
  14712. jg loopVert1
  14713. done:
  14714. }
  14715. #else
  14716. ALIGN16( int usedVertNums[4] );
  14717. ALIGN16( float lightDir0[4] );
  14718. ALIGN16( float lightDir1[4] );
  14719. ALIGN16( float lightDir2[4] );
  14720. ALIGN16( float viewDir0[4] );
  14721. ALIGN16( float viewDir1[4] );
  14722. ALIGN16( float viewDir2[4] );
  14723. ALIGN16( float normal0[4] );
  14724. ALIGN16( float normal1[4] );
  14725. ALIGN16( float normal2[4] );
  14726. ALIGN16( float tangent0[4] );
  14727. ALIGN16( float tangent1[4] );
  14728. ALIGN16( float tangent2[4] );
  14729. ALIGN16( float tangent3[4] );
  14730. ALIGN16( float tangent4[4] );
  14731. ALIGN16( float tangent5[4] );
  14732. ALIGN16( float texCoords0[4] );
  14733. ALIGN16( float texCoords1[4] );
  14734. ALIGN16( float texCoords2[4] );
  14735. idVec3 localLightOrigin = lightOrigin;
  14736. idVec3 localViewOrigin = viewOrigin;
  14737. int numUsedVerts = 0;
  14738. for ( int i = 0; i < numVerts; i++ ) {
  14739. if ( !used[i] ) {
  14740. continue;
  14741. }
  14742. const idDrawVert *v = &verts[i];
  14743. lightDir0[numUsedVerts] = localLightOrigin[0] - v->xyz[0];
  14744. lightDir1[numUsedVerts] = localLightOrigin[1] - v->xyz[1];
  14745. lightDir2[numUsedVerts] = localLightOrigin[2] - v->xyz[2];
  14746. viewDir0[numUsedVerts] = localViewOrigin[0] - v->xyz[0];
  14747. viewDir1[numUsedVerts] = localViewOrigin[1] - v->xyz[1];
  14748. viewDir2[numUsedVerts] = localViewOrigin[2] - v->xyz[2];
  14749. normal0[numUsedVerts] = v->normal[0];
  14750. normal1[numUsedVerts] = v->normal[1];
  14751. normal2[numUsedVerts] = v->normal[2];
  14752. tangent0[numUsedVerts] = v->tangents[0][0];
  14753. tangent1[numUsedVerts] = v->tangents[0][1];
  14754. tangent2[numUsedVerts] = v->tangents[0][2];
  14755. tangent3[numUsedVerts] = v->tangents[1][0];
  14756. tangent4[numUsedVerts] = v->tangents[1][1];
  14757. tangent5[numUsedVerts] = v->tangents[1][2];
  14758. usedVertNums[numUsedVerts++] = i;
  14759. if ( numUsedVerts < 4 ) {
  14760. continue;
  14761. }
  14762. ALIGN16( float temp[4] );
  14763. temp[0] = lightDir0[0] * lightDir0[0];
  14764. temp[1] = lightDir0[1] * lightDir0[1];
  14765. temp[2] = lightDir0[2] * lightDir0[2];
  14766. temp[3] = lightDir0[3] * lightDir0[3];
  14767. temp[0] += lightDir1[0] * lightDir1[0];
  14768. temp[1] += lightDir1[1] * lightDir1[1];
  14769. temp[2] += lightDir1[2] * lightDir1[2];
  14770. temp[3] += lightDir1[3] * lightDir1[3];
  14771. temp[0] += lightDir2[0] * lightDir2[0];
  14772. temp[1] += lightDir2[1] * lightDir2[1];
  14773. temp[2] += lightDir2[2] * lightDir2[2];
  14774. temp[3] += lightDir2[3] * lightDir2[3];
  14775. temp[0] = idMath::RSqrt( temp[0] );
  14776. temp[1] = idMath::RSqrt( temp[1] );
  14777. temp[2] = idMath::RSqrt( temp[2] );
  14778. temp[3] = idMath::RSqrt( temp[3] );
  14779. lightDir0[0] *= temp[0];
  14780. lightDir0[1] *= temp[1];
  14781. lightDir0[2] *= temp[2];
  14782. lightDir0[3] *= temp[3];
  14783. lightDir1[0] *= temp[0];
  14784. lightDir1[1] *= temp[1];
  14785. lightDir1[2] *= temp[2];
  14786. lightDir1[3] *= temp[3];
  14787. lightDir2[0] *= temp[0];
  14788. lightDir2[1] *= temp[1];
  14789. lightDir2[2] *= temp[2];
  14790. lightDir2[3] *= temp[3];
  14791. temp[0] = viewDir0[0] * viewDir0[0];
  14792. temp[1] = viewDir0[1] * viewDir0[1];
  14793. temp[2] = viewDir0[2] * viewDir0[2];
  14794. temp[3] = viewDir0[3] * viewDir0[3];
  14795. temp[0] += viewDir1[0] * viewDir1[0];
  14796. temp[1] += viewDir1[1] * viewDir1[1];
  14797. temp[2] += viewDir1[2] * viewDir1[2];
  14798. temp[3] += viewDir1[3] * viewDir1[3];
  14799. temp[0] += viewDir2[0] * viewDir2[0];
  14800. temp[1] += viewDir2[1] * viewDir2[1];
  14801. temp[2] += viewDir2[2] * viewDir2[2];
  14802. temp[3] += viewDir2[3] * viewDir2[3];
  14803. temp[0] = idMath::RSqrt( temp[0] );
  14804. temp[1] = idMath::RSqrt( temp[1] );
  14805. temp[2] = idMath::RSqrt( temp[2] );
  14806. temp[3] = idMath::RSqrt( temp[3] );
  14807. viewDir0[0] *= temp[0];
  14808. viewDir0[1] *= temp[1];
  14809. viewDir0[2] *= temp[2];
  14810. viewDir0[3] *= temp[3];
  14811. viewDir1[0] *= temp[0];
  14812. viewDir1[1] *= temp[1];
  14813. viewDir1[2] *= temp[2];
  14814. viewDir1[3] *= temp[3];
  14815. viewDir2[0] *= temp[0];
  14816. viewDir2[1] *= temp[1];
  14817. viewDir2[2] *= temp[2];
  14818. viewDir2[3] *= temp[3];
  14819. lightDir0[0] += viewDir0[0];
  14820. lightDir0[1] += viewDir0[1];
  14821. lightDir0[2] += viewDir0[2];
  14822. lightDir0[3] += viewDir0[3];
  14823. lightDir1[0] += viewDir1[0];
  14824. lightDir1[1] += viewDir1[1];
  14825. lightDir1[2] += viewDir1[2];
  14826. lightDir1[3] += viewDir1[3];
  14827. lightDir2[0] += viewDir2[0];
  14828. lightDir2[1] += viewDir2[1];
  14829. lightDir2[2] += viewDir2[2];
  14830. lightDir2[3] += viewDir2[3];
  14831. texCoords0[0] = lightDir0[0] * tangent0[0];
  14832. texCoords0[1] = lightDir0[1] * tangent0[1];
  14833. texCoords0[2] = lightDir0[2] * tangent0[2];
  14834. texCoords0[3] = lightDir0[3] * tangent0[3];
  14835. texCoords0[0] += lightDir1[0] * tangent1[0];
  14836. texCoords0[1] += lightDir1[1] * tangent1[1];
  14837. texCoords0[2] += lightDir1[2] * tangent1[2];
  14838. texCoords0[3] += lightDir1[3] * tangent1[3];
  14839. texCoords0[0] += lightDir2[0] * tangent2[0];
  14840. texCoords0[1] += lightDir2[1] * tangent2[1];
  14841. texCoords0[2] += lightDir2[2] * tangent2[2];
  14842. texCoords0[3] += lightDir2[3] * tangent2[3];
  14843. texCoords1[0] = lightDir0[0] * tangent3[0];
  14844. texCoords1[1] = lightDir0[1] * tangent3[1];
  14845. texCoords1[2] = lightDir0[2] * tangent3[2];
  14846. texCoords1[3] = lightDir0[3] * tangent3[3];
  14847. texCoords1[0] += lightDir1[0] * tangent4[0];
  14848. texCoords1[1] += lightDir1[1] * tangent4[1];
  14849. texCoords1[2] += lightDir1[2] * tangent4[2];
  14850. texCoords1[3] += lightDir1[3] * tangent4[3];
  14851. texCoords1[0] += lightDir2[0] * tangent5[0];
  14852. texCoords1[1] += lightDir2[1] * tangent5[1];
  14853. texCoords1[2] += lightDir2[2] * tangent5[2];
  14854. texCoords1[3] += lightDir2[3] * tangent5[3];
  14855. texCoords2[0] = lightDir0[0] * normal0[0];
  14856. texCoords2[1] = lightDir0[1] * normal0[1];
  14857. texCoords2[2] = lightDir0[2] * normal0[2];
  14858. texCoords2[3] = lightDir0[3] * normal0[3];
  14859. texCoords2[0] += lightDir1[0] * normal1[0];
  14860. texCoords2[1] += lightDir1[1] * normal1[1];
  14861. texCoords2[2] += lightDir1[2] * normal1[2];
  14862. texCoords2[3] += lightDir1[3] * normal1[3];
  14863. texCoords2[0] += lightDir2[0] * normal2[0];
  14864. texCoords2[1] += lightDir2[1] * normal2[1];
  14865. texCoords2[2] += lightDir2[2] * normal2[2];
  14866. texCoords2[3] += lightDir2[3] * normal2[3];
  14867. for ( int j = 0; j < 4; j++ ) {
  14868. int n = usedVertNums[j];
  14869. texCoords[n][0] = texCoords0[j];
  14870. texCoords[n][1] = texCoords1[j];
  14871. texCoords[n][2] = texCoords2[j];
  14872. texCoords[n][3] = 1.0f;
  14873. }
  14874. numUsedVerts = 0;
  14875. }
  14876. for ( int i = 0; i < numUsedVerts; i++ ) {
  14877. float temp;
  14878. temp = lightDir0[i] * lightDir0[i] + lightDir1[i] * lightDir1[i] + lightDir2[i] * lightDir2[i];
  14879. temp = idMath::RSqrt( temp );
  14880. lightDir0[i] *= temp;
  14881. lightDir1[i] *= temp;
  14882. lightDir2[i] *= temp;
  14883. temp = viewDir0[i] * viewDir0[i] + viewDir1[i] * viewDir1[i] + viewDir2[i] * viewDir2[i];
  14884. temp = idMath::RSqrt( temp );
  14885. viewDir0[i] *= temp;
  14886. viewDir1[i] *= temp;
  14887. viewDir2[i] *= temp;
  14888. lightDir0[i] += viewDir0[i];
  14889. lightDir1[i] += viewDir1[i];
  14890. lightDir2[i] += viewDir2[i];
  14891. texCoords0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
  14892. texCoords1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
  14893. texCoords2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
  14894. int n = usedVertNums[i];
  14895. texCoords[n][0] = texCoords0;
  14896. texCoords[n][1] = texCoords1;
  14897. texCoords[n][2] = texCoords2;
  14898. texCoords[n][3] = 1.0f;
  14899. }
  14900. #endif
  14901. }
  14902. /*
  14903. ============
  14904. idSIMD_SSE::CreateShadowCache
  14905. ============
  14906. */
  14907. int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
  14908. #if 1
  14909. int outVerts;
  14910. __asm {
  14911. push ebx
  14912. mov esi, lightOrigin
  14913. movaps xmm5, SIMD_SP_lastOne
  14914. movss xmm6, [esi+0]
  14915. movhps xmm6, [esi+4]
  14916. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 3, 1 )
  14917. orps xmm6, SIMD_SP_lastOne
  14918. movaps xmm7, xmm6
  14919. xor ebx, ebx
  14920. xor ecx, ecx
  14921. mov edx, vertRemap
  14922. mov esi, verts
  14923. mov edi, vertexCache
  14924. mov eax, numVerts
  14925. and eax, ~3
  14926. jz done4
  14927. shl eax, 2
  14928. add edx, eax
  14929. neg eax
  14930. loop4:
  14931. prefetchnta [edx+128]
  14932. prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
  14933. cmp dword ptr [edx+eax+0], ebx
  14934. jne skip1
  14935. mov dword ptr [edx+eax+0], ecx
  14936. movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  14937. movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  14938. add ecx, 2
  14939. shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
  14940. orps xmm0, xmm5
  14941. movaps [edi+0*16], xmm0
  14942. subps xmm0, xmm6
  14943. movaps [edi+1*16], xmm0
  14944. add edi, 2*16
  14945. skip1:
  14946. cmp dword ptr [edx+eax+4], ebx
  14947. jne skip2
  14948. mov dword ptr [edx+eax+4], ecx
  14949. movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  14950. movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
  14951. add ecx, 2
  14952. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
  14953. orps xmm1, xmm5
  14954. movaps [edi+0*16], xmm1
  14955. subps xmm1, xmm7
  14956. movaps [edi+1*16], xmm1
  14957. add edi, 2*16
  14958. skip2:
  14959. cmp dword ptr [edx+eax+8], ebx
  14960. jne skip3
  14961. mov dword ptr [edx+eax+8], ecx
  14962. movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  14963. movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  14964. add ecx, 2
  14965. shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
  14966. orps xmm2, xmm5
  14967. movaps [edi+0*16], xmm2
  14968. subps xmm2, xmm6
  14969. movaps [edi+1*16], xmm2
  14970. add edi, 2*16
  14971. skip3:
  14972. cmp dword ptr [edx+eax+12], ebx
  14973. jne skip4
  14974. mov dword ptr [edx+eax+12], ecx
  14975. movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  14976. movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
  14977. add ecx, 2
  14978. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
  14979. orps xmm3, xmm5
  14980. movaps [edi+0*16], xmm3
  14981. subps xmm3, xmm7
  14982. movaps [edi+1*16], xmm3
  14983. add edi, 2*16
  14984. skip4:
  14985. add esi, 4*DRAWVERT_SIZE
  14986. add eax, 4*4
  14987. jl loop4
  14988. done4:
  14989. mov eax, numVerts
  14990. and eax, 3
  14991. jz done1
  14992. shl eax, 2
  14993. add edx, eax
  14994. neg eax
  14995. loop1:
  14996. cmp dword ptr [edx+eax+0], ebx
  14997. jne skip0
  14998. mov dword ptr [edx+eax+0], ecx
  14999. movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  15000. movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  15001. add ecx, 2
  15002. shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 )
  15003. orps xmm0, xmm5
  15004. movaps [edi+0*16], xmm0
  15005. subps xmm0, xmm6
  15006. movaps [edi+1*16], xmm0
  15007. add edi, 2*16
  15008. skip0:
  15009. add esi, DRAWVERT_SIZE
  15010. add eax, 4
  15011. jl loop1
  15012. done1:
  15013. pop ebx
  15014. mov outVerts, ecx
  15015. }
  15016. return outVerts;
  15017. #else
  15018. int outVerts = 0;
  15019. for ( int i = 0; i < numVerts; i++ ) {
  15020. if ( vertRemap[i] ) {
  15021. continue;
  15022. }
  15023. const float *v = verts[i].xyz.ToFloatPtr();
  15024. vertexCache[outVerts+0][0] = v[0];
  15025. vertexCache[outVerts+0][1] = v[1];
  15026. vertexCache[outVerts+0][2] = v[2];
  15027. vertexCache[outVerts+0][3] = 1.0f;
  15028. // R_SetupProjection() builds the projection matrix with a slight crunch
  15029. // for depth, which keeps this w=0 division from rasterizing right at the
  15030. // wrap around point and causing depth fighting with the rear caps
  15031. vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
  15032. vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
  15033. vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
  15034. vertexCache[outVerts+1][3] = 0.0f;
  15035. vertRemap[i] = outVerts;
  15036. outVerts += 2;
  15037. }
  15038. return outVerts;
  15039. #endif
  15040. }
  15041. /*
  15042. ============
  15043. idSIMD_SSE::CreateVertexProgramShadowCache
  15044. ============
  15045. */
  15046. int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
  15047. #if 1
  15048. __asm {
  15049. movaps xmm4, SIMD_SP_lastOne
  15050. movaps xmm5, xmm4
  15051. movaps xmm6, xmm4
  15052. movaps xmm7, xmm4
  15053. mov esi, verts
  15054. mov edi, vertexCache
  15055. mov eax, numVerts
  15056. and eax, ~3
  15057. jz done4
  15058. shl eax, 5
  15059. add edi, eax
  15060. neg eax
  15061. loop4:
  15062. prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
  15063. movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  15064. movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  15065. shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
  15066. movaps [edi+eax+1*16], xmm0
  15067. orps xmm0, xmm4
  15068. movaps [edi+eax+0*16], xmm0
  15069. movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  15070. movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
  15071. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
  15072. movaps [edi+eax+3*16], xmm1
  15073. orps xmm1, xmm5
  15074. movaps [edi+eax+2*16], xmm1
  15075. movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  15076. movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  15077. shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
  15078. movaps [edi+eax+5*16], xmm2
  15079. orps xmm2, xmm6
  15080. movaps [edi+eax+4*16], xmm2
  15081. movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  15082. movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
  15083. shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
  15084. movaps [edi+eax+7*16], xmm3
  15085. orps xmm3, xmm7
  15086. movaps [edi+eax+6*16], xmm3
  15087. add esi, 4*DRAWVERT_SIZE
  15088. add eax, 4*8*4
  15089. jl loop4
  15090. done4:
  15091. mov eax, numVerts
  15092. and eax, 3
  15093. jz done1
  15094. shl eax, 5
  15095. add edi, eax
  15096. neg eax
  15097. loop1:
  15098. movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
  15099. movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
  15100. shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
  15101. movaps [edi+eax+1*16], xmm0
  15102. orps xmm0, xmm4
  15103. movaps [edi+eax+0*16], xmm0
  15104. add esi, DRAWVERT_SIZE
  15105. add eax, 8*4
  15106. jl loop1
  15107. done1:
  15108. }
  15109. return numVerts * 2;
  15110. #else
  15111. for ( int i = 0; i < numVerts; i++ ) {
  15112. const float *v = verts[i].xyz.ToFloatPtr();
  15113. vertexCache[i*2+0][0] = v[0];
  15114. vertexCache[i*2+0][1] = v[1];
  15115. vertexCache[i*2+0][2] = v[2];
  15116. vertexCache[i*2+0][3] = 1.0f;
  15117. vertexCache[i*2+1][0] = v[0];
  15118. vertexCache[i*2+1][1] = v[1];
  15119. vertexCache[i*2+1][2] = v[2];
  15120. vertexCache[i*2+1][3] = 0.0f;
  15121. }
  15122. return numVerts * 2;
  15123. #endif
  15124. }
  15125. /*
  15126. ============
  15127. SSE_UpSample11kHzMonoPCMTo44kHz
  15128. ============
  15129. */
  15130. static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
  15131. __asm {
  15132. mov esi, src
  15133. mov edi, dest
  15134. mov eax, numSamples
  15135. and eax, ~1
  15136. jz done2
  15137. shl eax, 1
  15138. add esi, eax
  15139. neg eax
  15140. align 16
  15141. loop2:
  15142. add edi, 2*4*4
  15143. movsx ecx, word ptr [esi+eax+0]
  15144. cvtsi2ss xmm0, ecx
  15145. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  15146. movlps [edi-2*4*4+0], xmm0
  15147. movhps [edi-2*4*4+8], xmm0
  15148. movsx edx, word ptr [esi+eax+2]
  15149. cvtsi2ss xmm1, edx
  15150. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  15151. movlps [edi-1*4*4+0], xmm1
  15152. movhps [edi-1*4*4+8], xmm1
  15153. add eax, 2*2
  15154. jl loop2
  15155. done2:
  15156. mov eax, numSamples
  15157. and eax, 1
  15158. jz done
  15159. movsx ecx, word ptr [esi]
  15160. cvtsi2ss xmm0, ecx
  15161. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  15162. movlps [edi+0], xmm0
  15163. movhps [edi+8], xmm0
  15164. done:
  15165. }
  15166. }
  15167. /*
  15168. ============
  15169. SSE_UpSample11kHzStereoPCMTo44kHz
  15170. ============
  15171. */
  15172. static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
  15173. __asm {
  15174. mov esi, src
  15175. mov edi, dest
  15176. mov eax, numSamples
  15177. test eax, ~1
  15178. jz done2
  15179. shl eax, 1
  15180. add esi, eax
  15181. neg eax
  15182. align 16
  15183. loop2:
  15184. add edi, 8*4
  15185. movsx ecx, word ptr [esi+eax+0]
  15186. cvtsi2ss xmm0, ecx
  15187. movsx edx, word ptr [esi+eax+2]
  15188. cvtsi2ss xmm1, edx
  15189. unpcklps xmm0, xmm1
  15190. movlps [edi-8*4+0], xmm0
  15191. movlps [edi-8*4+8], xmm0
  15192. movlps [edi-4*4+0], xmm0
  15193. movlps [edi-4*4+8], xmm0
  15194. add eax, 2*2
  15195. jl loop2
  15196. done2:
  15197. }
  15198. }
  15199. /*
  15200. ============
  15201. SSE_UpSample22kHzMonoPCMTo44kHz
  15202. ============
  15203. */
  15204. static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
  15205. __asm {
  15206. mov esi, src
  15207. mov edi, dest
  15208. mov eax, numSamples
  15209. and eax, ~1
  15210. jz done2
  15211. shl eax, 1
  15212. add esi, eax
  15213. neg eax
  15214. align 16
  15215. loop2:
  15216. add edi, 4*4
  15217. movsx ecx, word ptr [esi+eax+0]
  15218. cvtsi2ss xmm0, ecx
  15219. movsx edx, word ptr [esi+eax+2]
  15220. cvtsi2ss xmm1, edx
  15221. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  15222. movlps [edi-4*4+0], xmm0
  15223. movhps [edi-4*4+8], xmm0
  15224. add eax, 2*2
  15225. jl loop2
  15226. done2:
  15227. mov eax, numSamples
  15228. and eax, 1
  15229. jz done
  15230. movsx ecx, word ptr [esi]
  15231. cvtsi2ss xmm0, ecx
  15232. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  15233. movlps [edi], xmm0
  15234. done:
  15235. }
  15236. }
  15237. /*
  15238. ============
  15239. SSE_UpSample22kHzStereoPCMTo44kHz
  15240. ============
  15241. */
  15242. static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
  15243. __asm {
  15244. mov esi, src
  15245. mov edi, dest
  15246. mov eax, numSamples
  15247. test eax, ~1
  15248. jz done2
  15249. shl eax, 1
  15250. add esi, eax
  15251. neg eax
  15252. align 16
  15253. loop2:
  15254. add edi, 4*4
  15255. movsx ecx, word ptr [esi+eax+0]
  15256. cvtsi2ss xmm0, ecx
  15257. movss [edi-4*4], xmm0
  15258. movss [edi-2*4], xmm0
  15259. movsx edx, word ptr [esi+eax+2]
  15260. cvtsi2ss xmm1, edx
  15261. movss [edi-3*4], xmm1
  15262. movss [edi-1*4], xmm1
  15263. add eax, 2*2
  15264. jl loop2
  15265. done2:
  15266. }
  15267. }
  15268. /*
  15269. ============
  15270. SSE_UpSample44kHzMonoPCMTo44kHz
  15271. ============
  15272. */
  15273. static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
  15274. __asm {
  15275. mov esi, src
  15276. mov edi, dest
  15277. mov eax, numSamples
  15278. and eax, ~1
  15279. jz done2
  15280. shl eax, 1
  15281. add esi, eax
  15282. neg eax
  15283. align 16
  15284. loop2:
  15285. add edi, 2*4
  15286. movsx ecx, word ptr [esi+eax+0]
  15287. cvtsi2ss xmm0, ecx
  15288. movss [edi-2*4], xmm0
  15289. movsx edx, word ptr [esi+eax+2]
  15290. cvtsi2ss xmm1, edx
  15291. movss [edi-1*4], xmm1
  15292. add eax, 2*2
  15293. jl loop2
  15294. done2:
  15295. mov eax, numSamples
  15296. and eax, 1
  15297. jz done
  15298. movsx ecx, word ptr [esi]
  15299. cvtsi2ss xmm0, ecx
  15300. movss [edi], xmm0
  15301. done:
  15302. }
  15303. }
  15304. /*
  15305. ============
  15306. idSIMD_SSE::UpSamplePCMTo44kHz
  15307. Duplicate samples for 44kHz output.
  15308. ============
  15309. */
  15310. void idSIMD_SSE::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
  15311. if ( kHz == 11025 ) {
  15312. if ( numChannels == 1 ) {
  15313. SSE_UpSample11kHzMonoPCMTo44kHz( dest, src, numSamples );
  15314. } else {
  15315. SSE_UpSample11kHzStereoPCMTo44kHz( dest, src, numSamples );
  15316. }
  15317. } else if ( kHz == 22050 ) {
  15318. if ( numChannels == 1 ) {
  15319. SSE_UpSample22kHzMonoPCMTo44kHz( dest, src, numSamples );
  15320. } else {
  15321. SSE_UpSample22kHzStereoPCMTo44kHz( dest, src, numSamples );
  15322. }
  15323. } else if ( kHz == 44100 ) {
  15324. SSE_UpSample44kHzMonoPCMTo44kHz( dest, src, numSamples );
  15325. } else {
  15326. assert( 0 );
  15327. }
  15328. }
  15329. /*
  15330. ============
  15331. SSE_UpSample11kHzMonoOGGTo44kHz
  15332. ============
  15333. */
  15334. static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
  15335. float constant = 32768.0f;
  15336. __asm {
  15337. mov esi, src
  15338. mov edi, dest
  15339. movss xmm7, constant
  15340. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  15341. mov eax, numSamples
  15342. and eax, ~1
  15343. jz done2
  15344. shl eax, 2
  15345. add esi, eax
  15346. neg eax
  15347. align 16
  15348. loop2:
  15349. add edi, 2*16
  15350. movss xmm0, [esi+eax+0]
  15351. mulss xmm0, xmm7
  15352. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  15353. movlps [edi-32], xmm0
  15354. movlps [edi-24], xmm0
  15355. movss xmm1, [esi+eax+4]
  15356. mulss xmm1, xmm7
  15357. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  15358. movlps [edi-16], xmm1
  15359. movlps [edi- 8], xmm1
  15360. add eax, 2*4
  15361. jl loop2
  15362. done2:
  15363. mov eax, numSamples
  15364. and eax, 1
  15365. jz done
  15366. movss xmm0, [esi]
  15367. mulss xmm0, xmm7
  15368. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  15369. movlps [edi+0], xmm0
  15370. movlps [edi+8], xmm0
  15371. done:
  15372. }
  15373. }
  15374. /*
  15375. ============
  15376. SSE_UpSample11kHzStereoOGGTo44kHz
  15377. ============
  15378. */
  15379. static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
  15380. float constant = 32768.0f;
  15381. __asm {
  15382. mov esi, src
  15383. mov ecx, [esi+0]
  15384. mov edx, [esi+4]
  15385. mov edi, dest
  15386. movss xmm7, constant
  15387. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  15388. mov eax, numSamples
  15389. and eax, ~1
  15390. jz done2
  15391. shl eax, 1
  15392. add ecx, eax
  15393. add edx, eax
  15394. neg eax
  15395. align 16
  15396. loop2:
  15397. add edi, 4*16
  15398. movlps xmm0, [ecx+eax]
  15399. movlps xmm1, [edx+eax]
  15400. unpcklps xmm0, xmm1
  15401. mulps xmm0, xmm7
  15402. movlps [edi-8*8], xmm0
  15403. movlps [edi-7*8], xmm0
  15404. movlps [edi-6*8], xmm0
  15405. movlps [edi-5*8], xmm0
  15406. movhps [edi-4*8], xmm0
  15407. movhps [edi-3*8], xmm0
  15408. movhps [edi-2*8], xmm0
  15409. movhps [edi-1*8], xmm0
  15410. add eax, 2*4
  15411. jl loop2
  15412. done2:
  15413. mov eax, numSamples
  15414. and eax, 1
  15415. jz done
  15416. movss xmm0, [ecx]
  15417. movss xmm1, [edx]
  15418. unpcklps xmm0, xmm1
  15419. mulps xmm0, xmm7
  15420. movlps [edi+0*8], xmm0
  15421. movlps [edi+1*8], xmm0
  15422. movlps [edi+2*8], xmm0
  15423. movlps [edi+3*8], xmm0
  15424. done:
  15425. }
  15426. }
  15427. /*
  15428. ============
  15429. SSE_UpSample22kHzMonoOGGTo44kHz
  15430. ============
  15431. */
  15432. static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
  15433. float constant = 32768.0f;
  15434. __asm {
  15435. mov esi, src
  15436. mov edi, dest
  15437. movss xmm7, constant
  15438. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  15439. mov eax, numSamples
  15440. and eax, ~1
  15441. jz done2
  15442. shl eax, 2
  15443. add esi, eax
  15444. neg eax
  15445. align 16
  15446. loop2:
  15447. add edi, 2*8
  15448. movss xmm0, [esi+eax+0]
  15449. movss xmm1, [esi+eax+4]
  15450. shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  15451. mulps xmm0, xmm7
  15452. movlps [edi-16], xmm0
  15453. movhps [edi- 8], xmm0
  15454. add eax, 2*4
  15455. jl loop2
  15456. done2:
  15457. mov eax, numSamples
  15458. and eax, 1
  15459. jz done
  15460. movss xmm0, [esi]
  15461. mulss xmm0, xmm7
  15462. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
  15463. movlps [edi+0], xmm0
  15464. done:
  15465. }
  15466. }
  15467. /*
  15468. ============
  15469. SSE_UpSample22kHzStereoOGGTo44kHz
  15470. ============
  15471. */
  15472. static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
  15473. float constant = 32768.0f;
  15474. __asm {
  15475. mov esi, src
  15476. mov ecx, [esi+0]
  15477. mov edx, [esi+4]
  15478. mov edi, dest
  15479. movss xmm7, constant
  15480. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  15481. mov eax, numSamples
  15482. and eax, ~1
  15483. jz done2
  15484. shl eax, 1
  15485. add ecx, eax
  15486. add edx, eax
  15487. neg eax
  15488. align 16
  15489. loop2:
  15490. add edi, 2*16
  15491. movlps xmm0, [ecx+eax]
  15492. movlps xmm1, [edx+eax]
  15493. unpcklps xmm0, xmm1
  15494. mulps xmm0, xmm7
  15495. movlps [edi-4*8], xmm0
  15496. movlps [edi-3*8], xmm0
  15497. movhps [edi-2*8], xmm0
  15498. movhps [edi-1*8], xmm0
  15499. add eax, 2*4
  15500. jl loop2
  15501. done2:
  15502. mov eax, numSamples
  15503. and eax, 1
  15504. jz done
  15505. movss xmm0, [ecx]
  15506. movss xmm1, [edx]
  15507. unpcklps xmm0, xmm1
  15508. mulps xmm0, xmm7
  15509. movlps [edi+0*8], xmm0
  15510. movlps [edi+1*8], xmm0
  15511. done:
  15512. }
  15513. }
  15514. /*
  15515. ============
  15516. SSE_UpSample44kHzMonoOGGTo44kHz
  15517. ============
  15518. */
  15519. static void SSE_UpSample44kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
  15520. float constant = 32768.0f;
  15521. KFLOAT_CA( mul, dest, src, constant, numSamples )
  15522. }
  15523. /*
  15524. ============
  15525. SSE_UpSample44kHzStereoOGGTo44kHz
  15526. ============
  15527. */
  15528. static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
  15529. float constant = 32768.0f;
  15530. __asm {
  15531. mov esi, src
  15532. mov ecx, [esi+0]
  15533. mov edx, [esi+4]
  15534. mov edi, dest
  15535. movss xmm7, constant
  15536. shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
  15537. mov eax, numSamples
  15538. and eax, ~1
  15539. jz done2
  15540. shl eax, 1
  15541. add ecx, eax
  15542. add edx, eax
  15543. neg eax
  15544. align 16
  15545. loop2:
  15546. add edi, 16
  15547. movlps xmm0, [ecx+eax]
  15548. movlps xmm1, [edx+eax]
  15549. unpcklps xmm0, xmm1
  15550. mulps xmm0, xmm7
  15551. movlps [edi-2*8], xmm0
  15552. movhps [edi-1*8], xmm0
  15553. add eax, 2*4
  15554. jl loop2
  15555. done2:
  15556. mov eax, numSamples
  15557. and eax, 1
  15558. jz done
  15559. movss xmm0, [ecx]
  15560. movss xmm1, [edx]
  15561. unpcklps xmm0, xmm1
  15562. mulps xmm0, xmm7
  15563. movlps [edi+0*8], xmm0
  15564. done:
  15565. }
  15566. }
  15567. /*
  15568. ============
  15569. idSIMD_SSE::UpSampleOGGTo44kHz
  15570. Duplicate samples for 44kHz output.
  15571. ============
  15572. */
  15573. void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
  15574. if ( kHz == 11025 ) {
  15575. if ( numChannels == 1 ) {
  15576. SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
  15577. } else {
  15578. SSE_UpSample11kHzStereoOGGTo44kHz( dest, ogg, numSamples );
  15579. }
  15580. } else if ( kHz == 22050 ) {
  15581. if ( numChannels == 1 ) {
  15582. SSE_UpSample22kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
  15583. } else {
  15584. SSE_UpSample22kHzStereoOGGTo44kHz( dest, ogg, numSamples );
  15585. }
  15586. } else if ( kHz == 44100 ) {
  15587. if ( numChannels == 1 ) {
  15588. SSE_UpSample44kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
  15589. } else {
  15590. SSE_UpSample44kHzStereoOGGTo44kHz( dest, ogg, numSamples );
  15591. }
  15592. } else {
  15593. assert( 0 );
  15594. }
  15595. }
  15596. /*
  15597. ============
  15598. idSIMD_SSE::MixSoundTwoSpeakerMono
  15599. ============
  15600. */
  15601. void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
  15602. #if 1
  15603. ALIGN16( float incs[2] );
  15604. assert( numSamples == MIXBUFFER_SAMPLES );
  15605. incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  15606. incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  15607. __asm {
  15608. mov eax, MIXBUFFER_SAMPLES
  15609. mov edi, mixBuffer
  15610. mov esi, samples
  15611. shl eax, 2
  15612. add esi, eax
  15613. neg eax
  15614. mov ecx, lastV
  15615. movlps xmm6, [ecx]
  15616. xorps xmm7, xmm7
  15617. movhps xmm7, incs
  15618. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
  15619. addps xmm6, xmm7
  15620. shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
  15621. addps xmm7, xmm7
  15622. loop16:
  15623. add edi, 4*4*4
  15624. movaps xmm0, [esi+eax+0*4*4]
  15625. movaps xmm1, xmm0
  15626. shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
  15627. mulps xmm0, xmm6
  15628. addps xmm0, [edi-4*4*4]
  15629. addps xmm6, xmm7
  15630. movaps [edi-4*4*4], xmm0
  15631. shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
  15632. mulps xmm1, xmm6
  15633. addps xmm1, [edi-3*4*4]
  15634. addps xmm6, xmm7
  15635. movaps [edi-3*4*4], xmm1
  15636. movaps xmm2, [esi+eax+1*4*4]
  15637. movaps xmm3, xmm2
  15638. shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
  15639. mulps xmm2, xmm6
  15640. addps xmm2, [edi-2*4*4]
  15641. addps xmm6, xmm7
  15642. movaps [edi-2*4*4], xmm2
  15643. shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )
  15644. mulps xmm3, xmm6
  15645. addps xmm3, [edi-1*4*4]
  15646. addps xmm6, xmm7
  15647. movaps [edi-1*4*4], xmm3
  15648. add eax, 2*4*4
  15649. jl loop16
  15650. }
  15651. #else
  15652. int i;
  15653. float incL;
  15654. float incR;
  15655. float sL0, sL1;
  15656. float sR0, sR1;
  15657. assert( numSamples == MIXBUFFER_SAMPLES );
  15658. incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  15659. incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  15660. sL0 = lastV[0];
  15661. sR0 = lastV[1];
  15662. sL1 = lastV[0] + incL;
  15663. sR1 = lastV[1] + incR;
  15664. incL *= 2;
  15665. incR *= 2;
  15666. for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
  15667. mixBuffer[i*2+0] += samples[i+0] * sL0;
  15668. mixBuffer[i*2+1] += samples[i+0] * sR0;
  15669. mixBuffer[i*2+2] += samples[i+1] * sL1;
  15670. mixBuffer[i*2+3] += samples[i+1] * sR1;
  15671. sL0 += incL;
  15672. sR0 += incR;
  15673. sL1 += incL;
  15674. sR1 += incR;
  15675. }
  15676. #endif
  15677. }
  15678. /*
  15679. ============
  15680. idSIMD_SSE::MixSoundTwoSpeakerStereo
  15681. ============
  15682. */
  15683. void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
  15684. #if 1
  15685. ALIGN16( float incs[2] );
  15686. assert( numSamples == MIXBUFFER_SAMPLES );
  15687. incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  15688. incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  15689. __asm {
  15690. mov eax, MIXBUFFER_SAMPLES
  15691. mov edi, mixBuffer
  15692. mov esi, samples
  15693. shl eax, 3
  15694. add esi, eax
  15695. neg eax
  15696. mov ecx, lastV
  15697. movlps xmm6, [ecx]
  15698. xorps xmm7, xmm7
  15699. movhps xmm7, incs
  15700. shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
  15701. addps xmm6, xmm7
  15702. shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
  15703. addps xmm7, xmm7
  15704. loop16:
  15705. add edi, 4*4*4
  15706. movaps xmm0, [esi+eax+0*4*4]
  15707. mulps xmm0, xmm6
  15708. addps xmm0, [edi-4*4*4]
  15709. addps xmm6, xmm7
  15710. movaps [edi-4*4*4], xmm0
  15711. movaps xmm2, [esi+eax+1*4*4]
  15712. mulps xmm2, xmm6
  15713. addps xmm2, [edi-3*4*4]
  15714. addps xmm6, xmm7
  15715. movaps [edi-3*4*4], xmm2
  15716. movaps xmm3, [esi+eax+2*4*4]
  15717. mulps xmm3, xmm6
  15718. addps xmm3, [edi-2*4*4]
  15719. addps xmm6, xmm7
  15720. movaps [edi-2*4*4], xmm3
  15721. movaps xmm4, [esi+eax+3*4*4]
  15722. mulps xmm4, xmm6
  15723. addps xmm4, [edi-1*4*4]
  15724. addps xmm6, xmm7
  15725. movaps [edi-1*4*4], xmm4
  15726. add eax, 4*4*4
  15727. jl loop16
  15728. }
  15729. #else
  15730. int i;
  15731. float incL;
  15732. float incR;
  15733. float sL0, sL1;
  15734. float sR0, sR1;
  15735. assert( numSamples == MIXBUFFER_SAMPLES );
  15736. incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  15737. incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  15738. sL0 = lastV[0];
  15739. sR0 = lastV[1];
  15740. sL1 = lastV[0] + incL;
  15741. sR1 = lastV[1] + incR;
  15742. incL *= 2;
  15743. incR *= 2;
  15744. for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
  15745. mixBuffer[i*2+0] += samples[i*2+0] * sL0;
  15746. mixBuffer[i*2+1] += samples[i*2+1] * sR0;
  15747. mixBuffer[i*2+2] += samples[i*2+2] * sL1;
  15748. mixBuffer[i*2+3] += samples[i*2+3] * sR1;
  15749. sL0 += incL;
  15750. sR0 += incR;
  15751. sL1 += incL;
  15752. sR1 += incR;
  15753. }
  15754. #endif
  15755. }
  15756. /*
  15757. ============
  15758. idSIMD_SSE::MixSoundSixSpeakerMono
  15759. ============
  15760. */
  15761. void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
  15762. #if 1
  15763. ALIGN16( float incs[6] );
  15764. assert( numSamples == MIXBUFFER_SAMPLES );
  15765. incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  15766. incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  15767. incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
  15768. incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
  15769. incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
  15770. incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
  15771. __asm {
  15772. mov eax, MIXBUFFER_SAMPLES
  15773. mov edi, mixBuffer
  15774. mov esi, samples
  15775. shl eax, 2
  15776. add esi, eax
  15777. neg eax
  15778. mov ecx, lastV
  15779. movlps xmm2, [ecx+ 0]
  15780. movhps xmm2, [ecx+ 8]
  15781. movlps xmm3, [ecx+16]
  15782. movaps xmm4, xmm2
  15783. shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
  15784. shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
  15785. xorps xmm5, xmm5
  15786. movhps xmm5, incs
  15787. movlps xmm7, incs+8
  15788. movhps xmm7, incs+16
  15789. addps xmm3, xmm5
  15790. addps xmm4, xmm7
  15791. shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
  15792. movaps xmm6, xmm7
  15793. shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
  15794. addps xmm5, xmm5
  15795. addps xmm6, xmm6
  15796. addps xmm7, xmm7
  15797. loop24:
  15798. add edi, 6*16
  15799. movaps xmm0, [esi+eax]
  15800. movaps xmm1, xmm0
  15801. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  15802. mulps xmm1, xmm2
  15803. addps xmm1, [edi-6*16]
  15804. addps xmm2, xmm5
  15805. movaps [edi-6*16], xmm1
  15806. movaps xmm1, xmm0
  15807. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
  15808. mulps xmm1, xmm3
  15809. addps xmm1, [edi-5*16]
  15810. addps xmm3, xmm6
  15811. movaps [edi-5*16], xmm1
  15812. movaps xmm1, xmm0
  15813. shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
  15814. mulps xmm1, xmm4
  15815. addps xmm1, [edi-4*16]
  15816. addps xmm4, xmm7
  15817. movaps [edi-4*16], xmm1
  15818. movaps xmm1, xmm0
  15819. shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 2, 2 )
  15820. mulps xmm1, xmm2
  15821. addps xmm1, [edi-3*16]
  15822. addps xmm2, xmm5
  15823. movaps [edi-3*16], xmm1
  15824. movaps xmm1, xmm0
  15825. shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
  15826. mulps xmm1, xmm3
  15827. addps xmm1, [edi-2*16]
  15828. addps xmm3, xmm6
  15829. movaps [edi-2*16], xmm1
  15830. shufps xmm0, xmm0, R_SHUFFLEPS( 3, 3, 3, 3 )
  15831. mulps xmm0, xmm4
  15832. addps xmm0, [edi-1*16]
  15833. addps xmm4, xmm7
  15834. movaps [edi-1*16], xmm0
  15835. add eax, 4*4
  15836. jl loop24
  15837. }
  15838. #else
  15839. int i;
  15840. float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
  15841. float incL0, incL1, incL2, incL3, incL4, incL5;
  15842. assert( numSamples == MIXBUFFER_SAMPLES );
  15843. incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  15844. incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  15845. incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
  15846. incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
  15847. incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
  15848. incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
  15849. sL0 = lastV[0];
  15850. sL1 = lastV[1];
  15851. sL2 = lastV[2];
  15852. sL3 = lastV[3];
  15853. sL4 = lastV[4];
  15854. sL5 = lastV[5];
  15855. sL6 = lastV[0] + incL0;
  15856. sL7 = lastV[1] + incL1;
  15857. sL8 = lastV[2] + incL2;
  15858. sL9 = lastV[3] + incL3;
  15859. sL10 = lastV[4] + incL4;
  15860. sL11 = lastV[5] + incL5;
  15861. incL0 *= 2;
  15862. incL1 *= 2;
  15863. incL2 *= 2;
  15864. incL3 *= 2;
  15865. incL4 *= 2;
  15866. incL5 *= 2;
  15867. for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
  15868. mixBuffer[i*6+ 0] += samples[i+0] * sL0;
  15869. mixBuffer[i*6+ 1] += samples[i+0] * sL1;
  15870. mixBuffer[i*6+ 2] += samples[i+0] * sL2;
  15871. mixBuffer[i*6+ 3] += samples[i+0] * sL3;
  15872. mixBuffer[i*6+ 4] += samples[i+0] * sL4;
  15873. mixBuffer[i*6+ 5] += samples[i+0] * sL5;
  15874. mixBuffer[i*6+ 6] += samples[i+1] * sL6;
  15875. mixBuffer[i*6+ 7] += samples[i+1] * sL7;
  15876. mixBuffer[i*6+ 8] += samples[i+1] * sL8;
  15877. mixBuffer[i*6+ 9] += samples[i+1] * sL9;
  15878. mixBuffer[i*6+10] += samples[i+1] * sL10;
  15879. mixBuffer[i*6+11] += samples[i+1] * sL11;
  15880. sL0 += incL0;
  15881. sL1 += incL1;
  15882. sL2 += incL2;
  15883. sL3 += incL3;
  15884. sL4 += incL4;
  15885. sL5 += incL5;
  15886. sL6 += incL0;
  15887. sL7 += incL1;
  15888. sL8 += incL2;
  15889. sL9 += incL3;
  15890. sL10 += incL4;
  15891. sL11 += incL5;
  15892. }
  15893. #endif
  15894. }
  15895. /*
  15896. ============
  15897. idSIMD_SSE::MixSoundSixSpeakerStereo
  15898. ============
  15899. */
  15900. void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
  15901. #if 1
  15902. ALIGN16( float incs[6] );
  15903. assert( numSamples == MIXBUFFER_SAMPLES );
  15904. assert( SPEAKER_RIGHT == 1 );
  15905. assert( SPEAKER_BACKRIGHT == 5 );
  15906. incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  15907. incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  15908. incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
  15909. incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
  15910. incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
  15911. incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
  15912. __asm {
  15913. mov eax, MIXBUFFER_SAMPLES
  15914. mov edi, mixBuffer
  15915. mov esi, samples
  15916. shl eax, 3
  15917. add esi, eax
  15918. neg eax
  15919. mov ecx, lastV
  15920. movlps xmm2, [ecx+ 0]
  15921. movhps xmm2, [ecx+ 8]
  15922. movlps xmm3, [ecx+16]
  15923. movaps xmm4, xmm2
  15924. shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
  15925. shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
  15926. xorps xmm5, xmm5
  15927. movhps xmm5, incs
  15928. movlps xmm7, incs+ 8
  15929. movhps xmm7, incs+16
  15930. addps xmm3, xmm5
  15931. addps xmm4, xmm7
  15932. shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
  15933. movaps xmm6, xmm7
  15934. shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
  15935. addps xmm5, xmm5
  15936. addps xmm6, xmm6
  15937. addps xmm7, xmm7
  15938. loop12:
  15939. add edi, 3*16
  15940. movaps xmm0, [esi+eax+0]
  15941. movaps xmm1, xmm0
  15942. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 0 )
  15943. mulps xmm1, xmm2
  15944. addps xmm1, [edi-3*16]
  15945. addps xmm2, xmm5
  15946. movaps [edi-3*16], xmm1
  15947. movaps xmm1, xmm0
  15948. shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 2, 3 )
  15949. mulps xmm1, xmm3
  15950. addps xmm1, [edi-2*16]
  15951. addps xmm3, xmm6
  15952. movaps [edi-2*16], xmm1
  15953. add eax, 4*4
  15954. shufps xmm0, xmm0, R_SHUFFLEPS( 2, 2, 2, 3 )
  15955. mulps xmm0, xmm4
  15956. addps xmm0, [edi-1*16]
  15957. addps xmm4, xmm7
  15958. movaps [edi-1*16], xmm0
  15959. jl loop12
  15960. emms
  15961. }
  15962. #else
  15963. int i;
  15964. float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
  15965. float incL0, incL1, incL2, incL3, incL4, incL5;
  15966. assert( numSamples == MIXBUFFER_SAMPLES );
  15967. assert( SPEAKER_RIGHT == 1 );
  15968. assert( SPEAKER_BACKRIGHT == 5 );
  15969. incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  15970. incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  15971. incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
  15972. incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
  15973. incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
  15974. incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
  15975. sL0 = lastV[0];
  15976. sL1 = lastV[1];
  15977. sL2 = lastV[2];
  15978. sL3 = lastV[3];
  15979. sL4 = lastV[4];
  15980. sL5 = lastV[5];
  15981. sL6 = lastV[0] + incL0;
  15982. sL7 = lastV[1] + incL1;
  15983. sL8 = lastV[2] + incL2;
  15984. sL9 = lastV[3] + incL3;
  15985. sL10 = lastV[4] + incL4;
  15986. sL11 = lastV[5] + incL5;
  15987. incL0 *= 2;
  15988. incL1 *= 2;
  15989. incL2 *= 2;
  15990. incL3 *= 2;
  15991. incL4 *= 2;
  15992. incL5 *= 2;
  15993. for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
  15994. mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0;
  15995. mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1;
  15996. mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2;
  15997. mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3;
  15998. mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4;
  15999. mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5;
  16000. mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6;
  16001. mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7;
  16002. mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8;
  16003. mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9;
  16004. mixBuffer[i*6+10] += samples[i*2+2+0] * sL10;
  16005. mixBuffer[i*6+11] += samples[i*2+2+1] * sL11;
  16006. sL0 += incL0;
  16007. sL1 += incL1;
  16008. sL2 += incL2;
  16009. sL3 += incL3;
  16010. sL4 += incL4;
  16011. sL5 += incL5;
  16012. sL6 += incL0;
  16013. sL7 += incL1;
  16014. sL8 += incL2;
  16015. sL9 += incL3;
  16016. sL10 += incL4;
  16017. sL11 += incL5;
  16018. }
  16019. #endif
  16020. }
  16021. /*
  16022. ============
  16023. idSIMD_SSE::MixedSoundToSamples
  16024. ============
  16025. */
  16026. void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
  16027. #if 1
  16028. assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
  16029. __asm {
  16030. mov eax, numSamples
  16031. mov edi, mixBuffer
  16032. mov esi, samples
  16033. shl eax, 2
  16034. add edi, eax
  16035. neg eax
  16036. loop16:
  16037. movaps xmm0, [edi+eax+0*16]
  16038. movaps xmm2, [edi+eax+1*16]
  16039. movaps xmm4, [edi+eax+2*16]
  16040. movaps xmm6, [edi+eax+3*16]
  16041. add esi, 4*4*2
  16042. movhlps xmm1, xmm0
  16043. movhlps xmm3, xmm2
  16044. movhlps xmm5, xmm4
  16045. movhlps xmm7, xmm6
  16046. prefetchnta [edi+eax+64]
  16047. cvtps2pi mm0, xmm0
  16048. cvtps2pi mm2, xmm2
  16049. cvtps2pi mm4, xmm4
  16050. cvtps2pi mm6, xmm6
  16051. prefetchnta [edi+eax+128]
  16052. cvtps2pi mm1, xmm1
  16053. cvtps2pi mm3, xmm3
  16054. cvtps2pi mm5, xmm5
  16055. cvtps2pi mm7, xmm7
  16056. add eax, 4*16
  16057. packssdw mm0, mm1
  16058. packssdw mm2, mm3
  16059. packssdw mm4, mm5
  16060. packssdw mm6, mm7
  16061. movq [esi-4*4*2], mm0
  16062. movq [esi-3*4*2], mm2
  16063. movq [esi-2*4*2], mm4
  16064. movq [esi-1*4*2], mm6
  16065. jl loop16
  16066. emms
  16067. }
  16068. #else
  16069. for ( int i = 0; i < numSamples; i++ ) {
  16070. if ( mixBuffer[i] <= -32768.0f ) {
  16071. samples[i] = -32768;
  16072. } else if ( mixBuffer[i] >= 32767.0f ) {
  16073. samples[i] = 32767;
  16074. } else {
  16075. samples[i] = (short) mixBuffer[i];
  16076. }
  16077. }
  16078. #endif
  16079. }
  16080. #endif /* _WIN32 */