Simd_AltiVec.cpp 380 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376937793789379938093819382938393849385938693879388938993909391939293939394939593969397939893999400940194029403940494059406940794089409941094119412941394149415941694179418941994209421942294239424942594269427942894299430943194329433943494359436943794389439944094419442944394449445944694479448944994509451945294539454945594569457945894599460946194629463946494659466946794689469947094719472947394749475947694779478947994809481948294839484948594869487948894899490949194929493949494959496949794989499950095019502950395049505950695079508950995109511951295139514951595169517951895199520952195229523952495259526952795289529953095319532953395349535953695379538953995409541954295439544954595469547954895499550955195529553955495559556955795589559956095619562956395649565956695679568956995709571957295739574957595769577957895799580958195829583958495859586958795889589959095919592959395949595959695979598959996009601960296039604960596069607960896099610961196129613961496159616961796189619962096219622962396249625962696279628962996309631963296339634963596369637963896399640964196429643964496459646964796489649965096519652965396549655965696579658965996609661966296639664966596669667966896699670967196729673967496759676967796789679968096819682968396849685968696879688968996909691969296939694969596969697969896999700970197029703970497059706970797089709971097119712971397149715971697179718971997209721972297239724972597269727972897299730973197329733973497359736973797389739974097419742974397449745974697479748974997509751975297539754975597569757975897599760976197629763976497659766976797689769977097719772977397749775977697779778977997809781978297839784978597869787978897899790979197929793979497959796979797989799980098019802980398049805980698079808980998109811981298139814981598169817981898199820982198229823982498259826982798289829983098319832983398349835983698379838983998409841984298439844984598469847984898499850985198529853985498559856985798589859986098619862986398649865986698679868986998709871987298739874987598769877987898799880988198829883988498859886988798889889989098919892989398949895989698979898989999009901990299039904990599069907990899099910991199129913991499159916991799189919992099219922992399249925992699279928992999309931993299339934993599369937993899399940994199429943994499459946994799489949995099519952995399549955995699579958995999609961996299639964996599669967996899699970997199729973997499759976997799789979998099819982998399849985998699879988998999909991999299939994999599969997999899991000010001100021000310004100051000610007100081000910010100111001210013100141001510016100171001810019100201002110022100231002410025100261002710028100291003010031100321003310034100351003610037100381003910040100411004210043100441004510046100471004810049100501005110052100531005410055100561005710058100591006010061100621006310064100651006610067100681006910070100711007210073100741007510076100771007810079100801008110082100831008410085100861008710088100891009010091100921009310094100951009610097100981009910100101011010210103101041010510106101071010810109101101011110112101131011410115101161011710118101191012010121101221012310124101251012610127101281012910130101311013210133101341013510136101371013810139101401014110142101431014410145101461014710148101491015010151101521015310154101551015610157101581015910160101611016210163101641016510166101671016810169101701017110172101731017410175101761017710178101791018010181101821018310184101851018610187101881018910190101911019210193101941019510196101971019810199102001020110202102031020410205102061020710208102091021010211102121021310214102151021610217102181021910220102211022210223102241022510226102271022810229102301023110232102331023410235102361023710238102391024010241102421024310244102451024610247102481024910250102511025210253102541025510256102571025810259102601026110262102631026410265102661026710268102691027010271102721027310274102751027610277102781027910280102811028210283102841028510286102871028810289102901029110292102931029410295102961029710298102991030010301103021030310304103051030610307103081030910310103111031210313103141031510316103171031810319103201032110322103231032410325103261032710328103291033010331103321033310334103351033610337103381033910340103411034210343103441034510346103471034810349103501035110352103531035410355103561035710358103591036010361103621036310364103651036610367103681036910370103711037210373103741037510376103771037810379103801038110382103831038410385103861038710388103891039010391103921039310394103951039610397103981039910400104011040210403104041040510406104071040810409104101041110412104131041410415104161041710418104191042010421104221042310424104251042610427104281042910430104311043210433104341043510436104371043810439104401044110442104431044410445104461044710448104491045010451104521045310454104551045610457104581045910460104611046210463104641046510466104671046810469104701047110472104731047410475104761047710478104791048010481104821048310484104851048610487104881048910490104911049210493104941049510496104971049810499105001050110502105031050410505105061050710508105091051010511105121051310514105151051610517105181051910520105211052210523105241052510526105271052810529105301053110532105331053410535105361053710538105391054010541105421054310544105451054610547105481054910550105511055210553105541055510556105571055810559105601056110562105631056410565105661056710568105691057010571105721057310574105751057610577105781057910580105811058210583105841058510586105871058810589105901059110592105931059410595105961059710598105991060010601106021060310604106051060610607106081060910610106111061210613106141061510616106171061810619106201062110622106231062410625106261062710628106291063010631106321063310634106351063610637106381063910640106411064210643106441064510646106471064810649106501065110652106531065410655106561065710658106591066010661106621066310664106651066610667106681066910670106711067210673106741067510676106771067810679106801068110682106831068410685106861068710688106891069010691106921069310694106951069610697106981069910700107011070210703107041070510706107071070810709107101071110712107131071410715107161071710718107191072010721107221072310724107251072610727107281072910730107311073210733107341073510736107371073810739107401074110742107431074410745107461074710748107491075010751107521075310754107551075610757107581075910760107611076210763107641076510766107671076810769107701077110772107731077410775107761077710778107791078010781107821078310784107851078610787107881078910790107911079210793107941079510796107971079810799108001080110802108031080410805108061080710808108091081010811108121081310814108151081610817108181081910820108211082210823108241082510826108271082810829108301083110832108331083410835108361083710838108391084010841108421084310844108451084610847108481084910850108511085210853108541085510856108571085810859108601086110862108631086410865108661086710868108691087010871108721087310874108751087610877108781087910880108811088210883108841088510886108871088810889108901089110892108931089410895108961089710898108991090010901109021090310904109051090610907109081090910910109111091210913109141091510916109171091810919109201092110922109231092410925109261092710928109291093010931109321093310934109351093610937109381093910940109411094210943109441094510946109471094810949109501095110952109531095410955109561095710958109591096010961109621096310964109651096610967109681096910970109711097210973109741097510976109771097810979109801098110982109831098410985109861098710988109891099010991109921099310994109951099610997109981099911000110011100211003110041100511006110071100811009110101101111012110131101411015110161101711018110191102011021110221102311024110251102611027110281102911030110311103211033110341103511036110371103811039110401104111042110431104411045110461104711048110491105011051110521105311054110551105611057110581105911060110611106211063110641106511066110671106811069110701107111072110731107411075110761107711078110791108011081110821108311084110851108611087110881108911090110911109211093110941109511096110971109811099111001110111102111031110411105111061110711108111091111011111111121111311114111151111611117111181111911120111211112211123111241112511126111271112811129111301113111132111331113411135111361113711138111391114011141111421114311144111451114611147111481114911150111511115211153111541115511156111571115811159111601116111162111631116411165111661116711168111691117011171111721117311174111751117611177111781117911180111811118211183111841118511186111871118811189111901119111192111931119411195111961119711198111991120011201112021120311204112051120611207112081120911210112111121211213112141121511216112171121811219112201122111222112231122411225112261122711228112291123011231112321123311234112351123611237112381123911240
  1. /*
  2. ===========================================================================
  3. Doom 3 GPL Source Code
  4. Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
  6. Doom 3 Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #include "../precompiled.h"
  21. #pragma hdrstop
  22. #include "Simd_Generic.h"
  23. #include "Simd_AltiVec.h"
  24. #include <math.h>
  25. #include <float.h>
  26. #ifdef PPC_INTRINSICS
  27. #include <ppc_intrinsics.h>
  28. #endif
  29. // Doom3 SIMD Library version 0.5
  30. // Patrick Flanagan (pflanagan@apple.com)
  31. // Sanjay Patel (spatel@apple.com)
  32. // Architecture & Performance Group, Apple Computer
  33. //===============================================================
  34. //
  35. // AltiVec implementation of idSIMDProcessor
  36. //
  37. //===============================================================
  38. #if defined(MACOS_X) && defined(__ppc__)
  39. // Data struct sizes
  40. #ifndef DRAWVERT_PADDED
  41. // 60 bytes, 15 floats at 4 bytes each
  42. #define DRAWVERT_OFFSET 15
  43. #else
  44. // 64 bytes, 16 floats
  45. #define DRAWVERT_OFFSET 16
  46. #endif
  47. // 16 bytes each, 4 floats
  48. #define PLANE_OFFSET 4
  49. // 16 bytes each, 4 floats
  50. #define IDVEC4_OFFSET 4
  51. // Alignment tests
  52. #define IS_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F ) == 0 )
  53. #define NOT_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F) != 0 )
  54. // Aligned storing floats
  55. #define ALIGNED_STORE2( ADDR, V0, V1 ) \
  56. vec_st( V0, 0, ADDR ); \
  57. vec_st( V1, 16, ADDR )
  58. #define ALIGNED_STORE3( ADDR, V0, V1, V2 ) \
  59. vec_st( V0, 0, ADDR ); \
  60. vec_st( V1, 16, ADDR ); \
  61. vec_st( V2, 32, ADDR )
  62. #define ALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) \
  63. vec_st( V0, 0, ADDR ); \
  64. vec_st( V1, 16, ADDR ); \
  65. vec_st( V2, 32, ADDR ); \
  66. vec_st( V3, 48, ADDR )
  67. #define ALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) \
  68. vec_st( V0, 0, ADDR ); \
  69. vec_st( V1, 16, ADDR ); \
  70. vec_st( V2, 32, ADDR ); \
  71. vec_st( V3, 48, ADDR ); \
  72. vec_st( V4, 64, ADDR ); \
  73. vec_st( V5, 80, ADDR )
  74. #define ALIGNED_STORE8( ADDR, V0, V1, V2, V3, V4, V5, V6, V7 ) \
  75. vec_st( V0, 0, ADDR ); \
  76. vec_st( V1, 16, ADDR ); \
  77. vec_st( V2, 32, ADDR ); \
  78. vec_st( V3, 48, ADDR ); \
  79. vec_st( V4, 64, ADDR ); \
  80. vec_st( V5, 80, ADDR ); \
  81. vec_st( V6, 96, ADDR ); \
  82. vec_st( V7, 112, ADDR )
  83. // Unaligned storing floats. These assume that we can trash the input
  84. #define UNALIGNED_STORE1( ADDR, V0 ) { \
  85. /* use store element */ \
  86. vector unsigned char ULStoreMacroPerm = vec_lvsr( 0, ADDR ); \
  87. V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
  88. vec_ste( V0, 0, ADDR ); \
  89. vec_ste( V0, 4, ADDR ); \
  90. vec_ste( V0, 8, ADDR ); \
  91. vec_ste( V0, 12, ADDR ); \
  92. }
  93. #define UNALIGNED_STORE2( ADDR, V0, V1 ) { \
  94. /* load up the values that are there now */ \
  95. vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
  96. vector float ULStoreMacro2 = vec_ld( 31, ADDR ); \
  97. /* generate permute vector and mask */ \
  98. vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
  99. vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
  100. /* right rotate input data */ \
  101. V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
  102. V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
  103. /* setup the output vectors */ \
  104. vector float ULStoreVal1, ULStoreVal2, ULStoreVal3; \
  105. ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
  106. ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
  107. ULStoreVal3 = vec_sel( V1, ULStoreMacro2, ULStoreMacroMask ); \
  108. /* store results */ \
  109. vec_st( ULStoreVal1, 0, ADDR ); \
  110. vec_st( ULStoreVal2, 15, ADDR ); \
  111. vec_st( ULStoreVal3, 31, ADDR ); }
  112. #define UNALIGNED_STORE3( ADDR, V0, V1, V2 ) { \
  113. /* load up the values that are there now */ \
  114. vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
  115. vector float ULStoreMacro2 = vec_ld( 47, ADDR ); \
  116. /* generate permute vector and mask */ \
  117. vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
  118. vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
  119. /* right rotate input data */ \
  120. V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
  121. V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
  122. V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
  123. /* setup the output vectors */ \
  124. vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4; \
  125. ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
  126. ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
  127. ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
  128. ULStoreVal4 = vec_sel( V2, ULStoreMacro2, ULStoreMacroMask ); \
  129. /* store results */ \
  130. vec_st( ULStoreVal1, 0, ADDR ); \
  131. vec_st( ULStoreVal2, 15, ADDR ); \
  132. vec_st( ULStoreVal3, 31, ADDR ); \
  133. vec_st( ULStoreVal4, 47, ADDR ); }
  134. #define UNALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) { \
  135. /* load up the values that are there now */ \
  136. vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
  137. vector float ULStoreMacro2 = vec_ld( 63, ADDR ); \
  138. /* generate permute vector and mask */ \
  139. vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
  140. vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
  141. /* right rotate input data */ \
  142. V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
  143. V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
  144. V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
  145. V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
  146. /* setup the output vectors */ \
  147. vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5; \
  148. ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
  149. ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
  150. ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
  151. ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
  152. ULStoreVal5 = vec_sel( V3, ULStoreMacro2, ULStoreMacroMask ); \
  153. /* store results */ \
  154. vec_st( ULStoreVal1, 0, ADDR ); \
  155. vec_st( ULStoreVal2, 15, ADDR ); \
  156. vec_st( ULStoreVal3, 31, ADDR ); \
  157. vec_st( ULStoreVal4, 47, ADDR ); \
  158. vec_st( ULStoreVal5, 63, ADDR ); }
  159. #define UNALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) { \
  160. /* load up the values that are there now */ \
  161. vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
  162. vector float ULStoreMacro2 = vec_ld( 95, ADDR ); \
  163. /* generate permute vector and mask */ \
  164. vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
  165. vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
  166. /* right rotate input data */ \
  167. V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
  168. V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
  169. V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
  170. V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
  171. V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
  172. V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
  173. /* setup the output vectors */ \
  174. vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
  175. ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
  176. ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
  177. ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
  178. ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
  179. ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
  180. ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
  181. ULStoreVal7 = vec_sel( V5, ULStoreMacro2, ULStoreMacroMask ); \
  182. /* store results */ \
  183. vec_st( ULStoreVal1, 0, ADDR ); \
  184. vec_st( ULStoreVal2, 15, ADDR ); \
  185. vec_st( ULStoreVal3, 31, ADDR ); \
  186. vec_st( ULStoreVal4, 47, ADDR ); \
  187. vec_st( ULStoreVal5, 63, ADDR ); \
  188. vec_st( ULStoreVal6, 79, ADDR ); \
  189. vec_st( ULStoreVal7, 95, ADDR ); }
  190. #define UNALIGNED_STORE9( ADDR, V0, V1, V2, V3, V4, V5, V6, V7, V8 ) { \
  191. /* load up the values that are there now */ \
  192. vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
  193. vector float ULStoreMacro2 = vec_ld( 143, ADDR ); \
  194. /* generate permute vector and mask */ \
  195. vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
  196. vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
  197. /* right rotate input data */ \
  198. V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
  199. V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
  200. V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
  201. V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
  202. V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
  203. V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
  204. V6 = vec_perm( V6, V6, ULStoreMacroPerm ); \
  205. V7 = vec_perm( V7, V7, ULStoreMacroPerm ); \
  206. V8 = vec_perm( V8, V8, ULStoreMacroPerm ); \
  207. /* setup the output vectors */ \
  208. vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
  209. vector float ULStoreVal8, ULStoreVal9, ULStoreVal10; \
  210. ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
  211. ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
  212. ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
  213. ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
  214. ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
  215. ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
  216. ULStoreVal7 = vec_sel( V5, V6, ULStoreMacroMask ); \
  217. ULStoreVal8 = vec_sel( V6, V7, ULStoreMacroMask ); \
  218. ULStoreVal9 = vec_sel( V7, V8, ULStoreMacroMask ); \
  219. ULStoreVal10 = vec_sel( V8, ULStoreMacro2, ULStoreMacroMask ); \
  220. /* store results */ \
  221. vec_st( ULStoreVal1, 0, ADDR ); \
  222. vec_st( ULStoreVal2, 15, ADDR ); \
  223. vec_st( ULStoreVal3, 31, ADDR ); \
  224. vec_st( ULStoreVal4, 47, ADDR ); \
  225. vec_st( ULStoreVal5, 63, ADDR ); \
  226. vec_st( ULStoreVal6, 79, ADDR ); \
  227. vec_st( ULStoreVal7, 95, ADDR ); \
  228. vec_st( ULStoreVal8, 111, ADDR ); \
  229. vec_st( ULStoreVal9, 127, ADDR ); \
  230. vec_st( ULStoreVal10, 143, ADDR ); }
  231. /*
  232. ============
  233. idSIMD_AltiVec::GetName
  234. ============
  235. */
  236. const char *idSIMD_AltiVec::GetName( void ) const {
  237. return "AltiVec";
  238. }
  239. /*
  240. Helper Functions
  241. */
  242. #if 0
  243. // Prints the values of a vector, useful for debugging but
  244. // should never be called in real code
  245. inline void debugPrintVector( vector float v, char *msg ) {
  246. printf("%s -- %vf\n", msg, v );
  247. }
  248. inline void debugPrintVector( vector unsigned int v, char *msg ) {
  249. printf("%s -- %vd\n", msg, v );
  250. }
  251. inline void debugPrintVector( vector bool int v, char *msg ) {
  252. printf("%s -- %vi\n", msg, v );
  253. }
  254. inline void debugPrintVector( vector unsigned char v, char *msg ) {
  255. printf("%s -- %vuc\n", msg, v );
  256. }
  257. inline void debugPrintVector( vector unsigned short v, char *msg ) {
  258. printf("%s -- %vs\n", msg, v );
  259. }
  260. #endif
  261. /*
  262. ===============
  263. Reciprocal
  264. For each element in vector:
  265. n = 1 / n
  266. ===============
  267. */
  268. // Use Newton-Raphson to calculate reciprocal of a vector
  269. inline vector float Reciprocal( vector float v ) {
  270. //Get the reciprocal estimate
  271. vector float estimate = vec_re( v );
  272. //One round of Newton-Raphson refinement
  273. return vec_madd( vec_nmsub( estimate, v, (vector float) (1.0) ), estimate, estimate );
  274. }
  275. /*
  276. ===============
  277. ReciprocalSquareRoot
  278. For each element in vector:
  279. n = 1 / sqrt(n)
  280. ===============
  281. */
  282. // Reciprocal square root estimate of a vector
  283. inline vector float ReciprocalSquareRoot( vector float v ) {
  284. //Get the square root reciprocal estimate
  285. vector float zero = (vector float)(0);
  286. vector float oneHalf = (vector float)(0.5);
  287. vector float one = (vector float)(1.0);
  288. vector float estimate = vec_rsqrte( vec_max( v, (vector float)(FLT_MIN) ) );
  289. //One round of Newton-Raphson refinement
  290. vector float estimateSquared = vec_madd( estimate, estimate, zero );
  291. vector float halfEstimate = vec_madd( estimate, oneHalf, zero );
  292. return vec_madd( vec_nmsub( v, estimateSquared, one ), halfEstimate, estimate );
  293. }
  294. /*
  295. ===============
  296. Divide
  297. For each element in vectors:
  298. n = a / b
  299. ===============
  300. */
  301. // Use reciprocal estimate and multiply to divide a vector
  302. inline vector float Divide( vector float a, vector float b ) {
  303. return vec_madd( a, Reciprocal( b ), (vector float)(0) );
  304. }
  305. /*
  306. ===============
  307. loadSplatUnalignedScalar
  308. For each element in vector:
  309. n = s
  310. ===============
  311. */
  312. inline vector float loadSplatUnalignedScalar( const float *s ) {
  313. vector unsigned char splatMap = vec_lvsl( 0, s );
  314. vector float v = vec_ld( 0, s );
  315. splatMap = (vector unsigned char) vec_splat( (vector float) splatMap, 0 );
  316. return vec_perm( v, v, splatMap );
  317. }
  318. /*
  319. ===============
  320. VectorATan16
  321. For each element in vector:
  322. n = idMath::ATan16( x, y )
  323. ===============
  324. */
  325. // calculates arc tangent of a vector with 16 bits of precision, based on atan16 in idMath
  326. inline vector float VectorATan16( vector float x, vector float y ) {
  327. vector float xDivY = Divide( x, y );
  328. vector float yDivX = Divide( y, x );
  329. vector float zeroVector = (vector float)(0);
  330. vector bool int vecCmp = vec_cmpgt( vec_abs( y ), vec_abs( x ) );
  331. vector float vecA = vec_sel( yDivX, xDivY, vecCmp );
  332. vector bool int vecCmp2 = vec_cmplt( vecA, zeroVector );
  333. vector float vecS = vec_madd( vecA, vecA, (vector float)(0) );
  334. // do calculation for S
  335. vector float vecWork1 = vec_madd( (vector float)(0.0028662257f), vecS, (vector float)(-0.0161657367f) );
  336. vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.0429096138f) );
  337. vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.0752896400f) );
  338. vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1065626393f) );
  339. vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.1420889944f) );
  340. vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1999355085f) );
  341. vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.3333314528f) );
  342. vecWork1 = vec_madd( vecWork1, vecS, (vector float)(1) );
  343. // get the regular S value
  344. vecS = vec_madd( vecWork1, vecA, (vector float)(0) );
  345. // calculate what to return if y > x
  346. vector float negSPlusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(0.5f * 3.14159265358979323846f) );
  347. vector float negSMinusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(-0.5f * 3.14159265358979323846f) );
  348. vector float modRet = vec_sel( negSPlusHalfPI, negSMinusHalfPI, vecCmp2 );
  349. return vec_sel( modRet, vecS, vecCmp );
  350. }
  351. /*
  352. ===============
  353. VectorSin16
  354. For each element in vector:
  355. n = idMath::Sin16( v )
  356. ===============
  357. */
  358. inline vector float VectorSin16( vector float v ) {
  359. vector float zero = (vector float)(0);
  360. #if 0
  361. // load up half PI and use it to calculate the rest of the values. This is
  362. // sometimes cheaper than loading them from memory
  363. vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
  364. vector float PI = vec_add( halfPI, halfPI );
  365. vector float oneandhalfPI = vec_add( PI, halfPI );
  366. vector float twoPI = vec_add( oneandhalfPI, halfPI );
  367. #else
  368. vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
  369. vector float PI = (vector float)(3.14159265358979323846f);
  370. vector float oneandhalfPI = (vector float)(3.14159265358979323846f + ( 0.5f * 3.14159265358979323846f ) );
  371. vector float twoPI = (vector float)( 2.0f * 3.14159265358979323846f);
  372. #endif
  373. vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4;
  374. vector float vecMod;
  375. vector float vecResult;
  376. // fix the range if needbe
  377. vecMod = vec_floor( Divide( v, twoPI ) );
  378. vecResult = vec_nmsub( vecMod, twoPI, v );
  379. vector float vecPIminusA = vec_sub( PI, vecResult );
  380. vector float vecAminus2PI = vec_sub( vecResult, twoPI );
  381. vecCmp1 = vec_cmplt( vecResult, PI );
  382. vecCmp2 = vec_cmpgt( vecResult, halfPI );
  383. // these are the ones where a > PI + HALF_PI so set a = a - TWO_PI
  384. vecCmp3 = vec_cmpgt( vecResult, oneandhalfPI );
  385. // we also want to set a = PI - a everywhere that !(a < PI) and !(a > PI + HALF_PI)
  386. vecCmp4 = vec_and( vec_xor( vecCmp3, (vector bool int)(1) ), vec_xor( vecCmp1, (vector bool int)(1) ) ); // everywhere that both of those are false
  387. // these are ones where a < PI and a > HALF_PI so we set a = PI - a
  388. vecCmp1 = vec_and( vecCmp1, vecCmp2 );
  389. vecCmp1 = vec_or( vecCmp1, vecCmp4 );
  390. // put the correct values into place
  391. vecResult = vec_sel( vecResult, vecPIminusA, vecCmp1 );
  392. vecResult = vec_sel( vecResult, vecAminus2PI, vecCmp3 );
  393. // calculate answer
  394. vector float vecASquared = vec_madd( vecResult, vecResult, zero );
  395. vector float vecEst = vec_madd( (vector float)(-2.39e-08f), vecASquared, (vector float)(2.7526e-06f) );
  396. vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.98409e-04f) );
  397. vecEst = vec_madd( vecEst, vecASquared, (vector float)(8.3333315e-03f) );
  398. vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.666666664e-01f) );
  399. vecEst = vec_madd( vecEst, vecASquared, (vector float)(1.0f) );
  400. return vec_madd( vecResult, vecEst, zero );
  401. }
  402. /*
  403. ===============
  404. vecSplatWithRunTime
  405. For each element in vector:
  406. n = v(i)
  407. ===============
  408. */
  409. // splats an element across a vector using a runtime variable
  410. inline vector float vecSplatWithRunTime( vector float v, int i ) {
  411. vector unsigned char rotate = vec_lvsl( i * sizeof( float ), (int*) 0L );
  412. v = vec_perm( v, v, rotate );
  413. return vec_splat( v, 0 );
  414. }
  415. /*
  416. ===============
  417. FastScalarInvSqrt
  418. n = 1 / sqrt( f )
  419. ===============
  420. */
  421. inline float FastScalarInvSqrt( float f ) {
  422. #ifdef PPC_INTRINSICS
  423. float estimate;
  424. const float kSmallestFloat = FLT_MIN;
  425. //Calculate a 5 bit starting estimate for the reciprocal sqrt
  426. estimate = __frsqrte ( f + kSmallestFloat );
  427. //if you require less precision, you may reduce the number of loop iterations.
  428. // This will do 2 rounds of NR
  429. estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
  430. estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
  431. return estimate;
  432. #else
  433. return idMath::InvSqrt( f );
  434. #endif
  435. }
  436. /*
  437. ===============
  438. FastScalarInvSqrt_x3
  439. arg1 = 1 / sqrt( arg1 )
  440. arg2 = 1 / sqrt( arg2 )
  441. arg3 = 1 / sqrt( arg3 )
  442. ===============
  443. */
  444. inline void FastScalarInvSqrt_x3( float *arg1, float *arg2, float *arg3 ) {
  445. #ifdef PPC_INTRINSICS
  446. register float estimate1, estimate2, estimate3;
  447. const float kSmallestFloat = FLT_MIN;
  448. //Calculate a 5 bit starting estimate for the reciprocal sqrt of each
  449. estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
  450. estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
  451. estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
  452. // two rounds newton-raphson
  453. estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
  454. estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
  455. estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
  456. estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
  457. estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
  458. estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
  459. *arg1 = estimate1;
  460. *arg2 = estimate2;
  461. *arg3 = estimate3;
  462. #else
  463. *arg1 = idMath::InvSqrt( *arg1 );
  464. *arg2 = idMath::InvSqrt( *arg2 );
  465. *arg3 = idMath::InvSqrt( *arg3 );
  466. #endif
  467. }
  468. /*
  469. ===============
  470. FastScalarInvSqrt_x6
  471. arg1 = 1 / sqrt( arg1 )
  472. arg2 = 1 / sqrt( arg2 )
  473. arg3 = 1 / sqrt( arg3 )
  474. arg4 = 1 / sqrt( arg4 )
  475. arg5 = 1 / sqrt( arg5 )
  476. arg6 = 1 / sqrt( arg6 )
  477. On a G5, you've got 2 pipeline stages to fill. (2 FPU's with 6 stages each)
  478. ===============
  479. */
  480. inline void FastScalarInvSqrt_x6( float *arg1, float *arg2, float *arg3, float *arg4, float *arg5, float *arg6 ) {
  481. #ifdef PPC_INTRINSICS
  482. register float estimate1, estimate2, estimate3, estimate4, estimate5, estimate6;
  483. const float kSmallestFloat = FLT_MIN;
  484. //Calculate a 5 bit starting estimate for the reciprocal sqrt of each
  485. estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
  486. estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
  487. estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
  488. estimate4 = __frsqrte ( *arg4 + kSmallestFloat );
  489. estimate5 = __frsqrte ( *arg5 + kSmallestFloat );
  490. estimate6 = __frsqrte ( *arg6 + kSmallestFloat );
  491. // two rounds newton-raphson
  492. estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
  493. estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
  494. estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
  495. estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
  496. estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
  497. estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
  498. estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
  499. estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
  500. estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
  501. estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
  502. estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
  503. estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
  504. *arg1 = estimate1;
  505. *arg2 = estimate2;
  506. *arg3 = estimate3;
  507. *arg4 = estimate4;
  508. *arg5 = estimate5;
  509. *arg6 = estimate6;
  510. #else
  511. *arg1 = idMath::InvSqrt( *arg1 );
  512. *arg2 = idMath::InvSqrt( *arg2 );
  513. *arg3 = idMath::InvSqrt( *arg3 );
  514. *arg4 = idMath::InvSqrt( *arg4 );
  515. *arg5 = idMath::InvSqrt( *arg5 );
  516. *arg6 = idMath::InvSqrt( *arg6 );
  517. #endif
  518. }
  519. // End Helper Functions
  520. #ifdef ENABLE_SIMPLE_MATH
  521. /*
  522. ============
  523. idSIMD_AltiVec::Add
  524. dst[i] = constant + src[i];
  525. ============
  526. */
  527. void VPCALL idSIMD_AltiVec::Add( float *dst, const float constant, const float *src, const int count ) {
  528. vector float v0, v1, v2, v3;
  529. vector float v0_low, v0_hi, v1_hi;
  530. vector unsigned char permVec;
  531. vector float constVec;
  532. int i;
  533. // handle unaligned cases at beginning
  534. for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  535. dst[i] = constant + src[i];
  536. }
  537. //splat constant into a vector
  538. constVec = loadSplatUnalignedScalar( &constant );
  539. //calculate permute and do first load
  540. permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), (vector unsigned char)(1) );
  541. v1_hi = vec_ld( 0, &src[i] );
  542. //vectorize!
  543. for ( ; i+7 < count; i += 8 ) {
  544. //load source
  545. v0_low = v1_hi;
  546. v0_hi = vec_ld( 15, &src[i] );
  547. v1_hi = vec_ld( 31, &src[i] );
  548. v0 = vec_perm( v0_low, v0_hi, permVec );
  549. v1 = vec_perm( v0_hi, v1_hi, permVec );
  550. v2 = vec_add( v0, constVec );
  551. v3 = vec_add( v1, constVec );
  552. // store results
  553. ALIGNED_STORE2( &dst[i], v2, v3 );
  554. }
  555. //handle cleanup
  556. for ( ; i < count ; i++ ) {
  557. dst[i] = constant + src[i];
  558. }
  559. }
  560. /*
  561. ============
  562. idSIMD_AltiVec::Add
  563. dst[i] = src0[i] + src1[i];
  564. ============
  565. */
  566. void VPCALL idSIMD_AltiVec::Add( float *dst, const float *src0, const float *src1, const int count ) {
  567. register vector float v0, v1, v2, v3, v4, v5;
  568. //src0
  569. register vector float v0_low, v0_hi, v2_low, v2_hi;
  570. //src1
  571. register vector float v1_low, v1_hi, v3_low, v3_hi;
  572. //permute vectors
  573. register vector unsigned char permVec1, permVec2;
  574. vector unsigned char oneCharVector = (vector unsigned char)(1);
  575. int i;
  576. //unaligned at start
  577. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  578. dst[i] = src0[i] + src1[i];
  579. }
  580. //calculate permute and do loads
  581. permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
  582. permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
  583. v2_hi = vec_ld( 0, &src0[i] );
  584. v3_hi = vec_ld( 0, &src1[i] );
  585. //vectorize!
  586. for ( ; i+7 < count; i += 8 ) {
  587. //load source
  588. v0_low = v2_hi;
  589. v0_hi = vec_ld( 15, &src0[i] );
  590. v2_low = v0_hi;
  591. v2_hi = vec_ld( 31, &src0[i] );
  592. v1_low = v3_hi;
  593. v1_hi = vec_ld( 15, &src1[i] );
  594. v3_low = v1_hi;
  595. v3_hi = vec_ld( 31, &src1[i] );
  596. v0 = vec_perm( v0_low, v0_hi, permVec1 );
  597. v1 = vec_perm( v1_low, v1_hi, permVec2 );
  598. v2 = vec_perm( v2_low, v2_hi, permVec1 );
  599. v3 = vec_perm( v3_low, v3_hi, permVec2 );
  600. v4 = vec_add( v0, v1 );
  601. v5 = vec_add( v2, v3 );
  602. ALIGNED_STORE2( &dst[i], v4, v5 );
  603. }
  604. //handle cleanup
  605. for ( ; i < count ; i++ ) {
  606. dst[i] = src0[i] + src1[i];
  607. }
  608. }
  609. /*
  610. ============
  611. idSIMD_AltiVec::Sub
  612. dst[i] = constant - src[i];
  613. ============
  614. */
  615. void VPCALL idSIMD_AltiVec::Sub( float *dst, const float constant, const float *src, const int count ) {
  616. register vector float v0, v1, v2, v3;
  617. register vector float v0_low, v0_hi, v1_low, v1_hi;
  618. register vector unsigned char permVec;
  619. register vector float constVec;
  620. vector unsigned char oneCharVector = (vector unsigned char)(1);
  621. int i;
  622. //handle unaligned at start
  623. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  624. dst[i] = constant - src[i];
  625. }
  626. //splat constant into a vector
  627. constVec = loadSplatUnalignedScalar( &constant );
  628. //calculate permute vector and do first load
  629. permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
  630. v1_hi = vec_ld( 0, &src[i] );
  631. //vectorize!
  632. for ( ; i+7 < count; i += 8 ) {
  633. //load source
  634. v0_low = v1_hi;
  635. v0_hi = vec_ld( 15, &src[i] );
  636. v1_low = v0_hi;
  637. v1_hi = vec_ld( 31, &src[i] );
  638. v0 = vec_perm( v0_low, v0_hi, permVec );
  639. v1 = vec_perm( v1_low, v1_hi, permVec );
  640. v2 = vec_sub( constVec, v0 );
  641. v3 = vec_sub( constVec, v1 );
  642. ALIGNED_STORE2( &dst[i], v2, v3 );
  643. }
  644. //handle cleanup
  645. for ( ; i < count ; i++ ) {
  646. dst[i] = constant - src[i];
  647. }
  648. }
  649. /*
  650. ============
  651. idSIMD_AltiVec::Sub
  652. dst[i] = src0[i] - src1[i];
  653. ============
  654. */
  655. void VPCALL idSIMD_AltiVec::Sub( float *dst, const float *src0, const float *src1, const int count ) {
  656. register vector float v0, v1, v2, v3, v4, v5;
  657. //src0
  658. register vector float v0_low, v0_hi, v2_low, v2_hi;
  659. //src1
  660. register vector float v1_low, v1_hi, v3_low, v3_hi;
  661. register vector unsigned char permVec1, permVec2;
  662. vector unsigned char oneCharVector = (vector unsigned char)(1);
  663. int i;
  664. //handle unaligned at start
  665. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  666. dst[i] = src0[i] - src1[i];
  667. }
  668. //calculate permute and do first loads
  669. permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
  670. permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
  671. v2_hi = vec_ld( 0, &src0[i] );
  672. v3_hi = vec_ld( 0, &src1[i] );
  673. //vectorize!
  674. for ( ; i+7 < count; i += 8 ) {
  675. //load source
  676. v0_low = v2_hi;
  677. v0_hi = vec_ld( 15, &src0[i] );
  678. v2_low = v0_hi;
  679. v2_hi = vec_ld( 31, &src0[i] );
  680. v1_low = v3_hi;
  681. v1_hi = vec_ld( 15, &src1[i] );
  682. v3_low = v1_hi;
  683. v3_hi = vec_ld( 31, &src1[i] );
  684. v0 = vec_perm( v0_low, v0_hi, permVec1 );
  685. v1 = vec_perm( v1_low, v1_hi, permVec2 );
  686. v2 = vec_perm( v2_low, v2_hi, permVec1 );
  687. v3 = vec_perm( v3_low, v3_hi, permVec2 );
  688. v4 = vec_sub( v0, v1 );
  689. v5 = vec_sub( v2, v3 );
  690. ALIGNED_STORE2( &dst[i], v4, v5 );
  691. }
  692. //handle cleanup
  693. for ( ; i < count ; i++ ) {
  694. dst[i] = src0[i] - src1[i];
  695. }
  696. }
  697. /*
  698. ============
  699. idSIMD_AltiVec::Mul
  700. dst[i] = constant * src[i];
  701. ============
  702. */
  703. void VPCALL idSIMD_AltiVec::Mul( float *dst, const float constant, const float *src, const int count) {
  704. register vector float v0, v0_low, v0_hi, v1_low, v1_hi, v1, v2, v3;
  705. register vector float constVec;
  706. register vector unsigned char permVec;
  707. vector unsigned char oneCharVector = (vector unsigned char)(1);
  708. register vector float zeroVector = (vector float)(0.0);
  709. int i;
  710. // handle unaligned data at start
  711. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  712. dst[i] = constant * src[i];
  713. }
  714. //splat constant into a vector
  715. constVec = loadSplatUnalignedScalar( &constant );
  716. permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
  717. v1_hi = vec_ld( 0, &src[i] );
  718. //vectorize!
  719. for ( ; i+7 < count; i += 8 ) {
  720. //load source
  721. v0_low = v1_hi;
  722. v0_hi = vec_ld( 15, &src[i] );
  723. v1_low = v0_hi;
  724. v1_hi = vec_ld( 31, &src[i] );
  725. v0 = vec_perm( v0_low, v0_hi, permVec );
  726. v1 = vec_perm( v1_low, v1_hi, permVec );
  727. v2 = vec_madd( constVec, v0, zeroVector );
  728. v3 = vec_madd( constVec, v1, zeroVector );
  729. ALIGNED_STORE2( &dst[i], v2, v3 );
  730. }
  731. //handle cleanup
  732. for ( ; i < count ; i++ ) {
  733. dst[i] = constant * src[i];
  734. }
  735. }
  736. /*
  737. ============
  738. idSIMD_AltiVec::Mul
  739. dst[i] = src0[i] * src1[i];
  740. ============
  741. */
  742. void VPCALL idSIMD_AltiVec::Mul( float *dst, const float *src0, const float *src1, const int count ) {
  743. register vector float v0, v1, v2, v3, v4, v5;
  744. //src0
  745. register vector float v0_low, v0_hi, v2_low, v2_hi;
  746. //src1
  747. register vector float v1_low, v1_hi, v3_low, v3_hi;
  748. //permute vectors
  749. register vector unsigned char permVec1, permVec2;
  750. register vector float constVec = (vector float)(0.0);
  751. vector unsigned char oneCharVector = (vector unsigned char)(1);
  752. int i;
  753. //handle unaligned at start
  754. for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  755. dst[i] = src0[i] * src1[i];
  756. }
  757. //calculate permute and do loads
  758. permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
  759. permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
  760. v2_hi = vec_ld( 0, &src0[i] );
  761. v3_hi = vec_ld( 0, &src1[i] );
  762. //vectorize!
  763. for ( ; i+7 < count; i += 8 ) {
  764. //load source
  765. v0_low = v2_hi;
  766. v0_hi = vec_ld( 15, &src0[i] );
  767. v2_low = v0_hi;
  768. v2_hi = vec_ld( 31, &src0[i] );
  769. v1_low = v3_hi;
  770. v1_hi = vec_ld( 15, &src1[i] );
  771. v3_low = v1_hi;
  772. v3_hi = vec_ld( 31, &src1[i] );
  773. v0 = vec_perm( v0_low, v0_hi, permVec1 );
  774. v1 = vec_perm( v1_low, v1_hi, permVec2 );
  775. v2 = vec_perm( v2_low, v2_hi, permVec1 );
  776. v3 = vec_perm( v3_low, v3_hi, permVec2 );
  777. //no such thing as regular multiply so we do
  778. //multiply then add zero
  779. v4 = vec_madd( v0, v1, constVec );
  780. v5 = vec_madd( v2, v3, constVec );
  781. ALIGNED_STORE2( &dst[i], v4, v5 );
  782. }
  783. //handle cleanup
  784. for ( ; i < count ; i++ ) {
  785. dst[i] = src0[i] * src1[i];
  786. }
  787. }
  788. /*
  789. ============
  790. idSIMD_AltiVec::Div
  791. dst[i] = constant / divisor[i];
  792. ============
  793. */
  794. void VPCALL idSIMD_AltiVec::Div( float *dst, const float constant, const float *divisor, const int count ) {
  795. register vector float v0, v1, v2, v3;
  796. register vector float v0_low, v0_hi, v1_low, v1_hi;
  797. register vector unsigned char permVec;
  798. register vector float constVec;
  799. vector unsigned char oneCharVector = (vector unsigned char)(1);
  800. int i;
  801. //handle unaligned at start
  802. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  803. dst[i] = constant / divisor[i];
  804. }
  805. //splat constant into a vector
  806. constVec = loadSplatUnalignedScalar( &constant );
  807. //calculate permute and do first loads
  808. permVec = vec_add( vec_lvsl( -1, (int*) &divisor[i] ), oneCharVector );
  809. v1_hi = vec_ld( 0, &divisor[i] );
  810. //vectorize!
  811. for ( ; i+7 < count; i += 8 ) {
  812. //load source
  813. v0_low = v1_hi;
  814. v0_hi = vec_ld( 15, &divisor[i] );
  815. v1_low = v0_hi;
  816. v1_hi = vec_ld( 31, &divisor[i] );
  817. v0 = vec_perm( v0_low, v0_hi, permVec );
  818. v1 = vec_perm( v1_low, v1_hi, permVec );
  819. v2 = Divide( constVec, v0 );
  820. v3 = Divide( constVec, v1 );
  821. ALIGNED_STORE2( &dst[i], v2, v3 );
  822. }
  823. //handle cleanup
  824. for ( ; i < count ; i++ ) {
  825. dst[i] = constant / divisor[i];
  826. }
  827. }
  828. /*
  829. ============
  830. idSIMD_AltiVec::Div
  831. dst[i] = src0[i] / src1[i];
  832. ============
  833. */
  834. void VPCALL idSIMD_AltiVec::Div( float *dst, const float *src0, const float *src1, const int count ) {
  835. register vector float v0, v1, v2, v3, v4, v5;
  836. //src0
  837. register vector float v0_low, v0_hi, v2_low, v2_hi;
  838. //src1
  839. register vector float v1_low, v1_hi, v3_low, v3_hi;
  840. //permute vectors
  841. register vector unsigned char permVec1, permVec2;
  842. vector unsigned char oneCharVector = (vector unsigned char)(1);
  843. int i;
  844. //handle unaligned at start
  845. for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  846. dst[i] = src0[i] / src1[i];
  847. }
  848. //calculate permute and do loads
  849. permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
  850. permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
  851. v2_hi = vec_ld( 0, &src0[i] );
  852. v3_hi = vec_ld( 0, &src1[i] );
  853. //vectorize!
  854. for ( ; i+7 < count; i += 8 ) {
  855. //load source
  856. v0_low = v2_hi;
  857. v0_hi = vec_ld( 15, &src0[i] );
  858. v2_low = v0_hi;
  859. v2_hi = vec_ld( 31, &src0[i] );
  860. v1_low = v3_hi;
  861. v1_hi = vec_ld( 15, &src1[i] );
  862. v3_low = v1_hi;
  863. v3_hi = vec_ld( 31, &src1[i] );
  864. v0 = vec_perm( v0_low, v0_hi, permVec1 );
  865. v1 = vec_perm( v1_low, v1_hi, permVec2 );
  866. v2 = vec_perm( v2_low, v2_hi, permVec1 );
  867. v3 = vec_perm( v3_low, v3_hi, permVec2 );
  868. v4 = Divide( v0, v1 );
  869. v5 = Divide( v2, v3 );
  870. ALIGNED_STORE2( &dst[i], v4, v5 );
  871. }
  872. //handle cleanup
  873. for ( ; i < count ; i++ ) {
  874. dst[i] = src0[i] / src1[i];
  875. }
  876. }
  877. /*
  878. ============
  879. idSIMD_AltiVec::MulAdd
  880. dst[i] += constant * src[i];
  881. ============
  882. */
  883. void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float constant, const float *src, const int count ) {
  884. register vector float v0, v1, v2, v3, v4, v5;
  885. register vector float constVec;
  886. //src
  887. register vector float v0_low, v0_hi, v2_low, v2_hi;
  888. //permute vectors
  889. register vector unsigned char permVec1;
  890. vector unsigned char oneCharVector = (vector unsigned char)(1);
  891. int i;
  892. //handle unaligned at start
  893. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  894. dst[i] += constant * src[i];
  895. }
  896. //splat constant into a vector
  897. constVec = loadSplatUnalignedScalar( &constant );
  898. //calculate permute and do loads
  899. permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
  900. v2_hi = vec_ld( 0, &src[i] );
  901. //vectorize!
  902. for ( ; i+7 < count; i += 8 ) {
  903. v0_low = v2_hi;
  904. v0_hi = vec_ld( 15, &src[i] );
  905. v2_low = v0_hi;
  906. v2_hi = vec_ld( 31, &src[i] );
  907. v0 = vec_perm( v0_low, v0_hi, permVec1 );
  908. v2 = vec_perm( v2_low, v2_hi, permVec1 );
  909. // at this point, dst is known to be aligned
  910. v1 = vec_ld( 0, &dst[i] );
  911. v3 = vec_ld( 16, &dst[i] );
  912. v4 = vec_madd( constVec, v0, v1 );
  913. v5 = vec_madd( constVec, v2, v3 );
  914. ALIGNED_STORE2( &dst[i], v4, v5 );
  915. }
  916. //handle cleanup
  917. for ( ; i < count ; i++ ) {
  918. dst[i] += constant * src[i];
  919. }
  920. }
  921. /*
  922. ============
  923. idSIMD_AltiVec::MulAdd
  924. dst[i] += src0[i] * src1[i];
  925. ============
  926. */
  927. void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
  928. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  929. //src0
  930. register vector float v0_low, v0_hi, v2_low, v2_hi;
  931. //src1
  932. register vector float v1_low, v1_hi, v3_low, v3_hi;
  933. //permute vectors
  934. register vector unsigned char permVec1, permVec2;
  935. vector unsigned char oneCharVector = (vector unsigned char)(1);
  936. int i;
  937. //unaligned at start
  938. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  939. dst[i] += src0[i] * src1[i];
  940. }
  941. //calculate permute and do loads
  942. permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
  943. permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
  944. v2_hi = vec_ld( 0, &src0[i] );
  945. v3_hi = vec_ld( 0, &src1[i] );
  946. //vectorize!
  947. for ( ; i+7 < count; i += 8 ) {
  948. // load sources
  949. v0_low = v2_hi;
  950. v0_hi = vec_ld( 15, &src0[i] );
  951. v2_low = v0_hi;
  952. v2_hi = vec_ld( 31, &src0[i] );
  953. v1_low = v3_hi;
  954. v1_hi = vec_ld( 15, &src1[i] );
  955. v3_low = v1_hi;
  956. v3_hi = vec_ld( 31, &src1[i] );
  957. v0 = vec_perm( v0_low, v0_hi, permVec1 );
  958. v1 = vec_perm( v1_low, v1_hi, permVec2 );
  959. v2 = vec_perm( v2_low, v2_hi, permVec1 );
  960. v3 = vec_perm( v3_low, v3_hi, permVec2 );
  961. //we know dst is aligned because we handled unaligned cases
  962. //up front
  963. v4 = vec_ld( 0, &dst[i] );
  964. v5 = vec_ld( 16, &dst[i] );
  965. v6 = vec_madd( v0, v1, v4 );
  966. v7 = vec_madd( v2, v3, v5 );
  967. ALIGNED_STORE2( &dst[i], v6, v7 );
  968. }
  969. //handle cleanup
  970. for ( ; i < count ; i++ ) {
  971. dst[i] += src0[i] * src1[i];
  972. }
  973. }
  974. /*
  975. ============
  976. idSIMD_AltiVec::MulSub
  977. dst[i] -= constant * src[i];
  978. ============
  979. */
  980. void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float constant, const float *src, const int count ) {
  981. register vector float v0, v1, v2, v3, v4, v5;
  982. register vector float constVec;
  983. //src
  984. register vector float v0_low, v0_hi, v2_low, v2_hi;
  985. //permute vectors
  986. register vector unsigned char permVec1;
  987. vector unsigned char oneCharVector = (vector unsigned char)(1);
  988. int i;
  989. //handle unaligned at start
  990. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  991. dst[i] -= constant * src[i];
  992. }
  993. //splat constant into a vector
  994. constVec = loadSplatUnalignedScalar( &constant );
  995. //calculate permute and do loads
  996. permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
  997. v2_hi = vec_ld( 0, &src[i] );
  998. //vectorize!
  999. for ( ; i+7 < count; i += 8 ) {
  1000. v0_low = v2_hi;
  1001. v0_hi = vec_ld( 15, &src[i] );
  1002. v2_low = v0_hi;
  1003. v2_hi = vec_ld( 31, &src[i] );
  1004. v0 = vec_perm( v0_low, v0_hi, permVec1 );
  1005. v2 = vec_perm( v2_low, v2_hi, permVec1 );
  1006. //we know dst will be aligned here because we already handled the preceeding
  1007. //unaligned cases
  1008. v1 = vec_ld( 0, &dst[i] );
  1009. v3 = vec_ld( 16, &dst[i] );
  1010. v4 = vec_nmsub( v0, constVec, v1 );
  1011. v5 = vec_nmsub( v2, constVec, v3 );
  1012. ALIGNED_STORE2( &dst[i], v4, v5 );
  1013. }
  1014. //handle cleanup
  1015. for ( ; i < count ; i++ ) {
  1016. dst[i] -= constant * src[i];
  1017. }
  1018. }
  1019. /*
  1020. ============
  1021. idSIMD_AltiVec::MulSub
  1022. dst[i] -= src0[i] * src1[i];
  1023. ============
  1024. */
  1025. void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
  1026. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  1027. //src0
  1028. register vector float v0_low, v0_hi, v2_low, v2_hi;
  1029. //src1
  1030. register vector float v1_low, v1_hi, v3_low, v3_hi;
  1031. //permute vectors
  1032. register vector unsigned char permVec1, permVec2;
  1033. vector unsigned char oneCharVector = (vector unsigned char)(1);
  1034. int i;
  1035. //unaligned at start
  1036. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  1037. dst[i] -= src0[i] * src1[i];
  1038. }
  1039. //calculate permute and do loads
  1040. permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
  1041. permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
  1042. v2_hi = vec_ld( 0, &src0[i] );
  1043. v3_hi = vec_ld( 0, &src1[i] );
  1044. //vectorize!
  1045. for ( ; i+7 < count; i += 8 ) {
  1046. // load sources
  1047. v0_low = v2_hi;
  1048. v0_hi = vec_ld( 15, &src0[i] );
  1049. v2_low = v0_hi;
  1050. v2_hi = vec_ld( 31, &src0[i] );
  1051. v1_low = v3_hi;
  1052. v1_hi = vec_ld( 15, &src1[i] );
  1053. v3_low = v1_hi;
  1054. v3_hi = vec_ld( 31, &src1[i] );
  1055. v0 = vec_perm( v0_low, v0_hi, permVec1 );
  1056. v1 = vec_perm( v1_low, v1_hi, permVec2 );
  1057. v2 = vec_perm( v2_low, v2_hi, permVec1 );
  1058. v3 = vec_perm( v3_low, v3_hi, permVec2 );
  1059. //we know dst is aligned because we handled unaligned cases
  1060. //up front
  1061. v4 = vec_ld( 0, &dst[i] );
  1062. v5 = vec_ld( 16, &dst[i] );
  1063. v6 = vec_nmsub( v0, v1, v4 );
  1064. v7 = vec_nmsub( v2, v3, v5 );
  1065. ALIGNED_STORE2( &dst[i], v6, v7 );
  1066. }
  1067. //handle cleanup
  1068. for ( ; i < count ; i++ ) {
  1069. dst[i] -= src0[i] * src1[i];
  1070. }
  1071. }
  1072. #endif /* ENABLE_SIMPLE_MATH */
  1073. #ifdef ENABLE_DOT
  1074. /*
  1075. ============
  1076. idSIMD_AltiVec::Dot
  1077. dst[i] = constant * src[i];
  1078. ============
  1079. */
  1080. void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
  1081. register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
  1082. register vector float vecX, vecY, vecZ;
  1083. vector float vecX2, vecY2, vecZ2;
  1084. const float *addr = src[0].ToFloatPtr();
  1085. float tempVal[4];
  1086. float constVal[4];
  1087. register vector float zeroVector = (vector float)(0.0);
  1088. register vector float vecConstX, vecConstY, vecConstZ;
  1089. // permute vectors
  1090. register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
  1091. register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
  1092. register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
  1093. register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
  1094. register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
  1095. register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
  1096. int i;
  1097. // for scalar cleanup, if necessary
  1098. constVal[0] = constant[0];
  1099. constVal[1] = constant[1];
  1100. constVal[2] = constant[2];
  1101. constVal[3] = 0;
  1102. vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
  1103. vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
  1104. vecLd2 = vec_ld( 11, constant.ToFloatPtr() );
  1105. vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
  1106. // populate const vectors
  1107. vecConstX = vec_splat( vecLd1, 0 );
  1108. vecConstY = vec_splat( vecLd1, 1 );
  1109. vecConstZ = vec_splat( vecLd1, 2 );
  1110. vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
  1111. vector float vecOld = vec_ld( 0, addr );
  1112. // handle unaligned case at beginning
  1113. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  1114. dst[i] = constant * src[i];
  1115. }
  1116. for ( ; i + 7 < count; i += 8 ) {
  1117. float *vecPtr = (float*)( addr + (i*3) );
  1118. vector float v0, v1, v2, v3, v4, v5;
  1119. v0 = vecOld; //vec_ld( 0, vecPtr );
  1120. v1 = vec_ld( 15, vecPtr );
  1121. v2 = vec_ld( 31, vecPtr );
  1122. v3 = vec_ld( 47, vecPtr );
  1123. v4 = vec_ld( 63, vecPtr );
  1124. v5 = vec_ld( 79, vecPtr );
  1125. vecOld = vec_ld( 95, vecPtr );
  1126. vecLd1 = vec_perm( v0, v1, permVec );
  1127. vecLd2 = vec_perm( v1, v2, permVec );
  1128. vecLd3 = vec_perm( v2, v3, permVec );
  1129. vecLd4 = vec_perm( v3, v4, permVec );
  1130. vecLd5 = vec_perm( v4, v5, permVec );
  1131. vecLd6 = vec_perm( v5, vecOld, permVec );
  1132. // permute into X Y Z vectors
  1133. vecX = vec_perm( vecLd1, vecLd2, permX1 );
  1134. vecY = vec_perm( vecLd1, vecLd2, permY1 );
  1135. vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
  1136. vecX = vec_perm( vecX, vecLd3, permX2 );
  1137. vecY = vec_perm( vecY, vecLd3, permY2 );
  1138. vecZ = vec_perm( vecZ, vecLd3, permZ2 );
  1139. vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
  1140. vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
  1141. vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
  1142. vecX2 = vec_perm( vecX2, vecLd6, permX2 );
  1143. vecY2 = vec_perm( vecY2, vecLd6, permY2 );
  1144. vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
  1145. // do multiply
  1146. vecX = vec_madd( vecX, vecConstX, zeroVector );
  1147. vecY = vec_madd( vecY, vecConstY, vecX );
  1148. vecZ = vec_madd( vecZ, vecConstZ, vecY );
  1149. vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
  1150. vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
  1151. vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
  1152. // store out results
  1153. ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
  1154. }
  1155. //cleanup
  1156. for ( ; i < count; i++ ) {
  1157. // look up whats at the address we want, cast it as float pointer, then
  1158. // dereference that pointer
  1159. tempVal[0] = *( addr + (i*3) + 0 );
  1160. tempVal[1] = *( addr + (i*3) + 1 );
  1161. tempVal[2] = *( addr + (i*3) + 2 );
  1162. dst[i] = constVal[0] * tempVal[0] + constVal[1] * tempVal[1] + constVal[2] * tempVal[2];
  1163. }
  1164. }
  1165. /*
  1166. ============
  1167. idSIMD_AltiVec::Dot
  1168. dst[i] = constant * src[i].Normal() + src[i][3];
  1169. ============
  1170. */
  1171. void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
  1172. //#define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
  1173. assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
  1174. int i;
  1175. float constVal[4];
  1176. float srcVal[3];
  1177. float srcI3;
  1178. float tempVal;
  1179. vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
  1180. vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
  1181. vector float vecX, vecY, vecZ, vecI3;
  1182. vector float vecX2, vecY2, vecZ2, vecI32;
  1183. vector float vecConstX, vecConstY, vecConstZ;
  1184. constVal[0] = constant[0];
  1185. constVal[1] = constant[1];
  1186. constVal[2] = constant[2];
  1187. constVal[3] = 1;
  1188. vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
  1189. vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
  1190. vector float v1 = vec_ld( 11, constant.ToFloatPtr() );
  1191. vector float vecConst = vec_perm( v0, v1, constPerm );
  1192. vecConstX = vec_splat( vecConst, 0 );
  1193. vecConstY = vec_splat( vecConst, 1 );
  1194. vecConstZ = vec_splat( vecConst, 2 );
  1195. // handle unaligned case at beginning
  1196. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  1197. dst[i] = constant * src[i].Normal() + src[i][3];
  1198. }
  1199. const float *addr = src[i].ToFloatPtr();
  1200. vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
  1201. vector float vecOld = vec_ld( 0, addr );
  1202. for ( ; i + 7 < count; i += 8 ) {
  1203. float *planePtr = (float*)( addr + (i*PLANE_OFFSET) );
  1204. vector float v0, v1, v2, v3, v4, v5, v6, v7;
  1205. v0 = vecOld; //vec_ld( 0, planePtr );
  1206. v1 = vec_ld( 15, planePtr );
  1207. v2 = vec_ld( 31, planePtr );
  1208. v3 = vec_ld( 47, planePtr );
  1209. v4 = vec_ld( 63, planePtr );
  1210. v5 = vec_ld( 79, planePtr );
  1211. v6 = vec_ld( 95, planePtr );
  1212. v7 = vec_ld( 111, planePtr );
  1213. vecOld = vec_ld( 127, planePtr );
  1214. vecPlaneLd1 = vec_perm( v0, v1, permVec );
  1215. vecPlaneLd2 = vec_perm( v1, v2, permVec );
  1216. vecPlaneLd3 = vec_perm( v2, v3, permVec );
  1217. vecPlaneLd4 = vec_perm( v3, v4, permVec );
  1218. vecPlaneLd5 = vec_perm( v4, v5, permVec );
  1219. vecPlaneLd6 = vec_perm( v5, v6, permVec );
  1220. vecPlaneLd7 = vec_perm( v6, v7, permVec );
  1221. vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
  1222. // permute into X Y Z vectors, since this is square its basically
  1223. // a matrix transpose
  1224. v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
  1225. v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
  1226. v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
  1227. v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
  1228. vecX = vec_mergeh( v0, v1 );
  1229. vecY = vec_mergel( v0, v1 );
  1230. vecZ = vec_mergeh( v2, v3 );
  1231. vecI3 = vec_mergel( v2, v3 );
  1232. v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
  1233. v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
  1234. v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
  1235. v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
  1236. vecX2 = vec_mergeh( v4, v5 );
  1237. vecY2 = vec_mergel( v4, v5 );
  1238. vecZ2 = vec_mergeh( v6, v7 );
  1239. vecI32 = vec_mergel( v6, v7 );
  1240. // do calculation
  1241. v6 = vec_madd( vecZ, vecConstZ, vecI3 );
  1242. v5 = vec_madd( vecY, vecConstY, v6 );
  1243. v4 = vec_madd( vecX, vecConstX, v5 );
  1244. v0 = vec_madd( vecZ2, vecConstZ, vecI32 );
  1245. v1 = vec_madd( vecY2, vecConstY, v0 );
  1246. v2 = vec_madd( vecX2, vecConstX, v1 );
  1247. // store results
  1248. ALIGNED_STORE2( &dst[i], v4, v2 );
  1249. }
  1250. // cleanup
  1251. for ( ; i < count; i++ ) {
  1252. // populate srcVal with src X Y Z
  1253. srcVal[0] = *(addr + (i*PLANE_OFFSET) + 0 );
  1254. srcVal[1] = *(addr + (i*PLANE_OFFSET) + 1 );
  1255. srcVal[2] = *(addr + (i*PLANE_OFFSET) + 2 );
  1256. // put src[i][3] into srcI3
  1257. srcI3 = *(addr + (i*PLANE_OFFSET) + 3 );
  1258. tempVal = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
  1259. dst[i] = tempVal + srcI3;
  1260. }
  1261. }
  1262. #ifndef DRAWVERT_PADDED
  1263. /*
  1264. ============
  1265. idSIMD_AltiVec::Dot
  1266. dst[i] = constant * src[i].xyz;
  1267. ============
  1268. */
  1269. void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
  1270. //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
  1271. // idDrawVert size is 60 bytes
  1272. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
  1273. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  1274. int i;
  1275. register vector float vecConstX, vecConstY, vecConstZ;
  1276. register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
  1277. register vector float zeroVector = (vector float)(0.0);
  1278. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  1279. vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
  1280. v0 = vec_ld( 0, constant.ToFloatPtr() );
  1281. v1 = vec_ld( 11, constant.ToFloatPtr() );
  1282. v0 = vec_perm( v0, v1, constPerm );
  1283. // permute into constant vectors
  1284. vecConstX = vec_splat( v0, 0 );
  1285. vecConstY = vec_splat( v0, 1 );
  1286. vecConstZ = vec_splat( v0, 2 );
  1287. // handle unaligned case at beginning
  1288. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  1289. dst[i] = constant * src[i].xyz;
  1290. }
  1291. // every fourth one will have the same alignment. Make sure we've got enough here
  1292. if ( i+3 < count ) {
  1293. vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  1294. vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  1295. vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  1296. vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  1297. }
  1298. for ( ; i+3 < count; i += 4 ) {
  1299. const float *vertPtr = src[i].xyz.ToFloatPtr();
  1300. const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
  1301. const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
  1302. const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
  1303. v0 = vec_ld( 0, vertPtr );
  1304. v1 = vec_ld( 11, vertPtr );
  1305. v2 = vec_ld( 0, vertPtr2 );
  1306. v3 = vec_ld( 11, vertPtr2 );
  1307. v4 = vec_ld( 0, vertPtr3 );
  1308. v5 = vec_ld( 11, vertPtr3 );
  1309. v6 = vec_ld( 0, vertPtr4 );
  1310. v7 = vec_ld( 11, vertPtr4 );
  1311. v0 = vec_perm( v0, v1, vertPerm1 );
  1312. v2 = vec_perm( v2, v3, vertPerm2 );
  1313. v4 = vec_perm( v4, v5, vertPerm3 );
  1314. v6 = vec_perm( v6, v7, vertPerm4 );
  1315. // transpose into X Y Z vectors
  1316. v1 = vec_mergeh( v0, v4 );
  1317. v3 = vec_mergeh( v2, v6 );
  1318. v5 = vec_mergel( v0, v4 );
  1319. v7 = vec_mergel( v2, v6 );
  1320. vecSrcX1 = vec_mergeh( v1, v3 );
  1321. vecSrcY1 = vec_mergel( v1, v3 );
  1322. vecSrcZ1 = vec_mergeh( v5, v7 );
  1323. // now calculate dot product
  1324. vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
  1325. vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
  1326. vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
  1327. // store results
  1328. vec_st( vecSrcZ1, 0, &dst[i] );
  1329. }
  1330. for ( ; i < count; i++ ) {
  1331. dst[i] = constant * src[i].xyz;
  1332. }
  1333. }
  1334. #else
  1335. /*
  1336. ============
  1337. idSIMD_AltiVec::Dot
  1338. dst[i] = constant * src[i].xyz;
  1339. ============
  1340. */
  1341. void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
  1342. //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
  1343. // idDrawVert size is 64 bytes
  1344. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
  1345. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  1346. int i;
  1347. register vector float vecConstX, vecConstY, vecConstZ;
  1348. register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
  1349. register vector float zeroVector = (vector float)(0.0);
  1350. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  1351. vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
  1352. v0 = vec_ld( 0, constant.ToFloatPtr() );
  1353. v1 = vec_ld( 11, constant.ToFloatPtr() );
  1354. v0 = vec_perm( v0, v1, constPerm );
  1355. // permute into constant vectors
  1356. vecConstX = vec_splat( v0, 0 );
  1357. vecConstY = vec_splat( v0, 1 );
  1358. vecConstZ = vec_splat( v0, 2 );
  1359. // handle unaligned case at beginning
  1360. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  1361. dst[i] = constant * src[i].xyz;
  1362. }
  1363. for ( ; i+3 < count; i += 4 ) {
  1364. const float *vertPtr = src[i].xyz.ToFloatPtr();
  1365. const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
  1366. const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
  1367. const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
  1368. v0 = vec_ld( 0, vertPtr );
  1369. v2 = vec_ld( 0, vertPtr2 );
  1370. v4 = vec_ld( 0, vertPtr3 );
  1371. v6 = vec_ld( 0, vertPtr4 );
  1372. // transpose into X Y Z vectors
  1373. v1 = vec_mergeh( v0, v4 );
  1374. v3 = vec_mergeh( v2, v6 );
  1375. v5 = vec_mergel( v0, v4 );
  1376. v7 = vec_mergel( v2, v6 );
  1377. vecSrcX1 = vec_mergeh( v1, v3 );
  1378. vecSrcY1 = vec_mergel( v1, v3 );
  1379. vecSrcZ1 = vec_mergeh( v5, v7 );
  1380. // now calculate dot product
  1381. vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
  1382. vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
  1383. vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
  1384. // store results
  1385. vec_st( vecSrcZ1, 0, &dst[i] );
  1386. }
  1387. for ( ; i < count; i++ ) {
  1388. dst[i] = constant * src[i].xyz;
  1389. }
  1390. }
  1391. #endif /* DRAWVERT_PADDED */
  1392. /*
  1393. ============
  1394. idSIMD_AltiVec::Dot
  1395. dst[i] = constant.Normal() * src[i] + constant[3];
  1396. ============
  1397. */
  1398. void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
  1399. //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
  1400. register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
  1401. register vector float vecX, vecY, vecZ, vecX2, vecY2, vecZ2;
  1402. register vector float zeroVector = (vector float)(0.0);
  1403. register vector float vecConstX, vecConstY, vecConstZ;
  1404. register vector float vecConst3;
  1405. idVec3 constNormal = constant.Normal();
  1406. float const3 = constant[3];
  1407. // permute vectors
  1408. register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
  1409. register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
  1410. register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
  1411. register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
  1412. register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
  1413. register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
  1414. int i;
  1415. vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
  1416. vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
  1417. vecLd2 = vec_ld( 15, constant.ToFloatPtr() );
  1418. vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
  1419. // populate const vec
  1420. vecConstX = vec_splat( vecLd1, 0 );
  1421. vecConstY = vec_splat( vecLd1, 1 );
  1422. vecConstZ = vec_splat( vecLd1, 2 );
  1423. // put constant to add in vector
  1424. vecConst3 = loadSplatUnalignedScalar( &const3 );
  1425. // handle unaligned case at beginning
  1426. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  1427. dst[i] = constant.Normal() * src[i] + constant[3];
  1428. }
  1429. const float *addr = src[i].ToFloatPtr();
  1430. vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
  1431. vector float vecOld = vec_ld( 0, addr );
  1432. for ( ; i+7 < count; i += 8 ) {
  1433. float *vecPtr = (float*)( addr + (i*3) );
  1434. vector float v0, v1, v2, v3, v4, v5;
  1435. v0 = vecOld; //vec_ld( 0, vecPtr );
  1436. v1 = vec_ld( 15, vecPtr );
  1437. v2 = vec_ld( 31, vecPtr );
  1438. v3 = vec_ld( 47, vecPtr );
  1439. v4 = vec_ld( 63, vecPtr );
  1440. v5 = vec_ld( 79, vecPtr );
  1441. vecOld = vec_ld( 95, vecPtr );
  1442. vecLd1 = vec_perm( v0, v1, permVec );
  1443. vecLd2 = vec_perm( v1, v2, permVec );
  1444. vecLd3 = vec_perm( v2, v3, permVec );
  1445. vecLd4 = vec_perm( v3, v4, permVec );
  1446. vecLd5 = vec_perm( v4, v5, permVec );
  1447. vecLd6 = vec_perm( v5, vecOld, permVec );
  1448. // permute into X Y Z vectors
  1449. vecX = vec_perm( vecLd1, vecLd2, permX1 );
  1450. vecY = vec_perm( vecLd1, vecLd2, permY1 );
  1451. vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
  1452. vecX = vec_perm( vecX, vecLd3, permX2 );
  1453. vecY = vec_perm( vecY, vecLd3, permY2 );
  1454. vecZ = vec_perm( vecZ, vecLd3, permZ2 );
  1455. vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
  1456. vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
  1457. vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
  1458. vecX2 = vec_perm( vecX2, vecLd6, permX2 );
  1459. vecY2 = vec_perm( vecY2, vecLd6, permY2 );
  1460. vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
  1461. // calculate dot product
  1462. vecX = vec_madd( vecX, vecConstX, zeroVector );
  1463. vecY = vec_madd( vecY, vecConstY, vecX );
  1464. vecZ = vec_madd( vecZ, vecConstZ, vecY );
  1465. vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
  1466. vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
  1467. vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
  1468. // add in constant[3]
  1469. vecZ = vec_add( vecZ, vecConst3 );
  1470. vecZ2 = vec_add( vecZ2, vecConst3 );
  1471. // store out results
  1472. ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
  1473. }
  1474. //cleanup
  1475. for ( ; i < count; i++ ) {
  1476. dst[i] = constNormal * src[i] + const3;
  1477. }
  1478. }
  1479. /*
  1480. ============
  1481. idSIMD_AltiVec::Dot
  1482. dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
  1483. ============
  1484. */
  1485. void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
  1486. //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
  1487. // check plane size
  1488. assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
  1489. float constVal[4];
  1490. float srcVal[4];
  1491. int i;
  1492. const float *constPtr = constant.ToFloatPtr();
  1493. register vector float vecX, vecY, vecZ, vecI3;
  1494. register vector float vecX2, vecY2, vecZ2, vecI32;
  1495. vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
  1496. vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
  1497. register vector float zeroVector = (vector float)(0.0);
  1498. register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
  1499. constVal[0] = *(constPtr);
  1500. constVal[1] = *(constPtr+1);
  1501. constVal[2] = *(constPtr+2);
  1502. constVal[3] = *(constPtr+3);
  1503. // populate const vector
  1504. vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
  1505. vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
  1506. vector float v1 = vec_ld( 15, constant.ToFloatPtr() );
  1507. vector float vecConst = vec_perm( v0, v1, constPerm );
  1508. vecConstX = vec_splat( vecConst, 0 );
  1509. vecConstY = vec_splat( vecConst, 1 );
  1510. vecConstZ = vec_splat( vecConst, 2 );
  1511. vecConstI3 = vec_splat( vecConst, 3 );
  1512. // handle unaligned case at beginning
  1513. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  1514. dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
  1515. }
  1516. const float *srcPtr = src[i].ToFloatPtr();
  1517. vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
  1518. vector float vecOld = vec_ld( 0, srcPtr );
  1519. for ( ; i+7 < count; i += 8 ) {
  1520. float *planePtr = (float*)( srcPtr + (i*PLANE_OFFSET) );
  1521. vector float v0, v1, v2, v3, v4, v5, v6, v7;
  1522. v0 = vecOld; // vec_ld( 0, planePtr );
  1523. v1 = vec_ld( 15, planePtr );
  1524. v2 = vec_ld( 31, planePtr );
  1525. v3 = vec_ld( 47, planePtr );
  1526. v4 = vec_ld( 63, planePtr );
  1527. v5 = vec_ld( 79, planePtr );
  1528. v6 = vec_ld( 95, planePtr );
  1529. v7 = vec_ld( 111, planePtr );
  1530. vecOld = vec_ld( 127, planePtr );
  1531. vecPlaneLd1 = vec_perm( v0, v1, permVec );
  1532. vecPlaneLd2 = vec_perm( v1, v2, permVec );
  1533. vecPlaneLd3 = vec_perm( v2, v3, permVec );
  1534. vecPlaneLd4 = vec_perm( v3, v4, permVec );
  1535. vecPlaneLd5 = vec_perm( v4, v5, permVec );
  1536. vecPlaneLd6 = vec_perm( v5, v6, permVec );
  1537. vecPlaneLd7 = vec_perm( v6, v7, permVec );
  1538. vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
  1539. // permute into X Y Z vectors, since this is square its basically
  1540. // a matrix transpose
  1541. v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
  1542. v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
  1543. v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
  1544. v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
  1545. vecX = vec_mergeh( v0, v1 );
  1546. vecY = vec_mergel( v0, v1 );
  1547. vecZ = vec_mergeh( v2, v3 );
  1548. vecI3 = vec_mergel( v2, v3 );
  1549. v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
  1550. v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
  1551. v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
  1552. v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
  1553. vecX2 = vec_mergeh( v4, v5 );
  1554. vecY2 = vec_mergel( v4, v5 );
  1555. vecZ2 = vec_mergeh( v6, v7 );
  1556. vecI32 = vec_mergel( v6, v7 );
  1557. // do calculation
  1558. v4 = vec_madd( vecConstX, vecX, zeroVector );
  1559. v5 = vec_madd( vecConstY, vecY, v4 );
  1560. v6 = vec_madd( vecConstZ, vecZ, v5 );
  1561. v7 = vec_madd( vecConstI3, vecI3, v6 );
  1562. v0 = vec_madd( vecConstX, vecX2, zeroVector );
  1563. v1 = vec_madd( vecConstY, vecY2, v0 );
  1564. v2 = vec_madd( vecConstZ, vecZ2, v1 );
  1565. v3 = vec_madd( vecConstI3, vecI32, v2 );
  1566. //store result
  1567. ALIGNED_STORE2( &dst[i], v7, v3 );
  1568. }
  1569. // cleanup
  1570. for ( ; i < count; i++ ) {
  1571. //dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
  1572. srcVal[0] = *(srcPtr + (i*PLANE_OFFSET) + 0 );
  1573. srcVal[1] = *(srcPtr + (i*PLANE_OFFSET) + 1 );
  1574. srcVal[2] = *(srcPtr + (i*PLANE_OFFSET) + 2 );
  1575. srcVal[3] = *(srcPtr + (i*PLANE_OFFSET) + 3 );
  1576. dst[i] = srcVal[0] * constVal[0] + srcVal[1] * constVal[1] + srcVal[2] * constVal[2] + constVal[3] * srcVal[3];
  1577. }
  1578. }
  1579. #ifndef DRAWVERT_PADDED
  1580. /*
  1581. ============
  1582. idSIMD_AltiVec::Dot
  1583. dst[i] = constant.Normal() * src[i].xyz + constant[3];
  1584. ============
  1585. */
  1586. void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
  1587. //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
  1588. // idDrawVert size is 60 bytes
  1589. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
  1590. int i;
  1591. const float *constPtr = constant.ToFloatPtr();
  1592. const float *srcPtr = src[0].xyz.ToFloatPtr();
  1593. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  1594. register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
  1595. register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
  1596. register vector float vecDest1;
  1597. register vector float zeroVector = (vector float)(0.0);
  1598. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  1599. float constVal[4];
  1600. float srcVal[3];
  1601. constVal[0] = *(constPtr+0);
  1602. constVal[1] = *(constPtr+1);
  1603. constVal[2] = *(constPtr+2);
  1604. constVal[3] = *(constPtr+3);
  1605. // populate const vec
  1606. vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
  1607. v0 = vec_ld( 0, constant.ToFloatPtr() );
  1608. v1 = vec_ld( 15, constant.ToFloatPtr() );
  1609. v0 = vec_perm( v0, v1, constPerm );
  1610. vecConstX = vec_splat( v0, 0 );
  1611. vecConstY = vec_splat( v0, 1 );
  1612. vecConstZ = vec_splat( v0, 2 );
  1613. vecConstI3 = vec_splat( v0, 3 );
  1614. // handle unaligned case at beginning
  1615. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  1616. dst[i] = constant.Normal() * src[i].xyz + constant[3];
  1617. }
  1618. // every fourth one will have the same alignment, so can store these. Make sure we
  1619. // have enough so we don't run off the end of the array
  1620. if ( i+3 < count ) {
  1621. vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  1622. vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  1623. vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  1624. vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  1625. }
  1626. for ( ; i+3 < count; i+=4 ) {
  1627. const float *vertPtr = src[i].xyz.ToFloatPtr();
  1628. const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
  1629. const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
  1630. const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
  1631. v0 = vec_ld( 0, vertPtr );
  1632. v1 = vec_ld( 11, vertPtr );
  1633. v2 = vec_ld( 0, vertPtr2 );
  1634. v3 = vec_ld( 11, vertPtr2 );
  1635. v4 = vec_ld( 0, vertPtr3 );
  1636. v5 = vec_ld( 11, vertPtr3 );
  1637. v6 = vec_ld( 0, vertPtr4 );
  1638. v7 = vec_ld( 11, vertPtr4 );
  1639. v0 = vec_perm( v0, v1, vertPerm1 );
  1640. v2 = vec_perm( v2, v3, vertPerm2 );
  1641. v4 = vec_perm( v4, v5, vertPerm3 );
  1642. v6 = vec_perm( v6, v7, vertPerm4 );
  1643. // transpose into X Y Z vectors
  1644. v1 = vec_mergeh( v0, v4 );
  1645. v3 = vec_mergeh( v2, v6 );
  1646. v5 = vec_mergel( v0, v4 );
  1647. v7 = vec_mergel( v2, v6 );
  1648. vecSrcX1 = vec_mergeh( v1, v3 );
  1649. vecSrcY1 = vec_mergel( v1, v3 );
  1650. vecSrcZ1 = vec_mergeh( v5, v7 );
  1651. // now calculate dot product
  1652. vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
  1653. vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
  1654. vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
  1655. vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
  1656. // store results
  1657. vec_st( vecDest1, 0, &dst[i] );
  1658. }
  1659. // cleanup
  1660. for ( ; i < count; i++ ) {
  1661. srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
  1662. srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
  1663. srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
  1664. // dst[i] = constant.Normal() * src[i].xyz + constant[3];
  1665. dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
  1666. dst[i] += constVal[3];
  1667. }
  1668. }
  1669. #else
  1670. /*
  1671. ============
  1672. idSIMD_AltiVec::Dot
  1673. dst[i] = constant.Normal() * src[i].xyz + constant[3];
  1674. ============
  1675. */
  1676. void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
  1677. //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
  1678. // idDrawVert size is 60 bytes
  1679. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
  1680. int i;
  1681. const float *constPtr = constant.ToFloatPtr();
  1682. const float *srcPtr = src[0].xyz.ToFloatPtr();
  1683. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  1684. register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
  1685. register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
  1686. register vector float vecDest1;
  1687. register vector float zeroVector = (vector float)(0.0);
  1688. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  1689. float constVal[4];
  1690. float srcVal[3];
  1691. constVal[0] = *(constPtr+0);
  1692. constVal[1] = *(constPtr+1);
  1693. constVal[2] = *(constPtr+2);
  1694. constVal[3] = *(constPtr+3);
  1695. // populate const vec
  1696. vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
  1697. v0 = vec_ld( 0, constant.ToFloatPtr() );
  1698. v1 = vec_ld( 15, constant.ToFloatPtr() );
  1699. v0 = vec_perm( v0, v1, constPerm );
  1700. vecConstX = vec_splat( v0, 0 );
  1701. vecConstY = vec_splat( v0, 1 );
  1702. vecConstZ = vec_splat( v0, 2 );
  1703. vecConstI3 = vec_splat( v0, 3 );
  1704. // handle unaligned case at beginning
  1705. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  1706. dst[i] = constant.Normal() * src[i].xyz + constant[3];
  1707. }
  1708. for ( ; i+3 < count; i+=4 ) {
  1709. const float *vertPtr = src[i].xyz.ToFloatPtr();
  1710. const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
  1711. const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
  1712. const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
  1713. v0 = vec_ld( 0, vertPtr );
  1714. v2 = vec_ld( 0, vertPtr2 );
  1715. v4 = vec_ld( 0, vertPtr3 );
  1716. v6 = vec_ld( 0, vertPtr4 );
  1717. // transpose into X Y Z vectors
  1718. v1 = vec_mergeh( v0, v4 );
  1719. v3 = vec_mergeh( v2, v6 );
  1720. v5 = vec_mergel( v0, v4 );
  1721. v7 = vec_mergel( v2, v6 );
  1722. vecSrcX1 = vec_mergeh( v1, v3 );
  1723. vecSrcY1 = vec_mergel( v1, v3 );
  1724. vecSrcZ1 = vec_mergeh( v5, v7 );
  1725. // now calculate dot product
  1726. vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
  1727. vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
  1728. vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
  1729. vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
  1730. // store results
  1731. vec_st( vecDest1, 0, &dst[i] );
  1732. }
  1733. // cleanup
  1734. for ( ; i < count; i++ ) {
  1735. srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
  1736. srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
  1737. srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
  1738. // dst[i] = constant.Normal() * src[i].xyz + constant[3];
  1739. dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
  1740. dst[i] += constVal[3];
  1741. }
  1742. }
  1743. #endif /* DRAWVERT_PADDED */
  1744. /*
  1745. ============
  1746. idSIMD_AltiVec::Dot
  1747. dst[i] = src0[i] * src1[i];
  1748. ============
  1749. */
  1750. void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
  1751. //#define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
  1752. int i;
  1753. float src0Val[3];
  1754. float src1Val[3];
  1755. register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
  1756. vector float vecLd7, vecLd8, vecLd9, vecLd10, vecLd11, vecLd12;
  1757. register vector float vecX0, vecY0, vecZ0, vecX1, vecY1, vecZ1;
  1758. register vector float vecX02, vecY02, vecZ02, vecX12, vecY12, vecZ12;
  1759. register vector float zeroVector = (vector float)(0.0);
  1760. // permute vectors
  1761. register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
  1762. register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
  1763. register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
  1764. register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
  1765. register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
  1766. register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
  1767. // handle unaligned case at beginning
  1768. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  1769. dst[i] = src0[i] * src1[i];
  1770. }
  1771. const float *src0Ptr = src0[i].ToFloatPtr();
  1772. const float *src1Ptr = src1[i].ToFloatPtr();
  1773. vector unsigned char permVec1 = vec_add( vec_lvsl( -1, src0Ptr ), (vector unsigned char)(1) );
  1774. vector unsigned char permVec2 = vec_add( vec_lvsl( -1, src1Ptr ), (vector unsigned char)(1) );
  1775. vector float vecOld0 = vec_ld( 0, src0Ptr );
  1776. vector float vecOld1 = vec_ld( 0, src1Ptr );
  1777. for ( i = 0; i+7 < count; i += 8 ) {
  1778. float *s0Ptr = (float*)( src0Ptr + (i*3) );
  1779. float *s1Ptr = (float*)( src1Ptr + (i*3) );
  1780. vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
  1781. v0 = vecOld0;
  1782. v1 = vec_ld( 15, s0Ptr );
  1783. v2 = vec_ld( 31, s0Ptr );
  1784. v3 = vec_ld( 47, s0Ptr );
  1785. v4 = vec_ld( 63, s0Ptr );
  1786. v5 = vec_ld( 79, s0Ptr );
  1787. vecOld0 = vec_ld( 95, s0Ptr );
  1788. v6 = vecOld1;
  1789. v7 = vec_ld( 15, s1Ptr );
  1790. v8 = vec_ld( 31, s1Ptr );
  1791. v9 = vec_ld( 47, s1Ptr );
  1792. v10 = vec_ld( 63, s1Ptr );
  1793. v11 = vec_ld( 79, s1Ptr );
  1794. vecOld1 = vec_ld( 95, s1Ptr );
  1795. vecLd1 = vec_perm( v0, v1, permVec1 );
  1796. vecLd2 = vec_perm( v1, v2, permVec1 );
  1797. vecLd3 = vec_perm( v2, v3, permVec1 );
  1798. vecLd4 = vec_perm( v3, v4, permVec1 );
  1799. vecLd5 = vec_perm( v4, v5, permVec1 );
  1800. vecLd6 = vec_perm( v5, vecOld0, permVec1 );
  1801. vecLd7 = vec_perm( v6, v7, permVec2 );
  1802. vecLd8 = vec_perm( v7, v8, permVec2 );
  1803. vecLd9 = vec_perm( v8, v9, permVec2 );
  1804. vecLd10 = vec_perm( v9, v10, permVec2 );
  1805. vecLd11 = vec_perm( v10, v11, permVec2 );
  1806. vecLd12 = vec_perm( v11, vecOld1, permVec2 );
  1807. // permute into X Y Z vectors
  1808. vecX0 = vec_perm( vecLd1, vecLd2, permX1 );
  1809. vecY0 = vec_perm( vecLd1, vecLd2, permY1 );
  1810. vecZ0 = vec_perm( vecLd1, vecLd2, permZ1 );
  1811. vecX0 = vec_perm( vecX0, vecLd3, permX2 );
  1812. vecY0 = vec_perm( vecY0, vecLd3, permY2 );
  1813. vecZ0 = vec_perm( vecZ0, vecLd3, permZ2 );
  1814. vecX02 = vec_perm( vecLd4, vecLd5, permX1 );
  1815. vecY02 = vec_perm( vecLd4, vecLd5, permY1 );
  1816. vecZ02 = vec_perm( vecLd4, vecLd5, permZ1 );
  1817. vecX02 = vec_perm( vecX02, vecLd6, permX2 );
  1818. vecY02 = vec_perm( vecY02, vecLd6, permY2 );
  1819. vecZ02 = vec_perm( vecZ02, vecLd6, permZ2 );
  1820. vecX1 = vec_perm( vecLd7, vecLd8, permX1 );
  1821. vecY1 = vec_perm( vecLd7, vecLd8, permY1 );
  1822. vecZ1 = vec_perm( vecLd7, vecLd8, permZ1 );
  1823. vecX1 = vec_perm( vecX1, vecLd9, permX2 );
  1824. vecY1 = vec_perm( vecY1, vecLd9, permY2 );
  1825. vecZ1 = vec_perm( vecZ1, vecLd9, permZ2 );
  1826. vecX12 = vec_perm( vecLd10, vecLd11, permX1 );
  1827. vecY12 = vec_perm( vecLd10, vecLd11, permY1 );
  1828. vecZ12 = vec_perm( vecLd10, vecLd11, permZ1 );
  1829. vecX12 = vec_perm( vecX12, vecLd12, permX2 );
  1830. vecY12 = vec_perm( vecY12, vecLd12, permY2 );
  1831. vecZ12 = vec_perm( vecZ12, vecLd12, permZ2 );
  1832. // do multiply
  1833. vecX0 = vec_madd( vecX0, vecX1, zeroVector );
  1834. vecY0 = vec_madd( vecY0, vecY1, vecX0 );
  1835. vecZ0 = vec_madd( vecZ0, vecZ1, vecY0 );
  1836. vecX02 = vec_madd( vecX02, vecX12, zeroVector );
  1837. vecY02 = vec_madd( vecY02, vecY12, vecX02 );
  1838. vecZ02 = vec_madd( vecZ02, vecZ12, vecY02 );
  1839. // store out results
  1840. ALIGNED_STORE2( &dst[i], vecZ0, vecZ02 );
  1841. }
  1842. // cleanup
  1843. for ( ; i < count; i++ ) {
  1844. // dst[i] = src0[i] * src1[i];
  1845. src0Val[0] = *( src0Ptr + (i*3) + 0 );
  1846. src0Val[1] = *( src0Ptr + (i*3) + 1 );
  1847. src0Val[2] = *( src0Ptr + (i*3) + 2 );
  1848. src1Val[0] = *( src1Ptr + (i*3) + 0 );
  1849. src1Val[1] = *( src1Ptr + (i*3) + 1 );
  1850. src1Val[2] = *( src1Ptr + (i*3) + 2 );
  1851. dst[i] = src0Val[0] * src1Val[0] + src0Val[1] * src1Val[1] + src0Val[2] * src1Val[2];
  1852. }
  1853. }
  1854. /*
  1855. ============
  1856. idSIMD_AltiVec::Dot
  1857. dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
  1858. ============
  1859. */
  1860. void VPCALL idSIMD_AltiVec::Dot( float &dot, const float *src1, const float *src2, const int count ) {
  1861. dot = 0.0f;
  1862. register vector float v0, v1, v2, v3;
  1863. register vector float zeroVector;
  1864. register vector float runningTotal1, runningTotal2;
  1865. //src0
  1866. register vector float v0_low, v0_hi, v2_low, v2_hi;
  1867. //src1
  1868. register vector float v1_low, v1_hi, v3_low, v3_hi;
  1869. //permute vectors
  1870. register vector unsigned char permVec1, permVec2;
  1871. vector unsigned char oneCharVector = (vector unsigned char)(1);
  1872. int i = 0;
  1873. runningTotal1 = (vector float)(0.0);
  1874. runningTotal2 = (vector float)(0.0);
  1875. zeroVector = (vector float)(0.0);
  1876. if ( count >= 8 ) {
  1877. //calculate permute and do loads
  1878. permVec1 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
  1879. permVec2 = vec_add( vec_lvsl( -1, (int*) &src2[i] ), oneCharVector );
  1880. v2_hi = vec_ld( 0, &src1[i] );
  1881. v3_hi = vec_ld( 0, &src2[i] );
  1882. //vectorize!
  1883. for ( ; i+7 < count; i += 8 ) {
  1884. //load sources
  1885. v0_low = v2_hi;
  1886. v0_hi = vec_ld( 15, &src1[i] );
  1887. v2_low = v0_hi;
  1888. v2_hi = vec_ld( 31, &src1[i] );
  1889. v1_low = v3_hi;
  1890. v1_hi = vec_ld( 15, &src2[i] );
  1891. v3_low = v1_hi;
  1892. v3_hi = vec_ld( 31, &src2[i] );
  1893. v0 = vec_perm( v0_low, v0_hi, permVec1 );
  1894. v1 = vec_perm( v1_low, v1_hi, permVec2 );
  1895. v2 = vec_perm( v2_low, v2_hi, permVec1 );
  1896. v3 = vec_perm( v3_low, v3_hi, permVec2 );
  1897. //multiply together and keep running sum
  1898. runningTotal1 = vec_madd( v0, v1, runningTotal1 );
  1899. runningTotal2 = vec_madd( v2, v3, runningTotal2 );
  1900. }
  1901. runningTotal1 = vec_add( runningTotal1, runningTotal2 );
  1902. // sum accross vector
  1903. v0 = vec_add( runningTotal1, vec_sld( runningTotal1, runningTotal1, 8 ) );
  1904. v1 = vec_add( v0, vec_sld( v0, v0, 4 ) );
  1905. runningTotal1 = vec_splat( v1, 0 );
  1906. vec_ste( runningTotal1, 0, &dot );
  1907. }
  1908. //handle cleanup. when profiling the game, we found that most of the counts to this function were small, so it
  1909. // spends a lot of time in this scalar code. It's already really really fast (eg 1 TB tick) for scalar code for
  1910. // counts less than 50, so not much point in trying to get vector code in on the action
  1911. for ( ; i < count ; i++ ) {
  1912. dot += src1[i] * src2[i];
  1913. }
  1914. }
  1915. #endif /* ENABLE_DOT */
  1916. #ifdef ENABLE_COMPARES
  1917. /*
  1918. ============
  1919. idSIMD_AltiVec::CmpGT
  1920. dst[i] = src0[i] > constant;
  1921. ============
  1922. */
  1923. void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
  1924. //#define OPER(X) dst[(X)] = src0[(X)] > constant;
  1925. register vector float v0, v1, v2, v3;
  1926. register vector bool int vr1, vr2, vr3, vr4;
  1927. register vector bool short vs1, vs2;
  1928. register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
  1929. register vector unsigned char vc1;
  1930. register vector bool char vbc1;
  1931. register vector float constVec;
  1932. register vector unsigned char oneVector = (vector unsigned char)(1);
  1933. register vector unsigned char permVec;
  1934. int i;
  1935. //handle unaligned at start
  1936. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  1937. dst[i] = src0[i] > constant;
  1938. }
  1939. //splat constant into a vector
  1940. constVec = loadSplatUnalignedScalar( &constant );
  1941. //calculate permute and do loads
  1942. permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
  1943. v3_hi = vec_ld( 0, &src0[i] );
  1944. //vectorize!
  1945. for ( ; i+15 < count; i += 16 ) {
  1946. // load values
  1947. v0_low = v3_hi;
  1948. v0_hi = vec_ld( 15, &src0[i] );
  1949. v1_low = v0_hi;
  1950. v1_hi = vec_ld( 31, &src0[i] );
  1951. v2_low = v1_hi;
  1952. v2_hi = vec_ld( 47, &src0[i] );
  1953. v3_low = v2_hi;
  1954. v3_hi = vec_ld( 63, &src0[i] );
  1955. //permute into the vectors we want
  1956. v0 = vec_perm( v0_low, v0_hi, permVec );
  1957. v1 = vec_perm( v1_low, v1_hi, permVec );
  1958. v2 = vec_perm( v2_low, v2_hi, permVec );
  1959. v3 = vec_perm( v3_low, v3_hi, permVec );
  1960. //do comparison
  1961. vr1 = vec_cmpgt( v0, constVec );
  1962. vr2 = vec_cmpgt( v1, constVec );
  1963. vr3 = vec_cmpgt( v2, constVec );
  1964. vr4 = vec_cmpgt( v3, constVec );
  1965. // pack results into shorts
  1966. vs1 = vec_pack(vr1, vr2);
  1967. vs2 = vec_pack(vr3, vr4);
  1968. // pack results into byte
  1969. vbc1 = vec_pack(vs1, vs2);
  1970. //AND with 1 to get true=1 not true=255
  1971. vc1 = vec_and( vbc1, oneVector );
  1972. //store results
  1973. vec_st( vc1, 0, &dst[i] );
  1974. }
  1975. //handle cleanup
  1976. for ( ; i < count ; i++ ) {
  1977. dst[i] = src0[i] > constant;
  1978. }
  1979. }
  1980. /*
  1981. ============
  1982. idSIMD_AltiVec::CmpGT
  1983. dst[i] |= ( src0[i] > constant ) << bitNum;
  1984. ============
  1985. */
  1986. void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  1987. //#define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
  1988. // Temp vector registers
  1989. register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
  1990. register vector bool short vtbs0, vtbs1;
  1991. register vector bool char vtbc0;
  1992. register vector unsigned char vtuc0;
  1993. register vector unsigned char permVec, permVec2;
  1994. // dest vectors
  1995. register vector unsigned char vd;
  1996. // bitNum vectors
  1997. register vector unsigned char bitNumVec;
  1998. // src0 vectors
  1999. register vector float vs0, vs1, vs2, vs3;
  2000. register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
  2001. // constant vector
  2002. register vector float constVec;
  2003. // all one's
  2004. register vector unsigned char oneVector = (vector unsigned char)(1);
  2005. int i = 0;
  2006. //handle unaligned at start
  2007. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  2008. dst[i] |= ( src0[i] > constant ) << bitNum;
  2009. }
  2010. //splat constant into a vector
  2011. constVec = loadSplatUnalignedScalar( &constant );
  2012. //bitNum is unaligned.
  2013. permVec2 = vec_lvsl( 0, &bitNum );
  2014. vtuc0 = vec_ld( 0, &bitNum );
  2015. bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
  2016. bitNumVec = vec_splat( bitNumVec, 0 );
  2017. //calculate permute and do loads
  2018. permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
  2019. vs3_hi = vec_ld( 0, &src0[i] );
  2020. //vectorize!
  2021. for ( ; i+15 < count; i += 16 ) {
  2022. //load sources (floats)
  2023. vs0_low = vs3_hi;
  2024. vs0_hi = vec_ld( 15, &src0[i] );
  2025. vs1_low = vs0_hi;
  2026. vs1_hi = vec_ld( 31, &src0[i] );
  2027. vs2_low = vs1_hi;
  2028. vs2_hi = vec_ld( 47, &src0[i] );
  2029. vs3_low = vs2_hi;
  2030. vs3_hi = vec_ld( 63, &src0[i] );
  2031. //permute into the vectors we want
  2032. vs0 = vec_perm( vs0_low, vs0_hi, permVec );
  2033. vs1 = vec_perm( vs1_low, vs1_hi, permVec );
  2034. vs2 = vec_perm( vs2_low, vs2_hi, permVec );
  2035. vs3 = vec_perm( vs3_low, vs3_hi, permVec );
  2036. //load dest (bytes) as unsigned char
  2037. vd = vec_ld( 0, &dst[i] );
  2038. // do comparison and get bool int result
  2039. vtbi0 = vec_cmpgt( vs0, constVec );
  2040. vtbi1 = vec_cmpgt( vs1, constVec );
  2041. vtbi2 = vec_cmpgt( vs2, constVec );
  2042. vtbi3 = vec_cmpgt( vs3, constVec );
  2043. // pack results into shorts
  2044. vtbs0 = vec_pack(vtbi0, vtbi1);
  2045. vtbs1 = vec_pack(vtbi2, vtbi3);
  2046. // pack results into byte
  2047. vtbc0 = vec_pack(vtbs0, vtbs1);
  2048. //and with 1 to get true=1 instead of true=255
  2049. vtuc0 = vec_and(vtbc0, oneVector);
  2050. vtuc0 = vec_sl(vtuc0, bitNumVec );
  2051. //or with original
  2052. vd = vec_or( vd, vtuc0 );
  2053. vec_st( vd, 0, &dst[i] );
  2054. }
  2055. //handle cleanup
  2056. for ( ; i < count ; i++ ) {
  2057. dst[i] |= ( src0[i] > constant ) << bitNum;
  2058. }
  2059. }
  2060. /*
  2061. ============
  2062. idSIMD_AltiVec::CmpGE
  2063. dst[i] = src0[i] >= constant;
  2064. ============
  2065. */
  2066. void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
  2067. register vector float v0, v1, v2, v3;
  2068. register vector bool int vr1, vr2, vr3, vr4;
  2069. register vector bool short vs1, vs2;
  2070. register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
  2071. register vector unsigned char vc1;
  2072. register vector bool char vbc1;
  2073. register vector float constVec;
  2074. register vector unsigned char oneVector = (vector unsigned char)(1);
  2075. register vector unsigned char permVec;
  2076. int i = 0;
  2077. //handle unaligned at start
  2078. for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  2079. dst[i] = src0[i] >= constant;
  2080. }
  2081. //splat constant into a vector
  2082. constVec = loadSplatUnalignedScalar( &constant );
  2083. //calculate permute and do loads
  2084. permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
  2085. v3_hi = vec_ld( 0, &src0[i] );
  2086. //vectorize!
  2087. for ( ; i+15 < count; i += 16 ) {
  2088. // load values
  2089. v0_low = v3_hi;
  2090. v0_hi = vec_ld( 15, &src0[i] );
  2091. v1_low = v0_hi;
  2092. v1_hi = vec_ld( 31, &src0[i] );
  2093. v2_low = v1_hi;
  2094. v2_hi = vec_ld( 47, &src0[i] );
  2095. v3_low = v2_hi;
  2096. v3_hi = vec_ld( 63, &src0[i] );
  2097. //permute into the vectors we want
  2098. v0 = vec_perm( v0_low, v0_hi, permVec );
  2099. v1 = vec_perm( v1_low, v1_hi, permVec );
  2100. v2 = vec_perm( v2_low, v2_hi, permVec );
  2101. v3 = vec_perm( v3_low, v3_hi, permVec );
  2102. //do comparison
  2103. vr1 = vec_cmpge( v0, constVec );
  2104. vr2 = vec_cmpge( v1, constVec );
  2105. vr3 = vec_cmpge( v2, constVec );
  2106. vr4 = vec_cmpge( v3, constVec );
  2107. // pack results into shorts
  2108. vs1 = vec_pack(vr1, vr2);
  2109. vs2 = vec_pack(vr3, vr4);
  2110. // pack results into byte
  2111. vbc1 = vec_pack(vs1, vs2);
  2112. //AND with 1 to get true=1 not true=255
  2113. vc1 = vec_and( vbc1, oneVector );
  2114. //store results
  2115. vec_st( vc1, 0, &dst[i] );
  2116. }
  2117. //handle cleanup
  2118. for ( ; i < count ; i++ ) {
  2119. dst[i] = src0[i] >= constant;
  2120. }
  2121. }
  2122. /*
  2123. ============
  2124. idSIMD_AltiVec::CmpGE
  2125. dst[i] |= ( src0[i] >= constant ) << bitNum;
  2126. ============
  2127. */
  2128. void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  2129. register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
  2130. register vector bool short vtbs0, vtbs1;
  2131. register vector bool char vtbc0;
  2132. register vector unsigned char vtuc0;
  2133. register vector unsigned char permVec, permVec2;
  2134. // dest vectors
  2135. register vector unsigned char vd;
  2136. // bitNum vectors
  2137. register vector unsigned char bitNumVec;
  2138. // src0 vectors
  2139. register vector float vs0, vs1, vs2, vs3;
  2140. register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
  2141. // constant vector
  2142. register vector float constVec;
  2143. // all one's
  2144. register vector unsigned char oneVector = (vector unsigned char)(1);
  2145. int i = 0;
  2146. //handle unaligned at start
  2147. for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  2148. dst[i] |= ( src0[i] >= constant ) << bitNum;
  2149. }
  2150. //splat constant into a vector
  2151. constVec = loadSplatUnalignedScalar( &constant );
  2152. //bitNum is unaligned.
  2153. permVec2 = vec_lvsl( 0, &bitNum );
  2154. vtuc0 = vec_ld( 0, &bitNum );
  2155. bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
  2156. bitNumVec = vec_splat( bitNumVec, 0 );
  2157. //calculate permute and do loads
  2158. permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
  2159. vs3_hi = vec_ld( 0, &src0[i] );
  2160. //vectorize!
  2161. for ( ; i+15 < count; i += 16 ) {
  2162. //load sources (floats)
  2163. vs0_low = vs3_hi;
  2164. vs0_hi = vec_ld( 15, &src0[i] );
  2165. vs1_low = vs0_hi;
  2166. vs1_hi = vec_ld( 31, &src0[i] );
  2167. vs2_low = vs1_hi;
  2168. vs2_hi = vec_ld( 47, &src0[i] );
  2169. vs3_low = vs2_hi;
  2170. vs3_hi = vec_ld( 63, &src0[i] );
  2171. //permute into the vectors we want
  2172. vs0 = vec_perm( vs0_low, vs0_hi, permVec );
  2173. vs1 = vec_perm( vs1_low, vs1_hi, permVec );
  2174. vs2 = vec_perm( vs2_low, vs2_hi, permVec );
  2175. vs3 = vec_perm( vs3_low, vs3_hi, permVec );
  2176. //load dest (bytes) as unsigned char
  2177. vd = vec_ld( 0, &dst[i] );
  2178. // do comparison and get bool int result
  2179. vtbi0 = vec_cmpge( vs0, constVec );
  2180. vtbi1 = vec_cmpge( vs1, constVec );
  2181. vtbi2 = vec_cmpge( vs2, constVec );
  2182. vtbi3 = vec_cmpge( vs3, constVec );
  2183. // pack results into shorts
  2184. vtbs0 = vec_pack(vtbi0, vtbi1);
  2185. vtbs1 = vec_pack(vtbi2, vtbi3);
  2186. // pack results into byte
  2187. vtbc0 = vec_pack(vtbs0, vtbs1);
  2188. //and with 1L to get true=1 instead of true=255
  2189. vtuc0 = vec_and(vtbc0, oneVector);
  2190. vtuc0 = vec_sl(vtuc0, bitNumVec );
  2191. //or with original
  2192. vd = vec_or( vd, vtuc0 );
  2193. vec_st( vd, 0, &dst[i] );
  2194. }
  2195. //handle cleanup
  2196. for ( ; i < count ; i++ ) {
  2197. dst[i] |= ( src0[i] >= constant ) << bitNum;
  2198. }
  2199. }
  2200. /*
  2201. ============
  2202. idSIMD_AltiVec::CmpLT
  2203. dst[i] = src0[i] < constant;
  2204. ============
  2205. */
  2206. void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
  2207. //#define OPER(X) dst[(X)] = src0[(X)] < constant;
  2208. register vector float v0, v1, v2, v3;
  2209. register vector bool int vr1, vr2, vr3, vr4;
  2210. register vector bool short vs1, vs2;
  2211. register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
  2212. register vector unsigned char vc1;
  2213. register vector bool char vbc1;
  2214. register vector float constVec;
  2215. register vector unsigned char oneVector = (vector unsigned char)(1);
  2216. register vector unsigned char permVec;
  2217. int i = 0;
  2218. //handle unaligned at start
  2219. for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  2220. dst[i] = src0[i] < constant;
  2221. }
  2222. //splat constant into a vector
  2223. constVec = loadSplatUnalignedScalar( &constant );
  2224. //calculate permute and do loads
  2225. permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
  2226. v3_hi = vec_ld( 0, &src0[i] );
  2227. //vectorize!
  2228. for ( ; i+15 < count; i += 16 ) {
  2229. // load values
  2230. v0_low = v3_hi;
  2231. v0_hi = vec_ld( 15, &src0[i] );
  2232. v1_low = v0_hi;
  2233. v1_hi = vec_ld( 31, &src0[i] );
  2234. v2_low = v1_hi;
  2235. v2_hi = vec_ld( 47, &src0[i] );
  2236. v3_low = v2_hi;
  2237. v3_hi = vec_ld( 63, &src0[i] );
  2238. //permute into the vectors we want
  2239. v0 = vec_perm( v0_low, v0_hi, permVec );
  2240. v1 = vec_perm( v1_low, v1_hi, permVec );
  2241. v2 = vec_perm( v2_low, v2_hi, permVec );
  2242. v3 = vec_perm( v3_low, v3_hi, permVec );
  2243. //do comparison
  2244. vr1 = vec_cmplt( v0, constVec );
  2245. vr2 = vec_cmplt( v1, constVec );
  2246. vr3 = vec_cmplt( v2, constVec );
  2247. vr4 = vec_cmplt( v3, constVec );
  2248. // pack results into shorts
  2249. vs1 = vec_pack(vr1, vr2);
  2250. vs2 = vec_pack(vr3, vr4);
  2251. // pack results into byte
  2252. vbc1 = vec_pack(vs1, vs2);
  2253. //AND with 1 to get true=1 not true=255
  2254. vc1 = vec_and( vbc1, oneVector );
  2255. //store results
  2256. vec_st( vc1, 0, &dst[i] );
  2257. }
  2258. //handle cleanup
  2259. for ( ; i < count ; i++ ) {
  2260. dst[i] = src0[i] < constant;
  2261. }
  2262. }
  2263. /*
  2264. ============
  2265. idSIMD_AltiVec::CmpLT
  2266. dst[i] |= ( src0[i] < constant ) << bitNum;
  2267. ============
  2268. */
  2269. void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  2270. //#define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
  2271. register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
  2272. register vector bool short vtbs0, vtbs1;
  2273. register vector bool char vtbc0;
  2274. register vector unsigned char vtuc0;
  2275. register vector unsigned char permVec, permVec2;
  2276. // dest vectors
  2277. register vector unsigned char vd;
  2278. // bitNum vectors
  2279. register vector unsigned char bitNumVec;
  2280. // src0 vectors
  2281. register vector float vs0, vs1, vs2, vs3;
  2282. register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
  2283. // constant vector
  2284. register vector float constVec;
  2285. // all one's
  2286. register vector unsigned char oneVector = (vector unsigned char)(1);
  2287. int i = 0;
  2288. //handle unaligned at start
  2289. for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  2290. dst[i] |= ( src0[i] < constant ) << bitNum;
  2291. }
  2292. //splat constant into a vector
  2293. constVec = loadSplatUnalignedScalar( &constant );
  2294. //bitNum is unaligned.
  2295. permVec2 = vec_lvsl( 0, &bitNum );
  2296. vtuc0 = vec_ld( 0, &bitNum );
  2297. bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
  2298. bitNumVec = vec_splat( bitNumVec, 0 );
  2299. //calculate permute and do loads
  2300. permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
  2301. vs3_hi = vec_ld( 0, &src0[i] );
  2302. //vectorize!
  2303. for ( ; i+15 < count; i += 16 ) {
  2304. //load sources (floats)
  2305. vs0_low = vs3_hi;
  2306. vs0_hi = vec_ld( 15, &src0[i] );
  2307. vs1_low = vs0_hi;
  2308. vs1_hi = vec_ld( 31, &src0[i] );
  2309. vs2_low = vs1_hi;
  2310. vs2_hi = vec_ld( 47, &src0[i] );
  2311. vs3_low = vs2_hi;
  2312. vs3_hi = vec_ld( 63, &src0[i] );
  2313. //permute into the vectors we want
  2314. vs0 = vec_perm( vs0_low, vs0_hi, permVec );
  2315. vs1 = vec_perm( vs1_low, vs1_hi, permVec );
  2316. vs2 = vec_perm( vs2_low, vs2_hi, permVec );
  2317. vs3 = vec_perm( vs3_low, vs3_hi, permVec );
  2318. //load dest (bytes) as unsigned char
  2319. vd = vec_ld( 0, &dst[i] );
  2320. // do comparison and get bool int result
  2321. vtbi0 = vec_cmplt( vs0, constVec );
  2322. vtbi1 = vec_cmplt( vs1, constVec );
  2323. vtbi2 = vec_cmplt( vs2, constVec );
  2324. vtbi3 = vec_cmplt( vs3, constVec );
  2325. // pack results into shorts
  2326. vtbs0 = vec_pack(vtbi0, vtbi1);
  2327. vtbs1 = vec_pack(vtbi2, vtbi3);
  2328. // pack results into byte
  2329. vtbc0 = vec_pack(vtbs0, vtbs1);
  2330. //and with 1L to get true=1 instead of true=255
  2331. vtuc0 = vec_and(vtbc0, oneVector);
  2332. vtuc0 = vec_sl(vtuc0, bitNumVec );
  2333. //or with original
  2334. vd = vec_or( vd, vtuc0 );
  2335. vec_st( vd, 0, &dst[i] );
  2336. }
  2337. //handle cleanup
  2338. for ( ; i < count ; i++ ) {
  2339. dst[i] |= ( src0[i] < constant ) << bitNum;
  2340. }
  2341. }
  2342. //#endif
  2343. /*
  2344. ============
  2345. idSIMD_AltiVec::CmpLE
  2346. dst[i] = src0[i] <= constant;
  2347. ============
  2348. */
  2349. void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
  2350. //#define OPER(X) dst[(X)] = src0[(X)] <= constant;
  2351. register vector float v0, v1, v2, v3;
  2352. register vector bool int vr1, vr2, vr3, vr4;
  2353. register vector bool short vs1, vs2;
  2354. register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
  2355. register vector unsigned char vc1;
  2356. register vector bool char vbc1;
  2357. register vector float constVec;
  2358. register vector unsigned char oneVector = (vector unsigned char)(1);
  2359. register vector unsigned char permVec;
  2360. int i = 0;
  2361. //handle unaligned at start
  2362. for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  2363. dst[i] = src0[i] <= constant;
  2364. }
  2365. //splat constant into a vector
  2366. constVec = loadSplatUnalignedScalar( &constant );
  2367. //calculate permute and do loads
  2368. permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
  2369. v3_hi = vec_ld( 0, &src0[i] );
  2370. //vectorize!
  2371. for ( ; i+15 < count; i += 16 ) {
  2372. // load values
  2373. v0_low = v3_hi;
  2374. v0_hi = vec_ld( 15, &src0[i] );
  2375. v1_low = v0_hi;
  2376. v1_hi = vec_ld( 31, &src0[i] );
  2377. v2_low = v1_hi;
  2378. v2_hi = vec_ld( 47, &src0[i] );
  2379. v3_low = v2_hi;
  2380. v3_hi = vec_ld( 63, &src0[i] );
  2381. //permute into the vectors we want
  2382. v0 = vec_perm( v0_low, v0_hi, permVec );
  2383. v1 = vec_perm( v1_low, v1_hi, permVec );
  2384. v2 = vec_perm( v2_low, v2_hi, permVec );
  2385. v3 = vec_perm( v3_low, v3_hi, permVec );
  2386. //do comparison
  2387. vr1 = vec_cmple( v0, constVec );
  2388. vr2 = vec_cmple( v1, constVec );
  2389. vr3 = vec_cmple( v2, constVec );
  2390. vr4 = vec_cmple( v3, constVec );
  2391. // pack results into shorts
  2392. vs1 = vec_pack(vr1, vr2);
  2393. vs2 = vec_pack(vr3, vr4);
  2394. // pack results into byte
  2395. vbc1 = vec_pack(vs1, vs2);
  2396. //AND with 1 to get true=1 not true=255
  2397. vc1 = vec_and( vbc1, oneVector );
  2398. //store results
  2399. vec_st( vc1, 0, &dst[i] );
  2400. }
  2401. //handle cleanup
  2402. for ( ; i < count ; i++ ) {
  2403. dst[i] = src0[i] <= constant;
  2404. }
  2405. }
  2406. /*
  2407. ============
  2408. idSIMD_AltiVec::CmpLE
  2409. dst[i] |= ( src0[i] <= constant ) << bitNum;
  2410. ============
  2411. */
  2412. void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  2413. //#define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
  2414. register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
  2415. register vector bool short vtbs0, vtbs1;
  2416. register vector bool char vtbc0;
  2417. register vector unsigned char vtuc0;
  2418. register vector unsigned char permVec, permVec2;
  2419. // dest vectors
  2420. register vector unsigned char vd;
  2421. // bitNum vectors
  2422. register vector unsigned char bitNumVec;
  2423. // src0 vectors
  2424. register vector float vs0, vs1, vs2, vs3;
  2425. register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
  2426. // constant vector
  2427. register vector float constVec;
  2428. // all one's
  2429. register vector unsigned char oneVector = (vector unsigned char)(1);
  2430. int i = 0;
  2431. //handle unaligned at start
  2432. for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
  2433. dst[i] |= ( src0[i] <= constant ) << bitNum;
  2434. }
  2435. //splat constant into a vector
  2436. constVec = loadSplatUnalignedScalar( &constant );
  2437. //bitNum is unaligned.
  2438. permVec2 = vec_lvsl( 0, &bitNum );
  2439. vtuc0 = vec_ld( 0, &bitNum );
  2440. bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
  2441. bitNumVec = vec_splat( bitNumVec, 0 );
  2442. //calculate permute and do loads
  2443. permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
  2444. vs3_hi = vec_ld( 0, &src0[i] );
  2445. //vectorize!
  2446. for ( ; i+15 < count; i += 16 ) {
  2447. //load sources (floats)
  2448. vs0_low = vs3_hi;
  2449. vs0_hi = vec_ld( 15, &src0[i] );
  2450. vs1_low = vs0_hi;
  2451. vs1_hi = vec_ld( 31, &src0[i] );
  2452. vs2_low = vs1_hi;
  2453. vs2_hi = vec_ld( 47, &src0[i] );
  2454. vs3_low = vs2_hi;
  2455. vs3_hi = vec_ld( 63, &src0[i] );
  2456. //permute into the vectors we want
  2457. vs0 = vec_perm( vs0_low, vs0_hi, permVec );
  2458. vs1 = vec_perm( vs1_low, vs1_hi, permVec );
  2459. vs2 = vec_perm( vs2_low, vs2_hi, permVec );
  2460. vs3 = vec_perm( vs3_low, vs3_hi, permVec );
  2461. //load dest (bytes) as unsigned char
  2462. vd = vec_ld( 0, &dst[i] );
  2463. // do comparison and get bool int result
  2464. vtbi0 = vec_cmple( vs0, constVec );
  2465. vtbi1 = vec_cmple( vs1, constVec );
  2466. vtbi2 = vec_cmple( vs2, constVec );
  2467. vtbi3 = vec_cmple( vs3, constVec );
  2468. // pack results into shorts
  2469. vtbs0 = vec_pack(vtbi0, vtbi1);
  2470. vtbs1 = vec_pack(vtbi2, vtbi3);
  2471. // pack results into byte
  2472. vtbc0 = vec_pack(vtbs0, vtbs1);
  2473. //and with 1L to get true=1 instead of true=255
  2474. vtuc0 = vec_and(vtbc0, oneVector);
  2475. vtuc0 = vec_sl(vtuc0, bitNumVec );
  2476. //or with original
  2477. vd = vec_or( vd, vtuc0 );
  2478. vec_st( vd, 0, &dst[i] );
  2479. }
  2480. //handle cleanup
  2481. for ( ; i < count ; i++ ) {
  2482. dst[i] |= ( src0[i] <= constant ) << bitNum;
  2483. }
  2484. }
  2485. #endif /* ENABLE_COMPARES */
  2486. #ifdef ENABLE_MINMAX
  2487. /*
  2488. ============
  2489. idSIMD_AltiVec::MinMax
  2490. ============
  2491. */
  2492. void VPCALL idSIMD_AltiVec::MinMax( float &min, float &max, const float *src, const int count ) {
  2493. min = idMath::INFINITY; max = -idMath::INFINITY;
  2494. //#define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
  2495. register vector float v0, v1, v2, v3;
  2496. register vector float maxVec, minVec, tempMin, tempMax;
  2497. register vector unsigned char permVec;
  2498. register vector float v0_low, v0_hi, v1_low, v1_hi;
  2499. vector unsigned char oneCharVector = (vector unsigned char)(1);
  2500. int i = 0;
  2501. if ( count >= 4 ) {
  2502. //calculate permute and do first load to
  2503. //get a starting point for min and max
  2504. permVec = vec_add( vec_lvsl( -1, (int*) &src[0] ), oneCharVector );
  2505. v1_hi = vec_ld( 0, &src[0] );
  2506. maxVec = loadSplatUnalignedScalar( &max );
  2507. minVec = loadSplatUnalignedScalar( &min );
  2508. //vectorize!
  2509. for ( ; i+7 < count; i += 8 ) {
  2510. //load sources
  2511. v0_low = v1_hi;
  2512. v0_hi = vec_ld( 15, &src[i] );
  2513. v1_low = v0_hi;
  2514. v1_hi = vec_ld( 31, &src[i] );
  2515. v0 = vec_perm( v0_low, v0_hi, permVec );
  2516. v1 = vec_perm( v1_low, v1_hi, permVec );
  2517. // minimum
  2518. v2 = vec_min( v0, v1 );
  2519. minVec = vec_min( minVec, v2 );
  2520. // maximum
  2521. v3 = vec_max( v0, v1 );
  2522. maxVec = vec_max( maxVec, v3 );
  2523. }
  2524. //minVec and maxVec hold the min/max elements from the array, but now
  2525. //we need to figure out which particular element it is
  2526. tempMin = minVec;
  2527. tempMax = maxVec;
  2528. // rotate vector around and compare to itself to find the real min/max
  2529. tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 8 ) );
  2530. tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 8 ) );
  2531. tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 4 ) );
  2532. tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 4 ) );
  2533. minVec = vec_splat( tempMin, 0 );
  2534. maxVec = vec_splat( tempMax, 0 );
  2535. vec_ste( minVec, 0, &min );
  2536. vec_ste( maxVec, 0, &max );
  2537. }
  2538. //cleanup
  2539. for ( ; i < count; i++ ) {
  2540. if ( src[i] < min ) {
  2541. min = src[i];
  2542. }
  2543. if ( src[i] > max ) {
  2544. max = src[i];
  2545. }
  2546. }
  2547. }
  2548. /*
  2549. ============
  2550. idSIMD_AltiVec::MinMax
  2551. ============
  2552. */
  2553. void VPCALL idSIMD_AltiVec::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
  2554. min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
  2555. //#define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
  2556. idVec2 v;
  2557. int i = 0;
  2558. int j;
  2559. const float *srcPtr = src[0].ToFloatPtr();
  2560. register vector float vecLd1, vecLd2, vecLd3, vecLd4;
  2561. register vector float vecMin, vecMax;
  2562. register vector float v0, v1, v2, v3;
  2563. if ( count > 4 ) {
  2564. vecMin = (vector float)(FLT_MAX);
  2565. vecMax = (vector float)(FLT_MIN);
  2566. vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
  2567. vector float vecOld = vec_ld( 0, srcPtr );
  2568. for ( i = 0, j = 0; i+7 < count; i += 8, j += 4) {
  2569. // load data
  2570. float *vecPtr = (float*)( srcPtr + (j*4) );
  2571. vector float v0, v1, v2, v3;
  2572. v0 = vecOld;
  2573. v1 = vec_ld( 15, vecPtr );
  2574. v2 = vec_ld( 31, vecPtr );
  2575. v3 = vec_ld( 47, vecPtr );
  2576. vecOld = vec_ld( 63, vecPtr );
  2577. vecLd1 = vec_perm( v0, v1, permVec );
  2578. vecLd2 = vec_perm( v1, v2, permVec );
  2579. vecLd3 = vec_perm( v2, v3, permVec );
  2580. vecLd4 = vec_perm( v3, vecOld, permVec );
  2581. // each of these vectors contains 2 elements
  2582. // looks like | X Y X Y | X Y X Y
  2583. v0 = vec_min( vecLd1, vecLd2 );
  2584. v1 = vec_min( vecLd3, vecLd4 );
  2585. v0 = vec_min( v0, v1 );
  2586. v2 = vec_max( vecLd1, vecLd2 );
  2587. v3 = vec_max( vecLd3, vecLd4 );
  2588. v2 = vec_max( v2, v3 );
  2589. // since its always X Y X Y we don't have to re-merge each time. we can wait
  2590. // until the end
  2591. vecMin = vec_min( v0, vecMin );
  2592. vecMax = vec_max( v2, vecMax );
  2593. }
  2594. vecMin = vec_min( vecMin, vec_sld( vecMin, vecMin, 8 ) );
  2595. vecMax = vec_max( vecMax, vec_sld( vecMax, vecMax, 8 ) );
  2596. v0 = vec_splat( vecMin, 0 );
  2597. v1 = vec_splat( vecMin, 1 );
  2598. v2 = vec_splat( vecMax, 0 );
  2599. v3 = vec_splat( vecMax, 1 );
  2600. vec_ste( v0, 0, &min[0] );
  2601. vec_ste( v1, 0, &min[1] );
  2602. vec_ste( v2, 0, &max[0] );
  2603. vec_ste( v3, 0, &max[1] );
  2604. }
  2605. // cleanup
  2606. for ( ; i < count; i++ ) {
  2607. v = src[i];
  2608. if ( v[0] < min[0] ) {
  2609. min[0] = v[0];
  2610. }
  2611. if ( v[0] > max[0] ) {
  2612. max[0] = v[0];
  2613. }
  2614. if ( v[1] < min[1] ) {
  2615. min[1] = v[1];
  2616. }
  2617. if ( v[1] > max[1] ) {
  2618. max[1] = v[1];
  2619. }
  2620. }
  2621. }
  2622. /*
  2623. ============
  2624. idSIMD_AltiVec::MinMax
  2625. ============
  2626. */
  2627. void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
  2628. min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
  2629. //#define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
  2630. int i = 0;
  2631. const float *srcPtr = src[0].ToFloatPtr();
  2632. idVec3 v;
  2633. register vector float vecLd1, vecLd2, vecLd3;
  2634. register vector float vecMin, vecMax;
  2635. register vector float vecSrc1, vecSrc2, vecSrc3, vecSrc4;
  2636. register vector float vecMin1, vecMin2, vecMax1, vecMax2;
  2637. if ( count >= 4 ) {
  2638. vecMin = (vector float)(FLT_MAX);
  2639. vecMax = (vector float)(FLT_MIN);
  2640. vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr), (vector unsigned char)(1) );
  2641. vector float vecOld = vec_ld( 0, srcPtr );
  2642. // 4 elements at a time
  2643. for ( ; i+3 < count; i += 4 ) {
  2644. float *vecPtr = (float*)( srcPtr + (i*3) );
  2645. vector float v0, v1, v2;
  2646. v0 = vecOld;
  2647. v1 = vec_ld( 15, vecPtr );
  2648. v2 = vec_ld( 31, vecPtr );
  2649. vecOld = vec_ld( 47, vecPtr );
  2650. vecLd1 = vec_perm( v0, v1, permVec );
  2651. vecLd2 = vec_perm( v1, v2, permVec );
  2652. vecLd3 = vec_perm( v2, vecOld, permVec );
  2653. // put each idVec3 into its own vector as X Y Z (crap)
  2654. vecSrc1 = vecLd1;
  2655. vecSrc2 = vec_sld( vecLd1, vecLd2, 12 );
  2656. vecSrc3 = vec_sld( vecLd2, vecLd3, 8 );
  2657. vecSrc4 = vec_sld( vecLd3, vecLd3, 4 );
  2658. // do min and max
  2659. vecMin1 = vec_min( vecSrc1, vecSrc2 );
  2660. vecMin2 = vec_min( vecSrc3, vecSrc4 );
  2661. vecMin1 = vec_min( vecMin1, vecMin2 );
  2662. vecMin = vec_min( vecMin, vecMin1 );
  2663. vecMax1 = vec_max( vecSrc1, vecSrc2 );
  2664. vecMax2 = vec_max( vecSrc3, vecSrc4 );
  2665. vecMax1 = vec_max( vecMax1, vecMax2 );
  2666. vecMax = vec_max( vecMax1, vecMax );
  2667. }
  2668. // store results
  2669. vector float v0, v1, v2, v3, v4, v5;
  2670. v0 = vec_splat( vecMin, 0 );
  2671. v1 = vec_splat( vecMin, 1 );
  2672. v2 = vec_splat( vecMin, 2 );
  2673. v3 = vec_splat( vecMax, 0 );
  2674. v4 = vec_splat( vecMax, 1 );
  2675. v5 = vec_splat( vecMax, 2 );
  2676. vec_ste( v0, 0, &min[0] );
  2677. vec_ste( v1, 0, &min[1] );
  2678. vec_ste( v2, 0, &min[2] );
  2679. vec_ste( v3, 0, &max[0] );
  2680. vec_ste( v4, 0, &max[1] );
  2681. vec_ste( v5, 0, &max[2] );
  2682. }
  2683. // cleanup
  2684. for ( ; i < count; i ++ ) {
  2685. v = src[i];
  2686. if ( v[0] < min[0] ) {
  2687. min[0] = v[0];
  2688. }
  2689. if ( v[0] > max[0] ) {
  2690. max[0] = v[0];
  2691. }
  2692. if ( v[1] < min[1] ) {
  2693. min[1] = v[1];
  2694. }
  2695. if ( v[1] > max[1] ) {
  2696. max[1] = v[1];
  2697. }
  2698. if ( v[2] < min[2] ) {
  2699. min[2] = v[2];
  2700. }
  2701. if ( v[2] > max[2] ) {
  2702. max[2] = v[2];
  2703. }
  2704. }
  2705. }
  2706. #ifndef DRAWVERT_PADDED
  2707. /*
  2708. ============
  2709. idSIMD_AltiVec::MinMax
  2710. ============
  2711. */
  2712. void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
  2713. min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
  2714. idVec3 v;
  2715. int i = 0;
  2716. register vector float vecMin, vecMax;
  2717. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  2718. register vector float vecMin1, vecMin2, vecMax1, vecMax2;
  2719. if ( count >= 4 ) {
  2720. vecMin = (vector float)(FLT_MAX);
  2721. vecMax = (vector float)(FLT_MIN);
  2722. vector unsigned char vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  2723. vector unsigned char vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  2724. vector unsigned char vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  2725. vector unsigned char vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  2726. for ( ; i+3 < count; i += 4) {
  2727. const float *vertPtr = src[i].xyz.ToFloatPtr();
  2728. const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
  2729. const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
  2730. const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
  2731. v0 = vec_ld( 0, vertPtr );
  2732. v1 = vec_ld( 11, vertPtr );
  2733. v2 = vec_ld( 0, vertPtr2 );
  2734. v3 = vec_ld( 11, vertPtr2 );
  2735. v4 = vec_ld( 0, vertPtr3 );
  2736. v5 = vec_ld( 11, vertPtr3 );
  2737. v6 = vec_ld( 0, vertPtr4 );
  2738. v7 = vec_ld( 11, vertPtr4 );
  2739. v0 = vec_perm( v0, v1, vertPerm1 );
  2740. v2 = vec_perm( v2, v3, vertPerm2 );
  2741. v4 = vec_perm( v4, v5, vertPerm3 );
  2742. v6 = vec_perm( v6, v7, vertPerm4 );
  2743. vecMin1 = vec_min( v0, v2 );
  2744. vecMin2 = vec_min( v4, v6 );
  2745. vecMin1 = vec_min( vecMin1, vecMin2 );
  2746. vecMin = vec_min( vecMin, vecMin1 );
  2747. vecMax1 = vec_max( v0, v2 );
  2748. vecMax2 = vec_max( v4, v6 );
  2749. vecMax1 = vec_max( vecMax1, vecMax2 );
  2750. vecMax = vec_max( vecMax, vecMax1 );
  2751. }
  2752. // now we have min/max vectors in X Y Z form, store out
  2753. v0 = vec_splat( vecMin, 0 );
  2754. v1 = vec_splat( vecMin, 1 );
  2755. v2 = vec_splat( vecMin, 2 );
  2756. v3 = vec_splat( vecMax, 0 );
  2757. v4 = vec_splat( vecMax, 1 );
  2758. v5 = vec_splat( vecMax, 2 );
  2759. vec_ste( v0, 0, &min[0] );
  2760. vec_ste( v1, 0, &min[1] );
  2761. vec_ste( v2, 0, &min[2] );
  2762. vec_ste( v3, 0, &max[0] );
  2763. vec_ste( v4, 0, &max[1] );
  2764. vec_ste( v5, 0, &max[2] );
  2765. }
  2766. // cleanup
  2767. for ( ; i < count; i++ ) {
  2768. v = src[i].xyz;
  2769. if ( v[0] < min[0] ) {
  2770. min[0] = v[0];
  2771. }
  2772. if ( v[0] > max[0] ) {
  2773. max[0] = v[0];
  2774. }
  2775. if ( v[1] < min[1] ) {
  2776. min[1] = v[1];
  2777. }
  2778. if ( v[1] > max[1] ) {
  2779. max[1] = v[1];
  2780. }
  2781. if ( v[2] > max[2] ) {
  2782. max[2] = v[2];
  2783. }
  2784. if ( v[2] < min[2] ) {
  2785. min[2] = v[2];
  2786. }
  2787. }
  2788. }
  2789. #else
  2790. /*
  2791. ============
  2792. idSIMD_AltiVec::MinMax
  2793. ============
  2794. */
  2795. void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
  2796. min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
  2797. idVec3 v;
  2798. int i = 0;
  2799. register vector float vecMin, vecMax;
  2800. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  2801. register vector float vecMin1, vecMin2, vecMax1, vecMax2;
  2802. if ( count >= 4 ) {
  2803. vecMin = (vector float)(FLT_MAX);
  2804. vecMax = (vector float)(FLT_MIN);
  2805. for ( ; i+3 < count; i += 4) {
  2806. const float *vertPtr = src[i].xyz.ToFloatPtr();
  2807. const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
  2808. const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
  2809. const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
  2810. v0 = vec_ld( 0, vertPtr );
  2811. v2 = vec_ld( 0, vertPtr2 );
  2812. v4 = vec_ld( 0, vertPtr3 );
  2813. v6 = vec_ld( 0, vertPtr4 );
  2814. vecMin1 = vec_min( v0, v2 );
  2815. vecMin2 = vec_min( v4, v6 );
  2816. vecMin1 = vec_min( vecMin1, vecMin2 );
  2817. vecMin = vec_min( vecMin, vecMin1 );
  2818. vecMax1 = vec_max( v0, v2 );
  2819. vecMax2 = vec_max( v4, v6 );
  2820. vecMax1 = vec_max( vecMax1, vecMax2 );
  2821. vecMax = vec_max( vecMax, vecMax1 );
  2822. }
  2823. // now we have min/max vectors in X Y Z form, store out
  2824. v0 = vec_splat( vecMin, 0 );
  2825. v1 = vec_splat( vecMin, 1 );
  2826. v2 = vec_splat( vecMin, 2 );
  2827. v3 = vec_splat( vecMax, 0 );
  2828. v4 = vec_splat( vecMax, 1 );
  2829. v5 = vec_splat( vecMax, 2 );
  2830. vec_ste( v0, 0, &min[0] );
  2831. vec_ste( v1, 0, &min[1] );
  2832. vec_ste( v2, 0, &min[2] );
  2833. vec_ste( v3, 0, &max[0] );
  2834. vec_ste( v4, 0, &max[1] );
  2835. vec_ste( v5, 0, &max[2] );
  2836. }
  2837. // cleanup
  2838. for ( ; i < count; i++ ) {
  2839. v = src[i].xyz;
  2840. if ( v[0] < min[0] ) {
  2841. min[0] = v[0];
  2842. }
  2843. if ( v[0] > max[0] ) {
  2844. max[0] = v[0];
  2845. }
  2846. if ( v[1] < min[1] ) {
  2847. min[1] = v[1];
  2848. }
  2849. if ( v[1] > max[1] ) {
  2850. max[1] = v[1];
  2851. }
  2852. if ( v[2] > max[2] ) {
  2853. max[2] = v[2];
  2854. }
  2855. if ( v[2] < min[2] ) {
  2856. min[2] = v[2];
  2857. }
  2858. }
  2859. }
  2860. #endif /* DRAWVERT_PADDED */
  2861. #ifndef DRAWVERT_PADDED
  2862. /*
  2863. ============
  2864. idSIMD_AltiVec::MinMax
  2865. ============
  2866. */
  2867. void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
  2868. min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
  2869. idVec3 v;
  2870. int i = 0;
  2871. register vector float vecMin, vecMax;
  2872. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  2873. register vector float vecMin1, vecMin2, vecMax1, vecMax2;
  2874. if ( count >= 4 ) {
  2875. vecMin = (vector float)(FLT_MAX);
  2876. vecMax = (vector float)(FLT_MIN);
  2877. vector unsigned char vertPerm1;
  2878. vector unsigned char vertPerm2;
  2879. vector unsigned char vertPerm3;
  2880. vector unsigned char vertPerm4;
  2881. for ( ; i+3 < count; i += 4) {
  2882. const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
  2883. const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
  2884. const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
  2885. const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
  2886. vertPerm1 = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
  2887. vertPerm2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
  2888. vertPerm3 = vec_add( vec_lvsl( -1, vertPtr3 ), (vector unsigned char)(1) );
  2889. vertPerm4 = vec_add( vec_lvsl( -1, vertPtr4 ), (vector unsigned char)(1) );
  2890. v0 = vec_ld( 0, vertPtr );
  2891. v1 = vec_ld( 15, vertPtr );
  2892. v2 = vec_ld( 0, vertPtr2 );
  2893. v3 = vec_ld( 15, vertPtr2 );
  2894. v4 = vec_ld( 0, vertPtr3 );
  2895. v5 = vec_ld( 15, vertPtr3 );
  2896. v6 = vec_ld( 0, vertPtr4 );
  2897. v7 = vec_ld( 15, vertPtr4 );
  2898. v0 = vec_perm( v0, v1, vertPerm1 );
  2899. v2 = vec_perm( v2, v3, vertPerm2 );
  2900. v4 = vec_perm( v4, v5, vertPerm3 );
  2901. v6 = vec_perm( v6, v7, vertPerm4 );
  2902. vecMin1 = vec_min( v0, v2 );
  2903. vecMin2 = vec_min( v4, v6 );
  2904. vecMin1 = vec_min( vecMin1, vecMin2 );
  2905. vecMin = vec_min( vecMin, vecMin1 );
  2906. vecMax1 = vec_max( v0, v2 );
  2907. vecMax2 = vec_max( v4, v6 );
  2908. vecMax1 = vec_max( vecMax1, vecMax2 );
  2909. vecMax = vec_max( vecMax, vecMax1 );
  2910. }
  2911. // now we have min/max vectors in X Y Z form, store out
  2912. v0 = vec_splat( vecMin, 0 );
  2913. v1 = vec_splat( vecMin, 1 );
  2914. v2 = vec_splat( vecMin, 2 );
  2915. v3 = vec_splat( vecMax, 0 );
  2916. v4 = vec_splat( vecMax, 1 );
  2917. v5 = vec_splat( vecMax, 2 );
  2918. vec_ste( v0, 0, &min[0] );
  2919. vec_ste( v1, 0, &min[1] );
  2920. vec_ste( v2, 0, &min[2] );
  2921. vec_ste( v3, 0, &max[0] );
  2922. vec_ste( v4, 0, &max[1] );
  2923. vec_ste( v5, 0, &max[2] );
  2924. }
  2925. // cleanup
  2926. for ( ; i < count; i++ ) {
  2927. v = src[indexes[i]].xyz;
  2928. if ( v[0] < min[0] ) {
  2929. min[0] = v[0];
  2930. }
  2931. if ( v[0] > max[0] ) {
  2932. max[0] = v[0];
  2933. }
  2934. if ( v[1] < min[1] ) {
  2935. min[1] = v[1];
  2936. }
  2937. if ( v[1] > max[1] ) {
  2938. max[1] = v[1];
  2939. }
  2940. if ( v[2] > max[2] ) {
  2941. max[2] = v[2];
  2942. }
  2943. if ( v[2] < min[2] ) {
  2944. min[2] = v[2];
  2945. }
  2946. }
  2947. }
  2948. #else
  2949. /*
  2950. ============
  2951. idSIMD_AltiVec::MinMax
  2952. ============
  2953. */
  2954. void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
  2955. min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
  2956. idVec3 v;
  2957. int i = 0;
  2958. register vector float vecMin, vecMax;
  2959. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  2960. register vector float vecMin1, vecMin2, vecMax1, vecMax2;
  2961. if ( count >= 4 ) {
  2962. vecMin = (vector float)(FLT_MAX);
  2963. vecMax = (vector float)(FLT_MIN);
  2964. vector unsigned char vertPerm1;
  2965. vector unsigned char vertPerm2;
  2966. vector unsigned char vertPerm3;
  2967. vector unsigned char vertPerm4;
  2968. for ( ; i+3 < count; i += 4) {
  2969. const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
  2970. const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
  2971. const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
  2972. const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
  2973. v0 = vec_ld( 0, vertPtr );
  2974. v2 = vec_ld( 0, vertPtr2 );
  2975. v4 = vec_ld( 0, vertPtr3 );
  2976. v6 = vec_ld( 0, vertPtr4 );
  2977. vecMin1 = vec_min( v0, v2 );
  2978. vecMin2 = vec_min( v4, v6 );
  2979. vecMin1 = vec_min( vecMin1, vecMin2 );
  2980. vecMin = vec_min( vecMin, vecMin1 );
  2981. vecMax1 = vec_max( v0, v2 );
  2982. vecMax2 = vec_max( v4, v6 );
  2983. vecMax1 = vec_max( vecMax1, vecMax2 );
  2984. vecMax = vec_max( vecMax, vecMax1 );
  2985. }
  2986. // now we have min/max vectors in X Y Z form, store out
  2987. v0 = vec_splat( vecMin, 0 );
  2988. v1 = vec_splat( vecMin, 1 );
  2989. v2 = vec_splat( vecMin, 2 );
  2990. v3 = vec_splat( vecMax, 0 );
  2991. v4 = vec_splat( vecMax, 1 );
  2992. v5 = vec_splat( vecMax, 2 );
  2993. vec_ste( v0, 0, &min[0] );
  2994. vec_ste( v1, 0, &min[1] );
  2995. vec_ste( v2, 0, &min[2] );
  2996. vec_ste( v3, 0, &max[0] );
  2997. vec_ste( v4, 0, &max[1] );
  2998. vec_ste( v5, 0, &max[2] );
  2999. }
  3000. // cleanup
  3001. for ( ; i < count; i++ ) {
  3002. v = src[indexes[i]].xyz;
  3003. if ( v[0] < min[0] ) {
  3004. min[0] = v[0];
  3005. }
  3006. if ( v[0] > max[0] ) {
  3007. max[0] = v[0];
  3008. }
  3009. if ( v[1] < min[1] ) {
  3010. min[1] = v[1];
  3011. }
  3012. if ( v[1] > max[1] ) {
  3013. max[1] = v[1];
  3014. }
  3015. if ( v[2] > max[2] ) {
  3016. max[2] = v[2];
  3017. }
  3018. if ( v[2] < min[2] ) {
  3019. min[2] = v[2];
  3020. }
  3021. }
  3022. }
  3023. #endif /* DRAWVERT_PADDED */
  3024. #endif /* ENABLE_MINMAX */
  3025. #ifdef ENABLE_CLAMP
  3026. /*
  3027. ============
  3028. idSIMD_AltiVec::Clamp
  3029. ============
  3030. */
  3031. void VPCALL idSIMD_AltiVec::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
  3032. //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
  3033. register vector float v0, v1, v2, v3, v4, v5;
  3034. register vector unsigned char permVec;
  3035. register vector float v0_low, v0_hi, v1_low, v1_hi;
  3036. vector unsigned char oneVector = (vector unsigned char)(1);
  3037. register vector float minVec, maxVec;
  3038. int i = 0;
  3039. //handle unaligned at start
  3040. for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  3041. dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
  3042. }
  3043. //splat min/max into a vector
  3044. minVec = loadSplatUnalignedScalar( &min );
  3045. maxVec = loadSplatUnalignedScalar( &max );
  3046. //calculate permute and do first load
  3047. permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
  3048. v1_hi = vec_ld( 0, &src[i] );
  3049. //vectorize!
  3050. for ( ; i+7 < count; i += 8 ) {
  3051. //load source
  3052. v0_low = v1_hi;
  3053. v0_hi = vec_ld( 15, &src[i] );
  3054. v1_low = v0_hi;
  3055. v1_hi = vec_ld( 31, &src[i] );
  3056. v0 = vec_perm( v0_low, v0_hi, permVec );
  3057. v1 = vec_perm( v1_low, v1_hi, permVec );
  3058. //apply minimum
  3059. v2 = vec_max( v0, minVec );
  3060. v3 = vec_max( v1, minVec );
  3061. //apply maximum
  3062. v4 = vec_min( v2, maxVec );
  3063. v5 = vec_min( v3, maxVec );
  3064. ALIGNED_STORE2( &dst[i], v4, v5 );
  3065. }
  3066. //handle cleanup
  3067. for ( ; i < count ; i++ ) {
  3068. dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
  3069. }
  3070. }
  3071. /*
  3072. ============
  3073. idSIMD_AltiVec::ClampMin
  3074. ============
  3075. */
  3076. void VPCALL idSIMD_AltiVec::ClampMin( float *dst, const float *src, const float min, const int count ) {
  3077. //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
  3078. register vector float v0, v1, v2, v3;
  3079. register vector unsigned char permVec;
  3080. register vector float v0_low, v0_hi, v1_low, v1_hi;
  3081. register vector float constVec;
  3082. vector unsigned char oneVector = (vector unsigned char)(1);
  3083. int i = 0;
  3084. //handle unaligned at start
  3085. for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  3086. dst[i] = src[i] < min ? min : src[i];
  3087. }
  3088. //splat constant into a vector
  3089. constVec = loadSplatUnalignedScalar( &min );
  3090. //calculate permute and do first load
  3091. permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
  3092. v1_hi = vec_ld( 0, &src[i] );
  3093. //vectorize!
  3094. for ( ; i+7 < count; i += 8 ) {
  3095. //load source
  3096. v0_low = v1_hi;
  3097. v0_hi = vec_ld( 15, &src[i] );
  3098. v1_low = v0_hi;
  3099. v1_hi = vec_ld( 31, &src[i] );
  3100. v0 = vec_perm( v0_low, v0_hi, permVec );
  3101. v1 = vec_perm( v1_low, v1_hi, permVec );
  3102. v2 = vec_max( v0, constVec );
  3103. v3 = vec_max( v1, constVec );
  3104. ALIGNED_STORE2( &dst[i], v2, v3 );
  3105. }
  3106. //handle cleanup
  3107. for ( ; i < count ; i++ ) {
  3108. dst[i] = src[i] < min ? min : src[i];
  3109. }
  3110. }
  3111. /*
  3112. ============
  3113. idSIMD_AltiVec::ClampMax
  3114. ============
  3115. */
  3116. void VPCALL idSIMD_AltiVec::ClampMax( float *dst, const float *src, const float max, const int count ) {
  3117. //#define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
  3118. register vector float v0, v1, v2, v3;
  3119. register vector unsigned char permVec;
  3120. register vector float constVec;
  3121. register vector float v0_low, v0_hi, v1_low, v1_hi;
  3122. vector unsigned char oneVector = (vector unsigned char)(1);
  3123. int i = 0;
  3124. //handle unaligned at start
  3125. for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
  3126. dst[i] = src[i] < max ? max : src[i];
  3127. }
  3128. //splat constant into a vector
  3129. constVec = loadSplatUnalignedScalar( &max );
  3130. //calculate permute and do first load
  3131. permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
  3132. v1_hi = vec_ld( 0, &src[i] );
  3133. //vectorize!
  3134. for ( ; i+7 < count; i += 8 ) {
  3135. //load source
  3136. v0_low = v1_hi;
  3137. v0_hi = vec_ld( 15, &src[i] );
  3138. v1_low = v0_hi;
  3139. v1_hi = vec_ld( 31, &src[i] );
  3140. v0 = vec_perm( v0_low, v0_hi, permVec );
  3141. v1 = vec_perm( v1_low, v1_hi, permVec );
  3142. v2 = vec_min( v0, constVec );
  3143. v3 = vec_min( v1, constVec );
  3144. ALIGNED_STORE2( &dst[i], v2, v3 );
  3145. }
  3146. //handle cleanup
  3147. for ( ; i < count ; i++ ) {
  3148. dst[i] = src[i] < max ? max : src[i];
  3149. }
  3150. }
  3151. #endif /* ENABLE_CLAMP */
  3152. #ifdef ENABLE_16ROUTINES
  3153. /*
  3154. ============
  3155. idSIMD_AltiVec::Zero16
  3156. ============
  3157. */
  3158. void VPCALL idSIMD_AltiVec::Zero16( float *dst, const int count ) {
  3159. memset( dst, 0, count * sizeof( float ) );
  3160. }
  3161. /*
  3162. ============
  3163. idSIMD_AltiVec::Negate16
  3164. Assumptions:
  3165. dst is aligned
  3166. ============
  3167. */
  3168. void VPCALL idSIMD_AltiVec::Negate16( float *dst, const int count ) {
  3169. //#define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
  3170. // dst is aligned
  3171. assert( IS_16BYTE_ALIGNED( dst[0] ) );
  3172. // round count up to next 4 if needbe
  3173. int count2 = ( count + 3 ) & ~3;
  3174. int i = 0;
  3175. vector float v0, v1, v2, v3;
  3176. //know its 16-byte aligned
  3177. for ( ; i + 7 < count2; i += 8 ) {
  3178. v0 = vec_ld( 0, &dst[i] );
  3179. v1 = vec_ld( 16, &dst[i] );
  3180. v2 = vec_sub( (vector float)(0), v0 );
  3181. v3 = vec_sub( (vector float)(0), v1 );
  3182. ALIGNED_STORE2( &dst[i], v2, v3 );
  3183. }
  3184. for ( ; i < count2; i += 4 ) {
  3185. v0 = vec_ld( 0, &dst[i] );
  3186. v1 = vec_sub( (vector float)(0), v0 );
  3187. vec_st( v1, 0, &dst[i] );
  3188. }
  3189. }
  3190. /*
  3191. ============
  3192. idSIMD_AltiVec::Copy16
  3193. ============
  3194. */
  3195. void VPCALL idSIMD_AltiVec::Copy16( float *dst, const float *src, const int count ) {
  3196. //#define OPER(X) dst[(X)] = src[(X)]
  3197. memcpy( dst, src, sizeof(float) * count );
  3198. }
  3199. /*
  3200. ============
  3201. idSIMD_AltiVec::Add16
  3202. Assumptions:
  3203. Assumes dst, src1, src2 all start at aligned address
  3204. ============
  3205. */
  3206. void VPCALL idSIMD_AltiVec::Add16( float *dst, const float *src1, const float *src2, const int count ) {
  3207. //#define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
  3208. // dst is aligned
  3209. assert( IS_16BYTE_ALIGNED( dst[0] ) );
  3210. // src1 is aligned
  3211. assert( IS_16BYTE_ALIGNED( src1[0] ) );
  3212. // src2 is aligned
  3213. assert( IS_16BYTE_ALIGNED( src2[0] ) );
  3214. // round count up to next 4 if needbe
  3215. int count2 = ( count + 3 ) & ~3;
  3216. register vector float v0, v1, v2, v3, v4, v5;
  3217. int i = 0;
  3218. //know all data is 16-byte aligned, so vectorize!
  3219. for ( ; i+7 < count2; i += 8 ) {
  3220. //load sources
  3221. v0 = vec_ld( 0, &src1[i] );
  3222. v1 = vec_ld( 16, &src1[i] );
  3223. v2 = vec_ld( 0, &src2[i] );
  3224. v3 = vec_ld( 16, &src2[i] );
  3225. v4 = vec_add( v0, v2 );
  3226. v5 = vec_add( v1, v3 );
  3227. ALIGNED_STORE2( &dst[i], v4, v5 );
  3228. }
  3229. for ( ; i < count2; i += 4 ) {
  3230. v0 = vec_ld( 0, &src1[i] );
  3231. v1 = vec_ld( 0, &src2[i] );
  3232. v2 = vec_add( v0, v1 );
  3233. vec_st( v2, 0, &dst[i] );
  3234. }
  3235. }
  3236. /*
  3237. ============
  3238. idSIMD_AltiVec::Sub16
  3239. Assumptions:
  3240. Assumes that dst, src1, and src2 all start at aligned address
  3241. ============
  3242. */
  3243. void VPCALL idSIMD_AltiVec::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
  3244. //#define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
  3245. // dst is aligned
  3246. assert( IS_16BYTE_ALIGNED( dst[0] ) );
  3247. // src1 is aligned
  3248. assert( IS_16BYTE_ALIGNED( src1[0] ) );
  3249. // src2 is aligned
  3250. assert( IS_16BYTE_ALIGNED( src2[0] ) );
  3251. // round count up to next 4 if needbe
  3252. int count2 = ( count + 3 ) & ~3;
  3253. register vector float v0, v1, v2, v3, v4, v5;
  3254. int i = 0;
  3255. //know data is aligned, so vectorize!
  3256. for ( ; i+7 < count2; i += 8 ) {
  3257. //load sources
  3258. v0 = vec_ld( 0, &src1[i] );
  3259. v1 = vec_ld( 16, &src1[i] );
  3260. v2 = vec_ld( 0, &src2[i] );
  3261. v3 = vec_ld( 16, &src2[i] );
  3262. v4 = vec_sub( v0, v2 );
  3263. v5 = vec_sub( v1, v3 );
  3264. ALIGNED_STORE2( &dst[i], v4, v5 );
  3265. }
  3266. for ( ; i < count2; i += 4 ) {
  3267. v0 = vec_ld( 0, &src1[i] );
  3268. v1 = vec_ld( 0, &src2[i] );
  3269. v2 = vec_sub( v0, v1 );
  3270. vec_st( v2, 0, &dst[i] );
  3271. }
  3272. }
  3273. /*
  3274. ============
  3275. idSIMD_AltiVec::Mul16
  3276. Assumptions:
  3277. Assumes that dst and src1 start at aligned address
  3278. ============
  3279. */
  3280. void VPCALL idSIMD_AltiVec::Mul16( float *dst, const float *src1, const float constant, const int count ) {
  3281. //#define OPER(X) dst[(X)] = src1[(X)] * constant
  3282. // dst is aligned
  3283. assert( IS_16BYTE_ALIGNED( dst[0] ) );
  3284. // src1 is aligned
  3285. assert( IS_16BYTE_ALIGNED( src1[0] ) );
  3286. // round count up to next 4 if needbe
  3287. int count2 = ( count + 3 ) & ~3;
  3288. register vector float v0, v1, v2, v3;
  3289. register vector float constVec;
  3290. register vector float zeroVector = (vector float)(0.0);
  3291. int i = 0;
  3292. //splat constant into a vector
  3293. constVec = loadSplatUnalignedScalar( &constant );
  3294. //know data is aligned, so vectorize!
  3295. for ( ; i+7 < count2; i += 8 ) {
  3296. //load source
  3297. v0 = vec_ld( 0, &src1[i] );
  3298. v1 = vec_ld( 16, &src1[i] );
  3299. v2 = vec_madd( constVec, v0, zeroVector );
  3300. v3 = vec_madd( constVec, v1, zeroVector );
  3301. ALIGNED_STORE2( &dst[i], v2, v3 );
  3302. }
  3303. for ( ; i < count2; i += 4 ) {
  3304. v0 = vec_ld( 0, &src1[i] );
  3305. v1 = vec_madd( constVec, v0, zeroVector );
  3306. vec_st( v1, 0, &dst[i] );
  3307. }
  3308. }
  3309. /*
  3310. ============
  3311. idSIMD_AltiVec::AddAssign16
  3312. Assumptions:
  3313. Assumes that dst and src start at aligned address
  3314. ============
  3315. */
  3316. void VPCALL idSIMD_AltiVec::AddAssign16( float *dst, const float *src, const int count ) {
  3317. //#define OPER(X) dst[(X)] += src[(X)]
  3318. // dst is aligned
  3319. assert( IS_16BYTE_ALIGNED( dst[0] ) );
  3320. // src is aligned
  3321. assert( IS_16BYTE_ALIGNED( src[0] ) );
  3322. // round count up to next 4 if needbe
  3323. int count2 = ( count + 3 ) & ~3;
  3324. register vector float v0, v1, v2, v3, v4, v5;
  3325. int i = 0;
  3326. //vectorize!
  3327. for ( ; i+7 < count2; i += 8 ) {
  3328. v0 = vec_ld( 0, &src[i] );
  3329. v1 = vec_ld( 16, &src[i] );
  3330. v2 = vec_ld( 0, &dst[i] );
  3331. v3 = vec_ld( 16, &dst[i] );
  3332. v4 = vec_add( v0, v2 );
  3333. v5 = vec_add( v1, v3 );
  3334. ALIGNED_STORE2( &dst[i], v4, v5 );
  3335. }
  3336. for ( ; i < count2; i += 4 ) {
  3337. v0 = vec_ld( 0, &src[i] );
  3338. v1 = vec_ld( 0, &dst[i] );
  3339. v2 = vec_add( v0, v1 );
  3340. vec_st( v2, 0, &dst[i] );
  3341. }
  3342. }
  3343. /*
  3344. ============
  3345. idSIMD_AltiVec::SubAssign16
  3346. Assumptions:
  3347. Assumes that dst and src start at aligned address
  3348. ============
  3349. */
  3350. void VPCALL idSIMD_AltiVec::SubAssign16( float *dst, const float *src, const int count ) {
  3351. //#define OPER(X) dst[(X)] -= src[(X)]
  3352. register vector float v0, v1, v2, v3, v4, v5;
  3353. int i=0;
  3354. // dst is aligned
  3355. assert( IS_16BYTE_ALIGNED( dst[0] ) );
  3356. // src is aligned
  3357. assert( IS_16BYTE_ALIGNED( src[0] ) );
  3358. // round count up to next 4 if needbe
  3359. int count2 = ( count + 3 ) & ~3;
  3360. //vectorize!
  3361. for ( ; i+7 < count2; i += 8 ) {
  3362. v0 = vec_ld( 0, &src[i] );
  3363. v1 = vec_ld( 16, &src[i] );
  3364. v2 = vec_ld( 0, &dst[i] );
  3365. v3 = vec_ld( 16, &dst[i] );
  3366. v4 = vec_sub( v2, v0 );
  3367. v5 = vec_sub( v3, v1 );
  3368. ALIGNED_STORE2( &dst[i], v4, v5 );
  3369. }
  3370. for ( ; i < count2; i += 4 ) {
  3371. v0 = vec_ld( 0, &src[i] );
  3372. v1 = vec_ld( 0, &dst[i] );
  3373. v2 = vec_sub( v1, v0 );
  3374. vec_st( v2, 0, &dst[i] );
  3375. }
  3376. }
  3377. /*
  3378. ============
  3379. idSIMD_AltiVec::MulAssign16
  3380. Assumptions:
  3381. Assumes that dst starts at aligned address and count is multiple of 4
  3382. ============
  3383. */
  3384. void VPCALL idSIMD_AltiVec::MulAssign16( float *dst, const float constant, const int count ) {
  3385. //#define OPER(X) dst[(X)] *= constant
  3386. // dst is aligned
  3387. assert( IS_16BYTE_ALIGNED( dst[0] ) );
  3388. // round count up to next 4 if needbe
  3389. int count2 = ( count + 3 ) & ~3;
  3390. register vector float v0, v1, v2, v3;
  3391. register vector float constVec;
  3392. int i = 0;
  3393. register vector float zeroVector = (vector float)(0.0);
  3394. //splat constant into a vector
  3395. constVec = loadSplatUnalignedScalar( &constant );
  3396. //vectorize!
  3397. for ( ; i+7 < count2; i += 8 ) {
  3398. v0 = vec_ld( 0, &dst[i] );
  3399. v1 = vec_ld( 16, &dst[i] );
  3400. v2 = vec_madd( v0, constVec, zeroVector );
  3401. v3 = vec_madd( v1, constVec, zeroVector );
  3402. ALIGNED_STORE2( &dst[i], v2, v3 );
  3403. }
  3404. for ( ; i < count2; i += 4 ) {
  3405. v0 = vec_ld( 0, &dst[i] );
  3406. v1 = vec_madd( v0, constVec, zeroVector );
  3407. vec_st( v1, 0, &dst[i] );
  3408. }
  3409. }
  3410. #endif /* ENABLE_16ROUTINES */
  3411. #ifdef ENABLE_LOWER_TRIANGULAR
  3412. /*
  3413. ============
  3414. idSIMD_AltiVec::MatX_LowerTriangularSolve
  3415. solves x in L * x = b for the first n rows of L
  3416. if skip > 0 the first skip elements of x are assumed to be valid already
  3417. L has to be a lower triangular matrix with (implicit) ones on the diagonal
  3418. x == b is allowed
  3419. ============
  3420. */
  3421. void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
  3422. int i, j;
  3423. const float *lptr;
  3424. const float *lptr2;
  3425. const float *lptr3;
  3426. const float *lptr4;
  3427. float sum;
  3428. float sum2;
  3429. float sum3;
  3430. float sum4;
  3431. float tempSum;
  3432. float tempSum2;
  3433. float tempSum3;
  3434. float tempSum4;
  3435. vector float vecSum1 = (vector float)(0.0);
  3436. vector float vecSum2 = (vector float)(0.0);
  3437. vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
  3438. vector float zeroVector = (vector float)(0.0);
  3439. vector float vecSum3, vecSum4, vecSum5, vecSum6, vecSum7, vecSum8;
  3440. vector unsigned char vecPermX = vec_add( vec_lvsl( -1, &x[0] ), (vector unsigned char)(1) );
  3441. // unrolled this loop a bit
  3442. for ( i = skip; i+3 < n; i+=4 ) {
  3443. sum = b[i];
  3444. sum2 = b[i+1];
  3445. sum3 = b[i+2];
  3446. sum4 = b[i+3];
  3447. vecSum1 = zeroVector;
  3448. vecSum2 = zeroVector;
  3449. vecSum3 = vecSum4 = vecSum5 = vecSum6 = vecSum7 = vecSum8 = zeroVector;
  3450. lptr = L[i];
  3451. lptr2 = L[i+1];
  3452. lptr3 = L[i+2];
  3453. lptr4 = L[i+3];
  3454. vector unsigned char vecPermLptr1 = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
  3455. vector unsigned char vecPermLptr2 = vec_add( vec_lvsl( -1, lptr2 ), (vector unsigned char)(1) );
  3456. vector unsigned char vecPermLptr3 = vec_add( vec_lvsl( -1, lptr3 ), (vector unsigned char)(1) );
  3457. vector unsigned char vecPermLptr4 = vec_add( vec_lvsl( -1, lptr4 ), (vector unsigned char)(1) );
  3458. for ( j = 0 ; j+7 < i; j+=8 ) {
  3459. v0 = vec_ld( 0, &x[j] );
  3460. v1 = vec_ld( 15, &x[j] );
  3461. vector float vecExtraX = vec_ld( 31, &x[j] );
  3462. v0 = vec_perm( v0, v1, vecPermX );
  3463. v1 = vec_perm( v1, vecExtraX, vecPermX );
  3464. v2 = vec_ld( 0, lptr + j );
  3465. v3 = vec_ld( 15, lptr + j );
  3466. vector float vecExtra1 = vec_ld( 31, lptr + j );
  3467. v2 = vec_perm( v2, v3, vecPermLptr1 );
  3468. v3 = vec_perm( v3, vecExtra1, vecPermLptr1 );
  3469. v4 = vec_ld( 0, lptr2 + j );
  3470. v5 = vec_ld( 15, lptr2 + j );
  3471. vector float vecExtra2 = vec_ld( 31, lptr2 + j );
  3472. v4 = vec_perm( v4, v5, vecPermLptr2 );
  3473. v5 = vec_perm( v5, vecExtra2, vecPermLptr2 );
  3474. v6 = vec_ld( 0, lptr3 + j );
  3475. v7 = vec_ld( 15, lptr3 + j );
  3476. vector float vecExtra3 = vec_ld( 31, lptr3 + j );
  3477. v6 = vec_perm( v6, v7, vecPermLptr3 );
  3478. v7 = vec_perm( v7, vecExtra3, vecPermLptr3 );
  3479. v8 = vec_ld( 0, lptr4 + j );
  3480. v9 = vec_ld( 15, lptr4 + j );
  3481. vector float vecExtra4 = vec_ld( 31, lptr4 + j );
  3482. v8 = vec_perm( v8, v9, vecPermLptr4 );
  3483. v9 = vec_perm( v9, vecExtra4, vecPermLptr4 );
  3484. vecSum1 = vec_madd( v2, v0, vecSum1 );
  3485. vecSum2 = vec_madd( v3, v1, vecSum2 );
  3486. vecSum3 = vec_madd( v4, v0, vecSum3 );
  3487. vecSum4 = vec_madd( v5, v1, vecSum4 );
  3488. vecSum5 = vec_madd( v6, v0, vecSum5 );
  3489. vecSum6 = vec_madd( v7, v1, vecSum6 );
  3490. vecSum7 = vec_madd( v8, v0, vecSum7 );
  3491. vecSum8 = vec_madd( v9, v1, vecSum8 );
  3492. }
  3493. // if we ran the unrolled code, we need to sum accross the vectors
  3494. // to find out how much to subtract from sum
  3495. if ( j > 0 ) {
  3496. vecSum1 = vec_add( vecSum1, vecSum2 );
  3497. vecSum3 = vec_add( vecSum3, vecSum4 );
  3498. vecSum5 = vec_add( vecSum5, vecSum6 );
  3499. vecSum7 = vec_add( vecSum7, vecSum8 );
  3500. //sum accross the vectors
  3501. vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
  3502. vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
  3503. vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 8 ) );
  3504. vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 4 ) );
  3505. vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 8 ) );
  3506. vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 4 ) );
  3507. vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 8 ) );
  3508. vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 4 ) );
  3509. //move the result to the FPU
  3510. vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
  3511. vec_ste( vec_splat( vecSum3, 0 ), 0, &tempSum2 );
  3512. vec_ste( vec_splat( vecSum5, 0 ), 0, &tempSum3 );
  3513. vec_ste( vec_splat( vecSum7, 0 ), 0, &tempSum4 );
  3514. sum -= tempSum;
  3515. sum2 -= tempSum2;
  3516. sum3 -= tempSum3;
  3517. sum4 -= tempSum4;
  3518. }
  3519. //cleanup
  3520. for ( ; j < i; j++ ) {
  3521. sum -= lptr[j] * x[j];
  3522. sum2 -= lptr2[j] * x[j];
  3523. sum3 -= lptr3[j] * x[j];
  3524. sum4 -= lptr4[j] * x[j];
  3525. }
  3526. // store the 4 results at a time
  3527. sum2 -= ( lptr2[i] * sum );
  3528. sum3 = sum3 - ( lptr3[i+1] * sum2 ) - ( lptr3[i] * sum );
  3529. sum4 = sum4 - ( lptr4[i+2] * sum3 ) - ( lptr4[i+1] * sum2 ) - ( lptr4[i] * sum );
  3530. x[i] = sum;
  3531. x[i+1] = sum2;
  3532. x[i+2] = sum3;
  3533. x[i+3] = sum4;
  3534. }
  3535. // cleanup
  3536. for ( ; i < n; i++ ) {
  3537. sum = b[i];
  3538. vecSum1 = zeroVector;
  3539. vecSum2 = zeroVector;
  3540. lptr = L[i];
  3541. vector unsigned char vecPermLptr = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
  3542. for ( j = 0 ; j+7 < i; j+=8 ) {
  3543. v0 = vec_ld( 0, &x[j] );
  3544. v2 = vec_ld( 15, &x[j] );
  3545. vector float vecExtraX = vec_ld( 31, &x[j] );
  3546. v0 = vec_perm( v0, v2, vecPermX );
  3547. v2 = vec_perm( v2, vecExtraX, vecPermX );
  3548. v1 = vec_ld( 0, lptr + j );
  3549. v3 = vec_ld( 15, lptr + j );
  3550. vector float vecExtra = vec_ld( 31, lptr + j );
  3551. v1 = vec_perm( v1, v3, vecPermLptr );
  3552. v3 = vec_perm( v3, vecExtra, vecPermLptr );
  3553. vecSum1 = vec_madd( v1, v0, vecSum1 );
  3554. vecSum2 = vec_madd( v3, v2, vecSum2 );
  3555. }
  3556. // if we ran the unrolled code, we need to sum accross the vectors
  3557. // to find out how much to subtract from sum
  3558. if ( j > 0 ) {
  3559. //sum accross the vectors
  3560. vecSum1 = vec_add( vecSum1, vecSum2 );
  3561. vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
  3562. vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
  3563. //move the result to the FPU
  3564. vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
  3565. sum -= tempSum;
  3566. }
  3567. //cleanup
  3568. for ( ; j < i; j++ ) {
  3569. sum -= lptr[j] * x[j];
  3570. }
  3571. x[i] = sum;
  3572. }
  3573. }
  3574. /*
  3575. ============
  3576. idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose
  3577. solves x in L.Transpose() * x = b for the first n rows of L
  3578. L has to be a lower triangular matrix with (implicit) ones on the diagonal
  3579. x == b is allowed
  3580. ============
  3581. */
  3582. void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
  3583. int nc;
  3584. const float *lptr;
  3585. lptr = L.ToFloatPtr();
  3586. nc = L.GetNumColumns();
  3587. float x0, x1, x2, x3, x4, x5, x6;
  3588. // unrolled cases for n < 8
  3589. if ( n < 8 ) {
  3590. switch( n ) {
  3591. // using local variables to avoid aliasing issues
  3592. case 0:
  3593. return;
  3594. case 1:
  3595. x[0] = b[0];
  3596. return;
  3597. case 2:
  3598. x1 = b[1];
  3599. x0 = b[0] - lptr[1*nc+0] * x1;
  3600. x[1] = x1;
  3601. x[0] = x0;
  3602. return;
  3603. case 3:
  3604. x2 = b[2];
  3605. x1 = b[1] - lptr[2*nc+1] * x2;
  3606. x0 = b[0] - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
  3607. x[2] = x2;
  3608. x[1] = x1;
  3609. x[0] = x0;
  3610. return;
  3611. case 4:
  3612. x3 = b[3];
  3613. x2 = b[2] - lptr[3*nc+2] * x3;
  3614. x1 = b[1] - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
  3615. x0 = b[0] - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
  3616. x[3] = x3;
  3617. x[2] = x2;
  3618. x[1] = x1;
  3619. x[0] = x0;
  3620. return;
  3621. case 5:
  3622. x4 = b[4];
  3623. x3 = b[3] - lptr[4*nc+3] * x4;
  3624. x2 = b[2] - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
  3625. x1 = b[1] - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
  3626. x0 = b[0] - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
  3627. x[4] = x4;
  3628. x[3] = x3;
  3629. x[2] = x2;
  3630. x[1] = x1;
  3631. x[0] = x0;
  3632. return;
  3633. case 6:
  3634. x5 = b[5];
  3635. x4 = b[4] - lptr[5*nc+4] * x5;
  3636. x3 = b[3] - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
  3637. x2 = b[2] - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
  3638. x1 = b[1] - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
  3639. x0 = b[0] - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
  3640. x[5] = x5;
  3641. x[4] = x4;
  3642. x[3] = x3;
  3643. x[2] = x2;
  3644. x[1] = x1;
  3645. x[0] = x0;
  3646. return;
  3647. case 7:
  3648. x6 = b[6];
  3649. x5 = b[5] - lptr[6*nc+5] * x6;
  3650. x4 = b[4] - lptr[6*nc+4] * x6 - lptr[5*nc+4] * x5;
  3651. x3 = b[3] - lptr[6*nc+3] * x6 - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
  3652. x2 = b[2] - lptr[6*nc+2] * x6 - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
  3653. x1 = b[1] - lptr[6*nc+1] * x6 - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
  3654. x0 = b[0] - lptr[6*nc+0] * x6 - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
  3655. x[6] = x6;
  3656. x[5] = x5;
  3657. x[4] = x4;
  3658. x[3] = x3;
  3659. x[2] = x2;
  3660. x[1] = x1;
  3661. x[0] = x0;
  3662. return;
  3663. }
  3664. return;
  3665. }
  3666. int i, j;
  3667. register float s0, s1, s2, s3;
  3668. float *xptr;
  3669. lptr = L.ToFloatPtr() + n * nc + n - 4;
  3670. xptr = x + n;
  3671. // process 4 rows at a time
  3672. for ( i = n; i >= 4; i -= 4 ) {
  3673. s0 = b[i-4];
  3674. s1 = b[i-3];
  3675. s2 = b[i-2];
  3676. s3 = b[i-1];
  3677. // process 4x4 blocks
  3678. for ( j = 0; j < n-i; j += 4 ) {
  3679. s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
  3680. s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
  3681. s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
  3682. s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
  3683. s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
  3684. s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
  3685. s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
  3686. s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
  3687. s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
  3688. s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
  3689. s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
  3690. s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
  3691. s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
  3692. s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
  3693. s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
  3694. s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
  3695. }
  3696. // process left over of the 4 rows
  3697. s0 -= lptr[0-1*nc] * s3;
  3698. s1 -= lptr[1-1*nc] * s3;
  3699. s2 -= lptr[2-1*nc] * s3;
  3700. s0 -= lptr[0-2*nc] * s2;
  3701. s1 -= lptr[1-2*nc] * s2;
  3702. s0 -= lptr[0-3*nc] * s1;
  3703. // store result
  3704. xptr[-4] = s0;
  3705. xptr[-3] = s1;
  3706. xptr[-2] = s2;
  3707. xptr[-1] = s3;
  3708. // update pointers for next four rows
  3709. lptr -= 4 + 4 * nc;
  3710. xptr -= 4;
  3711. }
  3712. // process left over rows
  3713. for ( i--; i >= 0; i-- ) {
  3714. s0 = b[i];
  3715. lptr = L[0] + i;
  3716. for ( j = i + 1; j < n; j++ ) {
  3717. s0 -= lptr[j*nc] * x[j];
  3718. }
  3719. x[i] = s0;
  3720. }
  3721. }
  3722. /*
  3723. ============
  3724. idSIMD_AltiVec::MatX_LDLTFactor
  3725. ============
  3726. */
  3727. bool VPCALL idSIMD_AltiVec::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
  3728. int i, j, k, nc;
  3729. float *v, *diag, *mptr;
  3730. float s0, s1, s2, s3, sum, d;
  3731. float s0_2, s1_2, s2_2, s3_2, sum_2;
  3732. float *mptr2;
  3733. v = (float *) _alloca16( n * sizeof( float ) );
  3734. diag = (float *) _alloca16( n * sizeof( float ) );
  3735. nc = mat.GetNumColumns();
  3736. if ( n <= 0 ) {
  3737. return true;
  3738. }
  3739. mptr = mat[0];
  3740. sum = mptr[0];
  3741. if ( sum == 0.0f ) {
  3742. return false;
  3743. }
  3744. diag[0] = sum;
  3745. invDiag[0] = d = 1.0f / sum;
  3746. if ( n <= 1 ) {
  3747. return true;
  3748. }
  3749. mptr = mat[0];
  3750. for ( j = 1; j < n; j++ ) {
  3751. mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
  3752. }
  3753. mptr = mat[1];
  3754. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  3755. sum = mptr[1] - s0;
  3756. if ( sum == 0.0f ) {
  3757. return false;
  3758. }
  3759. mat[1][1] = sum;
  3760. diag[1] = sum;
  3761. invDiag[1] = d = 1.0f / sum;
  3762. if ( n <= 2 ) {
  3763. return true;
  3764. }
  3765. mptr = mat[0];
  3766. for ( j = 2; j < n; j++ ) {
  3767. mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
  3768. }
  3769. mptr = mat[2];
  3770. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  3771. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  3772. sum = mptr[2] - s0 - s1;
  3773. if ( sum == 0.0f ) {
  3774. return false;
  3775. }
  3776. mat[2][2] = sum;
  3777. diag[2] = sum;
  3778. invDiag[2] = d = 1.0f / sum;
  3779. if ( n <= 3 ) {
  3780. return true;
  3781. }
  3782. mptr = mat[0];
  3783. for ( j = 3; j < n; j++ ) {
  3784. mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
  3785. }
  3786. mptr = mat[3];
  3787. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  3788. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  3789. v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
  3790. sum = mptr[3] - s0 - s1 - s2;
  3791. if ( sum == 0.0f ) {
  3792. return false;
  3793. }
  3794. mat[3][3] = sum;
  3795. diag[3] = sum;
  3796. invDiag[3] = d = 1.0f / sum;
  3797. if ( n <= 4 ) {
  3798. return true;
  3799. }
  3800. mptr = mat[0];
  3801. for ( j = 4; j < n; j++ ) {
  3802. mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
  3803. }
  3804. for ( i = 4; i < n; i++ ) {
  3805. mptr = mat[i];
  3806. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  3807. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  3808. v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
  3809. v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
  3810. for ( k = 4; k < i-3; k += 4 ) {
  3811. v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
  3812. v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
  3813. v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
  3814. v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
  3815. }
  3816. switch( i - k ) {
  3817. case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
  3818. case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
  3819. case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
  3820. }
  3821. sum = s3;
  3822. sum += s2;
  3823. sum += s1;
  3824. sum += s0;
  3825. sum = mptr[i] - sum;
  3826. if ( sum == 0.0f ) {
  3827. return false;
  3828. }
  3829. mat[i][i] = sum;
  3830. diag[i] = sum;
  3831. invDiag[i] = d = 1.0f / sum;
  3832. if ( i + 1 >= n ) {
  3833. return true;
  3834. }
  3835. // unrolling madness!
  3836. mptr = mat[i+1];
  3837. mptr2 = mat[i+1] + nc;
  3838. for ( j = i+1; j+1 < n; j+=2 ) {
  3839. s0 = mptr[0] * v[0];
  3840. s1 = mptr[1] * v[1];
  3841. s2 = mptr[2] * v[2];
  3842. s3 = mptr[3] * v[3];
  3843. s0_2 = mptr2[0] * v[0];
  3844. s1_2 = mptr2[1] * v[1];
  3845. s2_2 = mptr2[2] * v[2];
  3846. s3_2 = mptr2[3] * v[3];
  3847. for ( k = 4; k < i-7; k += 8 ) {
  3848. s0 += mptr[k+0] * v[k+0];
  3849. s1 += mptr[k+1] * v[k+1];
  3850. s2 += mptr[k+2] * v[k+2];
  3851. s3 += mptr[k+3] * v[k+3];
  3852. s0 += mptr[k+4] * v[k+4];
  3853. s1 += mptr[k+5] * v[k+5];
  3854. s2 += mptr[k+6] * v[k+6];
  3855. s3 += mptr[k+7] * v[k+7];
  3856. s0_2 += mptr2[k+0] * v[k+0];
  3857. s1_2 += mptr2[k+1] * v[k+1];
  3858. s2_2 += mptr2[k+2] * v[k+2];
  3859. s3_2 += mptr2[k+3] * v[k+3];
  3860. s0_2 += mptr2[k+4] * v[k+4];
  3861. s1_2 += mptr2[k+5] * v[k+5];
  3862. s2_2 += mptr2[k+6] * v[k+6];
  3863. s3_2 += mptr2[k+7] * v[k+7];
  3864. }
  3865. switch( i - k ) {
  3866. case 7: s0 += mptr[k+6] * v[k+6]; s0_2 += mptr2[k+6] * v[k+6];
  3867. case 6: s1 += mptr[k+5] * v[k+5]; s1_2 += mptr2[k+5] * v[k+5];
  3868. case 5: s2 += mptr[k+4] * v[k+4]; s2_2 += mptr2[k+4] * v[k+4];
  3869. case 4: s3 += mptr[k+3] * v[k+3]; s3_2 += mptr2[k+3] * v[k+3];
  3870. case 3: s0 += mptr[k+2] * v[k+2]; s0_2 += mptr2[k+2] * v[k+2];
  3871. case 2: s1 += mptr[k+1] * v[k+1]; s1_2 += mptr2[k+1] * v[k+1];
  3872. case 1: s2 += mptr[k+0] * v[k+0]; s2_2 += mptr2[k+0] * v[k+0];
  3873. }
  3874. // disassociate these adds
  3875. s3 += s2;
  3876. s1 += s0;
  3877. sum = s1 + s3;
  3878. s3_2 += s2_2;
  3879. s1_2 += s0_2;
  3880. sum_2 = s1_2 + s3_2;
  3881. mptr[i] = ( mptr[i] - sum ) * d;
  3882. mptr2[i] = ( mptr2[i] - sum_2 ) * d;
  3883. mptr += nc*2;
  3884. mptr2 += nc*2;
  3885. }
  3886. // cleanup
  3887. for ( ; j < n; j++ ) {
  3888. s0 = mptr[0] * v[0];
  3889. s1 = mptr[1] * v[1];
  3890. s2 = mptr[2] * v[2];
  3891. s3 = mptr[3] * v[3];
  3892. for ( k = 4; k < i-7; k += 8 ) {
  3893. s0 += mptr[k+0] * v[k+0];
  3894. s1 += mptr[k+1] * v[k+1];
  3895. s2 += mptr[k+2] * v[k+2];
  3896. s3 += mptr[k+3] * v[k+3];
  3897. s0 += mptr[k+4] * v[k+4];
  3898. s1 += mptr[k+5] * v[k+5];
  3899. s2 += mptr[k+6] * v[k+6];
  3900. s3 += mptr[k+7] * v[k+7];
  3901. }
  3902. switch( i - k ) {
  3903. case 7: s0 += mptr[k+6] * v[k+6];
  3904. case 6: s1 += mptr[k+5] * v[k+5];
  3905. case 5: s2 += mptr[k+4] * v[k+4];
  3906. case 4: s3 += mptr[k+3] * v[k+3];
  3907. case 3: s0 += mptr[k+2] * v[k+2];
  3908. case 2: s1 += mptr[k+1] * v[k+1];
  3909. case 1: s2 += mptr[k+0] * v[k+0];
  3910. }
  3911. // disassociate these adds
  3912. s3 += s2;
  3913. s1 += s0;
  3914. sum = s1 + s3;
  3915. mptr[i] = ( mptr[i] - sum ) * d;
  3916. mptr += nc;
  3917. }
  3918. }
  3919. return true;
  3920. }
  3921. #endif /* ENABLE_LOWER_TRIANGULAR */
  3922. #ifdef LIVE_VICARIOUSLY
  3923. /*
  3924. ============
  3925. idSIMD_AltiVec::BlendJoints
  3926. ============
  3927. */
  3928. void VPCALL idSIMD_AltiVec::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
  3929. int i;
  3930. // since lerp is a constant, we can special case the two cases if they're true
  3931. if ( lerp <= 0.0f ) {
  3932. // this sets joints back to joints. No sense in doing no work, so just return
  3933. return;
  3934. }
  3935. if ( lerp >= 1.0f ) {
  3936. // this copies each q from blendJoints to joints and copies each t from blendJoints to joints
  3937. memcpy( joints[0].q.ToFloatPtr(), blendJoints[0].q.ToFloatPtr(), sizeof(idJointQuat) * numJoints );
  3938. return;
  3939. }
  3940. vector float vecLerp = loadSplatUnalignedScalar( &lerp );
  3941. vector float zeroVector = (vector float)(0);
  3942. for ( i = 0; i+3 < numJoints; i+=4 ) {
  3943. int j = index[i];
  3944. int j2 = index[i+1];
  3945. int j3 = index[i+2];
  3946. int j4 = index[i+3];
  3947. // slerp
  3948. const float *jointPtr = joints[j].q.ToFloatPtr();
  3949. const float *blendPtr = blendJoints[j].q.ToFloatPtr();
  3950. const float *jointPtr2 = joints[j2].q.ToFloatPtr();
  3951. const float *blendPtr2 = blendJoints[j2].q.ToFloatPtr();
  3952. const float *jointPtr3 = joints[j3].q.ToFloatPtr();
  3953. const float *blendPtr3 = blendJoints[j3].q.ToFloatPtr();
  3954. const float *jointPtr4 = joints[j4].q.ToFloatPtr();
  3955. const float *blendPtr4 = blendJoints[j4].q.ToFloatPtr();
  3956. vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
  3957. vector unsigned char permVec2 = vec_add( vec_lvsl( -1, jointPtr2 ), (vector unsigned char)(1) );
  3958. vector unsigned char permVec3 = vec_add( vec_lvsl( -1, jointPtr3 ), (vector unsigned char)(1) );
  3959. vector unsigned char permVec4 = vec_add( vec_lvsl( -1, jointPtr4 ), (vector unsigned char)(1) );
  3960. vector unsigned char permVec5 = vec_add( vec_lvsl( -1, blendPtr ), (vector unsigned char)(1) );
  3961. vector unsigned char permVec6 = vec_add( vec_lvsl( -1, blendPtr2 ), (vector unsigned char)(1) );
  3962. vector unsigned char permVec7 = vec_add( vec_lvsl( -1, blendPtr3 ), (vector unsigned char)(1) );
  3963. vector unsigned char permVec8 = vec_add( vec_lvsl( -1, blendPtr4 ), (vector unsigned char)(1) );
  3964. vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
  3965. vector float v12, v13, v14, v15, v16;
  3966. vector float vecFromX, vecFromY, vecFromZ, vecFromW;
  3967. vector float vecToX, vecToY, vecToZ, vecToW;
  3968. // load up the the idJointQuats from joints
  3969. v0 = vec_ld( 0, jointPtr );
  3970. v1 = vec_ld( 15, jointPtr );
  3971. v2 = vec_perm( v0, v1, permVec );
  3972. v3 = vec_ld( 0, jointPtr2 );
  3973. v4 = vec_ld( 15, jointPtr2 );
  3974. v5 = vec_perm( v3, v4, permVec2 );
  3975. v6 = vec_ld( 0, jointPtr3 );
  3976. v7 = vec_ld( 15, jointPtr3 );
  3977. v8 = vec_perm( v6, v7, permVec3 );
  3978. v9 = vec_ld( 0, jointPtr4 );
  3979. v10 = vec_ld( 15, jointPtr4 );
  3980. v11 = vec_perm( v9, v10, permVec4 );
  3981. // planarizing, so put each x y z w into its own vector
  3982. v0 = vec_mergeh( v2, v8 );
  3983. v1 = vec_mergeh( v5, v11 );
  3984. v3 = vec_mergel( v2, v8 );
  3985. v4 = vec_mergel( v5, v11 );
  3986. vecFromX = vec_mergeh( v0, v1 );
  3987. vecFromY = vec_mergel( v0, v1 );
  3988. vecFromZ = vec_mergeh( v3, v4 );
  3989. vecFromW = vec_mergel( v3, v4 );
  3990. // load up idJointQuats from blendJoints
  3991. v5 = vec_ld( 0, blendPtr );
  3992. v6 = vec_ld( 15, blendPtr );
  3993. v7 = vec_perm( v5, v6, permVec5 );
  3994. v8 = vec_ld( 0, blendPtr2 );
  3995. v9 = vec_ld( 15, blendPtr2 );
  3996. v10 = vec_perm( v8, v9, permVec6 );
  3997. v11 = vec_ld( 0, blendPtr3 );
  3998. v12 = vec_ld( 15, blendPtr3 );
  3999. v13 = vec_perm( v11, v12, permVec7 );
  4000. v14 = vec_ld( 0, blendPtr4 );
  4001. v15 = vec_ld( 15, blendPtr4 );
  4002. v16 = vec_perm( v14, v15, permVec8 );
  4003. // put these into their own vectors too
  4004. v5 = vec_mergeh( v7, v13 );
  4005. v6 = vec_mergeh( v10, v16 );
  4006. v8 = vec_mergel( v7, v13 );
  4007. v9 = vec_mergel( v10, v16 );
  4008. vecToX = vec_mergeh( v5, v6 );
  4009. vecToY = vec_mergel( v5, v6 );
  4010. vecToZ = vec_mergeh( v8, v9 );
  4011. vecToW = vec_mergel( v8, v9 );
  4012. // calculate cosom
  4013. vector float vecCosom = vec_madd( vecFromX, vecToX, (vector float)(0) );
  4014. vecCosom = vec_madd( vecFromY, vecToY, vecCosom );
  4015. vecCosom = vec_madd( vecFromZ, vecToZ, vecCosom );
  4016. vecCosom = vec_madd( vecFromW, vecToW, vecCosom );
  4017. // if cosom is < 0, negate it and set temp to negated elements in to. otherwise, set temp to
  4018. // to
  4019. vector bool int vecCmp, vecCmp2;
  4020. vecCmp = vec_cmplt( vecCosom, zeroVector );
  4021. // negate if needed
  4022. vecToX = vec_sel( vecToX, vec_madd( vecToX, (vector float)(-1), zeroVector ), vecCmp );
  4023. vecToY = vec_sel( vecToY, vec_madd( vecToY, (vector float)(-1), zeroVector ), vecCmp );
  4024. vecToZ = vec_sel( vecToZ, vec_madd( vecToZ, (vector float)(-1), zeroVector ), vecCmp );
  4025. vecToW = vec_sel( vecToW, vec_madd( vecToW, (vector float)(-1), zeroVector ), vecCmp );
  4026. vecCosom = vec_sel( vecCosom, vec_madd( vecCosom, (vector float)(-1), zeroVector ), vecCmp );
  4027. // check if we need to calculate scale
  4028. vecCmp2 = vec_cmpgt( vec_sub( (vector float)(1), vecCosom ), (vector float)(1e-6f) );
  4029. vector float vecScale0 = vec_sub( (vector float)(1), vecLerp );
  4030. vector float vecScale1 = vec_splat( vecLerp, 0 );
  4031. vector float vecWork1 = vec_sub( (vector float)(1), vec_madd( vecCosom, vecCosom, zeroVector ) );
  4032. vector float vecWork2 = ReciprocalSquareRoot( vecWork1 );
  4033. vector float vecWork3 = VectorATan16( vec_madd( vecWork1, vecWork2, zeroVector ), vecCosom );
  4034. vecWork1 = vec_madd( VectorSin16( vec_madd( vecScale0, vecWork3, zeroVector ) ), vecWork2, zeroVector );
  4035. vecWork2 = vec_madd( VectorSin16( vec_madd( vecLerp, vecWork3, zeroVector ) ), vecWork2, zeroVector );
  4036. // see which ones we have to insert into our scale0 and scale1 vectors
  4037. vecScale0 = vec_sel( vecScale0, vecWork1, vecCmp2 );
  4038. vecScale1 = vec_sel( vecScale1, vecWork2, vecCmp2 );
  4039. // multiply each element by the scale
  4040. vecFromX = vec_madd( vecFromX, vecScale0, zeroVector );
  4041. vecFromY = vec_madd( vecFromY, vecScale0, zeroVector );
  4042. vecFromZ = vec_madd( vecFromZ, vecScale0, zeroVector );
  4043. vecFromW = vec_madd( vecFromW, vecScale0, zeroVector );
  4044. // multiply temp by scale and add to result
  4045. vecFromX = vec_madd( vecToX, vecScale1, vecFromX );
  4046. vecFromY = vec_madd( vecToY, vecScale1, vecFromY );
  4047. vecFromZ = vec_madd( vecToZ, vecScale1, vecFromZ );
  4048. vecFromW = vec_madd( vecToW, vecScale1, vecFromW );
  4049. // do a transform again to get the results back to vectors we can store out
  4050. v5 = vec_mergeh( vecFromX, vecFromZ );
  4051. v6 = vec_mergeh( vecFromY, vecFromW );
  4052. v8 = vec_mergel( vecFromX, vecFromZ );
  4053. v9 = vec_mergel( vecFromY, vecFromW );
  4054. vecToX = vec_mergeh( v5, v6 );
  4055. vecToY = vec_mergel( v5, v6 );
  4056. vecToZ = vec_mergeh( v8, v9 );
  4057. vecToW = vec_mergel( v8, v9 );
  4058. vector unsigned char storePerm1 = vec_lvsr( 0, jointPtr );
  4059. vector unsigned char storePerm2 = vec_lvsr( 0, jointPtr2 );
  4060. vector unsigned char storePerm3 = vec_lvsr( 0, jointPtr3 );
  4061. vector unsigned char storePerm4 = vec_lvsr( 0, jointPtr4 );
  4062. // right rotate the input data
  4063. vecToX = vec_perm( vecToX, vecToX, storePerm1 );
  4064. vecToY = vec_perm( vecToY, vecToY, storePerm2 );
  4065. vecToZ = vec_perm( vecToZ, vecToZ, storePerm3 );
  4066. vecToW = vec_perm( vecToW, vecToW, storePerm4 );
  4067. vec_ste( vecToX, 0, (float*) jointPtr );
  4068. vec_ste( vecToX, 4, (float*) jointPtr );
  4069. vec_ste( vecToX, 8, (float*) jointPtr );
  4070. vec_ste( vecToX, 12, (float*) jointPtr );
  4071. vec_ste( vecToY, 0, (float*) jointPtr2 );
  4072. vec_ste( vecToY, 4, (float*) jointPtr2 );
  4073. vec_ste( vecToY, 8, (float*) jointPtr2 );
  4074. vec_ste( vecToY, 12, (float*) jointPtr2 );
  4075. vec_ste( vecToZ, 0, (float*) jointPtr3 );
  4076. vec_ste( vecToZ, 4, (float*) jointPtr3 );
  4077. vec_ste( vecToZ, 8, (float*) jointPtr3 );
  4078. vec_ste( vecToZ, 12, (float*) jointPtr3 );
  4079. vec_ste( vecToW, 0, (float*) jointPtr4 );
  4080. vec_ste( vecToW, 4, (float*) jointPtr4 );
  4081. vec_ste( vecToW, 8, (float*) jointPtr4 );
  4082. vec_ste( vecToW, 12, (float*) jointPtr4 );
  4083. // lerp is v1 + l * ( v2 - v1 );
  4084. // the idVec3 T is going to be 12 bytes after the Q, so we can do this without calling ToFloatPtr() again. since its
  4085. float *jointVecPtr = (float*)( jointPtr + 4 );
  4086. float *jointVecPtr2 = (float*)( jointPtr2 + 4 );
  4087. float *jointVecPtr3 = (float*)( jointPtr3 + 4 );
  4088. float *jointVecPtr4 = (float*)( jointPtr4 + 4 );
  4089. v0 = vec_ld( 0, jointVecPtr );
  4090. v1 = vec_ld( 11, jointVecPtr );
  4091. vector float vecLd1 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, jointVecPtr ), (vector unsigned char)(1) ) );
  4092. v2 = vec_ld( 0, jointVecPtr2 );
  4093. v3 = vec_ld( 11, jointVecPtr2 );
  4094. vector float vecLd2 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, jointVecPtr2 ), (vector unsigned char)(1) ) );
  4095. v4 = vec_ld( 0, jointVecPtr3 );
  4096. v5 = vec_ld( 11, jointVecPtr3 );
  4097. vector float vecLd3 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, jointVecPtr3 ), (vector unsigned char)(1) ) );
  4098. v6 = vec_ld( 0, jointVecPtr4 );
  4099. v7 = vec_ld( 11, jointVecPtr4 );
  4100. vector float vecLd4 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, jointVecPtr4 ), (vector unsigned char)(1) ) );
  4101. vector float vecVecX, vecVecY, vecVecZ;
  4102. vecVecX = vecVecY = vecVecZ = zeroVector;
  4103. // planarize
  4104. v0 = vec_mergeh( vecLd1, vecLd3 );
  4105. v1 = vec_mergeh( vecLd2, vecLd4 );
  4106. v3 = vec_mergel( vecLd1, vecLd3 );
  4107. v4 = vec_mergel( vecLd2, vecLd4 );
  4108. vecVecX = vec_mergeh( v0, v1 );
  4109. vecVecY = vec_mergel( v0, v1 );
  4110. vecVecZ = vec_mergeh( v3, v4 );
  4111. // load blend joint idvec3's
  4112. float *blendVecPtr = (float*)( blendPtr + 4 );
  4113. float *blendVecPtr2 =(float*)( blendPtr2 + 4 );
  4114. float *blendVecPtr3 = (float*)( blendPtr3 + 4 );
  4115. float *blendVecPtr4 = (float*)( blendPtr4 + 4 );
  4116. v0 = vec_ld( 0, blendVecPtr );
  4117. v1 = vec_ld( 11, blendVecPtr );
  4118. vector float vecLd5 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, blendVecPtr ), (vector unsigned char)(1) ) );
  4119. v2 = vec_ld( 0, blendVecPtr2 );
  4120. v3 = vec_ld( 11, blendVecPtr2 );
  4121. vector float vecLd6 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, blendVecPtr2 ), (vector unsigned char)(1) ) );
  4122. v4 = vec_ld( 0, blendVecPtr3 );
  4123. v5 = vec_ld( 11, blendVecPtr3 );
  4124. vector float vecLd7 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, blendVecPtr3 ), (vector unsigned char)(1) ) );
  4125. v6 = vec_ld( 0, blendVecPtr4 );
  4126. v7 = vec_ld( 11, blendVecPtr4 );
  4127. vector float vecLd8 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, blendVecPtr4 ), (vector unsigned char)(1) ) );
  4128. vector float vecBlendX, vecBlendY, vecBlendZ;
  4129. vecBlendX = vecBlendY = vecBlendZ = zeroVector;
  4130. // planarize
  4131. v0 = vec_mergeh( vecLd5, vecLd7 );
  4132. v1 = vec_mergeh( vecLd6, vecLd8 );
  4133. v3 = vec_mergel( vecLd5, vecLd7 );
  4134. v4 = vec_mergel( vecLd6, vecLd8 );
  4135. vecBlendX = vec_mergeh( v0, v1 );
  4136. vecBlendY = vec_mergel( v0, v1 );
  4137. vecBlendZ = vec_mergeh( v3, v4 );
  4138. // do subtraction
  4139. vecWork1 = vec_sub( vecBlendX, vecVecX );
  4140. vecWork2 = vec_sub( vecBlendY, vecVecY );
  4141. vecWork3 = vec_sub( vecBlendZ, vecVecZ );
  4142. // multiply by lerp and add to v1
  4143. vecVecX = vec_madd( vecWork1, vecLerp, vecVecX );
  4144. vecVecY = vec_madd( vecWork2, vecLerp, vecVecY );
  4145. vecVecZ = vec_madd( vecWork3, vecLerp, vecVecZ );
  4146. // put it back in original form
  4147. v0 = vec_mergeh( vecVecX, vecVecZ );
  4148. v1 = vec_mergeh( vecVecY, zeroVector );
  4149. v3 = vec_mergel( vecVecX, vecVecZ );
  4150. v4 = vec_mergel( vecVecY, zeroVector );
  4151. // generate vectors to store
  4152. vecWork1 = vec_mergeh( v0, v1 );
  4153. vecWork2 = vec_mergel( v0, v1 );
  4154. vecWork3 = vec_mergeh( v3, v4 );
  4155. vector float vecWork4 = vec_mergel( v3, v4 );
  4156. // store the T values
  4157. storePerm1 = vec_lvsr( 0, jointVecPtr );
  4158. storePerm2 = vec_lvsr( 0, jointVecPtr2 );
  4159. storePerm3 = vec_lvsr( 0, jointVecPtr3 );
  4160. storePerm4 = vec_lvsr( 0, jointVecPtr4 );
  4161. // right rotate the input data
  4162. vecWork1 = vec_perm( vecWork1, vecWork1, storePerm1 );
  4163. vecWork2 = vec_perm( vecWork2, vecWork2, storePerm2 );
  4164. vecWork3 = vec_perm( vecWork3, vecWork3, storePerm3 );
  4165. vecWork4 = vec_perm( vecWork4, vecWork4, storePerm4 );
  4166. vec_ste( vecWork1, 0, (float*) jointVecPtr );
  4167. vec_ste( vecWork1, 4, (float*) jointVecPtr );
  4168. vec_ste( vecWork1, 8, (float*) jointVecPtr );
  4169. vec_ste( vecWork2, 0, (float*) jointVecPtr2 );
  4170. vec_ste( vecWork2, 4, (float*) jointVecPtr2 );
  4171. vec_ste( vecWork2, 8, (float*) jointVecPtr2 );
  4172. vec_ste( vecWork3, 0, (float*) jointVecPtr3 );
  4173. vec_ste( vecWork3, 4, (float*) jointVecPtr3 );
  4174. vec_ste( vecWork3, 8, (float*) jointVecPtr3 );
  4175. vec_ste( vecWork4, 0, (float*) jointVecPtr4 );
  4176. vec_ste( vecWork4, 4, (float*) jointVecPtr4 );
  4177. vec_ste( vecWork4, 8, (float*) jointVecPtr4 );
  4178. }
  4179. // cleanup
  4180. for ( ; i < numJoints; i++ ) {
  4181. int j = index[i];
  4182. joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
  4183. joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
  4184. }
  4185. }
  4186. /*
  4187. ============
  4188. idSIMD_AltiVec::ConvertJointQuatsToJointMats
  4189. ============
  4190. */
  4191. // SSE doesn't vectorize this, and I don't think we should either. Its mainly just copying data, there's very little math involved and
  4192. // it's not easily parallelizable
  4193. void VPCALL idSIMD_AltiVec::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
  4194. for ( int i = 0; i < numJoints; i++ ) {
  4195. const float *q = jointQuats[i].q.ToFloatPtr();
  4196. float *m = jointMats[i].ToFloatPtr();
  4197. m[0*4+3] = q[4];
  4198. m[1*4+3] = q[5];
  4199. m[2*4+3] = q[6];
  4200. float x2 = q[0] + q[0];
  4201. float y2 = q[1] + q[1];
  4202. float z2 = q[2] + q[2];
  4203. {
  4204. float xx = q[0] * x2;
  4205. float yy = q[1] * y2;
  4206. float zz = q[2] * z2;
  4207. m[0*4+0] = 1.0f - yy - zz;
  4208. m[1*4+1] = 1.0f - xx - zz;
  4209. m[2*4+2] = 1.0f - xx - yy;
  4210. }
  4211. {
  4212. float yz = q[1] * z2;
  4213. float wx = q[3] * x2;
  4214. m[2*4+1] = yz - wx;
  4215. m[1*4+2] = yz + wx;
  4216. }
  4217. {
  4218. float xy = q[0] * y2;
  4219. float wz = q[3] * z2;
  4220. m[1*4+0] = xy - wz;
  4221. m[0*4+1] = xy + wz;
  4222. }
  4223. {
  4224. float xz = q[0] * z2;
  4225. float wy = q[3] * y2;
  4226. m[0*4+2] = xz - wy;
  4227. m[2*4+0] = xz + wy;
  4228. }
  4229. }
  4230. }
  4231. /*
  4232. ============
  4233. idSIMD_AltiVec::ConvertJointMatsToJointQuats
  4234. ============
  4235. */
  4236. void VPCALL idSIMD_AltiVec::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
  4237. int index;
  4238. // Since we use very little of the data we have to pull in for the altivec version, we end up with
  4239. // a lot of wasted math. Rather than try to force it to use altivec, I wrote an optimized version
  4240. // of InvSqrt for the G5, and made it use that instead. With only this change, we get a little
  4241. // bigger than 50% speedup, which is not too shabby. Should really replace idMath::InvSqrt with
  4242. // my function so everyone can benefit on G5.
  4243. for ( index = 0; index < numJoints; index++ ) {
  4244. idJointQuat jq;
  4245. float trace;
  4246. float s;
  4247. float t;
  4248. int i;
  4249. int j;
  4250. int k;
  4251. static int next[3] = { 1, 2, 0 };
  4252. float *mat = (float*)( jointMats[index].ToFloatPtr() );
  4253. trace = mat[0 * 4 + 0] + mat[1 * 4 + 1] + mat[2 * 4 + 2];
  4254. if ( trace > 0.0f ) {
  4255. t = trace + 1.0f;
  4256. //s = idMath::InvSqrt( t ) * 0.5f;
  4257. s = FastScalarInvSqrt( t ) * 0.5f;
  4258. jq.q[3] = s * t;
  4259. jq.q[0] = ( mat[1 * 4 + 2] - mat[2 * 4 + 1] ) * s;
  4260. jq.q[1] = ( mat[2 * 4 + 0] - mat[0 * 4 + 2] ) * s;
  4261. jq.q[2] = ( mat[0 * 4 + 1] - mat[1 * 4 + 0] ) * s;
  4262. } else {
  4263. i = 0;
  4264. if ( mat[1 * 4 + 1] > mat[0 * 4 + 0] ) {
  4265. i = 1;
  4266. }
  4267. if ( mat[2 * 4 + 2] > mat[i * 4 + i] ) {
  4268. i = 2;
  4269. }
  4270. j = next[i];
  4271. k = next[j];
  4272. t = ( mat[i * 4 + i] - ( mat[j * 4 + j] + mat[k * 4 + k] ) ) + 1.0f;
  4273. //s = idMath::InvSqrt( t ) * 0.5f;
  4274. s = FastScalarInvSqrt( t ) * 0.5f;
  4275. jq.q[i] = s * t;
  4276. jq.q[3] = ( mat[j * 4 + k] - mat[k * 4 + j] ) * s;
  4277. jq.q[j] = ( mat[i * 4 + j] + mat[j * 4 + i] ) * s;
  4278. jq.q[k] = ( mat[i * 4 + k] + mat[k * 4 + i] ) * s;
  4279. }
  4280. jq.t[0] = mat[0 * 4 + 3];
  4281. jq.t[1] = mat[1 * 4 + 3];
  4282. jq.t[2] = mat[2 * 4 + 3];
  4283. jointQuats[index] = jq;
  4284. }
  4285. }
  4286. /*
  4287. ============
  4288. idSIMD_AltiVec::TransformJoints
  4289. ============
  4290. */
  4291. void VPCALL idSIMD_AltiVec::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
  4292. int i;
  4293. #if 0
  4294. for( i = firstJoint; i <= lastJoint; i++ ) {
  4295. assert( parents[i] < i );
  4296. jointMats[i] *= jointMats[parents[i]];
  4297. }
  4298. #else
  4299. // I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
  4300. // on what the parents array looks like. This is true in the test code.
  4301. for ( i = firstJoint; i <= lastJoint; i++ ) {
  4302. assert( parents[i] < i );
  4303. float *jointPtr = jointMats[i].ToFloatPtr();
  4304. float *parentPtr = jointMats[parents[i]].ToFloatPtr();
  4305. vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
  4306. vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
  4307. vector float v0, v1, v2, v3, v4, v5, v6, v7;
  4308. // we need to load up 12 float elements that make up the Mat
  4309. v0 = vec_ld( 0, jointPtr );
  4310. v1 = vec_ld( 15, jointPtr );
  4311. v2 = vec_ld( 31, jointPtr );
  4312. v3 = vec_ld( 47, jointPtr );
  4313. // load parents
  4314. v4 = vec_ld( 0, parentPtr );
  4315. v5 = vec_ld( 15, parentPtr );
  4316. v6 = vec_ld( 31, parentPtr );
  4317. v7 = vec_ld( 47, parentPtr );
  4318. // permute into vectors
  4319. vector float vecJointMat1 = vec_perm( v0, v1, permVec );
  4320. vector float vecJointMat2 = vec_perm( v1, v2, permVec );
  4321. vector float vecJointMat3 = vec_perm( v2, v3, permVec );
  4322. vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
  4323. vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
  4324. vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
  4325. vector float zero = (vector float)(0);
  4326. vector float C1, C2, C3;
  4327. // matrix multiply
  4328. C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero ); // m(0 to 3) * a(0)
  4329. C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat2, 0 ), zero ); // m(4 to 7) * a(4)
  4330. C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat3, 0 ), zero ); // m(8 to 11) * a(8)
  4331. C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat1, 1 ), C1 ); // add in m(4 to 7) * a(1)
  4332. C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 ); // add in m(4 to 7) * a(5)
  4333. C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat3, 1 ), C3 ); // add in m(4 to 7) * a(9)
  4334. C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat1, 2 ), C1 );
  4335. C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat2, 2 ), C2 );
  4336. C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
  4337. // do the addition at the end
  4338. vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
  4339. C1 = vec_add( C1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
  4340. C2 = vec_add( C2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
  4341. C3 = vec_add( C3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
  4342. // store results
  4343. UNALIGNED_STORE3( (float*) jointPtr, C1, C2, C3 );
  4344. }
  4345. #endif
  4346. }
  4347. /*
  4348. ============
  4349. idSIMD_AltiVec::UntransformJoints
  4350. ============
  4351. */
  4352. void VPCALL idSIMD_AltiVec::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
  4353. int i;
  4354. #if 0
  4355. for( i = lastJoint; i >= firstJoint; i-- ) {
  4356. assert( parents[i] < i );
  4357. jointMats[i] /= jointMats[parents[i]];
  4358. }
  4359. #else
  4360. // I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
  4361. // on what the parents array looks like. This is true in the test code.
  4362. for ( i = lastJoint; i >= firstJoint; i-- ) {
  4363. assert( parents[i] < i );
  4364. float *jointPtr = jointMats[i].ToFloatPtr();
  4365. float *parentPtr = jointMats[parents[i]].ToFloatPtr();
  4366. vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
  4367. vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
  4368. vector float v0, v1, v2, v3, v4, v5, v6, v7;
  4369. // we need to load up 12 float elements that make up the Mat
  4370. v0 = vec_ld( 0, jointPtr );
  4371. v1 = vec_ld( 15, jointPtr );
  4372. v2 = vec_ld( 31, jointPtr );
  4373. v3 = vec_ld( 47, jointPtr );
  4374. // load parents
  4375. v4 = vec_ld( 0, parentPtr );
  4376. v5 = vec_ld( 15, parentPtr );
  4377. v6 = vec_ld( 31, parentPtr );
  4378. v7 = vec_ld( 47, parentPtr );
  4379. // permute into vectors
  4380. vector float vecJointMat1 = vec_perm( v0, v1, permVec );
  4381. vector float vecJointMat2 = vec_perm( v1, v2, permVec );
  4382. vector float vecJointMat3 = vec_perm( v2, v3, permVec );
  4383. vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
  4384. vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
  4385. vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
  4386. vector float zero = (vector float)(0);
  4387. vector float C1, C2, C3;
  4388. // do subtraction at the beginning
  4389. vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
  4390. vecJointMat1 = vec_sub( vecJointMat1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
  4391. vecJointMat2 = vec_sub( vecJointMat2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
  4392. vecJointMat3 = vec_sub( vecJointMat3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
  4393. // matrix multiply
  4394. C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero );
  4395. C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 1 ), zero );
  4396. C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 2 ), zero );
  4397. C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 0 ), C1 );
  4398. C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 );
  4399. C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 2 ), C3 );
  4400. C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 0 ), C1 );
  4401. C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 1 ), C2 );
  4402. C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
  4403. // store results back
  4404. vector unsigned char storePerm = vec_lvsr( 0, jointPtr );
  4405. // right rotate the input data
  4406. C1 = vec_perm( C1, C1, storePerm );
  4407. C2 = vec_perm( C2, C2, storePerm );
  4408. C3 = vec_perm( C3, C3, storePerm );
  4409. vec_ste( C1, 0, (float*) jointPtr );
  4410. vec_ste( C1, 4, (float*) jointPtr );
  4411. vec_ste( C1, 8, (float*) jointPtr );
  4412. vec_ste( C1, 12, (float*) jointPtr );
  4413. vec_ste( C2, 16, (float*) jointPtr );
  4414. vec_ste( C2, 20, (float*) jointPtr );
  4415. vec_ste( C2, 24, (float*) jointPtr );
  4416. vec_ste( C2, 28, (float*) jointPtr );
  4417. vec_ste( C3, 32, (float*) jointPtr );
  4418. vec_ste( C3, 36, (float*) jointPtr );
  4419. vec_ste( C3, 40, (float*) jointPtr );
  4420. vec_ste( C3, 44, (float*) jointPtr );
  4421. }
  4422. #endif
  4423. }
  4424. /*
  4425. ============
  4426. idSIMD_AltiVec::TransformVerts
  4427. ============
  4428. */
  4429. // Here we don't have much for the vector unit to do, and the gain we get from doing the math
  4430. // in parallel is eaten by doing unaligned stores.
  4431. void VPCALL idSIMD_AltiVec::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
  4432. int i, j;
  4433. const byte *jointsPtr = (byte *)joints;
  4434. for( j = i = 0; i < numVerts; i++ ) {
  4435. idVec3 v;
  4436. float *matPtrOrig = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
  4437. float *weightPtr = (float*) weights[j].ToFloatPtr();
  4438. v[0] = matPtrOrig[0] * weightPtr[0];
  4439. v[0] += matPtrOrig[1] * weightPtr[1];
  4440. v[0] += matPtrOrig[2] * weightPtr[2];
  4441. v[0] += matPtrOrig[3] * weightPtr[3];
  4442. v[1] = matPtrOrig[4] * weightPtr[0];
  4443. v[1] += matPtrOrig[5] * weightPtr[1];
  4444. v[1] += matPtrOrig[6] * weightPtr[2];
  4445. v[1] += matPtrOrig[7] * weightPtr[3];
  4446. v[2] = matPtrOrig[8] * weightPtr[0];
  4447. v[2] += matPtrOrig[9] * weightPtr[1];
  4448. v[2] += matPtrOrig[10] * weightPtr[2];
  4449. v[2] += matPtrOrig[11] * weightPtr[3];
  4450. while( index[j*2+1] == 0 ) {
  4451. j++;
  4452. float *matPtr = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
  4453. weightPtr = (float*) weights[j].ToFloatPtr();
  4454. v[0] += matPtr[0] * weightPtr[0];
  4455. v[0] += matPtr[1] * weightPtr[1];
  4456. v[0] += matPtr[2] * weightPtr[2];
  4457. v[0] += matPtr[3] * weightPtr[3];
  4458. v[1] += matPtr[4] * weightPtr[0];
  4459. v[1] += matPtr[5] * weightPtr[1];
  4460. v[1] += matPtr[6] * weightPtr[2];
  4461. v[1] += matPtr[7] * weightPtr[3];
  4462. v[2] += matPtr[8] * weightPtr[0];
  4463. v[2] += matPtr[9] * weightPtr[1];
  4464. v[2] += matPtr[10] * weightPtr[2];
  4465. v[2] += matPtr[11] * weightPtr[3];
  4466. }
  4467. j++;
  4468. verts[i].xyz = v;
  4469. }
  4470. }
  4471. #endif /* LIVE_VICARIOUSLY */
  4472. #ifdef ENABLE_CULL
  4473. #ifndef DRAWVERT_PADDED
  4474. /*
  4475. ============
  4476. idSIMD_AltiVec::TracePointCull
  4477. ============
  4478. */
  4479. void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  4480. // idDrawVert size
  4481. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
  4482. byte tOr;
  4483. tOr = 0;
  4484. // pointers
  4485. const float *planePtr = planes[0].ToFloatPtr();
  4486. vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
  4487. vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
  4488. vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
  4489. vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
  4490. vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
  4491. vector unsigned char vecPerm;
  4492. vector float v0, v1, v2, v3, v4, v5, v6, v7;
  4493. vector float zeroVector = (vector float)(0);
  4494. vector float vecRadius;
  4495. vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
  4496. vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
  4497. vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
  4498. vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
  4499. vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
  4500. vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
  4501. vector bool int oneIntVector = (vector bool int)(1);
  4502. vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
  4503. vector unsigned int vecTotals;
  4504. vector unsigned int tempIntSum;
  4505. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  4506. vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
  4507. // populate planes
  4508. v0 = vec_ld( 0, planePtr );
  4509. v1 = vec_ld( 15, planePtr );
  4510. vecPlane0 = vec_perm( v0, v1, vecPerm );
  4511. v2 = vec_ld( 0, planePtr + 4 );
  4512. v3 = vec_ld( 15, planePtr + 4 );
  4513. vecPlane1 = vec_perm( v2, v3, vecPerm );
  4514. v0 = vec_ld( 0, planePtr + 8 );
  4515. v1 = vec_ld( 15, planePtr + 8 );
  4516. vecPlane2 = vec_perm( v0, v1, vecPerm );
  4517. v2 = vec_ld( 0, planePtr + 12 );
  4518. v3 = vec_ld( 15, planePtr + 12 );
  4519. vecPlane3 = vec_perm( v2, v3, vecPerm );
  4520. // transpose
  4521. v0 = vec_mergeh( vecPlane0, vecPlane2 );
  4522. v1 = vec_mergeh( vecPlane1, vecPlane3 );
  4523. v2 = vec_mergel( vecPlane0, vecPlane2 );
  4524. v3 = vec_mergel( vecPlane1, vecPlane3 );
  4525. vecPlane0 = vec_mergeh( v0, v1 );
  4526. vecPlane1 = vec_mergel( v0, v1 );
  4527. vecPlane2 = vec_mergeh( v2, v3 );
  4528. vecPlane3 = vec_mergel( v2, v3 );
  4529. // load constants
  4530. vecRadius = loadSplatUnalignedScalar( &radius );
  4531. unsigned int cullBitVal[4];
  4532. vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
  4533. int i = 0;
  4534. // every fourth one will have the same alignment. Make sure we've got enough here
  4535. if ( i+3 < numVerts ) {
  4536. vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  4537. vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  4538. vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  4539. vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  4540. }
  4541. for ( ; i+3 < numVerts; i+=4 ) {
  4542. const float *vertPtr = verts[i].xyz.ToFloatPtr();
  4543. const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
  4544. const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
  4545. const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
  4546. v0 = vec_ld( 0, vertPtr );
  4547. v1 = vec_ld( 15, vertPtr );
  4548. v2 = vec_ld( 0, vertPtr2 );
  4549. v3 = vec_ld( 15, vertPtr2 );
  4550. v4 = vec_ld( 0, vertPtr3 );
  4551. v5 = vec_ld( 15, vertPtr3 );
  4552. v6 = vec_ld( 0, vertPtr4 );
  4553. v7 = vec_ld( 15, vertPtr4 );
  4554. vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
  4555. vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
  4556. vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
  4557. vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
  4558. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
  4559. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
  4560. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
  4561. vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
  4562. vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
  4563. vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
  4564. vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
  4565. vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
  4566. vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
  4567. vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
  4568. vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
  4569. vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
  4570. vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
  4571. vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
  4572. vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
  4573. vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
  4574. // vec1Sum1 now holds d0, d1, d2, d3. calculate the
  4575. // difference with +radius and -radius
  4576. vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
  4577. vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
  4578. vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
  4579. vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
  4580. vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
  4581. vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
  4582. vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
  4583. vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
  4584. // do compare
  4585. vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
  4586. vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
  4587. vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
  4588. vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
  4589. vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
  4590. vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
  4591. vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
  4592. vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
  4593. //and it with 1 so we multiply by 1 not 1111's
  4594. vecCmp1 = vec_and( vecCmp1, oneIntVector );
  4595. vecCmp2 = vec_and( vecCmp2, oneIntVector );
  4596. vecCmp3 = vec_and( vecCmp3, oneIntVector );
  4597. vecCmp4 = vec_and( vecCmp4, oneIntVector );
  4598. vecCmp5 = vec_and( vecCmp5, oneIntVector );
  4599. vecCmp6 = vec_and( vecCmp6, oneIntVector );
  4600. vecCmp7 = vec_and( vecCmp7, oneIntVector );
  4601. vecCmp8 = vec_and( vecCmp8, oneIntVector );
  4602. vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
  4603. vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
  4604. vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
  4605. vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
  4606. vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
  4607. vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
  4608. vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
  4609. vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
  4610. // OR (add) them all together
  4611. vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
  4612. vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
  4613. vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
  4614. vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
  4615. vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
  4616. vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
  4617. tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
  4618. tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
  4619. vecTotals = vec_mergeh( vecTotals, tempIntSum );
  4620. tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
  4621. tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
  4622. vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
  4623. tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
  4624. tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
  4625. vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
  4626. // store out results
  4627. vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
  4628. tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
  4629. vec_ste( tempSt, 0, &cullBitVal[0] );
  4630. vec_ste( tempSt, 4, &cullBitVal[0] );
  4631. vec_ste( tempSt, 8, &cullBitVal[0] );
  4632. vec_ste( tempSt, 12, &cullBitVal[0] );
  4633. tOr |= cullBitVal[0];
  4634. tOr |= cullBitVal[1];
  4635. tOr |= cullBitVal[2];
  4636. tOr |= cullBitVal[3];
  4637. cullBits[i] = cullBitVal[0];
  4638. cullBits[i+1] = cullBitVal[1];
  4639. cullBits[i+2] = cullBitVal[2];
  4640. cullBits[i+3] = cullBitVal[3];
  4641. }
  4642. // cleanup
  4643. for ( ; i < numVerts; i++ ) {
  4644. byte bits;
  4645. float d0, d1, d2, d3, t;
  4646. const idVec3 &v = verts[i].xyz;
  4647. d0 = planes[0].Distance( v );
  4648. d1 = planes[1].Distance( v );
  4649. d2 = planes[2].Distance( v );
  4650. d3 = planes[3].Distance( v );
  4651. t = d0 + radius;
  4652. bits = FLOATSIGNBITSET( t ) << 0;
  4653. t = d1 + radius;
  4654. bits |= FLOATSIGNBITSET( t ) << 1;
  4655. t = d2 + radius;
  4656. bits |= FLOATSIGNBITSET( t ) << 2;
  4657. t = d3 + radius;
  4658. bits |= FLOATSIGNBITSET( t ) << 3;
  4659. t = d0 - radius;
  4660. bits |= FLOATSIGNBITSET( t ) << 4;
  4661. t = d1 - radius;
  4662. bits |= FLOATSIGNBITSET( t ) << 5;
  4663. t = d2 - radius;
  4664. bits |= FLOATSIGNBITSET( t ) << 6;
  4665. t = d3 - radius;
  4666. bits |= FLOATSIGNBITSET( t ) << 7;
  4667. bits ^= 0x0F; // flip lower four bits
  4668. tOr |= bits;
  4669. cullBits[i] = bits;
  4670. }
  4671. totalOr = tOr;
  4672. }
  4673. #else
  4674. /*
  4675. ============
  4676. idSIMD_AltiVec::TracePointCull
  4677. ============
  4678. */
  4679. void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  4680. // idDrawVert size
  4681. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
  4682. byte tOr;
  4683. tOr = 0;
  4684. // pointers
  4685. const float *planePtr = planes[0].ToFloatPtr();
  4686. vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
  4687. vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
  4688. vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
  4689. vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
  4690. vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
  4691. vector unsigned char vecPerm;
  4692. vector float v0, v1, v2, v3, v4, v5, v6, v7;
  4693. vector float zeroVector = (vector float)(0);
  4694. vector float vecRadius;
  4695. vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
  4696. vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
  4697. vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
  4698. vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
  4699. vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
  4700. vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
  4701. vector bool int oneIntVector = (vector bool int)(1);
  4702. vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
  4703. vector unsigned int vecTotals;
  4704. vector unsigned int tempIntSum;
  4705. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  4706. vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
  4707. // populate planes
  4708. v0 = vec_ld( 0, planePtr );
  4709. v1 = vec_ld( 15, planePtr );
  4710. vecPlane0 = vec_perm( v0, v1, vecPerm );
  4711. v2 = vec_ld( 0, planePtr + 4 );
  4712. v3 = vec_ld( 15, planePtr + 4 );
  4713. vecPlane1 = vec_perm( v2, v3, vecPerm );
  4714. v0 = vec_ld( 0, planePtr + 8 );
  4715. v1 = vec_ld( 15, planePtr + 8 );
  4716. vecPlane2 = vec_perm( v0, v1, vecPerm );
  4717. v2 = vec_ld( 0, planePtr + 12 );
  4718. v3 = vec_ld( 15, planePtr + 12 );
  4719. vecPlane3 = vec_perm( v2, v3, vecPerm );
  4720. // transpose
  4721. v0 = vec_mergeh( vecPlane0, vecPlane2 );
  4722. v1 = vec_mergeh( vecPlane1, vecPlane3 );
  4723. v2 = vec_mergel( vecPlane0, vecPlane2 );
  4724. v3 = vec_mergel( vecPlane1, vecPlane3 );
  4725. vecPlane0 = vec_mergeh( v0, v1 );
  4726. vecPlane1 = vec_mergel( v0, v1 );
  4727. vecPlane2 = vec_mergeh( v2, v3 );
  4728. vecPlane3 = vec_mergel( v2, v3 );
  4729. // load constants
  4730. vecRadius = loadSplatUnalignedScalar( &radius );
  4731. unsigned int cullBitVal[4];
  4732. vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
  4733. int i = 0;
  4734. for ( ; i+3 < numVerts; i+=4 ) {
  4735. const float *vertPtr = verts[i].xyz.ToFloatPtr();
  4736. const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
  4737. const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
  4738. const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
  4739. vecXYZ1 = vec_ld( 0, vertPtr );
  4740. vecXYZ2 = vec_ld( 0, vertPtr2 );
  4741. vecXYZ3 = vec_ld( 0, vertPtr3 );
  4742. vecXYZ4 = vec_ld( 0, vertPtr4 );
  4743. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
  4744. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
  4745. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
  4746. vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
  4747. vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
  4748. vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
  4749. vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
  4750. vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
  4751. vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
  4752. vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
  4753. vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
  4754. vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
  4755. vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
  4756. vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
  4757. vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
  4758. vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
  4759. // vec1Sum1 now holds d0, d1, d2, d3. calculate the
  4760. // difference with +radius and -radius
  4761. vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
  4762. vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
  4763. vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
  4764. vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
  4765. vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
  4766. vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
  4767. vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
  4768. vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
  4769. // do compare
  4770. vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
  4771. vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
  4772. vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
  4773. vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
  4774. vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
  4775. vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
  4776. vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
  4777. vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
  4778. //and it with 1 so we multiply by 1 not 1111's
  4779. vecCmp1 = vec_and( vecCmp1, oneIntVector );
  4780. vecCmp2 = vec_and( vecCmp2, oneIntVector );
  4781. vecCmp3 = vec_and( vecCmp3, oneIntVector );
  4782. vecCmp4 = vec_and( vecCmp4, oneIntVector );
  4783. vecCmp5 = vec_and( vecCmp5, oneIntVector );
  4784. vecCmp6 = vec_and( vecCmp6, oneIntVector );
  4785. vecCmp7 = vec_and( vecCmp7, oneIntVector );
  4786. vecCmp8 = vec_and( vecCmp8, oneIntVector );
  4787. vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
  4788. vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
  4789. vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
  4790. vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
  4791. vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
  4792. vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
  4793. vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
  4794. vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
  4795. // OR (add) them all together
  4796. vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
  4797. vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
  4798. vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
  4799. vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
  4800. vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
  4801. vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
  4802. tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
  4803. tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
  4804. vecTotals = vec_mergeh( vecTotals, tempIntSum );
  4805. tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
  4806. tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
  4807. vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
  4808. tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
  4809. tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
  4810. vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
  4811. // store out results
  4812. vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
  4813. tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
  4814. vec_ste( tempSt, 0, &cullBitVal[0] );
  4815. vec_ste( tempSt, 4, &cullBitVal[0] );
  4816. vec_ste( tempSt, 8, &cullBitVal[0] );
  4817. vec_ste( tempSt, 12, &cullBitVal[0] );
  4818. tOr |= cullBitVal[0];
  4819. tOr |= cullBitVal[1];
  4820. tOr |= cullBitVal[2];
  4821. tOr |= cullBitVal[3];
  4822. cullBits[i] = cullBitVal[0];
  4823. cullBits[i+1] = cullBitVal[1];
  4824. cullBits[i+2] = cullBitVal[2];
  4825. cullBits[i+3] = cullBitVal[3];
  4826. }
  4827. // cleanup
  4828. for ( ; i < numVerts; i++ ) {
  4829. byte bits;
  4830. float d0, d1, d2, d3, t;
  4831. const idVec3 &v = verts[i].xyz;
  4832. d0 = planes[0].Distance( v );
  4833. d1 = planes[1].Distance( v );
  4834. d2 = planes[2].Distance( v );
  4835. d3 = planes[3].Distance( v );
  4836. t = d0 + radius;
  4837. bits = FLOATSIGNBITSET( t ) << 0;
  4838. t = d1 + radius;
  4839. bits |= FLOATSIGNBITSET( t ) << 1;
  4840. t = d2 + radius;
  4841. bits |= FLOATSIGNBITSET( t ) << 2;
  4842. t = d3 + radius;
  4843. bits |= FLOATSIGNBITSET( t ) << 3;
  4844. t = d0 - radius;
  4845. bits |= FLOATSIGNBITSET( t ) << 4;
  4846. t = d1 - radius;
  4847. bits |= FLOATSIGNBITSET( t ) << 5;
  4848. t = d2 - radius;
  4849. bits |= FLOATSIGNBITSET( t ) << 6;
  4850. t = d3 - radius;
  4851. bits |= FLOATSIGNBITSET( t ) << 7;
  4852. bits ^= 0x0F; // flip lower four bits
  4853. tOr |= bits;
  4854. cullBits[i] = bits;
  4855. }
  4856. totalOr = tOr;
  4857. }
  4858. #endif /* DRAWVERT_PADDED */
  4859. #ifndef DRAWVERT_PADDED
  4860. /*
  4861. ============
  4862. idSIMD_AltiVec::DecalPointCull
  4863. ============
  4864. */
  4865. void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  4866. // idDrawVert size
  4867. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
  4868. int i;
  4869. const float *planePtr = planes[0].ToFloatPtr();
  4870. vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
  4871. vector float zeroVector = (vector float)(0.0);
  4872. vector unsigned char vecPerm;
  4873. vector float v0, v1, v2, v3, v4, v5, v6, v7;
  4874. vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
  4875. // populate planes
  4876. v0 = vec_ld( 0, planePtr );
  4877. v1 = vec_ld( 15, planePtr );
  4878. vecPlane0 = vec_perm( v0, v1, vecPerm );
  4879. v2 = vec_ld( 0, planePtr + 4 );
  4880. v3 = vec_ld( 15, planePtr + 4 );
  4881. vecPlane1 = vec_perm( v2, v3, vecPerm );
  4882. v0 = vec_ld( 0, planePtr + 8 );
  4883. v1 = vec_ld( 15, planePtr + 8 );
  4884. vecPlane2 = vec_perm( v0, v1, vecPerm );
  4885. v2 = vec_ld( 0, planePtr + 12 );
  4886. v3 = vec_ld( 15, planePtr + 12 );
  4887. vecPlane3 = vec_perm( v2, v3, vecPerm );
  4888. v0 = vec_ld( 0, planePtr + 16 );
  4889. v1 = vec_ld( 15, planePtr + 16 );
  4890. vecPlane4 = vec_perm( v0, v1, vecPerm );
  4891. v2 = vec_ld( 0, planePtr + 20 );
  4892. v3 = vec_ld( 15, planePtr + 20 );
  4893. vecPlane5 = vec_perm( v2, v3, vecPerm );
  4894. // transpose
  4895. v0 = vec_mergeh( vecPlane0, vecPlane2 );
  4896. v1 = vec_mergeh( vecPlane1, vecPlane3 );
  4897. v2 = vec_mergel( vecPlane0, vecPlane2 );
  4898. v3 = vec_mergel( vecPlane1, vecPlane3 );
  4899. vecPlane0 = vec_mergeh( v0, v1 );
  4900. vecPlane1 = vec_mergel( v0, v1 );
  4901. vecPlane2 = vec_mergeh( v2, v3 );
  4902. vecPlane3 = vec_mergel( v2, v3 );
  4903. v0 = vec_mergeh( vecPlane4, zeroVector );
  4904. v1 = vec_mergeh( vecPlane5, zeroVector );
  4905. v2 = vec_mergel( vecPlane4, zeroVector );
  4906. v3 = vec_mergel( vecPlane5, zeroVector );
  4907. vecPlane4 = vec_mergeh( v0, v1 );
  4908. vecPlane5 = vec_mergel( v0, v1 );
  4909. vecPlane6 = vec_mergeh( v2, v3 );
  4910. vecPlane7 = vec_mergel( v2, v3 );
  4911. vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
  4912. vector bool int oneIntVector = (vector bool int)(1);
  4913. vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
  4914. vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
  4915. vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
  4916. vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
  4917. vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
  4918. vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
  4919. vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
  4920. vector unsigned int vecR1, vecR2, vecR3, vecR4;
  4921. vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
  4922. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  4923. unsigned int vBits[4];
  4924. vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
  4925. i = 0;
  4926. // every fourth one will have the same alignment. Make sure we've got enough here
  4927. if ( i+3 < numVerts ) {
  4928. vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  4929. vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  4930. vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  4931. vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  4932. }
  4933. for ( ; i+3 < numVerts; i+=4 ) {
  4934. const float *vertPtr = verts[i].xyz.ToFloatPtr();
  4935. const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
  4936. const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
  4937. const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
  4938. v0 = vec_ld( 0, vertPtr );
  4939. v1 = vec_ld( 15, vertPtr );
  4940. v2 = vec_ld( 0, vertPtr2 );
  4941. v3 = vec_ld( 15, vertPtr2 );
  4942. v4 = vec_ld( 0, vertPtr3 );
  4943. v5 = vec_ld( 15, vertPtr3 );
  4944. v6 = vec_ld( 0, vertPtr4 );
  4945. v7 = vec_ld( 15, vertPtr4 );
  4946. vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
  4947. vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
  4948. vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
  4949. vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
  4950. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
  4951. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
  4952. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
  4953. vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
  4954. vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
  4955. vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
  4956. vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
  4957. vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
  4958. vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
  4959. vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
  4960. vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
  4961. vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
  4962. vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
  4963. vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
  4964. vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
  4965. vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
  4966. vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
  4967. vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
  4968. vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
  4969. vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
  4970. vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
  4971. vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
  4972. vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
  4973. vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
  4974. vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
  4975. vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
  4976. vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
  4977. vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
  4978. vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
  4979. vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
  4980. vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
  4981. vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
  4982. vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
  4983. vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
  4984. vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
  4985. vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
  4986. vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
  4987. vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
  4988. vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
  4989. vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
  4990. //and it with 1 so we multiply by 1 not 1111's
  4991. vecCmp1 = vec_and( vecCmp1, oneIntVector );
  4992. vecCmp2 = vec_and( vecCmp2, oneIntVector );
  4993. vecCmp3 = vec_and( vecCmp3, oneIntVector );
  4994. vecCmp4 = vec_and( vecCmp4, oneIntVector );
  4995. vecCmp5 = vec_and( vecCmp5, oneIntVector );
  4996. vecCmp6 = vec_and( vecCmp6, oneIntVector );
  4997. vecCmp7 = vec_and( vecCmp7, oneIntVector );
  4998. vecCmp8 = vec_and( vecCmp8, oneIntVector );
  4999. vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
  5000. vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
  5001. vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
  5002. vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
  5003. vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
  5004. vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
  5005. vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
  5006. vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
  5007. //OR them all together (this is the same as adding them, since they're all only 1 bit set)
  5008. vecR1 = (vector unsigned int)(0); //zeroIntVector;
  5009. vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
  5010. vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
  5011. vecR1 = vec_add(vecR1, vecBitShifted2 );
  5012. vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
  5013. vecR2 = (vector unsigned int)(0); //zeroIntVector;
  5014. vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
  5015. vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
  5016. vecR2 = vec_add(vecR2, vecBitShifted4 );
  5017. vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
  5018. vecR3 = (vector unsigned int)(0); //zeroIntVector;
  5019. vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
  5020. vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
  5021. vecR3 = vec_add(vecR3, vecBitShifted6 );
  5022. vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
  5023. vecR4 = (vector unsigned int)(0); //zeroIntVector;
  5024. vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
  5025. vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
  5026. vecR4 = vec_add(vecR4, vecBitShifted8 );
  5027. vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
  5028. // take the first element from each vector and put them into vecR1
  5029. vecR1 = vec_mergeh( vecR1, vecR2 );
  5030. vecR3 = vec_mergeh( vecR3, vecR4 );
  5031. vecR1 = vec_perm( vecR1, vecR3, permHalves );
  5032. // XOR with 0x3F to flip lower 6 bits
  5033. vecR1 = vec_xor( vecR1, vecFlipBits );
  5034. // store out results. don't have 16 at a time so let's just
  5035. // do this and avoid alignment concerns
  5036. vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
  5037. vec_ste( vecR1, 0, &vBits[0] );
  5038. vec_ste( vecR1, 4, &vBits[0] );
  5039. vec_ste( vecR1, 8, &vBits[0] );
  5040. vec_ste( vecR1, 12, &vBits[0] );
  5041. cullBits[i] = vBits[0];
  5042. cullBits[i+1] = vBits[1];
  5043. cullBits[i+2] = vBits[2];
  5044. cullBits[i+3] = vBits[3];
  5045. }
  5046. for ( ; i < numVerts; i++ ) {
  5047. byte bits;
  5048. float d0, d1, d2, d3, d4, d5;
  5049. const idVec3 &v = verts[i].xyz;
  5050. d0 = planes[0].Distance( v );
  5051. d1 = planes[1].Distance( v );
  5052. d2 = planes[2].Distance( v );
  5053. d3 = planes[3].Distance( v );
  5054. d4 = planes[4].Distance( v );
  5055. d5 = planes[5].Distance( v );
  5056. // they check if the sign bit is set by casting as long and shifting right 31 places.
  5057. bits = FLOATSIGNBITSET( d0 ) << 0;
  5058. bits |= FLOATSIGNBITSET( d1 ) << 1;
  5059. bits |= FLOATSIGNBITSET( d2 ) << 2;
  5060. bits |= FLOATSIGNBITSET( d3 ) << 3;
  5061. bits |= FLOATSIGNBITSET( d4 ) << 4;
  5062. bits |= FLOATSIGNBITSET( d5 ) << 5;
  5063. cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
  5064. }
  5065. }
  5066. #else
  5067. /*
  5068. ============
  5069. idSIMD_AltiVec::DecalPointCull
  5070. ============
  5071. */
  5072. void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  5073. // idDrawVert size
  5074. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
  5075. int i;
  5076. const float *planePtr = planes[0].ToFloatPtr();
  5077. vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
  5078. vector float zeroVector = (vector float)(0.0);
  5079. vector unsigned char vecPerm;
  5080. vector float v0, v1, v2, v3, v4, v5, v6, v7;
  5081. vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
  5082. // populate planes
  5083. v0 = vec_ld( 0, planePtr );
  5084. v1 = vec_ld( 15, planePtr );
  5085. vecPlane0 = vec_perm( v0, v1, vecPerm );
  5086. v2 = vec_ld( 0, planePtr + 4 );
  5087. v3 = vec_ld( 15, planePtr + 4 );
  5088. vecPlane1 = vec_perm( v2, v3, vecPerm );
  5089. v0 = vec_ld( 0, planePtr + 8 );
  5090. v1 = vec_ld( 15, planePtr + 8 );
  5091. vecPlane2 = vec_perm( v0, v1, vecPerm );
  5092. v2 = vec_ld( 0, planePtr + 12 );
  5093. v3 = vec_ld( 15, planePtr + 12 );
  5094. vecPlane3 = vec_perm( v2, v3, vecPerm );
  5095. v0 = vec_ld( 0, planePtr + 16 );
  5096. v1 = vec_ld( 15, planePtr + 16 );
  5097. vecPlane4 = vec_perm( v0, v1, vecPerm );
  5098. v2 = vec_ld( 0, planePtr + 20 );
  5099. v3 = vec_ld( 15, planePtr + 20 );
  5100. vecPlane5 = vec_perm( v2, v3, vecPerm );
  5101. // transpose
  5102. v0 = vec_mergeh( vecPlane0, vecPlane2 );
  5103. v1 = vec_mergeh( vecPlane1, vecPlane3 );
  5104. v2 = vec_mergel( vecPlane0, vecPlane2 );
  5105. v3 = vec_mergel( vecPlane1, vecPlane3 );
  5106. vecPlane0 = vec_mergeh( v0, v1 );
  5107. vecPlane1 = vec_mergel( v0, v1 );
  5108. vecPlane2 = vec_mergeh( v2, v3 );
  5109. vecPlane3 = vec_mergel( v2, v3 );
  5110. v0 = vec_mergeh( vecPlane4, zeroVector );
  5111. v1 = vec_mergeh( vecPlane5, zeroVector );
  5112. v2 = vec_mergel( vecPlane4, zeroVector );
  5113. v3 = vec_mergel( vecPlane5, zeroVector );
  5114. vecPlane4 = vec_mergeh( v0, v1 );
  5115. vecPlane5 = vec_mergel( v0, v1 );
  5116. vecPlane6 = vec_mergeh( v2, v3 );
  5117. vecPlane7 = vec_mergel( v2, v3 );
  5118. vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
  5119. vector bool int oneIntVector = (vector bool int)(1);
  5120. vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
  5121. vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
  5122. vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
  5123. vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
  5124. vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
  5125. vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
  5126. vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
  5127. vector unsigned int vecR1, vecR2, vecR3, vecR4;
  5128. vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
  5129. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  5130. unsigned int vBits[4];
  5131. vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
  5132. i = 0;
  5133. for ( ; i+3 < numVerts; i+=4 ) {
  5134. const float *vertPtr = verts[i].xyz.ToFloatPtr();
  5135. const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
  5136. const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
  5137. const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
  5138. v0 = vec_ld( 0, vertPtr );
  5139. v2 = vec_ld( 0, vertPtr2 );
  5140. v4 = vec_ld( 0, vertPtr3 );
  5141. v6 = vec_ld( 0, vertPtr4 );
  5142. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
  5143. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
  5144. vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
  5145. vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
  5146. vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
  5147. vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
  5148. vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
  5149. vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
  5150. vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
  5151. vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
  5152. vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
  5153. vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
  5154. vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
  5155. vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
  5156. vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
  5157. vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
  5158. vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
  5159. vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
  5160. vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
  5161. vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
  5162. vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
  5163. vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
  5164. vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
  5165. vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
  5166. vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
  5167. vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
  5168. vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
  5169. vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
  5170. vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
  5171. vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
  5172. vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
  5173. vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
  5174. vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
  5175. vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
  5176. vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
  5177. vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
  5178. vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
  5179. vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
  5180. vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
  5181. vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
  5182. //and it with 1 so we multiply by 1 not 1111's
  5183. vecCmp1 = vec_and( vecCmp1, oneIntVector );
  5184. vecCmp2 = vec_and( vecCmp2, oneIntVector );
  5185. vecCmp3 = vec_and( vecCmp3, oneIntVector );
  5186. vecCmp4 = vec_and( vecCmp4, oneIntVector );
  5187. vecCmp5 = vec_and( vecCmp5, oneIntVector );
  5188. vecCmp6 = vec_and( vecCmp6, oneIntVector );
  5189. vecCmp7 = vec_and( vecCmp7, oneIntVector );
  5190. vecCmp8 = vec_and( vecCmp8, oneIntVector );
  5191. vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
  5192. vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
  5193. vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
  5194. vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
  5195. vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
  5196. vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
  5197. vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
  5198. vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
  5199. //OR them all together (this is the same as adding them, since they're all only 1 bit set)
  5200. vecR1 = (vector unsigned int)(0); //zeroIntVector;
  5201. vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
  5202. vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
  5203. vecR1 = vec_add(vecR1, vecBitShifted2 );
  5204. vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
  5205. vecR2 = (vector unsigned int)(0); //zeroIntVector;
  5206. vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
  5207. vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
  5208. vecR2 = vec_add(vecR2, vecBitShifted4 );
  5209. vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
  5210. vecR3 = (vector unsigned int)(0); //zeroIntVector;
  5211. vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
  5212. vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
  5213. vecR3 = vec_add(vecR3, vecBitShifted6 );
  5214. vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
  5215. vecR4 = (vector unsigned int)(0); //zeroIntVector;
  5216. vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
  5217. vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
  5218. vecR4 = vec_add(vecR4, vecBitShifted8 );
  5219. vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
  5220. // take the first element from each vector and put them into vecR1
  5221. vecR1 = vec_mergeh( vecR1, vecR2 );
  5222. vecR3 = vec_mergeh( vecR3, vecR4 );
  5223. vecR1 = vec_perm( vecR1, vecR3, permHalves );
  5224. // XOR with 0x3F to flip lower 6 bits
  5225. vecR1 = vec_xor( vecR1, vecFlipBits );
  5226. // store out results. don't have 16 at a time so let's just
  5227. // do this and avoid alignment concerns
  5228. vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
  5229. vec_ste( vecR1, 0, &vBits[0] );
  5230. vec_ste( vecR1, 4, &vBits[0] );
  5231. vec_ste( vecR1, 8, &vBits[0] );
  5232. vec_ste( vecR1, 12, &vBits[0] );
  5233. cullBits[i] = vBits[0];
  5234. cullBits[i+1] = vBits[1];
  5235. cullBits[i+2] = vBits[2];
  5236. cullBits[i+3] = vBits[3];
  5237. }
  5238. for ( ; i < numVerts; i++ ) {
  5239. byte bits;
  5240. float d0, d1, d2, d3, d4, d5;
  5241. const idVec3 &v = verts[i].xyz;
  5242. d0 = planes[0].Distance( v );
  5243. d1 = planes[1].Distance( v );
  5244. d2 = planes[2].Distance( v );
  5245. d3 = planes[3].Distance( v );
  5246. d4 = planes[4].Distance( v );
  5247. d5 = planes[5].Distance( v );
  5248. // they check if the sign bit is set by casting as long and shifting right 31 places.
  5249. bits = FLOATSIGNBITSET( d0 ) << 0;
  5250. bits |= FLOATSIGNBITSET( d1 ) << 1;
  5251. bits |= FLOATSIGNBITSET( d2 ) << 2;
  5252. bits |= FLOATSIGNBITSET( d3 ) << 3;
  5253. bits |= FLOATSIGNBITSET( d4 ) << 4;
  5254. bits |= FLOATSIGNBITSET( d5 ) << 5;
  5255. cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
  5256. }
  5257. }
  5258. #endif /*DRAWVERT_PADDED */
  5259. #ifndef DRAWVERT_PADDED
  5260. /*
  5261. ============
  5262. idSIMD_AltiVec::OverlayPointCull
  5263. ============
  5264. */
  5265. void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  5266. // idDrawVert size
  5267. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
  5268. int i;
  5269. float p0x, p0y, p0z, p0d;
  5270. float p1x, p1y, p1z, p1d;
  5271. const float *planePtr = planes[0].ToFloatPtr();
  5272. const float *vertPtr = verts[0].xyz.ToFloatPtr();
  5273. vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
  5274. vector float v0, v1, v2, v3, v4, v5, v6, v7;
  5275. vector unsigned char vecPerm;
  5276. vector float zeroVector = (vector float)(0);
  5277. p0x = *(planePtr + 0);
  5278. p0y = *(planePtr + 1);
  5279. p0z = *(planePtr + 2);
  5280. p0d = *(planePtr + 3);
  5281. p1x = *(planePtr + 4);
  5282. p1y = *(planePtr + 5);
  5283. p1z = *(planePtr + 6);
  5284. p1d = *(planePtr + 7);
  5285. // populate the planes
  5286. vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
  5287. v0 = vec_ld( 0, planePtr );
  5288. v1 = vec_ld( 15, planePtr );
  5289. vecPlane0 = vec_perm( v0, v1, vecPerm );
  5290. v2 = vec_ld( 31, planePtr );
  5291. vecPlane1 = vec_perm( v1, v2, vecPerm );
  5292. // transpose
  5293. v0 = vec_mergeh( vecPlane0, vecPlane0 );
  5294. v1 = vec_mergeh( vecPlane1, vecPlane1 );
  5295. v2 = vec_mergel( vecPlane0, vecPlane0 );
  5296. v3 = vec_mergel( vecPlane1, vecPlane1);
  5297. vecPlane0 = vec_mergeh( v0, v1 );
  5298. vecPlane1 = vec_mergel( v0, v1 );
  5299. vecPlane2 = vec_mergeh( v2, v3 );
  5300. vecPlane3 = vec_mergel( v2, v3 );
  5301. vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
  5302. vector float oneVector = (vector float)(1);
  5303. vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
  5304. vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
  5305. vector float negTwoVector = (vector float)(-2);
  5306. vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
  5307. vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
  5308. vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
  5309. vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
  5310. vector bool int oneIntVector = (vector bool int)(1);
  5311. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  5312. unsigned int cullBitVal[4];
  5313. vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
  5314. i = 0;
  5315. // every fourth one will have the same alignment. Make sure we've got enough here
  5316. if ( i+3 < numVerts ) {
  5317. vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  5318. vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  5319. vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  5320. vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  5321. }
  5322. for ( ; i+3 < numVerts; i+=4 ) {
  5323. const float *vertPtr = verts[i].xyz.ToFloatPtr();
  5324. const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
  5325. const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
  5326. const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
  5327. v0 = vec_ld( 0, vertPtr );
  5328. v1 = vec_ld( 15, vertPtr );
  5329. v2 = vec_ld( 0, vertPtr2 );
  5330. v3 = vec_ld( 15, vertPtr2 );
  5331. v4 = vec_ld( 0, vertPtr3 );
  5332. v5 = vec_ld( 15, vertPtr3 );
  5333. v6 = vec_ld( 0, vertPtr4 );
  5334. v7 = vec_ld( 15, vertPtr4 );
  5335. vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
  5336. vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
  5337. vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
  5338. vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
  5339. // like a splat, but only doing halves
  5340. vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
  5341. vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
  5342. vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
  5343. vecSum1 = vec_add( vecSum1, vecPlane3 );
  5344. vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
  5345. vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
  5346. vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
  5347. vecSum2 = vec_add( vecSum2, vecPlane3 );
  5348. // store out results
  5349. UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
  5350. // bit manipulation
  5351. vecCmp1 = vec_cmplt( vecSum1, zeroVector );
  5352. vecCmp2 = vec_cmplt( vecSum2, zeroVector );
  5353. //and it with 1 so we multiply by 1 not 1111's
  5354. vecCmp1 = vec_and( vecCmp1, oneIntVector );
  5355. vecCmp2 = vec_and( vecCmp2, oneIntVector );
  5356. // store out and write to cullBits
  5357. // finally, a use for algebra! 1-x = x + 1 - 2x
  5358. vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
  5359. vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
  5360. vecSum1Inv = vec_add( vecSum1Inv, oneVector );
  5361. vecSum2Inv = vec_add( vecSum2Inv, oneVector );
  5362. // do the same comparisons for the inverted d0/d1
  5363. vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
  5364. vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
  5365. //and it with 1 so we multiply by 1 not 1111's
  5366. vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
  5367. vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
  5368. // shift them as needed
  5369. vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
  5370. vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
  5371. vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
  5372. vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
  5373. // OR them all together. since only 1 bit is set for each value, thats
  5374. // the same as adding them. add up d0 + d1 + d0Inv + d1Inv
  5375. vector unsigned int vecResult;
  5376. vector unsigned int vecResult2;
  5377. vector unsigned int vecResult3;
  5378. vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
  5379. vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
  5380. // vecResult now holds the values without the inverses yet, so add those
  5381. vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
  5382. vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
  5383. vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
  5384. vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
  5385. vecResult = vec_add( vecResult, vecResult2 );
  5386. //store out results
  5387. vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
  5388. vec_ste( vecResult, 0, &cullBitVal[0] );
  5389. vec_ste( vecResult, 4, &cullBitVal[0] );
  5390. vec_ste( vecResult, 8, &cullBitVal[0] );
  5391. vec_ste( vecResult, 12, &cullBitVal[0] );
  5392. cullBits[i] = cullBitVal[0];
  5393. cullBits[i+1] = cullBitVal[1];
  5394. cullBits[i+2] = cullBitVal[2];
  5395. cullBits[i+3] = cullBitVal[3];
  5396. }
  5397. // cleanup
  5398. for ( ; i < numVerts; i++ ) {
  5399. byte bits;
  5400. float d0, d1;
  5401. float vx, vy, vz;
  5402. vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
  5403. vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
  5404. vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
  5405. d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
  5406. d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
  5407. texCoords[i][0] = d0;
  5408. texCoords[i][1] = d1;
  5409. bits = ( d0 >= 0 ) ? 0 : 1;
  5410. d0 = 1.0f - d0;
  5411. bits |= ( d1 >= 0 ) ? 0 : 1*2;
  5412. d1 = 1.0f - d1;
  5413. bits |= ( d0 >= 0 ) ? 0: 1*4;
  5414. bits |= ( d1 >= 0 ) ? 0: 1*8;
  5415. cullBits[i] = bits;
  5416. }
  5417. }
  5418. #else
  5419. /*
  5420. ============
  5421. idSIMD_AltiVec::OverlayPointCull
  5422. ============
  5423. */
  5424. void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  5425. // idDrawVert size
  5426. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
  5427. int i;
  5428. float p0x, p0y, p0z, p0d;
  5429. float p1x, p1y, p1z, p1d;
  5430. const float *planePtr = planes[0].ToFloatPtr();
  5431. const float *vertPtr = verts[0].xyz.ToFloatPtr();
  5432. vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
  5433. vector float v0, v1, v2, v3, v4, v5, v6, v7;
  5434. vector unsigned char vecPerm;
  5435. vector float zeroVector = (vector float)(0);
  5436. p0x = *(planePtr + 0);
  5437. p0y = *(planePtr + 1);
  5438. p0z = *(planePtr + 2);
  5439. p0d = *(planePtr + 3);
  5440. p1x = *(planePtr + 4);
  5441. p1y = *(planePtr + 5);
  5442. p1z = *(planePtr + 6);
  5443. p1d = *(planePtr + 7);
  5444. // populate the planes
  5445. vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
  5446. v0 = vec_ld( 0, planePtr );
  5447. v1 = vec_ld( 15, planePtr );
  5448. vecPlane0 = vec_perm( v0, v1, vecPerm );
  5449. v2 = vec_ld( 31, planePtr );
  5450. vecPlane1 = vec_perm( v1, v2, vecPerm );
  5451. // transpose
  5452. v0 = vec_mergeh( vecPlane0, vecPlane0 );
  5453. v1 = vec_mergeh( vecPlane1, vecPlane1 );
  5454. v2 = vec_mergel( vecPlane0, vecPlane0 );
  5455. v3 = vec_mergel( vecPlane1, vecPlane1);
  5456. vecPlane0 = vec_mergeh( v0, v1 );
  5457. vecPlane1 = vec_mergel( v0, v1 );
  5458. vecPlane2 = vec_mergeh( v2, v3 );
  5459. vecPlane3 = vec_mergel( v2, v3 );
  5460. vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
  5461. vector float oneVector = (vector float)(1);
  5462. vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
  5463. vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
  5464. vector float negTwoVector = (vector float)(-2);
  5465. vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
  5466. vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
  5467. vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
  5468. vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
  5469. vector bool int oneIntVector = (vector bool int)(1);
  5470. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  5471. unsigned int cullBitVal[4];
  5472. vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
  5473. i = 0;
  5474. for ( ; i+3 < numVerts; i+=4 ) {
  5475. const float *vertPtr = verts[i].xyz.ToFloatPtr();
  5476. const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
  5477. const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
  5478. const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
  5479. vecXYZ1 = vec_ld( 0, vertPtr );
  5480. vecXYZ2 = vec_ld( 0, vertPtr2 );
  5481. vecXYZ3 = vec_ld( 0, vertPtr3 );
  5482. vecXYZ4 = vec_ld( 0, vertPtr4 );
  5483. // like a splat, but only doing halves
  5484. vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
  5485. vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
  5486. vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
  5487. vecSum1 = vec_add( vecSum1, vecPlane3 );
  5488. vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
  5489. vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
  5490. vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
  5491. vecSum2 = vec_add( vecSum2, vecPlane3 );
  5492. // store out results
  5493. UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
  5494. // bit manipulation
  5495. vecCmp1 = vec_cmplt( vecSum1, zeroVector );
  5496. vecCmp2 = vec_cmplt( vecSum2, zeroVector );
  5497. //and it with 1 so we multiply by 1 not 1111's
  5498. vecCmp1 = vec_and( vecCmp1, oneIntVector );
  5499. vecCmp2 = vec_and( vecCmp2, oneIntVector );
  5500. // store out and write to cullBits
  5501. // finally, a use for algebra! 1-x = x + 1 - 2x
  5502. vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
  5503. vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
  5504. vecSum1Inv = vec_add( vecSum1Inv, oneVector );
  5505. vecSum2Inv = vec_add( vecSum2Inv, oneVector );
  5506. // do the same comparisons for the inverted d0/d1
  5507. vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
  5508. vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
  5509. //and it with 1 so we multiply by 1 not 1111's
  5510. vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
  5511. vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
  5512. // shift them as needed
  5513. vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
  5514. vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
  5515. vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
  5516. vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
  5517. // OR them all together. since only 1 bit is set for each value, thats
  5518. // the same as adding them. add up d0 + d1 + d0Inv + d1Inv
  5519. vector unsigned int vecResult;
  5520. vector unsigned int vecResult2;
  5521. vector unsigned int vecResult3;
  5522. vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
  5523. vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
  5524. // vecResult now holds the values without the inverses yet, so add those
  5525. vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
  5526. vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
  5527. vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
  5528. vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
  5529. vecResult = vec_add( vecResult, vecResult2 );
  5530. //store out results
  5531. vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
  5532. vec_ste( vecResult, 0, &cullBitVal[0] );
  5533. vec_ste( vecResult, 4, &cullBitVal[0] );
  5534. vec_ste( vecResult, 8, &cullBitVal[0] );
  5535. vec_ste( vecResult, 12, &cullBitVal[0] );
  5536. cullBits[i] = cullBitVal[0];
  5537. cullBits[i+1] = cullBitVal[1];
  5538. cullBits[i+2] = cullBitVal[2];
  5539. cullBits[i+3] = cullBitVal[3];
  5540. }
  5541. // cleanup
  5542. for ( ; i < numVerts; i++ ) {
  5543. byte bits;
  5544. float d0, d1;
  5545. float vx, vy, vz;
  5546. vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
  5547. vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
  5548. vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
  5549. d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
  5550. d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
  5551. texCoords[i][0] = d0;
  5552. texCoords[i][1] = d1;
  5553. bits = ( d0 >= 0 ) ? 0 : 1;
  5554. d0 = 1.0f - d0;
  5555. bits |= ( d1 >= 0 ) ? 0 : 1*2;
  5556. d1 = 1.0f - d1;
  5557. bits |= ( d0 >= 0 ) ? 0: 1*4;
  5558. bits |= ( d1 >= 0 ) ? 0: 1*8;
  5559. cullBits[i] = bits;
  5560. }
  5561. }
  5562. #endif /* DRAWVERT_PADDED */
  5563. #endif /* ENABLE_CULL */
  5564. #ifdef ENABLE_DERIVE
  5565. /*
  5566. ============
  5567. idSIMD_AltiVec::DeriveTriPlanes
  5568. Derives a plane equation for each triangle.
  5569. ============
  5570. */
  5571. void VPCALL idSIMD_AltiVec::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  5572. // idDrawVert size
  5573. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
  5574. // idPlane size
  5575. assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
  5576. int i;
  5577. vector float vecD0, vecD1, vecD2, vecD3, vecD4, vecD5, vecD6, vecD7;
  5578. vector float vecVertA, vecVertB, vecVertC;
  5579. vector float vecVertA2, vecVertB2, vecVertC2;
  5580. vector float vecVertA3, vecVertB3, vecVertC3;
  5581. vector float vecVertA4, vecVertB4, vecVertC4;
  5582. vector float vecN, vecN2, vecN3, vecN4;
  5583. vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
  5584. vector unsigned char vecPerm1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
  5585. vector unsigned char vecPerm2 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
  5586. vector float vecF;
  5587. vector float vecF1, vecF2, vecF3, vecF4;
  5588. vector float zeroVector = (vector float)(0);
  5589. vector float vecNegOne = (vector float)(-1);
  5590. vector float vecSecondHalf, vecFirstHalf, vecSecondHalf2, vecFirstHalf2, vecSecondHalf3, vecFirstHalf3, vecFirstHalf4, vecSecondHalf4;
  5591. vector unsigned char vecPermA, vecPermA2, vecPermA3, vecPermA4;
  5592. vector unsigned char vecPermB, vecPermB2, vecPermB3, vecPermB4;
  5593. vector unsigned char vecPermC, vecPermC2, vecPermC3, vecPermC4;
  5594. vector unsigned char oneVector = (vector unsigned char)(1);
  5595. vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
  5596. vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
  5597. const float *xyzPtr = verts[0].xyz.ToFloatPtr();
  5598. float *planePtr = planes[0].ToFloatPtr();
  5599. int j;
  5600. for ( j = 0, i = 0; i+11 < numIndexes; i += 12, j += 4 ) {
  5601. #ifndef DRAWVERT_PADDED
  5602. // calculate permute vectors to load as needed. these are all
  5603. // triangle indexes and are usaully pretty close together but
  5604. // not guaranteed to be in any particular order
  5605. vecPermA = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) ), oneVector );
  5606. vecPermB = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) ), oneVector );
  5607. vecPermC = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) ), oneVector );
  5608. vecPermA2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) ), oneVector );
  5609. vecPermB2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) ), oneVector );
  5610. vecPermC2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) ), oneVector );
  5611. vecPermA3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) ), oneVector );
  5612. vecPermB3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) ), oneVector );
  5613. vecPermC3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) ), oneVector );
  5614. vecPermA4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) ), oneVector );
  5615. vecPermB4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) ), oneVector );
  5616. vecPermC4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) ), oneVector );
  5617. #endif
  5618. #ifndef DRAWVERT_PADDED
  5619. // load first A B C
  5620. vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
  5621. vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
  5622. vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
  5623. vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
  5624. vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
  5625. vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
  5626. vecVertA = vec_perm( vecLd1, vecLd2, vecPermA );
  5627. vecVertB = vec_perm( vecLd3, vecLd4, vecPermB );
  5628. vecVertC = vec_perm( vecLd5, vecLd6, vecPermC );
  5629. // set the last element to 0
  5630. vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
  5631. vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
  5632. vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
  5633. // load second A B C
  5634. vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
  5635. vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
  5636. vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
  5637. vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
  5638. vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
  5639. vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
  5640. vecVertA2 = vec_perm( vecLd1, vecLd2, vecPermA2 );
  5641. vecVertB2 = vec_perm( vecLd3, vecLd4, vecPermB2 );
  5642. vecVertC2 = vec_perm( vecLd5, vecLd6, vecPermC2 );
  5643. // set the last element to 0
  5644. vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
  5645. vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
  5646. vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
  5647. // load third A B C
  5648. vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
  5649. vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
  5650. vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
  5651. vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
  5652. vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
  5653. vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
  5654. vecVertA3 = vec_perm( vecLd1, vecLd2, vecPermA3 );
  5655. vecVertB3 = vec_perm( vecLd3, vecLd4, vecPermB3 );
  5656. vecVertC3 = vec_perm( vecLd5, vecLd6, vecPermC3 );
  5657. // set the last element to 0
  5658. vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
  5659. vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
  5660. vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
  5661. // load the fourth A B C
  5662. vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
  5663. vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
  5664. vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
  5665. vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
  5666. vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
  5667. vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
  5668. vecVertA4 = vec_perm( vecLd1, vecLd2, vecPermA4 );
  5669. vecVertB4 = vec_perm( vecLd3, vecLd4, vecPermB4 );
  5670. vecVertC4 = vec_perm( vecLd5, vecLd6, vecPermC4 );
  5671. // set the last element to 0
  5672. vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
  5673. vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
  5674. vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
  5675. #else
  5676. // load first A B C
  5677. vecVertA = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
  5678. vecVertB = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
  5679. vecVertC = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
  5680. // set the last element to 0
  5681. vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
  5682. vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
  5683. vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
  5684. // load second A B C
  5685. vecVertA2 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
  5686. vecVertB2 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
  5687. vecVertC2 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
  5688. // set the last element to 0
  5689. vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
  5690. vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
  5691. vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
  5692. // load third A B C
  5693. vecVertA3 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
  5694. vecVertB3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
  5695. vecVertC3 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
  5696. // set the last element to 0
  5697. vecVertA3 = vec_perm( vecVertA3, zeroVector, vecPermZeroLast );
  5698. vecVertB3 = vec_perm( vecVertB3, zeroVector, vecPermZeroLast );
  5699. vecVertC3 = vec_perm( vecVertC3, zeroVector, vecPermZeroLast );
  5700. // load the fourth A B C
  5701. vecVertA4 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
  5702. vecVertB4 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
  5703. vecVertC4 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
  5704. // set the last element to 0
  5705. vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
  5706. vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
  5707. vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
  5708. #endif
  5709. // calculate d0 and d1 for each
  5710. vecD0 = vec_sub( vecVertB, vecVertA );
  5711. vecD1 = vec_sub( vecVertC, vecVertA );
  5712. vecD2 = vec_sub( vecVertB2, vecVertA2 );
  5713. vecD3 = vec_sub( vecVertC2, vecVertA2 );
  5714. vecD4 = vec_sub( vecVertB3, vecVertA3 );
  5715. vecD5 = vec_sub( vecVertC3, vecVertA3 );
  5716. vecD6 = vec_sub( vecVertB4, vecVertA4 );
  5717. vecD7 = vec_sub( vecVertC4, vecVertA4 );
  5718. vecWork1 = vec_perm( vecD0, vecD0, vecPerm1 );
  5719. vecWork2 = vec_perm( vecD1, vecD1, vecPerm2 );
  5720. vecWork3 = vec_perm( vecD2, vecD2, vecPerm1 );
  5721. vecWork4 = vec_perm( vecD3, vecD3, vecPerm2 );
  5722. vecWork5 = vec_perm( vecD4, vecD4, vecPerm1 );
  5723. vecWork6 = vec_perm( vecD5, vecD5, vecPerm2 );
  5724. vecWork7 = vec_perm( vecD6, vecD6, vecPerm1 );
  5725. vecWork8 = vec_perm( vecD7, vecD7, vecPerm2 );
  5726. vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
  5727. vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
  5728. vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
  5729. vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
  5730. vecWork1 = vec_perm( vecD1, vecD1, vecPerm1 );
  5731. vecWork2 = vec_perm( vecD0, vecD0, vecPerm2 );
  5732. vecWork3 = vec_perm( vecD3, vecD3, vecPerm1 );
  5733. vecWork4 = vec_perm( vecD2, vecD2, vecPerm2 );
  5734. vecWork5 = vec_perm( vecD5, vecD5, vecPerm1 );
  5735. vecWork6 = vec_perm( vecD4, vecD4, vecPerm2 );
  5736. vecWork7 = vec_perm( vecD7, vecD7, vecPerm1 );
  5737. vecWork8 = vec_perm( vecD6, vecD6, vecPerm2 );
  5738. vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
  5739. vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
  5740. vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
  5741. vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
  5742. vecN = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
  5743. vecN2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
  5744. vecN3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
  5745. vecN4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
  5746. // transpose vecNs
  5747. vector float v0, v1, v2, v3;
  5748. v0 = vec_mergeh( vecN, vecN3 );
  5749. v1 = vec_mergeh( vecN2, vecN4 );
  5750. v2 = vec_mergel( vecN, vecN3 );
  5751. v3 = vec_mergel( vecN2, vecN4 );
  5752. vecN = vec_mergeh( v0, v1 );
  5753. vecN2 = vec_mergel( v0, v1 );
  5754. vecN3 = vec_mergeh( v2, v3 );
  5755. vecN4 = vec_mergel( v2, v3 );
  5756. vecF = vec_madd( vecN, vecN, zeroVector );
  5757. vecF = vec_madd( vecN2, vecN2, vecF );
  5758. vecF = vec_madd( vecN3, vecN3, vecF );
  5759. vecF = ReciprocalSquareRoot( vecF );
  5760. vecF1 = vec_madd( vecF, vecN, zeroVector );
  5761. vecF2 = vec_madd( vecF, vecN2, zeroVector );
  5762. vecF3 = vec_madd( vecF, vecN3, zeroVector );
  5763. vecF4 = vec_madd( vecF, vecN4, zeroVector );
  5764. vector float v8, v9, v10, v11;
  5765. v8 = vecF1;
  5766. v9 = vecF2;
  5767. v10 = vecF3;
  5768. v11 = vecF4;
  5769. // transpose vecVerts
  5770. v0 = vec_mergeh( vecVertA, vecVertA3 );
  5771. v1 = vec_mergeh( vecVertA2, vecVertA4 );
  5772. v2 = vec_mergel( vecVertA, vecVertA3 );
  5773. v3 = vec_mergel( vecVertA2, vecVertA4 );
  5774. vecVertA = vec_mergeh( v0, v1 );
  5775. vecVertA2 = vec_mergel( v0, v1 );
  5776. vecVertA3 = vec_mergeh( v2, v3 );
  5777. vecVertA4 = vec_mergel( v2, v3 );
  5778. vector float vecTotals;
  5779. vecTotals = vec_madd( vecVertA, v8, zeroVector );
  5780. vecTotals = vec_madd( vecVertA2, v9, vecTotals );
  5781. vecTotals = vec_madd( vecVertA3, v10, vecTotals );
  5782. vecTotals = vec_madd( vecVertA4, v11, vecTotals );
  5783. vecF = vec_madd( vecTotals, vecNegOne, zeroVector );
  5784. // transpose vecFs
  5785. v0 = vec_mergeh( vecF1, vecF3 );
  5786. v1 = vec_mergeh( vecF2, vecF );
  5787. v2 = vec_mergel( vecF1, vecF3 );
  5788. v3 = vec_mergel( vecF2, vecF );
  5789. vecF1 = vec_mergeh( v0, v1 );
  5790. vecF2 = vec_mergel( v0, v1 );
  5791. vecF3 = vec_mergeh( v2, v3 );
  5792. vecF4 = vec_mergel( v2, v3 );
  5793. // store results
  5794. UNALIGNED_STORE4( planePtr + ( j * PLANE_OFFSET ), vecF1, vecF2, vecF3, vecF4 );
  5795. }
  5796. // cleanup
  5797. for ( ; i < numIndexes; i += 3, j++ ) {
  5798. const idDrawVert *a, *b, *c;
  5799. float d0[3], d1[3], f;
  5800. idVec3 n;
  5801. a = verts + indexes[i + 0];
  5802. b = verts + indexes[i + 1];
  5803. c = verts + indexes[i + 2];
  5804. d0[0] = b->xyz[0] - a->xyz[0];
  5805. d0[1] = b->xyz[1] - a->xyz[1];
  5806. d0[2] = b->xyz[2] - a->xyz[2];
  5807. d1[0] = c->xyz[0] - a->xyz[0];
  5808. d1[1] = c->xyz[1] - a->xyz[1];
  5809. d1[2] = c->xyz[2] - a->xyz[2];
  5810. n[0] = d1[1] * d0[2] - d1[2] * d0[1];
  5811. n[1] = d1[2] * d0[0] - d1[0] * d0[2];
  5812. n[2] = d1[0] * d0[1] - d1[1] * d0[0];
  5813. f = FastScalarInvSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
  5814. //idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
  5815. n.x *= f;
  5816. n.y *= f;
  5817. n.z *= f;
  5818. planes[j].SetNormal( n );
  5819. planes[j].FitThroughPoint( a->xyz );
  5820. }
  5821. }
  5822. /*
  5823. ============
  5824. idSIMD_AltiVec::DeriveTangents
  5825. Derives the normal and orthogonal tangent vectors for the triangle vertices.
  5826. For each vertex the normal and tangent vectors are derived from all triangles
  5827. using the vertex which results in smooth tangents across the mesh.
  5828. In the process the triangle planes are calculated as well.
  5829. ============
  5830. */
  5831. void VPCALL idSIMD_AltiVec::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  5832. int i;
  5833. bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
  5834. memset( used, 0, numVerts * sizeof( used[0] ) );
  5835. idPlane *planesPtr = planes;
  5836. for ( i = 0; i < numIndexes; i += 3 ) {
  5837. idDrawVert *a, *b, *c;
  5838. // unsigned long signBit;
  5839. float d0[5], d1[5], area;
  5840. idVec3 n, t0, t1;
  5841. float f1, f2, f3;
  5842. int v0 = indexes[i + 0];
  5843. int v1 = indexes[i + 1];
  5844. int v2 = indexes[i + 2];
  5845. a = verts + v0;
  5846. b = verts + v1;
  5847. c = verts + v2;
  5848. d0[0] = b->xyz[0] - a->xyz[0];
  5849. d0[1] = b->xyz[1] - a->xyz[1];
  5850. d0[2] = b->xyz[2] - a->xyz[2];
  5851. d0[3] = b->st[0] - a->st[0];
  5852. d0[4] = b->st[1] - a->st[1];
  5853. d1[0] = c->xyz[0] - a->xyz[0];
  5854. d1[1] = c->xyz[1] - a->xyz[1];
  5855. d1[2] = c->xyz[2] - a->xyz[2];
  5856. d1[3] = c->st[0] - a->st[0];
  5857. d1[4] = c->st[1] - a->st[1];
  5858. // normal
  5859. n[0] = d1[1] * d0[2] - d1[2] * d0[1];
  5860. n[1] = d1[2] * d0[0] - d1[0] * d0[2];
  5861. n[2] = d1[0] * d0[1] - d1[1] * d0[0];
  5862. f1 = n.x * n.x + n.y * n.y + n.z * n.z;
  5863. // area sign bit
  5864. area = d0[3] * d1[4] - d0[4] * d1[3];
  5865. // first tangent
  5866. t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
  5867. t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
  5868. t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
  5869. f2 = t0.x * t0.x + t0.y * t0.y + t0.z * t0.z;
  5870. // second tangent
  5871. t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
  5872. t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
  5873. t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
  5874. f3 = t1.x * t1.x + t1.y * t1.y + t1.z * t1.z;
  5875. // Behold! The power of the pipeline
  5876. FastScalarInvSqrt_x3( &f1, &f2, &f3 );
  5877. #ifdef PPC_INTRINSICS
  5878. f2 = __fsel( area, f2, -f2 );
  5879. f3 = __fsel( area, f3, -f3 );
  5880. #else
  5881. f2 = ( area < 0.0f ) ? -f2 : f2;
  5882. f3 = ( area < 0.0f ) ? -f3 : f3;
  5883. #endif
  5884. t0.x *= f2;
  5885. t0.y *= f2;
  5886. t0.z *= f2;
  5887. n.x *= f1;
  5888. n.y *= f1;
  5889. n.z *= f1;
  5890. planesPtr->SetNormal( n );
  5891. planesPtr->FitThroughPoint( a->xyz );
  5892. planesPtr++;
  5893. t1.x *= f3;
  5894. t1.y *= f3;
  5895. t1.z *= f3;
  5896. if ( used[v0] ) {
  5897. a->normal += n;
  5898. a->tangents[0] += t0;
  5899. a->tangents[1] += t1;
  5900. } else {
  5901. a->normal = n;
  5902. a->tangents[0] = t0;
  5903. a->tangents[1] = t1;
  5904. used[v0] = true;
  5905. }
  5906. if ( used[v1] ) {
  5907. b->normal += n;
  5908. b->tangents[0] += t0;
  5909. b->tangents[1] += t1;
  5910. } else {
  5911. b->normal = n;
  5912. b->tangents[0] = t0;
  5913. b->tangents[1] = t1;
  5914. used[v1] = true;
  5915. }
  5916. if ( used[v2] ) {
  5917. c->normal += n;
  5918. c->tangents[0] += t0;
  5919. c->tangents[1] += t1;
  5920. } else {
  5921. c->normal = n;
  5922. c->tangents[0] = t0;
  5923. c->tangents[1] = t1;
  5924. used[v2] = true;
  5925. }
  5926. }
  5927. }
  5928. #ifdef DERIVE_UNSMOOTH_DRAWVERT_ALIGNED
  5929. /*
  5930. ============
  5931. idSIMD_AltiVec::DeriveUnsmoothedTangents
  5932. Derives the normal and orthogonal tangent vectors for the triangle vertices.
  5933. For each vertex the normal and tangent vectors are derived from a single dominant triangle.
  5934. ============
  5935. */
  5936. #define DERIVE_UNSMOOTHED_BITANGENT
  5937. void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
  5938. int i;
  5939. // idDrawVert size
  5940. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
  5941. // drawverts aligned
  5942. assert( IS_16BYTE_ALIGNED( verts[0] ) );
  5943. vector float vecVertA, vecVertB, vecVertC;
  5944. vector float vecVertA2, vecVertB2, vecVertC2;
  5945. vector float vecVertA3, vecVertB3, vecVertC3;
  5946. vector float vecVertA4, vecVertB4, vecVertC4;
  5947. vector float v0, v1, v2, v3, v4, v5, v6, v7, v8;
  5948. vector float vecS0, vecS1, vecS2;
  5949. vector float vecS0_2, vecS1_2, vecS2_2;
  5950. vector float vecS0_3, vecS1_3, vecS2_3;
  5951. vector float vecS0_4, vecS1_4, vecS2_4;
  5952. vector float vecD1, vecD2, vecD3, vecD4, vecD5, vecD6;
  5953. vector float vecD7, vecD8, vecD9, vecD10, vecD11, vecD12;
  5954. vector float vecT1, vecT1_2, vecT1_3, vecT1_4, vecT2, vecT2_2, vecT2_3, vecT2_4;
  5955. vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
  5956. vector float vecN, vecN2, vecN3, vecN4;
  5957. vector unsigned char vecPermN0 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
  5958. vector unsigned char vecPermN1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
  5959. vector unsigned char vecPermT0 = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3);
  5960. vector unsigned char vecPermT1 = (vector unsigned char)(8,9,10,11,8,9,10,11,8,9,10,11,8,9,10,11);
  5961. vector float zeroVector = (vector float)(0);
  5962. vector float vecNegOne = (vector float)(-1.0);
  5963. vector float vecStore1, vecStore2, vecStore3;
  5964. vector unsigned char vecPermFirstThreeLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
  5965. vector unsigned char vecPermStoreSecond = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
  5966. vector unsigned char vecPermLeadAndThree = (vector unsigned char)(0,1,2,3,16,17,18,19,20,21,22,23,24,25,26,27);
  5967. vector unsigned char vecPermStore2 = (vector unsigned char)(4,5,6,7,8,9,10,11,24,25,26,27,28,29,30,31);
  5968. vector unsigned char vecPermStore3 = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
  5969. vector unsigned char vecPermStore4 = (vector unsigned char)(8,9,10,11,16,17,18,19,20,21,22,23,24,25,26,27);
  5970. vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
  5971. vector float vecLd1, vecLd2, vecLd3;
  5972. vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3, vecPerm4;
  5973. float *normalPtr = verts[0].normal.ToFloatPtr();
  5974. float *xyzPtr = verts[0].xyz.ToFloatPtr();
  5975. vector float vecFirstHalf, vecSecondHalf;
  5976. vector float vecFirstHalf2, vecSecondHalf2;
  5977. vector float vecFirstHalf3, vecSecondHalf3;
  5978. vector float vecFirstHalf4, vecSecondHalf4;
  5979. for ( i = 0; i+3 < numVerts; i+=4 ) {
  5980. int bOffset1, bOffset2, bOffset3, bOffset4;
  5981. int cOffset1, cOffset2, cOffset3, cOffset4;
  5982. bOffset1 = dominantTris[i].v2;
  5983. cOffset1 = dominantTris[i].v3;
  5984. bOffset2 = dominantTris[i+1].v2;
  5985. cOffset2 = dominantTris[i+1].v3;
  5986. bOffset3 = dominantTris[i+2].v2;
  5987. cOffset3 = dominantTris[i+2].v3;
  5988. bOffset4 = dominantTris[i+3].v2;
  5989. cOffset4 = dominantTris[i+3].v3;
  5990. vecPerm0 = vec_lvsl( 0, xyzPtr + ( i * DRAWVERT_OFFSET ) );
  5991. v0 = vec_ld( 0, xyzPtr + (i * DRAWVERT_OFFSET ) );
  5992. v1 = vec_ld( 16, xyzPtr + (i * DRAWVERT_OFFSET ) );
  5993. vecVertA = vec_perm( v0, v1, vecPerm0 );
  5994. vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset1 * DRAWVERT_OFFSET ) );
  5995. v2 = vec_ld( 0, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
  5996. v3 = vec_ld( 16, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
  5997. vecVertB = vec_perm( v2, v3, vecPerm1 );
  5998. vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
  5999. v4 = vec_ld( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
  6000. v5 = vec_ld( 16, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
  6001. vecVertC = vec_perm( v4, v5, vecPerm2 );
  6002. // put remainder into v2
  6003. v1 = vec_perm( v1, v1, vecPerm0 );
  6004. v3 = vec_perm( v3, v3, vecPerm1 );
  6005. v5 = vec_perm( v5, v5, vecPerm2 );
  6006. v1 = vec_mergeh( v1, v5 );
  6007. v2 = vec_mergeh( v3, zeroVector );
  6008. v2 = vec_mergeh( v1, v2 );
  6009. v2 = vec_perm( v2, v2, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
  6010. // load second one
  6011. vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
  6012. v0 = vec_ld( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
  6013. v1 = vec_ld( 16, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
  6014. vecVertA2 = vec_perm( v0, v1, vecPerm0 );
  6015. vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset2 * DRAWVERT_OFFSET ) );
  6016. v3 = vec_ld( 0, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
  6017. v4 = vec_ld( 16, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
  6018. vecVertB2 = vec_perm( v3, v4, vecPerm3 );
  6019. vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
  6020. v5 = vec_ld( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
  6021. v6 = vec_ld( 16, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
  6022. vecVertC2 = vec_perm( v5, v6, vecPerm4 );
  6023. // put remainder into v3
  6024. v1 = vec_perm( v1, v1, vecPerm0 );
  6025. v4 = vec_perm( v4, v4, vecPerm3 );
  6026. v5 = vec_perm( v6, v6, vecPerm4 );
  6027. v1 = vec_mergeh( v1, v5 );
  6028. v3 = vec_mergeh( v4, zeroVector );
  6029. v3 = vec_mergeh( v1, v3 );
  6030. v3 = vec_perm( v3, v3, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
  6031. // load third one
  6032. vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
  6033. v0 = vec_ld( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
  6034. v1 = vec_ld( 16, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
  6035. vecVertA3 = vec_perm( v0, v1, vecPerm0 );
  6036. vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset3 * DRAWVERT_OFFSET ) );
  6037. v4 = vec_ld( 0, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
  6038. v5 = vec_ld( 16, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
  6039. vecVertB3 = vec_perm( v4, v5, vecPerm1 );
  6040. vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
  6041. v6 = vec_ld( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
  6042. v7 = vec_ld( 16, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
  6043. vecVertC3 = vec_perm( v6, v7, vecPerm2 );
  6044. // put remainder into v4
  6045. v1 = vec_perm( v1, v1, vecPerm0 );
  6046. v5 = vec_perm( v5, v5, vecPerm1 );
  6047. v7 = vec_perm( v7, v7, vecPerm2 );
  6048. v1 = vec_mergeh( v1, v7 );
  6049. v4 = vec_mergeh( v5, zeroVector );
  6050. v4 = vec_mergeh( v1, v4 );
  6051. v4 = vec_perm( v4, v4, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
  6052. // load fourth one
  6053. vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
  6054. v0 = vec_ld( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
  6055. v1 = vec_ld( 16, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
  6056. vecVertA4 = vec_perm( v0, v1, vecPerm0 );
  6057. vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset4 * DRAWVERT_OFFSET ) );
  6058. v5 = vec_ld( 0, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
  6059. v6 = vec_ld( 16, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
  6060. vecVertB4 = vec_perm( v5, v6, vecPerm3 );
  6061. vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
  6062. v7 = vec_ld( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
  6063. v8 = vec_ld( 16, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
  6064. vecVertC4 = vec_perm( v7, v8, vecPerm4 );
  6065. // put remainder into v5
  6066. v1 = vec_perm( v1, v1, vecPerm0 );
  6067. v6 = vec_perm( v6, v6, vecPerm3 );
  6068. v8 = vec_perm( v8, v8, vecPerm4 );
  6069. v1 = vec_mergeh( v1, v8 );
  6070. v5 = vec_mergeh( v6, zeroVector );
  6071. v5 = vec_mergeh( v1, v5 );
  6072. v5 = vec_perm( v5, v5, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
  6073. // remainder vectors look like b->st[1], a->st[1], c->st[1], a->st[1]
  6074. //vecD1 now holds d0, d1, d2, d3
  6075. vecD1 = vec_sub( vecVertB, vecVertA );
  6076. vecD4 = vec_sub( vecVertB2, vecVertA2 );
  6077. vecD7 = vec_sub( vecVertB3, vecVertA3 );
  6078. vecD10 = vec_sub( vecVertB4, vecVertA4 );
  6079. // vecD2 how holds d5, d6, d7, d8
  6080. vecD2 = vec_sub( vecVertC, vecVertA );
  6081. vecD5 = vec_sub( vecVertC2, vecVertA2 );
  6082. vecD8 = vec_sub( vecVertC3, vecVertA3 );
  6083. vecD11 = vec_sub( vecVertC4, vecVertA4 );
  6084. // vecD3 now holds d4, crap, d9, crap
  6085. vecD3 = vec_sub( v2, vec_sld( v2, v2, 4 ) );
  6086. vecD6 = vec_sub( v3, vec_sld( v3, v3, 4 ) );
  6087. vecD9 = vec_sub( v4, vec_sld( v4, v4, 4 ) );
  6088. vecD12 = vec_sub( v5, vec_sld( v5, v5, 4 ) );
  6089. // get permute vectors for loading from dt
  6090. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i].normalizationScale[0] ), (vector unsigned char)(1) );
  6091. vecPerm2 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+1].normalizationScale[0] ), (vector unsigned char)(1) );
  6092. vecPerm3 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+2].normalizationScale[0] ), (vector unsigned char)(1) );
  6093. vecPerm4 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+3].normalizationScale[0] ), (vector unsigned char)(1) );
  6094. // load S values from dominantTris
  6095. v0 = vec_ld( 0, &dominantTris[i].normalizationScale[0] );
  6096. v1 = vec_ld( 11, &dominantTris[i].normalizationScale[0] );
  6097. v2 = vec_ld( 0, &dominantTris[i+1].normalizationScale[0] );
  6098. v3 = vec_ld( 11, &dominantTris[i+1].normalizationScale[0] );
  6099. v4 = vec_ld( 0, &dominantTris[i+2].normalizationScale[0] );
  6100. v5 = vec_ld( 11, &dominantTris[i+2].normalizationScale[0] );
  6101. v6 = vec_ld( 0, &dominantTris[i+3].normalizationScale[0] );
  6102. v7 = vec_ld( 11, &dominantTris[i+3].normalizationScale[0] );
  6103. v0 = vec_perm( v0, v1, vecPerm1 );
  6104. v2 = vec_perm( v2, v3, vecPerm2 );
  6105. v4 = vec_perm( v4, v5, vecPerm3 );
  6106. v6 = vec_perm( v6, v7, vecPerm4 );
  6107. vecS0 = vec_splat( v0, 0 );
  6108. vecS1 = vec_splat( v0, 1 );
  6109. vecS2 = vec_splat( v0, 2 );
  6110. vecS0_2 = vec_splat( v2, 0);
  6111. vecS1_2 = vec_splat( v2, 1 );
  6112. vecS2_2 = vec_splat( v2, 2 );
  6113. vecS0_3 = vec_splat( v4, 0 );
  6114. vecS1_3 = vec_splat( v4, 1 );
  6115. vecS2_3 = vec_splat( v4, 2 );
  6116. vecS0_4 = vec_splat( v6, 0 );
  6117. vecS1_4 = vec_splat( v6, 1 );
  6118. vecS2_4 = vec_splat( v6, 2 );
  6119. // do calculation
  6120. vecWork1 = vec_perm( vecD2, vecD2, vecPermN1 );
  6121. vecWork2 = vec_perm( vecD1, vecD1, vecPermN0 );
  6122. vecWork3 = vec_perm( vecD5, vecD5, vecPermN1 );
  6123. vecWork4 = vec_perm( vecD4, vecD4, vecPermN0 );
  6124. vecWork5 = vec_perm( vecD8, vecD8, vecPermN1 );
  6125. vecWork6 = vec_perm( vecD7, vecD7, vecPermN0 );
  6126. vecWork7 = vec_perm( vecD11, vecD11, vecPermN1 );
  6127. vecWork8 = vec_perm( vecD10, vecD10, vecPermN0 );
  6128. vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
  6129. vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
  6130. vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
  6131. vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
  6132. vecWork1 = vec_perm( vecD2, vecD2, vecPermN0 );
  6133. vecWork2 = vec_perm( vecD1, vecD1, vecPermN1 );
  6134. vecWork3 = vec_perm( vecD5, vecD5, vecPermN0 );
  6135. vecWork4 = vec_perm( vecD4, vecD4, vecPermN1 );
  6136. vecWork5 = vec_perm( vecD8, vecD8, vecPermN0 );
  6137. vecWork6 = vec_perm( vecD7, vecD7, vecPermN1 );
  6138. vecWork7 = vec_perm( vecD11, vecD11, vecPermN0 );
  6139. vecWork8 = vec_perm( vecD10, vecD10, vecPermN1 );
  6140. vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
  6141. vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
  6142. vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
  6143. vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
  6144. // calculate N values
  6145. vecN = vec_madd( vecS2, vecSecondHalf, zeroVector );
  6146. vecN2 = vec_madd( vecS2_2, vecSecondHalf2, zeroVector );
  6147. vecN3 = vec_madd( vecS2_3, vecSecondHalf3, zeroVector );
  6148. vecN4 = vec_madd( vecS2_4, vecSecondHalf4, zeroVector );
  6149. // calculate both halves of the calculation for t
  6150. vecWork1 = vecD1;
  6151. vecWork2 = vec_perm( vecD3, vecD3, vecPermT1 );
  6152. vecWork3 = vecD4;
  6153. vecWork4 = vec_perm( vecD6, vecD6, vecPermT1 );
  6154. vecWork5 = vecD7;
  6155. vecWork6 = vec_perm( vecD9, vecD9, vecPermT1 );
  6156. vecWork7 = vecD10;
  6157. vecWork8 = vec_perm( vecD12, vecD12, vecPermT1 );
  6158. vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
  6159. vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
  6160. vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
  6161. vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
  6162. vecWork1 = vecD2;
  6163. vecWork2 = vec_perm( vecD3, vecD3, vecPermT0 );
  6164. vecWork3 = vecD5;
  6165. vecWork4 = vec_perm( vecD6, vecD6, vecPermT0 );
  6166. vecWork5 = vecD8;
  6167. vecWork6 = vec_perm( vecD9, vecD9, vecPermT0 );
  6168. vecWork7 = vecD11;
  6169. vecWork8 = vec_perm( vecD12, vecD12, vecPermT0 );
  6170. vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
  6171. vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
  6172. vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
  6173. vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
  6174. // calculate T values
  6175. vecT1 = vec_madd( vecS0, vecSecondHalf, zeroVector );
  6176. vecT1_2 = vec_madd( vecS0_2, vecSecondHalf2, zeroVector );
  6177. vecT1_3 = vec_madd( vecS0_3, vecSecondHalf3, zeroVector );
  6178. vecT1_4 = vec_madd( vecS0_4, vecSecondHalf4, zeroVector );
  6179. #ifndef DERIVE_UNSMOOTHED_BITANGENT
  6180. vecWork1 = vecD1;
  6181. vecWork2 = vec_perm( vecD2, vecD2, vecPermT2 );
  6182. vecWork3 = vecD4;
  6183. vecWork4 = vec_perm( vecD5, vecD5, vecPermT2 );
  6184. vecWork5 = vecD7;
  6185. vecWork6 = vec_perm( vecD8, vecD8, vecPermT2 );
  6186. vecWork7 = vecD10;
  6187. vecWork8 = vec_perm( vecD11, vecD11, vecPermT2 );
  6188. vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
  6189. vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
  6190. vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
  6191. vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
  6192. vecWork1 = vec_perm( vecD1, vecD1, vecPermT2 );
  6193. vecWork2 = vecD2;
  6194. vecWork3 = vec_perm( vecD4, vecD4, vecPermT2 );
  6195. vecWork4 = vecD5;
  6196. vecWork5 = vec_perm( vecD7, vecD7, vecPermT2 );
  6197. vecWork6 = vecD8;
  6198. vecWork7 = vec_perm( vecD10, vecD10, vecPermT2 );
  6199. vecWork8 = vecD11;
  6200. vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
  6201. vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
  6202. vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
  6203. vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
  6204. #else
  6205. vecWork1 = vec_perm( vecN, vecN, vecPermN1 );
  6206. vecWork2 = vec_perm( vecT1, vecT1, vecPermN0 );
  6207. vecWork3 = vec_perm( vecN2, vecN2, vecPermN1 );
  6208. vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN0 );
  6209. vecWork5 = vec_perm( vecN3, vecN3, vecPermN1 );
  6210. vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN0 );
  6211. vecWork7 = vec_perm( vecN4, vecN4, vecPermN1 );
  6212. vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN0 );
  6213. vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
  6214. vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
  6215. vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
  6216. vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
  6217. vecWork1 = vec_perm( vecN, vecN, vecPermN0 );
  6218. vecWork2 = vec_perm( vecT1, vecT1, vecPermN1 );
  6219. vecWork3 = vec_perm( vecN2, vecN2, vecPermN0 );
  6220. vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN1 );
  6221. vecWork5 = vec_perm( vecN3, vecN3, vecPermN0 );
  6222. vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN1 );
  6223. vecWork7 = vec_perm( vecN4, vecN4, vecPermN0 );
  6224. vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN1 );
  6225. vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
  6226. vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
  6227. vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
  6228. vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
  6229. #endif
  6230. // finish the calculation
  6231. vecSecondHalf = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
  6232. vecSecondHalf2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
  6233. vecSecondHalf3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
  6234. vecSecondHalf4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
  6235. vecT2 = vec_madd( vecS1, vecSecondHalf, zeroVector );
  6236. vecT2_2 = vec_madd( vecS1_2, vecSecondHalf2, zeroVector );
  6237. vecT2_3 = vec_madd( vecS1_3, vecSecondHalf3, zeroVector );
  6238. vecT2_4 = vec_madd( vecS1_4, vecSecondHalf4, zeroVector );
  6239. // Store results
  6240. // read values that we need to preserve
  6241. vecLd1 = vec_ld( 0, normalPtr + ( i * DRAWVERT_OFFSET ) );
  6242. vecLd2 = vec_ld( 32, normalPtr + ( i * DRAWVERT_OFFSET ) );
  6243. //generate vectors to store
  6244. vecStore1 = vec_perm( vecLd1, vecN, vecPermLeadAndThree );
  6245. vecStore2 = vec_perm( vecT1, vecT2, vecPermFirstThreeLast );
  6246. vecStore3 = vec_perm( vecT2, vecLd2, vecPermStore2 );
  6247. // store out results
  6248. ALIGNED_STORE3( normalPtr + ( i * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
  6249. // read values that we need to preserve
  6250. vecLd3 = vec_ld( 32, normalPtr + ( (i+1) * DRAWVERT_OFFSET ));
  6251. // generate vectors to store
  6252. vecStore1 = vec_perm( vecN2, vecT1_2, vecPermFirstThreeLast );
  6253. vecStore2 = vec_perm( vecT1_2, vecT2_2, vecPermStoreSecond );
  6254. vecStore3 = vec_perm( vecT2_2, vecLd3, (vector unsigned char)(8,9,10,11,20,21,22,23,24,25,26,27,28,29,30,31) );
  6255. // instead of doing permute, shift it where it needs to be and use vec_ste
  6256. // store out vectors
  6257. ALIGNED_STORE3( normalPtr + ((i+1) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
  6258. // read values that we need to preserve
  6259. vecLd1 = vec_ld( 0, normalPtr + ( (i+2) * DRAWVERT_OFFSET ) );
  6260. // generate vectors to store
  6261. vecStore1 = vec_perm( vecLd1, vecN3, vecPermFirstThreeLast );
  6262. vecStore2 = vec_perm( vecN3, vecT1_3, vecPermStore3 );
  6263. vecStore3 = vec_perm( vecT1_3, vecT2_3, vecPermStore4 );
  6264. // store out vectors
  6265. ALIGNED_STORE3( normalPtr + ((i+2) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
  6266. // read values that we need to preserve
  6267. vecLd2 = vec_ld( 0, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
  6268. vecLd3 = vec_ld( 32, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
  6269. // generate vectors to store
  6270. vecStore1 = vec_perm( vecLd2, vecN4, vecPermHalves );
  6271. vecStore2 = vec_perm( vecN4, vecT1_4, vecPermStore4 );
  6272. vecStore3 = vec_perm( vecT2_4, vecLd3, vecPermFirstThreeLast );
  6273. // store out vectors
  6274. ALIGNED_STORE3( normalPtr + ((i+3) * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
  6275. }
  6276. // cleanup
  6277. for ( ; i < numVerts; i++ ) {
  6278. idDrawVert *a, *b, *c;
  6279. float d0, d1, d2, d3, d4;
  6280. float d5, d6, d7, d8, d9;
  6281. float s0, s1, s2;
  6282. float n0, n1, n2;
  6283. float t0, t1, t2;
  6284. float t3, t4, t5;
  6285. const dominantTri_s &dt = dominantTris[i];
  6286. a = verts + i;
  6287. b = verts + dt.v2;
  6288. c = verts + dt.v3;
  6289. d0 = b->xyz[0] - a->xyz[0];
  6290. d1 = b->xyz[1] - a->xyz[1];
  6291. d2 = b->xyz[2] - a->xyz[2];
  6292. d3 = b->st[0] - a->st[0];
  6293. d4 = b->st[1] - a->st[1];
  6294. d5 = c->xyz[0] - a->xyz[0];
  6295. d6 = c->xyz[1] - a->xyz[1];
  6296. d7 = c->xyz[2] - a->xyz[2];
  6297. d8 = c->st[0] - a->st[0];
  6298. d9 = c->st[1] - a->st[1];
  6299. s0 = dt.normalizationScale[0];
  6300. s1 = dt.normalizationScale[1];
  6301. s2 = dt.normalizationScale[2];
  6302. n0 = s2 * ( d6 * d2 - d7 * d1 );
  6303. n1 = s2 * ( d7 * d0 - d5 * d2 );
  6304. n2 = s2 * ( d5 * d1 - d6 * d0 );
  6305. t0 = s0 * ( d0 * d9 - d4 * d5 );
  6306. t1 = s0 * ( d1 * d9 - d4 * d6 );
  6307. t2 = s0 * ( d2 * d9 - d4 * d7 );
  6308. #ifndef DERIVE_UNSMOOTHED_BITANGENT
  6309. t3 = s1 * ( d3 * d5 - d0 * d8 );
  6310. t4 = s1 * ( d3 * d6 - d1 * d8 );
  6311. t5 = s1 * ( d3 * d7 - d2 * d8 );
  6312. #else
  6313. t3 = s1 * ( n2 * t1 - n1 * t2 );
  6314. t4 = s1 * ( n0 * t2 - n2 * t0 );
  6315. t5 = s1 * ( n1 * t0 - n0 * t1 );
  6316. #endif
  6317. a->normal[0] = n0;
  6318. a->normal[1] = n1;
  6319. a->normal[2] = n2;
  6320. a->tangents[0][0] = t0;
  6321. a->tangents[0][1] = t1;
  6322. a->tangents[0][2] = t2;
  6323. a->tangents[1][0] = t3;
  6324. a->tangents[1][1] = t4;
  6325. a->tangents[1][2] = t5;
  6326. }
  6327. }
  6328. #else
  6329. /*
  6330. ============
  6331. idSIMD_AltiVec::DeriveUnsmoothedTangents
  6332. Derives the normal and orthogonal tangent vectors for the triangle vertices.
  6333. For each vertex the normal and tangent vectors are derived from a single dominant triangle.
  6334. ============
  6335. */
  6336. #define DERIVE_UNSMOOTHED_BITANGENT
  6337. void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
  6338. int i;
  6339. for ( i = 0; i < numVerts; i++ ) {
  6340. idDrawVert *a, *b, *c;
  6341. float d0, d1, d2, d3, d4;
  6342. float d5, d6, d7, d8, d9;
  6343. float s0, s1, s2;
  6344. float n0, n1, n2;
  6345. float t0, t1, t2;
  6346. float t3, t4, t5;
  6347. const dominantTri_s &dt = dominantTris[i];
  6348. a = verts + i;
  6349. b = verts + dt.v2;
  6350. c = verts + dt.v3;
  6351. d0 = b->xyz[0] - a->xyz[0];
  6352. d1 = b->xyz[1] - a->xyz[1];
  6353. d2 = b->xyz[2] - a->xyz[2];
  6354. d3 = b->st[0] - a->st[0];
  6355. d4 = b->st[1] - a->st[1];
  6356. d5 = c->xyz[0] - a->xyz[0];
  6357. d6 = c->xyz[1] - a->xyz[1];
  6358. d7 = c->xyz[2] - a->xyz[2];
  6359. d8 = c->st[0] - a->st[0];
  6360. d9 = c->st[1] - a->st[1];
  6361. s0 = dt.normalizationScale[0];
  6362. s1 = dt.normalizationScale[1];
  6363. s2 = dt.normalizationScale[2];
  6364. n0 = s2 * ( d6 * d2 - d7 * d1 );
  6365. n1 = s2 * ( d7 * d0 - d5 * d2 );
  6366. n2 = s2 * ( d5 * d1 - d6 * d0 );
  6367. t0 = s0 * ( d0 * d9 - d4 * d5 );
  6368. t1 = s0 * ( d1 * d9 - d4 * d6 );
  6369. t2 = s0 * ( d2 * d9 - d4 * d7 );
  6370. #ifndef DERIVE_UNSMOOTHED_BITANGENT
  6371. t3 = s1 * ( d3 * d5 - d0 * d8 );
  6372. t4 = s1 * ( d3 * d6 - d1 * d8 );
  6373. t5 = s1 * ( d3 * d7 - d2 * d8 );
  6374. #else
  6375. t3 = s1 * ( n2 * t1 - n1 * t2 );
  6376. t4 = s1 * ( n0 * t2 - n2 * t0 );
  6377. t5 = s1 * ( n1 * t0 - n0 * t1 );
  6378. #endif
  6379. a->normal[0] = n0;
  6380. a->normal[1] = n1;
  6381. a->normal[2] = n2;
  6382. a->tangents[0][0] = t0;
  6383. a->tangents[0][1] = t1;
  6384. a->tangents[0][2] = t2;
  6385. a->tangents[1][0] = t3;
  6386. a->tangents[1][1] = t4;
  6387. a->tangents[1][2] = t5;
  6388. }
  6389. }
  6390. #endif /* DERIVE_UNSMOOTH_DRAWVERT_ALIGNED */
  6391. /*
  6392. ============
  6393. idSIMD_AltiVec::NormalizeTangents
  6394. Normalizes each vertex normal and projects and normalizes the
  6395. tangent vectors onto the plane orthogonal to the vertex normal.
  6396. ============
  6397. */
  6398. void VPCALL idSIMD_AltiVec::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
  6399. // idDrawVert size
  6400. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
  6401. float *addr = verts[0].normal.ToFloatPtr();
  6402. float *tAddr = verts[0].tangents[0].ToFloatPtr();
  6403. // v0 through v3 maintain originally loaded values so we don't take
  6404. // as much hit for unaligned stores
  6405. vector float v0, v1, v2, v3;
  6406. // v5 through v8 are the "working" values of the vectors
  6407. vector float v5, v6, v7, v8;
  6408. // working values
  6409. vector float vec1T0, vec1T1, vec2T0, vec2T1, vec3T0, vec3T1, vec4T0, vec4T1;
  6410. vector float vecSum, vecTSum1, vecTSum2, tempSum, tempSum2, tempSum3;
  6411. vector float vecF, vecF2;
  6412. vector float vecTemp, vecTemp2, vecTemp3, vecTemp4;
  6413. register vector float zeroVector = (vector float)(0.0);
  6414. vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
  6415. vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
  6416. vector unsigned char vecPermSplatFirstWithZero = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,16,17,18,19);
  6417. vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3;
  6418. vector unsigned char storePerm0, storePerm1, storePerm2, storePerm3;
  6419. vector float vecTan11, vecTan12, vecTan13, vecTan21, vecTan22, vecTan23;
  6420. vector float vecTan31, vecTan32, vecTan33, vecTan41, vecTan42, vecTan43;
  6421. vector unsigned char vec1T0Perm, vec1T1Perm, vec2T0Perm, vec2T1Perm, vec3T0Perm, vec3T1Perm, vec4T0Perm, vec4T1Perm;
  6422. vector unsigned char storeT11, storeT12, storeT21, storeT22, storeT31, storeT32;
  6423. vector unsigned char storeT41, storeT42;
  6424. int i = 0;
  6425. if ( i+3 < numVerts ) {
  6426. // for loading normal from idDrawVert
  6427. vecPerm0 = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
  6428. vecPerm1 = vec_add( vec_lvsl( -1, addr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
  6429. vecPerm2 = vec_add( vec_lvsl( -1, addr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
  6430. vecPerm3 = vec_add( vec_lvsl( -1, addr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
  6431. // for loading tangents from idDrawVert
  6432. vec1T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
  6433. vec1T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
  6434. vec2T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
  6435. vec2T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
  6436. vec3T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
  6437. vec3T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
  6438. vec4T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
  6439. vec4T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
  6440. // generate permute vectors to store normals
  6441. storePerm0 = vec_lvsr( 0, addr );
  6442. storePerm1 = vec_lvsr( 0, addr + ( 1 * DRAWVERT_OFFSET ) );
  6443. storePerm2 = vec_lvsr( 0, addr + ( 2 * DRAWVERT_OFFSET ) );
  6444. storePerm3 = vec_lvsr( 0, addr + ( 3 * DRAWVERT_OFFSET ) );
  6445. // generate permute vectors to store tangents
  6446. storeT11 = vec_lvsr( 0, tAddr + ( 0 * DRAWVERT_OFFSET ) );
  6447. storeT12 = vec_lvsr( 12, tAddr + ( 0 * DRAWVERT_OFFSET ) );
  6448. storeT21 = vec_lvsr( 0, tAddr + ( 1 * DRAWVERT_OFFSET ) );
  6449. storeT22 = vec_lvsr( 12, tAddr + ( 1 * DRAWVERT_OFFSET ) );
  6450. storeT31 = vec_lvsr( 0, tAddr + ( 2 * DRAWVERT_OFFSET ) );
  6451. storeT32 = vec_lvsr( 12, tAddr + ( 2 * DRAWVERT_OFFSET ) );
  6452. storeT41 = vec_lvsr( 0, tAddr + ( 3 * DRAWVERT_OFFSET ) );
  6453. storeT42 = vec_lvsr( 12, tAddr + ( 3 * DRAWVERT_OFFSET ) );
  6454. }
  6455. for ( ; i+3 < numVerts; i+=4 ) {
  6456. // load normals
  6457. vector float vecNormal11 = vec_ld( 0, addr + ( i * DRAWVERT_OFFSET ) );
  6458. vector float vecNormal12 = vec_ld( 15, addr + ( i * DRAWVERT_OFFSET ) );
  6459. v0 = vec_perm( vecNormal11, vecNormal12, vecPerm0 );
  6460. vector float vecNormal21 = vec_ld( 0, addr + ((i+1) * DRAWVERT_OFFSET ) );
  6461. vector float vecNormal22 = vec_ld( 15, addr + ((i+1) * DRAWVERT_OFFSET ) );
  6462. v1 = vec_perm( vecNormal21, vecNormal22, vecPerm1 );
  6463. vector float vecNormal31 = vec_ld( 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
  6464. vector float vecNormal32 = vec_ld( 15, addr + ( (i+2) * DRAWVERT_OFFSET ) );
  6465. v2 = vec_perm( vecNormal31, vecNormal32, vecPerm2 );
  6466. vector float vecNormal41 = vec_ld( 0, addr + ((i+3) * DRAWVERT_OFFSET ) );
  6467. vector float vecNormal42 = vec_ld( 15, addr + ((i+3) * DRAWVERT_OFFSET ) );
  6468. v3 = vec_perm( vecNormal41, vecNormal42, vecPerm3 );
  6469. // zero out the last element of each useless vector
  6470. v0 = vec_perm( v0, zeroVector, vecPermLast );
  6471. v1 = vec_perm( v1, zeroVector, vecPermLast );
  6472. v2 = vec_perm( v2, zeroVector, vecPermLast );
  6473. v3 = vec_perm( v3, zeroVector, vecPermLast );
  6474. // got 4 vectors in v0 through v3, sum them each accross
  6475. // and put into one vector
  6476. vecTemp = vec_madd( v0, v0, zeroVector );
  6477. vecSum = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
  6478. vecSum = vec_add( vecSum, vec_sld( vecSum, vecSum, 4 ) );
  6479. // element 0 of vecSum now has sum of v0
  6480. vecTemp2 = vec_madd( v1, v1, zeroVector );
  6481. tempSum = vec_add( vecTemp2, vec_sld( vecTemp2, vecTemp2, 8 ) );
  6482. tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
  6483. // put this into vecSum
  6484. vecSum = vec_mergeh( vecSum, tempSum );
  6485. vecTemp3 = vec_madd( v2, v2, zeroVector );
  6486. tempSum = vec_add( vecTemp3, vec_sld( vecTemp3, vecTemp3, 8 ) );
  6487. tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
  6488. // put this into vecSum
  6489. vecSum = vec_perm( vecSum, tempSum, vecPermHalves );
  6490. vecTemp4 = vec_madd( v3, v3, zeroVector );
  6491. tempSum = vec_add( vecTemp4, vec_sld( vecTemp4, vecTemp4, 8 ) );
  6492. tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
  6493. // put this into vecSum
  6494. vecSum = vec_perm( vecSum, tempSum, vecPermLast );
  6495. // take reciprocal square roots of these
  6496. vecF = ReciprocalSquareRoot( vecSum );
  6497. // multiply each vector by f
  6498. v5 = vec_madd( v0, vec_splat( vecF, 0 ), zeroVector );
  6499. v6 = vec_madd( v1, vec_splat( vecF, 1 ), zeroVector );
  6500. v7 = vec_madd( v2, vec_splat( vecF, 2 ), zeroVector );
  6501. v8 = vec_madd( v3, vec_splat( vecF, 3 ), zeroVector );
  6502. // load tangents as unaligned
  6503. vecTan11 = vec_ld( 0, tAddr + ( i * DRAWVERT_OFFSET ) );
  6504. vecTan12 = vec_ld( 11, tAddr + ( i * DRAWVERT_OFFSET ) );
  6505. vecTan13 = vec_ld( 23, tAddr + ( i * DRAWVERT_OFFSET ) );
  6506. vecTan21 = vec_ld( 0, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
  6507. vecTan22 = vec_ld( 11, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
  6508. vecTan23 = vec_ld( 23, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
  6509. vecTan31 = vec_ld( 0, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
  6510. vecTan32 = vec_ld( 11, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
  6511. vecTan33 = vec_ld( 23, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
  6512. vecTan41 = vec_ld( 0, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
  6513. vecTan42 = vec_ld( 11, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
  6514. vecTan43 = vec_ld( 23, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
  6515. vec1T0 = vec_perm( vecTan11, vecTan12, vec1T0Perm );
  6516. vec1T1 = vec_perm( vecTan12, vecTan13, vec1T1Perm );
  6517. vec2T0 = vec_perm( vecTan21, vecTan22, vec2T0Perm );
  6518. vec2T1 = vec_perm( vecTan22, vecTan23, vec2T1Perm );
  6519. vec3T0 = vec_perm( vecTan31, vecTan32, vec3T0Perm );
  6520. vec3T1 = vec_perm( vecTan32, vecTan33, vec3T1Perm );
  6521. vec4T0 = vec_perm( vecTan41, vecTan42, vec4T0Perm );
  6522. vec4T1 = vec_perm( vecTan42, vecTan43, vec4T1Perm );
  6523. //zero out last element of tangents
  6524. vec1T0 = vec_perm( vec1T0, zeroVector, vecPermLast );
  6525. vec1T1 = vec_perm( vec1T1, zeroVector, vecPermLast );
  6526. vec2T0 = vec_perm( vec2T0, zeroVector, vecPermLast );
  6527. vec2T1 = vec_perm( vec2T1, zeroVector, vecPermLast );
  6528. vec3T0 = vec_perm( vec3T0, zeroVector, vecPermLast );
  6529. vec3T1 = vec_perm( vec3T1, zeroVector, vecPermLast );
  6530. vec4T0 = vec_perm( vec4T0, zeroVector, vecPermLast );
  6531. vec4T1 = vec_perm( vec4T1, zeroVector, vecPermLast );
  6532. // all tangents[0]
  6533. tempSum = zeroVector;
  6534. tempSum = vec_madd( vec1T0, v5, tempSum );
  6535. //sum accross tempSum
  6536. vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
  6537. vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
  6538. // put tempSum splatted accross vecTSum1
  6539. vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
  6540. vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
  6541. //vec1T0 now contains what needs to be rsqrt'd and multiplied by f
  6542. vec1T0 = vec_sub( vec1T0, vecTSum1 );
  6543. tempSum = zeroVector;
  6544. tempSum = vec_madd( vec2T0, v6, tempSum );
  6545. //sum accross tempSum
  6546. vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
  6547. vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
  6548. vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
  6549. vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
  6550. vec2T0 = vec_sub( vec2T0, vecTSum1 );
  6551. tempSum = zeroVector;
  6552. tempSum = vec_madd( vec3T0, v7, tempSum );
  6553. //sum accross tempSum
  6554. vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
  6555. vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
  6556. vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
  6557. vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
  6558. vec3T0 = vec_sub( vec3T0, vecTSum1 );
  6559. tempSum = zeroVector;
  6560. tempSum = vec_madd( vec4T0, v8, tempSum );
  6561. //sum accross tempSum
  6562. vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
  6563. vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
  6564. vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
  6565. vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
  6566. vec4T0 = vec_sub( vec4T0, vecTSum1 );
  6567. // all tangents[1]
  6568. tempSum = zeroVector;
  6569. tempSum = vec_madd( vec1T1, v5, tempSum );
  6570. //sum accross tempSum
  6571. vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
  6572. vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
  6573. vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
  6574. vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
  6575. //vec1T0 now contains what needs to be rsqrt'd and multiplied by f
  6576. vec1T1 = vec_sub( vec1T1, vecTSum1 );
  6577. tempSum = zeroVector;
  6578. tempSum = vec_madd( vec2T1, v6, tempSum );
  6579. //sum accross tempSum
  6580. vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
  6581. vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
  6582. vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
  6583. vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
  6584. vec2T1 = vec_sub( vec2T1, vecTSum1 );
  6585. tempSum = zeroVector;
  6586. tempSum = vec_madd( vec3T1, v7, tempSum );
  6587. //sum accross tempSum
  6588. vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
  6589. vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
  6590. vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
  6591. vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
  6592. vec3T1 = vec_sub( vec3T1, vecTSum1 );
  6593. tempSum = zeroVector;
  6594. tempSum = vec_madd( vec4T1, v8, tempSum );
  6595. //sum accross tempSum
  6596. vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
  6597. vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
  6598. vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
  6599. vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
  6600. vec4T1 = vec_sub( vec4T1, vecTSum1 );
  6601. // sum accross vectors and put into one vector
  6602. vecTemp = vec_madd( vec1T0, vec1T0, zeroVector );
  6603. vecTSum1 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
  6604. vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
  6605. // element 0 of vecSum now has sum of v0
  6606. vecTemp = vec_madd( vec2T0, vec2T0, zeroVector );
  6607. tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
  6608. tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
  6609. // put this into vecSum
  6610. vecTemp = vec_madd( vec3T0, vec3T0, zeroVector );
  6611. vecTSum1 = vec_mergeh( vecTSum1, tempSum2 );
  6612. tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
  6613. tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
  6614. // put this into vecSum
  6615. vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermHalves );
  6616. vecTemp = vec_madd( vec4T0, vec4T0, zeroVector );
  6617. tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
  6618. tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
  6619. // put this into vecSum
  6620. vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermLast );
  6621. vecTemp = vec_madd( vec1T1, vec1T1, zeroVector );
  6622. vecTSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
  6623. vecTSum2 = vec_add( vecTSum2, vec_sld( vecTSum2, vecTSum2, 4 ) );
  6624. // element 0 of vecSum now has sum of v0
  6625. vecTemp = vec_madd( vec2T1, vec2T1, zeroVector );
  6626. tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
  6627. tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
  6628. // put this into vecSum
  6629. vecTSum2 = vec_mergeh( vecTSum2, tempSum3 );
  6630. vecTemp = vec_madd( vec3T1, vec3T1, zeroVector );
  6631. tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
  6632. tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
  6633. // put this into vecSum
  6634. vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermHalves );
  6635. vecTemp = vec_madd( vec4T1, vec4T1, zeroVector );
  6636. tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
  6637. tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
  6638. // put this into vecSum
  6639. vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermLast );
  6640. // tangents[0]
  6641. vecF = ReciprocalSquareRoot( vecTSum1 );
  6642. // tangents[1]
  6643. vecF2 = ReciprocalSquareRoot( vecTSum2 );
  6644. // multiply each tangent vector by f
  6645. vec1T0 = vec_madd( vec1T0, vec_splat( vecF, 0 ), zeroVector );
  6646. vec2T0 = vec_madd( vec2T0, vec_splat( vecF, 1 ), zeroVector );
  6647. vec3T0 = vec_madd( vec3T0, vec_splat( vecF, 2 ), zeroVector );
  6648. vec4T0 = vec_madd( vec4T0, vec_splat( vecF, 3 ), zeroVector );
  6649. vec1T1 = vec_madd( vec1T1, vec_splat( vecF2, 0 ), zeroVector );
  6650. vec2T1 = vec_madd( vec2T1, vec_splat( vecF2, 1 ), zeroVector );
  6651. vec3T1 = vec_madd( vec3T1, vec_splat( vecF2, 2 ), zeroVector );
  6652. vec4T1 = vec_madd( vec4T1, vec_splat( vecF2, 3 ), zeroVector );
  6653. // rotate input data
  6654. v5 = vec_perm( v5, v5, storePerm0 );
  6655. v6 = vec_perm( v6, v6, storePerm1 );
  6656. v7 = vec_perm( v7, v7, storePerm2 );
  6657. v8 = vec_perm( v8, v8, storePerm3 );
  6658. vec_ste( v5, 0, addr + ( (i+0) * DRAWVERT_OFFSET ) );
  6659. vec_ste( v5, 4, addr + ( (i+0) * DRAWVERT_OFFSET ) );
  6660. vec_ste( v5, 8, addr + ( (i+0) * DRAWVERT_OFFSET ) );
  6661. vec_ste( v6, 0, addr + ( (i+1) * DRAWVERT_OFFSET ) );
  6662. vec_ste( v6, 4, addr + ( (i+1) * DRAWVERT_OFFSET ) );
  6663. vec_ste( v6, 8, addr + ( (i+1) * DRAWVERT_OFFSET ) );
  6664. vec_ste( v7, 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
  6665. vec_ste( v7, 4, addr + ( (i+2) * DRAWVERT_OFFSET ) );
  6666. vec_ste( v7, 8, addr + ( (i+2) * DRAWVERT_OFFSET ) );
  6667. vec_ste( v8, 0, addr + ( (i+3) * DRAWVERT_OFFSET ) );
  6668. vec_ste( v8, 4, addr + ( (i+3) * DRAWVERT_OFFSET ) );
  6669. vec_ste( v8, 8, addr + ( (i+3) * DRAWVERT_OFFSET ) );
  6670. // store tangents[0] and tangents[1]
  6671. vec1T0 = vec_perm( vec1T0, vec1T0, storeT11 );
  6672. vec1T1 = vec_perm( vec1T1, vec1T1, storeT12 );
  6673. vec_ste( vec1T0, 0, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
  6674. vec_ste( vec1T0, 4, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
  6675. vec_ste( vec1T0, 8, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
  6676. vec_ste( vec1T1, 12, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
  6677. vec_ste( vec1T1, 16, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
  6678. vec_ste( vec1T1, 20, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
  6679. // store second tangents[0] and tangents[1]
  6680. vec2T0 = vec_perm( vec2T0, vec2T0, storeT21 );
  6681. vec2T1 = vec_perm( vec2T1, vec2T1, storeT22 );
  6682. vec_ste( vec2T0, 0, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
  6683. vec_ste( vec2T0, 4, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
  6684. vec_ste( vec2T0, 8, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
  6685. vec_ste( vec2T1, 12, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
  6686. vec_ste( vec2T1, 16, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
  6687. vec_ste( vec2T1, 20, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
  6688. // store third tangents[0] and tangents[1]
  6689. vec3T0 = vec_perm( vec3T0, vec3T0, storeT31 );
  6690. vec3T1 = vec_perm( vec3T1, vec3T1, storeT32 );
  6691. vec_ste( vec3T0, 0, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
  6692. vec_ste( vec3T0, 4, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
  6693. vec_ste( vec3T0, 8, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
  6694. vec_ste( vec3T1, 12, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
  6695. vec_ste( vec3T1, 16, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
  6696. vec_ste( vec3T1, 20, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
  6697. // store fourth tangents[0] and tangents[1]
  6698. vec4T0 = vec_perm( vec4T0, vec4T0, storeT41 );
  6699. vec4T1 = vec_perm( vec4T1, vec4T1, storeT42 );
  6700. vec_ste( vec4T0, 0, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
  6701. vec_ste( vec4T0, 4, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
  6702. vec_ste( vec4T0, 8, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
  6703. vec_ste( vec4T1, 12, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
  6704. vec_ste( vec4T1, 16, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
  6705. vec_ste( vec4T1, 20, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
  6706. }
  6707. // cleanup
  6708. for ( ; i < numVerts; i++ ) {
  6709. idVec3 &v = verts[i].normal;
  6710. float f;
  6711. //f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
  6712. f = FastScalarInvSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
  6713. v.x *= f; v.y *= f; v.z *= f;
  6714. for ( int j = 0; j < 2; j++ ) {
  6715. idVec3 &t = verts[i].tangents[j];
  6716. t -= ( t * v ) * v;
  6717. // f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
  6718. f = FastScalarInvSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
  6719. t.x *= f; t.y *= f; t.z *= f;
  6720. }
  6721. }
  6722. }
  6723. #endif /* ENABLE_DERIVE */
  6724. #ifdef ENABLE_CREATE
  6725. /*
  6726. ============
  6727. idSIMD_AltiVec::CreateTextureSpaceLightVectors
  6728. Calculates light vectors in texture space for the given triangle vertices.
  6729. For each vertex the direction towards the light origin is projected onto texture space.
  6730. The light vectors are only calculated for the vertices referenced by the indexes.
  6731. ============
  6732. */
  6733. void VPCALL idSIMD_AltiVec::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  6734. bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
  6735. memset( used, 0, numVerts * sizeof( used[0] ) );
  6736. int i;
  6737. for ( i = 0; i+7 < numIndexes; i+= 8 ) {
  6738. used[indexes[i]] = true;
  6739. used[indexes[i+1]] = true;
  6740. used[indexes[i+2]] = true;
  6741. used[indexes[i+3]] = true;
  6742. used[indexes[i+4]] = true;
  6743. used[indexes[i+5]] = true;
  6744. used[indexes[i+6]] = true;
  6745. used[indexes[i+7]] = true;
  6746. }
  6747. for ( ; i < numIndexes; i++ ) {
  6748. used[indexes[i]] = true;
  6749. }
  6750. for ( i = 0; i+1 < numVerts; i+=2 ) {
  6751. const idDrawVert *v = &verts[i];
  6752. const idDrawVert *v2 = &verts[i+1];
  6753. float x, y, z;
  6754. float x2, y2, z2;
  6755. idVec3 lightDir, lightDir2;
  6756. lightDir[0] = lightOrigin[0] - v->xyz[0];
  6757. lightDir[1] = lightOrigin[1] - v->xyz[1];
  6758. lightDir[2] = lightOrigin[2] - v->xyz[2];
  6759. lightDir2[0] = lightOrigin[0] - v2->xyz[0];
  6760. lightDir2[1] = lightOrigin[1] - v2->xyz[1];
  6761. lightDir2[2] = lightOrigin[2] - v2->xyz[2];
  6762. x = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
  6763. y = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
  6764. z = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
  6765. x2 = lightDir2[0] * v2->tangents[0][0] + lightDir2[1] * v2->tangents[0][1] + lightDir2[2] * v2->tangents[0][2];
  6766. y2 = lightDir2[0] * v2->tangents[1][0] + lightDir2[1] * v2->tangents[1][1] + lightDir2[2] * v2->tangents[1][2];
  6767. z2 = lightDir2[0] * v2->normal[0] + lightDir2[1] * v2->normal[1] + lightDir2[2] * v2->normal[2];
  6768. if ( used[i] ) {
  6769. lightVectors[i][0] = x;
  6770. lightVectors[i][1] = y;
  6771. lightVectors[i][2] = z;
  6772. }
  6773. if ( used[i+1] ) {
  6774. lightVectors[i+1][0] = x2;
  6775. lightVectors[i+1][1] = y2;
  6776. lightVectors[i+1][2] = z2;
  6777. }
  6778. }
  6779. // cleanup
  6780. for ( ; i < numVerts; i++ ) {
  6781. if ( !used[i] ) {
  6782. continue;
  6783. }
  6784. const idDrawVert *v = &verts[i];
  6785. idVec3 lightDir;
  6786. lightDir[0] = lightOrigin[0] - v->xyz[0];
  6787. lightDir[1] = lightOrigin[1] - v->xyz[1];
  6788. lightDir[2] = lightOrigin[2] - v->xyz[2];
  6789. lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
  6790. lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
  6791. lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
  6792. }
  6793. }
  6794. #if 1
  6795. /*
  6796. ============
  6797. idSIMD_AltiVec::CreateSpecularTextureCoords
  6798. Calculates specular texture coordinates for the given triangle vertices.
  6799. For each vertex the normalized direction towards the light origin is added to the
  6800. normalized direction towards the view origin and the result is projected onto texture space.
  6801. The texture coordinates are only calculated for the vertices referenced by the indexes.
  6802. ============
  6803. */
  6804. void VPCALL idSIMD_AltiVec::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  6805. bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
  6806. memset( used, 0, numVerts * sizeof( used[0] ) );
  6807. int i;
  6808. for ( i = 0; i+7 < numIndexes; i+= 8 ) {
  6809. used[indexes[i]] = true;
  6810. used[indexes[i+1]] = true;
  6811. used[indexes[i+2]] = true;
  6812. used[indexes[i+3]] = true;
  6813. used[indexes[i+4]] = true;
  6814. used[indexes[i+5]] = true;
  6815. used[indexes[i+6]] = true;
  6816. used[indexes[i+7]] = true;
  6817. }
  6818. for ( ; i < numIndexes; i++ ) {
  6819. used[indexes[i]] = true;
  6820. }
  6821. // load lightOrigin and viewOrigin into vectors
  6822. const float *lightOriginPtr = lightOrigin.ToFloatPtr();
  6823. const float *viewOriginPtr = viewOrigin.ToFloatPtr();
  6824. vector unsigned char permVec = vec_lvsl( 0, lightOriginPtr );
  6825. vector unsigned char permVec2 = vec_lvsl( 0, viewOriginPtr );
  6826. vector float v0 = vec_ld( 0, lightOriginPtr );
  6827. vector float v1 = vec_ld( 15, lightOriginPtr );
  6828. vector float v2 = vec_ld( 0, viewOriginPtr );
  6829. vector float v3 = vec_ld( 15, viewOriginPtr );
  6830. vector float vecLightOrigin = vec_perm( v0, v1, permVec );
  6831. vector float vecViewOrigin = vec_perm( v2, v3, permVec2 );
  6832. const vector float zeroVector = (vector float)(0);
  6833. int index;
  6834. for ( index = 0; index+1 < numVerts; index+=2 ) {
  6835. const float *vertPtr = verts[index].xyz.ToFloatPtr();
  6836. const float *vertPtr2 = verts[index+1].xyz.ToFloatPtr();
  6837. permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
  6838. permVec2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
  6839. v0 = vec_ld( 0, vertPtr );
  6840. v1 = vec_ld( 15, vertPtr );
  6841. vector float v2 = vec_ld( 31, vertPtr );
  6842. vector float v3 = vec_ld( 47, vertPtr );
  6843. vector float v4 = vec_ld( 63, vertPtr );
  6844. vector float v5 = vec_ld( 0, vertPtr2 );
  6845. vector float v6 = vec_ld( 15, vertPtr2 );
  6846. vector float v7 = vec_ld( 31, vertPtr2 );
  6847. vector float v8 = vec_ld( 47, vertPtr2 );
  6848. vector float v9 = vec_ld( 63, vertPtr2 );
  6849. // figure out what values go where
  6850. vector float vecXYZ = vec_perm( v0, v1, permVec );
  6851. vector float vecNormal = vec_perm( v1, v2, permVec );
  6852. vecNormal = vec_sld( vecNormal, vecNormal, 4 );
  6853. const vector float vecTangent0 = vec_perm( v2, v3, permVec );
  6854. permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
  6855. const vector float vecTangent1 = vec_perm( v3, v4, permVec );
  6856. vector float vecXYZ2 = vec_perm( v5, v6, permVec2 );
  6857. vector float vecNormal2 = vec_perm( v6, v7, permVec2 );
  6858. vecNormal2 = vec_sld( vecNormal2, vecNormal2, 4 );
  6859. const vector float vecTangent02 = vec_perm( v7, v8, permVec2 );
  6860. permVec2 = vec_add( permVec2, (vector unsigned char)(-4) );
  6861. const vector float vecTangent12 = vec_perm( v8, v9, permVec2 );
  6862. // calculate lightDir
  6863. vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
  6864. vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
  6865. vector float vecLightDir2 = vec_sub( vecLightOrigin, vecXYZ2 );
  6866. vector float vecViewDir2 = vec_sub( vecViewOrigin, vecXYZ2 );
  6867. // calculate distance
  6868. vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
  6869. vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
  6870. vector float vecTempLight2 = vec_madd( vecLightDir2, vecLightDir2, zeroVector );
  6871. vector float vecTempView2 = vec_madd( vecViewDir2, vecViewDir2, zeroVector );
  6872. // sum accross first 3 elements of vector
  6873. vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
  6874. vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
  6875. vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
  6876. vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
  6877. vector float tempSum4 = vec_add( vecTempLight2, vec_sld( vecTempLight2, vecTempLight2, 4 ) );
  6878. vecTempLight2 = vec_add( tempSum4, vec_sld( tempSum4, tempSum4, 8 ) );
  6879. vector float tempSum5 = vec_add( vecTempView2, vec_sld( vecTempView2, vecTempView2, 4 ) );
  6880. vecTempView2 = vec_add( tempSum5, vec_sld( tempSum5, tempSum5, 8 ) );
  6881. // splat sum accross the whole vector
  6882. vecTempLight = vec_splat( vecTempLight, 0 );
  6883. vecTempView = vec_splat( vecTempView, 0 );
  6884. vecTempLight2 = vec_splat( vecTempLight2, 0 );
  6885. vecTempView2 = vec_splat( vecTempView2, 0 );
  6886. vecTempLight = ReciprocalSquareRoot( vecTempLight );
  6887. vecTempView = ReciprocalSquareRoot( vecTempView );
  6888. vecTempLight2 = ReciprocalSquareRoot( vecTempLight2 );
  6889. vecTempView2 = ReciprocalSquareRoot( vecTempView2 );
  6890. // modify light and view vectors based on ilength
  6891. vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
  6892. vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
  6893. vecViewDir2 = vec_madd( vecViewDir2, vecTempView2, zeroVector );
  6894. vecLightDir2 = vec_madd( vecLightDir2, vecTempLight2, vecViewDir2 );
  6895. // calculate what to store in each texture coord
  6896. vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
  6897. vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
  6898. vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
  6899. vector float vecTC3 = vec_madd( vecLightDir2, vecTangent02, zeroVector );
  6900. vector float vecTC4 = vec_madd( vecLightDir2, vecTangent12, zeroVector );
  6901. vector float vecTC5 = vec_madd( vecLightDir2, vecNormal2, zeroVector );
  6902. // sum accross first 3 elements of vector
  6903. vector float tempSum3;
  6904. tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
  6905. vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
  6906. tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
  6907. vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
  6908. tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
  6909. vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
  6910. tempSum4 = vec_add( vecTC3, vec_sld( vecTC3, vecTC3, 4 ) );
  6911. vecTC3 = vec_add( tempSum4, vec_sld( vecTC3, vecTC3, 8 ) );
  6912. tempSum5 = vec_add( vecTC4, vec_sld( vecTC4, vecTC4, 4 ) );
  6913. vecTC4 = vec_add( tempSum5, vec_sld( vecTC4, vecTC4, 8 ) );
  6914. vector float tempSum6 = vec_add( vecTC5, vec_sld( vecTC5, vecTC5, 4 ) );
  6915. vecTC5 = vec_add( tempSum6, vec_sld( vecTC5, vecTC5, 8 ) );
  6916. vecTC0 = vec_splat( vecTC0, 0 );
  6917. vecTC1 = vec_splat( vecTC1, 0 );
  6918. vecTC2 = vec_splat( vecTC2, 0 );
  6919. vecTC3 = vec_splat( vecTC3, 0 );
  6920. vecTC4 = vec_splat( vecTC4, 0 );
  6921. vecTC5 = vec_splat( vecTC5, 0 );
  6922. if ( used[index] ) {
  6923. // store out results
  6924. vec_ste( vecTC0, 0, &texCoords[index][0] );
  6925. vec_ste( vecTC1, 0, &texCoords[index][1] );
  6926. vec_ste( vecTC2, 0, &texCoords[index][2] );
  6927. vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
  6928. }
  6929. if ( used[index+1] ) {
  6930. vec_ste( vecTC3, 0, &texCoords[index+1][0] );
  6931. vec_ste( vecTC4, 0, &texCoords[index+1][1] );
  6932. vec_ste( vecTC5, 0, &texCoords[index+1][2] );
  6933. vec_ste( (vector float)(1.0), 0, &texCoords[index+1][3] );
  6934. }
  6935. }
  6936. // cleanup
  6937. for ( ; index < numVerts; index++ ) {
  6938. if ( !used[index] ) {
  6939. continue;
  6940. }
  6941. const float *vertPtr = verts[index].xyz.ToFloatPtr();
  6942. permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
  6943. v0 = vec_ld( 0, vertPtr );
  6944. v1 = vec_ld( 15, vertPtr );
  6945. vector float v2 = vec_ld( 31, vertPtr );
  6946. vector float v3 = vec_ld( 47, vertPtr );
  6947. vector float v4 = vec_ld( 63, vertPtr );
  6948. // figure out what values go where
  6949. vector float vecXYZ = vec_perm( v0, v1, permVec );
  6950. vector float vecNormal = vec_perm( v1, v2, permVec );
  6951. vecNormal = vec_sld( vecNormal, vecNormal, 4 );
  6952. const vector float vecTangent0 = vec_perm( v2, v3, permVec );
  6953. permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
  6954. const vector float vecTangent1 = vec_perm( v3, v4, permVec );
  6955. // calculate lightDir
  6956. vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
  6957. vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
  6958. // calculate distance
  6959. vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
  6960. vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
  6961. // sum accross first 3 elements of vector
  6962. vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
  6963. vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
  6964. vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
  6965. vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
  6966. // splat sum accross the whole vector
  6967. vecTempLight = vec_splat( vecTempLight, 0 );
  6968. vecTempView = vec_splat( vecTempView, 0 );
  6969. vecTempLight = ReciprocalSquareRoot( vecTempLight );
  6970. vecTempView = ReciprocalSquareRoot( vecTempView );
  6971. // modify light and view vectors based on ilength
  6972. vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
  6973. vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
  6974. // calculate what to store in each texture coord
  6975. vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
  6976. vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
  6977. vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
  6978. // sum accross first 3 elements of vector
  6979. vector float tempSum3;
  6980. tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
  6981. vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
  6982. tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
  6983. vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
  6984. tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
  6985. vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
  6986. vecTC0 = vec_splat( vecTC0, 0 );
  6987. vecTC1 = vec_splat( vecTC1, 0 );
  6988. vecTC2 = vec_splat( vecTC2, 0 );
  6989. // store out results
  6990. vec_ste( vecTC0, 0, &texCoords[index][0] );
  6991. vec_ste( vecTC1, 0, &texCoords[index][1] );
  6992. vec_ste( vecTC2, 0, &texCoords[index][2] );
  6993. vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
  6994. }
  6995. }
  6996. #endif /* 0 for disable spec coord */
  6997. #if 1
  6998. #ifdef VERTEXCACHE_ALIGNED
  6999. /*
  7000. ============
  7001. idSIMD_AltiVec::CreateShadowCache
  7002. ============
  7003. */
  7004. int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
  7005. int outVerts = 0;
  7006. int i = 0;
  7007. assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
  7008. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  7009. register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
  7010. register vector float zeroVector = (vector float)(0.0);
  7011. register vector float oneVector = (vector float)(1);
  7012. register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
  7013. const float *lPtr = lightOrigin.ToFloatPtr();
  7014. const float *vPtr;
  7015. const float *vPtr2;
  7016. const float *vPtr3;
  7017. const float *vPtr4;
  7018. // put values into a vector
  7019. vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
  7020. v0 = vec_ld( 0, lPtr );
  7021. v1 = vec_ld( 15, lPtr );
  7022. v0 = vec_perm( v0, v1, vecPerm );
  7023. v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
  7024. //v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
  7025. for ( ; i+3 < numVerts; i+= 4 ) {
  7026. if ( ! vertRemap[i] ) {
  7027. vPtr = verts[i].xyz.ToFloatPtr();
  7028. #ifndef DRAWVERT_PADDED
  7029. vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
  7030. v2 = vec_ld( 0, vPtr );
  7031. v3 = vec_ld( 15, vPtr );
  7032. v7 = vec_perm( v2, v3, vecPerm2 );
  7033. #else
  7034. v7 = vec_ld( 0, vPtr );
  7035. #endif
  7036. v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
  7037. v3 = vec_perm( v7, oneVector, vecPermZeroLast );
  7038. v1 = vec_sub( v2, v0 );
  7039. vec_st( v3, 0, &vertexCache[outVerts][0] );
  7040. vec_st( v1, 0, &vertexCache[outVerts+1][0] );
  7041. vertRemap[i] = outVerts;
  7042. outVerts += 2;
  7043. }
  7044. if ( ! vertRemap[i+1] ) {
  7045. vPtr2 = verts[i+1].xyz.ToFloatPtr();
  7046. #ifndef DRAWVERT_PADDED
  7047. vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
  7048. v4 = vec_ld( 0, vPtr2 );
  7049. v5 = vec_ld( 15, vPtr2 );
  7050. v6 = vec_perm( v4, v5, vecPerm3 );
  7051. #else
  7052. v6 = vec_ld( 0, vPtr2 );
  7053. #endif
  7054. v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
  7055. v5 = vec_perm( v6, oneVector, vecPermZeroLast );
  7056. v6 = vec_sub( v4, v0 );
  7057. vec_st( v5, 0, &vertexCache[outVerts][0] );
  7058. vec_st( v6, 0, &vertexCache[outVerts+1][0] );
  7059. vertRemap[i+1] = outVerts;
  7060. outVerts += 2;
  7061. }
  7062. if ( ! vertRemap[i+2] ) {
  7063. vPtr3 = verts[i+2].xyz.ToFloatPtr();
  7064. #ifndef DRAWVERT_PADDED
  7065. vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
  7066. v1 = vec_ld( 0, vPtr3 );
  7067. v2 = vec_ld( 15, vPtr3 );
  7068. v3 = vec_perm( v1, v2, vecPerm4 );
  7069. #else
  7070. v3 = vec_ld( 0, vPtr3 );
  7071. #endif
  7072. v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
  7073. v2 = vec_perm( v3, oneVector, vecPermZeroLast );
  7074. v3 = vec_sub( v1, v0 );
  7075. vec_st( v2, 0, &vertexCache[outVerts][0] );
  7076. vec_st( v3, 0, &vertexCache[outVerts+1][0] );
  7077. vertRemap[i+2] = outVerts;
  7078. outVerts += 2;
  7079. }
  7080. if ( ! vertRemap[i+3] ) {
  7081. vPtr4 = verts[i+3].xyz.ToFloatPtr();
  7082. #ifndef DRAWVERT_PADDED
  7083. vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
  7084. v4 = vec_ld( 0, vPtr4 );
  7085. v5 = vec_ld( 16, vPtr4 );
  7086. v6 = vec_perm( v4, v5, vecPerm5 );
  7087. #else
  7088. v6 = vec_ld( 0, vPtr4 );
  7089. #endif
  7090. v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
  7091. v5 = vec_perm( v6, oneVector, vecPermZeroLast );
  7092. v6 = vec_sub( v4, v0 );
  7093. vec_st( v5, 0, &vertexCache[outVerts][0] );
  7094. vec_st( v6, 0, &vertexCache[outVerts+1][0] );
  7095. vertRemap[i+3] = outVerts;
  7096. outVerts += 2;
  7097. }
  7098. }
  7099. // cleanup
  7100. for (; i < numVerts; i++ ) {
  7101. if ( vertRemap[i] ) {
  7102. continue;
  7103. }
  7104. const float *v = verts[i].xyz.ToFloatPtr();
  7105. vertexCache[outVerts+0][0] = v[0];
  7106. vertexCache[outVerts+0][1] = v[1];
  7107. vertexCache[outVerts+0][2] = v[2];
  7108. vertexCache[outVerts+0][3] = 1.0f;
  7109. // R_SetupProjection() builds the projection matrix with a slight crunch
  7110. // for depth, which keeps this w=0 division from rasterizing right at the
  7111. // wrap around point and causing depth fighting with the rear caps
  7112. vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
  7113. vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
  7114. vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
  7115. vertexCache[outVerts+1][3] = 0.0f;
  7116. vertRemap[i] = outVerts;
  7117. outVerts += 2;
  7118. }
  7119. return outVerts;
  7120. }
  7121. #else
  7122. /*
  7123. ============
  7124. idSIMD_AltiVec::CreateShadowCache
  7125. ============
  7126. */
  7127. int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
  7128. int outVerts = 0;
  7129. int i = 0;
  7130. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  7131. register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
  7132. register vector float zeroVector = (vector float)(0.0);
  7133. register vector float oneVector = (vector float)(1);
  7134. register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
  7135. const float *lPtr = lightOrigin.ToFloatPtr();
  7136. const float *vPtr;
  7137. const float *vPtr2;
  7138. const float *vPtr3;
  7139. const float *vPtr4;
  7140. // put values into a vector
  7141. vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
  7142. v0 = vec_ld( 0, lPtr );
  7143. v1 = vec_ld( 15, lPtr );
  7144. v0 = vec_perm( v0, v1, vecPerm );
  7145. v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
  7146. //v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
  7147. for ( ; i+3 < numVerts; i+= 4 ) {
  7148. if ( ! vertRemap[i] ) {
  7149. vPtr = verts[i].xyz.ToFloatPtr();
  7150. #ifndef DRAWVERT_PADDED
  7151. vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
  7152. v2 = vec_ld( 0, vPtr );
  7153. v3 = vec_ld( 15, vPtr );
  7154. v7 = vec_perm( v2, v3, vecPerm2 );
  7155. #else
  7156. v7 = vec_ld( 0, vPtr );
  7157. #endif
  7158. v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
  7159. v3 = vec_perm( v7, oneVector, vecPermZeroLast );
  7160. v1 = vec_sub( v2, v0 );
  7161. // store results
  7162. UNALIGNED_STORE2( &vertexCache[outVerts][0], v3, v1 );
  7163. vertRemap[i] = outVerts;
  7164. outVerts += 2;
  7165. }
  7166. if ( ! vertRemap[i+1] ) {
  7167. vPtr2 = verts[i+1].xyz.ToFloatPtr();
  7168. #ifndef DRAWVERT_PADDED
  7169. vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
  7170. v4 = vec_ld( 0, vPtr2 );
  7171. v5 = vec_ld( 15, vPtr2 );
  7172. v6 = vec_perm( v4, v5, vecPerm3 );
  7173. #else
  7174. v6 = vec_ld( 0, vPtr2 );
  7175. #endif
  7176. v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
  7177. v5 = vec_perm( v6, oneVector, vecPermZeroLast );
  7178. v6 = vec_sub( v4, v0 );
  7179. // store results
  7180. UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
  7181. vertRemap[i+1] = outVerts;
  7182. outVerts += 2;
  7183. }
  7184. if ( ! vertRemap[i+2] ) {
  7185. vPtr3 = verts[i+2].xyz.ToFloatPtr();
  7186. #ifndef DRAWVERT_PADDED
  7187. vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
  7188. v1 = vec_ld( 0, vPtr3 );
  7189. v2 = vec_ld( 15, vPtr3 );
  7190. v3 = vec_perm( v1, v2, vecPerm4 );
  7191. #else
  7192. v3 = vec_ld( 0, vPtr3 );
  7193. #endif
  7194. v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
  7195. v2 = vec_perm( v3, oneVector, vecPermZeroLast );
  7196. v3 = vec_sub( v1, v0 );
  7197. // store results
  7198. UNALIGNED_STORE2( &vertexCache[outVerts][0], v2, v3 );
  7199. vertRemap[i+2] = outVerts;
  7200. outVerts += 2;
  7201. }
  7202. if ( ! vertRemap[i+3] ) {
  7203. vPtr4 = verts[i+3].xyz.ToFloatPtr();
  7204. #ifndef DRAWVERT_PADDED
  7205. vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
  7206. v4 = vec_ld( 0, vPtr4 );
  7207. v5 = vec_ld( 16, vPtr4 );
  7208. v6 = vec_perm( v4, v5, vecPerm5 );
  7209. #else
  7210. v6 = vec_ld( 0, vPtr4 );
  7211. #endif
  7212. v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
  7213. v5 = vec_perm( v6, oneVector, vecPermZeroLast );
  7214. v6 = vec_sub( v4, v0 );
  7215. // store results
  7216. UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
  7217. vertRemap[i+3] = outVerts;
  7218. outVerts += 2;
  7219. }
  7220. }
  7221. // cleanup
  7222. for (; i < numVerts; i++ ) {
  7223. if ( vertRemap[i] ) {
  7224. continue;
  7225. }
  7226. const float *v = verts[i].xyz.ToFloatPtr();
  7227. vertexCache[outVerts+0][0] = v[0];
  7228. vertexCache[outVerts+0][1] = v[1];
  7229. vertexCache[outVerts+0][2] = v[2];
  7230. vertexCache[outVerts+0][3] = 1.0f;
  7231. // R_SetupProjection() builds the projection matrix with a slight crunch
  7232. // for depth, which keeps this w=0 division from rasterizing right at the
  7233. // wrap around point and causing depth fighting with the rear caps
  7234. vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
  7235. vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
  7236. vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
  7237. vertexCache[outVerts+1][3] = 0.0f;
  7238. vertRemap[i] = outVerts;
  7239. outVerts += 2;
  7240. }
  7241. return outVerts;
  7242. }
  7243. #endif /* VERTEXCACHE_ALIGNED */
  7244. #endif /* 0 to disable shadow cache */
  7245. #if 1
  7246. #ifdef VERTEXCACHE_ALIGNED
  7247. /*
  7248. ============
  7249. idSIMD_AltiVec::CreateVertexProgramShadowCache
  7250. ============
  7251. */
  7252. int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
  7253. // vertexCache aligned
  7254. assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
  7255. // idDrawVert size
  7256. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
  7257. // idVec4 size
  7258. assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
  7259. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  7260. register vector float zeroVector = (vector float)(0.0);
  7261. register vector float oneVector = (vector float)(1);
  7262. register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
  7263. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  7264. int i = 0;
  7265. #ifndef DRAWVERT_PADDED
  7266. // every fourth one will have the same alignment. Make sure we've got enough here
  7267. if ( i+3 < numVerts ) {
  7268. vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  7269. vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  7270. vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  7271. vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  7272. }
  7273. #endif
  7274. for ( ; i+3 < numVerts; i+=4 ) {
  7275. const float *vertPtr = verts[i].xyz.ToFloatPtr();
  7276. const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
  7277. const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
  7278. const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
  7279. #ifndef DRAWVERT_PADDED
  7280. v0 = vec_ld( 0, vertPtr );
  7281. v1 = vec_ld( 15, vertPtr );
  7282. v2 = vec_ld( 0, vertPtr2 );
  7283. v3 = vec_ld( 15, vertPtr2 );
  7284. v4 = vec_ld( 0, vertPtr3 );
  7285. v5 = vec_ld( 15, vertPtr3 );
  7286. v6 = vec_ld( 0, vertPtr4 );
  7287. v7 = vec_ld( 15, vertPtr4 );
  7288. v0 = vec_perm( v0, v1, vertPerm1 );
  7289. v1 = vec_perm( v2, v3, vertPerm2 );
  7290. v2 = vec_perm( v4, v5, vertPerm3 );
  7291. v3 = vec_perm( v6, v7, vertPerm4 );
  7292. #else
  7293. v0 = vec_ld( 0, vertPtr );
  7294. v1 = vec_ld( 0, vertPtr2 );
  7295. v2 = vec_ld( 0, vertPtr3 );
  7296. v3 = vec_ld( 0, vertPtr4 );
  7297. #endif
  7298. v0 = vec_perm( v0, oneVector, vecPermThreeOne );
  7299. v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
  7300. v1 = vec_perm( v1, oneVector, vecPermThreeOne );
  7301. v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
  7302. v2 = vec_perm( v2, oneVector, vecPermThreeOne );
  7303. v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
  7304. v3 = vec_perm( v3, oneVector, vecPermThreeOne );
  7305. v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
  7306. // store results
  7307. ALIGNED_STORE4( &vertexCache[i*2][0], v0, v4, v1, v5 );
  7308. ALIGNED_STORE4( &vertexCache[(i+2)*2][0], v2, v6, v3, v7 );
  7309. }
  7310. // cleanup
  7311. for ( ; i < numVerts; i++ ) {
  7312. const float *v = verts[i].xyz.ToFloatPtr();
  7313. vertexCache[i*2+0][0] = v[0];
  7314. vertexCache[i*2+1][0] = v[0];
  7315. vertexCache[i*2+0][1] = v[1];
  7316. vertexCache[i*2+1][1] = v[1];
  7317. vertexCache[i*2+0][2] = v[2];
  7318. vertexCache[i*2+1][2] = v[2];
  7319. vertexCache[i*2+0][3] = 1.0f;
  7320. vertexCache[i*2+1][3] = 0.0f;
  7321. }
  7322. return numVerts * 2;
  7323. }
  7324. #else
  7325. /*
  7326. ============
  7327. idSIMD_AltiVec::CreateVertexProgramShadowCache
  7328. ============
  7329. */
  7330. int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
  7331. // idDrawVert size
  7332. assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
  7333. // idVec4 size
  7334. assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
  7335. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  7336. register vector float zeroVector = (vector float)(0.0);
  7337. register vector float oneVector = (vector float)(1);
  7338. register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
  7339. vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
  7340. int i = 0;
  7341. #ifndef DRAWVERT_PADDED
  7342. // every fourth one will have the same alignment. Make sure we've got enough here
  7343. if ( i+3 < numVerts ) {
  7344. vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  7345. vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  7346. vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  7347. vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
  7348. }
  7349. #endif
  7350. for ( ; i+3 < numVerts; i+=4 ) {
  7351. const float *vertPtr = verts[i].xyz.ToFloatPtr();
  7352. const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
  7353. const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
  7354. const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
  7355. #ifndef DRAWVERT_PADDED
  7356. v0 = vec_ld( 0, vertPtr );
  7357. v1 = vec_ld( 15, vertPtr );
  7358. v2 = vec_ld( 0, vertPtr2 );
  7359. v3 = vec_ld( 15, vertPtr2 );
  7360. v4 = vec_ld( 0, vertPtr3 );
  7361. v5 = vec_ld( 15, vertPtr3 );
  7362. v6 = vec_ld( 0, vertPtr4 );
  7363. v7 = vec_ld( 15, vertPtr4 );
  7364. v0 = vec_perm( v0, v1, vertPerm1 );
  7365. v1 = vec_perm( v2, v3, vertPerm2 );
  7366. v2 = vec_perm( v4, v5, vertPerm3 );
  7367. v3 = vec_perm( v6, v7, vertPerm4 );
  7368. #else
  7369. v0 = vec_ld( 0, vertPtr );
  7370. v1 = vec_ld( 0, vertPtr2 );
  7371. v2 = vec_ld( 0, vertPtr3 );
  7372. v3 = vec_ld( 0, vertPtr4 );
  7373. #endif
  7374. v0 = vec_perm( v0, oneVector, vecPermThreeOne );
  7375. v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
  7376. v1 = vec_perm( v1, oneVector, vecPermThreeOne );
  7377. v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
  7378. v2 = vec_perm( v2, oneVector, vecPermThreeOne );
  7379. v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
  7380. v3 = vec_perm( v3, oneVector, vecPermThreeOne );
  7381. v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
  7382. // store results as unaligned
  7383. vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &vertexCache[i*2][0] ), (vector unsigned char)(1) );
  7384. vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
  7385. vector float vc1 = vec_ld( 0, &vertexCache[i*2][0] );
  7386. vector float vc2 = vec_ld( 127, &vertexCache[i*2][0] );
  7387. // right rotate input data
  7388. v0 = vec_perm( v0, v0, storePerm );
  7389. v4 = vec_perm( v4, v4, storePerm );
  7390. v1 = vec_perm( v1, v1, storePerm );
  7391. v5 = vec_perm( v5, v5, storePerm );
  7392. v2 = vec_perm( v2, v2, storePerm );
  7393. v6 = vec_perm( v6, v6, storePerm );
  7394. v3 = vec_perm( v3, v3, storePerm );
  7395. v7 = vec_perm( v7, v7, storePerm );
  7396. vec_st( vec_sel( vc1, v0, mask ), 0 , &vertexCache[i*2][0] );
  7397. vec_st( vec_sel( v0, v4, mask ), 15 , &vertexCache[i*2][0] );
  7398. vec_st( vec_sel( v4, v1, mask ), 31 , &vertexCache[i*2][0] );
  7399. vec_st( vec_sel( v1, v5, mask ), 47 , &vertexCache[i*2][0] );
  7400. vec_st( vec_sel( v5, v2, mask ), 63 , &vertexCache[i*2][0] );
  7401. vec_st( vec_sel( v2, v6, mask ), 79 , &vertexCache[i*2][0] );
  7402. vec_st( vec_sel( v6, v3, mask ), 95 , &vertexCache[i*2][0] );
  7403. vec_st( vec_sel( v3, v7, mask ), 111 , &vertexCache[i*2][0] );
  7404. vec_st( vec_sel( v7, vc2, mask ), 127 , &vertexCache[i*2][0] );
  7405. }
  7406. // cleanup
  7407. for ( ; i < numVerts; i++ ) {
  7408. const float *v = verts[i].xyz.ToFloatPtr();
  7409. vertexCache[i*2+0][0] = v[0];
  7410. vertexCache[i*2+1][0] = v[0];
  7411. vertexCache[i*2+0][1] = v[1];
  7412. vertexCache[i*2+1][1] = v[1];
  7413. vertexCache[i*2+0][2] = v[2];
  7414. vertexCache[i*2+1][2] = v[2];
  7415. vertexCache[i*2+0][3] = 1.0f;
  7416. vertexCache[i*2+1][3] = 0.0f;
  7417. }
  7418. return numVerts * 2;
  7419. }
  7420. #endif /* VERTEXCACHE_ALIGNED */
  7421. #endif /* 0 to kill VP shader cache */
  7422. #endif /* ENABLE_CREATE */
  7423. #ifdef ENABLE_SOUND_ROUTINES
  7424. #ifdef SOUND_DEST_ALIGNED
  7425. /*
  7426. ============
  7427. idSIMD_AltiVec::UpSamplePCMTo44kHz
  7428. Duplicate samples for 44kHz output.
  7429. Assumptions:
  7430. Assumes that dest starts at aligned address
  7431. ============
  7432. */
  7433. void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
  7434. // dest is aligned
  7435. assert( IS_16BYTE_ALIGNED( dest[0] ) );
  7436. vector signed short vs0, vs1;
  7437. register vector signed int vi0, vi1;
  7438. register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
  7439. // permute vectors
  7440. register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
  7441. register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
  7442. register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
  7443. register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
  7444. // If this can be assumed true, we can eliminate another conditional that checks to see if we can
  7445. // load up a vector before the loop
  7446. assert( numSamples >= 12 );
  7447. if ( kHz == 11025 ) {
  7448. if ( numChannels == 1 ) {
  7449. // 8 at a time
  7450. int i = 0;
  7451. vector signed short vsOld = vec_ld( 0, &src[i] );
  7452. vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
  7453. for ( ; i+7 < numSamples; i+= 8 ) {
  7454. // load src
  7455. vs1 = vec_ld( 15, &src[i] );
  7456. vs0 = vec_perm( vsOld, vs1, permVec );
  7457. vsOld = vs1;
  7458. // unpack shorts to ints
  7459. vi0 = vec_unpackh( vs0 );
  7460. vi1 = vec_unpackl( vs0 );
  7461. // convert ints to floats
  7462. v0 = vec_ctf( vi0, 0 );
  7463. v1 = vec_ctf( vi1, 0 );
  7464. // permute into vectors in the order to store
  7465. v2 = vec_splat( v0, 0 );
  7466. v3 = vec_splat( v0, 1 );
  7467. v4 = vec_splat( v0, 2 );
  7468. v5 = vec_splat( v0, 3 );
  7469. v6 = vec_splat( v1, 0 );
  7470. v7 = vec_splat( v1, 1 );
  7471. v8 = vec_splat( v1, 2 );
  7472. v9 = vec_splat( v1, 3 );
  7473. // store results
  7474. ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
  7475. }
  7476. // cleanup
  7477. for (; i < numSamples; i++ ) {
  7478. dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
  7479. }
  7480. } else {
  7481. int i = 0;
  7482. vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
  7483. vector signed short vsOld = vec_ld( 0, &src[0] );
  7484. for ( ; i+7 < numSamples; i += 8 ) {
  7485. // load src
  7486. vs1 = vec_ld( 15, &src[i] );
  7487. vs0 = vec_perm( vsOld, vs1, permVec );
  7488. vsOld = vs1;
  7489. // unpack shorts to ints
  7490. vi0 = vec_unpackh( vs0 );
  7491. vi1 = vec_unpackl( vs0 );
  7492. // convert ints to floats
  7493. v0 = vec_ctf( vi0, 0 );
  7494. v1 = vec_ctf( vi1, 0 );
  7495. // put into vectors in order to store
  7496. v2 = vec_perm( v0, v0, vecFirstHalf );
  7497. v3 = v2;
  7498. v4 = vec_perm( v0, v0, vecSecondHalf );
  7499. v5 = v4;
  7500. v6 = vec_perm( v1, v1, vecFirstHalf );
  7501. v7 = v6;
  7502. v8 = vec_perm (v1, v1, vecSecondHalf );
  7503. v9 = v8;
  7504. // store results
  7505. ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
  7506. }
  7507. for ( ; i < numSamples; i += 2 ) {
  7508. dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
  7509. dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
  7510. }
  7511. }
  7512. } else if ( kHz == 22050 ) {
  7513. if ( numChannels == 1 ) {
  7514. int i;
  7515. vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
  7516. vector signed short vsOld = vec_ld( 0, &src[0] );
  7517. for ( i = 0; i+7 < numSamples; i += 8 ) {
  7518. // load src
  7519. vs1 = vec_ld( 0, &src[i] );
  7520. vs0 = vec_perm( vsOld, vs1, permVec );
  7521. vsOld = vs1;
  7522. // unpack shorts to ints
  7523. vi0 = vec_unpackh( vs0 );
  7524. vi1 = vec_unpackl( vs0 );
  7525. // convert ints to floats
  7526. v0 = vec_ctf( vi0, 0 );
  7527. v1 = vec_ctf( vi1, 0 );
  7528. // put into vectors in order to store
  7529. v2 = vec_perm( v0, v0, vecBottom );
  7530. v3 = vec_perm( v0, v0, vecTop );
  7531. v4 = vec_perm( v1, v1, vecBottom );
  7532. v5 = vec_perm (v1, v1, vecTop );
  7533. // store results
  7534. ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
  7535. }
  7536. // cleanup
  7537. for ( ; i < numSamples; i++ ) {
  7538. dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
  7539. }
  7540. } else {
  7541. int i;
  7542. vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
  7543. vector signed short vsOld = vec_ld( 0, &src[0] );
  7544. for ( i = 0; i+7 < numSamples; i += 8 ) {
  7545. // load src
  7546. vs1 = vec_ld( 15, &src[i] );
  7547. vs0 = vec_perm( vsOld, vs1, permVec );
  7548. vsOld = vs1;
  7549. // unpack shorts to ints
  7550. vi0 = vec_unpackh( vs0 );
  7551. vi1 = vec_unpackl( vs0 );
  7552. // convert ints to floats
  7553. v0 = vec_ctf( vi0, 0 );
  7554. v1 = vec_ctf( vi1, 0 );
  7555. // put into vectors in order to store
  7556. v2 = vec_perm( v0, v0, vecFirstHalf );
  7557. v3 = vec_perm( v0, v0, vecSecondHalf );
  7558. v4 = vec_perm( v1, v1, vecFirstHalf );
  7559. v5 = vec_perm (v1, v1, vecSecondHalf );
  7560. // store results
  7561. ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
  7562. }
  7563. // cleanup
  7564. for ( ; i < numSamples; i += 2 ) {
  7565. dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
  7566. dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
  7567. }
  7568. }
  7569. } else if ( kHz == 44100 ) {
  7570. int i;
  7571. vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
  7572. vector signed short vsOld = vec_ld( 0, &src[0] );
  7573. for ( i = 0; i+7 < numSamples; i += 8 ) {
  7574. vs1 = vec_ld( 15, &src[i] );
  7575. vs0 = vec_perm( vsOld, vs1, permVec );
  7576. vsOld = vs1;
  7577. //unpack shorts to ints
  7578. vi0 = vec_unpackh( vs0 );
  7579. vi1 = vec_unpackl( vs0 );
  7580. //convert ints to floats
  7581. v0 = vec_ctf( vi0, 0 );
  7582. v1 = vec_ctf( vi1, 0 );
  7583. //store results
  7584. ALIGNED_STORE2( &dest[i], v0, v1 );
  7585. }
  7586. // cleanup
  7587. for ( ; i < numSamples; i++ ) {
  7588. dest[i] = (float) src[i];
  7589. }
  7590. } else {
  7591. assert( 0 );
  7592. }
  7593. }
  7594. #else
  7595. /*
  7596. ============
  7597. idSIMD_AltiVec::UpSamplePCMTo44kHz
  7598. Duplicate samples for 44kHz output.
  7599. Assumptions:
  7600. No assumptions
  7601. ============
  7602. */
  7603. void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
  7604. vector signed short vs0, vs1;
  7605. register vector signed int vi0, vi1;
  7606. register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
  7607. // permute vectors
  7608. register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
  7609. register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
  7610. register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
  7611. register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
  7612. // calculate perm vector and masks for stores
  7613. vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
  7614. // original values of dest
  7615. vector float vecDest = vec_ld( 0, &dest[0] );
  7616. vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
  7617. if ( kHz == 11025 ) {
  7618. if ( numChannels == 1 ) {
  7619. // 8 at a time
  7620. int i = 0;
  7621. vector signed short vsOld = vec_ld( 0, &src[i] );
  7622. vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
  7623. for ( ; i+7 < numSamples; i+= 8 ) {
  7624. // load src
  7625. vs1 = vec_ld( 15, &src[i] );
  7626. vs0 = vec_perm( vsOld, vs1, permVec );
  7627. vsOld = vs1;
  7628. vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
  7629. // unpack shorts to ints
  7630. vi0 = vec_unpackh( vs0 );
  7631. vi1 = vec_unpackl( vs0 );
  7632. // convert ints to floats
  7633. v0 = vec_ctf( vi0, 0 );
  7634. v1 = vec_ctf( vi1, 0 );
  7635. // permute into vectors in the order to store
  7636. v2 = vec_splat( v0, 0 );
  7637. v3 = vec_splat( v0, 1 );
  7638. v4 = vec_splat( v0, 2 );
  7639. v5 = vec_splat( v0, 3 );
  7640. v6 = vec_splat( v1, 0 );
  7641. v7 = vec_splat( v1, 1 );
  7642. v8 = vec_splat( v1, 2 );
  7643. v9 = vec_splat( v1, 3 );
  7644. v2 = vec_perm( v2, v2, storePerm );
  7645. v3 = vec_perm( v3, v3, storePerm );
  7646. v4 = vec_perm( v4, v4, storePerm );
  7647. v5 = vec_perm( v5, v5, storePerm );
  7648. v6 = vec_perm( v6, v6, storePerm );
  7649. v7 = vec_perm( v7, v7, storePerm );
  7650. v8 = vec_perm( v8, v8, storePerm );
  7651. v9 = vec_perm( v9, v9, storePerm );
  7652. // store results
  7653. vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
  7654. vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
  7655. vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
  7656. vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
  7657. vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
  7658. vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
  7659. vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
  7660. vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
  7661. vecDest = vec_sel( v9, vecDestEnd, mask );
  7662. vec_st( vecDest, 127, &dest[i*4] );
  7663. }
  7664. // cleanup
  7665. for (; i < numSamples; i++ ) {
  7666. dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
  7667. }
  7668. } else {
  7669. int i = 0;
  7670. vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
  7671. vector signed short vsOld = vec_ld( 0, &src[0] );
  7672. for ( ; i+7 < numSamples; i += 8 ) {
  7673. // load src
  7674. vs1 = vec_ld( 15, &src[i] );
  7675. vs0 = vec_perm( vsOld, vs1, permVec );
  7676. vsOld = vs1;
  7677. vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
  7678. // unpack shorts to ints
  7679. vi0 = vec_unpackh( vs0 );
  7680. vi1 = vec_unpackl( vs0 );
  7681. // convert ints to floats
  7682. v0 = vec_ctf( vi0, 0 );
  7683. v1 = vec_ctf( vi1, 0 );
  7684. // put into vectors in order to store
  7685. v2 = vec_perm( v0, v0, vecFirstHalf );
  7686. v3 = v2;
  7687. v4 = vec_perm( v0, v0, vecSecondHalf );
  7688. v5 = v4;
  7689. v6 = vec_perm( v1, v1, vecFirstHalf );
  7690. v7 = v6;
  7691. v8 = vec_perm (v1, v1, vecSecondHalf );
  7692. v9 = v8;
  7693. v2 = vec_perm( v2, v2, storePerm );
  7694. v3 = vec_perm( v3, v3, storePerm );
  7695. v4 = vec_perm( v4, v4, storePerm );
  7696. v5 = vec_perm( v5, v5, storePerm );
  7697. v6 = vec_perm( v6, v6, storePerm );
  7698. v7 = vec_perm( v7, v7, storePerm );
  7699. v8 = vec_perm( v8, v8, storePerm );
  7700. v9 = vec_perm( v9, v9, storePerm );
  7701. // store results
  7702. vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
  7703. vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
  7704. vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
  7705. vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
  7706. vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
  7707. vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
  7708. vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
  7709. vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
  7710. vecDest = vec_sel( v9, vecDestEnd, mask );
  7711. vec_st( vecDest, 127, &dest[i*4] );
  7712. }
  7713. for ( ; i < numSamples; i += 2 ) {
  7714. dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
  7715. dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
  7716. }
  7717. }
  7718. } else if ( kHz == 22050 ) {
  7719. if ( numChannels == 1 ) {
  7720. int i;
  7721. vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
  7722. vector signed short vsOld = vec_ld( 0, &src[0] );
  7723. for ( i = 0; i+7 < numSamples; i += 8 ) {
  7724. // load src
  7725. vs1 = vec_ld( 0, &src[i] );
  7726. vs0 = vec_perm( vsOld, vs1, permVec );
  7727. vsOld = vs1;
  7728. vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
  7729. // unpack shorts to ints
  7730. vi0 = vec_unpackh( vs0 );
  7731. vi1 = vec_unpackl( vs0 );
  7732. // convert ints to floats
  7733. v0 = vec_ctf( vi0, 0 );
  7734. v1 = vec_ctf( vi1, 0 );
  7735. // put into vectors in order to store
  7736. v2 = vec_perm( v0, v0, vecBottom );
  7737. v3 = vec_perm( v0, v0, vecTop );
  7738. v4 = vec_perm( v1, v1, vecBottom );
  7739. v5 = vec_perm (v1, v1, vecTop );
  7740. v2 = vec_perm( v2, v2, storePerm );
  7741. v3 = vec_perm( v3, v3, storePerm );
  7742. v4 = vec_perm( v4, v4, storePerm );
  7743. v5 = vec_perm( v5, v5, storePerm );
  7744. // store results
  7745. vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
  7746. vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
  7747. vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
  7748. vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
  7749. vecDest = vec_sel( v5, vecDestEnd, mask );
  7750. vec_st( vecDest, 63, &dest[i*2] );
  7751. }
  7752. // cleanup
  7753. for ( ; i < numSamples; i++ ) {
  7754. dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
  7755. }
  7756. } else {
  7757. int i;
  7758. vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
  7759. vector signed short vsOld = vec_ld( 0, &src[0] );
  7760. for ( i = 0; i+7 < numSamples; i += 8 ) {
  7761. // load src
  7762. vs1 = vec_ld( 15, &src[i] );
  7763. vs0 = vec_perm( vsOld, vs1, permVec );
  7764. vsOld = vs1;
  7765. vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
  7766. // unpack shorts to ints
  7767. vi0 = vec_unpackh( vs0 );
  7768. vi1 = vec_unpackl( vs0 );
  7769. // convert ints to floats
  7770. v0 = vec_ctf( vi0, 0 );
  7771. v1 = vec_ctf( vi1, 0 );
  7772. // put into vectors in order to store
  7773. v2 = vec_perm( v0, v0, vecFirstHalf );
  7774. v3 = vec_perm( v0, v0, vecSecondHalf );
  7775. v4 = vec_perm( v1, v1, vecFirstHalf );
  7776. v5 = vec_perm (v1, v1, vecSecondHalf );
  7777. v2 = vec_perm( v2, v2, storePerm );
  7778. v3 = vec_perm( v3, v3, storePerm );
  7779. v4 = vec_perm( v4, v4, storePerm );
  7780. v5 = vec_perm( v5, v5, storePerm );
  7781. // store results
  7782. vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
  7783. vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
  7784. vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
  7785. vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
  7786. vecDest = vec_sel( v5, vecDestEnd, mask );
  7787. vec_st( vecDest, 63, &dest[i*2] );
  7788. }
  7789. // cleanup
  7790. for ( ; i < numSamples; i += 2 ) {
  7791. dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
  7792. dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
  7793. }
  7794. }
  7795. } else if ( kHz == 44100 ) {
  7796. int i;
  7797. vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
  7798. vector signed short vsOld = vec_ld( 0, &src[0] );
  7799. for ( i = 0; i+7 < numSamples; i += 8 ) {
  7800. //vs0 = vec_ld( 0, &src[i] );
  7801. vs1 = vec_ld( 15, &src[i] );
  7802. vs0 = vec_perm( vsOld, vs1, permVec );
  7803. vsOld = vs1;
  7804. vector float vecDestEnd = vec_ld( 31, &dest[i] );
  7805. //unpack shorts to ints
  7806. vi0 = vec_unpackh( vs0 );
  7807. vi1 = vec_unpackl( vs0 );
  7808. //convert ints to floats
  7809. v0 = vec_ctf( vi0, 0 );
  7810. v1 = vec_ctf( vi1, 0 );
  7811. v0 = vec_perm( v0, v0, storePerm );
  7812. v1 = vec_perm( v1, v1, storePerm );
  7813. // store results
  7814. vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
  7815. vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
  7816. vecDest = vec_sel( v1, vecDestEnd, mask );
  7817. vec_st( vecDest, 31, &dest[i] );
  7818. }
  7819. // cleanup
  7820. for ( ; i < numSamples; i++ ) {
  7821. dest[i] = (float) src[i];
  7822. }
  7823. } else {
  7824. assert( 0 );
  7825. }
  7826. }
  7827. #endif
  7828. #ifdef SOUND_DEST_ALIGNED
  7829. /*
  7830. ============
  7831. idSIMD_AltiVec::UpSampleOGGTo44kHz
  7832. Duplicate samples for 44kHz output.
  7833. Assumptions:
  7834. Assumes that dest starts at aligned address
  7835. ============
  7836. */
  7837. void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
  7838. // dest is aligned
  7839. assert( IS_16BYTE_ALIGNED( dest[0] ) );
  7840. register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
  7841. register vector float constVec, zeroVector;
  7842. register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
  7843. vector unsigned char vecPerm1;
  7844. vector unsigned char vecPerm2;
  7845. vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
  7846. vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
  7847. vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
  7848. vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
  7849. vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
  7850. vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
  7851. constVec = (vector float)(32768.0f);
  7852. zeroVector = (vector float)(0.0);
  7853. if ( kHz == 11025 ) {
  7854. if ( numChannels == 1 ) {
  7855. // calculate perm vector and do first load
  7856. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  7857. v10 = vec_ld( 0, &ogg[0][0] );
  7858. int i;
  7859. for ( i = 0; i+7 < numSamples; i += 8 ) {
  7860. // as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
  7861. v8 = v10;
  7862. v9 = vec_ld( 15, &ogg[0][i] );
  7863. v10 = vec_ld( 31, &ogg[0][i] );
  7864. v0 = vec_perm( v8, v9, vecPerm1 );
  7865. v1 = vec_perm( v9, v10, vecPerm1 );
  7866. // now we have the elements in a vector, we want
  7867. // to splat them each accross their own vector
  7868. oggVec1 = vec_splat( v0, 0 );
  7869. oggVec2 = vec_splat( v0, 1 );
  7870. oggVec3 = vec_splat( v0, 2 );
  7871. oggVec4 = vec_splat( v0, 3 );
  7872. oggVec5 = vec_splat( v1, 0 );
  7873. oggVec6 = vec_splat( v1, 1 );
  7874. oggVec7 = vec_splat( v1, 2 );
  7875. oggVec8 = vec_splat( v1, 3 );
  7876. v0 = vec_madd( oggVec1, constVec, zeroVector );
  7877. v1 = vec_madd( oggVec2, constVec, zeroVector );
  7878. v2 = vec_madd( oggVec3, constVec, zeroVector );
  7879. v3 = vec_madd( oggVec4, constVec, zeroVector );
  7880. v4 = vec_madd( oggVec5, constVec, zeroVector );
  7881. v5 = vec_madd( oggVec6, constVec, zeroVector );
  7882. v6 = vec_madd( oggVec7, constVec, zeroVector );
  7883. v7 = vec_madd( oggVec8, constVec, zeroVector );
  7884. //store results
  7885. ALIGNED_STORE8( &dest[i*4], v0, v1, v2, v3, v4, v5, v6, v7 );
  7886. }
  7887. //cleanup
  7888. for ( ; i < numSamples; i++ ) {
  7889. dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
  7890. }
  7891. } else {
  7892. // calculate perm vec for ogg
  7893. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  7894. vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
  7895. v7 = vec_ld( 0, &ogg[1][0] );
  7896. v9 = vec_ld( 0, &ogg[0][0] );
  7897. int i;
  7898. for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
  7899. // load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
  7900. v8 = v9;
  7901. v9 = vec_ld( 15, &ogg[0][i] );
  7902. v0 = vec_perm( v8, v9, vecPerm1 );
  7903. // now we have the elements in a vector, we want
  7904. // to splat them each accross their own vector
  7905. oggVec1 = vec_splat( v0, 0 );
  7906. oggVec2 = vec_splat( v0, 1 );
  7907. oggVec3 = vec_splat( v0, 2 );
  7908. oggVec4 = vec_splat( v0, 3 );
  7909. // load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
  7910. v6 = v7;
  7911. v7 = vec_ld( 15, &ogg[1][i] );
  7912. v1 = vec_perm( v6, v7, vecPerm2 );
  7913. // now we have the elements in a vector, we want
  7914. // to splat them each accross their own vector
  7915. oggVec5 = vec_splat( v1, 0 );
  7916. oggVec6 = vec_splat( v1, 1 );
  7917. oggVec7 = vec_splat( v1, 2 );
  7918. oggVec8 = vec_splat( v1, 3 );
  7919. oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
  7920. oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
  7921. oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
  7922. oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
  7923. oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
  7924. oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
  7925. oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
  7926. oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
  7927. //merge generates the interleaved pattern that we want and it
  7928. //doesn't require a permute vector, so use that instead
  7929. v0 = vec_mergeh( oggVec1, oggVec5 );
  7930. v1 = vec_mergel( oggVec1, oggVec5 );
  7931. v2 = vec_mergeh( oggVec2, oggVec6 );
  7932. v3 = vec_mergel( oggVec2, oggVec6 );
  7933. v4 = vec_mergeh( oggVec3, oggVec7 );
  7934. v5 = vec_mergel( oggVec3, oggVec7 );
  7935. v6 = vec_mergeh( oggVec4, oggVec8 );
  7936. v10 = vec_mergel( oggVec4, oggVec8 );
  7937. //store results
  7938. ALIGNED_STORE8( &dest[i*8], v0, v1, v2, v3, v4, v5, v6, v10 );
  7939. }
  7940. //cleanup
  7941. for ( ; i < numSamples >> 1; i++ ) {
  7942. dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
  7943. dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
  7944. }
  7945. }
  7946. } else if ( kHz == 22050 ) {
  7947. if ( numChannels == 1 ) {
  7948. // calculate perm vector and do first load
  7949. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  7950. v10 = vec_ld( 0, &ogg[0][0] );
  7951. int i;
  7952. for ( i = 0; i+7 < numSamples; i += 8 ) {
  7953. // load values from ogg
  7954. v8 = v10;
  7955. v9 = vec_ld( 15, &ogg[0][i] );
  7956. v10 = vec_ld( 31, &ogg[0][i] );
  7957. v0 = vec_perm( v8, v9, vecPerm1 );
  7958. v1 = vec_perm( v9, v10, vecPerm1 );
  7959. // multiply
  7960. v0 = vec_madd( v0, constVec, zeroVector );
  7961. v1 = vec_madd( v1, constVec, zeroVector );
  7962. // permute into results vectors to store
  7963. v5 = vec_perm( v0, v0, vecOneTwo );
  7964. v6 = vec_perm( v0, v0, vecThreeFour);
  7965. v7 = vec_perm( v1, v1, vecOneTwo );
  7966. v8 = vec_perm( v1, v1, vecThreeFour );
  7967. //store results
  7968. ALIGNED_STORE4( &dest[i*2], v5, v6, v7, v8 );
  7969. }
  7970. // cleanup
  7971. for ( ; i < numSamples; i++ ) {
  7972. dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
  7973. }
  7974. } else {
  7975. // calculate perm vector and do first load
  7976. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  7977. vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
  7978. v7 = vec_ld( 0, &ogg[1][0] );
  7979. v9 = vec_ld( 0, &ogg[0][0] );
  7980. int i;
  7981. for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
  7982. // load ogg[0][i] to ogg[0][i+4]
  7983. v8 = v9;
  7984. v9 = vec_ld( 15, &ogg[0][i] );
  7985. v0 = vec_perm( v8, v9, vecPerm1 );
  7986. // load ogg[1][i] to ogg[1][i+3]
  7987. v6 = v7;
  7988. v7 = vec_ld( 15, &ogg[1][i] );
  7989. v1 = vec_perm( v6, v7, vecPerm2 );
  7990. // multiply
  7991. v0 = vec_madd( v0, constVec, zeroVector );
  7992. v1 = vec_madd( v1, constVec, zeroVector );
  7993. // generate result vectors to store
  7994. v2 = vec_perm( v0, v1, vecFirst );
  7995. v3 = vec_perm( v0, v1, vecSecond );
  7996. v4 = vec_perm( v0, v1, vecThird );
  7997. v5 = vec_perm( v0, v1, vecFourth );
  7998. // store results
  7999. ALIGNED_STORE4( &dest[i*4], v2, v3, v4, v5 );
  8000. }
  8001. // cleanup
  8002. for ( ; i < numSamples >> 1; i++ ) {
  8003. dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
  8004. dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
  8005. }
  8006. }
  8007. } else if ( kHz == 44100 ) {
  8008. if ( numChannels == 1 ) {
  8009. // calculate perm vector and do first load
  8010. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  8011. v9 = vec_ld( 0, &ogg[0][0] );
  8012. int i;
  8013. for ( i = 0; i+7 < numSamples; i += 8 ) {
  8014. // load values from ogg
  8015. v8 = v9;
  8016. v7 = vec_ld( 15, &ogg[0][i] );
  8017. v6 = v7;
  8018. v9 = vec_ld( 31, &ogg[0][i] );
  8019. v0 = vec_perm( v8, v7, vecPerm1 );
  8020. v1 = vec_perm( v6, v9, vecPerm1 );
  8021. // multiply
  8022. v0 = vec_madd( v0, constVec, zeroVector );
  8023. v1 = vec_madd( v1, constVec, zeroVector );
  8024. ALIGNED_STORE2( &dest[i], v0, v1 );
  8025. }
  8026. // cleanup
  8027. for ( ; i < numSamples; i++ ) {
  8028. dest[i*1+0] = ogg[0][i] * 32768.0f;
  8029. }
  8030. } else {
  8031. // calculate perm vector and do first load
  8032. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  8033. vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
  8034. v7 = vec_ld( 0, &ogg[1][0] );
  8035. v9 = vec_ld( 0, &ogg[0][0] );
  8036. int i;
  8037. for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
  8038. v8 = v9;
  8039. v9 = vec_ld( 15, &ogg[0][i] );
  8040. v0 = vec_perm( v8, v9, vecPerm1 );
  8041. // load ogg[1][i] to ogg[1][i+3]
  8042. v6 = v7;
  8043. v7 = vec_ld( 15, &ogg[1][i] );
  8044. v1 = vec_perm( v6, v7, vecPerm2 );
  8045. // multiply
  8046. v0 = vec_madd( v0, constVec, zeroVector );
  8047. v1 = vec_madd( v1, constVec, zeroVector );
  8048. // generate result vectors
  8049. v2 = vec_mergeh( v0, v1 );
  8050. v3 = vec_mergel( v0, v1 );
  8051. // store results
  8052. ALIGNED_STORE2( &dest[i*2], v2, v3 );
  8053. }
  8054. // cleanup
  8055. for ( ; i < numSamples >> 1; i++ ) {
  8056. dest[i*2+0] = ogg[0][i] * 32768.0f;
  8057. dest[i*2+1] = ogg[1][i] * 32768.0f;
  8058. }
  8059. }
  8060. } else {
  8061. assert( 0 );
  8062. }
  8063. }
  8064. #else
  8065. /*
  8066. ============
  8067. idSIMD_AltiVec::UpSampleOGGTo44kHz
  8068. Duplicate samples for 44kHz output.
  8069. Assumptions:
  8070. No assumptions
  8071. ============
  8072. */
  8073. void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
  8074. register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
  8075. register vector float constVec, zeroVector;
  8076. register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
  8077. vector unsigned char vecPerm1;
  8078. vector unsigned char vecPerm2;
  8079. vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
  8080. vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
  8081. vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
  8082. vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
  8083. vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
  8084. vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
  8085. vector unsigned char storePerm;
  8086. constVec = (vector float)(32768.0f);
  8087. zeroVector = (vector float)(0.0);
  8088. // calculate perm vector and masks for stores
  8089. storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
  8090. // original values of dest
  8091. vector float vecDest = vec_ld( 0, &dest[0] );
  8092. vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
  8093. if ( kHz == 11025 ) {
  8094. if ( numChannels == 1 ) {
  8095. // calculate perm vector and do first load
  8096. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  8097. v10 = vec_ld( 0, &ogg[0][0] );
  8098. int i;
  8099. for ( i = 0; i+7 < numSamples; i += 8 ) {
  8100. // as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
  8101. v8 = v10;
  8102. v9 = vec_ld( 15, &ogg[0][i] );
  8103. v10 = vec_ld( 31, &ogg[0][i] );
  8104. vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
  8105. v0 = vec_perm( v8, v9, vecPerm1 );
  8106. v1 = vec_perm( v9, v10, vecPerm1 );
  8107. // now we have the elements in a vector, we want
  8108. // to splat them each accross their own vector
  8109. oggVec1 = vec_splat( v0, 0 );
  8110. oggVec2 = vec_splat( v0, 1 );
  8111. oggVec3 = vec_splat( v0, 2 );
  8112. oggVec4 = vec_splat( v0, 3 );
  8113. oggVec5 = vec_splat( v1, 0 );
  8114. oggVec6 = vec_splat( v1, 1 );
  8115. oggVec7 = vec_splat( v1, 2 );
  8116. oggVec8 = vec_splat( v1, 3 );
  8117. v0 = vec_madd( oggVec1, constVec, zeroVector );
  8118. v1 = vec_madd( oggVec2, constVec, zeroVector );
  8119. v2 = vec_madd( oggVec3, constVec, zeroVector );
  8120. v3 = vec_madd( oggVec4, constVec, zeroVector );
  8121. v4 = vec_madd( oggVec5, constVec, zeroVector );
  8122. v5 = vec_madd( oggVec6, constVec, zeroVector );
  8123. v6 = vec_madd( oggVec7, constVec, zeroVector );
  8124. v7 = vec_madd( oggVec8, constVec, zeroVector );
  8125. // rotate input data
  8126. v0 = vec_perm( v0, v0, storePerm );
  8127. v1 = vec_perm( v1, v1, storePerm );
  8128. v2 = vec_perm( v2, v2, storePerm );
  8129. v3 = vec_perm( v3, v3, storePerm );
  8130. v4 = vec_perm( v4, v4, storePerm );
  8131. v5 = vec_perm( v5, v5, storePerm );
  8132. v6 = vec_perm( v6, v6, storePerm );
  8133. v7 = vec_perm( v7, v7, storePerm );
  8134. // store results
  8135. vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*4] );
  8136. vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*4] );
  8137. vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*4] );
  8138. vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*4] );
  8139. vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*4] );
  8140. vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*4] );
  8141. vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*4] );
  8142. vec_st( vec_sel( v6, v7, mask ), 111, &dest[i*4] );
  8143. vecDest = vec_sel( v7, vecDestEnd, mask );
  8144. vec_st( vecDest, 127, &dest[i*4] );
  8145. }
  8146. //cleanup
  8147. for ( ; i < numSamples; i++ ) {
  8148. dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
  8149. }
  8150. } else {
  8151. // calculate perm vec for ogg
  8152. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  8153. vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
  8154. v7 = vec_ld( 0, &ogg[1][0] );
  8155. v9 = vec_ld( 0, &ogg[0][0] );
  8156. int i;
  8157. for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
  8158. // load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
  8159. v8 = v9;
  8160. v9 = vec_ld( 15, &ogg[0][i] );
  8161. vector float vecDestEnd = vec_ld( 127, &dest[i*8] );
  8162. v0 = vec_perm( v8, v9, vecPerm1 );
  8163. // now we have the elements in a vector, we want
  8164. // to splat them each accross their own vector
  8165. oggVec1 = vec_splat( v0, 0 );
  8166. oggVec2 = vec_splat( v0, 1 );
  8167. oggVec3 = vec_splat( v0, 2 );
  8168. oggVec4 = vec_splat( v0, 3 );
  8169. // load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
  8170. v6 = v7;
  8171. v7 = vec_ld( 15, &ogg[1][i] );
  8172. v1 = vec_perm( v6, v7, vecPerm2 );
  8173. // now we have the elements in a vector, we want
  8174. // to splat them each accross their own vector
  8175. oggVec5 = vec_splat( v1, 0 );
  8176. oggVec6 = vec_splat( v1, 1 );
  8177. oggVec7 = vec_splat( v1, 2 );
  8178. oggVec8 = vec_splat( v1, 3 );
  8179. oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
  8180. oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
  8181. oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
  8182. oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
  8183. oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
  8184. oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
  8185. oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
  8186. oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
  8187. //merge generates the interleaved pattern that we want and it
  8188. //doesn't require a permute vector, so use that instead
  8189. v0 = vec_mergeh( oggVec1, oggVec5 );
  8190. v1 = vec_mergel( oggVec1, oggVec5 );
  8191. v2 = vec_mergeh( oggVec2, oggVec6 );
  8192. v3 = vec_mergel( oggVec2, oggVec6 );
  8193. v4 = vec_mergeh( oggVec3, oggVec7 );
  8194. v5 = vec_mergel( oggVec3, oggVec7 );
  8195. v6 = vec_mergeh( oggVec4, oggVec8 );
  8196. v10 = vec_mergel( oggVec4, oggVec8 );
  8197. // rotate input data
  8198. v0 = vec_perm( v0, v0, storePerm );
  8199. v1 = vec_perm( v1, v1, storePerm );
  8200. v2 = vec_perm( v2, v2, storePerm );
  8201. v3 = vec_perm( v3, v3, storePerm );
  8202. v4 = vec_perm( v4, v4, storePerm );
  8203. v5 = vec_perm( v5, v5, storePerm );
  8204. v6 = vec_perm( v6, v6, storePerm );
  8205. v10 = vec_perm( v10, v10, storePerm );
  8206. // store results
  8207. vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*8] );
  8208. vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*8] );
  8209. vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*8] );
  8210. vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*8] );
  8211. vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*8] );
  8212. vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*8] );
  8213. vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*8] );
  8214. vec_st( vec_sel( v6, v10, mask ), 111, &dest[i*8] );
  8215. vecDest = vec_sel( v10, vecDestEnd, mask );
  8216. vec_st( vecDest, 127, &dest[i*8] );
  8217. }
  8218. //cleanup
  8219. for ( ; i < numSamples >> 1; i++ ) {
  8220. dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
  8221. dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
  8222. }
  8223. }
  8224. } else if ( kHz == 22050 ) {
  8225. if ( numChannels == 1 ) {
  8226. // calculate perm vector and do first load
  8227. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  8228. v10 = vec_ld( 0, &ogg[0][0] );
  8229. int i;
  8230. for ( i = 0; i+7 < numSamples; i += 8 ) {
  8231. // load values from ogg
  8232. v8 = v10;
  8233. v9 = vec_ld( 15, &ogg[0][i] );
  8234. v10 = vec_ld( 31, &ogg[0][i] );
  8235. vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
  8236. v0 = vec_perm( v8, v9, vecPerm1 );
  8237. v1 = vec_perm( v9, v10, vecPerm1 );
  8238. // multiply
  8239. v0 = vec_madd( v0, constVec, zeroVector );
  8240. v1 = vec_madd( v1, constVec, zeroVector );
  8241. // permute into results vectors to store
  8242. v5 = vec_perm( v0, v0, vecOneTwo );
  8243. v6 = vec_perm( v0, v0, vecThreeFour);
  8244. v7 = vec_perm( v1, v1, vecOneTwo );
  8245. v8 = vec_perm( v1, v1, vecThreeFour );
  8246. // rotate input data
  8247. v5 = vec_perm( v5, v5, storePerm );
  8248. v6 = vec_perm( v6, v6, storePerm );
  8249. v7 = vec_perm( v7, v7, storePerm );
  8250. v8 = vec_perm( v8, v8, storePerm );
  8251. // store results
  8252. vec_st( vec_sel( vecDest, v5, mask ), 0, &dest[i*2] );
  8253. vec_st( vec_sel( v5, v6, mask ), 15, &dest[i*2] );
  8254. vec_st( vec_sel( v6, v7, mask ), 31, &dest[i*2] );
  8255. vec_st( vec_sel( v7, v8, mask ), 47, &dest[i*2] );
  8256. vecDest = vec_sel( v8, vecDestEnd, mask );
  8257. vec_st( vecDest, 63, &dest[i*2] );
  8258. }
  8259. // cleanup
  8260. for ( ; i < numSamples; i++ ) {
  8261. dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
  8262. }
  8263. } else {
  8264. // calculate perm vector and do first load
  8265. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  8266. vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
  8267. v7 = vec_ld( 0, &ogg[1][0] );
  8268. v9 = vec_ld( 0, &ogg[0][0] );
  8269. int i;
  8270. for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
  8271. // load ogg[0][i] to ogg[0][i+4]
  8272. v8 = v9;
  8273. v9 = vec_ld( 15, &ogg[0][i] );
  8274. vector float vecDestEnd = vec_ld( 63, &dest[i*4] );
  8275. v0 = vec_perm( v8, v9, vecPerm1 );
  8276. // load ogg[1][i] to ogg[1][i+3]
  8277. v6 = v7;
  8278. v7 = vec_ld( 15, &ogg[1][i] );
  8279. v1 = vec_perm( v6, v7, vecPerm2 );
  8280. // multiply
  8281. v0 = vec_madd( v0, constVec, zeroVector );
  8282. v1 = vec_madd( v1, constVec, zeroVector );
  8283. // generate result vectors to store
  8284. v2 = vec_perm( v0, v1, vecFirst );
  8285. v3 = vec_perm( v0, v1, vecSecond );
  8286. v4 = vec_perm( v0, v1, vecThird );
  8287. v5 = vec_perm( v0, v1, vecFourth );
  8288. // rotate input data
  8289. v2 = vec_perm( v2, v2, storePerm );
  8290. v3 = vec_perm( v3, v3, storePerm );
  8291. v4 = vec_perm( v4, v4, storePerm );
  8292. v5 = vec_perm( v5, v5, storePerm );
  8293. // store results
  8294. vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
  8295. vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
  8296. vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
  8297. vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
  8298. vecDest = vec_sel( v5, vecDestEnd, mask );
  8299. vec_st( vecDest, 63, &dest[i*4] );
  8300. }
  8301. // cleanup
  8302. for ( ; i < numSamples >> 1; i++ ) {
  8303. dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
  8304. dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
  8305. }
  8306. }
  8307. } else if ( kHz == 44100 ) {
  8308. if ( numChannels == 1 ) {
  8309. // calculate perm vector and do first load
  8310. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  8311. v9 = vec_ld( 0, &ogg[0][0] );
  8312. int i;
  8313. for ( i = 0; i+7 < numSamples; i += 8 ) {
  8314. // load values from ogg
  8315. v8 = v9;
  8316. v7 = vec_ld( 15, &ogg[0][i] );
  8317. v6 = v7;
  8318. v9 = vec_ld( 31, &ogg[0][i] );
  8319. vector float vecDestEnd = vec_ld( 31, &dest[i] );
  8320. v0 = vec_perm( v8, v7, vecPerm1 );
  8321. v1 = vec_perm( v6, v9, vecPerm1 );
  8322. // multiply
  8323. v0 = vec_madd( v0, constVec, zeroVector );
  8324. v1 = vec_madd( v1, constVec, zeroVector );
  8325. // rotate data
  8326. v0 = vec_perm( v0, v0, storePerm );
  8327. v1 = vec_perm( v1, v1, storePerm );
  8328. // store results
  8329. vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
  8330. vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
  8331. vecDest = vec_sel( v1, vecDestEnd, mask );
  8332. vec_st( vecDest, 31, &dest[i] );
  8333. }
  8334. // cleanup
  8335. for ( ; i < numSamples; i++ ) {
  8336. dest[i*1+0] = ogg[0][i] * 32768.0f;
  8337. }
  8338. } else {
  8339. // calculate perm vector and do first load
  8340. vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
  8341. vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
  8342. v7 = vec_ld( 0, &ogg[1][0] );
  8343. v9 = vec_ld( 0, &ogg[0][0] );
  8344. int i;
  8345. for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
  8346. v8 = v9;
  8347. v9 = vec_ld( 15, &ogg[0][i] );
  8348. v0 = vec_perm( v8, v9, vecPerm1 );
  8349. // load ogg[1][i] to ogg[1][i+3]
  8350. v6 = v7;
  8351. v7 = vec_ld( 15, &ogg[1][i] );
  8352. v1 = vec_perm( v6, v7, vecPerm2 );
  8353. // multiply
  8354. v0 = vec_madd( v0, constVec, zeroVector );
  8355. v1 = vec_madd( v1, constVec, zeroVector );
  8356. // generate result vectors
  8357. v2 = vec_mergeh( v0, v1 );
  8358. v3 = vec_mergel( v0, v1 );
  8359. // store results
  8360. UNALIGNED_STORE2( &dest[i*2], v2, v3 );
  8361. }
  8362. // cleanup
  8363. for ( ; i < numSamples >> 1; i++ ) {
  8364. dest[i*2+0] = ogg[0][i] * 32768.0f;
  8365. dest[i*2+1] = ogg[1][i] * 32768.0f;
  8366. }
  8367. }
  8368. } else {
  8369. assert( 0 );
  8370. }
  8371. }
  8372. #endif /* SOUND_DEST_ALIGNED */
  8373. #ifdef SOUND_DEST_ALIGNED
  8374. /*
  8375. ============
  8376. idSIMD_AltiVec::MixSoundTwoSpeakerMono
  8377. Assumptions:
  8378. Assumes that mixBuffer starts at aligned address
  8379. ============
  8380. */
  8381. void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
  8382. // mixBuffer is aligned
  8383. assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
  8384. int i;
  8385. float inc[2];
  8386. float spkr[4];
  8387. register vector float vecInc;
  8388. register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
  8389. register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
  8390. register vector float vecSamplesLd1, vecSamplesLd2;
  8391. register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
  8392. register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
  8393. register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
  8394. register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
  8395. register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
  8396. //constants
  8397. vector float fourVec = (vector float)(4.0);
  8398. vector float zeroVec = (vector float)(0.0);
  8399. inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  8400. inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  8401. spkr[0] = lastV[0];
  8402. spkr[1] = lastV[1];
  8403. spkr[2] = lastV[0] + inc[0];
  8404. spkr[3] = lastV[1] + inc[1];
  8405. assert( numSamples == MIXBUFFER_SAMPLES );
  8406. inc[0] *= 2;
  8407. inc[1] *= 2;
  8408. //load data into registers
  8409. vector float v0 = loadSplatUnalignedScalar( &inc[0] );
  8410. vector float v1 = loadSplatUnalignedScalar( &inc[1] );
  8411. vecInc = vec_mergeh( v0, v1 );
  8412. vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
  8413. vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
  8414. vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
  8415. vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
  8416. // load spkr array
  8417. v0 = vec_mergeh( v2, v4 );
  8418. v1 = vec_mergeh( v3, v5 );
  8419. vecSpeaker1 = vec_mergeh( v0, v1 );
  8420. vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
  8421. vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
  8422. vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
  8423. vecInc = vec_madd( vecInc, fourVec, zeroVec );
  8424. vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
  8425. vector float vecSamplesLast = vec_ld( 0, &samples[0] );
  8426. //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
  8427. //need a cleanup loop
  8428. for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
  8429. //load samples and mix buffers
  8430. vecSamplesLd1 = vecSamplesLast; //vec_ld( 0, &samples[i] );
  8431. vecSamplesLd2 = vec_ld( 15, &samples[i] );
  8432. vecSamplesLast = vec_ld( 31, &samples[i] );
  8433. vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
  8434. vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
  8435. vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
  8436. vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
  8437. vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
  8438. vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
  8439. vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
  8440. vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
  8441. vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
  8442. vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
  8443. vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
  8444. vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
  8445. vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
  8446. vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
  8447. // store results
  8448. ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
  8449. //add for next iteration
  8450. vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
  8451. vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
  8452. vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
  8453. vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
  8454. }
  8455. }
  8456. #else
  8457. /*
  8458. ============
  8459. idSIMD_AltiVec::MixSoundTwoSpeakerMono
  8460. Assumptions:
  8461. No assumptions
  8462. ============
  8463. */
  8464. void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
  8465. int i;
  8466. float inc[2];
  8467. float spkr[4];
  8468. register vector float vecInc;
  8469. register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
  8470. register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
  8471. register vector float vecSamplesLd1, vecSamplesLd2;
  8472. register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
  8473. register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
  8474. register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
  8475. register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
  8476. register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
  8477. //constants
  8478. vector float fourVec = (vector float)(4.0);
  8479. vector float zeroVec = (vector float)(0.0);
  8480. inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  8481. inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  8482. spkr[0] = lastV[0];
  8483. spkr[1] = lastV[1];
  8484. spkr[2] = lastV[0] + inc[0];
  8485. spkr[3] = lastV[1] + inc[1];
  8486. assert( numSamples == MIXBUFFER_SAMPLES );
  8487. inc[0] *= 2;
  8488. inc[1] *= 2;
  8489. //load data into registers
  8490. vector float v0 = loadSplatUnalignedScalar( &inc[0] );
  8491. vector float v1 = loadSplatUnalignedScalar( &inc[1] );
  8492. vecInc = vec_mergeh( v0, v1 );
  8493. vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
  8494. vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
  8495. vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
  8496. vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
  8497. // load spkr array
  8498. v0 = vec_mergeh( v2, v4 );
  8499. v1 = vec_mergeh( v3, v5 );
  8500. vecSpeaker1 = vec_mergeh( v0, v1 );
  8501. vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
  8502. vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
  8503. vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
  8504. vecInc = vec_madd( vecInc, fourVec, zeroVec );
  8505. vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
  8506. vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0]), (vector unsigned char)(1) );
  8507. vector float vecSamplesLast = vec_ld( 0, &samples[0] );
  8508. vector float vecDest = vec_ld( 0, &mixBuffer[0] );
  8509. //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
  8510. //need a cleanup loop
  8511. for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
  8512. //load samples and mix buffers
  8513. vecSamplesLd1 = vecSamplesLast;
  8514. vecSamplesLd2 = vec_ld( 15, &samples[i] );
  8515. vecSamplesLast = vec_ld( 31, &samples[i] );
  8516. vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
  8517. vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
  8518. vecMixBuffer1 = vecDest;
  8519. vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
  8520. vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
  8521. vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
  8522. vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
  8523. vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
  8524. vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
  8525. vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
  8526. vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
  8527. vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
  8528. vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
  8529. vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
  8530. vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
  8531. vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
  8532. vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
  8533. vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
  8534. vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
  8535. // store results
  8536. UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
  8537. //add for next iteration
  8538. vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
  8539. vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
  8540. vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
  8541. vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
  8542. }
  8543. }
  8544. #endif /* SOUND_DEST_ALIGNED */
  8545. #ifdef SOUND_DEST_ALIGNED
  8546. /*
  8547. ============
  8548. idSIMD_AltiVec::MixSoundTwoSpeakerStereo
  8549. Assumptions:
  8550. Assumes that mixBuffer starts at aligned address
  8551. ============
  8552. */
  8553. void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
  8554. // mixBuffer is aligned
  8555. assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
  8556. int i, k;
  8557. float inc[2];
  8558. float spkr[4];
  8559. // loading buffers
  8560. register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
  8561. // loading buffers
  8562. register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
  8563. register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
  8564. register vector float vecInc;
  8565. vector float fourVec = (vector float)(4.0);
  8566. vector float zeroVec = (vector float)(0.0);
  8567. assert( numSamples == MIXBUFFER_SAMPLES );
  8568. inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  8569. inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  8570. spkr[0] = lastV[0];
  8571. spkr[1] = lastV[1];
  8572. spkr[2] = lastV[0] + inc[0];
  8573. spkr[3] = lastV[1] + inc[1];
  8574. for ( k = 0; k < 2; k++ ) {
  8575. inc[k] *= 2;
  8576. }
  8577. // load data in vectors
  8578. vector float v0 = loadSplatUnalignedScalar( &inc[0] );
  8579. vector float v1 = loadSplatUnalignedScalar( &inc[1] );
  8580. vecInc = vec_mergeh( v0, v1 );
  8581. vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
  8582. vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
  8583. vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
  8584. vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
  8585. // load spkr array
  8586. v0 = vec_mergeh( v2, v4 );
  8587. v1 = vec_mergeh( v3, v5 );
  8588. vecSpeaker1 = vec_mergeh( v0, v1 );
  8589. vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
  8590. vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
  8591. vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
  8592. vecInc = vec_madd( vecInc, fourVec, zeroVec );
  8593. vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
  8594. vector float vecSamplesLast = vec_ld( 0, &samples[0] );
  8595. //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
  8596. //need a cleanup loop
  8597. for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
  8598. // load mix buffers and samples
  8599. vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
  8600. vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
  8601. vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
  8602. vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
  8603. vecSamples1 = vecSamplesLast;
  8604. vecSamples2 = vec_ld( 15, &samples[i*2] );
  8605. vecSamples3 = vec_ld( 31, &samples[i*2] );
  8606. vecSamples4 = vec_ld( 47, &samples[i*2] );
  8607. vecSamplesLast = vec_ld( 63, &samples[i*2] );
  8608. vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
  8609. vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
  8610. vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
  8611. vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
  8612. vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
  8613. vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
  8614. vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
  8615. vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
  8616. vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
  8617. vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
  8618. vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
  8619. vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
  8620. //store results
  8621. ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
  8622. }
  8623. }
  8624. #else
  8625. /*
  8626. ============
  8627. idSIMD_AltiVec::MixSoundTwoSpeakerStereo
  8628. Assumptions:
  8629. No assumptions
  8630. ============
  8631. */
  8632. void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
  8633. int i, k;
  8634. float inc[2];
  8635. float spkr[4];
  8636. // loading buffers
  8637. register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
  8638. // loading buffers
  8639. register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
  8640. register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
  8641. register vector float vecInc;
  8642. vector float fourVec = (vector float)(4.0);
  8643. vector float zeroVec = (vector float)(0.0);
  8644. assert( numSamples == MIXBUFFER_SAMPLES );
  8645. inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  8646. inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  8647. spkr[0] = lastV[0];
  8648. spkr[1] = lastV[1];
  8649. spkr[2] = lastV[0] + inc[0];
  8650. spkr[3] = lastV[1] + inc[1];
  8651. for ( k = 0; k < 2; k++ ) {
  8652. inc[k] *= 2;
  8653. }
  8654. // load data in vectors
  8655. vector float v0 = loadSplatUnalignedScalar( &inc[0] );
  8656. vector float v1 = loadSplatUnalignedScalar( &inc[1] );
  8657. vecInc = vec_mergeh( v0, v1 );
  8658. vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
  8659. vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
  8660. vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
  8661. vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
  8662. // load spkr array
  8663. v0 = vec_mergeh( v2, v4 );
  8664. v1 = vec_mergeh( v3, v5 );
  8665. vecSpeaker1 = vec_mergeh( v0, v1 );
  8666. vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
  8667. vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
  8668. vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
  8669. vecInc = vec_madd( vecInc, fourVec, zeroVec );
  8670. vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
  8671. vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
  8672. vector float vecSamplesLast = vec_ld( 0, &samples[0] );
  8673. vector float vecDest = vec_ld( 0, &mixBuffer[0] );
  8674. //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
  8675. //need a cleanup loop
  8676. for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
  8677. // load mix buffers and samples
  8678. vecMixBuffer1 = vecDest;
  8679. vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
  8680. vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
  8681. vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
  8682. vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
  8683. vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
  8684. vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
  8685. vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
  8686. vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
  8687. vecSamples1 = vecSamplesLast;
  8688. vecSamples2 = vec_ld( 15, &samples[i*2] );
  8689. vecSamples3 = vec_ld( 31, &samples[i*2] );
  8690. vecSamples4 = vec_ld( 47, &samples[i*2] );
  8691. vecSamplesLast = vec_ld( 63, &samples[i*2] );
  8692. vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
  8693. vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
  8694. vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
  8695. vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
  8696. vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
  8697. vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
  8698. vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
  8699. vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
  8700. vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
  8701. vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
  8702. vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
  8703. vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
  8704. // store results
  8705. UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
  8706. }
  8707. }
  8708. #endif /* SOUND_DEST_ALIGNED */
  8709. #ifdef SOUND_DEST_ALIGNED
  8710. /*
  8711. ============
  8712. idSIMD_AltiVec::MixSoundSixSpeakerMono
  8713. Assumptions:
  8714. Assumes that mixBuffer starts at aligned address
  8715. ============
  8716. */
  8717. void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
  8718. // mixBuffer is aligned
  8719. assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
  8720. float incL[24];
  8721. float sL[24];
  8722. int i, k;
  8723. vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
  8724. vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
  8725. vector float vecSamplesLd;
  8726. vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
  8727. vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
  8728. // permute vectors for sample
  8729. vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
  8730. vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
  8731. assert( numSamples == MIXBUFFER_SAMPLES );
  8732. assert( SPEAKER_RIGHT == 1 );
  8733. assert( SPEAKER_BACKRIGHT == 5 );
  8734. // incL array, 6 elements repeated
  8735. incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  8736. incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  8737. incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
  8738. incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
  8739. incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
  8740. incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
  8741. // sL array repeated
  8742. for ( k = 0; k < 6; k++ ) {
  8743. sL[k] = lastV[k];
  8744. }
  8745. for ( k = 6; k < 12; k++ ) {
  8746. sL[k] = lastV[k-6] + incL[k];
  8747. }
  8748. for ( k = 12; k < 18; k++ ) {
  8749. sL[k] = lastV[k-12] + incL[k] + incL[k];
  8750. }
  8751. for ( k = 18; k < 24; k++ ) {
  8752. sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
  8753. }
  8754. // multiply by 2 since doing 12 at a time
  8755. for ( k = 0; k < 24; k++ ) {
  8756. incL[k] *= 4;
  8757. }
  8758. //load the data
  8759. vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
  8760. vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
  8761. vecIncl1 = vec_ld( 0, &incL[0] );
  8762. vecIncl2 = vec_ld( 15, &incL[0] );
  8763. vecIncl3 = vec_ld( 31, &incL[0] );
  8764. vecIncl4 = vec_ld( 47, &incL[0] );
  8765. vecIncl5 = vec_ld( 63, &incL[0] );
  8766. vecIncl6 = vec_ld( 79, &incL[0] );
  8767. vecIncl7 = vec_ld( 95, &incL[0] );
  8768. vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
  8769. vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
  8770. vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
  8771. vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
  8772. vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
  8773. vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
  8774. vecSL1 = vec_ld( 0, &sL[0] );
  8775. vecSL2 = vec_ld( 15, &sL[0] );
  8776. vecSL3 = vec_ld( 31, &sL[0] );
  8777. vecSL4 = vec_ld( 47, &sL[0] );
  8778. vecSL5 = vec_ld( 63, &sL[0] );
  8779. vecSL6 = vec_ld( 79, &sL[0] );
  8780. vecSL7 = vec_ld( 95, &sL[0] );
  8781. vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
  8782. vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
  8783. vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
  8784. vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
  8785. vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
  8786. vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
  8787. vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
  8788. vector float vecSamplesLast = vec_ld( 0, &samples[0] );
  8789. //since MIXBUFFER_SAMPLES is a multiple of 4, we don't
  8790. //need a cleanup loop
  8791. for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
  8792. //load mix buffer into vectors, assume aligned
  8793. vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
  8794. vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
  8795. vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
  8796. vecMixBuffer4 = vec_ld( 0, &mixBuffer[(i*6)+12] );
  8797. vecMixBuffer5 = vec_ld( 0, &mixBuffer[(i*6)+16] );
  8798. vecMixBuffer6 = vec_ld( 0, &mixBuffer[(i*6)+20] );
  8799. //load samples into vector
  8800. vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
  8801. vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
  8802. vecSamplesLast = vecSamplesLd2;
  8803. //permute to get them ordered how we want
  8804. vecSamples1 = vec_splat( vecSamplesLd, 0 );
  8805. vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
  8806. vecSamples3 = vec_splat( vecSamplesLd, 1 );
  8807. vecSamples4 = vec_splat( vecSamplesLd, 2 );
  8808. vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
  8809. vecSamples6 = vec_splat( vecSamplesLd, 3 );
  8810. //do calculation
  8811. vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
  8812. vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
  8813. vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
  8814. vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
  8815. vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
  8816. vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
  8817. //store out results
  8818. ALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
  8819. // add for next iteration
  8820. vecSL1 = vec_add( vecSL1, vecIncl1 );
  8821. vecSL2 = vec_add( vecSL2, vecIncl2 );
  8822. vecSL3 = vec_add( vecSL3, vecIncl3 );
  8823. vecSL4 = vec_add( vecSL4, vecIncl4 );
  8824. vecSL5 = vec_add( vecSL5, vecIncl5 );
  8825. vecSL6 = vec_add( vecSL6, vecIncl6 );
  8826. }
  8827. }
  8828. #else
  8829. /*
  8830. ============
  8831. idSIMD_AltiVec::MixSoundSixSpeakerMono
  8832. Assumptions:
  8833. No assumptions
  8834. ============
  8835. */
  8836. void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
  8837. float incL[24];
  8838. float sL[24];
  8839. int i, k;
  8840. vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
  8841. vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
  8842. vector float vecSamplesLd;
  8843. vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
  8844. vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
  8845. // permute vectors for sample
  8846. register vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
  8847. register vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
  8848. assert( numSamples == MIXBUFFER_SAMPLES );
  8849. assert( SPEAKER_RIGHT == 1 );
  8850. assert( SPEAKER_BACKRIGHT == 5 );
  8851. // incL array, 6 elements repeated
  8852. incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  8853. incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  8854. incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
  8855. incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
  8856. incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
  8857. incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
  8858. // sL array repeated
  8859. for ( k = 0; k < 6; k++ ) {
  8860. sL[k] = lastV[k];
  8861. }
  8862. for ( k = 6; k < 12; k++ ) {
  8863. sL[k] = lastV[k-6] + incL[k];
  8864. }
  8865. for ( k = 12; k < 18; k++ ) {
  8866. sL[k] = lastV[k-12] + incL[k] + incL[k];
  8867. }
  8868. for ( k = 18; k < 24; k++ ) {
  8869. sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
  8870. }
  8871. // multiply by 2 since doing 12 at a time
  8872. for ( k = 0; k < 24; k++ ) {
  8873. incL[k] *= 4;
  8874. }
  8875. // load the data
  8876. vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
  8877. vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
  8878. vecIncl1 = vec_ld( 0, &incL[0] );
  8879. vecIncl2 = vec_ld( 15, &incL[0] );
  8880. vecIncl3 = vec_ld( 31, &incL[0] );
  8881. vecIncl4 = vec_ld( 47, &incL[0] );
  8882. vecIncl5 = vec_ld( 63, &incL[0] );
  8883. vecIncl6 = vec_ld( 79, &incL[0] );
  8884. vecIncl7 = vec_ld( 95, &incL[0] );
  8885. vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
  8886. vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
  8887. vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
  8888. vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
  8889. vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
  8890. vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
  8891. vecSL1 = vec_ld( 0, &sL[0] );
  8892. vecSL2 = vec_ld( 15, &sL[0] );
  8893. vecSL3 = vec_ld( 31, &sL[0] );
  8894. vecSL4 = vec_ld( 47, &sL[0] );
  8895. vecSL5 = vec_ld( 63, &sL[0] );
  8896. vecSL6 = vec_ld( 79, &sL[0] );
  8897. vecSL7 = vec_ld( 95, &sL[0] );
  8898. vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
  8899. vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
  8900. vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
  8901. vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
  8902. vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
  8903. vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
  8904. vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
  8905. vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
  8906. vector float vecSamplesLast = vec_ld( 0, &samples[0] );
  8907. vector float vecDest = vec_ld( 0, &mixBuffer[0] );
  8908. //since MIXBUFFER_SAMPLES is a multiple of 4, we don't
  8909. //need a cleanup loop
  8910. for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
  8911. //load mix buffer into vectors
  8912. vecMixBuffer1 = vecDest;
  8913. vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
  8914. vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
  8915. vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*6] );
  8916. vecMixBuffer5 = vec_ld( 63, &mixBuffer[i*6] );
  8917. vecMixBuffer6 = vec_ld( 79, &mixBuffer[i*6] );
  8918. vector float vecDestEnd = vec_ld( 95, &mixBuffer[i*6] );
  8919. vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
  8920. vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
  8921. vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
  8922. vecMixBuffer4 = vec_perm( vecMixBuffer4, vecMixBuffer5, mixBufferPerm );
  8923. vecMixBuffer5 = vec_perm( vecMixBuffer5, vecMixBuffer6, mixBufferPerm );
  8924. vecMixBuffer6 = vec_perm( vecMixBuffer6, vecDestEnd, mixBufferPerm );
  8925. //load samples into vector
  8926. vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
  8927. vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
  8928. vecSamplesLast = vecSamplesLd2;
  8929. //permute to get them ordered how we want
  8930. vecSamples1 = vec_splat( vecSamplesLd, 0 );
  8931. vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
  8932. vecSamples3 = vec_splat( vecSamplesLd, 1 );
  8933. vecSamples4 = vec_splat( vecSamplesLd, 2 );
  8934. vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
  8935. vecSamples6 = vec_splat( vecSamplesLd, 3 );
  8936. //do calculation
  8937. vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
  8938. vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
  8939. vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
  8940. vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
  8941. vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
  8942. vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
  8943. // store results
  8944. UNALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
  8945. // add for next iteration
  8946. vecSL1 = vec_add( vecSL1, vecIncl1 );
  8947. vecSL2 = vec_add( vecSL2, vecIncl2 );
  8948. vecSL3 = vec_add( vecSL3, vecIncl3 );
  8949. vecSL4 = vec_add( vecSL4, vecIncl4 );
  8950. vecSL5 = vec_add( vecSL5, vecIncl5 );
  8951. vecSL6 = vec_add( vecSL6, vecIncl6 );
  8952. }
  8953. }
  8954. #endif /* SOUND_DEST_ALIGNED */
  8955. #ifdef SOUND_DEST_ALIGNED
  8956. /*
  8957. ============
  8958. idSIMD_AltiVec::MixSoundSixSpeakerStereo
  8959. Assumptions:
  8960. Assumes that mixBuffer starts at aligned address
  8961. ============
  8962. */
  8963. void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
  8964. // mixBuffer is aligned
  8965. assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
  8966. float incL[12];
  8967. float sL[12];
  8968. int i;
  8969. vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
  8970. vector float vecSL1, vecSL2, vecSL3, vecSL4;
  8971. vector float vecSamplesLd;
  8972. vector float vecSamples1, vecSamples2, vecSamples3;
  8973. vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
  8974. // permute vectors for sample
  8975. vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
  8976. vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
  8977. assert( numSamples == MIXBUFFER_SAMPLES );
  8978. assert( SPEAKER_RIGHT == 1 );
  8979. assert( SPEAKER_BACKRIGHT == 5 );
  8980. // incL array, 6 elements repeated
  8981. incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  8982. incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  8983. incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
  8984. incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
  8985. incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
  8986. incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
  8987. // sL array repeated
  8988. sL[0] = lastV[0];
  8989. sL[1] = lastV[1];
  8990. sL[2] = lastV[2];
  8991. sL[3] = lastV[3];
  8992. sL[4] = lastV[4];
  8993. sL[5] = lastV[5];
  8994. sL[6] = lastV[0] + incL[0];
  8995. sL[7] = lastV[1] + incL[1];
  8996. sL[8] = lastV[2] + incL[2];
  8997. sL[9] = lastV[3] + incL[3];
  8998. sL[10] = lastV[4] + incL[4];
  8999. sL[11] = lastV[5] + incL[5];
  9000. // multiply by 2 since doing 12 at a time
  9001. incL[0] *= 2;
  9002. incL[1] *= 2;
  9003. incL[2] *= 2;
  9004. incL[3] *= 2;
  9005. incL[4] *= 2;
  9006. incL[5] *= 2;
  9007. incL[6] *= 2;
  9008. incL[7] *= 2;
  9009. incL[8] *= 2;
  9010. incL[9] *= 2;
  9011. incL[10] *= 2;
  9012. incL[11] *= 2;
  9013. //we aligned this data, so load it up
  9014. vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
  9015. vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
  9016. vecIncl1 = vec_ld( 0, &incL[0] );
  9017. vecIncl2 = vec_ld( 15, &incL[0] );
  9018. vecIncl3 = vec_ld( 31, &incL[0] );
  9019. vecIncl4 = vec_ld( 47, &incL[0] );
  9020. vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
  9021. vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
  9022. vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
  9023. vecSL1 = vec_ld( 0, &sL[0] );
  9024. vecSL2 = vec_ld( 15, &sL[0] );
  9025. vecSL3 = vec_ld( 31, &sL[0] );
  9026. vecSL4 = vec_ld( 47, &sL[0] );
  9027. vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
  9028. vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
  9029. vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
  9030. vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
  9031. vector float vecSamplesLast = vec_ld( 0, &samples[0] );
  9032. for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
  9033. //load mix buffer into vectors, assume aligned
  9034. vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
  9035. vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
  9036. vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
  9037. //load samples into vector
  9038. vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
  9039. vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
  9040. vecSamplesLast = vecSamplesLd2;
  9041. //permute to get them ordered how we want. For the 2nd vector,
  9042. //the order happens to be the same as the order we loaded them
  9043. //in, so there's no need to permute that one
  9044. vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
  9045. vecSamples2 = vecSamplesLd;
  9046. vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
  9047. //do calculation
  9048. vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
  9049. vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
  9050. vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
  9051. //store out results
  9052. ALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
  9053. // add for next iteration
  9054. vecSL1 = vec_add( vecSL1, vecIncl1 );
  9055. vecSL2 = vec_add( vecSL2, vecIncl2 );
  9056. vecSL3 = vec_add( vecSL3, vecIncl3 );
  9057. }
  9058. }
  9059. #else
  9060. /*
  9061. ============
  9062. idSIMD_AltiVec::MixSoundSixSpeakerStereo
  9063. Assumptions:
  9064. No assumptions
  9065. ============
  9066. */
  9067. void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
  9068. float incL[12];
  9069. float sL[12];
  9070. int i;
  9071. vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
  9072. vector float vecSL1, vecSL2, vecSL3, vecSL4;
  9073. vector float vecSamplesLd;
  9074. vector float vecSamples1, vecSamples2, vecSamples3;
  9075. vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
  9076. // permute vectors for sample
  9077. vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
  9078. vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
  9079. assert( numSamples == MIXBUFFER_SAMPLES );
  9080. assert( SPEAKER_RIGHT == 1 );
  9081. assert( SPEAKER_BACKRIGHT == 5 );
  9082. // incL array, 6 elements repeated
  9083. incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  9084. incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  9085. incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
  9086. incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
  9087. incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
  9088. incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
  9089. // sL array repeated
  9090. sL[0] = lastV[0];
  9091. sL[1] = lastV[1];
  9092. sL[2] = lastV[2];
  9093. sL[3] = lastV[3];
  9094. sL[4] = lastV[4];
  9095. sL[5] = lastV[5];
  9096. sL[6] = lastV[0] + incL[0];
  9097. sL[7] = lastV[1] + incL[1];
  9098. sL[8] = lastV[2] + incL[2];
  9099. sL[9] = lastV[3] + incL[3];
  9100. sL[10] = lastV[4] + incL[4];
  9101. sL[11] = lastV[5] + incL[5];
  9102. // multiply by 2 since doing 12 at a time
  9103. incL[0] *= 2;
  9104. incL[1] *= 2;
  9105. incL[2] *= 2;
  9106. incL[3] *= 2;
  9107. incL[4] *= 2;
  9108. incL[5] *= 2;
  9109. incL[6] *= 2;
  9110. incL[7] *= 2;
  9111. incL[8] *= 2;
  9112. incL[9] *= 2;
  9113. incL[10] *= 2;
  9114. incL[11] *= 2;
  9115. // load the data
  9116. vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
  9117. vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
  9118. vecIncl1 = vec_ld( 0, &incL[0] );
  9119. vecIncl2 = vec_ld( 15, &incL[0] );
  9120. vecIncl3 = vec_ld( 31, &incL[0] );
  9121. vecIncl4 = vec_ld( 47, &incL[0] );
  9122. vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
  9123. vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
  9124. vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
  9125. vecSL1 = vec_ld( 0, &sL[0] );
  9126. vecSL2 = vec_ld( 15, &sL[0] );
  9127. vecSL3 = vec_ld( 31, &sL[0] );
  9128. vecSL4 = vec_ld( 47, &sL[0] );
  9129. vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
  9130. vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
  9131. vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
  9132. vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
  9133. vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
  9134. vector float vecSamplesLast = vec_ld( 0, &samples[0] );
  9135. vector float vecDest = vec_ld( 0, &mixBuffer[0] );
  9136. for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
  9137. //load mix buffer into vectors
  9138. vecMixBuffer1 = vecDest;
  9139. vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
  9140. vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
  9141. vector float vecDestEnd = vec_ld( 47, &mixBuffer[i*6] );
  9142. vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
  9143. vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
  9144. vecMixBuffer3 = vec_perm( vecMixBuffer3, vecDestEnd, mixBufferPerm );
  9145. //load samples into vector
  9146. vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
  9147. vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
  9148. vecSamplesLast = vecSamplesLd2;
  9149. //permute to get them ordered how we want. For the 2nd vector,
  9150. //the order happens to be the same as the order we loaded them
  9151. //in, so there's no need to permute that one
  9152. vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
  9153. vecSamples2 = vecSamplesLd;
  9154. vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
  9155. //do calculation
  9156. vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
  9157. vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
  9158. vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
  9159. // store results
  9160. UNALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
  9161. // add for next iteration
  9162. vecSL1 = vec_add( vecSL1, vecIncl1 );
  9163. vecSL2 = vec_add( vecSL2, vecIncl2 );
  9164. vecSL3 = vec_add( vecSL3, vecIncl3 );
  9165. }
  9166. }
  9167. #endif
  9168. /*
  9169. ============
  9170. idSIMD_AltiVec::MixedSoundToSamples
  9171. ============
  9172. */
  9173. void VPCALL idSIMD_AltiVec::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
  9174. //this is basically a clamp for sound mixing
  9175. register vector float v0, v1, v2, v3, v4, v5, v6, v7;
  9176. register vector signed int vi0, vi1, vi2, vi3;
  9177. register vector signed short vs0, vs1;
  9178. register vector float minVec, maxVec, constVec;
  9179. int i = 0;
  9180. //unaligned at start, since samples is not 16-byte aligned
  9181. for ( ; NOT_16BYTE_ALIGNED( samples[i] ) && ( i < numSamples ); i++ ) {
  9182. samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
  9183. }
  9184. constVec = (vector float)(65536.0f);
  9185. //splat min/max into a vector
  9186. minVec = (vector float)(-32768.0f);
  9187. maxVec = (vector float)(32767.0f);
  9188. vector float vecOld = vec_ld( 0, &mixBuffer[i] );
  9189. vector unsigned char permVec = vec_add( vec_lvsl( -1, &mixBuffer[i] ), (vector unsigned char)(1) );
  9190. //vectorize!
  9191. for ( ; i+15 < numSamples; i += 16 ) {
  9192. //load source
  9193. v0 = vecOld;
  9194. v1 = vec_ld( 15, &mixBuffer[i] );
  9195. v2 = vec_ld( 31, &mixBuffer[i] );
  9196. v3 = vec_ld( 31, &mixBuffer[i] );
  9197. vecOld = vec_ld( 47, &mixBuffer[i] );
  9198. v0 = vec_perm( v0, v1, permVec );
  9199. v1 = vec_perm( v1, v2, permVec );
  9200. v2 = vec_perm( v2, v3, permVec );
  9201. v3 = vec_perm( v3, vecOld, permVec );
  9202. //apply minimum
  9203. v4 = vec_max( v0, minVec );
  9204. v5 = vec_max( v1, minVec );
  9205. v6 = vec_max( v2, minVec );
  9206. v7 = vec_max( v3, minVec );
  9207. //apply maximum
  9208. v4 = vec_min( v4, maxVec );
  9209. v5 = vec_min( v5, maxVec );
  9210. v6 = vec_min( v6, maxVec );
  9211. v7 = vec_min( v7, maxVec );
  9212. // convert floats to ints
  9213. vi0 = vec_cts( v4, 0 );
  9214. vi1 = vec_cts( v5, 0 );
  9215. vi2 = vec_cts( v6, 0 );
  9216. vi3 = vec_cts( v7, 0 );
  9217. // pack ints into shorts
  9218. vs0 = vec_pack( vi0, vi1 );
  9219. vs1 = vec_pack( vi2, vi3 );
  9220. ALIGNED_STORE2( &samples[i], vs0, vs1 );
  9221. }
  9222. //handle cleanup
  9223. for ( ; i < numSamples ; i++ ) {
  9224. samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
  9225. }
  9226. }
  9227. #endif /* ENABLE_SOUND_ROUTINES */
  9228. #endif /* MACOS_X */