1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376937793789379938093819382938393849385938693879388938993909391939293939394939593969397939893999400940194029403940494059406940794089409941094119412941394149415941694179418941994209421942294239424942594269427942894299430943194329433943494359436943794389439944094419442944394449445944694479448944994509451945294539454945594569457945894599460946194629463946494659466946794689469947094719472947394749475947694779478947994809481948294839484948594869487948894899490949194929493949494959496949794989499950095019502950395049505950695079508950995109511951295139514951595169517951895199520952195229523952495259526952795289529953095319532953395349535953695379538953995409541954295439544954595469547954895499550955195529553955495559556955795589559956095619562956395649565956695679568956995709571957295739574957595769577957895799580958195829583958495859586958795889589959095919592959395949595959695979598959996009601960296039604960596069607960896099610961196129613961496159616961796189619962096219622962396249625962696279628962996309631963296339634963596369637963896399640964196429643964496459646964796489649965096519652965396549655965696579658965996609661966296639664966596669667966896699670967196729673967496759676967796789679968096819682968396849685968696879688968996909691969296939694969596969697969896999700970197029703970497059706970797089709971097119712971397149715971697179718971997209721972297239724972597269727972897299730973197329733973497359736973797389739974097419742974397449745974697479748974997509751975297539754975597569757975897599760976197629763976497659766976797689769977097719772977397749775977697779778977997809781978297839784978597869787978897899790979197929793979497959796979797989799980098019802980398049805980698079808980998109811981298139814981598169817981898199820982198229823982498259826982798289829983098319832983398349835983698379838983998409841984298439844984598469847984898499850985198529853985498559856985798589859986098619862986398649865986698679868986998709871987298739874987598769877987898799880988198829883988498859886988798889889989098919892989398949895989698979898989999009901990299039904990599069907990899099910991199129913991499159916991799189919992099219922992399249925992699279928992999309931993299339934993599369937993899399940994199429943994499459946994799489949995099519952995399549955995699579958995999609961996299639964996599669967996899699970997199729973997499759976997799789979998099819982998399849985998699879988998999909991999299939994999599969997999899991000010001100021000310004100051000610007100081000910010100111001210013100141001510016100171001810019100201002110022100231002410025100261002710028100291003010031100321003310034100351003610037100381003910040100411004210043100441004510046100471004810049100501005110052100531005410055100561005710058100591006010061100621006310064100651006610067100681006910070100711007210073100741007510076100771007810079100801008110082100831008410085100861008710088100891009010091100921009310094100951009610097100981009910100101011010210103101041010510106101071010810109101101011110112101131011410115101161011710118101191012010121101221012310124101251012610127101281012910130101311013210133101341013510136101371013810139101401014110142101431014410145101461014710148101491015010151101521015310154101551015610157101581015910160101611016210163101641016510166101671016810169101701017110172101731017410175101761017710178101791018010181101821018310184101851018610187101881018910190101911019210193101941019510196101971019810199102001020110202102031020410205102061020710208102091021010211102121021310214102151021610217102181021910220102211022210223102241022510226102271022810229102301023110232102331023410235102361023710238102391024010241102421024310244102451024610247102481024910250102511025210253102541025510256102571025810259102601026110262102631026410265102661026710268102691027010271102721027310274102751027610277102781027910280102811028210283102841028510286102871028810289102901029110292102931029410295102961029710298102991030010301103021030310304103051030610307103081030910310103111031210313103141031510316103171031810319103201032110322103231032410325103261032710328103291033010331103321033310334103351033610337103381033910340103411034210343103441034510346103471034810349103501035110352103531035410355103561035710358103591036010361103621036310364103651036610367103681036910370103711037210373103741037510376103771037810379103801038110382103831038410385103861038710388103891039010391103921039310394103951039610397103981039910400104011040210403104041040510406104071040810409104101041110412104131041410415104161041710418104191042010421104221042310424104251042610427104281042910430104311043210433104341043510436104371043810439104401044110442104431044410445104461044710448104491045010451104521045310454104551045610457104581045910460104611046210463104641046510466104671046810469104701047110472104731047410475104761047710478104791048010481104821048310484104851048610487104881048910490104911049210493104941049510496104971049810499105001050110502105031050410505105061050710508105091051010511105121051310514105151051610517105181051910520105211052210523105241052510526105271052810529105301053110532105331053410535105361053710538105391054010541105421054310544105451054610547105481054910550105511055210553105541055510556105571055810559105601056110562105631056410565105661056710568105691057010571105721057310574105751057610577105781057910580105811058210583105841058510586105871058810589105901059110592105931059410595105961059710598105991060010601106021060310604106051060610607106081060910610106111061210613106141061510616106171061810619106201062110622106231062410625106261062710628106291063010631106321063310634106351063610637106381063910640106411064210643106441064510646106471064810649106501065110652106531065410655106561065710658106591066010661106621066310664106651066610667106681066910670106711067210673106741067510676106771067810679106801068110682106831068410685106861068710688106891069010691106921069310694106951069610697106981069910700107011070210703107041070510706107071070810709107101071110712107131071410715107161071710718107191072010721107221072310724107251072610727107281072910730107311073210733107341073510736107371073810739107401074110742107431074410745107461074710748107491075010751107521075310754107551075610757107581075910760107611076210763107641076510766107671076810769107701077110772107731077410775107761077710778107791078010781107821078310784107851078610787107881078910790107911079210793107941079510796107971079810799108001080110802108031080410805108061080710808108091081010811108121081310814108151081610817108181081910820108211082210823108241082510826108271082810829108301083110832108331083410835108361083710838108391084010841108421084310844108451084610847108481084910850108511085210853108541085510856108571085810859108601086110862108631086410865108661086710868108691087010871108721087310874108751087610877108781087910880108811088210883108841088510886108871088810889108901089110892108931089410895108961089710898108991090010901109021090310904109051090610907109081090910910109111091210913109141091510916109171091810919109201092110922109231092410925109261092710928109291093010931109321093310934109351093610937109381093910940109411094210943109441094510946109471094810949109501095110952109531095410955109561095710958109591096010961109621096310964109651096610967109681096910970109711097210973109741097510976109771097810979109801098110982109831098410985109861098710988109891099010991109921099310994109951099610997109981099911000110011100211003110041100511006110071100811009110101101111012110131101411015110161101711018110191102011021110221102311024110251102611027110281102911030110311103211033110341103511036110371103811039110401104111042110431104411045110461104711048110491105011051110521105311054110551105611057110581105911060110611106211063110641106511066110671106811069110701107111072110731107411075110761107711078110791108011081110821108311084110851108611087110881108911090110911109211093110941109511096110971109811099111001110111102111031110411105111061110711108111091111011111111121111311114111151111611117111181111911120111211112211123111241112511126111271112811129111301113111132111331113411135111361113711138111391114011141111421114311144111451114611147111481114911150111511115211153111541115511156111571115811159111601116111162111631116411165111661116711168111691117011171111721117311174111751117611177111781117911180111811118211183111841118511186111871118811189111901119111192111931119411195111961119711198111991120011201112021120311204112051120611207112081120911210112111121211213112141121511216112171121811219112201122111222112231122411225112261122711228112291123011231112321123311234112351123611237112381123911240 |
- /*
- ===========================================================================
- Doom 3 GPL Source Code
- Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
- This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
- Doom 3 Source Code is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- Doom 3 Source Code is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
- In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
- If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
- ===========================================================================
- */
- #include "../precompiled.h"
- #pragma hdrstop
- #include "Simd_Generic.h"
- #include "Simd_AltiVec.h"
- #include <math.h>
- #include <float.h>
- #ifdef PPC_INTRINSICS
- #include <ppc_intrinsics.h>
- #endif
- // Doom3 SIMD Library version 0.5
- // Patrick Flanagan (pflanagan@apple.com)
- // Sanjay Patel (spatel@apple.com)
- // Architecture & Performance Group, Apple Computer
- //===============================================================
- //
- // AltiVec implementation of idSIMDProcessor
- //
- //===============================================================
- #if defined(MACOS_X) && defined(__ppc__)
- // Data struct sizes
- #ifndef DRAWVERT_PADDED
- // 60 bytes, 15 floats at 4 bytes each
- #define DRAWVERT_OFFSET 15
- #else
- // 64 bytes, 16 floats
- #define DRAWVERT_OFFSET 16
- #endif
- // 16 bytes each, 4 floats
- #define PLANE_OFFSET 4
- // 16 bytes each, 4 floats
- #define IDVEC4_OFFSET 4
- // Alignment tests
- #define IS_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F ) == 0 )
- #define NOT_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F) != 0 )
- // Aligned storing floats
- #define ALIGNED_STORE2( ADDR, V0, V1 ) \
- vec_st( V0, 0, ADDR ); \
- vec_st( V1, 16, ADDR )
-
- #define ALIGNED_STORE3( ADDR, V0, V1, V2 ) \
- vec_st( V0, 0, ADDR ); \
- vec_st( V1, 16, ADDR ); \
- vec_st( V2, 32, ADDR )
-
- #define ALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) \
- vec_st( V0, 0, ADDR ); \
- vec_st( V1, 16, ADDR ); \
- vec_st( V2, 32, ADDR ); \
- vec_st( V3, 48, ADDR )
- #define ALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) \
- vec_st( V0, 0, ADDR ); \
- vec_st( V1, 16, ADDR ); \
- vec_st( V2, 32, ADDR ); \
- vec_st( V3, 48, ADDR ); \
- vec_st( V4, 64, ADDR ); \
- vec_st( V5, 80, ADDR )
-
- #define ALIGNED_STORE8( ADDR, V0, V1, V2, V3, V4, V5, V6, V7 ) \
- vec_st( V0, 0, ADDR ); \
- vec_st( V1, 16, ADDR ); \
- vec_st( V2, 32, ADDR ); \
- vec_st( V3, 48, ADDR ); \
- vec_st( V4, 64, ADDR ); \
- vec_st( V5, 80, ADDR ); \
- vec_st( V6, 96, ADDR ); \
- vec_st( V7, 112, ADDR )
-
- // Unaligned storing floats. These assume that we can trash the input
- #define UNALIGNED_STORE1( ADDR, V0 ) { \
- /* use store element */ \
- vector unsigned char ULStoreMacroPerm = vec_lvsr( 0, ADDR ); \
- V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
- vec_ste( V0, 0, ADDR ); \
- vec_ste( V0, 4, ADDR ); \
- vec_ste( V0, 8, ADDR ); \
- vec_ste( V0, 12, ADDR ); \
- }
- #define UNALIGNED_STORE2( ADDR, V0, V1 ) { \
- /* load up the values that are there now */ \
- vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
- vector float ULStoreMacro2 = vec_ld( 31, ADDR ); \
- /* generate permute vector and mask */ \
- vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
- vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
- /* right rotate input data */ \
- V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
- V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
- /* setup the output vectors */ \
- vector float ULStoreVal1, ULStoreVal2, ULStoreVal3; \
- ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
- ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
- ULStoreVal3 = vec_sel( V1, ULStoreMacro2, ULStoreMacroMask ); \
- /* store results */ \
- vec_st( ULStoreVal1, 0, ADDR ); \
- vec_st( ULStoreVal2, 15, ADDR ); \
- vec_st( ULStoreVal3, 31, ADDR ); }
- #define UNALIGNED_STORE3( ADDR, V0, V1, V2 ) { \
- /* load up the values that are there now */ \
- vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
- vector float ULStoreMacro2 = vec_ld( 47, ADDR ); \
- /* generate permute vector and mask */ \
- vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
- vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
- /* right rotate input data */ \
- V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
- V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
- V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
- /* setup the output vectors */ \
- vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4; \
- ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
- ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
- ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
- ULStoreVal4 = vec_sel( V2, ULStoreMacro2, ULStoreMacroMask ); \
- /* store results */ \
- vec_st( ULStoreVal1, 0, ADDR ); \
- vec_st( ULStoreVal2, 15, ADDR ); \
- vec_st( ULStoreVal3, 31, ADDR ); \
- vec_st( ULStoreVal4, 47, ADDR ); }
- #define UNALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) { \
- /* load up the values that are there now */ \
- vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
- vector float ULStoreMacro2 = vec_ld( 63, ADDR ); \
- /* generate permute vector and mask */ \
- vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
- vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
- /* right rotate input data */ \
- V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
- V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
- V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
- V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
- /* setup the output vectors */ \
- vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5; \
- ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
- ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
- ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
- ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
- ULStoreVal5 = vec_sel( V3, ULStoreMacro2, ULStoreMacroMask ); \
- /* store results */ \
- vec_st( ULStoreVal1, 0, ADDR ); \
- vec_st( ULStoreVal2, 15, ADDR ); \
- vec_st( ULStoreVal3, 31, ADDR ); \
- vec_st( ULStoreVal4, 47, ADDR ); \
- vec_st( ULStoreVal5, 63, ADDR ); }
- #define UNALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) { \
- /* load up the values that are there now */ \
- vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
- vector float ULStoreMacro2 = vec_ld( 95, ADDR ); \
- /* generate permute vector and mask */ \
- vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
- vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
- /* right rotate input data */ \
- V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
- V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
- V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
- V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
- V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
- V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
- /* setup the output vectors */ \
- vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
- ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
- ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
- ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
- ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
- ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
- ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
- ULStoreVal7 = vec_sel( V5, ULStoreMacro2, ULStoreMacroMask ); \
- /* store results */ \
- vec_st( ULStoreVal1, 0, ADDR ); \
- vec_st( ULStoreVal2, 15, ADDR ); \
- vec_st( ULStoreVal3, 31, ADDR ); \
- vec_st( ULStoreVal4, 47, ADDR ); \
- vec_st( ULStoreVal5, 63, ADDR ); \
- vec_st( ULStoreVal6, 79, ADDR ); \
- vec_st( ULStoreVal7, 95, ADDR ); }
- #define UNALIGNED_STORE9( ADDR, V0, V1, V2, V3, V4, V5, V6, V7, V8 ) { \
- /* load up the values that are there now */ \
- vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
- vector float ULStoreMacro2 = vec_ld( 143, ADDR ); \
- /* generate permute vector and mask */ \
- vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
- vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
- /* right rotate input data */ \
- V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
- V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
- V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
- V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
- V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
- V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
- V6 = vec_perm( V6, V6, ULStoreMacroPerm ); \
- V7 = vec_perm( V7, V7, ULStoreMacroPerm ); \
- V8 = vec_perm( V8, V8, ULStoreMacroPerm ); \
- /* setup the output vectors */ \
- vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
- vector float ULStoreVal8, ULStoreVal9, ULStoreVal10; \
- ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
- ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
- ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
- ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
- ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
- ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
- ULStoreVal7 = vec_sel( V5, V6, ULStoreMacroMask ); \
- ULStoreVal8 = vec_sel( V6, V7, ULStoreMacroMask ); \
- ULStoreVal9 = vec_sel( V7, V8, ULStoreMacroMask ); \
- ULStoreVal10 = vec_sel( V8, ULStoreMacro2, ULStoreMacroMask ); \
- /* store results */ \
- vec_st( ULStoreVal1, 0, ADDR ); \
- vec_st( ULStoreVal2, 15, ADDR ); \
- vec_st( ULStoreVal3, 31, ADDR ); \
- vec_st( ULStoreVal4, 47, ADDR ); \
- vec_st( ULStoreVal5, 63, ADDR ); \
- vec_st( ULStoreVal6, 79, ADDR ); \
- vec_st( ULStoreVal7, 95, ADDR ); \
- vec_st( ULStoreVal8, 111, ADDR ); \
- vec_st( ULStoreVal9, 127, ADDR ); \
- vec_st( ULStoreVal10, 143, ADDR ); }
- /*
- ============
- idSIMD_AltiVec::GetName
- ============
- */
- const char *idSIMD_AltiVec::GetName( void ) const {
- return "AltiVec";
- }
- /*
- Helper Functions
- */
- #if 0
- // Prints the values of a vector, useful for debugging but
- // should never be called in real code
- inline void debugPrintVector( vector float v, char *msg ) {
- printf("%s -- %vf\n", msg, v );
- }
- inline void debugPrintVector( vector unsigned int v, char *msg ) {
- printf("%s -- %vd\n", msg, v );
- }
- inline void debugPrintVector( vector bool int v, char *msg ) {
- printf("%s -- %vi\n", msg, v );
- }
- inline void debugPrintVector( vector unsigned char v, char *msg ) {
- printf("%s -- %vuc\n", msg, v );
- }
- inline void debugPrintVector( vector unsigned short v, char *msg ) {
- printf("%s -- %vs\n", msg, v );
- }
- #endif
- /*
- ===============
- Reciprocal
- For each element in vector:
- n = 1 / n
- ===============
- */
-
- // Use Newton-Raphson to calculate reciprocal of a vector
- inline vector float Reciprocal( vector float v ) {
- //Get the reciprocal estimate
- vector float estimate = vec_re( v );
- //One round of Newton-Raphson refinement
- return vec_madd( vec_nmsub( estimate, v, (vector float) (1.0) ), estimate, estimate );
- }
- /*
- ===============
- ReciprocalSquareRoot
- For each element in vector:
- n = 1 / sqrt(n)
- ===============
- */
- // Reciprocal square root estimate of a vector
- inline vector float ReciprocalSquareRoot( vector float v ) {
- //Get the square root reciprocal estimate
- vector float zero = (vector float)(0);
- vector float oneHalf = (vector float)(0.5);
- vector float one = (vector float)(1.0);
- vector float estimate = vec_rsqrte( vec_max( v, (vector float)(FLT_MIN) ) );
-
- //One round of Newton-Raphson refinement
- vector float estimateSquared = vec_madd( estimate, estimate, zero );
- vector float halfEstimate = vec_madd( estimate, oneHalf, zero );
- return vec_madd( vec_nmsub( v, estimateSquared, one ), halfEstimate, estimate );
- }
- /*
- ===============
- Divide
- For each element in vectors:
- n = a / b
- ===============
- */
- // Use reciprocal estimate and multiply to divide a vector
- inline vector float Divide( vector float a, vector float b ) {
- return vec_madd( a, Reciprocal( b ), (vector float)(0) );
- }
- /*
- ===============
- loadSplatUnalignedScalar
- For each element in vector:
- n = s
- ===============
- */
- inline vector float loadSplatUnalignedScalar( const float *s ) {
- vector unsigned char splatMap = vec_lvsl( 0, s );
- vector float v = vec_ld( 0, s );
- splatMap = (vector unsigned char) vec_splat( (vector float) splatMap, 0 );
- return vec_perm( v, v, splatMap );
- }
- /*
- ===============
- VectorATan16
- For each element in vector:
- n = idMath::ATan16( x, y )
- ===============
- */
- // calculates arc tangent of a vector with 16 bits of precision, based on atan16 in idMath
- inline vector float VectorATan16( vector float x, vector float y ) {
-
- vector float xDivY = Divide( x, y );
- vector float yDivX = Divide( y, x );
- vector float zeroVector = (vector float)(0);
-
- vector bool int vecCmp = vec_cmpgt( vec_abs( y ), vec_abs( x ) );
- vector float vecA = vec_sel( yDivX, xDivY, vecCmp );
- vector bool int vecCmp2 = vec_cmplt( vecA, zeroVector );
- vector float vecS = vec_madd( vecA, vecA, (vector float)(0) );
-
- // do calculation for S
- vector float vecWork1 = vec_madd( (vector float)(0.0028662257f), vecS, (vector float)(-0.0161657367f) );
- vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.0429096138f) );
- vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.0752896400f) );
- vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1065626393f) );
- vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.1420889944f) );
- vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1999355085f) );
- vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.3333314528f) );
- vecWork1 = vec_madd( vecWork1, vecS, (vector float)(1) );
-
- // get the regular S value
- vecS = vec_madd( vecWork1, vecA, (vector float)(0) );
-
- // calculate what to return if y > x
- vector float negSPlusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(0.5f * 3.14159265358979323846f) );
- vector float negSMinusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(-0.5f * 3.14159265358979323846f) );
- vector float modRet = vec_sel( negSPlusHalfPI, negSMinusHalfPI, vecCmp2 );
-
- return vec_sel( modRet, vecS, vecCmp );
- }
- /*
- ===============
- VectorSin16
- For each element in vector:
- n = idMath::Sin16( v )
- ===============
- */
- inline vector float VectorSin16( vector float v ) {
- vector float zero = (vector float)(0);
- #if 0
- // load up half PI and use it to calculate the rest of the values. This is
- // sometimes cheaper than loading them from memory
-
- vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
- vector float PI = vec_add( halfPI, halfPI );
- vector float oneandhalfPI = vec_add( PI, halfPI );
- vector float twoPI = vec_add( oneandhalfPI, halfPI );
- #else
- vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
- vector float PI = (vector float)(3.14159265358979323846f);
- vector float oneandhalfPI = (vector float)(3.14159265358979323846f + ( 0.5f * 3.14159265358979323846f ) );
- vector float twoPI = (vector float)( 2.0f * 3.14159265358979323846f);
- #endif
-
- vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4;
-
- vector float vecMod;
- vector float vecResult;
-
- // fix the range if needbe
- vecMod = vec_floor( Divide( v, twoPI ) );
- vecResult = vec_nmsub( vecMod, twoPI, v );
- vector float vecPIminusA = vec_sub( PI, vecResult );
- vector float vecAminus2PI = vec_sub( vecResult, twoPI );
- vecCmp1 = vec_cmplt( vecResult, PI );
- vecCmp2 = vec_cmpgt( vecResult, halfPI );
- // these are the ones where a > PI + HALF_PI so set a = a - TWO_PI
- vecCmp3 = vec_cmpgt( vecResult, oneandhalfPI );
-
- // we also want to set a = PI - a everywhere that !(a < PI) and !(a > PI + HALF_PI)
- vecCmp4 = vec_and( vec_xor( vecCmp3, (vector bool int)(1) ), vec_xor( vecCmp1, (vector bool int)(1) ) ); // everywhere that both of those are false
-
- // these are ones where a < PI and a > HALF_PI so we set a = PI - a
- vecCmp1 = vec_and( vecCmp1, vecCmp2 );
- vecCmp1 = vec_or( vecCmp1, vecCmp4 );
-
- // put the correct values into place
- vecResult = vec_sel( vecResult, vecPIminusA, vecCmp1 );
- vecResult = vec_sel( vecResult, vecAminus2PI, vecCmp3 );
- // calculate answer
- vector float vecASquared = vec_madd( vecResult, vecResult, zero );
- vector float vecEst = vec_madd( (vector float)(-2.39e-08f), vecASquared, (vector float)(2.7526e-06f) );
- vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.98409e-04f) );
- vecEst = vec_madd( vecEst, vecASquared, (vector float)(8.3333315e-03f) );
- vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.666666664e-01f) );
- vecEst = vec_madd( vecEst, vecASquared, (vector float)(1.0f) );
- return vec_madd( vecResult, vecEst, zero );
- }
- /*
- ===============
- vecSplatWithRunTime
- For each element in vector:
- n = v(i)
- ===============
- */
- // splats an element across a vector using a runtime variable
- inline vector float vecSplatWithRunTime( vector float v, int i ) {
- vector unsigned char rotate = vec_lvsl( i * sizeof( float ), (int*) 0L );
- v = vec_perm( v, v, rotate );
- return vec_splat( v, 0 );
- }
- /*
- ===============
- FastScalarInvSqrt
-
- n = 1 / sqrt( f )
- ===============
- */
- inline float FastScalarInvSqrt( float f ) {
- #ifdef PPC_INTRINSICS
- float estimate;
- const float kSmallestFloat = FLT_MIN;
- //Calculate a 5 bit starting estimate for the reciprocal sqrt
- estimate = __frsqrte ( f + kSmallestFloat );
- //if you require less precision, you may reduce the number of loop iterations.
- // This will do 2 rounds of NR
- estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
- estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
- return estimate;
- #else
- return idMath::InvSqrt( f );
- #endif
- }
- /*
- ===============
- FastScalarInvSqrt_x3
-
- arg1 = 1 / sqrt( arg1 )
- arg2 = 1 / sqrt( arg2 )
- arg3 = 1 / sqrt( arg3 )
- ===============
- */
- inline void FastScalarInvSqrt_x3( float *arg1, float *arg2, float *arg3 ) {
- #ifdef PPC_INTRINSICS
- register float estimate1, estimate2, estimate3;
- const float kSmallestFloat = FLT_MIN;
- //Calculate a 5 bit starting estimate for the reciprocal sqrt of each
- estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
- estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
- estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
- // two rounds newton-raphson
- estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
- estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
- estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
- estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
- estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
- estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
- *arg1 = estimate1;
- *arg2 = estimate2;
- *arg3 = estimate3;
- #else
- *arg1 = idMath::InvSqrt( *arg1 );
- *arg2 = idMath::InvSqrt( *arg2 );
- *arg3 = idMath::InvSqrt( *arg3 );
- #endif
- }
- /*
- ===============
- FastScalarInvSqrt_x6
-
- arg1 = 1 / sqrt( arg1 )
- arg2 = 1 / sqrt( arg2 )
- arg3 = 1 / sqrt( arg3 )
- arg4 = 1 / sqrt( arg4 )
- arg5 = 1 / sqrt( arg5 )
- arg6 = 1 / sqrt( arg6 )
- On a G5, you've got 2 pipeline stages to fill. (2 FPU's with 6 stages each)
- ===============
- */
- inline void FastScalarInvSqrt_x6( float *arg1, float *arg2, float *arg3, float *arg4, float *arg5, float *arg6 ) {
- #ifdef PPC_INTRINSICS
- register float estimate1, estimate2, estimate3, estimate4, estimate5, estimate6;
- const float kSmallestFloat = FLT_MIN;
- //Calculate a 5 bit starting estimate for the reciprocal sqrt of each
- estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
- estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
- estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
- estimate4 = __frsqrte ( *arg4 + kSmallestFloat );
- estimate5 = __frsqrte ( *arg5 + kSmallestFloat );
- estimate6 = __frsqrte ( *arg6 + kSmallestFloat );
- // two rounds newton-raphson
- estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
- estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
- estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
- estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
- estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
- estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
- estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
- estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
- estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
- estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
- estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
- estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
- *arg1 = estimate1;
- *arg2 = estimate2;
- *arg3 = estimate3;
- *arg4 = estimate4;
- *arg5 = estimate5;
- *arg6 = estimate6;
- #else
- *arg1 = idMath::InvSqrt( *arg1 );
- *arg2 = idMath::InvSqrt( *arg2 );
- *arg3 = idMath::InvSqrt( *arg3 );
- *arg4 = idMath::InvSqrt( *arg4 );
- *arg5 = idMath::InvSqrt( *arg5 );
- *arg6 = idMath::InvSqrt( *arg6 );
- #endif
- }
- // End Helper Functions
- #ifdef ENABLE_SIMPLE_MATH
- /*
- ============
- idSIMD_AltiVec::Add
- dst[i] = constant + src[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::Add( float *dst, const float constant, const float *src, const int count ) {
- vector float v0, v1, v2, v3;
- vector float v0_low, v0_hi, v1_hi;
- vector unsigned char permVec;
- vector float constVec;
- int i;
-
- // handle unaligned cases at beginning
- for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = constant + src[i];
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //calculate permute and do first load
- permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), (vector unsigned char)(1) );
- v1_hi = vec_ld( 0, &src[i] );
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load source
- v0_low = v1_hi;
- v0_hi = vec_ld( 15, &src[i] );
- v1_hi = vec_ld( 31, &src[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v0_hi, v1_hi, permVec );
-
- v2 = vec_add( v0, constVec );
- v3 = vec_add( v1, constVec );
-
- // store results
- ALIGNED_STORE2( &dst[i], v2, v3 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = constant + src[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::Add
- dst[i] = src0[i] + src1[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::Add( float *dst, const float *src0, const float *src1, const int count ) {
-
- register vector float v0, v1, v2, v3, v4, v5;
- //src0
- register vector float v0_low, v0_hi, v2_low, v2_hi;
- //src1
- register vector float v1_low, v1_hi, v3_low, v3_hi;
- //permute vectors
- register vector unsigned char permVec1, permVec2;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
-
- int i;
-
- //unaligned at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = src0[i] + src1[i];
- }
-
- //calculate permute and do loads
- permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
- permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
- v2_hi = vec_ld( 0, &src0[i] );
- v3_hi = vec_ld( 0, &src1[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load source
- v0_low = v2_hi;
- v0_hi = vec_ld( 15, &src0[i] );
- v2_low = v0_hi;
- v2_hi = vec_ld( 31, &src0[i] );
-
- v1_low = v3_hi;
- v1_hi = vec_ld( 15, &src1[i] );
- v3_low = v1_hi;
- v3_hi = vec_ld( 31, &src1[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec1 );
- v1 = vec_perm( v1_low, v1_hi, permVec2 );
- v2 = vec_perm( v2_low, v2_hi, permVec1 );
- v3 = vec_perm( v3_low, v3_hi, permVec2 );
-
- v4 = vec_add( v0, v1 );
- v5 = vec_add( v2, v3 );
-
- ALIGNED_STORE2( &dst[i], v4, v5 );
-
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = src0[i] + src1[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::Sub
- dst[i] = constant - src[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::Sub( float *dst, const float constant, const float *src, const int count ) {
- register vector float v0, v1, v2, v3;
- register vector float v0_low, v0_hi, v1_low, v1_hi;
- register vector unsigned char permVec;
- register vector float constVec;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- int i;
-
- //handle unaligned at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = constant - src[i];
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //calculate permute vector and do first load
- permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
- v1_hi = vec_ld( 0, &src[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load source
- v0_low = v1_hi;
- v0_hi = vec_ld( 15, &src[i] );
- v1_low = v0_hi;
- v1_hi = vec_ld( 31, &src[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v1_low, v1_hi, permVec );
-
- v2 = vec_sub( constVec, v0 );
- v3 = vec_sub( constVec, v1 );
-
- ALIGNED_STORE2( &dst[i], v2, v3 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = constant - src[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::Sub
- dst[i] = src0[i] - src1[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::Sub( float *dst, const float *src0, const float *src1, const int count ) {
- register vector float v0, v1, v2, v3, v4, v5;
- //src0
- register vector float v0_low, v0_hi, v2_low, v2_hi;
- //src1
- register vector float v1_low, v1_hi, v3_low, v3_hi;
- register vector unsigned char permVec1, permVec2;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- int i;
- //handle unaligned at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = src0[i] - src1[i];
- }
-
- //calculate permute and do first loads
- permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
- permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
- v2_hi = vec_ld( 0, &src0[i] );
- v3_hi = vec_ld( 0, &src1[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load source
- v0_low = v2_hi;
- v0_hi = vec_ld( 15, &src0[i] );
- v2_low = v0_hi;
- v2_hi = vec_ld( 31, &src0[i] );
-
- v1_low = v3_hi;
- v1_hi = vec_ld( 15, &src1[i] );
- v3_low = v1_hi;
- v3_hi = vec_ld( 31, &src1[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec1 );
- v1 = vec_perm( v1_low, v1_hi, permVec2 );
- v2 = vec_perm( v2_low, v2_hi, permVec1 );
- v3 = vec_perm( v3_low, v3_hi, permVec2 );
-
- v4 = vec_sub( v0, v1 );
- v5 = vec_sub( v2, v3 );
- ALIGNED_STORE2( &dst[i], v4, v5 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = src0[i] - src1[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::Mul
- dst[i] = constant * src[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::Mul( float *dst, const float constant, const float *src, const int count) {
- register vector float v0, v0_low, v0_hi, v1_low, v1_hi, v1, v2, v3;
- register vector float constVec;
- register vector unsigned char permVec;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- register vector float zeroVector = (vector float)(0.0);
- int i;
-
- // handle unaligned data at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] = constant * src[i];
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
- v1_hi = vec_ld( 0, &src[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load source
- v0_low = v1_hi;
- v0_hi = vec_ld( 15, &src[i] );
- v1_low = v0_hi;
- v1_hi = vec_ld( 31, &src[i] );
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v1_low, v1_hi, permVec );
-
- v2 = vec_madd( constVec, v0, zeroVector );
- v3 = vec_madd( constVec, v1, zeroVector );
- ALIGNED_STORE2( &dst[i], v2, v3 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = constant * src[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::Mul
- dst[i] = src0[i] * src1[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::Mul( float *dst, const float *src0, const float *src1, const int count ) {
- register vector float v0, v1, v2, v3, v4, v5;
- //src0
- register vector float v0_low, v0_hi, v2_low, v2_hi;
- //src1
- register vector float v1_low, v1_hi, v3_low, v3_hi;
- //permute vectors
- register vector unsigned char permVec1, permVec2;
- register vector float constVec = (vector float)(0.0);
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- int i;
-
- //handle unaligned at start
- for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] = src0[i] * src1[i];
- }
-
- //calculate permute and do loads
- permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
- permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
- v2_hi = vec_ld( 0, &src0[i] );
- v3_hi = vec_ld( 0, &src1[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load source
- v0_low = v2_hi;
- v0_hi = vec_ld( 15, &src0[i] );
- v2_low = v0_hi;
- v2_hi = vec_ld( 31, &src0[i] );
-
- v1_low = v3_hi;
- v1_hi = vec_ld( 15, &src1[i] );
- v3_low = v1_hi;
- v3_hi = vec_ld( 31, &src1[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec1 );
- v1 = vec_perm( v1_low, v1_hi, permVec2 );
- v2 = vec_perm( v2_low, v2_hi, permVec1 );
- v3 = vec_perm( v3_low, v3_hi, permVec2 );
-
- //no such thing as regular multiply so we do
- //multiply then add zero
- v4 = vec_madd( v0, v1, constVec );
- v5 = vec_madd( v2, v3, constVec );
- ALIGNED_STORE2( &dst[i], v4, v5 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = src0[i] * src1[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::Div
- dst[i] = constant / divisor[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::Div( float *dst, const float constant, const float *divisor, const int count ) {
- register vector float v0, v1, v2, v3;
- register vector float v0_low, v0_hi, v1_low, v1_hi;
- register vector unsigned char permVec;
- register vector float constVec;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- int i;
-
- //handle unaligned at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] = constant / divisor[i];
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //calculate permute and do first loads
- permVec = vec_add( vec_lvsl( -1, (int*) &divisor[i] ), oneCharVector );
- v1_hi = vec_ld( 0, &divisor[i] );
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load source
- v0_low = v1_hi;
- v0_hi = vec_ld( 15, &divisor[i] );
- v1_low = v0_hi;
- v1_hi = vec_ld( 31, &divisor[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v1_low, v1_hi, permVec );
-
- v2 = Divide( constVec, v0 );
- v3 = Divide( constVec, v1 );
-
- ALIGNED_STORE2( &dst[i], v2, v3 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = constant / divisor[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::Div
- dst[i] = src0[i] / src1[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::Div( float *dst, const float *src0, const float *src1, const int count ) {
- register vector float v0, v1, v2, v3, v4, v5;
- //src0
- register vector float v0_low, v0_hi, v2_low, v2_hi;
- //src1
- register vector float v1_low, v1_hi, v3_low, v3_hi;
- //permute vectors
- register vector unsigned char permVec1, permVec2;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- int i;
- //handle unaligned at start
- for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] = src0[i] / src1[i];
- }
-
- //calculate permute and do loads
- permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
- permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
- v2_hi = vec_ld( 0, &src0[i] );
- v3_hi = vec_ld( 0, &src1[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load source
- v0_low = v2_hi;
- v0_hi = vec_ld( 15, &src0[i] );
- v2_low = v0_hi;
- v2_hi = vec_ld( 31, &src0[i] );
-
- v1_low = v3_hi;
- v1_hi = vec_ld( 15, &src1[i] );
- v3_low = v1_hi;
- v3_hi = vec_ld( 31, &src1[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec1 );
- v1 = vec_perm( v1_low, v1_hi, permVec2 );
- v2 = vec_perm( v2_low, v2_hi, permVec1 );
- v3 = vec_perm( v3_low, v3_hi, permVec2 );
-
- v4 = Divide( v0, v1 );
- v5 = Divide( v2, v3 );
-
- ALIGNED_STORE2( &dst[i], v4, v5 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = src0[i] / src1[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::MulAdd
- dst[i] += constant * src[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float constant, const float *src, const int count ) {
-
- register vector float v0, v1, v2, v3, v4, v5;
- register vector float constVec;
- //src
- register vector float v0_low, v0_hi, v2_low, v2_hi;
- //permute vectors
- register vector unsigned char permVec1;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- int i;
- //handle unaligned at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] += constant * src[i];
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //calculate permute and do loads
- permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
- v2_hi = vec_ld( 0, &src[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- v0_low = v2_hi;
- v0_hi = vec_ld( 15, &src[i] );
- v2_low = v0_hi;
- v2_hi = vec_ld( 31, &src[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec1 );
- v2 = vec_perm( v2_low, v2_hi, permVec1 );
- // at this point, dst is known to be aligned
- v1 = vec_ld( 0, &dst[i] );
- v3 = vec_ld( 16, &dst[i] );
-
- v4 = vec_madd( constVec, v0, v1 );
- v5 = vec_madd( constVec, v2, v3 );
-
- ALIGNED_STORE2( &dst[i], v4, v5 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] += constant * src[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::MulAdd
- dst[i] += src0[i] * src1[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- //src0
- register vector float v0_low, v0_hi, v2_low, v2_hi;
- //src1
- register vector float v1_low, v1_hi, v3_low, v3_hi;
- //permute vectors
- register vector unsigned char permVec1, permVec2;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- int i;
-
- //unaligned at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] += src0[i] * src1[i];
- }
-
- //calculate permute and do loads
- permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
- permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
- v2_hi = vec_ld( 0, &src0[i] );
- v3_hi = vec_ld( 0, &src1[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- // load sources
- v0_low = v2_hi;
- v0_hi = vec_ld( 15, &src0[i] );
- v2_low = v0_hi;
- v2_hi = vec_ld( 31, &src0[i] );
-
- v1_low = v3_hi;
- v1_hi = vec_ld( 15, &src1[i] );
- v3_low = v1_hi;
- v3_hi = vec_ld( 31, &src1[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec1 );
- v1 = vec_perm( v1_low, v1_hi, permVec2 );
- v2 = vec_perm( v2_low, v2_hi, permVec1 );
- v3 = vec_perm( v3_low, v3_hi, permVec2 );
-
- //we know dst is aligned because we handled unaligned cases
- //up front
- v4 = vec_ld( 0, &dst[i] );
- v5 = vec_ld( 16, &dst[i] );
-
- v6 = vec_madd( v0, v1, v4 );
- v7 = vec_madd( v2, v3, v5 );
-
- ALIGNED_STORE2( &dst[i], v6, v7 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] += src0[i] * src1[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::MulSub
- dst[i] -= constant * src[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float constant, const float *src, const int count ) {
- register vector float v0, v1, v2, v3, v4, v5;
- register vector float constVec;
- //src
- register vector float v0_low, v0_hi, v2_low, v2_hi;
- //permute vectors
- register vector unsigned char permVec1;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- int i;
- //handle unaligned at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] -= constant * src[i];
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //calculate permute and do loads
- permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
- v2_hi = vec_ld( 0, &src[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- v0_low = v2_hi;
- v0_hi = vec_ld( 15, &src[i] );
- v2_low = v0_hi;
- v2_hi = vec_ld( 31, &src[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec1 );
- v2 = vec_perm( v2_low, v2_hi, permVec1 );
-
- //we know dst will be aligned here because we already handled the preceeding
- //unaligned cases
- v1 = vec_ld( 0, &dst[i] );
- v3 = vec_ld( 16, &dst[i] );
-
- v4 = vec_nmsub( v0, constVec, v1 );
- v5 = vec_nmsub( v2, constVec, v3 );
- ALIGNED_STORE2( &dst[i], v4, v5 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] -= constant * src[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::MulSub
- dst[i] -= src0[i] * src1[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- //src0
- register vector float v0_low, v0_hi, v2_low, v2_hi;
- //src1
- register vector float v1_low, v1_hi, v3_low, v3_hi;
- //permute vectors
- register vector unsigned char permVec1, permVec2;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- int i;
-
- //unaligned at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] -= src0[i] * src1[i];
- }
-
- //calculate permute and do loads
- permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
- permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
- v2_hi = vec_ld( 0, &src0[i] );
- v3_hi = vec_ld( 0, &src1[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- // load sources
- v0_low = v2_hi;
- v0_hi = vec_ld( 15, &src0[i] );
- v2_low = v0_hi;
- v2_hi = vec_ld( 31, &src0[i] );
-
- v1_low = v3_hi;
- v1_hi = vec_ld( 15, &src1[i] );
- v3_low = v1_hi;
- v3_hi = vec_ld( 31, &src1[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec1 );
- v1 = vec_perm( v1_low, v1_hi, permVec2 );
- v2 = vec_perm( v2_low, v2_hi, permVec1 );
- v3 = vec_perm( v3_low, v3_hi, permVec2 );
-
- //we know dst is aligned because we handled unaligned cases
- //up front
- v4 = vec_ld( 0, &dst[i] );
- v5 = vec_ld( 16, &dst[i] );
-
- v6 = vec_nmsub( v0, v1, v4 );
- v7 = vec_nmsub( v2, v3, v5 );
- ALIGNED_STORE2( &dst[i], v6, v7 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] -= src0[i] * src1[i];
- }
- }
- #endif /* ENABLE_SIMPLE_MATH */
- #ifdef ENABLE_DOT
- /*
- ============
- idSIMD_AltiVec::Dot
- dst[i] = constant * src[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
-
- register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
- register vector float vecX, vecY, vecZ;
- vector float vecX2, vecY2, vecZ2;
- const float *addr = src[0].ToFloatPtr();
- float tempVal[4];
- float constVal[4];
- register vector float zeroVector = (vector float)(0.0);
- register vector float vecConstX, vecConstY, vecConstZ;
-
- // permute vectors
- register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
- register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
-
- register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
- register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
-
- register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
- register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
-
- int i;
-
- // for scalar cleanup, if necessary
- constVal[0] = constant[0];
- constVal[1] = constant[1];
- constVal[2] = constant[2];
- constVal[3] = 0;
-
- vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
- vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
- vecLd2 = vec_ld( 11, constant.ToFloatPtr() );
- vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
-
-
- // populate const vectors
- vecConstX = vec_splat( vecLd1, 0 );
- vecConstY = vec_splat( vecLd1, 1 );
- vecConstZ = vec_splat( vecLd1, 2 );
- vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
- vector float vecOld = vec_ld( 0, addr );
- // handle unaligned case at beginning
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = constant * src[i];
- }
- for ( ; i + 7 < count; i += 8 ) {
- float *vecPtr = (float*)( addr + (i*3) );
- vector float v0, v1, v2, v3, v4, v5;
-
- v0 = vecOld; //vec_ld( 0, vecPtr );
- v1 = vec_ld( 15, vecPtr );
- v2 = vec_ld( 31, vecPtr );
- v3 = vec_ld( 47, vecPtr );
- v4 = vec_ld( 63, vecPtr );
- v5 = vec_ld( 79, vecPtr );
- vecOld = vec_ld( 95, vecPtr );
- vecLd1 = vec_perm( v0, v1, permVec );
- vecLd2 = vec_perm( v1, v2, permVec );
- vecLd3 = vec_perm( v2, v3, permVec );
-
- vecLd4 = vec_perm( v3, v4, permVec );
- vecLd5 = vec_perm( v4, v5, permVec );
- vecLd6 = vec_perm( v5, vecOld, permVec );
-
- // permute into X Y Z vectors
- vecX = vec_perm( vecLd1, vecLd2, permX1 );
- vecY = vec_perm( vecLd1, vecLd2, permY1 );
- vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
- vecX = vec_perm( vecX, vecLd3, permX2 );
- vecY = vec_perm( vecY, vecLd3, permY2 );
- vecZ = vec_perm( vecZ, vecLd3, permZ2 );
-
- vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
- vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
- vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
- vecX2 = vec_perm( vecX2, vecLd6, permX2 );
- vecY2 = vec_perm( vecY2, vecLd6, permY2 );
- vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
-
- // do multiply
- vecX = vec_madd( vecX, vecConstX, zeroVector );
- vecY = vec_madd( vecY, vecConstY, vecX );
- vecZ = vec_madd( vecZ, vecConstZ, vecY );
-
- vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
- vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
- vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
-
- // store out results
- ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
- }
-
- //cleanup
- for ( ; i < count; i++ ) {
- // look up whats at the address we want, cast it as float pointer, then
- // dereference that pointer
- tempVal[0] = *( addr + (i*3) + 0 );
- tempVal[1] = *( addr + (i*3) + 1 );
- tempVal[2] = *( addr + (i*3) + 2 );
- dst[i] = constVal[0] * tempVal[0] + constVal[1] * tempVal[1] + constVal[2] * tempVal[2];
- }
- }
- /*
- ============
- idSIMD_AltiVec::Dot
- dst[i] = constant * src[i].Normal() + src[i][3];
- ============
- */
- void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
- //#define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
- assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
-
- int i;
- float constVal[4];
- float srcVal[3];
- float srcI3;
- float tempVal;
-
- vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
- vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
- vector float vecX, vecY, vecZ, vecI3;
- vector float vecX2, vecY2, vecZ2, vecI32;
- vector float vecConstX, vecConstY, vecConstZ;
-
- constVal[0] = constant[0];
- constVal[1] = constant[1];
- constVal[2] = constant[2];
- constVal[3] = 1;
-
- vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
- vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
- vector float v1 = vec_ld( 11, constant.ToFloatPtr() );
- vector float vecConst = vec_perm( v0, v1, constPerm );
-
- vecConstX = vec_splat( vecConst, 0 );
- vecConstY = vec_splat( vecConst, 1 );
- vecConstZ = vec_splat( vecConst, 2 );
-
- // handle unaligned case at beginning
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = constant * src[i].Normal() + src[i][3];
- }
- const float *addr = src[i].ToFloatPtr();
- vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
- vector float vecOld = vec_ld( 0, addr );
-
- for ( ; i + 7 < count; i += 8 ) {
- float *planePtr = (float*)( addr + (i*PLANE_OFFSET) );
- vector float v0, v1, v2, v3, v4, v5, v6, v7;
-
- v0 = vecOld; //vec_ld( 0, planePtr );
- v1 = vec_ld( 15, planePtr );
- v2 = vec_ld( 31, planePtr );
- v3 = vec_ld( 47, planePtr );
- v4 = vec_ld( 63, planePtr );
- v5 = vec_ld( 79, planePtr );
- v6 = vec_ld( 95, planePtr );
- v7 = vec_ld( 111, planePtr );
- vecOld = vec_ld( 127, planePtr );
-
- vecPlaneLd1 = vec_perm( v0, v1, permVec );
- vecPlaneLd2 = vec_perm( v1, v2, permVec );
- vecPlaneLd3 = vec_perm( v2, v3, permVec );
- vecPlaneLd4 = vec_perm( v3, v4, permVec );
-
- vecPlaneLd5 = vec_perm( v4, v5, permVec );
- vecPlaneLd6 = vec_perm( v5, v6, permVec );
- vecPlaneLd7 = vec_perm( v6, v7, permVec );
- vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
-
- // permute into X Y Z vectors, since this is square its basically
- // a matrix transpose
- v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
- v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
- v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
- v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
-
- vecX = vec_mergeh( v0, v1 );
- vecY = vec_mergel( v0, v1 );
- vecZ = vec_mergeh( v2, v3 );
- vecI3 = vec_mergel( v2, v3 );
-
- v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
- v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
- v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
- v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
-
- vecX2 = vec_mergeh( v4, v5 );
- vecY2 = vec_mergel( v4, v5 );
- vecZ2 = vec_mergeh( v6, v7 );
- vecI32 = vec_mergel( v6, v7 );
- // do calculation
- v6 = vec_madd( vecZ, vecConstZ, vecI3 );
- v5 = vec_madd( vecY, vecConstY, v6 );
- v4 = vec_madd( vecX, vecConstX, v5 );
-
- v0 = vec_madd( vecZ2, vecConstZ, vecI32 );
- v1 = vec_madd( vecY2, vecConstY, v0 );
- v2 = vec_madd( vecX2, vecConstX, v1 );
-
- // store results
- ALIGNED_STORE2( &dst[i], v4, v2 );
- }
- // cleanup
- for ( ; i < count; i++ ) {
- // populate srcVal with src X Y Z
- srcVal[0] = *(addr + (i*PLANE_OFFSET) + 0 );
- srcVal[1] = *(addr + (i*PLANE_OFFSET) + 1 );
- srcVal[2] = *(addr + (i*PLANE_OFFSET) + 2 );
-
- // put src[i][3] into srcI3
- srcI3 = *(addr + (i*PLANE_OFFSET) + 3 );
- tempVal = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
- dst[i] = tempVal + srcI3;
- }
- }
- #ifndef DRAWVERT_PADDED
- /*
- ============
- idSIMD_AltiVec::Dot
- dst[i] = constant * src[i].xyz;
- ============
- */
- void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
- //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
-
- // idDrawVert size is 60 bytes
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- int i;
- register vector float vecConstX, vecConstY, vecConstZ;
- register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
- register vector float zeroVector = (vector float)(0.0);
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
-
- vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
- v0 = vec_ld( 0, constant.ToFloatPtr() );
- v1 = vec_ld( 11, constant.ToFloatPtr() );
- v0 = vec_perm( v0, v1, constPerm );
-
- // permute into constant vectors
- vecConstX = vec_splat( v0, 0 );
- vecConstY = vec_splat( v0, 1 );
- vecConstZ = vec_splat( v0, 2 );
-
- // handle unaligned case at beginning
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = constant * src[i].xyz;
- }
-
- // every fourth one will have the same alignment. Make sure we've got enough here
- if ( i+3 < count ) {
- vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- }
-
- for ( ; i+3 < count; i += 4 ) {
- const float *vertPtr = src[i].xyz.ToFloatPtr();
- const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
-
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 11, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v3 = vec_ld( 11, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v5 = vec_ld( 11, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
- v7 = vec_ld( 11, vertPtr4 );
-
- v0 = vec_perm( v0, v1, vertPerm1 );
- v2 = vec_perm( v2, v3, vertPerm2 );
- v4 = vec_perm( v4, v5, vertPerm3 );
- v6 = vec_perm( v6, v7, vertPerm4 );
- // transpose into X Y Z vectors
- v1 = vec_mergeh( v0, v4 );
- v3 = vec_mergeh( v2, v6 );
- v5 = vec_mergel( v0, v4 );
- v7 = vec_mergel( v2, v6 );
-
- vecSrcX1 = vec_mergeh( v1, v3 );
- vecSrcY1 = vec_mergel( v1, v3 );
- vecSrcZ1 = vec_mergeh( v5, v7 );
-
- // now calculate dot product
- vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
- vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
- vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
-
- // store results
- vec_st( vecSrcZ1, 0, &dst[i] );
- }
-
- for ( ; i < count; i++ ) {
- dst[i] = constant * src[i].xyz;
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::Dot
- dst[i] = constant * src[i].xyz;
- ============
- */
- void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
- //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
-
- // idDrawVert size is 64 bytes
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- int i;
- register vector float vecConstX, vecConstY, vecConstZ;
- register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
- register vector float zeroVector = (vector float)(0.0);
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
-
- vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
- v0 = vec_ld( 0, constant.ToFloatPtr() );
- v1 = vec_ld( 11, constant.ToFloatPtr() );
- v0 = vec_perm( v0, v1, constPerm );
-
- // permute into constant vectors
- vecConstX = vec_splat( v0, 0 );
- vecConstY = vec_splat( v0, 1 );
- vecConstZ = vec_splat( v0, 2 );
-
- // handle unaligned case at beginning
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = constant * src[i].xyz;
- }
-
- for ( ; i+3 < count; i += 4 ) {
- const float *vertPtr = src[i].xyz.ToFloatPtr();
- const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
-
- v0 = vec_ld( 0, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
-
- // transpose into X Y Z vectors
- v1 = vec_mergeh( v0, v4 );
- v3 = vec_mergeh( v2, v6 );
- v5 = vec_mergel( v0, v4 );
- v7 = vec_mergel( v2, v6 );
-
- vecSrcX1 = vec_mergeh( v1, v3 );
- vecSrcY1 = vec_mergel( v1, v3 );
- vecSrcZ1 = vec_mergeh( v5, v7 );
-
- // now calculate dot product
- vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
- vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
- vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
-
- // store results
- vec_st( vecSrcZ1, 0, &dst[i] );
- }
-
- for ( ; i < count; i++ ) {
- dst[i] = constant * src[i].xyz;
- }
- }
- #endif /* DRAWVERT_PADDED */
- /*
- ============
- idSIMD_AltiVec::Dot
- dst[i] = constant.Normal() * src[i] + constant[3];
- ============
- */
- void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
- //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
-
- register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
- register vector float vecX, vecY, vecZ, vecX2, vecY2, vecZ2;
- register vector float zeroVector = (vector float)(0.0);
- register vector float vecConstX, vecConstY, vecConstZ;
- register vector float vecConst3;
-
- idVec3 constNormal = constant.Normal();
- float const3 = constant[3];
-
- // permute vectors
- register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
- register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
-
- register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
- register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
-
- register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
- register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
-
- int i;
-
- vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
- vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
- vecLd2 = vec_ld( 15, constant.ToFloatPtr() );
- vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
-
- // populate const vec
- vecConstX = vec_splat( vecLd1, 0 );
- vecConstY = vec_splat( vecLd1, 1 );
- vecConstZ = vec_splat( vecLd1, 2 );
- // put constant to add in vector
- vecConst3 = loadSplatUnalignedScalar( &const3 );
- // handle unaligned case at beginning
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = constant.Normal() * src[i] + constant[3];
- }
- const float *addr = src[i].ToFloatPtr();
- vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
- vector float vecOld = vec_ld( 0, addr );
-
- for ( ; i+7 < count; i += 8 ) {
- float *vecPtr = (float*)( addr + (i*3) );
- vector float v0, v1, v2, v3, v4, v5;
-
- v0 = vecOld; //vec_ld( 0, vecPtr );
- v1 = vec_ld( 15, vecPtr );
- v2 = vec_ld( 31, vecPtr );
- v3 = vec_ld( 47, vecPtr );
- v4 = vec_ld( 63, vecPtr );
- v5 = vec_ld( 79, vecPtr );
- vecOld = vec_ld( 95, vecPtr );
-
- vecLd1 = vec_perm( v0, v1, permVec );
- vecLd2 = vec_perm( v1, v2, permVec );
- vecLd3 = vec_perm( v2, v3, permVec );
-
- vecLd4 = vec_perm( v3, v4, permVec );
- vecLd5 = vec_perm( v4, v5, permVec );
- vecLd6 = vec_perm( v5, vecOld, permVec );
-
- // permute into X Y Z vectors
- vecX = vec_perm( vecLd1, vecLd2, permX1 );
- vecY = vec_perm( vecLd1, vecLd2, permY1 );
- vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
- vecX = vec_perm( vecX, vecLd3, permX2 );
- vecY = vec_perm( vecY, vecLd3, permY2 );
- vecZ = vec_perm( vecZ, vecLd3, permZ2 );
- vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
- vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
- vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
- vecX2 = vec_perm( vecX2, vecLd6, permX2 );
- vecY2 = vec_perm( vecY2, vecLd6, permY2 );
- vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
-
- // calculate dot product
- vecX = vec_madd( vecX, vecConstX, zeroVector );
- vecY = vec_madd( vecY, vecConstY, vecX );
- vecZ = vec_madd( vecZ, vecConstZ, vecY );
-
- vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
- vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
- vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
-
- // add in constant[3]
- vecZ = vec_add( vecZ, vecConst3 );
- vecZ2 = vec_add( vecZ2, vecConst3 );
-
- // store out results
- ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
- }
-
- //cleanup
- for ( ; i < count; i++ ) {
- dst[i] = constNormal * src[i] + const3;
- }
- }
- /*
- ============
- idSIMD_AltiVec::Dot
- dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
- ============
- */
- void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
- //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
- // check plane size
- assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
-
- float constVal[4];
- float srcVal[4];
-
- int i;
- const float *constPtr = constant.ToFloatPtr();
-
- register vector float vecX, vecY, vecZ, vecI3;
- register vector float vecX2, vecY2, vecZ2, vecI32;
- vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
- vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
- register vector float zeroVector = (vector float)(0.0);
- register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
-
- constVal[0] = *(constPtr);
- constVal[1] = *(constPtr+1);
- constVal[2] = *(constPtr+2);
- constVal[3] = *(constPtr+3);
-
- // populate const vector
- vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
- vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
- vector float v1 = vec_ld( 15, constant.ToFloatPtr() );
- vector float vecConst = vec_perm( v0, v1, constPerm );
-
- vecConstX = vec_splat( vecConst, 0 );
- vecConstY = vec_splat( vecConst, 1 );
- vecConstZ = vec_splat( vecConst, 2 );
- vecConstI3 = vec_splat( vecConst, 3 );
-
- // handle unaligned case at beginning
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
- }
-
- const float *srcPtr = src[i].ToFloatPtr();
- vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
- vector float vecOld = vec_ld( 0, srcPtr );
-
- for ( ; i+7 < count; i += 8 ) {
- float *planePtr = (float*)( srcPtr + (i*PLANE_OFFSET) );
- vector float v0, v1, v2, v3, v4, v5, v6, v7;
-
- v0 = vecOld; // vec_ld( 0, planePtr );
- v1 = vec_ld( 15, planePtr );
- v2 = vec_ld( 31, planePtr );
- v3 = vec_ld( 47, planePtr );
- v4 = vec_ld( 63, planePtr );
- v5 = vec_ld( 79, planePtr );
- v6 = vec_ld( 95, planePtr );
- v7 = vec_ld( 111, planePtr );
- vecOld = vec_ld( 127, planePtr );
-
- vecPlaneLd1 = vec_perm( v0, v1, permVec );
- vecPlaneLd2 = vec_perm( v1, v2, permVec );
- vecPlaneLd3 = vec_perm( v2, v3, permVec );
- vecPlaneLd4 = vec_perm( v3, v4, permVec );
-
- vecPlaneLd5 = vec_perm( v4, v5, permVec );
- vecPlaneLd6 = vec_perm( v5, v6, permVec );
- vecPlaneLd7 = vec_perm( v6, v7, permVec );
- vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
-
- // permute into X Y Z vectors, since this is square its basically
- // a matrix transpose
- v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
- v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
- v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
- v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
-
- vecX = vec_mergeh( v0, v1 );
- vecY = vec_mergel( v0, v1 );
- vecZ = vec_mergeh( v2, v3 );
- vecI3 = vec_mergel( v2, v3 );
-
- v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
- v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
- v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
- v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
-
- vecX2 = vec_mergeh( v4, v5 );
- vecY2 = vec_mergel( v4, v5 );
- vecZ2 = vec_mergeh( v6, v7 );
- vecI32 = vec_mergel( v6, v7 );
-
- // do calculation
- v4 = vec_madd( vecConstX, vecX, zeroVector );
- v5 = vec_madd( vecConstY, vecY, v4 );
- v6 = vec_madd( vecConstZ, vecZ, v5 );
- v7 = vec_madd( vecConstI3, vecI3, v6 );
- v0 = vec_madd( vecConstX, vecX2, zeroVector );
- v1 = vec_madd( vecConstY, vecY2, v0 );
- v2 = vec_madd( vecConstZ, vecZ2, v1 );
- v3 = vec_madd( vecConstI3, vecI32, v2 );
-
- //store result
- ALIGNED_STORE2( &dst[i], v7, v3 );
- }
- // cleanup
- for ( ; i < count; i++ ) {
- //dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
- srcVal[0] = *(srcPtr + (i*PLANE_OFFSET) + 0 );
- srcVal[1] = *(srcPtr + (i*PLANE_OFFSET) + 1 );
- srcVal[2] = *(srcPtr + (i*PLANE_OFFSET) + 2 );
- srcVal[3] = *(srcPtr + (i*PLANE_OFFSET) + 3 );
- dst[i] = srcVal[0] * constVal[0] + srcVal[1] * constVal[1] + srcVal[2] * constVal[2] + constVal[3] * srcVal[3];
- }
- }
- #ifndef DRAWVERT_PADDED
- /*
- ============
- idSIMD_AltiVec::Dot
- dst[i] = constant.Normal() * src[i].xyz + constant[3];
- ============
- */
- void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
- //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
-
- // idDrawVert size is 60 bytes
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
-
- int i;
- const float *constPtr = constant.ToFloatPtr();
- const float *srcPtr = src[0].xyz.ToFloatPtr();
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
- register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
- register vector float vecDest1;
- register vector float zeroVector = (vector float)(0.0);
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
-
- float constVal[4];
- float srcVal[3];
-
- constVal[0] = *(constPtr+0);
- constVal[1] = *(constPtr+1);
- constVal[2] = *(constPtr+2);
- constVal[3] = *(constPtr+3);
- // populate const vec
- vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
- v0 = vec_ld( 0, constant.ToFloatPtr() );
- v1 = vec_ld( 15, constant.ToFloatPtr() );
- v0 = vec_perm( v0, v1, constPerm );
-
- vecConstX = vec_splat( v0, 0 );
- vecConstY = vec_splat( v0, 1 );
- vecConstZ = vec_splat( v0, 2 );
- vecConstI3 = vec_splat( v0, 3 );
-
- // handle unaligned case at beginning
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = constant.Normal() * src[i].xyz + constant[3];
- }
-
- // every fourth one will have the same alignment, so can store these. Make sure we
- // have enough so we don't run off the end of the array
- if ( i+3 < count ) {
- vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- }
-
- for ( ; i+3 < count; i+=4 ) {
- const float *vertPtr = src[i].xyz.ToFloatPtr();
- const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
-
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 11, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v3 = vec_ld( 11, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v5 = vec_ld( 11, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
- v7 = vec_ld( 11, vertPtr4 );
-
- v0 = vec_perm( v0, v1, vertPerm1 );
- v2 = vec_perm( v2, v3, vertPerm2 );
- v4 = vec_perm( v4, v5, vertPerm3 );
- v6 = vec_perm( v6, v7, vertPerm4 );
- // transpose into X Y Z vectors
- v1 = vec_mergeh( v0, v4 );
- v3 = vec_mergeh( v2, v6 );
- v5 = vec_mergel( v0, v4 );
- v7 = vec_mergel( v2, v6 );
-
- vecSrcX1 = vec_mergeh( v1, v3 );
- vecSrcY1 = vec_mergel( v1, v3 );
- vecSrcZ1 = vec_mergeh( v5, v7 );
- // now calculate dot product
- vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
- vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
- vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
- vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
-
- // store results
- vec_st( vecDest1, 0, &dst[i] );
- }
- // cleanup
- for ( ; i < count; i++ ) {
- srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
- srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
- srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
- // dst[i] = constant.Normal() * src[i].xyz + constant[3];
-
- dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
- dst[i] += constVal[3];
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::Dot
- dst[i] = constant.Normal() * src[i].xyz + constant[3];
- ============
- */
- void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
- //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
-
- // idDrawVert size is 60 bytes
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
-
- int i;
- const float *constPtr = constant.ToFloatPtr();
- const float *srcPtr = src[0].xyz.ToFloatPtr();
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
- register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
- register vector float vecDest1;
- register vector float zeroVector = (vector float)(0.0);
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
-
- float constVal[4];
- float srcVal[3];
-
- constVal[0] = *(constPtr+0);
- constVal[1] = *(constPtr+1);
- constVal[2] = *(constPtr+2);
- constVal[3] = *(constPtr+3);
- // populate const vec
- vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
- v0 = vec_ld( 0, constant.ToFloatPtr() );
- v1 = vec_ld( 15, constant.ToFloatPtr() );
- v0 = vec_perm( v0, v1, constPerm );
-
- vecConstX = vec_splat( v0, 0 );
- vecConstY = vec_splat( v0, 1 );
- vecConstZ = vec_splat( v0, 2 );
- vecConstI3 = vec_splat( v0, 3 );
-
- // handle unaligned case at beginning
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = constant.Normal() * src[i].xyz + constant[3];
- }
-
- for ( ; i+3 < count; i+=4 ) {
- const float *vertPtr = src[i].xyz.ToFloatPtr();
- const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
-
- v0 = vec_ld( 0, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
- // transpose into X Y Z vectors
- v1 = vec_mergeh( v0, v4 );
- v3 = vec_mergeh( v2, v6 );
- v5 = vec_mergel( v0, v4 );
- v7 = vec_mergel( v2, v6 );
-
- vecSrcX1 = vec_mergeh( v1, v3 );
- vecSrcY1 = vec_mergel( v1, v3 );
- vecSrcZ1 = vec_mergeh( v5, v7 );
- // now calculate dot product
- vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
- vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
- vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
- vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
-
- // store results
- vec_st( vecDest1, 0, &dst[i] );
- }
- // cleanup
- for ( ; i < count; i++ ) {
- srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
- srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
- srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
- // dst[i] = constant.Normal() * src[i].xyz + constant[3];
-
- dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
- dst[i] += constVal[3];
- }
- }
- #endif /* DRAWVERT_PADDED */
- /*
- ============
- idSIMD_AltiVec::Dot
- dst[i] = src0[i] * src1[i];
- ============
- */
- void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
- //#define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
-
- int i;
- float src0Val[3];
- float src1Val[3];
-
- register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
- vector float vecLd7, vecLd8, vecLd9, vecLd10, vecLd11, vecLd12;
- register vector float vecX0, vecY0, vecZ0, vecX1, vecY1, vecZ1;
- register vector float vecX02, vecY02, vecZ02, vecX12, vecY12, vecZ12;
- register vector float zeroVector = (vector float)(0.0);
- // permute vectors
- register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
- register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
- register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
- register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
- register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
- register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
-
- // handle unaligned case at beginning
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = src0[i] * src1[i];
- }
-
- const float *src0Ptr = src0[i].ToFloatPtr();
- const float *src1Ptr = src1[i].ToFloatPtr();
- vector unsigned char permVec1 = vec_add( vec_lvsl( -1, src0Ptr ), (vector unsigned char)(1) );
- vector unsigned char permVec2 = vec_add( vec_lvsl( -1, src1Ptr ), (vector unsigned char)(1) );
- vector float vecOld0 = vec_ld( 0, src0Ptr );
- vector float vecOld1 = vec_ld( 0, src1Ptr );
-
- for ( i = 0; i+7 < count; i += 8 ) {
- float *s0Ptr = (float*)( src0Ptr + (i*3) );
- float *s1Ptr = (float*)( src1Ptr + (i*3) );
-
- vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
- v0 = vecOld0;
- v1 = vec_ld( 15, s0Ptr );
- v2 = vec_ld( 31, s0Ptr );
- v3 = vec_ld( 47, s0Ptr );
- v4 = vec_ld( 63, s0Ptr );
- v5 = vec_ld( 79, s0Ptr );
- vecOld0 = vec_ld( 95, s0Ptr );
-
- v6 = vecOld1;
- v7 = vec_ld( 15, s1Ptr );
- v8 = vec_ld( 31, s1Ptr );
- v9 = vec_ld( 47, s1Ptr );
- v10 = vec_ld( 63, s1Ptr );
- v11 = vec_ld( 79, s1Ptr );
- vecOld1 = vec_ld( 95, s1Ptr );
-
- vecLd1 = vec_perm( v0, v1, permVec1 );
- vecLd2 = vec_perm( v1, v2, permVec1 );
- vecLd3 = vec_perm( v2, v3, permVec1 );
- vecLd4 = vec_perm( v3, v4, permVec1 );
- vecLd5 = vec_perm( v4, v5, permVec1 );
- vecLd6 = vec_perm( v5, vecOld0, permVec1 );
-
- vecLd7 = vec_perm( v6, v7, permVec2 );
- vecLd8 = vec_perm( v7, v8, permVec2 );
- vecLd9 = vec_perm( v8, v9, permVec2 );
- vecLd10 = vec_perm( v9, v10, permVec2 );
- vecLd11 = vec_perm( v10, v11, permVec2 );
- vecLd12 = vec_perm( v11, vecOld1, permVec2 );
-
- // permute into X Y Z vectors
- vecX0 = vec_perm( vecLd1, vecLd2, permX1 );
- vecY0 = vec_perm( vecLd1, vecLd2, permY1 );
- vecZ0 = vec_perm( vecLd1, vecLd2, permZ1 );
- vecX0 = vec_perm( vecX0, vecLd3, permX2 );
- vecY0 = vec_perm( vecY0, vecLd3, permY2 );
- vecZ0 = vec_perm( vecZ0, vecLd3, permZ2 );
-
- vecX02 = vec_perm( vecLd4, vecLd5, permX1 );
- vecY02 = vec_perm( vecLd4, vecLd5, permY1 );
- vecZ02 = vec_perm( vecLd4, vecLd5, permZ1 );
- vecX02 = vec_perm( vecX02, vecLd6, permX2 );
- vecY02 = vec_perm( vecY02, vecLd6, permY2 );
- vecZ02 = vec_perm( vecZ02, vecLd6, permZ2 );
-
- vecX1 = vec_perm( vecLd7, vecLd8, permX1 );
- vecY1 = vec_perm( vecLd7, vecLd8, permY1 );
- vecZ1 = vec_perm( vecLd7, vecLd8, permZ1 );
- vecX1 = vec_perm( vecX1, vecLd9, permX2 );
- vecY1 = vec_perm( vecY1, vecLd9, permY2 );
- vecZ1 = vec_perm( vecZ1, vecLd9, permZ2 );
-
- vecX12 = vec_perm( vecLd10, vecLd11, permX1 );
- vecY12 = vec_perm( vecLd10, vecLd11, permY1 );
- vecZ12 = vec_perm( vecLd10, vecLd11, permZ1 );
- vecX12 = vec_perm( vecX12, vecLd12, permX2 );
- vecY12 = vec_perm( vecY12, vecLd12, permY2 );
- vecZ12 = vec_perm( vecZ12, vecLd12, permZ2 );
-
- // do multiply
- vecX0 = vec_madd( vecX0, vecX1, zeroVector );
- vecY0 = vec_madd( vecY0, vecY1, vecX0 );
- vecZ0 = vec_madd( vecZ0, vecZ1, vecY0 );
- vecX02 = vec_madd( vecX02, vecX12, zeroVector );
- vecY02 = vec_madd( vecY02, vecY12, vecX02 );
- vecZ02 = vec_madd( vecZ02, vecZ12, vecY02 );
-
- // store out results
- ALIGNED_STORE2( &dst[i], vecZ0, vecZ02 );
- }
-
- // cleanup
- for ( ; i < count; i++ ) {
- // dst[i] = src0[i] * src1[i];
- src0Val[0] = *( src0Ptr + (i*3) + 0 );
- src0Val[1] = *( src0Ptr + (i*3) + 1 );
- src0Val[2] = *( src0Ptr + (i*3) + 2 );
-
- src1Val[0] = *( src1Ptr + (i*3) + 0 );
- src1Val[1] = *( src1Ptr + (i*3) + 1 );
- src1Val[2] = *( src1Ptr + (i*3) + 2 );
-
- dst[i] = src0Val[0] * src1Val[0] + src0Val[1] * src1Val[1] + src0Val[2] * src1Val[2];
- }
- }
- /*
- ============
- idSIMD_AltiVec::Dot
- dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
- ============
- */
- void VPCALL idSIMD_AltiVec::Dot( float &dot, const float *src1, const float *src2, const int count ) {
- dot = 0.0f;
- register vector float v0, v1, v2, v3;
- register vector float zeroVector;
- register vector float runningTotal1, runningTotal2;
- //src0
- register vector float v0_low, v0_hi, v2_low, v2_hi;
- //src1
- register vector float v1_low, v1_hi, v3_low, v3_hi;
- //permute vectors
- register vector unsigned char permVec1, permVec2;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- int i = 0;
-
- runningTotal1 = (vector float)(0.0);
- runningTotal2 = (vector float)(0.0);
- zeroVector = (vector float)(0.0);
-
- if ( count >= 8 ) {
- //calculate permute and do loads
- permVec1 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
- permVec2 = vec_add( vec_lvsl( -1, (int*) &src2[i] ), oneCharVector );
- v2_hi = vec_ld( 0, &src1[i] );
- v3_hi = vec_ld( 0, &src2[i] );
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load sources
- v0_low = v2_hi;
- v0_hi = vec_ld( 15, &src1[i] );
- v2_low = v0_hi;
- v2_hi = vec_ld( 31, &src1[i] );
-
- v1_low = v3_hi;
- v1_hi = vec_ld( 15, &src2[i] );
- v3_low = v1_hi;
- v3_hi = vec_ld( 31, &src2[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec1 );
- v1 = vec_perm( v1_low, v1_hi, permVec2 );
- v2 = vec_perm( v2_low, v2_hi, permVec1 );
- v3 = vec_perm( v3_low, v3_hi, permVec2 );
- //multiply together and keep running sum
- runningTotal1 = vec_madd( v0, v1, runningTotal1 );
- runningTotal2 = vec_madd( v2, v3, runningTotal2 );
- }
-
- runningTotal1 = vec_add( runningTotal1, runningTotal2 );
-
- // sum accross vector
- v0 = vec_add( runningTotal1, vec_sld( runningTotal1, runningTotal1, 8 ) );
- v1 = vec_add( v0, vec_sld( v0, v0, 4 ) );
- runningTotal1 = vec_splat( v1, 0 );
- vec_ste( runningTotal1, 0, &dot );
- }
-
- //handle cleanup. when profiling the game, we found that most of the counts to this function were small, so it
- // spends a lot of time in this scalar code. It's already really really fast (eg 1 TB tick) for scalar code for
- // counts less than 50, so not much point in trying to get vector code in on the action
- for ( ; i < count ; i++ ) {
- dot += src1[i] * src2[i];
- }
-
- }
- #endif /* ENABLE_DOT */
- #ifdef ENABLE_COMPARES
- /*
- ============
- idSIMD_AltiVec::CmpGT
- dst[i] = src0[i] > constant;
- ============
- */
- void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
- //#define OPER(X) dst[(X)] = src0[(X)] > constant;
- register vector float v0, v1, v2, v3;
- register vector bool int vr1, vr2, vr3, vr4;
- register vector bool short vs1, vs2;
- register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
- register vector unsigned char vc1;
- register vector bool char vbc1;
- register vector float constVec;
- register vector unsigned char oneVector = (vector unsigned char)(1);
- register vector unsigned char permVec;
- int i;
- //handle unaligned at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] = src0[i] > constant;
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //calculate permute and do loads
- permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
- v3_hi = vec_ld( 0, &src0[i] );
-
- //vectorize!
- for ( ; i+15 < count; i += 16 ) {
- // load values
- v0_low = v3_hi;
- v0_hi = vec_ld( 15, &src0[i] );
- v1_low = v0_hi;
- v1_hi = vec_ld( 31, &src0[i] );
- v2_low = v1_hi;
- v2_hi = vec_ld( 47, &src0[i] );
- v3_low = v2_hi;
- v3_hi = vec_ld( 63, &src0[i] );
-
- //permute into the vectors we want
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v1_low, v1_hi, permVec );
- v2 = vec_perm( v2_low, v2_hi, permVec );
- v3 = vec_perm( v3_low, v3_hi, permVec );
-
- //do comparison
- vr1 = vec_cmpgt( v0, constVec );
- vr2 = vec_cmpgt( v1, constVec );
- vr3 = vec_cmpgt( v2, constVec );
- vr4 = vec_cmpgt( v3, constVec );
-
- // pack results into shorts
- vs1 = vec_pack(vr1, vr2);
- vs2 = vec_pack(vr3, vr4);
-
- // pack results into byte
- vbc1 = vec_pack(vs1, vs2);
-
- //AND with 1 to get true=1 not true=255
- vc1 = vec_and( vbc1, oneVector );
-
- //store results
- vec_st( vc1, 0, &dst[i] );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = src0[i] > constant;
- }
- }
- /*
- ============
- idSIMD_AltiVec::CmpGT
- dst[i] |= ( src0[i] > constant ) << bitNum;
- ============
- */
- void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
- //#define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
- // Temp vector registers
- register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
- register vector bool short vtbs0, vtbs1;
- register vector bool char vtbc0;
- register vector unsigned char vtuc0;
- register vector unsigned char permVec, permVec2;
-
- // dest vectors
- register vector unsigned char vd;
- // bitNum vectors
- register vector unsigned char bitNumVec;
- // src0 vectors
- register vector float vs0, vs1, vs2, vs3;
- register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
- // constant vector
- register vector float constVec;
- // all one's
- register vector unsigned char oneVector = (vector unsigned char)(1);
- int i = 0;
-
- //handle unaligned at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] |= ( src0[i] > constant ) << bitNum;
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //bitNum is unaligned.
- permVec2 = vec_lvsl( 0, &bitNum );
- vtuc0 = vec_ld( 0, &bitNum );
- bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
- bitNumVec = vec_splat( bitNumVec, 0 );
- //calculate permute and do loads
- permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
- vs3_hi = vec_ld( 0, &src0[i] );
- //vectorize!
- for ( ; i+15 < count; i += 16 ) {
- //load sources (floats)
- vs0_low = vs3_hi;
- vs0_hi = vec_ld( 15, &src0[i] );
- vs1_low = vs0_hi;
- vs1_hi = vec_ld( 31, &src0[i] );
- vs2_low = vs1_hi;
- vs2_hi = vec_ld( 47, &src0[i] );
- vs3_low = vs2_hi;
- vs3_hi = vec_ld( 63, &src0[i] );
-
- //permute into the vectors we want
- vs0 = vec_perm( vs0_low, vs0_hi, permVec );
- vs1 = vec_perm( vs1_low, vs1_hi, permVec );
- vs2 = vec_perm( vs2_low, vs2_hi, permVec );
- vs3 = vec_perm( vs3_low, vs3_hi, permVec );
-
- //load dest (bytes) as unsigned char
- vd = vec_ld( 0, &dst[i] );
-
- // do comparison and get bool int result
- vtbi0 = vec_cmpgt( vs0, constVec );
- vtbi1 = vec_cmpgt( vs1, constVec );
- vtbi2 = vec_cmpgt( vs2, constVec );
- vtbi3 = vec_cmpgt( vs3, constVec );
-
- // pack results into shorts
- vtbs0 = vec_pack(vtbi0, vtbi1);
- vtbs1 = vec_pack(vtbi2, vtbi3);
-
- // pack results into byte
- vtbc0 = vec_pack(vtbs0, vtbs1);
-
- //and with 1 to get true=1 instead of true=255
- vtuc0 = vec_and(vtbc0, oneVector);
- vtuc0 = vec_sl(vtuc0, bitNumVec );
-
- //or with original
- vd = vec_or( vd, vtuc0 );
-
- vec_st( vd, 0, &dst[i] );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] |= ( src0[i] > constant ) << bitNum;
- }
- }
- /*
- ============
- idSIMD_AltiVec::CmpGE
- dst[i] = src0[i] >= constant;
- ============
- */
- void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
- register vector float v0, v1, v2, v3;
- register vector bool int vr1, vr2, vr3, vr4;
- register vector bool short vs1, vs2;
- register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
- register vector unsigned char vc1;
- register vector bool char vbc1;
- register vector float constVec;
- register vector unsigned char oneVector = (vector unsigned char)(1);
- register vector unsigned char permVec;
- int i = 0;
- //handle unaligned at start
- for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] = src0[i] >= constant;
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //calculate permute and do loads
- permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
- v3_hi = vec_ld( 0, &src0[i] );
-
- //vectorize!
- for ( ; i+15 < count; i += 16 ) {
- // load values
- v0_low = v3_hi;
- v0_hi = vec_ld( 15, &src0[i] );
- v1_low = v0_hi;
- v1_hi = vec_ld( 31, &src0[i] );
- v2_low = v1_hi;
- v2_hi = vec_ld( 47, &src0[i] );
- v3_low = v2_hi;
- v3_hi = vec_ld( 63, &src0[i] );
- //permute into the vectors we want
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v1_low, v1_hi, permVec );
- v2 = vec_perm( v2_low, v2_hi, permVec );
- v3 = vec_perm( v3_low, v3_hi, permVec );
- //do comparison
- vr1 = vec_cmpge( v0, constVec );
- vr2 = vec_cmpge( v1, constVec );
- vr3 = vec_cmpge( v2, constVec );
- vr4 = vec_cmpge( v3, constVec );
- // pack results into shorts
- vs1 = vec_pack(vr1, vr2);
- vs2 = vec_pack(vr3, vr4);
- // pack results into byte
- vbc1 = vec_pack(vs1, vs2);
- //AND with 1 to get true=1 not true=255
- vc1 = vec_and( vbc1, oneVector );
- //store results
- vec_st( vc1, 0, &dst[i] );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = src0[i] >= constant;
- }
- }
- /*
- ============
- idSIMD_AltiVec::CmpGE
- dst[i] |= ( src0[i] >= constant ) << bitNum;
- ============
- */
- void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
- register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
- register vector bool short vtbs0, vtbs1;
- register vector bool char vtbc0;
- register vector unsigned char vtuc0;
- register vector unsigned char permVec, permVec2;
-
- // dest vectors
- register vector unsigned char vd;
- // bitNum vectors
- register vector unsigned char bitNumVec;
- // src0 vectors
- register vector float vs0, vs1, vs2, vs3;
- register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
- // constant vector
- register vector float constVec;
- // all one's
- register vector unsigned char oneVector = (vector unsigned char)(1);
- int i = 0;
-
- //handle unaligned at start
- for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] |= ( src0[i] >= constant ) << bitNum;
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //bitNum is unaligned.
- permVec2 = vec_lvsl( 0, &bitNum );
- vtuc0 = vec_ld( 0, &bitNum );
- bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
- bitNumVec = vec_splat( bitNumVec, 0 );
- //calculate permute and do loads
- permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
- vs3_hi = vec_ld( 0, &src0[i] );
- //vectorize!
- for ( ; i+15 < count; i += 16 ) {
- //load sources (floats)
- vs0_low = vs3_hi;
- vs0_hi = vec_ld( 15, &src0[i] );
- vs1_low = vs0_hi;
- vs1_hi = vec_ld( 31, &src0[i] );
- vs2_low = vs1_hi;
- vs2_hi = vec_ld( 47, &src0[i] );
- vs3_low = vs2_hi;
- vs3_hi = vec_ld( 63, &src0[i] );
-
- //permute into the vectors we want
- vs0 = vec_perm( vs0_low, vs0_hi, permVec );
- vs1 = vec_perm( vs1_low, vs1_hi, permVec );
- vs2 = vec_perm( vs2_low, vs2_hi, permVec );
- vs3 = vec_perm( vs3_low, vs3_hi, permVec );
-
- //load dest (bytes) as unsigned char
- vd = vec_ld( 0, &dst[i] );
-
- // do comparison and get bool int result
- vtbi0 = vec_cmpge( vs0, constVec );
- vtbi1 = vec_cmpge( vs1, constVec );
- vtbi2 = vec_cmpge( vs2, constVec );
- vtbi3 = vec_cmpge( vs3, constVec );
-
- // pack results into shorts
- vtbs0 = vec_pack(vtbi0, vtbi1);
- vtbs1 = vec_pack(vtbi2, vtbi3);
-
- // pack results into byte
- vtbc0 = vec_pack(vtbs0, vtbs1);
-
- //and with 1L to get true=1 instead of true=255
- vtuc0 = vec_and(vtbc0, oneVector);
- vtuc0 = vec_sl(vtuc0, bitNumVec );
-
- //or with original
- vd = vec_or( vd, vtuc0 );
-
- vec_st( vd, 0, &dst[i] );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] |= ( src0[i] >= constant ) << bitNum;
- }
- }
- /*
- ============
- idSIMD_AltiVec::CmpLT
- dst[i] = src0[i] < constant;
- ============
- */
- void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
- //#define OPER(X) dst[(X)] = src0[(X)] < constant;
- register vector float v0, v1, v2, v3;
- register vector bool int vr1, vr2, vr3, vr4;
- register vector bool short vs1, vs2;
- register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
- register vector unsigned char vc1;
- register vector bool char vbc1;
- register vector float constVec;
- register vector unsigned char oneVector = (vector unsigned char)(1);
- register vector unsigned char permVec;
- int i = 0;
- //handle unaligned at start
- for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] = src0[i] < constant;
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //calculate permute and do loads
- permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
- v3_hi = vec_ld( 0, &src0[i] );
-
- //vectorize!
- for ( ; i+15 < count; i += 16 ) {
- // load values
- v0_low = v3_hi;
- v0_hi = vec_ld( 15, &src0[i] );
- v1_low = v0_hi;
- v1_hi = vec_ld( 31, &src0[i] );
- v2_low = v1_hi;
- v2_hi = vec_ld( 47, &src0[i] );
- v3_low = v2_hi;
- v3_hi = vec_ld( 63, &src0[i] );
-
- //permute into the vectors we want
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v1_low, v1_hi, permVec );
- v2 = vec_perm( v2_low, v2_hi, permVec );
- v3 = vec_perm( v3_low, v3_hi, permVec );
-
- //do comparison
- vr1 = vec_cmplt( v0, constVec );
- vr2 = vec_cmplt( v1, constVec );
- vr3 = vec_cmplt( v2, constVec );
- vr4 = vec_cmplt( v3, constVec );
-
- // pack results into shorts
- vs1 = vec_pack(vr1, vr2);
- vs2 = vec_pack(vr3, vr4);
-
- // pack results into byte
- vbc1 = vec_pack(vs1, vs2);
-
- //AND with 1 to get true=1 not true=255
- vc1 = vec_and( vbc1, oneVector );
-
- //store results
- vec_st( vc1, 0, &dst[i] );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = src0[i] < constant;
- }
- }
- /*
- ============
- idSIMD_AltiVec::CmpLT
- dst[i] |= ( src0[i] < constant ) << bitNum;
- ============
- */
- void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
- //#define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
- register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
- register vector bool short vtbs0, vtbs1;
- register vector bool char vtbc0;
- register vector unsigned char vtuc0;
- register vector unsigned char permVec, permVec2;
-
- // dest vectors
- register vector unsigned char vd;
- // bitNum vectors
- register vector unsigned char bitNumVec;
- // src0 vectors
- register vector float vs0, vs1, vs2, vs3;
- register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
- // constant vector
- register vector float constVec;
- // all one's
- register vector unsigned char oneVector = (vector unsigned char)(1);
- int i = 0;
-
- //handle unaligned at start
- for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] |= ( src0[i] < constant ) << bitNum;
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //bitNum is unaligned.
- permVec2 = vec_lvsl( 0, &bitNum );
- vtuc0 = vec_ld( 0, &bitNum );
- bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
- bitNumVec = vec_splat( bitNumVec, 0 );
- //calculate permute and do loads
- permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
- vs3_hi = vec_ld( 0, &src0[i] );
- //vectorize!
- for ( ; i+15 < count; i += 16 ) {
- //load sources (floats)
- vs0_low = vs3_hi;
- vs0_hi = vec_ld( 15, &src0[i] );
- vs1_low = vs0_hi;
- vs1_hi = vec_ld( 31, &src0[i] );
- vs2_low = vs1_hi;
- vs2_hi = vec_ld( 47, &src0[i] );
- vs3_low = vs2_hi;
- vs3_hi = vec_ld( 63, &src0[i] );
-
- //permute into the vectors we want
- vs0 = vec_perm( vs0_low, vs0_hi, permVec );
- vs1 = vec_perm( vs1_low, vs1_hi, permVec );
- vs2 = vec_perm( vs2_low, vs2_hi, permVec );
- vs3 = vec_perm( vs3_low, vs3_hi, permVec );
-
- //load dest (bytes) as unsigned char
- vd = vec_ld( 0, &dst[i] );
-
- // do comparison and get bool int result
- vtbi0 = vec_cmplt( vs0, constVec );
- vtbi1 = vec_cmplt( vs1, constVec );
- vtbi2 = vec_cmplt( vs2, constVec );
- vtbi3 = vec_cmplt( vs3, constVec );
-
- // pack results into shorts
- vtbs0 = vec_pack(vtbi0, vtbi1);
- vtbs1 = vec_pack(vtbi2, vtbi3);
-
- // pack results into byte
- vtbc0 = vec_pack(vtbs0, vtbs1);
-
- //and with 1L to get true=1 instead of true=255
- vtuc0 = vec_and(vtbc0, oneVector);
- vtuc0 = vec_sl(vtuc0, bitNumVec );
-
- //or with original
- vd = vec_or( vd, vtuc0 );
-
- vec_st( vd, 0, &dst[i] );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] |= ( src0[i] < constant ) << bitNum;
- }
- }
- //#endif
- /*
- ============
- idSIMD_AltiVec::CmpLE
- dst[i] = src0[i] <= constant;
- ============
- */
- void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
- //#define OPER(X) dst[(X)] = src0[(X)] <= constant;
- register vector float v0, v1, v2, v3;
- register vector bool int vr1, vr2, vr3, vr4;
- register vector bool short vs1, vs2;
- register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
- register vector unsigned char vc1;
- register vector bool char vbc1;
- register vector float constVec;
- register vector unsigned char oneVector = (vector unsigned char)(1);
- register vector unsigned char permVec;
- int i = 0;
- //handle unaligned at start
- for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] = src0[i] <= constant;
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //calculate permute and do loads
- permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
- v3_hi = vec_ld( 0, &src0[i] );
-
- //vectorize!
- for ( ; i+15 < count; i += 16 ) {
- // load values
- v0_low = v3_hi;
- v0_hi = vec_ld( 15, &src0[i] );
- v1_low = v0_hi;
- v1_hi = vec_ld( 31, &src0[i] );
- v2_low = v1_hi;
- v2_hi = vec_ld( 47, &src0[i] );
- v3_low = v2_hi;
- v3_hi = vec_ld( 63, &src0[i] );
-
- //permute into the vectors we want
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v1_low, v1_hi, permVec );
- v2 = vec_perm( v2_low, v2_hi, permVec );
- v3 = vec_perm( v3_low, v3_hi, permVec );
-
- //do comparison
- vr1 = vec_cmple( v0, constVec );
- vr2 = vec_cmple( v1, constVec );
- vr3 = vec_cmple( v2, constVec );
- vr4 = vec_cmple( v3, constVec );
-
- // pack results into shorts
- vs1 = vec_pack(vr1, vr2);
- vs2 = vec_pack(vr3, vr4);
-
- // pack results into byte
- vbc1 = vec_pack(vs1, vs2);
-
- //AND with 1 to get true=1 not true=255
- vc1 = vec_and( vbc1, oneVector );
-
- //store results
- vec_st( vc1, 0, &dst[i] );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = src0[i] <= constant;
- }
- }
- /*
- ============
- idSIMD_AltiVec::CmpLE
- dst[i] |= ( src0[i] <= constant ) << bitNum;
- ============
- */
- void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
- //#define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
- register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
- register vector bool short vtbs0, vtbs1;
- register vector bool char vtbc0;
- register vector unsigned char vtuc0;
- register vector unsigned char permVec, permVec2;
-
- // dest vectors
- register vector unsigned char vd;
- // bitNum vectors
- register vector unsigned char bitNumVec;
- // src0 vectors
- register vector float vs0, vs1, vs2, vs3;
- register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
- // constant vector
- register vector float constVec;
- // all one's
- register vector unsigned char oneVector = (vector unsigned char)(1);
- int i = 0;
-
- //handle unaligned at start
- for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
- dst[i] |= ( src0[i] <= constant ) << bitNum;
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //bitNum is unaligned.
- permVec2 = vec_lvsl( 0, &bitNum );
- vtuc0 = vec_ld( 0, &bitNum );
- bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
- bitNumVec = vec_splat( bitNumVec, 0 );
- //calculate permute and do loads
- permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
- vs3_hi = vec_ld( 0, &src0[i] );
- //vectorize!
- for ( ; i+15 < count; i += 16 ) {
- //load sources (floats)
- vs0_low = vs3_hi;
- vs0_hi = vec_ld( 15, &src0[i] );
- vs1_low = vs0_hi;
- vs1_hi = vec_ld( 31, &src0[i] );
- vs2_low = vs1_hi;
- vs2_hi = vec_ld( 47, &src0[i] );
- vs3_low = vs2_hi;
- vs3_hi = vec_ld( 63, &src0[i] );
-
- //permute into the vectors we want
- vs0 = vec_perm( vs0_low, vs0_hi, permVec );
- vs1 = vec_perm( vs1_low, vs1_hi, permVec );
- vs2 = vec_perm( vs2_low, vs2_hi, permVec );
- vs3 = vec_perm( vs3_low, vs3_hi, permVec );
-
- //load dest (bytes) as unsigned char
- vd = vec_ld( 0, &dst[i] );
-
- // do comparison and get bool int result
- vtbi0 = vec_cmple( vs0, constVec );
- vtbi1 = vec_cmple( vs1, constVec );
- vtbi2 = vec_cmple( vs2, constVec );
- vtbi3 = vec_cmple( vs3, constVec );
-
- // pack results into shorts
- vtbs0 = vec_pack(vtbi0, vtbi1);
- vtbs1 = vec_pack(vtbi2, vtbi3);
-
- // pack results into byte
- vtbc0 = vec_pack(vtbs0, vtbs1);
-
- //and with 1L to get true=1 instead of true=255
- vtuc0 = vec_and(vtbc0, oneVector);
- vtuc0 = vec_sl(vtuc0, bitNumVec );
-
- //or with original
- vd = vec_or( vd, vtuc0 );
-
- vec_st( vd, 0, &dst[i] );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] |= ( src0[i] <= constant ) << bitNum;
- }
- }
- #endif /* ENABLE_COMPARES */
- #ifdef ENABLE_MINMAX
- /*
- ============
- idSIMD_AltiVec::MinMax
- ============
- */
- void VPCALL idSIMD_AltiVec::MinMax( float &min, float &max, const float *src, const int count ) {
- min = idMath::INFINITY; max = -idMath::INFINITY;
- //#define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
- register vector float v0, v1, v2, v3;
- register vector float maxVec, minVec, tempMin, tempMax;
- register vector unsigned char permVec;
- register vector float v0_low, v0_hi, v1_low, v1_hi;
- vector unsigned char oneCharVector = (vector unsigned char)(1);
- int i = 0;
-
- if ( count >= 4 ) {
-
- //calculate permute and do first load to
- //get a starting point for min and max
- permVec = vec_add( vec_lvsl( -1, (int*) &src[0] ), oneCharVector );
- v1_hi = vec_ld( 0, &src[0] );
-
- maxVec = loadSplatUnalignedScalar( &max );
- minVec = loadSplatUnalignedScalar( &min );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load sources
- v0_low = v1_hi;
- v0_hi = vec_ld( 15, &src[i] );
- v1_low = v0_hi;
- v1_hi = vec_ld( 31, &src[i] );
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v1_low, v1_hi, permVec );
-
- // minimum
- v2 = vec_min( v0, v1 );
- minVec = vec_min( minVec, v2 );
- // maximum
- v3 = vec_max( v0, v1 );
- maxVec = vec_max( maxVec, v3 );
- }
-
- //minVec and maxVec hold the min/max elements from the array, but now
- //we need to figure out which particular element it is
-
- tempMin = minVec;
- tempMax = maxVec;
- // rotate vector around and compare to itself to find the real min/max
- tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 8 ) );
- tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 8 ) );
- tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 4 ) );
- tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 4 ) );
- minVec = vec_splat( tempMin, 0 );
- maxVec = vec_splat( tempMax, 0 );
- vec_ste( minVec, 0, &min );
- vec_ste( maxVec, 0, &max );
- }
-
- //cleanup
- for ( ; i < count; i++ ) {
- if ( src[i] < min ) {
- min = src[i];
- }
- if ( src[i] > max ) {
- max = src[i];
- }
- }
- }
- /*
- ============
- idSIMD_AltiVec::MinMax
- ============
- */
- void VPCALL idSIMD_AltiVec::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
- min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
- //#define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
- idVec2 v;
- int i = 0;
- int j;
-
- const float *srcPtr = src[0].ToFloatPtr();
- register vector float vecLd1, vecLd2, vecLd3, vecLd4;
- register vector float vecMin, vecMax;
-
- register vector float v0, v1, v2, v3;
-
- if ( count > 4 ) {
-
- vecMin = (vector float)(FLT_MAX);
- vecMax = (vector float)(FLT_MIN);
-
- vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
- vector float vecOld = vec_ld( 0, srcPtr );
- for ( i = 0, j = 0; i+7 < count; i += 8, j += 4) {
- // load data
- float *vecPtr = (float*)( srcPtr + (j*4) );
- vector float v0, v1, v2, v3;
-
- v0 = vecOld;
- v1 = vec_ld( 15, vecPtr );
- v2 = vec_ld( 31, vecPtr );
- v3 = vec_ld( 47, vecPtr );
- vecOld = vec_ld( 63, vecPtr );
-
- vecLd1 = vec_perm( v0, v1, permVec );
- vecLd2 = vec_perm( v1, v2, permVec );
- vecLd3 = vec_perm( v2, v3, permVec );
- vecLd4 = vec_perm( v3, vecOld, permVec );
- // each of these vectors contains 2 elements
- // looks like | X Y X Y | X Y X Y
- v0 = vec_min( vecLd1, vecLd2 );
- v1 = vec_min( vecLd3, vecLd4 );
- v0 = vec_min( v0, v1 );
-
- v2 = vec_max( vecLd1, vecLd2 );
- v3 = vec_max( vecLd3, vecLd4 );
- v2 = vec_max( v2, v3 );
-
- // since its always X Y X Y we don't have to re-merge each time. we can wait
- // until the end
- vecMin = vec_min( v0, vecMin );
- vecMax = vec_max( v2, vecMax );
- }
- vecMin = vec_min( vecMin, vec_sld( vecMin, vecMin, 8 ) );
- vecMax = vec_max( vecMax, vec_sld( vecMax, vecMax, 8 ) );
- v0 = vec_splat( vecMin, 0 );
- v1 = vec_splat( vecMin, 1 );
- v2 = vec_splat( vecMax, 0 );
- v3 = vec_splat( vecMax, 1 );
-
- vec_ste( v0, 0, &min[0] );
- vec_ste( v1, 0, &min[1] );
- vec_ste( v2, 0, &max[0] );
- vec_ste( v3, 0, &max[1] );
- }
-
- // cleanup
- for ( ; i < count; i++ ) {
- v = src[i];
-
- if ( v[0] < min[0] ) {
- min[0] = v[0];
- }
- if ( v[0] > max[0] ) {
- max[0] = v[0];
- }
-
- if ( v[1] < min[1] ) {
- min[1] = v[1];
- }
- if ( v[1] > max[1] ) {
- max[1] = v[1];
- }
- }
- }
- /*
- ============
- idSIMD_AltiVec::MinMax
- ============
- */
- void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
- min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
- //#define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
- int i = 0;
- const float *srcPtr = src[0].ToFloatPtr();
- idVec3 v;
- register vector float vecLd1, vecLd2, vecLd3;
- register vector float vecMin, vecMax;
- register vector float vecSrc1, vecSrc2, vecSrc3, vecSrc4;
- register vector float vecMin1, vecMin2, vecMax1, vecMax2;
-
- if ( count >= 4 ) {
-
- vecMin = (vector float)(FLT_MAX);
- vecMax = (vector float)(FLT_MIN);
-
- vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr), (vector unsigned char)(1) );
- vector float vecOld = vec_ld( 0, srcPtr );
- // 4 elements at a time
- for ( ; i+3 < count; i += 4 ) {
- float *vecPtr = (float*)( srcPtr + (i*3) );
- vector float v0, v1, v2;
-
- v0 = vecOld;
- v1 = vec_ld( 15, vecPtr );
- v2 = vec_ld( 31, vecPtr );
- vecOld = vec_ld( 47, vecPtr );
-
- vecLd1 = vec_perm( v0, v1, permVec );
- vecLd2 = vec_perm( v1, v2, permVec );
- vecLd3 = vec_perm( v2, vecOld, permVec );
-
- // put each idVec3 into its own vector as X Y Z (crap)
- vecSrc1 = vecLd1;
- vecSrc2 = vec_sld( vecLd1, vecLd2, 12 );
- vecSrc3 = vec_sld( vecLd2, vecLd3, 8 );
- vecSrc4 = vec_sld( vecLd3, vecLd3, 4 );
-
- // do min and max
- vecMin1 = vec_min( vecSrc1, vecSrc2 );
- vecMin2 = vec_min( vecSrc3, vecSrc4 );
- vecMin1 = vec_min( vecMin1, vecMin2 );
- vecMin = vec_min( vecMin, vecMin1 );
-
- vecMax1 = vec_max( vecSrc1, vecSrc2 );
- vecMax2 = vec_max( vecSrc3, vecSrc4 );
- vecMax1 = vec_max( vecMax1, vecMax2 );
- vecMax = vec_max( vecMax1, vecMax );
- }
-
- // store results
- vector float v0, v1, v2, v3, v4, v5;
- v0 = vec_splat( vecMin, 0 );
- v1 = vec_splat( vecMin, 1 );
- v2 = vec_splat( vecMin, 2 );
- v3 = vec_splat( vecMax, 0 );
- v4 = vec_splat( vecMax, 1 );
- v5 = vec_splat( vecMax, 2 );
- vec_ste( v0, 0, &min[0] );
- vec_ste( v1, 0, &min[1] );
- vec_ste( v2, 0, &min[2] );
- vec_ste( v3, 0, &max[0] );
- vec_ste( v4, 0, &max[1] );
- vec_ste( v5, 0, &max[2] );
- }
-
- // cleanup
- for ( ; i < count; i ++ ) {
- v = src[i];
-
- if ( v[0] < min[0] ) {
- min[0] = v[0];
- }
- if ( v[0] > max[0] ) {
- max[0] = v[0];
- }
- if ( v[1] < min[1] ) {
- min[1] = v[1];
- }
- if ( v[1] > max[1] ) {
- max[1] = v[1];
- }
- if ( v[2] < min[2] ) {
- min[2] = v[2];
- }
- if ( v[2] > max[2] ) {
- max[2] = v[2];
- }
- }
- }
- #ifndef DRAWVERT_PADDED
- /*
- ============
- idSIMD_AltiVec::MinMax
- ============
- */
- void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
-
- min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
- idVec3 v;
- int i = 0;
- register vector float vecMin, vecMax;
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- register vector float vecMin1, vecMin2, vecMax1, vecMax2;
-
- if ( count >= 4 ) {
- vecMin = (vector float)(FLT_MAX);
- vecMax = (vector float)(FLT_MIN);
-
- vector unsigned char vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vector unsigned char vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vector unsigned char vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vector unsigned char vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- for ( ; i+3 < count; i += 4) {
- const float *vertPtr = src[i].xyz.ToFloatPtr();
- const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 11, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v3 = vec_ld( 11, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v5 = vec_ld( 11, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
- v7 = vec_ld( 11, vertPtr4 );
-
- v0 = vec_perm( v0, v1, vertPerm1 );
- v2 = vec_perm( v2, v3, vertPerm2 );
- v4 = vec_perm( v4, v5, vertPerm3 );
- v6 = vec_perm( v6, v7, vertPerm4 );
-
- vecMin1 = vec_min( v0, v2 );
- vecMin2 = vec_min( v4, v6 );
- vecMin1 = vec_min( vecMin1, vecMin2 );
- vecMin = vec_min( vecMin, vecMin1 );
-
- vecMax1 = vec_max( v0, v2 );
- vecMax2 = vec_max( v4, v6 );
- vecMax1 = vec_max( vecMax1, vecMax2 );
- vecMax = vec_max( vecMax, vecMax1 );
- }
-
- // now we have min/max vectors in X Y Z form, store out
- v0 = vec_splat( vecMin, 0 );
- v1 = vec_splat( vecMin, 1 );
- v2 = vec_splat( vecMin, 2 );
- v3 = vec_splat( vecMax, 0 );
- v4 = vec_splat( vecMax, 1 );
- v5 = vec_splat( vecMax, 2 );
- vec_ste( v0, 0, &min[0] );
- vec_ste( v1, 0, &min[1] );
- vec_ste( v2, 0, &min[2] );
- vec_ste( v3, 0, &max[0] );
- vec_ste( v4, 0, &max[1] );
- vec_ste( v5, 0, &max[2] );
- }
-
- // cleanup
- for ( ; i < count; i++ ) {
- v = src[i].xyz;
-
- if ( v[0] < min[0] ) {
- min[0] = v[0];
- }
- if ( v[0] > max[0] ) {
- max[0] = v[0];
- }
-
- if ( v[1] < min[1] ) {
- min[1] = v[1];
- }
- if ( v[1] > max[1] ) {
- max[1] = v[1];
- }
-
- if ( v[2] > max[2] ) {
- max[2] = v[2];
- }
-
- if ( v[2] < min[2] ) {
- min[2] = v[2];
- }
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::MinMax
- ============
- */
- void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
-
- min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
- idVec3 v;
- int i = 0;
- register vector float vecMin, vecMax;
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- register vector float vecMin1, vecMin2, vecMax1, vecMax2;
-
- if ( count >= 4 ) {
- vecMin = (vector float)(FLT_MAX);
- vecMax = (vector float)(FLT_MIN);
- for ( ; i+3 < count; i += 4) {
- const float *vertPtr = src[i].xyz.ToFloatPtr();
- const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
- v0 = vec_ld( 0, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
-
- vecMin1 = vec_min( v0, v2 );
- vecMin2 = vec_min( v4, v6 );
- vecMin1 = vec_min( vecMin1, vecMin2 );
- vecMin = vec_min( vecMin, vecMin1 );
-
- vecMax1 = vec_max( v0, v2 );
- vecMax2 = vec_max( v4, v6 );
- vecMax1 = vec_max( vecMax1, vecMax2 );
- vecMax = vec_max( vecMax, vecMax1 );
- }
-
- // now we have min/max vectors in X Y Z form, store out
- v0 = vec_splat( vecMin, 0 );
- v1 = vec_splat( vecMin, 1 );
- v2 = vec_splat( vecMin, 2 );
- v3 = vec_splat( vecMax, 0 );
- v4 = vec_splat( vecMax, 1 );
- v5 = vec_splat( vecMax, 2 );
- vec_ste( v0, 0, &min[0] );
- vec_ste( v1, 0, &min[1] );
- vec_ste( v2, 0, &min[2] );
- vec_ste( v3, 0, &max[0] );
- vec_ste( v4, 0, &max[1] );
- vec_ste( v5, 0, &max[2] );
- }
-
- // cleanup
- for ( ; i < count; i++ ) {
- v = src[i].xyz;
-
- if ( v[0] < min[0] ) {
- min[0] = v[0];
- }
- if ( v[0] > max[0] ) {
- max[0] = v[0];
- }
-
- if ( v[1] < min[1] ) {
- min[1] = v[1];
- }
- if ( v[1] > max[1] ) {
- max[1] = v[1];
- }
-
- if ( v[2] > max[2] ) {
- max[2] = v[2];
- }
-
- if ( v[2] < min[2] ) {
- min[2] = v[2];
- }
- }
- }
- #endif /* DRAWVERT_PADDED */
- #ifndef DRAWVERT_PADDED
- /*
- ============
- idSIMD_AltiVec::MinMax
- ============
- */
- void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
- min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
-
- idVec3 v;
- int i = 0;
-
- register vector float vecMin, vecMax;
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- register vector float vecMin1, vecMin2, vecMax1, vecMax2;
-
- if ( count >= 4 ) {
-
- vecMin = (vector float)(FLT_MAX);
- vecMax = (vector float)(FLT_MIN);
-
- vector unsigned char vertPerm1;
- vector unsigned char vertPerm2;
- vector unsigned char vertPerm3;
- vector unsigned char vertPerm4;
- for ( ; i+3 < count; i += 4) {
- const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
- const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
- const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
- const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
- vertPerm1 = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
- vertPerm2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
- vertPerm3 = vec_add( vec_lvsl( -1, vertPtr3 ), (vector unsigned char)(1) );
- vertPerm4 = vec_add( vec_lvsl( -1, vertPtr4 ), (vector unsigned char)(1) );
-
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 15, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v3 = vec_ld( 15, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v5 = vec_ld( 15, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
- v7 = vec_ld( 15, vertPtr4 );
-
- v0 = vec_perm( v0, v1, vertPerm1 );
- v2 = vec_perm( v2, v3, vertPerm2 );
- v4 = vec_perm( v4, v5, vertPerm3 );
- v6 = vec_perm( v6, v7, vertPerm4 );
-
- vecMin1 = vec_min( v0, v2 );
- vecMin2 = vec_min( v4, v6 );
- vecMin1 = vec_min( vecMin1, vecMin2 );
- vecMin = vec_min( vecMin, vecMin1 );
-
- vecMax1 = vec_max( v0, v2 );
- vecMax2 = vec_max( v4, v6 );
- vecMax1 = vec_max( vecMax1, vecMax2 );
- vecMax = vec_max( vecMax, vecMax1 );
- }
-
- // now we have min/max vectors in X Y Z form, store out
- v0 = vec_splat( vecMin, 0 );
- v1 = vec_splat( vecMin, 1 );
- v2 = vec_splat( vecMin, 2 );
- v3 = vec_splat( vecMax, 0 );
- v4 = vec_splat( vecMax, 1 );
- v5 = vec_splat( vecMax, 2 );
- vec_ste( v0, 0, &min[0] );
- vec_ste( v1, 0, &min[1] );
- vec_ste( v2, 0, &min[2] );
- vec_ste( v3, 0, &max[0] );
- vec_ste( v4, 0, &max[1] );
- vec_ste( v5, 0, &max[2] );
- }
-
- // cleanup
- for ( ; i < count; i++ ) {
- v = src[indexes[i]].xyz;
-
- if ( v[0] < min[0] ) {
- min[0] = v[0];
- }
- if ( v[0] > max[0] ) {
- max[0] = v[0];
- }
-
- if ( v[1] < min[1] ) {
- min[1] = v[1];
- }
- if ( v[1] > max[1] ) {
- max[1] = v[1];
- }
-
- if ( v[2] > max[2] ) {
- max[2] = v[2];
- }
-
- if ( v[2] < min[2] ) {
- min[2] = v[2];
- }
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::MinMax
- ============
- */
- void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
- min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
-
- idVec3 v;
- int i = 0;
-
- register vector float vecMin, vecMax;
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- register vector float vecMin1, vecMin2, vecMax1, vecMax2;
-
- if ( count >= 4 ) {
-
- vecMin = (vector float)(FLT_MAX);
- vecMax = (vector float)(FLT_MIN);
-
- vector unsigned char vertPerm1;
- vector unsigned char vertPerm2;
- vector unsigned char vertPerm3;
- vector unsigned char vertPerm4;
- for ( ; i+3 < count; i += 4) {
- const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
- const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
- const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
- const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
-
- v0 = vec_ld( 0, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
-
- vecMin1 = vec_min( v0, v2 );
- vecMin2 = vec_min( v4, v6 );
- vecMin1 = vec_min( vecMin1, vecMin2 );
- vecMin = vec_min( vecMin, vecMin1 );
-
- vecMax1 = vec_max( v0, v2 );
- vecMax2 = vec_max( v4, v6 );
- vecMax1 = vec_max( vecMax1, vecMax2 );
- vecMax = vec_max( vecMax, vecMax1 );
- }
-
- // now we have min/max vectors in X Y Z form, store out
- v0 = vec_splat( vecMin, 0 );
- v1 = vec_splat( vecMin, 1 );
- v2 = vec_splat( vecMin, 2 );
- v3 = vec_splat( vecMax, 0 );
- v4 = vec_splat( vecMax, 1 );
- v5 = vec_splat( vecMax, 2 );
- vec_ste( v0, 0, &min[0] );
- vec_ste( v1, 0, &min[1] );
- vec_ste( v2, 0, &min[2] );
- vec_ste( v3, 0, &max[0] );
- vec_ste( v4, 0, &max[1] );
- vec_ste( v5, 0, &max[2] );
- }
-
- // cleanup
- for ( ; i < count; i++ ) {
- v = src[indexes[i]].xyz;
-
- if ( v[0] < min[0] ) {
- min[0] = v[0];
- }
- if ( v[0] > max[0] ) {
- max[0] = v[0];
- }
-
- if ( v[1] < min[1] ) {
- min[1] = v[1];
- }
- if ( v[1] > max[1] ) {
- max[1] = v[1];
- }
-
- if ( v[2] > max[2] ) {
- max[2] = v[2];
- }
-
- if ( v[2] < min[2] ) {
- min[2] = v[2];
- }
- }
- }
- #endif /* DRAWVERT_PADDED */
- #endif /* ENABLE_MINMAX */
- #ifdef ENABLE_CLAMP
- /*
- ============
- idSIMD_AltiVec::Clamp
- ============
- */
- void VPCALL idSIMD_AltiVec::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
- //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
- register vector float v0, v1, v2, v3, v4, v5;
- register vector unsigned char permVec;
- register vector float v0_low, v0_hi, v1_low, v1_hi;
- vector unsigned char oneVector = (vector unsigned char)(1);
- register vector float minVec, maxVec;
- int i = 0;
-
- //handle unaligned at start
- for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
- }
-
- //splat min/max into a vector
- minVec = loadSplatUnalignedScalar( &min );
- maxVec = loadSplatUnalignedScalar( &max );
-
- //calculate permute and do first load
- permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
- v1_hi = vec_ld( 0, &src[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load source
- v0_low = v1_hi;
- v0_hi = vec_ld( 15, &src[i] );
- v1_low = v0_hi;
- v1_hi = vec_ld( 31, &src[i] );
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v1_low, v1_hi, permVec );
-
- //apply minimum
- v2 = vec_max( v0, minVec );
- v3 = vec_max( v1, minVec );
-
- //apply maximum
- v4 = vec_min( v2, maxVec );
- v5 = vec_min( v3, maxVec );
-
- ALIGNED_STORE2( &dst[i], v4, v5 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::ClampMin
- ============
- */
- void VPCALL idSIMD_AltiVec::ClampMin( float *dst, const float *src, const float min, const int count ) {
- //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
- register vector float v0, v1, v2, v3;
- register vector unsigned char permVec;
- register vector float v0_low, v0_hi, v1_low, v1_hi;
- register vector float constVec;
- vector unsigned char oneVector = (vector unsigned char)(1);
- int i = 0;
- //handle unaligned at start
- for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = src[i] < min ? min : src[i];
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &min );
-
- //calculate permute and do first load
- permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
- v1_hi = vec_ld( 0, &src[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load source
- v0_low = v1_hi;
- v0_hi = vec_ld( 15, &src[i] );
- v1_low = v0_hi;
- v1_hi = vec_ld( 31, &src[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v1_low, v1_hi, permVec );
- v2 = vec_max( v0, constVec );
- v3 = vec_max( v1, constVec );
-
- ALIGNED_STORE2( &dst[i], v2, v3 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = src[i] < min ? min : src[i];
- }
- }
- /*
- ============
- idSIMD_AltiVec::ClampMax
- ============
- */
- void VPCALL idSIMD_AltiVec::ClampMax( float *dst, const float *src, const float max, const int count ) {
- //#define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
- register vector float v0, v1, v2, v3;
- register vector unsigned char permVec;
- register vector float constVec;
- register vector float v0_low, v0_hi, v1_low, v1_hi;
- vector unsigned char oneVector = (vector unsigned char)(1);
- int i = 0;
-
- //handle unaligned at start
- for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
- dst[i] = src[i] < max ? max : src[i];
- }
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &max );
-
- //calculate permute and do first load
- permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
- v1_hi = vec_ld( 0, &src[i] );
-
- //vectorize!
- for ( ; i+7 < count; i += 8 ) {
- //load source
- v0_low = v1_hi;
- v0_hi = vec_ld( 15, &src[i] );
- v1_low = v0_hi;
- v1_hi = vec_ld( 31, &src[i] );
-
- v0 = vec_perm( v0_low, v0_hi, permVec );
- v1 = vec_perm( v1_low, v1_hi, permVec );
- v2 = vec_min( v0, constVec );
- v3 = vec_min( v1, constVec );
-
- ALIGNED_STORE2( &dst[i], v2, v3 );
- }
-
- //handle cleanup
- for ( ; i < count ; i++ ) {
- dst[i] = src[i] < max ? max : src[i];
- }
- }
- #endif /* ENABLE_CLAMP */
- #ifdef ENABLE_16ROUTINES
- /*
- ============
- idSIMD_AltiVec::Zero16
- ============
- */
- void VPCALL idSIMD_AltiVec::Zero16( float *dst, const int count ) {
- memset( dst, 0, count * sizeof( float ) );
- }
- /*
- ============
- idSIMD_AltiVec::Negate16
- Assumptions:
- dst is aligned
- ============
- */
- void VPCALL idSIMD_AltiVec::Negate16( float *dst, const int count ) {
- //#define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
-
- // dst is aligned
- assert( IS_16BYTE_ALIGNED( dst[0] ) );
- // round count up to next 4 if needbe
- int count2 = ( count + 3 ) & ~3;
-
- int i = 0;
- vector float v0, v1, v2, v3;
-
- //know its 16-byte aligned
- for ( ; i + 7 < count2; i += 8 ) {
- v0 = vec_ld( 0, &dst[i] );
- v1 = vec_ld( 16, &dst[i] );
-
- v2 = vec_sub( (vector float)(0), v0 );
- v3 = vec_sub( (vector float)(0), v1 );
-
- ALIGNED_STORE2( &dst[i], v2, v3 );
- }
-
- for ( ; i < count2; i += 4 ) {
- v0 = vec_ld( 0, &dst[i] );
- v1 = vec_sub( (vector float)(0), v0 );
- vec_st( v1, 0, &dst[i] );
- }
- }
- /*
- ============
- idSIMD_AltiVec::Copy16
- ============
- */
- void VPCALL idSIMD_AltiVec::Copy16( float *dst, const float *src, const int count ) {
- //#define OPER(X) dst[(X)] = src[(X)]
- memcpy( dst, src, sizeof(float) * count );
- }
- /*
- ============
- idSIMD_AltiVec::Add16
-
- Assumptions:
- Assumes dst, src1, src2 all start at aligned address
- ============
- */
- void VPCALL idSIMD_AltiVec::Add16( float *dst, const float *src1, const float *src2, const int count ) {
- //#define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
- // dst is aligned
- assert( IS_16BYTE_ALIGNED( dst[0] ) );
- // src1 is aligned
- assert( IS_16BYTE_ALIGNED( src1[0] ) );
- // src2 is aligned
- assert( IS_16BYTE_ALIGNED( src2[0] ) );
-
- // round count up to next 4 if needbe
- int count2 = ( count + 3 ) & ~3;
-
- register vector float v0, v1, v2, v3, v4, v5;
- int i = 0;
-
- //know all data is 16-byte aligned, so vectorize!
- for ( ; i+7 < count2; i += 8 ) {
- //load sources
- v0 = vec_ld( 0, &src1[i] );
- v1 = vec_ld( 16, &src1[i] );
- v2 = vec_ld( 0, &src2[i] );
- v3 = vec_ld( 16, &src2[i] );
- v4 = vec_add( v0, v2 );
- v5 = vec_add( v1, v3 );
-
- ALIGNED_STORE2( &dst[i], v4, v5 );
- }
-
- for ( ; i < count2; i += 4 ) {
- v0 = vec_ld( 0, &src1[i] );
- v1 = vec_ld( 0, &src2[i] );
- v2 = vec_add( v0, v1 );
- vec_st( v2, 0, &dst[i] );
- }
- }
- /*
- ============
- idSIMD_AltiVec::Sub16
- Assumptions:
- Assumes that dst, src1, and src2 all start at aligned address
- ============
- */
- void VPCALL idSIMD_AltiVec::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
- //#define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
- // dst is aligned
- assert( IS_16BYTE_ALIGNED( dst[0] ) );
- // src1 is aligned
- assert( IS_16BYTE_ALIGNED( src1[0] ) );
- // src2 is aligned
- assert( IS_16BYTE_ALIGNED( src2[0] ) );
-
- // round count up to next 4 if needbe
- int count2 = ( count + 3 ) & ~3;
- register vector float v0, v1, v2, v3, v4, v5;
- int i = 0;
-
- //know data is aligned, so vectorize!
- for ( ; i+7 < count2; i += 8 ) {
- //load sources
- v0 = vec_ld( 0, &src1[i] );
- v1 = vec_ld( 16, &src1[i] );
- v2 = vec_ld( 0, &src2[i] );
- v3 = vec_ld( 16, &src2[i] );
- v4 = vec_sub( v0, v2 );
- v5 = vec_sub( v1, v3 );
-
- ALIGNED_STORE2( &dst[i], v4, v5 );
- }
-
- for ( ; i < count2; i += 4 ) {
- v0 = vec_ld( 0, &src1[i] );
- v1 = vec_ld( 0, &src2[i] );
- v2 = vec_sub( v0, v1 );
- vec_st( v2, 0, &dst[i] );
- }
- }
- /*
- ============
- idSIMD_AltiVec::Mul16
- Assumptions:
- Assumes that dst and src1 start at aligned address
- ============
- */
- void VPCALL idSIMD_AltiVec::Mul16( float *dst, const float *src1, const float constant, const int count ) {
- //#define OPER(X) dst[(X)] = src1[(X)] * constant
-
- // dst is aligned
- assert( IS_16BYTE_ALIGNED( dst[0] ) );
- // src1 is aligned
- assert( IS_16BYTE_ALIGNED( src1[0] ) );
-
- // round count up to next 4 if needbe
- int count2 = ( count + 3 ) & ~3;
-
- register vector float v0, v1, v2, v3;
- register vector float constVec;
- register vector float zeroVector = (vector float)(0.0);
- int i = 0;
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //know data is aligned, so vectorize!
- for ( ; i+7 < count2; i += 8 ) {
- //load source
- v0 = vec_ld( 0, &src1[i] );
- v1 = vec_ld( 16, &src1[i] );
- v2 = vec_madd( constVec, v0, zeroVector );
- v3 = vec_madd( constVec, v1, zeroVector );
- ALIGNED_STORE2( &dst[i], v2, v3 );
- }
-
- for ( ; i < count2; i += 4 ) {
- v0 = vec_ld( 0, &src1[i] );
- v1 = vec_madd( constVec, v0, zeroVector );
- vec_st( v1, 0, &dst[i] );
- }
- }
- /*
- ============
- idSIMD_AltiVec::AddAssign16
- Assumptions:
- Assumes that dst and src start at aligned address
- ============
- */
- void VPCALL idSIMD_AltiVec::AddAssign16( float *dst, const float *src, const int count ) {
- //#define OPER(X) dst[(X)] += src[(X)]
-
- // dst is aligned
- assert( IS_16BYTE_ALIGNED( dst[0] ) );
- // src is aligned
- assert( IS_16BYTE_ALIGNED( src[0] ) );
-
- // round count up to next 4 if needbe
- int count2 = ( count + 3 ) & ~3;
-
- register vector float v0, v1, v2, v3, v4, v5;
- int i = 0;
-
- //vectorize!
- for ( ; i+7 < count2; i += 8 ) {
- v0 = vec_ld( 0, &src[i] );
- v1 = vec_ld( 16, &src[i] );
- v2 = vec_ld( 0, &dst[i] );
- v3 = vec_ld( 16, &dst[i] );
- v4 = vec_add( v0, v2 );
- v5 = vec_add( v1, v3 );
- ALIGNED_STORE2( &dst[i], v4, v5 );
- }
-
- for ( ; i < count2; i += 4 ) {
- v0 = vec_ld( 0, &src[i] );
- v1 = vec_ld( 0, &dst[i] );
- v2 = vec_add( v0, v1 );
- vec_st( v2, 0, &dst[i] );
- }
- }
- /*
- ============
- idSIMD_AltiVec::SubAssign16
- Assumptions:
- Assumes that dst and src start at aligned address
- ============
- */
- void VPCALL idSIMD_AltiVec::SubAssign16( float *dst, const float *src, const int count ) {
- //#define OPER(X) dst[(X)] -= src[(X)]
- register vector float v0, v1, v2, v3, v4, v5;
- int i=0;
-
- // dst is aligned
- assert( IS_16BYTE_ALIGNED( dst[0] ) );
- // src is aligned
- assert( IS_16BYTE_ALIGNED( src[0] ) );
- // round count up to next 4 if needbe
- int count2 = ( count + 3 ) & ~3;
-
- //vectorize!
- for ( ; i+7 < count2; i += 8 ) {
- v0 = vec_ld( 0, &src[i] );
- v1 = vec_ld( 16, &src[i] );
- v2 = vec_ld( 0, &dst[i] );
- v3 = vec_ld( 16, &dst[i] );
- v4 = vec_sub( v2, v0 );
- v5 = vec_sub( v3, v1 );
- ALIGNED_STORE2( &dst[i], v4, v5 );
- }
-
- for ( ; i < count2; i += 4 ) {
- v0 = vec_ld( 0, &src[i] );
- v1 = vec_ld( 0, &dst[i] );
- v2 = vec_sub( v1, v0 );
- vec_st( v2, 0, &dst[i] );
- }
- }
- /*
- ============
- idSIMD_AltiVec::MulAssign16
- Assumptions:
- Assumes that dst starts at aligned address and count is multiple of 4
- ============
- */
- void VPCALL idSIMD_AltiVec::MulAssign16( float *dst, const float constant, const int count ) {
- //#define OPER(X) dst[(X)] *= constant
- // dst is aligned
- assert( IS_16BYTE_ALIGNED( dst[0] ) );
- // round count up to next 4 if needbe
- int count2 = ( count + 3 ) & ~3;
-
- register vector float v0, v1, v2, v3;
- register vector float constVec;
- int i = 0;
- register vector float zeroVector = (vector float)(0.0);
-
- //splat constant into a vector
- constVec = loadSplatUnalignedScalar( &constant );
-
- //vectorize!
- for ( ; i+7 < count2; i += 8 ) {
- v0 = vec_ld( 0, &dst[i] );
- v1 = vec_ld( 16, &dst[i] );
- v2 = vec_madd( v0, constVec, zeroVector );
- v3 = vec_madd( v1, constVec, zeroVector );
- ALIGNED_STORE2( &dst[i], v2, v3 );
- }
-
- for ( ; i < count2; i += 4 ) {
- v0 = vec_ld( 0, &dst[i] );
- v1 = vec_madd( v0, constVec, zeroVector );
- vec_st( v1, 0, &dst[i] );
- }
- }
- #endif /* ENABLE_16ROUTINES */
- #ifdef ENABLE_LOWER_TRIANGULAR
- /*
- ============
- idSIMD_AltiVec::MatX_LowerTriangularSolve
- solves x in L * x = b for the first n rows of L
- if skip > 0 the first skip elements of x are assumed to be valid already
- L has to be a lower triangular matrix with (implicit) ones on the diagonal
- x == b is allowed
- ============
- */
- void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
-
- int i, j;
- const float *lptr;
- const float *lptr2;
- const float *lptr3;
- const float *lptr4;
- float sum;
- float sum2;
- float sum3;
- float sum4;
- float tempSum;
- float tempSum2;
- float tempSum3;
- float tempSum4;
- vector float vecSum1 = (vector float)(0.0);
- vector float vecSum2 = (vector float)(0.0);
- vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
- vector float zeroVector = (vector float)(0.0);
- vector float vecSum3, vecSum4, vecSum5, vecSum6, vecSum7, vecSum8;
-
- vector unsigned char vecPermX = vec_add( vec_lvsl( -1, &x[0] ), (vector unsigned char)(1) );
-
- // unrolled this loop a bit
- for ( i = skip; i+3 < n; i+=4 ) {
- sum = b[i];
- sum2 = b[i+1];
- sum3 = b[i+2];
- sum4 = b[i+3];
-
- vecSum1 = zeroVector;
- vecSum2 = zeroVector;
- vecSum3 = vecSum4 = vecSum5 = vecSum6 = vecSum7 = vecSum8 = zeroVector;
- lptr = L[i];
- lptr2 = L[i+1];
- lptr3 = L[i+2];
- lptr4 = L[i+3];
-
- vector unsigned char vecPermLptr1 = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
- vector unsigned char vecPermLptr2 = vec_add( vec_lvsl( -1, lptr2 ), (vector unsigned char)(1) );
- vector unsigned char vecPermLptr3 = vec_add( vec_lvsl( -1, lptr3 ), (vector unsigned char)(1) );
- vector unsigned char vecPermLptr4 = vec_add( vec_lvsl( -1, lptr4 ), (vector unsigned char)(1) );
-
- for ( j = 0 ; j+7 < i; j+=8 ) {
-
- v0 = vec_ld( 0, &x[j] );
- v1 = vec_ld( 15, &x[j] );
- vector float vecExtraX = vec_ld( 31, &x[j] );
- v0 = vec_perm( v0, v1, vecPermX );
- v1 = vec_perm( v1, vecExtraX, vecPermX );
-
- v2 = vec_ld( 0, lptr + j );
- v3 = vec_ld( 15, lptr + j );
- vector float vecExtra1 = vec_ld( 31, lptr + j );
- v2 = vec_perm( v2, v3, vecPermLptr1 );
- v3 = vec_perm( v3, vecExtra1, vecPermLptr1 );
-
- v4 = vec_ld( 0, lptr2 + j );
- v5 = vec_ld( 15, lptr2 + j );
- vector float vecExtra2 = vec_ld( 31, lptr2 + j );
- v4 = vec_perm( v4, v5, vecPermLptr2 );
- v5 = vec_perm( v5, vecExtra2, vecPermLptr2 );
-
- v6 = vec_ld( 0, lptr3 + j );
- v7 = vec_ld( 15, lptr3 + j );
- vector float vecExtra3 = vec_ld( 31, lptr3 + j );
- v6 = vec_perm( v6, v7, vecPermLptr3 );
- v7 = vec_perm( v7, vecExtra3, vecPermLptr3 );
-
- v8 = vec_ld( 0, lptr4 + j );
- v9 = vec_ld( 15, lptr4 + j );
- vector float vecExtra4 = vec_ld( 31, lptr4 + j );
- v8 = vec_perm( v8, v9, vecPermLptr4 );
- v9 = vec_perm( v9, vecExtra4, vecPermLptr4 );
-
- vecSum1 = vec_madd( v2, v0, vecSum1 );
- vecSum2 = vec_madd( v3, v1, vecSum2 );
-
- vecSum3 = vec_madd( v4, v0, vecSum3 );
- vecSum4 = vec_madd( v5, v1, vecSum4 );
-
- vecSum5 = vec_madd( v6, v0, vecSum5 );
- vecSum6 = vec_madd( v7, v1, vecSum6 );
-
- vecSum7 = vec_madd( v8, v0, vecSum7 );
- vecSum8 = vec_madd( v9, v1, vecSum8 );
- }
-
- // if we ran the unrolled code, we need to sum accross the vectors
- // to find out how much to subtract from sum
- if ( j > 0 ) {
- vecSum1 = vec_add( vecSum1, vecSum2 );
- vecSum3 = vec_add( vecSum3, vecSum4 );
- vecSum5 = vec_add( vecSum5, vecSum6 );
- vecSum7 = vec_add( vecSum7, vecSum8 );
- //sum accross the vectors
- vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
- vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
-
- vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 8 ) );
- vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 4 ) );
-
- vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 8 ) );
- vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 4 ) );
-
- vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 8 ) );
- vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 4 ) );
- //move the result to the FPU
- vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
- vec_ste( vec_splat( vecSum3, 0 ), 0, &tempSum2 );
- vec_ste( vec_splat( vecSum5, 0 ), 0, &tempSum3 );
- vec_ste( vec_splat( vecSum7, 0 ), 0, &tempSum4 );
-
- sum -= tempSum;
- sum2 -= tempSum2;
- sum3 -= tempSum3;
- sum4 -= tempSum4;
- }
-
- //cleanup
- for ( ; j < i; j++ ) {
- sum -= lptr[j] * x[j];
- sum2 -= lptr2[j] * x[j];
- sum3 -= lptr3[j] * x[j];
- sum4 -= lptr4[j] * x[j];
- }
- // store the 4 results at a time
- sum2 -= ( lptr2[i] * sum );
- sum3 = sum3 - ( lptr3[i+1] * sum2 ) - ( lptr3[i] * sum );
- sum4 = sum4 - ( lptr4[i+2] * sum3 ) - ( lptr4[i+1] * sum2 ) - ( lptr4[i] * sum );
-
- x[i] = sum;
- x[i+1] = sum2;
- x[i+2] = sum3;
- x[i+3] = sum4;
- }
-
- // cleanup
- for ( ; i < n; i++ ) {
- sum = b[i];
- vecSum1 = zeroVector;
- vecSum2 = zeroVector;
- lptr = L[i];
- vector unsigned char vecPermLptr = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
-
- for ( j = 0 ; j+7 < i; j+=8 ) {
-
- v0 = vec_ld( 0, &x[j] );
- v2 = vec_ld( 15, &x[j] );
- vector float vecExtraX = vec_ld( 31, &x[j] );
- v0 = vec_perm( v0, v2, vecPermX );
- v2 = vec_perm( v2, vecExtraX, vecPermX );
-
- v1 = vec_ld( 0, lptr + j );
- v3 = vec_ld( 15, lptr + j );
- vector float vecExtra = vec_ld( 31, lptr + j );
- v1 = vec_perm( v1, v3, vecPermLptr );
- v3 = vec_perm( v3, vecExtra, vecPermLptr );
-
- vecSum1 = vec_madd( v1, v0, vecSum1 );
- vecSum2 = vec_madd( v3, v2, vecSum2 );
- }
-
- // if we ran the unrolled code, we need to sum accross the vectors
- // to find out how much to subtract from sum
- if ( j > 0 ) {
- //sum accross the vectors
- vecSum1 = vec_add( vecSum1, vecSum2 );
- vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
- vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
- //move the result to the FPU
- vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
- sum -= tempSum;
- }
-
- //cleanup
- for ( ; j < i; j++ ) {
- sum -= lptr[j] * x[j];
- }
- x[i] = sum;
- }
- }
- /*
- ============
- idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose
- solves x in L.Transpose() * x = b for the first n rows of L
- L has to be a lower triangular matrix with (implicit) ones on the diagonal
- x == b is allowed
- ============
- */
- void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
- int nc;
- const float *lptr;
- lptr = L.ToFloatPtr();
- nc = L.GetNumColumns();
- float x0, x1, x2, x3, x4, x5, x6;
- // unrolled cases for n < 8
- if ( n < 8 ) {
- switch( n ) {
- // using local variables to avoid aliasing issues
- case 0:
- return;
- case 1:
- x[0] = b[0];
- return;
- case 2:
- x1 = b[1];
- x0 = b[0] - lptr[1*nc+0] * x1;
-
- x[1] = x1;
- x[0] = x0;
- return;
- case 3:
- x2 = b[2];
- x1 = b[1] - lptr[2*nc+1] * x2;
- x0 = b[0] - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
-
- x[2] = x2;
- x[1] = x1;
- x[0] = x0;
- return;
- case 4:
- x3 = b[3];
- x2 = b[2] - lptr[3*nc+2] * x3;
- x1 = b[1] - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
- x0 = b[0] - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
-
- x[3] = x3;
- x[2] = x2;
- x[1] = x1;
- x[0] = x0;
-
- return;
- case 5:
- x4 = b[4];
- x3 = b[3] - lptr[4*nc+3] * x4;
- x2 = b[2] - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
- x1 = b[1] - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
- x0 = b[0] - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
-
- x[4] = x4;
- x[3] = x3;
- x[2] = x2;
- x[1] = x1;
- x[0] = x0;
- return;
- case 6:
- x5 = b[5];
- x4 = b[4] - lptr[5*nc+4] * x5;
- x3 = b[3] - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
- x2 = b[2] - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
- x1 = b[1] - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
- x0 = b[0] - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
-
- x[5] = x5;
- x[4] = x4;
- x[3] = x3;
- x[2] = x2;
- x[1] = x1;
- x[0] = x0;
-
- return;
- case 7:
- x6 = b[6];
- x5 = b[5] - lptr[6*nc+5] * x6;
- x4 = b[4] - lptr[6*nc+4] * x6 - lptr[5*nc+4] * x5;
- x3 = b[3] - lptr[6*nc+3] * x6 - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
- x2 = b[2] - lptr[6*nc+2] * x6 - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
- x1 = b[1] - lptr[6*nc+1] * x6 - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
- x0 = b[0] - lptr[6*nc+0] * x6 - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
-
- x[6] = x6;
- x[5] = x5;
- x[4] = x4;
- x[3] = x3;
- x[2] = x2;
- x[1] = x1;
- x[0] = x0;
- return;
- }
- return;
- }
- int i, j;
- register float s0, s1, s2, s3;
- float *xptr;
- lptr = L.ToFloatPtr() + n * nc + n - 4;
- xptr = x + n;
- // process 4 rows at a time
- for ( i = n; i >= 4; i -= 4 ) {
- s0 = b[i-4];
- s1 = b[i-3];
- s2 = b[i-2];
- s3 = b[i-1];
- // process 4x4 blocks
- for ( j = 0; j < n-i; j += 4 ) {
- s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
- s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
- s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
- s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
- s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
- s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
- s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
- s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
- s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
- s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
- s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
- s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
- s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
- s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
- s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
- s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
- }
- // process left over of the 4 rows
- s0 -= lptr[0-1*nc] * s3;
- s1 -= lptr[1-1*nc] * s3;
- s2 -= lptr[2-1*nc] * s3;
- s0 -= lptr[0-2*nc] * s2;
- s1 -= lptr[1-2*nc] * s2;
- s0 -= lptr[0-3*nc] * s1;
- // store result
- xptr[-4] = s0;
- xptr[-3] = s1;
- xptr[-2] = s2;
- xptr[-1] = s3;
- // update pointers for next four rows
- lptr -= 4 + 4 * nc;
- xptr -= 4;
- }
- // process left over rows
- for ( i--; i >= 0; i-- ) {
- s0 = b[i];
- lptr = L[0] + i;
- for ( j = i + 1; j < n; j++ ) {
- s0 -= lptr[j*nc] * x[j];
- }
- x[i] = s0;
- }
- }
- /*
- ============
- idSIMD_AltiVec::MatX_LDLTFactor
- ============
- */
- bool VPCALL idSIMD_AltiVec::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
- int i, j, k, nc;
- float *v, *diag, *mptr;
- float s0, s1, s2, s3, sum, d;
- float s0_2, s1_2, s2_2, s3_2, sum_2;
- float *mptr2;
- v = (float *) _alloca16( n * sizeof( float ) );
- diag = (float *) _alloca16( n * sizeof( float ) );
- nc = mat.GetNumColumns();
- if ( n <= 0 ) {
- return true;
- }
- mptr = mat[0];
- sum = mptr[0];
- if ( sum == 0.0f ) {
- return false;
- }
- diag[0] = sum;
- invDiag[0] = d = 1.0f / sum;
- if ( n <= 1 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 1; j < n; j++ ) {
- mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
- }
- mptr = mat[1];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- sum = mptr[1] - s0;
- if ( sum == 0.0f ) {
- return false;
- }
- mat[1][1] = sum;
- diag[1] = sum;
- invDiag[1] = d = 1.0f / sum;
- if ( n <= 2 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 2; j < n; j++ ) {
- mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
- }
- mptr = mat[2];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
- sum = mptr[2] - s0 - s1;
- if ( sum == 0.0f ) {
- return false;
- }
- mat[2][2] = sum;
- diag[2] = sum;
- invDiag[2] = d = 1.0f / sum;
- if ( n <= 3 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 3; j < n; j++ ) {
- mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
- }
- mptr = mat[3];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
- v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
- sum = mptr[3] - s0 - s1 - s2;
- if ( sum == 0.0f ) {
- return false;
- }
- mat[3][3] = sum;
- diag[3] = sum;
- invDiag[3] = d = 1.0f / sum;
- if ( n <= 4 ) {
- return true;
- }
- mptr = mat[0];
- for ( j = 4; j < n; j++ ) {
- mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
- }
- for ( i = 4; i < n; i++ ) {
- mptr = mat[i];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
- v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
- v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
- for ( k = 4; k < i-3; k += 4 ) {
- v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
- v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
- v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
- v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
- }
- switch( i - k ) {
- case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
- case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
- case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
- }
- sum = s3;
- sum += s2;
- sum += s1;
- sum += s0;
- sum = mptr[i] - sum;
- if ( sum == 0.0f ) {
- return false;
- }
- mat[i][i] = sum;
- diag[i] = sum;
- invDiag[i] = d = 1.0f / sum;
- if ( i + 1 >= n ) {
- return true;
- }
- // unrolling madness!
- mptr = mat[i+1];
- mptr2 = mat[i+1] + nc;
-
- for ( j = i+1; j+1 < n; j+=2 ) {
- s0 = mptr[0] * v[0];
- s1 = mptr[1] * v[1];
- s2 = mptr[2] * v[2];
- s3 = mptr[3] * v[3];
-
- s0_2 = mptr2[0] * v[0];
- s1_2 = mptr2[1] * v[1];
- s2_2 = mptr2[2] * v[2];
- s3_2 = mptr2[3] * v[3];
-
- for ( k = 4; k < i-7; k += 8 ) {
- s0 += mptr[k+0] * v[k+0];
- s1 += mptr[k+1] * v[k+1];
- s2 += mptr[k+2] * v[k+2];
- s3 += mptr[k+3] * v[k+3];
- s0 += mptr[k+4] * v[k+4];
- s1 += mptr[k+5] * v[k+5];
- s2 += mptr[k+6] * v[k+6];
- s3 += mptr[k+7] * v[k+7];
-
- s0_2 += mptr2[k+0] * v[k+0];
- s1_2 += mptr2[k+1] * v[k+1];
- s2_2 += mptr2[k+2] * v[k+2];
- s3_2 += mptr2[k+3] * v[k+3];
- s0_2 += mptr2[k+4] * v[k+4];
- s1_2 += mptr2[k+5] * v[k+5];
- s2_2 += mptr2[k+6] * v[k+6];
- s3_2 += mptr2[k+7] * v[k+7];
- }
-
- switch( i - k ) {
- case 7: s0 += mptr[k+6] * v[k+6]; s0_2 += mptr2[k+6] * v[k+6];
- case 6: s1 += mptr[k+5] * v[k+5]; s1_2 += mptr2[k+5] * v[k+5];
- case 5: s2 += mptr[k+4] * v[k+4]; s2_2 += mptr2[k+4] * v[k+4];
- case 4: s3 += mptr[k+3] * v[k+3]; s3_2 += mptr2[k+3] * v[k+3];
- case 3: s0 += mptr[k+2] * v[k+2]; s0_2 += mptr2[k+2] * v[k+2];
- case 2: s1 += mptr[k+1] * v[k+1]; s1_2 += mptr2[k+1] * v[k+1];
- case 1: s2 += mptr[k+0] * v[k+0]; s2_2 += mptr2[k+0] * v[k+0];
- }
- // disassociate these adds
- s3 += s2;
- s1 += s0;
- sum = s1 + s3;
-
- s3_2 += s2_2;
- s1_2 += s0_2;
- sum_2 = s1_2 + s3_2;
-
- mptr[i] = ( mptr[i] - sum ) * d;
- mptr2[i] = ( mptr2[i] - sum_2 ) * d;
-
- mptr += nc*2;
- mptr2 += nc*2;
- }
- // cleanup
- for ( ; j < n; j++ ) {
- s0 = mptr[0] * v[0];
- s1 = mptr[1] * v[1];
- s2 = mptr[2] * v[2];
- s3 = mptr[3] * v[3];
- for ( k = 4; k < i-7; k += 8 ) {
- s0 += mptr[k+0] * v[k+0];
- s1 += mptr[k+1] * v[k+1];
- s2 += mptr[k+2] * v[k+2];
- s3 += mptr[k+3] * v[k+3];
- s0 += mptr[k+4] * v[k+4];
- s1 += mptr[k+5] * v[k+5];
- s2 += mptr[k+6] * v[k+6];
- s3 += mptr[k+7] * v[k+7];
- }
- switch( i - k ) {
- case 7: s0 += mptr[k+6] * v[k+6];
- case 6: s1 += mptr[k+5] * v[k+5];
- case 5: s2 += mptr[k+4] * v[k+4];
- case 4: s3 += mptr[k+3] * v[k+3];
- case 3: s0 += mptr[k+2] * v[k+2];
- case 2: s1 += mptr[k+1] * v[k+1];
- case 1: s2 += mptr[k+0] * v[k+0];
- }
- // disassociate these adds
- s3 += s2;
- s1 += s0;
- sum = s1 + s3;
- mptr[i] = ( mptr[i] - sum ) * d;
- mptr += nc;
- }
- }
- return true;
- }
- #endif /* ENABLE_LOWER_TRIANGULAR */
- #ifdef LIVE_VICARIOUSLY
- /*
- ============
- idSIMD_AltiVec::BlendJoints
- ============
- */
- void VPCALL idSIMD_AltiVec::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
- int i;
-
- // since lerp is a constant, we can special case the two cases if they're true
- if ( lerp <= 0.0f ) {
- // this sets joints back to joints. No sense in doing no work, so just return
- return;
- }
-
- if ( lerp >= 1.0f ) {
- // this copies each q from blendJoints to joints and copies each t from blendJoints to joints
- memcpy( joints[0].q.ToFloatPtr(), blendJoints[0].q.ToFloatPtr(), sizeof(idJointQuat) * numJoints );
- return;
- }
- vector float vecLerp = loadSplatUnalignedScalar( &lerp );
- vector float zeroVector = (vector float)(0);
-
- for ( i = 0; i+3 < numJoints; i+=4 ) {
- int j = index[i];
- int j2 = index[i+1];
- int j3 = index[i+2];
- int j4 = index[i+3];
-
- // slerp
- const float *jointPtr = joints[j].q.ToFloatPtr();
- const float *blendPtr = blendJoints[j].q.ToFloatPtr();
- const float *jointPtr2 = joints[j2].q.ToFloatPtr();
- const float *blendPtr2 = blendJoints[j2].q.ToFloatPtr();
- const float *jointPtr3 = joints[j3].q.ToFloatPtr();
- const float *blendPtr3 = blendJoints[j3].q.ToFloatPtr();
- const float *jointPtr4 = joints[j4].q.ToFloatPtr();
- const float *blendPtr4 = blendJoints[j4].q.ToFloatPtr();
-
- vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
- vector unsigned char permVec2 = vec_add( vec_lvsl( -1, jointPtr2 ), (vector unsigned char)(1) );
- vector unsigned char permVec3 = vec_add( vec_lvsl( -1, jointPtr3 ), (vector unsigned char)(1) );
- vector unsigned char permVec4 = vec_add( vec_lvsl( -1, jointPtr4 ), (vector unsigned char)(1) );
- vector unsigned char permVec5 = vec_add( vec_lvsl( -1, blendPtr ), (vector unsigned char)(1) );
- vector unsigned char permVec6 = vec_add( vec_lvsl( -1, blendPtr2 ), (vector unsigned char)(1) );
- vector unsigned char permVec7 = vec_add( vec_lvsl( -1, blendPtr3 ), (vector unsigned char)(1) );
- vector unsigned char permVec8 = vec_add( vec_lvsl( -1, blendPtr4 ), (vector unsigned char)(1) );
- vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
- vector float v12, v13, v14, v15, v16;
- vector float vecFromX, vecFromY, vecFromZ, vecFromW;
- vector float vecToX, vecToY, vecToZ, vecToW;
-
- // load up the the idJointQuats from joints
- v0 = vec_ld( 0, jointPtr );
- v1 = vec_ld( 15, jointPtr );
- v2 = vec_perm( v0, v1, permVec );
-
- v3 = vec_ld( 0, jointPtr2 );
- v4 = vec_ld( 15, jointPtr2 );
- v5 = vec_perm( v3, v4, permVec2 );
-
- v6 = vec_ld( 0, jointPtr3 );
- v7 = vec_ld( 15, jointPtr3 );
- v8 = vec_perm( v6, v7, permVec3 );
- v9 = vec_ld( 0, jointPtr4 );
- v10 = vec_ld( 15, jointPtr4 );
- v11 = vec_perm( v9, v10, permVec4 );
-
- // planarizing, so put each x y z w into its own vector
- v0 = vec_mergeh( v2, v8 );
- v1 = vec_mergeh( v5, v11 );
- v3 = vec_mergel( v2, v8 );
- v4 = vec_mergel( v5, v11 );
- vecFromX = vec_mergeh( v0, v1 );
- vecFromY = vec_mergel( v0, v1 );
- vecFromZ = vec_mergeh( v3, v4 );
- vecFromW = vec_mergel( v3, v4 );
- // load up idJointQuats from blendJoints
- v5 = vec_ld( 0, blendPtr );
- v6 = vec_ld( 15, blendPtr );
- v7 = vec_perm( v5, v6, permVec5 );
-
- v8 = vec_ld( 0, blendPtr2 );
- v9 = vec_ld( 15, blendPtr2 );
- v10 = vec_perm( v8, v9, permVec6 );
-
- v11 = vec_ld( 0, blendPtr3 );
- v12 = vec_ld( 15, blendPtr3 );
- v13 = vec_perm( v11, v12, permVec7 );
- v14 = vec_ld( 0, blendPtr4 );
- v15 = vec_ld( 15, blendPtr4 );
- v16 = vec_perm( v14, v15, permVec8 );
-
- // put these into their own vectors too
- v5 = vec_mergeh( v7, v13 );
- v6 = vec_mergeh( v10, v16 );
- v8 = vec_mergel( v7, v13 );
- v9 = vec_mergel( v10, v16 );
- vecToX = vec_mergeh( v5, v6 );
- vecToY = vec_mergel( v5, v6 );
- vecToZ = vec_mergeh( v8, v9 );
- vecToW = vec_mergel( v8, v9 );
- // calculate cosom
- vector float vecCosom = vec_madd( vecFromX, vecToX, (vector float)(0) );
- vecCosom = vec_madd( vecFromY, vecToY, vecCosom );
- vecCosom = vec_madd( vecFromZ, vecToZ, vecCosom );
- vecCosom = vec_madd( vecFromW, vecToW, vecCosom );
- // if cosom is < 0, negate it and set temp to negated elements in to. otherwise, set temp to
- // to
- vector bool int vecCmp, vecCmp2;
- vecCmp = vec_cmplt( vecCosom, zeroVector );
-
- // negate if needed
- vecToX = vec_sel( vecToX, vec_madd( vecToX, (vector float)(-1), zeroVector ), vecCmp );
- vecToY = vec_sel( vecToY, vec_madd( vecToY, (vector float)(-1), zeroVector ), vecCmp );
- vecToZ = vec_sel( vecToZ, vec_madd( vecToZ, (vector float)(-1), zeroVector ), vecCmp );
- vecToW = vec_sel( vecToW, vec_madd( vecToW, (vector float)(-1), zeroVector ), vecCmp );
- vecCosom = vec_sel( vecCosom, vec_madd( vecCosom, (vector float)(-1), zeroVector ), vecCmp );
-
- // check if we need to calculate scale
- vecCmp2 = vec_cmpgt( vec_sub( (vector float)(1), vecCosom ), (vector float)(1e-6f) );
- vector float vecScale0 = vec_sub( (vector float)(1), vecLerp );
- vector float vecScale1 = vec_splat( vecLerp, 0 );
-
- vector float vecWork1 = vec_sub( (vector float)(1), vec_madd( vecCosom, vecCosom, zeroVector ) );
- vector float vecWork2 = ReciprocalSquareRoot( vecWork1 );
- vector float vecWork3 = VectorATan16( vec_madd( vecWork1, vecWork2, zeroVector ), vecCosom );
-
- vecWork1 = vec_madd( VectorSin16( vec_madd( vecScale0, vecWork3, zeroVector ) ), vecWork2, zeroVector );
- vecWork2 = vec_madd( VectorSin16( vec_madd( vecLerp, vecWork3, zeroVector ) ), vecWork2, zeroVector );
-
- // see which ones we have to insert into our scale0 and scale1 vectors
- vecScale0 = vec_sel( vecScale0, vecWork1, vecCmp2 );
- vecScale1 = vec_sel( vecScale1, vecWork2, vecCmp2 );
-
- // multiply each element by the scale
- vecFromX = vec_madd( vecFromX, vecScale0, zeroVector );
- vecFromY = vec_madd( vecFromY, vecScale0, zeroVector );
- vecFromZ = vec_madd( vecFromZ, vecScale0, zeroVector );
- vecFromW = vec_madd( vecFromW, vecScale0, zeroVector );
- // multiply temp by scale and add to result
- vecFromX = vec_madd( vecToX, vecScale1, vecFromX );
- vecFromY = vec_madd( vecToY, vecScale1, vecFromY );
- vecFromZ = vec_madd( vecToZ, vecScale1, vecFromZ );
- vecFromW = vec_madd( vecToW, vecScale1, vecFromW );
- // do a transform again to get the results back to vectors we can store out
- v5 = vec_mergeh( vecFromX, vecFromZ );
- v6 = vec_mergeh( vecFromY, vecFromW );
- v8 = vec_mergel( vecFromX, vecFromZ );
- v9 = vec_mergel( vecFromY, vecFromW );
- vecToX = vec_mergeh( v5, v6 );
- vecToY = vec_mergel( v5, v6 );
- vecToZ = vec_mergeh( v8, v9 );
- vecToW = vec_mergel( v8, v9 );
- vector unsigned char storePerm1 = vec_lvsr( 0, jointPtr );
- vector unsigned char storePerm2 = vec_lvsr( 0, jointPtr2 );
- vector unsigned char storePerm3 = vec_lvsr( 0, jointPtr3 );
- vector unsigned char storePerm4 = vec_lvsr( 0, jointPtr4 );
- // right rotate the input data
- vecToX = vec_perm( vecToX, vecToX, storePerm1 );
- vecToY = vec_perm( vecToY, vecToY, storePerm2 );
- vecToZ = vec_perm( vecToZ, vecToZ, storePerm3 );
- vecToW = vec_perm( vecToW, vecToW, storePerm4 );
- vec_ste( vecToX, 0, (float*) jointPtr );
- vec_ste( vecToX, 4, (float*) jointPtr );
- vec_ste( vecToX, 8, (float*) jointPtr );
- vec_ste( vecToX, 12, (float*) jointPtr );
-
- vec_ste( vecToY, 0, (float*) jointPtr2 );
- vec_ste( vecToY, 4, (float*) jointPtr2 );
- vec_ste( vecToY, 8, (float*) jointPtr2 );
- vec_ste( vecToY, 12, (float*) jointPtr2 );
-
- vec_ste( vecToZ, 0, (float*) jointPtr3 );
- vec_ste( vecToZ, 4, (float*) jointPtr3 );
- vec_ste( vecToZ, 8, (float*) jointPtr3 );
- vec_ste( vecToZ, 12, (float*) jointPtr3 );
- vec_ste( vecToW, 0, (float*) jointPtr4 );
- vec_ste( vecToW, 4, (float*) jointPtr4 );
- vec_ste( vecToW, 8, (float*) jointPtr4 );
- vec_ste( vecToW, 12, (float*) jointPtr4 );
- // lerp is v1 + l * ( v2 - v1 );
- // the idVec3 T is going to be 12 bytes after the Q, so we can do this without calling ToFloatPtr() again. since its
- float *jointVecPtr = (float*)( jointPtr + 4 );
- float *jointVecPtr2 = (float*)( jointPtr2 + 4 );
- float *jointVecPtr3 = (float*)( jointPtr3 + 4 );
- float *jointVecPtr4 = (float*)( jointPtr4 + 4 );
- v0 = vec_ld( 0, jointVecPtr );
- v1 = vec_ld( 11, jointVecPtr );
- vector float vecLd1 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, jointVecPtr ), (vector unsigned char)(1) ) );
-
- v2 = vec_ld( 0, jointVecPtr2 );
- v3 = vec_ld( 11, jointVecPtr2 );
- vector float vecLd2 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, jointVecPtr2 ), (vector unsigned char)(1) ) );
-
- v4 = vec_ld( 0, jointVecPtr3 );
- v5 = vec_ld( 11, jointVecPtr3 );
- vector float vecLd3 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, jointVecPtr3 ), (vector unsigned char)(1) ) );
- v6 = vec_ld( 0, jointVecPtr4 );
- v7 = vec_ld( 11, jointVecPtr4 );
- vector float vecLd4 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, jointVecPtr4 ), (vector unsigned char)(1) ) );
-
- vector float vecVecX, vecVecY, vecVecZ;
- vecVecX = vecVecY = vecVecZ = zeroVector;
- // planarize
- v0 = vec_mergeh( vecLd1, vecLd3 );
- v1 = vec_mergeh( vecLd2, vecLd4 );
- v3 = vec_mergel( vecLd1, vecLd3 );
- v4 = vec_mergel( vecLd2, vecLd4 );
- vecVecX = vec_mergeh( v0, v1 );
- vecVecY = vec_mergel( v0, v1 );
- vecVecZ = vec_mergeh( v3, v4 );
- // load blend joint idvec3's
- float *blendVecPtr = (float*)( blendPtr + 4 );
- float *blendVecPtr2 =(float*)( blendPtr2 + 4 );
- float *blendVecPtr3 = (float*)( blendPtr3 + 4 );
- float *blendVecPtr4 = (float*)( blendPtr4 + 4 );
- v0 = vec_ld( 0, blendVecPtr );
- v1 = vec_ld( 11, blendVecPtr );
- vector float vecLd5 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, blendVecPtr ), (vector unsigned char)(1) ) );
-
- v2 = vec_ld( 0, blendVecPtr2 );
- v3 = vec_ld( 11, blendVecPtr2 );
- vector float vecLd6 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, blendVecPtr2 ), (vector unsigned char)(1) ) );
-
- v4 = vec_ld( 0, blendVecPtr3 );
- v5 = vec_ld( 11, blendVecPtr3 );
- vector float vecLd7 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, blendVecPtr3 ), (vector unsigned char)(1) ) );
- v6 = vec_ld( 0, blendVecPtr4 );
- v7 = vec_ld( 11, blendVecPtr4 );
- vector float vecLd8 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, blendVecPtr4 ), (vector unsigned char)(1) ) );
- vector float vecBlendX, vecBlendY, vecBlendZ;
- vecBlendX = vecBlendY = vecBlendZ = zeroVector;
- // planarize
- v0 = vec_mergeh( vecLd5, vecLd7 );
- v1 = vec_mergeh( vecLd6, vecLd8 );
- v3 = vec_mergel( vecLd5, vecLd7 );
- v4 = vec_mergel( vecLd6, vecLd8 );
- vecBlendX = vec_mergeh( v0, v1 );
- vecBlendY = vec_mergel( v0, v1 );
- vecBlendZ = vec_mergeh( v3, v4 );
- // do subtraction
- vecWork1 = vec_sub( vecBlendX, vecVecX );
- vecWork2 = vec_sub( vecBlendY, vecVecY );
- vecWork3 = vec_sub( vecBlendZ, vecVecZ );
-
- // multiply by lerp and add to v1
- vecVecX = vec_madd( vecWork1, vecLerp, vecVecX );
- vecVecY = vec_madd( vecWork2, vecLerp, vecVecY );
- vecVecZ = vec_madd( vecWork3, vecLerp, vecVecZ );
-
- // put it back in original form
- v0 = vec_mergeh( vecVecX, vecVecZ );
- v1 = vec_mergeh( vecVecY, zeroVector );
- v3 = vec_mergel( vecVecX, vecVecZ );
- v4 = vec_mergel( vecVecY, zeroVector );
-
- // generate vectors to store
- vecWork1 = vec_mergeh( v0, v1 );
- vecWork2 = vec_mergel( v0, v1 );
- vecWork3 = vec_mergeh( v3, v4 );
- vector float vecWork4 = vec_mergel( v3, v4 );
- // store the T values
- storePerm1 = vec_lvsr( 0, jointVecPtr );
- storePerm2 = vec_lvsr( 0, jointVecPtr2 );
- storePerm3 = vec_lvsr( 0, jointVecPtr3 );
- storePerm4 = vec_lvsr( 0, jointVecPtr4 );
-
- // right rotate the input data
- vecWork1 = vec_perm( vecWork1, vecWork1, storePerm1 );
- vecWork2 = vec_perm( vecWork2, vecWork2, storePerm2 );
- vecWork3 = vec_perm( vecWork3, vecWork3, storePerm3 );
- vecWork4 = vec_perm( vecWork4, vecWork4, storePerm4 );
- vec_ste( vecWork1, 0, (float*) jointVecPtr );
- vec_ste( vecWork1, 4, (float*) jointVecPtr );
- vec_ste( vecWork1, 8, (float*) jointVecPtr );
- vec_ste( vecWork2, 0, (float*) jointVecPtr2 );
- vec_ste( vecWork2, 4, (float*) jointVecPtr2 );
- vec_ste( vecWork2, 8, (float*) jointVecPtr2 );
-
- vec_ste( vecWork3, 0, (float*) jointVecPtr3 );
- vec_ste( vecWork3, 4, (float*) jointVecPtr3 );
- vec_ste( vecWork3, 8, (float*) jointVecPtr3 );
-
- vec_ste( vecWork4, 0, (float*) jointVecPtr4 );
- vec_ste( vecWork4, 4, (float*) jointVecPtr4 );
- vec_ste( vecWork4, 8, (float*) jointVecPtr4 );
- }
-
- // cleanup
- for ( ; i < numJoints; i++ ) {
- int j = index[i];
- joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
- joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
- }
- }
- /*
- ============
- idSIMD_AltiVec::ConvertJointQuatsToJointMats
- ============
- */
- // SSE doesn't vectorize this, and I don't think we should either. Its mainly just copying data, there's very little math involved and
- // it's not easily parallelizable
- void VPCALL idSIMD_AltiVec::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
-
- for ( int i = 0; i < numJoints; i++ ) {
- const float *q = jointQuats[i].q.ToFloatPtr();
- float *m = jointMats[i].ToFloatPtr();
- m[0*4+3] = q[4];
- m[1*4+3] = q[5];
- m[2*4+3] = q[6];
- float x2 = q[0] + q[0];
- float y2 = q[1] + q[1];
- float z2 = q[2] + q[2];
- {
- float xx = q[0] * x2;
- float yy = q[1] * y2;
- float zz = q[2] * z2;
- m[0*4+0] = 1.0f - yy - zz;
- m[1*4+1] = 1.0f - xx - zz;
- m[2*4+2] = 1.0f - xx - yy;
- }
- {
- float yz = q[1] * z2;
- float wx = q[3] * x2;
- m[2*4+1] = yz - wx;
- m[1*4+2] = yz + wx;
- }
- {
- float xy = q[0] * y2;
- float wz = q[3] * z2;
- m[1*4+0] = xy - wz;
- m[0*4+1] = xy + wz;
- }
- {
- float xz = q[0] * z2;
- float wy = q[3] * y2;
- m[0*4+2] = xz - wy;
- m[2*4+0] = xz + wy;
- }
- }
- }
- /*
- ============
- idSIMD_AltiVec::ConvertJointMatsToJointQuats
- ============
- */
- void VPCALL idSIMD_AltiVec::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
- int index;
- // Since we use very little of the data we have to pull in for the altivec version, we end up with
- // a lot of wasted math. Rather than try to force it to use altivec, I wrote an optimized version
- // of InvSqrt for the G5, and made it use that instead. With only this change, we get a little
- // bigger than 50% speedup, which is not too shabby. Should really replace idMath::InvSqrt with
- // my function so everyone can benefit on G5.
-
- for ( index = 0; index < numJoints; index++ ) {
- idJointQuat jq;
- float trace;
- float s;
- float t;
- int i;
- int j;
- int k;
-
- static int next[3] = { 1, 2, 0 };
- float *mat = (float*)( jointMats[index].ToFloatPtr() );
- trace = mat[0 * 4 + 0] + mat[1 * 4 + 1] + mat[2 * 4 + 2];
- if ( trace > 0.0f ) {
- t = trace + 1.0f;
- //s = idMath::InvSqrt( t ) * 0.5f;
- s = FastScalarInvSqrt( t ) * 0.5f;
- jq.q[3] = s * t;
- jq.q[0] = ( mat[1 * 4 + 2] - mat[2 * 4 + 1] ) * s;
- jq.q[1] = ( mat[2 * 4 + 0] - mat[0 * 4 + 2] ) * s;
- jq.q[2] = ( mat[0 * 4 + 1] - mat[1 * 4 + 0] ) * s;
- } else {
- i = 0;
- if ( mat[1 * 4 + 1] > mat[0 * 4 + 0] ) {
- i = 1;
- }
- if ( mat[2 * 4 + 2] > mat[i * 4 + i] ) {
- i = 2;
- }
- j = next[i];
- k = next[j];
- t = ( mat[i * 4 + i] - ( mat[j * 4 + j] + mat[k * 4 + k] ) ) + 1.0f;
- //s = idMath::InvSqrt( t ) * 0.5f;
- s = FastScalarInvSqrt( t ) * 0.5f;
- jq.q[i] = s * t;
- jq.q[3] = ( mat[j * 4 + k] - mat[k * 4 + j] ) * s;
- jq.q[j] = ( mat[i * 4 + j] + mat[j * 4 + i] ) * s;
- jq.q[k] = ( mat[i * 4 + k] + mat[k * 4 + i] ) * s;
- }
- jq.t[0] = mat[0 * 4 + 3];
- jq.t[1] = mat[1 * 4 + 3];
- jq.t[2] = mat[2 * 4 + 3];
- jointQuats[index] = jq;
- }
- }
- /*
- ============
- idSIMD_AltiVec::TransformJoints
- ============
- */
- void VPCALL idSIMD_AltiVec::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
- int i;
- #if 0
- for( i = firstJoint; i <= lastJoint; i++ ) {
- assert( parents[i] < i );
- jointMats[i] *= jointMats[parents[i]];
- }
- #else
- // I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
- // on what the parents array looks like. This is true in the test code.
- for ( i = firstJoint; i <= lastJoint; i++ ) {
- assert( parents[i] < i );
- float *jointPtr = jointMats[i].ToFloatPtr();
- float *parentPtr = jointMats[parents[i]].ToFloatPtr();
- vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
- vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
- vector float v0, v1, v2, v3, v4, v5, v6, v7;
-
- // we need to load up 12 float elements that make up the Mat
- v0 = vec_ld( 0, jointPtr );
- v1 = vec_ld( 15, jointPtr );
- v2 = vec_ld( 31, jointPtr );
- v3 = vec_ld( 47, jointPtr );
-
- // load parents
- v4 = vec_ld( 0, parentPtr );
- v5 = vec_ld( 15, parentPtr );
- v6 = vec_ld( 31, parentPtr );
- v7 = vec_ld( 47, parentPtr );
-
- // permute into vectors
- vector float vecJointMat1 = vec_perm( v0, v1, permVec );
- vector float vecJointMat2 = vec_perm( v1, v2, permVec );
- vector float vecJointMat3 = vec_perm( v2, v3, permVec );
-
- vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
- vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
- vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
- vector float zero = (vector float)(0);
- vector float C1, C2, C3;
- // matrix multiply
- C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero ); // m(0 to 3) * a(0)
- C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat2, 0 ), zero ); // m(4 to 7) * a(4)
- C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat3, 0 ), zero ); // m(8 to 11) * a(8)
-
- C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat1, 1 ), C1 ); // add in m(4 to 7) * a(1)
- C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 ); // add in m(4 to 7) * a(5)
- C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat3, 1 ), C3 ); // add in m(4 to 7) * a(9)
-
- C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat1, 2 ), C1 );
- C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat2, 2 ), C2 );
- C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
- // do the addition at the end
- vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
- C1 = vec_add( C1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
- C2 = vec_add( C2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
- C3 = vec_add( C3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
- // store results
- UNALIGNED_STORE3( (float*) jointPtr, C1, C2, C3 );
- }
- #endif
- }
- /*
- ============
- idSIMD_AltiVec::UntransformJoints
- ============
- */
- void VPCALL idSIMD_AltiVec::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
- int i;
- #if 0
- for( i = lastJoint; i >= firstJoint; i-- ) {
- assert( parents[i] < i );
- jointMats[i] /= jointMats[parents[i]];
- }
- #else
- // I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
- // on what the parents array looks like. This is true in the test code.
- for ( i = lastJoint; i >= firstJoint; i-- ) {
- assert( parents[i] < i );
- float *jointPtr = jointMats[i].ToFloatPtr();
- float *parentPtr = jointMats[parents[i]].ToFloatPtr();
- vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
- vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
- vector float v0, v1, v2, v3, v4, v5, v6, v7;
-
- // we need to load up 12 float elements that make up the Mat
- v0 = vec_ld( 0, jointPtr );
- v1 = vec_ld( 15, jointPtr );
- v2 = vec_ld( 31, jointPtr );
- v3 = vec_ld( 47, jointPtr );
-
- // load parents
- v4 = vec_ld( 0, parentPtr );
- v5 = vec_ld( 15, parentPtr );
- v6 = vec_ld( 31, parentPtr );
- v7 = vec_ld( 47, parentPtr );
-
- // permute into vectors
- vector float vecJointMat1 = vec_perm( v0, v1, permVec );
- vector float vecJointMat2 = vec_perm( v1, v2, permVec );
- vector float vecJointMat3 = vec_perm( v2, v3, permVec );
-
- vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
- vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
- vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
- vector float zero = (vector float)(0);
- vector float C1, C2, C3;
-
- // do subtraction at the beginning
- vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
- vecJointMat1 = vec_sub( vecJointMat1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
- vecJointMat2 = vec_sub( vecJointMat2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
- vecJointMat3 = vec_sub( vecJointMat3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
-
- // matrix multiply
- C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero );
- C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 1 ), zero );
- C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 2 ), zero );
-
- C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 0 ), C1 );
- C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 );
- C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 2 ), C3 );
-
- C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 0 ), C1 );
- C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 1 ), C2 );
- C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
- // store results back
- vector unsigned char storePerm = vec_lvsr( 0, jointPtr );
-
- // right rotate the input data
- C1 = vec_perm( C1, C1, storePerm );
- C2 = vec_perm( C2, C2, storePerm );
- C3 = vec_perm( C3, C3, storePerm );
- vec_ste( C1, 0, (float*) jointPtr );
- vec_ste( C1, 4, (float*) jointPtr );
- vec_ste( C1, 8, (float*) jointPtr );
- vec_ste( C1, 12, (float*) jointPtr );
- vec_ste( C2, 16, (float*) jointPtr );
- vec_ste( C2, 20, (float*) jointPtr );
- vec_ste( C2, 24, (float*) jointPtr );
- vec_ste( C2, 28, (float*) jointPtr );
- vec_ste( C3, 32, (float*) jointPtr );
- vec_ste( C3, 36, (float*) jointPtr );
- vec_ste( C3, 40, (float*) jointPtr );
- vec_ste( C3, 44, (float*) jointPtr );
- }
- #endif
- }
- /*
- ============
- idSIMD_AltiVec::TransformVerts
- ============
- */
- // Here we don't have much for the vector unit to do, and the gain we get from doing the math
- // in parallel is eaten by doing unaligned stores.
- void VPCALL idSIMD_AltiVec::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
- int i, j;
- const byte *jointsPtr = (byte *)joints;
- for( j = i = 0; i < numVerts; i++ ) {
- idVec3 v;
- float *matPtrOrig = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
- float *weightPtr = (float*) weights[j].ToFloatPtr();
-
- v[0] = matPtrOrig[0] * weightPtr[0];
- v[0] += matPtrOrig[1] * weightPtr[1];
- v[0] += matPtrOrig[2] * weightPtr[2];
- v[0] += matPtrOrig[3] * weightPtr[3];
- v[1] = matPtrOrig[4] * weightPtr[0];
- v[1] += matPtrOrig[5] * weightPtr[1];
- v[1] += matPtrOrig[6] * weightPtr[2];
- v[1] += matPtrOrig[7] * weightPtr[3];
-
- v[2] = matPtrOrig[8] * weightPtr[0];
- v[2] += matPtrOrig[9] * weightPtr[1];
- v[2] += matPtrOrig[10] * weightPtr[2];
- v[2] += matPtrOrig[11] * weightPtr[3];
- while( index[j*2+1] == 0 ) {
- j++;
- float *matPtr = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
- weightPtr = (float*) weights[j].ToFloatPtr();
-
- v[0] += matPtr[0] * weightPtr[0];
- v[0] += matPtr[1] * weightPtr[1];
- v[0] += matPtr[2] * weightPtr[2];
- v[0] += matPtr[3] * weightPtr[3];
-
- v[1] += matPtr[4] * weightPtr[0];
- v[1] += matPtr[5] * weightPtr[1];
- v[1] += matPtr[6] * weightPtr[2];
- v[1] += matPtr[7] * weightPtr[3];
-
- v[2] += matPtr[8] * weightPtr[0];
- v[2] += matPtr[9] * weightPtr[1];
- v[2] += matPtr[10] * weightPtr[2];
- v[2] += matPtr[11] * weightPtr[3];
- }
- j++;
- verts[i].xyz = v;
- }
- }
- #endif /* LIVE_VICARIOUSLY */
- #ifdef ENABLE_CULL
- #ifndef DRAWVERT_PADDED
- /*
- ============
- idSIMD_AltiVec::TracePointCull
- ============
- */
- void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
-
- // idDrawVert size
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
-
- byte tOr;
- tOr = 0;
- // pointers
- const float *planePtr = planes[0].ToFloatPtr();
-
- vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
- vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
- vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
- vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
- vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
- vector unsigned char vecPerm;
- vector float v0, v1, v2, v3, v4, v5, v6, v7;
- vector float zeroVector = (vector float)(0);
- vector float vecRadius;
- vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
- vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
- vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
- vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
- vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
- vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
- vector bool int oneIntVector = (vector bool int)(1);
- vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
- vector unsigned int vecTotals;
- vector unsigned int tempIntSum;
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
-
- vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
-
- // populate planes
- v0 = vec_ld( 0, planePtr );
- v1 = vec_ld( 15, planePtr );
- vecPlane0 = vec_perm( v0, v1, vecPerm );
- v2 = vec_ld( 0, planePtr + 4 );
- v3 = vec_ld( 15, planePtr + 4 );
- vecPlane1 = vec_perm( v2, v3, vecPerm );
- v0 = vec_ld( 0, planePtr + 8 );
- v1 = vec_ld( 15, planePtr + 8 );
- vecPlane2 = vec_perm( v0, v1, vecPerm );
- v2 = vec_ld( 0, planePtr + 12 );
- v3 = vec_ld( 15, planePtr + 12 );
- vecPlane3 = vec_perm( v2, v3, vecPerm );
-
- // transpose
- v0 = vec_mergeh( vecPlane0, vecPlane2 );
- v1 = vec_mergeh( vecPlane1, vecPlane3 );
- v2 = vec_mergel( vecPlane0, vecPlane2 );
- v3 = vec_mergel( vecPlane1, vecPlane3 );
- vecPlane0 = vec_mergeh( v0, v1 );
- vecPlane1 = vec_mergel( v0, v1 );
- vecPlane2 = vec_mergeh( v2, v3 );
- vecPlane3 = vec_mergel( v2, v3 );
-
- // load constants
- vecRadius = loadSplatUnalignedScalar( &radius );
-
- unsigned int cullBitVal[4];
- vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
- int i = 0;
-
- // every fourth one will have the same alignment. Make sure we've got enough here
- if ( i+3 < numVerts ) {
- vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- }
-
- for ( ; i+3 < numVerts; i+=4 ) {
- const float *vertPtr = verts[i].xyz.ToFloatPtr();
- const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
-
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 15, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v3 = vec_ld( 15, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v5 = vec_ld( 15, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
- v7 = vec_ld( 15, vertPtr4 );
-
- vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
- vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
- vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
- vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
- vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
-
- vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
- vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
- vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
- vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
-
- vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
- vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
- vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
- vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
-
- vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
- vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
- vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
- vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
- // vec1Sum1 now holds d0, d1, d2, d3. calculate the
- // difference with +radius and -radius
- vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
- vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
- vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
- vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
- vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
- vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
- vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
- vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
-
- // do compare
- vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
- vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
- vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
- vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
- vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
- vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
- vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
- vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
-
- //and it with 1 so we multiply by 1 not 1111's
- vecCmp1 = vec_and( vecCmp1, oneIntVector );
- vecCmp2 = vec_and( vecCmp2, oneIntVector );
- vecCmp3 = vec_and( vecCmp3, oneIntVector );
- vecCmp4 = vec_and( vecCmp4, oneIntVector );
- vecCmp5 = vec_and( vecCmp5, oneIntVector );
- vecCmp6 = vec_and( vecCmp6, oneIntVector );
- vecCmp7 = vec_and( vecCmp7, oneIntVector );
- vecCmp8 = vec_and( vecCmp8, oneIntVector );
-
- vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
- vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
- vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
- vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
- vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
- vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
- vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
- vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
-
- // OR (add) them all together
- vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
- vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
- vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
- vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
-
- vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
- vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
- tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
- tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
- vecTotals = vec_mergeh( vecTotals, tempIntSum );
- tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
- tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
- vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
- tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
- tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
- vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
-
- // store out results
- vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
- tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
- vec_ste( tempSt, 0, &cullBitVal[0] );
- vec_ste( tempSt, 4, &cullBitVal[0] );
- vec_ste( tempSt, 8, &cullBitVal[0] );
- vec_ste( tempSt, 12, &cullBitVal[0] );
-
- tOr |= cullBitVal[0];
- tOr |= cullBitVal[1];
- tOr |= cullBitVal[2];
- tOr |= cullBitVal[3];
-
- cullBits[i] = cullBitVal[0];
- cullBits[i+1] = cullBitVal[1];
- cullBits[i+2] = cullBitVal[2];
- cullBits[i+3] = cullBitVal[3];
- }
- // cleanup
- for ( ; i < numVerts; i++ ) {
- byte bits;
- float d0, d1, d2, d3, t;
- const idVec3 &v = verts[i].xyz;
- d0 = planes[0].Distance( v );
- d1 = planes[1].Distance( v );
- d2 = planes[2].Distance( v );
- d3 = planes[3].Distance( v );
- t = d0 + radius;
- bits = FLOATSIGNBITSET( t ) << 0;
- t = d1 + radius;
- bits |= FLOATSIGNBITSET( t ) << 1;
- t = d2 + radius;
- bits |= FLOATSIGNBITSET( t ) << 2;
- t = d3 + radius;
- bits |= FLOATSIGNBITSET( t ) << 3;
- t = d0 - radius;
- bits |= FLOATSIGNBITSET( t ) << 4;
- t = d1 - radius;
- bits |= FLOATSIGNBITSET( t ) << 5;
- t = d2 - radius;
- bits |= FLOATSIGNBITSET( t ) << 6;
- t = d3 - radius;
- bits |= FLOATSIGNBITSET( t ) << 7;
- bits ^= 0x0F; // flip lower four bits
- tOr |= bits;
- cullBits[i] = bits;
- }
- totalOr = tOr;
- }
- #else
- /*
- ============
- idSIMD_AltiVec::TracePointCull
- ============
- */
- void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
-
- // idDrawVert size
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
-
- byte tOr;
- tOr = 0;
- // pointers
- const float *planePtr = planes[0].ToFloatPtr();
-
- vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
- vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
- vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
- vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
- vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
- vector unsigned char vecPerm;
- vector float v0, v1, v2, v3, v4, v5, v6, v7;
- vector float zeroVector = (vector float)(0);
- vector float vecRadius;
- vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
- vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
- vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
- vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
- vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
- vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
- vector bool int oneIntVector = (vector bool int)(1);
- vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
- vector unsigned int vecTotals;
- vector unsigned int tempIntSum;
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
-
- vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
-
- // populate planes
- v0 = vec_ld( 0, planePtr );
- v1 = vec_ld( 15, planePtr );
- vecPlane0 = vec_perm( v0, v1, vecPerm );
- v2 = vec_ld( 0, planePtr + 4 );
- v3 = vec_ld( 15, planePtr + 4 );
- vecPlane1 = vec_perm( v2, v3, vecPerm );
- v0 = vec_ld( 0, planePtr + 8 );
- v1 = vec_ld( 15, planePtr + 8 );
- vecPlane2 = vec_perm( v0, v1, vecPerm );
- v2 = vec_ld( 0, planePtr + 12 );
- v3 = vec_ld( 15, planePtr + 12 );
- vecPlane3 = vec_perm( v2, v3, vecPerm );
-
- // transpose
- v0 = vec_mergeh( vecPlane0, vecPlane2 );
- v1 = vec_mergeh( vecPlane1, vecPlane3 );
- v2 = vec_mergel( vecPlane0, vecPlane2 );
- v3 = vec_mergel( vecPlane1, vecPlane3 );
- vecPlane0 = vec_mergeh( v0, v1 );
- vecPlane1 = vec_mergel( v0, v1 );
- vecPlane2 = vec_mergeh( v2, v3 );
- vecPlane3 = vec_mergel( v2, v3 );
-
- // load constants
- vecRadius = loadSplatUnalignedScalar( &radius );
-
- unsigned int cullBitVal[4];
- vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
- int i = 0;
-
- for ( ; i+3 < numVerts; i+=4 ) {
- const float *vertPtr = verts[i].xyz.ToFloatPtr();
- const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
-
- vecXYZ1 = vec_ld( 0, vertPtr );
- vecXYZ2 = vec_ld( 0, vertPtr2 );
- vecXYZ3 = vec_ld( 0, vertPtr3 );
- vecXYZ4 = vec_ld( 0, vertPtr4 );
-
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
- vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
-
- vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
- vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
- vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
- vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
-
- vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
- vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
- vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
- vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
-
- vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
- vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
- vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
- vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
- // vec1Sum1 now holds d0, d1, d2, d3. calculate the
- // difference with +radius and -radius
- vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
- vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
- vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
- vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
- vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
- vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
- vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
- vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
-
- // do compare
- vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
- vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
- vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
- vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
- vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
- vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
- vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
- vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
-
- //and it with 1 so we multiply by 1 not 1111's
- vecCmp1 = vec_and( vecCmp1, oneIntVector );
- vecCmp2 = vec_and( vecCmp2, oneIntVector );
- vecCmp3 = vec_and( vecCmp3, oneIntVector );
- vecCmp4 = vec_and( vecCmp4, oneIntVector );
- vecCmp5 = vec_and( vecCmp5, oneIntVector );
- vecCmp6 = vec_and( vecCmp6, oneIntVector );
- vecCmp7 = vec_and( vecCmp7, oneIntVector );
- vecCmp8 = vec_and( vecCmp8, oneIntVector );
-
- vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
- vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
- vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
- vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
- vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
- vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
- vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
- vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
-
- // OR (add) them all together
- vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
- vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
- vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
- vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
-
- vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
- vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
- tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
- tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
- vecTotals = vec_mergeh( vecTotals, tempIntSum );
- tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
- tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
- vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
- tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
- tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
- vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
-
- // store out results
- vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
- tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
- vec_ste( tempSt, 0, &cullBitVal[0] );
- vec_ste( tempSt, 4, &cullBitVal[0] );
- vec_ste( tempSt, 8, &cullBitVal[0] );
- vec_ste( tempSt, 12, &cullBitVal[0] );
-
- tOr |= cullBitVal[0];
- tOr |= cullBitVal[1];
- tOr |= cullBitVal[2];
- tOr |= cullBitVal[3];
-
- cullBits[i] = cullBitVal[0];
- cullBits[i+1] = cullBitVal[1];
- cullBits[i+2] = cullBitVal[2];
- cullBits[i+3] = cullBitVal[3];
- }
- // cleanup
- for ( ; i < numVerts; i++ ) {
- byte bits;
- float d0, d1, d2, d3, t;
- const idVec3 &v = verts[i].xyz;
- d0 = planes[0].Distance( v );
- d1 = planes[1].Distance( v );
- d2 = planes[2].Distance( v );
- d3 = planes[3].Distance( v );
- t = d0 + radius;
- bits = FLOATSIGNBITSET( t ) << 0;
- t = d1 + radius;
- bits |= FLOATSIGNBITSET( t ) << 1;
- t = d2 + radius;
- bits |= FLOATSIGNBITSET( t ) << 2;
- t = d3 + radius;
- bits |= FLOATSIGNBITSET( t ) << 3;
- t = d0 - radius;
- bits |= FLOATSIGNBITSET( t ) << 4;
- t = d1 - radius;
- bits |= FLOATSIGNBITSET( t ) << 5;
- t = d2 - radius;
- bits |= FLOATSIGNBITSET( t ) << 6;
- t = d3 - radius;
- bits |= FLOATSIGNBITSET( t ) << 7;
- bits ^= 0x0F; // flip lower four bits
- tOr |= bits;
- cullBits[i] = bits;
- }
- totalOr = tOr;
- }
- #endif /* DRAWVERT_PADDED */
- #ifndef DRAWVERT_PADDED
- /*
- ============
- idSIMD_AltiVec::DecalPointCull
- ============
- */
- void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
- // idDrawVert size
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
-
- int i;
- const float *planePtr = planes[0].ToFloatPtr();
-
- vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
- vector float zeroVector = (vector float)(0.0);
- vector unsigned char vecPerm;
- vector float v0, v1, v2, v3, v4, v5, v6, v7;
-
- vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
-
- // populate planes
- v0 = vec_ld( 0, planePtr );
- v1 = vec_ld( 15, planePtr );
- vecPlane0 = vec_perm( v0, v1, vecPerm );
- v2 = vec_ld( 0, planePtr + 4 );
- v3 = vec_ld( 15, planePtr + 4 );
- vecPlane1 = vec_perm( v2, v3, vecPerm );
- v0 = vec_ld( 0, planePtr + 8 );
- v1 = vec_ld( 15, planePtr + 8 );
- vecPlane2 = vec_perm( v0, v1, vecPerm );
- v2 = vec_ld( 0, planePtr + 12 );
- v3 = vec_ld( 15, planePtr + 12 );
- vecPlane3 = vec_perm( v2, v3, vecPerm );
-
- v0 = vec_ld( 0, planePtr + 16 );
- v1 = vec_ld( 15, planePtr + 16 );
- vecPlane4 = vec_perm( v0, v1, vecPerm );
-
- v2 = vec_ld( 0, planePtr + 20 );
- v3 = vec_ld( 15, planePtr + 20 );
- vecPlane5 = vec_perm( v2, v3, vecPerm );
-
- // transpose
- v0 = vec_mergeh( vecPlane0, vecPlane2 );
- v1 = vec_mergeh( vecPlane1, vecPlane3 );
- v2 = vec_mergel( vecPlane0, vecPlane2 );
- v3 = vec_mergel( vecPlane1, vecPlane3 );
- vecPlane0 = vec_mergeh( v0, v1 );
- vecPlane1 = vec_mergel( v0, v1 );
- vecPlane2 = vec_mergeh( v2, v3 );
- vecPlane3 = vec_mergel( v2, v3 );
-
- v0 = vec_mergeh( vecPlane4, zeroVector );
- v1 = vec_mergeh( vecPlane5, zeroVector );
- v2 = vec_mergel( vecPlane4, zeroVector );
- v3 = vec_mergel( vecPlane5, zeroVector );
- vecPlane4 = vec_mergeh( v0, v1 );
- vecPlane5 = vec_mergel( v0, v1 );
- vecPlane6 = vec_mergeh( v2, v3 );
- vecPlane7 = vec_mergel( v2, v3 );
-
-
- vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
- vector bool int oneIntVector = (vector bool int)(1);
- vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
- vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
- vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
-
- vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
- vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
- vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
- vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
- vector unsigned int vecR1, vecR2, vecR3, vecR4;
- vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
- unsigned int vBits[4];
- vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
-
- i = 0;
- // every fourth one will have the same alignment. Make sure we've got enough here
- if ( i+3 < numVerts ) {
- vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- }
-
- for ( ; i+3 < numVerts; i+=4 ) {
- const float *vertPtr = verts[i].xyz.ToFloatPtr();
- const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
-
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 15, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v3 = vec_ld( 15, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v5 = vec_ld( 15, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
- v7 = vec_ld( 15, vertPtr4 );
-
- vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
- vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
- vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
- vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
-
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
- vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
-
- vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
- vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
- vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
- vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
-
- vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
- vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
- vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
- vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
-
- vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
- vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
- vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
- vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
-
- vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
- vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
- vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
- vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
-
- vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
- vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
- vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
- vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
-
- vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
- vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
- vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
- vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
-
- vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
- vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
- vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
- vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
- vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
- vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
- vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
- vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
- vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
- vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
- vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
- vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
-
- //and it with 1 so we multiply by 1 not 1111's
- vecCmp1 = vec_and( vecCmp1, oneIntVector );
- vecCmp2 = vec_and( vecCmp2, oneIntVector );
- vecCmp3 = vec_and( vecCmp3, oneIntVector );
- vecCmp4 = vec_and( vecCmp4, oneIntVector );
- vecCmp5 = vec_and( vecCmp5, oneIntVector );
- vecCmp6 = vec_and( vecCmp6, oneIntVector );
- vecCmp7 = vec_and( vecCmp7, oneIntVector );
- vecCmp8 = vec_and( vecCmp8, oneIntVector );
- vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
- vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
- vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
- vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
- vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
- vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
- vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
- vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
-
- //OR them all together (this is the same as adding them, since they're all only 1 bit set)
- vecR1 = (vector unsigned int)(0); //zeroIntVector;
- vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
- vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
- vecR1 = vec_add(vecR1, vecBitShifted2 );
- vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
-
- vecR2 = (vector unsigned int)(0); //zeroIntVector;
- vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
- vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
- vecR2 = vec_add(vecR2, vecBitShifted4 );
- vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
-
- vecR3 = (vector unsigned int)(0); //zeroIntVector;
- vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
- vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
- vecR3 = vec_add(vecR3, vecBitShifted6 );
- vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
-
- vecR4 = (vector unsigned int)(0); //zeroIntVector;
- vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
- vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
- vecR4 = vec_add(vecR4, vecBitShifted8 );
- vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
-
- // take the first element from each vector and put them into vecR1
- vecR1 = vec_mergeh( vecR1, vecR2 );
- vecR3 = vec_mergeh( vecR3, vecR4 );
- vecR1 = vec_perm( vecR1, vecR3, permHalves );
- // XOR with 0x3F to flip lower 6 bits
- vecR1 = vec_xor( vecR1, vecFlipBits );
-
- // store out results. don't have 16 at a time so let's just
- // do this and avoid alignment concerns
- vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
- vec_ste( vecR1, 0, &vBits[0] );
- vec_ste( vecR1, 4, &vBits[0] );
- vec_ste( vecR1, 8, &vBits[0] );
- vec_ste( vecR1, 12, &vBits[0] );
-
- cullBits[i] = vBits[0];
- cullBits[i+1] = vBits[1];
- cullBits[i+2] = vBits[2];
- cullBits[i+3] = vBits[3];
- }
-
- for ( ; i < numVerts; i++ ) {
- byte bits;
- float d0, d1, d2, d3, d4, d5;
- const idVec3 &v = verts[i].xyz;
- d0 = planes[0].Distance( v );
- d1 = planes[1].Distance( v );
- d2 = planes[2].Distance( v );
- d3 = planes[3].Distance( v );
- d4 = planes[4].Distance( v );
- d5 = planes[5].Distance( v );
- // they check if the sign bit is set by casting as long and shifting right 31 places.
- bits = FLOATSIGNBITSET( d0 ) << 0;
- bits |= FLOATSIGNBITSET( d1 ) << 1;
- bits |= FLOATSIGNBITSET( d2 ) << 2;
- bits |= FLOATSIGNBITSET( d3 ) << 3;
- bits |= FLOATSIGNBITSET( d4 ) << 4;
- bits |= FLOATSIGNBITSET( d5 ) << 5;
- cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::DecalPointCull
- ============
- */
- void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
- // idDrawVert size
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
-
- int i;
- const float *planePtr = planes[0].ToFloatPtr();
-
- vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
- vector float zeroVector = (vector float)(0.0);
- vector unsigned char vecPerm;
- vector float v0, v1, v2, v3, v4, v5, v6, v7;
-
- vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
-
- // populate planes
- v0 = vec_ld( 0, planePtr );
- v1 = vec_ld( 15, planePtr );
- vecPlane0 = vec_perm( v0, v1, vecPerm );
- v2 = vec_ld( 0, planePtr + 4 );
- v3 = vec_ld( 15, planePtr + 4 );
- vecPlane1 = vec_perm( v2, v3, vecPerm );
- v0 = vec_ld( 0, planePtr + 8 );
- v1 = vec_ld( 15, planePtr + 8 );
- vecPlane2 = vec_perm( v0, v1, vecPerm );
- v2 = vec_ld( 0, planePtr + 12 );
- v3 = vec_ld( 15, planePtr + 12 );
- vecPlane3 = vec_perm( v2, v3, vecPerm );
-
- v0 = vec_ld( 0, planePtr + 16 );
- v1 = vec_ld( 15, planePtr + 16 );
- vecPlane4 = vec_perm( v0, v1, vecPerm );
-
- v2 = vec_ld( 0, planePtr + 20 );
- v3 = vec_ld( 15, planePtr + 20 );
- vecPlane5 = vec_perm( v2, v3, vecPerm );
-
- // transpose
- v0 = vec_mergeh( vecPlane0, vecPlane2 );
- v1 = vec_mergeh( vecPlane1, vecPlane3 );
- v2 = vec_mergel( vecPlane0, vecPlane2 );
- v3 = vec_mergel( vecPlane1, vecPlane3 );
- vecPlane0 = vec_mergeh( v0, v1 );
- vecPlane1 = vec_mergel( v0, v1 );
- vecPlane2 = vec_mergeh( v2, v3 );
- vecPlane3 = vec_mergel( v2, v3 );
-
- v0 = vec_mergeh( vecPlane4, zeroVector );
- v1 = vec_mergeh( vecPlane5, zeroVector );
- v2 = vec_mergel( vecPlane4, zeroVector );
- v3 = vec_mergel( vecPlane5, zeroVector );
- vecPlane4 = vec_mergeh( v0, v1 );
- vecPlane5 = vec_mergel( v0, v1 );
- vecPlane6 = vec_mergeh( v2, v3 );
- vecPlane7 = vec_mergel( v2, v3 );
-
-
- vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
- vector bool int oneIntVector = (vector bool int)(1);
- vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
- vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
- vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
-
- vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
- vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
- vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
- vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
- vector unsigned int vecR1, vecR2, vecR3, vecR4;
- vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
- unsigned int vBits[4];
- vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
-
- i = 0;
-
- for ( ; i+3 < numVerts; i+=4 ) {
- const float *vertPtr = verts[i].xyz.ToFloatPtr();
- const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
-
- v0 = vec_ld( 0, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
-
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
- vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
- vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
-
- vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
- vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
- vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
- vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
-
- vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
- vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
- vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
- vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
-
- vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
- vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
- vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
- vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
-
- vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
- vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
- vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
- vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
-
- vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
- vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
- vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
- vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
-
- vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
- vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
- vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
- vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
-
- vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
- vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
- vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
- vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
- vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
- vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
- vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
- vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
- vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
- vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
- vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
- vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
-
- //and it with 1 so we multiply by 1 not 1111's
- vecCmp1 = vec_and( vecCmp1, oneIntVector );
- vecCmp2 = vec_and( vecCmp2, oneIntVector );
- vecCmp3 = vec_and( vecCmp3, oneIntVector );
- vecCmp4 = vec_and( vecCmp4, oneIntVector );
- vecCmp5 = vec_and( vecCmp5, oneIntVector );
- vecCmp6 = vec_and( vecCmp6, oneIntVector );
- vecCmp7 = vec_and( vecCmp7, oneIntVector );
- vecCmp8 = vec_and( vecCmp8, oneIntVector );
- vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
- vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
- vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
- vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
- vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
- vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
- vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
- vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
-
- //OR them all together (this is the same as adding them, since they're all only 1 bit set)
- vecR1 = (vector unsigned int)(0); //zeroIntVector;
- vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
- vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
- vecR1 = vec_add(vecR1, vecBitShifted2 );
- vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
-
- vecR2 = (vector unsigned int)(0); //zeroIntVector;
- vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
- vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
- vecR2 = vec_add(vecR2, vecBitShifted4 );
- vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
-
- vecR3 = (vector unsigned int)(0); //zeroIntVector;
- vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
- vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
- vecR3 = vec_add(vecR3, vecBitShifted6 );
- vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
-
- vecR4 = (vector unsigned int)(0); //zeroIntVector;
- vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
- vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
- vecR4 = vec_add(vecR4, vecBitShifted8 );
- vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
-
- // take the first element from each vector and put them into vecR1
- vecR1 = vec_mergeh( vecR1, vecR2 );
- vecR3 = vec_mergeh( vecR3, vecR4 );
- vecR1 = vec_perm( vecR1, vecR3, permHalves );
- // XOR with 0x3F to flip lower 6 bits
- vecR1 = vec_xor( vecR1, vecFlipBits );
-
- // store out results. don't have 16 at a time so let's just
- // do this and avoid alignment concerns
- vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
- vec_ste( vecR1, 0, &vBits[0] );
- vec_ste( vecR1, 4, &vBits[0] );
- vec_ste( vecR1, 8, &vBits[0] );
- vec_ste( vecR1, 12, &vBits[0] );
-
- cullBits[i] = vBits[0];
- cullBits[i+1] = vBits[1];
- cullBits[i+2] = vBits[2];
- cullBits[i+3] = vBits[3];
- }
-
- for ( ; i < numVerts; i++ ) {
- byte bits;
- float d0, d1, d2, d3, d4, d5;
- const idVec3 &v = verts[i].xyz;
- d0 = planes[0].Distance( v );
- d1 = planes[1].Distance( v );
- d2 = planes[2].Distance( v );
- d3 = planes[3].Distance( v );
- d4 = planes[4].Distance( v );
- d5 = planes[5].Distance( v );
- // they check if the sign bit is set by casting as long and shifting right 31 places.
- bits = FLOATSIGNBITSET( d0 ) << 0;
- bits |= FLOATSIGNBITSET( d1 ) << 1;
- bits |= FLOATSIGNBITSET( d2 ) << 2;
- bits |= FLOATSIGNBITSET( d3 ) << 3;
- bits |= FLOATSIGNBITSET( d4 ) << 4;
- bits |= FLOATSIGNBITSET( d5 ) << 5;
- cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
- }
- }
- #endif /*DRAWVERT_PADDED */
- #ifndef DRAWVERT_PADDED
- /*
- ============
- idSIMD_AltiVec::OverlayPointCull
- ============
- */
- void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
-
- // idDrawVert size
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
-
- int i;
- float p0x, p0y, p0z, p0d;
- float p1x, p1y, p1z, p1d;
-
- const float *planePtr = planes[0].ToFloatPtr();
- const float *vertPtr = verts[0].xyz.ToFloatPtr();
-
- vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
- vector float v0, v1, v2, v3, v4, v5, v6, v7;
- vector unsigned char vecPerm;
- vector float zeroVector = (vector float)(0);
-
- p0x = *(planePtr + 0);
- p0y = *(planePtr + 1);
- p0z = *(planePtr + 2);
- p0d = *(planePtr + 3);
- p1x = *(planePtr + 4);
- p1y = *(planePtr + 5);
- p1z = *(planePtr + 6);
- p1d = *(planePtr + 7);
-
- // populate the planes
- vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
- v0 = vec_ld( 0, planePtr );
- v1 = vec_ld( 15, planePtr );
- vecPlane0 = vec_perm( v0, v1, vecPerm );
-
- v2 = vec_ld( 31, planePtr );
- vecPlane1 = vec_perm( v1, v2, vecPerm );
-
- // transpose
- v0 = vec_mergeh( vecPlane0, vecPlane0 );
- v1 = vec_mergeh( vecPlane1, vecPlane1 );
- v2 = vec_mergel( vecPlane0, vecPlane0 );
- v3 = vec_mergel( vecPlane1, vecPlane1);
- vecPlane0 = vec_mergeh( v0, v1 );
- vecPlane1 = vec_mergel( v0, v1 );
- vecPlane2 = vec_mergeh( v2, v3 );
- vecPlane3 = vec_mergel( v2, v3 );
- vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
- vector float oneVector = (vector float)(1);
- vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
- vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
- vector float negTwoVector = (vector float)(-2);
- vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
- vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
- vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
- vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
- vector bool int oneIntVector = (vector bool int)(1);
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
- unsigned int cullBitVal[4];
- vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
- i = 0;
- // every fourth one will have the same alignment. Make sure we've got enough here
- if ( i+3 < numVerts ) {
- vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- }
-
- for ( ; i+3 < numVerts; i+=4 ) {
- const float *vertPtr = verts[i].xyz.ToFloatPtr();
- const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
-
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 15, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v3 = vec_ld( 15, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v5 = vec_ld( 15, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
- v7 = vec_ld( 15, vertPtr4 );
-
- vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
- vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
- vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
- vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
- // like a splat, but only doing halves
- vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
- vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
- vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
- vecSum1 = vec_add( vecSum1, vecPlane3 );
- vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
- vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
- vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
- vecSum2 = vec_add( vecSum2, vecPlane3 );
- // store out results
- UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
- // bit manipulation
- vecCmp1 = vec_cmplt( vecSum1, zeroVector );
- vecCmp2 = vec_cmplt( vecSum2, zeroVector );
-
- //and it with 1 so we multiply by 1 not 1111's
- vecCmp1 = vec_and( vecCmp1, oneIntVector );
- vecCmp2 = vec_and( vecCmp2, oneIntVector );
-
- // store out and write to cullBits
- // finally, a use for algebra! 1-x = x + 1 - 2x
- vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
- vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
- vecSum1Inv = vec_add( vecSum1Inv, oneVector );
- vecSum2Inv = vec_add( vecSum2Inv, oneVector );
-
- // do the same comparisons for the inverted d0/d1
- vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
- vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
-
- //and it with 1 so we multiply by 1 not 1111's
- vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
- vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
- // shift them as needed
- vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
- vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
- vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
- vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
- // OR them all together. since only 1 bit is set for each value, thats
- // the same as adding them. add up d0 + d1 + d0Inv + d1Inv
- vector unsigned int vecResult;
- vector unsigned int vecResult2;
- vector unsigned int vecResult3;
- vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
-
- vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
-
- // vecResult now holds the values without the inverses yet, so add those
- vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
- vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
- vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
- vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
-
- vecResult = vec_add( vecResult, vecResult2 );
-
- //store out results
- vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
- vec_ste( vecResult, 0, &cullBitVal[0] );
- vec_ste( vecResult, 4, &cullBitVal[0] );
- vec_ste( vecResult, 8, &cullBitVal[0] );
- vec_ste( vecResult, 12, &cullBitVal[0] );
-
- cullBits[i] = cullBitVal[0];
- cullBits[i+1] = cullBitVal[1];
- cullBits[i+2] = cullBitVal[2];
- cullBits[i+3] = cullBitVal[3];
- }
- // cleanup
- for ( ; i < numVerts; i++ ) {
- byte bits;
- float d0, d1;
- float vx, vy, vz;
-
- vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
- vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
- vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
-
- d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
- d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
- texCoords[i][0] = d0;
- texCoords[i][1] = d1;
-
- bits = ( d0 >= 0 ) ? 0 : 1;
- d0 = 1.0f - d0;
- bits |= ( d1 >= 0 ) ? 0 : 1*2;
- d1 = 1.0f - d1;
-
- bits |= ( d0 >= 0 ) ? 0: 1*4;
- bits |= ( d1 >= 0 ) ? 0: 1*8;
- cullBits[i] = bits;
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::OverlayPointCull
- ============
- */
- void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
-
- // idDrawVert size
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
-
- int i;
- float p0x, p0y, p0z, p0d;
- float p1x, p1y, p1z, p1d;
-
- const float *planePtr = planes[0].ToFloatPtr();
- const float *vertPtr = verts[0].xyz.ToFloatPtr();
-
- vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
- vector float v0, v1, v2, v3, v4, v5, v6, v7;
- vector unsigned char vecPerm;
- vector float zeroVector = (vector float)(0);
-
- p0x = *(planePtr + 0);
- p0y = *(planePtr + 1);
- p0z = *(planePtr + 2);
- p0d = *(planePtr + 3);
- p1x = *(planePtr + 4);
- p1y = *(planePtr + 5);
- p1z = *(planePtr + 6);
- p1d = *(planePtr + 7);
-
- // populate the planes
- vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
- v0 = vec_ld( 0, planePtr );
- v1 = vec_ld( 15, planePtr );
- vecPlane0 = vec_perm( v0, v1, vecPerm );
-
- v2 = vec_ld( 31, planePtr );
- vecPlane1 = vec_perm( v1, v2, vecPerm );
-
- // transpose
- v0 = vec_mergeh( vecPlane0, vecPlane0 );
- v1 = vec_mergeh( vecPlane1, vecPlane1 );
- v2 = vec_mergel( vecPlane0, vecPlane0 );
- v3 = vec_mergel( vecPlane1, vecPlane1);
- vecPlane0 = vec_mergeh( v0, v1 );
- vecPlane1 = vec_mergel( v0, v1 );
- vecPlane2 = vec_mergeh( v2, v3 );
- vecPlane3 = vec_mergel( v2, v3 );
- vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
- vector float oneVector = (vector float)(1);
- vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
- vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
- vector float negTwoVector = (vector float)(-2);
- vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
- vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
- vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
- vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
- vector bool int oneIntVector = (vector bool int)(1);
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
- unsigned int cullBitVal[4];
- vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
- i = 0;
- for ( ; i+3 < numVerts; i+=4 ) {
- const float *vertPtr = verts[i].xyz.ToFloatPtr();
- const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
-
- vecXYZ1 = vec_ld( 0, vertPtr );
- vecXYZ2 = vec_ld( 0, vertPtr2 );
- vecXYZ3 = vec_ld( 0, vertPtr3 );
- vecXYZ4 = vec_ld( 0, vertPtr4 );
-
- // like a splat, but only doing halves
- vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
- vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
- vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
- vecSum1 = vec_add( vecSum1, vecPlane3 );
- vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
- vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
- vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
- vecSum2 = vec_add( vecSum2, vecPlane3 );
- // store out results
- UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
- // bit manipulation
- vecCmp1 = vec_cmplt( vecSum1, zeroVector );
- vecCmp2 = vec_cmplt( vecSum2, zeroVector );
-
- //and it with 1 so we multiply by 1 not 1111's
- vecCmp1 = vec_and( vecCmp1, oneIntVector );
- vecCmp2 = vec_and( vecCmp2, oneIntVector );
-
- // store out and write to cullBits
- // finally, a use for algebra! 1-x = x + 1 - 2x
- vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
- vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
- vecSum1Inv = vec_add( vecSum1Inv, oneVector );
- vecSum2Inv = vec_add( vecSum2Inv, oneVector );
-
- // do the same comparisons for the inverted d0/d1
- vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
- vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
-
- //and it with 1 so we multiply by 1 not 1111's
- vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
- vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
- // shift them as needed
- vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
- vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
- vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
- vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
- // OR them all together. since only 1 bit is set for each value, thats
- // the same as adding them. add up d0 + d1 + d0Inv + d1Inv
- vector unsigned int vecResult;
- vector unsigned int vecResult2;
- vector unsigned int vecResult3;
- vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
-
- vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
-
- // vecResult now holds the values without the inverses yet, so add those
- vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
- vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
- vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
- vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
-
- vecResult = vec_add( vecResult, vecResult2 );
-
- //store out results
- vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
- vec_ste( vecResult, 0, &cullBitVal[0] );
- vec_ste( vecResult, 4, &cullBitVal[0] );
- vec_ste( vecResult, 8, &cullBitVal[0] );
- vec_ste( vecResult, 12, &cullBitVal[0] );
-
- cullBits[i] = cullBitVal[0];
- cullBits[i+1] = cullBitVal[1];
- cullBits[i+2] = cullBitVal[2];
- cullBits[i+3] = cullBitVal[3];
- }
- // cleanup
- for ( ; i < numVerts; i++ ) {
- byte bits;
- float d0, d1;
- float vx, vy, vz;
-
- vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
- vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
- vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
-
- d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
- d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
- texCoords[i][0] = d0;
- texCoords[i][1] = d1;
-
- bits = ( d0 >= 0 ) ? 0 : 1;
- d0 = 1.0f - d0;
- bits |= ( d1 >= 0 ) ? 0 : 1*2;
- d1 = 1.0f - d1;
-
- bits |= ( d0 >= 0 ) ? 0: 1*4;
- bits |= ( d1 >= 0 ) ? 0: 1*8;
- cullBits[i] = bits;
- }
- }
- #endif /* DRAWVERT_PADDED */
- #endif /* ENABLE_CULL */
- #ifdef ENABLE_DERIVE
- /*
- ============
- idSIMD_AltiVec::DeriveTriPlanes
- Derives a plane equation for each triangle.
- ============
- */
- void VPCALL idSIMD_AltiVec::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
- // idDrawVert size
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
- // idPlane size
- assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
- int i;
- vector float vecD0, vecD1, vecD2, vecD3, vecD4, vecD5, vecD6, vecD7;
- vector float vecVertA, vecVertB, vecVertC;
- vector float vecVertA2, vecVertB2, vecVertC2;
- vector float vecVertA3, vecVertB3, vecVertC3;
- vector float vecVertA4, vecVertB4, vecVertC4;
- vector float vecN, vecN2, vecN3, vecN4;
- vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
- vector unsigned char vecPerm1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
- vector unsigned char vecPerm2 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
- vector float vecF;
- vector float vecF1, vecF2, vecF3, vecF4;
- vector float zeroVector = (vector float)(0);
- vector float vecNegOne = (vector float)(-1);
- vector float vecSecondHalf, vecFirstHalf, vecSecondHalf2, vecFirstHalf2, vecSecondHalf3, vecFirstHalf3, vecFirstHalf4, vecSecondHalf4;
-
- vector unsigned char vecPermA, vecPermA2, vecPermA3, vecPermA4;
- vector unsigned char vecPermB, vecPermB2, vecPermB3, vecPermB4;
- vector unsigned char vecPermC, vecPermC2, vecPermC3, vecPermC4;
-
- vector unsigned char oneVector = (vector unsigned char)(1);
- vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
- vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
- const float *xyzPtr = verts[0].xyz.ToFloatPtr();
- float *planePtr = planes[0].ToFloatPtr();
- int j;
- for ( j = 0, i = 0; i+11 < numIndexes; i += 12, j += 4 ) {
- #ifndef DRAWVERT_PADDED
- // calculate permute vectors to load as needed. these are all
- // triangle indexes and are usaully pretty close together but
- // not guaranteed to be in any particular order
- vecPermA = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) ), oneVector );
- vecPermB = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) ), oneVector );
- vecPermC = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) ), oneVector );
- vecPermA2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) ), oneVector );
- vecPermB2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) ), oneVector );
- vecPermC2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) ), oneVector );
- vecPermA3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) ), oneVector );
- vecPermB3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) ), oneVector );
- vecPermC3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) ), oneVector );
- vecPermA4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) ), oneVector );
- vecPermB4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) ), oneVector );
- vecPermC4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) ), oneVector );
- #endif
- #ifndef DRAWVERT_PADDED
- // load first A B C
- vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
- vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
- vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
- vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
- vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
- vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
-
- vecVertA = vec_perm( vecLd1, vecLd2, vecPermA );
- vecVertB = vec_perm( vecLd3, vecLd4, vecPermB );
- vecVertC = vec_perm( vecLd5, vecLd6, vecPermC );
- // set the last element to 0
- vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
- vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
- vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
-
- // load second A B C
- vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
- vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
- vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
- vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
- vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
- vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
-
- vecVertA2 = vec_perm( vecLd1, vecLd2, vecPermA2 );
- vecVertB2 = vec_perm( vecLd3, vecLd4, vecPermB2 );
- vecVertC2 = vec_perm( vecLd5, vecLd6, vecPermC2 );
- // set the last element to 0
- vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
- vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
- vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
- // load third A B C
- vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
- vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
- vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
- vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
- vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
- vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
-
- vecVertA3 = vec_perm( vecLd1, vecLd2, vecPermA3 );
- vecVertB3 = vec_perm( vecLd3, vecLd4, vecPermB3 );
- vecVertC3 = vec_perm( vecLd5, vecLd6, vecPermC3 );
- // set the last element to 0
- vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
- vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
- vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
- // load the fourth A B C
- vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
- vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
- vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
- vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
- vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
- vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
-
- vecVertA4 = vec_perm( vecLd1, vecLd2, vecPermA4 );
- vecVertB4 = vec_perm( vecLd3, vecLd4, vecPermB4 );
- vecVertC4 = vec_perm( vecLd5, vecLd6, vecPermC4 );
- // set the last element to 0
- vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
- vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
- vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
- #else
- // load first A B C
- vecVertA = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
- vecVertB = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
- vecVertC = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
- // set the last element to 0
- vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
- vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
- vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
-
- // load second A B C
- vecVertA2 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
- vecVertB2 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
- vecVertC2 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
-
- // set the last element to 0
- vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
- vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
- vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
- // load third A B C
- vecVertA3 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
- vecVertB3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
- vecVertC3 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
- // set the last element to 0
- vecVertA3 = vec_perm( vecVertA3, zeroVector, vecPermZeroLast );
- vecVertB3 = vec_perm( vecVertB3, zeroVector, vecPermZeroLast );
- vecVertC3 = vec_perm( vecVertC3, zeroVector, vecPermZeroLast );
- // load the fourth A B C
- vecVertA4 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
- vecVertB4 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
- vecVertC4 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
-
- // set the last element to 0
- vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
- vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
- vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
- #endif
- // calculate d0 and d1 for each
- vecD0 = vec_sub( vecVertB, vecVertA );
- vecD1 = vec_sub( vecVertC, vecVertA );
-
- vecD2 = vec_sub( vecVertB2, vecVertA2 );
- vecD3 = vec_sub( vecVertC2, vecVertA2 );
-
- vecD4 = vec_sub( vecVertB3, vecVertA3 );
- vecD5 = vec_sub( vecVertC3, vecVertA3 );
-
- vecD6 = vec_sub( vecVertB4, vecVertA4 );
- vecD7 = vec_sub( vecVertC4, vecVertA4 );
- vecWork1 = vec_perm( vecD0, vecD0, vecPerm1 );
- vecWork2 = vec_perm( vecD1, vecD1, vecPerm2 );
- vecWork3 = vec_perm( vecD2, vecD2, vecPerm1 );
- vecWork4 = vec_perm( vecD3, vecD3, vecPerm2 );
- vecWork5 = vec_perm( vecD4, vecD4, vecPerm1 );
- vecWork6 = vec_perm( vecD5, vecD5, vecPerm2 );
- vecWork7 = vec_perm( vecD6, vecD6, vecPerm1 );
- vecWork8 = vec_perm( vecD7, vecD7, vecPerm2 );
-
- vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
- vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
- vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
- vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
-
- vecWork1 = vec_perm( vecD1, vecD1, vecPerm1 );
- vecWork2 = vec_perm( vecD0, vecD0, vecPerm2 );
- vecWork3 = vec_perm( vecD3, vecD3, vecPerm1 );
- vecWork4 = vec_perm( vecD2, vecD2, vecPerm2 );
- vecWork5 = vec_perm( vecD5, vecD5, vecPerm1 );
- vecWork6 = vec_perm( vecD4, vecD4, vecPerm2 );
- vecWork7 = vec_perm( vecD7, vecD7, vecPerm1 );
- vecWork8 = vec_perm( vecD6, vecD6, vecPerm2 );
-
- vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
- vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
- vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
- vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
-
- vecN = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
- vecN2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
- vecN3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
- vecN4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
-
- // transpose vecNs
- vector float v0, v1, v2, v3;
- v0 = vec_mergeh( vecN, vecN3 );
- v1 = vec_mergeh( vecN2, vecN4 );
- v2 = vec_mergel( vecN, vecN3 );
- v3 = vec_mergel( vecN2, vecN4 );
-
- vecN = vec_mergeh( v0, v1 );
- vecN2 = vec_mergel( v0, v1 );
- vecN3 = vec_mergeh( v2, v3 );
- vecN4 = vec_mergel( v2, v3 );
- vecF = vec_madd( vecN, vecN, zeroVector );
- vecF = vec_madd( vecN2, vecN2, vecF );
- vecF = vec_madd( vecN3, vecN3, vecF );
-
- vecF = ReciprocalSquareRoot( vecF );
-
- vecF1 = vec_madd( vecF, vecN, zeroVector );
- vecF2 = vec_madd( vecF, vecN2, zeroVector );
- vecF3 = vec_madd( vecF, vecN3, zeroVector );
- vecF4 = vec_madd( vecF, vecN4, zeroVector );
- vector float v8, v9, v10, v11;
- v8 = vecF1;
- v9 = vecF2;
- v10 = vecF3;
- v11 = vecF4;
- // transpose vecVerts
- v0 = vec_mergeh( vecVertA, vecVertA3 );
- v1 = vec_mergeh( vecVertA2, vecVertA4 );
- v2 = vec_mergel( vecVertA, vecVertA3 );
- v3 = vec_mergel( vecVertA2, vecVertA4 );
-
- vecVertA = vec_mergeh( v0, v1 );
- vecVertA2 = vec_mergel( v0, v1 );
- vecVertA3 = vec_mergeh( v2, v3 );
- vecVertA4 = vec_mergel( v2, v3 );
-
- vector float vecTotals;
- vecTotals = vec_madd( vecVertA, v8, zeroVector );
- vecTotals = vec_madd( vecVertA2, v9, vecTotals );
- vecTotals = vec_madd( vecVertA3, v10, vecTotals );
- vecTotals = vec_madd( vecVertA4, v11, vecTotals );
- vecF = vec_madd( vecTotals, vecNegOne, zeroVector );
-
- // transpose vecFs
- v0 = vec_mergeh( vecF1, vecF3 );
- v1 = vec_mergeh( vecF2, vecF );
- v2 = vec_mergel( vecF1, vecF3 );
- v3 = vec_mergel( vecF2, vecF );
-
- vecF1 = vec_mergeh( v0, v1 );
- vecF2 = vec_mergel( v0, v1 );
- vecF3 = vec_mergeh( v2, v3 );
- vecF4 = vec_mergel( v2, v3 );
- // store results
- UNALIGNED_STORE4( planePtr + ( j * PLANE_OFFSET ), vecF1, vecF2, vecF3, vecF4 );
- }
- // cleanup
- for ( ; i < numIndexes; i += 3, j++ ) {
- const idDrawVert *a, *b, *c;
- float d0[3], d1[3], f;
- idVec3 n;
- a = verts + indexes[i + 0];
- b = verts + indexes[i + 1];
- c = verts + indexes[i + 2];
- d0[0] = b->xyz[0] - a->xyz[0];
- d0[1] = b->xyz[1] - a->xyz[1];
- d0[2] = b->xyz[2] - a->xyz[2];
- d1[0] = c->xyz[0] - a->xyz[0];
- d1[1] = c->xyz[1] - a->xyz[1];
- d1[2] = c->xyz[2] - a->xyz[2];
- n[0] = d1[1] * d0[2] - d1[2] * d0[1];
- n[1] = d1[2] * d0[0] - d1[0] * d0[2];
- n[2] = d1[0] * d0[1] - d1[1] * d0[0];
- f = FastScalarInvSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
- //idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
- n.x *= f;
- n.y *= f;
- n.z *= f;
- planes[j].SetNormal( n );
- planes[j].FitThroughPoint( a->xyz );
- }
- }
- /*
- ============
- idSIMD_AltiVec::DeriveTangents
- Derives the normal and orthogonal tangent vectors for the triangle vertices.
- For each vertex the normal and tangent vectors are derived from all triangles
- using the vertex which results in smooth tangents across the mesh.
- In the process the triangle planes are calculated as well.
- ============
- */
- void VPCALL idSIMD_AltiVec::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
- int i;
- bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
- memset( used, 0, numVerts * sizeof( used[0] ) );
-
- idPlane *planesPtr = planes;
- for ( i = 0; i < numIndexes; i += 3 ) {
- idDrawVert *a, *b, *c;
- // unsigned long signBit;
- float d0[5], d1[5], area;
- idVec3 n, t0, t1;
- float f1, f2, f3;
- int v0 = indexes[i + 0];
- int v1 = indexes[i + 1];
- int v2 = indexes[i + 2];
- a = verts + v0;
- b = verts + v1;
- c = verts + v2;
- d0[0] = b->xyz[0] - a->xyz[0];
- d0[1] = b->xyz[1] - a->xyz[1];
- d0[2] = b->xyz[2] - a->xyz[2];
- d0[3] = b->st[0] - a->st[0];
- d0[4] = b->st[1] - a->st[1];
- d1[0] = c->xyz[0] - a->xyz[0];
- d1[1] = c->xyz[1] - a->xyz[1];
- d1[2] = c->xyz[2] - a->xyz[2];
- d1[3] = c->st[0] - a->st[0];
- d1[4] = c->st[1] - a->st[1];
- // normal
- n[0] = d1[1] * d0[2] - d1[2] * d0[1];
- n[1] = d1[2] * d0[0] - d1[0] * d0[2];
- n[2] = d1[0] * d0[1] - d1[1] * d0[0];
- f1 = n.x * n.x + n.y * n.y + n.z * n.z;
-
- // area sign bit
- area = d0[3] * d1[4] - d0[4] * d1[3];
- // first tangent
- t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
- t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
- t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
-
- f2 = t0.x * t0.x + t0.y * t0.y + t0.z * t0.z;
- // second tangent
- t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
- t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
- t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
- f3 = t1.x * t1.x + t1.y * t1.y + t1.z * t1.z;
- // Behold! The power of the pipeline
- FastScalarInvSqrt_x3( &f1, &f2, &f3 );
- #ifdef PPC_INTRINSICS
- f2 = __fsel( area, f2, -f2 );
- f3 = __fsel( area, f3, -f3 );
- #else
- f2 = ( area < 0.0f ) ? -f2 : f2;
- f3 = ( area < 0.0f ) ? -f3 : f3;
- #endif
- t0.x *= f2;
- t0.y *= f2;
- t0.z *= f2;
-
- n.x *= f1;
- n.y *= f1;
- n.z *= f1;
- planesPtr->SetNormal( n );
- planesPtr->FitThroughPoint( a->xyz );
- planesPtr++;
- t1.x *= f3;
- t1.y *= f3;
- t1.z *= f3;
- if ( used[v0] ) {
- a->normal += n;
- a->tangents[0] += t0;
- a->tangents[1] += t1;
- } else {
- a->normal = n;
- a->tangents[0] = t0;
- a->tangents[1] = t1;
- used[v0] = true;
- }
- if ( used[v1] ) {
- b->normal += n;
- b->tangents[0] += t0;
- b->tangents[1] += t1;
- } else {
- b->normal = n;
- b->tangents[0] = t0;
- b->tangents[1] = t1;
- used[v1] = true;
- }
- if ( used[v2] ) {
- c->normal += n;
- c->tangents[0] += t0;
- c->tangents[1] += t1;
- } else {
- c->normal = n;
- c->tangents[0] = t0;
- c->tangents[1] = t1;
- used[v2] = true;
- }
- }
- }
- #ifdef DERIVE_UNSMOOTH_DRAWVERT_ALIGNED
- /*
- ============
- idSIMD_AltiVec::DeriveUnsmoothedTangents
- Derives the normal and orthogonal tangent vectors for the triangle vertices.
- For each vertex the normal and tangent vectors are derived from a single dominant triangle.
- ============
- */
- #define DERIVE_UNSMOOTHED_BITANGENT
- void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
- int i;
- // idDrawVert size
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
- // drawverts aligned
- assert( IS_16BYTE_ALIGNED( verts[0] ) );
-
- vector float vecVertA, vecVertB, vecVertC;
- vector float vecVertA2, vecVertB2, vecVertC2;
- vector float vecVertA3, vecVertB3, vecVertC3;
- vector float vecVertA4, vecVertB4, vecVertC4;
- vector float v0, v1, v2, v3, v4, v5, v6, v7, v8;
- vector float vecS0, vecS1, vecS2;
- vector float vecS0_2, vecS1_2, vecS2_2;
- vector float vecS0_3, vecS1_3, vecS2_3;
- vector float vecS0_4, vecS1_4, vecS2_4;
-
- vector float vecD1, vecD2, vecD3, vecD4, vecD5, vecD6;
- vector float vecD7, vecD8, vecD9, vecD10, vecD11, vecD12;
- vector float vecT1, vecT1_2, vecT1_3, vecT1_4, vecT2, vecT2_2, vecT2_3, vecT2_4;
- vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
- vector float vecN, vecN2, vecN3, vecN4;
-
- vector unsigned char vecPermN0 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
- vector unsigned char vecPermN1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
- vector unsigned char vecPermT0 = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3);
- vector unsigned char vecPermT1 = (vector unsigned char)(8,9,10,11,8,9,10,11,8,9,10,11,8,9,10,11);
- vector float zeroVector = (vector float)(0);
-
- vector float vecNegOne = (vector float)(-1.0);
-
- vector float vecStore1, vecStore2, vecStore3;
- vector unsigned char vecPermFirstThreeLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
- vector unsigned char vecPermStoreSecond = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
- vector unsigned char vecPermLeadAndThree = (vector unsigned char)(0,1,2,3,16,17,18,19,20,21,22,23,24,25,26,27);
- vector unsigned char vecPermStore2 = (vector unsigned char)(4,5,6,7,8,9,10,11,24,25,26,27,28,29,30,31);
- vector unsigned char vecPermStore3 = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
- vector unsigned char vecPermStore4 = (vector unsigned char)(8,9,10,11,16,17,18,19,20,21,22,23,24,25,26,27);
- vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
-
- vector float vecLd1, vecLd2, vecLd3;
- vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3, vecPerm4;
-
- float *normalPtr = verts[0].normal.ToFloatPtr();
- float *xyzPtr = verts[0].xyz.ToFloatPtr();
- vector float vecFirstHalf, vecSecondHalf;
- vector float vecFirstHalf2, vecSecondHalf2;
- vector float vecFirstHalf3, vecSecondHalf3;
- vector float vecFirstHalf4, vecSecondHalf4;
-
- for ( i = 0; i+3 < numVerts; i+=4 ) {
- int bOffset1, bOffset2, bOffset3, bOffset4;
- int cOffset1, cOffset2, cOffset3, cOffset4;
-
- bOffset1 = dominantTris[i].v2;
- cOffset1 = dominantTris[i].v3;
- bOffset2 = dominantTris[i+1].v2;
- cOffset2 = dominantTris[i+1].v3;
- bOffset3 = dominantTris[i+2].v2;
- cOffset3 = dominantTris[i+2].v3;
- bOffset4 = dominantTris[i+3].v2;
- cOffset4 = dominantTris[i+3].v3;
-
- vecPerm0 = vec_lvsl( 0, xyzPtr + ( i * DRAWVERT_OFFSET ) );
- v0 = vec_ld( 0, xyzPtr + (i * DRAWVERT_OFFSET ) );
- v1 = vec_ld( 16, xyzPtr + (i * DRAWVERT_OFFSET ) );
- vecVertA = vec_perm( v0, v1, vecPerm0 );
- vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset1 * DRAWVERT_OFFSET ) );
- v2 = vec_ld( 0, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
- v3 = vec_ld( 16, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
- vecVertB = vec_perm( v2, v3, vecPerm1 );
-
- vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
- v4 = vec_ld( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
- v5 = vec_ld( 16, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
- vecVertC = vec_perm( v4, v5, vecPerm2 );
- // put remainder into v2
- v1 = vec_perm( v1, v1, vecPerm0 );
- v3 = vec_perm( v3, v3, vecPerm1 );
- v5 = vec_perm( v5, v5, vecPerm2 );
-
- v1 = vec_mergeh( v1, v5 );
- v2 = vec_mergeh( v3, zeroVector );
- v2 = vec_mergeh( v1, v2 );
- v2 = vec_perm( v2, v2, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
-
- // load second one
- vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
- v0 = vec_ld( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
- v1 = vec_ld( 16, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
- vecVertA2 = vec_perm( v0, v1, vecPerm0 );
-
- vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset2 * DRAWVERT_OFFSET ) );
- v3 = vec_ld( 0, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
- v4 = vec_ld( 16, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
- vecVertB2 = vec_perm( v3, v4, vecPerm3 );
-
- vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
- v5 = vec_ld( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
- v6 = vec_ld( 16, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
- vecVertC2 = vec_perm( v5, v6, vecPerm4 );
- // put remainder into v3
- v1 = vec_perm( v1, v1, vecPerm0 );
- v4 = vec_perm( v4, v4, vecPerm3 );
- v5 = vec_perm( v6, v6, vecPerm4 );
-
- v1 = vec_mergeh( v1, v5 );
- v3 = vec_mergeh( v4, zeroVector );
- v3 = vec_mergeh( v1, v3 );
- v3 = vec_perm( v3, v3, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
- // load third one
- vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
- v0 = vec_ld( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
- v1 = vec_ld( 16, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
- vecVertA3 = vec_perm( v0, v1, vecPerm0 );
-
- vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset3 * DRAWVERT_OFFSET ) );
- v4 = vec_ld( 0, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
- v5 = vec_ld( 16, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
- vecVertB3 = vec_perm( v4, v5, vecPerm1 );
-
- vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
- v6 = vec_ld( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
- v7 = vec_ld( 16, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
- vecVertC3 = vec_perm( v6, v7, vecPerm2 );
- // put remainder into v4
- v1 = vec_perm( v1, v1, vecPerm0 );
- v5 = vec_perm( v5, v5, vecPerm1 );
- v7 = vec_perm( v7, v7, vecPerm2 );
-
- v1 = vec_mergeh( v1, v7 );
- v4 = vec_mergeh( v5, zeroVector );
- v4 = vec_mergeh( v1, v4 );
- v4 = vec_perm( v4, v4, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
-
- // load fourth one
- vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
- v0 = vec_ld( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
- v1 = vec_ld( 16, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
- vecVertA4 = vec_perm( v0, v1, vecPerm0 );
-
- vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset4 * DRAWVERT_OFFSET ) );
- v5 = vec_ld( 0, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
- v6 = vec_ld( 16, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
- vecVertB4 = vec_perm( v5, v6, vecPerm3 );
-
- vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
- v7 = vec_ld( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
- v8 = vec_ld( 16, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
- vecVertC4 = vec_perm( v7, v8, vecPerm4 );
- // put remainder into v5
- v1 = vec_perm( v1, v1, vecPerm0 );
- v6 = vec_perm( v6, v6, vecPerm3 );
- v8 = vec_perm( v8, v8, vecPerm4 );
-
- v1 = vec_mergeh( v1, v8 );
- v5 = vec_mergeh( v6, zeroVector );
- v5 = vec_mergeh( v1, v5 );
- v5 = vec_perm( v5, v5, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
- // remainder vectors look like b->st[1], a->st[1], c->st[1], a->st[1]
- //vecD1 now holds d0, d1, d2, d3
- vecD1 = vec_sub( vecVertB, vecVertA );
- vecD4 = vec_sub( vecVertB2, vecVertA2 );
- vecD7 = vec_sub( vecVertB3, vecVertA3 );
- vecD10 = vec_sub( vecVertB4, vecVertA4 );
- // vecD2 how holds d5, d6, d7, d8
- vecD2 = vec_sub( vecVertC, vecVertA );
- vecD5 = vec_sub( vecVertC2, vecVertA2 );
- vecD8 = vec_sub( vecVertC3, vecVertA3 );
- vecD11 = vec_sub( vecVertC4, vecVertA4 );
-
- // vecD3 now holds d4, crap, d9, crap
- vecD3 = vec_sub( v2, vec_sld( v2, v2, 4 ) );
- vecD6 = vec_sub( v3, vec_sld( v3, v3, 4 ) );
- vecD9 = vec_sub( v4, vec_sld( v4, v4, 4 ) );
- vecD12 = vec_sub( v5, vec_sld( v5, v5, 4 ) );
- // get permute vectors for loading from dt
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i].normalizationScale[0] ), (vector unsigned char)(1) );
- vecPerm2 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+1].normalizationScale[0] ), (vector unsigned char)(1) );
- vecPerm3 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+2].normalizationScale[0] ), (vector unsigned char)(1) );
- vecPerm4 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+3].normalizationScale[0] ), (vector unsigned char)(1) );
- // load S values from dominantTris
- v0 = vec_ld( 0, &dominantTris[i].normalizationScale[0] );
- v1 = vec_ld( 11, &dominantTris[i].normalizationScale[0] );
- v2 = vec_ld( 0, &dominantTris[i+1].normalizationScale[0] );
- v3 = vec_ld( 11, &dominantTris[i+1].normalizationScale[0] );
- v4 = vec_ld( 0, &dominantTris[i+2].normalizationScale[0] );
- v5 = vec_ld( 11, &dominantTris[i+2].normalizationScale[0] );
- v6 = vec_ld( 0, &dominantTris[i+3].normalizationScale[0] );
- v7 = vec_ld( 11, &dominantTris[i+3].normalizationScale[0] );
- v0 = vec_perm( v0, v1, vecPerm1 );
- v2 = vec_perm( v2, v3, vecPerm2 );
- v4 = vec_perm( v4, v5, vecPerm3 );
- v6 = vec_perm( v6, v7, vecPerm4 );
-
- vecS0 = vec_splat( v0, 0 );
- vecS1 = vec_splat( v0, 1 );
- vecS2 = vec_splat( v0, 2 );
-
- vecS0_2 = vec_splat( v2, 0);
- vecS1_2 = vec_splat( v2, 1 );
- vecS2_2 = vec_splat( v2, 2 );
- vecS0_3 = vec_splat( v4, 0 );
- vecS1_3 = vec_splat( v4, 1 );
- vecS2_3 = vec_splat( v4, 2 );
-
- vecS0_4 = vec_splat( v6, 0 );
- vecS1_4 = vec_splat( v6, 1 );
- vecS2_4 = vec_splat( v6, 2 );
- // do calculation
- vecWork1 = vec_perm( vecD2, vecD2, vecPermN1 );
- vecWork2 = vec_perm( vecD1, vecD1, vecPermN0 );
- vecWork3 = vec_perm( vecD5, vecD5, vecPermN1 );
- vecWork4 = vec_perm( vecD4, vecD4, vecPermN0 );
- vecWork5 = vec_perm( vecD8, vecD8, vecPermN1 );
- vecWork6 = vec_perm( vecD7, vecD7, vecPermN0 );
- vecWork7 = vec_perm( vecD11, vecD11, vecPermN1 );
- vecWork8 = vec_perm( vecD10, vecD10, vecPermN0 );
-
- vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
- vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
- vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
- vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
-
- vecWork1 = vec_perm( vecD2, vecD2, vecPermN0 );
- vecWork2 = vec_perm( vecD1, vecD1, vecPermN1 );
- vecWork3 = vec_perm( vecD5, vecD5, vecPermN0 );
- vecWork4 = vec_perm( vecD4, vecD4, vecPermN1 );
- vecWork5 = vec_perm( vecD8, vecD8, vecPermN0 );
- vecWork6 = vec_perm( vecD7, vecD7, vecPermN1 );
- vecWork7 = vec_perm( vecD11, vecD11, vecPermN0 );
- vecWork8 = vec_perm( vecD10, vecD10, vecPermN1 );
-
- vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
- vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
- vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
- vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
-
-
- // calculate N values
- vecN = vec_madd( vecS2, vecSecondHalf, zeroVector );
- vecN2 = vec_madd( vecS2_2, vecSecondHalf2, zeroVector );
- vecN3 = vec_madd( vecS2_3, vecSecondHalf3, zeroVector );
- vecN4 = vec_madd( vecS2_4, vecSecondHalf4, zeroVector );
-
- // calculate both halves of the calculation for t
- vecWork1 = vecD1;
- vecWork2 = vec_perm( vecD3, vecD3, vecPermT1 );
- vecWork3 = vecD4;
- vecWork4 = vec_perm( vecD6, vecD6, vecPermT1 );
- vecWork5 = vecD7;
- vecWork6 = vec_perm( vecD9, vecD9, vecPermT1 );
- vecWork7 = vecD10;
- vecWork8 = vec_perm( vecD12, vecD12, vecPermT1 );
-
- vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
- vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
- vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
- vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
-
- vecWork1 = vecD2;
- vecWork2 = vec_perm( vecD3, vecD3, vecPermT0 );
- vecWork3 = vecD5;
- vecWork4 = vec_perm( vecD6, vecD6, vecPermT0 );
- vecWork5 = vecD8;
- vecWork6 = vec_perm( vecD9, vecD9, vecPermT0 );
- vecWork7 = vecD11;
- vecWork8 = vec_perm( vecD12, vecD12, vecPermT0 );
- vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
- vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
- vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
- vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
- // calculate T values
- vecT1 = vec_madd( vecS0, vecSecondHalf, zeroVector );
- vecT1_2 = vec_madd( vecS0_2, vecSecondHalf2, zeroVector );
- vecT1_3 = vec_madd( vecS0_3, vecSecondHalf3, zeroVector );
- vecT1_4 = vec_madd( vecS0_4, vecSecondHalf4, zeroVector );
- #ifndef DERIVE_UNSMOOTHED_BITANGENT
- vecWork1 = vecD1;
- vecWork2 = vec_perm( vecD2, vecD2, vecPermT2 );
- vecWork3 = vecD4;
- vecWork4 = vec_perm( vecD5, vecD5, vecPermT2 );
- vecWork5 = vecD7;
- vecWork6 = vec_perm( vecD8, vecD8, vecPermT2 );
- vecWork7 = vecD10;
- vecWork8 = vec_perm( vecD11, vecD11, vecPermT2 );
- vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
- vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
- vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
- vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
-
- vecWork1 = vec_perm( vecD1, vecD1, vecPermT2 );
- vecWork2 = vecD2;
- vecWork3 = vec_perm( vecD4, vecD4, vecPermT2 );
- vecWork4 = vecD5;
- vecWork5 = vec_perm( vecD7, vecD7, vecPermT2 );
- vecWork6 = vecD8;
- vecWork7 = vec_perm( vecD10, vecD10, vecPermT2 );
- vecWork8 = vecD11;
- vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
- vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
- vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
- vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
- #else
- vecWork1 = vec_perm( vecN, vecN, vecPermN1 );
- vecWork2 = vec_perm( vecT1, vecT1, vecPermN0 );
- vecWork3 = vec_perm( vecN2, vecN2, vecPermN1 );
- vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN0 );
- vecWork5 = vec_perm( vecN3, vecN3, vecPermN1 );
- vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN0 );
- vecWork7 = vec_perm( vecN4, vecN4, vecPermN1 );
- vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN0 );
- vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
- vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
- vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
- vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
-
- vecWork1 = vec_perm( vecN, vecN, vecPermN0 );
- vecWork2 = vec_perm( vecT1, vecT1, vecPermN1 );
- vecWork3 = vec_perm( vecN2, vecN2, vecPermN0 );
- vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN1 );
- vecWork5 = vec_perm( vecN3, vecN3, vecPermN0 );
- vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN1 );
- vecWork7 = vec_perm( vecN4, vecN4, vecPermN0 );
- vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN1 );
- vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
- vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
- vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
- vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
- #endif
- // finish the calculation
- vecSecondHalf = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
- vecSecondHalf2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
- vecSecondHalf3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
- vecSecondHalf4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
- vecT2 = vec_madd( vecS1, vecSecondHalf, zeroVector );
- vecT2_2 = vec_madd( vecS1_2, vecSecondHalf2, zeroVector );
- vecT2_3 = vec_madd( vecS1_3, vecSecondHalf3, zeroVector );
- vecT2_4 = vec_madd( vecS1_4, vecSecondHalf4, zeroVector );
- // Store results
- // read values that we need to preserve
- vecLd1 = vec_ld( 0, normalPtr + ( i * DRAWVERT_OFFSET ) );
- vecLd2 = vec_ld( 32, normalPtr + ( i * DRAWVERT_OFFSET ) );
-
- //generate vectors to store
- vecStore1 = vec_perm( vecLd1, vecN, vecPermLeadAndThree );
- vecStore2 = vec_perm( vecT1, vecT2, vecPermFirstThreeLast );
- vecStore3 = vec_perm( vecT2, vecLd2, vecPermStore2 );
-
- // store out results
- ALIGNED_STORE3( normalPtr + ( i * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
- // read values that we need to preserve
- vecLd3 = vec_ld( 32, normalPtr + ( (i+1) * DRAWVERT_OFFSET ));
-
- // generate vectors to store
- vecStore1 = vec_perm( vecN2, vecT1_2, vecPermFirstThreeLast );
- vecStore2 = vec_perm( vecT1_2, vecT2_2, vecPermStoreSecond );
- vecStore3 = vec_perm( vecT2_2, vecLd3, (vector unsigned char)(8,9,10,11,20,21,22,23,24,25,26,27,28,29,30,31) );
-
- // instead of doing permute, shift it where it needs to be and use vec_ste
- // store out vectors
- ALIGNED_STORE3( normalPtr + ((i+1) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
- // read values that we need to preserve
- vecLd1 = vec_ld( 0, normalPtr + ( (i+2) * DRAWVERT_OFFSET ) );
- // generate vectors to store
- vecStore1 = vec_perm( vecLd1, vecN3, vecPermFirstThreeLast );
- vecStore2 = vec_perm( vecN3, vecT1_3, vecPermStore3 );
- vecStore3 = vec_perm( vecT1_3, vecT2_3, vecPermStore4 );
-
- // store out vectors
- ALIGNED_STORE3( normalPtr + ((i+2) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
- // read values that we need to preserve
- vecLd2 = vec_ld( 0, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
- vecLd3 = vec_ld( 32, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
-
- // generate vectors to store
- vecStore1 = vec_perm( vecLd2, vecN4, vecPermHalves );
- vecStore2 = vec_perm( vecN4, vecT1_4, vecPermStore4 );
- vecStore3 = vec_perm( vecT2_4, vecLd3, vecPermFirstThreeLast );
-
- // store out vectors
- ALIGNED_STORE3( normalPtr + ((i+3) * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
- }
- // cleanup
- for ( ; i < numVerts; i++ ) {
- idDrawVert *a, *b, *c;
- float d0, d1, d2, d3, d4;
- float d5, d6, d7, d8, d9;
- float s0, s1, s2;
- float n0, n1, n2;
- float t0, t1, t2;
- float t3, t4, t5;
- const dominantTri_s &dt = dominantTris[i];
- a = verts + i;
- b = verts + dt.v2;
- c = verts + dt.v3;
-
- d0 = b->xyz[0] - a->xyz[0];
- d1 = b->xyz[1] - a->xyz[1];
- d2 = b->xyz[2] - a->xyz[2];
- d3 = b->st[0] - a->st[0];
-
- d4 = b->st[1] - a->st[1];
- d5 = c->xyz[0] - a->xyz[0];
- d6 = c->xyz[1] - a->xyz[1];
- d7 = c->xyz[2] - a->xyz[2];
- d8 = c->st[0] - a->st[0];
-
- d9 = c->st[1] - a->st[1];
- s0 = dt.normalizationScale[0];
- s1 = dt.normalizationScale[1];
- s2 = dt.normalizationScale[2];
- n0 = s2 * ( d6 * d2 - d7 * d1 );
- n1 = s2 * ( d7 * d0 - d5 * d2 );
- n2 = s2 * ( d5 * d1 - d6 * d0 );
- t0 = s0 * ( d0 * d9 - d4 * d5 );
- t1 = s0 * ( d1 * d9 - d4 * d6 );
- t2 = s0 * ( d2 * d9 - d4 * d7 );
- #ifndef DERIVE_UNSMOOTHED_BITANGENT
- t3 = s1 * ( d3 * d5 - d0 * d8 );
- t4 = s1 * ( d3 * d6 - d1 * d8 );
- t5 = s1 * ( d3 * d7 - d2 * d8 );
- #else
- t3 = s1 * ( n2 * t1 - n1 * t2 );
- t4 = s1 * ( n0 * t2 - n2 * t0 );
- t5 = s1 * ( n1 * t0 - n0 * t1 );
- #endif
- a->normal[0] = n0;
- a->normal[1] = n1;
- a->normal[2] = n2;
- a->tangents[0][0] = t0;
- a->tangents[0][1] = t1;
- a->tangents[0][2] = t2;
- a->tangents[1][0] = t3;
- a->tangents[1][1] = t4;
- a->tangents[1][2] = t5;
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::DeriveUnsmoothedTangents
- Derives the normal and orthogonal tangent vectors for the triangle vertices.
- For each vertex the normal and tangent vectors are derived from a single dominant triangle.
- ============
- */
- #define DERIVE_UNSMOOTHED_BITANGENT
- void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
- int i;
-
- for ( i = 0; i < numVerts; i++ ) {
- idDrawVert *a, *b, *c;
- float d0, d1, d2, d3, d4;
- float d5, d6, d7, d8, d9;
- float s0, s1, s2;
- float n0, n1, n2;
- float t0, t1, t2;
- float t3, t4, t5;
- const dominantTri_s &dt = dominantTris[i];
- a = verts + i;
- b = verts + dt.v2;
- c = verts + dt.v3;
-
- d0 = b->xyz[0] - a->xyz[0];
- d1 = b->xyz[1] - a->xyz[1];
- d2 = b->xyz[2] - a->xyz[2];
- d3 = b->st[0] - a->st[0];
-
- d4 = b->st[1] - a->st[1];
- d5 = c->xyz[0] - a->xyz[0];
- d6 = c->xyz[1] - a->xyz[1];
- d7 = c->xyz[2] - a->xyz[2];
- d8 = c->st[0] - a->st[0];
-
- d9 = c->st[1] - a->st[1];
- s0 = dt.normalizationScale[0];
- s1 = dt.normalizationScale[1];
- s2 = dt.normalizationScale[2];
- n0 = s2 * ( d6 * d2 - d7 * d1 );
- n1 = s2 * ( d7 * d0 - d5 * d2 );
- n2 = s2 * ( d5 * d1 - d6 * d0 );
- t0 = s0 * ( d0 * d9 - d4 * d5 );
- t1 = s0 * ( d1 * d9 - d4 * d6 );
- t2 = s0 * ( d2 * d9 - d4 * d7 );
- #ifndef DERIVE_UNSMOOTHED_BITANGENT
- t3 = s1 * ( d3 * d5 - d0 * d8 );
- t4 = s1 * ( d3 * d6 - d1 * d8 );
- t5 = s1 * ( d3 * d7 - d2 * d8 );
- #else
- t3 = s1 * ( n2 * t1 - n1 * t2 );
- t4 = s1 * ( n0 * t2 - n2 * t0 );
- t5 = s1 * ( n1 * t0 - n0 * t1 );
- #endif
- a->normal[0] = n0;
- a->normal[1] = n1;
- a->normal[2] = n2;
- a->tangents[0][0] = t0;
- a->tangents[0][1] = t1;
- a->tangents[0][2] = t2;
- a->tangents[1][0] = t3;
- a->tangents[1][1] = t4;
- a->tangents[1][2] = t5;
- }
- }
- #endif /* DERIVE_UNSMOOTH_DRAWVERT_ALIGNED */
- /*
- ============
- idSIMD_AltiVec::NormalizeTangents
- Normalizes each vertex normal and projects and normalizes the
- tangent vectors onto the plane orthogonal to the vertex normal.
- ============
- */
- void VPCALL idSIMD_AltiVec::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
- // idDrawVert size
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
- float *addr = verts[0].normal.ToFloatPtr();
- float *tAddr = verts[0].tangents[0].ToFloatPtr();
- // v0 through v3 maintain originally loaded values so we don't take
- // as much hit for unaligned stores
- vector float v0, v1, v2, v3;
- // v5 through v8 are the "working" values of the vectors
- vector float v5, v6, v7, v8;
- // working values
- vector float vec1T0, vec1T1, vec2T0, vec2T1, vec3T0, vec3T1, vec4T0, vec4T1;
- vector float vecSum, vecTSum1, vecTSum2, tempSum, tempSum2, tempSum3;
- vector float vecF, vecF2;
- vector float vecTemp, vecTemp2, vecTemp3, vecTemp4;
-
- register vector float zeroVector = (vector float)(0.0);
-
- vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
- vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
- vector unsigned char vecPermSplatFirstWithZero = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,16,17,18,19);
- vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3;
- vector unsigned char storePerm0, storePerm1, storePerm2, storePerm3;
- vector float vecTan11, vecTan12, vecTan13, vecTan21, vecTan22, vecTan23;
- vector float vecTan31, vecTan32, vecTan33, vecTan41, vecTan42, vecTan43;
-
- vector unsigned char vec1T0Perm, vec1T1Perm, vec2T0Perm, vec2T1Perm, vec3T0Perm, vec3T1Perm, vec4T0Perm, vec4T1Perm;
- vector unsigned char storeT11, storeT12, storeT21, storeT22, storeT31, storeT32;
- vector unsigned char storeT41, storeT42;
-
- int i = 0;
-
- if ( i+3 < numVerts ) {
- // for loading normal from idDrawVert
- vecPerm0 = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
- vecPerm1 = vec_add( vec_lvsl( -1, addr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
- vecPerm2 = vec_add( vec_lvsl( -1, addr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
- vecPerm3 = vec_add( vec_lvsl( -1, addr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
-
- // for loading tangents from idDrawVert
- vec1T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
- vec1T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
- vec2T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
- vec2T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
- vec3T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
- vec3T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
- vec4T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
- vec4T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
-
- // generate permute vectors to store normals
- storePerm0 = vec_lvsr( 0, addr );
- storePerm1 = vec_lvsr( 0, addr + ( 1 * DRAWVERT_OFFSET ) );
- storePerm2 = vec_lvsr( 0, addr + ( 2 * DRAWVERT_OFFSET ) );
- storePerm3 = vec_lvsr( 0, addr + ( 3 * DRAWVERT_OFFSET ) );
- // generate permute vectors to store tangents
- storeT11 = vec_lvsr( 0, tAddr + ( 0 * DRAWVERT_OFFSET ) );
- storeT12 = vec_lvsr( 12, tAddr + ( 0 * DRAWVERT_OFFSET ) );
- storeT21 = vec_lvsr( 0, tAddr + ( 1 * DRAWVERT_OFFSET ) );
- storeT22 = vec_lvsr( 12, tAddr + ( 1 * DRAWVERT_OFFSET ) );
-
- storeT31 = vec_lvsr( 0, tAddr + ( 2 * DRAWVERT_OFFSET ) );
- storeT32 = vec_lvsr( 12, tAddr + ( 2 * DRAWVERT_OFFSET ) );
-
- storeT41 = vec_lvsr( 0, tAddr + ( 3 * DRAWVERT_OFFSET ) );
- storeT42 = vec_lvsr( 12, tAddr + ( 3 * DRAWVERT_OFFSET ) );
- }
-
- for ( ; i+3 < numVerts; i+=4 ) {
-
- // load normals
- vector float vecNormal11 = vec_ld( 0, addr + ( i * DRAWVERT_OFFSET ) );
- vector float vecNormal12 = vec_ld( 15, addr + ( i * DRAWVERT_OFFSET ) );
- v0 = vec_perm( vecNormal11, vecNormal12, vecPerm0 );
-
- vector float vecNormal21 = vec_ld( 0, addr + ((i+1) * DRAWVERT_OFFSET ) );
- vector float vecNormal22 = vec_ld( 15, addr + ((i+1) * DRAWVERT_OFFSET ) );
- v1 = vec_perm( vecNormal21, vecNormal22, vecPerm1 );
-
- vector float vecNormal31 = vec_ld( 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
- vector float vecNormal32 = vec_ld( 15, addr + ( (i+2) * DRAWVERT_OFFSET ) );
- v2 = vec_perm( vecNormal31, vecNormal32, vecPerm2 );
-
- vector float vecNormal41 = vec_ld( 0, addr + ((i+3) * DRAWVERT_OFFSET ) );
- vector float vecNormal42 = vec_ld( 15, addr + ((i+3) * DRAWVERT_OFFSET ) );
- v3 = vec_perm( vecNormal41, vecNormal42, vecPerm3 );
-
- // zero out the last element of each useless vector
- v0 = vec_perm( v0, zeroVector, vecPermLast );
- v1 = vec_perm( v1, zeroVector, vecPermLast );
- v2 = vec_perm( v2, zeroVector, vecPermLast );
- v3 = vec_perm( v3, zeroVector, vecPermLast );
- // got 4 vectors in v0 through v3, sum them each accross
- // and put into one vector
- vecTemp = vec_madd( v0, v0, zeroVector );
-
- vecSum = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
- vecSum = vec_add( vecSum, vec_sld( vecSum, vecSum, 4 ) );
- // element 0 of vecSum now has sum of v0
-
- vecTemp2 = vec_madd( v1, v1, zeroVector );
- tempSum = vec_add( vecTemp2, vec_sld( vecTemp2, vecTemp2, 8 ) );
- tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
- // put this into vecSum
- vecSum = vec_mergeh( vecSum, tempSum );
-
- vecTemp3 = vec_madd( v2, v2, zeroVector );
- tempSum = vec_add( vecTemp3, vec_sld( vecTemp3, vecTemp3, 8 ) );
- tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
- // put this into vecSum
- vecSum = vec_perm( vecSum, tempSum, vecPermHalves );
-
- vecTemp4 = vec_madd( v3, v3, zeroVector );
- tempSum = vec_add( vecTemp4, vec_sld( vecTemp4, vecTemp4, 8 ) );
- tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
- // put this into vecSum
- vecSum = vec_perm( vecSum, tempSum, vecPermLast );
- // take reciprocal square roots of these
- vecF = ReciprocalSquareRoot( vecSum );
-
- // multiply each vector by f
- v5 = vec_madd( v0, vec_splat( vecF, 0 ), zeroVector );
- v6 = vec_madd( v1, vec_splat( vecF, 1 ), zeroVector );
- v7 = vec_madd( v2, vec_splat( vecF, 2 ), zeroVector );
- v8 = vec_madd( v3, vec_splat( vecF, 3 ), zeroVector );
- // load tangents as unaligned
- vecTan11 = vec_ld( 0, tAddr + ( i * DRAWVERT_OFFSET ) );
- vecTan12 = vec_ld( 11, tAddr + ( i * DRAWVERT_OFFSET ) );
- vecTan13 = vec_ld( 23, tAddr + ( i * DRAWVERT_OFFSET ) );
- vecTan21 = vec_ld( 0, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
- vecTan22 = vec_ld( 11, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
- vecTan23 = vec_ld( 23, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
-
- vecTan31 = vec_ld( 0, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
- vecTan32 = vec_ld( 11, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
- vecTan33 = vec_ld( 23, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
-
- vecTan41 = vec_ld( 0, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
- vecTan42 = vec_ld( 11, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
- vecTan43 = vec_ld( 23, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
- vec1T0 = vec_perm( vecTan11, vecTan12, vec1T0Perm );
- vec1T1 = vec_perm( vecTan12, vecTan13, vec1T1Perm );
- vec2T0 = vec_perm( vecTan21, vecTan22, vec2T0Perm );
- vec2T1 = vec_perm( vecTan22, vecTan23, vec2T1Perm );
- vec3T0 = vec_perm( vecTan31, vecTan32, vec3T0Perm );
- vec3T1 = vec_perm( vecTan32, vecTan33, vec3T1Perm );
- vec4T0 = vec_perm( vecTan41, vecTan42, vec4T0Perm );
- vec4T1 = vec_perm( vecTan42, vecTan43, vec4T1Perm );
- //zero out last element of tangents
- vec1T0 = vec_perm( vec1T0, zeroVector, vecPermLast );
- vec1T1 = vec_perm( vec1T1, zeroVector, vecPermLast );
- vec2T0 = vec_perm( vec2T0, zeroVector, vecPermLast );
- vec2T1 = vec_perm( vec2T1, zeroVector, vecPermLast );
- vec3T0 = vec_perm( vec3T0, zeroVector, vecPermLast );
- vec3T1 = vec_perm( vec3T1, zeroVector, vecPermLast );
- vec4T0 = vec_perm( vec4T0, zeroVector, vecPermLast );
- vec4T1 = vec_perm( vec4T1, zeroVector, vecPermLast );
- // all tangents[0]
- tempSum = zeroVector;
- tempSum = vec_madd( vec1T0, v5, tempSum );
- //sum accross tempSum
- vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
- vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
- // put tempSum splatted accross vecTSum1
- vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
- vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
-
- //vec1T0 now contains what needs to be rsqrt'd and multiplied by f
- vec1T0 = vec_sub( vec1T0, vecTSum1 );
-
- tempSum = zeroVector;
- tempSum = vec_madd( vec2T0, v6, tempSum );
-
- //sum accross tempSum
- vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
- vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
- vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
- vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
- vec2T0 = vec_sub( vec2T0, vecTSum1 );
-
- tempSum = zeroVector;
- tempSum = vec_madd( vec3T0, v7, tempSum );
-
- //sum accross tempSum
- vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
- vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
- vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
- vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
- vec3T0 = vec_sub( vec3T0, vecTSum1 );
-
- tempSum = zeroVector;
- tempSum = vec_madd( vec4T0, v8, tempSum );
-
- //sum accross tempSum
- vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
- vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
- vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
- vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
- vec4T0 = vec_sub( vec4T0, vecTSum1 );
-
- // all tangents[1]
- tempSum = zeroVector;
- tempSum = vec_madd( vec1T1, v5, tempSum );
-
- //sum accross tempSum
- vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
- vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
- vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
- vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
-
- //vec1T0 now contains what needs to be rsqrt'd and multiplied by f
- vec1T1 = vec_sub( vec1T1, vecTSum1 );
-
- tempSum = zeroVector;
- tempSum = vec_madd( vec2T1, v6, tempSum );
-
- //sum accross tempSum
- vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
- vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
- vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
- vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
- vec2T1 = vec_sub( vec2T1, vecTSum1 );
-
- tempSum = zeroVector;
- tempSum = vec_madd( vec3T1, v7, tempSum );
-
- //sum accross tempSum
- vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
- vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
- vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
- vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
- vec3T1 = vec_sub( vec3T1, vecTSum1 );
-
- tempSum = zeroVector;
- tempSum = vec_madd( vec4T1, v8, tempSum );
-
- //sum accross tempSum
- vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
- vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
- vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
- vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
- vec4T1 = vec_sub( vec4T1, vecTSum1 );
- // sum accross vectors and put into one vector
- vecTemp = vec_madd( vec1T0, vec1T0, zeroVector );
- vecTSum1 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
- vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
- // element 0 of vecSum now has sum of v0
- vecTemp = vec_madd( vec2T0, vec2T0, zeroVector );
- tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
- tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
- // put this into vecSum
- vecTemp = vec_madd( vec3T0, vec3T0, zeroVector );
- vecTSum1 = vec_mergeh( vecTSum1, tempSum2 );
- tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
- tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
- // put this into vecSum
- vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermHalves );
- vecTemp = vec_madd( vec4T0, vec4T0, zeroVector );
- tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
- tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
- // put this into vecSum
- vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermLast );
-
- vecTemp = vec_madd( vec1T1, vec1T1, zeroVector );
- vecTSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
- vecTSum2 = vec_add( vecTSum2, vec_sld( vecTSum2, vecTSum2, 4 ) );
- // element 0 of vecSum now has sum of v0
- vecTemp = vec_madd( vec2T1, vec2T1, zeroVector );
- tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
- tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
- // put this into vecSum
- vecTSum2 = vec_mergeh( vecTSum2, tempSum3 );
- vecTemp = vec_madd( vec3T1, vec3T1, zeroVector );
- tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
- tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
- // put this into vecSum
- vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermHalves );
- vecTemp = vec_madd( vec4T1, vec4T1, zeroVector );
- tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
- tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
- // put this into vecSum
- vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermLast );
-
- // tangents[0]
- vecF = ReciprocalSquareRoot( vecTSum1 );
- // tangents[1]
- vecF2 = ReciprocalSquareRoot( vecTSum2 );
-
- // multiply each tangent vector by f
- vec1T0 = vec_madd( vec1T0, vec_splat( vecF, 0 ), zeroVector );
- vec2T0 = vec_madd( vec2T0, vec_splat( vecF, 1 ), zeroVector );
- vec3T0 = vec_madd( vec3T0, vec_splat( vecF, 2 ), zeroVector );
- vec4T0 = vec_madd( vec4T0, vec_splat( vecF, 3 ), zeroVector );
- vec1T1 = vec_madd( vec1T1, vec_splat( vecF2, 0 ), zeroVector );
- vec2T1 = vec_madd( vec2T1, vec_splat( vecF2, 1 ), zeroVector );
- vec3T1 = vec_madd( vec3T1, vec_splat( vecF2, 2 ), zeroVector );
- vec4T1 = vec_madd( vec4T1, vec_splat( vecF2, 3 ), zeroVector );
- // rotate input data
- v5 = vec_perm( v5, v5, storePerm0 );
- v6 = vec_perm( v6, v6, storePerm1 );
- v7 = vec_perm( v7, v7, storePerm2 );
- v8 = vec_perm( v8, v8, storePerm3 );
- vec_ste( v5, 0, addr + ( (i+0) * DRAWVERT_OFFSET ) );
- vec_ste( v5, 4, addr + ( (i+0) * DRAWVERT_OFFSET ) );
- vec_ste( v5, 8, addr + ( (i+0) * DRAWVERT_OFFSET ) );
- vec_ste( v6, 0, addr + ( (i+1) * DRAWVERT_OFFSET ) );
- vec_ste( v6, 4, addr + ( (i+1) * DRAWVERT_OFFSET ) );
- vec_ste( v6, 8, addr + ( (i+1) * DRAWVERT_OFFSET ) );
-
- vec_ste( v7, 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
- vec_ste( v7, 4, addr + ( (i+2) * DRAWVERT_OFFSET ) );
- vec_ste( v7, 8, addr + ( (i+2) * DRAWVERT_OFFSET ) );
- vec_ste( v8, 0, addr + ( (i+3) * DRAWVERT_OFFSET ) );
- vec_ste( v8, 4, addr + ( (i+3) * DRAWVERT_OFFSET ) );
- vec_ste( v8, 8, addr + ( (i+3) * DRAWVERT_OFFSET ) );
- // store tangents[0] and tangents[1]
- vec1T0 = vec_perm( vec1T0, vec1T0, storeT11 );
- vec1T1 = vec_perm( vec1T1, vec1T1, storeT12 );
-
- vec_ste( vec1T0, 0, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
- vec_ste( vec1T0, 4, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
- vec_ste( vec1T0, 8, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
- vec_ste( vec1T1, 12, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
- vec_ste( vec1T1, 16, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
- vec_ste( vec1T1, 20, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
- // store second tangents[0] and tangents[1]
- vec2T0 = vec_perm( vec2T0, vec2T0, storeT21 );
- vec2T1 = vec_perm( vec2T1, vec2T1, storeT22 );
-
- vec_ste( vec2T0, 0, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
- vec_ste( vec2T0, 4, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
- vec_ste( vec2T0, 8, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
- vec_ste( vec2T1, 12, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
- vec_ste( vec2T1, 16, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
- vec_ste( vec2T1, 20, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
- // store third tangents[0] and tangents[1]
- vec3T0 = vec_perm( vec3T0, vec3T0, storeT31 );
- vec3T1 = vec_perm( vec3T1, vec3T1, storeT32 );
-
- vec_ste( vec3T0, 0, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
- vec_ste( vec3T0, 4, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
- vec_ste( vec3T0, 8, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
- vec_ste( vec3T1, 12, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
- vec_ste( vec3T1, 16, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
- vec_ste( vec3T1, 20, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
- // store fourth tangents[0] and tangents[1]
- vec4T0 = vec_perm( vec4T0, vec4T0, storeT41 );
- vec4T1 = vec_perm( vec4T1, vec4T1, storeT42 );
-
- vec_ste( vec4T0, 0, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
- vec_ste( vec4T0, 4, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
- vec_ste( vec4T0, 8, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
- vec_ste( vec4T1, 12, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
- vec_ste( vec4T1, 16, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
- vec_ste( vec4T1, 20, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
- }
- // cleanup
- for ( ; i < numVerts; i++ ) {
- idVec3 &v = verts[i].normal;
- float f;
- //f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
- f = FastScalarInvSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
- v.x *= f; v.y *= f; v.z *= f;
- for ( int j = 0; j < 2; j++ ) {
- idVec3 &t = verts[i].tangents[j];
- t -= ( t * v ) * v;
- // f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
- f = FastScalarInvSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
- t.x *= f; t.y *= f; t.z *= f;
- }
- }
- }
- #endif /* ENABLE_DERIVE */
- #ifdef ENABLE_CREATE
- /*
- ============
- idSIMD_AltiVec::CreateTextureSpaceLightVectors
- Calculates light vectors in texture space for the given triangle vertices.
- For each vertex the direction towards the light origin is projected onto texture space.
- The light vectors are only calculated for the vertices referenced by the indexes.
- ============
- */
- void VPCALL idSIMD_AltiVec::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
-
- bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
- memset( used, 0, numVerts * sizeof( used[0] ) );
- int i;
- for ( i = 0; i+7 < numIndexes; i+= 8 ) {
- used[indexes[i]] = true;
- used[indexes[i+1]] = true;
- used[indexes[i+2]] = true;
- used[indexes[i+3]] = true;
- used[indexes[i+4]] = true;
- used[indexes[i+5]] = true;
- used[indexes[i+6]] = true;
- used[indexes[i+7]] = true;
- }
-
- for ( ; i < numIndexes; i++ ) {
- used[indexes[i]] = true;
- }
- for ( i = 0; i+1 < numVerts; i+=2 ) {
- const idDrawVert *v = &verts[i];
- const idDrawVert *v2 = &verts[i+1];
-
- float x, y, z;
- float x2, y2, z2;
- idVec3 lightDir, lightDir2;
- lightDir[0] = lightOrigin[0] - v->xyz[0];
- lightDir[1] = lightOrigin[1] - v->xyz[1];
- lightDir[2] = lightOrigin[2] - v->xyz[2];
-
- lightDir2[0] = lightOrigin[0] - v2->xyz[0];
- lightDir2[1] = lightOrigin[1] - v2->xyz[1];
- lightDir2[2] = lightOrigin[2] - v2->xyz[2];
-
- x = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
- y = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
- z = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
-
- x2 = lightDir2[0] * v2->tangents[0][0] + lightDir2[1] * v2->tangents[0][1] + lightDir2[2] * v2->tangents[0][2];
- y2 = lightDir2[0] * v2->tangents[1][0] + lightDir2[1] * v2->tangents[1][1] + lightDir2[2] * v2->tangents[1][2];
- z2 = lightDir2[0] * v2->normal[0] + lightDir2[1] * v2->normal[1] + lightDir2[2] * v2->normal[2];
-
- if ( used[i] ) {
- lightVectors[i][0] = x;
- lightVectors[i][1] = y;
- lightVectors[i][2] = z;
- }
-
- if ( used[i+1] ) {
- lightVectors[i+1][0] = x2;
- lightVectors[i+1][1] = y2;
- lightVectors[i+1][2] = z2;
- }
- }
- // cleanup
- for ( ; i < numVerts; i++ ) {
- if ( !used[i] ) {
- continue;
- }
- const idDrawVert *v = &verts[i];
- idVec3 lightDir;
- lightDir[0] = lightOrigin[0] - v->xyz[0];
- lightDir[1] = lightOrigin[1] - v->xyz[1];
- lightDir[2] = lightOrigin[2] - v->xyz[2];
- lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
- lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
- lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
- }
- }
- #if 1
- /*
- ============
- idSIMD_AltiVec::CreateSpecularTextureCoords
- Calculates specular texture coordinates for the given triangle vertices.
- For each vertex the normalized direction towards the light origin is added to the
- normalized direction towards the view origin and the result is projected onto texture space.
- The texture coordinates are only calculated for the vertices referenced by the indexes.
- ============
- */
- void VPCALL idSIMD_AltiVec::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
-
- bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
- memset( used, 0, numVerts * sizeof( used[0] ) );
- int i;
- for ( i = 0; i+7 < numIndexes; i+= 8 ) {
- used[indexes[i]] = true;
- used[indexes[i+1]] = true;
- used[indexes[i+2]] = true;
- used[indexes[i+3]] = true;
- used[indexes[i+4]] = true;
- used[indexes[i+5]] = true;
- used[indexes[i+6]] = true;
- used[indexes[i+7]] = true;
- }
-
- for ( ; i < numIndexes; i++ ) {
- used[indexes[i]] = true;
- }
- // load lightOrigin and viewOrigin into vectors
- const float *lightOriginPtr = lightOrigin.ToFloatPtr();
- const float *viewOriginPtr = viewOrigin.ToFloatPtr();
- vector unsigned char permVec = vec_lvsl( 0, lightOriginPtr );
- vector unsigned char permVec2 = vec_lvsl( 0, viewOriginPtr );
- vector float v0 = vec_ld( 0, lightOriginPtr );
- vector float v1 = vec_ld( 15, lightOriginPtr );
- vector float v2 = vec_ld( 0, viewOriginPtr );
- vector float v3 = vec_ld( 15, viewOriginPtr );
- vector float vecLightOrigin = vec_perm( v0, v1, permVec );
- vector float vecViewOrigin = vec_perm( v2, v3, permVec2 );
- const vector float zeroVector = (vector float)(0);
- int index;
- for ( index = 0; index+1 < numVerts; index+=2 ) {
- const float *vertPtr = verts[index].xyz.ToFloatPtr();
- const float *vertPtr2 = verts[index+1].xyz.ToFloatPtr();
- permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
- permVec2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
-
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 15, vertPtr );
- vector float v2 = vec_ld( 31, vertPtr );
- vector float v3 = vec_ld( 47, vertPtr );
- vector float v4 = vec_ld( 63, vertPtr );
-
- vector float v5 = vec_ld( 0, vertPtr2 );
- vector float v6 = vec_ld( 15, vertPtr2 );
- vector float v7 = vec_ld( 31, vertPtr2 );
- vector float v8 = vec_ld( 47, vertPtr2 );
- vector float v9 = vec_ld( 63, vertPtr2 );
-
- // figure out what values go where
- vector float vecXYZ = vec_perm( v0, v1, permVec );
- vector float vecNormal = vec_perm( v1, v2, permVec );
- vecNormal = vec_sld( vecNormal, vecNormal, 4 );
- const vector float vecTangent0 = vec_perm( v2, v3, permVec );
- permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
- const vector float vecTangent1 = vec_perm( v3, v4, permVec );
-
- vector float vecXYZ2 = vec_perm( v5, v6, permVec2 );
- vector float vecNormal2 = vec_perm( v6, v7, permVec2 );
- vecNormal2 = vec_sld( vecNormal2, vecNormal2, 4 );
- const vector float vecTangent02 = vec_perm( v7, v8, permVec2 );
- permVec2 = vec_add( permVec2, (vector unsigned char)(-4) );
- const vector float vecTangent12 = vec_perm( v8, v9, permVec2 );
-
- // calculate lightDir
- vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
- vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
-
- vector float vecLightDir2 = vec_sub( vecLightOrigin, vecXYZ2 );
- vector float vecViewDir2 = vec_sub( vecViewOrigin, vecXYZ2 );
- // calculate distance
- vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
- vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
-
- vector float vecTempLight2 = vec_madd( vecLightDir2, vecLightDir2, zeroVector );
- vector float vecTempView2 = vec_madd( vecViewDir2, vecViewDir2, zeroVector );
-
- // sum accross first 3 elements of vector
- vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
- vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
- vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
- vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
-
- vector float tempSum4 = vec_add( vecTempLight2, vec_sld( vecTempLight2, vecTempLight2, 4 ) );
- vecTempLight2 = vec_add( tempSum4, vec_sld( tempSum4, tempSum4, 8 ) );
- vector float tempSum5 = vec_add( vecTempView2, vec_sld( vecTempView2, vecTempView2, 4 ) );
- vecTempView2 = vec_add( tempSum5, vec_sld( tempSum5, tempSum5, 8 ) );
-
- // splat sum accross the whole vector
- vecTempLight = vec_splat( vecTempLight, 0 );
- vecTempView = vec_splat( vecTempView, 0 );
-
- vecTempLight2 = vec_splat( vecTempLight2, 0 );
- vecTempView2 = vec_splat( vecTempView2, 0 );
-
- vecTempLight = ReciprocalSquareRoot( vecTempLight );
- vecTempView = ReciprocalSquareRoot( vecTempView );
-
- vecTempLight2 = ReciprocalSquareRoot( vecTempLight2 );
- vecTempView2 = ReciprocalSquareRoot( vecTempView2 );
- // modify light and view vectors based on ilength
- vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
- vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
-
- vecViewDir2 = vec_madd( vecViewDir2, vecTempView2, zeroVector );
- vecLightDir2 = vec_madd( vecLightDir2, vecTempLight2, vecViewDir2 );
-
- // calculate what to store in each texture coord
- vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
- vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
- vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
-
- vector float vecTC3 = vec_madd( vecLightDir2, vecTangent02, zeroVector );
- vector float vecTC4 = vec_madd( vecLightDir2, vecTangent12, zeroVector );
- vector float vecTC5 = vec_madd( vecLightDir2, vecNormal2, zeroVector );
- // sum accross first 3 elements of vector
- vector float tempSum3;
- tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
- vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
- tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
- vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
- tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
- vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
-
- tempSum4 = vec_add( vecTC3, vec_sld( vecTC3, vecTC3, 4 ) );
- vecTC3 = vec_add( tempSum4, vec_sld( vecTC3, vecTC3, 8 ) );
- tempSum5 = vec_add( vecTC4, vec_sld( vecTC4, vecTC4, 4 ) );
- vecTC4 = vec_add( tempSum5, vec_sld( vecTC4, vecTC4, 8 ) );
- vector float tempSum6 = vec_add( vecTC5, vec_sld( vecTC5, vecTC5, 4 ) );
- vecTC5 = vec_add( tempSum6, vec_sld( vecTC5, vecTC5, 8 ) );
-
- vecTC0 = vec_splat( vecTC0, 0 );
- vecTC1 = vec_splat( vecTC1, 0 );
- vecTC2 = vec_splat( vecTC2, 0 );
- vecTC3 = vec_splat( vecTC3, 0 );
- vecTC4 = vec_splat( vecTC4, 0 );
- vecTC5 = vec_splat( vecTC5, 0 );
-
- if ( used[index] ) {
- // store out results
- vec_ste( vecTC0, 0, &texCoords[index][0] );
- vec_ste( vecTC1, 0, &texCoords[index][1] );
- vec_ste( vecTC2, 0, &texCoords[index][2] );
- vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
- }
-
- if ( used[index+1] ) {
- vec_ste( vecTC3, 0, &texCoords[index+1][0] );
- vec_ste( vecTC4, 0, &texCoords[index+1][1] );
- vec_ste( vecTC5, 0, &texCoords[index+1][2] );
- vec_ste( (vector float)(1.0), 0, &texCoords[index+1][3] );
- }
- }
- // cleanup
- for ( ; index < numVerts; index++ ) {
- if ( !used[index] ) {
- continue;
- }
- const float *vertPtr = verts[index].xyz.ToFloatPtr();
- permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
-
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 15, vertPtr );
- vector float v2 = vec_ld( 31, vertPtr );
- vector float v3 = vec_ld( 47, vertPtr );
- vector float v4 = vec_ld( 63, vertPtr );
-
- // figure out what values go where
- vector float vecXYZ = vec_perm( v0, v1, permVec );
- vector float vecNormal = vec_perm( v1, v2, permVec );
- vecNormal = vec_sld( vecNormal, vecNormal, 4 );
- const vector float vecTangent0 = vec_perm( v2, v3, permVec );
- permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
- const vector float vecTangent1 = vec_perm( v3, v4, permVec );
-
- // calculate lightDir
- vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
- vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
- // calculate distance
- vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
- vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
-
- // sum accross first 3 elements of vector
- vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
- vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
- vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
- vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
-
- // splat sum accross the whole vector
- vecTempLight = vec_splat( vecTempLight, 0 );
- vecTempView = vec_splat( vecTempView, 0 );
-
- vecTempLight = ReciprocalSquareRoot( vecTempLight );
- vecTempView = ReciprocalSquareRoot( vecTempView );
-
- // modify light and view vectors based on ilength
- vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
- vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
-
- // calculate what to store in each texture coord
- vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
- vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
- vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
-
- // sum accross first 3 elements of vector
- vector float tempSum3;
- tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
- vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
- tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
- vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
- tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
- vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
-
- vecTC0 = vec_splat( vecTC0, 0 );
- vecTC1 = vec_splat( vecTC1, 0 );
- vecTC2 = vec_splat( vecTC2, 0 );
-
- // store out results
- vec_ste( vecTC0, 0, &texCoords[index][0] );
- vec_ste( vecTC1, 0, &texCoords[index][1] );
- vec_ste( vecTC2, 0, &texCoords[index][2] );
- vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
-
- }
- }
- #endif /* 0 for disable spec coord */
- #if 1
- #ifdef VERTEXCACHE_ALIGNED
- /*
- ============
- idSIMD_AltiVec::CreateShadowCache
- ============
- */
- int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
- int outVerts = 0;
- int i = 0;
-
- assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
- register vector float zeroVector = (vector float)(0.0);
- register vector float oneVector = (vector float)(1);
- register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
-
- const float *lPtr = lightOrigin.ToFloatPtr();
- const float *vPtr;
- const float *vPtr2;
- const float *vPtr3;
- const float *vPtr4;
-
- // put values into a vector
- vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
- v0 = vec_ld( 0, lPtr );
- v1 = vec_ld( 15, lPtr );
- v0 = vec_perm( v0, v1, vecPerm );
- v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
-
- //v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
- for ( ; i+3 < numVerts; i+= 4 ) {
- if ( ! vertRemap[i] ) {
- vPtr = verts[i].xyz.ToFloatPtr();
- #ifndef DRAWVERT_PADDED
- vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
- v2 = vec_ld( 0, vPtr );
- v3 = vec_ld( 15, vPtr );
- v7 = vec_perm( v2, v3, vecPerm2 );
- #else
- v7 = vec_ld( 0, vPtr );
- #endif
- v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
- v3 = vec_perm( v7, oneVector, vecPermZeroLast );
- v1 = vec_sub( v2, v0 );
-
- vec_st( v3, 0, &vertexCache[outVerts][0] );
- vec_st( v1, 0, &vertexCache[outVerts+1][0] );
- vertRemap[i] = outVerts;
- outVerts += 2;
- }
-
- if ( ! vertRemap[i+1] ) {
- vPtr2 = verts[i+1].xyz.ToFloatPtr();
- #ifndef DRAWVERT_PADDED
- vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
- v4 = vec_ld( 0, vPtr2 );
- v5 = vec_ld( 15, vPtr2 );
- v6 = vec_perm( v4, v5, vecPerm3 );
- #else
- v6 = vec_ld( 0, vPtr2 );
- #endif
- v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
- v5 = vec_perm( v6, oneVector, vecPermZeroLast );
- v6 = vec_sub( v4, v0 );
-
- vec_st( v5, 0, &vertexCache[outVerts][0] );
- vec_st( v6, 0, &vertexCache[outVerts+1][0] );
-
- vertRemap[i+1] = outVerts;
- outVerts += 2;
- }
-
- if ( ! vertRemap[i+2] ) {
- vPtr3 = verts[i+2].xyz.ToFloatPtr();
- #ifndef DRAWVERT_PADDED
- vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
- v1 = vec_ld( 0, vPtr3 );
- v2 = vec_ld( 15, vPtr3 );
- v3 = vec_perm( v1, v2, vecPerm4 );
- #else
- v3 = vec_ld( 0, vPtr3 );
- #endif
- v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
- v2 = vec_perm( v3, oneVector, vecPermZeroLast );
- v3 = vec_sub( v1, v0 );
- vec_st( v2, 0, &vertexCache[outVerts][0] );
- vec_st( v3, 0, &vertexCache[outVerts+1][0] );
-
- vertRemap[i+2] = outVerts;
- outVerts += 2;
- }
-
- if ( ! vertRemap[i+3] ) {
- vPtr4 = verts[i+3].xyz.ToFloatPtr();
- #ifndef DRAWVERT_PADDED
- vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
- v4 = vec_ld( 0, vPtr4 );
- v5 = vec_ld( 16, vPtr4 );
- v6 = vec_perm( v4, v5, vecPerm5 );
- #else
- v6 = vec_ld( 0, vPtr4 );
- #endif
- v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
- v5 = vec_perm( v6, oneVector, vecPermZeroLast );
- v6 = vec_sub( v4, v0 );
-
- vec_st( v5, 0, &vertexCache[outVerts][0] );
- vec_st( v6, 0, &vertexCache[outVerts+1][0] );
-
- vertRemap[i+3] = outVerts;
- outVerts += 2;
- }
- }
- // cleanup
- for (; i < numVerts; i++ ) {
- if ( vertRemap[i] ) {
- continue;
- }
- const float *v = verts[i].xyz.ToFloatPtr();
- vertexCache[outVerts+0][0] = v[0];
- vertexCache[outVerts+0][1] = v[1];
- vertexCache[outVerts+0][2] = v[2];
- vertexCache[outVerts+0][3] = 1.0f;
- // R_SetupProjection() builds the projection matrix with a slight crunch
- // for depth, which keeps this w=0 division from rasterizing right at the
- // wrap around point and causing depth fighting with the rear caps
- vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
- vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
- vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
- vertexCache[outVerts+1][3] = 0.0f;
- vertRemap[i] = outVerts;
- outVerts += 2;
- }
- return outVerts;
- }
- #else
- /*
- ============
- idSIMD_AltiVec::CreateShadowCache
- ============
- */
- int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
- int outVerts = 0;
- int i = 0;
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
- register vector float zeroVector = (vector float)(0.0);
- register vector float oneVector = (vector float)(1);
- register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
-
- const float *lPtr = lightOrigin.ToFloatPtr();
- const float *vPtr;
- const float *vPtr2;
- const float *vPtr3;
- const float *vPtr4;
-
- // put values into a vector
- vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
- v0 = vec_ld( 0, lPtr );
- v1 = vec_ld( 15, lPtr );
- v0 = vec_perm( v0, v1, vecPerm );
- v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
-
- //v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
- for ( ; i+3 < numVerts; i+= 4 ) {
- if ( ! vertRemap[i] ) {
- vPtr = verts[i].xyz.ToFloatPtr();
- #ifndef DRAWVERT_PADDED
- vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
- v2 = vec_ld( 0, vPtr );
- v3 = vec_ld( 15, vPtr );
- v7 = vec_perm( v2, v3, vecPerm2 );
- #else
- v7 = vec_ld( 0, vPtr );
- #endif
- v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
- v3 = vec_perm( v7, oneVector, vecPermZeroLast );
- v1 = vec_sub( v2, v0 );
-
- // store results
- UNALIGNED_STORE2( &vertexCache[outVerts][0], v3, v1 );
-
- vertRemap[i] = outVerts;
- outVerts += 2;
- }
-
- if ( ! vertRemap[i+1] ) {
- vPtr2 = verts[i+1].xyz.ToFloatPtr();
- #ifndef DRAWVERT_PADDED
- vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
- v4 = vec_ld( 0, vPtr2 );
- v5 = vec_ld( 15, vPtr2 );
- v6 = vec_perm( v4, v5, vecPerm3 );
- #else
- v6 = vec_ld( 0, vPtr2 );
- #endif
- v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
- v5 = vec_perm( v6, oneVector, vecPermZeroLast );
- v6 = vec_sub( v4, v0 );
- // store results
- UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
-
- vertRemap[i+1] = outVerts;
- outVerts += 2;
- }
-
- if ( ! vertRemap[i+2] ) {
- vPtr3 = verts[i+2].xyz.ToFloatPtr();
- #ifndef DRAWVERT_PADDED
- vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
- v1 = vec_ld( 0, vPtr3 );
- v2 = vec_ld( 15, vPtr3 );
- v3 = vec_perm( v1, v2, vecPerm4 );
- #else
- v3 = vec_ld( 0, vPtr3 );
- #endif
- v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
- v2 = vec_perm( v3, oneVector, vecPermZeroLast );
- v3 = vec_sub( v1, v0 );
-
- // store results
- UNALIGNED_STORE2( &vertexCache[outVerts][0], v2, v3 );
-
- vertRemap[i+2] = outVerts;
- outVerts += 2;
- }
- if ( ! vertRemap[i+3] ) {
- vPtr4 = verts[i+3].xyz.ToFloatPtr();
- #ifndef DRAWVERT_PADDED
- vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
- v4 = vec_ld( 0, vPtr4 );
- v5 = vec_ld( 16, vPtr4 );
- v6 = vec_perm( v4, v5, vecPerm5 );
- #else
- v6 = vec_ld( 0, vPtr4 );
- #endif
- v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
- v5 = vec_perm( v6, oneVector, vecPermZeroLast );
- v6 = vec_sub( v4, v0 );
-
- // store results
- UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
-
-
- vertRemap[i+3] = outVerts;
- outVerts += 2;
- }
- }
- // cleanup
- for (; i < numVerts; i++ ) {
- if ( vertRemap[i] ) {
- continue;
- }
- const float *v = verts[i].xyz.ToFloatPtr();
- vertexCache[outVerts+0][0] = v[0];
- vertexCache[outVerts+0][1] = v[1];
- vertexCache[outVerts+0][2] = v[2];
- vertexCache[outVerts+0][3] = 1.0f;
- // R_SetupProjection() builds the projection matrix with a slight crunch
- // for depth, which keeps this w=0 division from rasterizing right at the
- // wrap around point and causing depth fighting with the rear caps
- vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
- vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
- vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
- vertexCache[outVerts+1][3] = 0.0f;
- vertRemap[i] = outVerts;
- outVerts += 2;
- }
- return outVerts;
- }
- #endif /* VERTEXCACHE_ALIGNED */
- #endif /* 0 to disable shadow cache */
- #if 1
- #ifdef VERTEXCACHE_ALIGNED
- /*
- ============
- idSIMD_AltiVec::CreateVertexProgramShadowCache
- ============
- */
- int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
- // vertexCache aligned
- assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
- // idDrawVert size
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
- // idVec4 size
- assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- register vector float zeroVector = (vector float)(0.0);
- register vector float oneVector = (vector float)(1);
- register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
- int i = 0;
- #ifndef DRAWVERT_PADDED
- // every fourth one will have the same alignment. Make sure we've got enough here
- if ( i+3 < numVerts ) {
- vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- }
- #endif
-
- for ( ; i+3 < numVerts; i+=4 ) {
- const float *vertPtr = verts[i].xyz.ToFloatPtr();
- const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
-
- #ifndef DRAWVERT_PADDED
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 15, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v3 = vec_ld( 15, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v5 = vec_ld( 15, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
- v7 = vec_ld( 15, vertPtr4 );
- v0 = vec_perm( v0, v1, vertPerm1 );
- v1 = vec_perm( v2, v3, vertPerm2 );
- v2 = vec_perm( v4, v5, vertPerm3 );
- v3 = vec_perm( v6, v7, vertPerm4 );
- #else
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 0, vertPtr2 );
- v2 = vec_ld( 0, vertPtr3 );
- v3 = vec_ld( 0, vertPtr4 );
- #endif
- v0 = vec_perm( v0, oneVector, vecPermThreeOne );
- v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
-
- v1 = vec_perm( v1, oneVector, vecPermThreeOne );
- v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
-
- v2 = vec_perm( v2, oneVector, vecPermThreeOne );
- v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
-
- v3 = vec_perm( v3, oneVector, vecPermThreeOne );
- v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
- // store results
- ALIGNED_STORE4( &vertexCache[i*2][0], v0, v4, v1, v5 );
- ALIGNED_STORE4( &vertexCache[(i+2)*2][0], v2, v6, v3, v7 );
- }
- // cleanup
- for ( ; i < numVerts; i++ ) {
- const float *v = verts[i].xyz.ToFloatPtr();
- vertexCache[i*2+0][0] = v[0];
- vertexCache[i*2+1][0] = v[0];
- vertexCache[i*2+0][1] = v[1];
- vertexCache[i*2+1][1] = v[1];
- vertexCache[i*2+0][2] = v[2];
- vertexCache[i*2+1][2] = v[2];
- vertexCache[i*2+0][3] = 1.0f;
- vertexCache[i*2+1][3] = 0.0f;
- }
- return numVerts * 2;
- }
- #else
- /*
- ============
- idSIMD_AltiVec::CreateVertexProgramShadowCache
- ============
- */
- int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
- // idDrawVert size
- assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
- // idVec4 size
- assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
-
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- register vector float zeroVector = (vector float)(0.0);
- register vector float oneVector = (vector float)(1);
- register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
- vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
- int i = 0;
- #ifndef DRAWVERT_PADDED
- // every fourth one will have the same alignment. Make sure we've got enough here
- if ( i+3 < numVerts ) {
- vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
- }
- #endif
-
- for ( ; i+3 < numVerts; i+=4 ) {
- const float *vertPtr = verts[i].xyz.ToFloatPtr();
- const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
- const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
- const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
- #ifndef DRAWVERT_PADDED
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 15, vertPtr );
- v2 = vec_ld( 0, vertPtr2 );
- v3 = vec_ld( 15, vertPtr2 );
- v4 = vec_ld( 0, vertPtr3 );
- v5 = vec_ld( 15, vertPtr3 );
- v6 = vec_ld( 0, vertPtr4 );
- v7 = vec_ld( 15, vertPtr4 );
-
- v0 = vec_perm( v0, v1, vertPerm1 );
- v1 = vec_perm( v2, v3, vertPerm2 );
- v2 = vec_perm( v4, v5, vertPerm3 );
- v3 = vec_perm( v6, v7, vertPerm4 );
- #else
- v0 = vec_ld( 0, vertPtr );
- v1 = vec_ld( 0, vertPtr2 );
- v2 = vec_ld( 0, vertPtr3 );
- v3 = vec_ld( 0, vertPtr4 );
- #endif
- v0 = vec_perm( v0, oneVector, vecPermThreeOne );
- v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
-
- v1 = vec_perm( v1, oneVector, vecPermThreeOne );
- v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
-
- v2 = vec_perm( v2, oneVector, vecPermThreeOne );
- v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
-
- v3 = vec_perm( v3, oneVector, vecPermThreeOne );
- v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
-
- // store results as unaligned
- vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &vertexCache[i*2][0] ), (vector unsigned char)(1) );
- vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
- vector float vc1 = vec_ld( 0, &vertexCache[i*2][0] );
- vector float vc2 = vec_ld( 127, &vertexCache[i*2][0] );
-
- // right rotate input data
- v0 = vec_perm( v0, v0, storePerm );
- v4 = vec_perm( v4, v4, storePerm );
- v1 = vec_perm( v1, v1, storePerm );
- v5 = vec_perm( v5, v5, storePerm );
- v2 = vec_perm( v2, v2, storePerm );
- v6 = vec_perm( v6, v6, storePerm );
- v3 = vec_perm( v3, v3, storePerm );
- v7 = vec_perm( v7, v7, storePerm );
-
- vec_st( vec_sel( vc1, v0, mask ), 0 , &vertexCache[i*2][0] );
- vec_st( vec_sel( v0, v4, mask ), 15 , &vertexCache[i*2][0] );
- vec_st( vec_sel( v4, v1, mask ), 31 , &vertexCache[i*2][0] );
- vec_st( vec_sel( v1, v5, mask ), 47 , &vertexCache[i*2][0] );
- vec_st( vec_sel( v5, v2, mask ), 63 , &vertexCache[i*2][0] );
- vec_st( vec_sel( v2, v6, mask ), 79 , &vertexCache[i*2][0] );
- vec_st( vec_sel( v6, v3, mask ), 95 , &vertexCache[i*2][0] );
- vec_st( vec_sel( v3, v7, mask ), 111 , &vertexCache[i*2][0] );
- vec_st( vec_sel( v7, vc2, mask ), 127 , &vertexCache[i*2][0] );
- }
- // cleanup
- for ( ; i < numVerts; i++ ) {
- const float *v = verts[i].xyz.ToFloatPtr();
- vertexCache[i*2+0][0] = v[0];
- vertexCache[i*2+1][0] = v[0];
- vertexCache[i*2+0][1] = v[1];
- vertexCache[i*2+1][1] = v[1];
- vertexCache[i*2+0][2] = v[2];
- vertexCache[i*2+1][2] = v[2];
- vertexCache[i*2+0][3] = 1.0f;
- vertexCache[i*2+1][3] = 0.0f;
- }
- return numVerts * 2;
- }
- #endif /* VERTEXCACHE_ALIGNED */
- #endif /* 0 to kill VP shader cache */
- #endif /* ENABLE_CREATE */
- #ifdef ENABLE_SOUND_ROUTINES
- #ifdef SOUND_DEST_ALIGNED
- /*
- ============
- idSIMD_AltiVec::UpSamplePCMTo44kHz
- Duplicate samples for 44kHz output.
-
- Assumptions:
- Assumes that dest starts at aligned address
- ============
- */
- void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
-
- // dest is aligned
- assert( IS_16BYTE_ALIGNED( dest[0] ) );
- vector signed short vs0, vs1;
- register vector signed int vi0, vi1;
- register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
- // permute vectors
- register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
- register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
-
- register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
- register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
-
- // If this can be assumed true, we can eliminate another conditional that checks to see if we can
- // load up a vector before the loop
- assert( numSamples >= 12 );
-
- if ( kHz == 11025 ) {
- if ( numChannels == 1 ) {
- // 8 at a time
- int i = 0;
-
- vector signed short vsOld = vec_ld( 0, &src[i] );
- vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
-
- for ( ; i+7 < numSamples; i+= 8 ) {
- // load src
- vs1 = vec_ld( 15, &src[i] );
- vs0 = vec_perm( vsOld, vs1, permVec );
- vsOld = vs1;
-
- // unpack shorts to ints
- vi0 = vec_unpackh( vs0 );
- vi1 = vec_unpackl( vs0 );
- // convert ints to floats
- v0 = vec_ctf( vi0, 0 );
- v1 = vec_ctf( vi1, 0 );
- // permute into vectors in the order to store
- v2 = vec_splat( v0, 0 );
- v3 = vec_splat( v0, 1 );
- v4 = vec_splat( v0, 2 );
- v5 = vec_splat( v0, 3 );
- v6 = vec_splat( v1, 0 );
- v7 = vec_splat( v1, 1 );
- v8 = vec_splat( v1, 2 );
- v9 = vec_splat( v1, 3 );
- // store results
- ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
- }
- // cleanup
- for (; i < numSamples; i++ ) {
- dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
- }
- } else {
- int i = 0;
-
- vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
- vector signed short vsOld = vec_ld( 0, &src[0] );
-
- for ( ; i+7 < numSamples; i += 8 ) {
- // load src
- vs1 = vec_ld( 15, &src[i] );
- vs0 = vec_perm( vsOld, vs1, permVec );
- vsOld = vs1;
-
- // unpack shorts to ints
- vi0 = vec_unpackh( vs0 );
- vi1 = vec_unpackl( vs0 );
- // convert ints to floats
- v0 = vec_ctf( vi0, 0 );
- v1 = vec_ctf( vi1, 0 );
- // put into vectors in order to store
- v2 = vec_perm( v0, v0, vecFirstHalf );
- v3 = v2;
- v4 = vec_perm( v0, v0, vecSecondHalf );
- v5 = v4;
- v6 = vec_perm( v1, v1, vecFirstHalf );
- v7 = v6;
- v8 = vec_perm (v1, v1, vecSecondHalf );
- v9 = v8;
-
- // store results
- ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
- }
-
- for ( ; i < numSamples; i += 2 ) {
- dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
- dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
- }
- }
- } else if ( kHz == 22050 ) {
- if ( numChannels == 1 ) {
- int i;
- vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
- vector signed short vsOld = vec_ld( 0, &src[0] );
-
- for ( i = 0; i+7 < numSamples; i += 8 ) {
- // load src
- vs1 = vec_ld( 0, &src[i] );
- vs0 = vec_perm( vsOld, vs1, permVec );
- vsOld = vs1;
-
- // unpack shorts to ints
- vi0 = vec_unpackh( vs0 );
- vi1 = vec_unpackl( vs0 );
- // convert ints to floats
- v0 = vec_ctf( vi0, 0 );
- v1 = vec_ctf( vi1, 0 );
- // put into vectors in order to store
- v2 = vec_perm( v0, v0, vecBottom );
- v3 = vec_perm( v0, v0, vecTop );
- v4 = vec_perm( v1, v1, vecBottom );
- v5 = vec_perm (v1, v1, vecTop );
-
- // store results
- ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
- }
- // cleanup
- for ( ; i < numSamples; i++ ) {
- dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
- }
- } else {
- int i;
- vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
- vector signed short vsOld = vec_ld( 0, &src[0] );
-
- for ( i = 0; i+7 < numSamples; i += 8 ) {
- // load src
- vs1 = vec_ld( 15, &src[i] );
- vs0 = vec_perm( vsOld, vs1, permVec );
- vsOld = vs1;
-
- // unpack shorts to ints
- vi0 = vec_unpackh( vs0 );
- vi1 = vec_unpackl( vs0 );
- // convert ints to floats
- v0 = vec_ctf( vi0, 0 );
- v1 = vec_ctf( vi1, 0 );
- // put into vectors in order to store
- v2 = vec_perm( v0, v0, vecFirstHalf );
- v3 = vec_perm( v0, v0, vecSecondHalf );
- v4 = vec_perm( v1, v1, vecFirstHalf );
- v5 = vec_perm (v1, v1, vecSecondHalf );
-
- // store results
- ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
- }
- // cleanup
- for ( ; i < numSamples; i += 2 ) {
- dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
- dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
- }
- }
- } else if ( kHz == 44100 ) {
- int i;
- vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
- vector signed short vsOld = vec_ld( 0, &src[0] );
-
- for ( i = 0; i+7 < numSamples; i += 8 ) {
- vs1 = vec_ld( 15, &src[i] );
- vs0 = vec_perm( vsOld, vs1, permVec );
- vsOld = vs1;
-
- //unpack shorts to ints
- vi0 = vec_unpackh( vs0 );
- vi1 = vec_unpackl( vs0 );
-
- //convert ints to floats
- v0 = vec_ctf( vi0, 0 );
- v1 = vec_ctf( vi1, 0 );
-
- //store results
- ALIGNED_STORE2( &dest[i], v0, v1 );
- }
- // cleanup
- for ( ; i < numSamples; i++ ) {
- dest[i] = (float) src[i];
- }
- } else {
- assert( 0 );
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::UpSamplePCMTo44kHz
- Duplicate samples for 44kHz output.
-
- Assumptions:
- No assumptions
- ============
- */
- void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
-
- vector signed short vs0, vs1;
- register vector signed int vi0, vi1;
- register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
- // permute vectors
- register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
- register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
-
- register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
- register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
-
- // calculate perm vector and masks for stores
- vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
- // original values of dest
- vector float vecDest = vec_ld( 0, &dest[0] );
- vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
-
- if ( kHz == 11025 ) {
- if ( numChannels == 1 ) {
- // 8 at a time
- int i = 0;
-
- vector signed short vsOld = vec_ld( 0, &src[i] );
- vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
-
- for ( ; i+7 < numSamples; i+= 8 ) {
- // load src
- vs1 = vec_ld( 15, &src[i] );
- vs0 = vec_perm( vsOld, vs1, permVec );
- vsOld = vs1;
- vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
-
- // unpack shorts to ints
- vi0 = vec_unpackh( vs0 );
- vi1 = vec_unpackl( vs0 );
- // convert ints to floats
- v0 = vec_ctf( vi0, 0 );
- v1 = vec_ctf( vi1, 0 );
- // permute into vectors in the order to store
- v2 = vec_splat( v0, 0 );
- v3 = vec_splat( v0, 1 );
- v4 = vec_splat( v0, 2 );
- v5 = vec_splat( v0, 3 );
- v6 = vec_splat( v1, 0 );
- v7 = vec_splat( v1, 1 );
- v8 = vec_splat( v1, 2 );
- v9 = vec_splat( v1, 3 );
- v2 = vec_perm( v2, v2, storePerm );
- v3 = vec_perm( v3, v3, storePerm );
- v4 = vec_perm( v4, v4, storePerm );
- v5 = vec_perm( v5, v5, storePerm );
- v6 = vec_perm( v6, v6, storePerm );
- v7 = vec_perm( v7, v7, storePerm );
- v8 = vec_perm( v8, v8, storePerm );
- v9 = vec_perm( v9, v9, storePerm );
-
- // store results
- vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
- vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
- vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
- vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
- vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
- vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
- vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
- vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
- vecDest = vec_sel( v9, vecDestEnd, mask );
- vec_st( vecDest, 127, &dest[i*4] );
- }
- // cleanup
- for (; i < numSamples; i++ ) {
- dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
- }
- } else {
- int i = 0;
-
- vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
- vector signed short vsOld = vec_ld( 0, &src[0] );
-
- for ( ; i+7 < numSamples; i += 8 ) {
- // load src
- vs1 = vec_ld( 15, &src[i] );
- vs0 = vec_perm( vsOld, vs1, permVec );
- vsOld = vs1;
- vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
- // unpack shorts to ints
- vi0 = vec_unpackh( vs0 );
- vi1 = vec_unpackl( vs0 );
- // convert ints to floats
- v0 = vec_ctf( vi0, 0 );
- v1 = vec_ctf( vi1, 0 );
- // put into vectors in order to store
- v2 = vec_perm( v0, v0, vecFirstHalf );
- v3 = v2;
- v4 = vec_perm( v0, v0, vecSecondHalf );
- v5 = v4;
- v6 = vec_perm( v1, v1, vecFirstHalf );
- v7 = v6;
- v8 = vec_perm (v1, v1, vecSecondHalf );
- v9 = v8;
- v2 = vec_perm( v2, v2, storePerm );
- v3 = vec_perm( v3, v3, storePerm );
- v4 = vec_perm( v4, v4, storePerm );
- v5 = vec_perm( v5, v5, storePerm );
- v6 = vec_perm( v6, v6, storePerm );
- v7 = vec_perm( v7, v7, storePerm );
- v8 = vec_perm( v8, v8, storePerm );
- v9 = vec_perm( v9, v9, storePerm );
-
- // store results
- vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
- vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
- vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
- vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
- vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
- vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
- vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
- vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
- vecDest = vec_sel( v9, vecDestEnd, mask );
- vec_st( vecDest, 127, &dest[i*4] );
- }
-
- for ( ; i < numSamples; i += 2 ) {
- dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
- dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
- }
- }
- } else if ( kHz == 22050 ) {
- if ( numChannels == 1 ) {
- int i;
- vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
- vector signed short vsOld = vec_ld( 0, &src[0] );
-
- for ( i = 0; i+7 < numSamples; i += 8 ) {
- // load src
- vs1 = vec_ld( 0, &src[i] );
- vs0 = vec_perm( vsOld, vs1, permVec );
- vsOld = vs1;
- vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
-
- // unpack shorts to ints
- vi0 = vec_unpackh( vs0 );
- vi1 = vec_unpackl( vs0 );
- // convert ints to floats
- v0 = vec_ctf( vi0, 0 );
- v1 = vec_ctf( vi1, 0 );
- // put into vectors in order to store
- v2 = vec_perm( v0, v0, vecBottom );
- v3 = vec_perm( v0, v0, vecTop );
- v4 = vec_perm( v1, v1, vecBottom );
- v5 = vec_perm (v1, v1, vecTop );
-
- v2 = vec_perm( v2, v2, storePerm );
- v3 = vec_perm( v3, v3, storePerm );
- v4 = vec_perm( v4, v4, storePerm );
- v5 = vec_perm( v5, v5, storePerm );
-
- // store results
- vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
- vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
- vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
- vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
- vecDest = vec_sel( v5, vecDestEnd, mask );
- vec_st( vecDest, 63, &dest[i*2] );
-
- }
- // cleanup
- for ( ; i < numSamples; i++ ) {
- dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
- }
- } else {
- int i;
- vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
- vector signed short vsOld = vec_ld( 0, &src[0] );
-
- for ( i = 0; i+7 < numSamples; i += 8 ) {
- // load src
- vs1 = vec_ld( 15, &src[i] );
- vs0 = vec_perm( vsOld, vs1, permVec );
- vsOld = vs1;
- vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
- // unpack shorts to ints
- vi0 = vec_unpackh( vs0 );
- vi1 = vec_unpackl( vs0 );
- // convert ints to floats
- v0 = vec_ctf( vi0, 0 );
- v1 = vec_ctf( vi1, 0 );
- // put into vectors in order to store
- v2 = vec_perm( v0, v0, vecFirstHalf );
- v3 = vec_perm( v0, v0, vecSecondHalf );
- v4 = vec_perm( v1, v1, vecFirstHalf );
- v5 = vec_perm (v1, v1, vecSecondHalf );
-
- v2 = vec_perm( v2, v2, storePerm );
- v3 = vec_perm( v3, v3, storePerm );
- v4 = vec_perm( v4, v4, storePerm );
- v5 = vec_perm( v5, v5, storePerm );
-
- // store results
- vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
- vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
- vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
- vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
- vecDest = vec_sel( v5, vecDestEnd, mask );
- vec_st( vecDest, 63, &dest[i*2] );
- }
- // cleanup
- for ( ; i < numSamples; i += 2 ) {
- dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
- dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
- }
- }
- } else if ( kHz == 44100 ) {
- int i;
- vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
- vector signed short vsOld = vec_ld( 0, &src[0] );
-
- for ( i = 0; i+7 < numSamples; i += 8 ) {
- //vs0 = vec_ld( 0, &src[i] );
- vs1 = vec_ld( 15, &src[i] );
- vs0 = vec_perm( vsOld, vs1, permVec );
- vsOld = vs1;
- vector float vecDestEnd = vec_ld( 31, &dest[i] );
-
- //unpack shorts to ints
- vi0 = vec_unpackh( vs0 );
- vi1 = vec_unpackl( vs0 );
-
- //convert ints to floats
- v0 = vec_ctf( vi0, 0 );
- v1 = vec_ctf( vi1, 0 );
-
- v0 = vec_perm( v0, v0, storePerm );
- v1 = vec_perm( v1, v1, storePerm );
-
- // store results
- vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
- vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
- vecDest = vec_sel( v1, vecDestEnd, mask );
- vec_st( vecDest, 31, &dest[i] );
- }
- // cleanup
- for ( ; i < numSamples; i++ ) {
- dest[i] = (float) src[i];
- }
- } else {
- assert( 0 );
- }
- }
- #endif
- #ifdef SOUND_DEST_ALIGNED
- /*
- ============
- idSIMD_AltiVec::UpSampleOGGTo44kHz
- Duplicate samples for 44kHz output.
-
- Assumptions:
- Assumes that dest starts at aligned address
- ============
- */
- void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
- // dest is aligned
- assert( IS_16BYTE_ALIGNED( dest[0] ) );
-
- register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
- register vector float constVec, zeroVector;
- register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
- vector unsigned char vecPerm1;
- vector unsigned char vecPerm2;
- vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
- vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
- vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
- vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
- vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
- vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
-
- constVec = (vector float)(32768.0f);
- zeroVector = (vector float)(0.0);
-
- if ( kHz == 11025 ) {
- if ( numChannels == 1 ) {
- // calculate perm vector and do first load
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
- v10 = vec_ld( 0, &ogg[0][0] );
-
- int i;
- for ( i = 0; i+7 < numSamples; i += 8 ) {
- // as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
- v8 = v10;
- v9 = vec_ld( 15, &ogg[0][i] );
- v10 = vec_ld( 31, &ogg[0][i] );
- v0 = vec_perm( v8, v9, vecPerm1 );
- v1 = vec_perm( v9, v10, vecPerm1 );
-
- // now we have the elements in a vector, we want
- // to splat them each accross their own vector
- oggVec1 = vec_splat( v0, 0 );
- oggVec2 = vec_splat( v0, 1 );
- oggVec3 = vec_splat( v0, 2 );
- oggVec4 = vec_splat( v0, 3 );
- oggVec5 = vec_splat( v1, 0 );
- oggVec6 = vec_splat( v1, 1 );
- oggVec7 = vec_splat( v1, 2 );
- oggVec8 = vec_splat( v1, 3 );
-
- v0 = vec_madd( oggVec1, constVec, zeroVector );
- v1 = vec_madd( oggVec2, constVec, zeroVector );
- v2 = vec_madd( oggVec3, constVec, zeroVector );
- v3 = vec_madd( oggVec4, constVec, zeroVector );
- v4 = vec_madd( oggVec5, constVec, zeroVector );
- v5 = vec_madd( oggVec6, constVec, zeroVector );
- v6 = vec_madd( oggVec7, constVec, zeroVector );
- v7 = vec_madd( oggVec8, constVec, zeroVector );
-
- //store results
- ALIGNED_STORE8( &dest[i*4], v0, v1, v2, v3, v4, v5, v6, v7 );
- }
-
- //cleanup
- for ( ; i < numSamples; i++ ) {
- dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
- }
-
- } else {
-
- // calculate perm vec for ogg
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
- vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
- v7 = vec_ld( 0, &ogg[1][0] );
- v9 = vec_ld( 0, &ogg[0][0] );
- int i;
-
- for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
- // load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
- v8 = v9;
- v9 = vec_ld( 15, &ogg[0][i] );
- v0 = vec_perm( v8, v9, vecPerm1 );
-
- // now we have the elements in a vector, we want
- // to splat them each accross their own vector
- oggVec1 = vec_splat( v0, 0 );
- oggVec2 = vec_splat( v0, 1 );
- oggVec3 = vec_splat( v0, 2 );
- oggVec4 = vec_splat( v0, 3 );
-
- // load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
- v6 = v7;
- v7 = vec_ld( 15, &ogg[1][i] );
- v1 = vec_perm( v6, v7, vecPerm2 );
-
- // now we have the elements in a vector, we want
- // to splat them each accross their own vector
- oggVec5 = vec_splat( v1, 0 );
- oggVec6 = vec_splat( v1, 1 );
- oggVec7 = vec_splat( v1, 2 );
- oggVec8 = vec_splat( v1, 3 );
-
- oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
- oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
- oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
- oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
- oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
- oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
- oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
- oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
-
- //merge generates the interleaved pattern that we want and it
- //doesn't require a permute vector, so use that instead
- v0 = vec_mergeh( oggVec1, oggVec5 );
- v1 = vec_mergel( oggVec1, oggVec5 );
- v2 = vec_mergeh( oggVec2, oggVec6 );
- v3 = vec_mergel( oggVec2, oggVec6 );
-
- v4 = vec_mergeh( oggVec3, oggVec7 );
- v5 = vec_mergel( oggVec3, oggVec7 );
- v6 = vec_mergeh( oggVec4, oggVec8 );
- v10 = vec_mergel( oggVec4, oggVec8 );
-
- //store results
- ALIGNED_STORE8( &dest[i*8], v0, v1, v2, v3, v4, v5, v6, v10 );
- }
-
- //cleanup
- for ( ; i < numSamples >> 1; i++ ) {
- dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
- dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
- }
- }
- } else if ( kHz == 22050 ) {
- if ( numChannels == 1 ) {
-
- // calculate perm vector and do first load
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
- v10 = vec_ld( 0, &ogg[0][0] );
-
- int i;
-
- for ( i = 0; i+7 < numSamples; i += 8 ) {
- // load values from ogg
- v8 = v10;
- v9 = vec_ld( 15, &ogg[0][i] );
- v10 = vec_ld( 31, &ogg[0][i] );
- v0 = vec_perm( v8, v9, vecPerm1 );
- v1 = vec_perm( v9, v10, vecPerm1 );
-
- // multiply
- v0 = vec_madd( v0, constVec, zeroVector );
- v1 = vec_madd( v1, constVec, zeroVector );
-
- // permute into results vectors to store
- v5 = vec_perm( v0, v0, vecOneTwo );
- v6 = vec_perm( v0, v0, vecThreeFour);
- v7 = vec_perm( v1, v1, vecOneTwo );
- v8 = vec_perm( v1, v1, vecThreeFour );
-
- //store results
- ALIGNED_STORE4( &dest[i*2], v5, v6, v7, v8 );
- }
- // cleanup
- for ( ; i < numSamples; i++ ) {
- dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
- }
- } else {
- // calculate perm vector and do first load
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
- vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
- v7 = vec_ld( 0, &ogg[1][0] );
- v9 = vec_ld( 0, &ogg[0][0] );
-
- int i;
- for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
- // load ogg[0][i] to ogg[0][i+4]
- v8 = v9;
- v9 = vec_ld( 15, &ogg[0][i] );
- v0 = vec_perm( v8, v9, vecPerm1 );
-
- // load ogg[1][i] to ogg[1][i+3]
- v6 = v7;
- v7 = vec_ld( 15, &ogg[1][i] );
- v1 = vec_perm( v6, v7, vecPerm2 );
-
- // multiply
- v0 = vec_madd( v0, constVec, zeroVector );
- v1 = vec_madd( v1, constVec, zeroVector );
-
- // generate result vectors to store
- v2 = vec_perm( v0, v1, vecFirst );
- v3 = vec_perm( v0, v1, vecSecond );
- v4 = vec_perm( v0, v1, vecThird );
- v5 = vec_perm( v0, v1, vecFourth );
- // store results
- ALIGNED_STORE4( &dest[i*4], v2, v3, v4, v5 );
- }
- // cleanup
- for ( ; i < numSamples >> 1; i++ ) {
- dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
- dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
- }
- }
- } else if ( kHz == 44100 ) {
- if ( numChannels == 1 ) {
- // calculate perm vector and do first load
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
-
- v9 = vec_ld( 0, &ogg[0][0] );
- int i;
- for ( i = 0; i+7 < numSamples; i += 8 ) {
- // load values from ogg
- v8 = v9;
- v7 = vec_ld( 15, &ogg[0][i] );
- v6 = v7;
- v9 = vec_ld( 31, &ogg[0][i] );
-
- v0 = vec_perm( v8, v7, vecPerm1 );
- v1 = vec_perm( v6, v9, vecPerm1 );
-
- // multiply
- v0 = vec_madd( v0, constVec, zeroVector );
- v1 = vec_madd( v1, constVec, zeroVector );
- ALIGNED_STORE2( &dest[i], v0, v1 );
- }
-
- // cleanup
- for ( ; i < numSamples; i++ ) {
- dest[i*1+0] = ogg[0][i] * 32768.0f;
- }
- } else {
-
- // calculate perm vector and do first load
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
- vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
- v7 = vec_ld( 0, &ogg[1][0] );
- v9 = vec_ld( 0, &ogg[0][0] );
- int i;
-
- for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
- v8 = v9;
- v9 = vec_ld( 15, &ogg[0][i] );
- v0 = vec_perm( v8, v9, vecPerm1 );
-
- // load ogg[1][i] to ogg[1][i+3]
- v6 = v7;
- v7 = vec_ld( 15, &ogg[1][i] );
- v1 = vec_perm( v6, v7, vecPerm2 );
-
- // multiply
- v0 = vec_madd( v0, constVec, zeroVector );
- v1 = vec_madd( v1, constVec, zeroVector );
-
- // generate result vectors
- v2 = vec_mergeh( v0, v1 );
- v3 = vec_mergel( v0, v1 );
-
- // store results
- ALIGNED_STORE2( &dest[i*2], v2, v3 );
- }
- // cleanup
- for ( ; i < numSamples >> 1; i++ ) {
- dest[i*2+0] = ogg[0][i] * 32768.0f;
- dest[i*2+1] = ogg[1][i] * 32768.0f;
- }
- }
- } else {
- assert( 0 );
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::UpSampleOGGTo44kHz
- Duplicate samples for 44kHz output.
-
- Assumptions:
- No assumptions
- ============
- */
- void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
-
- register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
- register vector float constVec, zeroVector;
- register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
- vector unsigned char vecPerm1;
- vector unsigned char vecPerm2;
- vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
- vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
- vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
- vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
- vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
- vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
-
- vector unsigned char storePerm;
-
- constVec = (vector float)(32768.0f);
- zeroVector = (vector float)(0.0);
-
- // calculate perm vector and masks for stores
- storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
- // original values of dest
- vector float vecDest = vec_ld( 0, &dest[0] );
- vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
-
- if ( kHz == 11025 ) {
- if ( numChannels == 1 ) {
- // calculate perm vector and do first load
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
- v10 = vec_ld( 0, &ogg[0][0] );
-
- int i;
- for ( i = 0; i+7 < numSamples; i += 8 ) {
- // as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
- v8 = v10;
- v9 = vec_ld( 15, &ogg[0][i] );
- v10 = vec_ld( 31, &ogg[0][i] );
- vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
- v0 = vec_perm( v8, v9, vecPerm1 );
- v1 = vec_perm( v9, v10, vecPerm1 );
-
- // now we have the elements in a vector, we want
- // to splat them each accross their own vector
- oggVec1 = vec_splat( v0, 0 );
- oggVec2 = vec_splat( v0, 1 );
- oggVec3 = vec_splat( v0, 2 );
- oggVec4 = vec_splat( v0, 3 );
- oggVec5 = vec_splat( v1, 0 );
- oggVec6 = vec_splat( v1, 1 );
- oggVec7 = vec_splat( v1, 2 );
- oggVec8 = vec_splat( v1, 3 );
-
- v0 = vec_madd( oggVec1, constVec, zeroVector );
- v1 = vec_madd( oggVec2, constVec, zeroVector );
- v2 = vec_madd( oggVec3, constVec, zeroVector );
- v3 = vec_madd( oggVec4, constVec, zeroVector );
- v4 = vec_madd( oggVec5, constVec, zeroVector );
- v5 = vec_madd( oggVec6, constVec, zeroVector );
- v6 = vec_madd( oggVec7, constVec, zeroVector );
- v7 = vec_madd( oggVec8, constVec, zeroVector );
-
- // rotate input data
- v0 = vec_perm( v0, v0, storePerm );
- v1 = vec_perm( v1, v1, storePerm );
- v2 = vec_perm( v2, v2, storePerm );
- v3 = vec_perm( v3, v3, storePerm );
- v4 = vec_perm( v4, v4, storePerm );
- v5 = vec_perm( v5, v5, storePerm );
- v6 = vec_perm( v6, v6, storePerm );
- v7 = vec_perm( v7, v7, storePerm );
-
- // store results
- vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*4] );
- vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*4] );
- vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*4] );
- vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*4] );
- vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*4] );
- vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*4] );
- vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*4] );
- vec_st( vec_sel( v6, v7, mask ), 111, &dest[i*4] );
- vecDest = vec_sel( v7, vecDestEnd, mask );
- vec_st( vecDest, 127, &dest[i*4] );
- }
-
- //cleanup
- for ( ; i < numSamples; i++ ) {
- dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
- }
-
- } else {
-
- // calculate perm vec for ogg
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
- vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
- v7 = vec_ld( 0, &ogg[1][0] );
- v9 = vec_ld( 0, &ogg[0][0] );
- int i;
-
- for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
- // load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
- v8 = v9;
- v9 = vec_ld( 15, &ogg[0][i] );
- vector float vecDestEnd = vec_ld( 127, &dest[i*8] );
- v0 = vec_perm( v8, v9, vecPerm1 );
-
- // now we have the elements in a vector, we want
- // to splat them each accross their own vector
- oggVec1 = vec_splat( v0, 0 );
- oggVec2 = vec_splat( v0, 1 );
- oggVec3 = vec_splat( v0, 2 );
- oggVec4 = vec_splat( v0, 3 );
-
- // load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
- v6 = v7;
- v7 = vec_ld( 15, &ogg[1][i] );
- v1 = vec_perm( v6, v7, vecPerm2 );
-
- // now we have the elements in a vector, we want
- // to splat them each accross their own vector
- oggVec5 = vec_splat( v1, 0 );
- oggVec6 = vec_splat( v1, 1 );
- oggVec7 = vec_splat( v1, 2 );
- oggVec8 = vec_splat( v1, 3 );
-
- oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
- oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
- oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
- oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
- oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
- oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
- oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
- oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
-
- //merge generates the interleaved pattern that we want and it
- //doesn't require a permute vector, so use that instead
- v0 = vec_mergeh( oggVec1, oggVec5 );
- v1 = vec_mergel( oggVec1, oggVec5 );
- v2 = vec_mergeh( oggVec2, oggVec6 );
- v3 = vec_mergel( oggVec2, oggVec6 );
-
- v4 = vec_mergeh( oggVec3, oggVec7 );
- v5 = vec_mergel( oggVec3, oggVec7 );
- v6 = vec_mergeh( oggVec4, oggVec8 );
- v10 = vec_mergel( oggVec4, oggVec8 );
-
- // rotate input data
- v0 = vec_perm( v0, v0, storePerm );
- v1 = vec_perm( v1, v1, storePerm );
- v2 = vec_perm( v2, v2, storePerm );
- v3 = vec_perm( v3, v3, storePerm );
- v4 = vec_perm( v4, v4, storePerm );
- v5 = vec_perm( v5, v5, storePerm );
- v6 = vec_perm( v6, v6, storePerm );
- v10 = vec_perm( v10, v10, storePerm );
-
- // store results
- vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*8] );
- vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*8] );
- vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*8] );
- vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*8] );
- vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*8] );
- vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*8] );
- vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*8] );
- vec_st( vec_sel( v6, v10, mask ), 111, &dest[i*8] );
- vecDest = vec_sel( v10, vecDestEnd, mask );
- vec_st( vecDest, 127, &dest[i*8] );
- }
-
- //cleanup
- for ( ; i < numSamples >> 1; i++ ) {
- dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
- dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
- }
- }
- } else if ( kHz == 22050 ) {
- if ( numChannels == 1 ) {
-
- // calculate perm vector and do first load
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
- v10 = vec_ld( 0, &ogg[0][0] );
-
- int i;
-
- for ( i = 0; i+7 < numSamples; i += 8 ) {
-
- // load values from ogg
- v8 = v10;
- v9 = vec_ld( 15, &ogg[0][i] );
- v10 = vec_ld( 31, &ogg[0][i] );
- vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
- v0 = vec_perm( v8, v9, vecPerm1 );
- v1 = vec_perm( v9, v10, vecPerm1 );
-
- // multiply
- v0 = vec_madd( v0, constVec, zeroVector );
- v1 = vec_madd( v1, constVec, zeroVector );
-
- // permute into results vectors to store
- v5 = vec_perm( v0, v0, vecOneTwo );
- v6 = vec_perm( v0, v0, vecThreeFour);
- v7 = vec_perm( v1, v1, vecOneTwo );
- v8 = vec_perm( v1, v1, vecThreeFour );
-
- // rotate input data
- v5 = vec_perm( v5, v5, storePerm );
- v6 = vec_perm( v6, v6, storePerm );
- v7 = vec_perm( v7, v7, storePerm );
- v8 = vec_perm( v8, v8, storePerm );
-
- // store results
- vec_st( vec_sel( vecDest, v5, mask ), 0, &dest[i*2] );
- vec_st( vec_sel( v5, v6, mask ), 15, &dest[i*2] );
- vec_st( vec_sel( v6, v7, mask ), 31, &dest[i*2] );
- vec_st( vec_sel( v7, v8, mask ), 47, &dest[i*2] );
- vecDest = vec_sel( v8, vecDestEnd, mask );
- vec_st( vecDest, 63, &dest[i*2] );
- }
- // cleanup
- for ( ; i < numSamples; i++ ) {
- dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
- }
- } else {
- // calculate perm vector and do first load
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
- vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
- v7 = vec_ld( 0, &ogg[1][0] );
- v9 = vec_ld( 0, &ogg[0][0] );
-
- int i;
- for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
- // load ogg[0][i] to ogg[0][i+4]
- v8 = v9;
- v9 = vec_ld( 15, &ogg[0][i] );
- vector float vecDestEnd = vec_ld( 63, &dest[i*4] );
- v0 = vec_perm( v8, v9, vecPerm1 );
-
- // load ogg[1][i] to ogg[1][i+3]
- v6 = v7;
- v7 = vec_ld( 15, &ogg[1][i] );
- v1 = vec_perm( v6, v7, vecPerm2 );
-
- // multiply
- v0 = vec_madd( v0, constVec, zeroVector );
- v1 = vec_madd( v1, constVec, zeroVector );
-
- // generate result vectors to store
- v2 = vec_perm( v0, v1, vecFirst );
- v3 = vec_perm( v0, v1, vecSecond );
- v4 = vec_perm( v0, v1, vecThird );
- v5 = vec_perm( v0, v1, vecFourth );
-
- // rotate input data
- v2 = vec_perm( v2, v2, storePerm );
- v3 = vec_perm( v3, v3, storePerm );
- v4 = vec_perm( v4, v4, storePerm );
- v5 = vec_perm( v5, v5, storePerm );
-
- // store results
- vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
- vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
- vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
- vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
- vecDest = vec_sel( v5, vecDestEnd, mask );
- vec_st( vecDest, 63, &dest[i*4] );
- }
-
- // cleanup
- for ( ; i < numSamples >> 1; i++ ) {
- dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
- dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
- }
- }
- } else if ( kHz == 44100 ) {
- if ( numChannels == 1 ) {
- // calculate perm vector and do first load
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
-
- v9 = vec_ld( 0, &ogg[0][0] );
- int i;
- for ( i = 0; i+7 < numSamples; i += 8 ) {
- // load values from ogg
- v8 = v9;
- v7 = vec_ld( 15, &ogg[0][i] );
- v6 = v7;
- v9 = vec_ld( 31, &ogg[0][i] );
- vector float vecDestEnd = vec_ld( 31, &dest[i] );
-
- v0 = vec_perm( v8, v7, vecPerm1 );
- v1 = vec_perm( v6, v9, vecPerm1 );
-
- // multiply
- v0 = vec_madd( v0, constVec, zeroVector );
- v1 = vec_madd( v1, constVec, zeroVector );
-
- // rotate data
- v0 = vec_perm( v0, v0, storePerm );
- v1 = vec_perm( v1, v1, storePerm );
-
- // store results
- vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
- vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
- vecDest = vec_sel( v1, vecDestEnd, mask );
- vec_st( vecDest, 31, &dest[i] );
- }
-
- // cleanup
- for ( ; i < numSamples; i++ ) {
- dest[i*1+0] = ogg[0][i] * 32768.0f;
- }
- } else {
-
- // calculate perm vector and do first load
- vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
- vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
- v7 = vec_ld( 0, &ogg[1][0] );
- v9 = vec_ld( 0, &ogg[0][0] );
- int i;
-
- for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
- v8 = v9;
- v9 = vec_ld( 15, &ogg[0][i] );
- v0 = vec_perm( v8, v9, vecPerm1 );
-
- // load ogg[1][i] to ogg[1][i+3]
- v6 = v7;
- v7 = vec_ld( 15, &ogg[1][i] );
- v1 = vec_perm( v6, v7, vecPerm2 );
-
- // multiply
- v0 = vec_madd( v0, constVec, zeroVector );
- v1 = vec_madd( v1, constVec, zeroVector );
-
- // generate result vectors
- v2 = vec_mergeh( v0, v1 );
- v3 = vec_mergel( v0, v1 );
-
- // store results
- UNALIGNED_STORE2( &dest[i*2], v2, v3 );
- }
- // cleanup
- for ( ; i < numSamples >> 1; i++ ) {
- dest[i*2+0] = ogg[0][i] * 32768.0f;
- dest[i*2+1] = ogg[1][i] * 32768.0f;
- }
- }
- } else {
- assert( 0 );
- }
- }
- #endif /* SOUND_DEST_ALIGNED */
- #ifdef SOUND_DEST_ALIGNED
- /*
- ============
- idSIMD_AltiVec::MixSoundTwoSpeakerMono
- Assumptions:
- Assumes that mixBuffer starts at aligned address
- ============
- */
- void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
- // mixBuffer is aligned
- assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
-
- int i;
- float inc[2];
- float spkr[4];
- register vector float vecInc;
- register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
- register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
- register vector float vecSamplesLd1, vecSamplesLd2;
- register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
-
- register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
- register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
- register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
- register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
-
- //constants
- vector float fourVec = (vector float)(4.0);
- vector float zeroVec = (vector float)(0.0);
-
- inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- spkr[0] = lastV[0];
- spkr[1] = lastV[1];
- spkr[2] = lastV[0] + inc[0];
- spkr[3] = lastV[1] + inc[1];
- assert( numSamples == MIXBUFFER_SAMPLES );
-
- inc[0] *= 2;
- inc[1] *= 2;
- //load data into registers
- vector float v0 = loadSplatUnalignedScalar( &inc[0] );
- vector float v1 = loadSplatUnalignedScalar( &inc[1] );
- vecInc = vec_mergeh( v0, v1 );
-
- vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
- vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
- vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
- vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
-
- // load spkr array
- v0 = vec_mergeh( v2, v4 );
- v1 = vec_mergeh( v3, v5 );
- vecSpeaker1 = vec_mergeh( v0, v1 );
-
- vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
- vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
- vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
- vecInc = vec_madd( vecInc, fourVec, zeroVec );
-
- vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
- vector float vecSamplesLast = vec_ld( 0, &samples[0] );
-
- //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
- //need a cleanup loop
- for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
- //load samples and mix buffers
- vecSamplesLd1 = vecSamplesLast; //vec_ld( 0, &samples[i] );
- vecSamplesLd2 = vec_ld( 15, &samples[i] );
- vecSamplesLast = vec_ld( 31, &samples[i] );
-
- vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
- vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
-
- vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
- vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
- vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
- vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
- vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
- vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
- vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
- vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
- vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
- vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
- vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
- vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
-
- // store results
- ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
-
- //add for next iteration
- vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
- vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
- vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
- vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::MixSoundTwoSpeakerMono
- Assumptions:
- No assumptions
- ============
- */
- void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
-
- int i;
- float inc[2];
- float spkr[4];
- register vector float vecInc;
- register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
- register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
- register vector float vecSamplesLd1, vecSamplesLd2;
- register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
-
- register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
- register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
- register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
- register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
-
- //constants
- vector float fourVec = (vector float)(4.0);
- vector float zeroVec = (vector float)(0.0);
-
- inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- spkr[0] = lastV[0];
- spkr[1] = lastV[1];
- spkr[2] = lastV[0] + inc[0];
- spkr[3] = lastV[1] + inc[1];
- assert( numSamples == MIXBUFFER_SAMPLES );
-
- inc[0] *= 2;
- inc[1] *= 2;
- //load data into registers
- vector float v0 = loadSplatUnalignedScalar( &inc[0] );
- vector float v1 = loadSplatUnalignedScalar( &inc[1] );
- vecInc = vec_mergeh( v0, v1 );
-
- vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
- vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
- vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
- vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
-
- // load spkr array
- v0 = vec_mergeh( v2, v4 );
- v1 = vec_mergeh( v3, v5 );
- vecSpeaker1 = vec_mergeh( v0, v1 );
- vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
- vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
- vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
- vecInc = vec_madd( vecInc, fourVec, zeroVec );
-
- vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
- vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0]), (vector unsigned char)(1) );
- vector float vecSamplesLast = vec_ld( 0, &samples[0] );
- vector float vecDest = vec_ld( 0, &mixBuffer[0] );
-
- //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
- //need a cleanup loop
- for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
- //load samples and mix buffers
- vecSamplesLd1 = vecSamplesLast;
- vecSamplesLd2 = vec_ld( 15, &samples[i] );
- vecSamplesLast = vec_ld( 31, &samples[i] );
-
- vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
- vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
-
- vecMixBuffer1 = vecDest;
- vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
- vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
- vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
- vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
- vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
- vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
- vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
- vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
- vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
- vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
- vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
- vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
- vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
- vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
- vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
- vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
-
- // store results
- UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
-
- //add for next iteration
- vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
- vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
- vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
- vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
- }
- }
- #endif /* SOUND_DEST_ALIGNED */
- #ifdef SOUND_DEST_ALIGNED
- /*
- ============
- idSIMD_AltiVec::MixSoundTwoSpeakerStereo
- Assumptions:
- Assumes that mixBuffer starts at aligned address
- ============
- */
- void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
- // mixBuffer is aligned
- assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
-
- int i, k;
- float inc[2];
- float spkr[4];
-
- // loading buffers
- register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
- // loading buffers
- register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
- register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
- register vector float vecInc;
- vector float fourVec = (vector float)(4.0);
- vector float zeroVec = (vector float)(0.0);
-
- assert( numSamples == MIXBUFFER_SAMPLES );
- inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- spkr[0] = lastV[0];
- spkr[1] = lastV[1];
- spkr[2] = lastV[0] + inc[0];
- spkr[3] = lastV[1] + inc[1];
- for ( k = 0; k < 2; k++ ) {
- inc[k] *= 2;
- }
- // load data in vectors
- vector float v0 = loadSplatUnalignedScalar( &inc[0] );
- vector float v1 = loadSplatUnalignedScalar( &inc[1] );
- vecInc = vec_mergeh( v0, v1 );
-
- vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
- vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
- vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
- vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
-
- // load spkr array
- v0 = vec_mergeh( v2, v4 );
- v1 = vec_mergeh( v3, v5 );
- vecSpeaker1 = vec_mergeh( v0, v1 );
- vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
- vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
- vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
- vecInc = vec_madd( vecInc, fourVec, zeroVec );
-
- vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
- vector float vecSamplesLast = vec_ld( 0, &samples[0] );
-
- //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
- //need a cleanup loop
- for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
- // load mix buffers and samples
- vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
- vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
- vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
- vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
-
- vecSamples1 = vecSamplesLast;
- vecSamples2 = vec_ld( 15, &samples[i*2] );
- vecSamples3 = vec_ld( 31, &samples[i*2] );
- vecSamples4 = vec_ld( 47, &samples[i*2] );
- vecSamplesLast = vec_ld( 63, &samples[i*2] );
-
- vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
- vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
- vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
- vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
-
- vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
- vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
- vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
- vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
-
- vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
- vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
- vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
- vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
- //store results
- ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::MixSoundTwoSpeakerStereo
- Assumptions:
- No assumptions
- ============
- */
- void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
-
- int i, k;
- float inc[2];
- float spkr[4];
- // loading buffers
- register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
- // loading buffers
- register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
- register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
- register vector float vecInc;
- vector float fourVec = (vector float)(4.0);
- vector float zeroVec = (vector float)(0.0);
-
- assert( numSamples == MIXBUFFER_SAMPLES );
- inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- spkr[0] = lastV[0];
- spkr[1] = lastV[1];
- spkr[2] = lastV[0] + inc[0];
- spkr[3] = lastV[1] + inc[1];
- for ( k = 0; k < 2; k++ ) {
- inc[k] *= 2;
- }
- // load data in vectors
- vector float v0 = loadSplatUnalignedScalar( &inc[0] );
- vector float v1 = loadSplatUnalignedScalar( &inc[1] );
- vecInc = vec_mergeh( v0, v1 );
-
- vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
- vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
- vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
- vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
-
- // load spkr array
- v0 = vec_mergeh( v2, v4 );
- v1 = vec_mergeh( v3, v5 );
- vecSpeaker1 = vec_mergeh( v0, v1 );
- vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
- vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
- vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
- vecInc = vec_madd( vecInc, fourVec, zeroVec );
-
- vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
- vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
- vector float vecSamplesLast = vec_ld( 0, &samples[0] );
- vector float vecDest = vec_ld( 0, &mixBuffer[0] );
-
- //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
- //need a cleanup loop
- for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
- // load mix buffers and samples
- vecMixBuffer1 = vecDest;
- vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
- vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
- vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
- vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
- vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
- vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
- vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
- vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
-
- vecSamples1 = vecSamplesLast;
- vecSamples2 = vec_ld( 15, &samples[i*2] );
- vecSamples3 = vec_ld( 31, &samples[i*2] );
- vecSamples4 = vec_ld( 47, &samples[i*2] );
- vecSamplesLast = vec_ld( 63, &samples[i*2] );
-
- vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
- vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
- vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
- vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
-
- vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
- vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
- vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
- vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
-
- vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
- vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
- vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
- vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
- // store results
- UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
- }
- }
- #endif /* SOUND_DEST_ALIGNED */
- #ifdef SOUND_DEST_ALIGNED
- /*
- ============
- idSIMD_AltiVec::MixSoundSixSpeakerMono
- Assumptions:
- Assumes that mixBuffer starts at aligned address
- ============
- */
- void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
-
- // mixBuffer is aligned
- assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
-
- float incL[24];
- float sL[24];
- int i, k;
-
- vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
- vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
- vector float vecSamplesLd;
- vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
- vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
- // permute vectors for sample
- vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
- vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
- assert( numSamples == MIXBUFFER_SAMPLES );
- assert( SPEAKER_RIGHT == 1 );
- assert( SPEAKER_BACKRIGHT == 5 );
-
- // incL array, 6 elements repeated
- incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
- incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
- incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
- incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
-
- // sL array repeated
- for ( k = 0; k < 6; k++ ) {
- sL[k] = lastV[k];
- }
- for ( k = 6; k < 12; k++ ) {
- sL[k] = lastV[k-6] + incL[k];
- }
- for ( k = 12; k < 18; k++ ) {
- sL[k] = lastV[k-12] + incL[k] + incL[k];
- }
- for ( k = 18; k < 24; k++ ) {
- sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
- }
-
- // multiply by 2 since doing 12 at a time
- for ( k = 0; k < 24; k++ ) {
- incL[k] *= 4;
- }
- //load the data
- vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
- vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
-
- vecIncl1 = vec_ld( 0, &incL[0] );
- vecIncl2 = vec_ld( 15, &incL[0] );
- vecIncl3 = vec_ld( 31, &incL[0] );
- vecIncl4 = vec_ld( 47, &incL[0] );
- vecIncl5 = vec_ld( 63, &incL[0] );
- vecIncl6 = vec_ld( 79, &incL[0] );
- vecIncl7 = vec_ld( 95, &incL[0] );
-
- vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
- vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
- vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
- vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
- vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
- vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
-
- vecSL1 = vec_ld( 0, &sL[0] );
- vecSL2 = vec_ld( 15, &sL[0] );
- vecSL3 = vec_ld( 31, &sL[0] );
- vecSL4 = vec_ld( 47, &sL[0] );
- vecSL5 = vec_ld( 63, &sL[0] );
- vecSL6 = vec_ld( 79, &sL[0] );
- vecSL7 = vec_ld( 95, &sL[0] );
-
- vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
- vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
- vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
- vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
- vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
- vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
-
- vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
- vector float vecSamplesLast = vec_ld( 0, &samples[0] );
- //since MIXBUFFER_SAMPLES is a multiple of 4, we don't
- //need a cleanup loop
- for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
- //load mix buffer into vectors, assume aligned
- vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
- vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
- vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
- vecMixBuffer4 = vec_ld( 0, &mixBuffer[(i*6)+12] );
- vecMixBuffer5 = vec_ld( 0, &mixBuffer[(i*6)+16] );
- vecMixBuffer6 = vec_ld( 0, &mixBuffer[(i*6)+20] );
-
- //load samples into vector
- vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
- vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
- vecSamplesLast = vecSamplesLd2;
-
- //permute to get them ordered how we want
- vecSamples1 = vec_splat( vecSamplesLd, 0 );
- vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
- vecSamples3 = vec_splat( vecSamplesLd, 1 );
- vecSamples4 = vec_splat( vecSamplesLd, 2 );
- vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
- vecSamples6 = vec_splat( vecSamplesLd, 3 );
- //do calculation
- vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
- vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
- vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
- vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
- vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
- vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
-
- //store out results
- ALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
- // add for next iteration
- vecSL1 = vec_add( vecSL1, vecIncl1 );
- vecSL2 = vec_add( vecSL2, vecIncl2 );
- vecSL3 = vec_add( vecSL3, vecIncl3 );
- vecSL4 = vec_add( vecSL4, vecIncl4 );
- vecSL5 = vec_add( vecSL5, vecIncl5 );
- vecSL6 = vec_add( vecSL6, vecIncl6 );
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::MixSoundSixSpeakerMono
- Assumptions:
- No assumptions
- ============
- */
- void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
-
- float incL[24];
- float sL[24];
- int i, k;
- vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
- vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
- vector float vecSamplesLd;
- vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
- vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
- // permute vectors for sample
- register vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
- register vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
- assert( numSamples == MIXBUFFER_SAMPLES );
- assert( SPEAKER_RIGHT == 1 );
- assert( SPEAKER_BACKRIGHT == 5 );
-
- // incL array, 6 elements repeated
- incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
- incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
- incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
- incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
-
- // sL array repeated
- for ( k = 0; k < 6; k++ ) {
- sL[k] = lastV[k];
- }
- for ( k = 6; k < 12; k++ ) {
- sL[k] = lastV[k-6] + incL[k];
- }
- for ( k = 12; k < 18; k++ ) {
- sL[k] = lastV[k-12] + incL[k] + incL[k];
- }
- for ( k = 18; k < 24; k++ ) {
- sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
- }
-
- // multiply by 2 since doing 12 at a time
- for ( k = 0; k < 24; k++ ) {
- incL[k] *= 4;
- }
- // load the data
- vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
- vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
-
- vecIncl1 = vec_ld( 0, &incL[0] );
- vecIncl2 = vec_ld( 15, &incL[0] );
- vecIncl3 = vec_ld( 31, &incL[0] );
- vecIncl4 = vec_ld( 47, &incL[0] );
- vecIncl5 = vec_ld( 63, &incL[0] );
- vecIncl6 = vec_ld( 79, &incL[0] );
- vecIncl7 = vec_ld( 95, &incL[0] );
-
- vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
- vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
- vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
- vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
- vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
- vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
-
- vecSL1 = vec_ld( 0, &sL[0] );
- vecSL2 = vec_ld( 15, &sL[0] );
- vecSL3 = vec_ld( 31, &sL[0] );
- vecSL4 = vec_ld( 47, &sL[0] );
- vecSL5 = vec_ld( 63, &sL[0] );
- vecSL6 = vec_ld( 79, &sL[0] );
- vecSL7 = vec_ld( 95, &sL[0] );
-
- vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
- vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
- vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
- vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
- vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
- vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
- vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
- vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
- vector float vecSamplesLast = vec_ld( 0, &samples[0] );
- vector float vecDest = vec_ld( 0, &mixBuffer[0] );
- //since MIXBUFFER_SAMPLES is a multiple of 4, we don't
- //need a cleanup loop
- for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
- //load mix buffer into vectors
- vecMixBuffer1 = vecDest;
- vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
- vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
- vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*6] );
- vecMixBuffer5 = vec_ld( 63, &mixBuffer[i*6] );
- vecMixBuffer6 = vec_ld( 79, &mixBuffer[i*6] );
- vector float vecDestEnd = vec_ld( 95, &mixBuffer[i*6] );
-
- vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
- vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
- vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
- vecMixBuffer4 = vec_perm( vecMixBuffer4, vecMixBuffer5, mixBufferPerm );
- vecMixBuffer5 = vec_perm( vecMixBuffer5, vecMixBuffer6, mixBufferPerm );
- vecMixBuffer6 = vec_perm( vecMixBuffer6, vecDestEnd, mixBufferPerm );
-
- //load samples into vector
- vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
- vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
- vecSamplesLast = vecSamplesLd2;
-
- //permute to get them ordered how we want
- vecSamples1 = vec_splat( vecSamplesLd, 0 );
- vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
- vecSamples3 = vec_splat( vecSamplesLd, 1 );
- vecSamples4 = vec_splat( vecSamplesLd, 2 );
- vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
- vecSamples6 = vec_splat( vecSamplesLd, 3 );
- //do calculation
- vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
- vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
- vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
- vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
- vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
- vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
-
- // store results
- UNALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
- // add for next iteration
- vecSL1 = vec_add( vecSL1, vecIncl1 );
- vecSL2 = vec_add( vecSL2, vecIncl2 );
- vecSL3 = vec_add( vecSL3, vecIncl3 );
- vecSL4 = vec_add( vecSL4, vecIncl4 );
- vecSL5 = vec_add( vecSL5, vecIncl5 );
- vecSL6 = vec_add( vecSL6, vecIncl6 );
- }
- }
- #endif /* SOUND_DEST_ALIGNED */
- #ifdef SOUND_DEST_ALIGNED
- /*
- ============
- idSIMD_AltiVec::MixSoundSixSpeakerStereo
- Assumptions:
- Assumes that mixBuffer starts at aligned address
- ============
- */
-
- void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
- // mixBuffer is aligned
- assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
-
- float incL[12];
- float sL[12];
- int i;
- vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
- vector float vecSL1, vecSL2, vecSL3, vecSL4;
- vector float vecSamplesLd;
- vector float vecSamples1, vecSamples2, vecSamples3;
- vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
- // permute vectors for sample
- vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
- vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
- assert( numSamples == MIXBUFFER_SAMPLES );
- assert( SPEAKER_RIGHT == 1 );
- assert( SPEAKER_BACKRIGHT == 5 );
-
- // incL array, 6 elements repeated
- incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
- incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
- incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
- incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
-
- // sL array repeated
- sL[0] = lastV[0];
- sL[1] = lastV[1];
- sL[2] = lastV[2];
- sL[3] = lastV[3];
- sL[4] = lastV[4];
- sL[5] = lastV[5];
- sL[6] = lastV[0] + incL[0];
- sL[7] = lastV[1] + incL[1];
- sL[8] = lastV[2] + incL[2];
- sL[9] = lastV[3] + incL[3];
- sL[10] = lastV[4] + incL[4];
- sL[11] = lastV[5] + incL[5];
-
- // multiply by 2 since doing 12 at a time
- incL[0] *= 2;
- incL[1] *= 2;
- incL[2] *= 2;
- incL[3] *= 2;
- incL[4] *= 2;
- incL[5] *= 2;
- incL[6] *= 2;
- incL[7] *= 2;
- incL[8] *= 2;
- incL[9] *= 2;
- incL[10] *= 2;
- incL[11] *= 2;
-
- //we aligned this data, so load it up
- vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
- vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
- vecIncl1 = vec_ld( 0, &incL[0] );
- vecIncl2 = vec_ld( 15, &incL[0] );
- vecIncl3 = vec_ld( 31, &incL[0] );
- vecIncl4 = vec_ld( 47, &incL[0] );
-
- vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
- vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
- vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
-
- vecSL1 = vec_ld( 0, &sL[0] );
- vecSL2 = vec_ld( 15, &sL[0] );
- vecSL3 = vec_ld( 31, &sL[0] );
- vecSL4 = vec_ld( 47, &sL[0] );
-
- vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
- vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
- vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
-
- vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
- vector float vecSamplesLast = vec_ld( 0, &samples[0] );
- for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
-
- //load mix buffer into vectors, assume aligned
- vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
- vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
- vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
-
- //load samples into vector
- vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
- vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
- vecSamplesLast = vecSamplesLd2;
-
- //permute to get them ordered how we want. For the 2nd vector,
- //the order happens to be the same as the order we loaded them
- //in, so there's no need to permute that one
- vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
- vecSamples2 = vecSamplesLd;
- vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
- //do calculation
- vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
- vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
- vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
-
- //store out results
- ALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
- // add for next iteration
- vecSL1 = vec_add( vecSL1, vecIncl1 );
- vecSL2 = vec_add( vecSL2, vecIncl2 );
- vecSL3 = vec_add( vecSL3, vecIncl3 );
- }
- }
- #else
- /*
- ============
- idSIMD_AltiVec::MixSoundSixSpeakerStereo
- Assumptions:
- No assumptions
- ============
- */
- void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
-
- float incL[12];
- float sL[12];
-
- int i;
- vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
- vector float vecSL1, vecSL2, vecSL3, vecSL4;
- vector float vecSamplesLd;
- vector float vecSamples1, vecSamples2, vecSamples3;
- vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
- // permute vectors for sample
- vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
- vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
- assert( numSamples == MIXBUFFER_SAMPLES );
- assert( SPEAKER_RIGHT == 1 );
- assert( SPEAKER_BACKRIGHT == 5 );
-
- // incL array, 6 elements repeated
- incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
- incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
- incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
- incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
- incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
- incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
-
- // sL array repeated
- sL[0] = lastV[0];
- sL[1] = lastV[1];
- sL[2] = lastV[2];
- sL[3] = lastV[3];
- sL[4] = lastV[4];
- sL[5] = lastV[5];
- sL[6] = lastV[0] + incL[0];
- sL[7] = lastV[1] + incL[1];
- sL[8] = lastV[2] + incL[2];
- sL[9] = lastV[3] + incL[3];
- sL[10] = lastV[4] + incL[4];
- sL[11] = lastV[5] + incL[5];
-
- // multiply by 2 since doing 12 at a time
- incL[0] *= 2;
- incL[1] *= 2;
- incL[2] *= 2;
- incL[3] *= 2;
- incL[4] *= 2;
- incL[5] *= 2;
- incL[6] *= 2;
- incL[7] *= 2;
- incL[8] *= 2;
- incL[9] *= 2;
- incL[10] *= 2;
- incL[11] *= 2;
-
- // load the data
- vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
- vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
- vecIncl1 = vec_ld( 0, &incL[0] );
- vecIncl2 = vec_ld( 15, &incL[0] );
- vecIncl3 = vec_ld( 31, &incL[0] );
- vecIncl4 = vec_ld( 47, &incL[0] );
-
- vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
- vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
- vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
-
- vecSL1 = vec_ld( 0, &sL[0] );
- vecSL2 = vec_ld( 15, &sL[0] );
- vecSL3 = vec_ld( 31, &sL[0] );
- vecSL4 = vec_ld( 47, &sL[0] );
-
- vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
- vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
- vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
- vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
- vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
- vector float vecSamplesLast = vec_ld( 0, &samples[0] );
- vector float vecDest = vec_ld( 0, &mixBuffer[0] );
- for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
-
- //load mix buffer into vectors
- vecMixBuffer1 = vecDest;
- vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
- vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
- vector float vecDestEnd = vec_ld( 47, &mixBuffer[i*6] );
-
- vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
- vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
- vecMixBuffer3 = vec_perm( vecMixBuffer3, vecDestEnd, mixBufferPerm );
-
- //load samples into vector
- vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
- vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
- vecSamplesLast = vecSamplesLd2;
-
- //permute to get them ordered how we want. For the 2nd vector,
- //the order happens to be the same as the order we loaded them
- //in, so there's no need to permute that one
- vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
- vecSamples2 = vecSamplesLd;
- vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
- //do calculation
- vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
- vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
- vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
-
- // store results
- UNALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
- // add for next iteration
- vecSL1 = vec_add( vecSL1, vecIncl1 );
- vecSL2 = vec_add( vecSL2, vecIncl2 );
- vecSL3 = vec_add( vecSL3, vecIncl3 );
- }
- }
- #endif
- /*
- ============
- idSIMD_AltiVec::MixedSoundToSamples
- ============
- */
- void VPCALL idSIMD_AltiVec::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
- //this is basically a clamp for sound mixing
- register vector float v0, v1, v2, v3, v4, v5, v6, v7;
- register vector signed int vi0, vi1, vi2, vi3;
- register vector signed short vs0, vs1;
- register vector float minVec, maxVec, constVec;
- int i = 0;
- //unaligned at start, since samples is not 16-byte aligned
- for ( ; NOT_16BYTE_ALIGNED( samples[i] ) && ( i < numSamples ); i++ ) {
- samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
- }
-
- constVec = (vector float)(65536.0f);
- //splat min/max into a vector
- minVec = (vector float)(-32768.0f);
- maxVec = (vector float)(32767.0f);
-
- vector float vecOld = vec_ld( 0, &mixBuffer[i] );
- vector unsigned char permVec = vec_add( vec_lvsl( -1, &mixBuffer[i] ), (vector unsigned char)(1) );
-
- //vectorize!
- for ( ; i+15 < numSamples; i += 16 ) {
- //load source
- v0 = vecOld;
- v1 = vec_ld( 15, &mixBuffer[i] );
- v2 = vec_ld( 31, &mixBuffer[i] );
- v3 = vec_ld( 31, &mixBuffer[i] );
- vecOld = vec_ld( 47, &mixBuffer[i] );
- v0 = vec_perm( v0, v1, permVec );
- v1 = vec_perm( v1, v2, permVec );
- v2 = vec_perm( v2, v3, permVec );
- v3 = vec_perm( v3, vecOld, permVec );
-
- //apply minimum
- v4 = vec_max( v0, minVec );
- v5 = vec_max( v1, minVec );
- v6 = vec_max( v2, minVec );
- v7 = vec_max( v3, minVec );
-
- //apply maximum
- v4 = vec_min( v4, maxVec );
- v5 = vec_min( v5, maxVec );
- v6 = vec_min( v6, maxVec );
- v7 = vec_min( v7, maxVec );
-
- // convert floats to ints
- vi0 = vec_cts( v4, 0 );
- vi1 = vec_cts( v5, 0 );
- vi2 = vec_cts( v6, 0 );
- vi3 = vec_cts( v7, 0 );
-
- // pack ints into shorts
- vs0 = vec_pack( vi0, vi1 );
- vs1 = vec_pack( vi2, vi3 );
- ALIGNED_STORE2( &samples[i], vs0, vs1 );
- }
-
- //handle cleanup
- for ( ; i < numSamples ; i++ ) {
- samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
- }
- }
- #endif /* ENABLE_SOUND_ROUTINES */
- #endif /* MACOS_X */
|