RenderMatrix.cpp 187 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439
  1. /*
  2. ===========================================================================
  3. Doom 3 BFG Edition GPL Source Code
  4. Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
  6. Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #include "../ParallelJobList_JobHeaders.h"
  21. #include "../math/Math.h"
  22. #include "../math/Vector.h"
  23. #include "../math/Matrix.h"
  24. #include "../math/Rotation.h"
  25. #include "../math/Plane.h"
  26. #include "../bv/Sphere.h"
  27. #include "../bv/Bounds.h"
  28. #include "RenderMatrix.h"
  29. // FIXME: it would be nice if all render matrices were 16-byte aligned
  30. // so there is no need for unaligned loads and stores everywhere
  31. #ifdef _lint
  32. #undef ID_WIN_X86_SSE2_INTRIN
  33. #endif
  34. //lint -e438 // the non-SSE code isn't lint friendly, either
  35. //lint -e550
  36. #define RENDER_MATRIX_INVERSE_EPSILON 1e-16f // JDC: changed from 1e-14f to allow full wasteland parallel light projections to invert
  37. #define RENDER_MATRIX_INFINITY 1e30f // NOTE: cannot initiaize a vec_float4 with idMath::INFINITY on the SPU
  38. #define RENDER_MATRIX_PROJECTION_EPSILON 0.1f
  39. #define CLIP_SPACE_OGL // the OpenGL clip space Z is in the range [-1, 1]
  40. /*
  41. ================================================================================================
  42. Constant render matrices
  43. ================================================================================================
  44. */
  45. // identity matrix
  46. ALIGNTYPE16 const idRenderMatrix renderMatrix_identity(
  47. 1.0f, 0.0f, 0.0f, 0.0f,
  48. 0.0f, 1.0f, 0.0f, 0.0f,
  49. 0.0f, 0.0f, 1.0f, 0.0f,
  50. 0.0f, 0.0f, 0.0f, 1.0f
  51. );
  52. // convert from our coordinate system (looking down X) to OpenGL's coordinate system (looking down -Z)
  53. ALIGNTYPE16 const idRenderMatrix renderMatrix_flipToOpenGL(
  54. 0.0f, -1.0f, 0.0f, 0.0f,
  55. 0.0f, 0.0f, 1.0f, 0.0f,
  56. -1.0f, 0.0f, 0.0f, 0.0f,
  57. 0.0f, 0.0f, 0.0f, 1.0f
  58. );
  59. // OpenGL -1 to 1.
  60. ALIGNTYPE16 const idRenderMatrix renderMatrix_windowSpaceToClipSpace(
  61. 2.0f, 0.0f, 0.0f, -1.0f,
  62. 0.0f, 2.0f, 0.0f, -1.0f,
  63. 0.0f, 0.0f, 2.0f, -1.0f,
  64. 0.0f, 0.0f, 0.0f, 1.0f
  65. );
  66. /*
  67. ================================================================================================
  68. SIMD constants
  69. ================================================================================================
  70. */
  71. #ifdef ID_WIN_X86_SSE2_INTRIN
  72. static const __m128i vector_int_1 = _mm_set1_epi32( 1 );
  73. static const __m128i vector_int_4 = _mm_set1_epi32( 4 );
  74. static const __m128i vector_int_0123 = _mm_set_epi32( 3, 2, 1, 0 );
  75. static const __m128 vector_float_mask0 = __m128c( _mm_set1_epi32( 1<<0 ) );
  76. static const __m128 vector_float_mask1 = __m128c( _mm_set1_epi32( 1<<1 ) );
  77. static const __m128 vector_float_mask2 = __m128c( _mm_set1_epi32( 1<<2 ) );
  78. static const __m128 vector_float_mask3 = __m128c( _mm_set1_epi32( 1<<3 ) );
  79. static const __m128 vector_float_mask4 = __m128c( _mm_set1_epi32( 1<<4 ) );
  80. static const __m128 vector_float_mask5 = __m128c( _mm_set1_epi32( 1<<5 ) );
  81. static const __m128 vector_float_sign_bit = __m128c( _mm_set1_epi32( IEEE_FLT_SIGN_MASK ) );
  82. static const __m128 vector_float_abs_mask = __m128c( _mm_set1_epi32( ~IEEE_FLT_SIGN_MASK ) );
  83. static const __m128 vector_float_keep_last = __m128c( _mm_set_epi32( -1, 0, 0, 0 ) );
  84. static const __m128 vector_float_inverse_epsilon = { RENDER_MATRIX_INVERSE_EPSILON, RENDER_MATRIX_INVERSE_EPSILON, RENDER_MATRIX_INVERSE_EPSILON, RENDER_MATRIX_INVERSE_EPSILON };
  85. static const __m128 vector_float_smallest_non_denorm = { 1.1754944e-038f, 1.1754944e-038f, 1.1754944e-038f, 1.1754944e-038f };
  86. static const __m128 vector_float_pos_infinity = { RENDER_MATRIX_INFINITY, RENDER_MATRIX_INFINITY, RENDER_MATRIX_INFINITY, RENDER_MATRIX_INFINITY };
  87. static const __m128 vector_float_neg_infinity = { -RENDER_MATRIX_INFINITY, -RENDER_MATRIX_INFINITY, -RENDER_MATRIX_INFINITY, -RENDER_MATRIX_INFINITY };
  88. static const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
  89. static const __m128 vector_float_half = { 0.5f, 0.5f, 0.5f, 0.5f };
  90. static const __m128 vector_float_neg_half = { -0.5f, -0.5f, -0.5f, -0.5f };
  91. static const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f };
  92. static const __m128 vector_float_pos_one = { +1.0f, +1.0f, +1.0f, +1.0f };
  93. static const __m128 vector_float_neg_one = { -1.0f, -1.0f, -1.0f, -1.0f };
  94. static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0f };
  95. #endif
  96. /*
  97. ================================================================================================
  98. Box definition
  99. ================================================================================================
  100. */
  101. /*
  102. 4----{E}---5
  103. + /| /|
  104. Z {H} {I} {F} |
  105. - / | / {J}
  106. 7--{G}-----6 |
  107. | | | |
  108. {L} 0----|-{A}-1
  109. | / {K} / -
  110. | {D} | {B} Y
  111. |/ |/ +
  112. 3---{C}----2
  113. - X +
  114. */
  115. static const short boxPolygonVertices[6][4] = {
  116. { 0, 3, 7, 4 }, // neg-X
  117. { 0, 1, 5, 4 }, // neg-Y
  118. { 0, 1, 2, 3 }, // neg-Z
  119. { 1, 2, 6, 5 }, // pos-X
  120. { 2, 3, 7, 6 }, // pos-Y
  121. { 4, 5, 6, 7 } // pos-Z
  122. };
  123. static const short boxEdgeVertices[12][2] = {
  124. /* A = */ { 0, 1 }, /* B = */ { 1, 2 }, /* C = */ { 2, 3 }, /* D = */ { 3, 0 }, // bottom
  125. /* E = */ { 4, 5 }, /* F = */ { 5, 6 }, /* G = */ { 6, 7 }, /* H = */ { 7, 4 }, // top
  126. /* I = */ { 0, 4 }, /* J = */ { 1, 5 }, /* K = */ { 2, 6 }, /* L = */ { 3, 7 } // sides
  127. };
  128. static int boxEdgePolygons[12][2] = {
  129. /* A = */ { 1, 2 }, /* B = */ { 3, 2 }, /* C = */ { 4, 2 }, /* D = */ { 0, 2 }, // bottom
  130. /* E = */ { 1, 5 }, /* F = */ { 3, 5 }, /* G = */ { 4, 5 }, /* H = */ { 0, 5 }, // top
  131. /* I = */ { 0, 1 }, /* J = */ { 3, 1 }, /* K = */ { 3, 4 }, /* L = */ { 0, 4 } // sides
  132. };
  133. /*
  134. #include <Windows.h>
  135. class idCreateBoxFrontPolygonsForFrontBits {
  136. public:
  137. idCreateBoxFrontPolygonsForFrontBits() {
  138. for ( int i = 0; i < 64; i++ ) {
  139. int frontPolygons[7] = { 0 };
  140. int numFrontPolygons = 0;
  141. char bits[7] = { 0 };
  142. for ( int j = 0; j < 6; j++ ) {
  143. if ( ( i & ( 1 << j ) ) != 0 ) {
  144. frontPolygons[numFrontPolygons++] = j;
  145. bits[5 - j] = '1';
  146. } else {
  147. bits[5 - j] = '0';
  148. }
  149. }
  150. const char * comment = ( ( i & ( i >> 3 ) & 7 ) != 0 ) ? " invalid" : "";
  151. if ( i == 0 ) {
  152. comment = " inside the box, every polygon is considered front facing";
  153. numFrontPolygons = 6;
  154. for ( int j = 0; j < 6; j++ ) {
  155. frontPolygons[j] = j;
  156. }
  157. }
  158. char buffer[1024];
  159. sprintf( buffer, "{ { %d, %d, %d, %d, %d, %d, %d }, %d }, // %s = %d%s\n",
  160. frontPolygons[0], frontPolygons[1], frontPolygons[2], frontPolygons[3],
  161. frontPolygons[4], frontPolygons[5], frontPolygons[6],
  162. numFrontPolygons, bits, i, comment );
  163. OutputDebugString( buffer );
  164. }
  165. }
  166. } createBoxFrontPolygonsForFrontBits;
  167. */
  168. // make sure this is a power of two for fast addressing an array of these without integer multiplication
  169. static const struct frontPolygons_t {
  170. byte indices[7];
  171. byte count;
  172. } boxFrontPolygonsForFrontBits[64] = {
  173. { { 0, 1, 2, 3, 4, 5, 0 }, 6 }, // 000000 = 0 inside the box, every polygon is considered front facing
  174. { { 0, 0, 0, 0, 0, 0, 0 }, 1 }, // 000001 = 1
  175. { { 1, 0, 0, 0, 0, 0, 0 }, 1 }, // 000010 = 2
  176. { { 0, 1, 0, 0, 0, 0, 0 }, 2 }, // 000011 = 3
  177. { { 2, 0, 0, 0, 0, 0, 0 }, 1 }, // 000100 = 4
  178. { { 0, 2, 0, 0, 0, 0, 0 }, 2 }, // 000101 = 5
  179. { { 1, 2, 0, 0, 0, 0, 0 }, 2 }, // 000110 = 6
  180. { { 0, 1, 2, 0, 0, 0, 0 }, 3 }, // 000111 = 7
  181. { { 3, 0, 0, 0, 0, 0, 0 }, 1 }, // 001000 = 8
  182. { { 0, 3, 0, 0, 0, 0, 0 }, 2 }, // 001001 = 9 invalid
  183. { { 1, 3, 0, 0, 0, 0, 0 }, 2 }, // 001010 = 10
  184. { { 0, 1, 3, 0, 0, 0, 0 }, 3 }, // 001011 = 11 invalid
  185. { { 2, 3, 0, 0, 0, 0, 0 }, 2 }, // 001100 = 12
  186. { { 0, 2, 3, 0, 0, 0, 0 }, 3 }, // 001101 = 13 invalid
  187. { { 1, 2, 3, 0, 0, 0, 0 }, 3 }, // 001110 = 14
  188. { { 0, 1, 2, 3, 0, 0, 0 }, 4 }, // 001111 = 15 invalid
  189. { { 4, 0, 0, 0, 0, 0, 0 }, 1 }, // 010000 = 16
  190. { { 0, 4, 0, 0, 0, 0, 0 }, 2 }, // 010001 = 17
  191. { { 1, 4, 0, 0, 0, 0, 0 }, 2 }, // 010010 = 18 invalid
  192. { { 0, 1, 4, 0, 0, 0, 0 }, 3 }, // 010011 = 19 invalid
  193. { { 2, 4, 0, 0, 0, 0, 0 }, 2 }, // 010100 = 20
  194. { { 0, 2, 4, 0, 0, 0, 0 }, 3 }, // 010101 = 21
  195. { { 1, 2, 4, 0, 0, 0, 0 }, 3 }, // 010110 = 22 invalid
  196. { { 0, 1, 2, 4, 0, 0, 0 }, 4 }, // 010111 = 23 invalid
  197. { { 3, 4, 0, 0, 0, 0, 0 }, 2 }, // 011000 = 24
  198. { { 0, 3, 4, 0, 0, 0, 0 }, 3 }, // 011001 = 25 invalid
  199. { { 1, 3, 4, 0, 0, 0, 0 }, 3 }, // 011010 = 26 invalid
  200. { { 0, 1, 3, 4, 0, 0, 0 }, 4 }, // 011011 = 27 invalid
  201. { { 2, 3, 4, 0, 0, 0, 0 }, 3 }, // 011100 = 28
  202. { { 0, 2, 3, 4, 0, 0, 0 }, 4 }, // 011101 = 29 invalid
  203. { { 1, 2, 3, 4, 0, 0, 0 }, 4 }, // 011110 = 30 invalid
  204. { { 0, 1, 2, 3, 4, 0, 0 }, 5 }, // 011111 = 31 invalid
  205. { { 5, 0, 0, 0, 0, 0, 0 }, 1 }, // 100000 = 32
  206. { { 0, 5, 0, 0, 0, 0, 0 }, 2 }, // 100001 = 33
  207. { { 1, 5, 0, 0, 0, 0, 0 }, 2 }, // 100010 = 34
  208. { { 0, 1, 5, 0, 0, 0, 0 }, 3 }, // 100011 = 35
  209. { { 2, 5, 0, 0, 0, 0, 0 }, 2 }, // 100100 = 36 invalid
  210. { { 0, 2, 5, 0, 0, 0, 0 }, 3 }, // 100101 = 37 invalid
  211. { { 1, 2, 5, 0, 0, 0, 0 }, 3 }, // 100110 = 38 invalid
  212. { { 0, 1, 2, 5, 0, 0, 0 }, 4 }, // 100111 = 39 invalid
  213. { { 3, 5, 0, 0, 0, 0, 0 }, 2 }, // 101000 = 40
  214. { { 0, 3, 5, 0, 0, 0, 0 }, 3 }, // 101001 = 41 invalid
  215. { { 1, 3, 5, 0, 0, 0, 0 }, 3 }, // 101010 = 42
  216. { { 0, 1, 3, 5, 0, 0, 0 }, 4 }, // 101011 = 43 invalid
  217. { { 2, 3, 5, 0, 0, 0, 0 }, 3 }, // 101100 = 44 invalid
  218. { { 0, 2, 3, 5, 0, 0, 0 }, 4 }, // 101101 = 45 invalid
  219. { { 1, 2, 3, 5, 0, 0, 0 }, 4 }, // 101110 = 46 invalid
  220. { { 0, 1, 2, 3, 5, 0, 0 }, 5 }, // 101111 = 47 invalid
  221. { { 4, 5, 0, 0, 0, 0, 0 }, 2 }, // 110000 = 48
  222. { { 0, 4, 5, 0, 0, 0, 0 }, 3 }, // 110001 = 49
  223. { { 1, 4, 5, 0, 0, 0, 0 }, 3 }, // 110010 = 50 invalid
  224. { { 0, 1, 4, 5, 0, 0, 0 }, 4 }, // 110011 = 51 invalid
  225. { { 2, 4, 5, 0, 0, 0, 0 }, 3 }, // 110100 = 52 invalid
  226. { { 0, 2, 4, 5, 0, 0, 0 }, 4 }, // 110101 = 53 invalid
  227. { { 1, 2, 4, 5, 0, 0, 0 }, 4 }, // 110110 = 54 invalid
  228. { { 0, 1, 2, 4, 5, 0, 0 }, 5 }, // 110111 = 55 invalid
  229. { { 3, 4, 5, 0, 0, 0, 0 }, 3 }, // 111000 = 56
  230. { { 0, 3, 4, 5, 0, 0, 0 }, 4 }, // 111001 = 57 invalid
  231. { { 1, 3, 4, 5, 0, 0, 0 }, 4 }, // 111010 = 58 invalid
  232. { { 0, 1, 3, 4, 5, 0, 0 }, 5 }, // 111011 = 59 invalid
  233. { { 2, 3, 4, 5, 0, 0, 0 }, 4 }, // 111100 = 60 invalid
  234. { { 0, 2, 3, 4, 5, 0, 0 }, 5 }, // 111101 = 61 invalid
  235. { { 1, 2, 3, 4, 5, 0, 0 }, 5 }, // 111110 = 62 invalid
  236. { { 0, 1, 2, 3, 4, 5, 0 }, 6 }, // 111111 = 63 invalid
  237. };
  238. /*
  239. #include <Windows.h>
  240. class idCreateBoxSilhouetteEdgesForFrontBits {
  241. public:
  242. idCreateBoxSilhouetteEdgesForFrontBits() {
  243. for ( int i = 0; i < 64; i++ ) {
  244. int silhouetteEdges[12] = { 0 };
  245. int numSilhouetteEdges = 0;
  246. for ( int j = 0; j < 12; j++ ) {
  247. if ( i == 0 || ( ( i >> boxEdgePolygons[j][0] ) & 1 ) != ( ( i >> boxEdgePolygons[j][1] ) & 1 ) ) {
  248. silhouetteEdges[numSilhouetteEdges++] = j;
  249. }
  250. }
  251. char bits[7] = { 0 };
  252. for ( int j = 0; j < 6; j++ ) {
  253. if ( ( i & ( 1 << j ) ) != 0 ) {
  254. bits[5 - j] = '1';
  255. } else {
  256. bits[5 - j] = '0';
  257. }
  258. }
  259. const char * comment = ( ( i & ( i >> 3 ) & 7 ) != 0 ) ? " invalid" : "";
  260. if ( i == 0 ) {
  261. comment = " inside the box, every edge is considered part of the silhouette";
  262. }
  263. char buffer[1024];
  264. sprintf( buffer, "{ { %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d }, %2d }, // %s = %d%s\n",
  265. silhouetteEdges[0], silhouetteEdges[1], silhouetteEdges[2], silhouetteEdges[3],
  266. silhouetteEdges[4], silhouetteEdges[5], silhouetteEdges[6], silhouetteEdges[7],
  267. silhouetteEdges[8], silhouetteEdges[9], silhouetteEdges[10], silhouetteEdges[11],
  268. numSilhouetteEdges, bits, i, comment );
  269. OutputDebugString( buffer );
  270. }
  271. }
  272. } createBoxSilhouetteEdgesForFrontBits;
  273. */
  274. // make sure this is a power of two for fast addressing an array of these without integer multiplication
  275. static const struct silhouetteEdges_t {
  276. byte indices[12];
  277. int32 count;
  278. } boxSilhouetteEdgesForFrontBits[64] = {
  279. { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, 12 }, // 000000 = 0 inside the box, every edge is considered part of the silhouette
  280. { { 3, 7, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 000001 = 1
  281. { { 0, 4, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 000010 = 2
  282. { { 0, 3, 4, 7, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 000011 = 3
  283. { { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 000100 = 4
  284. { { 0, 1, 2, 7, 8, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 000101 = 5
  285. { { 1, 2, 3, 4, 8, 9, 0, 0, 0, 0, 0, 0 }, 6 }, // 000110 = 6
  286. { { 1, 2, 4, 7, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 000111 = 7
  287. { { 1, 5, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 001000 = 8
  288. { { 1, 3, 5, 7, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 001001 = 9 invalid
  289. { { 0, 1, 4, 5, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 001010 = 10
  290. { { 0, 1, 3, 4, 5, 7, 10, 11, 0, 0, 0, 0 }, 8 }, // 001011 = 11 invalid
  291. { { 0, 2, 3, 5, 9, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 001100 = 12
  292. { { 0, 2, 5, 7, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 001101 = 13 invalid
  293. { { 2, 3, 4, 5, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 001110 = 14
  294. { { 2, 4, 5, 7, 10, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 001111 = 15 invalid
  295. { { 2, 6, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 010000 = 16
  296. { { 2, 3, 6, 7, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 010001 = 17
  297. { { 0, 2, 4, 6, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 010010 = 18 invalid
  298. { { 0, 2, 3, 4, 6, 7, 9, 10, 0, 0, 0, 0 }, 8 }, // 010011 = 19 invalid
  299. { { 0, 1, 3, 6, 10, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 010100 = 20
  300. { { 0, 1, 6, 7, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 010101 = 21
  301. { { 1, 3, 4, 6, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 010110 = 22 invalid
  302. { { 1, 4, 6, 7, 9, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 010111 = 23 invalid
  303. { { 1, 2, 5, 6, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 011000 = 24
  304. { { 1, 2, 3, 5, 6, 7, 8, 9, 0, 0, 0, 0 }, 8 }, // 011001 = 25 invalid
  305. { { 0, 1, 2, 4, 5, 6, 8, 11, 0, 0, 0, 0 }, 8 }, // 011010 = 26 invalid
  306. { { 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0 }, 8 }, // 011011 = 27 invalid
  307. { { 0, 3, 5, 6, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 011100 = 28
  308. { { 0, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0 }, 6 }, // 011101 = 29 invalid
  309. { { 3, 4, 5, 6, 8, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 011110 = 30 invalid
  310. { { 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 011111 = 31 invalid
  311. { { 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 100000 = 32
  312. { { 3, 4, 5, 6, 8, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 100001 = 33
  313. { { 0, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0 }, 6 }, // 100010 = 34
  314. { { 0, 3, 5, 6, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 100011 = 35
  315. { { 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0 }, 8 }, // 100100 = 36 invalid
  316. { { 0, 1, 2, 4, 5, 6, 8, 11, 0, 0, 0, 0 }, 8 }, // 100101 = 37 invalid
  317. { { 1, 2, 3, 5, 6, 7, 8, 9, 0, 0, 0, 0 }, 8 }, // 100110 = 38 invalid
  318. { { 1, 2, 5, 6, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 100111 = 39 invalid
  319. { { 1, 4, 6, 7, 9, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 101000 = 40
  320. { { 1, 3, 4, 6, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 101001 = 41 invalid
  321. { { 0, 1, 6, 7, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 101010 = 42
  322. { { 0, 1, 3, 6, 10, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 101011 = 43 invalid
  323. { { 0, 2, 3, 4, 6, 7, 9, 10, 0, 0, 0, 0 }, 8 }, // 101100 = 44 invalid
  324. { { 0, 2, 4, 6, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 101101 = 45 invalid
  325. { { 2, 3, 6, 7, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 101110 = 46 invalid
  326. { { 2, 6, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 101111 = 47 invalid
  327. { { 2, 4, 5, 7, 10, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 110000 = 48
  328. { { 2, 3, 4, 5, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 110001 = 49
  329. { { 0, 2, 5, 7, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 110010 = 50 invalid
  330. { { 0, 2, 3, 5, 9, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 110011 = 51 invalid
  331. { { 0, 1, 3, 4, 5, 7, 10, 11, 0, 0, 0, 0 }, 8 }, // 110100 = 52 invalid
  332. { { 0, 1, 4, 5, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 110101 = 53 invalid
  333. { { 1, 3, 5, 7, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 110110 = 54 invalid
  334. { { 1, 5, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 110111 = 55 invalid
  335. { { 1, 2, 4, 7, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 111000 = 56
  336. { { 1, 2, 3, 4, 8, 9, 0, 0, 0, 0, 0, 0 }, 6 }, // 111001 = 57 invalid
  337. { { 0, 1, 2, 7, 8, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 111010 = 58 invalid
  338. { { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 111011 = 59 invalid
  339. { { 0, 3, 4, 7, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 111100 = 60 invalid
  340. { { 0, 4, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 111101 = 61 invalid
  341. { { 3, 7, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 111110 = 62 invalid
  342. { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 0 }, // 111111 = 63 invalid
  343. };
  344. /*
  345. #include <Windows.h>
  346. class idCreateBoxSilhouetteVerticesForFrontBits {
  347. public:
  348. idCreateBoxSilhouetteVerticesForFrontBits() {
  349. for ( int i = 0; i < 64; i++ ) {
  350. int silhouetteEdges[12] = { 0 };
  351. int numSilhouetteEdges = 0;
  352. for ( int j = 0; j < 12; j++ ) {
  353. if ( i == 0 || ( ( i >> boxEdgePolygons[j][0] ) & 1 ) != ( ( i >> boxEdgePolygons[j][1] ) & 1 ) ) {
  354. silhouetteEdges[numSilhouetteEdges++] = j;
  355. }
  356. }
  357. int silhouetteVertices[8] = { 0 };
  358. int numSilhouetteVertices = 0;
  359. int vertex = boxEdgeVertices[silhouetteEdges[0]][0];
  360. for ( int j = 0; j < 7; j++ ) {
  361. int newVertex = -1;
  362. for ( int j = 0; j < numSilhouetteEdges; j++ ) {
  363. if ( silhouetteEdges[j] == -1 ) {
  364. continue;
  365. }
  366. if ( boxEdgeVertices[silhouetteEdges[j]][0] == vertex ) {
  367. newVertex = boxEdgeVertices[silhouetteEdges[j]][1];
  368. silhouetteEdges[j] = -1;
  369. break;
  370. } else if ( boxEdgeVertices[silhouetteEdges[j]][1] == vertex ) {
  371. newVertex = boxEdgeVertices[silhouetteEdges[j]][0];
  372. silhouetteEdges[j] = -1;
  373. break;
  374. }
  375. }
  376. if ( newVertex == -1 ) {
  377. break;
  378. }
  379. silhouetteVertices[numSilhouetteVertices++] = newVertex;
  380. vertex = newVertex;
  381. }
  382. char bits[7] = { 0 };
  383. for ( int j = 0; j < 6; j++ ) {
  384. if ( ( i & ( 1 << j ) ) != 0 ) {
  385. bits[5 - j] = '1';
  386. } else {
  387. bits[5 - j] = '0';
  388. }
  389. }
  390. const char * comment = ( ( i & ( i >> 3 ) & 7 ) != 0 ) ? " invalid" : "";
  391. if ( i == 0 ) {
  392. comment = " inside the box, no silhouette";
  393. }
  394. char buffer[1024];
  395. sprintf( buffer, "{ { %d, %d, %d, %d, %d, %d, %d }, %d }, // %s = %d%s\n",
  396. silhouetteVertices[0], silhouetteVertices[1], silhouetteVertices[2], silhouetteVertices[3],
  397. silhouetteVertices[4], silhouetteVertices[5], silhouetteVertices[6], numSilhouetteVertices, bits, i, comment );
  398. OutputDebugString( buffer );
  399. }
  400. }
  401. } createBoxSilhouetteVerticesForFrontBits;
  402. */
  403. // make sure this is a power of two for fast addressing an array of these without integer multiplication
  404. static const struct silhouetteVertices_t {
  405. byte indices[7];
  406. byte count;
  407. } boxSilhouetteVerticesForFrontBits[64] = {
  408. { { 1, 2, 3, 0, 4, 5, 6 }, 7 }, // 000000 = 0 inside the box, no vertex is considered part of the silhouette
  409. { { 0, 4, 7, 3, 0, 0, 0 }, 4 }, // 000001 = 1
  410. { { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 000010 = 2
  411. { { 1, 5, 4, 7, 3, 0, 0 }, 6 }, // 000011 = 3
  412. { { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 000100 = 4
  413. { { 1, 2, 3, 7, 4, 0, 0 }, 6 }, // 000101 = 5
  414. { { 2, 3, 0, 4, 5, 1, 0 }, 6 }, // 000110 = 6
  415. { { 2, 3, 7, 4, 5, 1, 0 }, 6 }, // 000111 = 7
  416. { { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 001000 = 8
  417. { { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 001001 = 9 invalid
  418. { { 1, 2, 6, 5, 4, 0, 0 }, 6 }, // 001010 = 10
  419. { { 1, 2, 6, 5, 4, 7, 3 }, 7 }, // 001011 = 11 invalid
  420. { { 1, 5, 6, 2, 3, 0, 0 }, 6 }, // 001100 = 12
  421. { { 1, 5, 6, 2, 3, 7, 4 }, 7 }, // 001101 = 13 invalid
  422. { { 3, 0, 4, 5, 6, 2, 0 }, 6 }, // 001110 = 14
  423. { { 3, 7, 4, 5, 6, 2, 0 }, 6 }, // 001111 = 15 invalid
  424. { { 3, 7, 6, 2, 0, 0, 0 }, 4 }, // 010000 = 16
  425. { { 3, 0, 4, 7, 6, 2, 0 }, 6 }, // 010001 = 17
  426. { { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 010010 = 18 invalid
  427. { { 1, 5, 4, 7, 6, 2, 3 }, 7 }, // 010011 = 19 invalid
  428. { { 1, 2, 6, 7, 3, 0, 0 }, 6 }, // 010100 = 20
  429. { { 1, 2, 6, 7, 4, 0, 0 }, 6 }, // 010101 = 21
  430. { { 2, 6, 7, 3, 0, 4, 5 }, 7 }, // 010110 = 22 invalid
  431. { { 2, 6, 7, 4, 5, 1, 0 }, 6 }, // 010111 = 23 invalid
  432. { { 2, 3, 7, 6, 5, 1, 0 }, 6 }, // 011000 = 24
  433. { { 2, 3, 0, 4, 7, 6, 5 }, 7 }, // 011001 = 25 invalid
  434. { { 1, 2, 3, 7, 6, 5, 4 }, 7 }, // 011010 = 26 invalid
  435. { { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 011011 = 27 invalid
  436. { { 1, 5, 6, 7, 3, 0, 0 }, 6 }, // 011100 = 28
  437. { { 1, 5, 6, 7, 4, 0, 0 }, 6 }, // 011101 = 29 invalid
  438. { { 0, 4, 5, 6, 7, 3, 0 }, 6 }, // 011110 = 30 invalid
  439. { { 5, 6, 7, 4, 0, 0, 0 }, 4 }, // 011111 = 31 invalid
  440. { { 5, 6, 7, 4, 0, 0, 0 }, 4 }, // 100000 = 32
  441. { { 0, 4, 5, 6, 7, 3, 0 }, 6 }, // 100001 = 33
  442. { { 1, 5, 6, 7, 4, 0, 0 }, 6 }, // 100010 = 34
  443. { { 1, 5, 6, 7, 3, 0, 0 }, 6 }, // 100011 = 35
  444. { { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 100100 = 36 invalid
  445. { { 1, 2, 3, 7, 6, 5, 4 }, 7 }, // 100101 = 37 invalid
  446. { { 2, 3, 0, 4, 7, 6, 5 }, 7 }, // 100110 = 38 invalid
  447. { { 2, 3, 7, 6, 5, 1, 0 }, 6 }, // 100111 = 39 invalid
  448. { { 2, 6, 7, 4, 5, 1, 0 }, 6 }, // 101000 = 40
  449. { { 2, 6, 7, 3, 0, 4, 5 }, 7 }, // 101001 = 41 invalid
  450. { { 1, 2, 6, 7, 4, 0, 0 }, 6 }, // 101010 = 42
  451. { { 1, 2, 6, 7, 3, 0, 0 }, 6 }, // 101011 = 43 invalid
  452. { { 1, 5, 4, 7, 6, 2, 3 }, 7 }, // 101100 = 44 invalid
  453. { { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 101101 = 45 invalid
  454. { { 3, 0, 4, 7, 6, 2, 0 }, 6 }, // 101110 = 46 invalid
  455. { { 3, 7, 6, 2, 0, 0, 0 }, 4 }, // 101111 = 47 invalid
  456. { { 3, 7, 4, 5, 6, 2, 0 }, 6 }, // 110000 = 48
  457. { { 3, 0, 4, 5, 6, 2, 0 }, 6 }, // 110001 = 49
  458. { { 1, 5, 6, 2, 3, 7, 4 }, 7 }, // 110010 = 50 invalid
  459. { { 1, 5, 6, 2, 3, 0, 0 }, 6 }, // 110011 = 51 invalid
  460. { { 1, 2, 6, 5, 4, 7, 3 }, 7 }, // 110100 = 52 invalid
  461. { { 1, 2, 6, 5, 4, 0, 0 }, 6 }, // 110101 = 53 invalid
  462. { { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 110110 = 54 invalid
  463. { { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 110111 = 55 invalid
  464. { { 2, 3, 7, 4, 5, 1, 0 }, 6 }, // 111000 = 56
  465. { { 2, 3, 0, 4, 5, 1, 0 }, 6 }, // 111001 = 57 invalid
  466. { { 1, 2, 3, 7, 4, 0, 0 }, 6 }, // 111010 = 58 invalid
  467. { { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 111011 = 59 invalid
  468. { { 1, 5, 4, 7, 3, 0, 0 }, 6 }, // 111100 = 60 invalid
  469. { { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 111101 = 61 invalid
  470. { { 0, 4, 7, 3, 0, 0, 0 }, 4 }, // 111110 = 62 invalid
  471. { { 0, 0, 0, 0, 0, 0, 0 }, 0 }, // 111111 = 63 invalid
  472. };
  473. /*
  474. ========================
  475. GetBoxFrontBits
  476. front bits:
  477. bit 0 = neg-X is front facing
  478. bit 1 = neg-Y is front facing
  479. bit 2 = neg-Z is front facing
  480. bit 3 = pos-X is front facing
  481. bit 4 = pos-Y is front facing
  482. bit 5 = pos-Z is front facing
  483. ========================
  484. */
  485. #ifdef ID_WIN_X86_SSE2_INTRIN
  486. static int GetBoxFrontBits_SSE2( const __m128 & b0, const __m128 & b1, const __m128 & viewOrigin ) {
  487. const __m128 dir0 = _mm_sub_ps( viewOrigin, b0 );
  488. const __m128 dir1 = _mm_sub_ps( b1, viewOrigin );
  489. const __m128 d0 = _mm_cmplt_ps( dir0, _mm_setzero_ps() );
  490. const __m128 d1 = _mm_cmplt_ps( dir1, _mm_setzero_ps() );
  491. int frontBits = _mm_movemask_ps( d0 ) | ( _mm_movemask_ps( d1 ) << 3 );
  492. return frontBits;
  493. }
  494. #else
  495. static int GetBoxFrontBits_Generic( const idBounds & bounds, const idVec3 & viewOrigin ) {
  496. idVec3 dir0 = viewOrigin - bounds[0];
  497. idVec3 dir1 = bounds[1] - viewOrigin;
  498. int frontBits = 0;
  499. frontBits |= IEEE_FLT_SIGNBITSET( dir0.x ) << 0;
  500. frontBits |= IEEE_FLT_SIGNBITSET( dir0.y ) << 1;
  501. frontBits |= IEEE_FLT_SIGNBITSET( dir0.z ) << 2;
  502. frontBits |= IEEE_FLT_SIGNBITSET( dir1.x ) << 3;
  503. frontBits |= IEEE_FLT_SIGNBITSET( dir1.y ) << 4;
  504. frontBits |= IEEE_FLT_SIGNBITSET( dir1.z ) << 5;
  505. return frontBits;
  506. }
  507. #endif
  508. /*
  509. ================================================================================================
  510. idRenderMatrix implementation
  511. ================================================================================================
  512. */
  513. /*
  514. ========================
  515. idRenderMatrix::CreateFromOriginAxis
  516. ========================
  517. */
  518. void idRenderMatrix::CreateFromOriginAxis( const idVec3 & origin, const idMat3 & axis, idRenderMatrix & out ) {
  519. out[0][0] = axis[0][0];
  520. out[0][1] = axis[1][0];
  521. out[0][2] = axis[2][0];
  522. out[0][3] = origin[0];
  523. out[1][0] = axis[0][1];
  524. out[1][1] = axis[1][1];
  525. out[1][2] = axis[2][1];
  526. out[1][3] = origin[1];
  527. out[2][0] = axis[0][2];
  528. out[2][1] = axis[1][2];
  529. out[2][2] = axis[2][2];
  530. out[2][3] = origin[2];
  531. out[3][0] = 0.0f;
  532. out[3][1] = 0.0f;
  533. out[3][2] = 0.0f;
  534. out[3][3] = 1.0f;
  535. }
  536. /*
  537. ========================
  538. idRenderMatrix::CreateFromOriginAxisScale
  539. ========================
  540. */
  541. void idRenderMatrix::CreateFromOriginAxisScale( const idVec3 & origin, const idMat3 & axis, const idVec3 & scale, idRenderMatrix & out ) {
  542. out[0][0] = axis[0][0] * scale[0];
  543. out[0][1] = axis[1][0] * scale[1];
  544. out[0][2] = axis[2][0] * scale[2];
  545. out[0][3] = origin[0];
  546. out[1][0] = axis[0][1] * scale[0];
  547. out[1][1] = axis[1][1] * scale[1];
  548. out[1][2] = axis[2][1] * scale[2];
  549. out[1][3] = origin[1];
  550. out[2][0] = axis[0][2] * scale[0];
  551. out[2][1] = axis[1][2] * scale[1];
  552. out[2][2] = axis[2][2] * scale[2];
  553. out[2][3] = origin[2];
  554. out[3][0] = 0.0f;
  555. out[3][1] = 0.0f;
  556. out[3][2] = 0.0f;
  557. out[3][3] = 1.0f;
  558. }
  559. /*
  560. ========================
  561. idRenderMatrix::CreateViewMatrix
  562. Our axis looks down positive +X, render matrix looks down -Z.
  563. ========================
  564. */
  565. void idRenderMatrix::CreateViewMatrix( const idVec3 & origin, const idMat3 & axis, idRenderMatrix & out ) {
  566. out[0][0] = -axis[1][0];
  567. out[0][1] = -axis[1][1];
  568. out[0][2] = -axis[1][2];
  569. out[0][3] = origin[0] * axis[1][0] + origin[1] * axis[1][1] + origin[2] * axis[1][2];
  570. out[1][0] = axis[2][0];
  571. out[1][1] = axis[2][1];
  572. out[1][2] = axis[2][2];
  573. out[1][3] = -( origin[0] * axis[2][0] + origin[1] * axis[2][1] + origin[2] * axis[2][2] );
  574. out[2][0] = -axis[0][0];
  575. out[2][1] = -axis[0][1];
  576. out[2][2] = -axis[0][2];
  577. out[2][3] = origin[0] * axis[0][0] + origin[1] * axis[0][1] + origin[2] * axis[0][2];
  578. out[3][0] = 0.0f;
  579. out[3][1] = 0.0f;
  580. out[3][2] = 0.0f;
  581. out[3][3] = 1.0f;
  582. }
  583. /*
  584. ========================
  585. idRenderMatrix::CreateProjectionMatrix
  586. If zFar == 0, an infinite far plane will be used.
  587. ========================
  588. */
  589. void idRenderMatrix::CreateProjectionMatrix( float xMin, float xMax, float yMin, float yMax, float zNear, float zFar, idRenderMatrix & out ) {
  590. const float width = xMax - xMin;
  591. const float height = yMax - yMin;
  592. out[0][0] = 2.0f * zNear / width;
  593. out[0][1] = 0.0f;
  594. out[0][2] = ( xMax + xMin ) / width; // normally 0
  595. out[0][3] = 0.0f;
  596. out[1][0] = 0.0f;
  597. out[1][1] = 2.0f * zNear / height;
  598. out[1][2] = ( yMax + yMin ) / height; // normally 0
  599. out[1][3] = 0.0f;
  600. if ( zFar <= zNear ) {
  601. // this is the far-plane-at-infinity formulation
  602. out[2][0] = 0.0f;
  603. out[2][1] = 0.0f;
  604. out[2][2] = -1.0f;
  605. #if defined( CLIP_SPACE_D3D )
  606. // the D3D clip space Z is in range [0,1] instead of [-1,1]
  607. out[2][3] = -zNear;
  608. #else
  609. out[2][3] = -2.0f * zNear;
  610. #endif
  611. } else {
  612. out[2][0] = 0.0f;
  613. out[2][1] = 0.0f;
  614. #if defined( CLIP_SPACE_D3D )
  615. // the D3D clip space Z is in range [0,1] instead of [-1,1]
  616. out[2][2] = -( zFar ) / ( zFar - zNear );
  617. out[2][3] = -( zFar * zNear ) / ( zFar - zNear );
  618. #else
  619. out[2][2] = -( zFar + zNear ) / ( zFar - zNear );
  620. out[2][3] = -( 2.0f * zFar * zNear ) / ( zFar - zNear );
  621. #endif
  622. }
  623. out[3][0] = 0.0f;
  624. out[3][1] = 0.0f;
  625. out[3][2] = -1.0f;
  626. out[3][3] = 0.0f;
  627. }
  628. /*
  629. ========================
  630. idRenderMatrix::CreateProjectionMatrixFov
  631. xOffset and yOffset should be in the -1 to 1 range for sub-pixel accumulation jitter.
  632. xOffset can also be used for eye separation when rendering stereo.
  633. ========================
  634. */
  635. void idRenderMatrix::CreateProjectionMatrixFov( float xFovDegrees, float yFovDegrees, float zNear, float zFar, float xOffset, float yOffset, idRenderMatrix & out ) {
  636. float xMax = zNear * idMath::Tan( DEG2RAD( xFovDegrees ) * 0.5f );
  637. float xMin = -xMax;
  638. float yMax = zNear * idMath::Tan( DEG2RAD( yFovDegrees ) * 0.5f );
  639. float yMin = -yMax;
  640. xMin += xOffset;
  641. xMax += xOffset;
  642. yMin += yOffset;
  643. yMax += yOffset;
  644. CreateProjectionMatrix( xMin, xMax, yMin, yMax, zNear, zFar, out );
  645. }
  646. /*
  647. ========================
  648. idRenderMatrix::OffsetScaleForBounds
  649. Add the offset to the center of the bounds and scale for the width of the bounds.
  650. The result matrix will transform the unit-cube to exactly cover the bounds.
  651. ========================
  652. */
  653. void idRenderMatrix::OffsetScaleForBounds( const idRenderMatrix & src, const idBounds & bounds, idRenderMatrix & out ) {
  654. assert( &src != &out );
  655. #ifdef ID_WIN_X86_SSE2_INTRIN
  656. __m128 b0 = _mm_loadu_bounds_0( bounds );
  657. __m128 b1 = _mm_loadu_bounds_1( bounds );
  658. __m128 offset = _mm_mul_ps( _mm_add_ps( b1, b0 ), vector_float_half );
  659. __m128 scale = _mm_mul_ps( _mm_sub_ps( b1, b0 ), vector_float_half );
  660. scale = _mm_or_ps( scale, vector_float_last_one );
  661. __m128 a0 = _mm_loadu_ps( src.m + 0*4 );
  662. __m128 a1 = _mm_loadu_ps( src.m + 1*4 );
  663. __m128 a2 = _mm_loadu_ps( src.m + 2*4 );
  664. __m128 a3 = _mm_loadu_ps( src.m + 3*4 );
  665. __m128 d0 = _mm_mul_ps( a0, offset );
  666. __m128 d1 = _mm_mul_ps( a1, offset );
  667. __m128 d2 = _mm_mul_ps( a2, offset );
  668. __m128 d3 = _mm_mul_ps( a3, offset );
  669. __m128 s0 = _mm_unpacklo_ps( d0, d2 ); // a0, c0, a1, c1
  670. __m128 s1 = _mm_unpackhi_ps( d0, d2 ); // a2, c2, a3, c3
  671. __m128 s2 = _mm_unpacklo_ps( d1, d3 ); // b0, d0, b1, d1
  672. __m128 s3 = _mm_unpackhi_ps( d1, d3 ); // b2, d2, b3, d3
  673. __m128 t0 = _mm_unpacklo_ps( s0, s2 ); // a0, b0, c0, d0
  674. __m128 t1 = _mm_unpackhi_ps( s0, s2 ); // a1, b1, c1, d1
  675. __m128 t2 = _mm_unpacklo_ps( s1, s3 ); // a2, b2, c2, d2
  676. t0 = _mm_add_ps( t0, t1 );
  677. t0 = _mm_add_ps( t0, t2 );
  678. __m128 n0 = _mm_and_ps( _mm_splat_ps( t0, 0 ), vector_float_keep_last );
  679. __m128 n1 = _mm_and_ps( _mm_splat_ps( t0, 1 ), vector_float_keep_last );
  680. __m128 n2 = _mm_and_ps( _mm_splat_ps( t0, 2 ), vector_float_keep_last );
  681. __m128 n3 = _mm_and_ps( _mm_splat_ps( t0, 3 ), vector_float_keep_last );
  682. a0 = _mm_madd_ps( a0, scale, n0 );
  683. a1 = _mm_madd_ps( a1, scale, n1 );
  684. a2 = _mm_madd_ps( a2, scale, n2 );
  685. a3 = _mm_madd_ps( a3, scale, n3 );
  686. _mm_storeu_ps( out.m + 0*4, a0 );
  687. _mm_storeu_ps( out.m + 1*4, a1 );
  688. _mm_storeu_ps( out.m + 2*4, a2 );
  689. _mm_storeu_ps( out.m + 3*4, a3 );
  690. #else
  691. const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f;
  692. const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f;
  693. out[0][0] = src[0][0] * scale[0];
  694. out[0][1] = src[0][1] * scale[1];
  695. out[0][2] = src[0][2] * scale[2];
  696. out[0][3] = src[0][3] + src[0][0] * offset[0] + src[0][1] * offset[1] + src[0][2] * offset[2];
  697. out[1][0] = src[1][0] * scale[0];
  698. out[1][1] = src[1][1] * scale[1];
  699. out[1][2] = src[1][2] * scale[2];
  700. out[1][3] = src[1][3] + src[1][0] * offset[0] + src[1][1] * offset[1] + src[1][2] * offset[2];
  701. out[2][0] = src[2][0] * scale[0];
  702. out[2][1] = src[2][1] * scale[1];
  703. out[2][2] = src[2][2] * scale[2];
  704. out[2][3] = src[2][3] + src[2][0] * offset[0] + src[2][1] * offset[1] + src[2][2] * offset[2];
  705. out[3][0] = src[3][0] * scale[0];
  706. out[3][1] = src[3][1] * scale[1];
  707. out[3][2] = src[3][2] * scale[2];
  708. out[3][3] = src[3][3] + src[3][0] * offset[0] + src[3][1] * offset[1] + src[3][2] * offset[2];
  709. #endif
  710. }
  711. /*
  712. ========================
  713. idRenderMatrix::InverseOffsetScaleForBounds
  714. Subtract the offset to the center of the bounds and inverse scale for the width of the bounds.
  715. The result matrix will transform the bounds to exactly cover the unit-cube.
  716. ========================
  717. */
  718. void idRenderMatrix::InverseOffsetScaleForBounds( const idRenderMatrix & src, const idBounds & bounds, idRenderMatrix & out ) {
  719. assert( &src != &out );
  720. #ifdef ID_WIN_X86_SSE2_INTRIN
  721. __m128 b0 = _mm_loadu_bounds_0( bounds );
  722. __m128 b1 = _mm_loadu_bounds_1( bounds );
  723. __m128 offset = _mm_mul_ps( _mm_add_ps( b1, b0 ), vector_float_neg_half );
  724. __m128 scale = _mm_mul_ps( _mm_sub_ps( b0, b1 ), vector_float_neg_half );
  725. scale = _mm_max_ps( scale, vector_float_smallest_non_denorm );
  726. __m128 rscale = _mm_rcp32_ps( scale );
  727. offset = _mm_mul_ps( offset, rscale );
  728. __m128 d0 = _mm_and_ps( _mm_splat_ps( offset, 0 ), vector_float_keep_last );
  729. __m128 d1 = _mm_and_ps( _mm_splat_ps( offset, 1 ), vector_float_keep_last );
  730. __m128 d2 = _mm_and_ps( _mm_splat_ps( offset, 2 ), vector_float_keep_last );
  731. __m128 a0 = _mm_loadu_ps( src.m + 0*4 );
  732. __m128 a1 = _mm_loadu_ps( src.m + 1*4 );
  733. __m128 a2 = _mm_loadu_ps( src.m + 2*4 );
  734. __m128 a3 = _mm_loadu_ps( src.m + 3*4 );
  735. a0 = _mm_madd_ps( a0, _mm_splat_ps( rscale, 0 ), d0 );
  736. a1 = _mm_madd_ps( a1, _mm_splat_ps( rscale, 1 ), d1 );
  737. a2 = _mm_madd_ps( a2, _mm_splat_ps( rscale, 2 ), d2 );
  738. _mm_storeu_ps( out.m + 0*4, a0 );
  739. _mm_storeu_ps( out.m + 1*4, a1 );
  740. _mm_storeu_ps( out.m + 2*4, a2 );
  741. _mm_storeu_ps( out.m + 3*4, a3 );
  742. #else
  743. const idVec3 offset = -0.5f * ( bounds[1] + bounds[0] );
  744. const idVec3 scale = 2.0f / ( bounds[1] - bounds[0] );
  745. out[0][0] = scale[0] * src[0][0];
  746. out[0][1] = scale[0] * src[0][1];
  747. out[0][2] = scale[0] * src[0][2];
  748. out[0][3] = scale[0] * ( src[0][3] + offset[0] );
  749. out[1][0] = scale[1] * src[1][0];
  750. out[1][1] = scale[1] * src[1][1];
  751. out[1][2] = scale[1] * src[1][2];
  752. out[1][3] = scale[1] * ( src[1][3] + offset[1] );
  753. out[2][0] = scale[2] * src[2][0];
  754. out[2][1] = scale[2] * src[2][1];
  755. out[2][2] = scale[2] * src[2][2];
  756. out[2][3] = scale[2] * ( src[2][3] + offset[2] );
  757. out[3][0] = src[3][0];
  758. out[3][1] = src[3][1];
  759. out[3][2] = src[3][2];
  760. out[3][3] = src[3][3];
  761. #endif
  762. }
  763. /*
  764. ========================
  765. idRenderMatrix::Transpose
  766. ========================
  767. */
  768. void idRenderMatrix::Transpose( const idRenderMatrix & src, idRenderMatrix & out ) {
  769. assert( &src != &out );
  770. #ifdef ID_WIN_X86_SSE2_INTRIN
  771. const __m128 a0 = _mm_loadu_ps( src.m + 0*4 );
  772. const __m128 a1 = _mm_loadu_ps( src.m + 1*4 );
  773. const __m128 a2 = _mm_loadu_ps( src.m + 2*4 );
  774. const __m128 a3 = _mm_loadu_ps( src.m + 3*4 );
  775. const __m128 r0 = _mm_unpacklo_ps( a0, a2 );
  776. const __m128 r1 = _mm_unpackhi_ps( a0, a2 );
  777. const __m128 r2 = _mm_unpacklo_ps( a1, a3 );
  778. const __m128 r3 = _mm_unpackhi_ps( a1, a3 );
  779. const __m128 t0 = _mm_unpacklo_ps( r0, r2 );
  780. const __m128 t1 = _mm_unpackhi_ps( r0, r2 );
  781. const __m128 t2 = _mm_unpacklo_ps( r1, r3 );
  782. const __m128 t3 = _mm_unpackhi_ps( r1, r3 );
  783. _mm_storeu_ps( out.m + 0*4, t0 );
  784. _mm_storeu_ps( out.m + 1*4, t1 );
  785. _mm_storeu_ps( out.m + 2*4, t2 );
  786. _mm_storeu_ps( out.m + 3*4, t3 );
  787. #else
  788. for ( int i = 0; i < 4; i++ ) {
  789. for ( int j = 0; j < 4; j++ ) {
  790. out[i][j] = src[j][i];
  791. }
  792. }
  793. #endif
  794. }
  795. /*
  796. ========================
  797. idRenderMatrix::Multiply
  798. ========================
  799. */
  800. void idRenderMatrix::Multiply( const idRenderMatrix & a, const idRenderMatrix & b, idRenderMatrix & out ) {
  801. #ifdef ID_WIN_X86_SSE2_INTRIN
  802. __m128 a0 = _mm_loadu_ps( a.m + 0*4 );
  803. __m128 a1 = _mm_loadu_ps( a.m + 1*4 );
  804. __m128 a2 = _mm_loadu_ps( a.m + 2*4 );
  805. __m128 a3 = _mm_loadu_ps( a.m + 3*4 );
  806. __m128 b0 = _mm_loadu_ps( b.m + 0*4 );
  807. __m128 b1 = _mm_loadu_ps( b.m + 1*4 );
  808. __m128 b2 = _mm_loadu_ps( b.m + 2*4 );
  809. __m128 b3 = _mm_loadu_ps( b.m + 3*4 );
  810. __m128 t0 = _mm_mul_ps( _mm_splat_ps( a0, 0 ), b0 );
  811. __m128 t1 = _mm_mul_ps( _mm_splat_ps( a1, 0 ), b0 );
  812. __m128 t2 = _mm_mul_ps( _mm_splat_ps( a2, 0 ), b0 );
  813. __m128 t3 = _mm_mul_ps( _mm_splat_ps( a3, 0 ), b0 );
  814. t0 = _mm_madd_ps( _mm_splat_ps( a0, 1 ), b1, t0 );
  815. t1 = _mm_madd_ps( _mm_splat_ps( a1, 1 ), b1, t1 );
  816. t2 = _mm_madd_ps( _mm_splat_ps( a2, 1 ), b1, t2 );
  817. t3 = _mm_madd_ps( _mm_splat_ps( a3, 1 ), b1, t3 );
  818. t0 = _mm_madd_ps( _mm_splat_ps( a0, 2 ), b2, t0 );
  819. t1 = _mm_madd_ps( _mm_splat_ps( a1, 2 ), b2, t1 );
  820. t2 = _mm_madd_ps( _mm_splat_ps( a2, 2 ), b2, t2 );
  821. t3 = _mm_madd_ps( _mm_splat_ps( a3, 2 ), b2, t3 );
  822. t0 = _mm_madd_ps( _mm_splat_ps( a0, 3 ), b3, t0 );
  823. t1 = _mm_madd_ps( _mm_splat_ps( a1, 3 ), b3, t1 );
  824. t2 = _mm_madd_ps( _mm_splat_ps( a2, 3 ), b3, t2 );
  825. t3 = _mm_madd_ps( _mm_splat_ps( a3, 3 ), b3, t3 );
  826. _mm_storeu_ps( out.m + 0*4, t0 );
  827. _mm_storeu_ps( out.m + 1*4, t1 );
  828. _mm_storeu_ps( out.m + 2*4, t2 );
  829. _mm_storeu_ps( out.m + 3*4, t3 );
  830. #else
  831. /*
  832. for ( int i = 0 ; i < 4 ; i++ ) {
  833. for ( int j = 0 ; j < 4 ; j++ ) {
  834. out.m[ i * 4 + j ] =
  835. a.m[ i * 4 + 0 ] * b.m[ 0 * 4 + j ] +
  836. a.m[ i * 4 + 1 ] * b.m[ 1 * 4 + j ] +
  837. a.m[ i * 4 + 2 ] * b.m[ 2 * 4 + j ] +
  838. a.m[ i * 4 + 3 ] * b.m[ 3 * 4 + j ];
  839. }
  840. }
  841. */
  842. out.m[0*4+0] = a.m[0*4+0]*b.m[0*4+0] + a.m[0*4+1]*b.m[1*4+0] + a.m[0*4+2]*b.m[2*4+0] + a.m[0*4+3]*b.m[3*4+0];
  843. out.m[0*4+1] = a.m[0*4+0]*b.m[0*4+1] + a.m[0*4+1]*b.m[1*4+1] + a.m[0*4+2]*b.m[2*4+1] + a.m[0*4+3]*b.m[3*4+1];
  844. out.m[0*4+2] = a.m[0*4+0]*b.m[0*4+2] + a.m[0*4+1]*b.m[1*4+2] + a.m[0*4+2]*b.m[2*4+2] + a.m[0*4+3]*b.m[3*4+2];
  845. out.m[0*4+3] = a.m[0*4+0]*b.m[0*4+3] + a.m[0*4+1]*b.m[1*4+3] + a.m[0*4+2]*b.m[2*4+3] + a.m[0*4+3]*b.m[3*4+3];
  846. out.m[1*4+0] = a.m[1*4+0]*b.m[0*4+0] + a.m[1*4+1]*b.m[1*4+0] + a.m[1*4+2]*b.m[2*4+0] + a.m[1*4+3]*b.m[3*4+0];
  847. out.m[1*4+1] = a.m[1*4+0]*b.m[0*4+1] + a.m[1*4+1]*b.m[1*4+1] + a.m[1*4+2]*b.m[2*4+1] + a.m[1*4+3]*b.m[3*4+1];
  848. out.m[1*4+2] = a.m[1*4+0]*b.m[0*4+2] + a.m[1*4+1]*b.m[1*4+2] + a.m[1*4+2]*b.m[2*4+2] + a.m[1*4+3]*b.m[3*4+2];
  849. out.m[1*4+3] = a.m[1*4+0]*b.m[0*4+3] + a.m[1*4+1]*b.m[1*4+3] + a.m[1*4+2]*b.m[2*4+3] + a.m[1*4+3]*b.m[3*4+3];
  850. out.m[2*4+0] = a.m[2*4+0]*b.m[0*4+0] + a.m[2*4+1]*b.m[1*4+0] + a.m[2*4+2]*b.m[2*4+0] + a.m[2*4+3]*b.m[3*4+0];
  851. out.m[2*4+1] = a.m[2*4+0]*b.m[0*4+1] + a.m[2*4+1]*b.m[1*4+1] + a.m[2*4+2]*b.m[2*4+1] + a.m[2*4+3]*b.m[3*4+1];
  852. out.m[2*4+2] = a.m[2*4+0]*b.m[0*4+2] + a.m[2*4+1]*b.m[1*4+2] + a.m[2*4+2]*b.m[2*4+2] + a.m[2*4+3]*b.m[3*4+2];
  853. out.m[2*4+3] = a.m[2*4+0]*b.m[0*4+3] + a.m[2*4+1]*b.m[1*4+3] + a.m[2*4+2]*b.m[2*4+3] + a.m[2*4+3]*b.m[3*4+3];
  854. out.m[3*4+0] = a.m[3*4+0]*b.m[0*4+0] + a.m[3*4+1]*b.m[1*4+0] + a.m[3*4+2]*b.m[2*4+0] + a.m[3*4+3]*b.m[3*4+0];
  855. out.m[3*4+1] = a.m[3*4+0]*b.m[0*4+1] + a.m[3*4+1]*b.m[1*4+1] + a.m[3*4+2]*b.m[2*4+1] + a.m[3*4+3]*b.m[3*4+1];
  856. out.m[3*4+2] = a.m[3*4+0]*b.m[0*4+2] + a.m[3*4+1]*b.m[1*4+2] + a.m[3*4+2]*b.m[2*4+2] + a.m[3*4+3]*b.m[3*4+2];
  857. out.m[3*4+3] = a.m[3*4+0]*b.m[0*4+3] + a.m[3*4+1]*b.m[1*4+3] + a.m[3*4+2]*b.m[2*4+3] + a.m[3*4+3]*b.m[3*4+3];
  858. #endif
  859. }
  860. /*
  861. ========================
  862. idRenderMatrix::Inverse
  863. inverse( M ) = ( 1 / determinant( M ) ) * transpose( cofactor( M ) )
  864. This code is based on the code written by Cédric Lallain, published on "Cell Performance"
  865. (by Mike Acton) and released under the BSD 3-Clause ("BSD New" or "BSD Simplified") license.
  866. https://code.google.com/p/cellperformance-snippets/
  867. Note that large parallel lights can have very small values in the projection matrix,
  868. scaling tens of thousands of world units down to a 0-1 range, so the determinants
  869. can get really, really small.
  870. ========================
  871. */
  872. bool idRenderMatrix::Inverse( const idRenderMatrix & src, idRenderMatrix & out ) {
  873. #ifdef ID_WIN_X86_SSE2_INTRIN
  874. const __m128 r0 = _mm_loadu_ps( src.m + 0 * 4 );
  875. const __m128 r1 = _mm_loadu_ps( src.m + 1 * 4 );
  876. const __m128 r2 = _mm_loadu_ps( src.m + 2 * 4 );
  877. const __m128 r3 = _mm_loadu_ps( src.m + 3 * 4 );
  878. // rXuY = row X rotated up by Y floats.
  879. const __m128 r0u1 = _mm_perm_ps( r0, _MM_SHUFFLE( 2, 1, 0, 3 ) );
  880. const __m128 r0u2 = _mm_perm_ps( r0, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  881. const __m128 r0u3 = _mm_perm_ps( r0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  882. const __m128 r1u1 = _mm_perm_ps( r1, _MM_SHUFFLE( 2, 1, 0, 3 ) );
  883. const __m128 r1u2 = _mm_perm_ps( r1, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  884. const __m128 r1u3 = _mm_perm_ps( r1, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  885. const __m128 r2u1 = _mm_perm_ps( r2, _MM_SHUFFLE( 2, 1, 0, 3 ) );
  886. const __m128 r2u2 = _mm_perm_ps( r2, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  887. const __m128 r2u3 = _mm_perm_ps( r2, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  888. const __m128 r3u1 = _mm_perm_ps( r3, _MM_SHUFFLE( 2, 1, 0, 3 ) );
  889. const __m128 r3u2 = _mm_perm_ps( r3, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  890. const __m128 r3u3 = _mm_perm_ps( r3, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  891. const __m128 m_r2u2_r3u3 = _mm_mul_ps( r2u2, r3u3 );
  892. const __m128 m_r1u1_r2u2_r3u3 = _mm_mul_ps( r1u1, m_r2u2_r3u3 );
  893. const __m128 m_r2u3_r3u1 = _mm_mul_ps( r2u3, r3u1 );
  894. const __m128 a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3 = _mm_madd_ps( r1u2, m_r2u3_r3u1, m_r1u1_r2u2_r3u3 );
  895. const __m128 m_r2u1_r3u2 = _mm_perm_ps( m_r2u2_r3u3, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  896. const __m128 pos_part_det3x3_r0 = _mm_madd_ps( r1u3, m_r2u1_r3u2, a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3 );
  897. const __m128 m_r2u3_r3u2 = _mm_mul_ps( r2u3, r3u2 );
  898. const __m128 m_r1u1_r2u3_r3u2 = _mm_mul_ps( r1u1, m_r2u3_r3u2 );
  899. const __m128 m_r2u1_r3u3 = _mm_perm_ps( m_r2u3_r3u1, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  900. const __m128 a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2 = _mm_madd_ps( r1u2, m_r2u1_r3u3, m_r1u1_r2u3_r3u2 );
  901. const __m128 m_r2u2_r3u1 = _mm_perm_ps( m_r2u3_r3u2, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  902. const __m128 neg_part_det3x3_r0 = _mm_madd_ps( r1u3, m_r2u2_r3u1, a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2 );
  903. const __m128 det3x3_r0 = _mm_sub_ps( pos_part_det3x3_r0, neg_part_det3x3_r0 );
  904. const __m128 m_r0u1_r2u2_r3u3 = _mm_mul_ps( r0u1, m_r2u2_r3u3 );
  905. const __m128 a_m_r0u2_r2u3_r3u1_m_r0u1_r2u2_r3u3 = _mm_madd_ps( r0u2, m_r2u3_r3u1, m_r0u1_r2u2_r3u3 );
  906. const __m128 pos_part_det3x3_r1 = _mm_madd_ps( r0u3, m_r2u1_r3u2, a_m_r0u2_r2u3_r3u1_m_r0u1_r2u2_r3u3 );
  907. const __m128 m_r0u1_r2u3_r3u2 = _mm_mul_ps( r0u1, m_r2u3_r3u2 );
  908. const __m128 a_m_r0u2_r2u1_r3u3_m_r0u1_r2u3_r3u2 = _mm_madd_ps( r0u2, m_r2u1_r3u3, m_r0u1_r2u3_r3u2 );
  909. const __m128 neg_part_det3x3_r1 = _mm_madd_ps( r0u3, m_r2u2_r3u1, a_m_r0u2_r2u1_r3u3_m_r0u1_r2u3_r3u2 );
  910. const __m128 det3x3_r1 = _mm_sub_ps( pos_part_det3x3_r1, neg_part_det3x3_r1 );
  911. const __m128 m_r0u1_r1u2 = _mm_mul_ps( r0u1, r1u2 );
  912. const __m128 m_r0u1_r1u2_r2u3 = _mm_mul_ps( m_r0u1_r1u2, r2u3 );
  913. const __m128 m_r0u2_r1u3 = _mm_perm_ps( m_r0u1_r1u2, _MM_SHUFFLE( 2, 1, 0, 3 ) );
  914. const __m128 a_m_r0u2_r1u3_r2u1_m_r0u1_r1u2_r2u3 = _mm_madd_ps( m_r0u2_r1u3, r2u1, m_r0u1_r1u2_r2u3 );
  915. const __m128 m_r0u3_r1u1 = _mm_mul_ps( r0u3, r1u1 );
  916. const __m128 pos_part_det3x3_r3 = _mm_madd_ps( m_r0u3_r1u1, r2u2, a_m_r0u2_r1u3_r2u1_m_r0u1_r1u2_r2u3 );
  917. const __m128 m_r0u1_r1u3 = _mm_perm_ps( m_r0u3_r1u1, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  918. const __m128 m_r0u1_r1u3_r2u2 = _mm_mul_ps( m_r0u1_r1u3, r2u2 );
  919. const __m128 m_r0u2_r1u1 = _mm_mul_ps( r0u2, r1u1 );
  920. const __m128 a_m_r0u2_r1u1_r2u3_m_r0u1_r1u3_r2u2 = _mm_madd_ps( m_r0u2_r1u1, r2u3, m_r0u1_r1u3_r2u2 );
  921. const __m128 m_r0u3_r1u2 = _mm_perm_ps( m_r0u2_r1u1, _MM_SHUFFLE( 2, 1, 0, 3 ) );
  922. const __m128 neg_part_det3x3_r3 = _mm_madd_ps( m_r0u3_r1u2, r2u1, a_m_r0u2_r1u1_r2u3_m_r0u1_r1u3_r2u2 );
  923. const __m128 det3x3_r3 = _mm_sub_ps( pos_part_det3x3_r3, neg_part_det3x3_r3 );
  924. const __m128 m_r0u1_r1u2_r3u3 = _mm_mul_ps( m_r0u1_r1u2, r3u3 );
  925. const __m128 a_m_r0u2_r1u3_r3u1_m_r0u1_r1u2_r3u3 = _mm_madd_ps( m_r0u2_r1u3, r3u1, m_r0u1_r1u2_r3u3 );
  926. const __m128 pos_part_det3x3_r2 = _mm_madd_ps( m_r0u3_r1u1, r3u2, a_m_r0u2_r1u3_r3u1_m_r0u1_r1u2_r3u3 );
  927. const __m128 m_r0u1_r1u3_r3u2 = _mm_mul_ps( m_r0u1_r1u3, r3u2 );
  928. const __m128 a_m_r0u2_r1u1_r3u3_m_r0u1_r1u3_r3u2 = _mm_madd_ps( m_r0u2_r1u1, r3u3, m_r0u1_r1u3_r3u2 );
  929. const __m128 neg_part_det3x3_r2 = _mm_madd_ps( m_r0u3_r1u2, r3u1, a_m_r0u2_r1u1_r3u3_m_r0u1_r1u3_r3u2 );
  930. const __m128 det3x3_r2 = _mm_sub_ps( pos_part_det3x3_r2, neg_part_det3x3_r2 );
  931. const __m128 c_zero = _mm_setzero_ps();
  932. const __m128 c_mask = _mm_cmpeq_ps( c_zero, c_zero );
  933. const __m128 c_signmask = _mm_castsi128_ps( _mm_slli_epi32( _mm_castps_si128( c_mask ), 31 ) );
  934. const __m128 c_znzn = _mm_unpacklo_ps( c_zero, c_signmask );
  935. const __m128 c_nznz = _mm_unpacklo_ps( c_signmask, c_zero );
  936. const __m128 cofactor_r0 = _mm_xor_ps( det3x3_r0, c_znzn );
  937. const __m128 cofactor_r1 = _mm_xor_ps( det3x3_r1, c_nznz );
  938. const __m128 cofactor_r2 = _mm_xor_ps( det3x3_r2, c_znzn );
  939. const __m128 cofactor_r3 = _mm_xor_ps( det3x3_r3, c_nznz );
  940. const __m128 dot0 = _mm_mul_ps( r0, cofactor_r0 );
  941. const __m128 dot1 = _mm_add_ps( dot0, _mm_perm_ps( dot0, _MM_SHUFFLE( 2, 1, 0, 3 ) ) );
  942. const __m128 det = _mm_add_ps( dot1, _mm_perm_ps( dot1, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  943. const __m128 absDet = _mm_andnot_ps( c_signmask, det );
  944. if ( _mm_movemask_ps( _mm_cmplt_ps( absDet, vector_float_inverse_epsilon ) ) & 15 ) {
  945. return false;
  946. }
  947. const __m128 rcpDet = _mm_rcp32_ps( det );
  948. const __m128 hi_part_r0_r2 = _mm_unpacklo_ps( cofactor_r0, cofactor_r2 );
  949. const __m128 lo_part_r0_r2 = _mm_unpackhi_ps( cofactor_r0, cofactor_r2 );
  950. const __m128 hi_part_r1_r3 = _mm_unpacklo_ps( cofactor_r1, cofactor_r3 );
  951. const __m128 lo_part_r1_r3 = _mm_unpackhi_ps( cofactor_r1, cofactor_r3 );
  952. const __m128 adjoint_r0 = _mm_unpacklo_ps( hi_part_r0_r2, hi_part_r1_r3 );
  953. const __m128 adjoint_r1 = _mm_unpackhi_ps( hi_part_r0_r2, hi_part_r1_r3 );
  954. const __m128 adjoint_r2 = _mm_unpacklo_ps( lo_part_r0_r2, lo_part_r1_r3 );
  955. const __m128 adjoint_r3 = _mm_unpackhi_ps( lo_part_r0_r2, lo_part_r1_r3 );
  956. _mm_storeu_ps( out.m + 0 * 4, _mm_mul_ps( adjoint_r0, rcpDet ) );
  957. _mm_storeu_ps( out.m + 1 * 4, _mm_mul_ps( adjoint_r1, rcpDet ) );
  958. _mm_storeu_ps( out.m + 2 * 4, _mm_mul_ps( adjoint_r2, rcpDet ) );
  959. _mm_storeu_ps( out.m + 3 * 4, _mm_mul_ps( adjoint_r3, rcpDet ) );
  960. #else
  961. const int FRL = 4;
  962. // 84+4+16 = 104 multiplications
  963. // 1 division
  964. // 2x2 sub-determinants required to calculate 4x4 determinant
  965. const float det2_01_01 = src.m[0*FRL+0] * src.m[1*FRL+1] - src.m[0*FRL+1] * src.m[1*FRL+0];
  966. const float det2_01_02 = src.m[0*FRL+0] * src.m[1*FRL+2] - src.m[0*FRL+2] * src.m[1*FRL+0];
  967. const float det2_01_03 = src.m[0*FRL+0] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+0];
  968. const float det2_01_12 = src.m[0*FRL+1] * src.m[1*FRL+2] - src.m[0*FRL+2] * src.m[1*FRL+1];
  969. const float det2_01_13 = src.m[0*FRL+1] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+1];
  970. const float det2_01_23 = src.m[0*FRL+2] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+2];
  971. // 3x3 sub-determinants required to calculate 4x4 determinant
  972. const float det3_201_012 = src.m[2*FRL+0] * det2_01_12 - src.m[2*FRL+1] * det2_01_02 + src.m[2*FRL+2] * det2_01_01;
  973. const float det3_201_013 = src.m[2*FRL+0] * det2_01_13 - src.m[2*FRL+1] * det2_01_03 + src.m[2*FRL+3] * det2_01_01;
  974. const float det3_201_023 = src.m[2*FRL+0] * det2_01_23 - src.m[2*FRL+2] * det2_01_03 + src.m[2*FRL+3] * det2_01_02;
  975. const float det3_201_123 = src.m[2*FRL+1] * det2_01_23 - src.m[2*FRL+2] * det2_01_13 + src.m[2*FRL+3] * det2_01_12;
  976. const float det = ( - det3_201_123 * src.m[3*FRL+0] + det3_201_023 * src.m[3*FRL+1] - det3_201_013 * src.m[3*FRL+2] + det3_201_012 * src.m[3*FRL+3] );
  977. if ( idMath::Fabs( det ) < RENDER_MATRIX_INVERSE_EPSILON ) {
  978. return false;
  979. }
  980. const float rcpDet = 1.0f / det;
  981. // remaining 2x2 sub-determinants
  982. const float det2_03_01 = src.m[0*FRL+0] * src.m[3*FRL+1] - src.m[0*FRL+1] * src.m[3*FRL+0];
  983. const float det2_03_02 = src.m[0*FRL+0] * src.m[3*FRL+2] - src.m[0*FRL+2] * src.m[3*FRL+0];
  984. const float det2_03_03 = src.m[0*FRL+0] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+0];
  985. const float det2_03_12 = src.m[0*FRL+1] * src.m[3*FRL+2] - src.m[0*FRL+2] * src.m[3*FRL+1];
  986. const float det2_03_13 = src.m[0*FRL+1] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+1];
  987. const float det2_03_23 = src.m[0*FRL+2] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+2];
  988. const float det2_13_01 = src.m[1*FRL+0] * src.m[3*FRL+1] - src.m[1*FRL+1] * src.m[3*FRL+0];
  989. const float det2_13_02 = src.m[1*FRL+0] * src.m[3*FRL+2] - src.m[1*FRL+2] * src.m[3*FRL+0];
  990. const float det2_13_03 = src.m[1*FRL+0] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+0];
  991. const float det2_13_12 = src.m[1*FRL+1] * src.m[3*FRL+2] - src.m[1*FRL+2] * src.m[3*FRL+1];
  992. const float det2_13_13 = src.m[1*FRL+1] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+1];
  993. const float det2_13_23 = src.m[1*FRL+2] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+2];
  994. // remaining 3x3 sub-determinants
  995. const float det3_203_012 = src.m[2*FRL+0] * det2_03_12 - src.m[2*FRL+1] * det2_03_02 + src.m[2*FRL+2] * det2_03_01;
  996. const float det3_203_013 = src.m[2*FRL+0] * det2_03_13 - src.m[2*FRL+1] * det2_03_03 + src.m[2*FRL+3] * det2_03_01;
  997. const float det3_203_023 = src.m[2*FRL+0] * det2_03_23 - src.m[2*FRL+2] * det2_03_03 + src.m[2*FRL+3] * det2_03_02;
  998. const float det3_203_123 = src.m[2*FRL+1] * det2_03_23 - src.m[2*FRL+2] * det2_03_13 + src.m[2*FRL+3] * det2_03_12;
  999. const float det3_213_012 = src.m[2*FRL+0] * det2_13_12 - src.m[2*FRL+1] * det2_13_02 + src.m[2*FRL+2] * det2_13_01;
  1000. const float det3_213_013 = src.m[2*FRL+0] * det2_13_13 - src.m[2*FRL+1] * det2_13_03 + src.m[2*FRL+3] * det2_13_01;
  1001. const float det3_213_023 = src.m[2*FRL+0] * det2_13_23 - src.m[2*FRL+2] * det2_13_03 + src.m[2*FRL+3] * det2_13_02;
  1002. const float det3_213_123 = src.m[2*FRL+1] * det2_13_23 - src.m[2*FRL+2] * det2_13_13 + src.m[2*FRL+3] * det2_13_12;
  1003. const float det3_301_012 = src.m[3*FRL+0] * det2_01_12 - src.m[3*FRL+1] * det2_01_02 + src.m[3*FRL+2] * det2_01_01;
  1004. const float det3_301_013 = src.m[3*FRL+0] * det2_01_13 - src.m[3*FRL+1] * det2_01_03 + src.m[3*FRL+3] * det2_01_01;
  1005. const float det3_301_023 = src.m[3*FRL+0] * det2_01_23 - src.m[3*FRL+2] * det2_01_03 + src.m[3*FRL+3] * det2_01_02;
  1006. const float det3_301_123 = src.m[3*FRL+1] * det2_01_23 - src.m[3*FRL+2] * det2_01_13 + src.m[3*FRL+3] * det2_01_12;
  1007. out.m[0*FRL+0] = - det3_213_123 * rcpDet;
  1008. out.m[1*FRL+0] = + det3_213_023 * rcpDet;
  1009. out.m[2*FRL+0] = - det3_213_013 * rcpDet;
  1010. out.m[3*FRL+0] = + det3_213_012 * rcpDet;
  1011. out.m[0*FRL+1] = + det3_203_123 * rcpDet;
  1012. out.m[1*FRL+1] = - det3_203_023 * rcpDet;
  1013. out.m[2*FRL+1] = + det3_203_013 * rcpDet;
  1014. out.m[3*FRL+1] = - det3_203_012 * rcpDet;
  1015. out.m[0*FRL+2] = + det3_301_123 * rcpDet;
  1016. out.m[1*FRL+2] = - det3_301_023 * rcpDet;
  1017. out.m[2*FRL+2] = + det3_301_013 * rcpDet;
  1018. out.m[3*FRL+2] = - det3_301_012 * rcpDet;
  1019. out.m[0*FRL+3] = - det3_201_123 * rcpDet;
  1020. out.m[1*FRL+3] = + det3_201_023 * rcpDet;
  1021. out.m[2*FRL+3] = - det3_201_013 * rcpDet;
  1022. out.m[3*FRL+3] = + det3_201_012 * rcpDet;
  1023. #endif
  1024. return true;
  1025. }
  1026. /*
  1027. ========================
  1028. idRenderMatrix::InverseByTranspose
  1029. ========================
  1030. */
  1031. void idRenderMatrix::InverseByTranspose( const idRenderMatrix & src, idRenderMatrix & out ) {
  1032. assert( &src != &out );
  1033. assert( src.IsAffineTransform( 0.01f ) );
  1034. out[0][0] = src[0][0];
  1035. out[1][0] = src[0][1];
  1036. out[2][0] = src[0][2];
  1037. out[3][0] = 0.0f;
  1038. out[0][1] = src[1][0];
  1039. out[1][1] = src[1][1];
  1040. out[2][1] = src[1][2];
  1041. out[3][1] = 0.0f;
  1042. out[0][2] = src[2][0];
  1043. out[1][2] = src[2][1];
  1044. out[2][2] = src[2][2];
  1045. out[3][2] = 0.0f;
  1046. out[0][3] = -( src[0][0] * src[0][3] + src[1][0] * src[1][3] + src[2][0] * src[2][3] );
  1047. out[1][3] = -( src[0][1] * src[0][3] + src[1][1] * src[1][3] + src[2][1] * src[2][3] );
  1048. out[2][3] = -( src[0][2] * src[0][3] + src[1][2] * src[1][3] + src[2][2] * src[2][3] );
  1049. out[3][3] = 1.0f;
  1050. }
  1051. /*
  1052. ========================
  1053. idRenderMatrix::InverseByDoubles
  1054. This should never be used at run-time.
  1055. This is only for tools where more precision is needed.
  1056. ========================
  1057. */
  1058. bool idRenderMatrix::InverseByDoubles( const idRenderMatrix & src, idRenderMatrix & out ) {
  1059. const int FRL = 4;
  1060. // 84+4+16 = 104 multiplications
  1061. // 1 division
  1062. // 2x2 sub-determinants required to calculate 4x4 determinant
  1063. const double det2_01_01 = (double)src.m[0*FRL+0] * (double)src.m[1*FRL+1] - (double)src.m[0*FRL+1] * (double)src.m[1*FRL+0];
  1064. const double det2_01_02 = (double)src.m[0*FRL+0] * (double)src.m[1*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[1*FRL+0];
  1065. const double det2_01_03 = (double)src.m[0*FRL+0] * (double)src.m[1*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[1*FRL+0];
  1066. const double det2_01_12 = (double)src.m[0*FRL+1] * (double)src.m[1*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[1*FRL+1];
  1067. const double det2_01_13 = (double)src.m[0*FRL+1] * (double)src.m[1*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[1*FRL+1];
  1068. const double det2_01_23 = (double)src.m[0*FRL+2] * (double)src.m[1*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[1*FRL+2];
  1069. // 3x3 sub-determinants required to calculate 4x4 determinant
  1070. const double det3_201_012 = (double)src.m[2*FRL+0] * det2_01_12 - (double)src.m[2*FRL+1] * det2_01_02 + (double)src.m[2*FRL+2] * det2_01_01;
  1071. const double det3_201_013 = (double)src.m[2*FRL+0] * det2_01_13 - (double)src.m[2*FRL+1] * det2_01_03 + (double)src.m[2*FRL+3] * det2_01_01;
  1072. const double det3_201_023 = (double)src.m[2*FRL+0] * det2_01_23 - (double)src.m[2*FRL+2] * det2_01_03 + (double)src.m[2*FRL+3] * det2_01_02;
  1073. const double det3_201_123 = (double)src.m[2*FRL+1] * det2_01_23 - (double)src.m[2*FRL+2] * det2_01_13 + (double)src.m[2*FRL+3] * det2_01_12;
  1074. const double det = ( - det3_201_123 * (double)src.m[3*FRL+0] + det3_201_023 * (double)src.m[3*FRL+1] - det3_201_013 * (double)src.m[3*FRL+2] + det3_201_012 * (double)src.m[3*FRL+3] );
  1075. const double rcpDet = 1.0f / det;
  1076. // remaining 2x2 sub-determinants
  1077. const double det2_03_01 = (double)src.m[0*FRL+0] * (double)src.m[3*FRL+1] - (double)src.m[0*FRL+1] * (double)src.m[3*FRL+0];
  1078. const double det2_03_02 = (double)src.m[0*FRL+0] * (double)src.m[3*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[3*FRL+0];
  1079. const double det2_03_03 = (double)src.m[0*FRL+0] * (double)src.m[3*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[3*FRL+0];
  1080. const double det2_03_12 = (double)src.m[0*FRL+1] * (double)src.m[3*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[3*FRL+1];
  1081. const double det2_03_13 = (double)src.m[0*FRL+1] * (double)src.m[3*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[3*FRL+1];
  1082. const double det2_03_23 = (double)src.m[0*FRL+2] * (double)src.m[3*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[3*FRL+2];
  1083. const double det2_13_01 = (double)src.m[1*FRL+0] * (double)src.m[3*FRL+1] - (double)src.m[1*FRL+1] * (double)src.m[3*FRL+0];
  1084. const double det2_13_02 = (double)src.m[1*FRL+0] * (double)src.m[3*FRL+2] - (double)src.m[1*FRL+2] * (double)src.m[3*FRL+0];
  1085. const double det2_13_03 = (double)src.m[1*FRL+0] * (double)src.m[3*FRL+3] - (double)src.m[1*FRL+3] * (double)src.m[3*FRL+0];
  1086. const double det2_13_12 = (double)src.m[1*FRL+1] * (double)src.m[3*FRL+2] - (double)src.m[1*FRL+2] * (double)src.m[3*FRL+1];
  1087. const double det2_13_13 = (double)src.m[1*FRL+1] * (double)src.m[3*FRL+3] - (double)src.m[1*FRL+3] * (double)src.m[3*FRL+1];
  1088. const double det2_13_23 = (double)src.m[1*FRL+2] * (double)src.m[3*FRL+3] - (double)src.m[1*FRL+3] * (double)src.m[3*FRL+2];
  1089. // remaining 3x3 sub-determinants
  1090. const double det3_203_012 = (double)src.m[2*FRL+0] * det2_03_12 - (double)src.m[2*FRL+1] * det2_03_02 + (double)src.m[2*FRL+2] * det2_03_01;
  1091. const double det3_203_013 = (double)src.m[2*FRL+0] * det2_03_13 - (double)src.m[2*FRL+1] * det2_03_03 + (double)src.m[2*FRL+3] * det2_03_01;
  1092. const double det3_203_023 = (double)src.m[2*FRL+0] * det2_03_23 - (double)src.m[2*FRL+2] * det2_03_03 + (double)src.m[2*FRL+3] * det2_03_02;
  1093. const double det3_203_123 = (double)src.m[2*FRL+1] * det2_03_23 - (double)src.m[2*FRL+2] * det2_03_13 + (double)src.m[2*FRL+3] * det2_03_12;
  1094. const double det3_213_012 = (double)src.m[2*FRL+0] * det2_13_12 - (double)src.m[2*FRL+1] * det2_13_02 + (double)src.m[2*FRL+2] * det2_13_01;
  1095. const double det3_213_013 = (double)src.m[2*FRL+0] * det2_13_13 - (double)src.m[2*FRL+1] * det2_13_03 + (double)src.m[2*FRL+3] * det2_13_01;
  1096. const double det3_213_023 = (double)src.m[2*FRL+0] * det2_13_23 - (double)src.m[2*FRL+2] * det2_13_03 + (double)src.m[2*FRL+3] * det2_13_02;
  1097. const double det3_213_123 = (double)src.m[2*FRL+1] * det2_13_23 - (double)src.m[2*FRL+2] * det2_13_13 + (double)src.m[2*FRL+3] * det2_13_12;
  1098. const double det3_301_012 = (double)src.m[3*FRL+0] * det2_01_12 - (double)src.m[3*FRL+1] * det2_01_02 + (double)src.m[3*FRL+2] * det2_01_01;
  1099. const double det3_301_013 = (double)src.m[3*FRL+0] * det2_01_13 - (double)src.m[3*FRL+1] * det2_01_03 + (double)src.m[3*FRL+3] * det2_01_01;
  1100. const double det3_301_023 = (double)src.m[3*FRL+0] * det2_01_23 - (double)src.m[3*FRL+2] * det2_01_03 + (double)src.m[3*FRL+3] * det2_01_02;
  1101. const double det3_301_123 = (double)src.m[3*FRL+1] * det2_01_23 - (double)src.m[3*FRL+2] * det2_01_13 + (double)src.m[3*FRL+3] * det2_01_12;
  1102. out.m[0*FRL+0] = (float)( - det3_213_123 * rcpDet );
  1103. out.m[1*FRL+0] = (float)( + det3_213_023 * rcpDet );
  1104. out.m[2*FRL+0] = (float)( - det3_213_013 * rcpDet );
  1105. out.m[3*FRL+0] = (float)( + det3_213_012 * rcpDet );
  1106. out.m[0*FRL+1] = (float)( + det3_203_123 * rcpDet );
  1107. out.m[1*FRL+1] = (float)( - det3_203_023 * rcpDet );
  1108. out.m[2*FRL+1] = (float)( + det3_203_013 * rcpDet );
  1109. out.m[3*FRL+1] = (float)( - det3_203_012 * rcpDet );
  1110. out.m[0*FRL+2] = (float)( + det3_301_123 * rcpDet );
  1111. out.m[1*FRL+2] = (float)( - det3_301_023 * rcpDet );
  1112. out.m[2*FRL+2] = (float)( + det3_301_013 * rcpDet );
  1113. out.m[3*FRL+2] = (float)( - det3_301_012 * rcpDet );
  1114. out.m[0*FRL+3] = (float)( - det3_201_123 * rcpDet );
  1115. out.m[1*FRL+3] = (float)( + det3_201_023 * rcpDet );
  1116. out.m[2*FRL+3] = (float)( - det3_201_013 * rcpDet );
  1117. out.m[3*FRL+3] = (float)( + det3_201_012 * rcpDet );
  1118. return true;
  1119. }
  1120. /*
  1121. ========================
  1122. DeterminantIsNegative
  1123. ========================
  1124. */
  1125. #ifdef ID_WIN_X86_SSE2_INTRIN
  1126. void DeterminantIsNegative( bool & negativeDeterminant, const __m128 & r0, const __m128 & r1, const __m128 & r2, const __m128 & r3 ) {
  1127. const __m128 r1u1 = _mm_perm_ps( r1, _MM_SHUFFLE( 2, 1, 0, 3 ) );
  1128. const __m128 r1u2 = _mm_perm_ps( r1, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  1129. const __m128 r1u3 = _mm_perm_ps( r1, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  1130. const __m128 r2u2 = _mm_perm_ps( r2, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  1131. const __m128 r2u3 = _mm_perm_ps( r2, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  1132. const __m128 r3u1 = _mm_perm_ps( r3, _MM_SHUFFLE( 2, 1, 0, 3 ) );
  1133. const __m128 r3u2 = _mm_perm_ps( r3, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  1134. const __m128 r3u3 = _mm_perm_ps( r3, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  1135. const __m128 m_r2u2_r3u3 = _mm_mul_ps( r2u2, r3u3 );
  1136. const __m128 m_r1u1_r2u2_r3u3 = _mm_mul_ps( r1u1, m_r2u2_r3u3 );
  1137. const __m128 m_r2u3_r3u1 = _mm_mul_ps( r2u3, r3u1 );
  1138. const __m128 a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3 = _mm_madd_ps( r1u2, m_r2u3_r3u1, m_r1u1_r2u2_r3u3 );
  1139. const __m128 m_r2u1_r3u2 = _mm_perm_ps( m_r2u2_r3u3, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  1140. const __m128 pos_part_det3x3_r0 = _mm_madd_ps( r1u3, m_r2u1_r3u2, a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3 );
  1141. const __m128 m_r2u3_r3u2 = _mm_mul_ps( r2u3, r3u2 );
  1142. const __m128 m_r1u1_r2u3_r3u2 = _mm_mul_ps( r1u1, m_r2u3_r3u2 );
  1143. const __m128 m_r2u1_r3u3 = _mm_perm_ps( m_r2u3_r3u1, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  1144. const __m128 a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2 = _mm_madd_ps( r1u2, m_r2u1_r3u3, m_r1u1_r2u3_r3u2 );
  1145. const __m128 m_r2u2_r3u1 = _mm_perm_ps( m_r2u3_r3u2, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  1146. const __m128 neg_part_det3x3_r0 = _mm_madd_ps( r1u3, m_r2u2_r3u1, a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2 );
  1147. const __m128 det3x3_r0 = _mm_sub_ps( pos_part_det3x3_r0, neg_part_det3x3_r0 );
  1148. const __m128 c_zero = _mm_setzero_ps();
  1149. const __m128 c_mask = _mm_cmpeq_ps( c_zero, c_zero );
  1150. const __m128 c_signmask = _mm_castsi128_ps( _mm_slli_epi32( _mm_castps_si128( c_mask ), 31 ) );
  1151. const __m128 c_znzn = _mm_unpacklo_ps( c_zero, c_signmask );
  1152. const __m128 cofactor_r0 = _mm_xor_ps( det3x3_r0, c_znzn );
  1153. const __m128 dot0 = _mm_mul_ps( r0, cofactor_r0 );
  1154. const __m128 dot1 = _mm_add_ps( dot0, _mm_perm_ps( dot0, _MM_SHUFFLE( 2, 1, 0, 3 ) ) );
  1155. const __m128 det = _mm_add_ps( dot1, _mm_perm_ps( dot1, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  1156. const __m128 result = _mm_cmpgt_ps( c_zero, det );
  1157. negativeDeterminant = _mm_movemask_ps( result ) & 1;
  1158. }
  1159. #else
  1160. void DeterminantIsNegative( bool & negativeDeterminant, const float * row0, const float * row1, const float * row2, const float * row3 ) {
  1161. // 2x2 sub-determinants required to calculate 4x4 determinant
  1162. const float det2_01_01 = row0[0] * row1[1] - row0[1] * row1[0];
  1163. const float det2_01_02 = row0[0] * row1[2] - row0[2] * row1[0];
  1164. const float det2_01_03 = row0[0] * row1[3] - row0[3] * row1[0];
  1165. const float det2_01_12 = row0[1] * row1[2] - row0[2] * row1[1];
  1166. const float det2_01_13 = row0[1] * row1[3] - row0[3] * row1[1];
  1167. const float det2_01_23 = row0[2] * row1[3] - row0[3] * row1[2];
  1168. // 3x3 sub-determinants required to calculate 4x4 determinant
  1169. const float det3_201_012 = row2[0] * det2_01_12 - row2[1] * det2_01_02 + row2[2] * det2_01_01;
  1170. const float det3_201_013 = row2[0] * det2_01_13 - row2[1] * det2_01_03 + row2[3] * det2_01_01;
  1171. const float det3_201_023 = row2[0] * det2_01_23 - row2[2] * det2_01_03 + row2[3] * det2_01_02;
  1172. const float det3_201_123 = row2[1] * det2_01_23 - row2[2] * det2_01_13 + row2[3] * det2_01_12;
  1173. const float det = ( - det3_201_123 * row3[0] + det3_201_023 * row3[1] - det3_201_013 * row3[2] + det3_201_012 * row3[3] );
  1174. negativeDeterminant = ( det < 0.0f );
  1175. }
  1176. #endif
  1177. /*
  1178. ========================
  1179. idRenderMatrix::CopyMatrix
  1180. ========================
  1181. */
  1182. void idRenderMatrix::CopyMatrix( const idRenderMatrix & matrix, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3 ) {
  1183. assert_16_byte_aligned( row0.ToFloatPtr() );
  1184. assert_16_byte_aligned( row1.ToFloatPtr() );
  1185. assert_16_byte_aligned( row2.ToFloatPtr() );
  1186. assert_16_byte_aligned( row3.ToFloatPtr() );
  1187. #ifdef ID_WIN_X86_SSE2_INTRIN
  1188. const __m128 r0 = _mm_loadu_ps( matrix.m + 0 * 4 );
  1189. const __m128 r1 = _mm_loadu_ps( matrix.m + 1 * 4 );
  1190. const __m128 r2 = _mm_loadu_ps( matrix.m + 2 * 4 );
  1191. const __m128 r3 = _mm_loadu_ps( matrix.m + 3 * 4 );
  1192. _mm_store_ps( row0.ToFloatPtr(), r0 );
  1193. _mm_store_ps( row1.ToFloatPtr(), r1 );
  1194. _mm_store_ps( row2.ToFloatPtr(), r2 );
  1195. _mm_store_ps( row3.ToFloatPtr(), r3 );
  1196. #else
  1197. memcpy( row0.ToFloatPtr(), matrix[0], sizeof( idVec4 ) );
  1198. memcpy( row1.ToFloatPtr(), matrix[1], sizeof( idVec4 ) );
  1199. memcpy( row2.ToFloatPtr(), matrix[2], sizeof( idVec4 ) );
  1200. memcpy( row3.ToFloatPtr(), matrix[3], sizeof( idVec4 ) );
  1201. #endif
  1202. }
  1203. /*
  1204. ========================
  1205. idRenderMatrix::SetMVP
  1206. ========================
  1207. */
  1208. void idRenderMatrix::SetMVP( const idRenderMatrix & mvp, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3, bool & negativeDeterminant ) {
  1209. assert_16_byte_aligned( row0.ToFloatPtr() );
  1210. assert_16_byte_aligned( row1.ToFloatPtr() );
  1211. assert_16_byte_aligned( row2.ToFloatPtr() );
  1212. assert_16_byte_aligned( row3.ToFloatPtr() );
  1213. #ifdef ID_WIN_X86_SSE2_INTRIN
  1214. const __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 );
  1215. const __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 );
  1216. const __m128 r2 = _mm_loadu_ps( mvp.m + 2 * 4 );
  1217. const __m128 r3 = _mm_loadu_ps( mvp.m + 3 * 4 );
  1218. _mm_store_ps( row0.ToFloatPtr(), r0 );
  1219. _mm_store_ps( row1.ToFloatPtr(), r1 );
  1220. _mm_store_ps( row2.ToFloatPtr(), r2 );
  1221. _mm_store_ps( row3.ToFloatPtr(), r3 );
  1222. DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 );
  1223. #else
  1224. memcpy( row0.ToFloatPtr(), mvp[0], sizeof( idVec4 ) );
  1225. memcpy( row1.ToFloatPtr(), mvp[1], sizeof( idVec4 ) );
  1226. memcpy( row2.ToFloatPtr(), mvp[2], sizeof( idVec4 ) );
  1227. memcpy( row3.ToFloatPtr(), mvp[3], sizeof( idVec4 ) );
  1228. DeterminantIsNegative( negativeDeterminant, mvp[0], mvp[1], mvp[2], mvp[3] );
  1229. #endif
  1230. }
  1231. /*
  1232. ========================
  1233. idRenderMatrix::SetMVPForBounds
  1234. ========================
  1235. */
  1236. void idRenderMatrix::SetMVPForBounds( const idRenderMatrix & mvp, const idBounds & bounds, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3, bool & negativeDeterminant ) {
  1237. assert_16_byte_aligned( row0.ToFloatPtr() );
  1238. assert_16_byte_aligned( row1.ToFloatPtr() );
  1239. assert_16_byte_aligned( row2.ToFloatPtr() );
  1240. assert_16_byte_aligned( row3.ToFloatPtr() );
  1241. #ifdef ID_WIN_X86_SSE2_INTRIN
  1242. __m128 b0 = _mm_loadu_bounds_0( bounds );
  1243. __m128 b1 = _mm_loadu_bounds_1( bounds );
  1244. __m128 offset = _mm_mul_ps( _mm_add_ps( b1, b0 ), vector_float_half );
  1245. __m128 scale = _mm_mul_ps( _mm_sub_ps( b1, b0 ), vector_float_half );
  1246. scale = _mm_or_ps( scale, vector_float_last_one );
  1247. __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 );
  1248. __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 );
  1249. __m128 r2 = _mm_loadu_ps( mvp.m + 2 * 4 );
  1250. __m128 r3 = _mm_loadu_ps( mvp.m + 3 * 4 );
  1251. __m128 d0 = _mm_mul_ps( r0, offset );
  1252. __m128 d1 = _mm_mul_ps( r1, offset );
  1253. __m128 d2 = _mm_mul_ps( r2, offset );
  1254. __m128 d3 = _mm_mul_ps( r3, offset );
  1255. __m128 s0 = _mm_unpacklo_ps( d0, d2 ); // a0, c0, a1, c1
  1256. __m128 s1 = _mm_unpackhi_ps( d0, d2 ); // a2, c2, a3, c3
  1257. __m128 s2 = _mm_unpacklo_ps( d1, d3 ); // b0, d0, b1, d1
  1258. __m128 s3 = _mm_unpackhi_ps( d1, d3 ); // b2, d2, b3, d3
  1259. __m128 t0 = _mm_unpacklo_ps( s0, s2 ); // a0, b0, c0, d0
  1260. __m128 t1 = _mm_unpackhi_ps( s0, s2 ); // a1, b1, c1, d1
  1261. __m128 t2 = _mm_unpacklo_ps( s1, s3 ); // a2, b2, c2, d2
  1262. t0 = _mm_add_ps( t0, t1 );
  1263. t0 = _mm_add_ps( t0, t2 );
  1264. __m128 n0 = _mm_and_ps( _mm_splat_ps( t0, 0 ), vector_float_keep_last );
  1265. __m128 n1 = _mm_and_ps( _mm_splat_ps( t0, 1 ), vector_float_keep_last );
  1266. __m128 n2 = _mm_and_ps( _mm_splat_ps( t0, 2 ), vector_float_keep_last );
  1267. __m128 n3 = _mm_and_ps( _mm_splat_ps( t0, 3 ), vector_float_keep_last );
  1268. r0 = _mm_madd_ps( r0, scale, n0 );
  1269. r1 = _mm_madd_ps( r1, scale, n1 );
  1270. r2 = _mm_madd_ps( r2, scale, n2 );
  1271. r3 = _mm_madd_ps( r3, scale, n3 );
  1272. _mm_store_ps( row0.ToFloatPtr(), r0 );
  1273. _mm_store_ps( row1.ToFloatPtr(), r1 );
  1274. _mm_store_ps( row2.ToFloatPtr(), r2 );
  1275. _mm_store_ps( row3.ToFloatPtr(), r3 );
  1276. DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 );
  1277. #else
  1278. const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f;
  1279. const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f;
  1280. row0[0] = mvp[0][0] * scale[0];
  1281. row0[1] = mvp[0][1] * scale[1];
  1282. row0[2] = mvp[0][2] * scale[2];
  1283. row0[3] = mvp[0][3] + mvp[0][0] * offset[0] + mvp[0][1] * offset[1] + mvp[0][2] * offset[2];
  1284. row1[0] = mvp[1][0] * scale[0];
  1285. row1[1] = mvp[1][1] * scale[1];
  1286. row1[2] = mvp[1][2] * scale[2];
  1287. row1[3] = mvp[1][3] + mvp[1][0] * offset[0] + mvp[1][1] * offset[1] + mvp[1][2] * offset[2];
  1288. row2[0] = mvp[2][0] * scale[0];
  1289. row2[1] = mvp[2][1] * scale[1];
  1290. row2[2] = mvp[2][2] * scale[2];
  1291. row2[3] = mvp[2][3] + mvp[2][0] * offset[0] + mvp[2][1] * offset[1] + mvp[2][2] * offset[2];
  1292. row3[0] = mvp[3][0] * scale[0];
  1293. row3[1] = mvp[3][1] * scale[1];
  1294. row3[2] = mvp[3][2] * scale[2];
  1295. row3[3] = mvp[3][3] + mvp[3][0] * offset[0] + mvp[3][1] * offset[1] + mvp[3][2] * offset[2];
  1296. DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() );
  1297. #endif
  1298. }
  1299. /*
  1300. ========================
  1301. idRenderMatrix::SetMVPForInverseProject
  1302. ========================
  1303. */
  1304. void idRenderMatrix::SetMVPForInverseProject( const idRenderMatrix & mvp, const idRenderMatrix & inverseProject, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3, bool & negativeDeterminant ) {
  1305. assert_16_byte_aligned( row0.ToFloatPtr() );
  1306. assert_16_byte_aligned( row1.ToFloatPtr() );
  1307. assert_16_byte_aligned( row2.ToFloatPtr() );
  1308. assert_16_byte_aligned( row3.ToFloatPtr() );
  1309. #ifdef ID_WIN_X86_SSE2_INTRIN
  1310. __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 );
  1311. __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 );
  1312. __m128 r2 = _mm_loadu_ps( mvp.m + 2 * 4 );
  1313. __m128 r3 = _mm_loadu_ps( mvp.m + 3 * 4 );
  1314. __m128 p0 = _mm_loadu_ps( inverseProject.m + 0 * 4 );
  1315. __m128 p1 = _mm_loadu_ps( inverseProject.m + 1 * 4 );
  1316. __m128 p2 = _mm_loadu_ps( inverseProject.m + 2 * 4 );
  1317. __m128 p3 = _mm_loadu_ps( inverseProject.m + 3 * 4 );
  1318. __m128 t0 = _mm_mul_ps( _mm_splat_ps( r0, 0 ), p0 );
  1319. __m128 t1 = _mm_mul_ps( _mm_splat_ps( r1, 0 ), p0 );
  1320. __m128 t2 = _mm_mul_ps( _mm_splat_ps( r2, 0 ), p0 );
  1321. __m128 t3 = _mm_mul_ps( _mm_splat_ps( r3, 0 ), p0 );
  1322. t0 = _mm_madd_ps( _mm_splat_ps( r0, 1 ), p1, t0 );
  1323. t1 = _mm_madd_ps( _mm_splat_ps( r1, 1 ), p1, t1 );
  1324. t2 = _mm_madd_ps( _mm_splat_ps( r2, 1 ), p1, t2 );
  1325. t3 = _mm_madd_ps( _mm_splat_ps( r3, 1 ), p1, t3 );
  1326. t0 = _mm_madd_ps( _mm_splat_ps( r0, 2 ), p2, t0 );
  1327. t1 = _mm_madd_ps( _mm_splat_ps( r1, 2 ), p2, t1 );
  1328. t2 = _mm_madd_ps( _mm_splat_ps( r2, 2 ), p2, t2 );
  1329. t3 = _mm_madd_ps( _mm_splat_ps( r3, 2 ), p2, t3 );
  1330. t0 = _mm_madd_ps( _mm_splat_ps( r0, 3 ), p3, t0 );
  1331. t1 = _mm_madd_ps( _mm_splat_ps( r1, 3 ), p3, t1 );
  1332. t2 = _mm_madd_ps( _mm_splat_ps( r2, 3 ), p3, t2 );
  1333. t3 = _mm_madd_ps( _mm_splat_ps( r3, 3 ), p3, t3 );
  1334. _mm_store_ps( row0.ToFloatPtr(), t0 );
  1335. _mm_store_ps( row1.ToFloatPtr(), t1 );
  1336. _mm_store_ps( row2.ToFloatPtr(), t2 );
  1337. _mm_store_ps( row3.ToFloatPtr(), t3 );
  1338. DeterminantIsNegative( negativeDeterminant, t0, t1, t2, t3 );
  1339. #else
  1340. row0[0] = mvp.m[0*4+0]*inverseProject.m[0*4+0] + mvp.m[0*4+1]*inverseProject.m[1*4+0] + mvp.m[0*4+2]*inverseProject.m[2*4+0] + mvp.m[0*4+3]*inverseProject.m[3*4+0];
  1341. row0[1] = mvp.m[0*4+0]*inverseProject.m[0*4+1] + mvp.m[0*4+1]*inverseProject.m[1*4+1] + mvp.m[0*4+2]*inverseProject.m[2*4+1] + mvp.m[0*4+3]*inverseProject.m[3*4+1];
  1342. row0[2] = mvp.m[0*4+0]*inverseProject.m[0*4+2] + mvp.m[0*4+1]*inverseProject.m[1*4+2] + mvp.m[0*4+2]*inverseProject.m[2*4+2] + mvp.m[0*4+3]*inverseProject.m[3*4+2];
  1343. row0[3] = mvp.m[0*4+0]*inverseProject.m[0*4+3] + mvp.m[0*4+1]*inverseProject.m[1*4+3] + mvp.m[0*4+2]*inverseProject.m[2*4+3] + mvp.m[0*4+3]*inverseProject.m[3*4+3];
  1344. row1[0] = mvp.m[1*4+0]*inverseProject.m[0*4+0] + mvp.m[1*4+1]*inverseProject.m[1*4+0] + mvp.m[1*4+2]*inverseProject.m[2*4+0] + mvp.m[1*4+3]*inverseProject.m[3*4+0];
  1345. row1[1] = mvp.m[1*4+0]*inverseProject.m[0*4+1] + mvp.m[1*4+1]*inverseProject.m[1*4+1] + mvp.m[1*4+2]*inverseProject.m[2*4+1] + mvp.m[1*4+3]*inverseProject.m[3*4+1];
  1346. row1[2] = mvp.m[1*4+0]*inverseProject.m[0*4+2] + mvp.m[1*4+1]*inverseProject.m[1*4+2] + mvp.m[1*4+2]*inverseProject.m[2*4+2] + mvp.m[1*4+3]*inverseProject.m[3*4+2];
  1347. row1[3] = mvp.m[1*4+0]*inverseProject.m[0*4+3] + mvp.m[1*4+1]*inverseProject.m[1*4+3] + mvp.m[1*4+2]*inverseProject.m[2*4+3] + mvp.m[1*4+3]*inverseProject.m[3*4+3];
  1348. row2[0] = mvp.m[2*4+0]*inverseProject.m[0*4+0] + mvp.m[2*4+1]*inverseProject.m[1*4+0] + mvp.m[2*4+2]*inverseProject.m[2*4+0] + mvp.m[2*4+3]*inverseProject.m[3*4+0];
  1349. row2[1] = mvp.m[2*4+0]*inverseProject.m[0*4+1] + mvp.m[2*4+1]*inverseProject.m[1*4+1] + mvp.m[2*4+2]*inverseProject.m[2*4+1] + mvp.m[2*4+3]*inverseProject.m[3*4+1];
  1350. row2[2] = mvp.m[2*4+0]*inverseProject.m[0*4+2] + mvp.m[2*4+1]*inverseProject.m[1*4+2] + mvp.m[2*4+2]*inverseProject.m[2*4+2] + mvp.m[2*4+3]*inverseProject.m[3*4+2];
  1351. row2[3] = mvp.m[2*4+0]*inverseProject.m[0*4+3] + mvp.m[2*4+1]*inverseProject.m[1*4+3] + mvp.m[2*4+2]*inverseProject.m[2*4+3] + mvp.m[2*4+3]*inverseProject.m[3*4+3];
  1352. row3[0] = mvp.m[3*4+0]*inverseProject.m[0*4+0] + mvp.m[3*4+1]*inverseProject.m[1*4+0] + mvp.m[3*4+2]*inverseProject.m[2*4+0] + mvp.m[3*4+3]*inverseProject.m[3*4+0];
  1353. row3[1] = mvp.m[3*4+0]*inverseProject.m[0*4+1] + mvp.m[3*4+1]*inverseProject.m[1*4+1] + mvp.m[3*4+2]*inverseProject.m[2*4+1] + mvp.m[3*4+3]*inverseProject.m[3*4+1];
  1354. row3[2] = mvp.m[3*4+0]*inverseProject.m[0*4+2] + mvp.m[3*4+1]*inverseProject.m[1*4+2] + mvp.m[3*4+2]*inverseProject.m[2*4+2] + mvp.m[3*4+3]*inverseProject.m[3*4+2];
  1355. row3[3] = mvp.m[3*4+0]*inverseProject.m[0*4+3] + mvp.m[3*4+1]*inverseProject.m[1*4+3] + mvp.m[3*4+2]*inverseProject.m[2*4+3] + mvp.m[3*4+3]*inverseProject.m[3*4+3];
  1356. DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() );
  1357. #endif
  1358. }
  1359. /*
  1360. ========================
  1361. idRenderMatrix::CullPointToMVPbits
  1362. Returns true if the point transformed by the given Model View Projection (MVP) matrix is
  1363. outside the clip space.
  1364. Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne'
  1365. to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix.
  1366. ========================
  1367. */
  1368. bool idRenderMatrix::CullPointToMVPbits( const idRenderMatrix & mvp, const idVec3 & p, byte * outBits, bool zeroToOne ) {
  1369. idVec4 c;
  1370. for ( int i = 0; i < 4; i++ ) {
  1371. c[i] = p[0] * mvp[i][0] + p[1] * mvp[i][1] + p[2] * mvp[i][2] + mvp[i][3];
  1372. }
  1373. const float minW = zeroToOne ? 0.0f : -c[3];
  1374. const float maxW = c[3];
  1375. #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
  1376. const float minZ = 0.0f;
  1377. #else
  1378. const float minZ = minW;
  1379. #endif
  1380. int bits = 0;
  1381. if ( c[0] > minW ) { bits |= ( 1 << 0 ); }
  1382. if ( c[0] < maxW ) { bits |= ( 1 << 1 ); }
  1383. if ( c[1] > minW ) { bits |= ( 1 << 2 ); }
  1384. if ( c[1] < maxW ) { bits |= ( 1 << 3 ); }
  1385. if ( c[2] > minZ ) { bits |= ( 1 << 4 ); } // NOTE: using minZ
  1386. if ( c[2] < maxW ) { bits |= ( 1 << 5 ); }
  1387. // store out a bit set for each side where the point is outside the clip space
  1388. *outBits = (byte)( bits ^ 63 );
  1389. // if any bits weren't set, the point is completely off one side of the frustum
  1390. return ( bits != 63 );
  1391. }
  1392. /*
  1393. ========================
  1394. idRenderMatrix::CullBoundsToMVPbits
  1395. Returns true if nothing contained in the bounds is transformed by the given
  1396. Model View Projection (MVP) matrix to anything inside the clip space.
  1397. Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne'
  1398. to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix.
  1399. When all the corners of the bounding box are behind one of the six frustum planes, the box is
  1400. culled. This is conservative, because some boxes may "cross corners" and can be in front of a
  1401. frustum plane, but only while also being behind another one.
  1402. ========================
  1403. */
  1404. bool idRenderMatrix::CullBoundsToMVPbits( const idRenderMatrix & mvp, const idBounds & bounds, byte * outBits, bool zeroToOne ) {
  1405. #ifdef ID_WIN_X86_SSE2_INTRIN
  1406. __m128 mvp0 = _mm_loadu_ps( mvp[0] );
  1407. __m128 mvp1 = _mm_loadu_ps( mvp[1] );
  1408. __m128 mvp2 = _mm_loadu_ps( mvp[2] );
  1409. __m128 mvp3 = _mm_loadu_ps( mvp[3] );
  1410. __m128 minMul = zeroToOne ? vector_float_zero : vector_float_neg_one;
  1411. __m128 b0 = _mm_loadu_bounds_0( bounds );
  1412. __m128 b1 = _mm_loadu_bounds_1( bounds );
  1413. // take the four points on the X-Y plane
  1414. __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y
  1415. __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X
  1416. __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y
  1417. __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z
  1418. __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z
  1419. // compute four partial X,Y,Z,W values
  1420. __m128 parx = _mm_splat_ps( mvp0, 3 );
  1421. __m128 pary = _mm_splat_ps( mvp1, 3 );
  1422. __m128 parz = _mm_splat_ps( mvp2, 3 );
  1423. __m128 parw = _mm_splat_ps( mvp3, 3 );
  1424. parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx );
  1425. pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary );
  1426. parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
  1427. parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
  1428. parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx );
  1429. pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary );
  1430. parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
  1431. parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
  1432. // compute full X,Y,Z,W values
  1433. __m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
  1434. __m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
  1435. __m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
  1436. __m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
  1437. __m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx );
  1438. __m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary );
  1439. __m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz );
  1440. __m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw );
  1441. __m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx );
  1442. __m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary );
  1443. __m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz );
  1444. __m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw );
  1445. __m128 maxW0 = w0;
  1446. __m128 maxW1 = w1;
  1447. __m128 minW0 = _mm_mul_ps( w0, minMul );
  1448. __m128 minW1 = _mm_mul_ps( w1, minMul );
  1449. #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
  1450. __m128 minZ0 = vector_float_zero;
  1451. __m128 minZ1 = vector_float_zero;
  1452. #else
  1453. __m128 minZ0 = minW0;
  1454. __m128 minZ1 = minW1;
  1455. #endif
  1456. __m128 cullBits0 = _mm_cmpgt_ps( x0, minW0 );
  1457. __m128 cullBits1 = _mm_cmpgt_ps( maxW0, x0 );
  1458. __m128 cullBits2 = _mm_cmpgt_ps( y0, minW0 );
  1459. __m128 cullBits3 = _mm_cmpgt_ps( maxW0, y0 );
  1460. __m128 cullBits4 = _mm_cmpgt_ps( z0, minZ0 ); // NOTE: using minZ0
  1461. __m128 cullBits5 = _mm_cmpgt_ps( maxW0, z0 );
  1462. cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x1, minW1 ) );
  1463. cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW1, x1 ) );
  1464. cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y1, minW1 ) );
  1465. cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW1, y1 ) );
  1466. cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z1, minZ1 ) ); // NOTE: using minZ1
  1467. cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW1, z1 ) );
  1468. cullBits0 = _mm_and_ps( cullBits0, vector_float_mask0 );
  1469. cullBits1 = _mm_and_ps( cullBits1, vector_float_mask1 );
  1470. cullBits2 = _mm_and_ps( cullBits2, vector_float_mask2 );
  1471. cullBits3 = _mm_and_ps( cullBits3, vector_float_mask3 );
  1472. cullBits4 = _mm_and_ps( cullBits4, vector_float_mask4 );
  1473. cullBits5 = _mm_and_ps( cullBits5, vector_float_mask5 );
  1474. cullBits0 = _mm_or_ps( cullBits0, cullBits1 );
  1475. cullBits2 = _mm_or_ps( cullBits2, cullBits3 );
  1476. cullBits4 = _mm_or_ps( cullBits4, cullBits5 );
  1477. cullBits0 = _mm_or_ps( cullBits0, cullBits2 );
  1478. cullBits0 = _mm_or_ps( cullBits0, cullBits4 );
  1479. cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  1480. cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 0, 1, 0, 1 ) ) );
  1481. int bits = _mm_cvtsi128_si32( (const __m128i &)cullBits0 );
  1482. *outBits = (byte)( bits ^ 63 );
  1483. return ( bits != 63 );
  1484. #else
  1485. int bits = 0;
  1486. idVec3 v;
  1487. for ( int x = 0; x < 2; x++ ) {
  1488. v[0] = bounds[x][0];
  1489. for ( int y = 0; y < 2; y++ ) {
  1490. v[1] = bounds[y][1];
  1491. for ( int z = 0; z < 2; z++ ) {
  1492. v[2] = bounds[z][2];
  1493. idVec4 c;
  1494. for ( int i = 0; i < 4; i++ ) {
  1495. c[i] = v[0] * mvp[i][0] + v[1] * mvp[i][1] + v[2] * mvp[i][2] + mvp[i][3];
  1496. }
  1497. const float minW = zeroToOne ? 0.0f : -c[3];
  1498. const float maxW = c[3];
  1499. #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
  1500. const float minZ = 0.0f;
  1501. #else
  1502. const float minZ = minW;
  1503. #endif
  1504. if ( c[0] > minW ) { bits |= ( 1 << 0 ); }
  1505. if ( c[0] < maxW ) { bits |= ( 1 << 1 ); }
  1506. if ( c[1] > minW ) { bits |= ( 1 << 2 ); }
  1507. if ( c[1] < maxW ) { bits |= ( 1 << 3 ); }
  1508. if ( c[2] > minZ ) { bits |= ( 1 << 4 ); } // NOTE: using minZ
  1509. if ( c[2] < maxW ) { bits |= ( 1 << 5 ); }
  1510. }
  1511. }
  1512. }
  1513. // store out a bit set for each side where the bounds is outside the clip space
  1514. *outBits = (byte)( bits ^ 63 );
  1515. // if any bits weren't set, the bounds is completely off one side of the frustum
  1516. return ( bits != 63 );
  1517. #endif
  1518. }
  1519. /*
  1520. ========================
  1521. idRenderMatrix::CullExtrudedBoundsToMVPbits
  1522. Returns true if nothing contained in the extruded bounds is transformed by the
  1523. given Model View Projection (MVP) matrix to anything inside the clip space.
  1524. The given bounds is extruded in the 'extrudeDirection' up to the 'clipPlane'.
  1525. Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne'
  1526. to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix.
  1527. When all the corners of the bounding box are behind one of the six frustum planes, the box is
  1528. culled. This is conservative, because some boxes may "cross corners" and can be in front of a
  1529. frustum plane, but only while also being behind another one.
  1530. ========================
  1531. */
  1532. bool idRenderMatrix::CullExtrudedBoundsToMVPbits( const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & extrudeDirection, const idPlane & clipPlane, byte * outBits, bool zeroToOne ) {
  1533. assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL );
  1534. #ifdef ID_WIN_X86_SSE2_INTRIN
  1535. __m128 mvp0 = _mm_loadu_ps( mvp[0] );
  1536. __m128 mvp1 = _mm_loadu_ps( mvp[1] );
  1537. __m128 mvp2 = _mm_loadu_ps( mvp[2] );
  1538. __m128 mvp3 = _mm_loadu_ps( mvp[3] );
  1539. __m128 minMul = zeroToOne ? vector_float_zero : vector_float_neg_one;
  1540. __m128 b0 = _mm_loadu_bounds_0( bounds );
  1541. __m128 b1 = _mm_loadu_bounds_1( bounds );
  1542. // take the four points on the X-Y plane
  1543. __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y
  1544. __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X
  1545. __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y
  1546. __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z
  1547. __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z
  1548. __m128 cullBits0;
  1549. __m128 cullBits1;
  1550. __m128 cullBits2;
  1551. __m128 cullBits3;
  1552. __m128 cullBits4;
  1553. __m128 cullBits5;
  1554. // calculate the cull bits for the bounding box corners
  1555. {
  1556. // compute four partial X,Y,Z,W values
  1557. __m128 parx = _mm_splat_ps( mvp0, 3 );
  1558. __m128 pary = _mm_splat_ps( mvp1, 3 );
  1559. __m128 parz = _mm_splat_ps( mvp2, 3 );
  1560. __m128 parw = _mm_splat_ps( mvp3, 3 );
  1561. parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx );
  1562. pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary );
  1563. parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
  1564. parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
  1565. parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx );
  1566. pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary );
  1567. parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
  1568. parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
  1569. // compute full X,Y,Z,W values
  1570. __m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
  1571. __m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
  1572. __m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
  1573. __m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
  1574. __m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx );
  1575. __m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary );
  1576. __m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz );
  1577. __m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw );
  1578. __m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx );
  1579. __m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary );
  1580. __m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz );
  1581. __m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw );
  1582. __m128 maxW0 = w0;
  1583. __m128 maxW1 = w1;
  1584. __m128 minW0 = _mm_mul_ps( w0, minMul );
  1585. __m128 minW1 = _mm_mul_ps( w1, minMul );
  1586. #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
  1587. __m128 minZ0 = vector_float_zero;
  1588. __m128 minZ1 = vector_float_zero;
  1589. #else
  1590. __m128 minZ0 = minW0;
  1591. __m128 minZ1 = minW1;
  1592. #endif
  1593. cullBits0 = _mm_cmpgt_ps( x0, minW0 );
  1594. cullBits1 = _mm_cmpgt_ps( maxW0, x0 );
  1595. cullBits2 = _mm_cmpgt_ps( y0, minW0 );
  1596. cullBits3 = _mm_cmpgt_ps( maxW0, y0 );
  1597. cullBits4 = _mm_cmpgt_ps( z0, minZ0 ); // NOTE: using minZ0
  1598. cullBits5 = _mm_cmpgt_ps( maxW0, z0 );
  1599. cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x1, minW1 ) );
  1600. cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW1, x1 ) );
  1601. cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y1, minW1 ) );
  1602. cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW1, y1 ) );
  1603. cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z1, minZ1 ) ); // NOTE: using minZ1
  1604. cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW1, z1 ) );
  1605. }
  1606. // calculate and include the cull bits for the extruded bounding box corners
  1607. {
  1608. __m128 clipX = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 0 ), 0 );
  1609. __m128 clipY = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 1 ), 0 );
  1610. __m128 clipZ = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 2 ), 0 );
  1611. __m128 clipW = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 3 ), 0 );
  1612. __m128 extrudeX = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 0 ), 0 );
  1613. __m128 extrudeY = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 1 ), 0 );
  1614. __m128 extrudeZ = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 2 ), 0 );
  1615. __m128 closing = _mm_madd_ps( clipX, extrudeX, _mm_madd_ps( clipY, extrudeY, _mm_mul_ps( clipZ, extrudeZ ) ) );
  1616. __m128 invClosing = _mm_rcp32_ps( closing );
  1617. invClosing = _mm_xor_ps( invClosing, vector_float_sign_bit );
  1618. __m128 dt = _mm_madd_ps( clipX, vx, _mm_madd_ps( clipY, vy, clipW ) );
  1619. __m128 d0 = _mm_madd_ps( clipZ, vz0, dt );
  1620. __m128 d1 = _mm_madd_ps( clipZ, vz1, dt );
  1621. d0 = _mm_mul_ps( d0, invClosing );
  1622. d1 = _mm_mul_ps( d1, invClosing );
  1623. __m128 vx0 = _mm_madd_ps( extrudeX, d0, vx );
  1624. __m128 vx1 = _mm_madd_ps( extrudeX, d1, vx );
  1625. __m128 vy0 = _mm_madd_ps( extrudeY, d0, vy );
  1626. __m128 vy1 = _mm_madd_ps( extrudeY, d1, vy );
  1627. vz0 = _mm_madd_ps( extrudeZ, d0, vz0 );
  1628. vz1 = _mm_madd_ps( extrudeZ, d1, vz1 );
  1629. __m128 mvp0X = _mm_splat_ps( mvp0, 0 );
  1630. __m128 mvp1X = _mm_splat_ps( mvp1, 0 );
  1631. __m128 mvp2X = _mm_splat_ps( mvp2, 0 );
  1632. __m128 mvp3X = _mm_splat_ps( mvp3, 0 );
  1633. __m128 mvp0W = _mm_splat_ps( mvp0, 3 );
  1634. __m128 mvp1W = _mm_splat_ps( mvp1, 3 );
  1635. __m128 mvp2W = _mm_splat_ps( mvp2, 3 );
  1636. __m128 mvp3W = _mm_splat_ps( mvp3, 3 );
  1637. __m128 x0 = _mm_madd_ps( vx0, mvp0X, mvp0W );
  1638. __m128 y0 = _mm_madd_ps( vx0, mvp1X, mvp1W );
  1639. __m128 z0 = _mm_madd_ps( vx0, mvp2X, mvp2W );
  1640. __m128 w0 = _mm_madd_ps( vx0, mvp3X, mvp3W );
  1641. __m128 x1 = _mm_madd_ps( vx1, mvp0X, mvp0W );
  1642. __m128 y1 = _mm_madd_ps( vx1, mvp1X, mvp1W );
  1643. __m128 z1 = _mm_madd_ps( vx1, mvp2X, mvp2W );
  1644. __m128 w1 = _mm_madd_ps( vx1, mvp3X, mvp3W );
  1645. __m128 mvp0Y = _mm_splat_ps( mvp0, 1 );
  1646. __m128 mvp1Y = _mm_splat_ps( mvp1, 1 );
  1647. __m128 mvp2Y = _mm_splat_ps( mvp2, 1 );
  1648. __m128 mvp3Y = _mm_splat_ps( mvp3, 1 );
  1649. x0 = _mm_madd_ps( vy0, mvp0Y, x0 ); //-V537
  1650. y0 = _mm_madd_ps( vy0, mvp1Y, y0 );
  1651. z0 = _mm_madd_ps( vy0, mvp2Y, z0 ); //-V537
  1652. w0 = _mm_madd_ps( vy0, mvp3Y, w0 );
  1653. x1 = _mm_madd_ps( vy1, mvp0Y, x1 ); //-V537
  1654. y1 = _mm_madd_ps( vy1, mvp1Y, y1 );
  1655. z1 = _mm_madd_ps( vy1, mvp2Y, z1 ); //-V537
  1656. w1 = _mm_madd_ps( vy1, mvp3Y, w1 );
  1657. __m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
  1658. __m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
  1659. __m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
  1660. __m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
  1661. x0 = _mm_madd_ps( vz0, mvp0Z, x0 );
  1662. y0 = _mm_madd_ps( vz0, mvp1Z, y0 ); //-V537
  1663. z0 = _mm_madd_ps( vz0, mvp2Z, z0 );
  1664. w0 = _mm_madd_ps( vz0, mvp3Z, w0 );
  1665. x1 = _mm_madd_ps( vz1, mvp0Z, x1 );
  1666. y1 = _mm_madd_ps( vz1, mvp1Z, y1 ); //-V537
  1667. z1 = _mm_madd_ps( vz1, mvp2Z, z1 );
  1668. w1 = _mm_madd_ps( vz1, mvp3Z, w1 );
  1669. __m128 maxW0 = w0;
  1670. __m128 maxW1 = w1;
  1671. __m128 minW0 = _mm_mul_ps( w0, minMul );
  1672. __m128 minW1 = _mm_mul_ps( w1, minMul );
  1673. #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
  1674. __m128 minZ0 = vector_float_zero;
  1675. __m128 minZ1 = vector_float_zero;
  1676. #else
  1677. __m128 minZ0 = minW0;
  1678. __m128 minZ1 = minW1;
  1679. #endif
  1680. cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x0, minW0 ) );
  1681. cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW0, x0 ) );
  1682. cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y0, minW0 ) );
  1683. cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW0, y0 ) );
  1684. cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z0, minZ0 ) ); // NOTE: using minZ0
  1685. cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW0, z0 ) );
  1686. cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x1, minW1 ) );
  1687. cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW1, x1 ) );
  1688. cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y1, minW1 ) );
  1689. cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW1, y1 ) );
  1690. cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z1, minZ1 ) ); // NOTE: using minZ1
  1691. cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW1, z1 ) );
  1692. }
  1693. cullBits0 = _mm_and_ps( cullBits0, vector_float_mask0 );
  1694. cullBits1 = _mm_and_ps( cullBits1, vector_float_mask1 );
  1695. cullBits2 = _mm_and_ps( cullBits2, vector_float_mask2 );
  1696. cullBits3 = _mm_and_ps( cullBits3, vector_float_mask3 );
  1697. cullBits4 = _mm_and_ps( cullBits4, vector_float_mask4 );
  1698. cullBits5 = _mm_and_ps( cullBits5, vector_float_mask5 );
  1699. cullBits0 = _mm_or_ps( cullBits0, cullBits1 );
  1700. cullBits2 = _mm_or_ps( cullBits2, cullBits3 );
  1701. cullBits4 = _mm_or_ps( cullBits4, cullBits5 );
  1702. cullBits0 = _mm_or_ps( cullBits0, cullBits2 );
  1703. cullBits0 = _mm_or_ps( cullBits0, cullBits4 );
  1704. cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  1705. cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 0, 1, 0, 1 ) ) );
  1706. int bits = _mm_cvtsi128_si32( (const __m128i &)cullBits0 );
  1707. *outBits = (byte)(bits ^ 63);
  1708. return ( bits != 63 );
  1709. #else
  1710. int bits = 0;
  1711. float closing = extrudeDirection * clipPlane.Normal();
  1712. float invClosing = -1.0f / closing;
  1713. idVec3 v;
  1714. for ( int x = 0; x < 2; x++ ) {
  1715. v[0] = bounds[x][0];
  1716. for ( int y = 0; y < 2; y++ ) {
  1717. v[1] = bounds[y][1];
  1718. for ( int z = 0; z < 2; z++ ) {
  1719. v[2] = bounds[z][2];
  1720. for ( int extrude = 0; extrude <= 1; extrude++ ) {
  1721. idVec3 test;
  1722. if ( extrude ) {
  1723. const float extrudeDist = clipPlane.Distance( v ) * invClosing;
  1724. test = v + extrudeDirection * extrudeDist;
  1725. } else {
  1726. test = v;
  1727. }
  1728. idVec4 c;
  1729. for ( int i = 0; i < 4; i++ ) {
  1730. c[i] = test[0] * mvp[i][0] + test[1] * mvp[i][1] + test[2] * mvp[i][2] + mvp[i][3];
  1731. }
  1732. const float minW = zeroToOne ? 0.0f : -c[3];
  1733. const float maxW = c[3];
  1734. #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
  1735. const float minZ = 0.0f;
  1736. #else
  1737. const float minZ = minW;
  1738. #endif
  1739. if ( c[0] > minW ) { bits |= ( 1 << 0 ); }
  1740. if ( c[0] < maxW ) { bits |= ( 1 << 1 ); }
  1741. if ( c[1] > minW ) { bits |= ( 1 << 2 ); }
  1742. if ( c[1] < maxW ) { bits |= ( 1 << 3 ); }
  1743. if ( c[2] > minZ ) { bits |= ( 1 << 4 ); } // NOTE: using minZ
  1744. if ( c[2] < maxW ) { bits |= ( 1 << 5 ); }
  1745. }
  1746. }
  1747. }
  1748. }
  1749. // store out a bit set for each side where the bounds is outside the clip space
  1750. *outBits = (byte)(bits ^ 63);
  1751. // if any bits weren't set, the bounds is completely off one side of the frustum
  1752. return ( bits != 63 );
  1753. #endif
  1754. }
  1755. /*
  1756. ========================
  1757. idRenderMatrix::ProjectedBounds
  1758. Calculates the bounds of the given bounding box projected with the given Model View Projection (MVP) matrix.
  1759. If 'windowSpace' is true then the calculated bounds along each axis are moved and clamped to the [0, 1] range.
  1760. The given bounding box is not clipped to the MVP so the projected bounds may not be as tight as possible.
  1761. If the given bounding box is W=0 clipped then the projected bounds will cover the full X-Y range.
  1762. Note that while projected[0][1] will be set to the minimum when the given bounding box is W=0 clipped,
  1763. projected[1][1] will still be valid and will NOT be set to the maximum when the given bounding box
  1764. is W=0 clipped.
  1765. ========================
  1766. */
  1767. void idRenderMatrix::ProjectedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) {
  1768. #ifdef ID_WIN_X86_SSE2_INTRIN
  1769. __m128 mvp0 = _mm_loadu_ps( mvp[0] );
  1770. __m128 mvp1 = _mm_loadu_ps( mvp[1] );
  1771. __m128 mvp2 = _mm_loadu_ps( mvp[2] );
  1772. __m128 mvp3 = _mm_loadu_ps( mvp[3] );
  1773. __m128 b0 = _mm_loadu_bounds_0( bounds );
  1774. __m128 b1 = _mm_loadu_bounds_1( bounds );
  1775. // take the four points on the X-Y plane
  1776. __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y
  1777. __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X
  1778. __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y
  1779. __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z
  1780. __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z
  1781. // compute four partial X,Y,Z,W values
  1782. __m128 parx = _mm_splat_ps( mvp0, 3 );
  1783. __m128 pary = _mm_splat_ps( mvp1, 3 );
  1784. __m128 parz = _mm_splat_ps( mvp2, 3 );
  1785. __m128 parw = _mm_splat_ps( mvp3, 3 );
  1786. parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx );
  1787. pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary );
  1788. parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
  1789. parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
  1790. parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx );
  1791. pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary );
  1792. parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
  1793. parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
  1794. // compute full X,Y,Z,W values
  1795. __m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
  1796. __m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
  1797. __m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
  1798. __m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
  1799. __m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx );
  1800. __m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary );
  1801. __m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz );
  1802. __m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw );
  1803. __m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx );
  1804. __m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary );
  1805. __m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz );
  1806. __m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw );
  1807. __m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 );
  1808. __m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 );
  1809. w0 = _mm_sel_ps( w0, vector_float_one, s0 );
  1810. w1 = _mm_sel_ps( w1, vector_float_one, s1 );
  1811. __m128 rw0 = _mm_rcp32_ps( w0 );
  1812. __m128 rw1 = _mm_rcp32_ps( w1 );
  1813. x0 = _mm_mul_ps( x0, rw0 );
  1814. y0 = _mm_mul_ps( y0, rw0 );
  1815. z0 = _mm_mul_ps( z0, rw0 );
  1816. x1 = _mm_mul_ps( x1, rw1 );
  1817. y1 = _mm_mul_ps( y1, rw1 );
  1818. z1 = _mm_mul_ps( z1, rw1 );
  1819. __m128 minX = _mm_min_ps( x0, x1 );
  1820. __m128 minY = _mm_min_ps( y0, y1 );
  1821. __m128 minZ = _mm_min_ps( z0, z1 );
  1822. __m128 maxX = _mm_max_ps( x0, x1 );
  1823. __m128 maxY = _mm_max_ps( y0, y1 );
  1824. __m128 maxZ = _mm_max_ps( z0, z1 );
  1825. minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  1826. minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  1827. minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  1828. minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  1829. minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  1830. minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  1831. maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  1832. maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  1833. maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  1834. maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  1835. maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  1836. maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  1837. s0 = _mm_or_ps( s0, s1 );
  1838. s0 = _mm_or_ps( s0, _mm_perm_ps( s0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  1839. s0 = _mm_or_ps( s0, _mm_perm_ps( s0, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  1840. minX = _mm_sel_ps( minX, vector_float_neg_infinity, s0 );
  1841. minY = _mm_sel_ps( minY, vector_float_neg_infinity, s0 );
  1842. minZ = _mm_sel_ps( minZ, vector_float_neg_infinity, s0 );
  1843. maxX = _mm_sel_ps( maxX, vector_float_pos_infinity, s0 );
  1844. maxY = _mm_sel_ps( maxY, vector_float_pos_infinity, s0 );
  1845. // NOTE: maxZ is valid either way
  1846. if ( windowSpace ) {
  1847. minX = _mm_madd_ps( minX, vector_float_half, vector_float_half );
  1848. maxX = _mm_madd_ps( maxX, vector_float_half, vector_float_half );
  1849. minY = _mm_madd_ps( minY, vector_float_half, vector_float_half );
  1850. maxY = _mm_madd_ps( maxY, vector_float_half, vector_float_half );
  1851. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  1852. minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half );
  1853. maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half );
  1854. #endif
  1855. minX = _mm_max_ps( _mm_min_ps( minX, vector_float_one ), vector_float_zero );
  1856. maxX = _mm_max_ps( _mm_min_ps( maxX, vector_float_one ), vector_float_zero );
  1857. minY = _mm_max_ps( _mm_min_ps( minY, vector_float_one ), vector_float_zero );
  1858. maxY = _mm_max_ps( _mm_min_ps( maxY, vector_float_one ), vector_float_zero );
  1859. minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero );
  1860. maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero );
  1861. }
  1862. _mm_store_ss( & projected[0].x, minX );
  1863. _mm_store_ss( & projected[0].y, minY );
  1864. _mm_store_ss( & projected[0].z, minZ );
  1865. _mm_store_ss( & projected[1].x, maxX );
  1866. _mm_store_ss( & projected[1].y, maxY );
  1867. _mm_store_ss( & projected[1].z, maxZ );
  1868. #else
  1869. for ( int i = 0; i < 3; i++ ) {
  1870. projected[0][i] = RENDER_MATRIX_INFINITY;
  1871. projected[1][i] = - RENDER_MATRIX_INFINITY;
  1872. }
  1873. idVec3 v;
  1874. for ( int x = 0; x < 2; x++ ) {
  1875. v[0] = bounds[x][0];
  1876. for ( int y = 0; y < 2; y++ ) {
  1877. v[1] = bounds[y][1];
  1878. for ( int z = 0; z < 2; z++ ) {
  1879. v[2] = bounds[z][2];
  1880. float tx = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
  1881. float ty = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
  1882. float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
  1883. float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
  1884. if ( tw <= idMath::FLT_SMALLEST_NON_DENORMAL ) {
  1885. projected[0][0] = -RENDER_MATRIX_INFINITY;
  1886. projected[0][1] = -RENDER_MATRIX_INFINITY;
  1887. projected[0][2] = -RENDER_MATRIX_INFINITY;
  1888. projected[1][0] = RENDER_MATRIX_INFINITY;
  1889. projected[1][1] = RENDER_MATRIX_INFINITY;
  1890. // NOTE: projected[1][1] is still valid
  1891. continue;
  1892. }
  1893. float rw = 1.0f / tw;
  1894. tx = tx * rw;
  1895. ty = ty * rw;
  1896. tz = tz * rw;
  1897. projected[0][0] = Min( projected[0][0], tx );
  1898. projected[0][1] = Min( projected[0][1], ty );
  1899. projected[0][2] = Min( projected[0][2], tz );
  1900. projected[1][0] = Max( projected[1][0], tx );
  1901. projected[1][1] = Max( projected[1][1], ty );
  1902. projected[1][2] = Max( projected[1][2], tz );
  1903. }
  1904. }
  1905. }
  1906. if ( windowSpace ) {
  1907. // convert to window coords
  1908. projected[0][0] = projected[0][0] * 0.5f + 0.5f;
  1909. projected[1][0] = projected[1][0] * 0.5f + 0.5f;
  1910. projected[0][1] = projected[0][1] * 0.5f + 0.5f;
  1911. projected[1][1] = projected[1][1] * 0.5f + 0.5f;
  1912. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  1913. projected[0][2] = projected[0][2] * 0.5f + 0.5f;
  1914. projected[1][2] = projected[1][2] * 0.5f + 0.5f;
  1915. #endif
  1916. // clamp to [0, 1] range
  1917. projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] );
  1918. projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] );
  1919. projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] );
  1920. projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] );
  1921. projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] );
  1922. projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] );
  1923. }
  1924. #endif
  1925. }
  1926. /*
  1927. ========================
  1928. idRenderMatrix::ProjectedNearClippedBounds
  1929. Calculates the bounds of the given bounding box projected with the given Model View Projection (MVP) matrix.
  1930. If 'windowSpace' is true then the calculated bounds along each axis are moved and clamped to the [0, 1] range.
  1931. The given bounding box is first near clipped so the projected bounds do not cover the full X-Y range when
  1932. the given bounding box crosses the W=0 plane. However, the given bounding box is not clipped against the
  1933. other planes so the projected bounds are still not as tight as they could be if the given bounding box
  1934. crosses a corner. Fortunately, clipping to the near clipping planes typically provides more than 50% of
  1935. the gain between not clipping at all and fully clipping the bounding box to all planes. Only clipping to
  1936. the near clipping plane is much cheaper than clipping to all planes and can be easily implemented with
  1937. completely branchless SIMD.
  1938. ========================
  1939. */
  1940. void idRenderMatrix::ProjectedNearClippedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) {
  1941. /*
  1942. 4----{E}---5
  1943. + /| /|
  1944. Z {H} {I} {F} |
  1945. - / | / {J}
  1946. 7--{G}-----6 |
  1947. | | | |
  1948. {L} 0----|-{A}-1
  1949. | / {K} / -
  1950. | {D} | {B} Y
  1951. |/ |/ +
  1952. 3---{C}----2
  1953. - X +
  1954. */
  1955. #ifdef ID_WIN_X86_SSE2_INTRIN
  1956. const __m128 mvp0 = _mm_loadu_ps( mvp[0] );
  1957. const __m128 mvp1 = _mm_loadu_ps( mvp[1] );
  1958. const __m128 mvp2 = _mm_loadu_ps( mvp[2] );
  1959. const __m128 mvp3 = _mm_loadu_ps( mvp[3] );
  1960. const __m128 b0 = _mm_loadu_bounds_0( bounds );
  1961. const __m128 b1 = _mm_loadu_bounds_1( bounds );
  1962. // take the four points on the X-Y plane
  1963. const __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y
  1964. const __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X
  1965. const __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y
  1966. const __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z
  1967. const __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z
  1968. // compute four partial X,Y,Z,W values
  1969. __m128 parx = _mm_splat_ps( mvp0, 3 );
  1970. __m128 pary = _mm_splat_ps( mvp1, 3 );
  1971. __m128 parz = _mm_splat_ps( mvp2, 3 );
  1972. __m128 parw = _mm_splat_ps( mvp3, 3 );
  1973. parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx );
  1974. pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary );
  1975. parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
  1976. parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
  1977. parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx );
  1978. pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary );
  1979. parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
  1980. parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
  1981. // compute full X,Y,Z,W values
  1982. const __m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
  1983. const __m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
  1984. const __m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
  1985. const __m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
  1986. const __m128 x_0123 = _mm_madd_ps( vz0, mvp0Z, parx );
  1987. const __m128 y_0123 = _mm_madd_ps( vz0, mvp1Z, pary );
  1988. const __m128 z_0123 = _mm_madd_ps( vz0, mvp2Z, parz );
  1989. const __m128 w_0123 = _mm_madd_ps( vz0, mvp3Z, parw );
  1990. const __m128 x_4567 = _mm_madd_ps( vz1, mvp0Z, parx );
  1991. const __m128 y_4567 = _mm_madd_ps( vz1, mvp1Z, pary );
  1992. const __m128 z_4567 = _mm_madd_ps( vz1, mvp2Z, parz );
  1993. const __m128 w_4567 = _mm_madd_ps( vz1, mvp3Z, parw );
  1994. // rotate the X,Y,Z,W values up by one
  1995. const __m128 x_1230 = _mm_perm_ps( x_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  1996. const __m128 y_1230 = _mm_perm_ps( y_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  1997. const __m128 z_1230 = _mm_perm_ps( z_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  1998. const __m128 w_1230 = _mm_perm_ps( w_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  1999. const __m128 x_5674 = _mm_perm_ps( x_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  2000. const __m128 y_5674 = _mm_perm_ps( y_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  2001. const __m128 z_5674 = _mm_perm_ps( z_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  2002. const __m128 w_5674 = _mm_perm_ps( w_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) );
  2003. #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1
  2004. const __m128 d_0123 = z_0123;
  2005. const __m128 d_4567 = z_4567;
  2006. const __m128 d_1230 = z_1230;
  2007. const __m128 d_5674 = z_5674;
  2008. #else
  2009. const __m128 d_0123 = _mm_add_ps( z_0123, w_0123 );
  2010. const __m128 d_4567 = _mm_add_ps( z_4567, w_4567 );
  2011. const __m128 d_1230 = _mm_add_ps( z_1230, w_1230 );
  2012. const __m128 d_5674 = _mm_add_ps( z_5674, w_5674 );
  2013. #endif
  2014. const __m128 deltaABCD = _mm_sub_ps( d_0123, d_1230 );
  2015. const __m128 deltaEFGH = _mm_sub_ps( d_4567, d_5674 );
  2016. const __m128 deltaIJKL = _mm_sub_ps( d_0123, d_4567 );
  2017. const __m128 maskABCD = _mm_cmpgt_ps( _mm_and_ps( deltaABCD, vector_float_abs_mask ), vector_float_smallest_non_denorm );
  2018. const __m128 maskEFGH = _mm_cmpgt_ps( _mm_and_ps( deltaEFGH, vector_float_abs_mask ), vector_float_smallest_non_denorm );
  2019. const __m128 maskIJKL = _mm_cmpgt_ps( _mm_and_ps( deltaIJKL, vector_float_abs_mask ), vector_float_smallest_non_denorm );
  2020. const __m128 fractionABCD = _mm_and_ps( _mm_div32_ps( d_0123, _mm_sel_ps( vector_float_one, deltaABCD, maskABCD ) ), maskABCD );
  2021. const __m128 fractionEFGH = _mm_and_ps( _mm_div32_ps( d_4567, _mm_sel_ps( vector_float_one, deltaEFGH, maskEFGH ) ), maskEFGH );
  2022. const __m128 fractionIJKL = _mm_and_ps( _mm_div32_ps( d_0123, _mm_sel_ps( vector_float_one, deltaIJKL, maskIJKL ) ), maskIJKL );
  2023. const __m128 clipABCD = _mm_and_ps( _mm_cmpgt_ps( fractionABCD, vector_float_zero ), _mm_cmpgt_ps( vector_float_one, fractionABCD ) );
  2024. const __m128 clipEFGH = _mm_and_ps( _mm_cmpgt_ps( fractionEFGH, vector_float_zero ), _mm_cmpgt_ps( vector_float_one, fractionEFGH ) );
  2025. const __m128 clipIJKL = _mm_and_ps( _mm_cmpgt_ps( fractionIJKL, vector_float_zero ), _mm_cmpgt_ps( vector_float_one, fractionIJKL ) );
  2026. const __m128 intersectionABCD_x = _mm_madd_ps( fractionABCD, _mm_sub_ps( x_1230, x_0123 ), x_0123 );
  2027. const __m128 intersectionABCD_y = _mm_madd_ps( fractionABCD, _mm_sub_ps( y_1230, y_0123 ), y_0123 );
  2028. const __m128 intersectionABCD_z = _mm_madd_ps( fractionABCD, _mm_sub_ps( z_1230, z_0123 ), z_0123 );
  2029. const __m128 intersectionABCD_w = _mm_madd_ps( fractionABCD, _mm_sub_ps( w_1230, w_0123 ), w_0123 );
  2030. const __m128 intersectionEFGH_x = _mm_madd_ps( fractionEFGH, _mm_sub_ps( x_5674, x_4567 ), x_4567 );
  2031. const __m128 intersectionEFGH_y = _mm_madd_ps( fractionEFGH, _mm_sub_ps( y_5674, y_4567 ), y_4567 );
  2032. const __m128 intersectionEFGH_z = _mm_madd_ps( fractionEFGH, _mm_sub_ps( z_5674, z_4567 ), z_4567 );
  2033. const __m128 intersectionEFGH_w = _mm_madd_ps( fractionEFGH, _mm_sub_ps( w_5674, w_4567 ), w_4567 );
  2034. const __m128 intersectionIJKL_x = _mm_madd_ps( fractionIJKL, _mm_sub_ps( x_4567, x_0123 ), x_0123 );
  2035. const __m128 intersectionIJKL_y = _mm_madd_ps( fractionIJKL, _mm_sub_ps( y_4567, y_0123 ), y_0123 );
  2036. const __m128 intersectionIJKL_z = _mm_madd_ps( fractionIJKL, _mm_sub_ps( z_4567, z_0123 ), z_0123 );
  2037. const __m128 intersectionIJKL_w = _mm_madd_ps( fractionIJKL, _mm_sub_ps( w_4567, w_0123 ), w_0123 );
  2038. const __m128 mask_0123 = _mm_cmpgt_ps( vector_float_zero, d_0123 );
  2039. const __m128 mask_1230 = _mm_cmpgt_ps( vector_float_zero, d_1230 );
  2040. const __m128 mask_4567 = _mm_cmpgt_ps( vector_float_zero, d_4567 );
  2041. const __m128 mask_5674 = _mm_cmpgt_ps( vector_float_zero, d_5674 );
  2042. const __m128 maskABCD_0123 = _mm_and_ps( clipABCD, mask_0123 );
  2043. const __m128 maskABCD_1230 = _mm_and_ps( clipABCD, mask_1230 );
  2044. const __m128 maskEFGH_4567 = _mm_and_ps( clipEFGH, mask_4567 );
  2045. const __m128 maskEFGH_5674 = _mm_and_ps( clipEFGH, mask_5674 );
  2046. const __m128 maskIJKL_0123 = _mm_and_ps( clipIJKL, mask_0123 );
  2047. const __m128 maskIJKL_4567 = _mm_and_ps( clipIJKL, mask_4567 );
  2048. __m128 edgeVertsABCD_x0 = _mm_sel_ps( x_0123, intersectionABCD_x, maskABCD_0123 );
  2049. __m128 edgeVertsABCD_y0 = _mm_sel_ps( y_0123, intersectionABCD_y, maskABCD_0123 );
  2050. __m128 edgeVertsABCD_z0 = _mm_sel_ps( z_0123, intersectionABCD_z, maskABCD_0123 );
  2051. __m128 edgeVertsABCD_w0 = _mm_sel_ps( w_0123, intersectionABCD_w, maskABCD_0123 );
  2052. __m128 edgeVertsABCD_x1 = _mm_sel_ps( x_1230, intersectionABCD_x, maskABCD_1230 );
  2053. __m128 edgeVertsABCD_y1 = _mm_sel_ps( y_1230, intersectionABCD_y, maskABCD_1230 );
  2054. __m128 edgeVertsABCD_z1 = _mm_sel_ps( z_1230, intersectionABCD_z, maskABCD_1230 );
  2055. __m128 edgeVertsABCD_w1 = _mm_sel_ps( w_1230, intersectionABCD_w, maskABCD_1230 );
  2056. __m128 edgeVertsEFGH_x0 = _mm_sel_ps( x_4567, intersectionEFGH_x, maskEFGH_4567 );
  2057. __m128 edgeVertsEFGH_y0 = _mm_sel_ps( y_4567, intersectionEFGH_y, maskEFGH_4567 );
  2058. __m128 edgeVertsEFGH_z0 = _mm_sel_ps( z_4567, intersectionEFGH_z, maskEFGH_4567 );
  2059. __m128 edgeVertsEFGH_w0 = _mm_sel_ps( w_4567, intersectionEFGH_w, maskEFGH_4567 );
  2060. __m128 edgeVertsEFGH_x1 = _mm_sel_ps( x_5674, intersectionEFGH_x, maskEFGH_5674 );
  2061. __m128 edgeVertsEFGH_y1 = _mm_sel_ps( y_5674, intersectionEFGH_y, maskEFGH_5674 );
  2062. __m128 edgeVertsEFGH_z1 = _mm_sel_ps( z_5674, intersectionEFGH_z, maskEFGH_5674 );
  2063. __m128 edgeVertsEFGH_w1 = _mm_sel_ps( w_5674, intersectionEFGH_w, maskEFGH_5674 );
  2064. __m128 edgeVertsIJKL_x0 = _mm_sel_ps( x_0123, intersectionIJKL_x, maskIJKL_0123 );
  2065. __m128 edgeVertsIJKL_y0 = _mm_sel_ps( y_0123, intersectionIJKL_y, maskIJKL_0123 );
  2066. __m128 edgeVertsIJKL_z0 = _mm_sel_ps( z_0123, intersectionIJKL_z, maskIJKL_0123 );
  2067. __m128 edgeVertsIJKL_w0 = _mm_sel_ps( w_0123, intersectionIJKL_w, maskIJKL_0123 );
  2068. __m128 edgeVertsIJKL_x1 = _mm_sel_ps( x_4567, intersectionIJKL_x, maskIJKL_4567 );
  2069. __m128 edgeVertsIJKL_y1 = _mm_sel_ps( y_4567, intersectionIJKL_y, maskIJKL_4567 );
  2070. __m128 edgeVertsIJKL_z1 = _mm_sel_ps( z_4567, intersectionIJKL_z, maskIJKL_4567 );
  2071. __m128 edgeVertsIJKL_w1 = _mm_sel_ps( w_4567, intersectionIJKL_w, maskIJKL_4567 );
  2072. const __m128 maskABCD_w0 = _mm_cmpgt_ps( edgeVertsABCD_w0, vector_float_smallest_non_denorm );
  2073. const __m128 maskABCD_w1 = _mm_cmpgt_ps( edgeVertsABCD_w1, vector_float_smallest_non_denorm );
  2074. const __m128 maskEFGH_w0 = _mm_cmpgt_ps( edgeVertsEFGH_w0, vector_float_smallest_non_denorm );
  2075. const __m128 maskEFGH_w1 = _mm_cmpgt_ps( edgeVertsEFGH_w1, vector_float_smallest_non_denorm );
  2076. const __m128 maskIJKL_w0 = _mm_cmpgt_ps( edgeVertsIJKL_w0, vector_float_smallest_non_denorm );
  2077. const __m128 maskIJKL_w1 = _mm_cmpgt_ps( edgeVertsIJKL_w1, vector_float_smallest_non_denorm );
  2078. edgeVertsABCD_w0 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsABCD_w0, maskABCD_w0 ) );
  2079. edgeVertsABCD_w1 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsABCD_w1, maskABCD_w1 ) );
  2080. edgeVertsEFGH_w0 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsEFGH_w0, maskEFGH_w0 ) );
  2081. edgeVertsEFGH_w1 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsEFGH_w1, maskEFGH_w1 ) );
  2082. edgeVertsIJKL_w0 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsIJKL_w0, maskIJKL_w0 ) );
  2083. edgeVertsIJKL_w1 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsIJKL_w1, maskIJKL_w1 ) );
  2084. edgeVertsABCD_x0 = _mm_mul_ps( edgeVertsABCD_x0, edgeVertsABCD_w0 );
  2085. edgeVertsABCD_x1 = _mm_mul_ps( edgeVertsABCD_x1, edgeVertsABCD_w1 );
  2086. edgeVertsEFGH_x0 = _mm_mul_ps( edgeVertsEFGH_x0, edgeVertsEFGH_w0 );
  2087. edgeVertsEFGH_x1 = _mm_mul_ps( edgeVertsEFGH_x1, edgeVertsEFGH_w1 );
  2088. edgeVertsIJKL_x0 = _mm_mul_ps( edgeVertsIJKL_x0, edgeVertsIJKL_w0 );
  2089. edgeVertsIJKL_x1 = _mm_mul_ps( edgeVertsIJKL_x1, edgeVertsIJKL_w1 );
  2090. edgeVertsABCD_y0 = _mm_mul_ps( edgeVertsABCD_y0, edgeVertsABCD_w0 );
  2091. edgeVertsABCD_y1 = _mm_mul_ps( edgeVertsABCD_y1, edgeVertsABCD_w1 );
  2092. edgeVertsEFGH_y0 = _mm_mul_ps( edgeVertsEFGH_y0, edgeVertsEFGH_w0 );
  2093. edgeVertsEFGH_y1 = _mm_mul_ps( edgeVertsEFGH_y1, edgeVertsEFGH_w1 );
  2094. edgeVertsIJKL_y0 = _mm_mul_ps( edgeVertsIJKL_y0, edgeVertsIJKL_w0 );
  2095. edgeVertsIJKL_y1 = _mm_mul_ps( edgeVertsIJKL_y1, edgeVertsIJKL_w1 );
  2096. edgeVertsABCD_z0 = _mm_mul_ps( edgeVertsABCD_z0, edgeVertsABCD_w0 );
  2097. edgeVertsABCD_z1 = _mm_mul_ps( edgeVertsABCD_z1, edgeVertsABCD_w1 );
  2098. edgeVertsEFGH_z0 = _mm_mul_ps( edgeVertsEFGH_z0, edgeVertsEFGH_w0 );
  2099. edgeVertsEFGH_z1 = _mm_mul_ps( edgeVertsEFGH_z1, edgeVertsEFGH_w1 );
  2100. edgeVertsIJKL_z0 = _mm_mul_ps( edgeVertsIJKL_z0, edgeVertsIJKL_w0 );
  2101. edgeVertsIJKL_z1 = _mm_mul_ps( edgeVertsIJKL_z1, edgeVertsIJKL_w1 );
  2102. const __m128 posInf = vector_float_pos_infinity;
  2103. const __m128 negInf = vector_float_neg_infinity;
  2104. const __m128 minX0 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsABCD_x0, maskABCD_w0 ), _mm_sel_ps( posInf, edgeVertsABCD_x1, maskABCD_w1 ) );
  2105. const __m128 minX1 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsEFGH_x0, maskEFGH_w0 ), _mm_sel_ps( posInf, edgeVertsEFGH_x1, maskEFGH_w1 ) );
  2106. const __m128 minX2 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsIJKL_x0, maskIJKL_w0 ), _mm_sel_ps( posInf, edgeVertsIJKL_x1, maskIJKL_w1 ) );
  2107. const __m128 minY0 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsABCD_y0, maskABCD_w0 ), _mm_sel_ps( posInf, edgeVertsABCD_y1, maskABCD_w1 ) );
  2108. const __m128 minY1 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsEFGH_y0, maskEFGH_w0 ), _mm_sel_ps( posInf, edgeVertsEFGH_y1, maskEFGH_w1 ) );
  2109. const __m128 minY2 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsIJKL_y0, maskIJKL_w0 ), _mm_sel_ps( posInf, edgeVertsIJKL_y1, maskIJKL_w1 ) );
  2110. const __m128 minZ0 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsABCD_z0, maskABCD_w0 ), _mm_sel_ps( posInf, edgeVertsABCD_z1, maskABCD_w1 ) );
  2111. const __m128 minZ1 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsEFGH_z0, maskEFGH_w0 ), _mm_sel_ps( posInf, edgeVertsEFGH_z1, maskEFGH_w1 ) );
  2112. const __m128 minZ2 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsIJKL_z0, maskIJKL_w0 ), _mm_sel_ps( posInf, edgeVertsIJKL_z1, maskIJKL_w1 ) );
  2113. const __m128 maxX0 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsABCD_x0, maskABCD_w0 ), _mm_sel_ps( negInf, edgeVertsABCD_x1, maskABCD_w1 ) );
  2114. const __m128 maxX1 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsEFGH_x0, maskEFGH_w0 ), _mm_sel_ps( negInf, edgeVertsEFGH_x1, maskEFGH_w1 ) );
  2115. const __m128 maxX2 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsIJKL_x0, maskIJKL_w0 ), _mm_sel_ps( negInf, edgeVertsIJKL_x1, maskIJKL_w1 ) );
  2116. const __m128 maxY0 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsABCD_y0, maskABCD_w0 ), _mm_sel_ps( negInf, edgeVertsABCD_y1, maskABCD_w1 ) );
  2117. const __m128 maxY1 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsEFGH_y0, maskEFGH_w0 ), _mm_sel_ps( negInf, edgeVertsEFGH_y1, maskEFGH_w1 ) );
  2118. const __m128 maxY2 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsIJKL_y0, maskIJKL_w0 ), _mm_sel_ps( negInf, edgeVertsIJKL_y1, maskIJKL_w1 ) );
  2119. const __m128 maxZ0 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsABCD_z0, maskABCD_w0 ), _mm_sel_ps( negInf, edgeVertsABCD_z1, maskABCD_w1 ) );
  2120. const __m128 maxZ1 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsEFGH_z0, maskEFGH_w0 ), _mm_sel_ps( negInf, edgeVertsEFGH_z1, maskEFGH_w1 ) );
  2121. const __m128 maxZ2 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsIJKL_z0, maskIJKL_w0 ), _mm_sel_ps( negInf, edgeVertsIJKL_z1, maskIJKL_w1 ) );
  2122. __m128 minX = _mm_min_ps( minX0, _mm_min_ps( minX1, minX2 ) );
  2123. __m128 minY = _mm_min_ps( minY0, _mm_min_ps( minY1, minY2 ) );
  2124. __m128 minZ = _mm_min_ps( minZ0, _mm_min_ps( minZ1, minZ2 ) );
  2125. __m128 maxX = _mm_max_ps( maxX0, _mm_max_ps( maxX1, maxX2 ) );
  2126. __m128 maxY = _mm_max_ps( maxY0, _mm_max_ps( maxY1, maxY2 ) );
  2127. __m128 maxZ = _mm_max_ps( maxZ0, _mm_max_ps( maxZ1, maxZ2 ) );
  2128. minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2129. minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2130. minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2131. minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2132. minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2133. minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2134. maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2135. maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2136. maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2137. maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2138. maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2139. maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2140. if ( windowSpace ) {
  2141. minX = _mm_madd_ps( minX, vector_float_half, vector_float_half );
  2142. maxX = _mm_madd_ps( maxX, vector_float_half, vector_float_half );
  2143. minY = _mm_madd_ps( minY, vector_float_half, vector_float_half );
  2144. maxY = _mm_madd_ps( maxY, vector_float_half, vector_float_half );
  2145. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  2146. minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half );
  2147. maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half );
  2148. #endif
  2149. minX = _mm_max_ps( _mm_min_ps( minX, vector_float_one ), vector_float_zero );
  2150. maxX = _mm_max_ps( _mm_min_ps( maxX, vector_float_one ), vector_float_zero );
  2151. minY = _mm_max_ps( _mm_min_ps( minY, vector_float_one ), vector_float_zero );
  2152. maxY = _mm_max_ps( _mm_min_ps( maxY, vector_float_one ), vector_float_zero );
  2153. minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero );
  2154. maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero );
  2155. }
  2156. _mm_store_ss( & projected[0].x, minX );
  2157. _mm_store_ss( & projected[0].y, minY );
  2158. _mm_store_ss( & projected[0].z, minZ );
  2159. _mm_store_ss( & projected[1].x, maxX );
  2160. _mm_store_ss( & projected[1].y, maxY );
  2161. _mm_store_ss( & projected[1].z, maxZ );
  2162. #elif 1
  2163. {
  2164. const idVec3 points[8] = {
  2165. idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
  2166. idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
  2167. idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
  2168. idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
  2169. idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
  2170. idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
  2171. idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
  2172. idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
  2173. };
  2174. idVec4 projectedPoints[8];
  2175. for ( int i = 0; i < 8; i++ ) {
  2176. const idVec3 & v = points[i];
  2177. projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
  2178. projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
  2179. projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
  2180. projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
  2181. }
  2182. const idVec4 & p0 = projectedPoints[0];
  2183. const idVec4 & p1 = projectedPoints[1];
  2184. const idVec4 & p2 = projectedPoints[2];
  2185. const idVec4 & p3 = projectedPoints[3];
  2186. const idVec4 & p4 = projectedPoints[4];
  2187. const idVec4 & p5 = projectedPoints[5];
  2188. const idVec4 & p6 = projectedPoints[6];
  2189. const idVec4 & p7 = projectedPoints[7];
  2190. #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1
  2191. const float d0 = p0.z;
  2192. const float d1 = p1.z;
  2193. const float d2 = p2.z;
  2194. const float d3 = p3.z;
  2195. const float d4 = p4.z;
  2196. const float d5 = p5.z;
  2197. const float d6 = p6.z;
  2198. const float d7 = p7.z;
  2199. #else
  2200. const float d0 = p0.z + p0.w;
  2201. const float d1 = p1.z + p1.w;
  2202. const float d2 = p2.z + p2.w;
  2203. const float d3 = p3.z + p3.w;
  2204. const float d4 = p4.z + p4.w;
  2205. const float d5 = p5.z + p5.w;
  2206. const float d6 = p6.z + p6.w;
  2207. const float d7 = p7.z + p7.w;
  2208. #endif
  2209. const float deltaA = d0 - d1;
  2210. const float deltaB = d1 - d2;
  2211. const float deltaC = d2 - d3;
  2212. const float deltaD = d3 - d0;
  2213. const float deltaE = d4 - d5;
  2214. const float deltaF = d5 - d6;
  2215. const float deltaG = d6 - d7;
  2216. const float deltaH = d7 - d4;
  2217. const float deltaI = d0 - d4;
  2218. const float deltaJ = d1 - d5;
  2219. const float deltaK = d2 - d6;
  2220. const float deltaL = d3 - d7;
  2221. const float fractionA = ( fabs( deltaA ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaA ) : 0.0f;
  2222. const float fractionB = ( fabs( deltaB ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaB ) : 0.0f;
  2223. const float fractionC = ( fabs( deltaC ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaC ) : 0.0f;
  2224. const float fractionD = ( fabs( deltaD ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaD ) : 0.0f;
  2225. const float fractionE = ( fabs( deltaE ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d4 / deltaE ) : 0.0f;
  2226. const float fractionF = ( fabs( deltaF ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d5 / deltaF ) : 0.0f;
  2227. const float fractionG = ( fabs( deltaG ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d6 / deltaG ) : 0.0f;
  2228. const float fractionH = ( fabs( deltaH ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d7 / deltaH ) : 0.0f;
  2229. const float fractionI = ( fabs( deltaI ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaI ) : 0.0f;
  2230. const float fractionJ = ( fabs( deltaJ ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaJ ) : 0.0f;
  2231. const float fractionK = ( fabs( deltaK ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaK ) : 0.0f;
  2232. const float fractionL = ( fabs( deltaL ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaL ) : 0.0f;
  2233. const bool clipA = ( fractionA > 0.0f && fractionA < 1.0f );
  2234. const bool clipB = ( fractionB > 0.0f && fractionB < 1.0f );
  2235. const bool clipC = ( fractionC > 0.0f && fractionC < 1.0f );
  2236. const bool clipD = ( fractionD > 0.0f && fractionD < 1.0f );
  2237. const bool clipE = ( fractionE > 0.0f && fractionE < 1.0f );
  2238. const bool clipF = ( fractionF > 0.0f && fractionF < 1.0f );
  2239. const bool clipG = ( fractionG > 0.0f && fractionG < 1.0f );
  2240. const bool clipH = ( fractionH > 0.0f && fractionH < 1.0f );
  2241. const bool clipI = ( fractionI > 0.0f && fractionI < 1.0f );
  2242. const bool clipJ = ( fractionJ > 0.0f && fractionJ < 1.0f );
  2243. const bool clipK = ( fractionK > 0.0f && fractionK < 1.0f );
  2244. const bool clipL = ( fractionL > 0.0f && fractionL < 1.0f );
  2245. const idVec4 intersectionA = p0 + fractionA * ( p1 - p0 );
  2246. const idVec4 intersectionB = p1 + fractionB * ( p2 - p1 );
  2247. const idVec4 intersectionC = p2 + fractionC * ( p3 - p2 );
  2248. const idVec4 intersectionD = p3 + fractionD * ( p0 - p3 );
  2249. const idVec4 intersectionE = p4 + fractionE * ( p5 - p4 );
  2250. const idVec4 intersectionF = p5 + fractionF * ( p6 - p5 );
  2251. const idVec4 intersectionG = p6 + fractionG * ( p7 - p6 );
  2252. const idVec4 intersectionH = p7 + fractionH * ( p4 - p7 );
  2253. const idVec4 intersectionI = p0 + fractionI * ( p4 - p0 );
  2254. const idVec4 intersectionJ = p1 + fractionJ * ( p5 - p1 );
  2255. const idVec4 intersectionK = p2 + fractionK * ( p6 - p2 );
  2256. const idVec4 intersectionL = p3 + fractionL * ( p7 - p3 );
  2257. idVec4 edgeVerts[24];
  2258. edgeVerts[ 0] = ( clipA && d0 < 0.0f ) ? intersectionA : p0;
  2259. edgeVerts[ 2] = ( clipB && d1 < 0.0f ) ? intersectionB : p1;
  2260. edgeVerts[ 4] = ( clipC && d2 < 0.0f ) ? intersectionC : p2;
  2261. edgeVerts[ 6] = ( clipD && d3 < 0.0f ) ? intersectionD : p3;
  2262. edgeVerts[ 1] = ( clipA && d1 < 0.0f ) ? intersectionA : p1;
  2263. edgeVerts[ 3] = ( clipB && d2 < 0.0f ) ? intersectionB : p2;
  2264. edgeVerts[ 5] = ( clipC && d3 < 0.0f ) ? intersectionC : p3;
  2265. edgeVerts[ 7] = ( clipD && d0 < 0.0f ) ? intersectionD : p0;
  2266. edgeVerts[ 8] = ( clipE && d4 < 0.0f ) ? intersectionE : p4;
  2267. edgeVerts[10] = ( clipF && d5 < 0.0f ) ? intersectionF : p5;
  2268. edgeVerts[12] = ( clipG && d6 < 0.0f ) ? intersectionG : p6;
  2269. edgeVerts[14] = ( clipH && d7 < 0.0f ) ? intersectionH : p7;
  2270. edgeVerts[ 9] = ( clipE && d5 < 0.0f ) ? intersectionE : p5;
  2271. edgeVerts[11] = ( clipF && d6 < 0.0f ) ? intersectionF : p6;
  2272. edgeVerts[13] = ( clipG && d7 < 0.0f ) ? intersectionG : p7;
  2273. edgeVerts[15] = ( clipH && d4 < 0.0f ) ? intersectionH : p4;
  2274. edgeVerts[16] = ( clipI && d0 < 0.0f ) ? intersectionI : p0;
  2275. edgeVerts[18] = ( clipJ && d1 < 0.0f ) ? intersectionJ : p1;
  2276. edgeVerts[20] = ( clipK && d2 < 0.0f ) ? intersectionK : p2;
  2277. edgeVerts[22] = ( clipL && d3 < 0.0f ) ? intersectionL : p3;
  2278. edgeVerts[17] = ( clipI && d4 < 0.0f ) ? intersectionI : p4;
  2279. edgeVerts[19] = ( clipJ && d5 < 0.0f ) ? intersectionJ : p5;
  2280. edgeVerts[21] = ( clipK && d6 < 0.0f ) ? intersectionK : p6;
  2281. edgeVerts[23] = ( clipL && d7 < 0.0f ) ? intersectionL : p7;
  2282. idBounds projBnds;
  2283. for ( int i = 0; i < 3; i++ ) {
  2284. projBnds[0][i] = RENDER_MATRIX_INFINITY;
  2285. projBnds[1][i] = - RENDER_MATRIX_INFINITY;
  2286. }
  2287. for ( int i = 0; i < 24; i++ ) {
  2288. const idVec4 & v = edgeVerts[i];
  2289. if ( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) {
  2290. continue;
  2291. }
  2292. const float rw = 1.0f / v.w;
  2293. const float px = v.x * rw;
  2294. const float py = v.y * rw;
  2295. const float pz = v.z * rw;
  2296. projBnds[0][0] = Min( projBnds[0][0], px );
  2297. projBnds[0][1] = Min( projBnds[0][1], py );
  2298. projBnds[0][2] = Min( projBnds[0][2], pz );
  2299. projBnds[1][0] = Max( projBnds[1][0], px );
  2300. projBnds[1][1] = Max( projBnds[1][1], py );
  2301. projBnds[1][2] = Max( projBnds[1][2], pz );
  2302. }
  2303. if ( windowSpace ) {
  2304. // convert to window coords
  2305. projBnds[0][0] = projBnds[0][0] * 0.5f + 0.5f;
  2306. projBnds[1][0] = projBnds[1][0] * 0.5f + 0.5f;
  2307. projBnds[0][1] = projBnds[0][1] * 0.5f + 0.5f;
  2308. projBnds[1][1] = projBnds[1][1] * 0.5f + 0.5f;
  2309. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  2310. projBnds[0][2] = projBnds[0][2] * 0.5f + 0.5f;
  2311. projBnds[1][2] = projBnds[1][2] * 0.5f + 0.5f;
  2312. #endif
  2313. // clamp to [0, 1] range
  2314. projBnds[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][0] );
  2315. projBnds[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][0] );
  2316. projBnds[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][1] );
  2317. projBnds[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][1] );
  2318. projBnds[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][2] );
  2319. projBnds[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][2] );
  2320. }
  2321. assert( projected[0].Compare( projBnds[0], 0.01f ) );
  2322. assert( projected[1].Compare( projBnds[1], 0.01f ) );
  2323. }
  2324. #else
  2325. const idVec3 points[8] = {
  2326. idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
  2327. idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
  2328. idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
  2329. idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
  2330. idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
  2331. idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
  2332. idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
  2333. idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
  2334. };
  2335. idVec4 projectedPoints[8];
  2336. for ( int i = 0; i < 8; i++ ) {
  2337. const idVec3 & v = points[i];
  2338. projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
  2339. projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
  2340. projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
  2341. projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
  2342. }
  2343. idVec4 edgeVerts[24];
  2344. for ( int i = 0; i < 3; i++ ) {
  2345. int offset0 = ( i & 1 ) * 4;
  2346. int offset1 = ( i & 1 ) * 4 + ( i & 2 ) * 2;
  2347. int offset3 = ~( i >> 1 ) & 1;
  2348. for ( int j = 0; j < 4; j++ ) {
  2349. const idVec4 p0 = projectedPoints[offset0 + ( ( j + 0 ) & 3 )];
  2350. const idVec4 p1 = projectedPoints[offset1 + ( ( j + offset3 ) & 3 )];
  2351. #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1
  2352. const float d0 = p0.z;
  2353. const float d1 = p1.z;
  2354. #else
  2355. const float d0 = p0.z + p0.w;
  2356. const float d1 = p1.z + p1.w;
  2357. #endif
  2358. const float delta = d0 - d1;
  2359. const float fraction = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f;
  2360. const bool clip = ( fraction > 0.0f && fraction < 1.0f );
  2361. const idVec4 intersection = p0 + fraction * ( p1 - p0 );
  2362. edgeVerts[i * 8 + j * 2 + 0] = ( clip && d0 < 0.0f ) ? intersection : p0;
  2363. edgeVerts[i * 8 + j * 2 + 1] = ( clip && d1 < 0.0f ) ? intersection : p1;
  2364. }
  2365. }
  2366. for ( int i = 0; i < 3; i++ ) {
  2367. projected[0][i] = RENDER_MATRIX_INFINITY;
  2368. projected[1][i] = - RENDER_MATRIX_INFINITY;
  2369. }
  2370. for ( int i = 0; i < 24; i++ ) {
  2371. const idVec4 & v = edgeVerts[i];
  2372. if ( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) {
  2373. continue;
  2374. }
  2375. const float rw = 1.0f / v.w;
  2376. const float px = v.x * rw;
  2377. const float py = v.y * rw;
  2378. const float pz = v.z * rw;
  2379. projected[0][0] = Min( projected[0][0], px );
  2380. projected[0][1] = Min( projected[0][1], py );
  2381. projected[0][2] = Min( projected[0][2], pz );
  2382. projected[1][0] = Max( projected[1][0], px );
  2383. projected[1][1] = Max( projected[1][1], py );
  2384. projected[1][2] = Max( projected[1][2], pz );
  2385. }
  2386. if ( windowSpace ) {
  2387. // convert to window coords
  2388. projected[0][0] = projected[0][0] * 0.5f + 0.5f;
  2389. projected[1][0] = projected[1][0] * 0.5f + 0.5f;
  2390. projected[0][1] = projected[0][1] * 0.5f + 0.5f;
  2391. projected[1][1] = projected[1][1] * 0.5f + 0.5f;
  2392. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  2393. projected[0][2] = projected[0][2] * 0.5f + 0.5f;
  2394. projected[1][2] = projected[1][2] * 0.5f + 0.5f;
  2395. #endif
  2396. // clamp to [0, 1] range
  2397. projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] );
  2398. projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] );
  2399. projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] );
  2400. projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] );
  2401. projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] );
  2402. projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] );
  2403. }
  2404. #endif
  2405. }
  2406. #if 0
  2407. /*
  2408. ========================
  2409. LocalViewOriginFromMVP
  2410. ========================
  2411. */
  2412. static idVec3 LocalViewOriginFromMVP( const idRenderMatrix & mvp ) {
  2413. const float nearX = mvp[3][0] + mvp[2][0];
  2414. const float nearY = mvp[3][1] + mvp[2][1];
  2415. const float nearZ = mvp[3][2] + mvp[2][2];
  2416. const float s = idMath::InvSqrt( nearX * nearX + nearY * nearY + nearZ * nearZ );
  2417. idRenderMatrix inverseMVP;
  2418. idRenderMatrix::Inverse( mvp, inverseMVP );
  2419. const float invW = 1.0f / inverseMVP[3][3];
  2420. const float x = ( inverseMVP[0][3] - nearX * s ) * invW;
  2421. const float y = ( inverseMVP[1][3] - nearY * s ) * invW;
  2422. const float z = ( inverseMVP[2][3] - nearZ * s ) * invW;
  2423. return idVec3( x, y, z );
  2424. }
  2425. #endif
  2426. /*
  2427. ========================
  2428. LocalNearClipCenterFromMVP
  2429. Based on whether the depth range is [0,1] or [-1,1], either transform (0,0,0) or (0,0,-1) with the inverse MVP.
  2430. ========================
  2431. */
  2432. static idVec3 LocalNearClipCenterFromMVP( const idRenderMatrix & mvp ) {
  2433. idRenderMatrix inverseMVP;
  2434. idRenderMatrix::Inverse( mvp, inverseMVP );
  2435. #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1
  2436. const float x = inverseMVP[0][3];
  2437. const float y = inverseMVP[1][3];
  2438. const float z = inverseMVP[2][3];
  2439. const float w = inverseMVP[3][3];
  2440. #else
  2441. const float x = inverseMVP[0][3] - inverseMVP[0][2];
  2442. const float y = inverseMVP[1][3] - inverseMVP[1][2];
  2443. const float z = inverseMVP[2][3] - inverseMVP[2][2];
  2444. const float w = inverseMVP[3][3] - inverseMVP[3][2];
  2445. #endif
  2446. const float invW = 1.0f / w;
  2447. return idVec3( x * invW, y * invW, z * invW );
  2448. }
  2449. #ifdef ID_WIN_X86_SSE2_INTRIN
  2450. /*
  2451. ========================
  2452. ClipHomogeneousPolygonToSide
  2453. Clips a polygon with homogeneous coordinates to the axis aligned plane[axis] = sign * offset.
  2454. ========================
  2455. */
  2456. static void ClipHomogeneousPolygonToSide_SSE2( idVec4 * __restrict newPoints, idVec4 * __restrict points, int & numPoints,
  2457. const int axis, const __m128 & sign, const __m128 & offset ) {
  2458. assert( newPoints != points );
  2459. const __m128 side = _mm_mul_ps( sign, offset );
  2460. __m128i mask = _mm_sub_epi32( vector_int_0123, _mm_shuffle_epi32( _mm_cvtsi32_si128( numPoints ), 0 ) );
  2461. __m128i index = _mm_setzero_si128();
  2462. ALIGNTYPE16 unsigned short indices[16 * 2];
  2463. ALIGNTYPE16 float clipFractions[16];
  2464. int localNumPoint = numPoints;
  2465. for ( int i = 0; i < localNumPoint; i += 4 ) {
  2466. const int i0 = ( i + 0 ) & ( ( i + 0 - localNumPoint ) >> 31 );
  2467. const int i1 = ( i + 1 ) & ( ( i + 1 - localNumPoint ) >> 31 );
  2468. const int i2 = ( i + 2 ) & ( ( i + 2 - localNumPoint ) >> 31 );
  2469. const int i3 = ( i + 3 ) & ( ( i + 3 - localNumPoint ) >> 31 );
  2470. const int i4 = ( i + 4 ) & ( ( i + 4 - localNumPoint ) >> 31 );
  2471. const __m128 p0A = _mm_load_ss( &points[i0][axis] );
  2472. const __m128 p1A = _mm_load_ss( &points[i1][axis] );
  2473. const __m128 p2A = _mm_load_ss( &points[i2][axis] );
  2474. const __m128 p3A = _mm_load_ss( &points[i3][axis] );
  2475. const __m128 p4A = _mm_load_ss( &points[i4][axis] );
  2476. const __m128 p0W = _mm_load_ss( &points[i0][3] );
  2477. const __m128 p1W = _mm_load_ss( &points[i1][3] );
  2478. const __m128 p2W = _mm_load_ss( &points[i2][3] );
  2479. const __m128 p3W = _mm_load_ss( &points[i3][3] );
  2480. const __m128 p4W = _mm_load_ss( &points[i4][3] );
  2481. const __m128 t0 = _mm_unpacklo_ps( p0A, p2A );
  2482. const __m128 t1 = _mm_unpacklo_ps( p1A, p3A );
  2483. const __m128 pa0 = _mm_unpacklo_ps( t0, t1 );
  2484. const __m128 pa1 = _mm_sld_ps( pa0, p4A, 4 );
  2485. const __m128 r0 = _mm_unpacklo_ps( p0W, p2W );
  2486. const __m128 r1 = _mm_unpacklo_ps( p1W, p3W );
  2487. const __m128 pw0 = _mm_unpacklo_ps( r0, r1 );
  2488. const __m128 pw1 = _mm_sld_ps( pw0, p4W, 4 );
  2489. {
  2490. const __m128 bside0 = _mm_cmpgt_ps( _mm_mul_ps( offset, pw0 ), _mm_mul_ps( sign, pa0 ) );
  2491. const __m128 bside1 = _mm_cmpgt_ps( _mm_mul_ps( offset, pw1 ), _mm_mul_ps( sign, pa1 ) );
  2492. const __m128i side0 = _mm_and_si128( __m128c( bside0 ), vector_int_1 );
  2493. const __m128i side1 = _mm_and_si128( __m128c( bside1 ), vector_int_1 );
  2494. const __m128i xorSide = _mm_xor_si128( side0, side1 );
  2495. const __m128i interleavedSide0 = _mm_unpacklo_epi32( side0, xorSide );
  2496. const __m128i interleavedSide1 = _mm_unpackhi_epi32( side0, xorSide );
  2497. const __m128i packedSide = _mm_packs_epi32( interleavedSide0, interleavedSide1 );
  2498. const __m128i packedMaskedSide = _mm_and_si128( packedSide, _mm_srai_epi32( mask, 31 ) );
  2499. index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 2 ) );
  2500. index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 4 ) );
  2501. index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 6 ) );
  2502. index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 8 ) );
  2503. index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 10 ) );
  2504. index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 12 ) );
  2505. index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 14 ) );
  2506. _mm_store_si128( (__m128i *)&indices[i * 2], index );
  2507. mask = _mm_add_epi32( mask, vector_int_4 );
  2508. index = _mm_add_epi16( index, packedMaskedSide );
  2509. index = _mm_shufflehi_epi16( index, _MM_SHUFFLE( 3, 3, 3, 3 ) );
  2510. index = _mm_shuffle_epi32( index, _MM_SHUFFLE( 3, 3, 3, 3 ) );
  2511. }
  2512. {
  2513. const __m128 d0 = _mm_nmsub_ps( pw0, side, pa0 );
  2514. const __m128 d1 = _mm_nmsub_ps( pw1, side, pa1 );
  2515. const __m128 delta = _mm_sub_ps( d0, d1 );
  2516. const __m128 deltaAbs = _mm_and_ps( delta, vector_float_abs_mask );
  2517. const __m128 clamp = _mm_cmpgt_ps( vector_float_smallest_non_denorm, deltaAbs );
  2518. const __m128 deltaClamped = _mm_sel_ps( delta, vector_float_one, clamp );
  2519. const __m128 fraction = _mm_mul_ps( d0, _mm_rcp32_ps( deltaClamped ) );
  2520. const __m128 fractionClamped0 = _mm_sel_ps( fraction, vector_float_one, clamp );
  2521. const __m128 fractionClamped1 = _mm_max_ps( fractionClamped0, vector_float_zero );
  2522. const __m128 fractionClamped2 = _mm_min_ps( fractionClamped1, vector_float_one );
  2523. _mm_store_ps( &clipFractions[i], fractionClamped2 );
  2524. }
  2525. }
  2526. numPoints = _mm_cvtsi128_si32( index ) & 0xFFFF;
  2527. for ( int i = 0; i < localNumPoint; i += 4 ) {
  2528. const int i0 = ( i + 0 ) & ( ( i + 0 - localNumPoint ) >> 31 );
  2529. const int i1 = ( i + 1 ) & ( ( i + 1 - localNumPoint ) >> 31 );
  2530. const int i2 = ( i + 2 ) & ( ( i + 2 - localNumPoint ) >> 31 );
  2531. const int i3 = ( i + 3 ) & ( ( i + 3 - localNumPoint ) >> 31 );
  2532. const int i4 = ( i + 4 ) & ( ( i + 4 - localNumPoint ) >> 31 );
  2533. const __m128 p0 = _mm_load_ps( points[i0].ToFloatPtr() );
  2534. const __m128 p1 = _mm_load_ps( points[i1].ToFloatPtr() );
  2535. const __m128 p2 = _mm_load_ps( points[i2].ToFloatPtr() );
  2536. const __m128 p3 = _mm_load_ps( points[i3].ToFloatPtr() );
  2537. const __m128 p4 = _mm_load_ps( points[i4].ToFloatPtr() );
  2538. const __m128 fraction = _mm_load_ps( &clipFractions[i] );
  2539. const __m128 c0 = _mm_madd_ps( _mm_splat_ps( fraction, 0 ), _mm_sub_ps( p1, p0 ), p0 );
  2540. const __m128 c1 = _mm_madd_ps( _mm_splat_ps( fraction, 1 ), _mm_sub_ps( p2, p1 ), p1 );
  2541. const __m128 c2 = _mm_madd_ps( _mm_splat_ps( fraction, 2 ), _mm_sub_ps( p3, p2 ), p2 );
  2542. const __m128 c3 = _mm_madd_ps( _mm_splat_ps( fraction, 3 ), _mm_sub_ps( p4, p3 ), p3 );
  2543. _mm_store_ps( newPoints[indices[i * 2 + 0]].ToFloatPtr(), p0 );
  2544. _mm_store_ps( newPoints[indices[i * 2 + 1]].ToFloatPtr(), c0 );
  2545. _mm_store_ps( newPoints[indices[i * 2 + 2]].ToFloatPtr(), p1 );
  2546. _mm_store_ps( newPoints[indices[i * 2 + 3]].ToFloatPtr(), c1 );
  2547. _mm_store_ps( newPoints[indices[i * 2 + 4]].ToFloatPtr(), p2 );
  2548. _mm_store_ps( newPoints[indices[i * 2 + 5]].ToFloatPtr(), c2 );
  2549. _mm_store_ps( newPoints[indices[i * 2 + 6]].ToFloatPtr(), p3 );
  2550. _mm_store_ps( newPoints[indices[i * 2 + 7]].ToFloatPtr(), c3 );
  2551. }
  2552. }
  2553. /*
  2554. ========================
  2555. ClipHomogeneousPolygonToUnitCube
  2556. Clips a polygon with homogeneous coordinates to all six axis aligned unit cube planes.
  2557. ========================
  2558. */
  2559. static int ClipHomogeneousPolygonToUnitCube_SSE2( idVec4 * points, int numPoints ) {
  2560. assert( numPoints < 16 - 6 );
  2561. ALIGNTYPE16 idVec4 newPoints[16 * 2];
  2562. #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1
  2563. ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 2, vector_float_neg_one, vector_float_zero ); // near
  2564. #else
  2565. ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 2, vector_float_neg_one, vector_float_one ); // near
  2566. #endif
  2567. ClipHomogeneousPolygonToSide_SSE2( points, newPoints, numPoints, 2, vector_float_pos_one, vector_float_one ); // far
  2568. ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 1, vector_float_neg_one, vector_float_one ); // bottom
  2569. ClipHomogeneousPolygonToSide_SSE2( points, newPoints, numPoints, 1, vector_float_pos_one, vector_float_one ); // top
  2570. ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 0, vector_float_neg_one, vector_float_one ); // left
  2571. ClipHomogeneousPolygonToSide_SSE2( points, newPoints, numPoints, 0, vector_float_pos_one, vector_float_one ); // right
  2572. return numPoints;
  2573. }
  2574. #else
  2575. /*
  2576. ========================
  2577. ClipHomogeneousLineToSide
  2578. Clips a line with homogeneous coordinates to the axis aligned plane[axis] = side.
  2579. ========================
  2580. */
  2581. static idVec4 ClipHomogeneousLineToSide( const idVec4 & p0, const idVec4 & p1, int axis, float side ) {
  2582. const float d0 = p0.w * side - p0[axis];
  2583. const float d1 = p1.w * side - p1[axis];
  2584. const float delta = d0 - d1;
  2585. const float f = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f;
  2586. const float c = idMath::ClampFloat( 0.0f, 1.0f, f );
  2587. return p0 + c * ( p1 - p0 );
  2588. }
  2589. /*
  2590. ========================
  2591. ClipHomogeneousPolygonToSide
  2592. Clips a polygon with homogeneous coordinates to the axis aligned plane[axis] = sign * offset.
  2593. ========================
  2594. */
  2595. static int ClipHomogeneousPolygonToSide_Generic( idVec4 * __restrict newPoints, idVec4 * __restrict points, int numPoints, int axis, float sign, float offset ) {
  2596. assert( newPoints != points );
  2597. assert( numPoints < 16 );
  2598. int sides[16];
  2599. const float side = sign * offset;
  2600. // calculate the plane side for each original point and calculate all potential new points
  2601. for ( int i = 0; i < numPoints; i++ ) {
  2602. int j = ( i + 1 ) & ( ( i + 1 - numPoints ) >> 31 );
  2603. sides[i] = sign * points[i][axis] < offset * points[i].w;
  2604. newPoints[i * 2 + 0] = points[i];
  2605. newPoints[i * 2 + 1] = ClipHomogeneousLineToSide( points[i], points[j], axis, side );
  2606. };
  2607. // repeat the first side at the end to avoid having to wrap around
  2608. sides[numPoints] = sides[0];
  2609. // compact the array of points
  2610. int numNewPoints = 0;
  2611. for ( int i = 0; i < numPoints; i++ ) {
  2612. if ( sides[i + 0] != 0 ) {
  2613. newPoints[numNewPoints++] = newPoints[i * 2 + 0];
  2614. }
  2615. if ( ( sides[i + 0] ^ sides[i + 1] ) != 0 ) {
  2616. newPoints[numNewPoints++] = newPoints[i * 2 + 1];
  2617. }
  2618. }
  2619. assert( numNewPoints <= 16 );
  2620. return numNewPoints;
  2621. }
  2622. /*
  2623. ========================
  2624. ClipHomogeneousPolygonToUnitCube
  2625. Clips a polygon with homogeneous coordinates to all six axis aligned unit cube planes.
  2626. ========================
  2627. */
  2628. static int ClipHomogeneousPolygonToUnitCube_Generic( idVec4 * points, int numPoints ) {
  2629. assert( numPoints < 16 - 6 );
  2630. ALIGNTYPE16 idVec4 newPoints[2 * 16]; // the C clip code temporarily doubles the points
  2631. #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1
  2632. numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 0.0f ); // near
  2633. #else
  2634. numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 1.0f ); // near
  2635. #endif
  2636. numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 2, +1.0f, 1.0f ); // far
  2637. numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 1, -1.0f, 1.0f ); // bottom
  2638. numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 1, +1.0f, 1.0f ); // top
  2639. numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 0, -1.0f, 1.0f ); // left
  2640. numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 0, +1.0f, 1.0f ); // right
  2641. return numPoints;
  2642. }
  2643. #endif
  2644. /*
  2645. ========================
  2646. idRenderMatrix::ProjectedFullyClippedBounds
  2647. Calculates the bounds of the given bounding box projected with the given Model View Projection (MVP) matrix.
  2648. If 'windowSpace' is true then the calculated bounds along each axis are moved and clamped to the [0, 1] range.
  2649. The given bounding box is first fully clipped to the MVP to get the smallest projected bounds.
  2650. Note that this code assumes the MVP matrix has an infinite far clipping plane. When the far plane is at
  2651. infinity the bounds are never far clipped and it is sufficient to test whether or not the center of the
  2652. near clip plane is inside the bounds to calculate the correct minimum Z. If the far plane is not at
  2653. infinity then this code would also have to test for the view frustum being completely contained inside
  2654. the given bounds in which case the projected bounds should be set to fully cover the view frustum.
  2655. ========================
  2656. */
  2657. void idRenderMatrix::ProjectedFullyClippedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) {
  2658. #ifdef ID_WIN_X86_SSE2_INTRIN
  2659. const __m128 mvp0 = _mm_loadu_ps( mvp[0] );
  2660. const __m128 mvp1 = _mm_loadu_ps( mvp[1] );
  2661. const __m128 mvp2 = _mm_loadu_ps( mvp[2] );
  2662. const __m128 mvp3 = _mm_loadu_ps( mvp[3] );
  2663. const __m128 t0 = _mm_unpacklo_ps( mvp0, mvp2 ); // mvp[0][0], mvp[2][0], mvp[0][1], mvp[2][1]
  2664. const __m128 t1 = _mm_unpackhi_ps( mvp0, mvp2 ); // mvp[0][2], mvp[2][2], mvp[0][3], mvp[2][3]
  2665. const __m128 t2 = _mm_unpacklo_ps( mvp1, mvp3 ); // mvp[1][0], mvp[3][0], mvp[1][1], mvp[3][1]
  2666. const __m128 t3 = _mm_unpackhi_ps( mvp1, mvp3 ); // mvp[1][2], mvp[3][2], mvp[1][3], mvp[3][3]
  2667. const __m128 mvpX = _mm_unpacklo_ps( t0, t2 ); // mvp[0][0], mvp[1][0], mvp[2][0], mvp[3][0]
  2668. const __m128 mvpY = _mm_unpackhi_ps( t0, t2 ); // mvp[0][1], mvp[1][1], mvp[2][1], mvp[3][1]
  2669. const __m128 mvpZ = _mm_unpacklo_ps( t1, t3 ); // mvp[0][2], mvp[1][2], mvp[2][2], mvp[3][2]
  2670. const __m128 mvpW = _mm_unpackhi_ps( t1, t3 ); // mvp[0][3], mvp[1][3], mvp[2][3], mvp[3][3]
  2671. const __m128 b0 = _mm_loadu_bounds_0( bounds );
  2672. const __m128 b1 = _mm_loadu_bounds_1( bounds );
  2673. const __m128 b0X = _mm_splat_ps( b0, 0 );
  2674. const __m128 b0Y = _mm_splat_ps( b0, 1 );
  2675. const __m128 b0Z = _mm_splat_ps( b0, 2 );
  2676. const __m128 b1X = _mm_splat_ps( b1, 0 );
  2677. const __m128 b1Y = _mm_splat_ps( b1, 1 );
  2678. const __m128 b1Z = _mm_splat_ps( b1, 2 );
  2679. const __m128 p0 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
  2680. const __m128 p1 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
  2681. const __m128 p2 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
  2682. const __m128 p3 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
  2683. const __m128 p4 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
  2684. const __m128 p5 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
  2685. const __m128 p6 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
  2686. const __m128 p7 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
  2687. ALIGNTYPE16 idVec4 projectedPoints[8];
  2688. _mm_store_ps( projectedPoints[0].ToFloatPtr(), p0 );
  2689. _mm_store_ps( projectedPoints[1].ToFloatPtr(), p1 );
  2690. _mm_store_ps( projectedPoints[2].ToFloatPtr(), p2 );
  2691. _mm_store_ps( projectedPoints[3].ToFloatPtr(), p3 );
  2692. _mm_store_ps( projectedPoints[4].ToFloatPtr(), p4 );
  2693. _mm_store_ps( projectedPoints[5].ToFloatPtr(), p5 );
  2694. _mm_store_ps( projectedPoints[6].ToFloatPtr(), p6 );
  2695. _mm_store_ps( projectedPoints[7].ToFloatPtr(), p7 );
  2696. ALIGNTYPE16 idVec4 clippedPoints[6 * 16];
  2697. int numClippedPoints = 0;
  2698. for ( int i = 0; i < 6; i++ ) {
  2699. _mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][0]].ToFloatPtr() ) );
  2700. _mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][1]].ToFloatPtr() ) );
  2701. _mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][2]].ToFloatPtr() ) );
  2702. _mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][3]].ToFloatPtr() ) );
  2703. numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 );
  2704. }
  2705. // repeat the first clipped point at the end to get a multiple of 4 clipped points
  2706. const __m128 point0 = _mm_load_ps( clippedPoints[0].ToFloatPtr() );
  2707. for ( int i = numClippedPoints; ( i & 3 ) != 0; i++ ) {
  2708. _mm_store_ps( clippedPoints[i].ToFloatPtr(), point0 );
  2709. }
  2710. // test if the center of the near clip plane is inside the given bounding box
  2711. const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp );
  2712. const bool inside = bounds.Expand( RENDER_MATRIX_PROJECTION_EPSILON ).ContainsPoint( localNearClipCenter );
  2713. __m128 minX = vector_float_pos_infinity;
  2714. __m128 minY = vector_float_pos_infinity;
  2715. __m128 minZ = inside ? vector_float_neg_one : vector_float_pos_infinity;
  2716. __m128 maxX = vector_float_neg_infinity;
  2717. __m128 maxY = vector_float_neg_infinity;
  2718. __m128 maxZ = vector_float_neg_infinity;
  2719. for ( int i = 0; i < numClippedPoints; i += 4 ) {
  2720. const __m128 cp0 = _mm_load_ps( clippedPoints[i + 0].ToFloatPtr() );
  2721. const __m128 cp1 = _mm_load_ps( clippedPoints[i + 1].ToFloatPtr() );
  2722. const __m128 cp2 = _mm_load_ps( clippedPoints[i + 2].ToFloatPtr() );
  2723. const __m128 cp3 = _mm_load_ps( clippedPoints[i + 3].ToFloatPtr() );
  2724. const __m128 s0 = _mm_unpacklo_ps( cp0, cp2 ); // cp0[0], cp2[0], cp0[1], cp2[1]
  2725. const __m128 s1 = _mm_unpackhi_ps( cp0, cp2 ); // cp0[2], cp2[2], cp0[3], cp2[3]
  2726. const __m128 s2 = _mm_unpacklo_ps( cp1, cp3 ); // cp1[0], cp3[0], cp1[1], cp3[1]
  2727. const __m128 s3 = _mm_unpackhi_ps( cp1, cp3 ); // cp1[2], cp3[2], cp1[3], cp3[3]
  2728. const __m128 cpX = _mm_unpacklo_ps( s0, s2 ); // cp0[0], cp1[0], cp2[0], cp3[0]
  2729. const __m128 cpY = _mm_unpackhi_ps( s0, s2 ); // cp0[1], cp1[1], cp2[1], cp3[1]
  2730. const __m128 cpZ = _mm_unpacklo_ps( s1, s3 ); // cp0[2], cp1[2], cp2[2], cp3[2]
  2731. const __m128 cpW = _mm_unpackhi_ps( s1, s3 ); // cp0[3], cp1[3], cp2[3], cp3[3]
  2732. const __m128 rW = _mm_rcp32_ps( cpW );
  2733. const __m128 rX = _mm_mul_ps( cpX, rW );
  2734. const __m128 rY = _mm_mul_ps( cpY, rW );
  2735. const __m128 rZ = _mm_mul_ps( cpZ, rW );
  2736. minX = _mm_min_ps( minX, rX );
  2737. minY = _mm_min_ps( minY, rY );
  2738. minZ = _mm_min_ps( minZ, rZ );
  2739. maxX = _mm_max_ps( maxX, rX );
  2740. maxY = _mm_max_ps( maxY, rY );
  2741. maxZ = _mm_max_ps( maxZ, rZ );
  2742. }
  2743. minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2744. minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2745. minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2746. minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2747. minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2748. minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2749. maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2750. maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2751. maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2752. maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2753. maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2754. maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2755. if ( windowSpace ) {
  2756. minX = _mm_madd_ps( minX, vector_float_half, vector_float_half );
  2757. maxX = _mm_madd_ps( maxX, vector_float_half, vector_float_half );
  2758. minY = _mm_madd_ps( minY, vector_float_half, vector_float_half );
  2759. maxY = _mm_madd_ps( maxY, vector_float_half, vector_float_half );
  2760. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  2761. minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half );
  2762. maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half );
  2763. #endif
  2764. minX = _mm_max_ps( _mm_min_ps( minX, vector_float_one ), vector_float_zero );
  2765. maxX = _mm_max_ps( _mm_min_ps( maxX, vector_float_one ), vector_float_zero );
  2766. minY = _mm_max_ps( _mm_min_ps( minY, vector_float_one ), vector_float_zero );
  2767. maxY = _mm_max_ps( _mm_min_ps( maxY, vector_float_one ), vector_float_zero );
  2768. minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero );
  2769. maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero );
  2770. }
  2771. _mm_store_ss( & projected[0].x, minX );
  2772. _mm_store_ss( & projected[0].y, minY );
  2773. _mm_store_ss( & projected[0].z, minZ );
  2774. _mm_store_ss( & projected[1].x, maxX );
  2775. _mm_store_ss( & projected[1].y, maxY );
  2776. _mm_store_ss( & projected[1].z, maxZ );
  2777. #else
  2778. const idVec3 points[8] = {
  2779. idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
  2780. idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
  2781. idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
  2782. idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
  2783. idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
  2784. idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
  2785. idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
  2786. idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
  2787. };
  2788. idVec4 projectedPoints[8];
  2789. for ( int i = 0; i < 8; i++ ) {
  2790. const idVec3 & v = points[i];
  2791. projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
  2792. projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
  2793. projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
  2794. projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
  2795. }
  2796. idVec4 clippedPoints[6 * 16];
  2797. int numClippedPoints = 0;
  2798. for ( int i = 0; i < 6; i++ ) {
  2799. clippedPoints[numClippedPoints + 0] = projectedPoints[boxPolygonVertices[i][0]];
  2800. clippedPoints[numClippedPoints + 1] = projectedPoints[boxPolygonVertices[i][1]];
  2801. clippedPoints[numClippedPoints + 2] = projectedPoints[boxPolygonVertices[i][2]];
  2802. clippedPoints[numClippedPoints + 3] = projectedPoints[boxPolygonVertices[i][3]];
  2803. numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
  2804. }
  2805. // test if the center of the near clip plane is inside the given bounding box
  2806. const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp );
  2807. const bool inside = bounds.Expand( RENDER_MATRIX_PROJECTION_EPSILON ).ContainsPoint( localNearClipCenter );
  2808. for ( int i = 0; i < 3; i++ ) {
  2809. projected[0][i] = RENDER_MATRIX_INFINITY;
  2810. projected[1][i] = - RENDER_MATRIX_INFINITY;
  2811. }
  2812. if ( inside ) {
  2813. projected[0][2] = -1.0f;
  2814. }
  2815. for ( int i = 0; i < numClippedPoints; i++ ) {
  2816. const idVec4 & c = clippedPoints[i];
  2817. assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL );
  2818. const float rw = 1.0f / c.w;
  2819. const float px = c.x * rw;
  2820. const float py = c.y * rw;
  2821. const float pz = c.z * rw;
  2822. projected[0][0] = Min( projected[0][0], px );
  2823. projected[0][1] = Min( projected[0][1], py );
  2824. projected[0][2] = Min( projected[0][2], pz );
  2825. projected[1][0] = Max( projected[1][0], px );
  2826. projected[1][1] = Max( projected[1][1], py );
  2827. projected[1][2] = Max( projected[1][2], pz );
  2828. }
  2829. if ( windowSpace ) {
  2830. // convert to window coords
  2831. projected[0][0] = projected[0][0] * 0.5f + 0.5f;
  2832. projected[1][0] = projected[1][0] * 0.5f + 0.5f;
  2833. projected[0][1] = projected[0][1] * 0.5f + 0.5f;
  2834. projected[1][1] = projected[1][1] * 0.5f + 0.5f;
  2835. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  2836. projected[0][2] = projected[0][2] * 0.5f + 0.5f;
  2837. projected[1][2] = projected[1][2] * 0.5f + 0.5f;
  2838. #endif
  2839. // clamp to [0, 1] range
  2840. projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] );
  2841. projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] );
  2842. projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] );
  2843. projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] );
  2844. projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] );
  2845. projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] );
  2846. }
  2847. #endif
  2848. }
  2849. /*
  2850. ========================
  2851. idRenderMatrix::DepthBoundsForBounds
  2852. Calculates the depth bounds of the given bounding box projected with the given Model View Projection (MVP) matrix.
  2853. If 'windowSpace' is true then the calculated depth bounds are moved and clamped to the [0, 1] range.
  2854. The given bounding box is not clipped to the MVP so the depth bounds may not be as tight as possible.
  2855. ========================
  2856. */
  2857. void idRenderMatrix::DepthBoundsForBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) {
  2858. #ifdef ID_WIN_X86_SSE2_INTRIN
  2859. __m128 mvp2 = _mm_loadu_ps( mvp[2] );
  2860. __m128 mvp3 = _mm_loadu_ps( mvp[3] );
  2861. __m128 b0 = _mm_loadu_bounds_0( bounds );
  2862. __m128 b1 = _mm_loadu_bounds_1( bounds );
  2863. // take the four points on the X-Y plane
  2864. __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y
  2865. __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X
  2866. __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y
  2867. __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z
  2868. __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z
  2869. // compute four partial Z,W values
  2870. __m128 parz = _mm_splat_ps( mvp2, 3 );
  2871. __m128 parw = _mm_splat_ps( mvp3, 3 );
  2872. parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
  2873. parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
  2874. parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
  2875. parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
  2876. __m128 z0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp2, 2 ), parz );
  2877. __m128 w0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp3, 2 ), parw );
  2878. __m128 z1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp2, 2 ), parz );
  2879. __m128 w1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp3, 2 ), parw );
  2880. __m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 );
  2881. w0 = _mm_or_ps( w0, _mm_and_ps( vector_float_smallest_non_denorm, s0 ) );
  2882. __m128 rw0 = _mm_rcp32_ps( w0 );
  2883. z0 = _mm_mul_ps( z0, rw0 );
  2884. z0 = _mm_sel_ps( z0, vector_float_neg_infinity, s0 );
  2885. __m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 );
  2886. w1 = _mm_or_ps( w1, _mm_and_ps( vector_float_smallest_non_denorm, s1 ) );
  2887. __m128 rw1 = _mm_rcp32_ps( w1 );
  2888. z1 = _mm_mul_ps( z1, rw1 );
  2889. z1 = _mm_sel_ps( z1, vector_float_neg_infinity, s1 );
  2890. __m128 minv = _mm_min_ps( z0, z1 );
  2891. __m128 maxv = _mm_max_ps( z0, z1 );
  2892. minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2893. minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2894. maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  2895. maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  2896. if ( windowSpace ) {
  2897. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  2898. minv = _mm_madd_ps( minv, vector_float_half, vector_float_half );
  2899. maxv = _mm_madd_ps( maxv, vector_float_half, vector_float_half );
  2900. #endif
  2901. minv = _mm_max_ps( minv, vector_float_zero );
  2902. maxv = _mm_min_ps( maxv, vector_float_one );
  2903. }
  2904. _mm_store_ss( & min, minv );
  2905. _mm_store_ss( & max, maxv );
  2906. #else
  2907. float localMin = RENDER_MATRIX_INFINITY;
  2908. float localMax = - RENDER_MATRIX_INFINITY;
  2909. idVec3 v;
  2910. for ( int x = 0; x < 2; x++ ) {
  2911. v[0] = bounds[x][0];
  2912. for ( int y = 0; y < 2; y++ ) {
  2913. v[1] = bounds[y][1];
  2914. for ( int z = 0; z < 2; z++ ) {
  2915. v[2] = bounds[z][2];
  2916. float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
  2917. float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
  2918. if ( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) {
  2919. tz = tz / tw;
  2920. } else {
  2921. tz = -RENDER_MATRIX_INFINITY;
  2922. }
  2923. localMin = Min( localMin, tz );
  2924. localMax = Max( localMax, tz );
  2925. }
  2926. }
  2927. }
  2928. if ( windowSpace ) {
  2929. // convert to window coords
  2930. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  2931. min = localMin * 0.5f + 0.5f;
  2932. max = localMax * 0.5f + 0.5f;
  2933. #endif
  2934. // clamp to the [0, 1] range
  2935. min = Max( min, 0.0f );
  2936. max = Min( max, 1.0f );
  2937. }
  2938. #endif
  2939. }
  2940. /*
  2941. ========================
  2942. idRenderMatrix::DepthBoundsForExtrudedBounds
  2943. Calculates the depth bounds of the given extruded bounding box projected with the given Model View Projection (MVP) matrix.
  2944. The given bounding box is extruded in the 'extrudeDirection' up to the 'clipPlane'.
  2945. If 'windowSpace' is true then the calculated depth bounds are moved and clamped to the [0, 1] range.
  2946. The extruded bounding box is not clipped to the MVP so the depth bounds may not be as tight as possible.
  2947. ========================
  2948. */
  2949. void idRenderMatrix::DepthBoundsForExtrudedBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & extrudeDirection, const idPlane & clipPlane, bool windowSpace ) {
  2950. assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL );
  2951. #ifdef ID_WIN_X86_SSE2_INTRIN
  2952. __m128 mvp2 = _mm_loadu_ps( mvp[2] );
  2953. __m128 mvp3 = _mm_loadu_ps( mvp[3] );
  2954. __m128 b0 = _mm_loadu_bounds_0( bounds );
  2955. __m128 b1 = _mm_loadu_bounds_1( bounds );
  2956. // take the four points on the X-Y plane
  2957. __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y
  2958. __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X
  2959. __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y
  2960. __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z
  2961. __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z
  2962. __m128 minv;
  2963. __m128 maxv;
  2964. // calculate the min/max depth values for the bounding box corners
  2965. {
  2966. // compute four partial Z,W values
  2967. __m128 parz = _mm_splat_ps( mvp2, 3 );
  2968. __m128 parw = _mm_splat_ps( mvp3, 3 );
  2969. parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
  2970. parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
  2971. parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
  2972. parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
  2973. __m128 z0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp2, 2 ), parz );
  2974. __m128 w0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp3, 2 ), parw );
  2975. __m128 z1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp2, 2 ), parz );
  2976. __m128 w1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp3, 2 ), parw );
  2977. __m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 );
  2978. w0 = _mm_or_ps( w0, _mm_and_ps( vector_float_smallest_non_denorm, s0 ) );
  2979. __m128 rw0 = _mm_rcp32_ps( w0 );
  2980. z0 = _mm_mul_ps( z0, rw0 );
  2981. z0 = _mm_sel_ps( z0, vector_float_neg_infinity, s0 );
  2982. __m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 );
  2983. w1 = _mm_or_ps( w1, _mm_and_ps( vector_float_smallest_non_denorm, s1 ) );
  2984. __m128 rw1 = _mm_rcp32_ps( w1 );
  2985. z1 = _mm_mul_ps( z1, rw1 );
  2986. z1 = _mm_sel_ps( z1, vector_float_neg_infinity, s1 );
  2987. minv = _mm_min_ps( z0, z1 );
  2988. maxv = _mm_max_ps( z0, z1 );
  2989. }
  2990. // calculate and include the min/max depth value for the extruded bounding box corners
  2991. {
  2992. __m128 clipX = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 0 ), 0 );
  2993. __m128 clipY = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 1 ), 0 );
  2994. __m128 clipZ = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 2 ), 0 );
  2995. __m128 clipW = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 3 ), 0 );
  2996. __m128 extrudeX = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 0 ), 0 );
  2997. __m128 extrudeY = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 1 ), 0 );
  2998. __m128 extrudeZ = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 2 ), 0 );
  2999. __m128 closing = _mm_madd_ps( clipX, extrudeX, _mm_madd_ps( clipY, extrudeY, _mm_mul_ps( clipZ, extrudeZ ) ) );
  3000. __m128 invClosing = _mm_rcp32_ps( closing );
  3001. invClosing = _mm_xor_ps( invClosing, vector_float_sign_bit );
  3002. __m128 dt = _mm_madd_ps( clipX, vx, _mm_madd_ps( clipY, vy, clipW ) );
  3003. __m128 d0 = _mm_madd_ps( clipZ, vz0, dt );
  3004. __m128 d1 = _mm_madd_ps( clipZ, vz1, dt );
  3005. d0 = _mm_mul_ps( d0, invClosing );
  3006. d1 = _mm_mul_ps( d1, invClosing );
  3007. __m128 vx0 = _mm_madd_ps( extrudeX, d0, vx );
  3008. __m128 vx1 = _mm_madd_ps( extrudeX, d1, vx );
  3009. __m128 vy0 = _mm_madd_ps( extrudeY, d0, vy );
  3010. __m128 vy1 = _mm_madd_ps( extrudeY, d1, vy );
  3011. vz0 = _mm_madd_ps( extrudeZ, d0, vz0 );
  3012. vz1 = _mm_madd_ps( extrudeZ, d1, vz1 );
  3013. __m128 mvp2X = _mm_splat_ps( mvp2, 0 );
  3014. __m128 mvp3X = _mm_splat_ps( mvp3, 0 );
  3015. __m128 mvp2W = _mm_splat_ps( mvp2, 3 );
  3016. __m128 mvp3W = _mm_splat_ps( mvp3, 3 );
  3017. __m128 z0 = _mm_madd_ps( vx0, mvp2X, mvp2W );
  3018. __m128 w0 = _mm_madd_ps( vx0, mvp3X, mvp3W );
  3019. __m128 z1 = _mm_madd_ps( vx1, mvp2X, mvp2W );
  3020. __m128 w1 = _mm_madd_ps( vx1, mvp3X, mvp3W );
  3021. __m128 mvp2Y = _mm_splat_ps( mvp2, 1 );
  3022. __m128 mvp3Y = _mm_splat_ps( mvp3, 1 );
  3023. z0 = _mm_madd_ps( vy0, mvp2Y, z0 );
  3024. w0 = _mm_madd_ps( vy0, mvp3Y, w0 );
  3025. z1 = _mm_madd_ps( vy1, mvp2Y, z1 );
  3026. w1 = _mm_madd_ps( vy1, mvp3Y, w1 );
  3027. __m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
  3028. __m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
  3029. z0 = _mm_madd_ps( vz0, mvp2Z, z0 );
  3030. w0 = _mm_madd_ps( vz0, mvp3Z, w0 );
  3031. z1 = _mm_madd_ps( vz1, mvp2Z, z1 );
  3032. w1 = _mm_madd_ps( vz1, mvp3Z, w1 );
  3033. __m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 );
  3034. w0 = _mm_or_ps( w0, _mm_and_ps( vector_float_smallest_non_denorm, s0 ) );
  3035. __m128 rw0 = _mm_rcp32_ps( w0 );
  3036. z0 = _mm_mul_ps( z0, rw0 );
  3037. z0 = _mm_sel_ps( z0, vector_float_neg_infinity, s0 );
  3038. __m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 );
  3039. w1 = _mm_or_ps( w1, _mm_and_ps( vector_float_smallest_non_denorm, s1 ) );
  3040. __m128 rw1 = _mm_rcp32_ps( w1 );
  3041. z1 = _mm_mul_ps( z1, rw1 );
  3042. z1 = _mm_sel_ps( z1, vector_float_neg_infinity, s1 );
  3043. minv = _mm_min_ps( minv, z0 );
  3044. maxv = _mm_max_ps( maxv, z0 );
  3045. minv = _mm_min_ps( minv, z1 );
  3046. maxv = _mm_max_ps( maxv, z1 );
  3047. }
  3048. minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  3049. minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  3050. maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  3051. maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  3052. if ( windowSpace ) {
  3053. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  3054. minv = _mm_madd_ps( minv, vector_float_half, vector_float_half );
  3055. maxv = _mm_madd_ps( maxv, vector_float_half, vector_float_half );
  3056. #endif
  3057. minv = _mm_max_ps( minv, vector_float_zero );
  3058. maxv = _mm_min_ps( maxv, vector_float_one );
  3059. }
  3060. _mm_store_ss( & min, minv );
  3061. _mm_store_ss( & max, maxv );
  3062. #else
  3063. const float closing = extrudeDirection * clipPlane.Normal();
  3064. const float invClosing = -1.0f / closing;
  3065. float localMin = RENDER_MATRIX_INFINITY;
  3066. float localMax = - RENDER_MATRIX_INFINITY;
  3067. idVec3 v;
  3068. for ( int x = 0; x < 2; x++ ) {
  3069. v[0] = bounds[x][0];
  3070. for ( int y = 0; y < 2; y++ ) {
  3071. v[1] = bounds[y][1];
  3072. for ( int z = 0; z < 2; z++ ) {
  3073. v[2] = bounds[z][2];
  3074. for ( int extrude = 0; extrude <= 1; extrude++ ) {
  3075. idVec3 test;
  3076. if ( extrude ) {
  3077. float extrudeDist = clipPlane.Distance( v ) * invClosing;
  3078. test = v + extrudeDirection * extrudeDist;
  3079. } else {
  3080. test = v;
  3081. }
  3082. float tz = test[0] * mvp[2][0] + test[1] * mvp[2][1] + test[2] * mvp[2][2] + mvp[2][3];
  3083. float tw = test[0] * mvp[3][0] + test[1] * mvp[3][1] + test[2] * mvp[3][2] + mvp[3][3];
  3084. if ( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) {
  3085. tz = tz / tw;
  3086. } else {
  3087. tz = -RENDER_MATRIX_INFINITY;
  3088. }
  3089. localMin = Min( localMin, tz );
  3090. localMax = Max( localMax, tz );
  3091. }
  3092. }
  3093. }
  3094. }
  3095. if ( windowSpace ) {
  3096. // convert to window coords
  3097. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  3098. min = localMin * 0.5f + 0.5f;
  3099. max = localMax * 0.5f + 0.5f;
  3100. #endif
  3101. // clamp to the [0, 1] range
  3102. min = Max( min, 0.0f );
  3103. max = Min( max, 1.0f );
  3104. }
  3105. #endif
  3106. }
  3107. /*
  3108. ========================
  3109. PointInsideInfiniteShadow
  3110. Returns true if the 'localPoint' is inside the infinite shadow volume cast
  3111. from the given occluder bounding box and the given light position.
  3112. ========================
  3113. */
  3114. static bool PointInsideInfiniteShadow( const idBounds & occluderBounds, const idVec3 & localLightOrigin, const idVec3 & localPoint, const float epsilon ) {
  3115. // Expand the bounds with an epsilon.
  3116. const idBounds expandedBounds = occluderBounds.Expand( epsilon );
  3117. // If the point is inside the bounding box then the point
  3118. // is also inside the shadow projection.
  3119. if ( expandedBounds.ContainsPoint( localPoint ) ) {
  3120. return true;
  3121. }
  3122. // If the light is inside the bounding box then the shadow is projected
  3123. // in all directions and any point is inside the infinte shadow projection.
  3124. if ( expandedBounds.ContainsPoint( localPoint ) ) {
  3125. return true;
  3126. }
  3127. // If the line from localLightOrigin to localPoint intersects the
  3128. // bounding box then the point is inside the infinite shadow projection.
  3129. if ( expandedBounds.LineIntersection( localLightOrigin, localPoint ) ) {
  3130. return true;
  3131. }
  3132. // The point is definitely not inside the projected shadow.
  3133. return false;
  3134. }
  3135. /*
  3136. ========================
  3137. idRenderMatrix::DepthBoundsForShadowBounds
  3138. Calculates the depth bounds of the infinite shadow volume projected with the given Model View Projection (MVP) matrix.
  3139. The infinite shadow volume is cast from the given occluder bounding box and the given light position.
  3140. If 'windowSpace' is true then the calculated depth bounds are moved and clamped to the [0, 1] range.
  3141. The infinite shadow volume is fully clipped to the MVP to get the tightest possible bounds.
  3142. Note that this code assumes the MVP matrix has an infinite far clipping plane. When the far plane is at
  3143. infinity the shadow volume is never far clipped and it is sufficient to test whether or not the center
  3144. of the near clip plane is inside the shadow volume to calculate the correct minimum Z. If the far plane
  3145. is not at infinity then this code would also have to test for the view frustum being completely contained
  3146. inside the shadow volume to also calculate the correct maximum Z. This could be done, for instance, by
  3147. testing if the center of the far clipping plane is contained inside the shadow volume.
  3148. ========================
  3149. */
  3150. void idRenderMatrix::DepthBoundsForShadowBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & localLightOrigin, bool windowSpace ) {
  3151. #ifdef ID_WIN_X86_SSE2_INTRIN
  3152. const __m128 mvp0 = _mm_loadu_ps( mvp[0] );
  3153. const __m128 mvp1 = _mm_loadu_ps( mvp[1] );
  3154. const __m128 mvp2 = _mm_loadu_ps( mvp[2] );
  3155. const __m128 mvp3 = _mm_loadu_ps( mvp[3] );
  3156. const __m128 t0 = _mm_unpacklo_ps( mvp0, mvp2 ); // mvp[0][0], mvp[2][0], mvp[0][1], mvp[2][1]
  3157. const __m128 t1 = _mm_unpackhi_ps( mvp0, mvp2 ); // mvp[0][2], mvp[2][2], mvp[0][3], mvp[2][3]
  3158. const __m128 t2 = _mm_unpacklo_ps( mvp1, mvp3 ); // mvp[1][0], mvp[3][0], mvp[1][1], mvp[3][1]
  3159. const __m128 t3 = _mm_unpackhi_ps( mvp1, mvp3 ); // mvp[1][2], mvp[3][2], mvp[1][3], mvp[3][3]
  3160. const __m128 mvpX = _mm_unpacklo_ps( t0, t2 ); // mvp[0][0], mvp[1][0], mvp[2][0], mvp[3][0]
  3161. const __m128 mvpY = _mm_unpackhi_ps( t0, t2 ); // mvp[0][1], mvp[1][1], mvp[2][1], mvp[3][1]
  3162. const __m128 mvpZ = _mm_unpacklo_ps( t1, t3 ); // mvp[0][2], mvp[1][2], mvp[2][2], mvp[3][2]
  3163. const __m128 mvpW = _mm_unpackhi_ps( t1, t3 ); // mvp[0][3], mvp[1][3], mvp[2][3], mvp[3][3]
  3164. const __m128 b0 = _mm_loadu_bounds_0( bounds );
  3165. const __m128 b1 = _mm_loadu_bounds_1( bounds );
  3166. const __m128 lightOriginX = _mm_load_ss( localLightOrigin.ToFloatPtr() + 0 );
  3167. const __m128 lightOriginY = _mm_load_ss( localLightOrigin.ToFloatPtr() + 1 );
  3168. const __m128 lightOriginZ = _mm_load_ss( localLightOrigin.ToFloatPtr() + 2 );
  3169. const __m128 lightOrigin = _mm_unpacklo_ps( _mm_unpacklo_ps( lightOriginX, lightOriginZ ), lightOriginY );
  3170. // calculate the front facing polygon bits
  3171. int frontBits = GetBoxFrontBits_SSE2( b0, b1, lightOrigin );
  3172. const __m128 b0X = _mm_splat_ps( b0, 0 );
  3173. const __m128 b0Y = _mm_splat_ps( b0, 1 );
  3174. const __m128 b0Z = _mm_splat_ps( b0, 2 );
  3175. const __m128 b1X = _mm_splat_ps( b1, 0 );
  3176. const __m128 b1Y = _mm_splat_ps( b1, 1 );
  3177. const __m128 b1Z = _mm_splat_ps( b1, 2 );
  3178. // bounding box corners
  3179. const __m128 np0 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
  3180. const __m128 np1 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
  3181. const __m128 np2 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
  3182. const __m128 np3 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
  3183. const __m128 np4 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
  3184. const __m128 np5 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
  3185. const __m128 np6 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
  3186. const __m128 np7 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
  3187. ALIGNTYPE16 idVec4 projectedNearPoints[8];
  3188. _mm_store_ps( projectedNearPoints[0].ToFloatPtr(), np0 );
  3189. _mm_store_ps( projectedNearPoints[1].ToFloatPtr(), np1 );
  3190. _mm_store_ps( projectedNearPoints[2].ToFloatPtr(), np2 );
  3191. _mm_store_ps( projectedNearPoints[3].ToFloatPtr(), np3 );
  3192. _mm_store_ps( projectedNearPoints[4].ToFloatPtr(), np4 );
  3193. _mm_store_ps( projectedNearPoints[5].ToFloatPtr(), np5 );
  3194. _mm_store_ps( projectedNearPoints[6].ToFloatPtr(), np6 );
  3195. _mm_store_ps( projectedNearPoints[7].ToFloatPtr(), np7 );
  3196. // subtract the light position from the bounding box
  3197. const __m128 lightX = _mm_splat_ps( lightOriginX, 0 );
  3198. const __m128 lightY = _mm_splat_ps( lightOriginY, 0 );
  3199. const __m128 lightZ = _mm_splat_ps( lightOriginZ, 0 );
  3200. const __m128 d0X = _mm_sub_ps( b0X, lightX );
  3201. const __m128 d0Y = _mm_sub_ps( b0Y, lightY );
  3202. const __m128 d0Z = _mm_sub_ps( b0Z, lightZ );
  3203. const __m128 d1X = _mm_sub_ps( b1X, lightX );
  3204. const __m128 d1Y = _mm_sub_ps( b1Y, lightY );
  3205. const __m128 d1Z = _mm_sub_ps( b1Z, lightZ );
  3206. // bounding box corners projected to infinity from the light position
  3207. const __m128 fp0 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) );
  3208. const __m128 fp1 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) );
  3209. const __m128 fp2 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) );
  3210. const __m128 fp3 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) );
  3211. const __m128 fp4 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) );
  3212. const __m128 fp5 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) );
  3213. const __m128 fp6 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) );
  3214. const __m128 fp7 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) );
  3215. ALIGNTYPE16 idVec4 projectedFarPoints[8];
  3216. _mm_store_ps( projectedFarPoints[0].ToFloatPtr(), fp0 );
  3217. _mm_store_ps( projectedFarPoints[1].ToFloatPtr(), fp1 );
  3218. _mm_store_ps( projectedFarPoints[2].ToFloatPtr(), fp2 );
  3219. _mm_store_ps( projectedFarPoints[3].ToFloatPtr(), fp3 );
  3220. _mm_store_ps( projectedFarPoints[4].ToFloatPtr(), fp4 );
  3221. _mm_store_ps( projectedFarPoints[5].ToFloatPtr(), fp5 );
  3222. _mm_store_ps( projectedFarPoints[6].ToFloatPtr(), fp6 );
  3223. _mm_store_ps( projectedFarPoints[7].ToFloatPtr(), fp7 );
  3224. ALIGNTYPE16 idVec4 clippedPoints[( 6 + 12 ) * 16];
  3225. int numClippedPoints = 0;
  3226. // clip the front facing bounding box polygons at the near cap
  3227. const frontPolygons_t & frontPolygons = boxFrontPolygonsForFrontBits[frontBits];
  3228. for ( int i = 0; i < frontPolygons.count; i++ ) {
  3229. const int polygon = frontPolygons.indices[i];
  3230. _mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][0]].ToFloatPtr() ) );
  3231. _mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][1]].ToFloatPtr() ) );
  3232. _mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][2]].ToFloatPtr() ) );
  3233. _mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][3]].ToFloatPtr() ) );
  3234. numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 );
  3235. }
  3236. // clip the front facing bounding box polygons projected to the far cap
  3237. for ( int i = 0; i < frontPolygons.count; i++ ) {
  3238. const int polygon = frontPolygons.indices[i];
  3239. _mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][0]].ToFloatPtr() ) );
  3240. _mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][1]].ToFloatPtr() ) );
  3241. _mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][2]].ToFloatPtr() ) );
  3242. _mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][3]].ToFloatPtr() ) );
  3243. numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 );
  3244. }
  3245. // clip the silhouette edge polygons that stretch to infinity
  3246. const silhouetteEdges_t & silhouetteEdges = boxSilhouetteEdgesForFrontBits[frontBits];
  3247. for ( int i = 0; i < silhouetteEdges.count; i++ ) {
  3248. const int edge = silhouetteEdges.indices[i];
  3249. _mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxEdgeVertices[edge][0]].ToFloatPtr() ) );
  3250. _mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxEdgeVertices[edge][1]].ToFloatPtr() ) );
  3251. _mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxEdgeVertices[edge][1]].ToFloatPtr() ) );
  3252. _mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxEdgeVertices[edge][0]].ToFloatPtr() ) );
  3253. numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 );
  3254. }
  3255. // repeat the first clipped point at the end to get a multiple of 4 clipped points
  3256. const __m128 point0 = _mm_load_ps( clippedPoints[0].ToFloatPtr() );
  3257. for ( int i = numClippedPoints; ( i & 3 ) != 0; i++ ) {
  3258. _mm_store_ps( clippedPoints[i].ToFloatPtr(), point0 );
  3259. }
  3260. // test if the center of the near clip plane is inside the infinite shadow volume
  3261. const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp );
  3262. const bool inside = PointInsideInfiniteShadow( bounds, localLightOrigin, localNearClipCenter, RENDER_MATRIX_PROJECTION_EPSILON );
  3263. __m128 minZ = inside ? vector_float_neg_one : vector_float_pos_infinity;
  3264. __m128 maxZ = vector_float_neg_infinity;
  3265. for ( int i = 0; i < numClippedPoints; i += 4 ) {
  3266. const __m128 cp0 = _mm_load_ps( clippedPoints[i + 0].ToFloatPtr() );
  3267. const __m128 cp1 = _mm_load_ps( clippedPoints[i + 1].ToFloatPtr() );
  3268. const __m128 cp2 = _mm_load_ps( clippedPoints[i + 2].ToFloatPtr() );
  3269. const __m128 cp3 = _mm_load_ps( clippedPoints[i + 3].ToFloatPtr() );
  3270. const __m128 s1 = _mm_unpackhi_ps( cp0, cp2 ); // cp0[2], cp2[2], cp0[3], cp2[3]
  3271. const __m128 s3 = _mm_unpackhi_ps( cp1, cp3 ); // cp1[2], cp3[2], cp1[3], cp3[3]
  3272. const __m128 cpZ = _mm_unpacklo_ps( s1, s3 ); // cp0[2], cp1[2], cp2[2], cp3[2]
  3273. const __m128 cpW = _mm_unpackhi_ps( s1, s3 ); // cp0[3], cp1[3], cp2[3], cp3[3]
  3274. const __m128 rW = _mm_rcp32_ps( cpW );
  3275. const __m128 rZ = _mm_mul_ps( cpZ, rW );
  3276. minZ = _mm_min_ps( minZ, rZ );
  3277. maxZ = _mm_max_ps( maxZ, rZ );
  3278. }
  3279. minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  3280. minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  3281. maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  3282. maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  3283. if ( windowSpace ) {
  3284. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  3285. minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half );
  3286. maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half );
  3287. #endif
  3288. minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero );
  3289. maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero );
  3290. }
  3291. _mm_store_ss( & min, minZ );
  3292. _mm_store_ss( & max, maxZ );
  3293. #else
  3294. const idVec3 points[8] = {
  3295. idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
  3296. idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
  3297. idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
  3298. idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
  3299. idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
  3300. idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
  3301. idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
  3302. idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
  3303. };
  3304. // calculate the front facing polygon bits
  3305. int frontBits = GetBoxFrontBits_Generic( bounds, localLightOrigin );
  3306. // bounding box corners
  3307. ALIGNTYPE16 idVec4 projectedNearPoints[8];
  3308. for ( int i = 0; i < 8; i++ ) {
  3309. const idVec3 & v = points[i];
  3310. projectedNearPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
  3311. projectedNearPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
  3312. projectedNearPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
  3313. projectedNearPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
  3314. }
  3315. // bounding box corners projected to infinity from the light position
  3316. ALIGNTYPE16 idVec4 projectedFarPoints[8];
  3317. for ( int i = 0; i < 8; i++ ) {
  3318. const idVec3 v = points[i] - localLightOrigin;
  3319. projectedFarPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2];
  3320. projectedFarPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2];
  3321. projectedFarPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2];
  3322. projectedFarPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2];
  3323. }
  3324. ALIGNTYPE16 idVec4 clippedPoints[( 6 + 12 ) * 16];
  3325. int numClippedPoints = 0;
  3326. // clip the front facing bounding box polygons at the near cap
  3327. const frontPolygons_t & frontPolygons = boxFrontPolygonsForFrontBits[frontBits];
  3328. for ( int i = 0; i < frontPolygons.count; i++ ) {
  3329. const int polygon = frontPolygons.indices[i];
  3330. clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxPolygonVertices[polygon][0]];
  3331. clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxPolygonVertices[polygon][1]];
  3332. clippedPoints[numClippedPoints + 2] = projectedNearPoints[boxPolygonVertices[polygon][2]];
  3333. clippedPoints[numClippedPoints + 3] = projectedNearPoints[boxPolygonVertices[polygon][3]];
  3334. numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
  3335. }
  3336. // clip the front facing bounding box polygons projected to the far cap
  3337. for ( int i = 0; i < frontPolygons.count; i++ ) {
  3338. const int polygon = frontPolygons.indices[i];
  3339. clippedPoints[numClippedPoints + 0] = projectedFarPoints[boxPolygonVertices[polygon][0]];
  3340. clippedPoints[numClippedPoints + 1] = projectedFarPoints[boxPolygonVertices[polygon][1]];
  3341. clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxPolygonVertices[polygon][2]];
  3342. clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxPolygonVertices[polygon][3]];
  3343. numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
  3344. }
  3345. // clip the silhouette edge polygons that stretch to infinity
  3346. const silhouetteEdges_t & silhouetteEdges = boxSilhouetteEdgesForFrontBits[frontBits];
  3347. for ( int i = 0; i < silhouetteEdges.count; i++ ) {
  3348. const int edge = silhouetteEdges.indices[i];
  3349. clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxEdgeVertices[edge][0]];
  3350. clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxEdgeVertices[edge][1]];
  3351. clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxEdgeVertices[edge][1]];
  3352. clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxEdgeVertices[edge][0]];
  3353. numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
  3354. }
  3355. // test if the center of the near clip plane is inside the infinite shadow volume
  3356. const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp );
  3357. const bool inside = PointInsideInfiniteShadow( bounds, localLightOrigin, localNearClipCenter, RENDER_MATRIX_PROJECTION_EPSILON );
  3358. min = inside ? -1.0f : RENDER_MATRIX_INFINITY;
  3359. max = - RENDER_MATRIX_INFINITY;
  3360. for ( int i = 0; i < numClippedPoints; i++ ) {
  3361. const idVec4 & c = clippedPoints[i];
  3362. assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL );
  3363. const float rw = 1.0f / c.w;
  3364. const float pz = c.z * rw;
  3365. min = Min( min, pz );
  3366. max = Max( max, pz );
  3367. }
  3368. if ( windowSpace ) {
  3369. // convert to window coords
  3370. #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1]
  3371. min = min * 0.5f + 0.5f;
  3372. max = max * 0.5f + 0.5f;
  3373. #endif
  3374. // clamp to [0, 1] range
  3375. min = idMath::ClampFloat( 0.0f, 1.0f, min );
  3376. max = idMath::ClampFloat( 0.0f, 1.0f, max );
  3377. }
  3378. #endif
  3379. }
  3380. /*
  3381. ========================
  3382. idRenderMatrix::GetFrustumPlanes
  3383. Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne'
  3384. to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix.
  3385. ========================
  3386. */
  3387. void idRenderMatrix::GetFrustumPlanes( idPlane planes[6], const idRenderMatrix & frustum, bool zeroToOne, bool normalize ) {
  3388. // FIXME: need to know whether or not this is a D3D MVP.
  3389. // We cannot just assume that it's an D3D MVP matrix when
  3390. // zeroToOne = false and CLIP_SPACE_D3D is defined because
  3391. // this code may be called for non-MVP matrices.
  3392. const bool isZeroOneZ = false;
  3393. if ( zeroToOne ) {
  3394. // left: inside(p) = p * frustum[0] > 0
  3395. planes[0][0] = frustum[0][0];
  3396. planes[0][1] = frustum[0][1];
  3397. planes[0][2] = frustum[0][2];
  3398. planes[0][3] = frustum[0][3];
  3399. // bottom: inside(p) = p * frustum[1] > 0
  3400. planes[2][0] = frustum[1][0];
  3401. planes[2][1] = frustum[1][1];
  3402. planes[2][2] = frustum[1][2];
  3403. planes[2][3] = frustum[1][3];
  3404. // near: inside(p) = p * frustum[2] > 0
  3405. planes[4][0] = frustum[2][0];
  3406. planes[4][1] = frustum[2][1];
  3407. planes[4][2] = frustum[2][2];
  3408. planes[4][3] = frustum[2][3];
  3409. } else {
  3410. // left: inside(p) = p * frustum[0] > - ( p * frustum[3] )
  3411. planes[0][0] = frustum[3][0] + frustum[0][0];
  3412. planes[0][1] = frustum[3][1] + frustum[0][1];
  3413. planes[0][2] = frustum[3][2] + frustum[0][2];
  3414. planes[0][3] = frustum[3][3] + frustum[0][3];
  3415. // bottom: inside(p) = p * frustum[1] > -( p * frustum[3] )
  3416. planes[2][0] = frustum[3][0] + frustum[1][0];
  3417. planes[2][1] = frustum[3][1] + frustum[1][1];
  3418. planes[2][2] = frustum[3][2] + frustum[1][2];
  3419. planes[2][3] = frustum[3][3] + frustum[1][3];
  3420. // near: inside(p) = p * frustum[2] > -( p * frustum[3] )
  3421. planes[4][0] = isZeroOneZ ? ( frustum[2][0] ) : ( frustum[3][0] + frustum[2][0] );
  3422. planes[4][1] = isZeroOneZ ? ( frustum[2][1] ) : ( frustum[3][1] + frustum[2][1] );
  3423. planes[4][2] = isZeroOneZ ? ( frustum[2][2] ) : ( frustum[3][2] + frustum[2][2] );
  3424. planes[4][3] = isZeroOneZ ? ( frustum[2][3] ) : ( frustum[3][3] + frustum[2][3] );
  3425. }
  3426. // right: inside(p) = p * frustum[0] < p * frustum[3]
  3427. planes[1][0] = frustum[3][0] - frustum[0][0];
  3428. planes[1][1] = frustum[3][1] - frustum[0][1];
  3429. planes[1][2] = frustum[3][2] - frustum[0][2];
  3430. planes[1][3] = frustum[3][3] - frustum[0][3];
  3431. // top: inside(p) = p * frustum[1] < p * frustum[3]
  3432. planes[3][0] = frustum[3][0] - frustum[1][0];
  3433. planes[3][1] = frustum[3][1] - frustum[1][1];
  3434. planes[3][2] = frustum[3][2] - frustum[1][2];
  3435. planes[3][3] = frustum[3][3] - frustum[1][3];
  3436. // far: inside(p) = p * frustum[2] < p * frustum[3]
  3437. planes[5][0] = frustum[3][0] - frustum[2][0];
  3438. planes[5][1] = frustum[3][1] - frustum[2][1];
  3439. planes[5][2] = frustum[3][2] - frustum[2][2];
  3440. planes[5][3] = frustum[3][3] - frustum[2][3];
  3441. // optionally normalize the planes
  3442. if ( normalize ) {
  3443. for ( int i = 0; i < 6; i++ ) {
  3444. float s = idMath::InvSqrt( planes[i].Normal().LengthSqr() );
  3445. planes[i][0] *= s;
  3446. planes[i][1] *= s;
  3447. planes[i][2] *= s;
  3448. planes[i][3] *= s;
  3449. }
  3450. }
  3451. }
  3452. /*
  3453. ========================
  3454. idRenderMatrix::GetFrustumCorners
  3455. ========================
  3456. */
  3457. void idRenderMatrix::GetFrustumCorners( frustumCorners_t & corners, const idRenderMatrix & frustumTransform, const idBounds & frustumBounds ) {
  3458. assert_16_byte_aligned( &corners );
  3459. #ifdef ID_WIN_X86_SSE2_INTRIN
  3460. __m128 mvp0 = _mm_loadu_ps( frustumTransform[0] );
  3461. __m128 mvp1 = _mm_loadu_ps( frustumTransform[1] );
  3462. __m128 mvp2 = _mm_loadu_ps( frustumTransform[2] );
  3463. __m128 mvp3 = _mm_loadu_ps( frustumTransform[3] );
  3464. __m128 b0 = _mm_loadu_bounds_0( frustumBounds );
  3465. __m128 b1 = _mm_loadu_bounds_1( frustumBounds );
  3466. // take the four points on the X-Y plane
  3467. __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y
  3468. __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X
  3469. __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y
  3470. __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z
  3471. __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z
  3472. // compute four partial X,Y,Z,W values
  3473. __m128 parx = _mm_splat_ps( mvp0, 3 );
  3474. __m128 pary = _mm_splat_ps( mvp1, 3 );
  3475. __m128 parz = _mm_splat_ps( mvp2, 3 );
  3476. __m128 parw = _mm_splat_ps( mvp3, 3 );
  3477. parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx );
  3478. pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary );
  3479. parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
  3480. parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
  3481. parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx );
  3482. pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary );
  3483. parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
  3484. parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
  3485. __m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
  3486. __m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
  3487. __m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
  3488. __m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
  3489. __m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx );
  3490. __m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary );
  3491. __m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz );
  3492. __m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw );
  3493. __m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx );
  3494. __m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary );
  3495. __m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz );
  3496. __m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw );
  3497. __m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 );
  3498. __m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 );
  3499. w0 = _mm_sel_ps( w0, vector_float_one, s0 );
  3500. w1 = _mm_sel_ps( w1, vector_float_one, s1 );
  3501. __m128 rw0 = _mm_rcp32_ps( w0 );
  3502. __m128 rw1 = _mm_rcp32_ps( w1 );
  3503. x0 = _mm_mul_ps( x0, rw0 );
  3504. y0 = _mm_mul_ps( y0, rw0 );
  3505. z0 = _mm_mul_ps( z0, rw0 );
  3506. x1 = _mm_mul_ps( x1, rw1 );
  3507. y1 = _mm_mul_ps( y1, rw1 );
  3508. z1 = _mm_mul_ps( z1, rw1 );
  3509. _mm_store_ps( corners.x + 0, x0 );
  3510. _mm_store_ps( corners.x + 4, x1 );
  3511. _mm_store_ps( corners.y + 0, y0 );
  3512. _mm_store_ps( corners.y + 4, y1 );
  3513. _mm_store_ps( corners.z + 0, z0 );
  3514. _mm_store_ps( corners.z + 4, z1 );
  3515. #else
  3516. idVec3 v;
  3517. for ( int x = 0; x < 2; x++ ) {
  3518. v[0] = frustumBounds[x][0];
  3519. for ( int y = 0; y < 2; y++ ) {
  3520. v[1] = frustumBounds[y][1];
  3521. for ( int z = 0; z < 2; z++ ) {
  3522. v[2] = frustumBounds[z][2];
  3523. float tx = v[0] * frustumTransform[0][0] + v[1] * frustumTransform[0][1] + v[2] * frustumTransform[0][2] + frustumTransform[0][3];
  3524. float ty = v[0] * frustumTransform[1][0] + v[1] * frustumTransform[1][1] + v[2] * frustumTransform[1][2] + frustumTransform[1][3];
  3525. float tz = v[0] * frustumTransform[2][0] + v[1] * frustumTransform[2][1] + v[2] * frustumTransform[2][2] + frustumTransform[2][3];
  3526. float tw = v[0] * frustumTransform[3][0] + v[1] * frustumTransform[3][1] + v[2] * frustumTransform[3][2] + frustumTransform[3][3];
  3527. assert( tw > idMath::FLT_SMALLEST_NON_DENORMAL );
  3528. float rw = 1.0f / tw;
  3529. corners.x[(z<<2)|(y<<1)|(x<<0)] = tx * rw;
  3530. corners.y[(z<<2)|(y<<1)|(x<<0)] = ty * rw;
  3531. corners.z[(z<<2)|(y<<1)|(x<<0)] = tz * rw;
  3532. }
  3533. }
  3534. }
  3535. #endif
  3536. }
  3537. /*
  3538. ========================
  3539. idRenderMatrix::CullFrustumCornersToPlane
  3540. ========================
  3541. */
  3542. frustumCull_t idRenderMatrix::CullFrustumCornersToPlane( const frustumCorners_t & corners, const idPlane & plane ) {
  3543. assert_16_byte_aligned( &corners );
  3544. #ifdef ID_WIN_X86_SSE2_INTRIN
  3545. __m128 vp = _mm_loadu_ps( plane.ToFloatPtr() );
  3546. __m128 x0 = _mm_load_ps( corners.x + 0 );
  3547. __m128 y0 = _mm_load_ps( corners.y + 0 );
  3548. __m128 z0 = _mm_load_ps( corners.z + 0 );
  3549. __m128 x1 = _mm_load_ps( corners.x + 4 );
  3550. __m128 y1 = _mm_load_ps( corners.y + 4 );
  3551. __m128 z1 = _mm_load_ps( corners.z + 4 );
  3552. __m128 p0 = _mm_splat_ps( vp, 0 );
  3553. __m128 p1 = _mm_splat_ps( vp, 1 );
  3554. __m128 p2 = _mm_splat_ps( vp, 2 );
  3555. __m128 p3 = _mm_splat_ps( vp, 3 );
  3556. __m128 d0 = _mm_madd_ps( x0, p0, _mm_madd_ps( y0, p1, _mm_madd_ps( z0, p2, p3 ) ) );
  3557. __m128 d1 = _mm_madd_ps( x1, p0, _mm_madd_ps( y1, p1, _mm_madd_ps( z1, p2, p3 ) ) );
  3558. int b0 = _mm_movemask_ps( d0 );
  3559. int b1 = _mm_movemask_ps( d1 );
  3560. unsigned int front = ( (unsigned int) -( ( b0 & b1 ) ^ 15 ) ) >> 31;
  3561. unsigned int back = ( (unsigned int) -( b0 | b1 ) ) >> 31;
  3562. compile_time_assert( FRUSTUM_CULL_FRONT == 1 );
  3563. compile_time_assert( FRUSTUM_CULL_BACK == 2 );
  3564. compile_time_assert( FRUSTUM_CULL_CROSS == 3 );
  3565. return (frustumCull_t) ( front | ( back << 1 ) );
  3566. #else
  3567. bool front = false;
  3568. bool back = false;
  3569. for ( int i = 0; i < 8; i++ ) {
  3570. const float d = corners.x[i] * plane[0] + corners.y[i] * plane[1] + corners.z[i] * plane[2] + plane[3];
  3571. if ( d >= 0.0f ) {
  3572. front = true;
  3573. } else if ( d <= 0.0f ) {
  3574. back = true;
  3575. }
  3576. if ( back && front ) {
  3577. return FRUSTUM_CULL_CROSS;
  3578. }
  3579. }
  3580. if ( front ) {
  3581. return FRUSTUM_CULL_FRONT;
  3582. } else {
  3583. return FRUSTUM_CULL_BACK;
  3584. }
  3585. #endif
  3586. }