123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042 |
- /*
- ===========================================================================
- Doom 3 BFG Edition GPL Source Code
- Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
- This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
- Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
- In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
- If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
- ===========================================================================
- */
- #pragma hdrstop
- #include "../precompiled.h"
- // this file is full of intentional case fall throughs
- //lint -e616
- // the code is correct, it can't be a NULL pointer
- //lint -e613
- static idCVar lcp_showFailures( "lcp_showFailures", "0", CVAR_BOOL, "show LCP solver failures" );
- const float LCP_BOUND_EPSILON = 1e-5f;
- const float LCP_ACCEL_EPSILON = 1e-5f;
- const float LCP_DELTA_ACCEL_EPSILON = 1e-9f;
- const float LCP_DELTA_FORCE_EPSILON = 1e-9f;
- #define IGNORE_UNSATISFIABLE_VARIABLES
- #if defined( ID_WIN_X86_SSE_ASM ) || defined( ID_WIN_X86_SSE_INTRIN )
- ALIGN16( const __m128 SIMD_SP_zero ) = { 0.0f, 0.0f, 0.0f, 0.0f };
- ALIGN16( const __m128 SIMD_SP_one ) = { 1.0f, 1.0f, 1.0f, 1.0f };
- ALIGN16( const __m128 SIMD_SP_two ) = { 2.0f, 2.0f, 2.0f, 2.0f };
- ALIGN16( const __m128 SIMD_SP_tiny ) = { 1e-10f, 1e-10f, 1e-10f, 1e-10f };
- ALIGN16( const __m128 SIMD_SP_infinity ) = { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY };
- ALIGN16( const __m128 SIMD_SP_LCP_DELTA_ACCEL_EPSILON ) = { LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON };
- ALIGN16( const __m128 SIMD_SP_LCP_DELTA_FORCE_EPSILON ) = { LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON };
- ALIGN16( const __m128 SIMD_SP_LCP_BOUND_EPSILON ) = { LCP_BOUND_EPSILON, LCP_BOUND_EPSILON, LCP_BOUND_EPSILON, LCP_BOUND_EPSILON };
- ALIGN16( const __m128 SIMD_SP_neg_LCP_BOUND_EPSILON ) = { -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON };
- ALIGN16( const unsigned int SIMD_SP_signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
- ALIGN16( const unsigned int SIMD_SP_absMask[4] ) = { ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK };
- ALIGN16( const unsigned int SIMD_SP_indexedStartMask[4][4] ) = { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0, 0, 0xFFFFFFFF } };
- ALIGN16( const unsigned int SIMD_SP_indexedEndMask[4][4] ) = { { 0, 0, 0, 0 }, { 0xFFFFFFFF, 0, 0, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 } };
- ALIGN16( const unsigned int SIMD_SP_clearLast1[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 };
- ALIGN16( const unsigned int SIMD_SP_clearLast2[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 };
- ALIGN16( const unsigned int SIMD_SP_clearLast3[4] ) = { 0xFFFFFFFF, 0, 0, 0 };
- ALIGN16( const unsigned int SIMD_DW_zero[4] ) = { 0, 0, 0, 0 };
- ALIGN16( const unsigned int SIMD_DW_one[4] ) = { 1, 1, 1, 1 };
- ALIGN16( const unsigned int SIMD_DW_four[4] ) = { 4, 4, 4, 4 };
- ALIGN16( const unsigned int SIMD_DW_index[4] ) = { 0, 1, 2, 3 };
- ALIGN16( const int SIMD_DW_not3[4] ) = { ~3, ~3, ~3, ~3 };
- #endif
- /*
- ========================
- Multiply_SIMD
- dst[i] = src0[i] * src1[i];
- Assumes the source and destination have the same memory alignment.
- ========================
- */
- static void Multiply_SIMD( float * dst, const float * src0, const float * src1, const int count ) {
- int i = 0;
- for ( ; ( (unsigned int)dst & 0xF ) != 0 && i < count; i++ ) {
- dst[i] = src0[i] * src1[i];
- }
- #ifdef ID_WIN_X86_SSE_INTRIN
- for ( ; i + 4 <= count; i += 4 ) {
- assert_16_byte_aligned( &dst[i] );
- assert_16_byte_aligned( &src0[i] );
- assert_16_byte_aligned( &src1[i] );
- __m128 s0 = _mm_load_ps( src0 + i );
- __m128 s1 = _mm_load_ps( src1 + i );
- s0 = _mm_mul_ps( s0, s1 );
- _mm_store_ps( dst + i, s0 );
- }
- #else
- for ( ; i + 4 <= count; i += 4 ) {
- assert_16_byte_aligned( &dst[i] );
- assert_16_byte_aligned( &src0[i] );
- assert_16_byte_aligned( &src1[i] );
- dst[i+0] = src0[i+0] * src1[i+0];
- dst[i+1] = src0[i+1] * src1[i+1];
- dst[i+2] = src0[i+2] * src1[i+2];
- dst[i+3] = src0[i+3] * src1[i+3];
- }
- #endif
- for ( ; i < count; i++ ) {
- dst[i] = src0[i] * src1[i];
- }
- }
- /*
- ========================
- MultiplyAdd_SIMD
- dst[i] += constant * src[i];
- Assumes the source and destination have the same memory alignment.
- ========================
- */
- static void MultiplyAdd_SIMD( float * dst, const float constant, const float * src, const int count ) {
- int i = 0;
- for ( ; ( (unsigned int)dst & 0xF ) != 0 && i < count; i++ ) {
- dst[i] += constant * src[i];
- }
- #ifdef ID_WIN_X86_SSE_INTRIN
- __m128 c = _mm_load1_ps( & constant );
- for ( ; i + 4 <= count; i += 4 ) {
- assert_16_byte_aligned( &dst[i] );
- assert_16_byte_aligned( &src[i] );
- __m128 s = _mm_load_ps( src + i );
- __m128 d = _mm_load_ps( dst + i );
- s = _mm_add_ps( _mm_mul_ps( s, c ), d );
- _mm_store_ps( dst + i, s );
- }
- #else
- for ( ; i + 4 <= count; i += 4 ) {
- assert_16_byte_aligned( &src[i] );
- assert_16_byte_aligned( &dst[i] );
- dst[i+0] += constant * src[i+0];
- dst[i+1] += constant * src[i+1];
- dst[i+2] += constant * src[i+2];
- dst[i+3] += constant * src[i+3];
- }
- #endif
- for ( ; i < count; i++ ) {
- dst[i] += constant * src[i];
- }
- }
- /*
- ========================
- DotProduct_SIMD
- dot = src0[0] * src1[0] + src0[1] * src1[1] + src0[2] * src1[2] + ...
- ========================
- */
- static float DotProduct_SIMD( const float * src0, const float * src1, const int count ) {
- assert_16_byte_aligned( src0 );
- assert_16_byte_aligned( src1 );
- #ifdef ID_WIN_X86_SSE_INTRIN
- __m128 sum = (__m128 &) SIMD_SP_zero;
- int i = 0;
- for ( ; i < count - 3; i += 4 ) {
- __m128 s0 = _mm_load_ps( src0 + i );
- __m128 s1 = _mm_load_ps( src1 + i );
- sum = _mm_add_ps( _mm_mul_ps( s0, s1 ), sum );
- }
- __m128 mask = _mm_load_ps( (float *) &SIMD_SP_indexedEndMask[count & 3] );
- __m128 s0 = _mm_and_ps( _mm_load_ps( src0 + i ), mask );
- __m128 s1 = _mm_and_ps( _mm_load_ps( src1 + i ), mask );
- sum = _mm_add_ps( _mm_mul_ps( s0, s1 ), sum );
- sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
- sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
- float dot;
- _mm_store_ss( & dot, sum );
- return dot;
- #else
- float s0 = 0.0f;
- float s1 = 0.0f;
- float s2 = 0.0f;
- float s3 = 0.0f;
- int i = 0;
- for ( ; i < count - 3; i += 4 ) {
- s0 += src0[i+4] * src1[i+4];
- s1 += src0[i+5] * src1[i+5];
- s2 += src0[i+6] * src1[i+6];
- s3 += src0[i+7] * src1[i+7];
- }
- switch( count - i ) {
- NODEFAULT;
- case 3: s0 += src0[i+2] * src1[i+2];
- case 2: s1 += src0[i+1] * src1[i+1];
- case 1: s2 += src0[i+0] * src1[i+0];
- case 0:
- break;
- }
- return s0 + s1 + s2 + s3;
- #endif
- }
- /*
- ========================
- LowerTriangularSolve_SIMD
- Solves x in Lx = b for the n * n sub-matrix of L.
- * if skip > 0 the first skip elements of x are assumed to be valid already
- * L has to be a lower triangular matrix with (implicit) ones on the diagonal
- * x == b is allowed
- ========================
- */
- static void LowerTriangularSolve_SIMD( const idMatX & L, float * x, const float * b, const int n, int skip ) {
- if ( skip >= n ) {
- return;
- }
- const float *lptr = L.ToFloatPtr();
- int nc = L.GetNumColumns();
- assert( ( nc & 3 ) == 0 );
- // unrolled cases for n < 8
- if ( n < 8 ) {
- #define NSKIP( n, s ) ((n<<3)|(s&7))
- switch( NSKIP( n, skip ) ) {
- case NSKIP( 1, 0 ): x[0] = b[0];
- return;
- case NSKIP( 2, 0 ): x[0] = b[0];
- case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- return;
- case NSKIP( 3, 0 ): x[0] = b[0];
- case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- return;
- case NSKIP( 4, 0 ): x[0] = b[0];
- case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- return;
- case NSKIP( 5, 0 ): x[0] = b[0];
- case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
- return;
- case NSKIP( 6, 0 ): x[0] = b[0];
- case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
- case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
- return;
- case NSKIP( 7, 0 ): x[0] = b[0];
- case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
- case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
- case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
- return;
- }
- #undef NSKIP
- return;
- }
- // process first 4 rows
- switch( skip ) {
- case 0: x[0] = b[0];
- case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
- case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- skip = 4;
- }
- lptr = L[skip];
- int i = skip;
- #ifdef ID_WIN_X86_SSE_INTRIN
- // work up to a multiple of 4 rows
- for ( ; ( i & 3 ) != 0 && i < n; i++ ) {
- __m128 sum = _mm_load_ss( & b[i] );
- int j = 0;
- for ( ; j < i - 3; j += 4 ) {
- __m128 s0 = _mm_load_ps( lptr + j );
- __m128 s1 = _mm_load_ps( x + j );
- sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
- }
- __m128 mask = _mm_load_ps( (float *) & SIMD_SP_indexedEndMask[i & 3] );
- __m128 s0 = _mm_and_ps( _mm_load_ps( lptr + j ), mask );
- __m128 s1 = _mm_and_ps( _mm_load_ps( x + j ), mask );
- sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
- sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
- sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
- _mm_store_ss( & x[i], sum );
- lptr += nc;
- }
- for ( ; i + 3 < n; i += 4 ) {
- const float * lptr0 = &lptr[0*nc];
- const float * lptr1 = &lptr[1*nc];
- const float * lptr2 = &lptr[2*nc];
- const float * lptr3 = &lptr[3*nc];
- assert_16_byte_aligned( lptr0 );
- assert_16_byte_aligned( lptr1 );
- assert_16_byte_aligned( lptr2 );
- assert_16_byte_aligned( lptr3 );
- __m128 va = _mm_load_ss( & b[i+0] );
- __m128 vb = _mm_load_ss( & b[i+1] );
- __m128 vc = _mm_load_ss( & b[i+2] );
- __m128 vd = _mm_load_ss( & b[i+3] );
- __m128 x0 = _mm_load_ps( & x[0] );
- va = _mm_sub_ps( va, _mm_mul_ps( x0, _mm_load_ps( lptr0 + 0 ) ) );
- vb = _mm_sub_ps( vb, _mm_mul_ps( x0, _mm_load_ps( lptr1 + 0 ) ) );
- vc = _mm_sub_ps( vc, _mm_mul_ps( x0, _mm_load_ps( lptr2 + 0 ) ) );
- vd = _mm_sub_ps( vd, _mm_mul_ps( x0, _mm_load_ps( lptr3 + 0 ) ) );
- for ( int j = 4; j < i; j += 4 ) {
- __m128 xj = _mm_load_ps( &x[j] );
- va = _mm_sub_ps( va, _mm_mul_ps( xj, _mm_load_ps( lptr0 + j ) ) );
- vb = _mm_sub_ps( vb, _mm_mul_ps( xj, _mm_load_ps( lptr1 + j ) ) );
- vc = _mm_sub_ps( vc, _mm_mul_ps( xj, _mm_load_ps( lptr2 + j ) ) );
- vd = _mm_sub_ps( vd, _mm_mul_ps( xj, _mm_load_ps( lptr3 + j ) ) );
- }
- vb = _mm_sub_ps( vb, _mm_mul_ps( va, _mm_load1_ps( lptr1 + i + 0 ) ) );
- vc = _mm_sub_ps( vc, _mm_mul_ps( va, _mm_load1_ps( lptr2 + i + 0 ) ) );
- vc = _mm_sub_ps( vc, _mm_mul_ps( vb, _mm_load1_ps( lptr2 + i + 1 ) ) );
- vd = _mm_sub_ps( vd, _mm_mul_ps( va, _mm_load1_ps( lptr3 + i + 0 ) ) );
- vd = _mm_sub_ps( vd, _mm_mul_ps( vb, _mm_load1_ps( lptr3 + i + 1 ) ) );
- vd = _mm_sub_ps( vd, _mm_mul_ps( vc, _mm_load1_ps( lptr3 + i + 2 ) ) );
- __m128 ta = _mm_unpacklo_ps( va, vc ); // x0, z0, x1, z1
- __m128 tb = _mm_unpackhi_ps( va, vc ); // x2, z2, x3, z3
- __m128 tc = _mm_unpacklo_ps( vb, vd ); // y0, w0, y1, w1
- __m128 td = _mm_unpackhi_ps( vb, vd ); // y2, w2, y3, w3
- va = _mm_unpacklo_ps( ta, tc ); // x0, y0, z0, w0
- vb = _mm_unpackhi_ps( ta, tc ); // x1, y1, z1, w1
- vc = _mm_unpacklo_ps( tb, td ); // x2, y2, z2, w2
- vd = _mm_unpackhi_ps( tb, td ); // x3, y3, z3, w3
- va = _mm_add_ps( va, vb );
- vc = _mm_add_ps( vc, vd );
- va = _mm_add_ps( va, vc );
- _mm_store_ps( & x[i], va );
- lptr += 4 * nc;
- }
- // go through any remaining rows
- for ( ; i < n; i++ ) {
- __m128 sum = _mm_load_ss( & b[i] );
- int j = 0;
- for ( ; j < i - 3; j += 4 ) {
- __m128 s0 = _mm_load_ps( lptr + j );
- __m128 s1 = _mm_load_ps( x + j );
- sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
- }
- __m128 mask = _mm_load_ps( (float *) & SIMD_SP_indexedEndMask[i & 3] );
- __m128 s0 = _mm_and_ps( _mm_load_ps( lptr + j ), mask );
- __m128 s1 = _mm_and_ps( _mm_load_ps( x + j ), mask );
- sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
- sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
- sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
- _mm_store_ss( & x[i], sum );
- lptr += nc;
- }
- #else
- // work up to a multiple of 4 rows
- for ( ; ( i & 3 ) != 0 && i < n; i++ ) {
- float sum = b[i];
- for ( int j = 0; j < i; j++ ) {
- sum -= lptr[j] * x[j];
- }
- x[i] = sum;
- lptr += nc;
- }
- assert_16_byte_aligned( x );
- for ( ; i + 3 < n; i += 4 ) {
- const float * lptr0 = &lptr[0*nc];
- const float * lptr1 = &lptr[1*nc];
- const float * lptr2 = &lptr[2*nc];
- const float * lptr3 = &lptr[3*nc];
- assert_16_byte_aligned( lptr0 );
- assert_16_byte_aligned( lptr1 );
- assert_16_byte_aligned( lptr2 );
- assert_16_byte_aligned( lptr3 );
- float a0 = - lptr0[0] * x[0] + b[i+0];
- float a1 = - lptr0[1] * x[1];
- float a2 = - lptr0[2] * x[2];
- float a3 = - lptr0[3] * x[3];
- float b0 = - lptr1[0] * x[0] + b[i+1];
- float b1 = - lptr1[1] * x[1];
- float b2 = - lptr1[2] * x[2];
- float b3 = - lptr1[3] * x[3];
- float c0 = - lptr2[0] * x[0] + b[i+2];
- float c1 = - lptr2[1] * x[1];
- float c2 = - lptr2[2] * x[2];
- float c3 = - lptr2[3] * x[3];
- float d0 = - lptr3[0] * x[0] + b[i+3];
- float d1 = - lptr3[1] * x[1];
- float d2 = - lptr3[2] * x[2];
- float d3 = - lptr3[3] * x[3];
- for ( int j = 4; j < i; j += 4 ) {
- a0 -= lptr0[j+0] * x[j+0];
- a1 -= lptr0[j+1] * x[j+1];
- a2 -= lptr0[j+2] * x[j+2];
- a3 -= lptr0[j+3] * x[j+3];
- b0 -= lptr1[j+0] * x[j+0];
- b1 -= lptr1[j+1] * x[j+1];
- b2 -= lptr1[j+2] * x[j+2];
- b3 -= lptr1[j+3] * x[j+3];
- c0 -= lptr2[j+0] * x[j+0];
- c1 -= lptr2[j+1] * x[j+1];
- c2 -= lptr2[j+2] * x[j+2];
- c3 -= lptr2[j+3] * x[j+3];
- d0 -= lptr3[j+0] * x[j+0];
- d1 -= lptr3[j+1] * x[j+1];
- d2 -= lptr3[j+2] * x[j+2];
- d3 -= lptr3[j+3] * x[j+3];
- }
- b0 -= lptr1[i+0] * a0;
- b1 -= lptr1[i+0] * a1;
- b2 -= lptr1[i+0] * a2;
- b3 -= lptr1[i+0] * a3;
- c0 -= lptr2[i+0] * a0;
- c1 -= lptr2[i+0] * a1;
- c2 -= lptr2[i+0] * a2;
- c3 -= lptr2[i+0] * a3;
- c0 -= lptr2[i+1] * b0;
- c1 -= lptr2[i+1] * b1;
- c2 -= lptr2[i+1] * b2;
- c3 -= lptr2[i+1] * b3;
- d0 -= lptr3[i+0] * a0;
- d1 -= lptr3[i+0] * a1;
- d2 -= lptr3[i+0] * a2;
- d3 -= lptr3[i+0] * a3;
- d0 -= lptr3[i+1] * b0;
- d1 -= lptr3[i+1] * b1;
- d2 -= lptr3[i+1] * b2;
- d3 -= lptr3[i+1] * b3;
- d0 -= lptr3[i+2] * c0;
- d1 -= lptr3[i+2] * c1;
- d2 -= lptr3[i+2] * c2;
- d3 -= lptr3[i+2] * c3;
- x[i+0] = a0 + a1 + a2 + a3;
- x[i+1] = b0 + b1 + b2 + b3;
- x[i+2] = c0 + c1 + c2 + c3;
- x[i+3] = d0 + d1 + d2 + d3;
- lptr += 4 * nc;
- }
- // go through any remaining rows
- for ( ; i < n; i++ ) {
- float sum = b[i];
- for ( int j = 0; j < i; j++ ) {
- sum -= lptr[j] * x[j];
- }
- x[i] = sum;
- lptr += nc;
- }
- #endif
- }
- /*
- ========================
- LowerTriangularSolveTranspose_SIMD
- Solves x in L'x = b for the n * n sub-matrix of L.
- * L has to be a lower triangular matrix with (implicit) ones on the diagonal
- * x == b is allowed
- ========================
- */
- static void LowerTriangularSolveTranspose_SIMD( const idMatX & L, float * x, const float * b, const int n ) {
- int nc = L.GetNumColumns();
- assert( ( nc & 3 ) == 0 );
- int m = n;
- int r = n & 3;
- if ( ( m & 3 ) != 0 ) {
- const float * lptr = L.ToFloatPtr() + m * nc + m;
- if ( ( m & 3 ) == 1 ) {
- x[m-1] = b[m-1];
- m -= 1;
- } else if ( ( m & 3 ) == 2 ) {
- x[m-1] = b[m-1];
- x[m-2] = b[m-2] - lptr[-1*nc-2] * x[m-1];
- m -= 2;
- } else {
- x[m-1] = b[m-1];
- x[m-2] = b[m-2] - lptr[-1*nc-2] * x[m-1];
- x[m-3] = b[m-3] - lptr[-1*nc-3] * x[m-1] - lptr[-2*nc-3] * x[m-2];
- m -= 3;
- }
- }
- const float * lptr = L.ToFloatPtr() + m * nc + m - 4;
- float * xptr = x + m;
- #ifdef ID_WIN_X86_SSE2_INTRIN
- // process 4 rows at a time
- for ( int i = m; i >= 4; i -= 4 ) {
- assert_16_byte_aligned( b );
- assert_16_byte_aligned( xptr );
- assert_16_byte_aligned( lptr );
- __m128 s0 = _mm_load_ps( &b[i-4] );
- __m128 s1 = (__m128 &)SIMD_SP_zero;
- __m128 s2 = (__m128 &)SIMD_SP_zero;
- __m128 s3 = (__m128 &)SIMD_SP_zero;
- // process 4x4 blocks
- const float * xptr2 = xptr; // x + i;
- const float * lptr2 = lptr; // ptr = L[i] + i - 4;
- for ( int j = i; j < m; j += 4 ) {
- __m128 xj = _mm_load_ps( xptr2 );
- s0 = _mm_sub_ps( s0, _mm_mul_ps( _mm_splat_ps( xj, 0 ), _mm_load_ps( lptr2 + 0 * nc ) ) );
- s1 = _mm_sub_ps( s1, _mm_mul_ps( _mm_splat_ps( xj, 1 ), _mm_load_ps( lptr2 + 1 * nc ) ) );
- s2 = _mm_sub_ps( s2, _mm_mul_ps( _mm_splat_ps( xj, 2 ), _mm_load_ps( lptr2 + 2 * nc ) ) );
- s3 = _mm_sub_ps( s3, _mm_mul_ps( _mm_splat_ps( xj, 3 ), _mm_load_ps( lptr2 + 3 * nc ) ) );
- lptr2 += 4 * nc;
- xptr2 += 4;
- }
- for ( int j = 0; j < r; j++ ) {
- s0 = _mm_sub_ps( s0, _mm_mul_ps( _mm_load_ps( lptr2 ), _mm_load1_ps( &xptr2[j] ) ) );
- lptr2 += nc;
- }
- s0 = _mm_add_ps( s0, s1 );
- s2 = _mm_add_ps( s2, s3 );
- s0 = _mm_add_ps( s0, s2 );
- // process left over of the 4 rows
- lptr -= 4 * nc;
- __m128 t0 = _mm_and_ps( _mm_load_ps( lptr + 3 * nc ), (__m128 &)SIMD_SP_clearLast1 );
- __m128 t1 = _mm_and_ps( _mm_load_ps( lptr + 2 * nc ), (__m128 &)SIMD_SP_clearLast2 );
- __m128 t2 = _mm_load_ss( lptr + 1 * nc );
- s0 = _mm_sub_ps( s0, _mm_mul_ps( t0, _mm_splat_ps( s0, 3 ) ) );
- s0 = _mm_sub_ps( s0, _mm_mul_ps( t1, _mm_splat_ps( s0, 2 ) ) );
- s0 = _mm_sub_ps( s0, _mm_mul_ps( t2, _mm_splat_ps( s0, 1 ) ) );
- // store result
- _mm_store_ps( &xptr[-4], s0 );
- // update pointers for next four rows
- lptr -= 4;
- xptr -= 4;
- }
- #else
- // process 4 rows at a time
- for ( int i = m; i >= 4; i -= 4 ) {
- assert_16_byte_aligned( b );
- assert_16_byte_aligned( xptr );
- assert_16_byte_aligned( lptr );
- float s0 = b[i-4];
- float s1 = b[i-3];
- float s2 = b[i-2];
- float s3 = b[i-1];
- // process 4x4 blocks
- const float * xptr2 = xptr; // x + i;
- const float * lptr2 = lptr; // ptr = L[i] + i - 4;
- for ( int j = i; j < m; j += 4 ) {
- float t0 = xptr2[0];
- s0 -= lptr2[0] * t0;
- s1 -= lptr2[1] * t0;
- s2 -= lptr2[2] * t0;
- s3 -= lptr2[3] * t0;
- lptr2 += nc;
- float t1 = xptr2[1];
- s0 -= lptr2[0] * t1;
- s1 -= lptr2[1] * t1;
- s2 -= lptr2[2] * t1;
- s3 -= lptr2[3] * t1;
- lptr2 += nc;
- float t2 = xptr2[2];
- s0 -= lptr2[0] * t2;
- s1 -= lptr2[1] * t2;
- s2 -= lptr2[2] * t2;
- s3 -= lptr2[3] * t2;
- lptr2 += nc;
- float t3 = xptr2[3];
- s0 -= lptr2[0] * t3;
- s1 -= lptr2[1] * t3;
- s2 -= lptr2[2] * t3;
- s3 -= lptr2[3] * t3;
- lptr2 += nc;
- xptr2 += 4;
- }
- for ( int j = 0; j < r; j++ ) {
- float t = xptr2[j];
- s0 -= lptr2[0] * t;
- s1 -= lptr2[1] * t;
- s2 -= lptr2[2] * t;
- s3 -= lptr2[3] * t;
- lptr2 += nc;
- }
- // process left over of the 4 rows
- lptr -= nc;
- s0 -= lptr[0] * s3;
- s1 -= lptr[1] * s3;
- s2 -= lptr[2] * s3;
- lptr -= nc;
- s0 -= lptr[0] * s2;
- s1 -= lptr[1] * s2;
- lptr -= nc;
- s0 -= lptr[0] * s1;
- lptr -= nc;
- // store result
- xptr[-4] = s0;
- xptr[-3] = s1;
- xptr[-2] = s2;
- xptr[-1] = s3;
- // update pointers for next four rows
- lptr -= 4;
- xptr -= 4;
- }
- #endif
- }
- /*
- ========================
- UpperTriangularSolve_SIMD
- Solves x in Ux = b for the n * n sub-matrix of U.
- * U has to be a upper triangular matrix
- * invDiag is the reciprical of the diagonal of the upper triangular matrix.
- * x == b is allowed
- ========================
- */
- static void UpperTriangularSolve_SIMD( const idMatX & U, const float * invDiag, float * x, const float * b, const int n ) {
- for ( int i = n - 1; i >= 0; i-- ) {
- float sum = b[i];
- const float * uptr = U[i];
- for ( int j = i + 1; j < n; j++ ) {
- sum -= uptr[j] * x[j];
- }
- x[i] = sum * invDiag[i];
- }
- }
- /*
- ========================
- LU_Factor_SIMD
- In-place factorization LU of the n * n sub-matrix of mat. The reciprocal of the diagonal
- elements of U are stored in invDiag. No pivoting is used.
- ========================
- */
- static bool LU_Factor_SIMD( idMatX & mat, idVecX & invDiag, const int n ) {
- for ( int i = 0; i < n; i++ ) {
- float d1 = mat[i][i];
- if ( fabs( d1 ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- return false;
- }
- invDiag[i] = d1 = 1.0f / d1;
- float * ptr1 = mat[i];
- for ( int j = i + 1; j < n; j++ ) {
- float * ptr2 = mat[j];
- float d2 = ptr2[i] * d1;
- ptr2[i] = d2;
- int k;
- for ( k = n - 1; k > i + 15; k -= 16 ) {
- ptr2[k-0] -= d2 * ptr1[k-0];
- ptr2[k-1] -= d2 * ptr1[k-1];
- ptr2[k-2] -= d2 * ptr1[k-2];
- ptr2[k-3] -= d2 * ptr1[k-3];
- ptr2[k-4] -= d2 * ptr1[k-4];
- ptr2[k-5] -= d2 * ptr1[k-5];
- ptr2[k-6] -= d2 * ptr1[k-6];
- ptr2[k-7] -= d2 * ptr1[k-7];
- ptr2[k-8] -= d2 * ptr1[k-8];
- ptr2[k-9] -= d2 * ptr1[k-9];
- ptr2[k-10] -= d2 * ptr1[k-10];
- ptr2[k-11] -= d2 * ptr1[k-11];
- ptr2[k-12] -= d2 * ptr1[k-12];
- ptr2[k-13] -= d2 * ptr1[k-13];
- ptr2[k-14] -= d2 * ptr1[k-14];
- ptr2[k-15] -= d2 * ptr1[k-15];
- }
- switch( k - i ) {
- NODEFAULT;
- case 15: ptr2[k-14] -= d2 * ptr1[k-14];
- case 14: ptr2[k-13] -= d2 * ptr1[k-13];
- case 13: ptr2[k-12] -= d2 * ptr1[k-12];
- case 12: ptr2[k-11] -= d2 * ptr1[k-11];
- case 11: ptr2[k-10] -= d2 * ptr1[k-10];
- case 10: ptr2[k-9] -= d2 * ptr1[k-9];
- case 9: ptr2[k-8] -= d2 * ptr1[k-8];
- case 8: ptr2[k-7] -= d2 * ptr1[k-7];
- case 7: ptr2[k-6] -= d2 * ptr1[k-6];
- case 6: ptr2[k-5] -= d2 * ptr1[k-5];
- case 5: ptr2[k-4] -= d2 * ptr1[k-4];
- case 4: ptr2[k-3] -= d2 * ptr1[k-3];
- case 3: ptr2[k-2] -= d2 * ptr1[k-2];
- case 2: ptr2[k-1] -= d2 * ptr1[k-1];
- case 1: ptr2[k-0] -= d2 * ptr1[k-0];
- case 0: break;
- }
- }
- }
- return true;
- }
- /*
- ========================
- LDLT_Factor_SIMD
- In-place factorization LDL' of the n * n sub-matrix of mat. The reciprocal of the diagonal
- elements are stored in invDiag.
- NOTE: The number of columns of mat must be a multiple of 4.
- ========================
- */
- static bool LDLT_Factor_SIMD( idMatX & mat, idVecX & invDiag, const int n ) {
- float s0, s1, s2, d;
- float * v = (float *) _alloca16( ( ( n + 3 ) & ~3 ) * sizeof( float ) );
- float * diag = (float *) _alloca16( ( ( n + 3 ) & ~3 ) * sizeof( float ) );
- float * invDiagPtr = invDiag.ToFloatPtr();
- int nc = mat.GetNumColumns();
- assert( ( nc & 3 ) == 0 );
- if ( n <= 0 ) {
- return true;
- }
- float * mptr = mat[0];
- float sum = mptr[0];
- if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- return false;
- }
- diag[0] = sum;
- invDiagPtr[0] = d = 1.0f / sum;
- if ( n <= 1 ) {
- return true;
- }
- mptr = mat[0];
- for ( int j = 1; j < n; j++ ) {
- mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
- }
- mptr = mat[1];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- sum = mptr[1] - s0;
- if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- return false;
- }
- mat[1][1] = sum;
- diag[1] = sum;
- invDiagPtr[1] = d = 1.0f / sum;
- if ( n <= 2 ) {
- return true;
- }
- mptr = mat[0];
- for ( int j = 2; j < n; j++ ) {
- mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
- }
- mptr = mat[2];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
- sum = mptr[2] - s0 - s1;
- if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- return false;
- }
- mat[2][2] = sum;
- diag[2] = sum;
- invDiagPtr[2] = d = 1.0f / sum;
- if ( n <= 3 ) {
- return true;
- }
- mptr = mat[0];
- for ( int j = 3; j < n; j++ ) {
- mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
- }
- mptr = mat[3];
- v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
- v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
- v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
- sum = mptr[3] - s0 - s1 - s2;
- if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- return false;
- }
- mat[3][3] = sum;
- diag[3] = sum;
- invDiagPtr[3] = d = 1.0f / sum;
- if ( n <= 4 ) {
- return true;
- }
- mptr = mat[0];
- for ( int j = 4; j < n; j++ ) {
- mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
- }
- #ifdef ID_WIN_X86_SSE2_INTRIN
- __m128 vzero = _mm_setzero_ps();
- for ( int i = 4; i < n; i += 4 ) {
- _mm_store_ps( diag + i, vzero );
- }
- for ( int i = 4; i < n; i++ ) {
- mptr = mat[i];
- assert_16_byte_aligned( v );
- assert_16_byte_aligned( mptr );
- assert_16_byte_aligned( diag );
- __m128 m0 = _mm_load_ps( mptr + 0 );
- __m128 d0 = _mm_load_ps( diag + 0 );
- __m128 v0 = _mm_mul_ps( d0, m0 );
- __m128 t0 = _mm_load_ss( mptr + i );
- t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) );
- _mm_store_ps( v + 0, v0 );
- int k = 4;
- for ( ; k < i - 3; k += 4 ) {
- m0 = _mm_load_ps( mptr + k );
- d0 = _mm_load_ps( diag + k );
- v0 = _mm_mul_ps( d0, m0 );
- t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) );
- _mm_store_ps( v + k, v0 );
- }
- __m128 mask = (__m128 &) SIMD_SP_indexedEndMask[i & 3];
- m0 = _mm_and_ps( _mm_load_ps( mptr + k ), mask );
- d0 = _mm_load_ps( diag + k );
- v0 = _mm_mul_ps( d0, m0 );
- t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) );
- _mm_store_ps( v + k, v0 );
- t0 = _mm_add_ps( t0, _mm_shuffle_ps( t0, t0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
- t0 = _mm_add_ps( t0, _mm_shuffle_ps( t0, t0, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
- __m128 tiny = _mm_and_ps( _mm_cmpeq_ps( t0, SIMD_SP_zero ), SIMD_SP_tiny );
- t0 = _mm_or_ps( t0, tiny );
- _mm_store_ss( mptr + i, t0 );
- _mm_store_ss( diag + i, t0 );
- __m128 d = _mm_rcp32_ps( t0 );
- _mm_store_ss( invDiagPtr + i, d );
- if ( i + 1 >= n ) {
- return true;
- }
- int j = i + 1;
- for ( ; j < n - 3; j += 4 ) {
- float * ra = mat[j+0];
- float * rb = mat[j+1];
- float * rc = mat[j+2];
- float * rd = mat[j+3];
- assert_16_byte_aligned( v );
- assert_16_byte_aligned( ra );
- assert_16_byte_aligned( rb );
- assert_16_byte_aligned( rc );
- assert_16_byte_aligned( rd );
- __m128 va = _mm_load_ss( ra + i );
- __m128 vb = _mm_load_ss( rb + i );
- __m128 vc = _mm_load_ss( rc + i );
- __m128 vd = _mm_load_ss( rd + i );
- __m128 v0 = _mm_load_ps( v + 0 );
- va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( ra + 0 ), v0 ) );
- vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_load_ps( rb + 0 ), v0 ) );
- vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_load_ps( rc + 0 ), v0 ) );
- vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_load_ps( rd + 0 ), v0 ) );
- int k = 4;
- for ( ; k < i - 3; k += 4 ) {
- v0 = _mm_load_ps( v + k );
- va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( ra + k ), v0 ) );
- vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_load_ps( rb + k ), v0 ) );
- vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_load_ps( rc + k ), v0 ) );
- vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_load_ps( rd + k ), v0 ) );
- }
- v0 = _mm_load_ps( v + k );
- va = _mm_sub_ps( va, _mm_mul_ps( _mm_and_ps( _mm_load_ps( ra + k ), mask ), v0 ) );
- vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rb + k ), mask ), v0 ) );
- vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rc + k ), mask ), v0 ) );
- vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rd + k ), mask ), v0 ) );
- __m128 ta = _mm_unpacklo_ps( va, vc ); // x0, z0, x1, z1
- __m128 tb = _mm_unpackhi_ps( va, vc ); // x2, z2, x3, z3
- __m128 tc = _mm_unpacklo_ps( vb, vd ); // y0, w0, y1, w1
- __m128 td = _mm_unpackhi_ps( vb, vd ); // y2, w2, y3, w3
- va = _mm_unpacklo_ps( ta, tc ); // x0, y0, z0, w0
- vb = _mm_unpackhi_ps( ta, tc ); // x1, y1, z1, w1
- vc = _mm_unpacklo_ps( tb, td ); // x2, y2, z2, w2
- vd = _mm_unpackhi_ps( tb, td ); // x3, y3, z3, w3
- va = _mm_add_ps( va, vb );
- vc = _mm_add_ps( vc, vd );
- va = _mm_add_ps( va, vc );
- va = _mm_mul_ps( va, d );
- _mm_store_ss( ra + i, _mm_splat_ps( va, 0 ) );
- _mm_store_ss( rb + i, _mm_splat_ps( va, 1 ) );
- _mm_store_ss( rc + i, _mm_splat_ps( va, 2 ) );
- _mm_store_ss( rd + i, _mm_splat_ps( va, 3 ) );
- }
- for ( ; j < n; j++ ) {
- float * mptr = mat[j];
- assert_16_byte_aligned( v );
- assert_16_byte_aligned( mptr );
- __m128 va = _mm_load_ss( mptr + i );
- __m128 v0 = _mm_load_ps( v + 0 );
- va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( mptr + 0 ), v0 ) );
- int k = 4;
- for ( ; k < i - 3; k += 4 ) {
- v0 = _mm_load_ps( v + k );
- va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( mptr + k ), v0 ) );
- }
- v0 = _mm_load_ps( v + k );
- va = _mm_sub_ps( va, _mm_mul_ps( _mm_and_ps( _mm_load_ps( mptr + k ), mask ), v0 ) );
- va = _mm_add_ps( va, _mm_shuffle_ps( va, va, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
- va = _mm_add_ps( va, _mm_shuffle_ps( va, va, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
- va = _mm_mul_ps( va, d );
- _mm_store_ss( mptr + i, va );
- }
- }
- return true;
- #else
- for ( int i = 4; i < n; i += 4 ) {
- diag[i+0] = 0.0f;
- diag[i+1] = 0.0f;
- diag[i+2] = 0.0f;
- diag[i+3] = 0.0f;
- }
- for ( int i = 4; i < n; i++ ) {
- mptr = mat[i];
- assert_16_byte_aligned( v );
- assert_16_byte_aligned( mptr );
- assert_16_byte_aligned( diag );
- v[0] = diag[0] * mptr[0];
- v[1] = diag[1] * mptr[1];
- v[2] = diag[2] * mptr[2];
- v[3] = diag[3] * mptr[3];
- float t0 = - mptr[0] * v[0] + mptr[i];
- float t1 = - mptr[1] * v[1];
- float t2 = - mptr[2] * v[2];
- float t3 = - mptr[3] * v[3];
- int k = 4;
- for ( ; k < i - 3; k += 4 ) {
- v[k+0] = diag[k+0] * mptr[k+0];
- v[k+1] = diag[k+1] * mptr[k+1];
- v[k+2] = diag[k+2] * mptr[k+2];
- v[k+3] = diag[k+3] * mptr[k+3];
- t0 -= mptr[k+0] * v[k+0];
- t1 -= mptr[k+1] * v[k+1];
- t2 -= mptr[k+2] * v[k+2];
- t3 -= mptr[k+3] * v[k+3];
- }
- float m0 = ( i - k > 0 ) ? mptr[k+0] : 0.0f;
- float m1 = ( i - k > 1 ) ? mptr[k+1] : 0.0f;
- float m2 = ( i - k > 2 ) ? mptr[k+2] : 0.0f;
- float m3 = ( i - k > 3 ) ? mptr[k+3] : 0.0f;
- v[k+0] = diag[k+0] * m0;
- v[k+1] = diag[k+1] * m1;
- v[k+2] = diag[k+2] * m2;
- v[k+3] = diag[k+3] * m3;
- t0 -= m0 * v[k+0];
- t1 -= m1 * v[k+1];
- t2 -= m2 * v[k+2];
- t3 -= m3 * v[k+3];
- sum = t0 + t1 + t2 + t3;
- if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- return false;
- }
- mat[i][i] = sum;
- diag[i] = sum;
- invDiagPtr[i] = d = 1.0f / sum;
- if ( i + 1 >= n ) {
- return true;
- }
- int j = i + 1;
- for ( ; j < n - 3; j += 4 ) {
- float * ra = mat[j+0];
- float * rb = mat[j+1];
- float * rc = mat[j+2];
- float * rd = mat[j+3];
- assert_16_byte_aligned( v );
- assert_16_byte_aligned( ra );
- assert_16_byte_aligned( rb );
- assert_16_byte_aligned( rc );
- assert_16_byte_aligned( rd );
- float a0 = - ra[0] * v[0] + ra[i];
- float a1 = - ra[1] * v[1];
- float a2 = - ra[2] * v[2];
- float a3 = - ra[3] * v[3];
- float b0 = - rb[0] * v[0] + rb[i];
- float b1 = - rb[1] * v[1];
- float b2 = - rb[2] * v[2];
- float b3 = - rb[3] * v[3];
- float c0 = - rc[0] * v[0] + rc[i];
- float c1 = - rc[1] * v[1];
- float c2 = - rc[2] * v[2];
- float c3 = - rc[3] * v[3];
- float d0 = - rd[0] * v[0] + rd[i];
- float d1 = - rd[1] * v[1];
- float d2 = - rd[2] * v[2];
- float d3 = - rd[3] * v[3];
- int k = 4;
- for ( ; k < i - 3; k += 4 ) {
- a0 -= ra[k+0] * v[k+0];
- a1 -= ra[k+1] * v[k+1];
- a2 -= ra[k+2] * v[k+2];
- a3 -= ra[k+3] * v[k+3];
- b0 -= rb[k+0] * v[k+0];
- b1 -= rb[k+1] * v[k+1];
- b2 -= rb[k+2] * v[k+2];
- b3 -= rb[k+3] * v[k+3];
- c0 -= rc[k+0] * v[k+0];
- c1 -= rc[k+1] * v[k+1];
- c2 -= rc[k+2] * v[k+2];
- c3 -= rc[k+3] * v[k+3];
- d0 -= rd[k+0] * v[k+0];
- d1 -= rd[k+1] * v[k+1];
- d2 -= rd[k+2] * v[k+2];
- d3 -= rd[k+3] * v[k+3];
- }
- float ra0 = ( i - k > 0 ) ? ra[k+0] : 0.0f;
- float ra1 = ( i - k > 1 ) ? ra[k+1] : 0.0f;
- float ra2 = ( i - k > 2 ) ? ra[k+2] : 0.0f;
- float ra3 = ( i - k > 3 ) ? ra[k+3] : 0.0f;
- float rb0 = ( i - k > 0 ) ? rb[k+0] : 0.0f;
- float rb1 = ( i - k > 1 ) ? rb[k+1] : 0.0f;
- float rb2 = ( i - k > 2 ) ? rb[k+2] : 0.0f;
- float rb3 = ( i - k > 3 ) ? rb[k+3] : 0.0f;
- float rc0 = ( i - k > 0 ) ? rc[k+0] : 0.0f;
- float rc1 = ( i - k > 1 ) ? rc[k+1] : 0.0f;
- float rc2 = ( i - k > 2 ) ? rc[k+2] : 0.0f;
- float rc3 = ( i - k > 3 ) ? rc[k+3] : 0.0f;
- float rd0 = ( i - k > 0 ) ? rd[k+0] : 0.0f;
- float rd1 = ( i - k > 1 ) ? rd[k+1] : 0.0f;
- float rd2 = ( i - k > 2 ) ? rd[k+2] : 0.0f;
- float rd3 = ( i - k > 3 ) ? rd[k+3] : 0.0f;
- a0 -= ra0 * v[k+0];
- a1 -= ra1 * v[k+1];
- a2 -= ra2 * v[k+2];
- a3 -= ra3 * v[k+3];
- b0 -= rb0 * v[k+0];
- b1 -= rb1 * v[k+1];
- b2 -= rb2 * v[k+2];
- b3 -= rb3 * v[k+3];
- c0 -= rc0 * v[k+0];
- c1 -= rc1 * v[k+1];
- c2 -= rc2 * v[k+2];
- c3 -= rc3 * v[k+3];
- d0 -= rd0 * v[k+0];
- d1 -= rd1 * v[k+1];
- d2 -= rd2 * v[k+2];
- d3 -= rd3 * v[k+3];
- ra[i] = ( a0 + a1 + a2 + a3 ) * d;
- rb[i] = ( b0 + b1 + b2 + b3 ) * d;
- rc[i] = ( c0 + c1 + c2 + c3 ) * d;
- rd[i] = ( d0 + d1 + d2 + d3 ) * d;
- }
- for ( ; j < n; j++ ) {
- mptr = mat[j];
- assert_16_byte_aligned( v );
- assert_16_byte_aligned( mptr );
- float a0 = - mptr[0] * v[0] + mptr[i];
- float a1 = - mptr[1] * v[1];
- float a2 = - mptr[2] * v[2];
- float a3 = - mptr[3] * v[3];
- int k = 4;
- for ( ; k < i - 3; k += 4 ) {
- a0 -= mptr[k+0] * v[k+0];
- a1 -= mptr[k+1] * v[k+1];
- a2 -= mptr[k+2] * v[k+2];
- a3 -= mptr[k+3] * v[k+3];
- }
- float m0 = ( i - k > 0 ) ? mptr[k+0] : 0.0f;
- float m1 = ( i - k > 1 ) ? mptr[k+1] : 0.0f;
- float m2 = ( i - k > 2 ) ? mptr[k+2] : 0.0f;
- float m3 = ( i - k > 3 ) ? mptr[k+3] : 0.0f;
- a0 -= m0 * v[k+0];
- a1 -= m1 * v[k+1];
- a2 -= m2 * v[k+2];
- a3 -= m3 * v[k+3];
- mptr[i] = ( a0 + a1 + a2 + a3 ) * d;
- }
- }
- return true;
- #endif
- }
- /*
- ========================
- GetMaxStep_SIMD
- ========================
- */
- static void GetMaxStep_SIMD( const float * f, const float * a, const float * delta_f, const float * delta_a,
- const float * lo, const float * hi, const int * side, int numUnbounded, int numClamped,
- int d, float dir, float & maxStep, int & limit, int & limitSide ) {
- #ifdef ID_WIN_X86_SSE2_INTRIN
- __m128 vMaxStep;
- __m128i vLimit;
- __m128i vLimitSide;
- // default to a full step for the current variable
- {
- __m128 vNegAccel = _mm_xor_ps( _mm_load1_ps( a + d ), (__m128 &) SIMD_SP_signBit );
- __m128 vDeltaAccel = _mm_load1_ps( delta_a + d );
- __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaAccel, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON );
- __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) );
- vMaxStep = _mm_sel_ps( SIMD_SP_zero, vStep, vM0 );
- vLimit = _mm_shuffle_epi32( _mm_cvtsi32_si128( d ), 0 );
- vLimitSide = (__m128i &) SIMD_DW_zero;
- }
- // test the current variable
- {
- __m128 vDeltaForce = _mm_load1_ps( & dir );
- __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero );
- __m128 vForceLimit = _mm_sel_ps( _mm_load1_ps( hi + d ), _mm_load1_ps( lo + d ), vSign );
- __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load1_ps( f + d ) ), vDeltaForce );
- __m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one );
- __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON );
- __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity );
- __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
- __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
- vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
- vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) );
- }
- // test the clamped bounded variables
- {
- __m128 mask = (__m128 &) SIMD_SP_indexedStartMask[numUnbounded & 3];
- __m128i index = _mm_add_epi32( _mm_and_si128( _mm_shuffle_epi32( _mm_cvtsi32_si128( numUnbounded ), 0 ), (__m128i &) SIMD_DW_not3 ), (__m128i &) SIMD_DW_index );
- int i = numUnbounded & ~3;
- for ( ; i < numClamped - 3; i += 4 ) {
- __m128 vDeltaForce = _mm_and_ps( _mm_load_ps( delta_f + i ), mask );
- __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero );
- __m128 vForceLimit = _mm_sel_ps( _mm_load_ps( hi + i ), _mm_load_ps( lo + i ), vSign );
- __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON );
- __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load_ps( f + i ) ), _mm_sel_ps( SIMD_SP_one, vDeltaForce, vM0 ) );
- __m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one );
- __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity );
- __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
- __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
- vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
- vLimit = _mm_sel_si128( vLimit, index, vM3 );
- vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) );
- mask = (__m128 &) SIMD_SP_indexedStartMask[0];
- index = _mm_add_epi32( index, (__m128i &) SIMD_DW_four );
- }
- __m128 vDeltaForce = _mm_and_ps( _mm_load_ps( delta_f + i ), _mm_and_ps( mask, (__m128 &) SIMD_SP_indexedEndMask[numClamped & 3] ) );
- __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero );
- __m128 vForceLimit = _mm_sel_ps( _mm_load_ps( hi + i ), _mm_load_ps( lo + i ), vSign );
- __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON );
- __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load_ps( f + i ) ), _mm_sel_ps( SIMD_SP_one, vDeltaForce, vM0 ) );
- __m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one );
- __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity );
- __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
- __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
- vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
- vLimit = _mm_sel_si128( vLimit, index, vM3 );
- vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) );
- }
- // test the not clamped bounded variables
- {
- __m128 mask = (__m128 &) SIMD_SP_indexedStartMask[numClamped & 3];
- __m128i index = _mm_add_epi32( _mm_and_si128( _mm_shuffle_epi32( _mm_cvtsi32_si128( numClamped ), 0 ), (__m128i &) SIMD_DW_not3 ), (__m128i &) SIMD_DW_index );
- int i = numClamped & ~3;
- for ( ; i < d - 3; i += 4 ) {
- __m128 vNegAccel = _mm_xor_ps( _mm_load_ps( a + i ), (__m128 &) SIMD_SP_signBit );
- __m128 vDeltaAccel = _mm_and_ps( _mm_load_ps( delta_a + i ), mask );
- __m128 vSide = _mm_cvtepi32_ps( _mm_load_si128( (__m128i *) ( side + i ) ) );
- __m128 vM0 = _mm_cmpgt_ps( _mm_mul_ps( vSide, vDeltaAccel ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON );
- __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) );
- __m128 vM1 = _mm_or_ps( _mm_cmplt_ps( _mm_load_ps( lo + i ), SIMD_SP_neg_LCP_BOUND_EPSILON ), _mm_cmpgt_ps( _mm_load_ps( hi + i ), SIMD_SP_LCP_BOUND_EPSILON ) );
- __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
- __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
- vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
- vLimit = _mm_sel_si128( vLimit, index, vM3 );
- vLimitSide = _mm_sel_si128( vLimitSide, (__m128i &) SIMD_DW_zero, __m128c( vM3 ) );
- mask = (__m128 &) SIMD_SP_indexedStartMask[0];
- index = _mm_add_epi32( index, (__m128i &) SIMD_DW_four );
- }
- __m128 vNegAccel = _mm_xor_ps( _mm_load_ps( a + i ), (__m128 &) SIMD_SP_signBit );
- __m128 vDeltaAccel = _mm_and_ps( _mm_load_ps( delta_a + i ), _mm_and_ps( mask, (__m128 &) SIMD_SP_indexedEndMask[d & 3] ) );
- __m128 vSide = _mm_cvtepi32_ps( _mm_load_si128( (__m128i *) ( side + i ) ) );
- __m128 vM0 = _mm_cmpgt_ps( _mm_mul_ps( vSide, vDeltaAccel ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON );
- __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) );
- __m128 vM1 = _mm_or_ps( _mm_cmplt_ps( _mm_load_ps( lo + i ), SIMD_SP_neg_LCP_BOUND_EPSILON ), _mm_cmpgt_ps( _mm_load_ps( hi + i ), SIMD_SP_LCP_BOUND_EPSILON ) );
- __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
- __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
- vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
- vLimit = _mm_sel_si128( vLimit, index, vM3 );
- vLimitSide = _mm_sel_si128( vLimitSide, (__m128i &) SIMD_DW_zero, __m128c( vM3 ) );
- }
- {
- __m128 tMaxStep = _mm_shuffle_ps( vMaxStep, vMaxStep, _MM_SHUFFLE( 1, 0, 3, 2 ) );
- __m128i tLimit = _mm_shuffle_epi32( vLimit, _MM_SHUFFLE( 1, 0, 3, 2 ) );
- __m128i tLimitSide = _mm_shuffle_epi32( vLimitSide, _MM_SHUFFLE( 1, 0, 3, 2 ) );
- __m128c mask = _mm_cmplt_ps( tMaxStep, vMaxStep );
- vMaxStep = _mm_min_ps( vMaxStep, tMaxStep );
- vLimit = _mm_sel_si128( vLimit, tLimit, mask );
- vLimitSide = _mm_sel_si128( vLimitSide, tLimitSide, mask );
- }
- {
- __m128 tMaxStep = _mm_shuffle_ps( vMaxStep, vMaxStep, _MM_SHUFFLE( 2, 3, 0, 1 ) );
- __m128i tLimit = _mm_shuffle_epi32( vLimit, _MM_SHUFFLE( 2, 3, 0, 1 ) );
- __m128i tLimitSide = _mm_shuffle_epi32( vLimitSide, _MM_SHUFFLE( 2, 3, 0, 1 ) );
- __m128c mask = _mm_cmplt_ps( tMaxStep, vMaxStep );
- vMaxStep = _mm_min_ps( vMaxStep, tMaxStep );
- vLimit = _mm_sel_si128( vLimit, tLimit, mask );
- vLimitSide = _mm_sel_si128( vLimitSide, tLimitSide, mask );
- }
- _mm_store_ss( & maxStep, vMaxStep );
- limit = _mm_cvtsi128_si32( vLimit );
- limitSide = _mm_cvtsi128_si32( vLimitSide );
- #else
- // default to a full step for the current variable
- {
- float negAccel = -a[d];
- float deltaAccel = delta_a[d];
- int m0 = ( fabs( deltaAccel ) > LCP_DELTA_ACCEL_EPSILON );
- float step = negAccel / ( m0 ? deltaAccel : 1.0f );
- maxStep = m0 ? step : 0.0f;
- limit = d;
- limitSide = 0;
- }
- // test the current variable
- {
- float deltaForce = dir;
- float forceLimit = ( deltaForce < 0.0f ) ? lo[d] : hi[d];
- float step = ( forceLimit - f[d] ) / deltaForce;
- int setSide = ( deltaForce < 0.0f ) ? -1 : 1;
- int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON );
- int m1 = ( fabs( forceLimit ) != idMath::INFINITY );
- int m2 = ( step < maxStep );
- int m3 = ( m0 & m1 & m2 );
- maxStep = m3 ? step : maxStep;
- limit = m3 ? d : limit;
- limitSide = m3 ? setSide : limitSide;
- }
- // test the clamped bounded variables
- for ( int i = numUnbounded; i < numClamped; i++ ) {
- float deltaForce = delta_f[i];
- float forceLimit = ( deltaForce < 0.0f ) ? lo[i] : hi[i];
- int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON );
- float step = ( forceLimit - f[i] ) / ( m0 ? deltaForce : 1.0f );
- int setSide = ( deltaForce < 0.0f ) ? -1 : 1;
- int m1 = ( fabs( forceLimit ) != idMath::INFINITY );
- int m2 = ( step < maxStep );
- int m3 = ( m0 & m1 & m2 );
- maxStep = m3 ? step : maxStep;
- limit = m3 ? i : limit;
- limitSide = m3 ? setSide : limitSide;
- }
- // test the not clamped bounded variables
- for ( int i = numClamped; i < d; i++ ) {
- float negAccel = -a[i];
- float deltaAccel = delta_a[i];
- int m0 = ( side[i] * deltaAccel > LCP_DELTA_ACCEL_EPSILON );
- float step = negAccel / ( m0 ? deltaAccel : 1.0f );
- int m1 = ( lo[i] < -LCP_BOUND_EPSILON || hi[i] > LCP_BOUND_EPSILON );
- int m2 = ( step < maxStep );
- int m3 = ( m0 & m1 & m2 );
- maxStep = m3 ? step : maxStep;
- limit = m3 ? i : limit;
- limitSide = m3 ? 0 : limitSide;
- }
- #endif
- }
- /*
- ================================================================================================
- SIMD test code
- ================================================================================================
- */
- //#define ENABLE_TEST_CODE
- #ifdef ENABLE_TEST_CODE
- #define TEST_TRIANGULAR_SOLVE_SIMD_EPSILON 0.1f
- #define TEST_TRIANGULAR_SOLVE_SIZE 50
- #define TEST_FACTOR_SIMD_EPSILON 0.1f
- #define TEST_FACTOR_SOLVE_SIZE 50
- #define NUM_TESTS 50
- /*
- ========================
- PrintClocks
- ========================
- */
- static void PrintClocks( const char * string, int dataCount, int64 clocks, int64 otherClocks = 0 ) {
- idLib::Printf( string );
- for ( int i = idStr::LengthWithoutColors(string); i < 48; i++ ) {
- idLib::Printf(" ");
- }
- if ( clocks && otherClocks ) {
- int p = 0;
- if ( clocks <= otherClocks ) {
- p = idMath::Ftoi( (float) ( otherClocks - clocks ) * 100.0f / (float) otherClocks );
- } else {
- p = - idMath::Ftoi( (float) ( clocks - otherClocks ) * 100.0f / (float) clocks );
- }
- idLib::Printf( "c = %4d, clcks = %5lld, %d%%\n", dataCount, clocks, p );
- } else {
- idLib::Printf( "c = %4d, clcks = %5lld\n", dataCount, clocks );
- }
- }
- /*
- ========================
- DotProduct_Test
- ========================
- */
- static void DotProduct_Test() {
- ALIGN16( float fsrc0[TEST_TRIANGULAR_SOLVE_SIZE + 1]; )
- ALIGN16( float fsrc1[TEST_TRIANGULAR_SOLVE_SIZE + 1]; )
- idRandom srnd( 13 );
- for ( int i = 0; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
- fsrc0[i] = srnd.CRandomFloat() * 10.0f;
- fsrc1[i] = srnd.CRandomFloat() * 10.0f;
- }
- idTimer timer;
- for ( int i = 0; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
- float dot1 = DotProduct_Generic( fsrc0, fsrc1, i );
- int64 clocksGeneric = 0xFFFFFFFFFFFF;
- for ( int j = 0; j < NUM_TESTS; j++ ) {
- fsrc1[TEST_TRIANGULAR_SOLVE_SIZE] = j;
- timer.Clear();
- timer.Start();
- dot1 = DotProduct_Generic( fsrc0, fsrc1, i );
- timer.Stop();
- clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
- }
- PrintClocks( va( "DotProduct_Generic %d", i ), 1, clocksGeneric );
- float dot2 = DotProduct_SIMD( fsrc0, fsrc1, i );
- int64 clocksSIMD = 0xFFFFFFFFFFFF;
- for ( int j = 0; j < NUM_TESTS; j++ ) {
- fsrc1[TEST_TRIANGULAR_SOLVE_SIZE] = j;
- timer.Clear();
- timer.Start();
- dot2 = DotProduct_SIMD( fsrc0, fsrc1, i );
- timer.Stop();
- clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
- }
- const char * result = idMath::Fabs( dot1 - dot2 ) < 1e-4f ? "ok" : S_COLOR_RED"X";
- PrintClocks( va( "DotProduct_SIMD %d %s", i, result ), 1, clocksSIMD, clocksGeneric );
- }
- }
- /*
- ========================
- LowerTriangularSolve_Test
- ========================
- */
- static void LowerTriangularSolve_Test() {
- idMatX L;
- idVecX x, b, tst;
- int paddedSize = ( TEST_TRIANGULAR_SOLVE_SIZE + 3 ) & ~3;
- L.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f );
- x.SetSize( paddedSize );
- b.Random( paddedSize, 0, -1.0f, 1.0f );
- idTimer timer;
- const int skip = 0;
- for ( int i = 1; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
- x.Zero( i );
- LowerTriangularSolve_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
- int64 clocksGeneric = 0xFFFFFFFFFFFF;
- for ( int j = 0; j < NUM_TESTS; j++ ) {
- timer.Clear();
- timer.Start();
- LowerTriangularSolve_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
- timer.Stop();
- clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
- }
- tst = x;
- x.Zero();
- PrintClocks( va( "LowerTriangularSolve_Generic %dx%d", i, i ), 1, clocksGeneric );
- LowerTriangularSolve_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
- int64 clocksSIMD = 0xFFFFFFFFFFFF;
- for ( int j = 0; j < NUM_TESTS; j++ ) {
- timer.Clear();
- timer.Start();
- LowerTriangularSolve_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
- timer.Stop();
- clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
- }
- const char * result = x.Compare( tst, TEST_TRIANGULAR_SOLVE_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X";
- PrintClocks( va( "LowerTriangularSolve_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric );
- }
- }
- /*
- ========================
- LowerTriangularSolveTranspose_Test
- ========================
- */
- static void LowerTriangularSolveTranspose_Test() {
- idMatX L;
- idVecX x, b, tst;
- int paddedSize = ( TEST_TRIANGULAR_SOLVE_SIZE + 3 ) & ~3;
- L.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f );
- x.SetSize( paddedSize );
- b.Random( paddedSize, 0, -1.0f, 1.0f );
- idTimer timer;
- for ( int i = 1; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
- x.Zero( i );
- LowerTriangularSolveTranspose_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
- int64 clocksGeneric = 0xFFFFFFFFFFFF;
- for ( int j = 0; j < NUM_TESTS; j++ ) {
- timer.Clear();
- timer.Start();
- LowerTriangularSolveTranspose_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
- timer.Stop();
- clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
- }
- tst = x;
- x.Zero();
- PrintClocks( va( "LowerTriangularSolveTranspose_Generic %dx%d", i, i ), 1, clocksGeneric );
- LowerTriangularSolveTranspose_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
- int64 clocksSIMD = 0xFFFFFFFFFFFF;
- for ( int j = 0; j < NUM_TESTS; j++ ) {
- timer.Clear();
- timer.Start();
- LowerTriangularSolveTranspose_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
- timer.Stop();
- clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
- }
- const char * result = x.Compare( tst, TEST_TRIANGULAR_SOLVE_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X";
- PrintClocks( va( "LowerTriangularSolveTranspose_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric );
- }
- }
- /*
- ========================
- LDLT_Factor_Test
- ========================
- */
- static void LDLT_Factor_Test() {
- idMatX src, original, mat1, mat2;
- idVecX invDiag1, invDiag2;
- int paddedSize = ( TEST_FACTOR_SOLVE_SIZE + 3 ) & ~3;
- original.SetSize( paddedSize, paddedSize );
- src.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f );
- src.TransposeMultiply( original, src );
- idTimer timer;
- for ( int i = 1; i < TEST_FACTOR_SOLVE_SIZE; i++ ) {
- int64 clocksGeneric = 0xFFFFFFFFFFFF;
- for ( int j = 0; j < NUM_TESTS; j++ ) {
- mat1 = original;
- invDiag1.Zero( TEST_FACTOR_SOLVE_SIZE );
- timer.Clear();
- timer.Start();
- LDLT_Factor_Generic( mat1, invDiag1, i );
- timer.Stop();
- clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
- }
- PrintClocks( va( "LDLT_Factor_Generic %dx%d", i, i ), 1, clocksGeneric );
- int64 clocksSIMD = 0xFFFFFFFFFFFF;
- for ( int j = 0; j < NUM_TESTS; j++ ) {
- mat2 = original;
- invDiag2.Zero( TEST_FACTOR_SOLVE_SIZE );
- timer.Clear();
- timer.Start();
- LDLT_Factor_SIMD( mat2, invDiag2, i );
- timer.Stop();
- clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
- }
- const char * result = mat1.Compare( mat2, TEST_FACTOR_SIMD_EPSILON ) && invDiag1.Compare( invDiag2, TEST_FACTOR_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X";
- PrintClocks( va( "LDLT_Factor_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric );
- }
- }
- #endif
- #define Multiply Multiply_SIMD
- #define MultiplyAdd MultiplyAdd_SIMD
- #define BigDotProduct DotProduct_SIMD
- #define LowerTriangularSolve LowerTriangularSolve_SIMD
- #define LowerTriangularSolveTranspose LowerTriangularSolveTranspose_SIMD
- #define UpperTriangularSolve UpperTriangularSolve_SIMD
- #define LU_Factor LU_Factor_SIMD
- #define LDLT_Factor LDLT_Factor_SIMD
- #define GetMaxStep GetMaxStep_SIMD
- /*
- ================================================================================================
- idLCP_Square
- ================================================================================================
- */
- /*
- ================================================
- idLCP_Square
- ================================================
- */
- class idLCP_Square : public idLCP {
- public:
- virtual bool Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex );
- private:
- idMatX m; // original matrix
- idVecX b; // right hand side
- idVecX lo, hi; // low and high bounds
- idVecX f, a; // force and acceleration
- idVecX delta_f, delta_a; // delta force and delta acceleration
- idMatX clamped; // LU factored sub matrix for clamped variables
- idVecX diagonal; // reciprocal of diagonal of U of the LU factored sub matrix for clamped variables
- int numUnbounded; // number of unbounded variables
- int numClamped; // number of clamped variables
- float ** rowPtrs; // pointers to the rows of m
- int * boxIndex; // box index
- int * side; // tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0
- int * permuted; // index to keep track of the permutation
- bool padded; // set to true if the rows of the initial matrix are 16 byte padded
- bool FactorClamped();
- void SolveClamped( idVecX & x, const float * b );
- void Swap( int i, int j );
- void AddClamped( int r );
- void RemoveClamped( int r );
- void CalcForceDelta( int d, float dir );
- void CalcAccelDelta( int d );
- void ChangeForce( int d, float step );
- void ChangeAccel( int d, float step );
- };
- /*
- ========================
- idLCP_Square::FactorClamped
- ========================
- */
- bool idLCP_Square::FactorClamped() {
- for ( int i = 0; i < numClamped; i++ ) {
- memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) );
- }
- return LU_Factor( clamped, diagonal, numClamped );
- }
- /*
- ========================
- idLCP_Square::SolveClamped
- ========================
- */
- void idLCP_Square::SolveClamped( idVecX & x, const float * b ) {
- // solve L
- LowerTriangularSolve( clamped, x.ToFloatPtr(), b, numClamped, 0 );
- // solve U
- UpperTriangularSolve( clamped, diagonal.ToFloatPtr(), x.ToFloatPtr(), x.ToFloatPtr(), numClamped );
- }
- /*
- ========================
- idLCP_Square::Swap
- ========================
- */
- void idLCP_Square::Swap( int i, int j ) {
- if ( i == j ) {
- return;
- }
- SwapValues( rowPtrs[i], rowPtrs[j] );
- m.SwapColumns( i, j );
- b.SwapElements( i, j );
- lo.SwapElements( i, j );
- hi.SwapElements( i, j );
- a.SwapElements( i, j );
- f.SwapElements( i, j );
- if ( boxIndex != NULL ) {
- SwapValues( boxIndex[i], boxIndex[j] );
- }
- SwapValues( side[i], side[j] );
- SwapValues( permuted[i], permuted[j] );
- }
- /*
- ========================
- idLCP_Square::AddClamped
- ========================
- */
- void idLCP_Square::AddClamped( int r ) {
- assert( r >= numClamped );
- // add a row at the bottom and a column at the right of the factored
- // matrix for the clamped variables
- Swap( numClamped, r );
- // add row to L
- for ( int i = 0; i < numClamped; i++ ) {
- float sum = rowPtrs[numClamped][i];
- for ( int j = 0; j < i; j++ ) {
- sum -= clamped[numClamped][j] * clamped[j][i];
- }
- clamped[numClamped][i] = sum * diagonal[i];
- }
- // add column to U
- for ( int i = 0; i <= numClamped; i++ ) {
- float sum = rowPtrs[i][numClamped];
- for ( int j = 0; j < i; j++ ) {
- sum -= clamped[i][j] * clamped[j][numClamped];
- }
- clamped[i][numClamped] = sum;
- }
- diagonal[numClamped] = 1.0f / clamped[numClamped][numClamped];
- numClamped++;
- }
- /*
- ========================
- idLCP_Square::RemoveClamped
- ========================
- */
- void idLCP_Square::RemoveClamped( int r ) {
- if ( !verify( r < numClamped ) ) {
- // complete fail, most likely due to exceptional floating point values
- return;
- }
- numClamped--;
- // no need to swap and update the factored matrix when the last row and column are removed
- if ( r == numClamped ) {
- return;
- }
- float * y0 = (float *) _alloca16( numClamped * sizeof( float ) );
- float * z0 = (float *) _alloca16( numClamped * sizeof( float ) );
- float * y1 = (float *) _alloca16( numClamped * sizeof( float ) );
- float * z1 = (float *) _alloca16( numClamped * sizeof( float ) );
- // the row/column need to be subtracted from the factorization
- for ( int i = 0; i < numClamped; i++ ) {
- y0[i] = -rowPtrs[i][r];
- }
- memset( y1, 0, numClamped * sizeof( float ) );
- y1[r] = 1.0f;
- memset( z0, 0, numClamped * sizeof( float ) );
- z0[r] = 1.0f;
- for ( int i = 0; i < numClamped; i++ ) {
- z1[i] = -rowPtrs[r][i];
- }
- // swap the to be removed row/column with the last row/column
- Swap( r, numClamped );
- // the swapped last row/column need to be added to the factorization
- for ( int i = 0; i < numClamped; i++ ) {
- y0[i] += rowPtrs[i][r];
- }
- for ( int i = 0; i < numClamped; i++ ) {
- z1[i] += rowPtrs[r][i];
- }
- z1[r] = 0.0f;
- // update the beginning of the to be updated row and column
- for ( int i = 0; i < r; i++ ) {
- float p0 = y0[i];
- float beta1 = z1[i] * diagonal[i];
- clamped[i][r] += p0;
- for ( int j = i+1; j < numClamped; j++ ) {
- z1[j] -= beta1 * clamped[i][j];
- }
- for ( int j = i+1; j < numClamped; j++ ) {
- y0[j] -= p0 * clamped[j][i];
- }
- clamped[r][i] += beta1;
- }
- // update the lower right corner starting at r,r
- for ( int i = r; i < numClamped; i++ ) {
- float diag = clamped[i][i];
- float p0 = y0[i];
- float p1 = z0[i];
- diag += p0 * p1;
- if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- idLib::Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" );
- diag = idMath::FLT_SMALLEST_NON_DENORMAL;
- }
- float beta0 = p1 / diag;
- float q0 = y1[i];
- float q1 = z1[i];
- diag += q0 * q1;
- if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- idLib::Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" );
- diag = idMath::FLT_SMALLEST_NON_DENORMAL;
- }
- float d = 1.0f / diag;
- float beta1 = q1 * d;
- clamped[i][i] = diag;
- diagonal[i] = d;
- for ( int j = i+1; j < numClamped; j++ ) {
- d = clamped[i][j];
- d += p0 * z0[j];
- z0[j] -= beta0 * d;
- d += q0 * z1[j];
- z1[j] -= beta1 * d;
- clamped[i][j] = d;
- }
- for ( int j = i+1; j < numClamped; j++ ) {
- d = clamped[j][i];
- y0[j] -= p0 * d;
- d += beta0 * y0[j];
- y1[j] -= q0 * d;
- d += beta1 * y1[j];
- clamped[j][i] = d;
- }
- }
- return;
- }
- /*
- ========================
- idLCP_Square::CalcForceDelta
- Modifies this->delta_f.
- ========================
- */
- void idLCP_Square::CalcForceDelta( int d, float dir ) {
- delta_f[d] = dir;
- if ( numClamped <= 0 ) {
- return;
- }
- // get column d of matrix
- float * ptr = (float *) _alloca16( numClamped * sizeof( float ) );
- for ( int i = 0; i < numClamped; i++ ) {
- ptr[i] = rowPtrs[i][d];
- }
- // solve force delta
- SolveClamped( delta_f, ptr );
- // flip force delta based on direction
- if ( dir > 0.0f ) {
- ptr = delta_f.ToFloatPtr();
- for ( int i = 0; i < numClamped; i++ ) {
- ptr[i] = - ptr[i];
- }
- }
- }
- /*
- ========================
- idLCP_Square::CalcAccelDelta
- Modifies this->delta_a and uses this->delta_f.
- ========================
- */
- ID_INLINE void idLCP_Square::CalcAccelDelta( int d ) {
- // only the not clamped variables, including the current variable, can have a change in acceleration
- for ( int j = numClamped; j <= d; j++ ) {
- // only the clamped variables and the current variable have a force delta unequal zero
- float dot = BigDotProduct( rowPtrs[j], delta_f.ToFloatPtr(), numClamped );
- delta_a[j] = dot + rowPtrs[j][d] * delta_f[d];
- }
- }
- /*
- ========================
- idLCP_Square::ChangeForce
- Modifies this->f and uses this->delta_f.
- ========================
- */
- ID_INLINE void idLCP_Square::ChangeForce( int d, float step ) {
- // only the clamped variables and current variable have a force delta unequal zero
- MultiplyAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped );
- f[d] += step * delta_f[d];
- }
- /*
- ========================
- idLCP_Square::ChangeAccel
- Modifies this->a and uses this->delta_a.
- ========================
- */
- ID_INLINE void idLCP_Square::ChangeAccel( int d, float step ) {
- // only the not clamped variables, including the current variable, can have an acceleration unequal zero
- MultiplyAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 );
- }
- /*
- ========================
- idLCP_Square::Solve
- ========================
- */
- bool idLCP_Square::Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ) {
- // true when the matrix rows are 16 byte padded
- padded = ((o_m.GetNumRows()+3)&~3) == o_m.GetNumColumns();
- assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() );
- assert( o_x.GetSize() == o_m.GetNumRows() );
- assert( o_b.GetSize() == o_m.GetNumRows() );
- assert( o_lo.GetSize() == o_m.GetNumRows() );
- assert( o_hi.GetSize() == o_m.GetNumRows() );
- // allocate memory for permuted input
- f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) );
- a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
- b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
- lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) );
- hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) );
- if ( o_boxIndex != NULL ) {
- boxIndex = (int *)_alloca16( o_x.GetSize() * sizeof( int ) );
- memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) );
- } else {
- boxIndex = NULL;
- }
- // we override the const on o_m here but on exit the matrix is unchanged
- m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast<float *>(o_m[0]) );
- f.Zero();
- a.Zero();
- b = o_b;
- lo = o_lo;
- hi = o_hi;
- // pointers to the rows of m
- rowPtrs = (float **) _alloca16( m.GetNumRows() * sizeof( float * ) );
- for ( int i = 0; i < m.GetNumRows(); i++ ) {
- rowPtrs[i] = m[i];
- }
- // tells if a variable is at the low boundary, high boundary or inbetween
- side = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
- // index to keep track of the permutation
- permuted = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
- for ( int i = 0; i < m.GetNumRows(); i++ ) {
- permuted[i] = i;
- }
- // permute input so all unbounded variables come first
- numUnbounded = 0;
- for ( int i = 0; i < m.GetNumRows(); i++ ) {
- if ( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) {
- if ( numUnbounded != i ) {
- Swap( numUnbounded, i );
- }
- numUnbounded++;
- }
- }
- // permute input so all variables using the boxIndex come last
- int boxStartIndex = m.GetNumRows();
- if ( boxIndex ) {
- for ( int i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) {
- if ( boxIndex[i] >= 0 ) {
- boxStartIndex--;
- if ( boxStartIndex != i ) {
- Swap( boxStartIndex, i );
- }
- }
- }
- }
- // sub matrix for factorization
- clamped.SetData( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA( m.GetNumRows() * m.GetNumColumns() ) );
- diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
- // all unbounded variables are clamped
- numClamped = numUnbounded;
- // if there are unbounded variables
- if ( numUnbounded ) {
- // factor and solve for unbounded variables
- if ( !FactorClamped() ) {
- idLib::Printf( "idLCP_Square::Solve: unbounded factorization failed\n" );
- return false;
- }
- SolveClamped( f, b.ToFloatPtr() );
- // if there are no bounded variables we are done
- if ( numUnbounded == m.GetNumRows() ) {
- o_x = f; // the vector is not permuted
- return true;
- }
- }
- int numIgnored = 0;
- // allocate for delta force and delta acceleration
- delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
- delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
- // solve for bounded variables
- idStr failed;
- for ( int i = numUnbounded; i < m.GetNumRows(); i++ ) {
- // once we hit the box start index we can initialize the low and high boundaries of the variables using the box index
- if ( i == boxStartIndex ) {
- for ( int j = 0; j < boxStartIndex; j++ ) {
- o_x[permuted[j]] = f[j];
- }
- for ( int j = boxStartIndex; j < m.GetNumRows(); j++ ) {
- float s = o_x[boxIndex[j]];
- if ( lo[j] != -idMath::INFINITY ) {
- lo[j] = - idMath::Fabs( lo[j] * s );
- }
- if ( hi[j] != idMath::INFINITY ) {
- hi[j] = idMath::Fabs( hi[j] * s );
- }
- }
- }
- // calculate acceleration for current variable
- float dot = BigDotProduct( rowPtrs[i], f.ToFloatPtr(), i );
- a[i] = dot - b[i];
- // if already at the low boundary
- if ( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) {
- side[i] = -1;
- continue;
- }
- // if already at the high boundary
- if ( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) {
- side[i] = 1;
- continue;
- }
- // if inside the clamped region
- if ( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) {
- side[i] = 0;
- AddClamped( i );
- continue;
- }
- // drive the current variable into a valid region
- int n = 0;
- for ( ; n < maxIterations; n++ ) {
- // direction to move
- float dir = ( a[i] <= 0.0f ) ? 1.0f : -1.0f;
- // calculate force delta
- CalcForceDelta( i, dir );
- // calculate acceleration delta: delta_a = m * delta_f;
- CalcAccelDelta( i );
- float maxStep;
- int limit;
- int limitSide;
- // maximum step we can take
- GetMaxStep( f.ToFloatPtr(), a.ToFloatPtr(), delta_f.ToFloatPtr(), delta_a.ToFloatPtr(),
- lo.ToFloatPtr(), hi.ToFloatPtr(), side, numUnbounded, numClamped,
- i, dir, maxStep, limit, limitSide );
- if ( maxStep <= 0.0f ) {
- #ifdef IGNORE_UNSATISFIABLE_VARIABLES
- // ignore the current variable completely
- lo[i] = hi[i] = 0.0f;
- f[i] = 0.0f;
- side[i] = -1;
- numIgnored++;
- #else
- failed.Format( "invalid step size %.4f", maxStep );
- for ( int j = i; j < m.GetNumRows(); j++ ) {
- f[j] = 0.0f;
- }
- numIgnored = m.GetNumRows() - i;
- #endif
- break;
- }
- // change force
- ChangeForce( i, maxStep );
- // change acceleration
- ChangeAccel( i, maxStep );
- // clamp/unclamp the variable that limited this step
- side[limit] = limitSide;
- if ( limitSide == 0 ) {
- a[limit] = 0.0f;
- AddClamped( limit );
- } else if ( limitSide == -1 ) {
- f[limit] = lo[limit];
- if ( limit != i ) {
- RemoveClamped( limit );
- }
- } else if ( limitSide == 1 ) {
- f[limit] = hi[limit];
- if ( limit != i ) {
- RemoveClamped( limit );
- }
- }
- // if the current variable limited the step we can continue with the next variable
- if ( limit == i ) {
- break;
- }
- }
- if ( n >= maxIterations ) {
- failed.Format( "max iterations %d", maxIterations );
- break;
- }
- if ( failed.Length() ) {
- break;
- }
- }
- #ifdef IGNORE_UNSATISFIABLE_VARIABLES
- if ( numIgnored ) {
- if ( lcp_showFailures.GetBool() ) {
- idLib::Printf( "idLCP_Square::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded );
- }
- }
- #endif
- // if failed clear remaining forces
- if ( failed.Length() ) {
- if ( lcp_showFailures.GetBool() ) {
- idLib::Printf( "idLCP_Square::Solve: %s (%d of %d bounded variables ignored)\n", failed.c_str(), numIgnored, m.GetNumRows() - numUnbounded );
- }
- }
- #if defined(_DEBUG) && 0
- if ( failed.Length() ) {
- // test whether or not the solution satisfies the complementarity conditions
- for ( int i = 0; i < m.GetNumRows(); i++ ) {
- a[i] = -b[i];
- for ( int j = 0; j < m.GetNumRows(); j++ ) {
- a[i] += rowPtrs[i][j] * f[j];
- }
- if ( f[i] == lo[i] ) {
- if ( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) {
- int bah1 = 1;
- }
- } else if ( f[i] == hi[i] ) {
- if ( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) {
- int bah2 = 1;
- }
- } else if ( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) {
- int bah3 = 1;
- }
- }
- }
- #endif
- // unpermute result
- for ( int i = 0; i < f.GetSize(); i++ ) {
- o_x[permuted[i]] = f[i];
- }
- return true;
- }
- /*
- ================================================================================================
- idLCP_Symmetric
- ================================================================================================
- */
- /*
- ================================================
- idLCP_Symmetric
- ================================================
- */
- class idLCP_Symmetric : public idLCP {
- public:
- virtual bool Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex );
- private:
- idMatX m; // original matrix
- idVecX b; // right hand side
- idVecX lo, hi; // low and high bounds
- idVecX f, a; // force and acceleration
- idVecX delta_f, delta_a; // delta force and delta acceleration
- idMatX clamped; // LDLt factored sub matrix for clamped variables
- idVecX diagonal; // reciprocal of diagonal of LDLt factored sub matrix for clamped variables
- idVecX solveCache1; // intermediate result cached in SolveClamped
- idVecX solveCache2; // "
- int numUnbounded; // number of unbounded variables
- int numClamped; // number of clamped variables
- int clampedChangeStart; // lowest row/column changed in the clamped matrix during an iteration
- float ** rowPtrs; // pointers to the rows of m
- int * boxIndex; // box index
- int * side; // tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0
- int * permuted; // index to keep track of the permutation
- bool padded; // set to true if the rows of the initial matrix are 16 byte padded
- bool FactorClamped();
- void SolveClamped( idVecX &x, const float *b );
- void Swap( int i, int j );
- void AddClamped( int r, bool useSolveCache );
- void RemoveClamped( int r );
- void CalcForceDelta( int d, float dir );
- void CalcAccelDelta( int d );
- void ChangeForce( int d, float step );
- void ChangeAccel( int d, float step );
- };
- /*
- ========================
- idLCP_Symmetric::FactorClamped
- ========================
- */
- bool idLCP_Symmetric::FactorClamped() {
- clampedChangeStart = 0;
- for ( int i = 0; i < numClamped; i++ ) {
- memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) );
- }
- return LDLT_Factor( clamped, diagonal, numClamped );
- }
- /*
- ========================
- idLCP_Symmetric::SolveClamped
- ========================
- */
- void idLCP_Symmetric::SolveClamped( idVecX &x, const float *b ) {
- // solve L
- LowerTriangularSolve( clamped, solveCache1.ToFloatPtr(), b, numClamped, clampedChangeStart );
- // scale with D
- Multiply( solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), diagonal.ToFloatPtr(), numClamped );
- // solve Lt
- LowerTriangularSolveTranspose( clamped, x.ToFloatPtr(), solveCache2.ToFloatPtr(), numClamped );
- clampedChangeStart = numClamped;
- }
- /*
- ========================
- idLCP_Symmetric::Swap
- ========================
- */
- void idLCP_Symmetric::Swap( int i, int j ) {
- if ( i == j ) {
- return;
- }
- SwapValues( rowPtrs[i], rowPtrs[j] );
- m.SwapColumns( i, j );
- b.SwapElements( i, j );
- lo.SwapElements( i, j );
- hi.SwapElements( i, j );
- a.SwapElements( i, j );
- f.SwapElements( i, j );
- if ( boxIndex != NULL ) {
- SwapValues( boxIndex[i], boxIndex[j] );
- }
- SwapValues( side[i], side[j] );
- SwapValues( permuted[i], permuted[j] );
- }
- /*
- ========================
- idLCP_Symmetric::AddClamped
- ========================
- */
- void idLCP_Symmetric::AddClamped( int r, bool useSolveCache ) {
- assert( r >= numClamped );
- if ( numClamped < clampedChangeStart ) {
- clampedChangeStart = numClamped;
- }
- // add a row at the bottom and a column at the right of the factored
- // matrix for the clamped variables
- Swap( numClamped, r );
- // solve for v in L * v = rowPtr[numClamped]
- float dot;
- if ( useSolveCache ) {
- // the lower triangular solve was cached in SolveClamped called by CalcForceDelta
- memcpy( clamped[numClamped], solveCache2.ToFloatPtr(), numClamped * sizeof( float ) );
- // calculate row dot product
- dot = BigDotProduct( solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), numClamped );
- } else {
- float *v = (float *) _alloca16( numClamped * sizeof( float ) );
- LowerTriangularSolve( clamped, v, rowPtrs[numClamped], numClamped, 0 );
- // add bottom row to L
- Multiply( clamped[numClamped], v, diagonal.ToFloatPtr(), numClamped );
- // calculate row dot product
- dot = BigDotProduct( clamped[numClamped], v, numClamped );
- }
- // update diagonal[numClamped]
- float d = rowPtrs[numClamped][numClamped] - dot;
- if ( fabs( d ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- idLib::Printf( "idLCP_Symmetric::AddClamped: updating factorization failed\n" );
- d = idMath::FLT_SMALLEST_NON_DENORMAL;
- }
- clamped[numClamped][numClamped] = d;
- diagonal[numClamped] = 1.0f / d;
- numClamped++;
- }
- /*
- ========================
- idLCP_Symmetric::RemoveClamped
- ========================
- */
- void idLCP_Symmetric::RemoveClamped( int r ) {
- if ( !verify( r < numClamped ) ) {
- // complete fail, most likely due to exceptional floating point values
- return;
- }
- if ( r < clampedChangeStart ) {
- clampedChangeStart = r;
- }
- numClamped--;
- // no need to swap and update the factored matrix when the last row and column are removed
- if ( r == numClamped ) {
- return;
- }
- // swap the to be removed row/column with the last row/column
- Swap( r, numClamped );
- // update the factored matrix
- float * addSub = (float *) _alloca16( numClamped * sizeof( float ) );
- if ( r == 0 ) {
- if ( numClamped == 1 ) {
- float diag = rowPtrs[0][0];
- if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
- diag = idMath::FLT_SMALLEST_NON_DENORMAL;
- }
- clamped[0][0] = diag;
- diagonal[0] = 1.0f / diag;
- return;
- }
- // calculate the row/column to be added to the lower right sub matrix starting at (r, r)
- float * original = rowPtrs[numClamped];
- float * ptr = rowPtrs[r];
- addSub[0] = ptr[0] - original[numClamped];
- for ( int i = 1; i < numClamped; i++ ) {
- addSub[i] = ptr[i] - original[i];
- }
- } else {
- float * v = (float *) _alloca16( numClamped * sizeof( float ) );
- // solve for v in L * v = rowPtr[r]
- LowerTriangularSolve( clamped, v, rowPtrs[r], r, 0 );
- // update removed row
- Multiply( clamped[r], v, diagonal.ToFloatPtr(), r );
- // if the last row/column of the matrix is updated
- if ( r == numClamped - 1 ) {
- // only calculate new diagonal
- float dot = BigDotProduct( clamped[r], v, r );
- float diag = rowPtrs[r][r] - dot;
- if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
- diag = idMath::FLT_SMALLEST_NON_DENORMAL;
- }
- clamped[r][r] = diag;
- diagonal[r] = 1.0f / diag;
- return;
- }
- // calculate the row/column to be added to the lower right sub matrix starting at (r, r)
- for ( int i = 0; i < r; i++ ) {
- v[i] = clamped[r][i] * clamped[i][i];
- }
- for ( int i = r; i < numClamped; i++ ) {
- float sum;
- if ( i == r ) {
- sum = clamped[r][r];
- } else {
- sum = clamped[r][r] * clamped[i][r];
- }
- float * ptr = clamped[i];
- for ( int j = 0; j < r; j++ ) {
- sum += ptr[j] * v[j];
- }
- addSub[i] = rowPtrs[r][i] - sum;
- }
- }
- // add row/column to the lower right sub matrix starting at (r, r)
- float * v1 = (float *) _alloca16( numClamped * sizeof( float ) );
- float * v2 = (float *) _alloca16( numClamped * sizeof( float ) );
- float diag = idMath::SQRT_1OVER2;
- v1[r] = ( 0.5f * addSub[r] + 1.0f ) * diag;
- v2[r] = ( 0.5f * addSub[r] - 1.0f ) * diag;
- for ( int i = r+1; i < numClamped; i++ ) {
- v1[i] = v2[i] = addSub[i] * diag;
- }
- float alpha1 = 1.0f;
- float alpha2 = -1.0f;
- // simultaneous update/downdate of the sub matrix starting at (r, r)
- int n = clamped.GetNumColumns();
- for ( int i = r; i < numClamped; i++ ) {
- diag = clamped[i][i];
- float p1 = v1[i];
- float newDiag = diag + alpha1 * p1 * p1;
- if ( fabs( newDiag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
- newDiag = idMath::FLT_SMALLEST_NON_DENORMAL;
- }
- alpha1 /= newDiag;
- float beta1 = p1 * alpha1;
- alpha1 *= diag;
- diag = newDiag;
- float p2 = v2[i];
- newDiag = diag + alpha2 * p2 * p2;
- if ( fabs( newDiag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
- idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
- newDiag = idMath::FLT_SMALLEST_NON_DENORMAL;
- }
- clamped[i][i] = newDiag;
- float invNewDiag = 1.0f / newDiag;
- diagonal[i] = invNewDiag;
- alpha2 *= invNewDiag;
- float beta2 = p2 * alpha2;
- alpha2 *= diag;
- // update column below diagonal (i,i)
- float * ptr = clamped.ToFloatPtr() + i;
- int j;
- for ( j = i+1; j < numClamped - 1; j += 2 ) {
- float sum0 = ptr[(j+0)*n];
- float sum1 = ptr[(j+1)*n];
- v1[j+0] -= p1 * sum0;
- v1[j+1] -= p1 * sum1;
- sum0 += beta1 * v1[j+0];
- sum1 += beta1 * v1[j+1];
- v2[j+0] -= p2 * sum0;
- v2[j+1] -= p2 * sum1;
- sum0 += beta2 * v2[j+0];
- sum1 += beta2 * v2[j+1];
- ptr[(j+0)*n] = sum0;
- ptr[(j+1)*n] = sum1;
- }
- for ( ; j < numClamped; j++ ) {
- float sum = ptr[j*n];
- v1[j] -= p1 * sum;
- sum += beta1 * v1[j];
- v2[j] -= p2 * sum;
- sum += beta2 * v2[j];
- ptr[j*n] = sum;
- }
- }
- }
- /*
- ========================
- idLCP_Symmetric::CalcForceDelta
- Modifies this->delta_f.
- ========================
- */
- ID_INLINE void idLCP_Symmetric::CalcForceDelta( int d, float dir ) {
- delta_f[d] = dir;
- if ( numClamped == 0 ) {
- return;
- }
- // solve force delta
- SolveClamped( delta_f, rowPtrs[d] );
- // flip force delta based on direction
- if ( dir > 0.0f ) {
- float * ptr = delta_f.ToFloatPtr();
- for ( int i = 0; i < numClamped; i++ ) {
- ptr[i] = - ptr[i];
- }
- }
- }
- /*
- ========================
- idLCP_Symmetric::CalcAccelDelta
- Modifies this->delta_a and uses this->delta_f.
- ========================
- */
- ID_INLINE void idLCP_Symmetric::CalcAccelDelta( int d ) {
- // only the not clamped variables, including the current variable, can have a change in acceleration
- for ( int j = numClamped; j <= d; j++ ) {
- // only the clamped variables and the current variable have a force delta unequal zero
- float dot = BigDotProduct( rowPtrs[j], delta_f.ToFloatPtr(), numClamped );
- delta_a[j] = dot + rowPtrs[j][d] * delta_f[d];
- }
- }
- /*
- ========================
- idLCP_Symmetric::ChangeForce
- Modifies this->f and uses this->delta_f.
- ========================
- */
- ID_INLINE void idLCP_Symmetric::ChangeForce( int d, float step ) {
- // only the clamped variables and current variable have a force delta unequal zero
- MultiplyAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped );
- f[d] += step * delta_f[d];
- }
- /*
- ========================
- idLCP_Symmetric::ChangeAccel
- Modifies this->a and uses this->delta_a.
- ========================
- */
- ID_INLINE void idLCP_Symmetric::ChangeAccel( int d, float step ) {
- // only the not clamped variables, including the current variable, can have an acceleration unequal zero
- MultiplyAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 );
- }
- /*
- ========================
- idLCP_Symmetric::Solve
- ========================
- */
- bool idLCP_Symmetric::Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ) {
- // true when the matrix rows are 16 byte padded
- padded = ((o_m.GetNumRows()+3)&~3) == o_m.GetNumColumns();
- assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() );
- assert( o_x.GetSize() == o_m.GetNumRows() );
- assert( o_b.GetSize() == o_m.GetNumRows() );
- assert( o_lo.GetSize() == o_m.GetNumRows() );
- assert( o_hi.GetSize() == o_m.GetNumRows() );
- // allocate memory for permuted input
- f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) );
- a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
- b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
- lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) );
- hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) );
- if ( o_boxIndex != NULL ) {
- boxIndex = (int *)_alloca16( o_x.GetSize() * sizeof( int ) );
- memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) );
- } else {
- boxIndex = NULL;
- }
- // we override the const on o_m here but on exit the matrix is unchanged
- m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast< float * >( o_m[0] ) );
- f.Zero();
- a.Zero();
- b = o_b;
- lo = o_lo;
- hi = o_hi;
- // pointers to the rows of m
- rowPtrs = (float **) _alloca16( m.GetNumRows() * sizeof( float * ) );
- for ( int i = 0; i < m.GetNumRows(); i++ ) {
- rowPtrs[i] = m[i];
- }
- // tells if a variable is at the low boundary, high boundary or inbetween
- side = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
- // index to keep track of the permutation
- permuted = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
- for ( int i = 0; i < m.GetNumRows(); i++ ) {
- permuted[i] = i;
- }
- // permute input so all unbounded variables come first
- numUnbounded = 0;
- for ( int i = 0; i < m.GetNumRows(); i++ ) {
- if ( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) {
- if ( numUnbounded != i ) {
- Swap( numUnbounded, i );
- }
- numUnbounded++;
- }
- }
- // permute input so all variables using the boxIndex come last
- int boxStartIndex = m.GetNumRows();
- if ( boxIndex != NULL ) {
- for ( int i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) {
- if ( boxIndex[i] >= 0 ) {
- boxStartIndex--;
- if ( boxStartIndex != i ) {
- Swap( boxStartIndex, i );
- }
- }
- }
- }
- // sub matrix for factorization
- clamped.SetDataCacheLines( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA_CACHE_LINES( m.GetNumRows() * m.GetNumColumns() ), true );
- diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
- solveCache1.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
- solveCache2.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
- // all unbounded variables are clamped
- numClamped = numUnbounded;
- // if there are unbounded variables
- if ( numUnbounded ) {
- // factor and solve for unbounded variables
- if ( !FactorClamped() ) {
- idLib::Printf( "idLCP_Symmetric::Solve: unbounded factorization failed\n" );
- return false;
- }
- SolveClamped( f, b.ToFloatPtr() );
- // if there are no bounded variables we are done
- if ( numUnbounded == m.GetNumRows() ) {
- o_x = f; // the vector is not permuted
- return true;
- }
- }
- int numIgnored = 0;
- // allocate for delta force and delta acceleration
- delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
- delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
- // solve for bounded variables
- idStr failed;
- for ( int i = numUnbounded; i < m.GetNumRows(); i++ ) {
- clampedChangeStart = 0;
- // once we hit the box start index we can initialize the low and high boundaries of the variables using the box index
- if ( i == boxStartIndex ) {
- for ( int j = 0; j < boxStartIndex; j++ ) {
- o_x[permuted[j]] = f[j];
- }
- for ( int j = boxStartIndex; j < m.GetNumRows(); j++ ) {
- float s = o_x[boxIndex[j]];
- if ( lo[j] != -idMath::INFINITY ) {
- lo[j] = - idMath::Fabs( lo[j] * s );
- }
- if ( hi[j] != idMath::INFINITY ) {
- hi[j] = idMath::Fabs( hi[j] * s );
- }
- }
- }
- // calculate acceleration for current variable
- float dot = BigDotProduct( rowPtrs[i], f.ToFloatPtr(), i );
- a[i] = dot - b[i];
- // if already at the low boundary
- if ( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) {
- side[i] = -1;
- continue;
- }
- // if already at the high boundary
- if ( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) {
- side[i] = 1;
- continue;
- }
- // if inside the clamped region
- if ( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) {
- side[i] = 0;
- AddClamped( i, false );
- continue;
- }
- // drive the current variable into a valid region
- int n = 0;
- for ( ; n < maxIterations; n++ ) {
- // direction to move
- float dir = ( a[i] <= 0.0f ) ? 1.0f : -1.0f;
- // calculate force delta
- CalcForceDelta( i, dir );
- // calculate acceleration delta: delta_a = m * delta_f;
- CalcAccelDelta( i );
- float maxStep;
- int limit;
- int limitSide;
- // maximum step we can take
- GetMaxStep( f.ToFloatPtr(), a.ToFloatPtr(), delta_f.ToFloatPtr(), delta_a.ToFloatPtr(),
- lo.ToFloatPtr(), hi.ToFloatPtr(), side, numUnbounded, numClamped,
- i, dir, maxStep, limit, limitSide );
- if ( maxStep <= 0.0f ) {
- #ifdef IGNORE_UNSATISFIABLE_VARIABLES
- // ignore the current variable completely
- lo[i] = hi[i] = 0.0f;
- f[i] = 0.0f;
- side[i] = -1;
- numIgnored++;
- #else
- failed.Format( "invalid step size %.4f", maxStep );
- for ( int j = i; j < m.GetNumRows(); j++ ) {
- f[j] = 0.0f;
- }
- numIgnored = m.GetNumRows() - i;
- #endif
- break;
- }
- // change force
- ChangeForce( i, maxStep );
- // change acceleration
- ChangeAccel( i, maxStep );
- // clamp/unclamp the variable that limited this step
- side[limit] = limitSide;
- if ( limitSide == 0 ) {
- a[limit] = 0.0f;
- AddClamped( limit, ( limit == i ) );
- } else if ( limitSide == -1 ) {
- f[limit] = lo[limit];
- if ( limit != i ) {
- RemoveClamped( limit );
- }
- } else if ( limitSide == 1 ) {
- f[limit] = hi[limit];
- if ( limit != i ) {
- RemoveClamped( limit );
- }
- }
- // if the current variable limited the step we can continue with the next variable
- if ( limit == i ) {
- break;
- }
- }
- if ( n >= maxIterations ) {
- failed.Format( "max iterations %d", maxIterations );
- break;
- }
- if ( failed.Length() ) {
- break;
- }
- }
- #ifdef IGNORE_UNSATISFIABLE_VARIABLES
- if ( numIgnored ) {
- if ( lcp_showFailures.GetBool() ) {
- idLib::Printf( "idLCP_Symmetric::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded );
- }
- }
- #endif
- // if failed clear remaining forces
- if ( failed.Length() ) {
- if ( lcp_showFailures.GetBool() ) {
- idLib::Printf( "idLCP_Symmetric::Solve: %s (%d of %d bounded variables ignored)\n", failed.c_str(), numIgnored, m.GetNumRows() - numUnbounded );
- }
- }
- #if defined(_DEBUG) && 0
- if ( failed.Length() ) {
- // test whether or not the solution satisfies the complementarity conditions
- for ( int i = 0; i < m.GetNumRows(); i++ ) {
- a[i] = -b[i];
- for ( j = 0; j < m.GetNumRows(); j++ ) {
- a[i] += rowPtrs[i][j] * f[j];
- }
- if ( f[i] == lo[i] ) {
- if ( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) {
- int bah1 = 1;
- }
- } else if ( f[i] == hi[i] ) {
- if ( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) {
- int bah2 = 1;
- }
- } else if ( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) {
- int bah3 = 1;
- }
- }
- }
- #endif
- // unpermute result
- for ( int i = 0; i < f.GetSize(); i++ ) {
- o_x[permuted[i]] = f[i];
- }
- return true;
- }
- /*
- ================================================================================================
- idLCP
- ================================================================================================
- */
- /*
- ========================
- idLCP::AllocSquare
- ========================
- */
- idLCP *idLCP::AllocSquare() {
- idLCP *lcp = new idLCP_Square;
- lcp->SetMaxIterations( 32 );
- return lcp;
- }
- /*
- ========================
- idLCP::AllocSymmetric
- ========================
- */
- idLCP *idLCP::AllocSymmetric() {
- idLCP *lcp = new idLCP_Symmetric;
- lcp->SetMaxIterations( 32 );
- return lcp;
- }
- /*
- ========================
- idLCP::~idLCP
- ========================
- */
- idLCP::~idLCP() {
- }
- /*
- ========================
- idLCP::SetMaxIterations
- ========================
- */
- void idLCP::SetMaxIterations( int max ) {
- maxIterations = max;
- }
- /*
- ========================
- idLCP::GetMaxIterations
- ========================
- */
- int idLCP::GetMaxIterations() {
- return maxIterations;
- }
- /*
- ========================
- idLCP::Test_f
- ========================
- */
- void idLCP::Test_f( const idCmdArgs &args ) {
- #ifdef ENABLE_TEST_CODE
- DotProduct_Test();
- LowerTriangularSolve_Test();
- LowerTriangularSolveTranspose_Test();
- LDLT_Factor_Test();
- #endif
- }
|