Lcp.cpp 87 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042
  1. /*
  2. ===========================================================================
  3. Doom 3 BFG Edition GPL Source Code
  4. Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
  6. Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #pragma hdrstop
  21. #include "../precompiled.h"
  22. // this file is full of intentional case fall throughs
  23. //lint -e616
  24. // the code is correct, it can't be a NULL pointer
  25. //lint -e613
  26. static idCVar lcp_showFailures( "lcp_showFailures", "0", CVAR_BOOL, "show LCP solver failures" );
  27. const float LCP_BOUND_EPSILON = 1e-5f;
  28. const float LCP_ACCEL_EPSILON = 1e-5f;
  29. const float LCP_DELTA_ACCEL_EPSILON = 1e-9f;
  30. const float LCP_DELTA_FORCE_EPSILON = 1e-9f;
  31. #define IGNORE_UNSATISFIABLE_VARIABLES
  32. #if defined( ID_WIN_X86_SSE_ASM ) || defined( ID_WIN_X86_SSE_INTRIN )
  33. ALIGN16( const __m128 SIMD_SP_zero ) = { 0.0f, 0.0f, 0.0f, 0.0f };
  34. ALIGN16( const __m128 SIMD_SP_one ) = { 1.0f, 1.0f, 1.0f, 1.0f };
  35. ALIGN16( const __m128 SIMD_SP_two ) = { 2.0f, 2.0f, 2.0f, 2.0f };
  36. ALIGN16( const __m128 SIMD_SP_tiny ) = { 1e-10f, 1e-10f, 1e-10f, 1e-10f };
  37. ALIGN16( const __m128 SIMD_SP_infinity ) = { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY };
  38. ALIGN16( const __m128 SIMD_SP_LCP_DELTA_ACCEL_EPSILON ) = { LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON };
  39. ALIGN16( const __m128 SIMD_SP_LCP_DELTA_FORCE_EPSILON ) = { LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON };
  40. ALIGN16( const __m128 SIMD_SP_LCP_BOUND_EPSILON ) = { LCP_BOUND_EPSILON, LCP_BOUND_EPSILON, LCP_BOUND_EPSILON, LCP_BOUND_EPSILON };
  41. ALIGN16( const __m128 SIMD_SP_neg_LCP_BOUND_EPSILON ) = { -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON };
  42. ALIGN16( const unsigned int SIMD_SP_signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
  43. ALIGN16( const unsigned int SIMD_SP_absMask[4] ) = { ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK };
  44. ALIGN16( const unsigned int SIMD_SP_indexedStartMask[4][4] ) = { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0, 0, 0xFFFFFFFF } };
  45. ALIGN16( const unsigned int SIMD_SP_indexedEndMask[4][4] ) = { { 0, 0, 0, 0 }, { 0xFFFFFFFF, 0, 0, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 } };
  46. ALIGN16( const unsigned int SIMD_SP_clearLast1[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 };
  47. ALIGN16( const unsigned int SIMD_SP_clearLast2[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 };
  48. ALIGN16( const unsigned int SIMD_SP_clearLast3[4] ) = { 0xFFFFFFFF, 0, 0, 0 };
  49. ALIGN16( const unsigned int SIMD_DW_zero[4] ) = { 0, 0, 0, 0 };
  50. ALIGN16( const unsigned int SIMD_DW_one[4] ) = { 1, 1, 1, 1 };
  51. ALIGN16( const unsigned int SIMD_DW_four[4] ) = { 4, 4, 4, 4 };
  52. ALIGN16( const unsigned int SIMD_DW_index[4] ) = { 0, 1, 2, 3 };
  53. ALIGN16( const int SIMD_DW_not3[4] ) = { ~3, ~3, ~3, ~3 };
  54. #endif
  55. /*
  56. ========================
  57. Multiply_SIMD
  58. dst[i] = src0[i] * src1[i];
  59. Assumes the source and destination have the same memory alignment.
  60. ========================
  61. */
  62. static void Multiply_SIMD( float * dst, const float * src0, const float * src1, const int count ) {
  63. int i = 0;
  64. for ( ; ( (unsigned int)dst & 0xF ) != 0 && i < count; i++ ) {
  65. dst[i] = src0[i] * src1[i];
  66. }
  67. #ifdef ID_WIN_X86_SSE_INTRIN
  68. for ( ; i + 4 <= count; i += 4 ) {
  69. assert_16_byte_aligned( &dst[i] );
  70. assert_16_byte_aligned( &src0[i] );
  71. assert_16_byte_aligned( &src1[i] );
  72. __m128 s0 = _mm_load_ps( src0 + i );
  73. __m128 s1 = _mm_load_ps( src1 + i );
  74. s0 = _mm_mul_ps( s0, s1 );
  75. _mm_store_ps( dst + i, s0 );
  76. }
  77. #else
  78. for ( ; i + 4 <= count; i += 4 ) {
  79. assert_16_byte_aligned( &dst[i] );
  80. assert_16_byte_aligned( &src0[i] );
  81. assert_16_byte_aligned( &src1[i] );
  82. dst[i+0] = src0[i+0] * src1[i+0];
  83. dst[i+1] = src0[i+1] * src1[i+1];
  84. dst[i+2] = src0[i+2] * src1[i+2];
  85. dst[i+3] = src0[i+3] * src1[i+3];
  86. }
  87. #endif
  88. for ( ; i < count; i++ ) {
  89. dst[i] = src0[i] * src1[i];
  90. }
  91. }
  92. /*
  93. ========================
  94. MultiplyAdd_SIMD
  95. dst[i] += constant * src[i];
  96. Assumes the source and destination have the same memory alignment.
  97. ========================
  98. */
  99. static void MultiplyAdd_SIMD( float * dst, const float constant, const float * src, const int count ) {
  100. int i = 0;
  101. for ( ; ( (unsigned int)dst & 0xF ) != 0 && i < count; i++ ) {
  102. dst[i] += constant * src[i];
  103. }
  104. #ifdef ID_WIN_X86_SSE_INTRIN
  105. __m128 c = _mm_load1_ps( & constant );
  106. for ( ; i + 4 <= count; i += 4 ) {
  107. assert_16_byte_aligned( &dst[i] );
  108. assert_16_byte_aligned( &src[i] );
  109. __m128 s = _mm_load_ps( src + i );
  110. __m128 d = _mm_load_ps( dst + i );
  111. s = _mm_add_ps( _mm_mul_ps( s, c ), d );
  112. _mm_store_ps( dst + i, s );
  113. }
  114. #else
  115. for ( ; i + 4 <= count; i += 4 ) {
  116. assert_16_byte_aligned( &src[i] );
  117. assert_16_byte_aligned( &dst[i] );
  118. dst[i+0] += constant * src[i+0];
  119. dst[i+1] += constant * src[i+1];
  120. dst[i+2] += constant * src[i+2];
  121. dst[i+3] += constant * src[i+3];
  122. }
  123. #endif
  124. for ( ; i < count; i++ ) {
  125. dst[i] += constant * src[i];
  126. }
  127. }
  128. /*
  129. ========================
  130. DotProduct_SIMD
  131. dot = src0[0] * src1[0] + src0[1] * src1[1] + src0[2] * src1[2] + ...
  132. ========================
  133. */
  134. static float DotProduct_SIMD( const float * src0, const float * src1, const int count ) {
  135. assert_16_byte_aligned( src0 );
  136. assert_16_byte_aligned( src1 );
  137. #ifdef ID_WIN_X86_SSE_INTRIN
  138. __m128 sum = (__m128 &) SIMD_SP_zero;
  139. int i = 0;
  140. for ( ; i < count - 3; i += 4 ) {
  141. __m128 s0 = _mm_load_ps( src0 + i );
  142. __m128 s1 = _mm_load_ps( src1 + i );
  143. sum = _mm_add_ps( _mm_mul_ps( s0, s1 ), sum );
  144. }
  145. __m128 mask = _mm_load_ps( (float *) &SIMD_SP_indexedEndMask[count & 3] );
  146. __m128 s0 = _mm_and_ps( _mm_load_ps( src0 + i ), mask );
  147. __m128 s1 = _mm_and_ps( _mm_load_ps( src1 + i ), mask );
  148. sum = _mm_add_ps( _mm_mul_ps( s0, s1 ), sum );
  149. sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  150. sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  151. float dot;
  152. _mm_store_ss( & dot, sum );
  153. return dot;
  154. #else
  155. float s0 = 0.0f;
  156. float s1 = 0.0f;
  157. float s2 = 0.0f;
  158. float s3 = 0.0f;
  159. int i = 0;
  160. for ( ; i < count - 3; i += 4 ) {
  161. s0 += src0[i+4] * src1[i+4];
  162. s1 += src0[i+5] * src1[i+5];
  163. s2 += src0[i+6] * src1[i+6];
  164. s3 += src0[i+7] * src1[i+7];
  165. }
  166. switch( count - i ) {
  167. NODEFAULT;
  168. case 3: s0 += src0[i+2] * src1[i+2];
  169. case 2: s1 += src0[i+1] * src1[i+1];
  170. case 1: s2 += src0[i+0] * src1[i+0];
  171. case 0:
  172. break;
  173. }
  174. return s0 + s1 + s2 + s3;
  175. #endif
  176. }
  177. /*
  178. ========================
  179. LowerTriangularSolve_SIMD
  180. Solves x in Lx = b for the n * n sub-matrix of L.
  181. * if skip > 0 the first skip elements of x are assumed to be valid already
  182. * L has to be a lower triangular matrix with (implicit) ones on the diagonal
  183. * x == b is allowed
  184. ========================
  185. */
  186. static void LowerTriangularSolve_SIMD( const idMatX & L, float * x, const float * b, const int n, int skip ) {
  187. if ( skip >= n ) {
  188. return;
  189. }
  190. const float *lptr = L.ToFloatPtr();
  191. int nc = L.GetNumColumns();
  192. assert( ( nc & 3 ) == 0 );
  193. // unrolled cases for n < 8
  194. if ( n < 8 ) {
  195. #define NSKIP( n, s ) ((n<<3)|(s&7))
  196. switch( NSKIP( n, skip ) ) {
  197. case NSKIP( 1, 0 ): x[0] = b[0];
  198. return;
  199. case NSKIP( 2, 0 ): x[0] = b[0];
  200. case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  201. return;
  202. case NSKIP( 3, 0 ): x[0] = b[0];
  203. case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  204. case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  205. return;
  206. case NSKIP( 4, 0 ): x[0] = b[0];
  207. case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  208. case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  209. case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  210. return;
  211. case NSKIP( 5, 0 ): x[0] = b[0];
  212. case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  213. case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  214. case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  215. case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  216. return;
  217. case NSKIP( 6, 0 ): x[0] = b[0];
  218. case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  219. case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  220. case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  221. case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  222. case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
  223. return;
  224. case NSKIP( 7, 0 ): x[0] = b[0];
  225. case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  226. case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  227. case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  228. case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  229. case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
  230. case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
  231. return;
  232. }
  233. #undef NSKIP
  234. return;
  235. }
  236. // process first 4 rows
  237. switch( skip ) {
  238. case 0: x[0] = b[0];
  239. case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
  240. case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  241. case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  242. skip = 4;
  243. }
  244. lptr = L[skip];
  245. int i = skip;
  246. #ifdef ID_WIN_X86_SSE_INTRIN
  247. // work up to a multiple of 4 rows
  248. for ( ; ( i & 3 ) != 0 && i < n; i++ ) {
  249. __m128 sum = _mm_load_ss( & b[i] );
  250. int j = 0;
  251. for ( ; j < i - 3; j += 4 ) {
  252. __m128 s0 = _mm_load_ps( lptr + j );
  253. __m128 s1 = _mm_load_ps( x + j );
  254. sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
  255. }
  256. __m128 mask = _mm_load_ps( (float *) & SIMD_SP_indexedEndMask[i & 3] );
  257. __m128 s0 = _mm_and_ps( _mm_load_ps( lptr + j ), mask );
  258. __m128 s1 = _mm_and_ps( _mm_load_ps( x + j ), mask );
  259. sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
  260. sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  261. sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  262. _mm_store_ss( & x[i], sum );
  263. lptr += nc;
  264. }
  265. for ( ; i + 3 < n; i += 4 ) {
  266. const float * lptr0 = &lptr[0*nc];
  267. const float * lptr1 = &lptr[1*nc];
  268. const float * lptr2 = &lptr[2*nc];
  269. const float * lptr3 = &lptr[3*nc];
  270. assert_16_byte_aligned( lptr0 );
  271. assert_16_byte_aligned( lptr1 );
  272. assert_16_byte_aligned( lptr2 );
  273. assert_16_byte_aligned( lptr3 );
  274. __m128 va = _mm_load_ss( & b[i+0] );
  275. __m128 vb = _mm_load_ss( & b[i+1] );
  276. __m128 vc = _mm_load_ss( & b[i+2] );
  277. __m128 vd = _mm_load_ss( & b[i+3] );
  278. __m128 x0 = _mm_load_ps( & x[0] );
  279. va = _mm_sub_ps( va, _mm_mul_ps( x0, _mm_load_ps( lptr0 + 0 ) ) );
  280. vb = _mm_sub_ps( vb, _mm_mul_ps( x0, _mm_load_ps( lptr1 + 0 ) ) );
  281. vc = _mm_sub_ps( vc, _mm_mul_ps( x0, _mm_load_ps( lptr2 + 0 ) ) );
  282. vd = _mm_sub_ps( vd, _mm_mul_ps( x0, _mm_load_ps( lptr3 + 0 ) ) );
  283. for ( int j = 4; j < i; j += 4 ) {
  284. __m128 xj = _mm_load_ps( &x[j] );
  285. va = _mm_sub_ps( va, _mm_mul_ps( xj, _mm_load_ps( lptr0 + j ) ) );
  286. vb = _mm_sub_ps( vb, _mm_mul_ps( xj, _mm_load_ps( lptr1 + j ) ) );
  287. vc = _mm_sub_ps( vc, _mm_mul_ps( xj, _mm_load_ps( lptr2 + j ) ) );
  288. vd = _mm_sub_ps( vd, _mm_mul_ps( xj, _mm_load_ps( lptr3 + j ) ) );
  289. }
  290. vb = _mm_sub_ps( vb, _mm_mul_ps( va, _mm_load1_ps( lptr1 + i + 0 ) ) );
  291. vc = _mm_sub_ps( vc, _mm_mul_ps( va, _mm_load1_ps( lptr2 + i + 0 ) ) );
  292. vc = _mm_sub_ps( vc, _mm_mul_ps( vb, _mm_load1_ps( lptr2 + i + 1 ) ) );
  293. vd = _mm_sub_ps( vd, _mm_mul_ps( va, _mm_load1_ps( lptr3 + i + 0 ) ) );
  294. vd = _mm_sub_ps( vd, _mm_mul_ps( vb, _mm_load1_ps( lptr3 + i + 1 ) ) );
  295. vd = _mm_sub_ps( vd, _mm_mul_ps( vc, _mm_load1_ps( lptr3 + i + 2 ) ) );
  296. __m128 ta = _mm_unpacklo_ps( va, vc ); // x0, z0, x1, z1
  297. __m128 tb = _mm_unpackhi_ps( va, vc ); // x2, z2, x3, z3
  298. __m128 tc = _mm_unpacklo_ps( vb, vd ); // y0, w0, y1, w1
  299. __m128 td = _mm_unpackhi_ps( vb, vd ); // y2, w2, y3, w3
  300. va = _mm_unpacklo_ps( ta, tc ); // x0, y0, z0, w0
  301. vb = _mm_unpackhi_ps( ta, tc ); // x1, y1, z1, w1
  302. vc = _mm_unpacklo_ps( tb, td ); // x2, y2, z2, w2
  303. vd = _mm_unpackhi_ps( tb, td ); // x3, y3, z3, w3
  304. va = _mm_add_ps( va, vb );
  305. vc = _mm_add_ps( vc, vd );
  306. va = _mm_add_ps( va, vc );
  307. _mm_store_ps( & x[i], va );
  308. lptr += 4 * nc;
  309. }
  310. // go through any remaining rows
  311. for ( ; i < n; i++ ) {
  312. __m128 sum = _mm_load_ss( & b[i] );
  313. int j = 0;
  314. for ( ; j < i - 3; j += 4 ) {
  315. __m128 s0 = _mm_load_ps( lptr + j );
  316. __m128 s1 = _mm_load_ps( x + j );
  317. sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
  318. }
  319. __m128 mask = _mm_load_ps( (float *) & SIMD_SP_indexedEndMask[i & 3] );
  320. __m128 s0 = _mm_and_ps( _mm_load_ps( lptr + j ), mask );
  321. __m128 s1 = _mm_and_ps( _mm_load_ps( x + j ), mask );
  322. sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
  323. sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  324. sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  325. _mm_store_ss( & x[i], sum );
  326. lptr += nc;
  327. }
  328. #else
  329. // work up to a multiple of 4 rows
  330. for ( ; ( i & 3 ) != 0 && i < n; i++ ) {
  331. float sum = b[i];
  332. for ( int j = 0; j < i; j++ ) {
  333. sum -= lptr[j] * x[j];
  334. }
  335. x[i] = sum;
  336. lptr += nc;
  337. }
  338. assert_16_byte_aligned( x );
  339. for ( ; i + 3 < n; i += 4 ) {
  340. const float * lptr0 = &lptr[0*nc];
  341. const float * lptr1 = &lptr[1*nc];
  342. const float * lptr2 = &lptr[2*nc];
  343. const float * lptr3 = &lptr[3*nc];
  344. assert_16_byte_aligned( lptr0 );
  345. assert_16_byte_aligned( lptr1 );
  346. assert_16_byte_aligned( lptr2 );
  347. assert_16_byte_aligned( lptr3 );
  348. float a0 = - lptr0[0] * x[0] + b[i+0];
  349. float a1 = - lptr0[1] * x[1];
  350. float a2 = - lptr0[2] * x[2];
  351. float a3 = - lptr0[3] * x[3];
  352. float b0 = - lptr1[0] * x[0] + b[i+1];
  353. float b1 = - lptr1[1] * x[1];
  354. float b2 = - lptr1[2] * x[2];
  355. float b3 = - lptr1[3] * x[3];
  356. float c0 = - lptr2[0] * x[0] + b[i+2];
  357. float c1 = - lptr2[1] * x[1];
  358. float c2 = - lptr2[2] * x[2];
  359. float c3 = - lptr2[3] * x[3];
  360. float d0 = - lptr3[0] * x[0] + b[i+3];
  361. float d1 = - lptr3[1] * x[1];
  362. float d2 = - lptr3[2] * x[2];
  363. float d3 = - lptr3[3] * x[3];
  364. for ( int j = 4; j < i; j += 4 ) {
  365. a0 -= lptr0[j+0] * x[j+0];
  366. a1 -= lptr0[j+1] * x[j+1];
  367. a2 -= lptr0[j+2] * x[j+2];
  368. a3 -= lptr0[j+3] * x[j+3];
  369. b0 -= lptr1[j+0] * x[j+0];
  370. b1 -= lptr1[j+1] * x[j+1];
  371. b2 -= lptr1[j+2] * x[j+2];
  372. b3 -= lptr1[j+3] * x[j+3];
  373. c0 -= lptr2[j+0] * x[j+0];
  374. c1 -= lptr2[j+1] * x[j+1];
  375. c2 -= lptr2[j+2] * x[j+2];
  376. c3 -= lptr2[j+3] * x[j+3];
  377. d0 -= lptr3[j+0] * x[j+0];
  378. d1 -= lptr3[j+1] * x[j+1];
  379. d2 -= lptr3[j+2] * x[j+2];
  380. d3 -= lptr3[j+3] * x[j+3];
  381. }
  382. b0 -= lptr1[i+0] * a0;
  383. b1 -= lptr1[i+0] * a1;
  384. b2 -= lptr1[i+0] * a2;
  385. b3 -= lptr1[i+0] * a3;
  386. c0 -= lptr2[i+0] * a0;
  387. c1 -= lptr2[i+0] * a1;
  388. c2 -= lptr2[i+0] * a2;
  389. c3 -= lptr2[i+0] * a3;
  390. c0 -= lptr2[i+1] * b0;
  391. c1 -= lptr2[i+1] * b1;
  392. c2 -= lptr2[i+1] * b2;
  393. c3 -= lptr2[i+1] * b3;
  394. d0 -= lptr3[i+0] * a0;
  395. d1 -= lptr3[i+0] * a1;
  396. d2 -= lptr3[i+0] * a2;
  397. d3 -= lptr3[i+0] * a3;
  398. d0 -= lptr3[i+1] * b0;
  399. d1 -= lptr3[i+1] * b1;
  400. d2 -= lptr3[i+1] * b2;
  401. d3 -= lptr3[i+1] * b3;
  402. d0 -= lptr3[i+2] * c0;
  403. d1 -= lptr3[i+2] * c1;
  404. d2 -= lptr3[i+2] * c2;
  405. d3 -= lptr3[i+2] * c3;
  406. x[i+0] = a0 + a1 + a2 + a3;
  407. x[i+1] = b0 + b1 + b2 + b3;
  408. x[i+2] = c0 + c1 + c2 + c3;
  409. x[i+3] = d0 + d1 + d2 + d3;
  410. lptr += 4 * nc;
  411. }
  412. // go through any remaining rows
  413. for ( ; i < n; i++ ) {
  414. float sum = b[i];
  415. for ( int j = 0; j < i; j++ ) {
  416. sum -= lptr[j] * x[j];
  417. }
  418. x[i] = sum;
  419. lptr += nc;
  420. }
  421. #endif
  422. }
  423. /*
  424. ========================
  425. LowerTriangularSolveTranspose_SIMD
  426. Solves x in L'x = b for the n * n sub-matrix of L.
  427. * L has to be a lower triangular matrix with (implicit) ones on the diagonal
  428. * x == b is allowed
  429. ========================
  430. */
  431. static void LowerTriangularSolveTranspose_SIMD( const idMatX & L, float * x, const float * b, const int n ) {
  432. int nc = L.GetNumColumns();
  433. assert( ( nc & 3 ) == 0 );
  434. int m = n;
  435. int r = n & 3;
  436. if ( ( m & 3 ) != 0 ) {
  437. const float * lptr = L.ToFloatPtr() + m * nc + m;
  438. if ( ( m & 3 ) == 1 ) {
  439. x[m-1] = b[m-1];
  440. m -= 1;
  441. } else if ( ( m & 3 ) == 2 ) {
  442. x[m-1] = b[m-1];
  443. x[m-2] = b[m-2] - lptr[-1*nc-2] * x[m-1];
  444. m -= 2;
  445. } else {
  446. x[m-1] = b[m-1];
  447. x[m-2] = b[m-2] - lptr[-1*nc-2] * x[m-1];
  448. x[m-3] = b[m-3] - lptr[-1*nc-3] * x[m-1] - lptr[-2*nc-3] * x[m-2];
  449. m -= 3;
  450. }
  451. }
  452. const float * lptr = L.ToFloatPtr() + m * nc + m - 4;
  453. float * xptr = x + m;
  454. #ifdef ID_WIN_X86_SSE2_INTRIN
  455. // process 4 rows at a time
  456. for ( int i = m; i >= 4; i -= 4 ) {
  457. assert_16_byte_aligned( b );
  458. assert_16_byte_aligned( xptr );
  459. assert_16_byte_aligned( lptr );
  460. __m128 s0 = _mm_load_ps( &b[i-4] );
  461. __m128 s1 = (__m128 &)SIMD_SP_zero;
  462. __m128 s2 = (__m128 &)SIMD_SP_zero;
  463. __m128 s3 = (__m128 &)SIMD_SP_zero;
  464. // process 4x4 blocks
  465. const float * xptr2 = xptr; // x + i;
  466. const float * lptr2 = lptr; // ptr = L[i] + i - 4;
  467. for ( int j = i; j < m; j += 4 ) {
  468. __m128 xj = _mm_load_ps( xptr2 );
  469. s0 = _mm_sub_ps( s0, _mm_mul_ps( _mm_splat_ps( xj, 0 ), _mm_load_ps( lptr2 + 0 * nc ) ) );
  470. s1 = _mm_sub_ps( s1, _mm_mul_ps( _mm_splat_ps( xj, 1 ), _mm_load_ps( lptr2 + 1 * nc ) ) );
  471. s2 = _mm_sub_ps( s2, _mm_mul_ps( _mm_splat_ps( xj, 2 ), _mm_load_ps( lptr2 + 2 * nc ) ) );
  472. s3 = _mm_sub_ps( s3, _mm_mul_ps( _mm_splat_ps( xj, 3 ), _mm_load_ps( lptr2 + 3 * nc ) ) );
  473. lptr2 += 4 * nc;
  474. xptr2 += 4;
  475. }
  476. for ( int j = 0; j < r; j++ ) {
  477. s0 = _mm_sub_ps( s0, _mm_mul_ps( _mm_load_ps( lptr2 ), _mm_load1_ps( &xptr2[j] ) ) );
  478. lptr2 += nc;
  479. }
  480. s0 = _mm_add_ps( s0, s1 );
  481. s2 = _mm_add_ps( s2, s3 );
  482. s0 = _mm_add_ps( s0, s2 );
  483. // process left over of the 4 rows
  484. lptr -= 4 * nc;
  485. __m128 t0 = _mm_and_ps( _mm_load_ps( lptr + 3 * nc ), (__m128 &)SIMD_SP_clearLast1 );
  486. __m128 t1 = _mm_and_ps( _mm_load_ps( lptr + 2 * nc ), (__m128 &)SIMD_SP_clearLast2 );
  487. __m128 t2 = _mm_load_ss( lptr + 1 * nc );
  488. s0 = _mm_sub_ps( s0, _mm_mul_ps( t0, _mm_splat_ps( s0, 3 ) ) );
  489. s0 = _mm_sub_ps( s0, _mm_mul_ps( t1, _mm_splat_ps( s0, 2 ) ) );
  490. s0 = _mm_sub_ps( s0, _mm_mul_ps( t2, _mm_splat_ps( s0, 1 ) ) );
  491. // store result
  492. _mm_store_ps( &xptr[-4], s0 );
  493. // update pointers for next four rows
  494. lptr -= 4;
  495. xptr -= 4;
  496. }
  497. #else
  498. // process 4 rows at a time
  499. for ( int i = m; i >= 4; i -= 4 ) {
  500. assert_16_byte_aligned( b );
  501. assert_16_byte_aligned( xptr );
  502. assert_16_byte_aligned( lptr );
  503. float s0 = b[i-4];
  504. float s1 = b[i-3];
  505. float s2 = b[i-2];
  506. float s3 = b[i-1];
  507. // process 4x4 blocks
  508. const float * xptr2 = xptr; // x + i;
  509. const float * lptr2 = lptr; // ptr = L[i] + i - 4;
  510. for ( int j = i; j < m; j += 4 ) {
  511. float t0 = xptr2[0];
  512. s0 -= lptr2[0] * t0;
  513. s1 -= lptr2[1] * t0;
  514. s2 -= lptr2[2] * t0;
  515. s3 -= lptr2[3] * t0;
  516. lptr2 += nc;
  517. float t1 = xptr2[1];
  518. s0 -= lptr2[0] * t1;
  519. s1 -= lptr2[1] * t1;
  520. s2 -= lptr2[2] * t1;
  521. s3 -= lptr2[3] * t1;
  522. lptr2 += nc;
  523. float t2 = xptr2[2];
  524. s0 -= lptr2[0] * t2;
  525. s1 -= lptr2[1] * t2;
  526. s2 -= lptr2[2] * t2;
  527. s3 -= lptr2[3] * t2;
  528. lptr2 += nc;
  529. float t3 = xptr2[3];
  530. s0 -= lptr2[0] * t3;
  531. s1 -= lptr2[1] * t3;
  532. s2 -= lptr2[2] * t3;
  533. s3 -= lptr2[3] * t3;
  534. lptr2 += nc;
  535. xptr2 += 4;
  536. }
  537. for ( int j = 0; j < r; j++ ) {
  538. float t = xptr2[j];
  539. s0 -= lptr2[0] * t;
  540. s1 -= lptr2[1] * t;
  541. s2 -= lptr2[2] * t;
  542. s3 -= lptr2[3] * t;
  543. lptr2 += nc;
  544. }
  545. // process left over of the 4 rows
  546. lptr -= nc;
  547. s0 -= lptr[0] * s3;
  548. s1 -= lptr[1] * s3;
  549. s2 -= lptr[2] * s3;
  550. lptr -= nc;
  551. s0 -= lptr[0] * s2;
  552. s1 -= lptr[1] * s2;
  553. lptr -= nc;
  554. s0 -= lptr[0] * s1;
  555. lptr -= nc;
  556. // store result
  557. xptr[-4] = s0;
  558. xptr[-3] = s1;
  559. xptr[-2] = s2;
  560. xptr[-1] = s3;
  561. // update pointers for next four rows
  562. lptr -= 4;
  563. xptr -= 4;
  564. }
  565. #endif
  566. }
  567. /*
  568. ========================
  569. UpperTriangularSolve_SIMD
  570. Solves x in Ux = b for the n * n sub-matrix of U.
  571. * U has to be a upper triangular matrix
  572. * invDiag is the reciprical of the diagonal of the upper triangular matrix.
  573. * x == b is allowed
  574. ========================
  575. */
  576. static void UpperTriangularSolve_SIMD( const idMatX & U, const float * invDiag, float * x, const float * b, const int n ) {
  577. for ( int i = n - 1; i >= 0; i-- ) {
  578. float sum = b[i];
  579. const float * uptr = U[i];
  580. for ( int j = i + 1; j < n; j++ ) {
  581. sum -= uptr[j] * x[j];
  582. }
  583. x[i] = sum * invDiag[i];
  584. }
  585. }
  586. /*
  587. ========================
  588. LU_Factor_SIMD
  589. In-place factorization LU of the n * n sub-matrix of mat. The reciprocal of the diagonal
  590. elements of U are stored in invDiag. No pivoting is used.
  591. ========================
  592. */
  593. static bool LU_Factor_SIMD( idMatX & mat, idVecX & invDiag, const int n ) {
  594. for ( int i = 0; i < n; i++ ) {
  595. float d1 = mat[i][i];
  596. if ( fabs( d1 ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  597. return false;
  598. }
  599. invDiag[i] = d1 = 1.0f / d1;
  600. float * ptr1 = mat[i];
  601. for ( int j = i + 1; j < n; j++ ) {
  602. float * ptr2 = mat[j];
  603. float d2 = ptr2[i] * d1;
  604. ptr2[i] = d2;
  605. int k;
  606. for ( k = n - 1; k > i + 15; k -= 16 ) {
  607. ptr2[k-0] -= d2 * ptr1[k-0];
  608. ptr2[k-1] -= d2 * ptr1[k-1];
  609. ptr2[k-2] -= d2 * ptr1[k-2];
  610. ptr2[k-3] -= d2 * ptr1[k-3];
  611. ptr2[k-4] -= d2 * ptr1[k-4];
  612. ptr2[k-5] -= d2 * ptr1[k-5];
  613. ptr2[k-6] -= d2 * ptr1[k-6];
  614. ptr2[k-7] -= d2 * ptr1[k-7];
  615. ptr2[k-8] -= d2 * ptr1[k-8];
  616. ptr2[k-9] -= d2 * ptr1[k-9];
  617. ptr2[k-10] -= d2 * ptr1[k-10];
  618. ptr2[k-11] -= d2 * ptr1[k-11];
  619. ptr2[k-12] -= d2 * ptr1[k-12];
  620. ptr2[k-13] -= d2 * ptr1[k-13];
  621. ptr2[k-14] -= d2 * ptr1[k-14];
  622. ptr2[k-15] -= d2 * ptr1[k-15];
  623. }
  624. switch( k - i ) {
  625. NODEFAULT;
  626. case 15: ptr2[k-14] -= d2 * ptr1[k-14];
  627. case 14: ptr2[k-13] -= d2 * ptr1[k-13];
  628. case 13: ptr2[k-12] -= d2 * ptr1[k-12];
  629. case 12: ptr2[k-11] -= d2 * ptr1[k-11];
  630. case 11: ptr2[k-10] -= d2 * ptr1[k-10];
  631. case 10: ptr2[k-9] -= d2 * ptr1[k-9];
  632. case 9: ptr2[k-8] -= d2 * ptr1[k-8];
  633. case 8: ptr2[k-7] -= d2 * ptr1[k-7];
  634. case 7: ptr2[k-6] -= d2 * ptr1[k-6];
  635. case 6: ptr2[k-5] -= d2 * ptr1[k-5];
  636. case 5: ptr2[k-4] -= d2 * ptr1[k-4];
  637. case 4: ptr2[k-3] -= d2 * ptr1[k-3];
  638. case 3: ptr2[k-2] -= d2 * ptr1[k-2];
  639. case 2: ptr2[k-1] -= d2 * ptr1[k-1];
  640. case 1: ptr2[k-0] -= d2 * ptr1[k-0];
  641. case 0: break;
  642. }
  643. }
  644. }
  645. return true;
  646. }
  647. /*
  648. ========================
  649. LDLT_Factor_SIMD
  650. In-place factorization LDL' of the n * n sub-matrix of mat. The reciprocal of the diagonal
  651. elements are stored in invDiag.
  652. NOTE: The number of columns of mat must be a multiple of 4.
  653. ========================
  654. */
  655. static bool LDLT_Factor_SIMD( idMatX & mat, idVecX & invDiag, const int n ) {
  656. float s0, s1, s2, d;
  657. float * v = (float *) _alloca16( ( ( n + 3 ) & ~3 ) * sizeof( float ) );
  658. float * diag = (float *) _alloca16( ( ( n + 3 ) & ~3 ) * sizeof( float ) );
  659. float * invDiagPtr = invDiag.ToFloatPtr();
  660. int nc = mat.GetNumColumns();
  661. assert( ( nc & 3 ) == 0 );
  662. if ( n <= 0 ) {
  663. return true;
  664. }
  665. float * mptr = mat[0];
  666. float sum = mptr[0];
  667. if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  668. return false;
  669. }
  670. diag[0] = sum;
  671. invDiagPtr[0] = d = 1.0f / sum;
  672. if ( n <= 1 ) {
  673. return true;
  674. }
  675. mptr = mat[0];
  676. for ( int j = 1; j < n; j++ ) {
  677. mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
  678. }
  679. mptr = mat[1];
  680. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  681. sum = mptr[1] - s0;
  682. if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  683. return false;
  684. }
  685. mat[1][1] = sum;
  686. diag[1] = sum;
  687. invDiagPtr[1] = d = 1.0f / sum;
  688. if ( n <= 2 ) {
  689. return true;
  690. }
  691. mptr = mat[0];
  692. for ( int j = 2; j < n; j++ ) {
  693. mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
  694. }
  695. mptr = mat[2];
  696. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  697. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  698. sum = mptr[2] - s0 - s1;
  699. if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  700. return false;
  701. }
  702. mat[2][2] = sum;
  703. diag[2] = sum;
  704. invDiagPtr[2] = d = 1.0f / sum;
  705. if ( n <= 3 ) {
  706. return true;
  707. }
  708. mptr = mat[0];
  709. for ( int j = 3; j < n; j++ ) {
  710. mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
  711. }
  712. mptr = mat[3];
  713. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  714. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  715. v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
  716. sum = mptr[3] - s0 - s1 - s2;
  717. if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  718. return false;
  719. }
  720. mat[3][3] = sum;
  721. diag[3] = sum;
  722. invDiagPtr[3] = d = 1.0f / sum;
  723. if ( n <= 4 ) {
  724. return true;
  725. }
  726. mptr = mat[0];
  727. for ( int j = 4; j < n; j++ ) {
  728. mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
  729. }
  730. #ifdef ID_WIN_X86_SSE2_INTRIN
  731. __m128 vzero = _mm_setzero_ps();
  732. for ( int i = 4; i < n; i += 4 ) {
  733. _mm_store_ps( diag + i, vzero );
  734. }
  735. for ( int i = 4; i < n; i++ ) {
  736. mptr = mat[i];
  737. assert_16_byte_aligned( v );
  738. assert_16_byte_aligned( mptr );
  739. assert_16_byte_aligned( diag );
  740. __m128 m0 = _mm_load_ps( mptr + 0 );
  741. __m128 d0 = _mm_load_ps( diag + 0 );
  742. __m128 v0 = _mm_mul_ps( d0, m0 );
  743. __m128 t0 = _mm_load_ss( mptr + i );
  744. t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) );
  745. _mm_store_ps( v + 0, v0 );
  746. int k = 4;
  747. for ( ; k < i - 3; k += 4 ) {
  748. m0 = _mm_load_ps( mptr + k );
  749. d0 = _mm_load_ps( diag + k );
  750. v0 = _mm_mul_ps( d0, m0 );
  751. t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) );
  752. _mm_store_ps( v + k, v0 );
  753. }
  754. __m128 mask = (__m128 &) SIMD_SP_indexedEndMask[i & 3];
  755. m0 = _mm_and_ps( _mm_load_ps( mptr + k ), mask );
  756. d0 = _mm_load_ps( diag + k );
  757. v0 = _mm_mul_ps( d0, m0 );
  758. t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) );
  759. _mm_store_ps( v + k, v0 );
  760. t0 = _mm_add_ps( t0, _mm_shuffle_ps( t0, t0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  761. t0 = _mm_add_ps( t0, _mm_shuffle_ps( t0, t0, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  762. __m128 tiny = _mm_and_ps( _mm_cmpeq_ps( t0, SIMD_SP_zero ), SIMD_SP_tiny );
  763. t0 = _mm_or_ps( t0, tiny );
  764. _mm_store_ss( mptr + i, t0 );
  765. _mm_store_ss( diag + i, t0 );
  766. __m128 d = _mm_rcp32_ps( t0 );
  767. _mm_store_ss( invDiagPtr + i, d );
  768. if ( i + 1 >= n ) {
  769. return true;
  770. }
  771. int j = i + 1;
  772. for ( ; j < n - 3; j += 4 ) {
  773. float * ra = mat[j+0];
  774. float * rb = mat[j+1];
  775. float * rc = mat[j+2];
  776. float * rd = mat[j+3];
  777. assert_16_byte_aligned( v );
  778. assert_16_byte_aligned( ra );
  779. assert_16_byte_aligned( rb );
  780. assert_16_byte_aligned( rc );
  781. assert_16_byte_aligned( rd );
  782. __m128 va = _mm_load_ss( ra + i );
  783. __m128 vb = _mm_load_ss( rb + i );
  784. __m128 vc = _mm_load_ss( rc + i );
  785. __m128 vd = _mm_load_ss( rd + i );
  786. __m128 v0 = _mm_load_ps( v + 0 );
  787. va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( ra + 0 ), v0 ) );
  788. vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_load_ps( rb + 0 ), v0 ) );
  789. vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_load_ps( rc + 0 ), v0 ) );
  790. vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_load_ps( rd + 0 ), v0 ) );
  791. int k = 4;
  792. for ( ; k < i - 3; k += 4 ) {
  793. v0 = _mm_load_ps( v + k );
  794. va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( ra + k ), v0 ) );
  795. vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_load_ps( rb + k ), v0 ) );
  796. vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_load_ps( rc + k ), v0 ) );
  797. vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_load_ps( rd + k ), v0 ) );
  798. }
  799. v0 = _mm_load_ps( v + k );
  800. va = _mm_sub_ps( va, _mm_mul_ps( _mm_and_ps( _mm_load_ps( ra + k ), mask ), v0 ) );
  801. vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rb + k ), mask ), v0 ) );
  802. vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rc + k ), mask ), v0 ) );
  803. vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rd + k ), mask ), v0 ) );
  804. __m128 ta = _mm_unpacklo_ps( va, vc ); // x0, z0, x1, z1
  805. __m128 tb = _mm_unpackhi_ps( va, vc ); // x2, z2, x3, z3
  806. __m128 tc = _mm_unpacklo_ps( vb, vd ); // y0, w0, y1, w1
  807. __m128 td = _mm_unpackhi_ps( vb, vd ); // y2, w2, y3, w3
  808. va = _mm_unpacklo_ps( ta, tc ); // x0, y0, z0, w0
  809. vb = _mm_unpackhi_ps( ta, tc ); // x1, y1, z1, w1
  810. vc = _mm_unpacklo_ps( tb, td ); // x2, y2, z2, w2
  811. vd = _mm_unpackhi_ps( tb, td ); // x3, y3, z3, w3
  812. va = _mm_add_ps( va, vb );
  813. vc = _mm_add_ps( vc, vd );
  814. va = _mm_add_ps( va, vc );
  815. va = _mm_mul_ps( va, d );
  816. _mm_store_ss( ra + i, _mm_splat_ps( va, 0 ) );
  817. _mm_store_ss( rb + i, _mm_splat_ps( va, 1 ) );
  818. _mm_store_ss( rc + i, _mm_splat_ps( va, 2 ) );
  819. _mm_store_ss( rd + i, _mm_splat_ps( va, 3 ) );
  820. }
  821. for ( ; j < n; j++ ) {
  822. float * mptr = mat[j];
  823. assert_16_byte_aligned( v );
  824. assert_16_byte_aligned( mptr );
  825. __m128 va = _mm_load_ss( mptr + i );
  826. __m128 v0 = _mm_load_ps( v + 0 );
  827. va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( mptr + 0 ), v0 ) );
  828. int k = 4;
  829. for ( ; k < i - 3; k += 4 ) {
  830. v0 = _mm_load_ps( v + k );
  831. va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( mptr + k ), v0 ) );
  832. }
  833. v0 = _mm_load_ps( v + k );
  834. va = _mm_sub_ps( va, _mm_mul_ps( _mm_and_ps( _mm_load_ps( mptr + k ), mask ), v0 ) );
  835. va = _mm_add_ps( va, _mm_shuffle_ps( va, va, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  836. va = _mm_add_ps( va, _mm_shuffle_ps( va, va, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  837. va = _mm_mul_ps( va, d );
  838. _mm_store_ss( mptr + i, va );
  839. }
  840. }
  841. return true;
  842. #else
  843. for ( int i = 4; i < n; i += 4 ) {
  844. diag[i+0] = 0.0f;
  845. diag[i+1] = 0.0f;
  846. diag[i+2] = 0.0f;
  847. diag[i+3] = 0.0f;
  848. }
  849. for ( int i = 4; i < n; i++ ) {
  850. mptr = mat[i];
  851. assert_16_byte_aligned( v );
  852. assert_16_byte_aligned( mptr );
  853. assert_16_byte_aligned( diag );
  854. v[0] = diag[0] * mptr[0];
  855. v[1] = diag[1] * mptr[1];
  856. v[2] = diag[2] * mptr[2];
  857. v[3] = diag[3] * mptr[3];
  858. float t0 = - mptr[0] * v[0] + mptr[i];
  859. float t1 = - mptr[1] * v[1];
  860. float t2 = - mptr[2] * v[2];
  861. float t3 = - mptr[3] * v[3];
  862. int k = 4;
  863. for ( ; k < i - 3; k += 4 ) {
  864. v[k+0] = diag[k+0] * mptr[k+0];
  865. v[k+1] = diag[k+1] * mptr[k+1];
  866. v[k+2] = diag[k+2] * mptr[k+2];
  867. v[k+3] = diag[k+3] * mptr[k+3];
  868. t0 -= mptr[k+0] * v[k+0];
  869. t1 -= mptr[k+1] * v[k+1];
  870. t2 -= mptr[k+2] * v[k+2];
  871. t3 -= mptr[k+3] * v[k+3];
  872. }
  873. float m0 = ( i - k > 0 ) ? mptr[k+0] : 0.0f;
  874. float m1 = ( i - k > 1 ) ? mptr[k+1] : 0.0f;
  875. float m2 = ( i - k > 2 ) ? mptr[k+2] : 0.0f;
  876. float m3 = ( i - k > 3 ) ? mptr[k+3] : 0.0f;
  877. v[k+0] = diag[k+0] * m0;
  878. v[k+1] = diag[k+1] * m1;
  879. v[k+2] = diag[k+2] * m2;
  880. v[k+3] = diag[k+3] * m3;
  881. t0 -= m0 * v[k+0];
  882. t1 -= m1 * v[k+1];
  883. t2 -= m2 * v[k+2];
  884. t3 -= m3 * v[k+3];
  885. sum = t0 + t1 + t2 + t3;
  886. if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  887. return false;
  888. }
  889. mat[i][i] = sum;
  890. diag[i] = sum;
  891. invDiagPtr[i] = d = 1.0f / sum;
  892. if ( i + 1 >= n ) {
  893. return true;
  894. }
  895. int j = i + 1;
  896. for ( ; j < n - 3; j += 4 ) {
  897. float * ra = mat[j+0];
  898. float * rb = mat[j+1];
  899. float * rc = mat[j+2];
  900. float * rd = mat[j+3];
  901. assert_16_byte_aligned( v );
  902. assert_16_byte_aligned( ra );
  903. assert_16_byte_aligned( rb );
  904. assert_16_byte_aligned( rc );
  905. assert_16_byte_aligned( rd );
  906. float a0 = - ra[0] * v[0] + ra[i];
  907. float a1 = - ra[1] * v[1];
  908. float a2 = - ra[2] * v[2];
  909. float a3 = - ra[3] * v[3];
  910. float b0 = - rb[0] * v[0] + rb[i];
  911. float b1 = - rb[1] * v[1];
  912. float b2 = - rb[2] * v[2];
  913. float b3 = - rb[3] * v[3];
  914. float c0 = - rc[0] * v[0] + rc[i];
  915. float c1 = - rc[1] * v[1];
  916. float c2 = - rc[2] * v[2];
  917. float c3 = - rc[3] * v[3];
  918. float d0 = - rd[0] * v[0] + rd[i];
  919. float d1 = - rd[1] * v[1];
  920. float d2 = - rd[2] * v[2];
  921. float d3 = - rd[3] * v[3];
  922. int k = 4;
  923. for ( ; k < i - 3; k += 4 ) {
  924. a0 -= ra[k+0] * v[k+0];
  925. a1 -= ra[k+1] * v[k+1];
  926. a2 -= ra[k+2] * v[k+2];
  927. a3 -= ra[k+3] * v[k+3];
  928. b0 -= rb[k+0] * v[k+0];
  929. b1 -= rb[k+1] * v[k+1];
  930. b2 -= rb[k+2] * v[k+2];
  931. b3 -= rb[k+3] * v[k+3];
  932. c0 -= rc[k+0] * v[k+0];
  933. c1 -= rc[k+1] * v[k+1];
  934. c2 -= rc[k+2] * v[k+2];
  935. c3 -= rc[k+3] * v[k+3];
  936. d0 -= rd[k+0] * v[k+0];
  937. d1 -= rd[k+1] * v[k+1];
  938. d2 -= rd[k+2] * v[k+2];
  939. d3 -= rd[k+3] * v[k+3];
  940. }
  941. float ra0 = ( i - k > 0 ) ? ra[k+0] : 0.0f;
  942. float ra1 = ( i - k > 1 ) ? ra[k+1] : 0.0f;
  943. float ra2 = ( i - k > 2 ) ? ra[k+2] : 0.0f;
  944. float ra3 = ( i - k > 3 ) ? ra[k+3] : 0.0f;
  945. float rb0 = ( i - k > 0 ) ? rb[k+0] : 0.0f;
  946. float rb1 = ( i - k > 1 ) ? rb[k+1] : 0.0f;
  947. float rb2 = ( i - k > 2 ) ? rb[k+2] : 0.0f;
  948. float rb3 = ( i - k > 3 ) ? rb[k+3] : 0.0f;
  949. float rc0 = ( i - k > 0 ) ? rc[k+0] : 0.0f;
  950. float rc1 = ( i - k > 1 ) ? rc[k+1] : 0.0f;
  951. float rc2 = ( i - k > 2 ) ? rc[k+2] : 0.0f;
  952. float rc3 = ( i - k > 3 ) ? rc[k+3] : 0.0f;
  953. float rd0 = ( i - k > 0 ) ? rd[k+0] : 0.0f;
  954. float rd1 = ( i - k > 1 ) ? rd[k+1] : 0.0f;
  955. float rd2 = ( i - k > 2 ) ? rd[k+2] : 0.0f;
  956. float rd3 = ( i - k > 3 ) ? rd[k+3] : 0.0f;
  957. a0 -= ra0 * v[k+0];
  958. a1 -= ra1 * v[k+1];
  959. a2 -= ra2 * v[k+2];
  960. a3 -= ra3 * v[k+3];
  961. b0 -= rb0 * v[k+0];
  962. b1 -= rb1 * v[k+1];
  963. b2 -= rb2 * v[k+2];
  964. b3 -= rb3 * v[k+3];
  965. c0 -= rc0 * v[k+0];
  966. c1 -= rc1 * v[k+1];
  967. c2 -= rc2 * v[k+2];
  968. c3 -= rc3 * v[k+3];
  969. d0 -= rd0 * v[k+0];
  970. d1 -= rd1 * v[k+1];
  971. d2 -= rd2 * v[k+2];
  972. d3 -= rd3 * v[k+3];
  973. ra[i] = ( a0 + a1 + a2 + a3 ) * d;
  974. rb[i] = ( b0 + b1 + b2 + b3 ) * d;
  975. rc[i] = ( c0 + c1 + c2 + c3 ) * d;
  976. rd[i] = ( d0 + d1 + d2 + d3 ) * d;
  977. }
  978. for ( ; j < n; j++ ) {
  979. mptr = mat[j];
  980. assert_16_byte_aligned( v );
  981. assert_16_byte_aligned( mptr );
  982. float a0 = - mptr[0] * v[0] + mptr[i];
  983. float a1 = - mptr[1] * v[1];
  984. float a2 = - mptr[2] * v[2];
  985. float a3 = - mptr[3] * v[3];
  986. int k = 4;
  987. for ( ; k < i - 3; k += 4 ) {
  988. a0 -= mptr[k+0] * v[k+0];
  989. a1 -= mptr[k+1] * v[k+1];
  990. a2 -= mptr[k+2] * v[k+2];
  991. a3 -= mptr[k+3] * v[k+3];
  992. }
  993. float m0 = ( i - k > 0 ) ? mptr[k+0] : 0.0f;
  994. float m1 = ( i - k > 1 ) ? mptr[k+1] : 0.0f;
  995. float m2 = ( i - k > 2 ) ? mptr[k+2] : 0.0f;
  996. float m3 = ( i - k > 3 ) ? mptr[k+3] : 0.0f;
  997. a0 -= m0 * v[k+0];
  998. a1 -= m1 * v[k+1];
  999. a2 -= m2 * v[k+2];
  1000. a3 -= m3 * v[k+3];
  1001. mptr[i] = ( a0 + a1 + a2 + a3 ) * d;
  1002. }
  1003. }
  1004. return true;
  1005. #endif
  1006. }
  1007. /*
  1008. ========================
  1009. GetMaxStep_SIMD
  1010. ========================
  1011. */
  1012. static void GetMaxStep_SIMD( const float * f, const float * a, const float * delta_f, const float * delta_a,
  1013. const float * lo, const float * hi, const int * side, int numUnbounded, int numClamped,
  1014. int d, float dir, float & maxStep, int & limit, int & limitSide ) {
  1015. #ifdef ID_WIN_X86_SSE2_INTRIN
  1016. __m128 vMaxStep;
  1017. __m128i vLimit;
  1018. __m128i vLimitSide;
  1019. // default to a full step for the current variable
  1020. {
  1021. __m128 vNegAccel = _mm_xor_ps( _mm_load1_ps( a + d ), (__m128 &) SIMD_SP_signBit );
  1022. __m128 vDeltaAccel = _mm_load1_ps( delta_a + d );
  1023. __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaAccel, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON );
  1024. __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) );
  1025. vMaxStep = _mm_sel_ps( SIMD_SP_zero, vStep, vM0 );
  1026. vLimit = _mm_shuffle_epi32( _mm_cvtsi32_si128( d ), 0 );
  1027. vLimitSide = (__m128i &) SIMD_DW_zero;
  1028. }
  1029. // test the current variable
  1030. {
  1031. __m128 vDeltaForce = _mm_load1_ps( & dir );
  1032. __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero );
  1033. __m128 vForceLimit = _mm_sel_ps( _mm_load1_ps( hi + d ), _mm_load1_ps( lo + d ), vSign );
  1034. __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load1_ps( f + d ) ), vDeltaForce );
  1035. __m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one );
  1036. __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON );
  1037. __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity );
  1038. __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
  1039. __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
  1040. vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
  1041. vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) );
  1042. }
  1043. // test the clamped bounded variables
  1044. {
  1045. __m128 mask = (__m128 &) SIMD_SP_indexedStartMask[numUnbounded & 3];
  1046. __m128i index = _mm_add_epi32( _mm_and_si128( _mm_shuffle_epi32( _mm_cvtsi32_si128( numUnbounded ), 0 ), (__m128i &) SIMD_DW_not3 ), (__m128i &) SIMD_DW_index );
  1047. int i = numUnbounded & ~3;
  1048. for ( ; i < numClamped - 3; i += 4 ) {
  1049. __m128 vDeltaForce = _mm_and_ps( _mm_load_ps( delta_f + i ), mask );
  1050. __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero );
  1051. __m128 vForceLimit = _mm_sel_ps( _mm_load_ps( hi + i ), _mm_load_ps( lo + i ), vSign );
  1052. __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON );
  1053. __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load_ps( f + i ) ), _mm_sel_ps( SIMD_SP_one, vDeltaForce, vM0 ) );
  1054. __m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one );
  1055. __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity );
  1056. __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
  1057. __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
  1058. vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
  1059. vLimit = _mm_sel_si128( vLimit, index, vM3 );
  1060. vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) );
  1061. mask = (__m128 &) SIMD_SP_indexedStartMask[0];
  1062. index = _mm_add_epi32( index, (__m128i &) SIMD_DW_four );
  1063. }
  1064. __m128 vDeltaForce = _mm_and_ps( _mm_load_ps( delta_f + i ), _mm_and_ps( mask, (__m128 &) SIMD_SP_indexedEndMask[numClamped & 3] ) );
  1065. __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero );
  1066. __m128 vForceLimit = _mm_sel_ps( _mm_load_ps( hi + i ), _mm_load_ps( lo + i ), vSign );
  1067. __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON );
  1068. __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load_ps( f + i ) ), _mm_sel_ps( SIMD_SP_one, vDeltaForce, vM0 ) );
  1069. __m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one );
  1070. __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity );
  1071. __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
  1072. __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
  1073. vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
  1074. vLimit = _mm_sel_si128( vLimit, index, vM3 );
  1075. vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) );
  1076. }
  1077. // test the not clamped bounded variables
  1078. {
  1079. __m128 mask = (__m128 &) SIMD_SP_indexedStartMask[numClamped & 3];
  1080. __m128i index = _mm_add_epi32( _mm_and_si128( _mm_shuffle_epi32( _mm_cvtsi32_si128( numClamped ), 0 ), (__m128i &) SIMD_DW_not3 ), (__m128i &) SIMD_DW_index );
  1081. int i = numClamped & ~3;
  1082. for ( ; i < d - 3; i += 4 ) {
  1083. __m128 vNegAccel = _mm_xor_ps( _mm_load_ps( a + i ), (__m128 &) SIMD_SP_signBit );
  1084. __m128 vDeltaAccel = _mm_and_ps( _mm_load_ps( delta_a + i ), mask );
  1085. __m128 vSide = _mm_cvtepi32_ps( _mm_load_si128( (__m128i *) ( side + i ) ) );
  1086. __m128 vM0 = _mm_cmpgt_ps( _mm_mul_ps( vSide, vDeltaAccel ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON );
  1087. __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) );
  1088. __m128 vM1 = _mm_or_ps( _mm_cmplt_ps( _mm_load_ps( lo + i ), SIMD_SP_neg_LCP_BOUND_EPSILON ), _mm_cmpgt_ps( _mm_load_ps( hi + i ), SIMD_SP_LCP_BOUND_EPSILON ) );
  1089. __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
  1090. __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
  1091. vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
  1092. vLimit = _mm_sel_si128( vLimit, index, vM3 );
  1093. vLimitSide = _mm_sel_si128( vLimitSide, (__m128i &) SIMD_DW_zero, __m128c( vM3 ) );
  1094. mask = (__m128 &) SIMD_SP_indexedStartMask[0];
  1095. index = _mm_add_epi32( index, (__m128i &) SIMD_DW_four );
  1096. }
  1097. __m128 vNegAccel = _mm_xor_ps( _mm_load_ps( a + i ), (__m128 &) SIMD_SP_signBit );
  1098. __m128 vDeltaAccel = _mm_and_ps( _mm_load_ps( delta_a + i ), _mm_and_ps( mask, (__m128 &) SIMD_SP_indexedEndMask[d & 3] ) );
  1099. __m128 vSide = _mm_cvtepi32_ps( _mm_load_si128( (__m128i *) ( side + i ) ) );
  1100. __m128 vM0 = _mm_cmpgt_ps( _mm_mul_ps( vSide, vDeltaAccel ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON );
  1101. __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) );
  1102. __m128 vM1 = _mm_or_ps( _mm_cmplt_ps( _mm_load_ps( lo + i ), SIMD_SP_neg_LCP_BOUND_EPSILON ), _mm_cmpgt_ps( _mm_load_ps( hi + i ), SIMD_SP_LCP_BOUND_EPSILON ) );
  1103. __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
  1104. __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
  1105. vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
  1106. vLimit = _mm_sel_si128( vLimit, index, vM3 );
  1107. vLimitSide = _mm_sel_si128( vLimitSide, (__m128i &) SIMD_DW_zero, __m128c( vM3 ) );
  1108. }
  1109. {
  1110. __m128 tMaxStep = _mm_shuffle_ps( vMaxStep, vMaxStep, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  1111. __m128i tLimit = _mm_shuffle_epi32( vLimit, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  1112. __m128i tLimitSide = _mm_shuffle_epi32( vLimitSide, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  1113. __m128c mask = _mm_cmplt_ps( tMaxStep, vMaxStep );
  1114. vMaxStep = _mm_min_ps( vMaxStep, tMaxStep );
  1115. vLimit = _mm_sel_si128( vLimit, tLimit, mask );
  1116. vLimitSide = _mm_sel_si128( vLimitSide, tLimitSide, mask );
  1117. }
  1118. {
  1119. __m128 tMaxStep = _mm_shuffle_ps( vMaxStep, vMaxStep, _MM_SHUFFLE( 2, 3, 0, 1 ) );
  1120. __m128i tLimit = _mm_shuffle_epi32( vLimit, _MM_SHUFFLE( 2, 3, 0, 1 ) );
  1121. __m128i tLimitSide = _mm_shuffle_epi32( vLimitSide, _MM_SHUFFLE( 2, 3, 0, 1 ) );
  1122. __m128c mask = _mm_cmplt_ps( tMaxStep, vMaxStep );
  1123. vMaxStep = _mm_min_ps( vMaxStep, tMaxStep );
  1124. vLimit = _mm_sel_si128( vLimit, tLimit, mask );
  1125. vLimitSide = _mm_sel_si128( vLimitSide, tLimitSide, mask );
  1126. }
  1127. _mm_store_ss( & maxStep, vMaxStep );
  1128. limit = _mm_cvtsi128_si32( vLimit );
  1129. limitSide = _mm_cvtsi128_si32( vLimitSide );
  1130. #else
  1131. // default to a full step for the current variable
  1132. {
  1133. float negAccel = -a[d];
  1134. float deltaAccel = delta_a[d];
  1135. int m0 = ( fabs( deltaAccel ) > LCP_DELTA_ACCEL_EPSILON );
  1136. float step = negAccel / ( m0 ? deltaAccel : 1.0f );
  1137. maxStep = m0 ? step : 0.0f;
  1138. limit = d;
  1139. limitSide = 0;
  1140. }
  1141. // test the current variable
  1142. {
  1143. float deltaForce = dir;
  1144. float forceLimit = ( deltaForce < 0.0f ) ? lo[d] : hi[d];
  1145. float step = ( forceLimit - f[d] ) / deltaForce;
  1146. int setSide = ( deltaForce < 0.0f ) ? -1 : 1;
  1147. int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON );
  1148. int m1 = ( fabs( forceLimit ) != idMath::INFINITY );
  1149. int m2 = ( step < maxStep );
  1150. int m3 = ( m0 & m1 & m2 );
  1151. maxStep = m3 ? step : maxStep;
  1152. limit = m3 ? d : limit;
  1153. limitSide = m3 ? setSide : limitSide;
  1154. }
  1155. // test the clamped bounded variables
  1156. for ( int i = numUnbounded; i < numClamped; i++ ) {
  1157. float deltaForce = delta_f[i];
  1158. float forceLimit = ( deltaForce < 0.0f ) ? lo[i] : hi[i];
  1159. int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON );
  1160. float step = ( forceLimit - f[i] ) / ( m0 ? deltaForce : 1.0f );
  1161. int setSide = ( deltaForce < 0.0f ) ? -1 : 1;
  1162. int m1 = ( fabs( forceLimit ) != idMath::INFINITY );
  1163. int m2 = ( step < maxStep );
  1164. int m3 = ( m0 & m1 & m2 );
  1165. maxStep = m3 ? step : maxStep;
  1166. limit = m3 ? i : limit;
  1167. limitSide = m3 ? setSide : limitSide;
  1168. }
  1169. // test the not clamped bounded variables
  1170. for ( int i = numClamped; i < d; i++ ) {
  1171. float negAccel = -a[i];
  1172. float deltaAccel = delta_a[i];
  1173. int m0 = ( side[i] * deltaAccel > LCP_DELTA_ACCEL_EPSILON );
  1174. float step = negAccel / ( m0 ? deltaAccel : 1.0f );
  1175. int m1 = ( lo[i] < -LCP_BOUND_EPSILON || hi[i] > LCP_BOUND_EPSILON );
  1176. int m2 = ( step < maxStep );
  1177. int m3 = ( m0 & m1 & m2 );
  1178. maxStep = m3 ? step : maxStep;
  1179. limit = m3 ? i : limit;
  1180. limitSide = m3 ? 0 : limitSide;
  1181. }
  1182. #endif
  1183. }
  1184. /*
  1185. ================================================================================================
  1186. SIMD test code
  1187. ================================================================================================
  1188. */
  1189. //#define ENABLE_TEST_CODE
  1190. #ifdef ENABLE_TEST_CODE
  1191. #define TEST_TRIANGULAR_SOLVE_SIMD_EPSILON 0.1f
  1192. #define TEST_TRIANGULAR_SOLVE_SIZE 50
  1193. #define TEST_FACTOR_SIMD_EPSILON 0.1f
  1194. #define TEST_FACTOR_SOLVE_SIZE 50
  1195. #define NUM_TESTS 50
  1196. /*
  1197. ========================
  1198. PrintClocks
  1199. ========================
  1200. */
  1201. static void PrintClocks( const char * string, int dataCount, int64 clocks, int64 otherClocks = 0 ) {
  1202. idLib::Printf( string );
  1203. for ( int i = idStr::LengthWithoutColors(string); i < 48; i++ ) {
  1204. idLib::Printf(" ");
  1205. }
  1206. if ( clocks && otherClocks ) {
  1207. int p = 0;
  1208. if ( clocks <= otherClocks ) {
  1209. p = idMath::Ftoi( (float) ( otherClocks - clocks ) * 100.0f / (float) otherClocks );
  1210. } else {
  1211. p = - idMath::Ftoi( (float) ( clocks - otherClocks ) * 100.0f / (float) clocks );
  1212. }
  1213. idLib::Printf( "c = %4d, clcks = %5lld, %d%%\n", dataCount, clocks, p );
  1214. } else {
  1215. idLib::Printf( "c = %4d, clcks = %5lld\n", dataCount, clocks );
  1216. }
  1217. }
  1218. /*
  1219. ========================
  1220. DotProduct_Test
  1221. ========================
  1222. */
  1223. static void DotProduct_Test() {
  1224. ALIGN16( float fsrc0[TEST_TRIANGULAR_SOLVE_SIZE + 1]; )
  1225. ALIGN16( float fsrc1[TEST_TRIANGULAR_SOLVE_SIZE + 1]; )
  1226. idRandom srnd( 13 );
  1227. for ( int i = 0; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
  1228. fsrc0[i] = srnd.CRandomFloat() * 10.0f;
  1229. fsrc1[i] = srnd.CRandomFloat() * 10.0f;
  1230. }
  1231. idTimer timer;
  1232. for ( int i = 0; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
  1233. float dot1 = DotProduct_Generic( fsrc0, fsrc1, i );
  1234. int64 clocksGeneric = 0xFFFFFFFFFFFF;
  1235. for ( int j = 0; j < NUM_TESTS; j++ ) {
  1236. fsrc1[TEST_TRIANGULAR_SOLVE_SIZE] = j;
  1237. timer.Clear();
  1238. timer.Start();
  1239. dot1 = DotProduct_Generic( fsrc0, fsrc1, i );
  1240. timer.Stop();
  1241. clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
  1242. }
  1243. PrintClocks( va( "DotProduct_Generic %d", i ), 1, clocksGeneric );
  1244. float dot2 = DotProduct_SIMD( fsrc0, fsrc1, i );
  1245. int64 clocksSIMD = 0xFFFFFFFFFFFF;
  1246. for ( int j = 0; j < NUM_TESTS; j++ ) {
  1247. fsrc1[TEST_TRIANGULAR_SOLVE_SIZE] = j;
  1248. timer.Clear();
  1249. timer.Start();
  1250. dot2 = DotProduct_SIMD( fsrc0, fsrc1, i );
  1251. timer.Stop();
  1252. clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
  1253. }
  1254. const char * result = idMath::Fabs( dot1 - dot2 ) < 1e-4f ? "ok" : S_COLOR_RED"X";
  1255. PrintClocks( va( "DotProduct_SIMD %d %s", i, result ), 1, clocksSIMD, clocksGeneric );
  1256. }
  1257. }
  1258. /*
  1259. ========================
  1260. LowerTriangularSolve_Test
  1261. ========================
  1262. */
  1263. static void LowerTriangularSolve_Test() {
  1264. idMatX L;
  1265. idVecX x, b, tst;
  1266. int paddedSize = ( TEST_TRIANGULAR_SOLVE_SIZE + 3 ) & ~3;
  1267. L.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f );
  1268. x.SetSize( paddedSize );
  1269. b.Random( paddedSize, 0, -1.0f, 1.0f );
  1270. idTimer timer;
  1271. const int skip = 0;
  1272. for ( int i = 1; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
  1273. x.Zero( i );
  1274. LowerTriangularSolve_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
  1275. int64 clocksGeneric = 0xFFFFFFFFFFFF;
  1276. for ( int j = 0; j < NUM_TESTS; j++ ) {
  1277. timer.Clear();
  1278. timer.Start();
  1279. LowerTriangularSolve_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
  1280. timer.Stop();
  1281. clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
  1282. }
  1283. tst = x;
  1284. x.Zero();
  1285. PrintClocks( va( "LowerTriangularSolve_Generic %dx%d", i, i ), 1, clocksGeneric );
  1286. LowerTriangularSolve_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
  1287. int64 clocksSIMD = 0xFFFFFFFFFFFF;
  1288. for ( int j = 0; j < NUM_TESTS; j++ ) {
  1289. timer.Clear();
  1290. timer.Start();
  1291. LowerTriangularSolve_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
  1292. timer.Stop();
  1293. clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
  1294. }
  1295. const char * result = x.Compare( tst, TEST_TRIANGULAR_SOLVE_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X";
  1296. PrintClocks( va( "LowerTriangularSolve_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric );
  1297. }
  1298. }
  1299. /*
  1300. ========================
  1301. LowerTriangularSolveTranspose_Test
  1302. ========================
  1303. */
  1304. static void LowerTriangularSolveTranspose_Test() {
  1305. idMatX L;
  1306. idVecX x, b, tst;
  1307. int paddedSize = ( TEST_TRIANGULAR_SOLVE_SIZE + 3 ) & ~3;
  1308. L.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f );
  1309. x.SetSize( paddedSize );
  1310. b.Random( paddedSize, 0, -1.0f, 1.0f );
  1311. idTimer timer;
  1312. for ( int i = 1; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
  1313. x.Zero( i );
  1314. LowerTriangularSolveTranspose_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
  1315. int64 clocksGeneric = 0xFFFFFFFFFFFF;
  1316. for ( int j = 0; j < NUM_TESTS; j++ ) {
  1317. timer.Clear();
  1318. timer.Start();
  1319. LowerTriangularSolveTranspose_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
  1320. timer.Stop();
  1321. clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
  1322. }
  1323. tst = x;
  1324. x.Zero();
  1325. PrintClocks( va( "LowerTriangularSolveTranspose_Generic %dx%d", i, i ), 1, clocksGeneric );
  1326. LowerTriangularSolveTranspose_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
  1327. int64 clocksSIMD = 0xFFFFFFFFFFFF;
  1328. for ( int j = 0; j < NUM_TESTS; j++ ) {
  1329. timer.Clear();
  1330. timer.Start();
  1331. LowerTriangularSolveTranspose_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
  1332. timer.Stop();
  1333. clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
  1334. }
  1335. const char * result = x.Compare( tst, TEST_TRIANGULAR_SOLVE_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X";
  1336. PrintClocks( va( "LowerTriangularSolveTranspose_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric );
  1337. }
  1338. }
  1339. /*
  1340. ========================
  1341. LDLT_Factor_Test
  1342. ========================
  1343. */
  1344. static void LDLT_Factor_Test() {
  1345. idMatX src, original, mat1, mat2;
  1346. idVecX invDiag1, invDiag2;
  1347. int paddedSize = ( TEST_FACTOR_SOLVE_SIZE + 3 ) & ~3;
  1348. original.SetSize( paddedSize, paddedSize );
  1349. src.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f );
  1350. src.TransposeMultiply( original, src );
  1351. idTimer timer;
  1352. for ( int i = 1; i < TEST_FACTOR_SOLVE_SIZE; i++ ) {
  1353. int64 clocksGeneric = 0xFFFFFFFFFFFF;
  1354. for ( int j = 0; j < NUM_TESTS; j++ ) {
  1355. mat1 = original;
  1356. invDiag1.Zero( TEST_FACTOR_SOLVE_SIZE );
  1357. timer.Clear();
  1358. timer.Start();
  1359. LDLT_Factor_Generic( mat1, invDiag1, i );
  1360. timer.Stop();
  1361. clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
  1362. }
  1363. PrintClocks( va( "LDLT_Factor_Generic %dx%d", i, i ), 1, clocksGeneric );
  1364. int64 clocksSIMD = 0xFFFFFFFFFFFF;
  1365. for ( int j = 0; j < NUM_TESTS; j++ ) {
  1366. mat2 = original;
  1367. invDiag2.Zero( TEST_FACTOR_SOLVE_SIZE );
  1368. timer.Clear();
  1369. timer.Start();
  1370. LDLT_Factor_SIMD( mat2, invDiag2, i );
  1371. timer.Stop();
  1372. clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
  1373. }
  1374. const char * result = mat1.Compare( mat2, TEST_FACTOR_SIMD_EPSILON ) && invDiag1.Compare( invDiag2, TEST_FACTOR_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X";
  1375. PrintClocks( va( "LDLT_Factor_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric );
  1376. }
  1377. }
  1378. #endif
  1379. #define Multiply Multiply_SIMD
  1380. #define MultiplyAdd MultiplyAdd_SIMD
  1381. #define BigDotProduct DotProduct_SIMD
  1382. #define LowerTriangularSolve LowerTriangularSolve_SIMD
  1383. #define LowerTriangularSolveTranspose LowerTriangularSolveTranspose_SIMD
  1384. #define UpperTriangularSolve UpperTriangularSolve_SIMD
  1385. #define LU_Factor LU_Factor_SIMD
  1386. #define LDLT_Factor LDLT_Factor_SIMD
  1387. #define GetMaxStep GetMaxStep_SIMD
  1388. /*
  1389. ================================================================================================
  1390. idLCP_Square
  1391. ================================================================================================
  1392. */
  1393. /*
  1394. ================================================
  1395. idLCP_Square
  1396. ================================================
  1397. */
  1398. class idLCP_Square : public idLCP {
  1399. public:
  1400. virtual bool Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex );
  1401. private:
  1402. idMatX m; // original matrix
  1403. idVecX b; // right hand side
  1404. idVecX lo, hi; // low and high bounds
  1405. idVecX f, a; // force and acceleration
  1406. idVecX delta_f, delta_a; // delta force and delta acceleration
  1407. idMatX clamped; // LU factored sub matrix for clamped variables
  1408. idVecX diagonal; // reciprocal of diagonal of U of the LU factored sub matrix for clamped variables
  1409. int numUnbounded; // number of unbounded variables
  1410. int numClamped; // number of clamped variables
  1411. float ** rowPtrs; // pointers to the rows of m
  1412. int * boxIndex; // box index
  1413. int * side; // tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0
  1414. int * permuted; // index to keep track of the permutation
  1415. bool padded; // set to true if the rows of the initial matrix are 16 byte padded
  1416. bool FactorClamped();
  1417. void SolveClamped( idVecX & x, const float * b );
  1418. void Swap( int i, int j );
  1419. void AddClamped( int r );
  1420. void RemoveClamped( int r );
  1421. void CalcForceDelta( int d, float dir );
  1422. void CalcAccelDelta( int d );
  1423. void ChangeForce( int d, float step );
  1424. void ChangeAccel( int d, float step );
  1425. };
  1426. /*
  1427. ========================
  1428. idLCP_Square::FactorClamped
  1429. ========================
  1430. */
  1431. bool idLCP_Square::FactorClamped() {
  1432. for ( int i = 0; i < numClamped; i++ ) {
  1433. memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) );
  1434. }
  1435. return LU_Factor( clamped, diagonal, numClamped );
  1436. }
  1437. /*
  1438. ========================
  1439. idLCP_Square::SolveClamped
  1440. ========================
  1441. */
  1442. void idLCP_Square::SolveClamped( idVecX & x, const float * b ) {
  1443. // solve L
  1444. LowerTriangularSolve( clamped, x.ToFloatPtr(), b, numClamped, 0 );
  1445. // solve U
  1446. UpperTriangularSolve( clamped, diagonal.ToFloatPtr(), x.ToFloatPtr(), x.ToFloatPtr(), numClamped );
  1447. }
  1448. /*
  1449. ========================
  1450. idLCP_Square::Swap
  1451. ========================
  1452. */
  1453. void idLCP_Square::Swap( int i, int j ) {
  1454. if ( i == j ) {
  1455. return;
  1456. }
  1457. SwapValues( rowPtrs[i], rowPtrs[j] );
  1458. m.SwapColumns( i, j );
  1459. b.SwapElements( i, j );
  1460. lo.SwapElements( i, j );
  1461. hi.SwapElements( i, j );
  1462. a.SwapElements( i, j );
  1463. f.SwapElements( i, j );
  1464. if ( boxIndex != NULL ) {
  1465. SwapValues( boxIndex[i], boxIndex[j] );
  1466. }
  1467. SwapValues( side[i], side[j] );
  1468. SwapValues( permuted[i], permuted[j] );
  1469. }
  1470. /*
  1471. ========================
  1472. idLCP_Square::AddClamped
  1473. ========================
  1474. */
  1475. void idLCP_Square::AddClamped( int r ) {
  1476. assert( r >= numClamped );
  1477. // add a row at the bottom and a column at the right of the factored
  1478. // matrix for the clamped variables
  1479. Swap( numClamped, r );
  1480. // add row to L
  1481. for ( int i = 0; i < numClamped; i++ ) {
  1482. float sum = rowPtrs[numClamped][i];
  1483. for ( int j = 0; j < i; j++ ) {
  1484. sum -= clamped[numClamped][j] * clamped[j][i];
  1485. }
  1486. clamped[numClamped][i] = sum * diagonal[i];
  1487. }
  1488. // add column to U
  1489. for ( int i = 0; i <= numClamped; i++ ) {
  1490. float sum = rowPtrs[i][numClamped];
  1491. for ( int j = 0; j < i; j++ ) {
  1492. sum -= clamped[i][j] * clamped[j][numClamped];
  1493. }
  1494. clamped[i][numClamped] = sum;
  1495. }
  1496. diagonal[numClamped] = 1.0f / clamped[numClamped][numClamped];
  1497. numClamped++;
  1498. }
  1499. /*
  1500. ========================
  1501. idLCP_Square::RemoveClamped
  1502. ========================
  1503. */
  1504. void idLCP_Square::RemoveClamped( int r ) {
  1505. if ( !verify( r < numClamped ) ) {
  1506. // complete fail, most likely due to exceptional floating point values
  1507. return;
  1508. }
  1509. numClamped--;
  1510. // no need to swap and update the factored matrix when the last row and column are removed
  1511. if ( r == numClamped ) {
  1512. return;
  1513. }
  1514. float * y0 = (float *) _alloca16( numClamped * sizeof( float ) );
  1515. float * z0 = (float *) _alloca16( numClamped * sizeof( float ) );
  1516. float * y1 = (float *) _alloca16( numClamped * sizeof( float ) );
  1517. float * z1 = (float *) _alloca16( numClamped * sizeof( float ) );
  1518. // the row/column need to be subtracted from the factorization
  1519. for ( int i = 0; i < numClamped; i++ ) {
  1520. y0[i] = -rowPtrs[i][r];
  1521. }
  1522. memset( y1, 0, numClamped * sizeof( float ) );
  1523. y1[r] = 1.0f;
  1524. memset( z0, 0, numClamped * sizeof( float ) );
  1525. z0[r] = 1.0f;
  1526. for ( int i = 0; i < numClamped; i++ ) {
  1527. z1[i] = -rowPtrs[r][i];
  1528. }
  1529. // swap the to be removed row/column with the last row/column
  1530. Swap( r, numClamped );
  1531. // the swapped last row/column need to be added to the factorization
  1532. for ( int i = 0; i < numClamped; i++ ) {
  1533. y0[i] += rowPtrs[i][r];
  1534. }
  1535. for ( int i = 0; i < numClamped; i++ ) {
  1536. z1[i] += rowPtrs[r][i];
  1537. }
  1538. z1[r] = 0.0f;
  1539. // update the beginning of the to be updated row and column
  1540. for ( int i = 0; i < r; i++ ) {
  1541. float p0 = y0[i];
  1542. float beta1 = z1[i] * diagonal[i];
  1543. clamped[i][r] += p0;
  1544. for ( int j = i+1; j < numClamped; j++ ) {
  1545. z1[j] -= beta1 * clamped[i][j];
  1546. }
  1547. for ( int j = i+1; j < numClamped; j++ ) {
  1548. y0[j] -= p0 * clamped[j][i];
  1549. }
  1550. clamped[r][i] += beta1;
  1551. }
  1552. // update the lower right corner starting at r,r
  1553. for ( int i = r; i < numClamped; i++ ) {
  1554. float diag = clamped[i][i];
  1555. float p0 = y0[i];
  1556. float p1 = z0[i];
  1557. diag += p0 * p1;
  1558. if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  1559. idLib::Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" );
  1560. diag = idMath::FLT_SMALLEST_NON_DENORMAL;
  1561. }
  1562. float beta0 = p1 / diag;
  1563. float q0 = y1[i];
  1564. float q1 = z1[i];
  1565. diag += q0 * q1;
  1566. if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  1567. idLib::Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" );
  1568. diag = idMath::FLT_SMALLEST_NON_DENORMAL;
  1569. }
  1570. float d = 1.0f / diag;
  1571. float beta1 = q1 * d;
  1572. clamped[i][i] = diag;
  1573. diagonal[i] = d;
  1574. for ( int j = i+1; j < numClamped; j++ ) {
  1575. d = clamped[i][j];
  1576. d += p0 * z0[j];
  1577. z0[j] -= beta0 * d;
  1578. d += q0 * z1[j];
  1579. z1[j] -= beta1 * d;
  1580. clamped[i][j] = d;
  1581. }
  1582. for ( int j = i+1; j < numClamped; j++ ) {
  1583. d = clamped[j][i];
  1584. y0[j] -= p0 * d;
  1585. d += beta0 * y0[j];
  1586. y1[j] -= q0 * d;
  1587. d += beta1 * y1[j];
  1588. clamped[j][i] = d;
  1589. }
  1590. }
  1591. return;
  1592. }
  1593. /*
  1594. ========================
  1595. idLCP_Square::CalcForceDelta
  1596. Modifies this->delta_f.
  1597. ========================
  1598. */
  1599. void idLCP_Square::CalcForceDelta( int d, float dir ) {
  1600. delta_f[d] = dir;
  1601. if ( numClamped <= 0 ) {
  1602. return;
  1603. }
  1604. // get column d of matrix
  1605. float * ptr = (float *) _alloca16( numClamped * sizeof( float ) );
  1606. for ( int i = 0; i < numClamped; i++ ) {
  1607. ptr[i] = rowPtrs[i][d];
  1608. }
  1609. // solve force delta
  1610. SolveClamped( delta_f, ptr );
  1611. // flip force delta based on direction
  1612. if ( dir > 0.0f ) {
  1613. ptr = delta_f.ToFloatPtr();
  1614. for ( int i = 0; i < numClamped; i++ ) {
  1615. ptr[i] = - ptr[i];
  1616. }
  1617. }
  1618. }
  1619. /*
  1620. ========================
  1621. idLCP_Square::CalcAccelDelta
  1622. Modifies this->delta_a and uses this->delta_f.
  1623. ========================
  1624. */
  1625. ID_INLINE void idLCP_Square::CalcAccelDelta( int d ) {
  1626. // only the not clamped variables, including the current variable, can have a change in acceleration
  1627. for ( int j = numClamped; j <= d; j++ ) {
  1628. // only the clamped variables and the current variable have a force delta unequal zero
  1629. float dot = BigDotProduct( rowPtrs[j], delta_f.ToFloatPtr(), numClamped );
  1630. delta_a[j] = dot + rowPtrs[j][d] * delta_f[d];
  1631. }
  1632. }
  1633. /*
  1634. ========================
  1635. idLCP_Square::ChangeForce
  1636. Modifies this->f and uses this->delta_f.
  1637. ========================
  1638. */
  1639. ID_INLINE void idLCP_Square::ChangeForce( int d, float step ) {
  1640. // only the clamped variables and current variable have a force delta unequal zero
  1641. MultiplyAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped );
  1642. f[d] += step * delta_f[d];
  1643. }
  1644. /*
  1645. ========================
  1646. idLCP_Square::ChangeAccel
  1647. Modifies this->a and uses this->delta_a.
  1648. ========================
  1649. */
  1650. ID_INLINE void idLCP_Square::ChangeAccel( int d, float step ) {
  1651. // only the not clamped variables, including the current variable, can have an acceleration unequal zero
  1652. MultiplyAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 );
  1653. }
  1654. /*
  1655. ========================
  1656. idLCP_Square::Solve
  1657. ========================
  1658. */
  1659. bool idLCP_Square::Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ) {
  1660. // true when the matrix rows are 16 byte padded
  1661. padded = ((o_m.GetNumRows()+3)&~3) == o_m.GetNumColumns();
  1662. assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() );
  1663. assert( o_x.GetSize() == o_m.GetNumRows() );
  1664. assert( o_b.GetSize() == o_m.GetNumRows() );
  1665. assert( o_lo.GetSize() == o_m.GetNumRows() );
  1666. assert( o_hi.GetSize() == o_m.GetNumRows() );
  1667. // allocate memory for permuted input
  1668. f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) );
  1669. a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
  1670. b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
  1671. lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) );
  1672. hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) );
  1673. if ( o_boxIndex != NULL ) {
  1674. boxIndex = (int *)_alloca16( o_x.GetSize() * sizeof( int ) );
  1675. memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) );
  1676. } else {
  1677. boxIndex = NULL;
  1678. }
  1679. // we override the const on o_m here but on exit the matrix is unchanged
  1680. m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast<float *>(o_m[0]) );
  1681. f.Zero();
  1682. a.Zero();
  1683. b = o_b;
  1684. lo = o_lo;
  1685. hi = o_hi;
  1686. // pointers to the rows of m
  1687. rowPtrs = (float **) _alloca16( m.GetNumRows() * sizeof( float * ) );
  1688. for ( int i = 0; i < m.GetNumRows(); i++ ) {
  1689. rowPtrs[i] = m[i];
  1690. }
  1691. // tells if a variable is at the low boundary, high boundary or inbetween
  1692. side = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
  1693. // index to keep track of the permutation
  1694. permuted = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
  1695. for ( int i = 0; i < m.GetNumRows(); i++ ) {
  1696. permuted[i] = i;
  1697. }
  1698. // permute input so all unbounded variables come first
  1699. numUnbounded = 0;
  1700. for ( int i = 0; i < m.GetNumRows(); i++ ) {
  1701. if ( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) {
  1702. if ( numUnbounded != i ) {
  1703. Swap( numUnbounded, i );
  1704. }
  1705. numUnbounded++;
  1706. }
  1707. }
  1708. // permute input so all variables using the boxIndex come last
  1709. int boxStartIndex = m.GetNumRows();
  1710. if ( boxIndex ) {
  1711. for ( int i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) {
  1712. if ( boxIndex[i] >= 0 ) {
  1713. boxStartIndex--;
  1714. if ( boxStartIndex != i ) {
  1715. Swap( boxStartIndex, i );
  1716. }
  1717. }
  1718. }
  1719. }
  1720. // sub matrix for factorization
  1721. clamped.SetData( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA( m.GetNumRows() * m.GetNumColumns() ) );
  1722. diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
  1723. // all unbounded variables are clamped
  1724. numClamped = numUnbounded;
  1725. // if there are unbounded variables
  1726. if ( numUnbounded ) {
  1727. // factor and solve for unbounded variables
  1728. if ( !FactorClamped() ) {
  1729. idLib::Printf( "idLCP_Square::Solve: unbounded factorization failed\n" );
  1730. return false;
  1731. }
  1732. SolveClamped( f, b.ToFloatPtr() );
  1733. // if there are no bounded variables we are done
  1734. if ( numUnbounded == m.GetNumRows() ) {
  1735. o_x = f; // the vector is not permuted
  1736. return true;
  1737. }
  1738. }
  1739. int numIgnored = 0;
  1740. // allocate for delta force and delta acceleration
  1741. delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
  1742. delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
  1743. // solve for bounded variables
  1744. idStr failed;
  1745. for ( int i = numUnbounded; i < m.GetNumRows(); i++ ) {
  1746. // once we hit the box start index we can initialize the low and high boundaries of the variables using the box index
  1747. if ( i == boxStartIndex ) {
  1748. for ( int j = 0; j < boxStartIndex; j++ ) {
  1749. o_x[permuted[j]] = f[j];
  1750. }
  1751. for ( int j = boxStartIndex; j < m.GetNumRows(); j++ ) {
  1752. float s = o_x[boxIndex[j]];
  1753. if ( lo[j] != -idMath::INFINITY ) {
  1754. lo[j] = - idMath::Fabs( lo[j] * s );
  1755. }
  1756. if ( hi[j] != idMath::INFINITY ) {
  1757. hi[j] = idMath::Fabs( hi[j] * s );
  1758. }
  1759. }
  1760. }
  1761. // calculate acceleration for current variable
  1762. float dot = BigDotProduct( rowPtrs[i], f.ToFloatPtr(), i );
  1763. a[i] = dot - b[i];
  1764. // if already at the low boundary
  1765. if ( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) {
  1766. side[i] = -1;
  1767. continue;
  1768. }
  1769. // if already at the high boundary
  1770. if ( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) {
  1771. side[i] = 1;
  1772. continue;
  1773. }
  1774. // if inside the clamped region
  1775. if ( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) {
  1776. side[i] = 0;
  1777. AddClamped( i );
  1778. continue;
  1779. }
  1780. // drive the current variable into a valid region
  1781. int n = 0;
  1782. for ( ; n < maxIterations; n++ ) {
  1783. // direction to move
  1784. float dir = ( a[i] <= 0.0f ) ? 1.0f : -1.0f;
  1785. // calculate force delta
  1786. CalcForceDelta( i, dir );
  1787. // calculate acceleration delta: delta_a = m * delta_f;
  1788. CalcAccelDelta( i );
  1789. float maxStep;
  1790. int limit;
  1791. int limitSide;
  1792. // maximum step we can take
  1793. GetMaxStep( f.ToFloatPtr(), a.ToFloatPtr(), delta_f.ToFloatPtr(), delta_a.ToFloatPtr(),
  1794. lo.ToFloatPtr(), hi.ToFloatPtr(), side, numUnbounded, numClamped,
  1795. i, dir, maxStep, limit, limitSide );
  1796. if ( maxStep <= 0.0f ) {
  1797. #ifdef IGNORE_UNSATISFIABLE_VARIABLES
  1798. // ignore the current variable completely
  1799. lo[i] = hi[i] = 0.0f;
  1800. f[i] = 0.0f;
  1801. side[i] = -1;
  1802. numIgnored++;
  1803. #else
  1804. failed.Format( "invalid step size %.4f", maxStep );
  1805. for ( int j = i; j < m.GetNumRows(); j++ ) {
  1806. f[j] = 0.0f;
  1807. }
  1808. numIgnored = m.GetNumRows() - i;
  1809. #endif
  1810. break;
  1811. }
  1812. // change force
  1813. ChangeForce( i, maxStep );
  1814. // change acceleration
  1815. ChangeAccel( i, maxStep );
  1816. // clamp/unclamp the variable that limited this step
  1817. side[limit] = limitSide;
  1818. if ( limitSide == 0 ) {
  1819. a[limit] = 0.0f;
  1820. AddClamped( limit );
  1821. } else if ( limitSide == -1 ) {
  1822. f[limit] = lo[limit];
  1823. if ( limit != i ) {
  1824. RemoveClamped( limit );
  1825. }
  1826. } else if ( limitSide == 1 ) {
  1827. f[limit] = hi[limit];
  1828. if ( limit != i ) {
  1829. RemoveClamped( limit );
  1830. }
  1831. }
  1832. // if the current variable limited the step we can continue with the next variable
  1833. if ( limit == i ) {
  1834. break;
  1835. }
  1836. }
  1837. if ( n >= maxIterations ) {
  1838. failed.Format( "max iterations %d", maxIterations );
  1839. break;
  1840. }
  1841. if ( failed.Length() ) {
  1842. break;
  1843. }
  1844. }
  1845. #ifdef IGNORE_UNSATISFIABLE_VARIABLES
  1846. if ( numIgnored ) {
  1847. if ( lcp_showFailures.GetBool() ) {
  1848. idLib::Printf( "idLCP_Square::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded );
  1849. }
  1850. }
  1851. #endif
  1852. // if failed clear remaining forces
  1853. if ( failed.Length() ) {
  1854. if ( lcp_showFailures.GetBool() ) {
  1855. idLib::Printf( "idLCP_Square::Solve: %s (%d of %d bounded variables ignored)\n", failed.c_str(), numIgnored, m.GetNumRows() - numUnbounded );
  1856. }
  1857. }
  1858. #if defined(_DEBUG) && 0
  1859. if ( failed.Length() ) {
  1860. // test whether or not the solution satisfies the complementarity conditions
  1861. for ( int i = 0; i < m.GetNumRows(); i++ ) {
  1862. a[i] = -b[i];
  1863. for ( int j = 0; j < m.GetNumRows(); j++ ) {
  1864. a[i] += rowPtrs[i][j] * f[j];
  1865. }
  1866. if ( f[i] == lo[i] ) {
  1867. if ( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) {
  1868. int bah1 = 1;
  1869. }
  1870. } else if ( f[i] == hi[i] ) {
  1871. if ( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) {
  1872. int bah2 = 1;
  1873. }
  1874. } else if ( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) {
  1875. int bah3 = 1;
  1876. }
  1877. }
  1878. }
  1879. #endif
  1880. // unpermute result
  1881. for ( int i = 0; i < f.GetSize(); i++ ) {
  1882. o_x[permuted[i]] = f[i];
  1883. }
  1884. return true;
  1885. }
  1886. /*
  1887. ================================================================================================
  1888. idLCP_Symmetric
  1889. ================================================================================================
  1890. */
  1891. /*
  1892. ================================================
  1893. idLCP_Symmetric
  1894. ================================================
  1895. */
  1896. class idLCP_Symmetric : public idLCP {
  1897. public:
  1898. virtual bool Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex );
  1899. private:
  1900. idMatX m; // original matrix
  1901. idVecX b; // right hand side
  1902. idVecX lo, hi; // low and high bounds
  1903. idVecX f, a; // force and acceleration
  1904. idVecX delta_f, delta_a; // delta force and delta acceleration
  1905. idMatX clamped; // LDLt factored sub matrix for clamped variables
  1906. idVecX diagonal; // reciprocal of diagonal of LDLt factored sub matrix for clamped variables
  1907. idVecX solveCache1; // intermediate result cached in SolveClamped
  1908. idVecX solveCache2; // "
  1909. int numUnbounded; // number of unbounded variables
  1910. int numClamped; // number of clamped variables
  1911. int clampedChangeStart; // lowest row/column changed in the clamped matrix during an iteration
  1912. float ** rowPtrs; // pointers to the rows of m
  1913. int * boxIndex; // box index
  1914. int * side; // tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0
  1915. int * permuted; // index to keep track of the permutation
  1916. bool padded; // set to true if the rows of the initial matrix are 16 byte padded
  1917. bool FactorClamped();
  1918. void SolveClamped( idVecX &x, const float *b );
  1919. void Swap( int i, int j );
  1920. void AddClamped( int r, bool useSolveCache );
  1921. void RemoveClamped( int r );
  1922. void CalcForceDelta( int d, float dir );
  1923. void CalcAccelDelta( int d );
  1924. void ChangeForce( int d, float step );
  1925. void ChangeAccel( int d, float step );
  1926. };
  1927. /*
  1928. ========================
  1929. idLCP_Symmetric::FactorClamped
  1930. ========================
  1931. */
  1932. bool idLCP_Symmetric::FactorClamped() {
  1933. clampedChangeStart = 0;
  1934. for ( int i = 0; i < numClamped; i++ ) {
  1935. memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) );
  1936. }
  1937. return LDLT_Factor( clamped, diagonal, numClamped );
  1938. }
  1939. /*
  1940. ========================
  1941. idLCP_Symmetric::SolveClamped
  1942. ========================
  1943. */
  1944. void idLCP_Symmetric::SolveClamped( idVecX &x, const float *b ) {
  1945. // solve L
  1946. LowerTriangularSolve( clamped, solveCache1.ToFloatPtr(), b, numClamped, clampedChangeStart );
  1947. // scale with D
  1948. Multiply( solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), diagonal.ToFloatPtr(), numClamped );
  1949. // solve Lt
  1950. LowerTriangularSolveTranspose( clamped, x.ToFloatPtr(), solveCache2.ToFloatPtr(), numClamped );
  1951. clampedChangeStart = numClamped;
  1952. }
  1953. /*
  1954. ========================
  1955. idLCP_Symmetric::Swap
  1956. ========================
  1957. */
  1958. void idLCP_Symmetric::Swap( int i, int j ) {
  1959. if ( i == j ) {
  1960. return;
  1961. }
  1962. SwapValues( rowPtrs[i], rowPtrs[j] );
  1963. m.SwapColumns( i, j );
  1964. b.SwapElements( i, j );
  1965. lo.SwapElements( i, j );
  1966. hi.SwapElements( i, j );
  1967. a.SwapElements( i, j );
  1968. f.SwapElements( i, j );
  1969. if ( boxIndex != NULL ) {
  1970. SwapValues( boxIndex[i], boxIndex[j] );
  1971. }
  1972. SwapValues( side[i], side[j] );
  1973. SwapValues( permuted[i], permuted[j] );
  1974. }
  1975. /*
  1976. ========================
  1977. idLCP_Symmetric::AddClamped
  1978. ========================
  1979. */
  1980. void idLCP_Symmetric::AddClamped( int r, bool useSolveCache ) {
  1981. assert( r >= numClamped );
  1982. if ( numClamped < clampedChangeStart ) {
  1983. clampedChangeStart = numClamped;
  1984. }
  1985. // add a row at the bottom and a column at the right of the factored
  1986. // matrix for the clamped variables
  1987. Swap( numClamped, r );
  1988. // solve for v in L * v = rowPtr[numClamped]
  1989. float dot;
  1990. if ( useSolveCache ) {
  1991. // the lower triangular solve was cached in SolveClamped called by CalcForceDelta
  1992. memcpy( clamped[numClamped], solveCache2.ToFloatPtr(), numClamped * sizeof( float ) );
  1993. // calculate row dot product
  1994. dot = BigDotProduct( solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), numClamped );
  1995. } else {
  1996. float *v = (float *) _alloca16( numClamped * sizeof( float ) );
  1997. LowerTriangularSolve( clamped, v, rowPtrs[numClamped], numClamped, 0 );
  1998. // add bottom row to L
  1999. Multiply( clamped[numClamped], v, diagonal.ToFloatPtr(), numClamped );
  2000. // calculate row dot product
  2001. dot = BigDotProduct( clamped[numClamped], v, numClamped );
  2002. }
  2003. // update diagonal[numClamped]
  2004. float d = rowPtrs[numClamped][numClamped] - dot;
  2005. if ( fabs( d ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  2006. idLib::Printf( "idLCP_Symmetric::AddClamped: updating factorization failed\n" );
  2007. d = idMath::FLT_SMALLEST_NON_DENORMAL;
  2008. }
  2009. clamped[numClamped][numClamped] = d;
  2010. diagonal[numClamped] = 1.0f / d;
  2011. numClamped++;
  2012. }
  2013. /*
  2014. ========================
  2015. idLCP_Symmetric::RemoveClamped
  2016. ========================
  2017. */
  2018. void idLCP_Symmetric::RemoveClamped( int r ) {
  2019. if ( !verify( r < numClamped ) ) {
  2020. // complete fail, most likely due to exceptional floating point values
  2021. return;
  2022. }
  2023. if ( r < clampedChangeStart ) {
  2024. clampedChangeStart = r;
  2025. }
  2026. numClamped--;
  2027. // no need to swap and update the factored matrix when the last row and column are removed
  2028. if ( r == numClamped ) {
  2029. return;
  2030. }
  2031. // swap the to be removed row/column with the last row/column
  2032. Swap( r, numClamped );
  2033. // update the factored matrix
  2034. float * addSub = (float *) _alloca16( numClamped * sizeof( float ) );
  2035. if ( r == 0 ) {
  2036. if ( numClamped == 1 ) {
  2037. float diag = rowPtrs[0][0];
  2038. if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  2039. idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
  2040. diag = idMath::FLT_SMALLEST_NON_DENORMAL;
  2041. }
  2042. clamped[0][0] = diag;
  2043. diagonal[0] = 1.0f / diag;
  2044. return;
  2045. }
  2046. // calculate the row/column to be added to the lower right sub matrix starting at (r, r)
  2047. float * original = rowPtrs[numClamped];
  2048. float * ptr = rowPtrs[r];
  2049. addSub[0] = ptr[0] - original[numClamped];
  2050. for ( int i = 1; i < numClamped; i++ ) {
  2051. addSub[i] = ptr[i] - original[i];
  2052. }
  2053. } else {
  2054. float * v = (float *) _alloca16( numClamped * sizeof( float ) );
  2055. // solve for v in L * v = rowPtr[r]
  2056. LowerTriangularSolve( clamped, v, rowPtrs[r], r, 0 );
  2057. // update removed row
  2058. Multiply( clamped[r], v, diagonal.ToFloatPtr(), r );
  2059. // if the last row/column of the matrix is updated
  2060. if ( r == numClamped - 1 ) {
  2061. // only calculate new diagonal
  2062. float dot = BigDotProduct( clamped[r], v, r );
  2063. float diag = rowPtrs[r][r] - dot;
  2064. if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  2065. idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
  2066. diag = idMath::FLT_SMALLEST_NON_DENORMAL;
  2067. }
  2068. clamped[r][r] = diag;
  2069. diagonal[r] = 1.0f / diag;
  2070. return;
  2071. }
  2072. // calculate the row/column to be added to the lower right sub matrix starting at (r, r)
  2073. for ( int i = 0; i < r; i++ ) {
  2074. v[i] = clamped[r][i] * clamped[i][i];
  2075. }
  2076. for ( int i = r; i < numClamped; i++ ) {
  2077. float sum;
  2078. if ( i == r ) {
  2079. sum = clamped[r][r];
  2080. } else {
  2081. sum = clamped[r][r] * clamped[i][r];
  2082. }
  2083. float * ptr = clamped[i];
  2084. for ( int j = 0; j < r; j++ ) {
  2085. sum += ptr[j] * v[j];
  2086. }
  2087. addSub[i] = rowPtrs[r][i] - sum;
  2088. }
  2089. }
  2090. // add row/column to the lower right sub matrix starting at (r, r)
  2091. float * v1 = (float *) _alloca16( numClamped * sizeof( float ) );
  2092. float * v2 = (float *) _alloca16( numClamped * sizeof( float ) );
  2093. float diag = idMath::SQRT_1OVER2;
  2094. v1[r] = ( 0.5f * addSub[r] + 1.0f ) * diag;
  2095. v2[r] = ( 0.5f * addSub[r] - 1.0f ) * diag;
  2096. for ( int i = r+1; i < numClamped; i++ ) {
  2097. v1[i] = v2[i] = addSub[i] * diag;
  2098. }
  2099. float alpha1 = 1.0f;
  2100. float alpha2 = -1.0f;
  2101. // simultaneous update/downdate of the sub matrix starting at (r, r)
  2102. int n = clamped.GetNumColumns();
  2103. for ( int i = r; i < numClamped; i++ ) {
  2104. diag = clamped[i][i];
  2105. float p1 = v1[i];
  2106. float newDiag = diag + alpha1 * p1 * p1;
  2107. if ( fabs( newDiag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  2108. idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
  2109. newDiag = idMath::FLT_SMALLEST_NON_DENORMAL;
  2110. }
  2111. alpha1 /= newDiag;
  2112. float beta1 = p1 * alpha1;
  2113. alpha1 *= diag;
  2114. diag = newDiag;
  2115. float p2 = v2[i];
  2116. newDiag = diag + alpha2 * p2 * p2;
  2117. if ( fabs( newDiag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
  2118. idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
  2119. newDiag = idMath::FLT_SMALLEST_NON_DENORMAL;
  2120. }
  2121. clamped[i][i] = newDiag;
  2122. float invNewDiag = 1.0f / newDiag;
  2123. diagonal[i] = invNewDiag;
  2124. alpha2 *= invNewDiag;
  2125. float beta2 = p2 * alpha2;
  2126. alpha2 *= diag;
  2127. // update column below diagonal (i,i)
  2128. float * ptr = clamped.ToFloatPtr() + i;
  2129. int j;
  2130. for ( j = i+1; j < numClamped - 1; j += 2 ) {
  2131. float sum0 = ptr[(j+0)*n];
  2132. float sum1 = ptr[(j+1)*n];
  2133. v1[j+0] -= p1 * sum0;
  2134. v1[j+1] -= p1 * sum1;
  2135. sum0 += beta1 * v1[j+0];
  2136. sum1 += beta1 * v1[j+1];
  2137. v2[j+0] -= p2 * sum0;
  2138. v2[j+1] -= p2 * sum1;
  2139. sum0 += beta2 * v2[j+0];
  2140. sum1 += beta2 * v2[j+1];
  2141. ptr[(j+0)*n] = sum0;
  2142. ptr[(j+1)*n] = sum1;
  2143. }
  2144. for ( ; j < numClamped; j++ ) {
  2145. float sum = ptr[j*n];
  2146. v1[j] -= p1 * sum;
  2147. sum += beta1 * v1[j];
  2148. v2[j] -= p2 * sum;
  2149. sum += beta2 * v2[j];
  2150. ptr[j*n] = sum;
  2151. }
  2152. }
  2153. }
  2154. /*
  2155. ========================
  2156. idLCP_Symmetric::CalcForceDelta
  2157. Modifies this->delta_f.
  2158. ========================
  2159. */
  2160. ID_INLINE void idLCP_Symmetric::CalcForceDelta( int d, float dir ) {
  2161. delta_f[d] = dir;
  2162. if ( numClamped == 0 ) {
  2163. return;
  2164. }
  2165. // solve force delta
  2166. SolveClamped( delta_f, rowPtrs[d] );
  2167. // flip force delta based on direction
  2168. if ( dir > 0.0f ) {
  2169. float * ptr = delta_f.ToFloatPtr();
  2170. for ( int i = 0; i < numClamped; i++ ) {
  2171. ptr[i] = - ptr[i];
  2172. }
  2173. }
  2174. }
  2175. /*
  2176. ========================
  2177. idLCP_Symmetric::CalcAccelDelta
  2178. Modifies this->delta_a and uses this->delta_f.
  2179. ========================
  2180. */
  2181. ID_INLINE void idLCP_Symmetric::CalcAccelDelta( int d ) {
  2182. // only the not clamped variables, including the current variable, can have a change in acceleration
  2183. for ( int j = numClamped; j <= d; j++ ) {
  2184. // only the clamped variables and the current variable have a force delta unequal zero
  2185. float dot = BigDotProduct( rowPtrs[j], delta_f.ToFloatPtr(), numClamped );
  2186. delta_a[j] = dot + rowPtrs[j][d] * delta_f[d];
  2187. }
  2188. }
  2189. /*
  2190. ========================
  2191. idLCP_Symmetric::ChangeForce
  2192. Modifies this->f and uses this->delta_f.
  2193. ========================
  2194. */
  2195. ID_INLINE void idLCP_Symmetric::ChangeForce( int d, float step ) {
  2196. // only the clamped variables and current variable have a force delta unequal zero
  2197. MultiplyAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped );
  2198. f[d] += step * delta_f[d];
  2199. }
  2200. /*
  2201. ========================
  2202. idLCP_Symmetric::ChangeAccel
  2203. Modifies this->a and uses this->delta_a.
  2204. ========================
  2205. */
  2206. ID_INLINE void idLCP_Symmetric::ChangeAccel( int d, float step ) {
  2207. // only the not clamped variables, including the current variable, can have an acceleration unequal zero
  2208. MultiplyAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 );
  2209. }
  2210. /*
  2211. ========================
  2212. idLCP_Symmetric::Solve
  2213. ========================
  2214. */
  2215. bool idLCP_Symmetric::Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ) {
  2216. // true when the matrix rows are 16 byte padded
  2217. padded = ((o_m.GetNumRows()+3)&~3) == o_m.GetNumColumns();
  2218. assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() );
  2219. assert( o_x.GetSize() == o_m.GetNumRows() );
  2220. assert( o_b.GetSize() == o_m.GetNumRows() );
  2221. assert( o_lo.GetSize() == o_m.GetNumRows() );
  2222. assert( o_hi.GetSize() == o_m.GetNumRows() );
  2223. // allocate memory for permuted input
  2224. f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) );
  2225. a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
  2226. b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
  2227. lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) );
  2228. hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) );
  2229. if ( o_boxIndex != NULL ) {
  2230. boxIndex = (int *)_alloca16( o_x.GetSize() * sizeof( int ) );
  2231. memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) );
  2232. } else {
  2233. boxIndex = NULL;
  2234. }
  2235. // we override the const on o_m here but on exit the matrix is unchanged
  2236. m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast< float * >( o_m[0] ) );
  2237. f.Zero();
  2238. a.Zero();
  2239. b = o_b;
  2240. lo = o_lo;
  2241. hi = o_hi;
  2242. // pointers to the rows of m
  2243. rowPtrs = (float **) _alloca16( m.GetNumRows() * sizeof( float * ) );
  2244. for ( int i = 0; i < m.GetNumRows(); i++ ) {
  2245. rowPtrs[i] = m[i];
  2246. }
  2247. // tells if a variable is at the low boundary, high boundary or inbetween
  2248. side = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
  2249. // index to keep track of the permutation
  2250. permuted = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
  2251. for ( int i = 0; i < m.GetNumRows(); i++ ) {
  2252. permuted[i] = i;
  2253. }
  2254. // permute input so all unbounded variables come first
  2255. numUnbounded = 0;
  2256. for ( int i = 0; i < m.GetNumRows(); i++ ) {
  2257. if ( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) {
  2258. if ( numUnbounded != i ) {
  2259. Swap( numUnbounded, i );
  2260. }
  2261. numUnbounded++;
  2262. }
  2263. }
  2264. // permute input so all variables using the boxIndex come last
  2265. int boxStartIndex = m.GetNumRows();
  2266. if ( boxIndex != NULL ) {
  2267. for ( int i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) {
  2268. if ( boxIndex[i] >= 0 ) {
  2269. boxStartIndex--;
  2270. if ( boxStartIndex != i ) {
  2271. Swap( boxStartIndex, i );
  2272. }
  2273. }
  2274. }
  2275. }
  2276. // sub matrix for factorization
  2277. clamped.SetDataCacheLines( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA_CACHE_LINES( m.GetNumRows() * m.GetNumColumns() ), true );
  2278. diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
  2279. solveCache1.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
  2280. solveCache2.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
  2281. // all unbounded variables are clamped
  2282. numClamped = numUnbounded;
  2283. // if there are unbounded variables
  2284. if ( numUnbounded ) {
  2285. // factor and solve for unbounded variables
  2286. if ( !FactorClamped() ) {
  2287. idLib::Printf( "idLCP_Symmetric::Solve: unbounded factorization failed\n" );
  2288. return false;
  2289. }
  2290. SolveClamped( f, b.ToFloatPtr() );
  2291. // if there are no bounded variables we are done
  2292. if ( numUnbounded == m.GetNumRows() ) {
  2293. o_x = f; // the vector is not permuted
  2294. return true;
  2295. }
  2296. }
  2297. int numIgnored = 0;
  2298. // allocate for delta force and delta acceleration
  2299. delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
  2300. delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
  2301. // solve for bounded variables
  2302. idStr failed;
  2303. for ( int i = numUnbounded; i < m.GetNumRows(); i++ ) {
  2304. clampedChangeStart = 0;
  2305. // once we hit the box start index we can initialize the low and high boundaries of the variables using the box index
  2306. if ( i == boxStartIndex ) {
  2307. for ( int j = 0; j < boxStartIndex; j++ ) {
  2308. o_x[permuted[j]] = f[j];
  2309. }
  2310. for ( int j = boxStartIndex; j < m.GetNumRows(); j++ ) {
  2311. float s = o_x[boxIndex[j]];
  2312. if ( lo[j] != -idMath::INFINITY ) {
  2313. lo[j] = - idMath::Fabs( lo[j] * s );
  2314. }
  2315. if ( hi[j] != idMath::INFINITY ) {
  2316. hi[j] = idMath::Fabs( hi[j] * s );
  2317. }
  2318. }
  2319. }
  2320. // calculate acceleration for current variable
  2321. float dot = BigDotProduct( rowPtrs[i], f.ToFloatPtr(), i );
  2322. a[i] = dot - b[i];
  2323. // if already at the low boundary
  2324. if ( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) {
  2325. side[i] = -1;
  2326. continue;
  2327. }
  2328. // if already at the high boundary
  2329. if ( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) {
  2330. side[i] = 1;
  2331. continue;
  2332. }
  2333. // if inside the clamped region
  2334. if ( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) {
  2335. side[i] = 0;
  2336. AddClamped( i, false );
  2337. continue;
  2338. }
  2339. // drive the current variable into a valid region
  2340. int n = 0;
  2341. for ( ; n < maxIterations; n++ ) {
  2342. // direction to move
  2343. float dir = ( a[i] <= 0.0f ) ? 1.0f : -1.0f;
  2344. // calculate force delta
  2345. CalcForceDelta( i, dir );
  2346. // calculate acceleration delta: delta_a = m * delta_f;
  2347. CalcAccelDelta( i );
  2348. float maxStep;
  2349. int limit;
  2350. int limitSide;
  2351. // maximum step we can take
  2352. GetMaxStep( f.ToFloatPtr(), a.ToFloatPtr(), delta_f.ToFloatPtr(), delta_a.ToFloatPtr(),
  2353. lo.ToFloatPtr(), hi.ToFloatPtr(), side, numUnbounded, numClamped,
  2354. i, dir, maxStep, limit, limitSide );
  2355. if ( maxStep <= 0.0f ) {
  2356. #ifdef IGNORE_UNSATISFIABLE_VARIABLES
  2357. // ignore the current variable completely
  2358. lo[i] = hi[i] = 0.0f;
  2359. f[i] = 0.0f;
  2360. side[i] = -1;
  2361. numIgnored++;
  2362. #else
  2363. failed.Format( "invalid step size %.4f", maxStep );
  2364. for ( int j = i; j < m.GetNumRows(); j++ ) {
  2365. f[j] = 0.0f;
  2366. }
  2367. numIgnored = m.GetNumRows() - i;
  2368. #endif
  2369. break;
  2370. }
  2371. // change force
  2372. ChangeForce( i, maxStep );
  2373. // change acceleration
  2374. ChangeAccel( i, maxStep );
  2375. // clamp/unclamp the variable that limited this step
  2376. side[limit] = limitSide;
  2377. if ( limitSide == 0 ) {
  2378. a[limit] = 0.0f;
  2379. AddClamped( limit, ( limit == i ) );
  2380. } else if ( limitSide == -1 ) {
  2381. f[limit] = lo[limit];
  2382. if ( limit != i ) {
  2383. RemoveClamped( limit );
  2384. }
  2385. } else if ( limitSide == 1 ) {
  2386. f[limit] = hi[limit];
  2387. if ( limit != i ) {
  2388. RemoveClamped( limit );
  2389. }
  2390. }
  2391. // if the current variable limited the step we can continue with the next variable
  2392. if ( limit == i ) {
  2393. break;
  2394. }
  2395. }
  2396. if ( n >= maxIterations ) {
  2397. failed.Format( "max iterations %d", maxIterations );
  2398. break;
  2399. }
  2400. if ( failed.Length() ) {
  2401. break;
  2402. }
  2403. }
  2404. #ifdef IGNORE_UNSATISFIABLE_VARIABLES
  2405. if ( numIgnored ) {
  2406. if ( lcp_showFailures.GetBool() ) {
  2407. idLib::Printf( "idLCP_Symmetric::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded );
  2408. }
  2409. }
  2410. #endif
  2411. // if failed clear remaining forces
  2412. if ( failed.Length() ) {
  2413. if ( lcp_showFailures.GetBool() ) {
  2414. idLib::Printf( "idLCP_Symmetric::Solve: %s (%d of %d bounded variables ignored)\n", failed.c_str(), numIgnored, m.GetNumRows() - numUnbounded );
  2415. }
  2416. }
  2417. #if defined(_DEBUG) && 0
  2418. if ( failed.Length() ) {
  2419. // test whether or not the solution satisfies the complementarity conditions
  2420. for ( int i = 0; i < m.GetNumRows(); i++ ) {
  2421. a[i] = -b[i];
  2422. for ( j = 0; j < m.GetNumRows(); j++ ) {
  2423. a[i] += rowPtrs[i][j] * f[j];
  2424. }
  2425. if ( f[i] == lo[i] ) {
  2426. if ( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) {
  2427. int bah1 = 1;
  2428. }
  2429. } else if ( f[i] == hi[i] ) {
  2430. if ( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) {
  2431. int bah2 = 1;
  2432. }
  2433. } else if ( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) {
  2434. int bah3 = 1;
  2435. }
  2436. }
  2437. }
  2438. #endif
  2439. // unpermute result
  2440. for ( int i = 0; i < f.GetSize(); i++ ) {
  2441. o_x[permuted[i]] = f[i];
  2442. }
  2443. return true;
  2444. }
  2445. /*
  2446. ================================================================================================
  2447. idLCP
  2448. ================================================================================================
  2449. */
  2450. /*
  2451. ========================
  2452. idLCP::AllocSquare
  2453. ========================
  2454. */
  2455. idLCP *idLCP::AllocSquare() {
  2456. idLCP *lcp = new idLCP_Square;
  2457. lcp->SetMaxIterations( 32 );
  2458. return lcp;
  2459. }
  2460. /*
  2461. ========================
  2462. idLCP::AllocSymmetric
  2463. ========================
  2464. */
  2465. idLCP *idLCP::AllocSymmetric() {
  2466. idLCP *lcp = new idLCP_Symmetric;
  2467. lcp->SetMaxIterations( 32 );
  2468. return lcp;
  2469. }
  2470. /*
  2471. ========================
  2472. idLCP::~idLCP
  2473. ========================
  2474. */
  2475. idLCP::~idLCP() {
  2476. }
  2477. /*
  2478. ========================
  2479. idLCP::SetMaxIterations
  2480. ========================
  2481. */
  2482. void idLCP::SetMaxIterations( int max ) {
  2483. maxIterations = max;
  2484. }
  2485. /*
  2486. ========================
  2487. idLCP::GetMaxIterations
  2488. ========================
  2489. */
  2490. int idLCP::GetMaxIterations() {
  2491. return maxIterations;
  2492. }
  2493. /*
  2494. ========================
  2495. idLCP::Test_f
  2496. ========================
  2497. */
  2498. void idLCP::Test_f( const idCmdArgs &args ) {
  2499. #ifdef ENABLE_TEST_CODE
  2500. DotProduct_Test();
  2501. LowerTriangularSolve_Test();
  2502. LowerTriangularSolveTranspose_Test();
  2503. LDLT_Factor_Test();
  2504. #endif
  2505. }