Simd_Generic.cpp 82 KB


  1. /*
  2. ===========================================================================
  3. Doom 3 GPL Source Code
  4. Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
  6. Doom 3 Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #include "../precompiled.h"
  21. #pragma hdrstop
  22. #include "Simd_Generic.h"
  23. //===============================================================
  24. //
  25. // Generic implementation of idSIMDProcessor
  26. //
  27. //===============================================================
  28. #define UNROLL1(Y) { int _IX; for (_IX=0;_IX<count;_IX++) {Y(_IX);} }
  29. #define UNROLL2(Y) { int _IX, _NM = count&0xfffffffe; for (_IX=0;_IX<_NM;_IX+=2){Y(_IX+0);Y(_IX+1);} if (_IX < count) {Y(_IX);}}
  30. #define UNROLL4(Y) { int _IX, _NM = count&0xfffffffc; for (_IX=0;_IX<_NM;_IX+=4){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);}for(;_IX<count;_IX++){Y(_IX);}}
  31. #define UNROLL8(Y) { int _IX, _NM = count&0xfffffff8; for (_IX=0;_IX<_NM;_IX+=8){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);Y(_IX+4);Y(_IX+5);Y(_IX+6);Y(_IX+7);} _NM = count&0xfffffffe; for(;_IX<_NM;_IX+=2){Y(_IX); Y(_IX+1);} if (_IX < count) {Y(_IX);} }
  32. #ifdef _DEBUG
  33. #define NODEFAULT default: assert( 0 )
  34. #elif _WIN32
  35. #define NODEFAULT default: __assume( 0 )
  36. #else
  37. #define NODEFAULT
  38. #endif
  39. /*
  40. ============
  41. idSIMD_Generic::GetName
  42. ============
  43. */
  44. const char * idSIMD_Generic::GetName( void ) const {
  45. return "generic code";
  46. }
  47. /*
  48. ============
  49. idSIMD_Generic::Add
  50. dst[i] = constant + src[i];
  51. ============
  52. */
  53. void VPCALL idSIMD_Generic::Add( float *dst, const float constant, const float *src, const int count ) {
  54. #define OPER(X) dst[(X)] = src[(X)] + constant;
  55. UNROLL4(OPER)
  56. #undef OPER
  57. }
  58. /*
  59. ============
  60. idSIMD_Generic::Add
  61. dst[i] = src0[i] + src1[i];
  62. ============
  63. */
  64. void VPCALL idSIMD_Generic::Add( float *dst, const float *src0, const float *src1, const int count ) {
  65. #define OPER(X) dst[(X)] = src0[(X)] + src1[(X)];
  66. UNROLL4(OPER)
  67. #undef OPER
  68. }
  69. /*
  70. ============
  71. idSIMD_Generic::Sub
  72. dst[i] = constant - src[i];
  73. ============
  74. */
  75. void VPCALL idSIMD_Generic::Sub( float *dst, const float constant, const float *src, const int count ) {
  76. double c = constant;
  77. #define OPER(X) dst[(X)] = c - src[(X)];
  78. UNROLL4(OPER)
  79. #undef OPER
  80. }
  81. /*
  82. ============
  83. idSIMD_Generic::Sub
  84. dst[i] = src0[i] - src1[i];
  85. ============
  86. */
  87. void VPCALL idSIMD_Generic::Sub( float *dst, const float *src0, const float *src1, const int count ) {
  88. #define OPER(X) dst[(X)] = src0[(X)] - src1[(X)];
  89. UNROLL4(OPER)
  90. #undef OPER
  91. }
  92. /*
  93. ============
  94. idSIMD_Generic::Mul
  95. dst[i] = constant * src[i];
  96. ============
  97. */
  98. void VPCALL idSIMD_Generic::Mul( float *dst, const float constant, const float *src0, const int count) {
  99. double c = constant;
  100. #define OPER(X) (dst[(X)] = (c * src0[(X)]))
  101. UNROLL4(OPER)
  102. #undef OPER
  103. }
  104. /*
  105. ============
  106. idSIMD_Generic::Mul
  107. dst[i] = src0[i] * src1[i];
  108. ============
  109. */
  110. void VPCALL idSIMD_Generic::Mul( float *dst, const float *src0, const float *src1, const int count ) {
  111. #define OPER(X) (dst[(X)] = src0[(X)] * src1[(X)])
  112. UNROLL4(OPER)
  113. #undef OPER
  114. }
  115. /*
  116. ============
  117. idSIMD_Generic::Div
  118. dst[i] = constant / divisor[i];
  119. ============
  120. */
  121. void VPCALL idSIMD_Generic::Div( float *dst, const float constant, const float *divisor, const int count ) {
  122. double c = constant;
  123. #define OPER(X) (dst[(X)] = (c / divisor[(X)]))
  124. UNROLL4(OPER)
  125. #undef OPER
  126. }
  127. /*
  128. ============
  129. idSIMD_Generic::Div
  130. dst[i] = src0[i] / src1[i];
  131. ============
  132. */
  133. void VPCALL idSIMD_Generic::Div( float *dst, const float *src0, const float *src1, const int count ) {
  134. #define OPER(X) (dst[(X)] = src0[(X)] / src1[(X)])
  135. UNROLL4(OPER)
  136. #undef OPER
  137. }
  138. /*
  139. ============
  140. idSIMD_Generic::MulAdd
  141. dst[i] += constant * src[i];
  142. ============
  143. */
  144. void VPCALL idSIMD_Generic::MulAdd( float *dst, const float constant, const float *src, const int count ) {
  145. double c = constant;
  146. #define OPER(X) (dst[(X)] += c * src[(X)])
  147. UNROLL4(OPER)
  148. #undef OPER
  149. }
  150. /*
  151. ============
  152. idSIMD_Generic::MulAdd
  153. dst[i] += src0[i] * src1[i];
  154. ============
  155. */
  156. void VPCALL idSIMD_Generic::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
  157. #define OPER(X) (dst[(X)] += src0[(X)] * src1[(X)])
  158. UNROLL4(OPER)
  159. #undef OPER
  160. }
  161. /*
  162. ============
  163. idSIMD_Generic::MulSub
  164. dst[i] -= constant * src[i];
  165. ============
  166. */
  167. void VPCALL idSIMD_Generic::MulSub( float *dst, const float constant, const float *src, const int count ) {
  168. double c = constant;
  169. #define OPER(X) (dst[(X)] -= c * src[(X)])
  170. UNROLL4(OPER)
  171. #undef OPER
  172. }
  173. /*
  174. ============
  175. idSIMD_Generic::MulSub
  176. dst[i] -= src0[i] * src1[i];
  177. ============
  178. */
  179. void VPCALL idSIMD_Generic::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
  180. #define OPER(X) (dst[(X)] -= src0[(X)] * src1[(X)])
  181. UNROLL4(OPER)
  182. #undef OPER
  183. }
  184. /*
  185. ============
  186. idSIMD_Generic::Dot
  187. dst[i] = constant * src[i];
  188. ============
  189. */
  190. void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
  191. #define OPER(X) dst[(X)] = constant * src[(X)];
  192. UNROLL1(OPER)
  193. #undef OPER
  194. }
  195. /*
  196. ============
  197. idSIMD_Generic::Dot
  198. dst[i] = constant * src[i].Normal() + src[i][3];
  199. ============
  200. */
  201. void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
  202. #define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
  203. UNROLL1(OPER)
  204. #undef OPER
  205. }
  206. /*
  207. ============
  208. idSIMD_Generic::Dot
  209. dst[i] = constant * src[i].xyz;
  210. ============
  211. */
  212. void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
  213. #define OPER(X) dst[(X)] = constant * src[(X)].xyz;
  214. UNROLL1(OPER)
  215. #undef OPER
  216. }
  217. /*
  218. ============
  219. idSIMD_Generic::Dot
  220. dst[i] = constant.Normal() * src[i] + constant[3];
  221. ============
  222. */
  223. void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
  224. #define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
  225. UNROLL1(OPER)
  226. #undef OPER
  227. }
  228. /*
  229. ============
  230. idSIMD_Generic::Dot
  231. dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
  232. ============
  233. */
  234. void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
  235. #define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
  236. UNROLL1(OPER)
  237. #undef OPER
  238. }
  239. /*
  240. ============
  241. idSIMD_Generic::Dot
  242. dst[i] = constant.Normal() * src[i].xyz + constant[3];
  243. ============
  244. */
  245. void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
  246. #define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
  247. UNROLL1(OPER)
  248. #undef OPER
  249. }
  250. /*
  251. ============
  252. idSIMD_Generic::Dot
  253. dst[i] = src0[i] * src1[i];
  254. ============
  255. */
  256. void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
  257. #define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
  258. UNROLL1(OPER)
  259. #undef OPER
  260. }
  261. /*
  262. ============
  263. idSIMD_Generic::Dot
  264. dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
  265. ============
  266. */
  267. void VPCALL idSIMD_Generic::Dot( float &dot, const float *src1, const float *src2, const int count ) {
  268. #if 1
  269. switch( count ) {
  270. case 0: {
  271. dot = 0.0f;
  272. return;
  273. }
  274. case 1: {
  275. dot = src1[0] * src2[0];
  276. return;
  277. }
  278. case 2: {
  279. dot = src1[0] * src2[0] + src1[1] * src2[1];
  280. return;
  281. }
  282. case 3: {
  283. dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
  284. return;
  285. }
  286. default: {
  287. int i;
  288. double s0, s1, s2, s3;
  289. s0 = src1[0] * src2[0];
  290. s1 = src1[1] * src2[1];
  291. s2 = src1[2] * src2[2];
  292. s3 = src1[3] * src2[3];
  293. for ( i = 4; i < count-7; i += 8 ) {
  294. s0 += src1[i+0] * src2[i+0];
  295. s1 += src1[i+1] * src2[i+1];
  296. s2 += src1[i+2] * src2[i+2];
  297. s3 += src1[i+3] * src2[i+3];
  298. s0 += src1[i+4] * src2[i+4];
  299. s1 += src1[i+5] * src2[i+5];
  300. s2 += src1[i+6] * src2[i+6];
  301. s3 += src1[i+7] * src2[i+7];
  302. }
  303. switch( count - i ) {
  304. NODEFAULT;
  305. case 7: s0 += src1[i+6] * src2[i+6];
  306. case 6: s1 += src1[i+5] * src2[i+5];
  307. case 5: s2 += src1[i+4] * src2[i+4];
  308. case 4: s3 += src1[i+3] * src2[i+3];
  309. case 3: s0 += src1[i+2] * src2[i+2];
  310. case 2: s1 += src1[i+1] * src2[i+1];
  311. case 1: s2 += src1[i+0] * src2[i+0];
  312. case 0: break;
  313. }
  314. double sum;
  315. sum = s3;
  316. sum += s2;
  317. sum += s1;
  318. sum += s0;
  319. dot = sum;
  320. }
  321. }
  322. #else
  323. dot = 0.0f;
  324. for ( i = 0; i < count; i++ ) {
  325. dot += src1[i] * src2[i];
  326. }
  327. #endif
  328. }
  329. /*
  330. ============
  331. idSIMD_Generic::CmpGT
  332. dst[i] = src0[i] > constant;
  333. ============
  334. */
  335. void VPCALL idSIMD_Generic::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
  336. #define OPER(X) dst[(X)] = src0[(X)] > constant;
  337. UNROLL4(OPER)
  338. #undef OPER
  339. }
  340. /*
  341. ============
  342. idSIMD_Generic::CmpGT
  343. dst[i] |= ( src0[i] > constant ) << bitNum;
  344. ============
  345. */
  346. void VPCALL idSIMD_Generic::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  347. #define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
  348. UNROLL4(OPER)
  349. #undef OPER
  350. }
  351. /*
  352. ============
  353. idSIMD_Generic::CmpGE
  354. dst[i] = src0[i] >= constant;
  355. ============
  356. */
  357. void VPCALL idSIMD_Generic::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
  358. #define OPER(X) dst[(X)] = src0[(X)] >= constant;
  359. UNROLL4(OPER)
  360. #undef OPER
  361. }
  362. /*
  363. ============
  364. idSIMD_Generic::CmpGE
  365. dst[i] |= ( src0[i] >= constant ) << bitNum;
  366. ============
  367. */
  368. void VPCALL idSIMD_Generic::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  369. #define OPER(X) dst[(X)] |= ( src0[(X)] >= constant ) << bitNum;
  370. UNROLL4(OPER)
  371. #undef OPER
  372. }
  373. /*
  374. ============
  375. idSIMD_Generic::CmpLT
  376. dst[i] = src0[i] < constant;
  377. ============
  378. */
  379. void VPCALL idSIMD_Generic::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
  380. #define OPER(X) dst[(X)] = src0[(X)] < constant;
  381. UNROLL4(OPER)
  382. #undef OPER
  383. }
  384. /*
  385. ============
  386. idSIMD_Generic::CmpLT
  387. dst[i] |= ( src0[i] < constant ) << bitNum;
  388. ============
  389. */
  390. void VPCALL idSIMD_Generic::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  391. #define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
  392. UNROLL4(OPER)
  393. #undef OPER
  394. }
  395. /*
  396. ============
  397. idSIMD_Generic::CmpLE
  398. dst[i] = src0[i] <= constant;
  399. ============
  400. */
  401. void VPCALL idSIMD_Generic::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
  402. #define OPER(X) dst[(X)] = src0[(X)] <= constant;
  403. UNROLL4(OPER)
  404. #undef OPER
  405. }
  406. /*
  407. ============
  408. idSIMD_Generic::CmpLE
  409. dst[i] |= ( src0[i] <= constant ) << bitNum;
  410. ============
  411. */
  412. void VPCALL idSIMD_Generic::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  413. #define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
  414. UNROLL4(OPER)
  415. #undef OPER
  416. }
  417. /*
  418. ============
  419. idSIMD_Generic::MinMax
  420. ============
  421. */
  422. void VPCALL idSIMD_Generic::MinMax( float &min, float &max, const float *src, const int count ) {
  423. min = idMath::INFINITY; max = -idMath::INFINITY;
  424. #define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
  425. UNROLL1(OPER)
  426. #undef OPER
  427. }
  428. /*
  429. ============
  430. idSIMD_Generic::MinMax
  431. ============
  432. */
  433. void VPCALL idSIMD_Generic::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
  434. min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
  435. #define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
  436. UNROLL1(OPER)
  437. #undef OPER
  438. }
  439. /*
  440. ============
  441. idSIMD_Generic::MinMax
  442. ============
  443. */
  444. void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
  445. min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
  446. #define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
  447. UNROLL1(OPER)
  448. #undef OPER
  449. }
  450. /*
  451. ============
  452. idSIMD_Generic::MinMax
  453. ============
  454. */
  455. void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
  456. min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
  457. #define OPER(X) const idVec3 &v = src[(X)].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
  458. UNROLL1(OPER)
  459. #undef OPER
  460. }
  461. /*
  462. ============
  463. idSIMD_Generic::MinMax
  464. ============
  465. */
  466. void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
  467. min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
  468. #define OPER(X) const idVec3 &v = src[indexes[(X)]].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
  469. UNROLL1(OPER)
  470. #undef OPER
  471. }
  472. /*
  473. ============
  474. idSIMD_Generic::Clamp
  475. ============
  476. */
  477. void VPCALL idSIMD_Generic::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
  478. #define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
  479. UNROLL1(OPER)
  480. #undef OPER
  481. }
  482. /*
  483. ============
  484. idSIMD_Generic::ClampMin
  485. ============
  486. */
  487. void VPCALL idSIMD_Generic::ClampMin( float *dst, const float *src, const float min, const int count ) {
  488. #define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
  489. UNROLL1(OPER)
  490. #undef OPER
  491. }
  492. /*
  493. ============
  494. idSIMD_Generic::ClampMax
  495. ============
  496. */
  497. void VPCALL idSIMD_Generic::ClampMax( float *dst, const float *src, const float max, const int count ) {
  498. #define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
  499. UNROLL1(OPER)
  500. #undef OPER
  501. }
  502. /*
  503. ================
  504. idSIMD_Generic::Memcpy
  505. ================
  506. */
  507. void VPCALL idSIMD_Generic::Memcpy( void *dst, const void *src, const int count ) {
  508. memcpy( dst, src, count );
  509. }
  510. /*
  511. ================
  512. idSIMD_Generic::Memset
  513. ================
  514. */
  515. void VPCALL idSIMD_Generic::Memset( void *dst, const int val, const int count ) {
  516. memset( dst, val, count );
  517. }
  518. /*
  519. ============
  520. idSIMD_Generic::Zero16
  521. ============
  522. */
  523. void VPCALL idSIMD_Generic::Zero16( float *dst, const int count ) {
  524. memset( dst, 0, count * sizeof( float ) );
  525. }
  526. /*
  527. ============
  528. idSIMD_Generic::Negate16
  529. ============
  530. */
  531. void VPCALL idSIMD_Generic::Negate16( float *dst, const int count ) {
  532. unsigned int *ptr = reinterpret_cast<unsigned int *>(dst);
  533. #define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
  534. UNROLL1(OPER)
  535. #undef OPER
  536. }
  537. /*
  538. ============
  539. idSIMD_Generic::Copy16
  540. ============
  541. */
  542. void VPCALL idSIMD_Generic::Copy16( float *dst, const float *src, const int count ) {
  543. #define OPER(X) dst[(X)] = src[(X)]
  544. UNROLL1(OPER)
  545. #undef OPER
  546. }
  547. /*
  548. ============
  549. idSIMD_Generic::Add16
  550. ============
  551. */
  552. void VPCALL idSIMD_Generic::Add16( float *dst, const float *src1, const float *src2, const int count ) {
  553. #define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
  554. UNROLL1(OPER)
  555. #undef OPER
  556. }
  557. /*
  558. ============
  559. idSIMD_Generic::Sub16
  560. ============
  561. */
  562. void VPCALL idSIMD_Generic::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
  563. #define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
  564. UNROLL1(OPER)
  565. #undef OPER
  566. }
  567. /*
  568. ============
  569. idSIMD_Generic::Mul16
  570. ============
  571. */
  572. void VPCALL idSIMD_Generic::Mul16( float *dst, const float *src1, const float constant, const int count ) {
  573. #define OPER(X) dst[(X)] = src1[(X)] * constant
  574. UNROLL1(OPER)
  575. #undef OPER
  576. }
  577. /*
  578. ============
  579. idSIMD_Generic::AddAssign16
  580. ============
  581. */
  582. void VPCALL idSIMD_Generic::AddAssign16( float *dst, const float *src, const int count ) {
  583. #define OPER(X) dst[(X)] += src[(X)]
  584. UNROLL1(OPER)
  585. #undef OPER
  586. }
  587. /*
  588. ============
  589. idSIMD_Generic::SubAssign16
  590. ============
  591. */
  592. void VPCALL idSIMD_Generic::SubAssign16( float *dst, const float *src, const int count ) {
  593. #define OPER(X) dst[(X)] -= src[(X)]
  594. UNROLL1(OPER)
  595. #undef OPER
  596. }
  597. /*
  598. ============
  599. idSIMD_Generic::MulAssign16
  600. ============
  601. */
  602. void VPCALL idSIMD_Generic::MulAssign16( float *dst, const float constant, const int count ) {
  603. #define OPER(X) dst[(X)] *= constant
  604. UNROLL1(OPER)
  605. #undef OPER
  606. }
  607. /*
  608. ============
  609. idSIMD_Generic::MatX_MultiplyVecX
  610. ============
  611. */
  612. void VPCALL idSIMD_Generic::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  613. int i, j, numRows;
  614. const float *mPtr, *vPtr;
  615. float *dstPtr;
  616. assert( vec.GetSize() >= mat.GetNumColumns() );
  617. assert( dst.GetSize() >= mat.GetNumRows() );
  618. mPtr = mat.ToFloatPtr();
  619. vPtr = vec.ToFloatPtr();
  620. dstPtr = dst.ToFloatPtr();
  621. numRows = mat.GetNumRows();
  622. switch( mat.GetNumColumns() ) {
  623. case 1:
  624. for ( i = 0; i < numRows; i++ ) {
  625. dstPtr[i] = mPtr[0] * vPtr[0];
  626. mPtr++;
  627. }
  628. break;
  629. case 2:
  630. for ( i = 0; i < numRows; i++ ) {
  631. dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
  632. mPtr += 2;
  633. }
  634. break;
  635. case 3:
  636. for ( i = 0; i < numRows; i++ ) {
  637. dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
  638. mPtr += 3;
  639. }
  640. break;
  641. case 4:
  642. for ( i = 0; i < numRows; i++ ) {
  643. dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  644. mPtr[3] * vPtr[3];
  645. mPtr += 4;
  646. }
  647. break;
  648. case 5:
  649. for ( i = 0; i < numRows; i++ ) {
  650. dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  651. mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
  652. mPtr += 5;
  653. }
  654. break;
  655. case 6:
  656. for ( i = 0; i < numRows; i++ ) {
  657. dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  658. mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
  659. mPtr += 6;
  660. }
  661. break;
  662. default:
  663. int numColumns = mat.GetNumColumns();
  664. for ( i = 0; i < numRows; i++ ) {
  665. float sum = mPtr[0] * vPtr[0];
  666. for ( j = 1; j < numColumns; j++ ) {
  667. sum += mPtr[j] * vPtr[j];
  668. }
  669. dstPtr[i] = sum;
  670. mPtr += numColumns;
  671. }
  672. break;
  673. }
  674. }
  675. /*
  676. ============
  677. idSIMD_Generic::MatX_MultiplyAddVecX
  678. ============
  679. */
  680. void VPCALL idSIMD_Generic::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  681. int i, j, numRows;
  682. const float *mPtr, *vPtr;
  683. float *dstPtr;
  684. assert( vec.GetSize() >= mat.GetNumColumns() );
  685. assert( dst.GetSize() >= mat.GetNumRows() );
  686. mPtr = mat.ToFloatPtr();
  687. vPtr = vec.ToFloatPtr();
  688. dstPtr = dst.ToFloatPtr();
  689. numRows = mat.GetNumRows();
  690. switch( mat.GetNumColumns() ) {
  691. case 1:
  692. for ( i = 0; i < numRows; i++ ) {
  693. dstPtr[i] += mPtr[0] * vPtr[0];
  694. mPtr++;
  695. }
  696. break;
  697. case 2:
  698. for ( i = 0; i < numRows; i++ ) {
  699. dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
  700. mPtr += 2;
  701. }
  702. break;
  703. case 3:
  704. for ( i = 0; i < numRows; i++ ) {
  705. dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
  706. mPtr += 3;
  707. }
  708. break;
  709. case 4:
  710. for ( i = 0; i < numRows; i++ ) {
  711. dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  712. mPtr[3] * vPtr[3];
  713. mPtr += 4;
  714. }
  715. break;
  716. case 5:
  717. for ( i = 0; i < numRows; i++ ) {
  718. dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  719. mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
  720. mPtr += 5;
  721. }
  722. break;
  723. case 6:
  724. for ( i = 0; i < numRows; i++ ) {
  725. dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  726. mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
  727. mPtr += 6;
  728. }
  729. break;
  730. default:
  731. int numColumns = mat.GetNumColumns();
  732. for ( i = 0; i < numRows; i++ ) {
  733. float sum = mPtr[0] * vPtr[0];
  734. for ( j = 1; j < numColumns; j++ ) {
  735. sum += mPtr[j] * vPtr[j];
  736. }
  737. dstPtr[i] += sum;
  738. mPtr += numColumns;
  739. }
  740. break;
  741. }
  742. }
  743. /*
  744. ============
  745. idSIMD_Generic::MatX_MultiplySubVecX
  746. ============
  747. */
  748. void VPCALL idSIMD_Generic::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  749. int i, j, numRows;
  750. const float *mPtr, *vPtr;
  751. float *dstPtr;
  752. assert( vec.GetSize() >= mat.GetNumColumns() );
  753. assert( dst.GetSize() >= mat.GetNumRows() );
  754. mPtr = mat.ToFloatPtr();
  755. vPtr = vec.ToFloatPtr();
  756. dstPtr = dst.ToFloatPtr();
  757. numRows = mat.GetNumRows();
  758. switch( mat.GetNumColumns() ) {
  759. case 1:
  760. for ( i = 0; i < numRows; i++ ) {
  761. dstPtr[i] -= mPtr[0] * vPtr[0];
  762. mPtr++;
  763. }
  764. break;
  765. case 2:
  766. for ( i = 0; i < numRows; i++ ) {
  767. dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
  768. mPtr += 2;
  769. }
  770. break;
  771. case 3:
  772. for ( i = 0; i < numRows; i++ ) {
  773. dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
  774. mPtr += 3;
  775. }
  776. break;
  777. case 4:
  778. for ( i = 0; i < numRows; i++ ) {
  779. dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  780. mPtr[3] * vPtr[3];
  781. mPtr += 4;
  782. }
  783. break;
  784. case 5:
  785. for ( i = 0; i < numRows; i++ ) {
  786. dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  787. mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
  788. mPtr += 5;
  789. }
  790. break;
  791. case 6:
  792. for ( i = 0; i < numRows; i++ ) {
  793. dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
  794. mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
  795. mPtr += 6;
  796. }
  797. break;
  798. default:
  799. int numColumns = mat.GetNumColumns();
  800. for ( i = 0; i < numRows; i++ ) {
  801. float sum = mPtr[0] * vPtr[0];
  802. for ( j = 1; j < numColumns; j++ ) {
  803. sum += mPtr[j] * vPtr[j];
  804. }
  805. dstPtr[i] -= sum;
  806. mPtr += numColumns;
  807. }
  808. break;
  809. }
  810. }
  811. /*
  812. ============
  813. idSIMD_Generic::MatX_TransposeMultiplyVecX
  814. ============
  815. */
  816. void VPCALL idSIMD_Generic::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  817. int i, j, numColumns;
  818. const float *mPtr, *vPtr;
  819. float *dstPtr;
  820. assert( vec.GetSize() >= mat.GetNumRows() );
  821. assert( dst.GetSize() >= mat.GetNumColumns() );
  822. mPtr = mat.ToFloatPtr();
  823. vPtr = vec.ToFloatPtr();
  824. dstPtr = dst.ToFloatPtr();
  825. numColumns = mat.GetNumColumns();
  826. switch( mat.GetNumRows() ) {
  827. case 1:
  828. for ( i = 0; i < numColumns; i++ ) {
  829. dstPtr[i] = *(mPtr) * vPtr[0];
  830. mPtr++;
  831. }
  832. break;
  833. case 2:
  834. for ( i = 0; i < numColumns; i++ ) {
  835. dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
  836. mPtr++;
  837. }
  838. break;
  839. case 3:
  840. for ( i = 0; i < numColumns; i++ ) {
  841. dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
  842. mPtr++;
  843. }
  844. break;
  845. case 4:
  846. for ( i = 0; i < numColumns; i++ ) {
  847. dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  848. *(mPtr+3*numColumns) * vPtr[3];
  849. mPtr++;
  850. }
  851. break;
  852. case 5:
  853. for ( i = 0; i < numColumns; i++ ) {
  854. dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  855. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
  856. mPtr++;
  857. }
  858. break;
  859. case 6:
  860. for ( i = 0; i < numColumns; i++ ) {
  861. dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  862. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
  863. mPtr++;
  864. }
  865. break;
  866. default:
  867. int numRows = mat.GetNumRows();
  868. for ( i = 0; i < numColumns; i++ ) {
  869. mPtr = mat.ToFloatPtr() + i;
  870. float sum = mPtr[0] * vPtr[0];
  871. for ( j = 1; j < numRows; j++ ) {
  872. mPtr += numColumns;
  873. sum += mPtr[0] * vPtr[j];
  874. }
  875. dstPtr[i] = sum;
  876. }
  877. break;
  878. }
  879. }
  880. /*
  881. ============
  882. idSIMD_Generic::MatX_TransposeMultiplyAddVecX
  883. ============
  884. */
  885. void VPCALL idSIMD_Generic::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  886. int i, j, numColumns;
  887. const float *mPtr, *vPtr;
  888. float *dstPtr;
  889. assert( vec.GetSize() >= mat.GetNumRows() );
  890. assert( dst.GetSize() >= mat.GetNumColumns() );
  891. mPtr = mat.ToFloatPtr();
  892. vPtr = vec.ToFloatPtr();
  893. dstPtr = dst.ToFloatPtr();
  894. numColumns = mat.GetNumColumns();
  895. switch( mat.GetNumRows() ) {
  896. case 1:
  897. for ( i = 0; i < numColumns; i++ ) {
  898. dstPtr[i] += *(mPtr) * vPtr[0];
  899. mPtr++;
  900. }
  901. break;
  902. case 2:
  903. for ( i = 0; i < numColumns; i++ ) {
  904. dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
  905. mPtr++;
  906. }
  907. break;
  908. case 3:
  909. for ( i = 0; i < numColumns; i++ ) {
  910. dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
  911. mPtr++;
  912. }
  913. break;
  914. case 4:
  915. for ( i = 0; i < numColumns; i++ ) {
  916. dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  917. *(mPtr+3*numColumns) * vPtr[3];
  918. mPtr++;
  919. }
  920. break;
  921. case 5:
  922. for ( i = 0; i < numColumns; i++ ) {
  923. dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  924. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
  925. mPtr++;
  926. }
  927. break;
  928. case 6:
  929. for ( i = 0; i < numColumns; i++ ) {
  930. dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  931. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
  932. mPtr++;
  933. }
  934. break;
  935. default:
  936. int numRows = mat.GetNumRows();
  937. for ( i = 0; i < numColumns; i++ ) {
  938. mPtr = mat.ToFloatPtr() + i;
  939. float sum = mPtr[0] * vPtr[0];
  940. for ( j = 1; j < numRows; j++ ) {
  941. mPtr += numColumns;
  942. sum += mPtr[0] * vPtr[j];
  943. }
  944. dstPtr[i] += sum;
  945. }
  946. break;
  947. }
  948. }
  949. /*
  950. ============
  951. idSIMD_Generic::MatX_TransposeMultiplySubVecX
  952. ============
  953. */
  954. void VPCALL idSIMD_Generic::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
  955. int i, numColumns;
  956. const float *mPtr, *vPtr;
  957. float *dstPtr;
  958. assert( vec.GetSize() >= mat.GetNumRows() );
  959. assert( dst.GetSize() >= mat.GetNumColumns() );
  960. mPtr = mat.ToFloatPtr();
  961. vPtr = vec.ToFloatPtr();
  962. dstPtr = dst.ToFloatPtr();
  963. numColumns = mat.GetNumColumns();
  964. switch( mat.GetNumRows() ) {
  965. case 1:
  966. for ( i = 0; i < numColumns; i++ ) {
  967. dstPtr[i] -= *(mPtr) * vPtr[0];
  968. mPtr++;
  969. }
  970. break;
  971. case 2:
  972. for ( i = 0; i < numColumns; i++ ) {
  973. dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
  974. mPtr++;
  975. }
  976. break;
  977. case 3:
  978. for ( i = 0; i < numColumns; i++ ) {
  979. dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
  980. mPtr++;
  981. }
  982. break;
  983. case 4:
  984. for ( i = 0; i < numColumns; i++ ) {
  985. dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  986. *(mPtr+3*numColumns) * vPtr[3];
  987. mPtr++;
  988. }
  989. break;
  990. case 5:
  991. for ( i = 0; i < numColumns; i++ ) {
  992. dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  993. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
  994. mPtr++;
  995. }
  996. break;
  997. case 6:
  998. for ( i = 0; i < numColumns; i++ ) {
  999. dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
  1000. *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
  1001. mPtr++;
  1002. }
  1003. break;
  1004. default:
  1005. int numRows = mat.GetNumRows();
  1006. for ( i = 0; i < numColumns; i++ ) {
  1007. mPtr = mat.ToFloatPtr() + i;
  1008. float sum = mPtr[0] * vPtr[0];
  1009. for ( int j = 1; j < numRows; j++ ) {
  1010. mPtr += numColumns;
  1011. sum += mPtr[0] * vPtr[j];
  1012. }
  1013. dstPtr[i] -= sum;
  1014. }
  1015. break;
  1016. }
  1017. }
  1018. /*
  1019. ============
  1020. idSIMD_Generic::MatX_MultiplyMatX
  1021. optimizes the following matrix multiplications:
  1022. NxN * Nx6
  1023. 6xN * Nx6
  1024. Nx6 * 6xN
  1025. 6x6 * 6xN
  1026. with N in the range [1-6].
  1027. ============
  1028. */
  1029. void VPCALL idSIMD_Generic::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
  1030. int i, j, k, l, n;
  1031. float *dstPtr;
  1032. const float *m1Ptr, *m2Ptr;
  1033. double sum;
  1034. assert( m1.GetNumColumns() == m2.GetNumRows() );
  1035. dstPtr = dst.ToFloatPtr();
  1036. m1Ptr = m1.ToFloatPtr();
  1037. m2Ptr = m2.ToFloatPtr();
  1038. k = m1.GetNumRows();
  1039. l = m2.GetNumColumns();
  1040. switch( m1.GetNumColumns() ) {
  1041. case 1: {
  1042. if ( l == 6 ) {
  1043. for ( i = 0; i < k; i++ ) { // Nx1 * 1x6
  1044. *dstPtr++ = m1Ptr[i] * m2Ptr[0];
  1045. *dstPtr++ = m1Ptr[i] * m2Ptr[1];
  1046. *dstPtr++ = m1Ptr[i] * m2Ptr[2];
  1047. *dstPtr++ = m1Ptr[i] * m2Ptr[3];
  1048. *dstPtr++ = m1Ptr[i] * m2Ptr[4];
  1049. *dstPtr++ = m1Ptr[i] * m2Ptr[5];
  1050. }
  1051. return;
  1052. }
  1053. for ( i = 0; i < k; i++ ) {
  1054. m2Ptr = m2.ToFloatPtr();
  1055. for ( j = 0; j < l; j++ ) {
  1056. *dstPtr++ = m1Ptr[0] * m2Ptr[0];
  1057. m2Ptr++;
  1058. }
  1059. m1Ptr++;
  1060. }
  1061. break;
  1062. }
  1063. case 2: {
  1064. if ( l == 6 ) {
  1065. for ( i = 0; i < k; i++ ) { // Nx2 * 2x6
  1066. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6];
  1067. *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7];
  1068. *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8];
  1069. *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9];
  1070. *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10];
  1071. *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11];
  1072. m1Ptr += 2;
  1073. }
  1074. return;
  1075. }
  1076. for ( i = 0; i < k; i++ ) {
  1077. m2Ptr = m2.ToFloatPtr();
  1078. for ( j = 0; j < l; j++ ) {
  1079. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
  1080. m2Ptr++;
  1081. }
  1082. m1Ptr += 2;
  1083. }
  1084. break;
  1085. }
  1086. case 3: {
  1087. if ( l == 6 ) {
  1088. for ( i = 0; i < k; i++ ) { // Nx3 * 3x6
  1089. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12];
  1090. *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13];
  1091. *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14];
  1092. *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15];
  1093. *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16];
  1094. *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17];
  1095. m1Ptr += 3;
  1096. }
  1097. return;
  1098. }
  1099. for ( i = 0; i < k; i++ ) {
  1100. m2Ptr = m2.ToFloatPtr();
  1101. for ( j = 0; j < l; j++ ) {
  1102. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
  1103. m2Ptr++;
  1104. }
  1105. m1Ptr += 3;
  1106. }
  1107. break;
  1108. }
  1109. case 4: {
  1110. if ( l == 6 ) {
  1111. for ( i = 0; i < k; i++ ) { // Nx4 * 4x6
  1112. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18];
  1113. *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19];
  1114. *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20];
  1115. *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21];
  1116. *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22];
  1117. *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23];
  1118. m1Ptr += 4;
  1119. }
  1120. return;
  1121. }
  1122. for ( i = 0; i < k; i++ ) {
  1123. m2Ptr = m2.ToFloatPtr();
  1124. for ( j = 0; j < l; j++ ) {
  1125. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
  1126. m1Ptr[3] * m2Ptr[3*l];
  1127. m2Ptr++;
  1128. }
  1129. m1Ptr += 4;
  1130. }
  1131. break;
  1132. }
  1133. case 5: {
  1134. if ( l == 6 ) {
  1135. for ( i = 0; i < k; i++ ) { // Nx5 * 5x6
  1136. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18] + m1Ptr[4] * m2Ptr[24];
  1137. *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19] + m1Ptr[4] * m2Ptr[25];
  1138. *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20] + m1Ptr[4] * m2Ptr[26];
  1139. *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21] + m1Ptr[4] * m2Ptr[27];
  1140. *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22] + m1Ptr[4] * m2Ptr[28];
  1141. *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23] + m1Ptr[4] * m2Ptr[29];
  1142. m1Ptr += 5;
  1143. }
  1144. return;
  1145. }
  1146. for ( i = 0; i < k; i++ ) {
  1147. m2Ptr = m2.ToFloatPtr();
  1148. for ( j = 0; j < l; j++ ) {
  1149. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
  1150. m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
  1151. m2Ptr++;
  1152. }
  1153. m1Ptr += 5;
  1154. }
  1155. break;
  1156. }
  1157. case 6: {
  1158. switch( k ) {
  1159. case 1: {
  1160. if ( l == 1 ) { // 1x6 * 6x1
  1161. dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
  1162. m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
  1163. return;
  1164. }
  1165. break;
  1166. }
  1167. case 2: {
  1168. if ( l == 2 ) { // 2x6 * 6x2
  1169. for ( i = 0; i < 2; i++ ) {
  1170. for ( j = 0; j < 2; j++ ) {
  1171. *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]
  1172. + m1Ptr[1] * m2Ptr[ 1 * 2 + j ]
  1173. + m1Ptr[2] * m2Ptr[ 2 * 2 + j ]
  1174. + m1Ptr[3] * m2Ptr[ 3 * 2 + j ]
  1175. + m1Ptr[4] * m2Ptr[ 4 * 2 + j ]
  1176. + m1Ptr[5] * m2Ptr[ 5 * 2 + j ];
  1177. dstPtr++;
  1178. }
  1179. m1Ptr += 6;
  1180. }
  1181. return;
  1182. }
  1183. break;
  1184. }
  1185. case 3: {
  1186. if ( l == 3 ) { // 3x6 * 6x3
  1187. for ( i = 0; i < 3; i++ ) {
  1188. for ( j = 0; j < 3; j++ ) {
  1189. *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]
  1190. + m1Ptr[1] * m2Ptr[ 1 * 3 + j ]
  1191. + m1Ptr[2] * m2Ptr[ 2 * 3 + j ]
  1192. + m1Ptr[3] * m2Ptr[ 3 * 3 + j ]
  1193. + m1Ptr[4] * m2Ptr[ 4 * 3 + j ]
  1194. + m1Ptr[5] * m2Ptr[ 5 * 3 + j ];
  1195. dstPtr++;
  1196. }
  1197. m1Ptr += 6;
  1198. }
  1199. return;
  1200. }
  1201. break;
  1202. }
  1203. case 4: {
  1204. if ( l == 4 ) { // 4x6 * 6x4
  1205. for ( i = 0; i < 4; i++ ) {
  1206. for ( j = 0; j < 4; j++ ) {
  1207. *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]
  1208. + m1Ptr[1] * m2Ptr[ 1 * 4 + j ]
  1209. + m1Ptr[2] * m2Ptr[ 2 * 4 + j ]
  1210. + m1Ptr[3] * m2Ptr[ 3 * 4 + j ]
  1211. + m1Ptr[4] * m2Ptr[ 4 * 4 + j ]
  1212. + m1Ptr[5] * m2Ptr[ 5 * 4 + j ];
  1213. dstPtr++;
  1214. }
  1215. m1Ptr += 6;
  1216. }
  1217. return;
  1218. }
  1219. }
  1220. case 5: {
  1221. if ( l == 5 ) { // 5x6 * 6x5
  1222. for ( i = 0; i < 5; i++ ) {
  1223. for ( j = 0; j < 5; j++ ) {
  1224. *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]
  1225. + m1Ptr[1] * m2Ptr[ 1 * 5 + j ]
  1226. + m1Ptr[2] * m2Ptr[ 2 * 5 + j ]
  1227. + m1Ptr[3] * m2Ptr[ 3 * 5 + j ]
  1228. + m1Ptr[4] * m2Ptr[ 4 * 5 + j ]
  1229. + m1Ptr[5] * m2Ptr[ 5 * 5 + j ];
  1230. dstPtr++;
  1231. }
  1232. m1Ptr += 6;
  1233. }
  1234. return;
  1235. }
  1236. }
  1237. case 6: {
  1238. switch( l ) {
  1239. case 1: { // 6x6 * 6x1
  1240. for ( i = 0; i < 6; i++ ) {
  1241. *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 1 ]
  1242. + m1Ptr[1] * m2Ptr[ 1 * 1 ]
  1243. + m1Ptr[2] * m2Ptr[ 2 * 1 ]
  1244. + m1Ptr[3] * m2Ptr[ 3 * 1 ]
  1245. + m1Ptr[4] * m2Ptr[ 4 * 1 ]
  1246. + m1Ptr[5] * m2Ptr[ 5 * 1 ];
  1247. dstPtr++;
  1248. m1Ptr += 6;
  1249. }
  1250. return;
  1251. }
  1252. case 2: { // 6x6 * 6x2
  1253. for ( i = 0; i < 6; i++ ) {
  1254. for ( j = 0; j < 2; j++ ) {
  1255. *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]
  1256. + m1Ptr[1] * m2Ptr[ 1 * 2 + j ]
  1257. + m1Ptr[2] * m2Ptr[ 2 * 2 + j ]
  1258. + m1Ptr[3] * m2Ptr[ 3 * 2 + j ]
  1259. + m1Ptr[4] * m2Ptr[ 4 * 2 + j ]
  1260. + m1Ptr[5] * m2Ptr[ 5 * 2 + j ];
  1261. dstPtr++;
  1262. }
  1263. m1Ptr += 6;
  1264. }
  1265. return;
  1266. }
  1267. case 3: { // 6x6 * 6x3
  1268. for ( i = 0; i < 6; i++ ) {
  1269. for ( j = 0; j < 3; j++ ) {
  1270. *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]
  1271. + m1Ptr[1] * m2Ptr[ 1 * 3 + j ]
  1272. + m1Ptr[2] * m2Ptr[ 2 * 3 + j ]
  1273. + m1Ptr[3] * m2Ptr[ 3 * 3 + j ]
  1274. + m1Ptr[4] * m2Ptr[ 4 * 3 + j ]
  1275. + m1Ptr[5] * m2Ptr[ 5 * 3 + j ];
  1276. dstPtr++;
  1277. }
  1278. m1Ptr += 6;
  1279. }
  1280. return;
  1281. }
  1282. case 4: { // 6x6 * 6x4
  1283. for ( i = 0; i < 6; i++ ) {
  1284. for ( j = 0; j < 4; j++ ) {
  1285. *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]
  1286. + m1Ptr[1] * m2Ptr[ 1 * 4 + j ]
  1287. + m1Ptr[2] * m2Ptr[ 2 * 4 + j ]
  1288. + m1Ptr[3] * m2Ptr[ 3 * 4 + j ]
  1289. + m1Ptr[4] * m2Ptr[ 4 * 4 + j ]
  1290. + m1Ptr[5] * m2Ptr[ 5 * 4 + j ];
  1291. dstPtr++;
  1292. }
  1293. m1Ptr += 6;
  1294. }
  1295. return;
  1296. }
  1297. case 5: { // 6x6 * 6x5
  1298. for ( i = 0; i < 6; i++ ) {
  1299. for ( j = 0; j < 5; j++ ) {
  1300. *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]
  1301. + m1Ptr[1] * m2Ptr[ 1 * 5 + j ]
  1302. + m1Ptr[2] * m2Ptr[ 2 * 5 + j ]
  1303. + m1Ptr[3] * m2Ptr[ 3 * 5 + j ]
  1304. + m1Ptr[4] * m2Ptr[ 4 * 5 + j ]
  1305. + m1Ptr[5] * m2Ptr[ 5 * 5 + j ];
  1306. dstPtr++;
  1307. }
  1308. m1Ptr += 6;
  1309. }
  1310. return;
  1311. }
  1312. case 6: { // 6x6 * 6x6
  1313. for ( i = 0; i < 6; i++ ) {
  1314. for ( j = 0; j < 6; j++ ) {
  1315. *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 6 + j ]
  1316. + m1Ptr[1] * m2Ptr[ 1 * 6 + j ]
  1317. + m1Ptr[2] * m2Ptr[ 2 * 6 + j ]
  1318. + m1Ptr[3] * m2Ptr[ 3 * 6 + j ]
  1319. + m1Ptr[4] * m2Ptr[ 4 * 6 + j ]
  1320. + m1Ptr[5] * m2Ptr[ 5 * 6 + j ];
  1321. dstPtr++;
  1322. }
  1323. m1Ptr += 6;
  1324. }
  1325. return;
  1326. }
  1327. }
  1328. }
  1329. }
  1330. for ( i = 0; i < k; i++ ) {
  1331. m2Ptr = m2.ToFloatPtr();
  1332. for ( j = 0; j < l; j++ ) {
  1333. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
  1334. m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
  1335. m2Ptr++;
  1336. }
  1337. m1Ptr += 6;
  1338. }
  1339. break;
  1340. }
  1341. default: {
  1342. for ( i = 0; i < k; i++ ) {
  1343. for ( j = 0; j < l; j++ ) {
  1344. m2Ptr = m2.ToFloatPtr() + j;
  1345. sum = m1Ptr[0] * m2Ptr[0];
  1346. for ( n = 1; n < m1.GetNumColumns(); n++ ) {
  1347. m2Ptr += l;
  1348. sum += m1Ptr[n] * m2Ptr[0];
  1349. }
  1350. *dstPtr++ = sum;
  1351. }
  1352. m1Ptr += m1.GetNumColumns();
  1353. }
  1354. break;
  1355. }
  1356. }
  1357. }
  1358. /*
  1359. ============
  1360. idSIMD_Generic::MatX_TransposeMultiplyMatX
  1361. optimizes the following tranpose matrix multiplications:
  1362. Nx6 * NxN
  1363. 6xN * 6x6
  1364. with N in the range [1-6].
  1365. ============
  1366. */
  1367. void VPCALL idSIMD_Generic::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
  1368. int i, j, k, l, n;
  1369. float *dstPtr;
  1370. const float *m1Ptr, *m2Ptr;
  1371. double sum;
  1372. assert( m1.GetNumRows() == m2.GetNumRows() );
  1373. m1Ptr = m1.ToFloatPtr();
  1374. m2Ptr = m2.ToFloatPtr();
  1375. dstPtr = dst.ToFloatPtr();
  1376. k = m1.GetNumColumns();
  1377. l = m2.GetNumColumns();
  1378. switch( m1.GetNumRows() ) {
  1379. case 1:
  1380. if ( k == 6 && l == 1 ) { // 1x6 * 1x1
  1381. for ( i = 0; i < 6; i++ ) {
  1382. *dstPtr++ = m1Ptr[0] * m2Ptr[0];
  1383. m1Ptr++;
  1384. }
  1385. return;
  1386. }
  1387. for ( i = 0; i < k; i++ ) {
  1388. m2Ptr = m2.ToFloatPtr();
  1389. for ( j = 0; j < l; j++ ) {
  1390. *dstPtr++ = m1Ptr[0] * m2Ptr[0];
  1391. m2Ptr++;
  1392. }
  1393. m1Ptr++;
  1394. }
  1395. break;
  1396. case 2:
  1397. if ( k == 6 && l == 2 ) { // 2x6 * 2x2
  1398. for ( i = 0; i < 6; i++ ) {
  1399. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+0] + m1Ptr[1*6] * m2Ptr[1*2+0];
  1400. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+1] + m1Ptr[1*6] * m2Ptr[1*2+1];
  1401. m1Ptr++;
  1402. }
  1403. return;
  1404. }
  1405. for ( i = 0; i < k; i++ ) {
  1406. m2Ptr = m2.ToFloatPtr();
  1407. for ( j = 0; j < l; j++ ) {
  1408. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
  1409. m2Ptr++;
  1410. }
  1411. m1Ptr++;
  1412. }
  1413. break;
  1414. case 3:
  1415. if ( k == 6 && l == 3 ) { // 3x6 * 3x3
  1416. for ( i = 0; i < 6; i++ ) {
  1417. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+0] + m1Ptr[1*6] * m2Ptr[1*3+0] + m1Ptr[2*6] * m2Ptr[2*3+0];
  1418. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+1] + m1Ptr[1*6] * m2Ptr[1*3+1] + m1Ptr[2*6] * m2Ptr[2*3+1];
  1419. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+2] + m1Ptr[1*6] * m2Ptr[1*3+2] + m1Ptr[2*6] * m2Ptr[2*3+2];
  1420. m1Ptr++;
  1421. }
  1422. return;
  1423. }
  1424. for ( i = 0; i < k; i++ ) {
  1425. m2Ptr = m2.ToFloatPtr();
  1426. for ( j = 0; j < l; j++ ) {
  1427. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
  1428. m2Ptr++;
  1429. }
  1430. m1Ptr++;
  1431. }
  1432. break;
  1433. case 4:
  1434. if ( k == 6 && l == 4 ) { // 4x6 * 4x4
  1435. for ( i = 0; i < 6; i++ ) {
  1436. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+0] + m1Ptr[1*6] * m2Ptr[1*4+0] + m1Ptr[2*6] * m2Ptr[2*4+0] + m1Ptr[3*6] * m2Ptr[3*4+0];
  1437. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+1] + m1Ptr[1*6] * m2Ptr[1*4+1] + m1Ptr[2*6] * m2Ptr[2*4+1] + m1Ptr[3*6] * m2Ptr[3*4+1];
  1438. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+2] + m1Ptr[1*6] * m2Ptr[1*4+2] + m1Ptr[2*6] * m2Ptr[2*4+2] + m1Ptr[3*6] * m2Ptr[3*4+2];
  1439. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+3] + m1Ptr[1*6] * m2Ptr[1*4+3] + m1Ptr[2*6] * m2Ptr[2*4+3] + m1Ptr[3*6] * m2Ptr[3*4+3];
  1440. m1Ptr++;
  1441. }
  1442. return;
  1443. }
  1444. for ( i = 0; i < k; i++ ) {
  1445. m2Ptr = m2.ToFloatPtr();
  1446. for ( j = 0; j < l; j++ ) {
  1447. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
  1448. m1Ptr[3*k] * m2Ptr[3*l];
  1449. m2Ptr++;
  1450. }
  1451. m1Ptr++;
  1452. }
  1453. break;
  1454. case 5:
  1455. if ( k == 6 && l == 5 ) { // 5x6 * 5x5
  1456. for ( i = 0; i < 6; i++ ) {
  1457. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+0] + m1Ptr[1*6] * m2Ptr[1*5+0] + m1Ptr[2*6] * m2Ptr[2*5+0] + m1Ptr[3*6] * m2Ptr[3*5+0] + m1Ptr[4*6] * m2Ptr[4*5+0];
  1458. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+1] + m1Ptr[1*6] * m2Ptr[1*5+1] + m1Ptr[2*6] * m2Ptr[2*5+1] + m1Ptr[3*6] * m2Ptr[3*5+1] + m1Ptr[4*6] * m2Ptr[4*5+1];
  1459. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+2] + m1Ptr[1*6] * m2Ptr[1*5+2] + m1Ptr[2*6] * m2Ptr[2*5+2] + m1Ptr[3*6] * m2Ptr[3*5+2] + m1Ptr[4*6] * m2Ptr[4*5+2];
  1460. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+3] + m1Ptr[1*6] * m2Ptr[1*5+3] + m1Ptr[2*6] * m2Ptr[2*5+3] + m1Ptr[3*6] * m2Ptr[3*5+3] + m1Ptr[4*6] * m2Ptr[4*5+3];
  1461. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+4] + m1Ptr[1*6] * m2Ptr[1*5+4] + m1Ptr[2*6] * m2Ptr[2*5+4] + m1Ptr[3*6] * m2Ptr[3*5+4] + m1Ptr[4*6] * m2Ptr[4*5+4];
  1462. m1Ptr++;
  1463. }
  1464. return;
  1465. }
  1466. for ( i = 0; i < k; i++ ) {
  1467. m2Ptr = m2.ToFloatPtr();
  1468. for ( j = 0; j < l; j++ ) {
  1469. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
  1470. m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
  1471. m2Ptr++;
  1472. }
  1473. m1Ptr++;
  1474. }
  1475. break;
  1476. case 6:
  1477. if ( l == 6 ) {
  1478. switch( k ) {
  1479. case 1: // 6x1 * 6x6
  1480. m2Ptr = m2.ToFloatPtr();
  1481. for ( j = 0; j < 6; j++ ) {
  1482. *dstPtr++ = m1Ptr[0*1] * m2Ptr[0*6] +
  1483. m1Ptr[1*1] * m2Ptr[1*6] +
  1484. m1Ptr[2*1] * m2Ptr[2*6] +
  1485. m1Ptr[3*1] * m2Ptr[3*6] +
  1486. m1Ptr[4*1] * m2Ptr[4*6] +
  1487. m1Ptr[5*1] * m2Ptr[5*6];
  1488. m2Ptr++;
  1489. }
  1490. return;
  1491. case 2: // 6x2 * 6x6
  1492. for ( i = 0; i < 2; i++ ) {
  1493. m2Ptr = m2.ToFloatPtr();
  1494. for ( j = 0; j < 6; j++ ) {
  1495. *dstPtr++ = m1Ptr[0*2] * m2Ptr[0*6] +
  1496. m1Ptr[1*2] * m2Ptr[1*6] +
  1497. m1Ptr[2*2] * m2Ptr[2*6] +
  1498. m1Ptr[3*2] * m2Ptr[3*6] +
  1499. m1Ptr[4*2] * m2Ptr[4*6] +
  1500. m1Ptr[5*2] * m2Ptr[5*6];
  1501. m2Ptr++;
  1502. }
  1503. m1Ptr++;
  1504. }
  1505. return;
  1506. case 3: // 6x3 * 6x6
  1507. for ( i = 0; i < 3; i++ ) {
  1508. m2Ptr = m2.ToFloatPtr();
  1509. for ( j = 0; j < 6; j++ ) {
  1510. *dstPtr++ = m1Ptr[0*3] * m2Ptr[0*6] +
  1511. m1Ptr[1*3] * m2Ptr[1*6] +
  1512. m1Ptr[2*3] * m2Ptr[2*6] +
  1513. m1Ptr[3*3] * m2Ptr[3*6] +
  1514. m1Ptr[4*3] * m2Ptr[4*6] +
  1515. m1Ptr[5*3] * m2Ptr[5*6];
  1516. m2Ptr++;
  1517. }
  1518. m1Ptr++;
  1519. }
  1520. return;
  1521. case 4: // 6x4 * 6x6
  1522. for ( i = 0; i < 4; i++ ) {
  1523. m2Ptr = m2.ToFloatPtr();
  1524. for ( j = 0; j < 6; j++ ) {
  1525. *dstPtr++ = m1Ptr[0*4] * m2Ptr[0*6] +
  1526. m1Ptr[1*4] * m2Ptr[1*6] +
  1527. m1Ptr[2*4] * m2Ptr[2*6] +
  1528. m1Ptr[3*4] * m2Ptr[3*6] +
  1529. m1Ptr[4*4] * m2Ptr[4*6] +
  1530. m1Ptr[5*4] * m2Ptr[5*6];
  1531. m2Ptr++;
  1532. }
  1533. m1Ptr++;
  1534. }
  1535. return;
  1536. case 5: // 6x5 * 6x6
  1537. for ( i = 0; i < 5; i++ ) {
  1538. m2Ptr = m2.ToFloatPtr();
  1539. for ( j = 0; j < 6; j++ ) {
  1540. *dstPtr++ = m1Ptr[0*5] * m2Ptr[0*6] +
  1541. m1Ptr[1*5] * m2Ptr[1*6] +
  1542. m1Ptr[2*5] * m2Ptr[2*6] +
  1543. m1Ptr[3*5] * m2Ptr[3*6] +
  1544. m1Ptr[4*5] * m2Ptr[4*6] +
  1545. m1Ptr[5*5] * m2Ptr[5*6];
  1546. m2Ptr++;
  1547. }
  1548. m1Ptr++;
  1549. }
  1550. return;
  1551. case 6: // 6x6 * 6x6
  1552. for ( i = 0; i < 6; i++ ) {
  1553. m2Ptr = m2.ToFloatPtr();
  1554. for ( j = 0; j < 6; j++ ) {
  1555. *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*6] +
  1556. m1Ptr[1*6] * m2Ptr[1*6] +
  1557. m1Ptr[2*6] * m2Ptr[2*6] +
  1558. m1Ptr[3*6] * m2Ptr[3*6] +
  1559. m1Ptr[4*6] * m2Ptr[4*6] +
  1560. m1Ptr[5*6] * m2Ptr[5*6];
  1561. m2Ptr++;
  1562. }
  1563. m1Ptr++;
  1564. }
  1565. return;
  1566. }
  1567. }
  1568. for ( i = 0; i < k; i++ ) {
  1569. m2Ptr = m2.ToFloatPtr();
  1570. for ( j = 0; j < l; j++ ) {
  1571. *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
  1572. m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
  1573. m2Ptr++;
  1574. }
  1575. m1Ptr++;
  1576. }
  1577. break;
  1578. default:
  1579. for ( i = 0; i < k; i++ ) {
  1580. for ( j = 0; j < l; j++ ) {
  1581. m1Ptr = m1.ToFloatPtr() + i;
  1582. m2Ptr = m2.ToFloatPtr() + j;
  1583. sum = m1Ptr[0] * m2Ptr[0];
  1584. for ( n = 1; n < m1.GetNumRows(); n++ ) {
  1585. m1Ptr += k;
  1586. m2Ptr += l;
  1587. sum += m1Ptr[0] * m2Ptr[0];
  1588. }
  1589. *dstPtr++ = sum;
  1590. }
  1591. }
  1592. break;
  1593. }
  1594. }
  1595. /*
  1596. ============
  1597. idSIMD_Generic::MatX_LowerTriangularSolve
  1598. solves x in Lx = b for the n * n sub-matrix of L
  1599. if skip > 0 the first skip elements of x are assumed to be valid already
  1600. L has to be a lower triangular matrix with (implicit) ones on the diagonal
  1601. x == b is allowed
  1602. ============
  1603. */
  1604. void VPCALL idSIMD_Generic::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
  1605. #if 1
  1606. int nc;
  1607. const float *lptr;
  1608. if ( skip >= n ) {
  1609. return;
  1610. }
  1611. lptr = L.ToFloatPtr();
  1612. nc = L.GetNumColumns();
  1613. // unrolled cases for n < 8
  1614. if ( n < 8 ) {
  1615. #define NSKIP( n, s ) ((n<<3)|(s&7))
  1616. switch( NSKIP( n, skip ) ) {
  1617. case NSKIP( 1, 0 ): x[0] = b[0];
  1618. return;
  1619. case NSKIP( 2, 0 ): x[0] = b[0];
  1620. case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  1621. return;
  1622. case NSKIP( 3, 0 ): x[0] = b[0];
  1623. case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  1624. case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  1625. return;
  1626. case NSKIP( 4, 0 ): x[0] = b[0];
  1627. case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  1628. case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  1629. case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  1630. return;
  1631. case NSKIP( 5, 0 ): x[0] = b[0];
  1632. case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  1633. case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  1634. case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  1635. case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  1636. return;
  1637. case NSKIP( 6, 0 ): x[0] = b[0];
  1638. case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  1639. case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  1640. case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  1641. case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  1642. case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
  1643. return;
  1644. case NSKIP( 7, 0 ): x[0] = b[0];
  1645. case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  1646. case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  1647. case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  1648. case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  1649. case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
  1650. case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
  1651. return;
  1652. }
  1653. return;
  1654. }
  1655. // process first 4 rows
  1656. switch( skip ) {
  1657. case 0: x[0] = b[0];
  1658. case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
  1659. case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  1660. case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  1661. skip = 4;
  1662. }
  1663. lptr = L[skip];
  1664. int i, j;
  1665. register double s0, s1, s2, s3;
  1666. for ( i = skip; i < n; i++ ) {
  1667. s0 = lptr[0] * x[0];
  1668. s1 = lptr[1] * x[1];
  1669. s2 = lptr[2] * x[2];
  1670. s3 = lptr[3] * x[3];
  1671. for ( j = 4; j < i-7; j += 8 ) {
  1672. s0 += lptr[j+0] * x[j+0];
  1673. s1 += lptr[j+1] * x[j+1];
  1674. s2 += lptr[j+2] * x[j+2];
  1675. s3 += lptr[j+3] * x[j+3];
  1676. s0 += lptr[j+4] * x[j+4];
  1677. s1 += lptr[j+5] * x[j+5];
  1678. s2 += lptr[j+6] * x[j+6];
  1679. s3 += lptr[j+7] * x[j+7];
  1680. }
  1681. switch( i - j ) {
  1682. NODEFAULT;
  1683. case 7: s0 += lptr[j+6] * x[j+6];
  1684. case 6: s1 += lptr[j+5] * x[j+5];
  1685. case 5: s2 += lptr[j+4] * x[j+4];
  1686. case 4: s3 += lptr[j+3] * x[j+3];
  1687. case 3: s0 += lptr[j+2] * x[j+2];
  1688. case 2: s1 += lptr[j+1] * x[j+1];
  1689. case 1: s2 += lptr[j+0] * x[j+0];
  1690. case 0: break;
  1691. }
  1692. double sum;
  1693. sum = s3;
  1694. sum += s2;
  1695. sum += s1;
  1696. sum += s0;
  1697. sum -= b[i];
  1698. x[i] = -sum;
  1699. lptr += nc;
  1700. }
  1701. #else
  1702. int i, j;
  1703. const float *lptr;
  1704. double sum;
  1705. for ( i = skip; i < n; i++ ) {
  1706. sum = b[i];
  1707. lptr = L[i];
  1708. for ( j = 0; j < i; j++ ) {
  1709. sum -= lptr[j] * x[j];
  1710. }
  1711. x[i] = sum;
  1712. }
  1713. #endif
  1714. }
  1715. /*
  1716. ============
  1717. idSIMD_Generic::MatX_LowerTriangularSolveTranspose
  1718. solves x in L'x = b for the n * n sub-matrix of L
  1719. L has to be a lower triangular matrix with (implicit) ones on the diagonal
  1720. x == b is allowed
  1721. ============
  1722. */
  1723. void VPCALL idSIMD_Generic::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
  1724. #if 1
  1725. int nc;
  1726. const float *lptr;
  1727. lptr = L.ToFloatPtr();
  1728. nc = L.GetNumColumns();
  1729. // unrolled cases for n < 8
  1730. if ( n < 8 ) {
  1731. switch( n ) {
  1732. case 0:
  1733. return;
  1734. case 1:
  1735. x[0] = b[0];
  1736. return;
  1737. case 2:
  1738. x[1] = b[1];
  1739. x[0] = b[0] - lptr[1*nc+0] * x[1];
  1740. return;
  1741. case 3:
  1742. x[2] = b[2];
  1743. x[1] = b[1] - lptr[2*nc+1] * x[2];
  1744. x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  1745. return;
  1746. case 4:
  1747. x[3] = b[3];
  1748. x[2] = b[2] - lptr[3*nc+2] * x[3];
  1749. x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  1750. x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  1751. return;
  1752. case 5:
  1753. x[4] = b[4];
  1754. x[3] = b[3] - lptr[4*nc+3] * x[4];
  1755. x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
  1756. x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  1757. x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  1758. return;
  1759. case 6:
  1760. x[5] = b[5];
  1761. x[4] = b[4] - lptr[5*nc+4] * x[5];
  1762. x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
  1763. x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
  1764. x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  1765. x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  1766. return;
  1767. case 7:
  1768. x[6] = b[6];
  1769. x[5] = b[5] - lptr[6*nc+5] * x[6];
  1770. x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
  1771. x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
  1772. x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
  1773. x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  1774. x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  1775. return;
  1776. }
  1777. return;
  1778. }
  1779. int i, j;
  1780. register double s0, s1, s2, s3;
  1781. float *xptr;
  1782. lptr = L.ToFloatPtr() + n * nc + n - 4;
  1783. xptr = x + n;
  1784. // process 4 rows at a time
  1785. for ( i = n; i >= 4; i -= 4 ) {
  1786. s0 = b[i-4];
  1787. s1 = b[i-3];
  1788. s2 = b[i-2];
  1789. s3 = b[i-1];
  1790. // process 4x4 blocks
  1791. for ( j = 0; j < n-i; j += 4 ) {
  1792. s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
  1793. s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
  1794. s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
  1795. s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
  1796. s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
  1797. s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
  1798. s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
  1799. s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
  1800. s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
  1801. s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
  1802. s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
  1803. s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
  1804. s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
  1805. s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
  1806. s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
  1807. s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
  1808. }
  1809. // process left over of the 4 rows
  1810. s0 -= lptr[0-1*nc] * s3;
  1811. s1 -= lptr[1-1*nc] * s3;
  1812. s2 -= lptr[2-1*nc] * s3;
  1813. s0 -= lptr[0-2*nc] * s2;
  1814. s1 -= lptr[1-2*nc] * s2;
  1815. s0 -= lptr[0-3*nc] * s1;
  1816. // store result
  1817. xptr[-4] = s0;
  1818. xptr[-3] = s1;
  1819. xptr[-2] = s2;
  1820. xptr[-1] = s3;
  1821. // update pointers for next four rows
  1822. lptr -= 4 + 4 * nc;
  1823. xptr -= 4;
  1824. }
  1825. // process left over rows
  1826. for ( i--; i >= 0; i-- ) {
  1827. s0 = b[i];
  1828. lptr = L[0] + i;
  1829. for ( j = i + 1; j < n; j++ ) {
  1830. s0 -= lptr[j*nc] * x[j];
  1831. }
  1832. x[i] = s0;
  1833. }
  1834. #else
  1835. int i, j, nc;
  1836. const float *ptr;
  1837. double sum;
  1838. nc = L.GetNumColumns();
  1839. for ( i = n - 1; i >= 0; i-- ) {
  1840. sum = b[i];
  1841. ptr = L[0] + i;
  1842. for ( j = i + 1; j < n; j++ ) {
  1843. sum -= ptr[j*nc] * x[j];
  1844. }
  1845. x[i] = sum;
  1846. }
  1847. #endif
  1848. }
  1849. /*
  1850. ============
  1851. idSIMD_Generic::MatX_LDLTFactor
  1852. in-place factorization LDL' of the n * n sub-matrix of mat
  1853. the reciprocal of the diagonal elements are stored in invDiag
  1854. ============
  1855. */
  1856. bool VPCALL idSIMD_Generic::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
  1857. #if 1
  1858. int i, j, k, nc;
  1859. float *v, *diag, *mptr;
  1860. double s0, s1, s2, s3, sum, d;
  1861. v = (float *) _alloca16( n * sizeof( float ) );
  1862. diag = (float *) _alloca16( n * sizeof( float ) );
  1863. nc = mat.GetNumColumns();
  1864. if ( n <= 0 ) {
  1865. return true;
  1866. }
  1867. mptr = mat[0];
  1868. sum = mptr[0];
  1869. if ( sum == 0.0f ) {
  1870. return false;
  1871. }
  1872. diag[0] = sum;
  1873. invDiag[0] = d = 1.0f / sum;
  1874. if ( n <= 1 ) {
  1875. return true;
  1876. }
  1877. mptr = mat[0];
  1878. for ( j = 1; j < n; j++ ) {
  1879. mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
  1880. }
  1881. mptr = mat[1];
  1882. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  1883. sum = mptr[1] - s0;
  1884. if ( sum == 0.0f ) {
  1885. return false;
  1886. }
  1887. mat[1][1] = sum;
  1888. diag[1] = sum;
  1889. invDiag[1] = d = 1.0f / sum;
  1890. if ( n <= 2 ) {
  1891. return true;
  1892. }
  1893. mptr = mat[0];
  1894. for ( j = 2; j < n; j++ ) {
  1895. mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
  1896. }
  1897. mptr = mat[2];
  1898. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  1899. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  1900. sum = mptr[2] - s0 - s1;
  1901. if ( sum == 0.0f ) {
  1902. return false;
  1903. }
  1904. mat[2][2] = sum;
  1905. diag[2] = sum;
  1906. invDiag[2] = d = 1.0f / sum;
  1907. if ( n <= 3 ) {
  1908. return true;
  1909. }
  1910. mptr = mat[0];
  1911. for ( j = 3; j < n; j++ ) {
  1912. mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
  1913. }
  1914. mptr = mat[3];
  1915. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  1916. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  1917. v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
  1918. sum = mptr[3] - s0 - s1 - s2;
  1919. if ( sum == 0.0f ) {
  1920. return false;
  1921. }
  1922. mat[3][3] = sum;
  1923. diag[3] = sum;
  1924. invDiag[3] = d = 1.0f / sum;
  1925. if ( n <= 4 ) {
  1926. return true;
  1927. }
  1928. mptr = mat[0];
  1929. for ( j = 4; j < n; j++ ) {
  1930. mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
  1931. }
  1932. for ( i = 4; i < n; i++ ) {
  1933. mptr = mat[i];
  1934. v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
  1935. v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
  1936. v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
  1937. v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
  1938. for ( k = 4; k < i-3; k += 4 ) {
  1939. v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
  1940. v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
  1941. v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
  1942. v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
  1943. }
  1944. switch( i - k ) {
  1945. NODEFAULT;
  1946. case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
  1947. case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
  1948. case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
  1949. case 0: break;
  1950. }
  1951. sum = s3;
  1952. sum += s2;
  1953. sum += s1;
  1954. sum += s0;
  1955. sum = mptr[i] - sum;
  1956. if ( sum == 0.0f ) {
  1957. return false;
  1958. }
  1959. mat[i][i] = sum;
  1960. diag[i] = sum;
  1961. invDiag[i] = d = 1.0f / sum;
  1962. if ( i + 1 >= n ) {
  1963. return true;
  1964. }
  1965. mptr = mat[i+1];
  1966. for ( j = i+1; j < n; j++ ) {
  1967. s0 = mptr[0] * v[0];
  1968. s1 = mptr[1] * v[1];
  1969. s2 = mptr[2] * v[2];
  1970. s3 = mptr[3] * v[3];
  1971. for ( k = 4; k < i-7; k += 8 ) {
  1972. s0 += mptr[k+0] * v[k+0];
  1973. s1 += mptr[k+1] * v[k+1];
  1974. s2 += mptr[k+2] * v[k+2];
  1975. s3 += mptr[k+3] * v[k+3];
  1976. s0 += mptr[k+4] * v[k+4];
  1977. s1 += mptr[k+5] * v[k+5];
  1978. s2 += mptr[k+6] * v[k+6];
  1979. s3 += mptr[k+7] * v[k+7];
  1980. }
  1981. switch( i - k ) {
  1982. NODEFAULT;
  1983. case 7: s0 += mptr[k+6] * v[k+6];
  1984. case 6: s1 += mptr[k+5] * v[k+5];
  1985. case 5: s2 += mptr[k+4] * v[k+4];
  1986. case 4: s3 += mptr[k+3] * v[k+3];
  1987. case 3: s0 += mptr[k+2] * v[k+2];
  1988. case 2: s1 += mptr[k+1] * v[k+1];
  1989. case 1: s2 += mptr[k+0] * v[k+0];
  1990. case 0: break;
  1991. }
  1992. sum = s3;
  1993. sum += s2;
  1994. sum += s1;
  1995. sum += s0;
  1996. mptr[i] = ( mptr[i] - sum ) * d;
  1997. mptr += nc;
  1998. }
  1999. }
  2000. return true;
  2001. #else
  2002. int i, j, k, nc;
  2003. float *v, *ptr, *diagPtr;
  2004. double d, sum;
  2005. v = (float *) _alloca16( n * sizeof( float ) );
  2006. nc = mat.GetNumColumns();
  2007. for ( i = 0; i < n; i++ ) {
  2008. ptr = mat[i];
  2009. diagPtr = mat[0];
  2010. sum = ptr[i];
  2011. for ( j = 0; j < i; j++ ) {
  2012. d = ptr[j];
  2013. v[j] = diagPtr[0] * d;
  2014. sum -= v[j] * d;
  2015. diagPtr += nc + 1;
  2016. }
  2017. if ( sum == 0.0f ) {
  2018. return false;
  2019. }
  2020. diagPtr[0] = sum;
  2021. invDiag[i] = d = 1.0f / sum;
  2022. if ( i + 1 >= n ) {
  2023. continue;
  2024. }
  2025. ptr = mat[i+1];
  2026. for ( j = i + 1; j < n; j++ ) {
  2027. sum = ptr[i];
  2028. for ( k = 0; k < i; k++ ) {
  2029. sum -= ptr[k] * v[k];
  2030. }
  2031. ptr[i] = sum * d;
  2032. ptr += nc;
  2033. }
  2034. }
  2035. return true;
  2036. #endif
  2037. }
  2038. /*
  2039. ============
  2040. idSIMD_Generic::BlendJoints
  2041. ============
  2042. */
  2043. void VPCALL idSIMD_Generic::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
  2044. int i;
  2045. for ( i = 0; i < numJoints; i++ ) {
  2046. int j = index[i];
  2047. joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
  2048. joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
  2049. }
  2050. }
  2051. /*
  2052. ============
  2053. idSIMD_Generic::ConvertJointQuatsToJointMats
  2054. ============
  2055. */
  2056. void VPCALL idSIMD_Generic::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
  2057. int i;
  2058. for ( i = 0; i < numJoints; i++ ) {
  2059. jointMats[i].SetRotation( jointQuats[i].q.ToMat3() );
  2060. jointMats[i].SetTranslation( jointQuats[i].t );
  2061. }
  2062. }
  2063. /*
  2064. ============
  2065. idSIMD_Generic::ConvertJointMatsToJointQuats
  2066. ============
  2067. */
  2068. void VPCALL idSIMD_Generic::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
  2069. int i;
  2070. for ( i = 0; i < numJoints; i++ ) {
  2071. jointQuats[i] = jointMats[i].ToJointQuat();
  2072. }
  2073. }
  2074. /*
  2075. ============
  2076. idSIMD_Generic::TransformJoints
  2077. ============
  2078. */
  2079. void VPCALL idSIMD_Generic::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
  2080. int i;
  2081. for( i = firstJoint; i <= lastJoint; i++ ) {
  2082. assert( parents[i] < i );
  2083. jointMats[i] *= jointMats[parents[i]];
  2084. }
  2085. }
  2086. /*
  2087. ============
  2088. idSIMD_Generic::UntransformJoints
  2089. ============
  2090. */
  2091. void VPCALL idSIMD_Generic::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
  2092. int i;
  2093. for( i = lastJoint; i >= firstJoint; i-- ) {
  2094. assert( parents[i] < i );
  2095. jointMats[i] /= jointMats[parents[i]];
  2096. }
  2097. }
  2098. /*
  2099. ============
  2100. idSIMD_Generic::TransformVerts
  2101. ============
  2102. */
  2103. void VPCALL idSIMD_Generic::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
  2104. int i, j;
  2105. const byte *jointsPtr = (byte *)joints;
  2106. for( j = i = 0; i < numVerts; i++ ) {
  2107. idVec3 v;
  2108. v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
  2109. while( index[j*2+1] == 0 ) {
  2110. j++;
  2111. v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
  2112. }
  2113. j++;
  2114. verts[i].xyz = v;
  2115. }
  2116. }
  2117. /*
  2118. ============
  2119. idSIMD_Generic::TracePointCull
  2120. ============
  2121. */
  2122. void VPCALL idSIMD_Generic::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  2123. int i;
  2124. byte tOr;
  2125. tOr = 0;
  2126. for ( i = 0; i < numVerts; i++ ) {
  2127. byte bits;
  2128. float d0, d1, d2, d3, t;
  2129. const idVec3 &v = verts[i].xyz;
  2130. d0 = planes[0].Distance( v );
  2131. d1 = planes[1].Distance( v );
  2132. d2 = planes[2].Distance( v );
  2133. d3 = planes[3].Distance( v );
  2134. t = d0 + radius;
  2135. bits = FLOATSIGNBITSET( t ) << 0;
  2136. t = d1 + radius;
  2137. bits |= FLOATSIGNBITSET( t ) << 1;
  2138. t = d2 + radius;
  2139. bits |= FLOATSIGNBITSET( t ) << 2;
  2140. t = d3 + radius;
  2141. bits |= FLOATSIGNBITSET( t ) << 3;
  2142. t = d0 - radius;
  2143. bits |= FLOATSIGNBITSET( t ) << 4;
  2144. t = d1 - radius;
  2145. bits |= FLOATSIGNBITSET( t ) << 5;
  2146. t = d2 - radius;
  2147. bits |= FLOATSIGNBITSET( t ) << 6;
  2148. t = d3 - radius;
  2149. bits |= FLOATSIGNBITSET( t ) << 7;
  2150. bits ^= 0x0F; // flip lower four bits
  2151. tOr |= bits;
  2152. cullBits[i] = bits;
  2153. }
  2154. totalOr = tOr;
  2155. }
  2156. /*
  2157. ============
  2158. idSIMD_Generic::DecalPointCull
  2159. ============
  2160. */
  2161. void VPCALL idSIMD_Generic::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  2162. int i;
  2163. for ( i = 0; i < numVerts; i++ ) {
  2164. byte bits;
  2165. float d0, d1, d2, d3, d4, d5;
  2166. const idVec3 &v = verts[i].xyz;
  2167. d0 = planes[0].Distance( v );
  2168. d1 = planes[1].Distance( v );
  2169. d2 = planes[2].Distance( v );
  2170. d3 = planes[3].Distance( v );
  2171. d4 = planes[4].Distance( v );
  2172. d5 = planes[5].Distance( v );
  2173. bits = FLOATSIGNBITSET( d0 ) << 0;
  2174. bits |= FLOATSIGNBITSET( d1 ) << 1;
  2175. bits |= FLOATSIGNBITSET( d2 ) << 2;
  2176. bits |= FLOATSIGNBITSET( d3 ) << 3;
  2177. bits |= FLOATSIGNBITSET( d4 ) << 4;
  2178. bits |= FLOATSIGNBITSET( d5 ) << 5;
  2179. cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
  2180. }
  2181. }
  2182. /*
  2183. ============
  2184. idSIMD_Generic::OverlayPointCull
  2185. ============
  2186. */
  2187. void VPCALL idSIMD_Generic::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
  2188. int i;
  2189. for ( i = 0; i < numVerts; i++ ) {
  2190. byte bits;
  2191. float d0, d1;
  2192. const idVec3 &v = verts[i].xyz;
  2193. texCoords[i][0] = d0 = planes[0].Distance( v );
  2194. texCoords[i][1] = d1 = planes[1].Distance( v );
  2195. bits = FLOATSIGNBITSET( d0 ) << 0;
  2196. d0 = 1.0f - d0;
  2197. bits |= FLOATSIGNBITSET( d1 ) << 1;
  2198. d1 = 1.0f - d1;
  2199. bits |= FLOATSIGNBITSET( d0 ) << 2;
  2200. bits |= FLOATSIGNBITSET( d1 ) << 3;
  2201. cullBits[i] = bits;
  2202. }
  2203. }
  2204. /*
  2205. ============
  2206. idSIMD_Generic::DeriveTriPlanes
  2207. Derives a plane equation for each triangle.
  2208. ============
  2209. */
  2210. void VPCALL idSIMD_Generic::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  2211. int i;
  2212. for ( i = 0; i < numIndexes; i += 3 ) {
  2213. const idDrawVert *a, *b, *c;
  2214. float d0[3], d1[3], f;
  2215. idVec3 n;
  2216. a = verts + indexes[i + 0];
  2217. b = verts + indexes[i + 1];
  2218. c = verts + indexes[i + 2];
  2219. d0[0] = b->xyz[0] - a->xyz[0];
  2220. d0[1] = b->xyz[1] - a->xyz[1];
  2221. d0[2] = b->xyz[2] - a->xyz[2];
  2222. d1[0] = c->xyz[0] - a->xyz[0];
  2223. d1[1] = c->xyz[1] - a->xyz[1];
  2224. d1[2] = c->xyz[2] - a->xyz[2];
  2225. n[0] = d1[1] * d0[2] - d1[2] * d0[1];
  2226. n[1] = d1[2] * d0[0] - d1[0] * d0[2];
  2227. n[2] = d1[0] * d0[1] - d1[1] * d0[0];
  2228. f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
  2229. n.x *= f;
  2230. n.y *= f;
  2231. n.z *= f;
  2232. planes->SetNormal( n );
  2233. planes->FitThroughPoint( a->xyz );
  2234. planes++;
  2235. }
  2236. }
  2237. /*
  2238. ============
  2239. idSIMD_Generic::DeriveTangents
  2240. Derives the normal and orthogonal tangent vectors for the triangle vertices.
  2241. For each vertex the normal and tangent vectors are derived from all triangles
  2242. using the vertex which results in smooth tangents across the mesh.
  2243. In the process the triangle planes are calculated as well.
  2244. ============
  2245. */
  2246. void VPCALL idSIMD_Generic::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  2247. int i;
  2248. bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
  2249. memset( used, 0, numVerts * sizeof( used[0] ) );
  2250. idPlane *planesPtr = planes;
  2251. for ( i = 0; i < numIndexes; i += 3 ) {
  2252. idDrawVert *a, *b, *c;
  2253. unsigned long signBit;
  2254. float d0[5], d1[5], f, area;
  2255. idVec3 n, t0, t1;
  2256. int v0 = indexes[i + 0];
  2257. int v1 = indexes[i + 1];
  2258. int v2 = indexes[i + 2];
  2259. a = verts + v0;
  2260. b = verts + v1;
  2261. c = verts + v2;
  2262. d0[0] = b->xyz[0] - a->xyz[0];
  2263. d0[1] = b->xyz[1] - a->xyz[1];
  2264. d0[2] = b->xyz[2] - a->xyz[2];
  2265. d0[3] = b->st[0] - a->st[0];
  2266. d0[4] = b->st[1] - a->st[1];
  2267. d1[0] = c->xyz[0] - a->xyz[0];
  2268. d1[1] = c->xyz[1] - a->xyz[1];
  2269. d1[2] = c->xyz[2] - a->xyz[2];
  2270. d1[3] = c->st[0] - a->st[0];
  2271. d1[4] = c->st[1] - a->st[1];
  2272. // normal
  2273. n[0] = d1[1] * d0[2] - d1[2] * d0[1];
  2274. n[1] = d1[2] * d0[0] - d1[0] * d0[2];
  2275. n[2] = d1[0] * d0[1] - d1[1] * d0[0];
  2276. f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
  2277. n.x *= f;
  2278. n.y *= f;
  2279. n.z *= f;
  2280. planesPtr->SetNormal( n );
  2281. planesPtr->FitThroughPoint( a->xyz );
  2282. planesPtr++;
  2283. // area sign bit
  2284. area = d0[3] * d1[4] - d0[4] * d1[3];
  2285. signBit = ( *(unsigned long *)&area ) & ( 1 << 31 );
  2286. // first tangent
  2287. t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
  2288. t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
  2289. t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
  2290. f = idMath::RSqrt( t0.x * t0.x + t0.y * t0.y + t0.z * t0.z );
  2291. *(unsigned long *)&f ^= signBit;
  2292. t0.x *= f;
  2293. t0.y *= f;
  2294. t0.z *= f;
  2295. // second tangent
  2296. t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
  2297. t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
  2298. t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
  2299. f = idMath::RSqrt( t1.x * t1.x + t1.y * t1.y + t1.z * t1.z );
  2300. *(unsigned long *)&f ^= signBit;
  2301. t1.x *= f;
  2302. t1.y *= f;
  2303. t1.z *= f;
  2304. if ( used[v0] ) {
  2305. a->normal += n;
  2306. a->tangents[0] += t0;
  2307. a->tangents[1] += t1;
  2308. } else {
  2309. a->normal = n;
  2310. a->tangents[0] = t0;
  2311. a->tangents[1] = t1;
  2312. used[v0] = true;
  2313. }
  2314. if ( used[v1] ) {
  2315. b->normal += n;
  2316. b->tangents[0] += t0;
  2317. b->tangents[1] += t1;
  2318. } else {
  2319. b->normal = n;
  2320. b->tangents[0] = t0;
  2321. b->tangents[1] = t1;
  2322. used[v1] = true;
  2323. }
  2324. if ( used[v2] ) {
  2325. c->normal += n;
  2326. c->tangents[0] += t0;
  2327. c->tangents[1] += t1;
  2328. } else {
  2329. c->normal = n;
  2330. c->tangents[0] = t0;
  2331. c->tangents[1] = t1;
  2332. used[v2] = true;
  2333. }
  2334. }
  2335. }
  2336. /*
  2337. ============
  2338. idSIMD_Generic::DeriveUnsmoothedTangents
  2339. Derives the normal and orthogonal tangent vectors for the triangle vertices.
  2340. For each vertex the normal and tangent vectors are derived from a single dominant triangle.
  2341. ============
  2342. */
  2343. #define DERIVE_UNSMOOTHED_BITANGENT
  2344. void VPCALL idSIMD_Generic::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
  2345. int i;
  2346. for ( i = 0; i < numVerts; i++ ) {
  2347. idDrawVert *a, *b, *c;
  2348. float d0, d1, d2, d3, d4;
  2349. float d5, d6, d7, d8, d9;
  2350. float s0, s1, s2;
  2351. float n0, n1, n2;
  2352. float t0, t1, t2;
  2353. float t3, t4, t5;
  2354. const dominantTri_s &dt = dominantTris[i];
  2355. a = verts + i;
  2356. b = verts + dt.v2;
  2357. c = verts + dt.v3;
  2358. d0 = b->xyz[0] - a->xyz[0];
  2359. d1 = b->xyz[1] - a->xyz[1];
  2360. d2 = b->xyz[2] - a->xyz[2];
  2361. d3 = b->st[0] - a->st[0];
  2362. d4 = b->st[1] - a->st[1];
  2363. d5 = c->xyz[0] - a->xyz[0];
  2364. d6 = c->xyz[1] - a->xyz[1];
  2365. d7 = c->xyz[2] - a->xyz[2];
  2366. d8 = c->st[0] - a->st[0];
  2367. d9 = c->st[1] - a->st[1];
  2368. s0 = dt.normalizationScale[0];
  2369. s1 = dt.normalizationScale[1];
  2370. s2 = dt.normalizationScale[2];
  2371. n0 = s2 * ( d6 * d2 - d7 * d1 );
  2372. n1 = s2 * ( d7 * d0 - d5 * d2 );
  2373. n2 = s2 * ( d5 * d1 - d6 * d0 );
  2374. t0 = s0 * ( d0 * d9 - d4 * d5 );
  2375. t1 = s0 * ( d1 * d9 - d4 * d6 );
  2376. t2 = s0 * ( d2 * d9 - d4 * d7 );
  2377. #ifndef DERIVE_UNSMOOTHED_BITANGENT
  2378. t3 = s1 * ( d3 * d5 - d0 * d8 );
  2379. t4 = s1 * ( d3 * d6 - d1 * d8 );
  2380. t5 = s1 * ( d3 * d7 - d2 * d8 );
  2381. #else
  2382. t3 = s1 * ( n2 * t1 - n1 * t2 );
  2383. t4 = s1 * ( n0 * t2 - n2 * t0 );
  2384. t5 = s1 * ( n1 * t0 - n0 * t1 );
  2385. #endif
  2386. a->normal[0] = n0;
  2387. a->normal[1] = n1;
  2388. a->normal[2] = n2;
  2389. a->tangents[0][0] = t0;
  2390. a->tangents[0][1] = t1;
  2391. a->tangents[0][2] = t2;
  2392. a->tangents[1][0] = t3;
  2393. a->tangents[1][1] = t4;
  2394. a->tangents[1][2] = t5;
  2395. }
  2396. }
  2397. /*
  2398. ============
  2399. idSIMD_Generic::NormalizeTangents
  2400. Normalizes each vertex normal and projects and normalizes the
  2401. tangent vectors onto the plane orthogonal to the vertex normal.
  2402. ============
  2403. */
  2404. void VPCALL idSIMD_Generic::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
  2405. for ( int i = 0; i < numVerts; i++ ) {
  2406. idVec3 &v = verts[i].normal;
  2407. float f;
  2408. f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
  2409. v.x *= f; v.y *= f; v.z *= f;
  2410. for ( int j = 0; j < 2; j++ ) {
  2411. idVec3 &t = verts[i].tangents[j];
  2412. t -= ( t * v ) * v;
  2413. f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
  2414. t.x *= f; t.y *= f; t.z *= f;
  2415. }
  2416. }
  2417. }
  2418. /*
  2419. ============
  2420. idSIMD_Generic::CreateTextureSpaceLightVectors
  2421. Calculates light vectors in texture space for the given triangle vertices.
  2422. For each vertex the direction towards the light origin is projected onto texture space.
  2423. The light vectors are only calculated for the vertices referenced by the indexes.
  2424. ============
  2425. */
  2426. void VPCALL idSIMD_Generic::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  2427. bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
  2428. memset( used, 0, numVerts * sizeof( used[0] ) );
  2429. for ( int i = numIndexes - 1; i >= 0; i-- ) {
  2430. used[indexes[i]] = true;
  2431. }
  2432. for ( int i = 0; i < numVerts; i++ ) {
  2433. if ( !used[i] ) {
  2434. continue;
  2435. }
  2436. const idDrawVert *v = &verts[i];
  2437. idVec3 lightDir = lightOrigin - v->xyz;
  2438. lightVectors[i][0] = lightDir * v->tangents[0];
  2439. lightVectors[i][1] = lightDir * v->tangents[1];
  2440. lightVectors[i][2] = lightDir * v->normal;
  2441. }
  2442. }
  2443. /*
  2444. ============
  2445. idSIMD_Generic::CreateSpecularTextureCoords
  2446. Calculates specular texture coordinates for the given triangle vertices.
  2447. For each vertex the normalized direction towards the light origin is added to the
  2448. normalized direction towards the view origin and the result is projected onto texture space.
  2449. The texture coordinates are only calculated for the vertices referenced by the indexes.
  2450. ============
  2451. */
  2452. void VPCALL idSIMD_Generic::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
  2453. bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
  2454. memset( used, 0, numVerts * sizeof( used[0] ) );
  2455. for ( int i = numIndexes - 1; i >= 0; i-- ) {
  2456. used[indexes[i]] = true;
  2457. }
  2458. for ( int i = 0; i < numVerts; i++ ) {
  2459. if ( !used[i] ) {
  2460. continue;
  2461. }
  2462. const idDrawVert *v = &verts[i];
  2463. idVec3 lightDir = lightOrigin - v->xyz;
  2464. idVec3 viewDir = viewOrigin - v->xyz;
  2465. float ilength;
  2466. ilength = idMath::RSqrt( lightDir * lightDir );
  2467. lightDir[0] *= ilength;
  2468. lightDir[1] *= ilength;
  2469. lightDir[2] *= ilength;
  2470. ilength = idMath::RSqrt( viewDir * viewDir );
  2471. viewDir[0] *= ilength;
  2472. viewDir[1] *= ilength;
  2473. viewDir[2] *= ilength;
  2474. lightDir += viewDir;
  2475. texCoords[i][0] = lightDir * v->tangents[0];
  2476. texCoords[i][1] = lightDir * v->tangents[1];
  2477. texCoords[i][2] = lightDir * v->normal;
  2478. texCoords[i][3] = 1.0f;
  2479. }
  2480. }
  2481. /*
  2482. ============
  2483. idSIMD_Generic::CreateShadowCache
  2484. ============
  2485. */
  2486. int VPCALL idSIMD_Generic::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
  2487. int outVerts = 0;
  2488. for ( int i = 0; i < numVerts; i++ ) {
  2489. if ( vertRemap[i] ) {
  2490. continue;
  2491. }
  2492. const float *v = verts[i].xyz.ToFloatPtr();
  2493. vertexCache[outVerts+0][0] = v[0];
  2494. vertexCache[outVerts+0][1] = v[1];
  2495. vertexCache[outVerts+0][2] = v[2];
  2496. vertexCache[outVerts+0][3] = 1.0f;
  2497. // R_SetupProjection() builds the projection matrix with a slight crunch
  2498. // for depth, which keeps this w=0 division from rasterizing right at the
  2499. // wrap around point and causing depth fighting with the rear caps
  2500. vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
  2501. vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
  2502. vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
  2503. vertexCache[outVerts+1][3] = 0.0f;
  2504. vertRemap[i] = outVerts;
  2505. outVerts += 2;
  2506. }
  2507. return outVerts;
  2508. }
  2509. /*
  2510. ============
  2511. idSIMD_Generic::CreateVertexProgramShadowCache
  2512. ============
  2513. */
  2514. int VPCALL idSIMD_Generic::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
  2515. for ( int i = 0; i < numVerts; i++ ) {
  2516. const float *v = verts[i].xyz.ToFloatPtr();
  2517. vertexCache[i*2+0][0] = v[0];
  2518. vertexCache[i*2+1][0] = v[0];
  2519. vertexCache[i*2+0][1] = v[1];
  2520. vertexCache[i*2+1][1] = v[1];
  2521. vertexCache[i*2+0][2] = v[2];
  2522. vertexCache[i*2+1][2] = v[2];
  2523. vertexCache[i*2+0][3] = 1.0f;
  2524. vertexCache[i*2+1][3] = 0.0f;
  2525. }
  2526. return numVerts * 2;
  2527. }
  2528. /*
  2529. ============
  2530. idSIMD_Generic::UpSamplePCMTo44kHz
  2531. Duplicate samples for 44kHz output.
  2532. ============
  2533. */
  2534. void idSIMD_Generic::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
  2535. if ( kHz == 11025 ) {
  2536. if ( numChannels == 1 ) {
  2537. for ( int i = 0; i < numSamples; i++ ) {
  2538. dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
  2539. }
  2540. } else {
  2541. for ( int i = 0; i < numSamples; i += 2 ) {
  2542. dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
  2543. dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
  2544. }
  2545. }
  2546. } else if ( kHz == 22050 ) {
  2547. if ( numChannels == 1 ) {
  2548. for ( int i = 0; i < numSamples; i++ ) {
  2549. dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
  2550. }
  2551. } else {
  2552. for ( int i = 0; i < numSamples; i += 2 ) {
  2553. dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
  2554. dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
  2555. }
  2556. }
  2557. } else if ( kHz == 44100 ) {
  2558. for ( int i = 0; i < numSamples; i++ ) {
  2559. dest[i] = (float) src[i];
  2560. }
  2561. } else {
  2562. assert( 0 );
  2563. }
  2564. }
  2565. /*
  2566. ============
  2567. idSIMD_Generic::UpSampleOGGTo44kHz
  2568. Duplicate samples for 44kHz output.
  2569. ============
  2570. */
  2571. void idSIMD_Generic::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
  2572. if ( kHz == 11025 ) {
  2573. if ( numChannels == 1 ) {
  2574. for ( int i = 0; i < numSamples; i++ ) {
  2575. dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
  2576. }
  2577. } else {
  2578. for ( int i = 0; i < numSamples >> 1; i++ ) {
  2579. dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
  2580. dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
  2581. }
  2582. }
  2583. } else if ( kHz == 22050 ) {
  2584. if ( numChannels == 1 ) {
  2585. for ( int i = 0; i < numSamples; i++ ) {
  2586. dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
  2587. }
  2588. } else {
  2589. for ( int i = 0; i < numSamples >> 1; i++ ) {
  2590. dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
  2591. dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
  2592. }
  2593. }
  2594. } else if ( kHz == 44100 ) {
  2595. if ( numChannels == 1 ) {
  2596. for ( int i = 0; i < numSamples; i++ ) {
  2597. dest[i*1+0] = ogg[0][i] * 32768.0f;
  2598. }
  2599. } else {
  2600. for ( int i = 0; i < numSamples >> 1; i++ ) {
  2601. dest[i*2+0] = ogg[0][i] * 32768.0f;
  2602. dest[i*2+1] = ogg[1][i] * 32768.0f;
  2603. }
  2604. }
  2605. } else {
  2606. assert( 0 );
  2607. }
  2608. }
  2609. /*
  2610. ============
  2611. idSIMD_Generic::MixSoundTwoSpeakerMono
  2612. ============
  2613. */
  2614. void VPCALL idSIMD_Generic::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
  2615. float sL = lastV[0];
  2616. float sR = lastV[1];
  2617. float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  2618. float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  2619. assert( numSamples == MIXBUFFER_SAMPLES );
  2620. for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {
  2621. mixBuffer[j*2+0] += samples[j] * sL;
  2622. mixBuffer[j*2+1] += samples[j] * sR;
  2623. sL += incL;
  2624. sR += incR;
  2625. }
  2626. }
  2627. /*
  2628. ============
  2629. idSIMD_Generic::MixSoundTwoSpeakerStereo
  2630. ============
  2631. */
  2632. void VPCALL idSIMD_Generic::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
  2633. float sL = lastV[0];
  2634. float sR = lastV[1];
  2635. float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  2636. float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  2637. assert( numSamples == MIXBUFFER_SAMPLES );
  2638. for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {
  2639. mixBuffer[j*2+0] += samples[j*2+0] * sL;
  2640. mixBuffer[j*2+1] += samples[j*2+1] * sR;
  2641. sL += incL;
  2642. sR += incR;
  2643. }
  2644. }
  2645. /*
  2646. ============
  2647. idSIMD_Generic::MixSoundSixSpeakerMono
  2648. ============
  2649. */
  2650. void VPCALL idSIMD_Generic::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
  2651. float sL0 = lastV[0];
  2652. float sL1 = lastV[1];
  2653. float sL2 = lastV[2];
  2654. float sL3 = lastV[3];
  2655. float sL4 = lastV[4];
  2656. float sL5 = lastV[5];
  2657. float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  2658. float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  2659. float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
  2660. float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
  2661. float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
  2662. float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
  2663. assert( numSamples == MIXBUFFER_SAMPLES );
  2664. for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
  2665. mixBuffer[i*6+0] += samples[i] * sL0;
  2666. mixBuffer[i*6+1] += samples[i] * sL1;
  2667. mixBuffer[i*6+2] += samples[i] * sL2;
  2668. mixBuffer[i*6+3] += samples[i] * sL3;
  2669. mixBuffer[i*6+4] += samples[i] * sL4;
  2670. mixBuffer[i*6+5] += samples[i] * sL5;
  2671. sL0 += incL0;
  2672. sL1 += incL1;
  2673. sL2 += incL2;
  2674. sL3 += incL3;
  2675. sL4 += incL4;
  2676. sL5 += incL5;
  2677. }
  2678. }
  2679. /*
  2680. ============
  2681. idSIMD_Generic::MixSoundSixSpeakerStereo
  2682. ============
  2683. */
  2684. void VPCALL idSIMD_Generic::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
  2685. float sL0 = lastV[0];
  2686. float sL1 = lastV[1];
  2687. float sL2 = lastV[2];
  2688. float sL3 = lastV[3];
  2689. float sL4 = lastV[4];
  2690. float sL5 = lastV[5];
  2691. float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
  2692. float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
  2693. float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
  2694. float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
  2695. float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
  2696. float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
  2697. assert( numSamples == MIXBUFFER_SAMPLES );
  2698. for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
  2699. mixBuffer[i*6+0] += samples[i*2+0] * sL0;
  2700. mixBuffer[i*6+1] += samples[i*2+1] * sL1;
  2701. mixBuffer[i*6+2] += samples[i*2+0] * sL2;
  2702. mixBuffer[i*6+3] += samples[i*2+0] * sL3;
  2703. mixBuffer[i*6+4] += samples[i*2+0] * sL4;
  2704. mixBuffer[i*6+5] += samples[i*2+1] * sL5;
  2705. sL0 += incL0;
  2706. sL1 += incL1;
  2707. sL2 += incL2;
  2708. sL3 += incL3;
  2709. sL4 += incL4;
  2710. sL5 += incL5;
  2711. }
  2712. }
  2713. /*
  2714. ============
  2715. idSIMD_Generic::MixedSoundToSamples
  2716. ============
  2717. */
  2718. void VPCALL idSIMD_Generic::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
  2719. for ( int i = 0; i < numSamples; i++ ) {
  2720. if ( mixBuffer[i] <= -32768.0f ) {
  2721. samples[i] = -32768;
  2722. } else if ( mixBuffer[i] >= 32767.0f ) {
  2723. samples[i] = 32767;
  2724. } else {
  2725. samples[i] = (short) mixBuffer[i];
  2726. }
  2727. }
  2728. }