Simd_SSE2.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878
  1. /*
  2. ===========================================================================
  3. Doom 3 GPL Source Code
  4. Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
  6. Doom 3 Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #include "../precompiled.h"
  21. #pragma hdrstop
  22. #include "Simd_Generic.h"
  23. #include "Simd_MMX.h"
  24. #include "Simd_SSE.h"
  25. #include "Simd_SSE2.h"
  26. //===============================================================
  27. //
  28. // SSE2 implementation of idSIMDProcessor
  29. //
  30. //===============================================================
  31. #if defined(MACOS_X) && defined(__i386__)
  32. #include <xmmintrin.h>
  33. #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
  34. #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
  35. /*
  36. ============
  37. idSIMD_SSE2::GetName
  38. ============
  39. */
  40. const char * idSIMD_SSE2::GetName( void ) const {
  41. return "MMX & SSE & SSE2";
  42. }
  43. /*
  44. ============
  45. idSIMD_SSE::CmpLT
  46. dst[i] |= ( src0[i] < constant ) << bitNum;
  47. ============
  48. */
  49. void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
  50. int i, cnt, pre, post;
  51. float *aligned;
  52. __m128 xmm0, xmm1;
  53. __m128i xmm0i;
  54. int cnt_l;
  55. char *src0_p;
  56. char *constant_p;
  57. char *dst_p;
  58. int mask_l;
  59. int dst_l;
  60. /* if the float array is not aligned on a 4 byte boundary */
  61. if ( ((int) src0) & 3 ) {
  62. /* unaligned memory access */
  63. pre = 0;
  64. cnt = count >> 2;
  65. post = count - (cnt<<2);
  66. /*
  67. __asm mov edx, cnt
  68. __asm test edx, edx
  69. __asm je doneCmp
  70. */
  71. cnt_l = cnt;
  72. if(cnt_l != 0) {
  73. /*
  74. __asm push ebx
  75. __asm neg edx
  76. __asm mov esi, src0
  77. __asm prefetchnta [esi+64]
  78. __asm movss xmm1, constant
  79. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  80. __asm mov edi, dst
  81. __asm mov cl, bitNum
  82. */
  83. cnt_l = -cnt_l;
  84. src0_p = (char *) src0;
  85. _mm_prefetch(src0_p+64, _MM_HINT_NTA);
  86. constant_p = (char *) &constant;
  87. xmm1 = _mm_load_ss((float *)constant_p);
  88. xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
  89. dst_p = (char *)dst;
  90. /*
  91. __asm loopNA:
  92. */
  93. do {
  94. /*
  95. __asm movups xmm0, [esi]
  96. __asm prefetchnta [esi+128]
  97. __asm cmpltps xmm0, xmm1
  98. __asm movmskps eax, xmm0 \
  99. __asm mov ah, al
  100. __asm shr ah, 1
  101. __asm mov bx, ax
  102. __asm shl ebx, 14
  103. __asm mov bx, ax
  104. __asm and ebx, 0x01010101
  105. __asm shl ebx, cl
  106. __asm or ebx, dword ptr [edi]
  107. __asm mov dword ptr [edi], ebx
  108. __asm add esi, 16
  109. __asm add edi, 4
  110. __asm inc edx
  111. __asm jl loopNA
  112. __asm pop ebx
  113. */
  114. xmm0 = _mm_loadu_ps((float *) src0_p);
  115. _mm_prefetch(src0_p+128, _MM_HINT_NTA);
  116. xmm0 = _mm_cmplt_ps(xmm0, xmm1);
  117. // Simplify using SSE2
  118. xmm0i = (__m128i) xmm0;
  119. xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
  120. xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
  121. mask_l = _mm_cvtsi128_si32(xmm0i);
  122. // End
  123. mask_l = mask_l & 0x01010101;
  124. mask_l = mask_l << bitNum;
  125. dst_l = *((int *) dst_p);
  126. mask_l = mask_l | dst_l;
  127. *((int *) dst_p) = mask_l;
  128. src0_p = src0_p + 16;
  129. dst_p = dst_p + 4;
  130. cnt_l = cnt_l + 1;
  131. } while (cnt_l < 0);
  132. }
  133. }
  134. else {
  135. /* aligned memory access */
  136. aligned = (float *) ((((int) src0) + 15) & ~15);
  137. if ( (int)aligned > ((int)src0) + count ) {
  138. pre = count;
  139. post = 0;
  140. }
  141. else {
  142. pre = aligned - src0;
  143. cnt = (count - pre) >> 2;
  144. post = count - pre - (cnt<<2);
  145. /*
  146. __asm mov edx, cnt
  147. __asm test edx, edx
  148. __asm je doneCmp
  149. */
  150. cnt_l = cnt;
  151. if(cnt_l != 0) {
  152. /*
  153. __asm push ebx
  154. __asm neg edx
  155. __asm mov esi, aligned
  156. __asm prefetchnta [esi+64]
  157. __asm movss xmm1, constant
  158. __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
  159. __asm mov edi, dst
  160. __asm add edi, pre
  161. __asm mov cl, bitNum
  162. */
  163. cnt_l = -cnt_l;
  164. src0_p = (char *) src0;
  165. _mm_prefetch(src0_p+64, _MM_HINT_NTA);
  166. constant_p = (char *) &constant;
  167. xmm1 = _mm_load_ss((float *)constant_p);
  168. xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
  169. dst_p = (char *)dst;
  170. dst_p = dst_p + pre;
  171. /*
  172. __asm loopA:
  173. */
  174. do {
  175. /*
  176. __asm movaps xmm0, [esi]
  177. __asm prefetchnta [esi+128]
  178. __asm cmpltps xmm0, xmm1
  179. __asm movmskps eax, xmm0 \
  180. __asm mov ah, al
  181. __asm shr ah, 1
  182. __asm mov bx, ax
  183. __asm shl ebx, 14
  184. __asm mov bx, ax
  185. __asm and ebx, 0x01010101
  186. __asm shl ebx, cl
  187. __asm or ebx, dword ptr [edi]
  188. __asm mov dword ptr [edi], ebx
  189. __asm add esi, 16
  190. __asm add edi, 4
  191. __asm inc edx
  192. __asm jl loopA
  193. __asm pop ebx
  194. */
  195. xmm0 = _mm_load_ps((float *) src0_p);
  196. _mm_prefetch(src0_p+128, _MM_HINT_NTA);
  197. xmm0 = _mm_cmplt_ps(xmm0, xmm1);
  198. // Simplify using SSE2
  199. xmm0i = (__m128i) xmm0;
  200. xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
  201. xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
  202. mask_l = _mm_cvtsi128_si32(xmm0i);
  203. // End
  204. mask_l = mask_l & 0x01010101;
  205. mask_l = mask_l << bitNum;
  206. dst_l = *((int *) dst_p);
  207. mask_l = mask_l | dst_l;
  208. *((int *) dst_p) = mask_l;
  209. src0_p = src0_p + 16;
  210. dst_p = dst_p + 4;
  211. cnt_l = cnt_l + 1;
  212. } while (cnt_l < 0);
  213. }
  214. }
  215. }
  216. /*
  217. doneCmp:
  218. */
  219. float c = constant;
  220. for ( i = 0; i < pre; i++ ) {
  221. dst[i] |= ( src0[i] < c ) << bitNum;
  222. }
  223. for ( i = count - post; i < count; i++ ) {
  224. dst[i] |= ( src0[i] < c ) << bitNum;
  225. }
  226. }
  227. #elif defined(_WIN32)
  228. #include <xmmintrin.h>
  229. #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
  230. #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
  231. #define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
  232. #define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
  233. #define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
  234. #define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
  235. #define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
  236. ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
  237. ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
  238. ALIGN4_INIT4( unsigned long SIMD_SP_singleSignBitMask, (unsigned long) ( 1 << 31 ), 0, 0, 0 );
  239. ALIGN4_INIT1( unsigned long SIMD_SP_signBitMask, (unsigned long) ( 1 << 31 ) );
  240. ALIGN4_INIT1( unsigned long SIMD_SP_absMask, (unsigned long) ~( 1 << 31 ) );
  241. ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, (unsigned long) ~( 1 << 23 ) );
  242. ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
  243. ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
  244. ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
  245. ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
  246. ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
  247. ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
  248. ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
  249. ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
  250. ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
  251. ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
  252. ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
  253. ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
  254. /*
  255. ============
  256. idSIMD_SSE2::GetName
  257. ============
  258. */
  259. const char * idSIMD_SSE2::GetName( void ) const {
  260. return "MMX & SSE & SSE2";
  261. }
  262. #if 0 // the SSE2 code is ungodly slow
  263. /*
  264. ============
  265. idSIMD_SSE2::MatX_LowerTriangularSolve
  266. solves x in Lx = b for the n * n sub-matrix of L
  267. if skip > 0 the first skip elements of x are assumed to be valid already
  268. L has to be a lower triangular matrix with (implicit) ones on the diagonal
  269. x == b is allowed
  270. ============
  271. */
  272. void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
  273. int nc;
  274. const float *lptr;
  275. if ( skip >= n ) {
  276. return;
  277. }
  278. lptr = L[skip];
  279. nc = L.GetNumColumns();
  280. // unrolled cases for n < 8
  281. if ( n < 8 ) {
  282. #define NSKIP( n, s ) ((n<<3)|(s&7))
  283. switch( NSKIP( n, skip ) ) {
  284. case NSKIP( 1, 0 ): x[0] = b[0];
  285. return;
  286. case NSKIP( 2, 0 ): x[0] = b[0];
  287. case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  288. return;
  289. case NSKIP( 3, 0 ): x[0] = b[0];
  290. case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  291. case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  292. return;
  293. case NSKIP( 4, 0 ): x[0] = b[0];
  294. case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  295. case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  296. case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  297. return;
  298. case NSKIP( 5, 0 ): x[0] = b[0];
  299. case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  300. case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  301. case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  302. case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  303. return;
  304. case NSKIP( 6, 0 ): x[0] = b[0];
  305. case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  306. case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  307. case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  308. case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  309. case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
  310. return;
  311. case NSKIP( 7, 0 ): x[0] = b[0];
  312. case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
  313. case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  314. case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  315. case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
  316. case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
  317. case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
  318. return;
  319. }
  320. return;
  321. }
  322. // process first 4 rows
  323. switch( skip ) {
  324. case 0: x[0] = b[0];
  325. case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
  326. case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
  327. case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
  328. skip = 4;
  329. }
  330. lptr = L[skip];
  331. __asm {
  332. push ebx
  333. mov eax, skip // eax = i
  334. shl eax, 2 // eax = i*4
  335. mov edx, n // edx = n
  336. shl edx, 2 // edx = n*4
  337. mov esi, x // esi = x
  338. mov edi, lptr // edi = lptr
  339. add esi, eax
  340. add edi, eax
  341. mov ebx, b // ebx = b
  342. // aligned
  343. looprow:
  344. mov ecx, eax
  345. neg ecx
  346. cvtps2pd xmm0, [esi+ecx]
  347. cvtps2pd xmm2, [edi+ecx]
  348. mulpd xmm0, xmm2
  349. cvtps2pd xmm1, [esi+ecx+8]
  350. cvtps2pd xmm3, [edi+ecx+8]
  351. mulpd xmm1, xmm3
  352. add ecx, 20*4
  353. jg donedot16
  354. dot16:
  355. cvtps2pd xmm2, [esi+ecx-(16*4)]
  356. cvtps2pd xmm3, [edi+ecx-(16*4)]
  357. cvtps2pd xmm4, [esi+ecx-(14*4)]
  358. mulpd xmm2, xmm3
  359. cvtps2pd xmm5, [edi+ecx-(14*4)]
  360. addpd xmm0, xmm2
  361. cvtps2pd xmm2, [esi+ecx-(12*4)]
  362. mulpd xmm4, xmm5
  363. cvtps2pd xmm3, [edi+ecx-(12*4)]
  364. addpd xmm1, xmm4
  365. cvtps2pd xmm4, [esi+ecx-(10*4)]
  366. mulpd xmm2, xmm3
  367. cvtps2pd xmm5, [edi+ecx-(10*4)]
  368. addpd xmm0, xmm2
  369. cvtps2pd xmm2, [esi+ecx-(8*4)]
  370. mulpd xmm4, xmm5
  371. cvtps2pd xmm3, [edi+ecx-(8*4)]
  372. addpd xmm1, xmm4
  373. cvtps2pd xmm4, [esi+ecx-(6*4)]
  374. mulpd xmm2, xmm3
  375. cvtps2pd xmm5, [edi+ecx-(6*4)]
  376. addpd xmm0, xmm2
  377. cvtps2pd xmm2, [esi+ecx-(4*4)]
  378. mulpd xmm4, xmm5
  379. cvtps2pd xmm3, [edi+ecx-(4*4)]
  380. addpd xmm1, xmm4
  381. cvtps2pd xmm4, [esi+ecx-(2*4)]
  382. mulpd xmm2, xmm3
  383. cvtps2pd xmm5, [edi+ecx-(2*4)]
  384. addpd xmm0, xmm2
  385. add ecx, 16*4
  386. mulpd xmm4, xmm5
  387. addpd xmm1, xmm4
  388. jle dot16
  389. donedot16:
  390. sub ecx, 8*4
  391. jg donedot8
  392. dot8:
  393. cvtps2pd xmm2, [esi+ecx-(8*4)]
  394. cvtps2pd xmm3, [edi+ecx-(8*4)]
  395. cvtps2pd xmm7, [esi+ecx-(6*4)]
  396. mulpd xmm2, xmm3
  397. cvtps2pd xmm5, [edi+ecx-(6*4)]
  398. addpd xmm0, xmm2
  399. cvtps2pd xmm6, [esi+ecx-(4*4)]
  400. mulpd xmm7, xmm5
  401. cvtps2pd xmm3, [edi+ecx-(4*4)]
  402. addpd xmm1, xmm7
  403. cvtps2pd xmm4, [esi+ecx-(2*4)]
  404. mulpd xmm6, xmm3
  405. cvtps2pd xmm7, [edi+ecx-(2*4)]
  406. addpd xmm0, xmm6
  407. add ecx, 8*4
  408. mulpd xmm4, xmm7
  409. addpd xmm1, xmm4
  410. donedot8:
  411. sub ecx, 4*4
  412. jg donedot4
  413. dot4:
  414. cvtps2pd xmm2, [esi+ecx-(4*4)]
  415. cvtps2pd xmm3, [edi+ecx-(4*4)]
  416. cvtps2pd xmm4, [esi+ecx-(2*4)]
  417. mulpd xmm2, xmm3
  418. cvtps2pd xmm5, [edi+ecx-(2*4)]
  419. addpd xmm0, xmm2
  420. add ecx, 4*4
  421. mulpd xmm4, xmm5
  422. addpd xmm1, xmm4
  423. donedot4:
  424. addpd xmm0, xmm1
  425. movaps xmm1, xmm0
  426. shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 0 )
  427. addsd xmm0, xmm1
  428. sub ecx, 4*4
  429. jz dot0
  430. add ecx, 4
  431. jz dot1
  432. add ecx, 4
  433. jz dot2
  434. //dot3:
  435. cvtss2sd xmm1, [esi-(3*4)]
  436. cvtss2sd xmm2, [edi-(3*4)]
  437. mulsd xmm1, xmm2
  438. addsd xmm0, xmm1
  439. dot2:
  440. cvtss2sd xmm3, [esi-(2*4)]
  441. cvtss2sd xmm4, [edi-(2*4)]
  442. mulsd xmm3, xmm4
  443. addsd xmm0, xmm3
  444. dot1:
  445. cvtss2sd xmm5, [esi-(1*4)]
  446. cvtss2sd xmm6, [edi-(1*4)]
  447. mulsd xmm5, xmm6
  448. addsd xmm0, xmm5
  449. dot0:
  450. cvtss2sd xmm1, [ebx+eax]
  451. subsd xmm1, xmm0
  452. cvtsd2ss xmm0, xmm1
  453. movss [esi], xmm0
  454. add eax, 4
  455. cmp eax, edx
  456. jge done
  457. add esi, 4
  458. mov ecx, nc
  459. shl ecx, 2
  460. add edi, ecx
  461. add edi, 4
  462. jmp looprow
  463. // done
  464. done:
  465. pop ebx
  466. }
  467. }
  468. /*
  469. ============
  470. idSIMD_SSE2::MatX_LowerTriangularSolveTranspose
  471. solves x in L'x = b for the n * n sub-matrix of L
  472. L has to be a lower triangular matrix with (implicit) ones on the diagonal
  473. x == b is allowed
  474. ============
  475. */
  476. void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
  477. int nc;
  478. const float *lptr;
  479. lptr = L.ToFloatPtr();
  480. nc = L.GetNumColumns();
  481. // unrolled cases for n < 8
  482. if ( n < 8 ) {
  483. switch( n ) {
  484. case 0:
  485. return;
  486. case 1:
  487. x[0] = b[0];
  488. return;
  489. case 2:
  490. x[1] = b[1];
  491. x[0] = b[0] - lptr[1*nc+0] * x[1];
  492. return;
  493. case 3:
  494. x[2] = b[2];
  495. x[1] = b[1] - lptr[2*nc+1] * x[2];
  496. x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  497. return;
  498. case 4:
  499. x[3] = b[3];
  500. x[2] = b[2] - lptr[3*nc+2] * x[3];
  501. x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  502. x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  503. return;
  504. case 5:
  505. x[4] = b[4];
  506. x[3] = b[3] - lptr[4*nc+3] * x[4];
  507. x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
  508. x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  509. x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  510. return;
  511. case 6:
  512. x[5] = b[5];
  513. x[4] = b[4] - lptr[5*nc+4] * x[5];
  514. x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
  515. x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
  516. x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  517. x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  518. return;
  519. case 7:
  520. x[6] = b[6];
  521. x[5] = b[5] - lptr[6*nc+5] * x[6];
  522. x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
  523. x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
  524. x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
  525. x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
  526. x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
  527. return;
  528. }
  529. return;
  530. }
  531. int i, j, m;
  532. float *xptr;
  533. double s0;
  534. // if the number of columns is not a multiple of 2 we're screwed for alignment.
  535. // however, if the number of columns is a multiple of 2 but the number of to be
  536. // processed rows is not a multiple of 2 we can still run 8 byte aligned
  537. m = n;
  538. if ( m & 1 ) {
  539. m--;
  540. x[m] = b[m];
  541. lptr = L[m] + m - 4;
  542. xptr = x + m;
  543. __asm {
  544. push ebx
  545. mov eax, m // eax = i
  546. mov esi, xptr // esi = xptr
  547. mov edi, lptr // edi = lptr
  548. mov ebx, b // ebx = b
  549. mov edx, nc // edx = nc*sizeof(float)
  550. shl edx, 2
  551. process4rows_1:
  552. cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
  553. cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
  554. xor ecx, ecx
  555. sub eax, m
  556. neg eax
  557. jz done4x4_1
  558. process4x4_1: // process 4x4 blocks
  559. cvtps2pd xmm3, [edi]
  560. cvtps2pd xmm4, [edi+8]
  561. add edi, edx
  562. cvtss2sd xmm5, [esi+4*ecx+0]
  563. shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
  564. mulpd xmm3, xmm5
  565. cvtps2pd xmm1, [edi]
  566. mulpd xmm4, xmm5
  567. cvtps2pd xmm6, [edi+8]
  568. subpd xmm0, xmm3
  569. subpd xmm2, xmm4
  570. add edi, edx
  571. cvtss2sd xmm7, [esi+4*ecx+4]
  572. shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
  573. mulpd xmm1, xmm7
  574. cvtps2pd xmm3, [edi]
  575. mulpd xmm6, xmm7
  576. cvtps2pd xmm4, [edi+8]
  577. subpd xmm0, xmm1
  578. subpd xmm2, xmm6
  579. add edi, edx
  580. cvtss2sd xmm5, [esi+4*ecx+8]
  581. shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
  582. mulpd xmm3, xmm5
  583. cvtps2pd xmm1, [edi]
  584. mulpd xmm4, xmm5
  585. cvtps2pd xmm6, [edi+8]
  586. subpd xmm0, xmm3
  587. subpd xmm2, xmm4
  588. add edi, edx
  589. cvtss2sd xmm7, [esi+4*ecx+12]
  590. shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
  591. mulpd xmm1, xmm7
  592. add ecx, 4
  593. mulpd xmm6, xmm7
  594. cmp ecx, eax
  595. subpd xmm0, xmm1
  596. subpd xmm2, xmm6
  597. jl process4x4_1
  598. done4x4_1: // process left over of the 4 rows
  599. cvtps2pd xmm3, [edi]
  600. cvtps2pd xmm4, [edi+8]
  601. cvtss2sd xmm5, [esi+4*ecx]
  602. shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
  603. mulpd xmm3, xmm5
  604. mulpd xmm4, xmm5
  605. subpd xmm0, xmm3
  606. subpd xmm2, xmm4
  607. imul ecx, edx
  608. sub edi, ecx
  609. neg eax
  610. add eax, m
  611. sub eax, 4
  612. movapd xmm1, xmm0
  613. shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
  614. movapd xmm3, xmm2
  615. shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
  616. sub edi, edx
  617. cvtsd2ss xmm7, xmm3
  618. movss [esi-4], xmm7 // xptr[-1] = s3
  619. movsd xmm4, xmm3
  620. movsd xmm5, xmm3
  621. cvtss2sd xmm7, [edi+8]
  622. mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
  623. cvtss2sd xmm7, [edi+4]
  624. mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
  625. cvtss2sd xmm7, [edi]
  626. mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
  627. subsd xmm2, xmm3
  628. cvtsd2ss xmm7, xmm2
  629. movss [esi-8], xmm7 // xptr[-2] = s2
  630. movsd xmm6, xmm2
  631. sub edi, edx
  632. subsd xmm0, xmm5
  633. subsd xmm1, xmm4
  634. cvtss2sd xmm7, [edi+4]
  635. mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
  636. cvtss2sd xmm7, [edi]
  637. mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
  638. subsd xmm1, xmm2
  639. cvtsd2ss xmm7, xmm1
  640. movss [esi-12], xmm7 // xptr[-3] = s1
  641. subsd xmm0, xmm6
  642. sub edi, edx
  643. cmp eax, 4
  644. cvtss2sd xmm7, [edi]
  645. mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
  646. subsd xmm0, xmm1
  647. cvtsd2ss xmm7, xmm0
  648. movss [esi-16], xmm7 // xptr[-4] = s0
  649. jl done4rows_1
  650. sub edi, edx
  651. sub edi, 16
  652. sub esi, 16
  653. jmp process4rows_1
  654. done4rows_1:
  655. pop ebx
  656. }
  657. }
  658. else {
  659. lptr = L.ToFloatPtr() + m * L.GetNumColumns() + m - 4;
  660. xptr = x + m;
  661. __asm {
  662. push ebx
  663. mov eax, m // eax = i
  664. mov esi, xptr // esi = xptr
  665. mov edi, lptr // edi = lptr
  666. mov ebx, b // ebx = b
  667. mov edx, nc // edx = nc*sizeof(float)
  668. shl edx, 2
  669. process4rows:
  670. cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
  671. cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
  672. sub eax, m
  673. jz done4x4
  674. neg eax
  675. xor ecx, ecx
  676. process4x4: // process 4x4 blocks
  677. cvtps2pd xmm3, [edi]
  678. cvtps2pd xmm4, [edi+8]
  679. add edi, edx
  680. cvtss2sd xmm5, [esi+4*ecx+0]
  681. shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
  682. mulpd xmm3, xmm5
  683. cvtps2pd xmm1, [edi]
  684. mulpd xmm4, xmm5
  685. cvtps2pd xmm6, [edi+8]
  686. subpd xmm0, xmm3
  687. subpd xmm2, xmm4
  688. add edi, edx
  689. cvtss2sd xmm7, [esi+4*ecx+4]
  690. shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
  691. mulpd xmm1, xmm7
  692. cvtps2pd xmm3, [edi]
  693. mulpd xmm6, xmm7
  694. cvtps2pd xmm4, [edi+8]
  695. subpd xmm0, xmm1
  696. subpd xmm2, xmm6
  697. add edi, edx
  698. cvtss2sd xmm5, [esi+4*ecx+8]
  699. shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
  700. mulpd xmm3, xmm5
  701. cvtps2pd xmm1, [edi]
  702. mulpd xmm4, xmm5
  703. cvtps2pd xmm6, [edi+8]
  704. subpd xmm0, xmm3
  705. subpd xmm2, xmm4
  706. add edi, edx
  707. cvtss2sd xmm7, [esi+4*ecx+12]
  708. shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
  709. mulpd xmm1, xmm7
  710. add ecx, 4
  711. mulpd xmm6, xmm7
  712. cmp ecx, eax
  713. subpd xmm0, xmm1
  714. subpd xmm2, xmm6
  715. jl process4x4
  716. imul ecx, edx
  717. sub edi, ecx
  718. neg eax
  719. done4x4: // process left over of the 4 rows
  720. add eax, m
  721. sub eax, 4
  722. movapd xmm1, xmm0
  723. shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
  724. movapd xmm3, xmm2
  725. shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
  726. sub edi, edx
  727. cvtsd2ss xmm7, xmm3
  728. movss [esi-4], xmm7 // xptr[-1] = s3
  729. movsd xmm4, xmm3
  730. movsd xmm5, xmm3
  731. cvtss2sd xmm7, [edi+8]
  732. mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
  733. cvtss2sd xmm7, [edi+4]
  734. mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
  735. cvtss2sd xmm7, [edi]
  736. mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
  737. subsd xmm2, xmm3
  738. cvtsd2ss xmm7, xmm2
  739. movss [esi-8], xmm7 // xptr[-2] = s2
  740. movsd xmm6, xmm2
  741. sub edi, edx
  742. subsd xmm0, xmm5
  743. subsd xmm1, xmm4
  744. cvtss2sd xmm7, [edi+4]
  745. mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
  746. cvtss2sd xmm7, [edi]
  747. mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
  748. subsd xmm1, xmm2
  749. cvtsd2ss xmm7, xmm1
  750. movss [esi-12], xmm7 // xptr[-3] = s1
  751. subsd xmm0, xmm6
  752. sub edi, edx
  753. cmp eax, 4
  754. cvtss2sd xmm7, [edi]
  755. mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
  756. subsd xmm0, xmm1
  757. cvtsd2ss xmm7, xmm0
  758. movss [esi-16], xmm7 // xptr[-4] = s0
  759. jl done4rows
  760. sub edi, edx
  761. sub edi, 16
  762. sub esi, 16
  763. jmp process4rows
  764. done4rows:
  765. pop ebx
  766. }
  767. }
  768. // process left over rows
  769. for ( i = (m&3)-1; i >= 0; i-- ) {
  770. s0 = b[i];
  771. lptr = L[i+1] + i;
  772. for ( j = i + 1; j < m; j++ ) {
  773. s0 -= lptr[0] * x[j];
  774. lptr += nc;
  775. }
  776. x[i] = s0;
  777. }
  778. }
  779. #endif
  780. /*
  781. ============
  782. idSIMD_SSE2::MixedSoundToSamples
  783. ============
  784. */
  785. void VPCALL idSIMD_SSE2::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
  786. assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
  787. __asm {
  788. mov eax, numSamples
  789. mov edi, mixBuffer
  790. mov esi, samples
  791. shl eax, 2
  792. add edi, eax
  793. neg eax
  794. loop16:
  795. movaps xmm0, [edi+eax+0*16]
  796. movaps xmm1, [edi+eax+1*16]
  797. movaps xmm2, [edi+eax+2*16]
  798. movaps xmm3, [edi+eax+3*16]
  799. add esi, 4*4*2
  800. cvtps2dq xmm4, xmm0
  801. cvtps2dq xmm5, xmm1
  802. cvtps2dq xmm6, xmm2
  803. cvtps2dq xmm7, xmm3
  804. prefetchnta [edi+eax+128]
  805. packssdw xmm4, xmm5
  806. packssdw xmm6, xmm7
  807. add eax, 4*16
  808. movlps [esi-4*4*2], xmm4 // FIXME: should not use movlps/movhps to move integer data
  809. movhps [esi-3*4*2], xmm4
  810. movlps [esi-2*4*2], xmm6
  811. movhps [esi-1*4*2], xmm6
  812. jl loop16
  813. }
  814. }
  815. #endif /* _WIN32 */