Simd_SSE.cpp 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935
  1. /*
  2. ===========================================================================
  3. Doom 3 BFG Edition GPL Source Code
  4. Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
  6. Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #pragma hdrstop
  21. #include "../precompiled.h"
  22. #include "Simd_Generic.h"
  23. #include "Simd_SSE.h"
  24. //===============================================================
  25. // M
  26. // SSE implementation of idSIMDProcessor MrE
  27. // E
  28. //===============================================================
  29. #include <xmmintrin.h>
  30. #define M_PI 3.14159265358979323846f
  31. /*
  32. ============
  33. idSIMD_SSE::GetName
  34. ============
  35. */
  36. const char * idSIMD_SSE::GetName() const {
  37. return "MMX & SSE";
  38. }
  39. /*
  40. ============
  41. idSIMD_SSE::BlendJoints
  42. ============
  43. */
  44. void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
  45. if ( lerp <= 0.0f ) {
  46. return;
  47. } else if ( lerp >= 1.0f ) {
  48. for ( int i = 0; i < numJoints; i++ ) {
  49. int j = index[i];
  50. joints[j] = blendJoints[j];
  51. }
  52. return;
  53. }
  54. const __m128 vlerp = { lerp, lerp, lerp, lerp };
  55. const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f };
  56. const __m128 vector_float_sign_bit = __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x80000000 ) );
  57. const __m128 vector_float_rsqrt_c0 = { -3.0f, -3.0f, -3.0f, -3.0f };
  58. const __m128 vector_float_rsqrt_c1 = { -0.5f, -0.5f, -0.5f, -0.5f };
  59. const __m128 vector_float_tiny = { 1e-10f, 1e-10f, 1e-10f, 1e-10f };
  60. const __m128 vector_float_half_pi = { M_PI*0.5f, M_PI*0.5f, M_PI*0.5f, M_PI*0.5f };
  61. const __m128 vector_float_sin_c0 = { -2.39e-08f, -2.39e-08f, -2.39e-08f, -2.39e-08f };
  62. const __m128 vector_float_sin_c1 = { 2.7526e-06f, 2.7526e-06f, 2.7526e-06f, 2.7526e-06f };
  63. const __m128 vector_float_sin_c2 = { -1.98409e-04f, -1.98409e-04f, -1.98409e-04f, -1.98409e-04f };
  64. const __m128 vector_float_sin_c3 = { 8.3333315e-03f, 8.3333315e-03f, 8.3333315e-03f, 8.3333315e-03f };
  65. const __m128 vector_float_sin_c4 = { -1.666666664e-01f, -1.666666664e-01f, -1.666666664e-01f, -1.666666664e-01f };
  66. const __m128 vector_float_atan_c0 = { 0.0028662257f, 0.0028662257f, 0.0028662257f, 0.0028662257f };
  67. const __m128 vector_float_atan_c1 = { -0.0161657367f, -0.0161657367f, -0.0161657367f, -0.0161657367f };
  68. const __m128 vector_float_atan_c2 = { 0.0429096138f, 0.0429096138f, 0.0429096138f, 0.0429096138f };
  69. const __m128 vector_float_atan_c3 = { -0.0752896400f, -0.0752896400f, -0.0752896400f, -0.0752896400f };
  70. const __m128 vector_float_atan_c4 = { 0.1065626393f, 0.1065626393f, 0.1065626393f, 0.1065626393f };
  71. const __m128 vector_float_atan_c5 = { -0.1420889944f, -0.1420889944f, -0.1420889944f, -0.1420889944f };
  72. const __m128 vector_float_atan_c6 = { 0.1999355085f, 0.1999355085f, 0.1999355085f, 0.1999355085f };
  73. const __m128 vector_float_atan_c7 = { -0.3333314528f, -0.3333314528f, -0.3333314528f, -0.3333314528f };
  74. int i = 0;
  75. for ( ; i < numJoints - 3; i += 4 ) {
  76. const int n0 = index[i+0];
  77. const int n1 = index[i+1];
  78. const int n2 = index[i+2];
  79. const int n3 = index[i+3];
  80. __m128 jqa_0 = _mm_load_ps( joints[n0].q.ToFloatPtr() );
  81. __m128 jqb_0 = _mm_load_ps( joints[n1].q.ToFloatPtr() );
  82. __m128 jqc_0 = _mm_load_ps( joints[n2].q.ToFloatPtr() );
  83. __m128 jqd_0 = _mm_load_ps( joints[n3].q.ToFloatPtr() );
  84. __m128 jta_0 = _mm_load_ps( joints[n0].t.ToFloatPtr() );
  85. __m128 jtb_0 = _mm_load_ps( joints[n1].t.ToFloatPtr() );
  86. __m128 jtc_0 = _mm_load_ps( joints[n2].t.ToFloatPtr() );
  87. __m128 jtd_0 = _mm_load_ps( joints[n3].t.ToFloatPtr() );
  88. __m128 bqa_0 = _mm_load_ps( blendJoints[n0].q.ToFloatPtr() );
  89. __m128 bqb_0 = _mm_load_ps( blendJoints[n1].q.ToFloatPtr() );
  90. __m128 bqc_0 = _mm_load_ps( blendJoints[n2].q.ToFloatPtr() );
  91. __m128 bqd_0 = _mm_load_ps( blendJoints[n3].q.ToFloatPtr() );
  92. __m128 bta_0 = _mm_load_ps( blendJoints[n0].t.ToFloatPtr() );
  93. __m128 btb_0 = _mm_load_ps( blendJoints[n1].t.ToFloatPtr() );
  94. __m128 btc_0 = _mm_load_ps( blendJoints[n2].t.ToFloatPtr() );
  95. __m128 btd_0 = _mm_load_ps( blendJoints[n3].t.ToFloatPtr() );
  96. bta_0 = _mm_sub_ps( bta_0, jta_0 );
  97. btb_0 = _mm_sub_ps( btb_0, jtb_0 );
  98. btc_0 = _mm_sub_ps( btc_0, jtc_0 );
  99. btd_0 = _mm_sub_ps( btd_0, jtd_0 );
  100. jta_0 = _mm_madd_ps( vlerp, bta_0, jta_0 );
  101. jtb_0 = _mm_madd_ps( vlerp, btb_0, jtb_0 );
  102. jtc_0 = _mm_madd_ps( vlerp, btc_0, jtc_0 );
  103. jtd_0 = _mm_madd_ps( vlerp, btd_0, jtd_0 );
  104. _mm_store_ps( joints[n0].t.ToFloatPtr(), jta_0 );
  105. _mm_store_ps( joints[n1].t.ToFloatPtr(), jtb_0 );
  106. _mm_store_ps( joints[n2].t.ToFloatPtr(), jtc_0 );
  107. _mm_store_ps( joints[n3].t.ToFloatPtr(), jtd_0 );
  108. __m128 jqr_0 = _mm_unpacklo_ps( jqa_0, jqc_0 );
  109. __m128 jqs_0 = _mm_unpackhi_ps( jqa_0, jqc_0 );
  110. __m128 jqt_0 = _mm_unpacklo_ps( jqb_0, jqd_0 );
  111. __m128 jqu_0 = _mm_unpackhi_ps( jqb_0, jqd_0 );
  112. __m128 bqr_0 = _mm_unpacklo_ps( bqa_0, bqc_0 );
  113. __m128 bqs_0 = _mm_unpackhi_ps( bqa_0, bqc_0 );
  114. __m128 bqt_0 = _mm_unpacklo_ps( bqb_0, bqd_0 );
  115. __m128 bqu_0 = _mm_unpackhi_ps( bqb_0, bqd_0 );
  116. __m128 jqx_0 = _mm_unpacklo_ps( jqr_0, jqt_0 );
  117. __m128 jqy_0 = _mm_unpackhi_ps( jqr_0, jqt_0 );
  118. __m128 jqz_0 = _mm_unpacklo_ps( jqs_0, jqu_0 );
  119. __m128 jqw_0 = _mm_unpackhi_ps( jqs_0, jqu_0 );
  120. __m128 bqx_0 = _mm_unpacklo_ps( bqr_0, bqt_0 );
  121. __m128 bqy_0 = _mm_unpackhi_ps( bqr_0, bqt_0 );
  122. __m128 bqz_0 = _mm_unpacklo_ps( bqs_0, bqu_0 );
  123. __m128 bqw_0 = _mm_unpackhi_ps( bqs_0, bqu_0 );
  124. __m128 cosoma_0 = _mm_mul_ps( jqx_0, bqx_0 );
  125. __m128 cosomb_0 = _mm_mul_ps( jqy_0, bqy_0 );
  126. __m128 cosomc_0 = _mm_mul_ps( jqz_0, bqz_0 );
  127. __m128 cosomd_0 = _mm_mul_ps( jqw_0, bqw_0 );
  128. __m128 cosome_0 = _mm_add_ps( cosoma_0, cosomb_0 );
  129. __m128 cosomf_0 = _mm_add_ps( cosomc_0, cosomd_0 );
  130. __m128 cosomg_0 = _mm_add_ps( cosome_0, cosomf_0 );
  131. __m128 sign_0 = _mm_and_ps( cosomg_0, vector_float_sign_bit );
  132. __m128 cosom_0 = _mm_xor_ps( cosomg_0, sign_0 );
  133. __m128 ss_0 = _mm_nmsub_ps( cosom_0, cosom_0, vector_float_one );
  134. ss_0 = _mm_max_ps( ss_0, vector_float_tiny );
  135. __m128 rs_0 = _mm_rsqrt_ps( ss_0 );
  136. __m128 sq_0 = _mm_mul_ps( rs_0, rs_0 );
  137. __m128 sh_0 = _mm_mul_ps( rs_0, vector_float_rsqrt_c1 );
  138. __m128 sx_0 = _mm_madd_ps( ss_0, sq_0, vector_float_rsqrt_c0 );
  139. __m128 sinom_0 = _mm_mul_ps( sh_0, sx_0 ); // sinom = sqrt( ss );
  140. ss_0 = _mm_mul_ps( ss_0, sinom_0 );
  141. __m128 min_0 = _mm_min_ps( ss_0, cosom_0 );
  142. __m128 max_0 = _mm_max_ps( ss_0, cosom_0 );
  143. __m128 mask_0 = _mm_cmpeq_ps( min_0, cosom_0 );
  144. __m128 masksign_0 = _mm_and_ps( mask_0, vector_float_sign_bit );
  145. __m128 maskPI_0 = _mm_and_ps( mask_0, vector_float_half_pi );
  146. __m128 rcpa_0 = _mm_rcp_ps( max_0 );
  147. __m128 rcpb_0 = _mm_mul_ps( max_0, rcpa_0 );
  148. __m128 rcpd_0 = _mm_add_ps( rcpa_0, rcpa_0 );
  149. __m128 rcp_0 = _mm_nmsub_ps( rcpb_0, rcpa_0, rcpd_0 ); // 1 / y or 1 / x
  150. __m128 ata_0 = _mm_mul_ps( min_0, rcp_0 ); // x / y or y / x
  151. __m128 atb_0 = _mm_xor_ps( ata_0, masksign_0 ); // -x / y or y / x
  152. __m128 atc_0 = _mm_mul_ps( atb_0, atb_0 );
  153. __m128 atd_0 = _mm_madd_ps( atc_0, vector_float_atan_c0, vector_float_atan_c1 );
  154. atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c2 );
  155. atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c3 );
  156. atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c4 );
  157. atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c5 );
  158. atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c6 );
  159. atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c7 );
  160. atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_one );
  161. __m128 omega_a_0 = _mm_madd_ps( atd_0, atb_0, maskPI_0 );
  162. __m128 omega_b_0 = _mm_mul_ps( vlerp, omega_a_0 );
  163. omega_a_0 = _mm_sub_ps( omega_a_0, omega_b_0 );
  164. __m128 sinsa_0 = _mm_mul_ps( omega_a_0, omega_a_0 );
  165. __m128 sinsb_0 = _mm_mul_ps( omega_b_0, omega_b_0 );
  166. __m128 sina_0 = _mm_madd_ps( sinsa_0, vector_float_sin_c0, vector_float_sin_c1 );
  167. __m128 sinb_0 = _mm_madd_ps( sinsb_0, vector_float_sin_c0, vector_float_sin_c1 );
  168. sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_sin_c2 );
  169. sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_sin_c2 );
  170. sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_sin_c3 );
  171. sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_sin_c3 );
  172. sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_sin_c4 );
  173. sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_sin_c4 );
  174. sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_one );
  175. sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_one );
  176. sina_0 = _mm_mul_ps( sina_0, omega_a_0 );
  177. sinb_0 = _mm_mul_ps( sinb_0, omega_b_0 );
  178. __m128 scalea_0 = _mm_mul_ps( sina_0, sinom_0 );
  179. __m128 scaleb_0 = _mm_mul_ps( sinb_0, sinom_0 );
  180. scaleb_0 = _mm_xor_ps( scaleb_0, sign_0 );
  181. jqx_0 = _mm_mul_ps( jqx_0, scalea_0 );
  182. jqy_0 = _mm_mul_ps( jqy_0, scalea_0 );
  183. jqz_0 = _mm_mul_ps( jqz_0, scalea_0 );
  184. jqw_0 = _mm_mul_ps( jqw_0, scalea_0 );
  185. jqx_0 = _mm_madd_ps( bqx_0, scaleb_0, jqx_0 );
  186. jqy_0 = _mm_madd_ps( bqy_0, scaleb_0, jqy_0 );
  187. jqz_0 = _mm_madd_ps( bqz_0, scaleb_0, jqz_0 );
  188. jqw_0 = _mm_madd_ps( bqw_0, scaleb_0, jqw_0 );
  189. __m128 tp0_0 = _mm_unpacklo_ps( jqx_0, jqz_0 );
  190. __m128 tp1_0 = _mm_unpackhi_ps( jqx_0, jqz_0 );
  191. __m128 tp2_0 = _mm_unpacklo_ps( jqy_0, jqw_0 );
  192. __m128 tp3_0 = _mm_unpackhi_ps( jqy_0, jqw_0 );
  193. __m128 p0_0 = _mm_unpacklo_ps( tp0_0, tp2_0 );
  194. __m128 p1_0 = _mm_unpackhi_ps( tp0_0, tp2_0 );
  195. __m128 p2_0 = _mm_unpacklo_ps( tp1_0, tp3_0 );
  196. __m128 p3_0 = _mm_unpackhi_ps( tp1_0, tp3_0 );
  197. _mm_store_ps( joints[n0].q.ToFloatPtr(), p0_0 );
  198. _mm_store_ps( joints[n1].q.ToFloatPtr(), p1_0 );
  199. _mm_store_ps( joints[n2].q.ToFloatPtr(), p2_0 );
  200. _mm_store_ps( joints[n3].q.ToFloatPtr(), p3_0 );
  201. }
  202. for ( ; i < numJoints; i++ ) {
  203. int n = index[i];
  204. idVec3 &jointVert = joints[n].t;
  205. const idVec3 &blendVert = blendJoints[n].t;
  206. jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
  207. jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
  208. jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );
  209. joints[n].w = 0.0f;
  210. idQuat &jointQuat = joints[n].q;
  211. const idQuat &blendQuat = blendJoints[n].q;
  212. float cosom;
  213. float sinom;
  214. float omega;
  215. float scale0;
  216. float scale1;
  217. unsigned long signBit;
  218. cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;
  219. signBit = (*(unsigned long *)&cosom) & ( 1 << 31 );
  220. (*(unsigned long *)&cosom) ^= signBit;
  221. scale0 = 1.0f - cosom * cosom;
  222. scale0 = ( scale0 <= 0.0f ) ? 1e-10f : scale0;
  223. sinom = idMath::InvSqrt( scale0 );
  224. omega = idMath::ATan16( scale0 * sinom, cosom );
  225. scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom;
  226. scale1 = idMath::Sin16( lerp * omega ) * sinom;
  227. (*(unsigned long *)&scale1) ^= signBit;
  228. jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x;
  229. jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y;
  230. jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z;
  231. jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w;
  232. }
  233. }
  234. /*
  235. ============
  236. idSIMD_SSE::BlendJointsFast
  237. ============
  238. */
  239. void VPCALL idSIMD_SSE::BlendJointsFast( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
  240. assert_16_byte_aligned( joints );
  241. assert_16_byte_aligned( blendJoints );
  242. assert_16_byte_aligned( JOINTQUAT_Q_OFFSET );
  243. assert_16_byte_aligned( JOINTQUAT_T_OFFSET );
  244. assert_sizeof_16_byte_multiple( idJointQuat );
  245. if ( lerp <= 0.0f ) {
  246. return;
  247. } else if ( lerp >= 1.0f ) {
  248. for ( int i = 0; i < numJoints; i++ ) {
  249. int j = index[i];
  250. joints[j] = blendJoints[j];
  251. }
  252. return;
  253. }
  254. const __m128 vector_float_sign_bit = __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x80000000 ) );
  255. const __m128 vector_float_rsqrt_c0 = { -3.0f, -3.0f, -3.0f, -3.0f };
  256. const __m128 vector_float_rsqrt_c1 = { -0.5f, -0.5f, -0.5f, -0.5f };
  257. const float scaledLerp = lerp / ( 1.0f - lerp );
  258. const __m128 vlerp = { lerp, lerp, lerp, lerp };
  259. const __m128 vscaledLerp = { scaledLerp, scaledLerp, scaledLerp, scaledLerp };
  260. int i = 0;
  261. for ( ; i < numJoints - 3; i += 4 ) {
  262. const int n0 = index[i+0];
  263. const int n1 = index[i+1];
  264. const int n2 = index[i+2];
  265. const int n3 = index[i+3];
  266. __m128 jqa_0 = _mm_load_ps( joints[n0].q.ToFloatPtr() );
  267. __m128 jqb_0 = _mm_load_ps( joints[n1].q.ToFloatPtr() );
  268. __m128 jqc_0 = _mm_load_ps( joints[n2].q.ToFloatPtr() );
  269. __m128 jqd_0 = _mm_load_ps( joints[n3].q.ToFloatPtr() );
  270. __m128 jta_0 = _mm_load_ps( joints[n0].t.ToFloatPtr() );
  271. __m128 jtb_0 = _mm_load_ps( joints[n1].t.ToFloatPtr() );
  272. __m128 jtc_0 = _mm_load_ps( joints[n2].t.ToFloatPtr() );
  273. __m128 jtd_0 = _mm_load_ps( joints[n3].t.ToFloatPtr() );
  274. __m128 bqa_0 = _mm_load_ps( blendJoints[n0].q.ToFloatPtr() );
  275. __m128 bqb_0 = _mm_load_ps( blendJoints[n1].q.ToFloatPtr() );
  276. __m128 bqc_0 = _mm_load_ps( blendJoints[n2].q.ToFloatPtr() );
  277. __m128 bqd_0 = _mm_load_ps( blendJoints[n3].q.ToFloatPtr() );
  278. __m128 bta_0 = _mm_load_ps( blendJoints[n0].t.ToFloatPtr() );
  279. __m128 btb_0 = _mm_load_ps( blendJoints[n1].t.ToFloatPtr() );
  280. __m128 btc_0 = _mm_load_ps( blendJoints[n2].t.ToFloatPtr() );
  281. __m128 btd_0 = _mm_load_ps( blendJoints[n3].t.ToFloatPtr() );
  282. bta_0 = _mm_sub_ps( bta_0, jta_0 );
  283. btb_0 = _mm_sub_ps( btb_0, jtb_0 );
  284. btc_0 = _mm_sub_ps( btc_0, jtc_0 );
  285. btd_0 = _mm_sub_ps( btd_0, jtd_0 );
  286. jta_0 = _mm_madd_ps( vlerp, bta_0, jta_0 );
  287. jtb_0 = _mm_madd_ps( vlerp, btb_0, jtb_0 );
  288. jtc_0 = _mm_madd_ps( vlerp, btc_0, jtc_0 );
  289. jtd_0 = _mm_madd_ps( vlerp, btd_0, jtd_0 );
  290. _mm_store_ps( joints[n0].t.ToFloatPtr(), jta_0 );
  291. _mm_store_ps( joints[n1].t.ToFloatPtr(), jtb_0 );
  292. _mm_store_ps( joints[n2].t.ToFloatPtr(), jtc_0 );
  293. _mm_store_ps( joints[n3].t.ToFloatPtr(), jtd_0 );
  294. __m128 jqr_0 = _mm_unpacklo_ps( jqa_0, jqc_0 );
  295. __m128 jqs_0 = _mm_unpackhi_ps( jqa_0, jqc_0 );
  296. __m128 jqt_0 = _mm_unpacklo_ps( jqb_0, jqd_0 );
  297. __m128 jqu_0 = _mm_unpackhi_ps( jqb_0, jqd_0 );
  298. __m128 bqr_0 = _mm_unpacklo_ps( bqa_0, bqc_0 );
  299. __m128 bqs_0 = _mm_unpackhi_ps( bqa_0, bqc_0 );
  300. __m128 bqt_0 = _mm_unpacklo_ps( bqb_0, bqd_0 );
  301. __m128 bqu_0 = _mm_unpackhi_ps( bqb_0, bqd_0 );
  302. __m128 jqx_0 = _mm_unpacklo_ps( jqr_0, jqt_0 );
  303. __m128 jqy_0 = _mm_unpackhi_ps( jqr_0, jqt_0 );
  304. __m128 jqz_0 = _mm_unpacklo_ps( jqs_0, jqu_0 );
  305. __m128 jqw_0 = _mm_unpackhi_ps( jqs_0, jqu_0 );
  306. __m128 bqx_0 = _mm_unpacklo_ps( bqr_0, bqt_0 );
  307. __m128 bqy_0 = _mm_unpackhi_ps( bqr_0, bqt_0 );
  308. __m128 bqz_0 = _mm_unpacklo_ps( bqs_0, bqu_0 );
  309. __m128 bqw_0 = _mm_unpackhi_ps( bqs_0, bqu_0 );
  310. __m128 cosoma_0 = _mm_mul_ps( jqx_0, bqx_0 );
  311. __m128 cosomb_0 = _mm_mul_ps( jqy_0, bqy_0 );
  312. __m128 cosomc_0 = _mm_mul_ps( jqz_0, bqz_0 );
  313. __m128 cosomd_0 = _mm_mul_ps( jqw_0, bqw_0 );
  314. __m128 cosome_0 = _mm_add_ps( cosoma_0, cosomb_0 );
  315. __m128 cosomf_0 = _mm_add_ps( cosomc_0, cosomd_0 );
  316. __m128 cosom_0 = _mm_add_ps( cosome_0, cosomf_0 );
  317. __m128 sign_0 = _mm_and_ps( cosom_0, vector_float_sign_bit );
  318. __m128 scale_0 = _mm_xor_ps( vscaledLerp, sign_0 );
  319. jqx_0 = _mm_madd_ps( scale_0, bqx_0, jqx_0 );
  320. jqy_0 = _mm_madd_ps( scale_0, bqy_0, jqy_0 );
  321. jqz_0 = _mm_madd_ps( scale_0, bqz_0, jqz_0 );
  322. jqw_0 = _mm_madd_ps( scale_0, bqw_0, jqw_0 );
  323. __m128 da_0 = _mm_mul_ps( jqx_0, jqx_0 );
  324. __m128 db_0 = _mm_mul_ps( jqy_0, jqy_0 );
  325. __m128 dc_0 = _mm_mul_ps( jqz_0, jqz_0 );
  326. __m128 dd_0 = _mm_mul_ps( jqw_0, jqw_0 );
  327. __m128 de_0 = _mm_add_ps( da_0, db_0 );
  328. __m128 df_0 = _mm_add_ps( dc_0, dd_0 );
  329. __m128 d_0 = _mm_add_ps( de_0, df_0 );
  330. __m128 rs_0 = _mm_rsqrt_ps( d_0 );
  331. __m128 sq_0 = _mm_mul_ps( rs_0, rs_0 );
  332. __m128 sh_0 = _mm_mul_ps( rs_0, vector_float_rsqrt_c1 );
  333. __m128 sx_0 = _mm_madd_ps( d_0, sq_0, vector_float_rsqrt_c0 );
  334. __m128 s_0 = _mm_mul_ps( sh_0, sx_0 );
  335. jqx_0 = _mm_mul_ps( jqx_0, s_0 );
  336. jqy_0 = _mm_mul_ps( jqy_0, s_0 );
  337. jqz_0 = _mm_mul_ps( jqz_0, s_0 );
  338. jqw_0 = _mm_mul_ps( jqw_0, s_0 );
  339. __m128 tp0_0 = _mm_unpacklo_ps( jqx_0, jqz_0 );
  340. __m128 tp1_0 = _mm_unpackhi_ps( jqx_0, jqz_0 );
  341. __m128 tp2_0 = _mm_unpacklo_ps( jqy_0, jqw_0 );
  342. __m128 tp3_0 = _mm_unpackhi_ps( jqy_0, jqw_0 );
  343. __m128 p0_0 = _mm_unpacklo_ps( tp0_0, tp2_0 );
  344. __m128 p1_0 = _mm_unpackhi_ps( tp0_0, tp2_0 );
  345. __m128 p2_0 = _mm_unpacklo_ps( tp1_0, tp3_0 );
  346. __m128 p3_0 = _mm_unpackhi_ps( tp1_0, tp3_0 );
  347. _mm_store_ps( joints[n0].q.ToFloatPtr(), p0_0 );
  348. _mm_store_ps( joints[n1].q.ToFloatPtr(), p1_0 );
  349. _mm_store_ps( joints[n2].q.ToFloatPtr(), p2_0 );
  350. _mm_store_ps( joints[n3].q.ToFloatPtr(), p3_0 );
  351. }
  352. for ( ; i < numJoints; i++ ) {
  353. const int n = index[i];
  354. idVec3 &jointVert = joints[n].t;
  355. const idVec3 &blendVert = blendJoints[n].t;
  356. jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
  357. jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
  358. jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );
  359. idQuat &jointQuat = joints[n].q;
  360. const idQuat &blendQuat = blendJoints[n].q;
  361. float cosom;
  362. float scale;
  363. float s;
  364. cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;
  365. scale = __fsels( cosom, scaledLerp, -scaledLerp );
  366. jointQuat.x += scale * blendQuat.x;
  367. jointQuat.y += scale * blendQuat.y;
  368. jointQuat.z += scale * blendQuat.z;
  369. jointQuat.w += scale * blendQuat.w;
  370. s = jointQuat.x * jointQuat.x + jointQuat.y * jointQuat.y + jointQuat.z * jointQuat.z + jointQuat.w * jointQuat.w;
  371. s = __frsqrts( s );
  372. jointQuat.x *= s;
  373. jointQuat.y *= s;
  374. jointQuat.z *= s;
  375. jointQuat.w *= s;
  376. }
  377. }
  378. /*
  379. ============
  380. idSIMD_SSE::ConvertJointQuatsToJointMats
  381. ============
  382. */
  383. void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
  384. assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
  385. assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
  386. assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
  387. const float * jointQuatPtr = (float *)jointQuats;
  388. float * jointMatPtr = (float *)jointMats;
  389. const __m128 vector_float_first_sign_bit = __m128c( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x80000000 ) );
  390. const __m128 vector_float_last_three_sign_bits = __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x00000000 ) );
  391. const __m128 vector_float_first_pos_half = { 0.5f, 0.0f, 0.0f, 0.0f }; // +.5 0 0 0
  392. const __m128 vector_float_first_neg_half = { -0.5f, 0.0f, 0.0f, 0.0f }; // -.5 0 0 0
  393. const __m128 vector_float_quat2mat_mad1 = { -1.0f, -1.0f, +1.0f, -1.0f }; // - - + -
  394. const __m128 vector_float_quat2mat_mad2 = { -1.0f, +1.0f, -1.0f, -1.0f }; // - + - -
  395. const __m128 vector_float_quat2mat_mad3 = { +1.0f, -1.0f, -1.0f, +1.0f }; // + - - +
  396. int i = 0;
  397. for ( ; i + 1 < numJoints; i += 2 ) {
  398. __m128 q0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+0] );
  399. __m128 q1 = _mm_load_ps( &jointQuatPtr[i*8+1*8+0] );
  400. __m128 t0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+4] );
  401. __m128 t1 = _mm_load_ps( &jointQuatPtr[i*8+1*8+4] );
  402. __m128 d0 = _mm_add_ps( q0, q0 );
  403. __m128 d1 = _mm_add_ps( q1, q1 );
  404. __m128 sa0 = _mm_perm_ps( q0, _MM_SHUFFLE( 1, 0, 0, 1 ) ); // y, x, x, y
  405. __m128 sb0 = _mm_perm_ps( d0, _MM_SHUFFLE( 2, 2, 1, 1 ) ); // y2, y2, z2, z2
  406. __m128 sc0 = _mm_perm_ps( q0, _MM_SHUFFLE( 3, 3, 3, 2 ) ); // z, w, w, w
  407. __m128 sd0 = _mm_perm_ps( d0, _MM_SHUFFLE( 0, 1, 2, 2 ) ); // z2, z2, y2, x2
  408. __m128 sa1 = _mm_perm_ps( q1, _MM_SHUFFLE( 1, 0, 0, 1 ) ); // y, x, x, y
  409. __m128 sb1 = _mm_perm_ps( d1, _MM_SHUFFLE( 2, 2, 1, 1 ) ); // y2, y2, z2, z2
  410. __m128 sc1 = _mm_perm_ps( q1, _MM_SHUFFLE( 3, 3, 3, 2 ) ); // z, w, w, w
  411. __m128 sd1 = _mm_perm_ps( d1, _MM_SHUFFLE( 0, 1, 2, 2 ) ); // z2, z2, y2, x2
  412. sa0 = _mm_xor_ps( sa0, vector_float_first_sign_bit );
  413. sa1 = _mm_xor_ps( sa1, vector_float_first_sign_bit );
  414. sc0 = _mm_xor_ps( sc0, vector_float_last_three_sign_bits ); // flip stupid inverse quaternions
  415. sc1 = _mm_xor_ps( sc1, vector_float_last_three_sign_bits ); // flip stupid inverse quaternions
  416. __m128 ma0 = _mm_add_ps( _mm_mul_ps( sa0, sb0 ), vector_float_first_pos_half ); // .5 - yy2, xy2, xz2, yz2 // .5 0 0 0
  417. __m128 mb0 = _mm_add_ps( _mm_mul_ps( sc0, sd0 ), vector_float_first_neg_half ); // -.5 + zz2, wz2, wy2, wx2 // -.5 0 0 0
  418. __m128 mc0 = _mm_sub_ps( vector_float_first_pos_half, _mm_mul_ps( q0, d0 ) ); // .5 - xx2, -yy2, -zz2, -ww2 // .5 0 0 0
  419. __m128 ma1 = _mm_add_ps( _mm_mul_ps( sa1, sb1 ), vector_float_first_pos_half ); // .5 - yy2, xy2, xz2, yz2 // .5 0 0 0
  420. __m128 mb1 = _mm_add_ps( _mm_mul_ps( sc1, sd1 ), vector_float_first_neg_half ); // -.5 + zz2, wz2, wy2, wx2 // -.5 0 0 0
  421. __m128 mc1 = _mm_sub_ps( vector_float_first_pos_half, _mm_mul_ps( q1, d1 ) ); // .5 - xx2, -yy2, -zz2, -ww2 // .5 0 0 0
  422. __m128 mf0 = _mm_shuffle_ps( ma0, mc0, _MM_SHUFFLE( 0, 0, 1, 1 ) ); // xy2, xy2, .5 - xx2, .5 - xx2 // 01, 01, 10, 10
  423. __m128 md0 = _mm_shuffle_ps( mf0, ma0, _MM_SHUFFLE( 3, 2, 0, 2 ) ); // .5 - xx2, xy2, xz2, yz2 // 10, 01, 02, 03
  424. __m128 me0 = _mm_shuffle_ps( ma0, mb0, _MM_SHUFFLE( 3, 2, 1, 0 ) ); // .5 - yy2, xy2, wy2, wx2 // 00, 01, 12, 13
  425. __m128 mf1 = _mm_shuffle_ps( ma1, mc1, _MM_SHUFFLE( 0, 0, 1, 1 ) ); // xy2, xy2, .5 - xx2, .5 - xx2 // 01, 01, 10, 10
  426. __m128 md1 = _mm_shuffle_ps( mf1, ma1, _MM_SHUFFLE( 3, 2, 0, 2 ) ); // .5 - xx2, xy2, xz2, yz2 // 10, 01, 02, 03
  427. __m128 me1 = _mm_shuffle_ps( ma1, mb1, _MM_SHUFFLE( 3, 2, 1, 0 ) ); // .5 - yy2, xy2, wy2, wx2 // 00, 01, 12, 13
  428. __m128 ra0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad1 ), ma0 ); // 1 - yy2 - zz2, xy2 - wz2, xz2 + wy2, // - - + -
  429. __m128 rb0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad2 ), md0 ); // 1 - xx2 - zz2, xy2 + wz2, , yz2 - wx2 // - + - -
  430. __m128 rc0 = _mm_add_ps( _mm_mul_ps( me0, vector_float_quat2mat_mad3 ), md0 ); // 1 - xx2 - yy2, , xz2 - wy2, yz2 + wx2 // + - - +
  431. __m128 ra1 = _mm_add_ps( _mm_mul_ps( mb1, vector_float_quat2mat_mad1 ), ma1 ); // 1 - yy2 - zz2, xy2 - wz2, xz2 + wy2, // - - + -
  432. __m128 rb1 = _mm_add_ps( _mm_mul_ps( mb1, vector_float_quat2mat_mad2 ), md1 ); // 1 - xx2 - zz2, xy2 + wz2, , yz2 - wx2 // - + - -
  433. __m128 rc1 = _mm_add_ps( _mm_mul_ps( me1, vector_float_quat2mat_mad3 ), md1 ); // 1 - xx2 - yy2, , xz2 - wy2, yz2 + wx2 // + - - +
  434. __m128 ta0 = _mm_shuffle_ps( ra0, t0, _MM_SHUFFLE( 0, 0, 2, 2 ) );
  435. __m128 tb0 = _mm_shuffle_ps( rb0, t0, _MM_SHUFFLE( 1, 1, 3, 3 ) );
  436. __m128 tc0 = _mm_shuffle_ps( rc0, t0, _MM_SHUFFLE( 2, 2, 0, 0 ) );
  437. __m128 ta1 = _mm_shuffle_ps( ra1, t1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
  438. __m128 tb1 = _mm_shuffle_ps( rb1, t1, _MM_SHUFFLE( 1, 1, 3, 3 ) );
  439. __m128 tc1 = _mm_shuffle_ps( rc1, t1, _MM_SHUFFLE( 2, 2, 0, 0 ) );
  440. ra0 = _mm_shuffle_ps( ra0, ta0, _MM_SHUFFLE( 2, 0, 1, 0 ) ); // 00 01 02 10
  441. rb0 = _mm_shuffle_ps( rb0, tb0, _MM_SHUFFLE( 2, 0, 0, 1 ) ); // 01 00 03 11
  442. rc0 = _mm_shuffle_ps( rc0, tc0, _MM_SHUFFLE( 2, 0, 3, 2 ) ); // 02 03 00 12
  443. ra1 = _mm_shuffle_ps( ra1, ta1, _MM_SHUFFLE( 2, 0, 1, 0 ) ); // 00 01 02 10
  444. rb1 = _mm_shuffle_ps( rb1, tb1, _MM_SHUFFLE( 2, 0, 0, 1 ) ); // 01 00 03 11
  445. rc1 = _mm_shuffle_ps( rc1, tc1, _MM_SHUFFLE( 2, 0, 3, 2 ) ); // 02 03 00 12
  446. _mm_store_ps( &jointMatPtr[i*12+0*12+0], ra0 );
  447. _mm_store_ps( &jointMatPtr[i*12+0*12+4], rb0 );
  448. _mm_store_ps( &jointMatPtr[i*12+0*12+8], rc0 );
  449. _mm_store_ps( &jointMatPtr[i*12+1*12+0], ra1 );
  450. _mm_store_ps( &jointMatPtr[i*12+1*12+4], rb1 );
  451. _mm_store_ps( &jointMatPtr[i*12+1*12+8], rc1 );
  452. }
  453. for ( ; i < numJoints; i++ ) {
  454. __m128 q0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+0] );
  455. __m128 t0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+4] );
  456. __m128 d0 = _mm_add_ps( q0, q0 );
  457. __m128 sa0 = _mm_perm_ps( q0, _MM_SHUFFLE( 1, 0, 0, 1 ) ); // y, x, x, y
  458. __m128 sb0 = _mm_perm_ps( d0, _MM_SHUFFLE( 2, 2, 1, 1 ) ); // y2, y2, z2, z2
  459. __m128 sc0 = _mm_perm_ps( q0, _MM_SHUFFLE( 3, 3, 3, 2 ) ); // z, w, w, w
  460. __m128 sd0 = _mm_perm_ps( d0, _MM_SHUFFLE( 0, 1, 2, 2 ) ); // z2, z2, y2, x2
  461. sa0 = _mm_xor_ps( sa0, vector_float_first_sign_bit );
  462. sc0 = _mm_xor_ps( sc0, vector_float_last_three_sign_bits ); // flip stupid inverse quaternions
  463. __m128 ma0 = _mm_add_ps( _mm_mul_ps( sa0, sb0 ), vector_float_first_pos_half ); // .5 - yy2, xy2, xz2, yz2 // .5 0 0 0
  464. __m128 mb0 = _mm_add_ps( _mm_mul_ps( sc0, sd0 ), vector_float_first_neg_half ); // -.5 + zz2, wz2, wy2, wx2 // -.5 0 0 0
  465. __m128 mc0 = _mm_sub_ps( vector_float_first_pos_half, _mm_mul_ps( q0, d0 ) ); // .5 - xx2, -yy2, -zz2, -ww2 // .5 0 0 0
  466. __m128 mf0 = _mm_shuffle_ps( ma0, mc0, _MM_SHUFFLE( 0, 0, 1, 1 ) ); // xy2, xy2, .5 - xx2, .5 - xx2 // 01, 01, 10, 10
  467. __m128 md0 = _mm_shuffle_ps( mf0, ma0, _MM_SHUFFLE( 3, 2, 0, 2 ) ); // .5 - xx2, xy2, xz2, yz2 // 10, 01, 02, 03
  468. __m128 me0 = _mm_shuffle_ps( ma0, mb0, _MM_SHUFFLE( 3, 2, 1, 0 ) ); // .5 - yy2, xy2, wy2, wx2 // 00, 01, 12, 13
  469. __m128 ra0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad1 ), ma0 ); // 1 - yy2 - zz2, xy2 - wz2, xz2 + wy2, // - - + -
  470. __m128 rb0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad2 ), md0 ); // 1 - xx2 - zz2, xy2 + wz2, , yz2 - wx2 // - + - -
  471. __m128 rc0 = _mm_add_ps( _mm_mul_ps( me0, vector_float_quat2mat_mad3 ), md0 ); // 1 - xx2 - yy2, , xz2 - wy2, yz2 + wx2 // + - - +
  472. __m128 ta0 = _mm_shuffle_ps( ra0, t0, _MM_SHUFFLE( 0, 0, 2, 2 ) );
  473. __m128 tb0 = _mm_shuffle_ps( rb0, t0, _MM_SHUFFLE( 1, 1, 3, 3 ) );
  474. __m128 tc0 = _mm_shuffle_ps( rc0, t0, _MM_SHUFFLE( 2, 2, 0, 0 ) );
  475. ra0 = _mm_shuffle_ps( ra0, ta0, _MM_SHUFFLE( 2, 0, 1, 0 ) ); // 00 01 02 10
  476. rb0 = _mm_shuffle_ps( rb0, tb0, _MM_SHUFFLE( 2, 0, 0, 1 ) ); // 01 00 03 11
  477. rc0 = _mm_shuffle_ps( rc0, tc0, _MM_SHUFFLE( 2, 0, 3, 2 ) ); // 02 03 00 12
  478. _mm_store_ps( &jointMatPtr[i*12+0*12+0], ra0 );
  479. _mm_store_ps( &jointMatPtr[i*12+0*12+4], rb0 );
  480. _mm_store_ps( &jointMatPtr[i*12+0*12+8], rc0 );
  481. }
  482. }
  483. /*
  484. ============
  485. idSIMD_SSE::ConvertJointMatsToJointQuats
  486. ============
  487. */
  488. void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
  489. assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
  490. assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
  491. assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
  492. const __m128 vector_float_zero = _mm_setzero_ps();
  493. const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f };
  494. const __m128 vector_float_not = __m128c( _mm_set_epi32( -1, -1, -1, -1 ) );
  495. const __m128 vector_float_sign_bit = __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x80000000 ) );
  496. const __m128 vector_float_rsqrt_c0 = { -3.0f, -3.0f, -3.0f, -3.0f };
  497. const __m128 vector_float_rsqrt_c2 = { -0.25f, -0.25f, -0.25f, -0.25f };
  498. int i = 0;
  499. for ( ; i < numJoints - 3; i += 4 ) {
  500. const float *__restrict m = (float *)&jointMats[i];
  501. float *__restrict q = (float *)&jointQuats[i];
  502. __m128 ma0 = _mm_load_ps( &m[0*12+0] );
  503. __m128 ma1 = _mm_load_ps( &m[0*12+4] );
  504. __m128 ma2 = _mm_load_ps( &m[0*12+8] );
  505. __m128 mb0 = _mm_load_ps( &m[1*12+0] );
  506. __m128 mb1 = _mm_load_ps( &m[1*12+4] );
  507. __m128 mb2 = _mm_load_ps( &m[1*12+8] );
  508. __m128 mc0 = _mm_load_ps( &m[2*12+0] );
  509. __m128 mc1 = _mm_load_ps( &m[2*12+4] );
  510. __m128 mc2 = _mm_load_ps( &m[2*12+8] );
  511. __m128 md0 = _mm_load_ps( &m[3*12+0] );
  512. __m128 md1 = _mm_load_ps( &m[3*12+4] );
  513. __m128 md2 = _mm_load_ps( &m[3*12+8] );
  514. __m128 ta0 = _mm_unpacklo_ps( ma0, mc0 ); // a0, c0, a1, c1
  515. __m128 ta1 = _mm_unpackhi_ps( ma0, mc0 ); // a2, c2, a3, c3
  516. __m128 ta2 = _mm_unpacklo_ps( mb0, md0 ); // b0, d0, b1, b2
  517. __m128 ta3 = _mm_unpackhi_ps( mb0, md0 ); // b2, d2, b3, d3
  518. __m128 tb0 = _mm_unpacklo_ps( ma1, mc1 ); // a0, c0, a1, c1
  519. __m128 tb1 = _mm_unpackhi_ps( ma1, mc1 ); // a2, c2, a3, c3
  520. __m128 tb2 = _mm_unpacklo_ps( mb1, md1 ); // b0, d0, b1, b2
  521. __m128 tb3 = _mm_unpackhi_ps( mb1, md1 ); // b2, d2, b3, d3
  522. __m128 tc0 = _mm_unpacklo_ps( ma2, mc2 ); // a0, c0, a1, c1
  523. __m128 tc1 = _mm_unpackhi_ps( ma2, mc2 ); // a2, c2, a3, c3
  524. __m128 tc2 = _mm_unpacklo_ps( mb2, md2 ); // b0, d0, b1, b2
  525. __m128 tc3 = _mm_unpackhi_ps( mb2, md2 ); // b2, d2, b3, d3
  526. __m128 m00 = _mm_unpacklo_ps( ta0, ta2 );
  527. __m128 m01 = _mm_unpackhi_ps( ta0, ta2 );
  528. __m128 m02 = _mm_unpacklo_ps( ta1, ta3 );
  529. __m128 m03 = _mm_unpackhi_ps( ta1, ta3 );
  530. __m128 m10 = _mm_unpacklo_ps( tb0, tb2 );
  531. __m128 m11 = _mm_unpackhi_ps( tb0, tb2 );
  532. __m128 m12 = _mm_unpacklo_ps( tb1, tb3 );
  533. __m128 m13 = _mm_unpackhi_ps( tb1, tb3 );
  534. __m128 m20 = _mm_unpacklo_ps( tc0, tc2 );
  535. __m128 m21 = _mm_unpackhi_ps( tc0, tc2 );
  536. __m128 m22 = _mm_unpacklo_ps( tc1, tc3 );
  537. __m128 m23 = _mm_unpackhi_ps( tc1, tc3 );
  538. __m128 b00 = _mm_add_ps( m00, m11 );
  539. __m128 b11 = _mm_cmpgt_ps( m00, m22 );
  540. __m128 b01 = _mm_add_ps( b00, m22 );
  541. __m128 b10 = _mm_cmpgt_ps( m00, m11 );
  542. __m128 b0 = _mm_cmpgt_ps( b01, vector_float_zero );
  543. __m128 b1 = _mm_and_ps( b10, b11 );
  544. __m128 b2 = _mm_cmpgt_ps( m11, m22 );
  545. __m128 m0 = b0;
  546. __m128 m1 = _mm_and_ps( _mm_xor_ps( b0, vector_float_not ), b1 );
  547. __m128 p1 = _mm_or_ps( b0, b1 );
  548. __m128 p2 = _mm_or_ps( p1, b2 );
  549. __m128 m2 = _mm_and_ps( _mm_xor_ps( p1, vector_float_not ), b2 );
  550. __m128 m3 = _mm_xor_ps( p2, vector_float_not );
  551. __m128 i0 = _mm_or_ps( m2, m3 );
  552. __m128 i1 = _mm_or_ps( m1, m3 );
  553. __m128 i2 = _mm_or_ps( m1, m2 );
  554. __m128 s0 = _mm_and_ps( i0, vector_float_sign_bit );
  555. __m128 s1 = _mm_and_ps( i1, vector_float_sign_bit );
  556. __m128 s2 = _mm_and_ps( i2, vector_float_sign_bit );
  557. m00 = _mm_xor_ps( m00, s0 );
  558. m11 = _mm_xor_ps( m11, s1 );
  559. m22 = _mm_xor_ps( m22, s2 );
  560. m21 = _mm_xor_ps( m21, s0 );
  561. m02 = _mm_xor_ps( m02, s1 );
  562. m10 = _mm_xor_ps( m10, s2 );
  563. __m128 t0 = _mm_add_ps( m00, m11 );
  564. __m128 t1 = _mm_add_ps( m22, vector_float_one );
  565. __m128 q0 = _mm_add_ps( t0, t1 );
  566. __m128 q1 = _mm_sub_ps( m01, m10 );
  567. __m128 q2 = _mm_sub_ps( m20, m02 );
  568. __m128 q3 = _mm_sub_ps( m12, m21 );
  569. __m128 rs = _mm_rsqrt_ps( q0 );
  570. __m128 sq = _mm_mul_ps( rs, rs );
  571. __m128 sh = _mm_mul_ps( rs, vector_float_rsqrt_c2 );
  572. __m128 sx = _mm_madd_ps( q0, sq, vector_float_rsqrt_c0 );
  573. __m128 s = _mm_mul_ps( sh, sx );
  574. q0 = _mm_mul_ps( q0, s );
  575. q1 = _mm_mul_ps( q1, s );
  576. q2 = _mm_mul_ps( q2, s );
  577. q3 = _mm_mul_ps( q3, s );
  578. m0 = _mm_or_ps( m0, m2 );
  579. m2 = _mm_or_ps( m2, m3 );
  580. __m128 fq0 = _mm_sel_ps( q0, q3, m0 );
  581. __m128 fq1 = _mm_sel_ps( q1, q2, m0 );
  582. __m128 fq2 = _mm_sel_ps( q2, q1, m0 );
  583. __m128 fq3 = _mm_sel_ps( q3, q0, m0 );
  584. __m128 rq0 = _mm_sel_ps( fq0, fq2, m2 );
  585. __m128 rq1 = _mm_sel_ps( fq1, fq3, m2 );
  586. __m128 rq2 = _mm_sel_ps( fq2, fq0, m2 );
  587. __m128 rq3 = _mm_sel_ps( fq3, fq1, m2 );
  588. __m128 tq0 = _mm_unpacklo_ps( rq0, rq2 );
  589. __m128 tq1 = _mm_unpackhi_ps( rq0, rq2 );
  590. __m128 tq2 = _mm_unpacklo_ps( rq1, rq3 );
  591. __m128 tq3 = _mm_unpackhi_ps( rq1, rq3 );
  592. __m128 sq0 = _mm_unpacklo_ps( tq0, tq2 );
  593. __m128 sq1 = _mm_unpackhi_ps( tq0, tq2 );
  594. __m128 sq2 = _mm_unpacklo_ps( tq1, tq3 );
  595. __m128 sq3 = _mm_unpackhi_ps( tq1, tq3 );
  596. __m128 tt0 = _mm_unpacklo_ps( m03, m23 );
  597. __m128 tt1 = _mm_unpackhi_ps( m03, m23 );
  598. __m128 tt2 = _mm_unpacklo_ps( m13, vector_float_zero );
  599. __m128 tt3 = _mm_unpackhi_ps( m13, vector_float_zero );
  600. __m128 st0 = _mm_unpacklo_ps( tt0, tt2 );
  601. __m128 st1 = _mm_unpackhi_ps( tt0, tt2 );
  602. __m128 st2 = _mm_unpacklo_ps( tt1, tt3 );
  603. __m128 st3 = _mm_unpackhi_ps( tt1, tt3 );
  604. _mm_store_ps( &q[0*4], sq0 );
  605. _mm_store_ps( &q[1*4], st0 );
  606. _mm_store_ps( &q[2*4], sq1 );
  607. _mm_store_ps( &q[3*4], st1 );
  608. _mm_store_ps( &q[4*4], sq2 );
  609. _mm_store_ps( &q[5*4], st2 );
  610. _mm_store_ps( &q[6*4], sq3 );
  611. _mm_store_ps( &q[7*4], st3 );
  612. }
  613. float sign[2] = { 1.0f, -1.0f };
  614. for ( ; i < numJoints; i++ ) {
  615. const float *__restrict m = (float *)&jointMats[i];
  616. float *__restrict q = (float *)&jointQuats[i];
  617. int b0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f;
  618. int b1 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2];
  619. int b2 = m[1 * 4 + 1] > m[2 * 4 + 2];
  620. int m0 = b0;
  621. int m1 = ( !b0 ) & b1;
  622. int m2 = ( !( b0 | b1 ) ) & b2;
  623. int m3 = !( b0 | b1 | b2 );
  624. int i0 = ( m2 | m3 );
  625. int i1 = ( m1 | m3 );
  626. int i2 = ( m1 | m2 );
  627. float s0 = sign[i0];
  628. float s1 = sign[i1];
  629. float s2 = sign[i2];
  630. float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f;
  631. float s = __frsqrts( t );
  632. s = ( t * s * s + -3.0f ) * ( s * -0.25f );
  633. q[0] = t * s;
  634. q[1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
  635. q[2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
  636. q[3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
  637. if ( m0 | m2 ) {
  638. // reverse
  639. SwapValues( q[0], q[3] );
  640. SwapValues( q[1], q[2] );
  641. }
  642. if ( m2 | m3 ) {
  643. // rotate 2
  644. SwapValues( q[0], q[2] );
  645. SwapValues( q[1], q[3] );
  646. }
  647. q[4] = m[0 * 4 + 3];
  648. q[5] = m[1 * 4 + 3];
  649. q[6] = m[2 * 4 + 3];
  650. q[7] = 0.0f;
  651. }
  652. }
  653. /*
  654. ============
  655. idSIMD_SSE::TransformJoints
  656. ============
  657. */
  658. void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
  659. const __m128 vector_float_mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) );
  660. const float *__restrict firstMatrix = jointMats->ToFloatPtr() + ( firstJoint + firstJoint + firstJoint - 3 ) * 4;
  661. __m128 pma = _mm_load_ps( firstMatrix + 0 );
  662. __m128 pmb = _mm_load_ps( firstMatrix + 4 );
  663. __m128 pmc = _mm_load_ps( firstMatrix + 8 );
  664. for ( int joint = firstJoint; joint <= lastJoint; joint++ ) {
  665. const int parent = parents[joint];
  666. const float *__restrict parentMatrix = jointMats->ToFloatPtr() + ( parent + parent + parent ) * 4;
  667. float *__restrict childMatrix = jointMats->ToFloatPtr() + ( joint + joint + joint ) * 4;
  668. if ( parent != joint - 1 ) {
  669. pma = _mm_load_ps( parentMatrix + 0 );
  670. pmb = _mm_load_ps( parentMatrix + 4 );
  671. pmc = _mm_load_ps( parentMatrix + 8 );
  672. }
  673. __m128 cma = _mm_load_ps( childMatrix + 0 );
  674. __m128 cmb = _mm_load_ps( childMatrix + 4 );
  675. __m128 cmc = _mm_load_ps( childMatrix + 8 );
  676. __m128 ta = _mm_splat_ps( pma, 0 );
  677. __m128 tb = _mm_splat_ps( pmb, 0 );
  678. __m128 tc = _mm_splat_ps( pmc, 0 );
  679. __m128 td = _mm_splat_ps( pma, 1 );
  680. __m128 te = _mm_splat_ps( pmb, 1 );
  681. __m128 tf = _mm_splat_ps( pmc, 1 );
  682. __m128 tg = _mm_splat_ps( pma, 2 );
  683. __m128 th = _mm_splat_ps( pmb, 2 );
  684. __m128 ti = _mm_splat_ps( pmc, 2 );
  685. pma = _mm_madd_ps( ta, cma, _mm_and_ps( pma, vector_float_mask_keep_last ) );
  686. pmb = _mm_madd_ps( tb, cma, _mm_and_ps( pmb, vector_float_mask_keep_last ) );
  687. pmc = _mm_madd_ps( tc, cma, _mm_and_ps( pmc, vector_float_mask_keep_last ) );
  688. pma = _mm_madd_ps( td, cmb, pma );
  689. pmb = _mm_madd_ps( te, cmb, pmb );
  690. pmc = _mm_madd_ps( tf, cmb, pmc );
  691. pma = _mm_madd_ps( tg, cmc, pma );
  692. pmb = _mm_madd_ps( th, cmc, pmb );
  693. pmc = _mm_madd_ps( ti, cmc, pmc );
  694. _mm_store_ps( childMatrix + 0, pma );
  695. _mm_store_ps( childMatrix + 4, pmb );
  696. _mm_store_ps( childMatrix + 8, pmc );
  697. }
  698. }
  699. /*
  700. ============
  701. idSIMD_SSE::UntransformJoints
  702. ============
  703. */
  704. void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
  705. const __m128 vector_float_mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) );
  706. for ( int joint = lastJoint; joint >= firstJoint; joint-- ) {
  707. assert( parents[joint] < joint );
  708. const int parent = parents[joint];
  709. const float *__restrict parentMatrix = jointMats->ToFloatPtr() + ( parent + parent + parent ) * 4;
  710. float *__restrict childMatrix = jointMats->ToFloatPtr() + ( joint + joint + joint ) * 4;
  711. __m128 pma = _mm_load_ps( parentMatrix + 0 );
  712. __m128 pmb = _mm_load_ps( parentMatrix + 4 );
  713. __m128 pmc = _mm_load_ps( parentMatrix + 8 );
  714. __m128 cma = _mm_load_ps( childMatrix + 0 );
  715. __m128 cmb = _mm_load_ps( childMatrix + 4 );
  716. __m128 cmc = _mm_load_ps( childMatrix + 8 );
  717. __m128 ta = _mm_splat_ps( pma, 0 );
  718. __m128 tb = _mm_splat_ps( pma, 1 );
  719. __m128 tc = _mm_splat_ps( pma, 2 );
  720. __m128 td = _mm_splat_ps( pmb, 0 );
  721. __m128 te = _mm_splat_ps( pmb, 1 );
  722. __m128 tf = _mm_splat_ps( pmb, 2 );
  723. __m128 tg = _mm_splat_ps( pmc, 0 );
  724. __m128 th = _mm_splat_ps( pmc, 1 );
  725. __m128 ti = _mm_splat_ps( pmc, 2 );
  726. cma = _mm_sub_ps( cma, _mm_and_ps( pma, vector_float_mask_keep_last ) );
  727. cmb = _mm_sub_ps( cmb, _mm_and_ps( pmb, vector_float_mask_keep_last ) );
  728. cmc = _mm_sub_ps( cmc, _mm_and_ps( pmc, vector_float_mask_keep_last ) );
  729. pma = _mm_mul_ps( ta, cma );
  730. pmb = _mm_mul_ps( tb, cma );
  731. pmc = _mm_mul_ps( tc, cma );
  732. pma = _mm_madd_ps( td, cmb, pma );
  733. pmb = _mm_madd_ps( te, cmb, pmb );
  734. pmc = _mm_madd_ps( tf, cmb, pmc );
  735. pma = _mm_madd_ps( tg, cmc, pma );
  736. pmb = _mm_madd_ps( th, cmc, pmb );
  737. pmc = _mm_madd_ps( ti, cmc, pmc );
  738. _mm_store_ps( childMatrix + 0, pma );
  739. _mm_store_ps( childMatrix + 4, pmb );
  740. _mm_store_ps( childMatrix + 8, pmc );
  741. }
  742. }