DrawVert_intrinsics.h 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. /*
  2. ===========================================================================
  3. Doom 3 BFG Edition GPL Source Code
  4. Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
  6. Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #ifndef __DRAWVERT_INTRINSICS_H__
  21. #define __DRAWVERT_INTRINSICS_H__
  22. #ifdef ID_WIN_X86_SSE2_INTRIN
  23. static const __m128i vector_int_f32_sign_mask = _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT );
  24. static const __m128i vector_int_f32_exponent_mask = _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS );
  25. static const __m128i vector_int_f32_mantissa_mask = _mm_set1_epi32( ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1 );
  26. static const __m128i vector_int_f16_min_exponent = _mm_set1_epi32( 0 );
  27. static const __m128i vector_int_f16_max_exponent = _mm_set1_epi32( ( 30 << IEEE_FLT16_MANTISSA_BITS ) );
  28. static const __m128i vector_int_f16_min_mantissa = _mm_set1_epi32( 0 );
  29. static const __m128i vector_int_f16_max_mantissa = _mm_set1_epi32( ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 ) );
  30. static const __m128i vector_int_f32_to_f16_exponent_bias = _mm_set1_epi32( ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS );
  31. static const int f32_to_f16_sign_shift = IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT;
  32. static const int f32_to_f16_exponent_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
  33. static const int f32_to_f16_mantissa_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
  34. static const __m128i vector_int_zero = _mm_setzero_si128();
  35. static const __m128i vector_int_one = _mm_set_epi32( 1, 1, 1, 1 );
  36. static const __m128 vector_float_mask_clear_last = __m128c( _mm_set_epi32( 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF ) );
  37. static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0f };
  38. static const __m128 vector_float_1_over_255 = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f };
  39. static const __m128 vector_float_1_over_4 = { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f };
  40. #endif
  41. /*
  42. ====================
  43. FastF32toF16
  44. ====================
  45. */
  46. #ifdef ID_WIN_X86_SSE2_INTRIN
  47. ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) {
  48. __m128i f16_sign = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask ), f32_to_f16_sign_shift );
  49. __m128i f16_exponent = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_exponent_mask ), f32_to_f16_exponent_shift );
  50. __m128i f16_mantissa = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_mantissa_mask ), f32_to_f16_mantissa_shift );
  51. f16_exponent = _mm_sub_epi32( f16_exponent, vector_int_f32_to_f16_exponent_bias );
  52. const __m128i underflow = _mm_cmplt_epi32( f16_exponent, vector_int_f16_min_exponent );
  53. const __m128i overflow = _mm_cmpgt_epi32( f16_exponent, vector_int_f16_max_exponent );
  54. f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_min_exponent, underflow );
  55. f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_max_exponent, overflow );
  56. f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_min_mantissa, underflow );
  57. f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_max_mantissa, overflow );
  58. __m128i flt16 = _mm_or_si128( _mm_or_si128( f16_sign, f16_exponent ), f16_mantissa );
  59. return _mm_packs_epi32( flt16, flt16 );
  60. }
  61. #endif
  62. ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) {
  63. const int f32_sign_mask = 1U << IEEE_FLT_SIGN_BIT;
  64. const int f32_exponent_mask = ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS;
  65. const int f32_mantissa_mask = ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1;
  66. const int f16_min_exponent = 0;
  67. const int f16_max_exponent = ( 30 << IEEE_FLT16_MANTISSA_BITS );
  68. const int f16_min_mantissa = 0;
  69. const int f16_max_mantissa = ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 );
  70. const int f32_to_f16_sign_shift = IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT;
  71. const int f32_to_f16_exponent_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
  72. const int f32_to_f16_mantissa_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
  73. const int f32_to_f16_exponent_bias = ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS;
  74. int f32_bits = *(unsigned int *)&f32;
  75. int f16_sign = ( (unsigned int )( f32_bits & f32_sign_mask ) >> f32_to_f16_sign_shift );
  76. int f16_exponent = ( (unsigned int )( f32_bits & f32_exponent_mask ) >> f32_to_f16_exponent_shift );
  77. int f16_mantissa = ( (unsigned int )( f32_bits & f32_mantissa_mask ) >> f32_to_f16_mantissa_shift );
  78. f16_exponent -= f32_to_f16_exponent_bias;
  79. const bool underflow = ( f16_exponent < f16_min_exponent );
  80. const bool overflow = ( f16_exponent > f16_max_exponent );
  81. f16_exponent = underflow ? f16_min_exponent : f16_exponent;
  82. f16_exponent = overflow ? f16_max_exponent : f16_exponent;
  83. f16_mantissa = underflow ? f16_min_mantissa : f16_mantissa;
  84. f16_mantissa = overflow ? f16_max_mantissa : f16_mantissa;
  85. return (halfFloat_t)( f16_sign | f16_exponent | f16_mantissa );
  86. }
  87. /*
  88. ====================
  89. LoadSkinnedDrawVertPosition
  90. ====================
  91. */
  92. #ifdef ID_WIN_X86_SSE2_INTRIN
  93. ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert & base, const idJointMat * joints ) {
  94. const idJointMat & j0 = joints[base.color[0]];
  95. const idJointMat & j1 = joints[base.color[1]];
  96. const idJointMat & j2 = joints[base.color[2]];
  97. const idJointMat & j3 = joints[base.color[3]];
  98. __m128i weights_b = _mm_cvtsi32_si128( *(const unsigned int *)base.color2 );
  99. __m128i weights_s = _mm_unpacklo_epi8( weights_b, vector_int_zero );
  100. __m128i weights_i = _mm_unpacklo_epi16( weights_s, vector_int_zero );
  101. __m128 weights = _mm_cvtepi32_ps( weights_i );
  102. weights = _mm_mul_ps( weights, vector_float_1_over_255 );
  103. __m128 w0 = _mm_splat_ps( weights, 0 );
  104. __m128 w1 = _mm_splat_ps( weights, 1 );
  105. __m128 w2 = _mm_splat_ps( weights, 2 );
  106. __m128 w3 = _mm_splat_ps( weights, 3 );
  107. __m128 matX = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 0 * 4 ), w0 );
  108. __m128 matY = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 1 * 4 ), w0 );
  109. __m128 matZ = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 2 * 4 ), w0 );
  110. matX = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 0 * 4 ), w1, matX );
  111. matY = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 1 * 4 ), w1, matY );
  112. matZ = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 2 * 4 ), w1, matZ );
  113. matX = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 0 * 4 ), w2, matX );
  114. matY = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 1 * 4 ), w2, matY );
  115. matZ = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 2 * 4 ), w2, matZ );
  116. matX = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 0 * 4 ), w3, matX );
  117. matY = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 1 * 4 ), w3, matY );
  118. matZ = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 2 * 4 ), w3, matZ );
  119. __m128 v = _mm_load_ps( base.xyz.ToFloatPtr() );
  120. v = _mm_and_ps( v, vector_float_mask_clear_last );
  121. v = _mm_or_ps( v, vector_float_last_one );
  122. __m128 t0 = _mm_mul_ps( matX, v );
  123. __m128 t1 = _mm_mul_ps( matY, v );
  124. __m128 t2 = _mm_mul_ps( matZ, v );
  125. __m128 t3 = vector_float_1_over_4;
  126. __m128 s0 = _mm_unpacklo_ps( t0, t2 ); // x0, z0, x1, z1
  127. __m128 s1 = _mm_unpackhi_ps( t0, t2 ); // x2, z2, x3, z3
  128. __m128 s2 = _mm_unpacklo_ps( t1, t3 ); // y0, w0, y1, w1
  129. __m128 s3 = _mm_unpackhi_ps( t1, t3 ); // y2, w2, y3, w3
  130. __m128 r0 = _mm_unpacklo_ps( s0, s2 ); // x0, y0, z0, w0
  131. __m128 r1 = _mm_unpackhi_ps( s0, s2 ); // x1, y1, z1, w1
  132. __m128 r2 = _mm_unpacklo_ps( s1, s3 ); // x2, y2, z2, w2
  133. __m128 r3 = _mm_unpackhi_ps( s1, s3 ); // x3, y3, z3, w3
  134. r0 = _mm_add_ps( r0, r1 );
  135. r2 = _mm_add_ps( r2, r3 );
  136. r0 = _mm_add_ps( r0, r2 );
  137. return r0;
  138. }
  139. #endif
  140. ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert & vert, const idJointMat * joints ) {
  141. const idJointMat & j0 = joints[vert.color[0]];
  142. const idJointMat & j1 = joints[vert.color[1]];
  143. const idJointMat & j2 = joints[vert.color[2]];
  144. const idJointMat & j3 = joints[vert.color[3]];
  145. const float w0 = vert.color2[0] * ( 1.0f / 255.0f );
  146. const float w1 = vert.color2[1] * ( 1.0f / 255.0f );
  147. const float w2 = vert.color2[2] * ( 1.0f / 255.0f );
  148. const float w3 = vert.color2[3] * ( 1.0f / 255.0f );
  149. idJointMat accum;
  150. idJointMat::Mul( accum, j0, w0 );
  151. idJointMat::Mad( accum, j1, w1 );
  152. idJointMat::Mad( accum, j2, w2 );
  153. idJointMat::Mad( accum, j3, w3 );
  154. return accum * idVec4( vert.xyz.x, vert.xyz.y, vert.xyz.z, 1.0f );
  155. }
  156. #endif /* !__DRAWVERT_INTRINSICS_H__ */