123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207 |
- /*
- ===========================================================================
- Doom 3 BFG Edition GPL Source Code
- Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
- This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
- Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
- In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
- If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
- ===========================================================================
- */
- #ifndef __DRAWVERT_INTRINSICS_H__
- #define __DRAWVERT_INTRINSICS_H__
- #ifdef ID_WIN_X86_SSE2_INTRIN
- static const __m128i vector_int_f32_sign_mask = _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT );
- static const __m128i vector_int_f32_exponent_mask = _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS );
- static const __m128i vector_int_f32_mantissa_mask = _mm_set1_epi32( ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1 );
- static const __m128i vector_int_f16_min_exponent = _mm_set1_epi32( 0 );
- static const __m128i vector_int_f16_max_exponent = _mm_set1_epi32( ( 30 << IEEE_FLT16_MANTISSA_BITS ) );
- static const __m128i vector_int_f16_min_mantissa = _mm_set1_epi32( 0 );
- static const __m128i vector_int_f16_max_mantissa = _mm_set1_epi32( ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 ) );
- static const __m128i vector_int_f32_to_f16_exponent_bias = _mm_set1_epi32( ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS );
- static const int f32_to_f16_sign_shift = IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT;
- static const int f32_to_f16_exponent_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
- static const int f32_to_f16_mantissa_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
- static const __m128i vector_int_zero = _mm_setzero_si128();
- static const __m128i vector_int_one = _mm_set_epi32( 1, 1, 1, 1 );
- static const __m128 vector_float_mask_clear_last = __m128c( _mm_set_epi32( 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF ) );
- static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0f };
- static const __m128 vector_float_1_over_255 = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f };
- static const __m128 vector_float_1_over_4 = { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f };
- #endif
- /*
- ====================
- FastF32toF16
- ====================
- */
- #ifdef ID_WIN_X86_SSE2_INTRIN
- ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) {
- __m128i f16_sign = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask ), f32_to_f16_sign_shift );
- __m128i f16_exponent = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_exponent_mask ), f32_to_f16_exponent_shift );
- __m128i f16_mantissa = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_mantissa_mask ), f32_to_f16_mantissa_shift );
- f16_exponent = _mm_sub_epi32( f16_exponent, vector_int_f32_to_f16_exponent_bias );
- const __m128i underflow = _mm_cmplt_epi32( f16_exponent, vector_int_f16_min_exponent );
- const __m128i overflow = _mm_cmpgt_epi32( f16_exponent, vector_int_f16_max_exponent );
- f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_min_exponent, underflow );
- f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_max_exponent, overflow );
- f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_min_mantissa, underflow );
- f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_max_mantissa, overflow );
- __m128i flt16 = _mm_or_si128( _mm_or_si128( f16_sign, f16_exponent ), f16_mantissa );
- return _mm_packs_epi32( flt16, flt16 );
- }
- #endif
- ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) {
- const int f32_sign_mask = 1U << IEEE_FLT_SIGN_BIT;
- const int f32_exponent_mask = ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS;
- const int f32_mantissa_mask = ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1;
- const int f16_min_exponent = 0;
- const int f16_max_exponent = ( 30 << IEEE_FLT16_MANTISSA_BITS );
- const int f16_min_mantissa = 0;
- const int f16_max_mantissa = ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 );
- const int f32_to_f16_sign_shift = IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT;
- const int f32_to_f16_exponent_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
- const int f32_to_f16_mantissa_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
- const int f32_to_f16_exponent_bias = ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS;
- int f32_bits = *(unsigned int *)&f32;
- int f16_sign = ( (unsigned int )( f32_bits & f32_sign_mask ) >> f32_to_f16_sign_shift );
- int f16_exponent = ( (unsigned int )( f32_bits & f32_exponent_mask ) >> f32_to_f16_exponent_shift );
- int f16_mantissa = ( (unsigned int )( f32_bits & f32_mantissa_mask ) >> f32_to_f16_mantissa_shift );
- f16_exponent -= f32_to_f16_exponent_bias;
- const bool underflow = ( f16_exponent < f16_min_exponent );
- const bool overflow = ( f16_exponent > f16_max_exponent );
- f16_exponent = underflow ? f16_min_exponent : f16_exponent;
- f16_exponent = overflow ? f16_max_exponent : f16_exponent;
- f16_mantissa = underflow ? f16_min_mantissa : f16_mantissa;
- f16_mantissa = overflow ? f16_max_mantissa : f16_mantissa;
- return (halfFloat_t)( f16_sign | f16_exponent | f16_mantissa );
- }
- /*
- ====================
- LoadSkinnedDrawVertPosition
- ====================
- */
- #ifdef ID_WIN_X86_SSE2_INTRIN
- ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert & base, const idJointMat * joints ) {
- const idJointMat & j0 = joints[base.color[0]];
- const idJointMat & j1 = joints[base.color[1]];
- const idJointMat & j2 = joints[base.color[2]];
- const idJointMat & j3 = joints[base.color[3]];
- __m128i weights_b = _mm_cvtsi32_si128( *(const unsigned int *)base.color2 );
- __m128i weights_s = _mm_unpacklo_epi8( weights_b, vector_int_zero );
- __m128i weights_i = _mm_unpacklo_epi16( weights_s, vector_int_zero );
- __m128 weights = _mm_cvtepi32_ps( weights_i );
- weights = _mm_mul_ps( weights, vector_float_1_over_255 );
- __m128 w0 = _mm_splat_ps( weights, 0 );
- __m128 w1 = _mm_splat_ps( weights, 1 );
- __m128 w2 = _mm_splat_ps( weights, 2 );
- __m128 w3 = _mm_splat_ps( weights, 3 );
- __m128 matX = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 0 * 4 ), w0 );
- __m128 matY = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 1 * 4 ), w0 );
- __m128 matZ = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 2 * 4 ), w0 );
- matX = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 0 * 4 ), w1, matX );
- matY = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 1 * 4 ), w1, matY );
- matZ = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 2 * 4 ), w1, matZ );
- matX = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 0 * 4 ), w2, matX );
- matY = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 1 * 4 ), w2, matY );
- matZ = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 2 * 4 ), w2, matZ );
- matX = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 0 * 4 ), w3, matX );
- matY = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 1 * 4 ), w3, matY );
- matZ = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 2 * 4 ), w3, matZ );
- __m128 v = _mm_load_ps( base.xyz.ToFloatPtr() );
- v = _mm_and_ps( v, vector_float_mask_clear_last );
- v = _mm_or_ps( v, vector_float_last_one );
- __m128 t0 = _mm_mul_ps( matX, v );
- __m128 t1 = _mm_mul_ps( matY, v );
- __m128 t2 = _mm_mul_ps( matZ, v );
- __m128 t3 = vector_float_1_over_4;
- __m128 s0 = _mm_unpacklo_ps( t0, t2 ); // x0, z0, x1, z1
- __m128 s1 = _mm_unpackhi_ps( t0, t2 ); // x2, z2, x3, z3
- __m128 s2 = _mm_unpacklo_ps( t1, t3 ); // y0, w0, y1, w1
- __m128 s3 = _mm_unpackhi_ps( t1, t3 ); // y2, w2, y3, w3
- __m128 r0 = _mm_unpacklo_ps( s0, s2 ); // x0, y0, z0, w0
- __m128 r1 = _mm_unpackhi_ps( s0, s2 ); // x1, y1, z1, w1
- __m128 r2 = _mm_unpacklo_ps( s1, s3 ); // x2, y2, z2, w2
- __m128 r3 = _mm_unpackhi_ps( s1, s3 ); // x3, y3, z3, w3
- r0 = _mm_add_ps( r0, r1 );
- r2 = _mm_add_ps( r2, r3 );
- r0 = _mm_add_ps( r0, r2 );
- return r0;
- }
- #endif
- ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert & vert, const idJointMat * joints ) {
- const idJointMat & j0 = joints[vert.color[0]];
- const idJointMat & j1 = joints[vert.color[1]];
- const idJointMat & j2 = joints[vert.color[2]];
- const idJointMat & j3 = joints[vert.color[3]];
- const float w0 = vert.color2[0] * ( 1.0f / 255.0f );
- const float w1 = vert.color2[1] * ( 1.0f / 255.0f );
- const float w2 = vert.color2[2] * ( 1.0f / 255.0f );
- const float w3 = vert.color2[3] * ( 1.0f / 255.0f );
- idJointMat accum;
- idJointMat::Mul( accum, j0, w0 );
- idJointMat::Mad( accum, j1, w1 );
- idJointMat::Mad( accum, j2, w2 );
- idJointMat::Mad( accum, j3, w3 );
- return accum * idVec4( vert.xyz.x, vert.xyz.y, vert.xyz.z, 1.0f );
- }
- #endif /* !__DRAWVERT_INTRINSICS_H__ */
|