sys_intrinsics.h 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. /*
  2. ===========================================================================
  3. Doom 3 BFG Edition GPL Source Code
  4. Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
  6. Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #ifndef __SYS_INTRIINSICS_H__
  21. #define __SYS_INTRIINSICS_H__
  22. /*
  23. ================================================================================================
  24. Scalar single precision floating-point intrinsics
  25. ================================================================================================
  26. */
  27. ID_INLINE_EXTERN float __fmuls( float a, float b ) { return ( a * b ); }
  28. ID_INLINE_EXTERN float __fmadds( float a, float b, float c ) { return ( a * b + c ); }
  29. ID_INLINE_EXTERN float __fnmsubs( float a, float b, float c ) { return ( c - a * b ); }
  30. ID_INLINE_EXTERN float __fsels( float a, float b, float c ) { return ( a >= 0.0f ) ? b : c; }
  31. ID_INLINE_EXTERN float __frcps( float x ) { return ( 1.0f / x ); }
  32. ID_INLINE_EXTERN float __fdivs( float x, float y ) { return ( x / y ); }
  33. ID_INLINE_EXTERN float __frsqrts( float x ) { return ( 1.0f / sqrtf( x ) ); }
  34. ID_INLINE_EXTERN float __frcps16( float x ) { return ( 1.0f / x ); }
  35. ID_INLINE_EXTERN float __fdivs16( float x, float y ) { return ( x / y ); }
  36. ID_INLINE_EXTERN float __frsqrts16( float x ) { return ( 1.0f / sqrtf( x ) ); }
  37. ID_INLINE_EXTERN float __frndz( float x ) { return (float)( (int)( x ) ); }
  38. /*
  39. ================================================================================================
  40. Zero cache line and prefetch intrinsics
  41. ================================================================================================
  42. */
  43. #ifdef ID_WIN_X86_SSE2_INTRIN
  44. // The code below assumes that a cache line is 64 bytes.
  45. // We specify the cache line size as 128 here to make the code consistent with the consoles.
  46. #define CACHE_LINE_SIZE 128
  47. ID_FORCE_INLINE void Prefetch( const void * ptr, int offset ) {
  48. // const char * bytePtr = ( (const char *) ptr ) + offset;
  49. // _mm_prefetch( bytePtr + 0, _MM_HINT_NTA );
  50. // _mm_prefetch( bytePtr + 64, _MM_HINT_NTA );
  51. }
  52. ID_FORCE_INLINE void ZeroCacheLine( void * ptr, int offset ) {
  53. assert_128_byte_aligned( ptr );
  54. char * bytePtr = ( (char *) ptr ) + offset;
  55. __m128i zero = _mm_setzero_si128();
  56. _mm_store_si128( (__m128i *) ( bytePtr + 0*16 ), zero );
  57. _mm_store_si128( (__m128i *) ( bytePtr + 1*16 ), zero );
  58. _mm_store_si128( (__m128i *) ( bytePtr + 2*16 ), zero );
  59. _mm_store_si128( (__m128i *) ( bytePtr + 3*16 ), zero );
  60. _mm_store_si128( (__m128i *) ( bytePtr + 4*16 ), zero );
  61. _mm_store_si128( (__m128i *) ( bytePtr + 5*16 ), zero );
  62. _mm_store_si128( (__m128i *) ( bytePtr + 6*16 ), zero );
  63. _mm_store_si128( (__m128i *) ( bytePtr + 7*16 ), zero );
  64. }
  65. ID_FORCE_INLINE void FlushCacheLine( const void * ptr, int offset ) {
  66. const char * bytePtr = ( (const char *) ptr ) + offset;
  67. _mm_clflush( bytePtr + 0 );
  68. _mm_clflush( bytePtr + 64 );
  69. }
  70. /*
  71. ================================================
  72. Other
  73. ================================================
  74. */
  75. #else
  76. #define CACHE_LINE_SIZE 128
  77. ID_INLINE void Prefetch( const void * ptr, int offset ) {}
  78. ID_INLINE void ZeroCacheLine( void * ptr, int offset ) {
  79. byte * bytePtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + ( offset ) ) & ~( CACHE_LINE_SIZE - 1 ) );
  80. memset( bytePtr, 0, CACHE_LINE_SIZE );
  81. }
  82. ID_INLINE void FlushCacheLine( const void * ptr, int offset ) {}
  83. #endif
  84. /*
  85. ================================================
  86. Block Clear Macros
  87. ================================================
  88. */
  89. // number of additional elements that are potentially cleared when clearing whole cache lines at a time
  90. ID_INLINE_EXTERN int CACHE_LINE_CLEAR_OVERFLOW_COUNT( int size ) {
  91. if ( ( size & ( CACHE_LINE_SIZE - 1 ) ) == 0 ) {
  92. return 0;
  93. }
  94. if ( size > CACHE_LINE_SIZE ) {
  95. return 1;
  96. }
  97. return ( CACHE_LINE_SIZE / ( size & ( CACHE_LINE_SIZE - 1 ) ) );
  98. }
  99. // if the pointer is not on a cache line boundary this assumes the cache line the pointer starts in was already cleared
  100. #define CACHE_LINE_CLEAR_BLOCK( ptr, size ) \
  101. byte * startPtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + CACHE_LINE_SIZE - 1 ) & ~( CACHE_LINE_SIZE - 1 ) ); \
  102. byte * endPtr = (byte *)( ( (UINT_PTR) ( ptr ) + ( size ) - 1 ) & ~( CACHE_LINE_SIZE - 1 ) ); \
  103. for ( ; startPtr <= endPtr; startPtr += CACHE_LINE_SIZE ) { \
  104. ZeroCacheLine( startPtr, 0 ); \
  105. }
  106. #define CACHE_LINE_CLEAR_BLOCK_AND_FLUSH( ptr, size ) \
  107. byte * startPtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + CACHE_LINE_SIZE - 1 ) & ~( CACHE_LINE_SIZE - 1 ) ); \
  108. byte * endPtr = (byte *)( ( (UINT_PTR) ( ptr ) + ( size ) - 1 ) & ~( CACHE_LINE_SIZE - 1 ) ); \
  109. for ( ; startPtr <= endPtr; startPtr += CACHE_LINE_SIZE ) { \
  110. ZeroCacheLine( startPtr, 0 ); \
  111. FlushCacheLine( startPtr, 0 ); \
  112. }
  113. /*
  114. ================================================================================================
  115. Vector Intrinsics
  116. ================================================================================================
  117. */
  118. /*
  119. ================================================
  120. PC Windows
  121. ================================================
  122. */
  123. #if !defined( R_SHUFFLE_D )
  124. #define R_SHUFFLE_D( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
  125. #endif
  126. // make the intrinsics "type unsafe"
  127. typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128c {
  128. __m128c() {}
  129. __m128c( __m128 f ) { m128 = f; }
  130. __m128c( __m128i i ) { m128i = i; }
  131. operator __m128() { return m128; }
  132. operator __m128i() { return m128i; }
  133. __m128 m128;
  134. __m128i m128i;
  135. } __m128c;
  136. #define _mm_madd_ps( a, b, c ) _mm_add_ps( _mm_mul_ps( (a), (b) ), (c) )
  137. #define _mm_nmsub_ps( a, b, c ) _mm_sub_ps( (c), _mm_mul_ps( (a), (b) ) )
  138. #define _mm_splat_ps( x, i ) __m128c( _mm_shuffle_epi32( __m128c( x ), _MM_SHUFFLE( i, i, i, i ) ) )
  139. #define _mm_perm_ps( x, perm ) __m128c( _mm_shuffle_epi32( __m128c( x ), perm ) )
  140. #define _mm_sel_ps( a, b, c ) _mm_or_ps( _mm_andnot_ps( __m128c( c ), a ), _mm_and_ps( __m128c( c ), b ) )
  141. #define _mm_sel_si128( a, b, c ) _mm_or_si128( _mm_andnot_si128( __m128c( c ), a ), _mm_and_si128( __m128c( c ), b ) )
  142. #define _mm_sld_ps( x, y, imm ) __m128c( _mm_or_si128( _mm_srli_si128( __m128c( x ), imm ), _mm_slli_si128( __m128c( y ), 16 - imm ) ) )
  143. #define _mm_sld_si128( x, y, imm ) _mm_or_si128( _mm_srli_si128( x, imm ), _mm_slli_si128( y, 16 - imm ) )
  144. ID_FORCE_INLINE_EXTERN __m128 _mm_msum3_ps( __m128 a, __m128 b ) {
  145. __m128 c = _mm_mul_ps( a, b );
  146. return _mm_add_ps( _mm_splat_ps( c, 0 ), _mm_add_ps( _mm_splat_ps( c, 1 ), _mm_splat_ps( c, 2 ) ) );
  147. }
  148. ID_FORCE_INLINE_EXTERN __m128 _mm_msum4_ps( __m128 a, __m128 b ) {
  149. __m128 c = _mm_mul_ps( a, b );
  150. c = _mm_add_ps( c, _mm_perm_ps( c, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
  151. c = _mm_add_ps( c, _mm_perm_ps( c, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
  152. return c;
  153. }
  154. #define _mm_shufmix_epi32( x, y, perm ) __m128c( _mm_shuffle_ps( __m128c( x ), __m128c( y ), perm ) )
  155. #define _mm_loadh_epi64( x, address ) __m128c( _mm_loadh_pi( __m128c( x ), (__m64 *)address ) )
  156. #define _mm_storeh_epi64( address, x ) _mm_storeh_pi( (__m64 *)address, __m128c( x ) )
  157. // floating-point reciprocal with close to full precision
  158. ID_FORCE_INLINE_EXTERN __m128 _mm_rcp32_ps( __m128 x ) {
  159. __m128 r = _mm_rcp_ps( x ); // _mm_rcp_ps() has 12 bits of precision
  160. r = _mm_sub_ps( _mm_add_ps( r, r ), _mm_mul_ps( _mm_mul_ps( x, r ), r ) );
  161. r = _mm_sub_ps( _mm_add_ps( r, r ), _mm_mul_ps( _mm_mul_ps( x, r ), r ) );
  162. return r;
  163. }
  164. // floating-point reciprocal with at least 16 bits precision
  165. ID_FORCE_INLINE_EXTERN __m128 _mm_rcp16_ps( __m128 x ) {
  166. __m128 r = _mm_rcp_ps( x ); // _mm_rcp_ps() has 12 bits of precision
  167. r = _mm_sub_ps( _mm_add_ps( r, r ), _mm_mul_ps( _mm_mul_ps( x, r ), r ) );
  168. return r;
  169. }
  170. // floating-point divide with close to full precision
  171. ID_FORCE_INLINE_EXTERN __m128 _mm_div32_ps( __m128 x, __m128 y ) {
  172. return _mm_mul_ps( x, _mm_rcp32_ps( y ) );
  173. }
  174. // floating-point divide with at least 16 bits precision
  175. ID_FORCE_INLINE_EXTERN __m128 _mm_div16_ps( __m128 x, __m128 y ) {
  176. return _mm_mul_ps( x, _mm_rcp16_ps( y ) );
  177. }
  178. // load idBounds::GetMins()
  179. #define _mm_loadu_bounds_0( bounds ) _mm_perm_ps( _mm_loadh_pi( _mm_load_ss( & bounds[0].x ), (__m64 *) & bounds[0].y ), _MM_SHUFFLE( 1, 3, 2, 0 ) )
  180. // load idBounds::GetMaxs()
  181. #define _mm_loadu_bounds_1( bounds ) _mm_perm_ps( _mm_loadh_pi( _mm_load_ss( & bounds[1].x ), (__m64 *) & bounds[1].y ), _MM_SHUFFLE( 1, 3, 2, 0 ) )
  182. #endif // !__SYS_INTRIINSICS_H__