123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368 |
- /*
- ===========================================================================
- Doom 3 GPL Source Code
- Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
- This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
- Doom 3 Source Code is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- Doom 3 Source Code is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
- In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
- If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
- ===========================================================================
- */
- #include "../precompiled.h"
- #pragma hdrstop
- #include "Simd_Generic.h"
- #include "Simd_MMX.h"
- #include "Simd_SSE.h"
- #include "Simd_SSE2.h"
- #include "Simd_SSE3.h"
- //===============================================================
- //
- // SSE3 implementation of idSIMDProcessor
- //
- //===============================================================
- #if defined(MACOS_X) && defined(__i386__)
- /*
- ============
- idSIMD_SSE3::GetName
- ============
- */
- const char * idSIMD_SSE3::GetName( void ) const {
- return "MMX & SSE & SSE2 & SSE3";
- }
- #elif defined(_WIN32)
- #include <xmmintrin.h>
- #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
- #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
- #define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
- #define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
- /*
- The first argument of an instruction macro is the destination
- and the second argument is the source operand. The destination
- operand can be _xmm0 to _xmm7 only. The source operand can be
- any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
- _ebp, _ebx, _esi, or _edi that contains the effective address.
- For instance: haddps xmm0, xmm1
- becomes: haddps( _xmm0, _xmm1 )
- and: haddps xmm0, [esi]
- becomes: haddps( _xmm0, _esi )
- The ADDRESS_ADDC macro can be used when the effective source address
- is formed by adding a constant to a general purpose register.
- For instance: haddps xmm0, [esi+48]
- becomes: haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )
- The ADDRESS_ADDR macro can be used when the effective source address
- is formed by adding two general purpose registers.
- For instance: haddps xmm0, [esi+eax]
- becomes: haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )
- The ADDRESS_ADDRC macro can be used when the effective source address
- is formed by adding two general purpose registers and a constant.
- The constant must be in the range [-128, 127].
- For instance: haddps xmm0, [esi+eax+48]
- becomes: haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )
- The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
- by adding a scaled general purpose register to another general purpose register.
- The scale must be either 1, 2, 4 or 8.
- For instance: haddps xmm0, [esi+eax*4]
- becomes: haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )
- The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
- by adding a scaled general purpose register to another general purpose register and
- also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
- be in the range [-128, 127].
- For instance: haddps xmm0, [esi+eax*4+64]
- becomes: haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )
- */
- #define _eax 0x00
- #define _ecx 0x01
- #define _edx 0x02
- #define _ebx 0x03
- #define _esp 0x04
- #define _ebp 0x05
- #define _esi 0x06
- #define _edi 0x07
- #define _xmm0 0xC0
- #define _xmm1 0xC1
- #define _xmm2 0xC2
- #define _xmm3 0xC3
- #define _xmm4 0xC4
- #define _xmm5 0xC5
- #define _xmm6 0xC6
- #define _xmm7 0xC7
- #define RSCALE( s ) ( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )
- #define ADDRESS_ADDC( reg0, constant ) 0x40 | ( reg0 & 7 ) \
- _asm _emit constant
- #define ADDRESS_ADDR( reg0, reg1 ) 0x04 \
- _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )
- #define ADDRESS_ADDRC( reg0, reg1, constant ) 0x44 \
- _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) \
- _asm _emit constant
- #define ADDRESS_SCALEADDR( reg0, reg1, scale ) 0x04 \
- _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )
- #define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant ) 0x44 \
- _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale ) \
- _asm _emit constant
- // Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
- #define addsubps( dst, src ) \
- _asm _emit 0xF2 \
- _asm _emit 0x0F \
- _asm _emit 0xD0 \
- _asm _emit ( ( dst & 7 ) << 3 ) | src
- // Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
- #define addsubpd( dst, src ) \
- _asm _emit 0x66 \
- _asm _emit 0x0F \
- _asm _emit 0xD0 \
- _asm _emit ( ( dst & 7 ) << 3 ) | src
- // Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
- #define haddps( dst, src ) \
- _asm _emit 0xF2 \
- _asm _emit 0x0F \
- _asm _emit 0x7C \
- _asm _emit ( ( dst & 7 ) << 3 ) | src
- // Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
- #define haddpd( dst, src ) \
- _asm _emit 0x66 \
- _asm _emit 0x0F \
- _asm _emit 0x7C \
- _asm _emit ( ( dst & 7 ) << 3 ) | src
- // Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
- #define hsubps( dst, src ) \
- _asm _emit 0xF2 \
- _asm _emit 0x0F \
- _asm _emit 0x7D \
- _asm _emit ( ( dst & 7 ) << 3 ) | src
- // Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
- #define hsubpd( dst, src ) \
- _asm _emit 0x66 \
- _asm _emit 0x0F \
- _asm _emit 0x7D \
- _asm _emit ( ( dst & 7 ) << 3 ) | src
- // Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
- #define movsldup( dst, src ) \
- _asm _emit 0xF3 \
- _asm _emit 0x0F \
- _asm _emit 0x12 \
- _asm _emit ( ( dst & 7 ) << 3 ) | src
- // Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
- #define movdldup( dst, src ) \
- _asm _emit 0xF2 \
- _asm _emit 0x0F \
- _asm _emit 0x12 \
- _asm _emit ( ( dst & 7 ) << 3 ) | src
- // Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
- #define movshdup( dst, src ) \
- _asm _emit 0xF3 \
- _asm _emit 0x0F \
- _asm _emit 0x16 \
- _asm _emit ( ( dst & 7 ) << 3 ) | src
- // Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
- #define movdhdup( dst, src ) \
- _asm _emit 0xF2 \
- _asm _emit 0x0F \
- _asm _emit 0x16 \
- _asm _emit ( ( dst & 7 ) << 3 ) | src
- // Load Unaligned Integer 128 bits
- #define lddqu( dst, src ) \
- _asm _emit 0xF2 \
- _asm _emit 0x0F \
- _asm _emit 0xF0 \
- _asm _emit ( ( dst & 7 ) << 3 ) | src
- #define DRAWVERT_SIZE 60
- #define DRAWVERT_XYZ_OFFSET (0*4)
- #define DRAWVERT_ST_OFFSET (3*4)
- #define DRAWVERT_NORMAL_OFFSET (5*4)
- #define DRAWVERT_TANGENT0_OFFSET (8*4)
- #define DRAWVERT_TANGENT1_OFFSET (11*4)
- #define DRAWVERT_COLOR_OFFSET (14*4)
- #define JOINTQUAT_SIZE (7*4)
- #define JOINTMAT_SIZE (4*3*4)
- #define JOINTWEIGHT_SIZE (4*4)
- /*
- ============
- SSE3_Dot
- ============
- */
- float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
- float d;
- __asm {
- mov esi, v1
- mov edi, v2
- movaps xmm0, [esi]
- mulps xmm0, [edi]
- haddps( _xmm0, _xmm0 )
- haddps( _xmm0, _xmm0 )
- movss d, xmm0
- }
- return d;
- }
- /*
- ============
- idSIMD_SSE3::GetName
- ============
- */
- const char * idSIMD_SSE3::GetName( void ) const {
- return "MMX & SSE & SSE2 & SSE3";
- }
- /*
- ============
- idSIMD_SSE3::TransformVerts
- ============
- */
- void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
- #if 1
- assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
- assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
- assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
- assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
- __asm
- {
- mov eax, numVerts
- test eax, eax
- jz done
- imul eax, DRAWVERT_SIZE
- mov ecx, verts
- mov edx, index
- mov esi, weights
- mov edi, joints
- add ecx, eax
- neg eax
- loopVert:
- mov ebx, [edx]
- movaps xmm2, [esi]
- add edx, 8
- movaps xmm0, xmm2
- add esi, JOINTWEIGHT_SIZE
- movaps xmm1, xmm2
- mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
- mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
- mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
- cmp dword ptr [edx-4], 0
- jne doneWeight
- loopWeight:
- mov ebx, [edx]
- movaps xmm5, [esi]
- add edx, 8
- movaps xmm3, xmm5
- add esi, JOINTWEIGHT_SIZE
- movaps xmm4, xmm5
- mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
- mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
- mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
- cmp dword ptr [edx-4], 0
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5
- je loopWeight
- doneWeight:
- add eax, DRAWVERT_SIZE
- haddps( _xmm0, _xmm1 )
- haddps( _xmm2, _xmm0 )
- movhps [ecx+eax-DRAWVERT_SIZE+0], xmm2
- haddps( _xmm2, _xmm2 )
- movss [ecx+eax-DRAWVERT_SIZE+8], xmm2
- jl loopVert
- done:
- }
- #else
- int i, j;
- const byte *jointsPtr = (byte *)joints;
- for( j = i = 0; i < numVerts; i++ ) {
- idVec3 v;
- v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
- while( index[j*2+1] == 0 ) {
- j++;
- v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
- }
- j++;
- verts[i].xyz = v;
- }
- #endif
- }
- #endif /* _WIN32 */
|