123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368 |
- /*
- ===========================================================================
- Doom 3 GPL Source Code
- Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
- This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
- Doom 3 Source Code is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- Doom 3 Source Code is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
- In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
- If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
- ===========================================================================
- */
- #include "../precompiled.h"
- #pragma hdrstop
- #include "Simd_Generic.h"
- #include "Simd_MMX.h"
- //===============================================================
- //
- // MMX implementation of idSIMDProcessor
- //
- //===============================================================
- #if defined(MACOS_X) && defined(__i386__)
- /*
- ============
- idSIMD_MMX::GetName
- ============
- */
- const char * idSIMD_MMX::GetName( void ) const {
- return "MMX";
- }
- #elif defined(_WIN32)
- #define EMMS_INSTRUCTION __asm emms
- /*
- ============
- idSIMD_MMX::GetName
- ============
- */
- const char * idSIMD_MMX::GetName( void ) const {
- return "MMX";
- }
- /*
- ================
- MMX_Memcpy8B
- ================
- */
- void MMX_Memcpy8B( void *dest, const void *src, const int count ) {
- _asm {
- mov esi, src
- mov edi, dest
- mov ecx, count
- shr ecx, 3 // 8 bytes per iteration
- loop1:
- movq mm1, 0[ESI] // Read in source data
- movntq 0[EDI], mm1 // Non-temporal stores
- add esi, 8
- add edi, 8
- dec ecx
- jnz loop1
- }
- EMMS_INSTRUCTION
- }
- /*
- ================
- MMX_Memcpy64B
- 165MB/sec
- ================
- */
- void MMX_Memcpy64B( void *dest, const void *src, const int count ) {
- _asm {
- mov esi, src
- mov edi, dest
- mov ecx, count
- shr ecx, 6 // 64 bytes per iteration
- loop1:
- prefetchnta 64[ESI] // Prefetch next loop, non-temporal
- prefetchnta 96[ESI]
- movq mm1, 0[ESI] // Read in source data
- movq mm2, 8[ESI]
- movq mm3, 16[ESI]
- movq mm4, 24[ESI]
- movq mm5, 32[ESI]
- movq mm6, 40[ESI]
- movq mm7, 48[ESI]
- movq mm0, 56[ESI]
- movntq 0[EDI], mm1 // Non-temporal stores
- movntq 8[EDI], mm2
- movntq 16[EDI], mm3
- movntq 24[EDI], mm4
- movntq 32[EDI], mm5
- movntq 40[EDI], mm6
- movntq 48[EDI], mm7
- movntq 56[EDI], mm0
- add esi, 64
- add edi, 64
- dec ecx
- jnz loop1
- }
- EMMS_INSTRUCTION
- }
- /*
- ================
- MMX_Memcpy2kB
- 240MB/sec
- ================
- */
- void MMX_Memcpy2kB( void *dest, const void *src, const int count ) {
- byte *tbuf = (byte *)_alloca16(2048);
- __asm {
- push ebx
- mov esi, src
- mov ebx, count
- shr ebx, 11 // 2048 bytes at a time
- mov edi, dest
- loop2k:
- push edi // copy 2k into temporary buffer
- mov edi, tbuf
- mov ecx, 32
- loopMemToL1:
- prefetchnta 64[ESI] // Prefetch next loop, non-temporal
- prefetchnta 96[ESI]
- movq mm1, 0[ESI] // Read in source data
- movq mm2, 8[ESI]
- movq mm3, 16[ESI]
- movq mm4, 24[ESI]
- movq mm5, 32[ESI]
- movq mm6, 40[ESI]
- movq mm7, 48[ESI]
- movq mm0, 56[ESI]
- movq 0[EDI], mm1 // Store into L1
- movq 8[EDI], mm2
- movq 16[EDI], mm3
- movq 24[EDI], mm4
- movq 32[EDI], mm5
- movq 40[EDI], mm6
- movq 48[EDI], mm7
- movq 56[EDI], mm0
- add esi, 64
- add edi, 64
- dec ecx
- jnz loopMemToL1
- pop edi // Now copy from L1 to system memory
- push esi
- mov esi, tbuf
- mov ecx, 32
- loopL1ToMem:
- movq mm1, 0[ESI] // Read in source data from L1
- movq mm2, 8[ESI]
- movq mm3, 16[ESI]
- movq mm4, 24[ESI]
- movq mm5, 32[ESI]
- movq mm6, 40[ESI]
- movq mm7, 48[ESI]
- movq mm0, 56[ESI]
- movntq 0[EDI], mm1 // Non-temporal stores
- movntq 8[EDI], mm2
- movntq 16[EDI], mm3
- movntq 24[EDI], mm4
- movntq 32[EDI], mm5
- movntq 40[EDI], mm6
- movntq 48[EDI], mm7
- movntq 56[EDI], mm0
- add esi, 64
- add edi, 64
- dec ecx
- jnz loopL1ToMem
- pop esi // Do next 2k block
- dec ebx
- jnz loop2k
- pop ebx
- }
- EMMS_INSTRUCTION
- }
- /*
- ================
- idSIMD_MMX::Memcpy
- optimized memory copy routine that handles all alignment cases and block sizes efficiently
- ================
- */
- void VPCALL idSIMD_MMX::Memcpy( void *dest0, const void *src0, const int count0 ) {
- // if copying more than 16 bytes and we can copy 8 byte aligned
- if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) {
- byte *dest = (byte *)dest0;
- byte *src = (byte *)src0;
- // copy up to the first 8 byte aligned boundary
- int count = ((int)dest) & 7;
- memcpy( dest, src, count );
- dest += count;
- src += count;
- count = count0 - count;
- // if there are multiple blocks of 2kB
- if ( count & ~4095 ) {
- MMX_Memcpy2kB( dest, src, count );
- src += (count & ~2047);
- dest += (count & ~2047);
- count &= 2047;
- }
- // if there are blocks of 64 bytes
- if ( count & ~63 ) {
- MMX_Memcpy64B( dest, src, count );
- src += (count & ~63);
- dest += (count & ~63);
- count &= 63;
- }
- // if there are blocks of 8 bytes
- if ( count & ~7 ) {
- MMX_Memcpy8B( dest, src, count );
- src += (count & ~7);
- dest += (count & ~7);
- count &= 7;
- }
- // copy any remaining bytes
- memcpy( dest, src, count );
- } else {
- // use the regular one if we cannot copy 8 byte aligned
- memcpy( dest0, src0, count0 );
- }
- // the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
- __asm {
- sfence
- }
- }
- /*
- ================
- idSIMD_MMX::Memset
- ================
- */
- void VPCALL idSIMD_MMX::Memset( void* dest0, const int val, const int count0 ) {
- union {
- byte bytes[8];
- word words[4];
- dword dwords[2];
- } dat;
- byte *dest = (byte *)dest0;
- int count = count0;
- while ( count > 0 && (((int)dest) & 7) ) {
- *dest = val;
- dest++;
- count--;
- }
- if ( !count ) {
- return;
- }
- dat.bytes[0] = val;
- dat.bytes[1] = val;
- dat.words[1] = dat.words[0];
- dat.dwords[1] = dat.dwords[0];
- if ( count >= 64 ) {
- __asm {
- mov edi, dest
- mov ecx, count
- shr ecx, 6 // 64 bytes per iteration
- movq mm1, dat // Read in source data
- movq mm2, mm1
- movq mm3, mm1
- movq mm4, mm1
- movq mm5, mm1
- movq mm6, mm1
- movq mm7, mm1
- movq mm0, mm1
- loop1:
- movntq 0[EDI], mm1 // Non-temporal stores
- movntq 8[EDI], mm2
- movntq 16[EDI], mm3
- movntq 24[EDI], mm4
- movntq 32[EDI], mm5
- movntq 40[EDI], mm6
- movntq 48[EDI], mm7
- movntq 56[EDI], mm0
- add edi, 64
- dec ecx
- jnz loop1
- }
- dest += ( count & ~63 );
- count &= 63;
- }
- if ( count >= 8 ) {
- __asm {
- mov edi, dest
- mov ecx, count
- shr ecx, 3 // 8 bytes per iteration
- movq mm1, dat // Read in source data
- loop2:
- movntq 0[EDI], mm1 // Non-temporal stores
- add edi, 8
- dec ecx
- jnz loop2
- }
- dest += (count & ~7);
- count &= 7;
- }
- while ( count > 0 ) {
- *dest = val;
- dest++;
- count--;
- }
- EMMS_INSTRUCTION
- // the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
- __asm {
- sfence
- }
- }
- #endif /* _WIN32 */
|