Simd_MMX.cpp 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /*
  2. ===========================================================================
  3. Doom 3 GPL Source Code
  4. Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
  6. Doom 3 Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #include "../precompiled.h"
  21. #pragma hdrstop
  22. #include "Simd_Generic.h"
  23. #include "Simd_MMX.h"
  24. //===============================================================
  25. //
  26. // MMX implementation of idSIMDProcessor
  27. //
  28. //===============================================================
  29. #if defined(MACOS_X) && defined(__i386__)
  30. /*
  31. ============
  32. idSIMD_MMX::GetName
  33. ============
  34. */
  35. const char * idSIMD_MMX::GetName( void ) const {
  36. return "MMX";
  37. }
  38. #elif defined(_WIN32)
  39. #define EMMS_INSTRUCTION __asm emms
  40. /*
  41. ============
  42. idSIMD_MMX::GetName
  43. ============
  44. */
  45. const char * idSIMD_MMX::GetName( void ) const {
  46. return "MMX";
  47. }
  48. /*
  49. ================
  50. MMX_Memcpy8B
  51. ================
  52. */
  53. void MMX_Memcpy8B( void *dest, const void *src, const int count ) {
  54. _asm {
  55. mov esi, src
  56. mov edi, dest
  57. mov ecx, count
  58. shr ecx, 3 // 8 bytes per iteration
  59. loop1:
  60. movq mm1, 0[ESI] // Read in source data
  61. movntq 0[EDI], mm1 // Non-temporal stores
  62. add esi, 8
  63. add edi, 8
  64. dec ecx
  65. jnz loop1
  66. }
  67. EMMS_INSTRUCTION
  68. }
  69. /*
  70. ================
  71. MMX_Memcpy64B
  72. 165MB/sec
  73. ================
  74. */
  75. void MMX_Memcpy64B( void *dest, const void *src, const int count ) {
  76. _asm {
  77. mov esi, src
  78. mov edi, dest
  79. mov ecx, count
  80. shr ecx, 6 // 64 bytes per iteration
  81. loop1:
  82. prefetchnta 64[ESI] // Prefetch next loop, non-temporal
  83. prefetchnta 96[ESI]
  84. movq mm1, 0[ESI] // Read in source data
  85. movq mm2, 8[ESI]
  86. movq mm3, 16[ESI]
  87. movq mm4, 24[ESI]
  88. movq mm5, 32[ESI]
  89. movq mm6, 40[ESI]
  90. movq mm7, 48[ESI]
  91. movq mm0, 56[ESI]
  92. movntq 0[EDI], mm1 // Non-temporal stores
  93. movntq 8[EDI], mm2
  94. movntq 16[EDI], mm3
  95. movntq 24[EDI], mm4
  96. movntq 32[EDI], mm5
  97. movntq 40[EDI], mm6
  98. movntq 48[EDI], mm7
  99. movntq 56[EDI], mm0
  100. add esi, 64
  101. add edi, 64
  102. dec ecx
  103. jnz loop1
  104. }
  105. EMMS_INSTRUCTION
  106. }
  107. /*
  108. ================
  109. MMX_Memcpy2kB
  110. 240MB/sec
  111. ================
  112. */
  113. void MMX_Memcpy2kB( void *dest, const void *src, const int count ) {
  114. byte *tbuf = (byte *)_alloca16(2048);
  115. __asm {
  116. push ebx
  117. mov esi, src
  118. mov ebx, count
  119. shr ebx, 11 // 2048 bytes at a time
  120. mov edi, dest
  121. loop2k:
  122. push edi // copy 2k into temporary buffer
  123. mov edi, tbuf
  124. mov ecx, 32
  125. loopMemToL1:
  126. prefetchnta 64[ESI] // Prefetch next loop, non-temporal
  127. prefetchnta 96[ESI]
  128. movq mm1, 0[ESI] // Read in source data
  129. movq mm2, 8[ESI]
  130. movq mm3, 16[ESI]
  131. movq mm4, 24[ESI]
  132. movq mm5, 32[ESI]
  133. movq mm6, 40[ESI]
  134. movq mm7, 48[ESI]
  135. movq mm0, 56[ESI]
  136. movq 0[EDI], mm1 // Store into L1
  137. movq 8[EDI], mm2
  138. movq 16[EDI], mm3
  139. movq 24[EDI], mm4
  140. movq 32[EDI], mm5
  141. movq 40[EDI], mm6
  142. movq 48[EDI], mm7
  143. movq 56[EDI], mm0
  144. add esi, 64
  145. add edi, 64
  146. dec ecx
  147. jnz loopMemToL1
  148. pop edi // Now copy from L1 to system memory
  149. push esi
  150. mov esi, tbuf
  151. mov ecx, 32
  152. loopL1ToMem:
  153. movq mm1, 0[ESI] // Read in source data from L1
  154. movq mm2, 8[ESI]
  155. movq mm3, 16[ESI]
  156. movq mm4, 24[ESI]
  157. movq mm5, 32[ESI]
  158. movq mm6, 40[ESI]
  159. movq mm7, 48[ESI]
  160. movq mm0, 56[ESI]
  161. movntq 0[EDI], mm1 // Non-temporal stores
  162. movntq 8[EDI], mm2
  163. movntq 16[EDI], mm3
  164. movntq 24[EDI], mm4
  165. movntq 32[EDI], mm5
  166. movntq 40[EDI], mm6
  167. movntq 48[EDI], mm7
  168. movntq 56[EDI], mm0
  169. add esi, 64
  170. add edi, 64
  171. dec ecx
  172. jnz loopL1ToMem
  173. pop esi // Do next 2k block
  174. dec ebx
  175. jnz loop2k
  176. pop ebx
  177. }
  178. EMMS_INSTRUCTION
  179. }
  180. /*
  181. ================
  182. idSIMD_MMX::Memcpy
  183. optimized memory copy routine that handles all alignment cases and block sizes efficiently
  184. ================
  185. */
  186. void VPCALL idSIMD_MMX::Memcpy( void *dest0, const void *src0, const int count0 ) {
  187. // if copying more than 16 bytes and we can copy 8 byte aligned
  188. if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) {
  189. byte *dest = (byte *)dest0;
  190. byte *src = (byte *)src0;
  191. // copy up to the first 8 byte aligned boundary
  192. int count = ((int)dest) & 7;
  193. memcpy( dest, src, count );
  194. dest += count;
  195. src += count;
  196. count = count0 - count;
  197. // if there are multiple blocks of 2kB
  198. if ( count & ~4095 ) {
  199. MMX_Memcpy2kB( dest, src, count );
  200. src += (count & ~2047);
  201. dest += (count & ~2047);
  202. count &= 2047;
  203. }
  204. // if there are blocks of 64 bytes
  205. if ( count & ~63 ) {
  206. MMX_Memcpy64B( dest, src, count );
  207. src += (count & ~63);
  208. dest += (count & ~63);
  209. count &= 63;
  210. }
  211. // if there are blocks of 8 bytes
  212. if ( count & ~7 ) {
  213. MMX_Memcpy8B( dest, src, count );
  214. src += (count & ~7);
  215. dest += (count & ~7);
  216. count &= 7;
  217. }
  218. // copy any remaining bytes
  219. memcpy( dest, src, count );
  220. } else {
  221. // use the regular one if we cannot copy 8 byte aligned
  222. memcpy( dest0, src0, count0 );
  223. }
  224. // the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
  225. __asm {
  226. sfence
  227. }
  228. }
  229. /*
  230. ================
  231. idSIMD_MMX::Memset
  232. ================
  233. */
  234. void VPCALL idSIMD_MMX::Memset( void* dest0, const int val, const int count0 ) {
  235. union {
  236. byte bytes[8];
  237. word words[4];
  238. dword dwords[2];
  239. } dat;
  240. byte *dest = (byte *)dest0;
  241. int count = count0;
  242. while ( count > 0 && (((int)dest) & 7) ) {
  243. *dest = val;
  244. dest++;
  245. count--;
  246. }
  247. if ( !count ) {
  248. return;
  249. }
  250. dat.bytes[0] = val;
  251. dat.bytes[1] = val;
  252. dat.words[1] = dat.words[0];
  253. dat.dwords[1] = dat.dwords[0];
  254. if ( count >= 64 ) {
  255. __asm {
  256. mov edi, dest
  257. mov ecx, count
  258. shr ecx, 6 // 64 bytes per iteration
  259. movq mm1, dat // Read in source data
  260. movq mm2, mm1
  261. movq mm3, mm1
  262. movq mm4, mm1
  263. movq mm5, mm1
  264. movq mm6, mm1
  265. movq mm7, mm1
  266. movq mm0, mm1
  267. loop1:
  268. movntq 0[EDI], mm1 // Non-temporal stores
  269. movntq 8[EDI], mm2
  270. movntq 16[EDI], mm3
  271. movntq 24[EDI], mm4
  272. movntq 32[EDI], mm5
  273. movntq 40[EDI], mm6
  274. movntq 48[EDI], mm7
  275. movntq 56[EDI], mm0
  276. add edi, 64
  277. dec ecx
  278. jnz loop1
  279. }
  280. dest += ( count & ~63 );
  281. count &= 63;
  282. }
  283. if ( count >= 8 ) {
  284. __asm {
  285. mov edi, dest
  286. mov ecx, count
  287. shr ecx, 3 // 8 bytes per iteration
  288. movq mm1, dat // Read in source data
  289. loop2:
  290. movntq 0[EDI], mm1 // Non-temporal stores
  291. add edi, 8
  292. dec ecx
  293. jnz loop2
  294. }
  295. dest += (count & ~7);
  296. count &= 7;
  297. }
  298. while ( count > 0 ) {
  299. *dest = val;
  300. dest++;
  301. count--;
  302. }
  303. EMMS_INSTRUCTION
  304. // the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
  305. __asm {
  306. sfence
  307. }
  308. }
  309. #endif /* _WIN32 */