Simd_SSE3.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /*
  2. ===========================================================================
  3. Doom 3 GPL Source Code
  4. Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
  6. Doom 3 Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #include "../precompiled.h"
  21. #pragma hdrstop
  22. #include "Simd_Generic.h"
  23. #include "Simd_MMX.h"
  24. #include "Simd_SSE.h"
  25. #include "Simd_SSE2.h"
  26. #include "Simd_SSE3.h"
  27. //===============================================================
  28. //
  29. // SSE3 implementation of idSIMDProcessor
  30. //
  31. //===============================================================
  32. #if defined(MACOS_X) && defined(__i386__)
  33. /*
  34. ============
  35. idSIMD_SSE3::GetName
  36. ============
  37. */
  38. const char * idSIMD_SSE3::GetName( void ) const {
  39. return "MMX & SSE & SSE2 & SSE3";
  40. }
  41. #elif defined(_WIN32)
  42. #include <xmmintrin.h>
  43. #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
  44. #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
  45. #define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
  46. #define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
  47. /*
  48. The first argument of an instruction macro is the destination
  49. and the second argument is the source operand. The destination
  50. operand can be _xmm0 to _xmm7 only. The source operand can be
  51. any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
  52. _ebp, _ebx, _esi, or _edi that contains the effective address.
  53. For instance: haddps xmm0, xmm1
  54. becomes: haddps( _xmm0, _xmm1 )
  55. and: haddps xmm0, [esi]
  56. becomes: haddps( _xmm0, _esi )
  57. The ADDRESS_ADDC macro can be used when the effective source address
  58. is formed by adding a constant to a general purpose register.
  59. For instance: haddps xmm0, [esi+48]
  60. becomes: haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )
  61. The ADDRESS_ADDR macro can be used when the effective source address
  62. is formed by adding two general purpose registers.
  63. For instance: haddps xmm0, [esi+eax]
  64. becomes: haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )
  65. The ADDRESS_ADDRC macro can be used when the effective source address
  66. is formed by adding two general purpose registers and a constant.
  67. The constant must be in the range [-128, 127].
  68. For instance: haddps xmm0, [esi+eax+48]
  69. becomes: haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )
  70. The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
  71. by adding a scaled general purpose register to another general purpose register.
  72. The scale must be either 1, 2, 4 or 8.
  73. For instance: haddps xmm0, [esi+eax*4]
  74. becomes: haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )
  75. The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
  76. by adding a scaled general purpose register to another general purpose register and
  77. also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
  78. be in the range [-128, 127].
  79. For instance: haddps xmm0, [esi+eax*4+64]
  80. becomes: haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )
  81. */
  82. #define _eax 0x00
  83. #define _ecx 0x01
  84. #define _edx 0x02
  85. #define _ebx 0x03
  86. #define _esp 0x04
  87. #define _ebp 0x05
  88. #define _esi 0x06
  89. #define _edi 0x07
  90. #define _xmm0 0xC0
  91. #define _xmm1 0xC1
  92. #define _xmm2 0xC2
  93. #define _xmm3 0xC3
  94. #define _xmm4 0xC4
  95. #define _xmm5 0xC5
  96. #define _xmm6 0xC6
  97. #define _xmm7 0xC7
  98. #define RSCALE( s ) ( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )
  99. #define ADDRESS_ADDC( reg0, constant ) 0x40 | ( reg0 & 7 ) \
  100. _asm _emit constant
  101. #define ADDRESS_ADDR( reg0, reg1 ) 0x04 \
  102. _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )
  103. #define ADDRESS_ADDRC( reg0, reg1, constant ) 0x44 \
  104. _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) \
  105. _asm _emit constant
  106. #define ADDRESS_SCALEADDR( reg0, reg1, scale ) 0x04 \
  107. _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )
  108. #define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant ) 0x44 \
  109. _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale ) \
  110. _asm _emit constant
  111. // Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
  112. #define addsubps( dst, src ) \
  113. _asm _emit 0xF2 \
  114. _asm _emit 0x0F \
  115. _asm _emit 0xD0 \
  116. _asm _emit ( ( dst & 7 ) << 3 ) | src
  117. // Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
  118. #define addsubpd( dst, src ) \
  119. _asm _emit 0x66 \
  120. _asm _emit 0x0F \
  121. _asm _emit 0xD0 \
  122. _asm _emit ( ( dst & 7 ) << 3 ) | src
  123. // Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
  124. #define haddps( dst, src ) \
  125. _asm _emit 0xF2 \
  126. _asm _emit 0x0F \
  127. _asm _emit 0x7C \
  128. _asm _emit ( ( dst & 7 ) << 3 ) | src
  129. // Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
  130. #define haddpd( dst, src ) \
  131. _asm _emit 0x66 \
  132. _asm _emit 0x0F \
  133. _asm _emit 0x7C \
  134. _asm _emit ( ( dst & 7 ) << 3 ) | src
  135. // Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
  136. #define hsubps( dst, src ) \
  137. _asm _emit 0xF2 \
  138. _asm _emit 0x0F \
  139. _asm _emit 0x7D \
  140. _asm _emit ( ( dst & 7 ) << 3 ) | src
  141. // Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
  142. #define hsubpd( dst, src ) \
  143. _asm _emit 0x66 \
  144. _asm _emit 0x0F \
  145. _asm _emit 0x7D \
  146. _asm _emit ( ( dst & 7 ) << 3 ) | src
  147. // Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
  148. #define movsldup( dst, src ) \
  149. _asm _emit 0xF3 \
  150. _asm _emit 0x0F \
  151. _asm _emit 0x12 \
  152. _asm _emit ( ( dst & 7 ) << 3 ) | src
  153. // Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
  154. #define movdldup( dst, src ) \
  155. _asm _emit 0xF2 \
  156. _asm _emit 0x0F \
  157. _asm _emit 0x12 \
  158. _asm _emit ( ( dst & 7 ) << 3 ) | src
  159. // Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
  160. #define movshdup( dst, src ) \
  161. _asm _emit 0xF3 \
  162. _asm _emit 0x0F \
  163. _asm _emit 0x16 \
  164. _asm _emit ( ( dst & 7 ) << 3 ) | src
  165. // Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
  166. #define movdhdup( dst, src ) \
  167. _asm _emit 0xF2 \
  168. _asm _emit 0x0F \
  169. _asm _emit 0x16 \
  170. _asm _emit ( ( dst & 7 ) << 3 ) | src
  171. // Load Unaligned Integer 128 bits
  172. #define lddqu( dst, src ) \
  173. _asm _emit 0xF2 \
  174. _asm _emit 0x0F \
  175. _asm _emit 0xF0 \
  176. _asm _emit ( ( dst & 7 ) << 3 ) | src
  177. #define DRAWVERT_SIZE 60
  178. #define DRAWVERT_XYZ_OFFSET (0*4)
  179. #define DRAWVERT_ST_OFFSET (3*4)
  180. #define DRAWVERT_NORMAL_OFFSET (5*4)
  181. #define DRAWVERT_TANGENT0_OFFSET (8*4)
  182. #define DRAWVERT_TANGENT1_OFFSET (11*4)
  183. #define DRAWVERT_COLOR_OFFSET (14*4)
  184. #define JOINTQUAT_SIZE (7*4)
  185. #define JOINTMAT_SIZE (4*3*4)
  186. #define JOINTWEIGHT_SIZE (4*4)
  187. /*
  188. ============
  189. SSE3_Dot
  190. ============
  191. */
  192. float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
  193. float d;
  194. __asm {
  195. mov esi, v1
  196. mov edi, v2
  197. movaps xmm0, [esi]
  198. mulps xmm0, [edi]
  199. haddps( _xmm0, _xmm0 )
  200. haddps( _xmm0, _xmm0 )
  201. movss d, xmm0
  202. }
  203. return d;
  204. }
  205. /*
  206. ============
  207. idSIMD_SSE3::GetName
  208. ============
  209. */
  210. const char * idSIMD_SSE3::GetName( void ) const {
  211. return "MMX & SSE & SSE2 & SSE3";
  212. }
  213. /*
  214. ============
  215. idSIMD_SSE3::TransformVerts
  216. ============
  217. */
  218. void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
  219. #if 1
  220. assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
  221. assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
  222. assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
  223. assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
  224. __asm
  225. {
  226. mov eax, numVerts
  227. test eax, eax
  228. jz done
  229. imul eax, DRAWVERT_SIZE
  230. mov ecx, verts
  231. mov edx, index
  232. mov esi, weights
  233. mov edi, joints
  234. add ecx, eax
  235. neg eax
  236. loopVert:
  237. mov ebx, [edx]
  238. movaps xmm2, [esi]
  239. add edx, 8
  240. movaps xmm0, xmm2
  241. add esi, JOINTWEIGHT_SIZE
  242. movaps xmm1, xmm2
  243. mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
  244. mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
  245. mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
  246. cmp dword ptr [edx-4], 0
  247. jne doneWeight
  248. loopWeight:
  249. mov ebx, [edx]
  250. movaps xmm5, [esi]
  251. add edx, 8
  252. movaps xmm3, xmm5
  253. add esi, JOINTWEIGHT_SIZE
  254. movaps xmm4, xmm5
  255. mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
  256. mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
  257. mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
  258. cmp dword ptr [edx-4], 0
  259. addps xmm0, xmm3
  260. addps xmm1, xmm4
  261. addps xmm2, xmm5
  262. je loopWeight
  263. doneWeight:
  264. add eax, DRAWVERT_SIZE
  265. haddps( _xmm0, _xmm1 )
  266. haddps( _xmm2, _xmm0 )
  267. movhps [ecx+eax-DRAWVERT_SIZE+0], xmm2
  268. haddps( _xmm2, _xmm2 )
  269. movss [ecx+eax-DRAWVERT_SIZE+8], xmm2
  270. jl loopVert
  271. done:
  272. }
  273. #else
  274. int i, j;
  275. const byte *jointsPtr = (byte *)joints;
  276. for( j = i = 0; i < numVerts; i++ ) {
  277. idVec3 v;
  278. v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
  279. while( index[j*2+1] == 0 ) {
  280. j++;
  281. v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
  282. }
  283. j++;
  284. verts[i].xyz = v;
  285. }
  286. #endif
  287. }
  288. #endif /* _WIN32 */