MaskedOcclusionCulling.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. ////////////////////////////////////////////////////////////////////////////////
  2. // Copyright 2017 Intel Corporation
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5. // use this file except in compliance with the License. You may obtain a copy
  6. // of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. // License for the specific language governing permissions and limitations
  14. // under the License.
  15. ////////////////////////////////////////////////////////////////////////////////
  16. #include <vector>
  17. #include <string.h>
  18. #include <assert.h>
  19. #include <float.h>
  20. #include "MaskedOcclusionCulling.h"
  21. #include "CompilerSpecific.inl"
  22. #if MOC_RECORDER_ENABLE
  23. #include "FrameRecorder.h"
  24. #endif
  25. #if defined(__AVX__) || defined(__AVX2__)
  26. // For performance reasons, the MaskedOcclusionCullingAVX2/512.cpp files should be compiled with VEX encoding for SSE instructions (to avoid
  27. // AVX-SSE transition penalties, see https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties). However, this file
  28. // _must_ be compiled without VEX encoding to allow backwards compatibility. Best practice is to use lowest supported target platform
  29. // (/arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files.
  30. #error The MaskedOcclusionCulling.cpp should be compiled with lowest supported target platform, e.g. /arch:SSE2
  31. #endif
  32. static MaskedOcclusionCulling::Implementation DetectCPUFeatures(MaskedOcclusionCulling::pfnAlignedAlloc alignedAlloc, MaskedOcclusionCulling::pfnAlignedFree alignedFree)
  33. {
  34. struct CpuInfo { int regs[4]; };
  35. // Get regular CPUID values
  36. int regs[4];
  37. __cpuidex(regs, 0, 0);
  38. // MOCVectorAllocator<CpuInfo> mocalloc( alignedAlloc, alignedFree );
  39. // std::vector<CpuInfo, MOCVectorAllocator<CpuInfo>> cpuId( mocalloc ), cpuIdEx( mocalloc );
  40. // cpuId.resize( regs[0] );
  41. size_t cpuIdCount = regs[0];
  42. CpuInfo * cpuId = (CpuInfo*)alignedAlloc( 64, sizeof(CpuInfo) * cpuIdCount );
  43. for (size_t i = 0; i < cpuIdCount; ++i)
  44. __cpuidex(cpuId[i].regs, (int)i, 0);
  45. // Get extended CPUID values
  46. __cpuidex(regs, 0x80000000, 0);
  47. //cpuIdEx.resize(regs[0] - 0x80000000);
  48. size_t cpuIdExCount = regs[0] - 0x80000000;
  49. CpuInfo * cpuIdEx = (CpuInfo*)alignedAlloc( 64, sizeof( CpuInfo ) * cpuIdExCount );
  50. for (size_t i = 0; i < cpuIdExCount; ++i)
  51. __cpuidex(cpuIdEx[i].regs, 0x80000000 + (int)i, 0);
  52. #define TEST_BITS(A, B) (((A) & (B)) == (B))
  53. #define TEST_FMA_MOVE_OXSAVE (cpuIdCount >= 1 && TEST_BITS(cpuId[1].regs[2], (1 << 12) | (1 << 22) | (1 << 27)))
  54. #define TEST_LZCNT (cpuIdExCount >= 1 && TEST_BITS(cpuIdEx[1].regs[2], 0x20))
  55. #define TEST_SSE41 (cpuIdCount >= 1 && TEST_BITS(cpuId[1].regs[2], (1 << 19)))
  56. #define TEST_XMM_YMM (cpuIdCount >= 1 && TEST_BITS(_xgetbv(0), (1 << 2) | (1 << 1)))
  57. #define TEST_OPMASK_ZMM (cpuIdCount >= 1 && TEST_BITS(_xgetbv(0), (1 << 7) | (1 << 6) | (1 << 5)))
  58. #define TEST_BMI1_BMI2_AVX2 (cpuIdCount >= 7 && TEST_BITS(cpuId[7].regs[1], (1 << 3) | (1 << 5) | (1 << 8)))
  59. #define TEST_AVX512_F_BW_DQ (cpuIdCount >= 7 && TEST_BITS(cpuId[7].regs[1], (1 << 16) | (1 << 17) | (1 << 30)))
  60. MaskedOcclusionCulling::Implementation retVal = MaskedOcclusionCulling::SSE2;
  61. if (TEST_FMA_MOVE_OXSAVE && TEST_LZCNT && TEST_SSE41)
  62. {
  63. if (TEST_XMM_YMM && TEST_OPMASK_ZMM && TEST_BMI1_BMI2_AVX2 && TEST_AVX512_F_BW_DQ)
  64. retVal = MaskedOcclusionCulling::AVX512;
  65. else if (TEST_XMM_YMM && TEST_BMI1_BMI2_AVX2)
  66. retVal = MaskedOcclusionCulling::AVX2;
  67. }
  68. else if (TEST_SSE41)
  69. retVal = MaskedOcclusionCulling::SSE41;
  70. alignedFree( cpuId );
  71. alignedFree( cpuIdEx );
  72. return retVal;
  73. }
  74. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  75. // Utility functions (not directly related to the algorithm/rasterizer)
  76. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  77. void MaskedOcclusionCulling::TransformVertices(const float *mtx, const float *inVtx, float *xfVtx, unsigned int nVtx, const VertexLayout &vtxLayout)
  78. {
  79. // This function pretty slow, about 10-20% slower than if the vertices are stored in aligned SOA form.
  80. if (nVtx == 0)
  81. return;
  82. // Load matrix and swizzle out the z component. For post-multiplication (OGL), the matrix is assumed to be column
  83. // major, with one column per SSE register. For pre-multiplication (DX), the matrix is assumed to be row major.
  84. __m128 mtxCol0 = _mm_loadu_ps(mtx);
  85. __m128 mtxCol1 = _mm_loadu_ps(mtx + 4);
  86. __m128 mtxCol2 = _mm_loadu_ps(mtx + 8);
  87. __m128 mtxCol3 = _mm_loadu_ps(mtx + 12);
  88. int stride = vtxLayout.mStride;
  89. const char *vPtr = (const char *)inVtx;
  90. float *outPtr = xfVtx;
  91. // Iterate through all vertices and transform
  92. for (unsigned int vtx = 0; vtx < nVtx; ++vtx)
  93. {
  94. __m128 xVal = _mm_load1_ps((float*)(vPtr));
  95. __m128 yVal = _mm_load1_ps((float*)(vPtr + vtxLayout.mOffsetY));
  96. __m128 zVal = _mm_load1_ps((float*)(vPtr + vtxLayout.mOffsetZ));
  97. __m128 xform = _mm_add_ps(_mm_mul_ps(mtxCol0, xVal), _mm_add_ps(_mm_mul_ps(mtxCol1, yVal), _mm_add_ps(_mm_mul_ps(mtxCol2, zVal), mtxCol3)));
  98. _mm_storeu_ps(outPtr, xform);
  99. vPtr += stride;
  100. outPtr += 4;
  101. }
  102. }
  103. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  104. // Typedefs
  105. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  106. typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc;
  107. typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree;
  108. typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
  109. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  110. // Common SSE2/SSE4.1 defines
  111. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  112. #define SIMD_LANES 4
  113. #define TILE_HEIGHT_SHIFT 2
  114. #define SIMD_LANE_IDX _mm_setr_epi32(0, 1, 2, 3)
  115. #define SIMD_SUB_TILE_COL_OFFSET _mm_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
  116. #define SIMD_SUB_TILE_ROW_OFFSET _mm_setzero_si128()
  117. #define SIMD_SUB_TILE_COL_OFFSET_F _mm_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
  118. #define SIMD_SUB_TILE_ROW_OFFSET_F _mm_setzero_ps()
  119. #define SIMD_LANE_YCOORD_I _mm_setr_epi32(128, 384, 640, 896)
  120. #define SIMD_LANE_YCOORD_F _mm_setr_ps(128.0f, 384.0f, 640.0f, 896.0f)
  121. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  122. // Common SSE2/SSE4.1 functions
  123. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  124. typedef __m128 __mw;
  125. typedef __m128i __mwi;
  126. #define _mmw_set1_ps _mm_set1_ps
  127. #define _mmw_setzero_ps _mm_setzero_ps
  128. #define _mmw_and_ps _mm_and_ps
  129. #define _mmw_or_ps _mm_or_ps
  130. #define _mmw_xor_ps _mm_xor_ps
  131. #define _mmw_not_ps(a) _mm_xor_ps((a), _mm_castsi128_ps(_mm_set1_epi32(~0)))
  132. #define _mmw_andnot_ps _mm_andnot_ps
  133. #define _mmw_neg_ps(a) _mm_xor_ps((a), _mm_set1_ps(-0.0f))
  134. #define _mmw_abs_ps(a) _mm_and_ps((a), _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)))
  135. #define _mmw_add_ps _mm_add_ps
  136. #define _mmw_sub_ps _mm_sub_ps
  137. #define _mmw_mul_ps _mm_mul_ps
  138. #define _mmw_div_ps _mm_div_ps
  139. #define _mmw_min_ps _mm_min_ps
  140. #define _mmw_max_ps _mm_max_ps
  141. #define _mmw_movemask_ps _mm_movemask_ps
  142. #define _mmw_cmpge_ps(a,b) _mm_cmpge_ps(a, b)
  143. #define _mmw_cmpgt_ps(a,b) _mm_cmpgt_ps(a, b)
  144. #define _mmw_cmpeq_ps(a,b) _mm_cmpeq_ps(a, b)
  145. #define _mmw_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
  146. #define _mmw_fmsub_ps(a,b,c) _mm_sub_ps(_mm_mul_ps(a,b), c)
  147. #define _mmw_shuffle_ps _mm_shuffle_ps
  148. #define _mmw_insertf32x4_ps(a,b,c) (b)
  149. #define _mmw_cvtepi32_ps _mm_cvtepi32_ps
  150. #define _mmw_blendv_epi32(a,b,c) simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c)))
  151. #define _mmw_set1_epi32 _mm_set1_epi32
  152. #define _mmw_setzero_epi32 _mm_setzero_si128
  153. #define _mmw_and_epi32 _mm_and_si128
  154. #define _mmw_or_epi32 _mm_or_si128
  155. #define _mmw_xor_epi32 _mm_xor_si128
  156. #define _mmw_not_epi32(a) _mm_xor_si128((a), _mm_set1_epi32(~0))
  157. #define _mmw_andnot_epi32 _mm_andnot_si128
  158. #define _mmw_neg_epi32(a) _mm_sub_epi32(_mm_set1_epi32(0), (a))
  159. #define _mmw_add_epi32 _mm_add_epi32
  160. #define _mmw_sub_epi32 _mm_sub_epi32
  161. #define _mmw_subs_epu16 _mm_subs_epu16
  162. #define _mmw_cmpeq_epi32 _mm_cmpeq_epi32
  163. #define _mmw_cmpgt_epi32 _mm_cmpgt_epi32
  164. #define _mmw_srai_epi32 _mm_srai_epi32
  165. #define _mmw_srli_epi32 _mm_srli_epi32
  166. #define _mmw_slli_epi32 _mm_slli_epi32
  167. #define _mmw_cvtps_epi32 _mm_cvtps_epi32
  168. #define _mmw_cvttps_epi32 _mm_cvttps_epi32
  169. #define _mmx_fmadd_ps _mmw_fmadd_ps
  170. #define _mmx_max_epi32 _mmw_max_epi32
  171. #define _mmx_min_epi32 _mmw_min_epi32
  172. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  173. // SIMD casting functions
  174. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  175. template<typename T, typename Y> FORCE_INLINE T simd_cast(Y A);
  176. template<> FORCE_INLINE __m128 simd_cast<__m128>(float A) { return _mm_set1_ps(A); }
  177. template<> FORCE_INLINE __m128 simd_cast<__m128>(__m128i A) { return _mm_castsi128_ps(A); }
  178. template<> FORCE_INLINE __m128 simd_cast<__m128>(__m128 A) { return A; }
  179. template<> FORCE_INLINE __m128i simd_cast<__m128i>(int A) { return _mm_set1_epi32(A); }
  180. template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128 A) { return _mm_castps_si128(A); }
  181. template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128i A) { return A; }
  182. #define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \
  183. FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \
  184. union accessor { simd_type m_native; base_type m_array[elements]; }; \
  185. is_const accessor *acs = reinterpret_cast<is_const accessor*>(&a); \
  186. return acs->m_array; \
  187. }
  188. MAKE_ACCESSOR(simd_f32, __m128, float, , 4)
  189. MAKE_ACCESSOR(simd_f32, __m128, float, const, 4)
  190. MAKE_ACCESSOR(simd_i32, __m128i, int, , 4)
  191. MAKE_ACCESSOR(simd_i32, __m128i, int, const, 4)
  192. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  193. // Specialized SSE input assembly function for general vertex gather
  194. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  195. FORCE_INLINE void GatherVertices(__m128 *vtxX, __m128 *vtxY, __m128 *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes, const VertexLayout &vtxLayout)
  196. {
  197. for (int lane = 0; lane < numLanes; lane++)
  198. {
  199. for (int i = 0; i < 3; i++)
  200. {
  201. char *vPtrX = (char *)inVtx + inTrisPtr[lane * 3 + i] * vtxLayout.mStride;
  202. char *vPtrY = vPtrX + vtxLayout.mOffsetY;
  203. char *vPtrW = vPtrX + vtxLayout.mOffsetW;
  204. simd_f32(vtxX[i])[lane] = *((float*)vPtrX);
  205. simd_f32(vtxY[i])[lane] = *((float*)vPtrY);
  206. simd_f32(vtxW[i])[lane] = *((float*)vPtrW);
  207. }
  208. }
  209. }
  210. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  211. // SSE4.1 version
  212. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  213. namespace MaskedOcclusionCullingSSE41
  214. {
  215. FORCE_INLINE __m128i _mmw_mullo_epi32(const __m128i &a, const __m128i &b) { return _mm_mullo_epi32(a, b); }
  216. FORCE_INLINE __m128i _mmw_min_epi32(const __m128i &a, const __m128i &b) { return _mm_min_epi32(a, b); }
  217. FORCE_INLINE __m128i _mmw_max_epi32(const __m128i &a, const __m128i &b) { return _mm_max_epi32(a, b); }
  218. FORCE_INLINE __m128i _mmw_abs_epi32(const __m128i &a) { return _mm_abs_epi32(a); }
  219. FORCE_INLINE __m128 _mmw_blendv_ps(const __m128 &a, const __m128 &b, const __m128 &c) { return _mm_blendv_ps(a, b, c); }
  220. FORCE_INLINE int _mmw_testz_epi32(const __m128i &a, const __m128i &b) { return _mm_testz_si128(a, b); }
  221. FORCE_INLINE __m128 _mmx_dp4_ps(const __m128 &a, const __m128 &b) { return _mm_dp_ps(a, b, 0xFF); }
  222. FORCE_INLINE __m128 _mmw_floor_ps(const __m128 &a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
  223. FORCE_INLINE __m128 _mmw_ceil_ps(const __m128 &a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
  224. FORCE_INLINE __m128i _mmw_transpose_epi8(const __m128i &a)
  225. {
  226. const __m128i shuff = _mm_setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF);
  227. return _mm_shuffle_epi8(a, shuff);
  228. }
  229. FORCE_INLINE __m128i _mmw_sllv_ones(const __m128i &ishift)
  230. {
  231. __m128i shift = _mm_min_epi32(ishift, _mm_set1_epi32(32));
  232. // Uses lookup tables and _mm_shuffle_epi8 to perform _mm_sllv_epi32(~0, shift)
  233. const __m128i byteShiftLUT = _mm_setr_epi8((char)0xFF, (char)0xFE, (char)0xFC, (char)0xF8, (char)0xF0, (char)0xE0, (char)0xC0, (char)0x80, 0, 0, 0, 0, 0, 0, 0, 0);
  234. const __m128i byteShiftOffset = _mm_setr_epi8(0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24);
  235. const __m128i byteShiftShuffle = _mm_setr_epi8(0x0, 0x0, 0x0, 0x0, 0x4, 0x4, 0x4, 0x4, 0x8, 0x8, 0x8, 0x8, 0xC, 0xC, 0xC, 0xC);
  236. __m128i byteShift = _mm_shuffle_epi8(shift, byteShiftShuffle);
  237. byteShift = _mm_min_epi8(_mm_subs_epu8(byteShift, byteShiftOffset), _mm_set1_epi8(8));
  238. __m128i retMask = _mm_shuffle_epi8(byteShiftLUT, byteShift);
  239. return retMask;
  240. }
  241. static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::SSE41;
  242. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  243. // Include common algorithm implementation (general, SIMD independent code)
  244. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  245. #include "MaskedOcclusionCullingCommon.inl"
  246. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  247. // Utility function to create a new object using the allocator callbacks
  248. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  249. MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
  250. {
  251. MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate));
  252. new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree);
  253. return object;
  254. }
  255. };
  256. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  257. // SSE2 version
  258. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  259. namespace MaskedOcclusionCullingSSE2
  260. {
  261. FORCE_INLINE __m128i _mmw_mullo_epi32(const __m128i &a, const __m128i &b)
  262. {
  263. // Do products for even / odd lanes & merge the result
  264. __m128i even = _mm_and_si128(_mm_mul_epu32(a, b), _mm_setr_epi32(~0, 0, ~0, 0));
  265. __m128i odd = _mm_slli_epi64(_mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)), 32);
  266. return _mm_or_si128(even, odd);
  267. }
  268. FORCE_INLINE __m128i _mmw_min_epi32(const __m128i &a, const __m128i &b)
  269. {
  270. __m128i cond = _mm_cmpgt_epi32(a, b);
  271. return _mm_or_si128(_mm_andnot_si128(cond, a), _mm_and_si128(cond, b));
  272. }
  273. FORCE_INLINE __m128i _mmw_max_epi32(const __m128i &a, const __m128i &b)
  274. {
  275. __m128i cond = _mm_cmpgt_epi32(b, a);
  276. return _mm_or_si128(_mm_andnot_si128(cond, a), _mm_and_si128(cond, b));
  277. }
  278. FORCE_INLINE __m128i _mmw_abs_epi32(const __m128i &a)
  279. {
  280. __m128i mask = _mm_cmplt_epi32(a, _mm_setzero_si128());
  281. return _mm_add_epi32(_mm_xor_si128(a, mask), _mm_srli_epi32(mask, 31));
  282. }
  283. FORCE_INLINE int _mmw_testz_epi32(const __m128i &a, const __m128i &b)
  284. {
  285. return _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(a, b), _mm_setzero_si128())) == 0xFFFF;
  286. }
  287. FORCE_INLINE __m128 _mmw_blendv_ps(const __m128 &a, const __m128 &b, const __m128 &c)
  288. {
  289. __m128 cond = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(c), 31));
  290. return _mm_or_ps(_mm_andnot_ps(cond, a), _mm_and_ps(cond, b));
  291. }
  292. FORCE_INLINE __m128 _mmx_dp4_ps(const __m128 &a, const __m128 &b)
  293. {
  294. // Product and two shuffle/adds pairs (similar to hadd_ps)
  295. __m128 prod = _mm_mul_ps(a, b);
  296. __m128 dp = _mm_add_ps(prod, _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1)));
  297. dp = _mm_add_ps(dp, _mm_shuffle_ps(dp, dp, _MM_SHUFFLE(0, 1, 2, 3)));
  298. return dp;
  299. }
  300. FORCE_INLINE __m128 _mmw_floor_ps(const __m128 &a)
  301. {
  302. int originalMode = _MM_GET_ROUNDING_MODE();
  303. _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
  304. __m128 rounded = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
  305. _MM_SET_ROUNDING_MODE(originalMode);
  306. return rounded;
  307. }
  308. FORCE_INLINE __m128 _mmw_ceil_ps(const __m128 &a)
  309. {
  310. int originalMode = _MM_GET_ROUNDING_MODE();
  311. _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
  312. __m128 rounded = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
  313. _MM_SET_ROUNDING_MODE(originalMode);
  314. return rounded;
  315. }
  316. FORCE_INLINE __m128i _mmw_transpose_epi8(const __m128i &a)
  317. {
  318. // Perform transpose through two 16->8 bit pack and byte shifts
  319. __m128i res = a;
  320. const __m128i mask = _mm_setr_epi8(~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0);
  321. res = _mm_packus_epi16(_mm_and_si128(res, mask), _mm_srli_epi16(res, 8));
  322. res = _mm_packus_epi16(_mm_and_si128(res, mask), _mm_srli_epi16(res, 8));
  323. return res;
  324. }
  325. FORCE_INLINE __m128i _mmw_sllv_ones(const __m128i &ishift)
  326. {
  327. __m128i shift = _mmw_min_epi32(ishift, _mm_set1_epi32(32));
  328. // Uses scalar approach to perform _mm_sllv_epi32(~0, shift)
  329. static const unsigned int maskLUT[33] = {
  330. ~0U << 0, ~0U << 1, ~0U << 2 , ~0U << 3, ~0U << 4, ~0U << 5, ~0U << 6 , ~0U << 7, ~0U << 8, ~0U << 9, ~0U << 10 , ~0U << 11, ~0U << 12, ~0U << 13, ~0U << 14 , ~0U << 15,
  331. ~0U << 16, ~0U << 17, ~0U << 18 , ~0U << 19, ~0U << 20, ~0U << 21, ~0U << 22 , ~0U << 23, ~0U << 24, ~0U << 25, ~0U << 26 , ~0U << 27, ~0U << 28, ~0U << 29, ~0U << 30 , ~0U << 31,
  332. 0U };
  333. __m128i retMask;
  334. simd_i32(retMask)[0] = (int)maskLUT[simd_i32(shift)[0]];
  335. simd_i32(retMask)[1] = (int)maskLUT[simd_i32(shift)[1]];
  336. simd_i32(retMask)[2] = (int)maskLUT[simd_i32(shift)[2]];
  337. simd_i32(retMask)[3] = (int)maskLUT[simd_i32(shift)[3]];
  338. return retMask;
  339. }
  340. static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::SSE2;
  341. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  342. // Include common algorithm implementation (general, SIMD independent code)
  343. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  344. #include "MaskedOcclusionCullingCommon.inl"
  345. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  346. // Utility function to create a new object using the allocator callbacks
  347. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  348. MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
  349. {
  350. MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate));
  351. new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree);
  352. return object;
  353. }
  354. };
  355. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  356. // Object construction and allocation
  357. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  358. namespace MaskedOcclusionCullingAVX512
  359. {
  360. extern MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree);
  361. }
  362. namespace MaskedOcclusionCullingAVX2
  363. {
  364. extern MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree);
  365. }
  366. MaskedOcclusionCulling *MaskedOcclusionCulling::Create(Implementation RequestedSIMD)
  367. {
  368. return Create(RequestedSIMD, aligned_alloc, aligned_free);
  369. }
  370. MaskedOcclusionCulling *MaskedOcclusionCulling::Create(Implementation RequestedSIMD, pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
  371. {
  372. MaskedOcclusionCulling *object = nullptr;
  373. MaskedOcclusionCulling::Implementation impl = DetectCPUFeatures(alignedAlloc, alignedFree);
  374. if (RequestedSIMD < impl)
  375. impl = RequestedSIMD;
  376. // Return best supported version
  377. if (object == nullptr && impl >= AVX512)
  378. object = MaskedOcclusionCullingAVX512::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use AVX512 version
  379. if (object == nullptr && impl >= AVX2)
  380. object = MaskedOcclusionCullingAVX2::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use AVX2 version
  381. if (object == nullptr && impl >= SSE41)
  382. object = MaskedOcclusionCullingSSE41::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use SSE4.1 version
  383. if (object == nullptr)
  384. object = MaskedOcclusionCullingSSE2::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use SSE2 (slow) version
  385. return object;
  386. }
  387. void MaskedOcclusionCulling::Destroy(MaskedOcclusionCulling *moc)
  388. {
  389. pfnAlignedFree alignedFreeCallback = moc->mAlignedFreeCallback;
  390. moc->~MaskedOcclusionCulling();
  391. alignedFreeCallback(moc);
  392. }