tessellation_cache.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "../common/default.h"
  5. /* force a complete cache invalidation when running out of allocation space */
  6. #define FORCE_SIMPLE_FLUSH 0
  7. #define THREAD_BLOCK_ATOMIC_ADD 4
  8. #if defined(DEBUG)
  9. #define CACHE_STATS(x)
  10. #else
  11. #define CACHE_STATS(x)
  12. #endif
  13. namespace embree
  14. {
  15. class SharedTessellationCacheStats
  16. {
  17. public:
  18. /* stats */
  19. static std::atomic<size_t> cache_accesses;
  20. static std::atomic<size_t> cache_hits;
  21. static std::atomic<size_t> cache_misses;
  22. static std::atomic<size_t> cache_flushes;
  23. static size_t cache_num_patches;
  24. __aligned(64) static SpinLock mtx;
  25. /* print stats for debugging */
  26. static void printStats();
  27. static void clearStats();
  28. };
  29. void resizeTessellationCache(size_t new_size);
  30. void resetTessellationCache();
  31. ////////////////////////////////////////////////////////////////////////////////
  32. ////////////////////////////////////////////////////////////////////////////////
  33. ////////////////////////////////////////////////////////////////////////////////
  34. struct __aligned(64) ThreadWorkState
  35. {
  36. ALIGNED_STRUCT_(64);
  37. std::atomic<size_t> counter;
  38. ThreadWorkState* next;
  39. bool allocated;
  40. __forceinline ThreadWorkState(bool allocated = false)
  41. : counter(0), next(nullptr), allocated(allocated)
  42. {
  43. assert( ((size_t)this % 64) == 0 );
  44. }
  45. };
  46. class __aligned(64) SharedLazyTessellationCache
  47. {
  48. public:
  49. static const size_t NUM_CACHE_SEGMENTS = 8;
  50. static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
  51. static const size_t COMMIT_INDEX_SHIFT = 32+8;
  52. #if defined(__64BIT__)
  53. static const size_t REF_TAG_MASK = 0xffffffffff;
  54. #else
  55. static const size_t REF_TAG_MASK = 0x7FFFFFFF;
  56. #endif
  57. static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1;
  58. static const size_t BLOCK_SIZE = 64;
  59. /*! Per thread tessellation ref cache */
  60. static __thread ThreadWorkState* init_t_state;
  61. static ThreadWorkState* current_t_state;
  62. static __forceinline ThreadWorkState *threadState()
  63. {
  64. if (unlikely(!init_t_state))
  65. /* sets init_t_state, can't return pointer due to macosx icc bug*/
  66. SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
  67. return init_t_state;
  68. }
  69. struct Tag
  70. {
  71. __forceinline Tag() : data(0) {}
  72. __forceinline Tag(void* ptr, size_t combinedTime) {
  73. init(ptr,combinedTime);
  74. }
  75. __forceinline Tag(size_t ptr, size_t combinedTime) {
  76. init((void*)ptr,combinedTime);
  77. }
  78. __forceinline void init(void* ptr, size_t combinedTime)
  79. {
  80. if (ptr == nullptr) {
  81. data = 0;
  82. return;
  83. }
  84. int64_t new_root_ref = (int64_t) ptr;
  85. new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();
  86. assert( new_root_ref <= (int64_t)REF_TAG_MASK );
  87. new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT;
  88. data = new_root_ref;
  89. }
  90. __forceinline int64_t get() const { return data.load(); }
  91. __forceinline void set( int64_t v ) { data.store(v); }
  92. __forceinline void reset() { data.store(0); }
  93. private:
  94. atomic<int64_t> data;
  95. };
  96. static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
  97. struct CacheEntry
  98. {
  99. Tag tag;
  100. SpinLock mutex;
  101. };
  102. private:
  103. float *data;
  104. bool hugepages;
  105. size_t size;
  106. size_t maxBlocks;
  107. ThreadWorkState *threadWorkState;
  108. __aligned(64) std::atomic<size_t> localTime;
  109. __aligned(64) std::atomic<size_t> next_block;
  110. __aligned(64) SpinLock reset_state;
  111. __aligned(64) SpinLock linkedlist_mtx;
  112. __aligned(64) std::atomic<size_t> switch_block_threshold;
  113. __aligned(64) std::atomic<size_t> numRenderThreads;
  114. public:
  115. SharedLazyTessellationCache();
  116. ~SharedLazyTessellationCache();
  117. void getNextRenderThreadWorkState();
  118. __forceinline size_t maxAllocSize() const {
  119. return switch_block_threshold;
  120. }
  121. __forceinline size_t getCurrentIndex() { return localTime.load(); }
  122. __forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }
  123. __forceinline size_t getTime(const size_t globalTime) {
  124. return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
  125. }
  126. __forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); }
  127. __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }
  128. __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }
  129. static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); }
  130. static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }
  131. static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }
  132. static __forceinline size_t getState() { return threadState()->counter.load(); }
  133. static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }
  134. static __forceinline size_t getTCacheTime(const size_t globalTime) {
  135. return sharedLazyTessellationCache.getTime(globalTime);
  136. }
  137. /* per thread lock */
  138. __forceinline void lockThreadLoop (ThreadWorkState *const t_state)
  139. {
  140. while(1)
  141. {
  142. size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);
  143. if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
  144. {
  145. /* lock failed wait until sync phase is over */
  146. sharedLazyTessellationCache.unlockThread(t_state,-1);
  147. sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);
  148. }
  149. else
  150. break;
  151. }
  152. }
  153. static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
  154. {
  155. const int64_t subdiv_patch_root_ref = entry.tag.get();
  156. CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
  157. if (likely(subdiv_patch_root_ref != 0))
  158. {
  159. const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
  160. const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
  161. if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
  162. {
  163. CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
  164. return (void*) subdiv_patch_root;
  165. }
  166. }
  167. CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
  168. return nullptr;
  169. }
  170. template<typename Constructor>
  171. static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
  172. {
  173. ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
  174. while (true)
  175. {
  176. sharedLazyTessellationCache.lockThreadLoop(t_state);
  177. void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
  178. if (patch) return (decltype(constructor())) patch;
  179. if (entry.mutex.try_lock())
  180. {
  181. if (!validTag(entry.tag,globalTime))
  182. {
  183. auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
  184. auto ret = constructor(); // thread is locked here!
  185. assert(ret);
  186. /* this should never return nullptr */
  187. auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
  188. auto time = before ? timeBefore : timeAfter;
  189. __memory_barrier();
  190. entry.tag = SharedLazyTessellationCache::Tag(ret,time);
  191. __memory_barrier();
  192. entry.mutex.unlock();
  193. return ret;
  194. }
  195. entry.mutex.unlock();
  196. }
  197. SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
  198. }
  199. }
  200. __forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
  201. {
  202. #if FORCE_SIMPLE_FLUSH == 1
  203. return i == getTime(globalTime);
  204. #else
  205. return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);
  206. #endif
  207. }
  208. static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
  209. {
  210. return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;
  211. }
  212. static __forceinline bool validTag(const Tag& tag, size_t globalTime)
  213. {
  214. const int64_t subdiv_patch_root_ref = tag.get();
  215. if (subdiv_patch_root_ref == 0) return false;
  216. const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
  217. return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);
  218. }
  219. void waitForUsersLessEqual(ThreadWorkState *const t_state,
  220. const unsigned int users);
  221. __forceinline size_t alloc(const size_t blocks)
  222. {
  223. if (unlikely(blocks >= switch_block_threshold))
  224. throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
  225. assert(blocks < switch_block_threshold);
  226. size_t index = next_block.fetch_add(blocks);
  227. if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;
  228. return index;
  229. }
  230. static __forceinline void* malloc(const size_t bytes)
  231. {
  232. size_t block_index = -1;
  233. ThreadWorkState *const t_state = threadState();
  234. while (true)
  235. {
  236. block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);
  237. if (block_index == (size_t)-1)
  238. {
  239. sharedLazyTessellationCache.unlockThread(t_state);
  240. sharedLazyTessellationCache.allocNextSegment();
  241. sharedLazyTessellationCache.lockThread(t_state);
  242. continue;
  243. }
  244. break;
  245. }
  246. return sharedLazyTessellationCache.getBlockPtr(block_index);
  247. }
  248. __forceinline void *getBlockPtr(const size_t block_index)
  249. {
  250. assert(block_index < maxBlocks);
  251. assert(data);
  252. assert(block_index*16 <= size);
  253. return (void*)&data[block_index*16];
  254. }
  255. __forceinline void* getDataPtr() { return data; }
  256. __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
  257. __forceinline size_t getMaxBlocks() { return maxBlocks; }
  258. __forceinline size_t getSize() { return size; }
  259. void allocNextSegment();
  260. void realloc(const size_t newSize);
  261. void reset();
  262. static SharedLazyTessellationCache sharedLazyTessellationCache;
  263. };
  264. }