alloc.h 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "default.h"
  5. #include "device.h"
  6. #include "scene.h"
  7. #include "primref.h"
  8. #if defined(APPLE) && defined(__aarch64__)
  9. #include <mutex>
  10. #endif
  11. namespace embree
  12. {
  13. class FastAllocator
  14. {
  15. /*! maximum supported alignment */
  16. static const size_t maxAlignment = 64;
  17. /*! maximum allocation size */
  18. /* default settings */
  19. //static const size_t defaultBlockSize = 4096;
  20. #define maxAllocationSize size_t(2*1024*1024-maxAlignment)
  21. static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
  22. public:
  23. struct ThreadLocal2;
  24. enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
  25. /*! Per thread structure holding the current memory block. */
  26. struct __aligned(64) ThreadLocal
  27. {
  28. ALIGNED_CLASS_(64);
  29. public:
  30. /*! Constructor for usage with ThreadLocalData */
  31. __forceinline ThreadLocal (ThreadLocal2* parent)
  32. : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
  33. /*! initialize allocator */
  34. void init(FastAllocator* alloc)
  35. {
  36. ptr = nullptr;
  37. cur = end = 0;
  38. bytesUsed = 0;
  39. bytesWasted = 0;
  40. allocBlockSize = 0;
  41. if (alloc) allocBlockSize = alloc->defaultBlockSize;
  42. }
  43. /* Allocate aligned memory from the threads memory block. */
  44. __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
  45. {
  46. /* bind the thread local allocator to the proper FastAllocator*/
  47. parent->bind(alloc);
  48. assert(align <= maxAlignment);
  49. bytesUsed += bytes;
  50. /* try to allocate in local block */
  51. size_t ofs = (align - cur) & (align-1);
  52. cur += bytes + ofs;
  53. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  54. cur -= bytes + ofs;
  55. /* if allocation is too large allocate with parent allocator */
  56. if (4*bytes > allocBlockSize) {
  57. return alloc->malloc(bytes,maxAlignment,false);
  58. }
  59. /* get new partial block if allocation failed */
  60. size_t blockSize = allocBlockSize;
  61. ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
  62. bytesWasted += end-cur;
  63. cur = 0; end = blockSize;
  64. /* retry allocation */
  65. ofs = (align - cur) & (align-1);
  66. cur += bytes + ofs;
  67. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  68. cur -= bytes + ofs;
  69. /* get new full block if allocation failed */
  70. blockSize = allocBlockSize;
  71. ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
  72. bytesWasted += end-cur;
  73. cur = 0; end = blockSize;
  74. /* retry allocation */
  75. ofs = (align - cur) & (align-1);
  76. cur += bytes + ofs;
  77. if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
  78. cur -= bytes + ofs;
  79. /* should never happen as large allocations get handled specially above */
  80. assert(false);
  81. return nullptr;
  82. }
  83. /*! returns amount of used bytes */
  84. __forceinline size_t getUsedBytes() const { return bytesUsed; }
  85. /*! returns amount of free bytes */
  86. __forceinline size_t getFreeBytes() const { return end-cur; }
  87. /*! returns amount of wasted bytes */
  88. __forceinline size_t getWastedBytes() const { return bytesWasted; }
  89. private:
  90. ThreadLocal2* parent;
  91. char* ptr; //!< pointer to memory block
  92. size_t cur; //!< current location of the allocator
  93. size_t end; //!< end of the memory block
  94. size_t allocBlockSize; //!< block size for allocations
  95. size_t bytesUsed; //!< number of total bytes allocated
  96. size_t bytesWasted; //!< number of bytes wasted
  97. };
  98. /*! Two thread local structures. */
  99. struct __aligned(64) ThreadLocal2
  100. {
  101. ALIGNED_CLASS_(64);
  102. public:
  103. __forceinline ThreadLocal2()
  104. : alloc(nullptr), alloc0(this), alloc1(this) {}
  105. /*! bind to fast allocator */
  106. __forceinline void bind(FastAllocator* alloc_i)
  107. {
  108. assert(alloc_i);
  109. if (alloc.load() == alloc_i) return;
  110. #if defined(APPLE) && defined(__aarch64__)
  111. std::scoped_lock lock(mutex);
  112. #else
  113. Lock<SpinLock> lock(mutex);
  114. #endif
  115. //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
  116. if (alloc.load()) {
  117. alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
  118. alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
  119. alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
  120. }
  121. alloc0.init(alloc_i);
  122. alloc1.init(alloc_i);
  123. alloc.store(alloc_i);
  124. alloc_i->join(this);
  125. }
  126. /*! unbind to fast allocator */
  127. void unbind(FastAllocator* alloc_i)
  128. {
  129. assert(alloc_i);
  130. if (alloc.load() != alloc_i) return;
  131. #if defined(APPLE) && defined(__aarch64__)
  132. std::scoped_lock lock(mutex);
  133. #else
  134. Lock<SpinLock> lock(mutex);
  135. #endif
  136. if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
  137. alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
  138. alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
  139. alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
  140. alloc0.init(nullptr);
  141. alloc1.init(nullptr);
  142. alloc.store(nullptr);
  143. }
  144. public:
  145. #if defined(APPLE) && defined(__aarch64__)
  146. std::mutex mutex;
  147. #else
  148. SpinLock mutex; //!< required as unbind is called from other threads
  149. #endif
  150. std::atomic<FastAllocator*> alloc; //!< parent allocator
  151. ThreadLocal alloc0;
  152. ThreadLocal alloc1;
  153. };
  154. FastAllocator (Device* device, bool osAllocation)
  155. : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
  156. growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
  157. primrefarray(device,0)
  158. {
  159. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
  160. {
  161. threadUsedBlocks[i] = nullptr;
  162. threadBlocks[i] = nullptr;
  163. assert(!slotMutex[i].isLocked());
  164. }
  165. }
  166. ~FastAllocator () {
  167. clear();
  168. }
  169. /*! returns the device attached to this allocator */
  170. Device* getDevice() {
  171. return device;
  172. }
  173. void share(mvector<PrimRef>& primrefarray_i) {
  174. primrefarray = std::move(primrefarray_i);
  175. }
  176. void unshare(mvector<PrimRef>& primrefarray_o)
  177. {
  178. reset(); // this removes blocks that are allocated inside the shared primref array
  179. primrefarray_o = std::move(primrefarray);
  180. }
  181. /*! returns first fast thread local allocator */
  182. __forceinline ThreadLocal* _threadLocal() {
  183. return &threadLocal2()->alloc0;
  184. }
  185. void setOSallocation(bool flag)
  186. {
  187. atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
  188. }
  189. private:
  190. /*! returns both fast thread local allocators */
  191. __forceinline ThreadLocal2* threadLocal2()
  192. {
  193. ThreadLocal2* alloc = thread_local_allocator2;
  194. if (alloc == nullptr) {
  195. thread_local_allocator2 = alloc = new ThreadLocal2;
  196. #if defined(APPLE) && defined(__aarch64__)
  197. std::scoped_lock lock(s_thread_local_allocators_lock);
  198. #else
  199. Lock<SpinLock> lock(s_thread_local_allocators_lock);
  200. #endif
  201. s_thread_local_allocators.push_back(make_unique(alloc));
  202. }
  203. return alloc;
  204. }
  205. public:
  206. __forceinline void join(ThreadLocal2* alloc)
  207. {
  208. #if defined(APPLE) && defined(__aarch64__)
  209. std::scoped_lock lock(s_thread_local_allocators_lock);
  210. #else
  211. Lock<SpinLock> lock(thread_local_allocators_lock);
  212. #endif
  213. thread_local_allocators.push_back(alloc);
  214. }
  215. public:
  216. struct CachedAllocator
  217. {
  218. __forceinline CachedAllocator(void* ptr)
  219. : alloc(nullptr), talloc0(nullptr), talloc1(nullptr)
  220. {
  221. assert(ptr == nullptr);
  222. }
  223. __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
  224. : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
  225. __forceinline operator bool () const {
  226. return alloc != nullptr;
  227. }
  228. __forceinline void* operator() (size_t bytes, size_t align = 16) const {
  229. return talloc0->malloc(alloc,bytes,align);
  230. }
  231. __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
  232. return talloc0->malloc(alloc,bytes,align);
  233. }
  234. __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
  235. return talloc1->malloc(alloc,bytes,align);
  236. }
  237. public:
  238. FastAllocator* alloc;
  239. ThreadLocal* talloc0;
  240. ThreadLocal* talloc1;
  241. };
  242. __forceinline CachedAllocator getCachedAllocator() {
  243. return CachedAllocator(this,threadLocal2());
  244. }
  245. /*! Builder interface to create thread local allocator */
  246. struct Create
  247. {
  248. public:
  249. __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
  250. __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); }
  251. private:
  252. FastAllocator* allocator;
  253. };
  254. void internal_fix_used_blocks()
  255. {
  256. /* move thread local blocks to global block list */
  257. for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
  258. {
  259. while (threadBlocks[i].load() != nullptr) {
  260. Block* nextUsedBlock = threadBlocks[i].load()->next;
  261. threadBlocks[i].load()->next = usedBlocks.load();
  262. usedBlocks = threadBlocks[i].load();
  263. threadBlocks[i] = nextUsedBlock;
  264. }
  265. threadBlocks[i] = nullptr;
  266. }
  267. }
  268. static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
  269. static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks
  270. static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
  271. /* calculates a single threaded threshold for the builders such
  272. * that for small scenes the overhead of partly allocated blocks
  273. * per thread is low */
  274. size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
  275. {
  276. if (numPrimitives == 0 || bytesEstimated == 0)
  277. return defaultThreshold;
  278. /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
  279. const size_t single_mode_factor = use_single_mode ? 1 : 2;
  280. const size_t threadCount = TaskScheduler::threadCount();
  281. const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
  282. /* if we do not have to limit number of threads use optimal thresdhold */
  283. if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
  284. return defaultThreshold;
  285. /* otherwise limit number of threads by calculating proper single thread threshold */
  286. else {
  287. double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
  288. return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive));
  289. }
  290. }
  291. __forceinline size_t alignSize(size_t i) {
  292. return (i+127)/128*128;
  293. }
  294. /*! initializes the grow size */
  295. __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast)
  296. {
  297. /* we do not need single thread local allocator mode */
  298. use_single_mode = false;
  299. /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
  300. size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
  301. size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
  302. growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
  303. /* if we reached the maxAllocationSize for growSize, we can
  304. * increase the number of allocation slots by still guaranteeing
  305. * the mainAllocationOverhead */
  306. slotMask = 0x0;
  307. if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
  308. if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
  309. if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
  310. if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
  311. /* set the thread local alloc block size */
  312. size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
  313. /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
  314. #if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
  315. const size_t threadCount = TaskScheduler::threadCount();
  316. const size_t single_mode_factor = use_single_mode ? 1 : 2;
  317. const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
  318. if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
  319. defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
  320. /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
  321. else
  322. #endif
  323. defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
  324. if (bytesEstimated == 0) {
  325. maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
  326. defaultBlockSize = defaultBlockSizeSwitch;
  327. }
  328. log2_grow_size_scale = 0;
  329. if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
  330. if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
  331. if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
  332. if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
  333. if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
  334. if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
  335. if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
  336. }
  337. /*! initializes the allocator */
  338. void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
  339. {
  340. internal_fix_used_blocks();
  341. /* distribute the allocation to multiple thread block slots */
  342. slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
  343. if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
  344. if (bytesReserve == 0) bytesReserve = bytesAllocate;
  345. freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
  346. estimatedSize = bytesEstimate;
  347. initGrowSizeAndNumSlots(bytesEstimate,true);
  348. }
  349. /*! initializes the allocator */
  350. void init_estimate(size_t bytesEstimate)
  351. {
  352. internal_fix_used_blocks();
  353. if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
  354. /* single allocator mode ? */
  355. estimatedSize = bytesEstimate;
  356. //initGrowSizeAndNumSlots(bytesEstimate,false);
  357. initGrowSizeAndNumSlots(bytesEstimate,false);
  358. }
  359. /*! frees state not required after build */
  360. __forceinline void cleanup()
  361. {
  362. internal_fix_used_blocks();
  363. /* unbind all thread local allocators */
  364. for (auto alloc : thread_local_allocators) alloc->unbind(this);
  365. thread_local_allocators.clear();
  366. }
  367. /*! resets the allocator, memory blocks get reused */
  368. void reset ()
  369. {
  370. internal_fix_used_blocks();
  371. bytesUsed.store(0);
  372. bytesFree.store(0);
  373. bytesWasted.store(0);
  374. /* reset all used blocks and move them to begin of free block list */
  375. while (usedBlocks.load() != nullptr) {
  376. usedBlocks.load()->reset_block();
  377. Block* nextUsedBlock = usedBlocks.load()->next;
  378. usedBlocks.load()->next = freeBlocks.load();
  379. freeBlocks = usedBlocks.load();
  380. usedBlocks = nextUsedBlock;
  381. }
  382. /* remove all shared blocks as they are re-added during build */
  383. freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
  384. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
  385. {
  386. threadUsedBlocks[i] = nullptr;
  387. threadBlocks[i] = nullptr;
  388. }
  389. /* unbind all thread local allocators */
  390. for (auto alloc : thread_local_allocators) alloc->unbind(this);
  391. thread_local_allocators.clear();
  392. }
  393. /*! frees all allocated memory */
  394. __forceinline void clear()
  395. {
  396. cleanup();
  397. bytesUsed.store(0);
  398. bytesFree.store(0);
  399. bytesWasted.store(0);
  400. if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
  401. if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
  402. for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
  403. threadUsedBlocks[i] = nullptr;
  404. threadBlocks[i] = nullptr;
  405. }
  406. primrefarray.clear();
  407. }
  408. __forceinline size_t incGrowSizeScale()
  409. {
  410. size_t scale = log2_grow_size_scale.fetch_add(1)+1;
  411. return size_t(1) << min(size_t(16),scale);
  412. }
  413. /*! thread safe allocation of memory */
  414. void* malloc(size_t& bytes, size_t align, bool partial)
  415. {
  416. assert(align <= maxAlignment);
  417. while (true)
  418. {
  419. /* allocate using current block */
  420. size_t threadID = TaskScheduler::threadID();
  421. size_t slot = threadID & slotMask;
  422. Block* myUsedBlocks = threadUsedBlocks[slot];
  423. if (myUsedBlocks) {
  424. void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
  425. if (ptr) return ptr;
  426. }
  427. /* throw error if allocation is too large */
  428. if (bytes > maxAllocationSize)
  429. throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
  430. /* parallel block creation in case of no freeBlocks, avoids single global mutex */
  431. if (likely(freeBlocks.load() == nullptr))
  432. {
  433. #if defined(APPLE) && defined(__aarch64__)
  434. std::scoped_lock lock(slotMutex[slot]);
  435. #else
  436. Lock<SpinLock> lock(slotMutex[slot]);
  437. #endif
  438. if (myUsedBlocks == threadUsedBlocks[slot]) {
  439. const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
  440. const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
  441. assert(allocSize >= bytes);
  442. threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
  443. // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
  444. }
  445. continue;
  446. }
  447. /* if this fails allocate new block */
  448. {
  449. #if defined(APPLE) && defined(__aarch64__)
  450. std::scoped_lock lock(mutex);
  451. #else
  452. Lock<SpinLock> lock(mutex);
  453. #endif
  454. if (myUsedBlocks == threadUsedBlocks[slot])
  455. {
  456. if (freeBlocks.load() != nullptr) {
  457. Block* nextFreeBlock = freeBlocks.load()->next;
  458. freeBlocks.load()->next = usedBlocks;
  459. __memory_barrier();
  460. usedBlocks = freeBlocks.load();
  461. threadUsedBlocks[slot] = freeBlocks.load();
  462. freeBlocks = nextFreeBlock;
  463. } else {
  464. const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
  465. usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
  466. }
  467. }
  468. }
  469. }
  470. }
  471. /*! add new block */
  472. void addBlock(void* ptr, ssize_t bytes)
  473. {
  474. #if defined(APPLE) && defined(__aarch64__)
  475. std::scoped_lock lock(mutex);
  476. #else
  477. Lock<SpinLock> lock(mutex);
  478. #endif
  479. const size_t sizeof_Header = offsetof(Block,data[0]);
  480. void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
  481. size_t ofs = (size_t) aptr - (size_t) ptr;
  482. bytes -= ofs;
  483. if (bytes < 4096) return; // ignore empty or very small blocks
  484. freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
  485. }
  486. /* special allocation only used from morton builder only a single time for each build */
  487. void* specialAlloc(size_t bytes)
  488. {
  489. assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
  490. return freeBlocks.load()->ptr();
  491. }
  492. struct Statistics
  493. {
  494. Statistics ()
  495. : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
  496. Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
  497. : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
  498. Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
  499. : bytesUsed(0), bytesFree(0), bytesWasted(0)
  500. {
  501. Block* usedBlocks = alloc->usedBlocks.load();
  502. Block* freeBlocks = alloc->freeBlocks.load();
  503. if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
  504. if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
  505. if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
  506. if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
  507. if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
  508. }
  509. std::string str(size_t numPrimitives)
  510. {
  511. std::stringstream str;
  512. str.setf(std::ios::fixed, std::ios::floatfield);
  513. str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  514. << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
  515. << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
  516. << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
  517. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
  518. return str.str();
  519. }
  520. friend Statistics operator+ ( const Statistics& a, const Statistics& b)
  521. {
  522. return Statistics(a.bytesUsed+b.bytesUsed,
  523. a.bytesFree+b.bytesFree,
  524. a.bytesWasted+b.bytesWasted);
  525. }
  526. size_t bytesAllocatedTotal() const {
  527. return bytesUsed + bytesFree + bytesWasted;
  528. }
  529. public:
  530. size_t bytesUsed;
  531. size_t bytesFree;
  532. size_t bytesWasted;
  533. };
  534. Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
  535. return Statistics(this,atype,huge_pages);
  536. }
  537. size_t getUsedBytes() {
  538. return bytesUsed;
  539. }
  540. size_t getWastedBytes() {
  541. return bytesWasted;
  542. }
  543. struct AllStatistics
  544. {
  545. AllStatistics (FastAllocator* alloc)
  546. : bytesUsed(alloc->bytesUsed),
  547. bytesFree(alloc->bytesFree),
  548. bytesWasted(alloc->bytesWasted),
  549. stat_all(alloc,ANY_TYPE),
  550. stat_malloc(alloc,ALIGNED_MALLOC),
  551. stat_4K(alloc,EMBREE_OS_MALLOC,false),
  552. stat_2M(alloc,EMBREE_OS_MALLOC,true),
  553. stat_shared(alloc,SHARED) {}
  554. AllStatistics (size_t bytesUsed,
  555. size_t bytesFree,
  556. size_t bytesWasted,
  557. Statistics stat_all,
  558. Statistics stat_malloc,
  559. Statistics stat_4K,
  560. Statistics stat_2M,
  561. Statistics stat_shared)
  562. : bytesUsed(bytesUsed),
  563. bytesFree(bytesFree),
  564. bytesWasted(bytesWasted),
  565. stat_all(stat_all),
  566. stat_malloc(stat_malloc),
  567. stat_4K(stat_4K),
  568. stat_2M(stat_2M),
  569. stat_shared(stat_shared) {}
  570. friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
  571. {
  572. return AllStatistics(a.bytesUsed+b.bytesUsed,
  573. a.bytesFree+b.bytesFree,
  574. a.bytesWasted+b.bytesWasted,
  575. a.stat_all + b.stat_all,
  576. a.stat_malloc + b.stat_malloc,
  577. a.stat_4K + b.stat_4K,
  578. a.stat_2M + b.stat_2M,
  579. a.stat_shared + b.stat_shared);
  580. }
  581. void print(size_t numPrimitives)
  582. {
  583. std::stringstream str0;
  584. str0.setf(std::ios::fixed, std::ios::floatfield);
  585. str0 << " alloc : "
  586. << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  587. << " "
  588. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
  589. std::cout << str0.str() << std::endl;
  590. std::stringstream str1;
  591. str1.setf(std::ios::fixed, std::ios::floatfield);
  592. str1 << " alloc : "
  593. << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
  594. << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
  595. << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
  596. << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
  597. << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
  598. std::cout << str1.str() << std::endl;
  599. std::cout << " total : " << stat_all.str(numPrimitives) << std::endl;
  600. std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl;
  601. std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl;
  602. std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl;
  603. std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl;
  604. }
  605. private:
  606. size_t bytesUsed;
  607. size_t bytesFree;
  608. size_t bytesWasted;
  609. Statistics stat_all;
  610. Statistics stat_malloc;
  611. Statistics stat_4K;
  612. Statistics stat_2M;
  613. Statistics stat_shared;
  614. };
  615. void print_blocks()
  616. {
  617. std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
  618. std::cout << " used blocks = ";
  619. if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
  620. std::cout << "[END]" << std::endl;
  621. std::cout << " free blocks = ";
  622. if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
  623. std::cout << "[END]" << std::endl;
  624. }
  625. private:
  626. struct Block
  627. {
  628. static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
  629. {
  630. /* We avoid using os_malloc for small blocks as this could
  631. * cause a risk of fragmenting the virtual address space and
  632. * reach the limit of vm.max_map_count = 65k under Linux. */
  633. if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
  634. atype = ALIGNED_MALLOC;
  635. /* we need to additionally allocate some header */
  636. const size_t sizeof_Header = offsetof(Block,data[0]);
  637. bytesAllocate = sizeof_Header+bytesAllocate;
  638. bytesReserve = sizeof_Header+bytesReserve;
  639. /* consume full 4k pages with using os_malloc */
  640. if (atype == EMBREE_OS_MALLOC) {
  641. bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
  642. bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
  643. }
  644. /* either use alignedMalloc or os_malloc */
  645. void *ptr = nullptr;
  646. if (atype == ALIGNED_MALLOC)
  647. {
  648. /* special handling for default block size */
  649. if (bytesAllocate == (2*PAGE_SIZE_2M))
  650. {
  651. const size_t alignment = maxAlignment;
  652. if (device) device->memoryMonitor(bytesAllocate+alignment,false);
  653. ptr = alignedMalloc(bytesAllocate,alignment);
  654. /* give hint to transparently convert these pages to 2MB pages */
  655. const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
  656. os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block
  657. os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
  658. os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
  659. return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
  660. }
  661. else
  662. {
  663. const size_t alignment = maxAlignment;
  664. if (device) device->memoryMonitor(bytesAllocate+alignment,false);
  665. ptr = alignedMalloc(bytesAllocate,alignment);
  666. return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
  667. }
  668. }
  669. else if (atype == EMBREE_OS_MALLOC)
  670. {
  671. if (device) device->memoryMonitor(bytesAllocate,false);
  672. bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
  673. return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
  674. }
  675. else
  676. assert(false);
  677. return NULL;
  678. }
  679. Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
  680. : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
  681. {
  682. assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
  683. }
  684. static Block* remove_shared_blocks(Block* head)
  685. {
  686. Block** prev_next = &head;
  687. for (Block* block = head; block; block = block->next) {
  688. if (block->atype == SHARED) *prev_next = block->next;
  689. else prev_next = &block->next;
  690. }
  691. return head;
  692. }
  693. void clear_list(MemoryMonitorInterface* device)
  694. {
  695. Block* block = this;
  696. while (block) {
  697. Block* next = block->next;
  698. block->clear_block(device);
  699. block = next;
  700. }
  701. }
  702. void clear_block (MemoryMonitorInterface* device)
  703. {
  704. const size_t sizeof_Header = offsetof(Block,data[0]);
  705. const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
  706. if (atype == ALIGNED_MALLOC) {
  707. alignedFree(this);
  708. if (device) device->memoryMonitor(-sizeof_Alloced,true);
  709. }
  710. else if (atype == EMBREE_OS_MALLOC) {
  711. size_t sizeof_This = sizeof_Header+reserveEnd;
  712. os_free(this,sizeof_This,huge_pages);
  713. if (device) device->memoryMonitor(-sizeof_Alloced,true);
  714. }
  715. else /* if (atype == SHARED) */ {
  716. }
  717. }
  718. void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
  719. {
  720. size_t bytes = bytes_in;
  721. assert(align <= maxAlignment);
  722. bytes = (bytes+(align-1)) & ~(align-1);
  723. if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
  724. const size_t i = cur.fetch_add(bytes);
  725. if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
  726. if (unlikely(i > reserveEnd)) return nullptr;
  727. bytes_in = bytes = min(bytes,reserveEnd-i);
  728. if (i+bytes > allocEnd) {
  729. if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
  730. }
  731. return &data[i];
  732. }
  733. void* ptr() {
  734. return &data[cur];
  735. }
  736. void reset_block ()
  737. {
  738. allocEnd = max(allocEnd,(size_t)cur);
  739. cur = 0;
  740. }
  741. size_t getBlockUsedBytes() const {
  742. return min(size_t(cur),reserveEnd);
  743. }
  744. size_t getBlockFreeBytes() const {
  745. return getBlockAllocatedBytes() - getBlockUsedBytes();
  746. }
  747. size_t getBlockAllocatedBytes() const {
  748. return min(max(allocEnd,size_t(cur)),reserveEnd);
  749. }
  750. size_t getBlockWastedBytes() const {
  751. const size_t sizeof_Header = offsetof(Block,data[0]);
  752. return sizeof_Header + wasted;
  753. }
  754. size_t getBlockReservedBytes() const {
  755. return reserveEnd;
  756. }
  757. bool hasType(AllocationType atype_i, bool huge_pages_i) const
  758. {
  759. if (atype_i == ANY_TYPE ) return true;
  760. else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
  761. else return atype_i == atype;
  762. }
  763. size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
  764. size_t bytes = 0;
  765. for (const Block* block = this; block; block = block->next) {
  766. if (!block->hasType(atype,huge_pages)) continue;
  767. bytes += block->getBlockUsedBytes();
  768. }
  769. return bytes;
  770. }
  771. size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
  772. size_t bytes = 0;
  773. for (const Block* block = this; block; block = block->next) {
  774. if (!block->hasType(atype,huge_pages)) continue;
  775. bytes += block->getBlockFreeBytes();
  776. }
  777. return bytes;
  778. }
  779. size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
  780. size_t bytes = 0;
  781. for (const Block* block = this; block; block = block->next) {
  782. if (!block->hasType(atype,huge_pages)) continue;
  783. bytes += block->getBlockWastedBytes();
  784. }
  785. return bytes;
  786. }
  787. size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
  788. size_t bytes = 0;
  789. for (const Block* block = this; block; block = block->next) {
  790. if (!block->hasType(atype,huge_pages)) continue;
  791. bytes += block->getBlockAllocatedBytes();
  792. }
  793. return bytes;
  794. }
  795. void print_list ()
  796. {
  797. for (const Block* block = this; block; block = block->next)
  798. block->print_block();
  799. }
  800. void print_block() const
  801. {
  802. if (atype == ALIGNED_MALLOC) std::cout << "A";
  803. else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
  804. else if (atype == SHARED) std::cout << "S";
  805. if (huge_pages) std::cout << "H";
  806. size_t bytesUsed = getBlockUsedBytes();
  807. size_t bytesFree = getBlockFreeBytes();
  808. size_t bytesWasted = getBlockWastedBytes();
  809. std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
  810. }
  811. public:
  812. std::atomic<size_t> cur; //!< current location of the allocator
  813. std::atomic<size_t> allocEnd; //!< end of the allocated memory region
  814. std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
  815. Block* next; //!< pointer to next block in list
  816. size_t wasted; //!< amount of memory wasted through block alignment
  817. AllocationType atype; //!< allocation mode of the block
  818. bool huge_pages; //!< whether the block uses huge pages
  819. char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
  820. char data[1]; //!< here starts memory to use for allocations
  821. };
  822. private:
  823. Device* device;
  824. SpinLock mutex;
  825. size_t slotMask;
  826. std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
  827. std::atomic<Block*> usedBlocks;
  828. std::atomic<Block*> freeBlocks;
  829. std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
  830. #if defined(APPLE) && defined(__aarch64__)
  831. std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
  832. #else
  833. PaddedSpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
  834. #endif
  835. bool use_single_mode;
  836. size_t defaultBlockSize;
  837. size_t estimatedSize;
  838. size_t growSize;
  839. size_t maxGrowSize;
  840. std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
  841. std::atomic<size_t> bytesUsed;
  842. std::atomic<size_t> bytesFree;
  843. std::atomic<size_t> bytesWasted;
  844. static __thread ThreadLocal2* thread_local_allocator2;
  845. static SpinLock s_thread_local_allocators_lock;
  846. static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
  847. #if defined(APPLE) && defined(__aarch64__)
  848. std::mutex thread_local_allocators_lock;
  849. #else
  850. SpinLock thread_local_allocators_lock;
  851. #endif
  852. std::vector<ThreadLocal2*> thread_local_allocators;
  853. AllocationType atype;
  854. mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
  855. };
  856. }