bvh_builder_morton.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "../common/builder.h"
  5. #include "../../common/algorithms/parallel_reduce.h"
  6. namespace embree
  7. {
  8. namespace isa
  9. {
  10. struct BVHBuilderMorton
  11. {
  12. static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor
  13. static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth
  14. /*! settings for morton builder */
  15. struct Settings
  16. {
  17. /*! default settings */
  18. Settings ()
  19. : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
  20. /*! initialize settings from API settings */
  21. Settings (const RTCBuildArguments& settings)
  22. : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
  23. {
  24. if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
  25. if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth;
  26. if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize;
  27. if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize;
  28. minLeafSize = min(minLeafSize,maxLeafSize);
  29. }
  30. Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
  31. : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
  32. {
  33. minLeafSize = min(minLeafSize,maxLeafSize);
  34. }
  35. public:
  36. size_t branchingFactor; //!< branching factor of BVH to build
  37. size_t maxDepth; //!< maximum depth of BVH to build
  38. size_t minLeafSize; //!< minimum size of a leaf
  39. size_t maxLeafSize; //!< maximum size of a leaf
  40. size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
  41. };
  42. /*! Build primitive consisting of morton code and primitive ID. */
  43. struct __aligned(8) BuildPrim
  44. {
  45. union {
  46. struct {
  47. unsigned int code; //!< morton code
  48. unsigned int index; //!< i'th primitive
  49. };
  50. uint64_t t;
  51. };
  52. /*! interface for radix sort */
  53. __forceinline operator unsigned() const { return code; }
  54. /*! interface for standard sort */
  55. __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
  56. };
  57. /*! maps bounding box to morton code */
  58. struct MortonCodeMapping
  59. {
  60. static const size_t LATTICE_BITS_PER_DIM = 10;
  61. static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
  62. vfloat4 base;
  63. vfloat4 scale;
  64. __forceinline MortonCodeMapping(const BBox3fa& bounds)
  65. {
  66. base = (vfloat4)bounds.lower;
  67. const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
  68. scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
  69. }
  70. __forceinline const vint4 bin (const BBox3fa& box) const
  71. {
  72. const vfloat4 lower = (vfloat4)box.lower;
  73. const vfloat4 upper = (vfloat4)box.upper;
  74. const vfloat4 centroid = lower+upper;
  75. return vint4((centroid-base)*scale);
  76. }
  77. __forceinline unsigned int code (const BBox3fa& box) const
  78. {
  79. const vint4 binID = bin(box);
  80. const unsigned int x = extract<0>(binID);
  81. const unsigned int y = extract<1>(binID);
  82. const unsigned int z = extract<2>(binID);
  83. const unsigned int xyz = bitInterleave(x,y,z);
  84. return xyz;
  85. }
  86. };
  87. #if defined (__AVX2__)
  88. /*! for AVX2 there is a fast scalar bitInterleave */
  89. struct MortonCodeGenerator
  90. {
  91. __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
  92. : mapping(mapping), dest(dest) {}
  93. __forceinline void operator() (const BBox3fa& b, const unsigned index)
  94. {
  95. dest->index = index;
  96. dest->code = mapping.code(b);
  97. dest++;
  98. }
  99. public:
  100. const MortonCodeMapping mapping;
  101. BuildPrim* dest;
  102. size_t currentID;
  103. };
  104. #else
  105. /*! before AVX2 is it better to use the SSE version of bitInterleave */
  106. struct MortonCodeGenerator
  107. {
  108. __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
  109. : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
  110. __forceinline ~MortonCodeGenerator()
  111. {
  112. if (slots != 0)
  113. {
  114. const vint4 code = bitInterleave(ax,ay,az);
  115. for (size_t i=0; i<slots; i++) {
  116. dest[currentID-slots+i].index = ai[i];
  117. dest[currentID-slots+i].code = code[i];
  118. }
  119. }
  120. }
  121. __forceinline void operator() (const BBox3fa& b, const unsigned index)
  122. {
  123. const vint4 binID = mapping.bin(b);
  124. ax[slots] = extract<0>(binID);
  125. ay[slots] = extract<1>(binID);
  126. az[slots] = extract<2>(binID);
  127. ai[slots] = index;
  128. slots++;
  129. currentID++;
  130. if (slots == 4)
  131. {
  132. const vint4 code = bitInterleave(ax,ay,az);
  133. vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
  134. vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
  135. slots = 0;
  136. }
  137. }
  138. public:
  139. const MortonCodeMapping mapping;
  140. BuildPrim* dest;
  141. size_t currentID;
  142. size_t slots;
  143. vint4 ax, ay, az, ai;
  144. };
  145. #endif
  146. template<
  147. typename ReductionTy,
  148. typename Allocator,
  149. typename CreateAllocator,
  150. typename CreateNodeFunc,
  151. typename SetNodeBoundsFunc,
  152. typename CreateLeafFunc,
  153. typename CalculateBounds,
  154. typename ProgressMonitor>
  155. class BuilderT : private Settings
  156. {
  157. ALIGNED_CLASS_(16);
  158. public:
  159. BuilderT (CreateAllocator& createAllocator,
  160. CreateNodeFunc& createNode,
  161. SetNodeBoundsFunc& setBounds,
  162. CreateLeafFunc& createLeaf,
  163. CalculateBounds& calculateBounds,
  164. ProgressMonitor& progressMonitor,
  165. const Settings& settings)
  166. : Settings(settings),
  167. createAllocator(createAllocator),
  168. createNode(createNode),
  169. setBounds(setBounds),
  170. createLeaf(createLeaf),
  171. calculateBounds(calculateBounds),
  172. progressMonitor(progressMonitor),
  173. morton(nullptr) {}
  174. ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
  175. {
  176. /* this should never occur but is a fatal error */
  177. if (depth > maxDepth)
  178. throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
  179. /* create leaf for few primitives */
  180. if (current.size() <= maxLeafSize)
  181. return createLeaf(current,alloc);
  182. /* fill all children by always splitting the largest one */
  183. range<unsigned> children[MAX_BRANCHING_FACTOR];
  184. size_t numChildren = 1;
  185. children[0] = current;
  186. do {
  187. /* find best child with largest number of primitives */
  188. size_t bestChild = -1;
  189. size_t bestSize = 0;
  190. for (size_t i=0; i<numChildren; i++)
  191. {
  192. /* ignore leaves as they cannot get split */
  193. if (children[i].size() <= maxLeafSize)
  194. continue;
  195. /* remember child with largest size */
  196. if (children[i].size() > bestSize) {
  197. bestSize = children[i].size();
  198. bestChild = i;
  199. }
  200. }
  201. if (bestChild == size_t(-1)) break;
  202. /*! split best child into left and right child */
  203. auto split = children[bestChild].split();
  204. /* add new children left and right */
  205. children[bestChild] = children[numChildren-1];
  206. children[numChildren-1] = split.first;
  207. children[numChildren+0] = split.second;
  208. numChildren++;
  209. } while (numChildren < branchingFactor);
  210. /* create node */
  211. auto node = createNode(alloc,numChildren);
  212. /* recurse into each child */
  213. ReductionTy bounds[MAX_BRANCHING_FACTOR];
  214. for (size_t i=0; i<numChildren; i++)
  215. bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
  216. return setBounds(node,bounds,numChildren);
  217. }
  218. /*! recreates morton codes when reaching a region where all codes are identical */
  219. __noinline void recreateMortonCodes(const range<unsigned>& current) const
  220. {
  221. /* fast path for small ranges */
  222. if (likely(current.size() < 1024))
  223. {
  224. /*! recalculate centroid bounds */
  225. BBox3fa centBounds(empty);
  226. for (size_t i=current.begin(); i<current.end(); i++)
  227. centBounds.extend(center2(calculateBounds(morton[i])));
  228. /* recalculate morton codes */
  229. MortonCodeMapping mapping(centBounds);
  230. for (size_t i=current.begin(); i<current.end(); i++)
  231. morton[i].code = mapping.code(calculateBounds(morton[i]));
  232. /* sort morton codes */
  233. std::sort(morton+current.begin(),morton+current.end());
  234. }
  235. else
  236. {
  237. /*! recalculate centroid bounds */
  238. auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
  239. BBox3fa centBounds = empty;
  240. for (size_t i=r.begin(); i<r.end(); i++)
  241. centBounds.extend(center2(calculateBounds(morton[i])));
  242. return centBounds;
  243. };
  244. const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
  245. BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
  246. /* recalculate morton codes */
  247. MortonCodeMapping mapping(centBounds);
  248. parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
  249. for (size_t i=r.begin(); i<r.end(); i++) {
  250. morton[i].code = mapping.code(calculateBounds(morton[i]));
  251. }
  252. });
  253. /*! sort morton codes */
  254. #if defined(TASKING_TBB)
  255. tbb::parallel_sort(morton+current.begin(),morton+current.end());
  256. #else
  257. radixsort32(morton+current.begin(),current.size());
  258. #endif
  259. }
  260. }
  261. __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
  262. {
  263. const unsigned int code_start = morton[current.begin()].code;
  264. const unsigned int code_end = morton[current.end()-1].code;
  265. unsigned int bitpos = lzcnt(code_start^code_end);
  266. /* if all items mapped to same morton code, then re-create new morton codes for the items */
  267. if (unlikely(bitpos == 32))
  268. {
  269. recreateMortonCodes(current);
  270. const unsigned int code_start = morton[current.begin()].code;
  271. const unsigned int code_end = morton[current.end()-1].code;
  272. bitpos = lzcnt(code_start^code_end);
  273. /* if the morton code is still the same, goto fall back split */
  274. if (unlikely(bitpos == 32)) {
  275. current.split(left,right);
  276. return;
  277. }
  278. }
  279. /* split the items at the topmost different morton code bit */
  280. const unsigned int bitpos_diff = 31-bitpos;
  281. const unsigned int bitmask = 1 << bitpos_diff;
  282. /* find location where bit differs using binary search */
  283. unsigned begin = current.begin();
  284. unsigned end = current.end();
  285. while (begin + 1 != end) {
  286. const unsigned mid = (begin+end)/2;
  287. const unsigned bit = morton[mid].code & bitmask;
  288. if (bit == 0) begin = mid; else end = mid;
  289. }
  290. unsigned center = end;
  291. #if defined(DEBUG)
  292. for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0);
  293. for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask);
  294. #endif
  295. left = make_range(current.begin(),center);
  296. right = make_range(center,current.end());
  297. }
  298. ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
  299. {
  300. /* get thread local allocator */
  301. if (!alloc)
  302. alloc = createAllocator();
  303. /* call memory monitor function to signal progress */
  304. if (toplevel && current.size() <= singleThreadThreshold)
  305. progressMonitor(current.size());
  306. /* create leaf node */
  307. if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
  308. return createLargeLeaf(depth,current,alloc);
  309. /* fill all children by always splitting the one with the largest surface area */
  310. range<unsigned> children[MAX_BRANCHING_FACTOR];
  311. split(current,children[0],children[1]);
  312. size_t numChildren = 2;
  313. while (numChildren < branchingFactor)
  314. {
  315. /* find best child with largest number of primitives */
  316. int bestChild = -1;
  317. unsigned bestItems = 0;
  318. for (unsigned int i=0; i<numChildren; i++)
  319. {
  320. /* ignore leaves as they cannot get split */
  321. if (children[i].size() <= minLeafSize)
  322. continue;
  323. /* remember child with largest area */
  324. if (children[i].size() > bestItems) {
  325. bestItems = children[i].size();
  326. bestChild = i;
  327. }
  328. }
  329. if (bestChild == -1) break;
  330. /*! split best child into left and right child */
  331. range<unsigned> left, right;
  332. split(children[bestChild],left,right);
  333. /* add new children left and right */
  334. children[bestChild] = children[numChildren-1];
  335. children[numChildren-1] = left;
  336. children[numChildren+0] = right;
  337. numChildren++;
  338. }
  339. /* create leaf node if no split is possible */
  340. if (unlikely(numChildren == 1))
  341. return createLeaf(current,alloc);
  342. /* allocate node */
  343. auto node = createNode(alloc,numChildren);
  344. /* process top parts of tree parallel */
  345. ReductionTy bounds[MAX_BRANCHING_FACTOR];
  346. if (current.size() > singleThreadThreshold)
  347. {
  348. /*! parallel_for is faster than spawning sub-tasks */
  349. parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
  350. for (size_t i=r.begin(); i<r.end(); i++) {
  351. bounds[i] = recurse(depth+1,children[i],nullptr,true);
  352. _mm_mfence(); // to allow non-temporal stores during build
  353. }
  354. });
  355. }
  356. /* finish tree sequentially */
  357. else
  358. {
  359. for (size_t i=0; i<numChildren; i++)
  360. bounds[i] = recurse(depth+1,children[i],alloc,false);
  361. }
  362. return setBounds(node,bounds,numChildren);
  363. }
  364. /* build function */
  365. ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
  366. {
  367. /* sort morton codes */
  368. morton = src;
  369. radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
  370. /* build BVH */
  371. const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
  372. _mm_mfence(); // to allow non-temporal stores during build
  373. return root;
  374. }
  375. public:
  376. CreateAllocator& createAllocator;
  377. CreateNodeFunc& createNode;
  378. SetNodeBoundsFunc& setBounds;
  379. CreateLeafFunc& createLeaf;
  380. CalculateBounds& calculateBounds;
  381. ProgressMonitor& progressMonitor;
  382. public:
  383. BuildPrim* morton;
  384. };
  385. template<
  386. typename ReductionTy,
  387. typename CreateAllocFunc,
  388. typename CreateNodeFunc,
  389. typename SetBoundsFunc,
  390. typename CreateLeafFunc,
  391. typename CalculateBoundsFunc,
  392. typename ProgressMonitor>
  393. static ReductionTy build(CreateAllocFunc createAllocator,
  394. CreateNodeFunc createNode,
  395. SetBoundsFunc setBounds,
  396. CreateLeafFunc createLeaf,
  397. CalculateBoundsFunc calculateBounds,
  398. ProgressMonitor progressMonitor,
  399. BuildPrim* src,
  400. BuildPrim* tmp,
  401. size_t numPrimitives,
  402. const Settings& settings)
  403. {
  404. typedef BuilderT<
  405. ReductionTy,
  406. decltype(createAllocator()),
  407. CreateAllocFunc,
  408. CreateNodeFunc,
  409. SetBoundsFunc,
  410. CreateLeafFunc,
  411. CalculateBoundsFunc,
  412. ProgressMonitor> Builder;
  413. Builder builder(createAllocator,
  414. createNode,
  415. setBounds,
  416. createLeaf,
  417. calculateBounds,
  418. progressMonitor,
  419. settings);
  420. return builder.build(src,tmp,numPrimitives);
  421. }
  422. };
  423. }
  424. }