123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502 |
- // Copyright 2009-2021 Intel Corporation
- // SPDX-License-Identifier: Apache-2.0
- #pragma once
- #include "../common/builder.h"
- #include "../../common/algorithms/parallel_reduce.h"
- namespace embree
- {
- namespace isa
- {
- struct BVHBuilderMorton
- {
- static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor
- static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth
- /*! settings for morton builder */
- struct Settings
- {
- /*! default settings */
- Settings ()
- : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
- /*! initialize settings from API settings */
- Settings (const RTCBuildArguments& settings)
- : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
- {
- if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
- if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth;
- if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize;
- if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize;
- minLeafSize = min(minLeafSize,maxLeafSize);
- }
- Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
- : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
- {
- minLeafSize = min(minLeafSize,maxLeafSize);
- }
- public:
- size_t branchingFactor; //!< branching factor of BVH to build
- size_t maxDepth; //!< maximum depth of BVH to build
- size_t minLeafSize; //!< minimum size of a leaf
- size_t maxLeafSize; //!< maximum size of a leaf
- size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
- };
- /*! Build primitive consisting of morton code and primitive ID. */
- struct __aligned(8) BuildPrim
- {
- union {
- struct {
- unsigned int code; //!< morton code
- unsigned int index; //!< i'th primitive
- };
- uint64_t t;
- };
- /*! interface for radix sort */
- __forceinline operator unsigned() const { return code; }
- /*! interface for standard sort */
- __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
- };
- /*! maps bounding box to morton code */
- struct MortonCodeMapping
- {
- static const size_t LATTICE_BITS_PER_DIM = 10;
- static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
- vfloat4 base;
- vfloat4 scale;
- __forceinline MortonCodeMapping(const BBox3fa& bounds)
- {
- base = (vfloat4)bounds.lower;
- const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
- scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
- }
- __forceinline const vint4 bin (const BBox3fa& box) const
- {
- const vfloat4 lower = (vfloat4)box.lower;
- const vfloat4 upper = (vfloat4)box.upper;
- const vfloat4 centroid = lower+upper;
- return vint4((centroid-base)*scale);
- }
- __forceinline unsigned int code (const BBox3fa& box) const
- {
- const vint4 binID = bin(box);
- const unsigned int x = extract<0>(binID);
- const unsigned int y = extract<1>(binID);
- const unsigned int z = extract<2>(binID);
- const unsigned int xyz = bitInterleave(x,y,z);
- return xyz;
- }
- };
- #if defined (__AVX2__)
- /*! for AVX2 there is a fast scalar bitInterleave */
- struct MortonCodeGenerator
- {
- __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
- : mapping(mapping), dest(dest) {}
- __forceinline void operator() (const BBox3fa& b, const unsigned index)
- {
- dest->index = index;
- dest->code = mapping.code(b);
- dest++;
- }
- public:
- const MortonCodeMapping mapping;
- BuildPrim* dest;
- size_t currentID;
- };
- #else
- /*! before AVX2 is it better to use the SSE version of bitInterleave */
- struct MortonCodeGenerator
- {
- __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
- : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
- __forceinline ~MortonCodeGenerator()
- {
- if (slots != 0)
- {
- const vint4 code = bitInterleave(ax,ay,az);
- for (size_t i=0; i<slots; i++) {
- dest[currentID-slots+i].index = ai[i];
- dest[currentID-slots+i].code = code[i];
- }
- }
- }
- __forceinline void operator() (const BBox3fa& b, const unsigned index)
- {
- const vint4 binID = mapping.bin(b);
- ax[slots] = extract<0>(binID);
- ay[slots] = extract<1>(binID);
- az[slots] = extract<2>(binID);
- ai[slots] = index;
- slots++;
- currentID++;
- if (slots == 4)
- {
- const vint4 code = bitInterleave(ax,ay,az);
- vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
- vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
- slots = 0;
- }
- }
- public:
- const MortonCodeMapping mapping;
- BuildPrim* dest;
- size_t currentID;
- size_t slots;
- vint4 ax, ay, az, ai;
- };
- #endif
- template<
- typename ReductionTy,
- typename Allocator,
- typename CreateAllocator,
- typename CreateNodeFunc,
- typename SetNodeBoundsFunc,
- typename CreateLeafFunc,
- typename CalculateBounds,
- typename ProgressMonitor>
- class BuilderT : private Settings
- {
- ALIGNED_CLASS_(16);
- public:
- BuilderT (CreateAllocator& createAllocator,
- CreateNodeFunc& createNode,
- SetNodeBoundsFunc& setBounds,
- CreateLeafFunc& createLeaf,
- CalculateBounds& calculateBounds,
- ProgressMonitor& progressMonitor,
- const Settings& settings)
- : Settings(settings),
- createAllocator(createAllocator),
- createNode(createNode),
- setBounds(setBounds),
- createLeaf(createLeaf),
- calculateBounds(calculateBounds),
- progressMonitor(progressMonitor),
- morton(nullptr) {}
- ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
- {
- /* this should never occur but is a fatal error */
- if (depth > maxDepth)
- throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
- /* create leaf for few primitives */
- if (current.size() <= maxLeafSize)
- return createLeaf(current,alloc);
- /* fill all children by always splitting the largest one */
- range<unsigned> children[MAX_BRANCHING_FACTOR];
- size_t numChildren = 1;
- children[0] = current;
- do {
- /* find best child with largest number of primitives */
- size_t bestChild = -1;
- size_t bestSize = 0;
- for (size_t i=0; i<numChildren; i++)
- {
- /* ignore leaves as they cannot get split */
- if (children[i].size() <= maxLeafSize)
- continue;
- /* remember child with largest size */
- if (children[i].size() > bestSize) {
- bestSize = children[i].size();
- bestChild = i;
- }
- }
- if (bestChild == size_t(-1)) break;
- /*! split best child into left and right child */
- auto split = children[bestChild].split();
- /* add new children left and right */
- children[bestChild] = children[numChildren-1];
- children[numChildren-1] = split.first;
- children[numChildren+0] = split.second;
- numChildren++;
- } while (numChildren < branchingFactor);
- /* create node */
- auto node = createNode(alloc,numChildren);
- /* recurse into each child */
- ReductionTy bounds[MAX_BRANCHING_FACTOR];
- for (size_t i=0; i<numChildren; i++)
- bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
- return setBounds(node,bounds,numChildren);
- }
- /*! recreates morton codes when reaching a region where all codes are identical */
- __noinline void recreateMortonCodes(const range<unsigned>& current) const
- {
- /* fast path for small ranges */
- if (likely(current.size() < 1024))
- {
- /*! recalculate centroid bounds */
- BBox3fa centBounds(empty);
- for (size_t i=current.begin(); i<current.end(); i++)
- centBounds.extend(center2(calculateBounds(morton[i])));
- /* recalculate morton codes */
- MortonCodeMapping mapping(centBounds);
- for (size_t i=current.begin(); i<current.end(); i++)
- morton[i].code = mapping.code(calculateBounds(morton[i]));
- /* sort morton codes */
- std::sort(morton+current.begin(),morton+current.end());
- }
- else
- {
- /*! recalculate centroid bounds */
- auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
- BBox3fa centBounds = empty;
- for (size_t i=r.begin(); i<r.end(); i++)
- centBounds.extend(center2(calculateBounds(morton[i])));
- return centBounds;
- };
- const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
- BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
- /* recalculate morton codes */
- MortonCodeMapping mapping(centBounds);
- parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
- for (size_t i=r.begin(); i<r.end(); i++) {
- morton[i].code = mapping.code(calculateBounds(morton[i]));
- }
- });
- /*! sort morton codes */
- #if defined(TASKING_TBB)
- tbb::parallel_sort(morton+current.begin(),morton+current.end());
- #else
- radixsort32(morton+current.begin(),current.size());
- #endif
- }
- }
- __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
- {
- const unsigned int code_start = morton[current.begin()].code;
- const unsigned int code_end = morton[current.end()-1].code;
- unsigned int bitpos = lzcnt(code_start^code_end);
- /* if all items mapped to same morton code, then re-create new morton codes for the items */
- if (unlikely(bitpos == 32))
- {
- recreateMortonCodes(current);
- const unsigned int code_start = morton[current.begin()].code;
- const unsigned int code_end = morton[current.end()-1].code;
- bitpos = lzcnt(code_start^code_end);
- /* if the morton code is still the same, goto fall back split */
- if (unlikely(bitpos == 32)) {
- current.split(left,right);
- return;
- }
- }
- /* split the items at the topmost different morton code bit */
- const unsigned int bitpos_diff = 31-bitpos;
- const unsigned int bitmask = 1 << bitpos_diff;
- /* find location where bit differs using binary search */
- unsigned begin = current.begin();
- unsigned end = current.end();
- while (begin + 1 != end) {
- const unsigned mid = (begin+end)/2;
- const unsigned bit = morton[mid].code & bitmask;
- if (bit == 0) begin = mid; else end = mid;
- }
- unsigned center = end;
- #if defined(DEBUG)
- for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0);
- for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask);
- #endif
- left = make_range(current.begin(),center);
- right = make_range(center,current.end());
- }
- ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
- {
- /* get thread local allocator */
- if (!alloc)
- alloc = createAllocator();
- /* call memory monitor function to signal progress */
- if (toplevel && current.size() <= singleThreadThreshold)
- progressMonitor(current.size());
- /* create leaf node */
- if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
- return createLargeLeaf(depth,current,alloc);
- /* fill all children by always splitting the one with the largest surface area */
- range<unsigned> children[MAX_BRANCHING_FACTOR];
- split(current,children[0],children[1]);
- size_t numChildren = 2;
- while (numChildren < branchingFactor)
- {
- /* find best child with largest number of primitives */
- int bestChild = -1;
- unsigned bestItems = 0;
- for (unsigned int i=0; i<numChildren; i++)
- {
- /* ignore leaves as they cannot get split */
- if (children[i].size() <= minLeafSize)
- continue;
- /* remember child with largest area */
- if (children[i].size() > bestItems) {
- bestItems = children[i].size();
- bestChild = i;
- }
- }
- if (bestChild == -1) break;
- /*! split best child into left and right child */
- range<unsigned> left, right;
- split(children[bestChild],left,right);
- /* add new children left and right */
- children[bestChild] = children[numChildren-1];
- children[numChildren-1] = left;
- children[numChildren+0] = right;
- numChildren++;
- }
- /* create leaf node if no split is possible */
- if (unlikely(numChildren == 1))
- return createLeaf(current,alloc);
- /* allocate node */
- auto node = createNode(alloc,numChildren);
- /* process top parts of tree parallel */
- ReductionTy bounds[MAX_BRANCHING_FACTOR];
- if (current.size() > singleThreadThreshold)
- {
- /*! parallel_for is faster than spawning sub-tasks */
- parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
- for (size_t i=r.begin(); i<r.end(); i++) {
- bounds[i] = recurse(depth+1,children[i],nullptr,true);
- _mm_mfence(); // to allow non-temporal stores during build
- }
- });
- }
- /* finish tree sequentially */
- else
- {
- for (size_t i=0; i<numChildren; i++)
- bounds[i] = recurse(depth+1,children[i],alloc,false);
- }
- return setBounds(node,bounds,numChildren);
- }
- /* build function */
- ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
- {
- /* sort morton codes */
- morton = src;
- radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
- /* build BVH */
- const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
- _mm_mfence(); // to allow non-temporal stores during build
- return root;
- }
- public:
- CreateAllocator& createAllocator;
- CreateNodeFunc& createNode;
- SetNodeBoundsFunc& setBounds;
- CreateLeafFunc& createLeaf;
- CalculateBounds& calculateBounds;
- ProgressMonitor& progressMonitor;
- public:
- BuildPrim* morton;
- };
- template<
- typename ReductionTy,
- typename CreateAllocFunc,
- typename CreateNodeFunc,
- typename SetBoundsFunc,
- typename CreateLeafFunc,
- typename CalculateBoundsFunc,
- typename ProgressMonitor>
- static ReductionTy build(CreateAllocFunc createAllocator,
- CreateNodeFunc createNode,
- SetBoundsFunc setBounds,
- CreateLeafFunc createLeaf,
- CalculateBoundsFunc calculateBounds,
- ProgressMonitor progressMonitor,
- BuildPrim* src,
- BuildPrim* tmp,
- size_t numPrimitives,
- const Settings& settings)
- {
- typedef BuilderT<
- ReductionTy,
- decltype(createAllocator()),
- CreateAllocFunc,
- CreateNodeFunc,
- SetBoundsFunc,
- CreateLeafFunc,
- CalculateBoundsFunc,
- ProgressMonitor> Builder;
- Builder builder(createAllocator,
- createNode,
- setBounds,
- createLeaf,
- calculateBounds,
- progressMonitor,
- settings);
- return builder.build(src,tmp,numPrimitives);
- }
- };
- }
- }
|