parallel_sort.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "../simd/simd.h"
  5. #include "parallel_for.h"
  6. #include <algorithm>
  7. namespace embree
  8. {
  9. template<class T>
  10. __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
  11. {
  12. for(size_t i = 1;i<length;++i)
  13. {
  14. T v = array[i];
  15. size_t j = i;
  16. while(j > 0 && v < array[j-1])
  17. {
  18. array[j] = array[j-1];
  19. --j;
  20. }
  21. array[j] = v;
  22. }
  23. }
  24. template<class T>
  25. __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
  26. {
  27. for(size_t i = 1;i<length;++i)
  28. {
  29. T v = array[i];
  30. size_t j = i;
  31. while(j > 0 && v > array[j-1])
  32. {
  33. array[j] = array[j-1];
  34. --j;
  35. }
  36. array[j] = v;
  37. }
  38. }
  39. template<class T>
  40. void quicksort_ascending(T *__restrict__ t,
  41. const ssize_t begin,
  42. const ssize_t end)
  43. {
  44. if (likely(begin < end))
  45. {
  46. const T pivotvalue = t[begin];
  47. ssize_t left = begin - 1;
  48. ssize_t right = end + 1;
  49. while(1)
  50. {
  51. while (t[--right] > pivotvalue);
  52. while (t[++left] < pivotvalue);
  53. if (left >= right) break;
  54. const T temp = t[right];
  55. t[right] = t[left];
  56. t[left] = temp;
  57. }
  58. const int pivot = right;
  59. quicksort_ascending(t, begin, pivot);
  60. quicksort_ascending(t, pivot + 1, end);
  61. }
  62. }
  63. template<class T>
  64. void quicksort_decending(T *__restrict__ t,
  65. const ssize_t begin,
  66. const ssize_t end)
  67. {
  68. if (likely(begin < end))
  69. {
  70. const T pivotvalue = t[begin];
  71. ssize_t left = begin - 1;
  72. ssize_t right = end + 1;
  73. while(1)
  74. {
  75. while (t[--right] < pivotvalue);
  76. while (t[++left] > pivotvalue);
  77. if (left >= right) break;
  78. const T temp = t[right];
  79. t[right] = t[left];
  80. t[left] = temp;
  81. }
  82. const int pivot = right;
  83. quicksort_decending(t, begin, pivot);
  84. quicksort_decending(t, pivot + 1, end);
  85. }
  86. }
  87. template<class T, ssize_t THRESHOLD>
  88. void quicksort_insertionsort_ascending(T *__restrict__ t,
  89. const ssize_t begin,
  90. const ssize_t end)
  91. {
  92. if (likely(begin < end))
  93. {
  94. const ssize_t size = end-begin+1;
  95. if (likely(size <= THRESHOLD))
  96. {
  97. insertionsort_ascending<T>(&t[begin],size);
  98. }
  99. else
  100. {
  101. const T pivotvalue = t[begin];
  102. ssize_t left = begin - 1;
  103. ssize_t right = end + 1;
  104. while(1)
  105. {
  106. while (t[--right] > pivotvalue);
  107. while (t[++left] < pivotvalue);
  108. if (left >= right) break;
  109. const T temp = t[right];
  110. t[right] = t[left];
  111. t[left] = temp;
  112. }
  113. const ssize_t pivot = right;
  114. quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
  115. quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
  116. }
  117. }
  118. }
  119. template<class T, ssize_t THRESHOLD>
  120. void quicksort_insertionsort_decending(T *__restrict__ t,
  121. const ssize_t begin,
  122. const ssize_t end)
  123. {
  124. if (likely(begin < end))
  125. {
  126. const ssize_t size = end-begin+1;
  127. if (likely(size <= THRESHOLD))
  128. {
  129. insertionsort_decending<T>(&t[begin],size);
  130. }
  131. else
  132. {
  133. const T pivotvalue = t[begin];
  134. ssize_t left = begin - 1;
  135. ssize_t right = end + 1;
  136. while(1)
  137. {
  138. while (t[--right] < pivotvalue);
  139. while (t[++left] > pivotvalue);
  140. if (left >= right) break;
  141. const T temp = t[right];
  142. t[right] = t[left];
  143. t[left] = temp;
  144. }
  145. const ssize_t pivot = right;
  146. quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
  147. quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
  148. }
  149. }
  150. }
  151. template<typename T>
  152. static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
  153. {
  154. static const unsigned int BITS = 8;
  155. static const unsigned int BUCKETS = (1 << BITS);
  156. static const unsigned int CMP_SORT_THRESHOLD = 16;
  157. __aligned(64) unsigned int count[BUCKETS];
  158. /* clear buckets */
  159. for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
  160. /* count buckets */
  161. #if defined(__INTEL_COMPILER)
  162. #pragma nounroll
  163. #endif
  164. for (size_t i=0;i<num;i++)
  165. count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
  166. /* prefix sums */
  167. __aligned(64) unsigned int head[BUCKETS];
  168. __aligned(64) unsigned int tail[BUCKETS];
  169. head[0] = 0;
  170. for (size_t i=1; i<BUCKETS; i++)
  171. head[i] = head[i-1] + count[i-1];
  172. for (size_t i=0; i<BUCKETS-1; i++)
  173. tail[i] = head[i+1];
  174. tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
  175. assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);
  176. assert(tail[BUCKETS-1] == num);
  177. /* in-place swap */
  178. for (size_t i=0;i<BUCKETS;i++)
  179. {
  180. /* process bucket */
  181. while(head[i] < tail[i])
  182. {
  183. T v = morton[head[i]];
  184. while(1)
  185. {
  186. const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
  187. if (b == i) break;
  188. std::swap(v,morton[head[b]++]);
  189. }
  190. assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
  191. morton[head[i]++] = v;
  192. }
  193. }
  194. if (shift == 0) return;
  195. size_t offset = 0;
  196. for (size_t i=0;i<BUCKETS;i++)
  197. if (count[i])
  198. {
  199. for (size_t j=offset;j<offset+count[i]-1;j++)
  200. assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
  201. if (unlikely(count[i] < CMP_SORT_THRESHOLD))
  202. insertionsort_ascending(morton + offset, count[i]);
  203. else
  204. radixsort32(morton + offset, count[i], shift-BITS);
  205. for (size_t j=offset;j<offset+count[i]-1;j++)
  206. assert(morton[j] <= morton[j+1]);
  207. offset += count[i];
  208. }
  209. }
  210. template<typename Ty, typename Key>
  211. class ParallelRadixSort
  212. {
  213. static const size_t MAX_TASKS = 64;
  214. static const size_t BITS = 8;
  215. static const size_t BUCKETS = (1 << BITS);
  216. typedef unsigned int TyRadixCount[BUCKETS];
  217. template<typename T>
  218. static bool compare(const T& v0, const T& v1) {
  219. return (Key)v0 < (Key)v1;
  220. }
  221. private:
  222. ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
  223. ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
  224. public:
  225. ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
  226. : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
  227. void sort(const size_t blockSize)
  228. {
  229. assert(blockSize > 0);
  230. /* perform single threaded sort for small N */
  231. if (N<=blockSize) // handles also special case of 0!
  232. {
  233. /* do inplace sort inside destination array */
  234. std::sort(src,src+N,compare<Ty>);
  235. }
  236. /* perform parallel sort for large N */
  237. else
  238. {
  239. const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
  240. tbbRadixSort(numThreads);
  241. }
  242. }
  243. ~ParallelRadixSort()
  244. {
  245. alignedFree(radixCount);
  246. radixCount = nullptr;
  247. }
  248. private:
  249. void tbbRadixIteration0(const Key shift,
  250. const Ty* __restrict const src,
  251. Ty* __restrict const dst,
  252. const size_t threadIndex, const size_t threadCount)
  253. {
  254. const size_t startID = (threadIndex+0)*N/threadCount;
  255. const size_t endID = (threadIndex+1)*N/threadCount;
  256. /* mask to extract some number of bits */
  257. const Key mask = BUCKETS-1;
  258. /* count how many items go into the buckets */
  259. for (size_t i=0; i<BUCKETS; i++)
  260. radixCount[threadIndex][i] = 0;
  261. /* iterate over src array and count buckets */
  262. unsigned int * __restrict const count = radixCount[threadIndex];
  263. #if defined(__INTEL_COMPILER)
  264. #pragma nounroll
  265. #endif
  266. for (size_t i=startID; i<endID; i++) {
  267. #if defined(__64BIT__)
  268. const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
  269. #else
  270. const Key index = ((Key)src[i] >> shift) & mask;
  271. #endif
  272. count[index]++;
  273. }
  274. }
  275. void tbbRadixIteration1(const Key shift,
  276. const Ty* __restrict const src,
  277. Ty* __restrict const dst,
  278. const size_t threadIndex, const size_t threadCount)
  279. {
  280. const size_t startID = (threadIndex+0)*N/threadCount;
  281. const size_t endID = (threadIndex+1)*N/threadCount;
  282. /* mask to extract some number of bits */
  283. const Key mask = BUCKETS-1;
  284. /* calculate total number of items for each bucket */
  285. __aligned(64) unsigned int total[BUCKETS];
  286. /*
  287. for (size_t i=0; i<BUCKETS; i++)
  288. total[i] = 0;
  289. */
  290. for (size_t i=0; i<BUCKETS; i+=VSIZEX)
  291. vintx::store(&total[i], zero);
  292. for (size_t i=0; i<threadCount; i++)
  293. {
  294. /*
  295. for (size_t j=0; j<BUCKETS; j++)
  296. total[j] += radixCount[i][j];
  297. */
  298. for (size_t j=0; j<BUCKETS; j+=VSIZEX)
  299. vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
  300. }
  301. /* calculate start offset of each bucket */
  302. __aligned(64) unsigned int offset[BUCKETS];
  303. offset[0] = 0;
  304. for (size_t i=1; i<BUCKETS; i++)
  305. offset[i] = offset[i-1] + total[i-1];
  306. /* calculate start offset of each bucket for this thread */
  307. for (size_t i=0; i<threadIndex; i++)
  308. {
  309. /*
  310. for (size_t j=0; j<BUCKETS; j++)
  311. offset[j] += radixCount[i][j];
  312. */
  313. for (size_t j=0; j<BUCKETS; j+=VSIZEX)
  314. vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
  315. }
  316. /* copy items into their buckets */
  317. #if defined(__INTEL_COMPILER)
  318. #pragma nounroll
  319. #endif
  320. for (size_t i=startID; i<endID; i++) {
  321. const Ty elt = src[i];
  322. #if defined(__64BIT__)
  323. const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
  324. #else
  325. const size_t index = ((Key)src[i] >> shift) & mask;
  326. #endif
  327. dst[offset[index]++] = elt;
  328. }
  329. }
  330. void tbbRadixIteration(const Key shift, const bool last,
  331. const Ty* __restrict src, Ty* __restrict dst,
  332. const size_t numTasks)
  333. {
  334. affinity_partitioner ap;
  335. parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
  336. parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
  337. }
  338. void tbbRadixSort(const size_t numTasks)
  339. {
  340. radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
  341. if (sizeof(Key) == sizeof(uint32_t)) {
  342. tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
  343. tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
  344. tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
  345. tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
  346. }
  347. else if (sizeof(Key) == sizeof(uint64_t))
  348. {
  349. tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
  350. tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
  351. tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
  352. tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
  353. tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
  354. tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
  355. tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
  356. tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
  357. }
  358. }
  359. private:
  360. TyRadixCount* radixCount;
  361. Ty* const src;
  362. Ty* const tmp;
  363. const size_t N;
  364. };
  365. template<typename Ty>
  366. void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
  367. {
  368. ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
  369. }
  370. template<typename Ty, typename Key>
  371. void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
  372. {
  373. ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
  374. }
  375. template<typename Ty>
  376. void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
  377. radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
  378. }
  379. template<typename Ty>
  380. void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
  381. radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
  382. }
  383. }