parallel_prefix_sum.h 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "parallel_for.h"
  5. namespace embree
  6. {
  7. template<typename Value>
  8. struct ParallelPrefixSumState
  9. {
  10. enum { MAX_TASKS = 64 };
  11. Value counts[MAX_TASKS];
  12. Value sums [MAX_TASKS];
  13. };
  14. template<typename Index, typename Value, typename Func, typename Reduction>
  15. __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
  16. {
  17. /* calculate number of tasks to use */
  18. const size_t numThreads = TaskScheduler::threadCount();
  19. const size_t numBlocks = (last-first+minStepSize-1)/minStepSize;
  20. const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
  21. /* perform parallel prefix sum */
  22. parallel_for(taskCount, [&](const size_t taskIndex)
  23. {
  24. const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
  25. const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
  26. state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
  27. });
  28. /* calculate prefix sum */
  29. Value sum=identity;
  30. for (size_t i=0; i<taskCount; i++)
  31. {
  32. const Value c = state.counts[i];
  33. state.sums[i] = sum;
  34. sum=reduction(sum,c);
  35. }
  36. return sum;
  37. }
  38. /*! parallel calculation of prefix sums */
  39. template<typename SrcArray, typename DstArray, typename Value, typename Add>
  40. __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096)
  41. {
  42. /* perform single threaded prefix operation for small N */
  43. if (N < SINGLE_THREAD_THRESHOLD)
  44. {
  45. Value sum=identity;
  46. for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
  47. return sum;
  48. }
  49. /* perform parallel prefix operation for large N */
  50. else
  51. {
  52. ParallelPrefixSumState<Value> state;
  53. /* initial run just sets up start values for subtasks */
  54. parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
  55. Value s = identity;
  56. for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
  57. return s;
  58. }, add);
  59. /* final run calculates prefix sum */
  60. return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
  61. Value s = identity;
  62. for (size_t i=r.begin(); i<r.end(); i++) {
  63. dst[i] = add(sum,s);
  64. s = add(s,src[i]);
  65. }
  66. return s;
  67. }, add);
  68. }
  69. }
  70. }