b3RadixSort32CL.h 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. #ifndef B3_RADIXSORT32_H
  2. #define B3_RADIXSORT32_H
  3. #include "b3OpenCLArray.h"
  4. struct b3SortData
  5. {
  6. union {
  7. unsigned int m_key;
  8. unsigned int x;
  9. };
  10. union {
  11. unsigned int m_value;
  12. unsigned int y;
  13. };
  14. };
  15. #include "b3BufferInfoCL.h"
  16. class b3RadixSort32CL
  17. {
  18. b3OpenCLArray<unsigned int>* m_workBuffer1;
  19. b3OpenCLArray<unsigned int>* m_workBuffer2;
  20. b3OpenCLArray<b3SortData>* m_workBuffer3;
  21. b3OpenCLArray<b3SortData>* m_workBuffer4;
  22. b3OpenCLArray<unsigned int>* m_workBuffer3a;
  23. b3OpenCLArray<unsigned int>* m_workBuffer4a;
  24. cl_command_queue m_commandQueue;
  25. cl_kernel m_streamCountSortDataKernel;
  26. cl_kernel m_streamCountKernel;
  27. cl_kernel m_prefixScanKernel;
  28. cl_kernel m_sortAndScatterSortDataKernel;
  29. cl_kernel m_sortAndScatterKernel;
  30. bool m_deviceCPU;
  31. class b3PrefixScanCL* m_scan;
  32. class b3FillCL* m_fill;
  33. public:
  34. struct b3ConstData
  35. {
  36. int m_n;
  37. int m_nWGs;
  38. int m_startBit;
  39. int m_nBlocksPerWG;
  40. };
  41. enum
  42. {
  43. DATA_ALIGNMENT = 256,
  44. WG_SIZE = 64,
  45. BLOCK_SIZE = 256,
  46. ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE),
  47. BITS_PER_PASS = 4,
  48. NUM_BUCKET = (1 << BITS_PER_PASS),
  49. // if you change this, change nPerWI in kernel as well
  50. NUM_WGS = 20 * 6, // cypress
  51. // NUM_WGS = 24*6, // cayman
  52. // NUM_WGS = 32*4, // nv
  53. };
  54. private:
  55. public:
  56. b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0);
  57. virtual ~b3RadixSort32CL();
  58. void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
  59. b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
  60. ///keys only
  61. void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32);
  62. void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
  63. void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
  64. void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
  65. };
  66. #endif //B3_RADIXSORT32_H