b3RadixSort32CL.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647
  1. #include "b3RadixSort32CL.h"
  2. #include "b3LauncherCL.h"
  3. #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
  4. #include "b3PrefixScanCL.h"
  5. #include "b3FillCL.h"
  6. #define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
  7. #include "kernels/RadixSort32KernelsCL.h"
  8. b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
  9. : m_commandQueue(queue)
  10. {
  11. b3OpenCLDeviceInfo info;
  12. b3OpenCLUtils::getDeviceInfo(device, &info);
  13. m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
  14. m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
  15. m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
  16. m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
  17. m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
  18. m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
  19. m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
  20. if (initialCapacity > 0)
  21. {
  22. m_workBuffer1->resize(initialCapacity);
  23. m_workBuffer3->resize(initialCapacity);
  24. m_workBuffer3a->resize(initialCapacity);
  25. m_workBuffer4->resize(initialCapacity);
  26. m_workBuffer4a->resize(initialCapacity);
  27. }
  28. m_scan = new b3PrefixScanCL(ctx, device, queue);
  29. m_fill = new b3FillCL(ctx, device, queue);
  30. const char* additionalMacros = "";
  31. cl_int pErrNum;
  32. const char* kernelSource = radixSort32KernelsCL;
  33. cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
  34. b3Assert(sortProg);
  35. m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
  36. b3Assert(m_streamCountSortDataKernel);
  37. m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
  38. b3Assert(m_streamCountKernel);
  39. if (m_deviceCPU)
  40. {
  41. m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
  42. b3Assert(m_sortAndScatterSortDataKernel);
  43. m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
  44. b3Assert(m_sortAndScatterKernel);
  45. }
  46. else
  47. {
  48. m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
  49. b3Assert(m_sortAndScatterSortDataKernel);
  50. m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
  51. b3Assert(m_sortAndScatterKernel);
  52. }
  53. m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
  54. b3Assert(m_prefixScanKernel);
  55. }
  56. b3RadixSort32CL::~b3RadixSort32CL()
  57. {
  58. delete m_scan;
  59. delete m_fill;
  60. delete m_workBuffer1;
  61. delete m_workBuffer2;
  62. delete m_workBuffer3;
  63. delete m_workBuffer3a;
  64. delete m_workBuffer4;
  65. delete m_workBuffer4a;
  66. clReleaseKernel(m_streamCountSortDataKernel);
  67. clReleaseKernel(m_streamCountKernel);
  68. clReleaseKernel(m_sortAndScatterSortDataKernel);
  69. clReleaseKernel(m_sortAndScatterKernel);
  70. clReleaseKernel(m_prefixScanKernel);
  71. }
  72. void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
  73. {
  74. int n = inout.size();
  75. const int BITS_PER_PASS = 8;
  76. const int NUM_TABLES = (1 << BITS_PER_PASS);
  77. int tables[NUM_TABLES];
  78. int counter[NUM_TABLES];
  79. b3SortData* src = &inout[0];
  80. b3AlignedObjectArray<b3SortData> workbuffer;
  81. workbuffer.resize(inout.size());
  82. b3SortData* dst = &workbuffer[0];
  83. int count = 0;
  84. for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
  85. {
  86. for (int i = 0; i < NUM_TABLES; i++)
  87. {
  88. tables[i] = 0;
  89. }
  90. for (int i = 0; i < n; i++)
  91. {
  92. int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
  93. tables[tableIdx]++;
  94. }
  95. //#define TEST
  96. #ifdef TEST
  97. printf("histogram size=%d\n", NUM_TABLES);
  98. for (int i = 0; i < NUM_TABLES; i++)
  99. {
  100. if (tables[i] != 0)
  101. {
  102. printf("tables[%d]=%d]\n", i, tables[i]);
  103. }
  104. }
  105. #endif //TEST \
  106. // prefix scan
  107. int sum = 0;
  108. for (int i = 0; i < NUM_TABLES; i++)
  109. {
  110. int iData = tables[i];
  111. tables[i] = sum;
  112. sum += iData;
  113. counter[i] = 0;
  114. }
  115. // distribute
  116. for (int i = 0; i < n; i++)
  117. {
  118. int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
  119. dst[tables[tableIdx] + counter[tableIdx]] = src[i];
  120. counter[tableIdx]++;
  121. }
  122. b3Swap(src, dst);
  123. count++;
  124. }
  125. if (count & 1)
  126. {
  127. b3Assert(0); //need to copy
  128. }
  129. }
  130. void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
  131. {
  132. b3AlignedObjectArray<b3SortData> inout;
  133. keyValuesInOut.copyToHost(inout);
  134. executeHost(inout, sortBits);
  135. keyValuesInOut.copyFromHost(inout);
  136. }
  137. void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
  138. b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
  139. {
  140. }
  141. //#define DEBUG_RADIXSORT
  142. //#define DEBUG_RADIXSORT2
  143. void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
  144. {
  145. int originalSize = keyValuesInOut.size();
  146. int workingSize = originalSize;
  147. int dataAlignment = DATA_ALIGNMENT;
  148. #ifdef DEBUG_RADIXSORT2
  149. b3AlignedObjectArray<b3SortData> test2;
  150. keyValuesInOut.copyToHost(test2);
  151. printf("numElem = %d\n", test2.size());
  152. for (int i = 0; i < test2.size(); i++)
  153. {
  154. printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
  155. printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
  156. }
  157. #endif //DEBUG_RADIXSORT2
  158. b3OpenCLArray<b3SortData>* src = 0;
  159. if (workingSize % dataAlignment)
  160. {
  161. workingSize += dataAlignment - (workingSize % dataAlignment);
  162. m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
  163. m_workBuffer4->resize(workingSize);
  164. b3SortData fillValue;
  165. fillValue.m_key = 0xffffffff;
  166. fillValue.m_value = 0xffffffff;
  167. #define USE_BTFILL
  168. #ifdef USE_BTFILL
  169. m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
  170. #else
  171. //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
  172. for (int i = originalSize; i < workingSize; i++)
  173. {
  174. m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
  175. }
  176. #endif //USE_BTFILL
  177. src = m_workBuffer4;
  178. }
  179. else
  180. {
  181. src = &keyValuesInOut;
  182. m_workBuffer4->resize(0);
  183. }
  184. b3Assert(workingSize % DATA_ALIGNMENT == 0);
  185. int minCap = NUM_BUCKET * NUM_WGS;
  186. int n = workingSize;
  187. m_workBuffer1->resize(minCap);
  188. m_workBuffer3->resize(workingSize);
  189. // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
  190. b3Assert(BITS_PER_PASS == 4);
  191. b3Assert(WG_SIZE == 64);
  192. b3Assert((sortBits & 0x3) == 0);
  193. b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
  194. b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
  195. b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
  196. int nWGs = NUM_WGS;
  197. b3ConstData cdata;
  198. {
  199. int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
  200. int nBlocks = (n + blockSize - 1) / (blockSize);
  201. cdata.m_n = n;
  202. cdata.m_nWGs = NUM_WGS;
  203. cdata.m_startBit = 0;
  204. cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
  205. if (nBlocks < NUM_WGS)
  206. {
  207. cdata.m_nBlocksPerWG = 1;
  208. nWGs = nBlocks;
  209. }
  210. }
  211. int count = 0;
  212. for (int ib = 0; ib < sortBits; ib += 4)
  213. {
  214. #ifdef DEBUG_RADIXSORT2
  215. keyValuesInOut.copyToHost(test2);
  216. printf("numElem = %d\n", test2.size());
  217. for (int i = 0; i < test2.size(); i++)
  218. {
  219. if (test2[i].m_key != test2[i].m_value)
  220. {
  221. printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
  222. printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
  223. }
  224. }
  225. #endif //DEBUG_RADIXSORT2
  226. cdata.m_startBit = ib;
  227. if (src->size())
  228. {
  229. b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
  230. b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
  231. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  232. launcher.setConst(cdata);
  233. int num = NUM_WGS * WG_SIZE;
  234. launcher.launch1D(num, WG_SIZE);
  235. }
  236. #ifdef DEBUG_RADIXSORT
  237. b3AlignedObjectArray<unsigned int> testHist;
  238. srcHisto->copyToHost(testHist);
  239. printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
  240. for (int i = 0; i < testHist.size(); i++)
  241. {
  242. if (testHist[i] != 0)
  243. printf("testHist[%d]=%d\n", i, testHist[i]);
  244. }
  245. #endif //DEBUG_RADIXSORT
  246. //fast prefix scan is not working properly on Mac OSX yet
  247. #ifdef __APPLE__
  248. bool fastScan = false;
  249. #else
  250. bool fastScan = !m_deviceCPU; //only use fast scan on GPU
  251. #endif
  252. if (fastScan)
  253. { // prefix scan group histogram
  254. b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
  255. b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
  256. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  257. launcher.setConst(cdata);
  258. launcher.launch1D(128, 128);
  259. destHisto = srcHisto;
  260. }
  261. else
  262. {
  263. //unsigned int sum; //for debugging
  264. m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
  265. }
  266. #ifdef DEBUG_RADIXSORT
  267. destHisto->copyToHost(testHist);
  268. printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
  269. for (int i = 0; i < testHist.size(); i++)
  270. {
  271. if (testHist[i] != 0)
  272. printf("testHist[%d]=%d\n", i, testHist[i]);
  273. }
  274. for (int i = 0; i < testHist.size(); i += NUM_WGS)
  275. {
  276. printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
  277. }
  278. #endif //DEBUG_RADIXSORT
  279. #define USE_GPU
  280. #ifdef USE_GPU
  281. if (src->size())
  282. { // local sort and distribute
  283. b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
  284. b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
  285. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  286. launcher.setConst(cdata);
  287. launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
  288. }
  289. #else
  290. {
  291. #define NUM_TABLES 16
  292. //#define SEQUENTIAL
  293. #ifdef SEQUENTIAL
  294. int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  295. int tables[NUM_TABLES];
  296. int startBit = ib;
  297. destHisto->copyToHost(testHist);
  298. b3AlignedObjectArray<b3SortData> srcHost;
  299. b3AlignedObjectArray<b3SortData> dstHost;
  300. dstHost.resize(src->size());
  301. src->copyToHost(srcHost);
  302. for (int i = 0; i < NUM_TABLES; i++)
  303. {
  304. tables[i] = testHist[i * NUM_WGS];
  305. }
  306. // distribute
  307. for (int i = 0; i < n; i++)
  308. {
  309. int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
  310. dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
  311. counter2[tableIdx]++;
  312. }
  313. #else
  314. int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  315. int tables[NUM_TABLES];
  316. b3AlignedObjectArray<b3SortData> dstHostOK;
  317. dstHostOK.resize(src->size());
  318. destHisto->copyToHost(testHist);
  319. b3AlignedObjectArray<b3SortData> srcHost;
  320. src->copyToHost(srcHost);
  321. int blockSize = 256;
  322. int nBlocksPerWG = cdata.m_nBlocksPerWG;
  323. int startBit = ib;
  324. {
  325. for (int i = 0; i < NUM_TABLES; i++)
  326. {
  327. tables[i] = testHist[i * NUM_WGS];
  328. }
  329. // distribute
  330. for (int i = 0; i < n; i++)
  331. {
  332. int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
  333. dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
  334. counter2[tableIdx]++;
  335. }
  336. }
  337. b3AlignedObjectArray<b3SortData> dstHost;
  338. dstHost.resize(src->size());
  339. int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  340. for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
  341. {
  342. int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  343. int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
  344. for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
  345. {
  346. for (int lIdx = 0; lIdx < 64; lIdx++)
  347. {
  348. int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
  349. // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
  350. // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
  351. // AMD: AtomInc performs better while NV prefers ++
  352. for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
  353. {
  354. if (addr + j < n)
  355. {
  356. // printf ("addr+j=%d\n", addr+j);
  357. int i = addr + j;
  358. int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
  359. int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
  360. b3SortData ok = dstHostOK[destIndex];
  361. if (ok.m_key != srcHost[i].m_key)
  362. {
  363. printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
  364. printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
  365. }
  366. if (ok.m_value != srcHost[i].m_value)
  367. {
  368. printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
  369. printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
  370. }
  371. dstHost[destIndex] = srcHost[i];
  372. counter[tableIdx]++;
  373. }
  374. }
  375. }
  376. }
  377. }
  378. #endif //SEQUENTIAL
  379. dst->copyFromHost(dstHost);
  380. }
  381. #endif //USE_GPU
  382. #ifdef DEBUG_RADIXSORT
  383. destHisto->copyToHost(testHist);
  384. printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
  385. for (int i = 0; i < testHist.size(); i++)
  386. {
  387. if (testHist[i] != 0)
  388. printf("testHist[%d]=%d\n", i, testHist[i]);
  389. }
  390. #endif //DEBUG_RADIXSORT
  391. b3Swap(src, dst);
  392. b3Swap(srcHisto, destHisto);
  393. #ifdef DEBUG_RADIXSORT2
  394. keyValuesInOut.copyToHost(test2);
  395. printf("numElem = %d\n", test2.size());
  396. for (int i = 0; i < test2.size(); i++)
  397. {
  398. if (test2[i].m_key != test2[i].m_value)
  399. {
  400. printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
  401. printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
  402. }
  403. }
  404. #endif //DEBUG_RADIXSORT2
  405. count++;
  406. }
  407. if (count & 1)
  408. {
  409. b3Assert(0); //need to copy from workbuffer to keyValuesInOut
  410. }
  411. if (m_workBuffer4->size())
  412. {
  413. m_workBuffer4->resize(originalSize);
  414. keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
  415. }
  416. #ifdef DEBUG_RADIXSORT
  417. keyValuesInOut.copyToHost(test2);
  418. printf("numElem = %d\n", test2.size());
  419. for (int i = 0; i < test2.size(); i++)
  420. {
  421. printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
  422. printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
  423. }
  424. #endif
  425. }
  426. void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
  427. {
  428. int originalSize = keysInOut.size();
  429. int workingSize = originalSize;
  430. int dataAlignment = DATA_ALIGNMENT;
  431. b3OpenCLArray<unsigned int>* src = 0;
  432. if (workingSize % dataAlignment)
  433. {
  434. workingSize += dataAlignment - (workingSize % dataAlignment);
  435. m_workBuffer4a->copyFromOpenCLArray(keysInOut);
  436. m_workBuffer4a->resize(workingSize);
  437. unsigned int fillValue = 0xffffffff;
  438. m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
  439. src = m_workBuffer4a;
  440. }
  441. else
  442. {
  443. src = &keysInOut;
  444. m_workBuffer4a->resize(0);
  445. }
  446. b3Assert(workingSize % DATA_ALIGNMENT == 0);
  447. int minCap = NUM_BUCKET * NUM_WGS;
  448. int n = workingSize;
  449. m_workBuffer1->resize(minCap);
  450. m_workBuffer3->resize(workingSize);
  451. m_workBuffer3a->resize(workingSize);
  452. // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
  453. b3Assert(BITS_PER_PASS == 4);
  454. b3Assert(WG_SIZE == 64);
  455. b3Assert((sortBits & 0x3) == 0);
  456. b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
  457. b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
  458. b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
  459. int nWGs = NUM_WGS;
  460. b3ConstData cdata;
  461. {
  462. int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
  463. int nBlocks = (n + blockSize - 1) / (blockSize);
  464. cdata.m_n = n;
  465. cdata.m_nWGs = NUM_WGS;
  466. cdata.m_startBit = 0;
  467. cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
  468. if (nBlocks < NUM_WGS)
  469. {
  470. cdata.m_nBlocksPerWG = 1;
  471. nWGs = nBlocks;
  472. }
  473. }
  474. int count = 0;
  475. for (int ib = 0; ib < sortBits; ib += 4)
  476. {
  477. cdata.m_startBit = ib;
  478. if (src->size())
  479. {
  480. b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
  481. b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
  482. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  483. launcher.setConst(cdata);
  484. int num = NUM_WGS * WG_SIZE;
  485. launcher.launch1D(num, WG_SIZE);
  486. }
  487. //fast prefix scan is not working properly on Mac OSX yet
  488. #ifdef __APPLE__
  489. bool fastScan = false;
  490. #else
  491. bool fastScan = !m_deviceCPU;
  492. #endif
  493. if (fastScan)
  494. { // prefix scan group histogram
  495. b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
  496. b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
  497. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  498. launcher.setConst(cdata);
  499. launcher.launch1D(128, 128);
  500. destHisto = srcHisto;
  501. }
  502. else
  503. {
  504. //unsigned int sum; //for debugging
  505. m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
  506. }
  507. if (src->size())
  508. { // local sort and distribute
  509. b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
  510. b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
  511. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  512. launcher.setConst(cdata);
  513. launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
  514. }
  515. b3Swap(src, dst);
  516. b3Swap(srcHisto, destHisto);
  517. count++;
  518. }
  519. if (count & 1)
  520. {
  521. b3Assert(0); //need to copy from workbuffer to keyValuesInOut
  522. }
  523. if (m_workBuffer4a->size())
  524. {
  525. m_workBuffer4a->resize(originalSize);
  526. keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
  527. }
  528. }