b3GpuPgsContactSolver.cpp 49 KB


  1. bool gUseLargeBatches = false;
  2. bool gCpuBatchContacts = false;
  3. bool gCpuSolveConstraint = false;
  4. bool gCpuRadixSort = false;
  5. bool gCpuSetSortData = false;
  6. bool gCpuSortContactsDeterminism = false;
  7. bool gUseCpuCopyConstraints = false;
  8. bool gUseScanHost = false;
  9. bool gReorderContactsOnCpu = false;
  10. bool optionalSortContactsDeterminism = true;
  11. #include "b3GpuPgsContactSolver.h"
  12. #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
  13. #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
  14. #include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
  15. #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
  16. #include <string.h>
  17. #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
  18. #include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
  19. #include "b3Solver.h"
  20. #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
  21. #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
  22. #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
  23. #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
  24. #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
  25. #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
  26. #include "kernels/solverSetup.h"
  27. #include "kernels/solverSetup2.h"
  28. #include "kernels/solveContact.h"
  29. #include "kernels/solveFriction.h"
  30. #include "kernels/batchingKernels.h"
  31. #include "kernels/batchingKernelsNew.h"
  32. struct b3GpuBatchingPgsSolverInternalData
  33. {
  34. cl_context m_context;
  35. cl_device_id m_device;
  36. cl_command_queue m_queue;
  37. int m_pairCapacity;
  38. int m_nIterations;
  39. b3OpenCLArray<b3GpuConstraint4>* m_contactCGPU;
  40. b3OpenCLArray<unsigned int>* m_numConstraints;
  41. b3OpenCLArray<unsigned int>* m_offsets;
  42. b3Solver* m_solverGPU;
  43. cl_kernel m_batchingKernel;
  44. cl_kernel m_batchingKernelNew;
  45. cl_kernel m_solveContactKernel;
  46. cl_kernel m_solveSingleContactKernel;
  47. cl_kernel m_solveSingleFrictionKernel;
  48. cl_kernel m_solveFrictionKernel;
  49. cl_kernel m_contactToConstraintKernel;
  50. cl_kernel m_setSortDataKernel;
  51. cl_kernel m_reorderContactKernel;
  52. cl_kernel m_copyConstraintKernel;
  53. cl_kernel m_setDeterminismSortDataBodyAKernel;
  54. cl_kernel m_setDeterminismSortDataBodyBKernel;
  55. cl_kernel m_setDeterminismSortDataChildShapeAKernel;
  56. cl_kernel m_setDeterminismSortDataChildShapeBKernel;
  57. class b3RadixSort32CL* m_sort32;
  58. class b3BoundSearchCL* m_search;
  59. class b3PrefixScanCL* m_scan;
  60. b3OpenCLArray<b3SortData>* m_sortDataBuffer;
  61. b3OpenCLArray<b3Contact4>* m_contactBuffer;
  62. b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
  63. b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
  64. b3OpenCLArray<b3Contact4>* m_pBufContactOutGPU;
  65. b3OpenCLArray<b3Contact4>* m_pBufContactOutGPUCopy;
  66. b3OpenCLArray<b3SortData>* m_contactKeyValues;
  67. b3AlignedObjectArray<unsigned int> m_idxBuffer;
  68. b3AlignedObjectArray<b3SortData> m_sortData;
  69. b3AlignedObjectArray<b3Contact4> m_old;
  70. b3AlignedObjectArray<int> m_batchSizes;
  71. b3OpenCLArray<int>* m_batchSizesGpu;
  72. };
  73. b3GpuPgsContactSolver::b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity)
  74. {
  75. m_debugOutput = 0;
  76. m_data = new b3GpuBatchingPgsSolverInternalData;
  77. m_data->m_context = ctx;
  78. m_data->m_device = device;
  79. m_data->m_queue = q;
  80. m_data->m_pairCapacity = pairCapacity;
  81. m_data->m_nIterations = 4;
  82. m_data->m_batchSizesGpu = new b3OpenCLArray<int>(ctx, q);
  83. m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx, q);
  84. m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx, q);
  85. m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx, q);
  86. m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray<b3Contact4>(ctx, q);
  87. m_data->m_contactKeyValues = new b3OpenCLArray<b3SortData>(ctx, q);
  88. m_data->m_solverGPU = new b3Solver(ctx, device, q, 512 * 1024);
  89. m_data->m_sort32 = new b3RadixSort32CL(ctx, device, m_data->m_queue);
  90. m_data->m_scan = new b3PrefixScanCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
  91. m_data->m_search = new b3BoundSearchCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
  92. const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
  93. m_data->m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, m_data->m_queue, sortSize);
  94. m_data->m_contactBuffer = new b3OpenCLArray<b3Contact4>(ctx, m_data->m_queue);
  95. m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
  96. m_data->m_numConstraints->resize(B3_SOLVER_N_CELLS);
  97. m_data->m_contactCGPU = new b3OpenCLArray<b3GpuConstraint4>(ctx, q, pairCapacity);
  98. m_data->m_offsets = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
  99. m_data->m_offsets->resize(B3_SOLVER_N_CELLS);
  100. const char* additionalMacros = "";
  101. //const char* srcFileNameForCaching="";
  102. cl_int pErrNum;
  103. const char* batchKernelSource = batchingKernelsCL;
  104. const char* batchKernelNewSource = batchingKernelsNewCL;
  105. const char* solverSetupSource = solverSetupCL;
  106. const char* solverSetup2Source = solverSetup2CL;
  107. const char* solveContactSource = solveContactCL;
  108. const char* solveFrictionSource = solveFrictionCL;
  109. {
  110. cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
  111. b3Assert(solveContactProg);
  112. cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
  113. b3Assert(solveFrictionProg);
  114. cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
  115. b3Assert(solverSetup2Prog);
  116. cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
  117. b3Assert(solverSetupProg);
  118. m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
  119. b3Assert(m_data->m_solveFrictionKernel);
  120. m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
  121. b3Assert(m_data->m_solveContactKernel);
  122. m_data->m_solveSingleContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "solveSingleContactKernel", &pErrNum, solveContactProg, additionalMacros);
  123. b3Assert(m_data->m_solveSingleContactKernel);
  124. m_data->m_solveSingleFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "solveSingleFrictionKernel", &pErrNum, solveFrictionProg, additionalMacros);
  125. b3Assert(m_data->m_solveSingleFrictionKernel);
  126. m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
  127. b3Assert(m_data->m_contactToConstraintKernel);
  128. m_data->m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
  129. b3Assert(m_data->m_setSortDataKernel);
  130. m_data->m_setDeterminismSortDataBodyAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog, additionalMacros);
  131. b3Assert(m_data->m_setDeterminismSortDataBodyAKernel);
  132. m_data->m_setDeterminismSortDataBodyBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog, additionalMacros);
  133. b3Assert(m_data->m_setDeterminismSortDataBodyBKernel);
  134. m_data->m_setDeterminismSortDataChildShapeAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeA", &pErrNum, solverSetup2Prog, additionalMacros);
  135. b3Assert(m_data->m_setDeterminismSortDataChildShapeAKernel);
  136. m_data->m_setDeterminismSortDataChildShapeBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeB", &pErrNum, solverSetup2Prog, additionalMacros);
  137. b3Assert(m_data->m_setDeterminismSortDataChildShapeBKernel);
  138. m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
  139. b3Assert(m_data->m_reorderContactKernel);
  140. m_data->m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
  141. b3Assert(m_data->m_copyConstraintKernel);
  142. }
  143. {
  144. cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
  145. b3Assert(batchingProg);
  146. m_data->m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
  147. b3Assert(m_data->m_batchingKernel);
  148. }
  149. {
  150. cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
  151. b3Assert(batchingNewProg);
  152. m_data->m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
  153. b3Assert(m_data->m_batchingKernelNew);
  154. }
  155. }
  156. b3GpuPgsContactSolver::~b3GpuPgsContactSolver()
  157. {
  158. delete m_data->m_batchSizesGpu;
  159. delete m_data->m_bodyBufferGPU;
  160. delete m_data->m_inertiaBufferGPU;
  161. delete m_data->m_pBufContactOutGPU;
  162. delete m_data->m_pBufContactOutGPUCopy;
  163. delete m_data->m_contactKeyValues;
  164. delete m_data->m_contactCGPU;
  165. delete m_data->m_numConstraints;
  166. delete m_data->m_offsets;
  167. delete m_data->m_sortDataBuffer;
  168. delete m_data->m_contactBuffer;
  169. delete m_data->m_sort32;
  170. delete m_data->m_scan;
  171. delete m_data->m_search;
  172. delete m_data->m_solverGPU;
  173. clReleaseKernel(m_data->m_batchingKernel);
  174. clReleaseKernel(m_data->m_batchingKernelNew);
  175. clReleaseKernel(m_data->m_solveSingleContactKernel);
  176. clReleaseKernel(m_data->m_solveSingleFrictionKernel);
  177. clReleaseKernel(m_data->m_solveContactKernel);
  178. clReleaseKernel(m_data->m_solveFrictionKernel);
  179. clReleaseKernel(m_data->m_contactToConstraintKernel);
  180. clReleaseKernel(m_data->m_setSortDataKernel);
  181. clReleaseKernel(m_data->m_reorderContactKernel);
  182. clReleaseKernel(m_data->m_copyConstraintKernel);
  183. clReleaseKernel(m_data->m_setDeterminismSortDataBodyAKernel);
  184. clReleaseKernel(m_data->m_setDeterminismSortDataBodyBKernel);
  185. clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeAKernel);
  186. clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeBKernel);
  187. delete m_data;
  188. }
  189. struct b3ConstraintCfg
  190. {
  191. b3ConstraintCfg(float dt = 0.f) : m_positionDrift(0.005f), m_positionConstraintCoeff(0.2f), m_dt(dt), m_staticIdx(0) {}
  192. float m_positionDrift;
  193. float m_positionConstraintCoeff;
  194. float m_dt;
  195. bool m_enableParallelSolve;
  196. float m_batchCellSize;
  197. int m_staticIdx;
  198. };
  199. void b3GpuPgsContactSolver::solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
  200. b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes) //const b3OpenCLArray<int>* gpuBatchSizes)
  201. {
  202. B3_PROFILE("solveContactConstraintBatchSizes");
  203. int numBatches = batchSizes->size() / B3_MAX_NUM_BATCHES;
  204. for (int iter = 0; iter < numIterations; iter++)
  205. {
  206. for (int cellId = 0; cellId < numBatches; cellId++)
  207. {
  208. int offset = 0;
  209. for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
  210. {
  211. int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
  212. if (!numInBatch)
  213. break;
  214. {
  215. b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleContactKernel, "m_solveSingleContactKernel");
  216. launcher.setBuffer(bodyBuf->getBufferCL());
  217. launcher.setBuffer(shapeBuf->getBufferCL());
  218. launcher.setBuffer(constraint->getBufferCL());
  219. launcher.setConst(cellId);
  220. launcher.setConst(offset);
  221. launcher.setConst(numInBatch);
  222. launcher.launch1D(numInBatch);
  223. offset += numInBatch;
  224. }
  225. }
  226. }
  227. }
  228. for (int iter = 0; iter < numIterations; iter++)
  229. {
  230. for (int cellId = 0; cellId < numBatches; cellId++)
  231. {
  232. int offset = 0;
  233. for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
  234. {
  235. int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
  236. if (!numInBatch)
  237. break;
  238. {
  239. b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleFrictionKernel, "m_solveSingleFrictionKernel");
  240. launcher.setBuffer(bodyBuf->getBufferCL());
  241. launcher.setBuffer(shapeBuf->getBufferCL());
  242. launcher.setBuffer(constraint->getBufferCL());
  243. launcher.setConst(cellId);
  244. launcher.setConst(offset);
  245. launcher.setConst(numInBatch);
  246. launcher.launch1D(numInBatch);
  247. offset += numInBatch;
  248. }
  249. }
  250. }
  251. }
  252. }
  253. void b3GpuPgsContactSolver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
  254. b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes) //,const b3OpenCLArray<int>* gpuBatchSizes)
  255. {
  256. //sort the contacts
  257. b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
  258. {
  259. const int nn = B3_SOLVER_N_CELLS;
  260. cdata.x = 0;
  261. cdata.y = maxNumBatches; //250;
  262. int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
  263. #ifdef DEBUG_ME
  264. SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
  265. adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
  266. #endif
  267. {
  268. B3_PROFILE("m_batchSolveKernel iterations");
  269. for (int iter = 0; iter < numIterations; iter++)
  270. {
  271. for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
  272. {
  273. #ifdef DEBUG_ME
  274. memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
  275. gpuDebugInfo.write(debugInfo, numWorkItems);
  276. #endif
  277. cdata.z = ib;
  278. b3LauncherCL launcher(m_data->m_queue, m_data->m_solveContactKernel, "m_solveContactKernel");
  279. #if 1
  280. b3BufferInfoCL bInfo[] = {
  281. b3BufferInfoCL(bodyBuf->getBufferCL()),
  282. b3BufferInfoCL(shapeBuf->getBufferCL()),
  283. b3BufferInfoCL(constraint->getBufferCL()),
  284. b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
  285. b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
  286. #ifdef DEBUG_ME
  287. ,
  288. b3BufferInfoCL(&gpuDebugInfo)
  289. #endif
  290. };
  291. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  292. launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
  293. //launcher.setConst( cdata.x );
  294. launcher.setConst(cdata.y);
  295. launcher.setConst(cdata.z);
  296. b3Int4 nSplit;
  297. nSplit.x = B3_SOLVER_N_SPLIT_X;
  298. nSplit.y = B3_SOLVER_N_SPLIT_Y;
  299. nSplit.z = B3_SOLVER_N_SPLIT_Z;
  300. launcher.setConst(nSplit);
  301. launcher.launch1D(numWorkItems, 64);
  302. #else
  303. const char* fileName = "m_batchSolveKernel.bin";
  304. FILE* f = fopen(fileName, "rb");
  305. if (f)
  306. {
  307. int sizeInBytes = 0;
  308. if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
  309. {
  310. printf("error, cannot get file size\n");
  311. exit(0);
  312. }
  313. unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
  314. fread(buf, sizeInBytes, 1, f);
  315. int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
  316. int num = *(int*)&buf[serializedBytes];
  317. launcher.launch1D(num);
  318. //this clFinish is for testing on errors
  319. clFinish(m_queue);
  320. }
  321. #endif
  322. #ifdef DEBUG_ME
  323. clFinish(m_queue);
  324. gpuDebugInfo.read(debugInfo, numWorkItems);
  325. clFinish(m_queue);
  326. for (int i = 0; i < numWorkItems; i++)
  327. {
  328. if (debugInfo[i].m_valInt2 > 0)
  329. {
  330. printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
  331. }
  332. if (debugInfo[i].m_valInt3 > 0)
  333. {
  334. printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
  335. }
  336. }
  337. #endif //DEBUG_ME
  338. }
  339. }
  340. clFinish(m_data->m_queue);
  341. }
  342. cdata.x = 1;
  343. bool applyFriction = true;
  344. if (applyFriction)
  345. {
  346. B3_PROFILE("m_batchSolveKernel iterations2");
  347. for (int iter = 0; iter < numIterations; iter++)
  348. {
  349. for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
  350. {
  351. cdata.z = ib;
  352. b3BufferInfoCL bInfo[] = {
  353. b3BufferInfoCL(bodyBuf->getBufferCL()),
  354. b3BufferInfoCL(shapeBuf->getBufferCL()),
  355. b3BufferInfoCL(constraint->getBufferCL()),
  356. b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
  357. b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
  358. #ifdef DEBUG_ME
  359. ,
  360. b3BufferInfoCL(&gpuDebugInfo)
  361. #endif //DEBUG_ME
  362. };
  363. b3LauncherCL launcher(m_data->m_queue, m_data->m_solveFrictionKernel, "m_solveFrictionKernel");
  364. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  365. launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
  366. //launcher.setConst( cdata.x );
  367. launcher.setConst(cdata.y);
  368. launcher.setConst(cdata.z);
  369. b3Int4 nSplit;
  370. nSplit.x = B3_SOLVER_N_SPLIT_X;
  371. nSplit.y = B3_SOLVER_N_SPLIT_Y;
  372. nSplit.z = B3_SOLVER_N_SPLIT_Z;
  373. launcher.setConst(nSplit);
  374. launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
  375. }
  376. }
  377. clFinish(m_data->m_queue);
  378. }
  379. #ifdef DEBUG_ME
  380. delete[] debugInfo;
  381. #endif //DEBUG_ME
  382. }
  383. }
  384. static bool sortfnc(const b3SortData& a, const b3SortData& b)
  385. {
  386. return (a.m_key < b.m_key);
  387. }
  388. static bool b3ContactCmp(const b3Contact4& p, const b3Contact4& q)
  389. {
  390. return ((p.m_bodyAPtrAndSignBit < q.m_bodyAPtrAndSignBit) ||
  391. ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit < q.m_bodyBPtrAndSignBit)) ||
  392. ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
  393. ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
  394. ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA == q.m_childIndexA && p.m_childIndexB < q.m_childIndexB));
  395. }
  396. #define USE_SPATIAL_BATCHING 1
  397. #define USE_4x4_GRID 1
  398. #ifndef USE_SPATIAL_BATCHING
  399. static const int gridTable4x4[] =
  400. {
  401. 0, 1, 17, 16,
  402. 1, 2, 18, 19,
  403. 17, 18, 32, 3,
  404. 16, 19, 3, 34};
  405. static const int gridTable8x8[] =
  406. {
  407. 0, 2, 3, 16, 17, 18, 19, 1,
  408. 66, 64, 80, 67, 82, 81, 65, 83,
  409. 131, 144, 128, 130, 147, 129, 145, 146,
  410. 208, 195, 194, 192, 193, 211, 210, 209,
  411. 21, 22, 23, 5, 4, 6, 7, 20,
  412. 86, 85, 69, 87, 70, 68, 84, 71,
  413. 151, 133, 149, 150, 135, 148, 132, 134,
  414. 197, 27, 214, 213, 212, 199, 198, 196
  415. };
  416. #endif
  417. void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyData* gBodies, b3SortData* gSortDataOut, int nContacts, float scale, const b3Int4& nSplit, int staticIdx)
  418. {
  419. for (int gIdx = 0; gIdx < nContacts; gIdx++)
  420. {
  421. if (gIdx < nContacts)
  422. {
  423. int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit;
  424. int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit;
  425. int aIdx = abs(aPtrAndSignBit);
  426. int bIdx = abs(bPtrAndSignBit);
  427. bool aStatic = (aPtrAndSignBit < 0) || (aPtrAndSignBit == staticIdx);
  428. #if USE_SPATIAL_BATCHING
  429. int idx = (aStatic) ? bIdx : aIdx;
  430. b3Vector3 p = gBodies[idx].m_pos;
  431. int xIdx = (int)((p.x - ((p.x < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.x - 1);
  432. int yIdx = (int)((p.y - ((p.y < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.y - 1);
  433. int zIdx = (int)((p.z - ((p.z < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.z - 1);
  434. int newIndex = (xIdx + yIdx * nSplit.x + zIdx * nSplit.x * nSplit.y);
  435. #else //USE_SPATIAL_BATCHING
  436. bool bStatic = (bPtrAndSignBit < 0) || (bPtrAndSignBit == staticIdx);
  437. #if USE_4x4_GRID
  438. int aa = aIdx & 3;
  439. int bb = bIdx & 3;
  440. if (aStatic)
  441. aa = bb;
  442. if (bStatic)
  443. bb = aa;
  444. int gridIndex = aa + bb * 4;
  445. int newIndex = gridTable4x4[gridIndex];
  446. #else //USE_4x4_GRID
  447. int aa = aIdx & 7;
  448. int bb = bIdx & 7;
  449. if (aStatic)
  450. aa = bb;
  451. if (bStatic)
  452. bb = aa;
  453. int gridIndex = aa + bb * 8;
  454. int newIndex = gridTable8x8[gridIndex];
  455. #endif //USE_4x4_GRID
  456. #endif //USE_SPATIAL_BATCHING
  457. gSortDataOut[gIdx].x = newIndex;
  458. gSortDataOut[gIdx].y = gIdx;
  459. }
  460. else
  461. {
  462. gSortDataOut[gIdx].x = 0xffffffff;
  463. }
  464. }
  465. }
  466. void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const b3Config& config, int static0Index)
  467. {
  468. B3_PROFILE("solveContacts");
  469. m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf, numBodies);
  470. m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf, numBodies);
  471. m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf, numContacts);
  472. if (optionalSortContactsDeterminism)
  473. {
  474. if (!gCpuSortContactsDeterminism)
  475. {
  476. B3_PROFILE("GPU Sort contact constraints (determinism)");
  477. m_data->m_pBufContactOutGPUCopy->resize(numContacts);
  478. m_data->m_contactKeyValues->resize(numContacts);
  479. m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(), numContacts, 0, 0);
  480. {
  481. b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeBKernel, "m_setDeterminismSortDataChildShapeBKernel");
  482. launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
  483. launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
  484. launcher.setConst(numContacts);
  485. launcher.launch1D(numContacts, 64);
  486. }
  487. m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
  488. {
  489. b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeAKernel, "m_setDeterminismSortDataChildShapeAKernel");
  490. launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
  491. launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
  492. launcher.setConst(numContacts);
  493. launcher.launch1D(numContacts, 64);
  494. }
  495. m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
  496. {
  497. b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel, "m_setDeterminismSortDataBodyBKernel");
  498. launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
  499. launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
  500. launcher.setConst(numContacts);
  501. launcher.launch1D(numContacts, 64);
  502. }
  503. m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
  504. {
  505. b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel, "m_setDeterminismSortDataBodyAKernel");
  506. launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
  507. launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
  508. launcher.setConst(numContacts);
  509. launcher.launch1D(numContacts, 64);
  510. }
  511. m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
  512. {
  513. B3_PROFILE("gpu reorderContactKernel (determinism)");
  514. b3Int4 cdata;
  515. cdata.x = numContacts;
  516. //b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL())
  517. // , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
  518. b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
  519. launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
  520. launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL());
  521. launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
  522. launcher.setConst(cdata);
  523. launcher.launch1D(numContacts, 64);
  524. }
  525. }
  526. else
  527. {
  528. B3_PROFILE("CPU Sort contact constraints (determinism)");
  529. b3AlignedObjectArray<b3Contact4> cpuConstraints;
  530. m_data->m_pBufContactOutGPU->copyToHost(cpuConstraints);
  531. bool sort = true;
  532. if (sort)
  533. {
  534. cpuConstraints.quickSort(b3ContactCmp);
  535. for (int i = 0; i < cpuConstraints.size(); i++)
  536. {
  537. cpuConstraints[i].m_batchIdx = i;
  538. }
  539. }
  540. m_data->m_pBufContactOutGPU->copyFromHost(cpuConstraints);
  541. if (m_debugOutput == 100)
  542. {
  543. for (int i = 0; i < cpuConstraints.size(); i++)
  544. {
  545. printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n", i, cpuConstraints[i].m_bodyAPtrAndSignBit, cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx);
  546. }
  547. }
  548. m_debugOutput++;
  549. }
  550. }
  551. int nContactOut = m_data->m_pBufContactOutGPU->size();
  552. bool useSolver = true;
  553. if (useSolver)
  554. {
  555. float dt = 1. / 60.;
  556. b3ConstraintCfg csCfg(dt);
  557. csCfg.m_enableParallelSolve = true;
  558. csCfg.m_batchCellSize = 6;
  559. csCfg.m_staticIdx = static0Index;
  560. b3OpenCLArray<b3RigidBodyData>* bodyBuf = m_data->m_bodyBufferGPU;
  561. void* additionalData = 0; //m_data->m_frictionCGPU;
  562. const b3OpenCLArray<b3InertiaData>* shapeBuf = m_data->m_inertiaBufferGPU;
  563. b3OpenCLArray<b3GpuConstraint4>* contactConstraintOut = m_data->m_contactCGPU;
  564. int nContacts = nContactOut;
  565. int maxNumBatches = 0;
  566. if (!gUseLargeBatches)
  567. {
  568. if (m_data->m_solverGPU->m_contactBuffer2)
  569. {
  570. m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
  571. }
  572. if (m_data->m_solverGPU->m_contactBuffer2 == 0)
  573. {
  574. m_data->m_solverGPU->m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(m_data->m_context, m_data->m_queue, nContacts);
  575. m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
  576. }
  577. //clFinish(m_data->m_queue);
  578. {
  579. B3_PROFILE("batching");
  580. //@todo: just reserve it, without copy of original contact (unless we use warmstarting)
  581. //const b3OpenCLArray<b3RigidBodyData>* bodyNative = bodyBuf;
  582. {
  583. //b3OpenCLArray<b3RigidBodyData>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf );
  584. //b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn );
  585. const int sortAlignment = 512; // todo. get this out of sort
  586. if (csCfg.m_enableParallelSolve)
  587. {
  588. int sortSize = B3NEXTMULTIPLEOF(nContacts, sortAlignment);
  589. b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
  590. b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
  591. if (!gCpuSetSortData)
  592. { // 2. set cell idx
  593. B3_PROFILE("GPU set cell idx");
  594. struct CB
  595. {
  596. int m_nContacts;
  597. int m_staticIdx;
  598. float m_scale;
  599. b3Int4 m_nSplit;
  600. };
  601. b3Assert(sortSize % 64 == 0);
  602. CB cdata;
  603. cdata.m_nContacts = nContacts;
  604. cdata.m_staticIdx = csCfg.m_staticIdx;
  605. cdata.m_scale = 1.f / csCfg.m_batchCellSize;
  606. cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X;
  607. cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y;
  608. cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z;
  609. m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
  610. b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
  611. b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel, "m_setSortDataKernel");
  612. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  613. launcher.setConst(cdata.m_nContacts);
  614. launcher.setConst(cdata.m_scale);
  615. launcher.setConst(cdata.m_nSplit);
  616. launcher.setConst(cdata.m_staticIdx);
  617. launcher.launch1D(sortSize, 64);
  618. }
  619. else
  620. {
  621. m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
  622. b3AlignedObjectArray<b3SortData> sortDataCPU;
  623. m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataCPU);
  624. b3AlignedObjectArray<b3Contact4> contactCPU;
  625. m_data->m_pBufContactOutGPU->copyToHost(contactCPU);
  626. b3AlignedObjectArray<b3RigidBodyData> bodiesCPU;
  627. bodyBuf->copyToHost(bodiesCPU);
  628. float scale = 1.f / csCfg.m_batchCellSize;
  629. b3Int4 nSplit;
  630. nSplit.x = B3_SOLVER_N_SPLIT_X;
  631. nSplit.y = B3_SOLVER_N_SPLIT_Y;
  632. nSplit.z = B3_SOLVER_N_SPLIT_Z;
  633. SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts, scale, nSplit, csCfg.m_staticIdx);
  634. m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU);
  635. }
  636. if (!gCpuRadixSort)
  637. { // 3. sort by cell idx
  638. B3_PROFILE("gpuRadixSort");
  639. //int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
  640. //int sortBit = 32;
  641. //if( n <= 0xffff ) sortBit = 16;
  642. //if( n <= 0xff ) sortBit = 8;
  643. //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
  644. //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize );
  645. b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
  646. this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut);
  647. }
  648. else
  649. {
  650. b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
  651. b3AlignedObjectArray<b3SortData> hostValues;
  652. keyValuesInOut.copyToHost(hostValues);
  653. hostValues.quickSort(sortfnc);
  654. keyValuesInOut.copyFromHost(hostValues);
  655. }
  656. if (gUseScanHost)
  657. {
  658. // 4. find entries
  659. B3_PROFILE("cpuBoundSearch");
  660. b3AlignedObjectArray<unsigned int> countsHost;
  661. countsNative->copyToHost(countsHost);
  662. b3AlignedObjectArray<b3SortData> sortDataHost;
  663. m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
  664. //m_data->m_solverGPU->m_search->executeHost(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT);
  665. m_data->m_solverGPU->m_search->executeHost(sortDataHost, nContacts, countsHost, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
  666. countsNative->copyFromHost(countsHost);
  667. //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative,
  668. // B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT );
  669. //unsigned int sum;
  670. //m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum );
  671. b3AlignedObjectArray<unsigned int> offsetsHost;
  672. offsetsHost.resize(offsetsNative->size());
  673. m_data->m_solverGPU->m_scan->executeHost(countsHost, offsetsHost, B3_SOLVER_N_CELLS); //,&sum );
  674. offsetsNative->copyFromHost(offsetsHost);
  675. //printf("sum = %d\n",sum);
  676. }
  677. else
  678. {
  679. // 4. find entries
  680. B3_PROFILE("gpuBoundSearch");
  681. m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
  682. m_data->m_solverGPU->m_scan->execute(*countsNative, *offsetsNative, B3_SOLVER_N_CELLS); //,&sum );
  683. }
  684. if (nContacts)
  685. { // 5. sort constraints by cellIdx
  686. if (gReorderContactsOnCpu)
  687. {
  688. B3_PROFILE("cpu m_reorderContactKernel");
  689. b3AlignedObjectArray<b3SortData> sortDataHost;
  690. m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
  691. b3AlignedObjectArray<b3Contact4> inContacts;
  692. b3AlignedObjectArray<b3Contact4> outContacts;
  693. m_data->m_pBufContactOutGPU->copyToHost(inContacts);
  694. outContacts.resize(inContacts.size());
  695. for (int i = 0; i < nContacts; i++)
  696. {
  697. int srcIdx = sortDataHost[i].y;
  698. outContacts[i] = inContacts[srcIdx];
  699. }
  700. m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts);
  701. /* "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
  702. "{\n"
  703. " int nContacts = cb.x;\n"
  704. " int gIdx = GET_GLOBAL_IDX;\n"
  705. " if( gIdx < nContacts )\n"
  706. " {\n"
  707. " int srcIdx = sortData[gIdx].y;\n"
  708. " out[gIdx] = in[srcIdx];\n"
  709. " }\n"
  710. "}\n"
  711. */
  712. }
  713. else
  714. {
  715. B3_PROFILE("gpu m_reorderContactKernel");
  716. b3Int4 cdata;
  717. cdata.x = nContacts;
  718. b3BufferInfoCL bInfo[] = {
  719. b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()),
  720. b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
  721. b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
  722. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  723. launcher.setConst(cdata);
  724. launcher.launch1D(nContacts, 64);
  725. }
  726. }
  727. }
  728. }
  729. //clFinish(m_data->m_queue);
  730. // {
  731. // b3AlignedObjectArray<unsigned int> histogram;
  732. // m_data->m_solverGPU->m_numConstraints->copyToHost(histogram);
  733. // printf(",,,\n");
  734. // }
  735. if (nContacts)
  736. {
  737. if (gUseCpuCopyConstraints)
  738. {
  739. for (int i = 0; i < nContacts; i++)
  740. {
  741. m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2);
  742. // m_data->m_solverGPU->m_contactBuffer2->getBufferCL();
  743. // m_data->m_pBufContactOutGPU->getBufferCL()
  744. }
  745. }
  746. else
  747. {
  748. B3_PROFILE("gpu m_copyConstraintKernel");
  749. b3Int4 cdata;
  750. cdata.x = nContacts;
  751. b3BufferInfoCL bInfo[] = {
  752. b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()),
  753. b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL())};
  754. b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel, "m_copyConstraintKernel");
  755. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  756. launcher.setConst(cdata);
  757. launcher.launch1D(nContacts, 64);
  758. //we use the clFinish for proper benchmark/profile
  759. clFinish(m_data->m_queue);
  760. }
  761. }
  762. // bool compareGPU = false;
  763. if (nContacts)
  764. {
  765. if (!gCpuBatchContacts)
  766. {
  767. B3_PROFILE("gpu batchContacts");
  768. maxNumBatches = 250; //250;
  769. m_data->m_solverGPU->batchContacts(m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx);
  770. clFinish(m_data->m_queue);
  771. }
  772. else
  773. {
  774. B3_PROFILE("cpu batchContacts");
  775. static b3AlignedObjectArray<b3Contact4> cpuContacts;
  776. b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
  777. {
  778. B3_PROFILE("copyToHost");
  779. contactsIn->copyToHost(cpuContacts);
  780. }
  781. b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
  782. b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
  783. b3AlignedObjectArray<unsigned int> nNativeHost;
  784. b3AlignedObjectArray<unsigned int> offsetsNativeHost;
  785. {
  786. B3_PROFILE("countsNative/offsetsNative copyToHost");
  787. countsNative->copyToHost(nNativeHost);
  788. offsetsNative->copyToHost(offsetsNativeHost);
  789. }
  790. int numNonzeroGrid = 0;
  791. if (gUseLargeBatches)
  792. {
  793. m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
  794. int totalNumConstraints = cpuContacts.size();
  795. //int simdWidth =numBodies+1;//-1;//64;//-1;//32;
  796. int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]); // on GPU
  797. maxNumBatches = b3Max(numBatches, maxNumBatches);
  798. static int globalMaxBatch = 0;
  799. if (maxNumBatches > globalMaxBatch)
  800. {
  801. globalMaxBatch = maxNumBatches;
  802. b3Printf("maxNumBatches = %d\n", maxNumBatches);
  803. }
  804. }
  805. else
  806. {
  807. m_data->m_batchSizes.resize(B3_SOLVER_N_CELLS * B3_MAX_NUM_BATCHES);
  808. B3_PROFILE("cpu batch grid");
  809. for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
  810. {
  811. int n = (nNativeHost)[i];
  812. int offset = (offsetsNativeHost)[i];
  813. if (n)
  814. {
  815. numNonzeroGrid++;
  816. int simdWidth = numBodies + 1; //-1;//64;//-1;//32;
  817. int numBatches = sortConstraintByBatch3(&cpuContacts[0] + offset, n, simdWidth, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[i * B3_MAX_NUM_BATCHES]); // on GPU
  818. maxNumBatches = b3Max(numBatches, maxNumBatches);
  819. static int globalMaxBatch = 0;
  820. if (maxNumBatches > globalMaxBatch)
  821. {
  822. globalMaxBatch = maxNumBatches;
  823. b3Printf("maxNumBatches = %d\n", maxNumBatches);
  824. }
  825. //we use the clFinish for proper benchmark/profile
  826. }
  827. }
  828. //clFinish(m_data->m_queue);
  829. }
  830. {
  831. B3_PROFILE("m_contactBuffer->copyFromHost");
  832. m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
  833. }
  834. }
  835. }
  836. }
  837. }
  838. //printf("maxNumBatches = %d\n", maxNumBatches);
  839. if (gUseLargeBatches)
  840. {
  841. if (nContacts)
  842. {
  843. B3_PROFILE("cpu batchContacts");
  844. static b3AlignedObjectArray<b3Contact4> cpuContacts;
  845. // b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
  846. {
  847. B3_PROFILE("copyToHost");
  848. m_data->m_pBufContactOutGPU->copyToHost(cpuContacts);
  849. }
  850. // b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
  851. // b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
  852. // int numNonzeroGrid=0;
  853. {
  854. m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
  855. int totalNumConstraints = cpuContacts.size();
  856. // int simdWidth =numBodies+1;//-1;//64;//-1;//32;
  857. int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]); // on GPU
  858. maxNumBatches = b3Max(numBatches, maxNumBatches);
  859. static int globalMaxBatch = 0;
  860. if (maxNumBatches > globalMaxBatch)
  861. {
  862. globalMaxBatch = maxNumBatches;
  863. b3Printf("maxNumBatches = %d\n", maxNumBatches);
  864. }
  865. }
  866. {
  867. B3_PROFILE("m_contactBuffer->copyFromHost");
  868. m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
  869. }
  870. }
  871. }
  872. if (nContacts)
  873. {
  874. B3_PROFILE("gpu convertToConstraints");
  875. m_data->m_solverGPU->convertToConstraints(bodyBuf,
  876. shapeBuf, m_data->m_solverGPU->m_contactBuffer2,
  877. contactConstraintOut,
  878. additionalData, nContacts,
  879. (b3SolverBase::ConstraintCfg&)csCfg);
  880. clFinish(m_data->m_queue);
  881. }
  882. if (1)
  883. {
  884. int numIter = 4;
  885. m_data->m_solverGPU->m_nIterations = numIter; //10
  886. if (!gCpuSolveConstraint)
  887. {
  888. B3_PROFILE("GPU solveContactConstraint");
  889. /*m_data->m_solverGPU->solveContactConstraint(
  890. m_data->m_bodyBufferGPU,
  891. m_data->m_inertiaBufferGPU,
  892. m_data->m_contactCGPU,0,
  893. nContactOut ,
  894. maxNumBatches);
  895. */
  896. //m_data->m_batchSizesGpu->copyFromHost(m_data->m_batchSizes);
  897. if (gUseLargeBatches)
  898. {
  899. solveContactConstraintBatchSizes(m_data->m_bodyBufferGPU,
  900. m_data->m_inertiaBufferGPU,
  901. m_data->m_contactCGPU, 0,
  902. nContactOut,
  903. maxNumBatches, numIter, &m_data->m_batchSizes);
  904. }
  905. else
  906. {
  907. solveContactConstraint(
  908. m_data->m_bodyBufferGPU,
  909. m_data->m_inertiaBufferGPU,
  910. m_data->m_contactCGPU, 0,
  911. nContactOut,
  912. maxNumBatches, numIter, &m_data->m_batchSizes); //m_data->m_batchSizesGpu);
  913. }
  914. }
  915. else
  916. {
  917. B3_PROFILE("Host solveContactConstraint");
  918. m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU, 0, nContactOut, maxNumBatches, &m_data->m_batchSizes);
  919. }
  920. }
  921. #if 0
  922. if (0)
  923. {
  924. B3_PROFILE("read body velocities back to CPU");
  925. //read body updated linear/angular velocities back to CPU
  926. m_data->m_bodyBufferGPU->read(
  927. m_data->m_bodyBufferCPU->m_ptr,numOfConvexRBodies);
  928. adl::DeviceUtils::waitForCompletion( m_data->m_deviceCL );
  929. }
  930. #endif
  931. }
  932. }
  933. void b3GpuPgsContactSolver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx)
  934. {
  935. }
  936. b3AlignedObjectArray<unsigned int> idxBuffer;
  937. b3AlignedObjectArray<b3SortData> sortData;
  938. b3AlignedObjectArray<b3Contact4> old;
  939. inline int b3GpuPgsContactSolver::sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies)
  940. {
  941. B3_PROFILE("sortConstraintByBatch");
  942. int numIter = 0;
  943. sortData.resize(n);
  944. idxBuffer.resize(n);
  945. old.resize(n);
  946. unsigned int* idxSrc = &idxBuffer[0];
  947. unsigned int* idxDst = &idxBuffer[0];
  948. int nIdxSrc, nIdxDst;
  949. const int N_FLG = 256;
  950. const int FLG_MASK = N_FLG - 1;
  951. unsigned int flg[N_FLG / 32];
  952. #if defined(_DEBUG)
  953. for (int i = 0; i < n; i++)
  954. cs[i].getBatchIdx() = -1;
  955. #endif
  956. for (int i = 0; i < n; i++)
  957. idxSrc[i] = i;
  958. nIdxSrc = n;
  959. int batchIdx = 0;
  960. {
  961. B3_PROFILE("cpu batch innerloop");
  962. while (nIdxSrc)
  963. {
  964. numIter++;
  965. nIdxDst = 0;
  966. int nCurrentBatch = 0;
  967. // clear flag
  968. for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
  969. for (int i = 0; i < nIdxSrc; i++)
  970. {
  971. int idx = idxSrc[i];
  972. b3Assert(idx < n);
  973. // check if it can go
  974. int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
  975. int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
  976. int bodyA = abs(bodyAS);
  977. int bodyB = abs(bodyBS);
  978. int aIdx = bodyA & FLG_MASK;
  979. int bIdx = bodyB & FLG_MASK;
  980. unsigned int aUnavailable = flg[aIdx / 32] & (1 << (aIdx & 31));
  981. unsigned int bUnavailable = flg[bIdx / 32] & (1 << (bIdx & 31));
  982. bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
  983. bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
  984. //use inv_mass!
  985. aUnavailable = !aIsStatic ? aUnavailable : 0; //
  986. bUnavailable = !bIsStatic ? bUnavailable : 0;
  987. if (aUnavailable == 0 && bUnavailable == 0) // ok
  988. {
  989. if (!aIsStatic)
  990. flg[aIdx / 32] |= (1 << (aIdx & 31));
  991. if (!bIsStatic)
  992. flg[bIdx / 32] |= (1 << (bIdx & 31));
  993. cs[idx].getBatchIdx() = batchIdx;
  994. sortData[idx].m_key = batchIdx;
  995. sortData[idx].m_value = idx;
  996. {
  997. nCurrentBatch++;
  998. if (nCurrentBatch == simdWidth)
  999. {
  1000. nCurrentBatch = 0;
  1001. for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
  1002. }
  1003. }
  1004. }
  1005. else
  1006. {
  1007. idxDst[nIdxDst++] = idx;
  1008. }
  1009. }
  1010. b3Swap(idxSrc, idxDst);
  1011. b3Swap(nIdxSrc, nIdxDst);
  1012. batchIdx++;
  1013. }
  1014. }
  1015. {
  1016. B3_PROFILE("quickSort");
  1017. sortData.quickSort(sortfnc);
  1018. }
  1019. {
  1020. B3_PROFILE("reorder");
  1021. // reorder
  1022. memcpy(&old[0], cs, sizeof(b3Contact4) * n);
  1023. for (int i = 0; i < n; i++)
  1024. {
  1025. int idx = sortData[i].m_value;
  1026. cs[i] = old[idx];
  1027. }
  1028. }
  1029. #if defined(_DEBUG)
  1030. // debugPrintf( "nBatches: %d\n", batchIdx );
  1031. for (int i = 0; i < n; i++)
  1032. {
  1033. b3Assert(cs[i].getBatchIdx() != -1);
  1034. }
  1035. #endif
  1036. return batchIdx;
  1037. }
  1038. b3AlignedObjectArray<int> bodyUsed2;
  1039. inline int b3GpuPgsContactSolver::sortConstraintByBatch2(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies)
  1040. {
  1041. B3_PROFILE("sortConstraintByBatch2");
  1042. bodyUsed2.resize(2 * simdWidth);
  1043. for (int q = 0; q < 2 * simdWidth; q++)
  1044. bodyUsed2[q] = 0;
  1045. int curBodyUsed = 0;
  1046. int numIter = 0;
  1047. m_data->m_sortData.resize(numConstraints);
  1048. m_data->m_idxBuffer.resize(numConstraints);
  1049. m_data->m_old.resize(numConstraints);
  1050. unsigned int* idxSrc = &m_data->m_idxBuffer[0];
  1051. #if defined(_DEBUG)
  1052. for (int i = 0; i < numConstraints; i++)
  1053. cs[i].getBatchIdx() = -1;
  1054. #endif
  1055. for (int i = 0; i < numConstraints; i++)
  1056. idxSrc[i] = i;
  1057. int numValidConstraints = 0;
  1058. // int unprocessedConstraintIndex = 0;
  1059. int batchIdx = 0;
  1060. {
  1061. B3_PROFILE("cpu batch innerloop");
  1062. while (numValidConstraints < numConstraints)
  1063. {
  1064. numIter++;
  1065. int nCurrentBatch = 0;
  1066. // clear flag
  1067. for (int i = 0; i < curBodyUsed; i++)
  1068. bodyUsed2[i] = 0;
  1069. curBodyUsed = 0;
  1070. for (int i = numValidConstraints; i < numConstraints; i++)
  1071. {
  1072. int idx = idxSrc[i];
  1073. b3Assert(idx < numConstraints);
  1074. // check if it can go
  1075. int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
  1076. int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
  1077. int bodyA = abs(bodyAS);
  1078. int bodyB = abs(bodyBS);
  1079. bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
  1080. bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
  1081. int aUnavailable = 0;
  1082. int bUnavailable = 0;
  1083. if (!aIsStatic)
  1084. {
  1085. for (int j = 0; j < curBodyUsed; j++)
  1086. {
  1087. if (bodyA == bodyUsed2[j])
  1088. {
  1089. aUnavailable = 1;
  1090. break;
  1091. }
  1092. }
  1093. }
  1094. if (!aUnavailable)
  1095. if (!bIsStatic)
  1096. {
  1097. for (int j = 0; j < curBodyUsed; j++)
  1098. {
  1099. if (bodyB == bodyUsed2[j])
  1100. {
  1101. bUnavailable = 1;
  1102. break;
  1103. }
  1104. }
  1105. }
  1106. if (aUnavailable == 0 && bUnavailable == 0) // ok
  1107. {
  1108. if (!aIsStatic)
  1109. {
  1110. bodyUsed2[curBodyUsed++] = bodyA;
  1111. }
  1112. if (!bIsStatic)
  1113. {
  1114. bodyUsed2[curBodyUsed++] = bodyB;
  1115. }
  1116. cs[idx].getBatchIdx() = batchIdx;
  1117. m_data->m_sortData[idx].m_key = batchIdx;
  1118. m_data->m_sortData[idx].m_value = idx;
  1119. if (i != numValidConstraints)
  1120. {
  1121. b3Swap(idxSrc[i], idxSrc[numValidConstraints]);
  1122. }
  1123. numValidConstraints++;
  1124. {
  1125. nCurrentBatch++;
  1126. if (nCurrentBatch == simdWidth)
  1127. {
  1128. nCurrentBatch = 0;
  1129. for (int i = 0; i < curBodyUsed; i++)
  1130. bodyUsed2[i] = 0;
  1131. curBodyUsed = 0;
  1132. }
  1133. }
  1134. }
  1135. }
  1136. batchIdx++;
  1137. }
  1138. }
  1139. {
  1140. B3_PROFILE("quickSort");
  1141. //m_data->m_sortData.quickSort(sortfnc);
  1142. }
  1143. {
  1144. B3_PROFILE("reorder");
  1145. // reorder
  1146. memcpy(&m_data->m_old[0], cs, sizeof(b3Contact4) * numConstraints);
  1147. for (int i = 0; i < numConstraints; i++)
  1148. {
  1149. b3Assert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]);
  1150. int idx = m_data->m_sortData[idxSrc[i]].m_value;
  1151. cs[i] = m_data->m_old[idx];
  1152. }
  1153. }
  1154. #if defined(_DEBUG)
  1155. // debugPrintf( "nBatches: %d\n", batchIdx );
  1156. for (int i = 0; i < numConstraints; i++)
  1157. {
  1158. b3Assert(cs[i].getBatchIdx() != -1);
  1159. }
  1160. #endif
  1161. return batchIdx;
  1162. }
  1163. b3AlignedObjectArray<int> bodyUsed;
  1164. b3AlignedObjectArray<int> curUsed;
  1165. inline int b3GpuPgsContactSolver::sortConstraintByBatch3(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies, int* batchSizes)
  1166. {
  1167. B3_PROFILE("sortConstraintByBatch3");
  1168. static int maxSwaps = 0;
  1169. int numSwaps = 0;
  1170. curUsed.resize(2 * simdWidth);
  1171. static int maxNumConstraints = 0;
  1172. if (maxNumConstraints < numConstraints)
  1173. {
  1174. maxNumConstraints = numConstraints;
  1175. //printf("maxNumConstraints = %d\n",maxNumConstraints );
  1176. }
  1177. int numUsedArray = numBodies / 32 + 1;
  1178. bodyUsed.resize(numUsedArray);
  1179. for (int q = 0; q < numUsedArray; q++)
  1180. bodyUsed[q] = 0;
  1181. int curBodyUsed = 0;
  1182. int numIter = 0;
  1183. m_data->m_sortData.resize(0);
  1184. m_data->m_idxBuffer.resize(0);
  1185. m_data->m_old.resize(0);
  1186. #if defined(_DEBUG)
  1187. for (int i = 0; i < numConstraints; i++)
  1188. cs[i].getBatchIdx() = -1;
  1189. #endif
  1190. int numValidConstraints = 0;
  1191. // int unprocessedConstraintIndex = 0;
  1192. int batchIdx = 0;
  1193. {
  1194. B3_PROFILE("cpu batch innerloop");
  1195. while (numValidConstraints < numConstraints)
  1196. {
  1197. numIter++;
  1198. int nCurrentBatch = 0;
  1199. batchSizes[batchIdx] = 0;
  1200. // clear flag
  1201. for (int i = 0; i < curBodyUsed; i++)
  1202. bodyUsed[curUsed[i] / 32] = 0;
  1203. curBodyUsed = 0;
  1204. for (int i = numValidConstraints; i < numConstraints; i++)
  1205. {
  1206. int idx = i;
  1207. b3Assert(idx < numConstraints);
  1208. // check if it can go
  1209. int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
  1210. int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
  1211. int bodyA = abs(bodyAS);
  1212. int bodyB = abs(bodyBS);
  1213. bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
  1214. bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
  1215. int aUnavailable = 0;
  1216. int bUnavailable = 0;
  1217. if (!aIsStatic)
  1218. {
  1219. aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31));
  1220. }
  1221. if (!aUnavailable)
  1222. if (!bIsStatic)
  1223. {
  1224. bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31));
  1225. }
  1226. if (aUnavailable == 0 && bUnavailable == 0) // ok
  1227. {
  1228. if (!aIsStatic)
  1229. {
  1230. bodyUsed[bodyA / 32] |= (1 << (bodyA & 31));
  1231. curUsed[curBodyUsed++] = bodyA;
  1232. }
  1233. if (!bIsStatic)
  1234. {
  1235. bodyUsed[bodyB / 32] |= (1 << (bodyB & 31));
  1236. curUsed[curBodyUsed++] = bodyB;
  1237. }
  1238. cs[idx].getBatchIdx() = batchIdx;
  1239. if (i != numValidConstraints)
  1240. {
  1241. b3Swap(cs[i], cs[numValidConstraints]);
  1242. numSwaps++;
  1243. }
  1244. numValidConstraints++;
  1245. {
  1246. nCurrentBatch++;
  1247. if (nCurrentBatch == simdWidth)
  1248. {
  1249. batchSizes[batchIdx] += simdWidth;
  1250. nCurrentBatch = 0;
  1251. for (int i = 0; i < curBodyUsed; i++)
  1252. bodyUsed[curUsed[i] / 32] = 0;
  1253. curBodyUsed = 0;
  1254. }
  1255. }
  1256. }
  1257. }
  1258. if (batchIdx >= B3_MAX_NUM_BATCHES)
  1259. {
  1260. b3Error("batchIdx>=B3_MAX_NUM_BATCHES");
  1261. b3Assert(0);
  1262. break;
  1263. }
  1264. batchSizes[batchIdx] += nCurrentBatch;
  1265. batchIdx++;
  1266. }
  1267. }
  1268. #if defined(_DEBUG)
  1269. // debugPrintf( "nBatches: %d\n", batchIdx );
  1270. for (int i = 0; i < numConstraints; i++)
  1271. {
  1272. b3Assert(cs[i].getBatchIdx() != -1);
  1273. }
  1274. #endif
  1275. batchSizes[batchIdx] = 0;
  1276. if (maxSwaps < numSwaps)
  1277. {
  1278. maxSwaps = numSwaps;
  1279. //printf("maxSwaps = %d\n", maxSwaps);
  1280. }
  1281. return batchIdx;
  1282. }