clusterizer.cpp 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <float.h>
  5. #include <math.h>
  6. #include <string.h>
  7. // This work is based on:
  8. // Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
  9. // Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
  10. // Jack Ritter. An Efficient Bounding Sphere. 1990
  11. namespace meshopt
  12. {
  13. // This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet
  14. const size_t kMeshletMaxVertices = 255;
  15. // A reasonable limit is around 2*max_vertices or less
  16. const size_t kMeshletMaxTriangles = 512;
  17. struct TriangleAdjacency2
  18. {
  19. unsigned int* counts;
  20. unsigned int* offsets;
  21. unsigned int* data;
  22. };
  23. static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
  24. {
  25. size_t face_count = index_count / 3;
  26. // allocate arrays
  27. adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
  28. adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
  29. adjacency.data = allocator.allocate<unsigned int>(index_count);
  30. // fill triangle counts
  31. memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
  32. for (size_t i = 0; i < index_count; ++i)
  33. {
  34. assert(indices[i] < vertex_count);
  35. adjacency.counts[indices[i]]++;
  36. }
  37. // fill offset table
  38. unsigned int offset = 0;
  39. for (size_t i = 0; i < vertex_count; ++i)
  40. {
  41. adjacency.offsets[i] = offset;
  42. offset += adjacency.counts[i];
  43. }
  44. assert(offset == index_count);
  45. // fill triangle data
  46. for (size_t i = 0; i < face_count; ++i)
  47. {
  48. unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
  49. adjacency.data[adjacency.offsets[a]++] = unsigned(i);
  50. adjacency.data[adjacency.offsets[b]++] = unsigned(i);
  51. adjacency.data[adjacency.offsets[c]++] = unsigned(i);
  52. }
  53. // fix offsets that have been disturbed by the previous pass
  54. for (size_t i = 0; i < vertex_count; ++i)
  55. {
  56. assert(adjacency.offsets[i] >= adjacency.counts[i]);
  57. adjacency.offsets[i] -= adjacency.counts[i];
  58. }
  59. }
  60. static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
  61. {
  62. assert(count > 0);
  63. // find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
  64. size_t pmin[3] = {0, 0, 0};
  65. size_t pmax[3] = {0, 0, 0};
  66. for (size_t i = 0; i < count; ++i)
  67. {
  68. const float* p = points[i];
  69. for (int axis = 0; axis < 3; ++axis)
  70. {
  71. pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
  72. pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
  73. }
  74. }
  75. // find the pair of points with largest distance
  76. float paxisd2 = 0;
  77. int paxis = 0;
  78. for (int axis = 0; axis < 3; ++axis)
  79. {
  80. const float* p1 = points[pmin[axis]];
  81. const float* p2 = points[pmax[axis]];
  82. float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
  83. if (d2 > paxisd2)
  84. {
  85. paxisd2 = d2;
  86. paxis = axis;
  87. }
  88. }
  89. // use the longest segment as the initial sphere diameter
  90. const float* p1 = points[pmin[paxis]];
  91. const float* p2 = points[pmax[paxis]];
  92. float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
  93. float radius = sqrtf(paxisd2) / 2;
  94. // iteratively adjust the sphere up until all points fit
  95. for (size_t i = 0; i < count; ++i)
  96. {
  97. const float* p = points[i];
  98. float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
  99. if (d2 > radius * radius)
  100. {
  101. float d = sqrtf(d2);
  102. assert(d > 0);
  103. float k = 0.5f + (radius / d) / 2;
  104. center[0] = center[0] * k + p[0] * (1 - k);
  105. center[1] = center[1] * k + p[1] * (1 - k);
  106. center[2] = center[2] * k + p[2] * (1 - k);
  107. radius = (radius + d) / 2;
  108. }
  109. }
  110. result[0] = center[0];
  111. result[1] = center[1];
  112. result[2] = center[2];
  113. result[3] = radius;
  114. }
  115. struct Cone
  116. {
  117. float px, py, pz;
  118. float nx, ny, nz;
  119. };
  120. static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius)
  121. {
  122. float cone = 1.f - spread * cone_weight;
  123. float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
  124. return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped;
  125. }
  126. static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
  127. {
  128. Cone result = acc;
  129. float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);
  130. result.px *= center_scale;
  131. result.py *= center_scale;
  132. result.pz *= center_scale;
  133. float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;
  134. float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);
  135. result.nx *= axis_scale;
  136. result.ny *= axis_scale;
  137. result.nz *= axis_scale;
  138. return result;
  139. }
  140. static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
  141. {
  142. (void)vertex_count;
  143. size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
  144. size_t face_count = index_count / 3;
  145. float mesh_area = 0;
  146. for (size_t i = 0; i < face_count; ++i)
  147. {
  148. unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
  149. assert(a < vertex_count && b < vertex_count && c < vertex_count);
  150. const float* p0 = vertex_positions + vertex_stride_float * a;
  151. const float* p1 = vertex_positions + vertex_stride_float * b;
  152. const float* p2 = vertex_positions + vertex_stride_float * c;
  153. float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
  154. float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
  155. float normalx = p10[1] * p20[2] - p10[2] * p20[1];
  156. float normaly = p10[2] * p20[0] - p10[0] * p20[2];
  157. float normalz = p10[0] * p20[1] - p10[1] * p20[0];
  158. float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
  159. float invarea = (area == 0.f) ? 0.f : 1.f / area;
  160. triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;
  161. triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;
  162. triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;
  163. triangles[i].nx = normalx * invarea;
  164. triangles[i].ny = normaly * invarea;
  165. triangles[i].nz = normalz * invarea;
  166. mesh_area += area;
  167. }
  168. return mesh_area;
  169. }
  170. static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
  171. {
  172. size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
  173. // fill 4b padding with 0
  174. while (offset & 3)
  175. meshlet_triangles[offset++] = 0;
  176. }
  177. static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles)
  178. {
  179. unsigned char& av = used[a];
  180. unsigned char& bv = used[b];
  181. unsigned char& cv = used[c];
  182. bool result = false;
  183. int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
  184. if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
  185. {
  186. meshlets[meshlet_offset] = meshlet;
  187. for (size_t j = 0; j < meshlet.vertex_count; ++j)
  188. used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff;
  189. finishMeshlet(meshlet, meshlet_triangles);
  190. meshlet.vertex_offset += meshlet.vertex_count;
  191. meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
  192. meshlet.vertex_count = 0;
  193. meshlet.triangle_count = 0;
  194. result = true;
  195. }
  196. if (av == 0xff)
  197. {
  198. av = (unsigned char)meshlet.vertex_count;
  199. meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
  200. }
  201. if (bv == 0xff)
  202. {
  203. bv = (unsigned char)meshlet.vertex_count;
  204. meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
  205. }
  206. if (cv == 0xff)
  207. {
  208. cv = (unsigned char)meshlet.vertex_count;
  209. meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
  210. }
  211. meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av;
  212. meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv;
  213. meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv;
  214. meshlet.triangle_count++;
  215. return result;
  216. }
  217. static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight)
  218. {
  219. unsigned int best_triangle = ~0u;
  220. int best_priority = 5;
  221. float best_score = FLT_MAX;
  222. for (size_t i = 0; i < meshlet.vertex_count; ++i)
  223. {
  224. unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
  225. unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
  226. size_t neighbors_size = adjacency.counts[index];
  227. for (size_t j = 0; j < neighbors_size; ++j)
  228. {
  229. unsigned int triangle = neighbors[j];
  230. unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
  231. int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
  232. assert(extra <= 2);
  233. int priority = -1;
  234. // triangles that don't add new vertices to meshlets are max. priority
  235. if (extra == 0)
  236. priority = 0;
  237. // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
  238. else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
  239. priority = 1;
  240. // if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow
  241. else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2)
  242. priority = 1 + extra;
  243. // otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count
  244. else
  245. priority = 2 + extra;
  246. // since topology-based priority is always more important than the score, we can skip scoring in some cases
  247. if (priority > best_priority)
  248. continue;
  249. float score = 0;
  250. // caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles)
  251. if (meshlet_cone)
  252. {
  253. const Cone& tri_cone = triangles[triangle];
  254. float distance2 =
  255. (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) +
  256. (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) +
  257. (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz);
  258. float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz;
  259. score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
  260. }
  261. else
  262. {
  263. // each live_triangles entry is >= 1 since it includes the current triangle we're processing
  264. score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3);
  265. }
  266. // note that topology-based priority is always more important than the score
  267. // this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
  268. if (priority < best_priority || score < best_score)
  269. {
  270. best_triangle = triangle;
  271. best_priority = priority;
  272. best_score = score;
  273. }
  274. }
  275. }
  276. return best_triangle;
  277. }
  278. struct KDNode
  279. {
  280. union
  281. {
  282. float split;
  283. unsigned int index;
  284. };
  285. // leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
  286. // branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
  287. unsigned int axis : 2;
  288. unsigned int children : 30;
  289. };
  290. static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
  291. {
  292. size_t m = 0;
  293. // invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
  294. for (size_t i = 0; i < count; ++i)
  295. {
  296. float v = points[indices[i] * stride + axis];
  297. // swap(m, i) unconditionally
  298. unsigned int t = indices[m];
  299. indices[m] = indices[i];
  300. indices[i] = t;
  301. // when v >= pivot, we swap i with m without advancing it, preserving invariants
  302. m += v < pivot;
  303. }
  304. return m;
  305. }
  306. static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)
  307. {
  308. assert(offset + count <= node_count);
  309. (void)node_count;
  310. KDNode& result = nodes[offset];
  311. result.index = indices[0];
  312. result.axis = 3;
  313. result.children = unsigned(count - 1);
  314. // all remaining points are stored in nodes immediately following the leaf
  315. for (size_t i = 1; i < count; ++i)
  316. {
  317. KDNode& tail = nodes[offset + i];
  318. tail.index = indices[i];
  319. tail.axis = 3;
  320. tail.children = ~0u >> 2; // bogus value to prevent misuse
  321. }
  322. return offset + count;
  323. }
  324. static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
  325. {
  326. assert(count > 0);
  327. assert(offset < node_count);
  328. if (count <= leaf_size)
  329. return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
  330. float mean[3] = {};
  331. float vars[3] = {};
  332. float runc = 1, runs = 1;
  333. // gather statistics on the points in the subtree using Welford's algorithm
  334. for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
  335. {
  336. const float* point = points + indices[i] * stride;
  337. for (int k = 0; k < 3; ++k)
  338. {
  339. float delta = point[k] - mean[k];
  340. mean[k] += delta * runs;
  341. vars[k] += delta * (point[k] - mean[k]);
  342. }
  343. }
  344. // split axis is one where the variance is largest
  345. unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
  346. float split = mean[axis];
  347. size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
  348. // when the partition is degenerate simply consolidate the points into a single node
  349. if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
  350. return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
  351. KDNode& result = nodes[offset];
  352. result.split = split;
  353. result.axis = axis;
  354. // left subtree is right after our node
  355. size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
  356. // distance to the right subtree is represented explicitly
  357. result.children = unsigned(next_offset - offset - 1);
  358. return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
  359. }
  360. static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
  361. {
  362. const KDNode& node = nodes[root];
  363. if (node.axis == 3)
  364. {
  365. // leaf
  366. for (unsigned int i = 0; i <= node.children; ++i)
  367. {
  368. unsigned int index = nodes[root + i].index;
  369. if (emitted_flags[index])
  370. continue;
  371. const float* point = points + index * stride;
  372. float distance2 =
  373. (point[0] - position[0]) * (point[0] - position[0]) +
  374. (point[1] - position[1]) * (point[1] - position[1]) +
  375. (point[2] - position[2]) * (point[2] - position[2]);
  376. float distance = sqrtf(distance2);
  377. if (distance < limit)
  378. {
  379. result = index;
  380. limit = distance;
  381. }
  382. }
  383. }
  384. else
  385. {
  386. // branch; we order recursion to process the node that search position is in first
  387. float delta = position[node.axis] - node.split;
  388. unsigned int first = (delta <= 0) ? 0 : node.children;
  389. unsigned int second = first ^ node.children;
  390. kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
  391. // only process the other node if it can have a match based on closest distance so far
  392. if (fabsf(delta) <= limit)
  393. kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
  394. }
  395. }
  396. } // namespace meshopt
  397. size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
  398. {
  399. using namespace meshopt;
  400. assert(index_count % 3 == 0);
  401. assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
  402. assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
  403. assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
  404. (void)kMeshletMaxVertices;
  405. (void)kMeshletMaxTriangles;
  406. // meshlet construction is limited by max vertices and max triangles per meshlet
  407. // the worst case is that the input is an unindexed stream since this equally stresses both limits
  408. // note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle
  409. size_t max_vertices_conservative = max_vertices - 2;
  410. size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;
  411. size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;
  412. return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
  413. }
  414. size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
  415. {
  416. using namespace meshopt;
  417. assert(index_count % 3 == 0);
  418. assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
  419. assert(vertex_positions_stride % sizeof(float) == 0);
  420. assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
  421. assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
  422. assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
  423. assert(cone_weight >= 0 && cone_weight <= 1);
  424. meshopt_Allocator allocator;
  425. TriangleAdjacency2 adjacency = {};
  426. buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
  427. unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
  428. memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
  429. size_t face_count = index_count / 3;
  430. unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
  431. memset(emitted_flags, 0, face_count);
  432. // for each triangle, precompute centroid & normal to use for scoring
  433. Cone* triangles = allocator.allocate<Cone>(face_count);
  434. float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
  435. // assuming each meshlet is a square patch, expected radius is sqrt(expected area)
  436. float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;
  437. float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;
  438. // build a kd-tree for nearest neighbor lookup
  439. unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);
  440. for (size_t i = 0; i < face_count; ++i)
  441. kdindices[i] = unsigned(i);
  442. KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
  443. kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
  444. // index of the vertex in the meshlet, 0xff if the vertex isn't used
  445. unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
  446. memset(used, -1, vertex_count);
  447. meshopt_Meshlet meshlet = {};
  448. size_t meshlet_offset = 0;
  449. Cone meshlet_cone_acc = {};
  450. for (;;)
  451. {
  452. Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
  453. unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);
  454. int best_extra = best_triangle == ~0u ? -1 : (used[indices[best_triangle * 3 + 0]] == 0xff) + (used[indices[best_triangle * 3 + 1]] == 0xff) + (used[indices[best_triangle * 3 + 2]] == 0xff);
  455. // if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring
  456. if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
  457. {
  458. best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f);
  459. }
  460. // when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
  461. if (best_triangle == ~0u)
  462. {
  463. float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
  464. unsigned int index = ~0u;
  465. float limit = FLT_MAX;
  466. kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit);
  467. best_triangle = index;
  468. }
  469. if (best_triangle == ~0u)
  470. break;
  471. unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
  472. assert(a < vertex_count && b < vertex_count && c < vertex_count);
  473. // add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
  474. if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles))
  475. {
  476. meshlet_offset++;
  477. memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
  478. }
  479. live_triangles[a]--;
  480. live_triangles[b]--;
  481. live_triangles[c]--;
  482. // remove emitted triangle from adjacency data
  483. // this makes sure that we spend less time traversing these lists on subsequent iterations
  484. for (size_t k = 0; k < 3; ++k)
  485. {
  486. unsigned int index = indices[best_triangle * 3 + k];
  487. unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
  488. size_t neighbors_size = adjacency.counts[index];
  489. for (size_t i = 0; i < neighbors_size; ++i)
  490. {
  491. unsigned int tri = neighbors[i];
  492. if (tri == best_triangle)
  493. {
  494. neighbors[i] = neighbors[neighbors_size - 1];
  495. adjacency.counts[index]--;
  496. break;
  497. }
  498. }
  499. }
  500. // update aggregated meshlet cone data for scoring subsequent triangles
  501. meshlet_cone_acc.px += triangles[best_triangle].px;
  502. meshlet_cone_acc.py += triangles[best_triangle].py;
  503. meshlet_cone_acc.pz += triangles[best_triangle].pz;
  504. meshlet_cone_acc.nx += triangles[best_triangle].nx;
  505. meshlet_cone_acc.ny += triangles[best_triangle].ny;
  506. meshlet_cone_acc.nz += triangles[best_triangle].nz;
  507. emitted_flags[best_triangle] = 1;
  508. }
  509. if (meshlet.triangle_count)
  510. {
  511. finishMeshlet(meshlet, meshlet_triangles);
  512. meshlets[meshlet_offset++] = meshlet;
  513. }
  514. assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
  515. return meshlet_offset;
  516. }
  517. size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
  518. {
  519. using namespace meshopt;
  520. assert(index_count % 3 == 0);
  521. assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
  522. assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
  523. assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
  524. meshopt_Allocator allocator;
  525. // index of the vertex in the meshlet, 0xff if the vertex isn't used
  526. unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
  527. memset(used, -1, vertex_count);
  528. meshopt_Meshlet meshlet = {};
  529. size_t meshlet_offset = 0;
  530. for (size_t i = 0; i < index_count; i += 3)
  531. {
  532. unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
  533. assert(a < vertex_count && b < vertex_count && c < vertex_count);
  534. // appends triangle to the meshlet and writes previous meshlet to the output if full
  535. meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);
  536. }
  537. if (meshlet.triangle_count)
  538. {
  539. finishMeshlet(meshlet, meshlet_triangles);
  540. meshlets[meshlet_offset++] = meshlet;
  541. }
  542. assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
  543. return meshlet_offset;
  544. }
  545. meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
  546. {
  547. using namespace meshopt;
  548. assert(index_count % 3 == 0);
  549. assert(index_count / 3 <= kMeshletMaxTriangles);
  550. assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
  551. assert(vertex_positions_stride % sizeof(float) == 0);
  552. (void)vertex_count;
  553. size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
  554. // compute triangle normals and gather triangle corners
  555. float normals[kMeshletMaxTriangles][3];
  556. float corners[kMeshletMaxTriangles][3][3];
  557. size_t triangles = 0;
  558. for (size_t i = 0; i < index_count; i += 3)
  559. {
  560. unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
  561. assert(a < vertex_count && b < vertex_count && c < vertex_count);
  562. const float* p0 = vertex_positions + vertex_stride_float * a;
  563. const float* p1 = vertex_positions + vertex_stride_float * b;
  564. const float* p2 = vertex_positions + vertex_stride_float * c;
  565. float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
  566. float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
  567. float normalx = p10[1] * p20[2] - p10[2] * p20[1];
  568. float normaly = p10[2] * p20[0] - p10[0] * p20[2];
  569. float normalz = p10[0] * p20[1] - p10[1] * p20[0];
  570. float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
  571. // no need to include degenerate triangles - they will be invisible anyway
  572. if (area == 0.f)
  573. continue;
  574. // record triangle normals & corners for future use; normal and corner 0 define a plane equation
  575. normals[triangles][0] = normalx / area;
  576. normals[triangles][1] = normaly / area;
  577. normals[triangles][2] = normalz / area;
  578. memcpy(corners[triangles][0], p0, 3 * sizeof(float));
  579. memcpy(corners[triangles][1], p1, 3 * sizeof(float));
  580. memcpy(corners[triangles][2], p2, 3 * sizeof(float));
  581. triangles++;
  582. }
  583. meshopt_Bounds bounds = {};
  584. // degenerate cluster, no valid triangles => trivial reject (cone data is 0)
  585. if (triangles == 0)
  586. return bounds;
  587. // compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
  588. float psphere[4] = {};
  589. computeBoundingSphere(psphere, corners[0], triangles * 3);
  590. float center[3] = {psphere[0], psphere[1], psphere[2]};
  591. // treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
  592. float nsphere[4] = {};
  593. computeBoundingSphere(nsphere, normals, triangles);
  594. float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
  595. float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
  596. float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;
  597. axis[0] *= invaxislength;
  598. axis[1] *= invaxislength;
  599. axis[2] *= invaxislength;
  600. // compute a tight cone around all normals, mindp = cos(angle/2)
  601. float mindp = 1.f;
  602. for (size_t i = 0; i < triangles; ++i)
  603. {
  604. float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];
  605. mindp = (dp < mindp) ? dp : mindp;
  606. }
  607. // fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones
  608. bounds.center[0] = center[0];
  609. bounds.center[1] = center[1];
  610. bounds.center[2] = center[2];
  611. bounds.radius = psphere[3];
  612. // degenerate cluster, normal cone is larger than a hemisphere => trivial accept
  613. // note that if mindp is positive but close to 0, the triangle intersection code below gets less stable
  614. // we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful
  615. if (mindp <= 0.1f)
  616. {
  617. bounds.cone_cutoff = 1;
  618. bounds.cone_cutoff_s8 = 127;
  619. return bounds;
  620. }
  621. float maxt = 0;
  622. // we need to find the point on center-t*axis ray that lies in negative half-space of all triangles
  623. for (size_t i = 0; i < triangles; ++i)
  624. {
  625. // dot(center-t*axis-corner, trinormal) = 0
  626. // dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0
  627. float cx = center[0] - corners[i][0][0];
  628. float cy = center[1] - corners[i][0][1];
  629. float cz = center[2] - corners[i][0][2];
  630. float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];
  631. float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];
  632. // dn should be larger than mindp cutoff above
  633. assert(dn > 0.f);
  634. float t = dc / dn;
  635. maxt = (t > maxt) ? t : maxt;
  636. }
  637. // cone apex should be in the negative half-space of all cluster triangles by construction
  638. bounds.cone_apex[0] = center[0] - axis[0] * maxt;
  639. bounds.cone_apex[1] = center[1] - axis[1] * maxt;
  640. bounds.cone_apex[2] = center[2] - axis[2] * maxt;
  641. // note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis
  642. bounds.cone_axis[0] = axis[0];
  643. bounds.cone_axis[1] = axis[1];
  644. bounds.cone_axis[2] = axis[2];
  645. // cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone
  646. // which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))
  647. bounds.cone_cutoff = sqrtf(1 - mindp * mindp);
  648. // quantize axis & cutoff to 8-bit SNORM format
  649. bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));
  650. bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));
  651. bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));
  652. // for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error
  653. float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);
  654. float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);
  655. float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);
  656. // note that we need to round this up instead of rounding to nearest, hence +1
  657. int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);
  658. bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);
  659. return bounds;
  660. }
  661. meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
  662. {
  663. using namespace meshopt;
  664. assert(triangle_count <= kMeshletMaxTriangles);
  665. assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
  666. assert(vertex_positions_stride % sizeof(float) == 0);
  667. unsigned int indices[kMeshletMaxTriangles * 3];
  668. for (size_t i = 0; i < triangle_count * 3; ++i)
  669. {
  670. unsigned int index = meshlet_vertices[meshlet_triangles[i]];
  671. assert(index < vertex_count);
  672. indices[i] = index;
  673. }
  674. return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
  675. }
  676. void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count)
  677. {
  678. using namespace meshopt;
  679. assert(triangle_count <= kMeshletMaxTriangles);
  680. assert(vertex_count <= kMeshletMaxVertices);
  681. unsigned char* indices = meshlet_triangles;
  682. unsigned int* vertices = meshlet_vertices;
  683. // cache tracks vertex timestamps (corresponding to triangle index! all 3 vertices are added at the same time and never removed)
  684. unsigned char cache[kMeshletMaxVertices];
  685. memset(cache, 0, vertex_count);
  686. // note that we start from a value that means all vertices aren't in cache
  687. unsigned char cache_last = 128;
  688. const unsigned char cache_cutoff = 3; // 3 triangles = ~5..9 vertices depending on reuse
  689. for (size_t i = 0; i < triangle_count; ++i)
  690. {
  691. int next = -1;
  692. int next_match = -1;
  693. for (size_t j = i; j < triangle_count; ++j)
  694. {
  695. unsigned char a = indices[j * 3 + 0], b = indices[j * 3 + 1], c = indices[j * 3 + 2];
  696. assert(a < vertex_count && b < vertex_count && c < vertex_count);
  697. // score each triangle by how many vertices are in cache
  698. // note: the distance is computed using unsigned 8-bit values, so cache timestamp overflow is handled gracefully
  699. int aok = (unsigned char)(cache_last - cache[a]) < cache_cutoff;
  700. int bok = (unsigned char)(cache_last - cache[b]) < cache_cutoff;
  701. int cok = (unsigned char)(cache_last - cache[c]) < cache_cutoff;
  702. if (aok + bok + cok > next_match)
  703. {
  704. next = (int)j;
  705. next_match = aok + bok + cok;
  706. // note that we could end up with all 3 vertices in the cache, but 2 is enough for ~strip traversal
  707. if (next_match >= 2)
  708. break;
  709. }
  710. }
  711. assert(next >= 0);
  712. unsigned char a = indices[next * 3 + 0], b = indices[next * 3 + 1], c = indices[next * 3 + 2];
  713. // shift triangles before the next one forward so that we always keep an ordered partition
  714. // note: this could have swapped triangles [i] and [next] but that distorts the order and may skew the output sequence
  715. memmove(indices + (i + 1) * 3, indices + i * 3, (next - i) * 3 * sizeof(unsigned char));
  716. indices[i * 3 + 0] = a;
  717. indices[i * 3 + 1] = b;
  718. indices[i * 3 + 2] = c;
  719. // cache timestamp is the same between all vertices of each triangle to reduce overflow
  720. cache_last++;
  721. cache[a] = cache_last;
  722. cache[b] = cache_last;
  723. cache[c] = cache_last;
  724. }
  725. // reorder meshlet vertices for access locality assuming index buffer is scanned sequentially
  726. unsigned int order[kMeshletMaxVertices];
  727. unsigned char remap[kMeshletMaxVertices];
  728. memset(remap, -1, vertex_count);
  729. size_t vertex_offset = 0;
  730. for (size_t i = 0; i < triangle_count * 3; ++i)
  731. {
  732. unsigned char& r = remap[indices[i]];
  733. if (r == 0xff)
  734. {
  735. r = (unsigned char)(vertex_offset);
  736. order[vertex_offset] = vertices[indices[i]];
  737. vertex_offset++;
  738. }
  739. indices[i] = r;
  740. }
  741. assert(vertex_offset <= vertex_count);
  742. memcpy(vertices, order, vertex_offset * sizeof(unsigned int));
  743. }