overdrawoptimizer.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <math.h>
  5. #include <string.h>
  6. // This work is based on:
  7. // Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
  8. namespace meshopt
  9. {
  10. static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
  11. {
  12. size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
  13. float mesh_centroid[3] = {};
  14. for (size_t i = 0; i < index_count; ++i)
  15. {
  16. const float* p = vertex_positions + vertex_stride_float * indices[i];
  17. mesh_centroid[0] += p[0];
  18. mesh_centroid[1] += p[1];
  19. mesh_centroid[2] += p[2];
  20. }
  21. mesh_centroid[0] /= index_count;
  22. mesh_centroid[1] /= index_count;
  23. mesh_centroid[2] /= index_count;
  24. for (size_t cluster = 0; cluster < cluster_count; ++cluster)
  25. {
  26. size_t cluster_begin = clusters[cluster] * 3;
  27. size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
  28. assert(cluster_begin < cluster_end);
  29. float cluster_area = 0;
  30. float cluster_centroid[3] = {};
  31. float cluster_normal[3] = {};
  32. for (size_t i = cluster_begin; i < cluster_end; i += 3)
  33. {
  34. const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0];
  35. const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1];
  36. const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2];
  37. float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
  38. float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
  39. float normalx = p10[1] * p20[2] - p10[2] * p20[1];
  40. float normaly = p10[2] * p20[0] - p10[0] * p20[2];
  41. float normalz = p10[0] * p20[1] - p10[1] * p20[0];
  42. float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
  43. cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3);
  44. cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3);
  45. cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3);
  46. cluster_normal[0] += normalx;
  47. cluster_normal[1] += normaly;
  48. cluster_normal[2] += normalz;
  49. cluster_area += area;
  50. }
  51. float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area;
  52. cluster_centroid[0] *= inv_cluster_area;
  53. cluster_centroid[1] *= inv_cluster_area;
  54. cluster_centroid[2] *= inv_cluster_area;
  55. float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]);
  56. float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length;
  57. cluster_normal[0] *= inv_cluster_normal_length;
  58. cluster_normal[1] *= inv_cluster_normal_length;
  59. cluster_normal[2] *= inv_cluster_normal_length;
  60. float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]};
  61. sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2];
  62. }
  63. }
  64. static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count)
  65. {
  66. // compute sort data bounds and renormalize, using fixed point snorm
  67. float sort_data_max = 1e-3f;
  68. for (size_t i = 0; i < cluster_count; ++i)
  69. {
  70. float dpa = fabsf(sort_data[i]);
  71. sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max;
  72. }
  73. const int sort_bits = 11;
  74. for (size_t i = 0; i < cluster_count; ++i)
  75. {
  76. // note that we flip distribution since high dot product should come first
  77. float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max);
  78. sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1);
  79. }
  80. // fill histogram for counting sort
  81. unsigned int histogram[1 << sort_bits];
  82. memset(histogram, 0, sizeof(histogram));
  83. for (size_t i = 0; i < cluster_count; ++i)
  84. {
  85. histogram[sort_keys[i]]++;
  86. }
  87. // compute offsets based on histogram data
  88. size_t histogram_sum = 0;
  89. for (size_t i = 0; i < 1 << sort_bits; ++i)
  90. {
  91. size_t count = histogram[i];
  92. histogram[i] = unsigned(histogram_sum);
  93. histogram_sum += count;
  94. }
  95. assert(histogram_sum == cluster_count);
  96. // compute sort order based on offsets
  97. for (size_t i = 0; i < cluster_count; ++i)
  98. {
  99. sort_order[histogram[sort_keys[i]]++] = unsigned(i);
  100. }
  101. }
  102. static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp)
  103. {
  104. unsigned int cache_misses = 0;
  105. // if vertex is not in cache, put it in cache
  106. if (timestamp - cache_timestamps[a] > cache_size)
  107. {
  108. cache_timestamps[a] = timestamp++;
  109. cache_misses++;
  110. }
  111. if (timestamp - cache_timestamps[b] > cache_size)
  112. {
  113. cache_timestamps[b] = timestamp++;
  114. cache_misses++;
  115. }
  116. if (timestamp - cache_timestamps[c] > cache_size)
  117. {
  118. cache_timestamps[c] = timestamp++;
  119. cache_misses++;
  120. }
  121. return cache_misses;
  122. }
  123. static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps)
  124. {
  125. memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
  126. unsigned int timestamp = cache_size + 1;
  127. size_t face_count = index_count / 3;
  128. size_t result = 0;
  129. for (size_t i = 0; i < face_count; ++i)
  130. {
  131. unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
  132. // when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh
  133. // that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently
  134. // suggests an inefficiency in the vertex cache optimization algorithm
  135. // usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0
  136. if (i == 0 || m == 3)
  137. {
  138. destination[result++] = unsigned(i);
  139. }
  140. }
  141. assert(result <= index_count / 3);
  142. return result;
  143. }
  144. static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps)
  145. {
  146. memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
  147. unsigned int timestamp = 0;
  148. size_t result = 0;
  149. for (size_t it = 0; it < cluster_count; ++it)
  150. {
  151. size_t start = clusters[it];
  152. size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3;
  153. assert(start < end);
  154. // reset cache
  155. timestamp += cache_size + 1;
  156. // measure cluster ACMR
  157. unsigned int cluster_misses = 0;
  158. for (size_t i = start; i < end; ++i)
  159. {
  160. unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
  161. cluster_misses += m;
  162. }
  163. float cluster_threshold = threshold * (float(cluster_misses) / float(end - start));
  164. // first cluster always starts from the hard cluster boundary
  165. destination[result++] = unsigned(start);
  166. // reset cache
  167. timestamp += cache_size + 1;
  168. unsigned int running_misses = 0;
  169. unsigned int running_faces = 0;
  170. for (size_t i = start; i < end; ++i)
  171. {
  172. unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
  173. running_misses += m;
  174. running_faces += 1;
  175. if (float(running_misses) / float(running_faces) <= cluster_threshold)
  176. {
  177. // we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one
  178. // note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last
  179. // cluster is empty; however, the 'pop_back' after the loop will clean it up
  180. destination[result++] = unsigned(i + 1);
  181. // reset cache
  182. timestamp += cache_size + 1;
  183. running_misses = 0;
  184. running_faces = 0;
  185. }
  186. }
  187. // each time we reach the target ACMR we flush the cluster
  188. // this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles
  189. // in the last cluster, producing a very bad ACMR and significantly penalizing the overall results
  190. // thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one
  191. // there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'
  192. // to the cluster boundary array which we need to remove anyway - this code will do that automatically
  193. if (destination[result - 1] != start)
  194. {
  195. result--;
  196. }
  197. }
  198. assert(result >= cluster_count);
  199. assert(result <= index_count / 3);
  200. return result;
  201. }
  202. } // namespace meshopt
  203. void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
  204. {
  205. using namespace meshopt;
  206. assert(index_count % 3 == 0);
  207. assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
  208. assert(vertex_positions_stride % sizeof(float) == 0);
  209. meshopt_Allocator allocator;
  210. // guard for empty meshes
  211. if (index_count == 0 || vertex_count == 0)
  212. return;
  213. // support in-place optimization
  214. if (destination == indices)
  215. {
  216. unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
  217. memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
  218. indices = indices_copy;
  219. }
  220. unsigned int cache_size = 16;
  221. unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
  222. // generate hard boundaries from full-triangle cache misses
  223. unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3);
  224. size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps);
  225. // generate soft boundaries
  226. unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1);
  227. size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps);
  228. const unsigned int* clusters = soft_clusters;
  229. size_t cluster_count = soft_cluster_count;
  230. // fill sort data
  231. float* sort_data = allocator.allocate<float>(cluster_count);
  232. calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
  233. // sort clusters using sort data
  234. unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
  235. unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count);
  236. calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count);
  237. // fill output buffer
  238. size_t offset = 0;
  239. for (size_t it = 0; it < cluster_count; ++it)
  240. {
  241. unsigned int cluster = sort_order[it];
  242. assert(cluster < cluster_count);
  243. size_t cluster_begin = clusters[cluster] * 3;
  244. size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
  245. assert(cluster_begin < cluster_end);
  246. memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int));
  247. offset += cluster_end - cluster_begin;
  248. }
  249. assert(offset == index_count);
  250. }