From 3b870eaa2ef09f6011f720cd3e6cb888c88f3599 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Thu, 19 Feb 2026 09:56:30 -0800 Subject: [PATCH 1/7] Rename clusterizer.cpp to meshletutils.cpp This is an auxiliary commit needed to trick git into preserving blame post-file split. --- src/{clusterizer.cpp => meshletutils.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{clusterizer.cpp => meshletutils.cpp} (100%) diff --git a/src/clusterizer.cpp b/src/meshletutils.cpp similarity index 100% rename from src/clusterizer.cpp rename to src/meshletutils.cpp From 208d4c8358a912fa27c32fb446d682f6c16d5fd0 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Thu, 19 Feb 2026 10:02:12 -0800 Subject: [PATCH 2/7] meshletutils: Remove parts of original source This change still keeps some of the clusterizer helper functions, because doing a complete cleanup in one commit confuses git diff. --- src/meshletutils.cpp | 499 ------------------------------------------- 1 file changed, 499 deletions(-) diff --git a/src/meshletutils.cpp b/src/meshletutils.cpp index f1c8c0820..546ea39a4 100644 --- a/src/meshletutils.cpp +++ b/src/meshletutils.cpp @@ -6,17 +6,6 @@ #include #include -// The block below auto-detects SIMD ISA that can be used on the target platform -#ifndef MESHOPTIMIZER_NO_SIMD -#if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64)) -#define SIMD_SSE -#include -#elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922) -#define SIMD_NEON -#include -#endif -#endif // !MESHOPTIMIZER_NO_SIMD - // This work is based on: // Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016 // Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016 @@ -39,140 +28,6 @@ const size_t kMeshletAddSeeds = 4; // To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree const int kMeshletMaxTreeDepth = 50; -struct TriangleAdjacency2 -{ - unsigned int* counts; - unsigned int* offsets; - unsigned int* data; -}; - -static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator) -{ - size_t face_count = index_count / 3; - - // allocate arrays - adjacency.counts = allocator.allocate(vertex_count); - adjacency.offsets = allocator.allocate(vertex_count); - adjacency.data = allocator.allocate(index_count); - - // fill triangle counts - memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int)); - - for (size_t i = 0; i < index_count; ++i) - { - assert(indices[i] < vertex_count); - - adjacency.counts[indices[i]]++; - } - - // fill offset table - unsigned int offset = 0; - - for (size_t i = 0; i < vertex_count; ++i) - { - adjacency.offsets[i] = offset; - offset += adjacency.counts[i]; - } - - assert(offset == index_count); - - // fill triangle data - for (size_t i = 0; i < face_count; ++i) - { - unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; - - adjacency.data[adjacency.offsets[a]++] = unsigned(i); - adjacency.data[adjacency.offsets[b]++] = unsigned(i); - adjacency.data[adjacency.offsets[c]++] = unsigned(i); - } - - // fix offsets that have been disturbed by the previous pass - for (size_t i = 0; i < vertex_count; ++i) - { - assert(adjacency.offsets[i] >= adjacency.counts[i]); - adjacency.offsets[i] -= adjacency.counts[i]; - } -} - -static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator) -{ - size_t face_count = index_count / 3; - - // sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices - const unsigned int sparse_seen = 1u << 31; - assert(index_count < sparse_seen); - - // allocate arrays - adjacency.counts = allocator.allocate(vertex_count); - adjacency.offsets = allocator.allocate(vertex_count); - adjacency.data = allocator.allocate(index_count); - - // fill triangle counts - for (size_t i = 0; i < index_count; ++i) - assert(indices[i] < vertex_count); - - for (size_t i = 0; i < index_count; ++i) - adjacency.counts[indices[i]] = 0; - - for (size_t i = 0; i < index_count; ++i) - adjacency.counts[indices[i]]++; - - // fill offset table; uses sparse_seen bit to tag visited vertices - unsigned int offset = 0; - - for (size_t i = 0; i < index_count; ++i) - { - unsigned int v = indices[i]; - - if ((adjacency.counts[v] & sparse_seen) == 0) - { - adjacency.offsets[v] = offset; - offset += adjacency.counts[v]; - adjacency.counts[v] |= sparse_seen; - } - } - - assert(offset == index_count); - - // fill triangle data - for (size_t i = 0; i < face_count; ++i) - { - unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; - - adjacency.data[adjacency.offsets[a]++] = unsigned(i); - adjacency.data[adjacency.offsets[b]++] = unsigned(i); - adjacency.data[adjacency.offsets[c]++] = unsigned(i); - } - - // fix offsets that have been disturbed by the previous pass - // also fix counts (that were marked with sparse_seen by the first pass) - for (size_t i = 0; i < index_count; ++i) - { - unsigned int v = indices[i]; - - if (adjacency.counts[v] & sparse_seen) - { - adjacency.counts[v] &= ~sparse_seen; - - assert(adjacency.offsets[v] >= adjacency.counts[v]); - adjacency.offsets[v] -= adjacency.counts[v]; - } - } -} - -static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count) -{ - // for sparse inputs, it's faster to only clear vertices referenced by the index buffer - if (vertex_count <= index_count) - memset(used, -1, vertex_count * sizeof(short)); - else - for (size_t i = 0; i < index_count; ++i) - { - assert(indices[i] < vertex_count); - used[indices[i]] = -1; - } -} - static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count) { static const float kAxes[7][3] = { @@ -1125,357 +980,6 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or } // namespace meshopt -size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles) -{ - using namespace meshopt; - - assert(index_count % 3 == 0); - assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); - assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); - - (void)kMeshletMaxVertices; - (void)kMeshletMaxTriangles; - - // meshlet construction is limited by max vertices and max triangles per meshlet - // the worst case is that the input is an unindexed stream since this equally stresses both limits - // note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle - size_t max_vertices_conservative = max_vertices - 2; - size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative; - size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles; - - return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles; -} - -size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor) -{ - using namespace meshopt; - - assert(index_count % 3 == 0); - assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); - assert(vertex_positions_stride % sizeof(float) == 0); - - assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); - assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); - - assert(cone_weight >= 0 && cone_weight <= 1); - assert(split_factor >= 0); - - if (index_count == 0) - return 0; - - meshopt_Allocator allocator; - - TriangleAdjacency2 adjacency = {}; - if (vertex_count > index_count && index_count < (1u << 31)) - buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator); - else - buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); - - // live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match - unsigned int* live_triangles = adjacency.counts; - - size_t face_count = index_count / 3; - - unsigned char* emitted_flags = allocator.allocate(face_count); - memset(emitted_flags, 0, face_count); - - // for each triangle, precompute centroid & normal to use for scoring - Cone* triangles = allocator.allocate(face_count); - float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride); - - // assuming each meshlet is a square patch, expected radius is sqrt(expected area) - float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f; - float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f; - - // build a kd-tree for nearest neighbor lookup - unsigned int* kdindices = allocator.allocate(face_count); - for (size_t i = 0; i < face_count; ++i) - kdindices[i] = unsigned(i); - - KDNode* nodes = allocator.allocate(face_count * 2); - kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0); - - // find a specific corner of the mesh to use as a starting point for meshlet flow - float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX; - - for (size_t i = 0; i < face_count; ++i) - { - const Cone& tri = triangles[i]; - - cornerx = cornerx > tri.px ? tri.px : cornerx; - cornery = cornery > tri.py ? tri.py : cornery; - cornerz = cornerz > tri.pz ? tri.pz : cornerz; - } - - // index of the vertex in the meshlet, -1 if the vertex isn't used - short* used = allocator.allocate(vertex_count); - clearUsed(used, vertex_count, indices, index_count); - - // initial seed triangle is the one closest to the corner - unsigned int initial_seed = ~0u; - float initial_score = FLT_MAX; - - for (size_t i = 0; i < face_count; ++i) - { - const Cone& tri = triangles[i]; - - float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz; - float score = sqrtf(dx * dx + dy * dy + dz * dz); - - if (initial_seed == ~0u || score < initial_score) - { - initial_seed = unsigned(i); - initial_score = score; - } - } - - // seed triangles to continue meshlet flow - unsigned int seeds[kMeshletMaxSeeds] = {}; - size_t seed_count = 0; - - meshopt_Meshlet meshlet = {}; - size_t meshlet_offset = 0; - - Cone meshlet_cone_acc = {}; - - for (;;) - { - Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count); - - unsigned int best_triangle = ~0u; - - // for the first triangle, we don't have a meshlet cone yet, so we use the initial seed - // to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring - if (meshlet_offset == 0 && meshlet.triangle_count == 0) - best_triangle = initial_seed; - else - best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight); - - bool split = false; - - // when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity - if (best_triangle == ~0u) - { - float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz}; - unsigned int index = ~0u; - float distance = FLT_MAX; - - kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance); - - best_triangle = index; - split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor; - } - - if (best_triangle == ~0u) - break; - - int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0); - - // if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow - if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles)) - { - seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags); - seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds; - seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz); - - unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz); - - // we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency - best_triangle = best_seed != ~0u ? best_seed : best_triangle; - } - - unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2]; - assert(a < vertex_count && b < vertex_count && c < vertex_count); - - // add meshlet to the output; when the current meshlet is full we reset the accumulated bounds - if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split)) - { - meshlet_offset++; - memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc)); - } - - // remove emitted triangle from adjacency data - // this makes sure that we spend less time traversing these lists on subsequent iterations - // live triangle counts are updated as a byproduct of these adjustments - for (size_t k = 0; k < 3; ++k) - { - unsigned int index = indices[best_triangle * 3 + k]; - - unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; - size_t neighbors_size = adjacency.counts[index]; - - for (size_t i = 0; i < neighbors_size; ++i) - { - unsigned int tri = neighbors[i]; - - if (tri == best_triangle) - { - neighbors[i] = neighbors[neighbors_size - 1]; - adjacency.counts[index]--; - break; - } - } - } - - // update aggregated meshlet cone data for scoring subsequent triangles - meshlet_cone_acc.px += triangles[best_triangle].px; - meshlet_cone_acc.py += triangles[best_triangle].py; - meshlet_cone_acc.pz += triangles[best_triangle].pz; - meshlet_cone_acc.nx += triangles[best_triangle].nx; - meshlet_cone_acc.ny += triangles[best_triangle].ny; - meshlet_cone_acc.nz += triangles[best_triangle].nz; - - assert(!emitted_flags[best_triangle]); - emitted_flags[best_triangle] = 1; - } - - if (meshlet.triangle_count) - meshlets[meshlet_offset++] = meshlet; - - assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles)); - assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); - return meshlet_offset; -} - -size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight) -{ - return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f); -} - -size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles) -{ - using namespace meshopt; - - assert(index_count % 3 == 0); - - assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); - assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); - - meshopt_Allocator allocator; - - // index of the vertex in the meshlet, -1 if the vertex isn't used - short* used = allocator.allocate(vertex_count); - clearUsed(used, vertex_count, indices, index_count); - - meshopt_Meshlet meshlet = {}; - size_t meshlet_offset = 0; - - for (size_t i = 0; i < index_count; i += 3) - { - unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2]; - assert(a < vertex_count && b < vertex_count && c < vertex_count); - - // appends triangle to the meshlet and writes previous meshlet to the output if full - meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles); - } - - if (meshlet.triangle_count) - meshlets[meshlet_offset++] = meshlet; - - assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles)); - assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); - return meshlet_offset; -} - -size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) -{ - using namespace meshopt; - - assert(index_count % 3 == 0); - assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); - assert(vertex_positions_stride % sizeof(float) == 0); - - assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); - assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); - - if (index_count == 0) - return 0; - - size_t face_count = index_count / 3; - size_t vertex_stride_float = vertex_positions_stride / sizeof(float); - - meshopt_Allocator allocator; - - // 3 floats plus 1 uint for sorting, or - // 2 floats plus 1 uint for pivoting, or - // 1 uint plus 1 byte for partitioning - float* scratch = allocator.allocate(face_count * 4); - - // compute bounding boxes and centroids for sorting - BVHBox* boxes = allocator.allocate(face_count + 1); // padding for SIMD - bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float); - memset(boxes + face_count, 0, sizeof(BVHBox)); - - unsigned int* axes = allocator.allocate(face_count * 3); - unsigned int* temp = reinterpret_cast(scratch) + face_count * 3; - - for (int k = 0; k < 3; ++k) - { - unsigned int* order = axes + k * face_count; - const float* keys = scratch + k * face_count; - - unsigned int hist[1024][3]; - computeHistogram(hist, keys, face_count); - - // 3-pass radix sort computes the resulting order into axes - for (size_t i = 0; i < face_count; ++i) - temp[i] = unsigned(i); - - radixPass(order, temp, keys, face_count, hist, 0); - radixPass(temp, order, keys, face_count, hist, 1); - radixPass(order, temp, keys, face_count, hist, 2); - } - - // index of the vertex in the meshlet, -1 if the vertex isn't used - short* used = allocator.allocate(vertex_count); - clearUsed(used, vertex_count, indices, index_count); - - unsigned char* boundary = allocator.allocate(face_count); - - bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); - - // compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound - size_t meshlet_count = 0; - for (size_t i = 0; i < face_count; ++i) - { - assert(boundary[i] <= 1); - meshlet_count += boundary[i]; - } - - size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles); - - // pack triangles into meshlets according to the order and boundaries marked by bvhSplit - meshopt_Meshlet meshlet = {}; - size_t meshlet_offset = 0; - size_t meshlet_pending = meshlet_count; - - for (size_t i = 0; i < face_count; ++i) - { - assert(boundary[i] <= 1); - bool split = i > 0 && boundary[i] == 1; - - // while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space - if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound) - split = false; - - unsigned int index = axes[i]; - assert(index < face_count); - - unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; - - // appends triangle to the meshlet and writes previous meshlet to the output if full - meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split); - meshlet_pending -= boundary[i]; - } - - if (meshlet.triangle_count) - meshlets[meshlet_offset++] = meshlet; - - assert(meshlet_offset <= meshlet_bound); - assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); - return meshlet_offset; -} - meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { using namespace meshopt; @@ -1817,6 +1321,3 @@ void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* mesh assert(vertex_offset <= vertex_count); memcpy(vertices, order, vertex_offset * sizeof(unsigned int)); } - -#undef SIMD_SSE -#undef SIMD_NEON From 4b93449ff8cdc816246c7a0628d1233bc09d4ee2 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Thu, 19 Feb 2026 10:03:30 -0800 Subject: [PATCH 3/7] meshletutils: Remove the rest of clusterizer source The file is now self-contained and only contains meshlet utilities: bounding sphere and bounds computation as well as meshlet optimizer. --- src/meshletutils.cpp | 847 ------------------------------------------- 1 file changed, 847 deletions(-) diff --git a/src/meshletutils.cpp b/src/meshletutils.cpp index 546ea39a4..12c4d8405 100644 --- a/src/meshletutils.cpp +++ b/src/meshletutils.cpp @@ -21,13 +21,6 @@ const size_t kMeshletMaxVertices = 256; // A reasonable limit is around 2*max_vertices or less const size_t kMeshletMaxTriangles = 512; -// We keep a limited number of seed triangles and add a few triangles per finished meshlet -const size_t kMeshletMaxSeeds = 256; -const size_t kMeshletAddSeeds = 4; - -// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree -const int kMeshletMaxTreeDepth = 50; - static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count) { static const float kAxes[7][3] = { @@ -138,846 +131,6 @@ static void computeBoundingSphere(float result[4], const float* points, size_t c result[3] = radius; } -struct Cone -{ - float px, py, pz; - float nx, ny, nz; -}; - -static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius) -{ - float cone = 1.f - spread * cone_weight; - float cone_clamped = cone < 1e-3f ? 1e-3f : cone; - - return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped; -} - -static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count) -{ - Cone result = acc; - - float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count); - - result.px *= center_scale; - result.py *= center_scale; - result.pz *= center_scale; - - float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz; - float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length); - - result.nx *= axis_scale; - result.ny *= axis_scale; - result.nz *= axis_scale; - - return result; -} - -static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) -{ - (void)vertex_count; - - size_t vertex_stride_float = vertex_positions_stride / sizeof(float); - size_t face_count = index_count / 3; - - float mesh_area = 0; - - for (size_t i = 0; i < face_count; ++i) - { - unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; - assert(a < vertex_count && b < vertex_count && c < vertex_count); - - const float* p0 = vertex_positions + vertex_stride_float * a; - const float* p1 = vertex_positions + vertex_stride_float * b; - const float* p2 = vertex_positions + vertex_stride_float * c; - - float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]}; - float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]}; - - float normalx = p10[1] * p20[2] - p10[2] * p20[1]; - float normaly = p10[2] * p20[0] - p10[0] * p20[2]; - float normalz = p10[0] * p20[1] - p10[1] * p20[0]; - - float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz); - float invarea = (area == 0.f) ? 0.f : 1.f / area; - - triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f; - triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f; - triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f; - - triangles[i].nx = normalx * invarea; - triangles[i].ny = normaly * invarea; - triangles[i].nz = normalz * invarea; - - mesh_area += area; - } - - return mesh_area; -} - -static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false) -{ - short& av = used[a]; - short& bv = used[b]; - short& cv = used[c]; - - bool result = false; - - int used_extra = (av < 0) + (bv < 0) + (cv < 0); - - if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split) - { - meshlets[meshlet_offset] = meshlet; - - for (size_t j = 0; j < meshlet.vertex_count; ++j) - used[meshlet_vertices[meshlet.vertex_offset + j]] = -1; - - meshlet.vertex_offset += meshlet.vertex_count; - meshlet.triangle_offset += meshlet.triangle_count * 3; - meshlet.vertex_count = 0; - meshlet.triangle_count = 0; - - result = true; - } - - if (av < 0) - { - av = short(meshlet.vertex_count); - meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a; - } - - if (bv < 0) - { - bv = short(meshlet.vertex_count); - meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b; - } - - if (cv < 0) - { - cv = short(meshlet.vertex_count); - meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c; - } - - meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av; - meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv; - meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv; - meshlet.triangle_count++; - - return result; -} - -static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight) -{ - unsigned int best_triangle = ~0u; - int best_priority = 5; - float best_score = FLT_MAX; - - for (size_t i = 0; i < meshlet.vertex_count; ++i) - { - unsigned int index = meshlet_vertices[meshlet.vertex_offset + i]; - - unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; - size_t neighbors_size = adjacency.counts[index]; - - for (size_t j = 0; j < neighbors_size; ++j) - { - unsigned int triangle = neighbors[j]; - unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; - - int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0); - assert(extra <= 2); - - int priority = -1; - - // triangles that don't add new vertices to meshlets are max. priority - if (extra == 0) - priority = 0; - // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets - else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1) - priority = 1; - // if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow - else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2) - priority = 1 + extra; - // otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count - else - priority = 2 + extra; - - // since topology-based priority is always more important than the score, we can skip scoring in some cases - if (priority > best_priority) - continue; - - const Cone& tri_cone = triangles[triangle]; - - float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz; - float distance = sqrtf(dx * dx + dy * dy + dz * dz); - float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz; - - float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius); - - // note that topology-based priority is always more important than the score - // this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost - if (priority < best_priority || score < best_score) - { - best_triangle = triangle; - best_priority = priority; - best_score = score; - } - } - } - - return best_triangle; -} - -static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz) -{ - unsigned int best_seeds[kMeshletAddSeeds]; - unsigned int best_live[kMeshletAddSeeds]; - float best_score[kMeshletAddSeeds]; - - for (size_t i = 0; i < kMeshletAddSeeds; ++i) - { - best_seeds[i] = ~0u; - best_live[i] = ~0u; - best_score[i] = FLT_MAX; - } - - for (size_t i = 0; i < meshlet.vertex_count; ++i) - { - unsigned int index = meshlet_vertices[meshlet.vertex_offset + i]; - - unsigned int best_neighbor = ~0u; - unsigned int best_neighbor_live = ~0u; - - // find the neighbor with the smallest live metric - unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; - size_t neighbors_size = adjacency.counts[index]; - - for (size_t j = 0; j < neighbors_size; ++j) - { - unsigned int triangle = neighbors[j]; - unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; - - unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c]; - - if (live < best_neighbor_live) - { - best_neighbor = triangle; - best_neighbor_live = live; - } - } - - // add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate - if (best_neighbor == ~0u) - continue; - - float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz; - float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz); - - for (size_t j = 0; j < kMeshletAddSeeds; ++j) - { - // non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices) - if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j])) - { - best_seeds[j] = best_neighbor; - best_live[j] = best_neighbor_live; - best_score[j] = best_neighbor_score; - break; - } - } - } - - // add surviving seeds to the meshlet - size_t seed_count = 0; - - for (size_t i = 0; i < kMeshletAddSeeds; ++i) - if (best_seeds[i] != ~0u) - seeds[seed_count++] = best_seeds[i]; - - return seed_count; -} - -static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags) -{ - size_t result = 0; - - for (size_t i = 0; i < seed_count; ++i) - { - unsigned int index = seeds[i]; - - seeds[result] = index; - result += emitted_flags[index] == 0; - } - - return result; -} - -static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz) -{ - unsigned int best_seed = ~0u; - unsigned int best_live = ~0u; - float best_score = FLT_MAX; - - for (size_t i = 0; i < seed_count; ++i) - { - unsigned int index = seeds[i]; - unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; - - unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c]; - float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz; - float score = sqrtf(dx * dx + dy * dy + dz * dz); - - if (live < best_live || (live == best_live && score < best_score)) - { - best_seed = index; - best_live = live; - best_score = score; - } - } - - return best_seed; -} - -struct KDNode -{ - union - { - float split; - unsigned int index; - }; - - // leaves: axis = 3, children = number of points including this one - // branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children - unsigned int axis : 2; - unsigned int children : 30; -}; - -static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot) -{ - size_t m = 0; - - // invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot - for (size_t i = 0; i < count; ++i) - { - float v = points[indices[i] * stride + axis]; - - // swap(m, i) unconditionally - unsigned int t = indices[m]; - indices[m] = indices[i]; - indices[i] = t; - - // when v >= pivot, we swap i with m without advancing it, preserving invariants - m += v < pivot; - } - - return m; -} - -static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count) -{ - assert(offset + count <= node_count); - (void)node_count; - - KDNode& result = nodes[offset]; - - result.index = indices[0]; - result.axis = 3; - result.children = unsigned(count); - - // all remaining points are stored in nodes immediately following the leaf - for (size_t i = 1; i < count; ++i) - { - KDNode& tail = nodes[offset + i]; - - tail.index = indices[i]; - tail.axis = 3; - tail.children = ~0u >> 2; // bogus value to prevent misuse - } - - return offset + count; -} - -static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth) -{ - assert(count > 0); - assert(offset < node_count); - - if (count <= leaf_size) - return kdtreeBuildLeaf(offset, nodes, node_count, indices, count); - - float mean[3] = {}; - float vars[3] = {}; - float runc = 1, runs = 1; - - // gather statistics on the points in the subtree using Welford's algorithm - for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc) - { - const float* point = points + indices[i] * stride; - - for (int k = 0; k < 3; ++k) - { - float delta = point[k] - mean[k]; - mean[k] += delta * runs; - vars[k] += delta * (point[k] - mean[k]); - } - } - - // split axis is one where the variance is largest - int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2); - - float split = mean[axis]; - size_t middle = kdtreePartition(indices, count, points, stride, axis, split); - - // when the partition is degenerate simply consolidate the points into a single node - // this also ensures recursion depth is bounded on pathological inputs - if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth) - return kdtreeBuildLeaf(offset, nodes, node_count, indices, count); - - KDNode& result = nodes[offset]; - - result.split = split; - result.axis = axis; - - // left subtree is right after our node - size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1); - - // distance to the right subtree is represented explicitly - assert(next_offset - offset > 1); - result.children = unsigned(next_offset - offset - 1); - - return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1); -} - -static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit) -{ - const KDNode& node = nodes[root]; - - if (node.children == 0) - return; - - if (node.axis == 3) - { - // leaf - bool inactive = true; - - for (unsigned int i = 0; i < node.children; ++i) - { - unsigned int index = nodes[root + i].index; - - if (emitted_flags[index]) - continue; - - inactive = false; - - const float* point = points + index * stride; - - float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2]; - float distance = sqrtf(dx * dx + dy * dy + dz * dz); - - if (distance < limit) - { - result = index; - limit = distance; - } - } - - // deactivate leaves that no longer have items to emit - if (inactive) - nodes[root].children = 0; - } - else - { - // branch; we order recursion to process the node that search position is in first - float delta = position[node.axis] - node.split; - unsigned int first = (delta <= 0) ? 0 : node.children; - unsigned int second = first ^ node.children; - - // deactivate branches that no longer have items to emit to accelerate traversal - // note that we do this *before* recursing which delays deactivation but keeps tail calls - if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0) - nodes[root].children = 0; - - // recursion depth is bounded by tree depth (which is limited by construction) - kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit); - - // only process the other node if it can have a match based on closest distance so far - if (fabsf(delta) <= limit) - kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit); - } -} - -struct BVHBoxT -{ - float min[4]; - float max[4]; -}; - -struct BVHBox -{ - float min[3]; - float max[3]; -}; - -#if defined(SIMD_SSE) -static float boxMerge(BVHBoxT& box, const BVHBox& other) -{ - __m128 min = _mm_loadu_ps(box.min); - __m128 max = _mm_loadu_ps(box.max); - - // note: over-read is safe because BVHBox array is allocated with padding - min = _mm_min_ps(min, _mm_loadu_ps(other.min)); - max = _mm_max_ps(max, _mm_loadu_ps(other.max)); - - _mm_storeu_ps(box.min, min); - _mm_storeu_ps(box.max, max); - - __m128 size = _mm_sub_ps(max, min); - __m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1)); - __m128 mul = _mm_mul_ps(size, size_yzx); - __m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1))); - __m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2))); - - return _mm_cvtss_f32(sum_xyz); -} -#elif defined(SIMD_NEON) -static float boxMerge(BVHBoxT& box, const BVHBox& other) -{ - float32x4_t min = vld1q_f32(box.min); - float32x4_t max = vld1q_f32(box.max); - - // note: over-read is safe because BVHBox array is allocated with padding - min = vminq_f32(min, vld1q_f32(other.min)); - max = vmaxq_f32(max, vld1q_f32(other.max)); - - vst1q_f32(box.min, min); - vst1q_f32(box.max, max); - - float32x4_t size = vsubq_f32(max, min); - float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2); - float32x4_t mul = vmulq_f32(size, size_yzx); - float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1); - float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2); - - return sum_xyz; -} -#else -static float boxMerge(BVHBoxT& box, const BVHBox& other) -{ - for (int k = 0; k < 3; ++k) - { - box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k]; - box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k]; - } - - float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2]; - return sx * sy + sx * sz + sy * sz; -} -#endif - -inline unsigned int radixFloat(unsigned int v) -{ - // if sign bit is 0, flip sign bit - // if sign bit is 1, flip everything - unsigned int mask = (int(v) >> 31) | 0x80000000; - return v ^ mask; -} - -static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count) -{ - memset(hist, 0, sizeof(hist)); - - const unsigned int* bits = reinterpret_cast(data); - - // compute 3 10-bit histograms in parallel (dropping 2 LSB) - for (size_t i = 0; i < count; ++i) - { - unsigned int id = radixFloat(bits[i]); - - hist[(id >> 2) & 1023][0]++; - hist[(id >> 12) & 1023][1]++; - hist[(id >> 22) & 1023][2]++; - } - - unsigned int sum0 = 0, sum1 = 0, sum2 = 0; - - // replace histogram data with prefix histogram sums in-place - for (int i = 0; i < 1024; ++i) - { - unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2]; - - hist[i][0] = sum0; - hist[i][1] = sum1; - hist[i][2] = sum2; - - sum0 += hx; - sum1 += hy; - sum2 += hz; - } - - assert(sum0 == count && sum1 == count && sum2 == count); -} - -static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass) -{ - const unsigned int* bits = reinterpret_cast(keys); - int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes - - for (size_t i = 0; i < count; ++i) - { - unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023; - - destination[hist[id][pass]++] = source[i]; - } -} - -static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float) -{ - (void)vertex_count; - - for (size_t i = 0; i < face_count; ++i) - { - unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; - assert(a < vertex_count && b < vertex_count && c < vertex_count); - - const float* va = vertex_positions + vertex_stride_float * a; - const float* vb = vertex_positions + vertex_stride_float * b; - const float* vc = vertex_positions + vertex_stride_float * c; - - BVHBox& box = boxes[i]; - - for (int k = 0; k < 3; ++k) - { - box.min[k] = va[k] < vb[k] ? va[k] : vb[k]; - box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k]; - - box.max[k] = va[k] > vb[k] ? va[k] : vb[k]; - box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k]; - - centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f; - } - } -} - -static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL) -{ - // count number of unique vertices - size_t used_vertices = 0; - for (size_t i = 0; i < count; ++i) - { - unsigned int index = order[i]; - unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; - - used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0); - used[a] = used[b] = used[c] = 1; - - if (out) - out[i] = unsigned(used_vertices); - } - - // reset used[] for future invocations - for (size_t i = 0; i < count; ++i) - { - unsigned int index = order[i]; - unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; - - used[a] = used[b] = used[c] = -1; - } - - return used_vertices; -} - -static void bvhPackLeaf(unsigned char* boundary, size_t count) -{ - // mark meshlet boundary for future reassembly - assert(count > 0); - - boundary[0] = 1; - memset(boundary + 1, 0, count - 1); -} - -static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles) -{ - for (size_t i = 0; i < count;) - { - size_t chunk = i + max_triangles <= count ? max_triangles : count - i; - - if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices) - { - bvhPackLeaf(boundary + i, chunk); - i += chunk; - continue; - } - - // chunk is vertex bound, split it into smaller meshlets - assert(chunk > max_vertices / 3); - - bvhPackLeaf(boundary + i, max_vertices / 3); - i += max_vertices / 3; - } -} - -static bool bvhDivisible(size_t count, size_t min, size_t max) -{ - // count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)] - // equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder) - // we avoid expensive integer divisions in the common case where min is <= max/2 - return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min); -} - -static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count) -{ - BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}}; - BVHBoxT accumr = accuml; - - for (size_t i = 0; i < count; ++i) - { - float larea = boxMerge(accuml, boxes[order[i]]); - float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]); - - areas[i] = larea; - areas[i + count] = rarea; - } -} - -static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost) -{ - bool aligned = count >= min * 2 && bvhDivisible(count, min, max); - size_t end = aligned ? count - min : count - 1; - - float rmaxfill = 1.f / float(int(maxfill)); - - // find best split that minimizes SAH - size_t bestsplit = 0; - float bestcost = FLT_MAX; - - for (size_t i = min - 1; i < end; i += step) - { - size_t lsplit = i + 1, rsplit = count - (i + 1); - - if (!bvhDivisible(lsplit, min, max)) - continue; - if (aligned && !bvhDivisible(rsplit, min, max)) - continue; - - // areas[x] = inclusive surface area of boxes[0..x] - // areas[count-1-x] = inclusive surface area of boxes[x..count-1] - float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count]; - float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit)); - - if (cost > bestcost) - continue; - - // use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count - // using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice - size_t lfill = vertices ? vertices[i] : lsplit; - size_t rfill = vertices ? vertices[i] : rsplit; - - // fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo - int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill); - int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill); - - cost += fill * (float(lrest) * larea + float(rrest) * rarea); - - if (cost < bestcost) - { - bestcost = cost; - bestsplit = i + 1; - } - } - - *out_cost = bestcost; - return bestsplit; -} - -static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count) -{ - size_t l = 0, r = split; - - for (size_t i = 0; i < count; ++i) - { - unsigned char side = sides[order[i]]; - target[side ? r : l] = order[i]; - l += 1; - l -= side; - r += side; - } - - assert(l == split && r == count); -} - -static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) -{ - if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices) - return bvhPackLeaf(boundary, count); - - unsigned int* axes[3] = {orderx, ordery, orderz}; - - // we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max - size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1; - - // if we could not pack the meshlet, we must be vertex bound - size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles; - size_t maxfill = count <= max_triangles ? max_vertices : max_triangles; - - // find best split that minimizes SAH - int bestk = -1; - size_t bestsplit = 0; - float bestcost = FLT_MAX; - - for (int k = 0; k < 3; ++k) - { - float* areas = static_cast(scratch); - unsigned int* vertices = NULL; - - bvhComputeArea(areas, boxes, axes[k], count); - - if (count <= max_triangles) - { - // for vertex bound clusters, count number of unique vertices for each split - vertices = reinterpret_cast(areas + 2 * count); - bvhCountVertices(axes[k], count, used, indices, vertices); - } - - float axiscost = FLT_MAX; - size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost); - - if (axissplit && axiscost < bestcost) - { - bestk = k; - bestcost = axiscost; - bestsplit = axissplit; - } - } - - // this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs - if (bestk < 0 || depth >= kMeshletMaxTreeDepth) - return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles); - - // mark sides of split for partitioning - unsigned char* sides = static_cast(scratch) + count * sizeof(unsigned int); - - for (size_t i = 0; i < bestsplit; ++i) - sides[axes[bestk][i]] = 0; - - for (size_t i = bestsplit; i < count; ++i) - sides[axes[bestk][i]] = 1; - - // partition all axes into two sides, maintaining order - unsigned int* temp = static_cast(scratch); - - for (int k = 0; k < 3; ++k) - { - if (k == bestk) - continue; - - unsigned int* axis = axes[k]; - memcpy(temp, axis, sizeof(unsigned int) * count); - bvhPartition(axis, temp, sides, bestsplit, count); - } - - // recursion depth is bounded due to max depth check above - bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); - bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); -} - } // namespace meshopt meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) From f4cb741f8738f047a6f041bf91b75c53888354c7 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Thu, 19 Feb 2026 10:04:54 -0800 Subject: [PATCH 4/7] clusterizer: Remove meshlet utilities (moved to meshletutils) All of the bounds utility functions as well as meshlet optimizer are going to be moved to meshletutils.cpp; the commits can't be atomic because of complications with preserving git blame history. --- src/clusterizer.cpp | 452 -------------------------------------------- 1 file changed, 452 deletions(-) diff --git a/src/clusterizer.cpp b/src/clusterizer.cpp index f1c8c0820..6de8e9b70 100644 --- a/src/clusterizer.cpp +++ b/src/clusterizer.cpp @@ -173,116 +173,6 @@ static void clearUsed(short* used, size_t vertex_count, const unsigned int* indi } } -static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count) -{ - static const float kAxes[7][3] = { - // X, Y, Z - {1, 0, 0}, - {0, 1, 0}, - {0, 0, 1}, - - // XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length - {0.57735026f, 0.57735026f, 0.57735026f}, - {-0.57735026f, 0.57735026f, 0.57735026f}, - {0.57735026f, -0.57735026f, 0.57735026f}, - {0.57735026f, 0.57735026f, -0.57735026f}, - }; - - assert(count > 0); - assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0])); - - size_t points_stride_float = points_stride / sizeof(float); - size_t radii_stride_float = radii_stride / sizeof(float); - - // find extremum points along all axes; for each axis we get a pair of points with min/max coordinates - size_t pmin[7], pmax[7]; - float tmin[7], tmax[7]; - - for (size_t axis = 0; axis < axis_count; ++axis) - { - pmin[axis] = pmax[axis] = 0; - tmin[axis] = FLT_MAX; - tmax[axis] = -FLT_MAX; - } - - for (size_t i = 0; i < count; ++i) - { - const float* p = points + i * points_stride_float; - float r = radii[i * radii_stride_float]; - - for (size_t axis = 0; axis < axis_count; ++axis) - { - const float* ax = kAxes[axis]; - - float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2]; - float tpmin = tp - r, tpmax = tp + r; - - pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis]; - pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis]; - tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis]; - tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis]; - } - } - - // find the pair of points with largest distance - size_t paxis = 0; - float paxisdr = 0; - - for (size_t axis = 0; axis < axis_count; ++axis) - { - const float* p1 = points + pmin[axis] * points_stride_float; - const float* p2 = points + pmax[axis] * points_stride_float; - float r1 = radii[pmin[axis] * radii_stride_float]; - float r2 = radii[pmax[axis] * radii_stride_float]; - - float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]); - float dr = sqrtf(d2) + r1 + r2; - - if (dr > paxisdr) - { - paxisdr = dr; - paxis = axis; - } - } - - // use the longest segment as the initial sphere diameter - const float* p1 = points + pmin[paxis] * points_stride_float; - const float* p2 = points + pmax[paxis] * points_stride_float; - float r1 = radii[pmin[paxis] * radii_stride_float]; - float r2 = radii[pmax[paxis] * radii_stride_float]; - - float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2])); - float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f; - - float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk}; - float radius = paxisdr / 2; - - // iteratively adjust the sphere up until all points fit - for (size_t i = 0; i < count; ++i) - { - const float* p = points + i * points_stride_float; - float r = radii[i * radii_stride_float]; - - float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]); - float d = sqrtf(d2); - - if (d + r > radius) - { - float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f; - - center[0] += k * (p[0] - center[0]); - center[1] += k * (p[1] - center[1]); - center[2] += k * (p[2] - center[2]); - radius = (radius + d + r) / 2; - } - } - - result[0] = center[0]; - result[1] = center[1]; - result[2] = center[2]; - result[3] = radius; -} - struct Cone { float px, py, pz; @@ -1476,347 +1366,5 @@ size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned i return meshlet_offset; } -meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) -{ - using namespace meshopt; - - assert(index_count % 3 == 0); - assert(index_count / 3 <= kMeshletMaxTriangles); - assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); - assert(vertex_positions_stride % sizeof(float) == 0); - - (void)vertex_count; - - size_t vertex_stride_float = vertex_positions_stride / sizeof(float); - - // compute triangle normals and gather triangle corners - float normals[kMeshletMaxTriangles][3]; - float corners[kMeshletMaxTriangles][3][3]; - size_t triangles = 0; - - for (size_t i = 0; i < index_count; i += 3) - { - unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2]; - assert(a < vertex_count && b < vertex_count && c < vertex_count); - - const float* p0 = vertex_positions + vertex_stride_float * a; - const float* p1 = vertex_positions + vertex_stride_float * b; - const float* p2 = vertex_positions + vertex_stride_float * c; - - float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]}; - float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]}; - - float normalx = p10[1] * p20[2] - p10[2] * p20[1]; - float normaly = p10[2] * p20[0] - p10[0] * p20[2]; - float normalz = p10[0] * p20[1] - p10[1] * p20[0]; - - float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz); - - // no need to include degenerate triangles - they will be invisible anyway - if (area == 0.f) - continue; - - // record triangle normals & corners for future use; normal and corner 0 define a plane equation - normals[triangles][0] = normalx / area; - normals[triangles][1] = normaly / area; - normals[triangles][2] = normalz / area; - memcpy(corners[triangles][0], p0, 3 * sizeof(float)); - memcpy(corners[triangles][1], p1, 3 * sizeof(float)); - memcpy(corners[triangles][2], p2, 3 * sizeof(float)); - triangles++; - } - - meshopt_Bounds bounds = {}; - - // degenerate cluster, no valid triangles => trivial reject (cone data is 0) - if (triangles == 0) - return bounds; - - const float rzero = 0.f; - - // compute cluster bounding sphere; we'll use the center to determine normal cone apex as well - float psphere[4] = {}; - computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7); - - float center[3] = {psphere[0], psphere[1], psphere[2]}; - - // treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis - float nsphere[4] = {}; - computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3); - - float axis[3] = {nsphere[0], nsphere[1], nsphere[2]}; - float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]); - float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength; - - axis[0] *= invaxislength; - axis[1] *= invaxislength; - axis[2] *= invaxislength; - - // compute a tight cone around all normals, mindp = cos(angle/2) - float mindp = 1.f; - - for (size_t i = 0; i < triangles; ++i) - { - float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2]; - - mindp = (dp < mindp) ? dp : mindp; - } - - // fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones - bounds.center[0] = center[0]; - bounds.center[1] = center[1]; - bounds.center[2] = center[2]; - bounds.radius = psphere[3]; - - // degenerate cluster, normal cone is larger than a hemisphere => trivial accept - // note that if mindp is positive but close to 0, the triangle intersection code below gets less stable - // we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful - if (mindp <= 0.1f) - { - bounds.cone_cutoff = 1; - bounds.cone_cutoff_s8 = 127; - return bounds; - } - - float maxt = 0; - - // we need to find the point on center-t*axis ray that lies in negative half-space of all triangles - for (size_t i = 0; i < triangles; ++i) - { - // dot(center-t*axis-corner, trinormal) = 0 - // dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0 - float cx = center[0] - corners[i][0][0]; - float cy = center[1] - corners[i][0][1]; - float cz = center[2] - corners[i][0][2]; - - float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2]; - float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2]; - - // dn should be larger than mindp cutoff above - assert(dn > 0.f); - float t = dc / dn; - - maxt = (t > maxt) ? t : maxt; - } - - // cone apex should be in the negative half-space of all cluster triangles by construction - bounds.cone_apex[0] = center[0] - axis[0] * maxt; - bounds.cone_apex[1] = center[1] - axis[1] * maxt; - bounds.cone_apex[2] = center[2] - axis[2] * maxt; - - // note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis - bounds.cone_axis[0] = axis[0]; - bounds.cone_axis[1] = axis[1]; - bounds.cone_axis[2] = axis[2]; - - // cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone - // which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a)) - bounds.cone_cutoff = sqrtf(1 - mindp * mindp); - - // quantize axis & cutoff to 8-bit SNORM format - bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8)); - bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8)); - bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8)); - - // for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error - float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]); - float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]); - float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]); - - // note that we need to round this up instead of rounding to nearest, hence +1 - int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1); - - bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8); - - return bounds; -} - -meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) -{ - using namespace meshopt; - - assert(triangle_count <= kMeshletMaxTriangles); - assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); - assert(vertex_positions_stride % sizeof(float) == 0); - - unsigned int indices[kMeshletMaxTriangles * 3]; - - for (size_t i = 0; i < triangle_count * 3; ++i) - { - unsigned int index = meshlet_vertices[meshlet_triangles[i]]; - assert(index < vertex_count); - - indices[i] = index; - } - - return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride); -} - -meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride) -{ - using namespace meshopt; - - assert(positions_stride >= 12 && positions_stride <= 256); - assert(positions_stride % sizeof(float) == 0); - assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL); - assert(radii_stride % sizeof(float) == 0); - - meshopt_Bounds bounds = {}; - - if (count == 0) - return bounds; - - const float rzero = 0.f; - - float psphere[4] = {}; - computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7); - - bounds.center[0] = psphere[0]; - bounds.center[1] = psphere[1]; - bounds.center[2] = psphere[2]; - bounds.radius = psphere[3]; - - return bounds; -} - -void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count) -{ - using namespace meshopt; - - assert(triangle_count <= kMeshletMaxTriangles); - assert(vertex_count <= kMeshletMaxVertices); - - unsigned char* indices = meshlet_triangles; - unsigned int* vertices = meshlet_vertices; - - // cache tracks vertex timestamps (corresponding to triangle index! all 3 vertices are added at the same time and never removed) - unsigned char cache[kMeshletMaxVertices]; - memset(cache, 0, vertex_count); - - // note that we start from a value that means all vertices aren't in cache - unsigned char cache_last = 128; - const unsigned char cache_cutoff = 3; // 3 triangles = ~5..9 vertices depending on reuse - - for (size_t i = 0; i < triangle_count; ++i) - { - int next = -1; - int next_match = -1; - - for (size_t j = i; j < triangle_count; ++j) - { - unsigned char a = indices[j * 3 + 0], b = indices[j * 3 + 1], c = indices[j * 3 + 2]; - assert(a < vertex_count && b < vertex_count && c < vertex_count); - - // score each triangle by how many vertices are in cache - // note: the distance is computed using unsigned 8-bit values, so cache timestamp overflow is handled gracefully - int aok = (unsigned char)(cache_last - cache[a]) < cache_cutoff; - int bok = (unsigned char)(cache_last - cache[b]) < cache_cutoff; - int cok = (unsigned char)(cache_last - cache[c]) < cache_cutoff; - - if (aok + bok + cok > next_match) - { - next = (int)j; - next_match = aok + bok + cok; - - // note that we could end up with all 3 vertices in the cache, but 2 is enough for ~strip traversal - if (next_match >= 2) - break; - } - } - - assert(next >= 0); - - unsigned char a = indices[next * 3 + 0], b = indices[next * 3 + 1], c = indices[next * 3 + 2]; - - // shift triangles before the next one forward so that we always keep an ordered partition - // note: this could have swapped triangles [i] and [next] but that distorts the order and may skew the output sequence - memmove(indices + (i + 1) * 3, indices + i * 3, (next - i) * 3 * sizeof(unsigned char)); - - indices[i * 3 + 0] = a; - indices[i * 3 + 1] = b; - indices[i * 3 + 2] = c; - - // cache timestamp is the same between all vertices of each triangle to reduce overflow - cache_last++; - cache[a] = cache_last; - cache[b] = cache_last; - cache[c] = cache_last; - } - - // rotate triangles to maximize compressibility - memset(cache, 0, vertex_count); - - for (size_t i = 0; i < triangle_count; ++i) - { - unsigned char a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; - - // if only the middle vertex has been used, rotate triangle to ensure new vertices are always sequential - if (!cache[a] && cache[b] && !cache[c]) - { - // abc -> bca - unsigned char t = a; - a = b, b = c, c = t; - } - else if (!cache[a] && !cache[b] && !cache[c]) - { - // out of three edges, the edge ab can not be reused by subsequent triangles in some encodings - // if subsequent triangles don't share edges ca or bc, we can rotate the triangle to fix this - bool needab = false, needbc = false, needca = false; - - for (size_t j = i + 1; j < triangle_count && j <= i + cache_cutoff; ++j) - { - unsigned char oa = indices[j * 3 + 0], ob = indices[j * 3 + 1], oc = indices[j * 3 + 2]; - - // note: edge comparisons are reversed as reused edges are flipped - needab |= (oa == b && ob == a) || (ob == b && oc == a) || (oc == b && oa == a); - needbc |= (oa == c && ob == b) || (ob == c && oc == b) || (oc == c && oa == b); - needca |= (oa == a && ob == c) || (ob == a && oc == c) || (oc == a && oa == c); - } - - if (needab && !needbc) - { - // abc -> bca - unsigned char t = a; - a = b, b = c, c = t; - } - else if (needab && !needca) - { - // abc -> cab - unsigned char t = c; - c = b, b = a, a = t; - } - } - - indices[i * 3 + 0] = a, indices[i * 3 + 1] = b, indices[i * 3 + 2] = c; - - cache[a] = cache[b] = cache[c] = 1; - } - - // reorder meshlet vertices for access locality assuming index buffer is scanned sequentially - unsigned int order[kMeshletMaxVertices]; - - short remap[kMeshletMaxVertices]; - memset(remap, -1, vertex_count * sizeof(short)); - - size_t vertex_offset = 0; - - for (size_t i = 0; i < triangle_count * 3; ++i) - { - short& r = remap[indices[i]]; - - if (r < 0) - { - r = short(vertex_offset); - order[vertex_offset] = vertices[indices[i]]; - vertex_offset++; - } - - indices[i] = (unsigned char)r; - } - - assert(vertex_offset <= vertex_count); - memcpy(vertices, order, vertex_offset * sizeof(unsigned int)); -} - #undef SIMD_SSE #undef SIMD_NEON From 91f30959a0f413e44c4699b407cf9e7569a535a0 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Thu, 19 Feb 2026 10:09:53 -0800 Subject: [PATCH 5/7] clusterizer: Remove redundant constants and references kMeshletMax* constants are only used for sanity checking here; we also can't keep them as is because amalgamated build would result in symbol conflicts with meshletutils.cpp. Also remove references that were only relevant for cluster bounds computation. --- src/clusterizer.cpp | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/src/clusterizer.cpp b/src/clusterizer.cpp index 6de8e9b70..c2ebd8c5a 100644 --- a/src/clusterizer.cpp +++ b/src/clusterizer.cpp @@ -19,19 +19,10 @@ // This work is based on: // Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016 -// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016 -// Jack Ritter. An Efficient Bounding Sphere. 1990 -// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008 // Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006 namespace meshopt { -// This must be <= 256 since meshlet indices are stored as bytes -const size_t kMeshletMaxVertices = 256; - -// A reasonable limit is around 2*max_vertices or less -const size_t kMeshletMaxTriangles = 512; - // We keep a limited number of seed triangles and add a few triangles per finished meshlet const size_t kMeshletMaxSeeds = 256; const size_t kMeshletAddSeeds = 4; @@ -1020,11 +1011,8 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_ using namespace meshopt; assert(index_count % 3 == 0); - assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); - assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); - - (void)kMeshletMaxVertices; - (void)kMeshletMaxTriangles; + assert(max_vertices >= 3 && max_vertices <= 256); + assert(max_triangles >= 1 && max_triangles <= 512); // meshlet construction is limited by max vertices and max triangles per meshlet // the worst case is that the input is an unindexed stream since this equally stresses both limits @@ -1044,8 +1032,8 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); - assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); - assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); + assert(max_vertices >= 3 && max_vertices <= 256); + assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= 512); assert(cone_weight >= 0 && cone_weight <= 1); assert(split_factor >= 0); @@ -1238,8 +1226,8 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle assert(index_count % 3 == 0); - assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); - assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); + assert(max_vertices >= 3 && max_vertices <= 256); + assert(max_triangles >= 1 && max_triangles <= 512); meshopt_Allocator allocator; @@ -1275,8 +1263,8 @@ size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned i assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); - assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); - assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); + assert(max_vertices >= 3 && max_vertices <= 256); + assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= 512); if (index_count == 0) return 0; From 7a430fb31297176c726d3c9481ae0f0a6962c948 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Thu, 19 Feb 2026 10:29:57 -0800 Subject: [PATCH 6/7] Fix js/meshopt_clusterizer and clusterfuzz builds clusterfuzz build was actually broken a few months ago; clusterfuzz now needs partition.cpp to function. --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 8cc54f3be..9dabf7183 100644 --- a/Makefile +++ b/Makefile @@ -54,7 +54,7 @@ WASM_ENCODER_EXPORTS=meshopt_encodeVertexBuffer meshopt_encodeVertexBufferBound WASM_SIMPLIFIER_SOURCES=src/simplifier.cpp src/vfetchoptimizer.cpp src/indexgenerator.cpp tools/wasmstubs.cpp WASM_SIMPLIFIER_EXPORTS=meshopt_simplify meshopt_simplifyWithAttributes meshopt_simplifyWithUpdate meshopt_simplifyScale meshopt_simplifyPoints meshopt_simplifySloppy meshopt_simplifyPrune meshopt_optimizeVertexFetchRemap meshopt_generatePositionRemap sbrk __wasm_call_ctors -WASM_CLUSTERIZER_SOURCES=src/clusterizer.cpp tools/wasmstubs.cpp +WASM_CLUSTERIZER_SOURCES=src/clusterizer.cpp src/meshletutils.cpp tools/wasmstubs.cpp WASM_CLUSTERIZER_EXPORTS=meshopt_buildMeshletsBound meshopt_buildMeshletsFlex meshopt_buildMeshletsSpatial meshopt_computeClusterBounds meshopt_computeMeshletBounds meshopt_computeSphereBounds meshopt_optimizeMeshlet sbrk __wasm_call_ctors ifneq ($(werror),) @@ -234,7 +234,7 @@ codectest: tools/codectest.cpp $(LIBRARY) codecfuzz: tools/codecfuzz.cpp src/vertexcodec.cpp src/indexcodec.cpp src/meshletcodec.cpp $(CXX) $^ -fsanitize=fuzzer,address,undefined -O1 -g -o $@ -clusterfuzz: tools/clusterfuzz.cpp src/clusterizer.cpp +clusterfuzz: tools/clusterfuzz.cpp src/clusterizer.cpp src/partition.cpp $(CXX) $^ -fsanitize=fuzzer,address,undefined -O1 -g -o $@ simplifyfuzz: tools/simplifyfuzz.cpp src/simplifier.cpp From fdfc8520c35449b711864bd9ca6118ca01e2b0b3 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Thu, 19 Feb 2026 10:49:07 -0800 Subject: [PATCH 7/7] meshletutils: Remove stale references Also rename kAxes inside the function, as we use regular variable syntax for function-scoped statics and this was the last place left using kName. --- src/meshletutils.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/meshletutils.cpp b/src/meshletutils.cpp index 12c4d8405..2f30eeb5f 100644 --- a/src/meshletutils.cpp +++ b/src/meshletutils.cpp @@ -7,11 +7,9 @@ #include // This work is based on: -// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016 // Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016 // Jack Ritter. An Efficient Bounding Sphere. 1990 // Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008 -// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006 namespace meshopt { @@ -23,7 +21,7 @@ const size_t kMeshletMaxTriangles = 512; static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count) { - static const float kAxes[7][3] = { + static const float axes[7][3] = { // X, Y, Z {1, 0, 0}, {0, 1, 0}, @@ -37,7 +35,7 @@ static void computeBoundingSphere(float result[4], const float* points, size_t c }; assert(count > 0); - assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0])); + assert(axis_count <= sizeof(axes) / sizeof(axes[0])); size_t points_stride_float = points_stride / sizeof(float); size_t radii_stride_float = radii_stride / sizeof(float); @@ -60,7 +58,7 @@ static void computeBoundingSphere(float result[4], const float* points, size_t c for (size_t axis = 0; axis < axis_count; ++axis) { - const float* ax = kAxes[axis]; + const float* ax = axes[axis]; float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2]; float tpmin = tp - r, tpmax = tp + r;