Merge branch 'move-opt' of github.com:lighttransport/tinyusdz into move-opt

2026-01-18 01:11:17 +01:00 · 2025-09-11 09:39:20 +09:00
parent f0205018b8 d8cfd3422f
commit ec1212d0aa
35 changed files with 15098 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ if (EMSCRIPTEN)
  set(TINYUSDZ_DEFAULT_WITH_USDA_PARSER Off)
  set(TINYUSDZ_DEFAULT_WITH_USDC_PARSER Off)
  set(TINYUSDZ_DEFAULT_WITH_COROUTINE On)
+  set(TINYUSDZ_DEFAULT_WITH_MESHOPT On)
 elseif(NOT PROJECT_IS_TOP_LEVEL)
  # assume tinyusdz is added from add_subdirectory()
  # disable tools, tests and examples build by default.
@@ -46,6 +47,7 @@ elseif(NOT PROJECT_IS_TOP_LEVEL)
  set(TINYUSDZ_DEFAULT_WITH_USDA_PARSER Off)
  set(TINYUSDZ_DEFAULT_WITH_USDC_PARSER Off)
  set(TINYUSDZ_DEFAULT_WITH_COROUTINE Off)
+  set(TINYUSDZ_DEFAULT_WITH_MESHOPT Off)
 else()
  set(TINYUSDZ_DEFAULT_NO_WERROR OFF)
  set(TINYUSDZ_DEFAULT_PRODUCTION_BUILD Off)
@@ -58,6 +60,7 @@ else()
  set(TINYUSDZ_DEFAULT_WITH_USDA_PARSER Off)
  set(TINYUSDZ_DEFAULT_WITH_USDC_PARSER Off)
  set(TINYUSDZ_DEFAULT_WITH_COROUTINE Off)
+  set(TINYUSDZ_DEFAULT_WITH_MESHOPT Off)

  # For Visual Studio
  set_property(GLOBAL PROPERTY USE_FOLDERS ON)
@@ -249,6 +252,9 @@ option(TINYUSDZ_WITH_EXR "Build with EXR HDR texture support" ON)
 # -- ColorIO --
 option(TINYUSDZ_WITH_COLORIO
       "Build with Color IO Baked LUT support(through tinycolorio)" ON)
+
+option(TINYUSDZ_WITH_MESHOPT
+       "Build with meshoptimizer support for mesh optimization" ${TINYUSDZ_DEFAULT_WITH_MESHOPT})
 # ---------

 # -- optional tool --
@@ -869,6 +875,30 @@ if(TINYUSDZ_WITH_WAMR)

 endif(TINYUSDZ_WITH_WAMR)

+if(TINYUSDZ_WITH_MESHOPT)
+  # meshoptimizer source files
+  set(MESHOPTIMIZER_SOURCES
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/allocator.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/clusterizer.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/indexanalyzer.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/indexcodec.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/indexgenerator.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/overdrawoptimizer.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/partition.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/quantization.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/rasterizer.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/simplifier.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/spatialorder.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/stripifier.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/vcacheoptimizer.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/vertexcodec.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/vertexfilter.cpp
+    ${PROJECT_SOURCE_DIR}/src/external/meshoptimizer/vfetchoptimizer.cpp
+  )
+
+  list(APPEND TINYUSDZ_DEP_SOURCES ${MESHOPTIMIZER_SOURCES})
+endif(TINYUSDZ_WITH_MESHOPT)
+
 if(TINYUSDZ_WITH_TIFF OR TINYUSDZ_WITH_EXR)
  if(TINYUSDZ_USE_SYSTEM_ZLIB)
    list(APPEND TINYUSDZ_EXT_LIBRARIES ZLIB::ZLIB)
@@ -1350,6 +1380,11 @@ foreach(TINYUSDZ_LIB_TARGET ${TINYUSDZ_LIBS})
                               PRIVATE "TINYUSDZ_WITH_COROUTINE")
  endif(TINYUSDZ_WITH_COROUTINE)

+  if(TINYUSDZ_WITH_MESHOPT)
+    target_compile_definitions(${TINYUSDZ_LIB_TARGET}
+                               PRIVATE "TINYUSDZ_WITH_MESHOPT")
+  endif(TINYUSDZ_WITH_MESHOPT)
+
  if(NOT TINYUSDZ_CXX_EXCEPTIONS)
    if(MSVC)
      target_compile_options(${TINYUSDZ_LIB_TARGET} PRIVATE /EHs-c-)
--- a/README.md
+++ b/README.md
@@ -575,3 +575,4 @@ Some helper code is licensed under MIT license.
 * civetweb: MIT license. https://github.com/civetweb/civetweb
 * libsais: Apache 2.0 license. https://github.com/IlyaGrebnov/libsais
 * quickjs-ng: MIT license: https://github.com/quickjs-ng/quickjs
+* meshoptimizer: MIT license: https://github.com/zeux/meshoptimizer
--- a/sandbox/wasm-heap/build-malloc-tests.sh
+++ b/sandbox/wasm-heap/build-malloc-tests.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Build WASM with different malloc implementations to test memory reuse
+
+echo "Building with different malloc implementations..."
+
+# dlmalloc (default/general-purpose)
+echo "Building with dlmalloc..."
+em++ memory_test.cpp \
+    -o memory_test_dlmalloc.js \
+    --bind \
+    -s MALLOC=dlmalloc \
+    -s ALLOW_MEMORY_GROWTH=1 \
+    -s EXPORTED_RUNTIME_METHODS='["ccall", "cwrap"]' \
+    -s ENVIRONMENT=node \
+    -s MODULARIZE=1 \
+    -s EXPORT_NAME='MemoryTestModule' \
+    -O2
+
+# emmalloc (simple and compact)  
+echo "Building with emmalloc..."
+em++ memory_test.cpp \
+    -o memory_test_emmalloc.js \
+    --bind \
+    -s MALLOC=emmalloc \
+    -s ALLOW_MEMORY_GROWTH=1 \
+    -s EXPORTED_RUNTIME_METHODS='["ccall", "cwrap"]' \
+    -s ENVIRONMENT=node \
+    -s MODULARIZE=1 \
+    -s EXPORT_NAME='MemoryTestModule' \
+    -O2
+
+# mimalloc (multithreaded allocator)
+echo "Building with mimalloc..."
+em++ memory_test.cpp \
+    -o memory_test_mimalloc.js \
+    --bind \
+    -s MALLOC=mimalloc \
+    -s ALLOW_MEMORY_GROWTH=1 \
+    -s EXPORTED_RUNTIME_METHODS='["ccall", "cwrap"]' \
+    -s ENVIRONMENT=node \
+    -s MODULARIZE=1 \
+    -s EXPORT_NAME='MemoryTestModule' \
+    -O2
+
+echo "All malloc variants built successfully:"
+echo "  dlmalloc: memory_test_dlmalloc.js/.wasm"
+echo "  emmalloc: memory_test_emmalloc.js/.wasm" 
+echo "  mimalloc: memory_test_mimalloc.js/.wasm"
--- a/sandbox/wasm-heap/build-pool.sh
+++ b/sandbox/wasm-heap/build-pool.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Build WASM module with memory pool
+
+em++ memory_pool.cpp \
+    -o memory_pool.js \
+    --bind \
+    -s ALLOW_MEMORY_GROWTH=1 \
+    -s EXPORTED_RUNTIME_METHODS='["ccall", "cwrap"]' \
+    -s ENVIRONMENT=node \
+    -s MODULARIZE=1 \
+    -s EXPORT_NAME='MemoryPoolModule' \
+    -O2
+
+echo "Memory pool build complete. Generated files:"
+echo "  memory_pool.js"
+echo "  memory_pool.wasm"
--- a/sandbox/wasm-heap/build.sh
+++ b/sandbox/wasm-heap/build.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Build WASM module with embind and allow_memory_growth
+
+em++ memory_test.cpp \
+    -o memory_test.js \
+    --bind \
+    -s ALLOW_MEMORY_GROWTH=1 \
+    -s EXPORTED_RUNTIME_METHODS='["ccall", "cwrap"]' \
+    -s ENVIRONMENT=node \
+    -s MODULARIZE=1 \
+    -s EXPORT_NAME='MemoryTestModule' \
+    -O2
+
+echo "Build complete. Generated files:"
+echo "  memory_test.js"
+echo "  memory_test.wasm"
--- a/sandbox/wasm-heap/memory_pool.cpp
+++ b/sandbox/wasm-heap/memory_pool.cpp
@@ -0,0 +1,171 @@
+#include <vector>
+#include <cstdint>
+#include <algorithm>
+#include <emscripten/bind.h>
+#include <emscripten/emscripten.h>
+
+class MemoryPool {
+private:
+    std::vector<uint8_t> pool_memory;
+    struct Block {
+        size_t offset;
+        size_t size;
+        bool is_free;
+        
+        Block(size_t off, size_t sz, bool free) : offset(off), size(sz), is_free(free) {}
+    };
+    std::vector<Block> blocks;
+    
+public:
+    size_t create_pool(size_t mb_size) {
+        const size_t size = mb_size * 1024 * 1024;
+        pool_memory.resize(size);
+        blocks.clear();
+        blocks.emplace_back(0, size, true);
+        return pool_memory.size();
+    }
+    
+    int allocate_from_pool(size_t mb_size) {
+        const size_t size = mb_size * 1024 * 1024;
+        
+        // Find first free block that fits
+        for (size_t i = 0; i < blocks.size(); ++i) {
+            Block& block = blocks[i];
+            if (block.is_free && block.size >= size) {
+                // Mark as used
+                block.is_free = false;
+                
+                // If block is larger, split it
+                if (block.size > size) {
+                    Block new_free_block(block.offset + size, block.size - size, true);
+                    blocks.insert(blocks.begin() + i + 1, new_free_block);
+                    block.size = size;
+                }
+                
+                // Fill with pattern for verification
+                std::fill(pool_memory.begin() + block.offset, 
+                         pool_memory.begin() + block.offset + size, 
+                         static_cast<uint8_t>(0xAA));
+                
+                return static_cast<int>(i);
+            }
+        }
+        return -1; // No suitable block found
+    }
+    
+    bool free_block(int block_id) {
+        if (block_id < 0 || block_id >= static_cast<int>(blocks.size())) {
+            return false;
+        }
+        
+        Block& block = blocks[block_id];
+        if (block.is_free) {
+            return false; // Already free
+        }
+        
+        block.is_free = true;
+        
+        // Clear memory for verification
+        std::fill(pool_memory.begin() + block.offset, 
+                 pool_memory.begin() + block.offset + block.size, 
+                 static_cast<uint8_t>(0x00));
+        
+        // Merge with adjacent free blocks
+        merge_free_blocks();
+        
+        return true;
+    }
+    
+    void merge_free_blocks() {
+        // Sort blocks by offset
+        std::sort(blocks.begin(), blocks.end(), 
+                 [](const Block& a, const Block& b) { return a.offset < b.offset; });
+        
+        // Merge adjacent free blocks
+        for (size_t i = 0; i < blocks.size() - 1; ) {
+            Block& current = blocks[i];
+            Block& next = blocks[i + 1];
+            
+            if (current.is_free && next.is_free && 
+                current.offset + current.size == next.offset) {
+                current.size += next.size;
+                blocks.erase(blocks.begin() + i + 1);
+            } else {
+                ++i;
+            }
+        }
+    }
+    
+    size_t get_pool_size() const {
+        return pool_memory.size();
+    }
+    
+    size_t get_total_allocated() const {
+        size_t total = 0;
+        for (const auto& block : blocks) {
+            if (!block.is_free) {
+                total += block.size;
+            }
+        }
+        return total;
+    }
+    
+    size_t get_total_free() const {
+        size_t total = 0;
+        for (const auto& block : blocks) {
+            if (block.is_free) {
+                total += block.size;
+            }
+        }
+        return total;
+    }
+    
+    size_t get_block_count() const {
+        return blocks.size();
+    }
+    
+    size_t get_largest_free_block() const {
+        size_t largest = 0;
+        for (const auto& block : blocks) {
+            if (block.is_free && block.size > largest) {
+                largest = block.size;
+            }
+        }
+        return largest;
+    }
+    
+    bool is_block_allocated(int block_id) const {
+        if (block_id < 0 || block_id >= static_cast<int>(blocks.size())) {
+            return false;
+        }
+        return !blocks[block_id].is_free;
+    }
+    
+    size_t get_block_size(int block_id) const {
+        if (block_id < 0 || block_id >= static_cast<int>(blocks.size())) {
+            return 0;
+        }
+        return blocks[block_id].size;
+    }
+    
+    void clear_pool() {
+        pool_memory.clear();
+        blocks.clear();
+    }
+};
+
+EMSCRIPTEN_BINDINGS(memory_pool) {
+    emscripten::class_<MemoryPool>("MemoryPool")
+        .constructor<>()
+        .function("create_pool", &MemoryPool::create_pool)
+        .function("allocate_from_pool", &MemoryPool::allocate_from_pool)
+        .function("free_block", &MemoryPool::free_block)
+        .function("get_pool_size", &MemoryPool::get_pool_size)
+        .function("get_total_allocated", &MemoryPool::get_total_allocated)
+        .function("get_total_free", &MemoryPool::get_total_free)
+        .function("get_block_count", &MemoryPool::get_block_count)
+        .function("get_largest_free_block", &MemoryPool::get_largest_free_block)
+        .function("is_block_allocated", &MemoryPool::is_block_allocated)
+        .function("get_block_size", &MemoryPool::get_block_size)
+        .function("clear_pool", &MemoryPool::clear_pool);
+}
--- a/sandbox/wasm-heap/memory_test.cpp
+++ b/sandbox/wasm-heap/memory_test.cpp
@@ -0,0 +1,103 @@
+#include <vector>
+#include <cstdint>
+#include <algorithm>
+#include <emscripten/bind.h>
+#include <emscripten/emscripten.h>
+
+class MemoryAllocator {
+private:
+    std::vector<std::vector<uint8_t>> allocated_chunks;
+    std::vector<uint8_t> reserved_space;
+
+public:
+    size_t allocate_100mb() {
+        const size_t size = 100 * 1024 * 1024; // 100MB
+        allocated_chunks.emplace_back(size, 0);
+        return allocated_chunks.size();
+    }
+    
+    size_t allocate_105mb() {
+        const size_t size = 105 * 1024 * 1024; // 105MB
+        allocated_chunks.emplace_back(size, 0);
+        return allocated_chunks.size();
+    }
+    
+    size_t allocate_20mb() {
+        const size_t size = 20 * 1024 * 1024; // 20MB
+        allocated_chunks.emplace_back(size, 0);
+        return allocated_chunks.size();
+    }
+    
+    size_t reserve_space(size_t mb_size) {
+        const size_t size = mb_size * 1024 * 1024;
+        reserved_space.reserve(size);
+        reserved_space.resize(size, 0);
+        return reserved_space.size();
+    }
+    
+    void clear_reserve() {
+        std::vector<uint8_t> empty_vector;
+        reserved_space.swap(empty_vector);
+    }
+    
+    size_t get_reserved_size() const {
+        return reserved_space.size();
+    }
+    
+    void clear_all() {
+        allocated_chunks.clear();
+        clear_reserve();
+    }
+    
+    bool release_chunk(size_t index) {
+        if (index >= allocated_chunks.size()) {
+            return false;
+        }
+        std::vector<uint8_t> empty_vector;
+        allocated_chunks[index].swap(empty_vector);
+        return true;
+    }
+    
+    void compact_chunks() {
+        allocated_chunks.erase(
+            std::remove_if(allocated_chunks.begin(), allocated_chunks.end(),
+                          [](const std::vector<uint8_t>& chunk) { return chunk.empty(); }),
+            allocated_chunks.end());
+    }
+    
+    size_t get_total_allocated() const {
+        size_t total = 0;
+        for (const auto& chunk : allocated_chunks) {
+            total += chunk.size();
+        }
+        return total;
+    }
+    
+    size_t get_chunk_count() const {
+        return allocated_chunks.size();
+    }
+    
+    size_t get_chunk_size(size_t index) const {
+        if (index >= allocated_chunks.size()) {
+            return 0;
+        }
+        return allocated_chunks[index].size();
+    }
+};
+
+EMSCRIPTEN_BINDINGS(memory_test) {
+    emscripten::class_<MemoryAllocator>("MemoryAllocator")
+        .constructor<>()
+        .function("allocate_100mb", &MemoryAllocator::allocate_100mb)
+        .function("allocate_105mb", &MemoryAllocator::allocate_105mb)
+        .function("allocate_20mb", &MemoryAllocator::allocate_20mb)
+        .function("reserve_space", &MemoryAllocator::reserve_space)
+        .function("clear_reserve", &MemoryAllocator::clear_reserve)
+        .function("get_reserved_size", &MemoryAllocator::get_reserved_size)
+        .function("clear_all", &MemoryAllocator::clear_all)
+        .function("release_chunk", &MemoryAllocator::release_chunk)
+        .function("compact_chunks", &MemoryAllocator::compact_chunks)
+        .function("get_total_allocated", &MemoryAllocator::get_total_allocated)
+        .function("get_chunk_count", &MemoryAllocator::get_chunk_count)
+        .function("get_chunk_size", &MemoryAllocator::get_chunk_size);
+}
--- a/sandbox/wasm-heap/package.json
+++ b/sandbox/wasm-heap/package.json
@@ -0,0 +1,14 @@
+{
+  "name": "wasm-heap-test",
+  "version": "1.0.0",
+  "description": "WASM memory allocation test with std::vector",
+  "main": "test.js",
+  "scripts": {
+    "build": "./build.sh",
+    "test": "node test.js",
+    "all": "npm run build && npm run test"
+  },
+  "engines": {
+    "node": ">=14.0.0"
+  }
+}
--- a/sandbox/wasm-heap/pool-vs-malloc-comparison.js
+++ b/sandbox/wasm-heap/pool-vs-malloc-comparison.js
@@ -0,0 +1,87 @@
+async function comparePoolVsMalloc() {
+    console.log('MEMORY POOL vs MALLOC COMPARISON');
+    console.log('Test sequence: 100MB → 20MB → free 100MB → 105MB');
+    console.log('=' .repeat(60));
+
+    // Load both modules
+    const MemoryTestModule = require('./memory_test_emmalloc.js');
+    const MemoryPoolModule = require('./memory_pool.js');
+
+    console.log('\n1. TESTING EMMALLOC (std::vector with swap)');
+    console.log('-'.repeat(50));
+    
+    const mallocModule = await MemoryTestModule();
+    const allocator = new mallocModule.MemoryAllocator();
+    
+    const mallocInitial = process.memoryUsage().rss;
+    allocator.allocate_100mb();
+    allocator.allocate_20mb();
+    const mallocPeak = process.memoryUsage().rss;
+    allocator.release_chunk(0);
+    const mallocAfterFree = process.memoryUsage().rss;
+    allocator.allocate_105mb();
+    const mallocFinal = process.memoryUsage().rss;
+    
+    console.log(`Initial RSS: ${(mallocInitial / 1024 / 1024).toFixed(2)} MB`);
+    console.log(`Peak RSS (120MB allocated): ${(mallocPeak / 1024 / 1024).toFixed(2)} MB`);
+    console.log(`After free RSS: ${(mallocAfterFree / 1024 / 1024).toFixed(2)} MB`);
+    console.log(`Final RSS (125MB allocated): ${(mallocFinal / 1024 / 1024).toFixed(2)} MB`);
+    
+    const mallocGrowth = mallocFinal - mallocInitial;
+    const mallocReuse = (mallocPeak + 25*1024*1024 - mallocFinal) / (105*1024*1024) * 100;
+    
+    console.log(`\n2. TESTING CUSTOM MEMORY POOL`);
+    console.log('-'.repeat(50));
+    
+    const poolModule = await MemoryPoolModule();
+    const pool = new poolModule.MemoryPool();
+    
+    const poolInitial = process.memoryUsage().rss;
+    pool.create_pool(150);
+    const poolAfterCreation = process.memoryUsage().rss;
+    const block1 = pool.allocate_from_pool(100);
+    const block2 = pool.allocate_from_pool(20);
+    const poolPeak = process.memoryUsage().rss;
+    pool.free_block(block1);
+    const poolAfterFree = process.memoryUsage().rss;
+    const block3 = pool.allocate_from_pool(105);
+    const poolFinal = process.memoryUsage().rss;
+    
+    console.log(`Initial RSS: ${(poolInitial / 1024 / 1024).toFixed(2)} MB`);
+    console.log(`After pool creation: ${(poolAfterCreation / 1024 / 1024).toFixed(2)} MB`);
+    console.log(`Peak RSS (120MB allocated): ${(poolPeak / 1024 / 1024).toFixed(2)} MB`);
+    console.log(`After free RSS: ${(poolAfterFree / 1024 / 1024).toFixed(2)} MB`);
+    console.log(`Final RSS (125MB allocated): ${(poolFinal / 1024 / 1024).toFixed(2)} MB`);
+    
+    const poolGrowth = poolFinal - poolInitial;
+    const poolSucceeded = block3 >= 0;
+    
+    console.log(`\n3. COMPARISON RESULTS`);
+    console.log('='.repeat(60));
+    
+    console.log('Approach\t\tTotal Growth\tMemory Reuse');
+    console.log('-'.repeat(50));
+    console.log(`emmalloc\t\t${(mallocGrowth / 1024 / 1024).toFixed(1)} MB\t\t${mallocReuse.toFixed(1)}%`);
+    console.log(`Memory Pool\t\t${(poolGrowth / 1024 / 1024).toFixed(1)} MB\t\t${poolSucceeded ? 'SUCCESS' : 'FAILED'}`);
+    
+    console.log(`\n4. KEY INSIGHTS`);
+    console.log('-'.repeat(50));
+    
+    if (poolSucceeded) {
+        const efficiency = (1 - (poolGrowth - 150*1024*1024) / (150*1024*1024)) * 100;
+        console.log(`✓ Memory pool achieved ${efficiency.toFixed(1)}% efficiency`);
+        console.log(`✓ 105MB allocation reused freed 100MB space`);
+        console.log(`✓ RSS stayed constant after pool creation`);
+        console.log(`✓ No heap fragmentation or growth after initial pool`);
+    } else {
+        console.log(`✗ Memory pool allocation failed`);
+    }
+    
+    console.log(`✗ emmalloc showed ${Math.abs(mallocReuse).toFixed(1)}% negative efficiency`);
+    console.log(`✗ emmalloc had ${((mallocFinal - mallocPeak) / 1024 / 1024).toFixed(1)} MB additional growth`);
+    
+    const improvement = ((mallocGrowth - poolGrowth) / mallocGrowth) * 100;
+    console.log(`\n📊 Memory pool reduces total memory usage by ${improvement.toFixed(1)}%`);
+}
+
+comparePoolVsMalloc().catch(console.error);
--- a/sandbox/wasm-heap/test-malloc-comparison.js
+++ b/sandbox/wasm-heap/test-malloc-comparison.js
@@ -0,0 +1,116 @@
+function formatBytes(bytes) {
+    return (bytes / 1024 / 1024).toFixed(2) + ' MB';
+}
+
+function printMemoryUsage(label) {
+    const usage = process.memoryUsage();
+    console.log(`\n=== ${label} ===`);
+    console.log(`RSS: ${formatBytes(usage.rss)}`);
+    console.log(`Heap Total: ${formatBytes(usage.heapTotal)}`);
+    console.log(`Heap Used: ${formatBytes(usage.heapUsed)}`);
+    console.log(`External: ${formatBytes(usage.external)}`);
+}
+
+function printAllocatorStatus(allocator, label) {
+    console.log(`\n--- ${label} ---`);
+    console.log(`Chunks: ${allocator.get_chunk_count()}`);
+    console.log(`Total allocated: ${formatBytes(allocator.get_total_allocated())}`);
+}
+
+async function testMallocImplementation(moduleName, jsFile) {
+    console.log(`\n${'='.repeat(60)}`);
+    console.log(`TESTING ${moduleName.toUpperCase()}`);
+    console.log(`${'='.repeat(60)}`);
+    
+    const Module = await require(jsFile)();
+    const allocator = new Module.MemoryAllocator();
+    
+    printMemoryUsage(`${moduleName} - Initial`);
+    
+    // Test sequence: 100MB → 20MB → free 100MB → 105MB
+    console.log('\n1. Allocate 100MB');
+    allocator.allocate_100mb();
+    printAllocatorStatus(allocator, `${moduleName} - After 100MB`);
+    const usage1 = process.memoryUsage();
+    
+    console.log('\n2. Allocate 20MB');
+    allocator.allocate_20mb();
+    printAllocatorStatus(allocator, `${moduleName} - After 20MB (120MB total)`);
+    const usage2 = process.memoryUsage();
+    
+    console.log('\n3. Free 100MB chunk');
+    const released = allocator.release_chunk(0);
+    console.log(`Release successful: ${released}`);
+    printAllocatorStatus(allocator, `${moduleName} - After freeing 100MB`);
+    const usage3 = process.memoryUsage();
+    
+    console.log('\n4. Allocate 105MB');
+    allocator.allocate_105mb();
+    printAllocatorStatus(allocator, `${moduleName} - After 105MB (125MB total)`);
+    const usage4 = process.memoryUsage();
+    
+    printMemoryUsage(`${moduleName} - Final`);
+    
+    // Calculate memory growth for analysis
+    const growth1 = usage1.rss - 50.5 * 1024 * 1024; // Subtract baseline
+    const growth4 = usage4.rss - 50.5 * 1024 * 1024;
+    const reuse_efficiency = (growth1 + 25*1024*1024 - growth4) / (105*1024*1024) * 100; // How much of 105MB was reused
+    
+    console.log(`\n--- ${moduleName} SUMMARY ---`);
+    console.log(`Peak RSS (step 2): ${formatBytes(usage2.rss)}`);
+    console.log(`Final RSS (step 4): ${formatBytes(usage4.rss)}`);
+    console.log(`Memory reuse efficiency: ${reuse_efficiency.toFixed(1)}%`);
+    
+    return {
+        name: moduleName,
+        peakRSS: usage2.rss,
+        finalRSS: usage4.rss,
+        reuseEfficiency: reuse_efficiency
+    };
+}
+
+async function runMallocComparison() {
+    console.log('MALLOC IMPLEMENTATION COMPARISON');
+    console.log('Test sequence: 100MB → 20MB → free 100MB → 105MB');
+    
+    const results = [];
+    
+    try {
+        results.push(await testMallocImplementation('dlmalloc', './memory_test_dlmalloc.js'));
+    } catch (e) {
+        console.log('dlmalloc test failed:', e.message);
+    }
+    
+    try {
+        results.push(await testMallocImplementation('emmalloc', './memory_test_emmalloc.js'));
+    } catch (e) {
+        console.log('emmalloc test failed:', e.message);
+    }
+    
+    try {
+        results.push(await testMallocImplementation('mimalloc', './memory_test_mimalloc.js'));
+    } catch (e) {
+        console.log('mimalloc test failed:', e.message);
+    }
+    
+    // Final comparison
+    console.log(`\n${'='.repeat(60)}`);
+    console.log('FINAL COMPARISON');
+    console.log(`${'='.repeat(60)}`);
+    
+    console.log('Malloc\t\tPeak RSS\tFinal RSS\tReuse Eff.');
+    console.log('-'.repeat(50));
+    
+    results.forEach(result => {
+        console.log(`${result.name}\t\t${formatBytes(result.peakRSS)}\t\t${formatBytes(result.finalRSS)}\t\t${result.reuseEfficiency.toFixed(1)}%`);
+    });
+    
+    // Find best performer
+    const bestReuse = results.reduce((best, current) => 
+        current.reuseEfficiency > best.reuseEfficiency ? current : best
+    );
+    
+    console.log(`\nBest for memory reuse: ${bestReuse.name} (${bestReuse.reuseEfficiency.toFixed(1)}% efficiency)`);
+}
+
+runMallocComparison().catch(console.error);
--- a/sandbox/wasm-heap/test-pool.js
+++ b/sandbox/wasm-heap/test-pool.js
@@ -0,0 +1,108 @@
+const MemoryPoolModule = require('./memory_pool.js');
+
+function formatBytes(bytes) {
+    return (bytes / 1024 / 1024).toFixed(2) + ' MB';
+}
+
+function printMemoryUsage(label) {
+    const usage = process.memoryUsage();
+    console.log(`\n=== ${label} ===`);
+    console.log(`RSS: ${formatBytes(usage.rss)}`);
+    console.log(`Heap Total: ${formatBytes(usage.heapTotal)}`);
+    console.log(`Heap Used: ${formatBytes(usage.heapUsed)}`);
+    console.log(`External: ${formatBytes(usage.external)}`);
+}
+
+function printPoolStatus(pool, label) {
+    console.log(`\n--- ${label} ---`);
+    console.log(`Pool size: ${formatBytes(pool.get_pool_size())}`);
+    console.log(`Total allocated: ${formatBytes(pool.get_total_allocated())}`);
+    console.log(`Total free: ${formatBytes(pool.get_total_free())}`);
+    console.log(`Largest free block: ${formatBytes(pool.get_largest_free_block())}`);
+    console.log(`Block count: ${pool.get_block_count()}`);
+}
+
+async function runPoolTest() {
+    console.log('CUSTOM MEMORY POOL TEST');
+    console.log('Test sequence: Create 150MB pool → 100MB → 20MB → free 100MB → 105MB');
+    console.log('='.repeat(70));
+    
+    const Module = await MemoryPoolModule();
+    const pool = new Module.MemoryPool();
+    
+    printMemoryUsage('Initial Memory Usage');
+    
+    // Step 0: Create 150MB pool
+    console.log('\n=== STEP 0: Create 150MB Pool ===');
+    const poolSize = pool.create_pool(150);
+    console.log(`Pool created: ${formatBytes(poolSize)}`);
+    printPoolStatus(pool, 'After pool creation');
+    printMemoryUsage('Memory after pool creation');
+    
+    // Step 1: Allocate 100MB from pool
+    console.log('\n=== STEP 1: Allocate 100MB from pool ===');
+    const block1 = pool.allocate_from_pool(100);
+    console.log(`Block ID: ${block1}`);
+    if (block1 >= 0) {
+        console.log(`Block size: ${formatBytes(pool.get_block_size(block1))}`);
+        console.log(`Block allocated: ${pool.is_block_allocated(block1)}`);
+    }
+    printPoolStatus(pool, 'After 100MB allocation');
+    printMemoryUsage('Memory after 100MB allocation');
+    
+    // Step 2: Allocate 20MB from pool
+    console.log('\n=== STEP 2: Allocate 20MB from pool ===');
+    const block2 = pool.allocate_from_pool(20);
+    console.log(`Block ID: ${block2}`);
+    if (block2 >= 0) {
+        console.log(`Block size: ${formatBytes(pool.get_block_size(block2))}`);
+        console.log(`Block allocated: ${pool.is_block_allocated(block2)}`);
+    }
+    printPoolStatus(pool, 'After 20MB allocation (120MB total used)');
+    printMemoryUsage('Memory after 20MB allocation');
+    
+    // Step 3: Free the 100MB block
+    console.log('\n=== STEP 3: Free 100MB block ===');
+    const freed = pool.free_block(block1);
+    console.log(`Free successful: ${freed}`);
+    if (block1 >= 0) {
+        console.log(`Block allocated: ${pool.is_block_allocated(block1)}`);
+    }
+    printPoolStatus(pool, 'After freeing 100MB block');
+    printMemoryUsage('Memory after freeing 100MB');
+    
+    // Step 4: Allocate 105MB from pool (should reuse freed space)
+    console.log('\n=== STEP 4: Allocate 105MB from pool ===');
+    const block3 = pool.allocate_from_pool(105);
+    console.log(`Block ID: ${block3}`);
+    if (block3 >= 0) {
+        console.log(`Block size: ${formatBytes(pool.get_block_size(block3))}`);
+        console.log(`Block allocated: ${pool.is_block_allocated(block3)}`);
+    } else {
+        console.log('Allocation failed - not enough free space');
+    }
+    printPoolStatus(pool, 'After 105MB allocation');
+    printMemoryUsage('Memory after 105MB allocation');
+    
+    console.log('\n=== ANALYSIS ===');
+    const finalUsage = process.memoryUsage();
+    const initialRSS = 50.5 * 1024 * 1024; // Approximate baseline
+    const totalGrowth = finalUsage.rss - initialRSS;
+    const expectedGrowth = 150 * 1024 * 1024; // Just the pool size
+    const efficiency = (1 - (totalGrowth - expectedGrowth) / expectedGrowth) * 100;
+    
+    console.log(`Expected RSS growth: ${formatBytes(expectedGrowth)} (pool only)`);
+    console.log(`Actual RSS growth: ${formatBytes(totalGrowth)}`);
+    console.log(`Memory efficiency: ${efficiency.toFixed(1)}%`);
+    
+    if (block3 >= 0) {
+        console.log(`✓ 105MB allocation succeeded - memory was reused!`);
+        console.log(`✓ Pool manages ${formatBytes(pool.get_pool_size())} with perfect reuse`);
+    } else {
+        console.log(`✗ 105MB allocation failed - insufficient free space`);
+    }
+    
+    console.log('\nPool test completed!');
+}
+
+runPoolTest().catch(console.error);
--- a/sandbox/wasm-heap/test-reserve.js
+++ b/sandbox/wasm-heap/test-reserve.js
@@ -0,0 +1,80 @@
+const MemoryTestModule = require('./memory_test.js');
+
+function formatBytes(bytes) {
+    return (bytes / 1024 / 1024).toFixed(2) + ' MB';
+}
+
+function printMemoryUsage(label) {
+    const usage = process.memoryUsage();
+    console.log(`\n=== ${label} ===`);
+    console.log(`RSS (Resident Set Size): ${formatBytes(usage.rss)}`);
+    console.log(`Heap Total: ${formatBytes(usage.heapTotal)}`);
+    console.log(`Heap Used: ${formatBytes(usage.heapUsed)}`);
+    console.log(`External: ${formatBytes(usage.external)}`);
+}
+
+function printAllocatorStatus(allocator, label) {
+    console.log(`\n--- ${label} ---`);
+    console.log(`Reserved: ${formatBytes(allocator.get_reserved_size())}`);
+    console.log(`Chunks: ${allocator.get_chunk_count()}`);
+    console.log(`Total allocated: ${formatBytes(allocator.get_total_allocated())}`);
+    for (let i = 0; i < allocator.get_chunk_count(); i++) {
+        console.log(`  Chunk ${i}: ${formatBytes(allocator.get_chunk_size(i))}`);
+    }
+}
+
+async function runReserveTest() {
+    console.log('Testing with 150MB reserve: reserve 150MB → 100MB → 20MB → free 100MB → 105MB');
+    console.log('Loading WASM module...');
+    const Module = await MemoryTestModule();
+    
+    printMemoryUsage('Initial Memory Usage');
+    
+    const allocator = new Module.MemoryAllocator();
+    
+    // Step 0: Reserve 150MB
+    console.log('\n=== STEP 0: Reserve 150MB ===');
+    const reserved = allocator.reserve_space(150);
+    console.log(`Reserved: ${formatBytes(reserved)}`);
+    printAllocatorStatus(allocator, 'After 150MB reserve');
+    printMemoryUsage('Memory after 150MB reserve');
+    
+    // Step 1: Allocate 100MB
+    console.log('\n=== STEP 1: Allocate 100MB ===');
+    allocator.allocate_100mb();
+    printAllocatorStatus(allocator, 'After 100MB allocation');
+    printMemoryUsage('Memory after 100MB (within reserved space)');
+    
+    // Step 2: Allocate 20MB
+    console.log('\n=== STEP 2: Allocate 20MB ===');
+    allocator.allocate_20mb();
+    printAllocatorStatus(allocator, 'After 20MB allocation (total: 120MB + 150MB reserve)');
+    printMemoryUsage('Memory after 20MB (within reserved space)');
+    
+    // Step 3: Free first chunk (100MB)
+    console.log('\n=== STEP 3: Free 100MB (chunk 0) ===');
+    const released = allocator.release_chunk(0);
+    console.log(`Release successful: ${released}`);
+    printAllocatorStatus(allocator, 'After freeing 100MB chunk');
+    printMemoryUsage('Memory after freeing 100MB');
+    
+    // Step 4: Allocate 105MB
+    console.log('\n=== STEP 4: Allocate 105MB ===');
+    allocator.allocate_105mb();
+    printAllocatorStatus(allocator, 'After 105MB allocation');
+    printMemoryUsage('Memory after 105MB (should fit in reserved space)');
+    
+    // Step 5: Clear reserve to see memory behavior
+    console.log('\n=== STEP 5: Clear reserve space ===');
+    allocator.clear_reserve();
+    printAllocatorStatus(allocator, 'After clearing reserve');
+    printMemoryUsage('Memory after clearing reserve');
+    
+    console.log('\n=== SUMMARY ===');
+    console.log('Expected behavior: With 150MB pre-reserved, all allocations should fit');
+    console.log('without additional heap growth, reducing fragmentation.');
+    
+    console.log('\nReserve test completed!');
+}
+
+runReserveTest().catch(console.error);
--- a/sandbox/wasm-heap/test-sequence.js
+++ b/sandbox/wasm-heap/test-sequence.js
@@ -0,0 +1,66 @@
+const MemoryTestModule = require('./memory_test.js');
+
+function formatBytes(bytes) {
+    return (bytes / 1024 / 1024).toFixed(2) + ' MB';
+}
+
+function printMemoryUsage(label) {
+    const usage = process.memoryUsage();
+    console.log(`\n=== ${label} ===`);
+    console.log(`RSS (Resident Set Size): ${formatBytes(usage.rss)}`);
+    console.log(`Heap Total: ${formatBytes(usage.heapTotal)}`);
+    console.log(`Heap Used: ${formatBytes(usage.heapUsed)}`);
+    console.log(`External: ${formatBytes(usage.external)}`);
+}
+
+function printAllocatorStatus(allocator, label) {
+    console.log(`\n--- ${label} ---`);
+    console.log(`Chunks: ${allocator.get_chunk_count()}`);
+    console.log(`Total allocated: ${formatBytes(allocator.get_total_allocated())}`);
+    for (let i = 0; i < allocator.get_chunk_count(); i++) {
+        console.log(`  Chunk ${i}: ${formatBytes(allocator.get_chunk_size(i))}`);
+    }
+}
+
+async function runSequenceTest() {
+    console.log('Testing sequence: 100MB → 20MB → free 100MB → 105MB');
+    console.log('Loading WASM module...');
+    const Module = await MemoryTestModule();
+    
+    printMemoryUsage('Initial Memory Usage');
+    
+    const allocator = new Module.MemoryAllocator();
+    
+    // Step 1: Allocate 100MB
+    console.log('\n=== STEP 1: Allocate 100MB ===');
+    allocator.allocate_100mb();
+    printAllocatorStatus(allocator, 'After 100MB allocation');
+    printMemoryUsage('Memory after 100MB');
+    
+    // Step 2: Allocate 20MB
+    console.log('\n=== STEP 2: Allocate 20MB ===');
+    allocator.allocate_20mb();
+    printAllocatorStatus(allocator, 'After 20MB allocation (total: 120MB)');
+    printMemoryUsage('Memory after 20MB (120MB total)');
+    
+    // Step 3: Free first chunk (100MB)
+    console.log('\n=== STEP 3: Free 100MB (chunk 0) ===');
+    const released = allocator.release_chunk(0);
+    console.log(`Release successful: ${released}`);
+    printAllocatorStatus(allocator, 'After freeing 100MB chunk');
+    printMemoryUsage('Memory after freeing 100MB');
+    
+    // Step 4: Allocate 105MB
+    console.log('\n=== STEP 4: Allocate 105MB ===');
+    allocator.allocate_105mb();
+    printAllocatorStatus(allocator, 'After 105MB allocation');
+    printMemoryUsage('Memory after 105MB (125MB total: 20MB + 105MB)');
+    
+    console.log('\n=== SUMMARY ===');
+    console.log('Final state: 20MB + 105MB = 125MB total allocated');
+    console.log('Peak was 120MB (100MB + 20MB), then down to 20MB, then up to 125MB');
+    
+    console.log('\nSequence test completed!');
+}
+
+runSequenceTest().catch(console.error);
--- a/sandbox/wasm-heap/test-swap.js
+++ b/sandbox/wasm-heap/test-swap.js
@@ -0,0 +1,70 @@
+const MemoryTestModule = require('./memory_test.js');
+
+function formatBytes(bytes) {
+    return (bytes / 1024 / 1024).toFixed(2) + ' MB';
+}
+
+function printMemoryUsage(label) {
+    const usage = process.memoryUsage();
+    console.log(`\n=== ${label} ===`);
+    console.log(`RSS (Resident Set Size): ${formatBytes(usage.rss)}`);
+    console.log(`Heap Total: ${formatBytes(usage.heapTotal)}`);
+    console.log(`Heap Used: ${formatBytes(usage.heapUsed)}`);
+    console.log(`External: ${formatBytes(usage.external)}`);
+}
+
+function printAllocatorStatus(allocator, label) {
+    console.log(`\n--- ${label} ---`);
+    console.log(`Chunks: ${allocator.get_chunk_count()}`);
+    console.log(`Total allocated: ${formatBytes(allocator.get_total_allocated())}`);
+    for (let i = 0; i < allocator.get_chunk_count(); i++) {
+        console.log(`  Chunk ${i}: ${formatBytes(allocator.get_chunk_size(i))}`);
+    }
+}
+
+async function runSwapTest() {
+    console.log('Loading WASM module...');
+    const Module = await MemoryTestModule();
+    
+    printMemoryUsage('Initial Memory Usage');
+    
+    console.log('\nCreating MemoryAllocator instance...');
+    const allocator = new Module.MemoryAllocator();
+    
+    printMemoryUsage('After Creating Allocator');
+    
+    // Step 1: Allocate 100MB
+    console.log('\n=== STEP 1: Allocating 100MB ===');
+    allocator.allocate_100mb();
+    printAllocatorStatus(allocator, 'After 100MB Allocation');
+    printMemoryUsage('Memory After 100MB Allocation');
+    
+    // Step 2: Release first chunk using swap
+    console.log('\n=== STEP 2: Releasing first chunk (100MB) using swap ===');
+    const released = allocator.release_chunk(0);
+    console.log(`Release successful: ${released}`);
+    printAllocatorStatus(allocator, 'After Swap Release (before compact)');
+    printMemoryUsage('Memory After Swap Release');
+    
+    // Step 3: Compact to remove empty chunks
+    console.log('\n=== STEP 3: Compacting chunks ===');
+    allocator.compact_chunks();
+    printAllocatorStatus(allocator, 'After Compacting');
+    printMemoryUsage('Memory After Compacting');
+    
+    // Step 4: Allocate 105MB
+    console.log('\n=== STEP 4: Allocating 105MB ===');
+    allocator.allocate_105mb();
+    printAllocatorStatus(allocator, 'After 105MB Allocation');
+    printMemoryUsage('Memory After 105MB Allocation');
+    
+    // Step 5: Final cleanup
+    console.log('\n=== STEP 5: Final cleanup ===');
+    allocator.clear_all();
+    printAllocatorStatus(allocator, 'After Clear All');
+    printMemoryUsage('Final Memory State');
+    
+    console.log('\nSwap test completed!');
+}
+
+runSwapTest().catch(console.error);
--- a/sandbox/wasm-heap/test.js
+++ b/sandbox/wasm-heap/test.js
@@ -0,0 +1,51 @@
+const MemoryTestModule = require('./memory_test.js');
+
+function formatBytes(bytes) {
+    return (bytes / 1024 / 1024).toFixed(2) + ' MB';
+}
+
+function printMemoryUsage(label) {
+    const usage = process.memoryUsage();
+    console.log(`\n=== ${label} ===`);
+    console.log(`RSS (Resident Set Size): ${formatBytes(usage.rss)}`);
+    console.log(`Heap Total: ${formatBytes(usage.heapTotal)}`);
+    console.log(`Heap Used: ${formatBytes(usage.heapUsed)}`);
+    console.log(`External: ${formatBytes(usage.external)}`);
+}
+
+async function runTest() {
+    console.log('Loading WASM module...');
+    const Module = await MemoryTestModule();
+    
+    printMemoryUsage('Initial Memory Usage');
+    
+    console.log('\nCreating MemoryAllocator instance...');
+    const allocator = new Module.MemoryAllocator();
+    
+    printMemoryUsage('After Creating Allocator');
+    
+    console.log('\nAllocating 100MB...');
+    const chunks1 = allocator.allocate_100mb();
+    console.log(`Chunks allocated: ${chunks1}`);
+    console.log(`Total allocated by C++: ${formatBytes(allocator.get_total_allocated())}`);
+    
+    printMemoryUsage('After 100MB Allocation');
+    
+    console.log('\nAllocating 105MB...');
+    const chunks2 = allocator.allocate_105mb();
+    console.log(`Chunks allocated: ${chunks2}`);
+    console.log(`Total allocated by C++: ${formatBytes(allocator.get_total_allocated())}`);
+    
+    printMemoryUsage('After 105MB Allocation (Total: ~205MB)');
+    
+    console.log('\nClearing all allocations...');
+    allocator.clear_all();
+    console.log(`Chunks remaining: ${allocator.get_chunk_count()}`);
+    console.log(`Total allocated by C++: ${formatBytes(allocator.get_total_allocated())}`);
+    
+    printMemoryUsage('After Clearing Allocations');
+    
+    console.log('\nTest completed!');
+}
+
+runTest().catch(console.error);
--- a/src/external/meshoptimizer/LICENSE.md
+++ b/src/external/meshoptimizer/LICENSE.md
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016-2025 Arseny Kapoulkine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/src/external/meshoptimizer/README.md
+++ b/src/external/meshoptimizer/README.md
@@ -0,0 +1,730 @@
+# 🐇 meshoptimizer [![Actions Status](https://github.com/zeux/meshoptimizer/workflows/build/badge.svg)](https://github.com/zeux/meshoptimizer/actions) [![codecov.io](https://codecov.io/github/zeux/meshoptimizer/coverage.svg?branch=master)](https://codecov.io/github/zeux/meshoptimizer?branch=master) [![MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE.md) [![GitHub](https://img.shields.io/badge/repo-github-green.svg)](https://github.com/zeux/meshoptimizer)
+
+## Purpose
+
+When a GPU renders triangle meshes, various stages of the GPU pipeline have to process vertex and index data. The efficiency of these stages depends on the data you feed to them; this library provides algorithms to help optimize meshes for these stages, as well as algorithms to reduce the mesh complexity and storage overhead.
+
+The library provides a C and C++ interface for all algorithms; you can use it from C/C++ or from other languages via FFI (such as P/Invoke). If you want to use this library from Rust, you should use [meshopt crate](https://crates.io/crates/meshopt). JavaScript interface for some algorithms is available through [meshoptimizer.js](https://www.npmjs.com/package/meshoptimizer).
+
+[gltfpack](./gltf/README.md), which is a tool that can automatically optimize glTF files, is developed and distributed alongside the library.
+
+## Installing
+
+meshoptimizer is hosted on GitHub; you can download the latest release using git:
+
+```
+git clone -b v0.25 https://github.com/zeux/meshoptimizer.git
+```
+
+Alternatively you can [download the .zip archive from GitHub](https://github.com/zeux/meshoptimizer/archive/v0.25.zip).
+
+The library is also available as a Linux package in several distributions ([ArchLinux](https://aur.archlinux.org/packages/meshoptimizer/), [Debian](https://packages.debian.org/libmeshoptimizer), [FreeBSD](https://www.freshports.org/misc/meshoptimizer/), [Nix](https://mynixos.com/nixpkgs/package/meshoptimizer), [Ubuntu](https://packages.ubuntu.com/libmeshoptimizer)), as well as a [Vcpkg port](https://github.com/microsoft/vcpkg/tree/master/ports/meshoptimizer) (see [installation instructions](https://learn.microsoft.com/en-us/vcpkg/get_started/get-started)) and a [Conan package](https://conan.io/center/recipes/meshoptimizer).
+
+[gltfpack](./gltf/README.md) is available as a pre-built binary on [Releases page](https://github.com/zeux/meshoptimizer/releases) or via [npm package](https://www.npmjs.com/package/gltfpack). Native binaries are recommended since they are more efficient and support texture compression.
+
+## Building
+
+meshoptimizer is distributed as a C/C++ header (`src/meshoptimizer.h`) and a set of C++ source files (`src/*.cpp`). To include it in your project, you can use one of two options:
+
+* Use CMake to build the library (either as a standalone project or as part of your project)
+* Add source files to your project's build system
+
+The source files are organized in such a way that you don't need to change your build-system settings, and you only need to add the source files for the algorithms you use. They should build without warnings or special compilation options on all major compilers. If you prefer amalgamated builds, you can also concatenate the source files into a single `.cpp` file and build that instead.
+
+To use meshoptimizer functions, simply `#include` the header `meshoptimizer.h`; the library source is C++, but the header is C-compatible.
+
+## Core pipeline
+
+When optimizing a mesh, to maximize rendering efficiency you should typically feed it through a set of optimizations (the order is important!):
+
+1. Indexing
+2. Vertex cache optimization
+3. (optional) Overdraw optimization
+4. Vertex fetch optimization
+5. Vertex quantization
+6. (optional) Shadow indexing
+
+### Indexing
+
+Most algorithms in this library assume that a mesh has a vertex buffer and an index buffer. For algorithms to work well and also for GPU to render your mesh efficiently, the vertex buffer has to have no redundant vertices; you can generate an index buffer from an unindexed vertex buffer or reindex an existing (potentially redundant) index buffer as follows:
+
+> Note: meshoptimizer generally works with 32-bit (`unsigned int`) indices, however when using C++ APIs you can use any integer type for index data by using the provided template overloads. By convention, remap tables always use `unsigned int`.
+
+First, generate a remap table from your existing vertex (and, optionally, index) data:
+
+```c++
+size_t index_count = face_count * 3;
+size_t unindexed_vertex_count = face_count * 3;
+std::vector<unsigned int> remap(unindexed_vertex_count); // temporary remap table
+size_t vertex_count = meshopt_generateVertexRemap(&remap[0], NULL, index_count,
+    &unindexed_vertices[0], unindexed_vertex_count, sizeof(Vertex));
+```
+
+Note that in this case we only have an unindexed vertex buffer; when input mesh has an index buffer, it will need to be passed to `meshopt_generateVertexRemap` instead of `NULL`, along with the correct source vertex count. In either case, the remap table is generated based on binary equivalence of the input vertices, so the resulting mesh will render the same way. Binary equivalence considers all input bytes, including padding which should be zero-initialized if the vertex structure has gaps.
+
+After generating the remap table, you can allocate space for the target vertex buffer (`vertex_count` elements) and index buffer (`index_count` elements) and generate them:
+
+```c++
+meshopt_remapIndexBuffer(indices, NULL, index_count, &remap[0]);
+meshopt_remapVertexBuffer(vertices, &unindexed_vertices[0], unindexed_vertex_count, sizeof(Vertex), &remap[0]);
+```
+
+You can then further optimize the resulting buffers by calling the other functions on them in-place.
+
+`meshopt_generateVertexRemap` uses binary equivalence of vertex data, which is generally a reasonable default; however, in some cases some attributes may have floating point drift causing extra vertices to be generated. For such cases, it may be necessary to quantize some attributes (most importantly, normals and tangents) before generating the remap, or use `meshopt_generateVertexRemapCustom` algorithm that allows comparing individual attributes with tolerance by providing a custom comparison function:
+
+```c++
+size_t vertex_count = meshopt_generateVertexRemapCustom(&remap[0], NULL, index_count,
+    &unindexed_vertices[0].px, unindexed_vertex_count, sizeof(Vertex),
+    [&](unsigned int lhs, unsigned int rhs) -> bool {
+        const Vertex& lv = unindexed_vertices[lhs];
+        const Vertex& rv = unindexed_vertices[rhs];
+
+        return fabsf(lv.tx - rv.tx) < 1e-3f && fabsf(lv.ty - rv.ty) < 1e-3f;
+    });
+```
+
+### Vertex cache optimization
+
+When the GPU renders the mesh, it has to run the vertex shader for each vertex; usually GPUs have a built-in fixed size cache that stores the transformed vertices (the result of running the vertex shader), and uses this cache to reduce the number of vertex shader invocations. This cache is usually small, 16-32 vertices, and can have different replacement policies; to use this cache efficiently, you have to reorder your triangles to maximize the locality of reused vertex references like so:
+
+```c++
+meshopt_optimizeVertexCache(indices, indices, index_count, vertex_count);
+```
+
+The details of vertex reuse vary between different GPU architectures, so vertex cache optimization uses an adaptive algorithm that produces a triangle sequence with good locality that works well across different GPUs. Alternatively, you can use an algorithm that optimizes specifically for fixed-size FIFO caches: `meshopt_optimizeVertexCacheFifo` (with a recommended cache size of 16). While it generally produces less performant results on most GPUs, it runs ~2x faster, which may benefit rapid content iteration.
+
+### Overdraw optimization
+
+After transforming the vertices, GPU sends the triangles for rasterization which results in generating pixels that are usually first ran through the depth test, and pixels that pass it get the pixel shader executed to generate the final color. As pixel shaders get more expensive, it becomes more and more important to reduce overdraw. While in general improving overdraw requires view-dependent operations, this library provides an algorithm to reorder triangles to minimize the overdraw from all directions, which you can run after vertex cache optimization like this:
+
+```c++
+meshopt_optimizeOverdraw(indices, indices, index_count, &vertices[0].x, vertex_count, sizeof(Vertex), 1.05f);
+```
+
+The overdraw optimizer needs to read vertex positions as a float3 from the vertex; the code snippet above assumes that the vertex stores position as `float x, y, z`.
+
+When performing the overdraw optimization you have to specify a floating-point threshold parameter. The algorithm tries to maintain a balance between vertex cache efficiency and overdraw; the threshold determines how much the algorithm can compromise the vertex cache hit ratio, with 1.05 meaning that the resulting ratio should be at most 5% worse than before the optimization.
+
+Note that depending on the renderer structure and target hardware, the optimization may or may not be beneficial; for example, mobile GPUs with tiled deferred rendering (PowerVR, Apple) would not benefit from this optimization. For vertex heavy scenes it's recommended to measure the performance impact to ensure that the reduced vertex cache efficiency is outweighed by the reduced overdraw.
+
+### Vertex fetch optimization
+
+After the final triangle order has been established, we still can optimize the vertex buffer for memory efficiency. Before running the vertex shader GPU has to fetch the vertex attributes from the vertex buffer; the fetch is usually backed by a memory cache, and as such optimizing the data for the locality of memory access is important. You can do this by running this code:
+
+```c++
+meshopt_optimizeVertexFetch(vertices, indices, index_count, vertices, vertex_count, sizeof(Vertex));
+```
+
+This will reorder the vertices in the vertex buffer to try to improve the locality of reference, and rewrite the indices in place to match; if the vertex data is stored using multiple streams, you should use `meshopt_optimizeVertexFetchRemap` instead. This optimization has to be performed on the final index buffer since the optimal vertex order depends on the triangle order.
+
+Note that the algorithm does not try to model cache replacement precisely and instead just orders vertices in the order of use, which generally produces results that are close to optimal.
+
+### Vertex quantization
+
+To optimize memory bandwidth when fetching the vertex data even further, and to reduce the amount of memory required to store the mesh, it is often beneficial to quantize the vertex attributes to smaller types. While this optimization can technically run at any part of the pipeline (and sometimes doing quantization as the first step can improve indexing by merging almost identical vertices), it generally is easier to run this after all other optimizations since some of them require access to float3 positions.
+
+Quantization is usually domain specific; it's common to quantize normals using 3 8-bit integers but you can use higher-precision quantization (for example using 10 bits per component in a 10_10_10_2 format), or a different encoding to use just 2 components. For positions and texture coordinate data the two most common storage formats are half precision floats, and 16-bit normalized integers that encode the position relative to the AABB of the mesh or the UV bounding rectangle.
+
+The number of possible combinations here is very large but this library does provide the building blocks, specifically functions to quantize floating point values to normalized integers, as well as half-precision floats. For example, here's how you can quantize a normal:
+
+```c++
+unsigned int normal =
+    (meshopt_quantizeUnorm(v.nx, 10) << 20) |
+    (meshopt_quantizeUnorm(v.ny, 10) << 10) |
+     meshopt_quantizeUnorm(v.nz, 10);
+```
+
+and here's how you can quantize a position:
+
+```c++
+unsigned short px = meshopt_quantizeHalf(v.x);
+unsigned short py = meshopt_quantizeHalf(v.y);
+unsigned short pz = meshopt_quantizeHalf(v.z);
+```
+
+Since quantized vertex attributes often need to remain in their compact representations for efficient transfer and storage, they are usually dequantized during vertex processing by configuring the GPU vertex input correctly to expect normalized integers or half precision floats, which often needs no or minimal changes to the shader code. When CPU dequantization is required instead, `meshopt_dequantizeHalf` can be used to convert half precision values back to single precision; for normalized integer formats, the dequantization just requires dividing by 2^N-1 for unorm and 2^(N-1)-1 for snorm variants, for example manually reversing `meshopt_quantizeUnorm(v, 10)` can be done by dividing by 1023.
+
+### Shadow indexing
+
+Many rendering pipelines require meshes to be rendered to depth-only targets, such as shadow maps or during a depth pre-pass, in addition to color/G-buffer targets. While using the same geometry data for both cases is possible, reducing the number of unique vertices for depth-only rendering can be beneficial, especially when the source geometry has many attribute seams due to faceted shading or lightmap texture seams.
+
+To achieve this, this library provides the `meshopt_generateShadowIndexBuffer` algorithm, which generates a second (shadow) index buffer that can be used with the original vertex data:
+
+```c++
+std::vector<unsigned int> shadow_indices(index_count);
+// note: this assumes Vertex starts with float3 positions and should be adjusted accordingly for quantized positions
+meshopt_generateShadowIndexBuffer(&shadow_indices[0], indices, index_count, &vertices[0].x, vertex_count, sizeof(float) * 3, sizeof(Vertex));
+```
+
+Because the vertex data is shared, shadow indexing should be done after other optimizations of the vertex/index data. However, it's possible (and recommended) to optimize the resulting shadow index buffer for vertex cache:
+
+```c++
+meshopt_optimizeVertexCache(&shadow_indices[0], &shadow_indices[0], index_count, vertex_count);
+```
+
+In some cases, it may be beneficial to split the vertex positions into a separate buffer to maximize efficiency for depth-only rendering. Note that the example above assumes only positions are relevant for shadow rendering, but more complex materials may require adding texture coordinates (for alpha testing) or skinning data to the vertex portion used as a key. `meshopt_generateShadowIndexBufferMulti` can be useful for these cases if the relevant data is not contiguous.
+
+Note that for meshes with optimal indexing and few attribute seams, the shadow index buffer will be very similar to the original index buffer, so may not be always worth generating a separate shadow index buffer even if the rendering pipeline relies on depth-only passes.
+
+## Clusterization
+
+While traditionally meshes have served as a unit of rendering, new approaches to rendering and raytracing are starting to use a smaller unit of work, such as clusters or meshlets. This allows more freedom in how the geometry is processed, and can lead to better performance and more efficient use of GPU hardware. This section describes algorithms designed to work with meshes as sets of clusters.
+
+### Mesh shading
+
+Modern GPUs are beginning to deviate from the traditional rasterization model. NVidia GPUs starting from Turing and AMD GPUs starting from RDNA2 provide a new programmable geometry pipeline that, instead of being built around index buffers and vertex shaders, is built around mesh shaders - a new shader type that allows to provide a batch of work to the rasterizer.
+
+Using mesh shaders in context of traditional mesh rendering provides an opportunity to use a variety of optimization techniques, starting from more efficient vertex reuse, using various forms of culling (e.g. cluster frustum or occlusion culling) and in-memory compression to maximize the utilization of GPU hardware. Beyond traditional rendering mesh shaders provide a richer programming model that can synthesize new geometry more efficiently than common alternatives such as geometry shaders. Mesh shading can be accessed via Vulkan or Direct3D 12 APIs; please refer to [Introduction to Turing Mesh Shaders](https://developer.nvidia.com/blog/introduction-turing-mesh-shaders/) and [Mesh Shaders and Amplification Shaders: Reinventing the Geometry Pipeline](https://devblogs.microsoft.com/directx/coming-to-directx-12-mesh-shaders-and-amplification-shaders-reinventing-the-geometry-pipeline/) for additional information.
+
+To use mesh shaders for conventional rendering efficiently, geometry needs to be converted into a series of meshlets; each meshlet represents a small subset of the original mesh and comes with a small set of vertices and a separate micro-index buffer that references vertices in the meshlet. This information can be directly fed to the rasterizer from the mesh shader. This library provides algorithms to create meshlet data for a mesh, and - assuming geometry is static - can compute bounding information that can be used to perform cluster culling, a technique that can reject a meshlet if it's invisible on screen.
+
+To generate meshlet data, this library provides `meshopt_buildMeshlets` algorithm, which tries to balance topological efficiency (by maximizing vertex reuse inside meshlets) with culling efficiency (by minimizing meshlet radius and triangle direction divergence) and produces GPU-friendly data. As an alternative (that can be useful for load-time processing), `meshopt_buildMeshletsScan` can create the meshlet data using a vertex cache-optimized index buffer as a starting point by greedily aggregating consecutive triangles until they go over the meshlet limits. `meshopt_buildMeshlets` is recommended for offline data processing even if cone culling is not used.
+
+```c++
+const size_t max_vertices = 64;
+const size_t max_triangles = 126;
+const float cone_weight = 0.0f;
+
+size_t max_meshlets = meshopt_buildMeshletsBound(indices.size(), max_vertices, max_triangles);
+std::vector<meshopt_Meshlet> meshlets(max_meshlets);
+std::vector<unsigned int> meshlet_vertices(indices.size());
+std::vector<unsigned char> meshlet_triangles(indices.size());
+
+size_t meshlet_count = meshopt_buildMeshlets(meshlets.data(), meshlet_vertices.data(), meshlet_triangles.data(), indices.data(),
+    indices.size(), &vertices[0].x, vertices.size(), sizeof(Vertex), max_vertices, max_triangles, cone_weight);
+```
+
+To generate the meshlet data, `max_vertices` and `max_triangles` need to be set within limits supported by the hardware; for NVidia the values of 64 and 126 are recommended. `cone_weight` should be left as 0 if cluster cone culling is not used, and set to a value between 0 and 1 to balance cone culling efficiency with other forms of culling like frustum or occlusion culling (`0.25` is a reasonable default).
+
+> Note that for earlier AMD GPUs, the best configurations tend to use the same limits for `max_vertices` and `max_triangles`, such as 64 and 64, or 128 and 128. Additionally, while NVidia recommends 64/126 as a good configuration, consider using a different configuration like `max_vertices 64, max_triangles 96`, to provide more realistic limits that are achievable on real-world meshes, and to reduce the overhead on other GPUs.
+
+Each resulting meshlet refers to a portion of `meshlet_vertices` and `meshlet_triangles` arrays; the arrays are overallocated for the worst case so it's recommended to trim them before saving them as an asset / uploading them to the GPU:
+
+```c++
+const meshopt_Meshlet& last = meshlets[meshlet_count - 1];
+
+meshlet_vertices.resize(last.vertex_offset + last.vertex_count);
+meshlet_triangles.resize(last.triangle_offset + last.triangle_count * 3);
+meshlets.resize(meshlet_count);
+```
+
+Depending on the application, other strategies of storing the data can be useful; for example, `meshlet_vertices` serves as indices into the original vertex buffer but it might be worthwhile to generate a mini vertex buffer for each meshlet to remove the extra indirection when accessing vertex data, or it might be desirable to compress vertex data as vertices in each meshlet are likely to be very spatially coherent.
+
+For optimal rasterization performance, it is recommended to further optimize each meshlet in isolation for better triangle and vertex locality by calling `meshopt_optimizeMeshlet` on vertex and index data like so:
+
+```c++
+meshopt_optimizeMeshlet(&meshlet_vertices[m.vertex_offset], &meshlet_triangles[m.triangle_offset], m.triangle_count, m.vertex_count);
+```
+
+Different applications will choose different strategies for rendering meshlets; on a GPU capable of mesh shading, meshlets can be rendered directly; for example, a basic GLSL shader for `VK_EXT_mesh_shader` extension could look like this (parts omitted for brevity):
+
+```glsl
+layout(binding = 0) readonly buffer Meshlets { Meshlet meshlets[]; };
+layout(binding = 1) readonly buffer MeshletVertices { uint meshlet_vertices[]; };
+layout(binding = 2) readonly buffer MeshletTriangles { uint8_t meshlet_triangles[]; };
+
+void main() {
+    Meshlet meshlet = meshlets[gl_WorkGroupID.x];
+    SetMeshOutputsEXT(meshlet.vertex_count, meshlet.triangle_count);
+
+    for (uint i = gl_LocalInvocationIndex; i < meshlet.vertex_count; i += gl_WorkGroupSize.x) {
+        uint index = meshlet_vertices[meshlet.vertex_offset + i];
+        gl_MeshVerticesEXT[i].gl_Position = world_view_projection * vec4(vertex_positions[index], 1);
+    }
+
+    for (uint i = gl_LocalInvocationIndex; i < meshlet.triangle_count; i += gl_WorkGroupSize.x) {
+        uint offset = meshlet.triangle_offset + i * 3;
+        gl_PrimitiveTriangleIndicesEXT[i] = uvec3(
+            meshlet_triangles[offset], meshlet_triangles[offset + 1], meshlet_triangles[offset + 2]);
+    }
+}
+```
+
+After generating the meshlet data, it's possible to generate extra data for each meshlet that can be saved and used at runtime to perform cluster culling, where each meshlet can be discarded if it's guaranteed to be invisible. To generate the data, `meshopt_computeMeshletBounds` can be used:
+
+```c++
+meshopt_Bounds bounds = meshopt_computeMeshletBounds(&meshlet_vertices[m.vertex_offset], &meshlet_triangles[m.triangle_offset],
+    m.triangle_count, &vertices[0].x, vertices.size(), sizeof(Vertex));
+```
+
+The resulting `bounds` values can be used to perform frustum or occlusion culling using the bounding sphere, or cone culling using the cone axis/angle (which will reject the entire meshlet if all triangles are guaranteed to be back-facing from the camera point of view):
+
+```c++
+if (dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff) reject();
+```
+
+Cluster culling should ideally run at a lower frequency than mesh shading, either using amplification/task shaders, or using a separate compute dispatch.
+
+By default, the meshlet builder tries to form complete meshlets even if that requires merging disconnected regions of the mesh into a single meshlet. In some cases, such as hierarchical level of detail, or when advanced culling is used, it may be beneficial to prioritize spatial locality of triangles in a meshlet even if that results in partially filled meshlets. To that end, `meshopt_buildMeshletsFlex` function can be used instead of `meshopt_buildMeshlets`; it provides two triangle limits, `min_triangles` and `max_triangles`, and uses an additional configuration parameter, `split_factor` (recommended value is 2.0), to decide whether increasing the meshlet radius is worth it to fit more triangles in the meshlet. When using this function, the worst case bound for the number of meshlets has to be computed using `meshopt_buildMeshletsBound` with `min_triangles` parameter instead of `max_triangles`.
+
+### Clustered raytracing
+
+In addition to rasterization, meshlets can also be used for ray tracing. NVidia GPUs starting from Turing with recent drivers provide support for cluster acceleration structures (via `VK_NV_cluster_acceleration_structure` extension / NVAPI); instead of building a traditional BLAS, a cluster acceleration structure can be built for each meshlet and combined into a single clustered BLAS. While this currently results in reduced ray tracing performance for static geometry (for which a traditional BLAS may be more suitable), it allows updating the individual clusters without having to rebuild or refit the entire BLAS, which can be useful for mesh deformation or hierarchical level of detail.
+
+When using meshlets for raytracing, the performance characteristics that matter differ from when rendering meshes with rasterization. For raytracing, clusters with optimal spatial division that minimize ray-triangle intersection tests are preferred, while for rasterization, clusters with maximum triangle count within vertex limits are ideal.
+
+To generate meshlets optimized for raytracing, this library provides `meshopt_buildMeshletsSpatial` algorithm, which builds clusters using surface area heuristic (SAH) to produce raytracing-friendly cluster distributions:
+
+```c++
+const size_t max_vertices = 64;
+const size_t min_triangles = 16;
+const size_t max_triangles = 64;
+const float fill_weight = 0.5f;
+
+size_t max_meshlets = meshopt_buildMeshletsBound(indices.size(), max_vertices, min_triangles); // note: use min_triangles to compute worst case bound
+std::vector<meshopt_Meshlet> meshlets(max_meshlets);
+std::vector<unsigned int> meshlet_vertices(indices.size());
+std::vector<unsigned char> meshlet_triangles(indices.size());
+
+size_t meshlet_count = meshopt_buildMeshletsSpatial(meshlets.data(), meshlet_vertices.data(), meshlet_triangles.data(), indices.data(),
+    indices.size(), &vertices[0].x, vertices.size(), sizeof(Vertex), max_vertices, min_triangles, max_triangles, fill_weight);
+```
+
+The algorithm recursively subdivides the triangles into a BVH-like hierarchy using SAH for optimal spatial partitioning while balancing cluster size; this results in clusters that are significantly more efficient to raytrace compared to clusters generated by `meshopt_buildMeshlets`, but can still be used for rasterization (for example, to build visibility buffers or G-buffers).
+
+The `min_triangles` and `max_triangles` parameters control the allowed range of triangles per cluster. For optimal raytracing performance, `min_triangles` should be at most `max_triangles/2` (or, ideally, `max_triangles/4`) to give the algorithm enough freedom to produce high-quality spatial partitioning. For meshes with few seams due to normal or UV discontinuities, using `max_vertices` equal to `max_triangles` is recommended when rasterization performance is a concern; for meshes with many seams or for renderers that primarily use meshlets for ray tracing, a higher `max_vertices` value should be used as it ensures that more clusters can fully utilize the triangle limit.
+
+The `fill_weight` parameter (typically between 0 and 1, although values higher than 1 could be used to prioritize cluster fill even more) controls the trade-off between pure SAH optimization and triangle utilization. A value of 0 will optimize purely for SAH, resulting in best raytracing performance but potentially smaller clusters. Values between 0.5 and 0.75 typically provide a good balance of SAH quality vs triangle count.
+
+### Point cloud clusterization
+
+Both of the meshlet algorithms are designed to work with triangle meshes. In some cases, splitting a point cloud into fixed size clusters can be useful; the resulting point clusters could be rendered via mesh or compute shaders, or the resulting subdivision can be used to parallelize point processing while maintaining locality of points. To that end, this library provides `meshopt_spatialClusterPoints` algorithm:
+
+```c++
+const size_t cluster_size = 256;
+
+std::vector<unsigned int> index(mesh.vertices.size());
+meshopt_spatialClusterPoints(&index[0], &mesh.vertices[0].px, mesh.vertices.size(), sizeof(Vertex), cluster_size);
+```
+
+The resulting index buffer could be used to process the points directly, or reorganize the point data into flat contiguous arrays. Every consecutive chunk of `cluster_size` points in the index buffer refers to a single cluster, with just the last cluster containing fewer points if the total number of points is not a multiple of `cluster_size`. Note that the index buffer is not a remap table, so `meshopt_remapVertexBuffer` can't be used to flatten the point data.
+
+### Cluster partitioning
+
+When working with clustered geometry, it can be beneficial to organize clusters into larger groups (partitions) for more efficient processing or workload distribution. This library provides an algorithm to partition clusters into groups of similar size while prioritizing locality:
+
+```c++
+const size_t partition_size = 32;
+
+std::vector<unsigned int> cluster_partitions(cluster_count);
+size_t partition_count = meshopt_partitionClusters(&cluster_partitions[0], &cluster_indices[0], total_index_count,
+    &cluster_index_counts[0], cluster_count, &vertices[0].x, vertex_count, sizeof(Vertex), partition_size);
+```
+
+The algorithm assigns each cluster to a partition, aiming for a target partition size while prioritizing topological locality (sharing vertices) and spatial locality. The resulting partitions can be used for more efficient batched processing of clusters, or for hierarchial simplification schemes similar to Nanite.
+
+If vertex positions are specified (not NULL), spatial locality will influence priority of merging clusters; otherwise, the algorithm will rely solely on topological connections.
+
+After partitioning, each element in the destination array contains the partition ID (ranging from 0 to the returned partition count minus 1) for the corresponding cluster. Note that the partitions may be both smaller and larger than the target size.
+
+## Mesh compression
+
+In case storage size or transmission bandwidth is of importance, you might want to additionally compress vertex and index data. While several mesh compression libraries, like Google Draco, are available, they typically are designed to maximize the compression ratio at the cost of disturbing the vertex/index order (which makes the meshes inefficient to render on GPU) or decompression performance. They also frequently don't support custom game-ready quantized vertex formats and thus require to re-quantize the data after loading it, introducing extra quantization errors and making decoding slower.
+
+Alternatively you can use general purpose compression libraries like zstd or Oodle to compress vertex/index data - however these compressors aren't designed to exploit redundancies in vertex/index data and as such compression rates can be unsatisfactory.
+
+To that end, this library provides algorithms to "encode" vertex and index data. The result of the encoding is generally significantly smaller than initial data, and remains compressible with general purpose compressors - so you can either store encoded data directly (for modest compression ratios and maximum decoding performance), or further compress it with LZ4/zstd/Oodle to maximize compression ratio.
+
+> Note: this compression scheme is available as a glTF extension [EXT_meshopt_compression](https://github.com/KhronosGroup/glTF/blob/main/extensions/2.0/Vendor/EXT_meshopt_compression/README.md).
+
+### Vertex compression
+
+This library provides a lossless algorithm to encode/decode vertex data. To encode vertices, you need to allocate a target buffer (using the worst case bound) and call the encoding function:
+
+```c++
+std::vector<unsigned char> vbuf(meshopt_encodeVertexBufferBound(vertex_count, sizeof(Vertex)));
+vbuf.resize(meshopt_encodeVertexBuffer(&vbuf[0], vbuf.size(), vertices, vertex_count, sizeof(Vertex)));
+```
+
+To decode the data at runtime, call the decoding function:
+
+```c++
+int res = meshopt_decodeVertexBuffer(vertices, vertex_count, sizeof(Vertex), &vbuf[0], vbuf.size());
+assert(res == 0);
+```
+
+Note that vertex encoding assumes that vertex buffer was optimized for vertex fetch, and that vertices are quantized. Feeding unoptimized data into the encoder may produce poor compression ratios. The codec is lossless by itself - the only lossy step is quantization/reordering or filters that you may apply before encoding. Additionally, if the vertex data contains padding bytes, they should be zero-initialized to ensure that the encoder does not need to store uninitialized data.
+
+Decoder is heavily optimized and can directly target write-combined memory; you can expect it to run at 3-6 GB/s on modern desktop CPUs. Compression ratio depends on the data; vertex data compression ratio is typically around 2-4x (compared to already quantized and optimally packed data). General purpose lossless compressors can further improve the compression ratio at some cost to decoding performance.
+
+The vertex codec tries to take advantage of the inherent locality of sequential vertices and identify bit patterns that repeat in consecutive vertices. Typically, vertex cache + vertex fetch provides a reasonably local vertex traversal order; without an index buffer, it is recommended to sort vertices spatially (via `meshopt_spatialSortRemap`) to improve the compression ratio.
+
+It is crucial to correctly specify the stride when encoding vertex data; however, for compression ratio it does not matter whether the vertices are interleaved or deinterleaved, as the codecs perform full byte deinterleaving internally. The stride of each stream must be a multiple of 4 bytes.
+
+For optimal compression results, the values should be quantized to small integers. It can be valuable to use bit counts that are not multiples of 8. For example, instead of using 16 bits to represent texture coordinates, use 12-bit integers and divide by 4095 in the shader. Alternatively, using half-precision floats can often achieve good results.
+For single-precision floating-point data, it's recommended to use `meshopt_quantizeFloat` to remove entropy from the lower bits of the mantissa; for best results, consider using 15 bits or 7 bits for extreme compression.
+For normal or tangent vectors, using octahedral encoding is recommended over three components as it reduces redundancy; similarly, consider using 10-12 bits per component instead of 16.
+
+When data is bit packed, specifying compression level 3 (via `meshopt_encodeVertexBufferLevel`) can improve the compression further by redistributing bits between components.
+
+### Index compression
+
+This library also provides algorithms to encode/decode index data. To encode triangle indices, you need to allocate a target buffer (using the worst case bound) and call the encoding function:
+
+```c++
+std::vector<unsigned char> ibuf(meshopt_encodeIndexBufferBound(index_count, vertex_count));
+ibuf.resize(meshopt_encodeIndexBuffer(&ibuf[0], ibuf.size(), indices, index_count));
+```
+
+To decode the data at runtime, call the decoding function:
+
+```c++
+int res = meshopt_decodeIndexBuffer(indices, index_count, &ibuf[0], ibuf.size());
+assert(res == 0);
+```
+
+Note that index encoding assumes that the index buffer was optimized for vertex cache and vertex fetch. Feeding unoptimized data into the encoder will produce poor compression ratios. Codec preserves the order of triangles, however it can rotate each triangle to improve compression ratio (which means the provoking vertex may change).
+
+Decoder is heavily optimized and can directly target write-combined memory; you can expect it to run at 3-6 GB/s on modern desktop CPUs.
+
+The index codec targets 1 byte per triangle as a best case (6x smaller than raw 16-bit index data); on real-world meshes, it's typical to achieve 1-1.2 bytes per triangle. To reach this, the index data needs to be optimized for vertex cache and vertex fetch. Optimizations that do not disrupt triangle locality (such as overdraw) are safe to use in between.
+To reduce the data size further, it's possible to use `meshopt_optimizeVertexCacheStrip` instead of `meshopt_optimizeVertexCache` when optimizing for vertex cache. This trades off some efficiency in vertex transform for smaller index (and sometimes vertex) data.
+
+When referenced vertex indices are not sequential, the index codec will use around 2 bytes per index. This can happen when the referenced vertices are a sparse subset of the vertex buffer, such as when encoding LODs. General-purpose compression can be especially helpful in this case.
+
+Index buffer codec only supports triangle list topology; when encoding triangle strips or line lists, use `meshopt_encodeIndexSequence`/`meshopt_decodeIndexSequence` instead. This codec typically encodes indices into ~1 byte per index, but compressing the results further with a general purpose compressor can improve the results to 1-3 bits per index.
+
+### Point cloud compression
+
+The vertex encoding algorithms can be used to compress arbitrary streams of attribute data; one other use case besides triangle meshes is point cloud data. Typically point clouds come with position, color and possibly other attributes but don't have an implied point order.
+
+To compress point clouds efficiently, it's recommended to first preprocess the points by sorting them using the spatial sort algorithm:
+
+```c++
+std::vector<unsigned int> remap(point_count);
+meshopt_spatialSortRemap(&remap[0], positions, point_count, sizeof(vec3));
+
+// for each attribute stream
+meshopt_remapVertexBuffer(positions, positions, point_count, sizeof(vec3), &remap[0]);
+```
+
+After this the resulting arrays should be quantized (e.g. using 16-bit fixed point numbers for positions and 8-bit color components), and the result can be compressed using `meshopt_encodeVertexBuffer` as described in the previous section. To decompress, `meshopt_decodeVertexBuffer` will recover the quantized data that can be used directly or converted back to original floating-point data. The compression ratio depends on the nature of source data, for colored points it's typical to get 35-40 bits per point.
+
+### Vertex filters
+
+To further leverage the inherent structure of some vertex data, it's possible to use filters that encode and decode the data in a lossy manner. This is similar to quantization but can be used without having to change the shader code. After decoding, the filter transformation needs to be reversed. For native game engine pipelines, it is usually more optimal to carefully prequantize and pretransform the vertex data, but sometimes (for example when serializing data in glTF format) this is not a practical option and filters are more convenient. This library provides four filters:
+
+- Octahedral filter (`meshopt_encodeFilterOct`/`meshopt_decodeFilterOct`) encodes quantized (snorm) normal or tangent vectors using octahedral encoding. Any number of bits <= 16 can be used with 4 bytes or 8 bytes per vector.
+- Quaternion filter (`meshopt_encodeFilterQuat`/`meshopt_decodeFilterQuat`) encodes quantized (snorm) quaternion vectors; this can be used to encode rotations or tangent frames. Any number of bits between 4 and 16 can be used with 8 bytes per vector.
+- Exponential filter (`meshopt_encodeFilterExp`/`meshopt_decodeFilterExp`) encodes single-precision floating-point vectors; this can be used to encode arbitrary floating-point data more efficiently. In addition to an arbitrary bit count (<= 24), the filter takes a "mode" parameter that allows specifying how the exponent sharing is performed to trade off compression ratio and quality:
+
+    - `meshopt_EncodeExpSeparate` does not share exponents and results in the largest output
+    - `meshopt_EncodeExpSharedVector` shares exponents between different components of the same vector
+    - `meshopt_EncodeExpSharedComponent` shares exponents between the same component in different vectors
+    - `meshopt_EncodeExpClamped` does not share exponents but clamps the exponent range to reduce exponent entropy
+- Color filter (`meshopt_encodeFilterColor`/`meshopt_decodeFilterColor`) encodes quantized (unorm) RGBA colors using YCoCg encoding. Any number of bits <= 16 can be used with 4 bytes or 8 bytes per vector.
+
+Note that all filters are lossy and require the data to be deinterleaved with one attribute per stream; this facilitates efficient SIMD implementation of filter decoders, which decodes at 5-10 GB/s on modern desktop CPUs, allowing the overall decompression speed to be closer to that of the raw vertex codec.
+
+### Versioning and compatibility
+
+The following guarantees on data compatibility are provided for point releases (*no* guarantees are given for development branch):
+
+- Data encoded with older versions of the library can always be decoded with newer versions;
+- Data encoded with newer versions of the library can be decoded with older versions, provided that encoding versions are set correctly; if binary stability of encoded data is important, use `meshopt_encodeVertexVersion` and `meshopt_encodeIndexVersion` to 'pin' the data versions (or `version` argument of `meshopt_encodeVertexBufferLevel`).
+
+By default, vertex data is encoded for format version 1 (compatible with meshoptimizer v0.23+), and index data is encoded for format version 1 (compatible with meshoptimizer v0.14+). When decoding the data, the decoder will automatically detect the version from the data header.
+
+## Simplification
+
+All algorithms presented so far don't affect visual appearance at all, with the exception of quantization that has minimal controlled impact. However, fundamentally the most effective way to reduce the rendering or transmission cost of a mesh is to reduce the number of triangles in the mesh.
+
+### Basic simplification
+
+This library provides a simplification algorithm, `meshopt_simplify`, that reduces the number of triangles in the mesh. Given a vertex and an index buffer, it generates a second index buffer that uses existing vertices in the vertex buffer. This index buffer can be used directly for rendering with the original vertex buffer (preferably after vertex cache optimization using `meshopt_optimizeVertexCache`), or a new compact vertex/index buffer can be generated using `meshopt_optimizeVertexFetch` that uses the optimal number and order of vertices.
+
+```c++
+float threshold = 0.2f;
+size_t target_index_count = size_t(index_count * threshold);
+float target_error = 1e-2f;
+
+std::vector<unsigned int> lod(index_count);
+float lod_error = 0.f;
+lod.resize(meshopt_simplify(&lod[0], indices, index_count, &vertices[0].x, vertex_count, sizeof(Vertex),
+    target_index_count, target_error, /* options= */ 0, &lod_error));
+```
+
+Target error is an approximate measure of the deviation from the original mesh using distance normalized to `[0..1]` range (e.g. `1e-2f` means that simplifier will try to maintain the error to be below 1% of the mesh extents). Note that the simplifier attempts to produce the requested number of indices at minimal error, but because of topological restrictions and error limit it is not guaranteed to reach the target index count and can stop earlier.
+
+To disable the error limit, `target_error` can be set to `FLT_MAX`. This makes it more likely that the simplifier will reach the target index count, but it may produce a mesh that looks significantly different from the original, so using the resulting error to control viewing distance would be required. Conversely, setting `target_index_count` to 0 will simplify the input mesh as much as possible within the specified error limit; this can be useful for generating LODs that should look good at a given viewing distance.
+
+The algorithm follows the topology of the original mesh in an attempt to preserve attribute seams, borders and overall appearance. For meshes with inconsistent topology or many seams, such as faceted meshes, it can result in simplifier getting "stuck" and not being able to simplify the mesh fully. Therefore it's critical that identical vertices are "welded" together, that is, the input vertex buffer does not contain duplicates. Additionally, it may be worthwhile to weld the vertices without taking into account vertex attributes that aren't critical and can be rebuilt later, or use "permissive" mode described below.
+
+Alternatively, the library provides another simplification algorithm, `meshopt_simplifySloppy`, which doesn't follow the topology of the original mesh. This means that it doesn't preserve attribute seams or borders, but it can collapse internal details that are too small to matter because it can merge mesh features that are topologically disjoint but spatially close. In general, this algorithm produces meshes with worse geometric quality and poor attribute quality compared to `meshopt_simplify`.
+
+The algorithm can also return the resulting normalized deviation that can be used to choose the correct level of detail based on screen size or solid angle; the error can be converted to object space by multiplying by the scaling factor returned by `meshopt_simplifyScale`. For example, given a mesh with a precomputed LOD and a prescaled error, the screen-space normalized error can be computed and used for LOD selection:
+
+```c++
+// lod_factor can be 1 or can be adjusted for more or less aggressive LOD selection
+float d = max(0, distance(camera_position, mesh_center) - mesh_radius);
+float e = d * (tan(camera_fovy / 2) * 2 / screen_height); // 1px in mesh space
+bool lod_ok = e * lod_factor >= lod_error;
+```
+
+When a sequence of LOD meshes is generated that all use the original vertex buffer, care must be taken to order vertices optimally to not penalize mobile GPU architectures that are only capable of transforming a sequential vertex buffer range. It's recommended in this case to first optimize each LOD for vertex cache, then assemble all LODs in one large index buffer starting from the coarsest LOD (the one with fewest triangles), and call `meshopt_optimizeVertexFetch` on the final large index buffer. This will make sure that coarser LODs require a smaller vertex range and are efficient wrt vertex fetch and transform.
+
+### Attribute-aware simplification
+
+While `meshopt_simplify` is aware of attribute discontinuities by default (and infers them through the supplied index buffer) and tries to preserve them, it can be useful to provide information about attribute values. This allows the simplifier to take attribute error into account which can improve shading (by using vertex normals), texture deformation (by using texture coordinates), and may be necessary to preserve vertex colors when textures are not used in the first place. This can be done by using a variant of the simplification function that takes attribute values and weight factors, `meshopt_simplifyWithAttributes`:
+
+```c++
+const float nrm_weight = 0.5f;
+const float attr_weights[3] = {nrm_weight, nrm_weight, nrm_weight};
+
+std::vector<unsigned int> lod(index_count);
+float lod_error = 0.f;
+lod.resize(meshopt_simplifyWithAttributes(&lod[0], indices, index_count, &vertices[0].x, vertex_count, sizeof(Vertex),
+    &vertices[0].nx, sizeof(Vertex), attr_weights, 3, /* vertex_lock= */ NULL,
+    target_index_count, target_error, /* options= */ 0, &lod_error));
+```
+
+The attributes are passed as a separate buffer (in the example above it's a subset of the same vertex buffer) and should be stored as consecutive floats; attribute weights are used to control the importance of each attribute in the simplification process. For normalized attributes like normals and vertex colors, a weight around 1.0 is usually appropriate; internally, a change of `1/weight` in attribute value over a distance `d` is approximately equivalent to a change of `d` in position. Using higher weights may be appropriate to preserve attribute quality at the cost of position quality. If the attribute has a different scale (e.g. unnormalized vertex colors in [0..255] range), the weight should be divided by the scaling factor (1/255 in this example).
+
+Both the target error and the resulting error combine positional error and attribute error, so the error can be used to control the LOD while taking attribute quality into account, assuming carefully chosen weights.
+
+### Permissive simplification
+
+By default, `meshopt_simplify` preserves attribute discontinuities inferred from the supplied index buffer. For meshes with many seams, the simplifier can get "stuck" and fail to fully simplify the mesh, as it cannot collapse vertices across attribute seams. This is especially problematic for meshes with faceted normals (flat shading), as the simplifier may not be able to reduce the triangle count at all. The `meshopt_SimplifyPermissive` option relaxes these restrictions, allowing the simplifier to collapse vertices across attribute discontinuities when the resulting error is acceptable:
+
+```c++
+std::vector<unsigned int> lod(index_count);
+float lod_error = 0.f;
+lod.resize(meshopt_simplifyWithAttributes(&lod[0], indices, index_count, &vertices[0].x, vertex_count, sizeof(Vertex),
+    &vertices[0].nx, sizeof(Vertex), attr_weights, 3, /* vertex_lock= */ NULL,
+    target_index_count, target_error, /* options= */ meshopt_SimplifyPermissive, &lod_error));
+```
+
+To maintain appearance, it's highly recommended to use this option together with attribute-aware simplification, as shown above, as it allows the simplifier to maintain attribute appearance. In this mode, it is often desirable to selectively preserve certain attribute seams, such as UV seams or sharp creases. This can be achieved by using the `vertex_lock` array with flag `meshopt_SimplifyVertex_Protect` set for individual vertices to protect specific discontinuities. To fill this array, use `meshopt_generatePositionRemap` to create a mapping table for vertices with identical positions, and then compare each vertex to the remapped vertex to determine which attributes are different:
+
+```c++
+std::vector<unsigned int> remap(vertices.size());
+meshopt_generatePositionRemap(&remap[0], &vertices[0].px, vertices.size(), sizeof(Vertex));
+
+std::vector<unsigned char> locks(vertices.size());
+for (size_t i = 0; i < vertices.size(); ++i) {
+    unsigned int r = remap[i];
+
+    if (r != i && (vertices[r].tx != vertices[i].tx || vertices[r].ty != vertices[i].ty))
+        locks[i] |= meshopt_SimplifyVertex_Protect; // protect UV seams
+
+    if (r != i && (vertices[r].nx * vertices[i].nx + vertices[r].ny * vertices[i].ny + vertices[r].nz * vertices[i].nz < 0.25f))
+        locks[i] |= meshopt_SimplifyVertex_Protect; // protect sharp normal creases
+}
+```
+
+This approach provides fine-grained control over which discontinuities to preserve. The permissive mode combined with selective locking provides a balance between simplification quality and attribute preservation, and usually results in higher quality LODs for the same target triangle count (and dramatically higher quality compared to `meshopt_simplifySloppy`).
+
+> Note: this functionality is currently experimental and is subject to future improvements. Certain collapses are restricted to protect the overall topology, and attribute quality may occasionally regress.
+
+### Simplification with vertex update
+
+All simplification functions described so far reuse the original vertex buffer and only produce a new index buffer. This means that the resulting mesh will have the same vertex positions and attributes as the original mesh; this is optimal for minimizing the memory consumption and for highly detailed meshes often provides good quality. However, for more aggressive simplification to retain visual quality, it may be necessary to adjust vertex data for optimal appearance. This can be done by using a variant of the simplification function that updates vertex positions and attributes, `meshopt_simplifyWithUpdate`:
+
+```c++
+indices.resize(meshopt_simplifyWithUpdate(&indices[0], indices.size(), &vertices[0].px, vertices.size(), sizeof(Vertex),
+    &vertices[0].nx, sizeof(Vertex), attr_weights, 3, /* vertex_lock= */ NULL,
+    target_index_count, target_error, /* options= */ 0, &result_error));
+```
+
+Unlike `meshopt_simplify`/`meshopt_simplifyWithAttributes`, this function updates the index buffer as well as vertex positions and attributes in place. The resulting indices still refer to the original vertex buffer; any attributes that are not passed to the simplifier can be left unchanged. However, since the original contents of `vertices` is no longer valid for rendering the original mesh, a new compact vertex/index buffer should be generated using `meshopt_optimizeVertexFetch` (after optimizing the index data with `meshopt_optimizeVertexCache`). If the original data was important, it should be copied before calling this function.
+
+Since the vertex positions are updated, this may require updating some attributes that could previously be left as-is when using the original vertex buffer. Notably, texture coordinates need to be updated to avoid texture distortion; thus it's highly recommended to include texture coordinates in the attribute data passed to the simplifier. For attributes to be updated, the corresponding attribute weight must not be zero; for texture coordinates, a weight of 1.0 is usually sufficient in this case (although a higher or mesh dependent weight could be used with this function or other functions to reduce UV stretching).
+
+Attributes that have specific constraints like normals and colors should be renormalized or clamped after the function returns new data. Attributes like bone indices/weights don't need to be updated for reasonable results (but regularization via `meshopt_SimplifyRegularize` may still be helpful to maintain deformation quality).
+
+Using unique vertex data for each LOD in a chain can improve visual quality, but it comes at a cost of ~doubling vertex memory used (if each LOD is using half the triangles of the previous LOD). To reduce the memory footprint, it is possible to use shared vertices with `meshopt_simplifyWithAttributes` for the first one or two LODs in the chain, and only switch to `meshopt_simplifyWithUpdate` for the remainder. In that case, similarly to the use of `meshopt_simplify` described earlier, care must be taken to optimally arrange the vertices in the original vertex buffer.
+
+### Advanced simplification
+
+`meshopt_simplify*` functions expose additional options and parameters that can be used to control the simplification process in more detail.
+
+For basic customization, a number of options can be passed via `options` bitmask that adjust the behavior of the simplifier:
+
+- `meshopt_SimplifyLockBorder` restricts the simplifier from collapsing edges that are on the border of the mesh. This can be useful for simplifying mesh subsets independently, so that the LODs can be combined without introducing cracks.
+- `meshopt_SimplifyErrorAbsolute` changes the error metric from relative to absolute both for the input error limit as well as for the resulting error. This can be used instead of `meshopt_simplifyScale`.
+- `meshopt_SimplifySparse` improves simplification performance assuming input indices are a sparse subset of the mesh. This can be useful when simplifying small mesh subsets independently, and is intended to be used for meshlet simplification. For consistency, it is recommended to use absolute errors when sparse simplification is desired, as this flag changes the meaning of the relative errors.
+- `meshopt_SimplifyPrune` allows the simplifier to remove isolated components regardless of the topological restrictions inside the component. This is generally recommended for full-mesh simplification as it can improve quality and reduce triangle count; note that with this option, triangles connected to locked vertices may be removed as part of their component.
+- `meshopt_SimplifyRegularize` produces more regular triangle sizes and shapes during simplification, at some cost to geometric quality. This can improve geometric quality under deformation such as skinning.
+- `meshopt_SimplifyPermissive`  allows collapses across attribute discontinuities, except for vertices that are tagged with `meshopt_SimplifyVertex_Protect` via `vertex_lock`.
+
+When using `meshopt_simplifyWithAttributes`, it is also possible to lock certain vertices by providing a `vertex_lock` array that contains a value for each vertex in the mesh, with `meshopt_SimplifyVertex_Lock` set for vertices that should not be collapsed. This can be useful to preserve certain vertices, such as the boundary of the mesh, with more control than `meshopt_SimplifyLockBorder` option provides. When using `meshopt_simplifyWithUpdate`, locking vertices (whether via `vertex_lock` or `meshopt_SimplifyLockBorder`) will also prevent the simplifier from updating their positions and attributes; this can be useful together with `meshopt_SimplifySparse` for meshlet simplification, as meshlets at one level of hierarchy can be simplified together without excessive data copying.
+
+In addition to the `meshopt_SimplifyPrune` flag, you can explicitly prune isolated components by calling the `meshopt_simplifyPrune` function. This can be done before regular simplification or as the only step, which is useful for scenarios like isosurface cleanup. Similar to other simplification functions, the `target_error` argument controls the cutoff of component radius and is specified in relative units (e.g., `1e-2f` will remove components under 1%). If an absolute cutoff is desired, divide the parameter by the factor returned by `meshopt_simplifyScale`.
+
+Simplification currently assumes that the input mesh is using the same material for all triangles. If the mesh uses multiple materials, it is possible to split the mesh into subsets based on the material and simplify each subset independently, using `meshopt_SimplifyLockBorder` or `vertex_lock` to preserve material boundaries; however, this limits the collapses and may reduce the resulting quality. An alternative approach is to encode information about the material into the vertex buffer, ensuring that all three vertices referencing the same triangle have the same material ID; this may require duplicating vertices on the boundary between materials. After this, simplification can be performed as usual, and after simplification per-triangle material information can be computed from the vertex material IDs. There is no need to inform the simplifier of the value of the material ID: the implicit boundaries created by duplicating vertices with conflicting material IDs will be preserved automatically (unless permissive simplification is used, in which case material boundaries should be protected via `vertex_lock`).
+
+When generating a LOD chain, you can either re-simplify each LOD from the original mesh or use the previous LOD as the starting point for the next level. The latter approach is more efficient and produces smoother visual transitions between LOD levels while preserving mesh attributes better. With this method, resulting error values from previous levels should be accumulated for LOD selection. Additionally, consider using `meshopt_SimplifySparse` to improve performance when generating deep LOD chains.
+
+### Point cloud simplification
+
+In addition to triangle mesh simplification, this library provides a function to simplify point clouds. The algorithm reduces the point cloud to a specified number of points while preserving the overall appearance, and can optionally take per-point colors into account:
+
+```c++
+const float color_weight = 1;
+std::vector<unsigned int> indices(target_count);
+indices.resize(meshopt_simplifyPoints(&indices[0], &points[0].x, points.size(), sizeof(Point),
+    &points[0].r, sizeof(Point), color_weight, target_count));
+```
+
+The resulting indices can be used to render the simplified point cloud; to reduce the memory footprint, the point cloud can be reindexed to create an array of points from the indices.
+
+## Efficiency analyzers
+
+While the only way to get precise performance data is to measure performance on the target GPU, it can be valuable to measure the impact of these optimization in a GPU-independent manner. To this end, the library provides analyzers for all three major optimization routines. For each optimization there is a corresponding analyze function, like `meshopt_analyzeOverdraw`, that returns a struct with statistics.
+
+`meshopt_analyzeVertexCache` returns vertex cache statistics. The common metric to use is ACMR - average cache miss ratio, which is the ratio of the total number of vertex invocations to the triangle count. The worst-case ACMR is 3 (GPU has to process 3 vertices for each triangle); on regular grids the optimal ACMR approaches 0.5. On real meshes it usually is in [0.5..1.5] range depending on the amount of vertex splits. One other useful metric is ATVR - average transformed vertex ratio - which represents the ratio of vertex shader invocations to the total vertices, and has the best case of 1.0 regardless of mesh topology (each vertex is transformed once).
+
+`meshopt_analyzeVertexFetch` returns vertex fetch statistics. The main metric it uses is overfetch - the ratio between the number of bytes read from the vertex buffer to the total number of bytes in the vertex buffer. Assuming non-redundant vertex buffers, the best case is 1.0 - each byte is fetched once.
+
+`meshopt_analyzeOverdraw` returns overdraw statistics. The main metric it uses is overdraw - the ratio between the number of pixel shader invocations to the total number of covered pixels, as measured from several different orthographic cameras. The best case for overdraw is 1.0 - each pixel is shaded once.
+
+`meshopt_analyzeCoverage` returns coverage statistics: the ratio of covered pixels to the viewport extent from each cardinal axis. This is not an efficiency measure per se, but it can be used to measure silhouette change after simplification as well as more precise distance based culling, where the amount of view dependent coverage can be estimated by computing a dot product between the view direction and the coverage vector.
+
+Note that all analyzers use approximate models for the relevant GPU units, so the numbers you will get as the result are only a rough approximation of the actual performance.
+
+## Deinterleaved geometry
+
+All of the examples above assume that geometry is represented as a single vertex buffer and a single index buffer. This requires storing all vertex attributes - position, normal, texture coordinate, skinning weights etc. - in a single contiguous struct. However, in some cases using multiple vertex streams may be preferable. In particular, if some passes require only positional data - such as depth pre-pass or shadow map - then it may be beneficial to split it from the rest of the vertex attributes to make sure the bandwidth use during these passes is optimal. On some mobile GPUs a position-only attribute stream also improves efficiency of tiling algorithms.
+
+Most of the functions in this library either only need the index buffer (such as vertex cache optimization) or only need positional information (such as overdraw optimization). However, several tasks require knowledge about all vertex attributes.
+
+For indexing, `meshopt_generateVertexRemap` assumes that there's just one vertex stream; when multiple vertex streams are used, it's necessary to use `meshopt_generateVertexRemapMulti` as follows:
+
+```c++
+meshopt_Stream streams[] = {
+    {&unindexed_pos[0], sizeof(float) * 3, sizeof(float) * 3},
+    {&unindexed_nrm[0], sizeof(float) * 3, sizeof(float) * 3},
+    {&unindexed_uv[0], sizeof(float) * 2, sizeof(float) * 2},
+};
+
+std::vector<unsigned int> remap(index_count);
+size_t vertex_count = meshopt_generateVertexRemapMulti(&remap[0], NULL, index_count, index_count, streams, sizeof(streams) / sizeof(streams[0]));
+```
+
+After this `meshopt_remapVertexBuffer` needs to be called once for each vertex stream to produce the correctly reindexed stream. For shadow indexing, similarly `meshopt_generateShadowIndexBufferMulti` is available as a replacement.
+
+Instead of calling `meshopt_optimizeVertexFetch` for reordering vertices in a single vertex buffer for efficiency, calling `meshopt_optimizeVertexFetchRemap` and then calling `meshopt_remapVertexBuffer` for each stream again is recommended.
+
+Finally, when compressing vertex data, `meshopt_encodeVertexBuffer` should be used on each vertex stream separately - this allows the encoder to best utilize correlation between attribute values for different vertices.
+
+## Specialized processing
+
+In addition to the core optimization techniques, the library provides several specialized algorithms for specific rendering techniques and pipeline optimizations that require a particular configuration of vertex and index data.
+
+### Triangle strip conversion
+
+On most hardware, indexed triangle lists are the most efficient way to drive the GPU. However, in some cases triangle strips might prove beneficial:
+
+- On some older GPUs, triangle strips may be a bit more efficient to render
+- On extremely memory constrained systems, index buffers for triangle strips could save a bit of memory
+
+This library provides an algorithm for converting a vertex cache optimized triangle list to a triangle strip:
+
+```c++
+std::vector<unsigned int> strip(meshopt_stripifyBound(index_count));
+unsigned int restart_index = ~0u;
+size_t strip_size = meshopt_stripify(&strip[0], indices, index_count, vertex_count, restart_index);
+```
+
+Typically you should expect triangle strips to have ~50-60% of indices compared to triangle lists (~1.5-1.8 indices per triangle) and have ~5% worse ACMR.
+Note that triangle strips can be stitched with or without restart index support. Using restart indices can result in ~10% smaller index buffers, but on some GPUs restart indices may result in decreased performance.
+
+To reduce the triangle strip size further, it's recommended to use `meshopt_optimizeVertexCacheStrip` instead of `meshopt_optimizeVertexCache` when optimizing for vertex cache. This trades off some efficiency in vertex transform for smaller index buffers.
+
+### Geometry shader adjacency
+
+For algorithms that use geometry shaders and require adjacency information, this library can generate an index buffer with adjacency data:
+
+```c++
+std::vector<unsigned int> adjacency(indices.size() * 2);
+meshopt_generateAdjacencyIndexBuffer(&adjacency[0], &indices[0], indices.size(), &vertices[0].x, vertices.size(), sizeof(Vertex));
+```
+
+This creates an index buffer suitable for rendering with triangle-with-adjacency topology, providing 3 extra vertices per triangle that represent vertices opposite to each triangle's edge. This data can be used to compute silhouettes and perform other types of local geometric processing in geometry shaders. To render the mesh with adjacency data, the index buffer should be used with `D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST_ADJ`/`VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY`/`GL_TRIANGLES_ADJACENCY` topology.
+
+Note that the use of geometry shaders may have a performance impact on some GPUs; in some cases alternative implementation strategies may be more efficient.
+
+### Tessellation with displacement mapping
+
+For hardware tessellation with crack-free displacement mapping, this library can generate a special index buffer that supports PN-AEN tessellation:
+
+```c++
+std::vector<unsigned int> tess(indices.size() * 4);
+meshopt_generateTessellationIndexBuffer(&tess[0], &indices[0], indices.size(), &vertices[0].x, vertices.size(), sizeof(Vertex));
+```
+
+This generates a 12-vertex patch for each input triangle with the following layout:
+
+- 0, 1, 2: original triangle vertices
+- 3, 4: opposing edge for edge 0, 1
+- 5, 6: opposing edge for edge 1, 2
+- 7, 8: opposing edge for edge 2, 0
+- 9, 10, 11: dominant vertices for corners 0, 1, 2
+
+This allows the use of hardware tessellation to implement PN-AEN and/or displacement mapping without cracks along UV seams or normal discontinuities. To render the mesh, the index buffer should be used with `D3D_PRIMITIVE_TOPOLOGY_12_CONTROL_POINT_PATCHLIST`/`VK_PRIMITIVE_TOPOLOGY_PATCH_LIST` (`patchControlPoints=12`) topology. For more details please refer to the following papers: [Crack-Free Point-Normal Triangles using Adjacent Edge Normals](https://developer.download.nvidia.com/whitepapers/2010/PN-AEN-Triangles-Whitepaper.pdf), [Tessellation on Any Budget](https://www.nvidia.com/content/pdf/gdc2011/john_mcdonald.pdf) and [My Tessellation Has Cracks!](https://developer.download.nvidia.com/assets/gamedev/files/gdc12/GDC12_DUDASH_MyTessellationHasCracks.pdf).
+
+### Visibility buffers
+
+To render geometry into visibility buffers, access to primitive index in fragment shader is required. While it is possible to use `SV_PrimitiveID`/`gl_PrimitiveID` in the fragment shader, this can result in suboptimal performance on some GPUs (notably, AMD RDNA1 and all NVidia GPUs), and may not be supported on mobile or console hardware. Using mesh shaders to generate primitive IDs is efficient but requires hardware support that is not universally available. To work around these limitations, this library provides a way to generate a special index buffer that uses provoking vertex to encode primitive IDs:
+
+```c++
+std::vector<unsigned int> provoke(indices.size());
+std::vector<unsigned int> reorder(vertices.size() + indices.size() / 3);
+reorder.resize(meshopt_generateProvokingIndexBuffer(&provoke[0], &reorder[0], &indices[0], indices.size(), vertices.size()));
+```
+
+This generates a special index buffer along with a reorder table that satisfies two constraints:
+
+- `provoke[3 * tri] == tri`
+- `reorder[provoke[x]]` refers to the original triangle vertices
+
+To render the mesh with provoking vertex data, the application should use `provoke` as an index buffer and a vertex shader that passes vertex index (`SV_VertexID`/`gl_VertexIndex`) via a `flat`/`nointerpolation` attribute to the fragment shader as a primitive index, and loads vertex data manually by computing the real vertex index based on `reorder` table (`reorder[gl_VertexIndex]`). For more details please refer to [Variable Rate Shading with Visibility Buffer Rendering](https://advances.realtimerendering.com/s2024/content/Hable/Advances_SIGGRAPH_2024_VisibilityVRS-SIGGRAPH_Advances_2024.pptx); naturally, this technique does not require VRS.
+
+> Note: This assumes the provoking vertex is the first vertex of a triangle, which is true for all graphics APIs except OpenGL/WebGL. For OpenGL/WebGL, you may need to rotate each triangle (abc -> bca) in the resulting index buffer, or use the `glProvokingVertex` function (OpenGL 3.2+) or `WEBGL_provoking_vertex` extension (WebGL2) to change the provoking vertex convention. For WebGL2, this is highly recommended to avoid a variety of emulation slowdowns that happen by default if `flat` attributes are used, such as an implicit use of geometry shaders.
+
+Because the order of indices in the resulting index buffer must be preserved exactly for the technique to work, all optimizations that reorder indices (such as vertex cache optimization) must be applied before generating the provoking index buffer. Additionally, if index compression is used, `meshopt_encodeIndexSequence` should be used instead of `meshopt_encodeIndexBuffer` to ensure that the triangles are not rotated during encoding.
+
+## Memory management
+
+Many algorithms allocate temporary memory to store intermediate results or accelerate processing. The amount of memory allocated is a function of various input parameters such as vertex count and index count. By default memory is allocated using `operator new` and `operator delete`; if these operators are overloaded by the application, the overloads will be used instead. Alternatively it's possible to specify custom allocation/deallocation functions using `meshopt_setAllocator`, e.g.
+
+```c++
+meshopt_setAllocator(malloc, free);
+```
+
+> Note that the library expects the allocation function to either throw in case of out-of-memory (in which case the exception will propagate to the caller) or abort, so technically the use of `malloc` above isn't safe. If you want to handle out-of-memory errors without using C++ exceptions, you can use `setjmp`/`longjmp` instead.
+
+Vertex and index decoders (`meshopt_decodeVertexBuffer`, `meshopt_decodeIndexBuffer`, `meshopt_decodeIndexSequence`) do not allocate memory and work completely within the buffer space provided via arguments.
+
+All functions have bounded stack usage that does not exceed 32 KB for any algorithms.
+
+## Experimental APIs
+
+Several algorithms provided by this library are marked as "experimental"; this status is reflected in the comments as well as the annotation `MESHOPTIMIZER_EXPERIMENTAL` for each function.
+
+APIs that are not experimental (annotated with `MESHOPTIMIZER_API`) are considered stable, which means that library updates will not break compatibility: existing calls should compile (API compatibility), existing binaries should link (ABI compatibility), and existing behavior should not change significantly (for example, floating point parameters will have similar behavior). This does not mean that the output of the algorithms will be identical: future versions may improve the algorithms and produce different results.
+
+APIs that *are* experimental may have their interface change, both in ways that will cause existing calls to not compile, and in ways that may compile but have significantly different behavior (e.g., changes in parameter order, meaning, valid ranges). Experimental APIs may also, in rare cases, be removed from future library versions. It is recommended to carefully read release notes when updating the library if experimental APIs are in use. Some experimental APIs may also lack documentation in this README.
+
+Applications may configure the library to change the attributes of experimental APIs, for example defining `MESHOPTIMIZER_EXPERIMENTAL` as `__attribute__((deprecated))` will emit compiler warnings when experimental APIs are used. When building a shared library with CMake, `MESHOPT_STABLE_EXPORTS` option can be set to only export stable APIs; this produces an ABI-stable shared library that can be updated without recompiling the application code.
+
+Currently, the following APIs are experimental:
+
+- `meshopt_buildMeshletsFlex`
+- `meshopt_buildMeshletsSpatial`
+- `meshopt_decodeFilterColor`
+- `meshopt_encodeFilterColor`
+- `meshopt_generatePositionRemap`
+- `meshopt_simplifySloppy`
+- `meshopt_simplifyWithUpdate`
+- `meshopt_SimplifyRegularize` flag for `meshopt_simplify*` functions
+- `meshopt_SimplifyPermissive` mode for `meshopt_simplify*` functions (and associated `meshopt_SimplifyVertex_*` flags)
+
+## License
+
+This library is available to anybody free of charge, under the terms of [MIT License](LICENSE.md).
+
+To honor the license agreement, please include attribution into the user-facing product documentation and/or credits, for example using this or similar text:
+
+> Uses meshoptimizer. Copyright (c) 2016-2025, Arseny Kapoulkine
--- a/src/external/meshoptimizer/allocator.cpp
+++ b/src/external/meshoptimizer/allocator.cpp
@@ -0,0 +1,17 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
+meshopt_Allocator::Storage& meshopt_Allocator::storage()
+{
+	static Storage s = {::operator new, ::operator delete };
+	return s;
+}
+#endif
+
+void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
+{
+	meshopt_Allocator::Storage& s = meshopt_Allocator::storage();
+	s.allocate = allocate;
+	s.deallocate = deallocate;
+}
--- a/src/external/meshoptimizer/clusterizer.cpp
+++ b/src/external/meshoptimizer/clusterizer.cpp
--- a/src/external/meshoptimizer/indexanalyzer.cpp
+++ b/src/external/meshoptimizer/indexanalyzer.cpp
@@ -0,0 +1,126 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
+{
+	assert(index_count % 3 == 0);
+	assert(cache_size >= 3);
+	assert(warp_size == 0 || warp_size >= 3);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexCacheStatistics result = {};
+
+	unsigned int warp_offset = 0;
+	unsigned int primgroup_offset = 0;
+
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = cache_size + 1;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		bool ac = (timestamp - cache_timestamps[a]) > cache_size;
+		bool bc = (timestamp - cache_timestamps[b]) > cache_size;
+		bool cc = (timestamp - cache_timestamps[c]) > cache_size;
+
+		// flush cache if triangle doesn't fit into warp or into the primitive buffer
+		if ((primgroup_size && primgroup_offset == primgroup_size) || (warp_size && warp_offset + ac + bc + cc > warp_size))
+		{
+			result.warps_executed += warp_offset > 0;
+
+			warp_offset = 0;
+			primgroup_offset = 0;
+
+			// reset cache
+			timestamp += cache_size + 1;
+		}
+
+		// update cache and add vertices to warp
+		for (int j = 0; j < 3; ++j)
+		{
+			unsigned int index = indices[i + j];
+
+			if (timestamp - cache_timestamps[index] > cache_size)
+			{
+				cache_timestamps[index] = timestamp++;
+				result.vertices_transformed++;
+				warp_offset++;
+			}
+		}
+
+		primgroup_offset++;
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += cache_timestamps[i] > 0;
+
+	result.warps_executed += warp_offset > 0;
+
+	result.acmr = index_count == 0 ? 0 : float(result.vertices_transformed) / float(index_count / 3);
+	result.atvr = unique_vertex_count == 0 ? 0 : float(result.vertices_transformed) / float(unique_vertex_count);
+
+	return result;
+}
+
+meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexFetchStatistics result = {};
+
+	unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
+	memset(vertex_visited, 0, vertex_count);
+
+	const size_t kCacheLine = 64;
+	const size_t kCacheSize = 128 * 1024;
+
+	// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
+	size_t cache[kCacheSize / kCacheLine] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		vertex_visited[index] = 1;
+
+		size_t start_address = index * vertex_size;
+		size_t end_address = start_address + vertex_size;
+
+		size_t start_tag = start_address / kCacheLine;
+		size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
+
+		assert(start_tag < end_tag);
+
+		for (size_t tag = start_tag; tag < end_tag; ++tag)
+		{
+			size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
+
+			// we store +1 since cache is filled with 0 by default
+			result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
+			cache[line] = tag + 1;
+		}
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += vertex_visited[i];
+
+	result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
+
+	return result;
+}
--- a/src/external/meshoptimizer/indexcodec.cpp
+++ b/src/external/meshoptimizer/indexcodec.cpp
@@ -0,0 +1,688 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// This work is based on:
+// Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013
+// Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014
+namespace meshopt
+{
+
+const unsigned char kIndexHeader = 0xe0;
+const unsigned char kSequenceHeader = 0xd0;
+
+static int gEncodeIndexVersion = 1;
+const int kDecodeIndexVersion = 1;
+
+typedef unsigned int VertexFifo[16];
+typedef unsigned int EdgeFifo[16][2];
+
+static const unsigned int kTriangleIndexOrder[3][3] = {
+    {0, 1, 2},
+    {1, 2, 0},
+    {2, 0, 1},
+};
+
+static const unsigned char kCodeAuxEncodingTable[16] = {
+    0x00, 0x76, 0x87, 0x56, 0x67, 0x78, 0xa9, 0x86, 0x65, 0x89, 0x68, 0x98, 0x01, 0x69,
+    0, 0, // last two entries aren't used for encoding
+};
+
+static int rotateTriangle(unsigned int a, unsigned int b, unsigned int c, unsigned int next)
+{
+	(void)a;
+
+	return (b == next) ? 1 : (c == next ? 2 : 0);
+}
+
+static int getEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, unsigned int c, size_t offset)
+{
+	for (int i = 0; i < 16; ++i)
+	{
+		size_t index = (offset - 1 - i) & 15;
+
+		unsigned int e0 = fifo[index][0];
+		unsigned int e1 = fifo[index][1];
+
+		if (e0 == a && e1 == b)
+			return (i << 2) | 0;
+		if (e0 == b && e1 == c)
+			return (i << 2) | 1;
+		if (e0 == c && e1 == a)
+			return (i << 2) | 2;
+	}
+
+	return -1;
+}
+
+static void pushEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, size_t& offset)
+{
+	fifo[offset][0] = a;
+	fifo[offset][1] = b;
+	offset = (offset + 1) & 15;
+}
+
+static int getVertexFifo(VertexFifo fifo, unsigned int v, size_t offset)
+{
+	for (int i = 0; i < 16; ++i)
+	{
+		size_t index = (offset - 1 - i) & 15;
+
+		if (fifo[index] == v)
+			return i;
+	}
+
+	return -1;
+}
+
+static void pushVertexFifo(VertexFifo fifo, unsigned int v, size_t& offset, int cond = 1)
+{
+	fifo[offset] = v;
+	offset = (offset + cond) & 15;
+}
+
+static void encodeVByte(unsigned char*& data, unsigned int v)
+{
+	// encode 32-bit value in up to 5 7-bit groups
+	do
+	{
+		*data++ = (v & 127) | (v > 127 ? 128 : 0);
+		v >>= 7;
+	} while (v);
+}
+
+static unsigned int decodeVByte(const unsigned char*& data)
+{
+	unsigned char lead = *data++;
+
+	// fast path: single byte
+	if (lead < 128)
+		return lead;
+
+	// slow path: up to 4 extra bytes
+	// note that this loop always terminates, which is important for malformed data
+	unsigned int result = lead & 127;
+	unsigned int shift = 7;
+
+	for (int i = 0; i < 4; ++i)
+	{
+		unsigned char group = *data++;
+		result |= unsigned(group & 127) << shift;
+		shift += 7;
+
+		if (group < 128)
+			break;
+	}
+
+	return result;
+}
+
+static void encodeIndex(unsigned char*& data, unsigned int index, unsigned int last)
+{
+	unsigned int d = index - last;
+	unsigned int v = (d << 1) ^ (int(d) >> 31);
+
+	encodeVByte(data, v);
+}
+
+static unsigned int decodeIndex(const unsigned char*& data, unsigned int last)
+{
+	unsigned int v = decodeVByte(data);
+	unsigned int d = (v >> 1) ^ -int(v & 1);
+
+	return last + d;
+}
+
+static int getCodeAuxIndex(unsigned char v, const unsigned char* table)
+{
+	for (int i = 0; i < 16; ++i)
+		if (table[i] == v)
+			return i;
+
+	return -1;
+}
+
+static void writeTriangle(void* destination, size_t offset, size_t index_size, unsigned int a, unsigned int b, unsigned int c)
+{
+	if (index_size == 2)
+	{
+		static_cast<unsigned short*>(destination)[offset + 0] = (unsigned short)(a);
+		static_cast<unsigned short*>(destination)[offset + 1] = (unsigned short)(b);
+		static_cast<unsigned short*>(destination)[offset + 2] = (unsigned short)(c);
+	}
+	else
+	{
+		static_cast<unsigned int*>(destination)[offset + 0] = a;
+		static_cast<unsigned int*>(destination)[offset + 1] = b;
+		static_cast<unsigned int*>(destination)[offset + 2] = c;
+	}
+}
+
+} // namespace meshopt
+
+size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+	if (buffer_size < 1 + index_count / 3 + 16)
+		return 0;
+
+	int version = gEncodeIndexVersion;
+
+	buffer[0] = (unsigned char)(kIndexHeader | version);
+
+	EdgeFifo edgefifo;
+	memset(edgefifo, -1, sizeof(edgefifo));
+
+	VertexFifo vertexfifo;
+	memset(vertexfifo, -1, sizeof(vertexfifo));
+
+	size_t edgefifooffset = 0;
+	size_t vertexfifooffset = 0;
+
+	unsigned int next = 0;
+	unsigned int last = 0;
+
+	unsigned char* code = buffer + 1;
+	unsigned char* data = code + index_count / 3;
+	unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+	int fecmax = version >= 1 ? 13 : 15;
+
+	// use static encoding table; it's possible to pack the result and then build an optimal table and repack
+	// for now we keep it simple and use the table that has been generated based on symbol frequency on a training mesh set
+	const unsigned char* codeaux_table = kCodeAuxEncodingTable;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		// make sure we have enough space to write a triangle
+		// each triangle writes at most 16 bytes: 1b for codeaux and 5b for each free index
+		// after this we can be sure we can write without extra bounds checks
+		if (data > data_safe_end)
+			return 0;
+
+		int fer = getEdgeFifo(edgefifo, indices[i + 0], indices[i + 1], indices[i + 2], edgefifooffset);
+
+		if (fer >= 0 && (fer >> 2) < 15)
+		{
+			// note: getEdgeFifo implicitly rotates triangles by matching a/b to existing edge
+			const unsigned int* order = kTriangleIndexOrder[fer & 3];
+
+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+			// encode edge index and vertex fifo index, next or free index
+			int fe = fer >> 2;
+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			int fec = (fc >= 1 && fc < fecmax) ? fc : (c == next ? (next++, 0) : 15);
+
+			if (fec == 15 && version >= 1)
+			{
+				// encode last-1 and last+1 to optimize strip-like sequences
+				if (c + 1 == last)
+					fec = 13, last = c;
+				if (c == last + 1)
+					fec = 14, last = c;
+			}
+
+			*code++ = (unsigned char)((fe << 4) | fec);
+
+			// note that we need to update the last index since free indices are delta-encoded
+			if (fec == 15)
+				encodeIndex(data, c, last), last = c;
+
+			// we only need to push third vertex since first two are likely already in the vertex fifo
+			if (fec == 0 || fec >= fecmax)
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// we only need to push two new edges to edge fifo since the third one is already there
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+		}
+		else
+		{
+			int rotation = rotateTriangle(indices[i + 0], indices[i + 1], indices[i + 2], next);
+			const unsigned int* order = kTriangleIndexOrder[rotation];
+
+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+			// if a/b/c are 0/1/2, we emit a reset code
+			bool reset = false;
+
+			if (a == 0 && b == 1 && c == 2 && next > 0 && version >= 1)
+			{
+				reset = true;
+				next = 0;
+
+				// reset vertex fifo to make sure we don't accidentally reference vertices from that in the future
+				// this makes sure next continues to get incremented instead of being stuck
+				memset(vertexfifo, -1, sizeof(vertexfifo));
+			}
+
+			int fb = getVertexFifo(vertexfifo, b, vertexfifooffset);
+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
+			// note: decoder implicitly assumes that if feb=fec=0, then fea=0 (reset code); this is enforced by rotation
+			int fea = (a == next) ? (next++, 0) : 15;
+			int feb = (fb >= 0 && fb < 14) ? fb + 1 : (b == next ? (next++, 0) : 15);
+			int fec = (fc >= 0 && fc < 14) ? fc + 1 : (c == next ? (next++, 0) : 15);
+
+			// we encode feb & fec in 4 bits using a table if possible, and as a full byte otherwise
+			unsigned char codeaux = (unsigned char)((feb << 4) | fec);
+			int codeauxindex = getCodeAuxIndex(codeaux, codeaux_table);
+
+			// <14 encodes an index into codeaux table, 14 encodes fea=0, 15 encodes fea=15
+			if (fea == 0 && codeauxindex >= 0 && codeauxindex < 14 && !reset)
+			{
+				*code++ = (unsigned char)((15 << 4) | codeauxindex);
+			}
+			else
+			{
+				*code++ = (unsigned char)((15 << 4) | 14 | fea);
+				*data++ = codeaux;
+			}
+
+			// note that we need to update the last index since free indices are delta-encoded
+			if (fea == 15)
+				encodeIndex(data, a, last), last = a;
+
+			if (feb == 15)
+				encodeIndex(data, b, last), last = b;
+
+			if (fec == 15)
+				encodeIndex(data, c, last), last = c;
+
+			// only push vertices that weren't already in fifo
+			if (fea == 0 || fea == 15)
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+
+			if (feb == 0 || feb == 15)
+				pushVertexFifo(vertexfifo, b, vertexfifooffset);
+
+			if (fec == 0 || fec == 15)
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// all three edges aren't in the fifo; pushing all of them is important so that we can match them for later triangles
+			pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+		}
+	}
+
+	// make sure we have enough space to write codeaux table
+	if (data > data_safe_end)
+		return 0;
+
+	// add codeaux encoding table to the end of the stream; this is used for decoding codeaux *and* as padding
+	// we need padding for decoding to be able to assume that each triangle is encoded as <= 16 bytes of extra data
+	// this is enough space for aux byte + 5 bytes per varint index which is the absolute worst case for any input
+	for (size_t i = 0; i < 16; ++i)
+	{
+		// decoder assumes that table entries never refer to separately encoded indices
+		assert((codeaux_table[i] & 0xf) != 0xf && (codeaux_table[i] >> 4) != 0xf);
+
+		*data++ = codeaux_table[i];
+	}
+
+	// since we encode restarts as codeaux without a table reference, we need to make sure 00 is encoded as a table reference
+	assert(codeaux_table[0] == 0);
+
+	assert(data >= buffer + index_count / 3 + 16);
+	assert(data <= buffer + buffer_size);
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	// compute number of bits required for each index
+	unsigned int vertex_bits = 1;
+
+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
+		vertex_bits++;
+
+	// worst-case encoding is 2 header bytes + 3 varint-7 encoded index deltas
+	unsigned int vertex_groups = (vertex_bits + 1 + 6) / 7;
+
+	return 1 + (index_count / 3) * (2 + 3 * vertex_groups) + 16;
+}
+
+void meshopt_encodeIndexVersion(int version)
+{
+	assert(unsigned(version) <= unsigned(meshopt::kDecodeIndexVersion));
+
+	meshopt::gEncodeIndexVersion = version;
+}
+
+int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size)
+{
+	if (buffer_size < 1)
+		return -1;
+
+	unsigned char header = buffer[0];
+
+	if ((header & 0xf0) != meshopt::kIndexHeader && (header & 0xf0) != meshopt::kSequenceHeader)
+		return -1;
+
+	int version = header & 0x0f;
+	if (version > meshopt::kDecodeIndexVersion)
+		return -1;
+
+	return version;
+}
+
+int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(index_size == 2 || index_size == 4);
+
+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+	if (buffer_size < 1 + index_count / 3 + 16)
+		return -2;
+
+	if ((buffer[0] & 0xf0) != kIndexHeader)
+		return -1;
+
+	int version = buffer[0] & 0x0f;
+	if (version > kDecodeIndexVersion)
+		return -1;
+
+	EdgeFifo edgefifo;
+	memset(edgefifo, -1, sizeof(edgefifo));
+
+	VertexFifo vertexfifo;
+	memset(vertexfifo, -1, sizeof(vertexfifo));
+
+	size_t edgefifooffset = 0;
+	size_t vertexfifooffset = 0;
+
+	unsigned int next = 0;
+	unsigned int last = 0;
+
+	int fecmax = version >= 1 ? 13 : 15;
+
+	// since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end
+	const unsigned char* code = buffer + 1;
+	const unsigned char* data = code + index_count / 3;
+	const unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+	const unsigned char* codeaux_table = data_safe_end;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		// make sure we have enough data to read for a triangle
+		// each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index
+		// after this we can be sure we can read without extra bounds checks
+		if (data > data_safe_end)
+			return -2;
+
+		unsigned char codetri = *code++;
+
+		if (codetri < 0xf0)
+		{
+			int fe = codetri >> 4;
+
+			// fifo reads are wrapped around 16 entry buffer
+			unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
+			unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
+			unsigned int c = 0;
+
+			int fec = codetri & 15;
+
+			// note: this is the most common path in the entire decoder
+			// inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable
+			if (fec < fecmax)
+			{
+				// fifo reads are wrapped around 16 entry buffer
+				unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
+				c = (fec == 0) ? next : cf;
+
+				int fec0 = fec == 0;
+				next += fec0;
+
+				// push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+			}
+			else
+			{
+				// fec - (fec ^ 3) decodes 13, 14 into -1, 1
+				// note that we need to update the last index since free indices are delta-encoded
+				last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+			}
+
+			// push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+
+			// output triangle
+			writeTriangle(destination, i, index_size, a, b, c);
+		}
+		else
+		{
+			// fast path: read codeaux from the table
+			if (codetri < 0xfe)
+			{
+				unsigned char codeaux = codeaux_table[codetri & 15];
+
+				// note: table can't contain feb/fec=15
+				int feb = codeaux >> 4;
+				int fec = codeaux & 15;
+
+				// fifo reads are wrapped around 16 entry buffer
+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+				unsigned int a = next++;
+
+				unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15];
+				unsigned int b = (feb == 0) ? next : bf;
+
+				int feb0 = feb == 0;
+				next += feb0;
+
+				unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15];
+				unsigned int c = (fec == 0) ? next : cf;
+
+				int fec0 = fec == 0;
+				next += fec0;
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+				pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0);
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+
+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+			else
+			{
+				// slow path: read a full byte for codeaux instead of using a table lookup
+				unsigned char codeaux = *data++;
+
+				int fea = codetri == 0xfe ? 0 : 15;
+				int feb = codeaux >> 4;
+				int fec = codeaux & 15;
+
+				// reset: codeaux is 0 but encoded as not-a-table
+				if (codeaux == 0)
+					next = 0;
+
+				// fifo reads are wrapped around 16 entry buffer
+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+				unsigned int a = (fea == 0) ? next++ : 0;
+				unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15];
+				unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15];
+
+				// note that we need to update the last index since free indices are delta-encoded
+				if (fea == 15)
+					last = a = decodeIndex(data, last);
+
+				if (feb == 15)
+					last = b = decodeIndex(data, last);
+
+				if (fec == 15)
+					last = c = decodeIndex(data, last);
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+				pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15));
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15));
+
+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+		}
+	}
+
+	// we should've read all data bytes and stopped at the boundary between data and codeaux table
+	if (data != data_safe_end)
+		return -3;
+
+	return 0;
+}
+
+size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
+{
+	using namespace meshopt;
+
+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
+	if (buffer_size < 1 + index_count + 4)
+		return 0;
+
+	int version = gEncodeIndexVersion;
+
+	buffer[0] = (unsigned char)(kSequenceHeader | version);
+
+	unsigned int last[2] = {};
+	unsigned int current = 0;
+
+	unsigned char* data = buffer + 1;
+	unsigned char* data_safe_end = buffer + buffer_size - 4;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		// make sure we have enough data to write
+		// each index writes at most 5 bytes of data; there's a 4 byte tail after data_safe_end
+		// after this we can be sure we can write without extra bounds checks
+		if (data >= data_safe_end)
+			return 0;
+
+		unsigned int index = indices[i];
+
+		// this is a heuristic that switches between baselines when the delta grows too large
+		// we want the encoded delta to fit into one byte (7 bits), but 2 bits are used for sign and baseline index
+		// for now we immediately switch the baseline when delta grows too large - this can be adjusted arbitrarily
+		int cd = int(index - last[current]);
+		current ^= ((cd < 0 ? -cd : cd) >= 30);
+
+		// encode delta from the last index
+		unsigned int d = index - last[current];
+		unsigned int v = (d << 1) ^ (int(d) >> 31);
+
+		// note: low bit encodes the index of the last baseline which will be used for reconstruction
+		encodeVByte(data, (v << 1) | current);
+
+		// update last for the next iteration that uses it
+		last[current] = index;
+	}
+
+	// make sure we have enough space to write tail
+	if (data > data_safe_end)
+		return 0;
+
+	for (int k = 0; k < 4; ++k)
+		*data++ = 0;
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count)
+{
+	// compute number of bits required for each index
+	unsigned int vertex_bits = 1;
+
+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
+		vertex_bits++;
+
+	// worst-case encoding is 1 varint-7 encoded index delta for a K bit value and an extra bit
+	unsigned int vertex_groups = (vertex_bits + 1 + 1 + 6) / 7;
+
+	return 1 + index_count * vertex_groups + 4;
+}
+
+int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
+	if (buffer_size < 1 + index_count + 4)
+		return -2;
+
+	if ((buffer[0] & 0xf0) != kSequenceHeader)
+		return -1;
+
+	int version = buffer[0] & 0x0f;
+	if (version > kDecodeIndexVersion)
+		return -1;
+
+	const unsigned char* data = buffer + 1;
+	const unsigned char* data_safe_end = buffer + buffer_size - 4;
+
+	unsigned int last[2] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		// make sure we have enough data to read
+		// each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end
+		// after this we can be sure we can read without extra bounds checks
+		if (data >= data_safe_end)
+			return -2;
+
+		unsigned int v = decodeVByte(data);
+
+		// decode the index of the last baseline
+		unsigned int current = v & 1;
+		v >>= 1;
+
+		// reconstruct index as a delta
+		unsigned int d = (v >> 1) ^ -int(v & 1);
+		unsigned int index = last[current] + d;
+
+		// update last for the next iteration that uses it
+		last[current] = index;
+
+		if (index_size == 2)
+		{
+			static_cast<unsigned short*>(destination)[i] = (unsigned short)(index);
+		}
+		else
+		{
+			static_cast<unsigned int*>(destination)[i] = index;
+		}
+	}
+
+	// we should've read all data bytes and stopped at the boundary between data and tail
+	if (data != data_safe_end)
+		return -3;
+
+	return 0;
+}
--- a/src/external/meshoptimizer/indexgenerator.cpp
+++ b/src/external/meshoptimizer/indexgenerator.cpp
@@ -0,0 +1,704 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// This work is based on:
+// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
+// John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
+// John Hable. Variable Rate Shading with Visibility Buffer Rendering. 2024
+namespace meshopt
+{
+
+static unsigned int hashUpdate4(unsigned int h, const unsigned char* key, size_t len)
+{
+	// MurmurHash2
+	const unsigned int m = 0x5bd1e995;
+	const int r = 24;
+
+	while (len >= 4)
+	{
+		unsigned int k = *reinterpret_cast<const unsigned int*>(key);
+
+		k *= m;
+		k ^= k >> r;
+		k *= m;
+
+		h *= m;
+		h ^= k;
+
+		key += 4;
+		len -= 4;
+	}
+
+	return h;
+}
+
+struct VertexHasher
+{
+	const unsigned char* vertices;
+	size_t vertex_size;
+	size_t vertex_stride;
+
+	size_t hash(unsigned int index) const
+	{
+		return hashUpdate4(0, vertices + index * vertex_stride, vertex_size);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return memcmp(vertices + lhs * vertex_stride, vertices + rhs * vertex_stride, vertex_size) == 0;
+	}
+};
+
+struct VertexStreamHasher
+{
+	const meshopt_Stream* streams;
+	size_t stream_count;
+
+	size_t hash(unsigned int index) const
+	{
+		unsigned int h = 0;
+
+		for (size_t i = 0; i < stream_count; ++i)
+		{
+			const meshopt_Stream& s = streams[i];
+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+			h = hashUpdate4(h, data + index * s.stride, s.size);
+		}
+
+		return h;
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		for (size_t i = 0; i < stream_count; ++i)
+		{
+			const meshopt_Stream& s = streams[i];
+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+			if (memcmp(data + lhs * s.stride, data + rhs * s.stride, s.size) != 0)
+				return false;
+		}
+
+		return true;
+	}
+};
+
+struct VertexCustomHasher
+{
+	const float* vertex_positions;
+	size_t vertex_stride_float;
+
+	int (*callback)(void*, unsigned int, unsigned int);
+	void* context;
+
+	size_t hash(unsigned int index) const
+	{
+		const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + index * vertex_stride_float);
+
+		unsigned int x = key[0], y = key[1], z = key[2];
+
+		// replace negative zero with zero
+		x = (x == 0x80000000) ? 0 : x;
+		y = (y == 0x80000000) ? 0 : y;
+		z = (z == 0x80000000) ? 0 : z;
+
+		// scramble bits to make sure that integer coordinates have entropy in lower bits
+		x ^= x >> 17;
+		y ^= y >> 17;
+		z ^= z >> 17;
+
+		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
+		return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		const float* lp = vertex_positions + lhs * vertex_stride_float;
+		const float* rp = vertex_positions + rhs * vertex_stride_float;
+
+		if (lp[0] != rp[0] || lp[1] != rp[1] || lp[2] != rp[2])
+			return false;
+
+		return callback ? callback(context, lhs, rhs) : true;
+	}
+};
+
+struct EdgeHasher
+{
+	const unsigned int* remap;
+
+	size_t hash(unsigned long long edge) const
+	{
+		unsigned int e0 = unsigned(edge >> 32);
+		unsigned int e1 = unsigned(edge);
+
+		unsigned int h1 = remap[e0];
+		unsigned int h2 = remap[e1];
+
+		const unsigned int m = 0x5bd1e995;
+
+		// MurmurHash64B finalizer
+		h1 ^= h2 >> 18;
+		h1 *= m;
+		h2 ^= h1 >> 22;
+		h2 *= m;
+		h1 ^= h2 >> 17;
+		h1 *= m;
+		h2 ^= h1 >> 19;
+		h2 *= m;
+
+		return h2;
+	}
+
+	bool equal(unsigned long long lhs, unsigned long long rhs) const
+	{
+		unsigned int l0 = unsigned(lhs >> 32);
+		unsigned int l1 = unsigned(lhs);
+
+		unsigned int r0 = unsigned(rhs >> 32);
+		unsigned int r1 = unsigned(rhs);
+
+		return remap[l0] == remap[r0] && remap[l1] == remap[r1];
+	}
+};
+
+static size_t hashBuckets(size_t count)
+{
+	size_t buckets = 1;
+	while (buckets < count + count / 4)
+		buckets *= 2;
+
+	return buckets;
+}
+
+template <typename T, typename Hash>
+static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
+{
+	assert(buckets > 0);
+	assert((buckets & (buckets - 1)) == 0);
+
+	size_t hashmod = buckets - 1;
+	size_t bucket = hash.hash(key) & hashmod;
+
+	for (size_t probe = 0; probe <= hashmod; ++probe)
+	{
+		T& item = table[bucket];
+
+		if (item == empty)
+			return &item;
+
+		if (hash.equal(item, key))
+			return &item;
+
+		// hash collision, quadratic probing
+		bucket = (bucket + probe + 1) & hashmod;
+	}
+
+	assert(false && "Hash table is full"); // unreachable
+	return NULL;
+}
+
+static void buildPositionRemap(unsigned int* remap, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
+{
+	VertexHasher vertex_hasher = {reinterpret_cast<const unsigned char*>(vertex_positions), 3 * sizeof(float), vertex_positions_stride};
+
+	size_t vertex_table_size = hashBuckets(vertex_count);
+	unsigned int* vertex_table = allocator.allocate<unsigned int>(vertex_table_size);
+	memset(vertex_table, -1, vertex_table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int index = unsigned(i);
+		unsigned int* entry = hashLookup(vertex_table, vertex_table_size, vertex_hasher, index, ~0u);
+
+		if (*entry == ~0u)
+			*entry = index;
+
+		remap[index] = *entry;
+	}
+
+	allocator.deallocate(vertex_table);
+}
+
+template <typename Hash>
+static size_t generateVertexRemap(unsigned int* remap, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
+{
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(index < vertex_count);
+
+		if (remap[index] != ~0u)
+			continue;
+
+		unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
+
+		if (*entry == ~0u)
+		{
+			*entry = index;
+			remap[index] = next_vertex++;
+		}
+		else
+		{
+			assert(remap[*entry] != ~0u);
+			remap[index] = remap[*entry];
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+	return next_vertex;
+}
+
+template <size_t BlockSize>
+static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
+{
+	size_t block_size = BlockSize == 0 ? vertex_size : BlockSize;
+	assert(block_size == vertex_size);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		if (remap[i] != ~0u)
+		{
+			assert(remap[i] < vertex_count);
+			memcpy(static_cast<unsigned char*>(destination) + remap[i] * block_size, static_cast<const unsigned char*>(vertices) + i * block_size, block_size);
+		}
+}
+
+template <typename Hash>
+static void generateShadowBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
+{
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (remap[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
+
+			if (*entry == ~0u)
+				*entry = index;
+
+			remap[index] = *entry;
+		}
+
+		destination[i] = remap[index];
+	}
+}
+
+} // namespace meshopt
+
+size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(indices || index_count == vertex_count);
+	assert(!indices || index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size};
+
+	return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
+}
+
+size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+	using namespace meshopt;
+
+	assert(indices || index_count == vertex_count);
+	assert(index_count % 3 == 0);
+	assert(stream_count > 0 && stream_count <= 16);
+
+	for (size_t i = 0; i < stream_count; ++i)
+	{
+		assert(streams[i].size > 0 && streams[i].size <= 256);
+		assert(streams[i].size <= streams[i].stride);
+	}
+
+	meshopt_Allocator allocator;
+	VertexStreamHasher hasher = {streams, stream_count};
+
+	return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
+}
+
+size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context)
+{
+	using namespace meshopt;
+
+	assert(indices || index_count == vertex_count);
+	assert(!indices || index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+	VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), callback, context};
+
+	return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
+}
+
+void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	// support in-place remap
+	if (destination == vertices)
+	{
+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+		vertices = vertices_copy;
+	}
+
+	// specialize the loop for common vertex sizes to ensure memcpy is compiled as an inlined intrinsic
+	switch (vertex_size)
+	{
+	case 4:
+		return remapVertices<4>(destination, vertices, vertex_count, vertex_size, remap);
+
+	case 8:
+		return remapVertices<8>(destination, vertices, vertex_count, vertex_size, remap);
+
+	case 12:
+		return remapVertices<12>(destination, vertices, vertex_count, vertex_size, remap);
+
+	case 16:
+		return remapVertices<16>(destination, vertices, vertex_count, vertex_size, remap);
+
+	default:
+		return remapVertices<0>(destination, vertices, vertex_count, vertex_size, remap);
+	}
+}
+
+void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap)
+{
+	assert(index_count % 3 == 0);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(remap[index] != ~0u);
+
+		destination[i] = remap[index];
+	}
+}
+
+void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
+{
+	using namespace meshopt;
+
+	assert(indices);
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size <= vertex_stride);
+
+	meshopt_Allocator allocator;
+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride};
+
+	generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
+}
+
+void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+	using namespace meshopt;
+
+	assert(indices);
+	assert(index_count % 3 == 0);
+	assert(stream_count > 0 && stream_count <= 16);
+
+	for (size_t i = 0; i < stream_count; ++i)
+	{
+		assert(streams[i].size > 0 && streams[i].size <= 256);
+		assert(streams[i].size <= streams[i].stride);
+	}
+
+	meshopt_Allocator allocator;
+	VertexStreamHasher hasher = {streams, stream_count};
+
+	generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
+}
+
+void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+	VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), NULL, NULL};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int* entry = hashLookup(table, table_size, hasher, unsigned(i), ~0u);
+
+		if (*entry == ~0u)
+			*entry = unsigned(i);
+
+		destination[i] = *entry;
+	}
+}
+
+void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	static const int next[4] = {1, 2, 0, 1};
+
+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
+
+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
+	EdgeHasher edge_hasher = {remap};
+
+	size_t edge_table_size = hashBuckets(index_count);
+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
+	unsigned int* edge_vertex_table = allocator.allocate<unsigned int>(edge_table_size);
+
+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
+	memset(edge_vertex_table, -1, edge_table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			unsigned int i2 = indices[i + next[e + 1]];
+			assert(i0 < vertex_count && i1 < vertex_count && i2 < vertex_count);
+
+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			if (*entry == ~0ull)
+			{
+				*entry = edge;
+
+				// store vertex opposite to the edge
+				edge_vertex_table[entry - edge_table] = i2;
+			}
+		}
+	}
+
+	// build resulting index buffer: 6 indices for each input triangle
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int patch[6];
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			// note: this refers to the opposite edge!
+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
+			unsigned long long* oppe = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			patch[e * 2 + 0] = i0;
+			patch[e * 2 + 1] = (*oppe == ~0ull) ? i0 : edge_vertex_table[oppe - edge_table];
+		}
+
+		memcpy(destination + i * 2, patch, sizeof(patch));
+	}
+}
+
+void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	static const int next[3] = {1, 2, 0};
+
+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
+
+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
+	EdgeHasher edge_hasher = {remap};
+
+	size_t edge_table_size = hashBuckets(index_count);
+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			if (*entry == ~0ull)
+				*entry = edge;
+		}
+	}
+
+	// build resulting index buffer: 12 indices for each input triangle
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int patch[12];
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			// note: this refers to the opposite edge!
+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
+			unsigned long long oppe = *hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			// use the same edge if opposite edge doesn't exist (border)
+			oppe = (oppe == ~0ull) ? edge : oppe;
+
+			// triangle index (0, 1, 2)
+			patch[e] = i0;
+
+			// opposite edge (3, 4; 5, 6; 7, 8)
+			patch[3 + e * 2 + 0] = unsigned(oppe);
+			patch[3 + e * 2 + 1] = unsigned(oppe >> 32);
+
+			// dominant vertex (9, 10, 11)
+			patch[9 + e] = remap[i0];
+		}
+
+		memcpy(destination + i * 4, patch, sizeof(patch));
+	}
+}
+
+size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	// compute vertex valence; this is used to prioritize least used corner
+	// note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
+	unsigned char* valence = allocator.allocate<unsigned char>(vertex_count);
+	memset(valence, 0, vertex_count);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		valence[index]++;
+	}
+
+	unsigned int reorder_offset = 0;
+
+	// assign provoking vertices; leave the rest for the next pass
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		// try to rotate triangle such that provoking vertex hasn't been seen before
+		// if multiple vertices are new, prioritize the one with least valence
+		// this reduces the risk that a future triangle will have all three vertices seen
+		unsigned int va = remap[a] == ~0u ? valence[a] : ~0u;
+		unsigned int vb = remap[b] == ~0u ? valence[b] : ~0u;
+		unsigned int vc = remap[c] == ~0u ? valence[c] : ~0u;
+
+		if (vb != ~0u && vb <= va && vb <= vc)
+		{
+			// abc -> bca
+			unsigned int t = a;
+			a = b, b = c, c = t;
+		}
+		else if (vc != ~0u && vc <= va && vc <= vb)
+		{
+			// abc -> cab
+			unsigned int t = c;
+			c = b, b = a, a = t;
+		}
+
+		unsigned int newidx = reorder_offset;
+
+		// now remap[a] = ~0u or all three vertices are old
+		// recording remap[a] makes it possible to remap future references to the same index, conserving space
+		if (remap[a] == ~0u)
+			remap[a] = newidx;
+
+		// we need to clone the provoking vertex to get a unique index
+		// if all three are used the choice is arbitrary since no future triangle will be able to reuse any of these
+		reorder[reorder_offset++] = a;
+
+		// note: first vertex is final, the other two will be fixed up in next pass
+		destination[i + 0] = newidx;
+		destination[i + 1] = b;
+		destination[i + 2] = c;
+
+		// update vertex valences for corner heuristic
+		valence[a]--;
+		valence[b]--;
+		valence[c]--;
+	}
+
+	// remap or clone non-provoking vertices (iterating to skip provoking vertices)
+	int step = 1;
+
+	for (size_t i = 1; i < index_count; i += step, step ^= 3)
+	{
+		unsigned int index = destination[i];
+
+		if (remap[index] == ~0u)
+		{
+			// we haven't seen the vertex before as a provoking vertex
+			// to maintain the reference to the original vertex we need to clone it
+			unsigned int newidx = reorder_offset;
+
+			remap[index] = newidx;
+			reorder[reorder_offset++] = index;
+		}
+
+		destination[i] = remap[index];
+	}
+
+	assert(reorder_offset <= vertex_count + index_count / 3);
+	return reorder_offset;
+}
--- a/src/external/meshoptimizer/meshoptimizer.h
+++ b/src/external/meshoptimizer/meshoptimizer.h
--- a/src/external/meshoptimizer/overdrawoptimizer.cpp
+++ b/src/external/meshoptimizer/overdrawoptimizer.cpp
@@ -0,0 +1,333 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float mesh_centroid[3] = {};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* p = vertex_positions + vertex_stride_float * i;
+
+		mesh_centroid[0] += p[0];
+		mesh_centroid[1] += p[1];
+		mesh_centroid[2] += p[2];
+	}
+
+	mesh_centroid[0] /= float(vertex_count);
+	mesh_centroid[1] /= float(vertex_count);
+	mesh_centroid[2] /= float(vertex_count);
+
+	for (size_t cluster = 0; cluster < cluster_count; ++cluster)
+	{
+		size_t cluster_begin = clusters[cluster] * 3;
+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+		assert(cluster_begin < cluster_end);
+
+		float cluster_area = 0;
+		float cluster_centroid[3] = {};
+		float cluster_normal[3] = {};
+
+		for (size_t i = cluster_begin; i < cluster_end; i += 3)
+		{
+			const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0];
+			const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1];
+			const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2];
+
+			float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+			float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+			float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+			float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+			float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+			float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+
+			cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3);
+			cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3);
+			cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3);
+			cluster_normal[0] += normalx;
+			cluster_normal[1] += normaly;
+			cluster_normal[2] += normalz;
+			cluster_area += area;
+		}
+
+		float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area;
+
+		cluster_centroid[0] *= inv_cluster_area;
+		cluster_centroid[1] *= inv_cluster_area;
+		cluster_centroid[2] *= inv_cluster_area;
+
+		float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]);
+		float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length;
+
+		cluster_normal[0] *= inv_cluster_normal_length;
+		cluster_normal[1] *= inv_cluster_normal_length;
+		cluster_normal[2] *= inv_cluster_normal_length;
+
+		float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]};
+
+		sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2];
+	}
+}
+
+static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count)
+{
+	// compute sort data bounds and renormalize, using fixed point snorm
+	float sort_data_max = 1e-3f;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		float dpa = fabsf(sort_data[i]);
+
+		sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max;
+	}
+
+	const int sort_bits = 11;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		// note that we flip distribution since high dot product should come first
+		float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max);
+
+		sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1);
+	}
+
+	// fill histogram for counting sort
+	unsigned int histogram[1 << sort_bits];
+	memset(histogram, 0, sizeof(histogram));
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		histogram[sort_keys[i]]++;
+	}
+
+	// compute offsets based on histogram data
+	size_t histogram_sum = 0;
+
+	for (size_t i = 0; i < 1 << sort_bits; ++i)
+	{
+		size_t count = histogram[i];
+		histogram[i] = unsigned(histogram_sum);
+		histogram_sum += count;
+	}
+
+	assert(histogram_sum == cluster_count);
+
+	// compute sort order based on offsets
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		sort_order[histogram[sort_keys[i]]++] = unsigned(i);
+	}
+}
+
+static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp)
+{
+	unsigned int cache_misses = 0;
+
+	// if vertex is not in cache, put it in cache
+	if (timestamp - cache_timestamps[a] > cache_size)
+	{
+		cache_timestamps[a] = timestamp++;
+		cache_misses++;
+	}
+
+	if (timestamp - cache_timestamps[b] > cache_size)
+	{
+		cache_timestamps[b] = timestamp++;
+		cache_misses++;
+	}
+
+	if (timestamp - cache_timestamps[c] > cache_size)
+	{
+		cache_timestamps[c] = timestamp++;
+		cache_misses++;
+	}
+
+	return cache_misses;
+}
+
+static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps)
+{
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = cache_size + 1;
+
+	size_t face_count = index_count / 3;
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+		// when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh
+		// that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently
+		// suggests an inefficiency in the vertex cache optimization algorithm
+		// usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0
+		if (i == 0 || m == 3)
+		{
+			destination[result++] = unsigned(i);
+		}
+	}
+
+	assert(result <= index_count / 3);
+
+	return result;
+}
+
+static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps)
+{
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = 0;
+
+	size_t result = 0;
+
+	for (size_t it = 0; it < cluster_count; ++it)
+	{
+		size_t start = clusters[it];
+		size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3;
+		assert(start < end);
+
+		// reset cache
+		timestamp += cache_size + 1;
+
+		// measure cluster ACMR
+		unsigned int cluster_misses = 0;
+
+		for (size_t i = start; i < end; ++i)
+		{
+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+			cluster_misses += m;
+		}
+
+		float cluster_threshold = threshold * (float(cluster_misses) / float(end - start));
+
+		// first cluster always starts from the hard cluster boundary
+		destination[result++] = unsigned(start);
+
+		// reset cache
+		timestamp += cache_size + 1;
+
+		unsigned int running_misses = 0;
+		unsigned int running_faces = 0;
+
+		for (size_t i = start; i < end; ++i)
+		{
+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+			running_misses += m;
+			running_faces += 1;
+
+			if (float(running_misses) / float(running_faces) <= cluster_threshold)
+			{
+				// we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one
+				// note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last
+				// cluster is empty; however, the 'pop_back' after the loop will clean it up
+				destination[result++] = unsigned(i + 1);
+
+				// reset cache
+				timestamp += cache_size + 1;
+
+				running_misses = 0;
+				running_faces = 0;
+			}
+		}
+
+		// each time we reach the target ACMR we flush the cluster
+		// this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles
+		// in the last cluster, producing a very bad ACMR and significantly penalizing the overall results
+		// thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one
+		// there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'
+		// to the cluster boundary array which we need to remove anyway - this code will do that automatically
+		if (destination[result - 1] != start)
+		{
+			result--;
+		}
+	}
+
+	assert(result >= cluster_count);
+	assert(result <= index_count / 3);
+
+	return result;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	unsigned int cache_size = 16;
+
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+
+	// generate hard boundaries from full-triangle cache misses
+	unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3);
+	size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps);
+
+	// generate soft boundaries
+	unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1);
+	size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps);
+
+	const unsigned int* clusters = soft_clusters;
+	size_t cluster_count = soft_cluster_count;
+
+	// fill sort data
+	float* sort_data = allocator.allocate<float>(cluster_count);
+	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, clusters, cluster_count);
+
+	// sort clusters using sort data
+	unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
+	unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count);
+	calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count);
+
+	// fill output buffer
+	size_t offset = 0;
+
+	for (size_t it = 0; it < cluster_count; ++it)
+	{
+		unsigned int cluster = sort_order[it];
+		assert(cluster < cluster_count);
+
+		size_t cluster_begin = clusters[cluster] * 3;
+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+		assert(cluster_begin < cluster_end);
+
+		memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int));
+		offset += cluster_end - cluster_begin;
+	}
+
+	assert(offset == index_count);
+}
--- a/src/external/meshoptimizer/partition.cpp
+++ b/src/external/meshoptimizer/partition.cpp
@@ -0,0 +1,499 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Takio Kurita. An efficient agglomerative clustering algorithm using a heap. 1991
+namespace meshopt
+{
+
+struct ClusterAdjacency
+{
+	unsigned int* offsets;
+	unsigned int* clusters;
+	unsigned int* shared;
+};
+
+static void filterClusterIndices(unsigned int* data, unsigned int* offsets, const unsigned int* cluster_indices, const unsigned int* cluster_index_counts, size_t cluster_count, unsigned char* used, size_t vertex_count, size_t total_index_count)
+{
+	(void)vertex_count;
+	(void)total_index_count;
+
+	size_t cluster_start = 0;
+	size_t cluster_write = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		offsets[i] = unsigned(cluster_write);
+
+		// copy cluster indices, skipping duplicates
+		for (size_t j = 0; j < cluster_index_counts[i]; ++j)
+		{
+			unsigned int v = cluster_indices[cluster_start + j];
+			assert(v < vertex_count);
+
+			data[cluster_write] = v;
+			cluster_write += 1 - used[v];
+			used[v] = 1;
+		}
+
+		// reset used flags for the next cluster
+		for (size_t j = offsets[i]; j < cluster_write; ++j)
+			used[data[j]] = 0;
+
+		cluster_start += cluster_index_counts[i];
+	}
+
+	assert(cluster_start == total_index_count);
+	assert(cluster_write <= total_index_count);
+	offsets[cluster_count] = unsigned(cluster_write);
+}
+
+static void computeClusterBounds(float* cluster_bounds, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, const float* vertex_positions, size_t vertex_positions_stride)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		float center[3] = {0, 0, 0};
+
+		// approximate center of the cluster by averaging all vertex positions
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+		{
+			const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float;
+
+			center[0] += p[0];
+			center[1] += p[1];
+			center[2] += p[2];
+		}
+
+		// note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes
+		if (size_t cluster_size = cluster_offsets[i + 1] - cluster_offsets[i])
+		{
+			center[0] /= float(cluster_size);
+			center[1] /= float(cluster_size);
+			center[2] /= float(cluster_size);
+		}
+
+		// compute radius of the bounding sphere for each cluster
+		float radiussq = 0;
+
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+		{
+			const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float;
+
+			float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+
+			radiussq = radiussq < d2 ? d2 : radiussq;
+		}
+
+		cluster_bounds[i * 4 + 0] = center[0];
+		cluster_bounds[i * 4 + 1] = center[1];
+		cluster_bounds[i * 4 + 2] = center[2];
+		cluster_bounds[i * 4 + 3] = sqrtf(radiussq);
+	}
+}
+
+static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	unsigned int* ref_offsets = allocator.allocate<unsigned int>(vertex_count + 1);
+
+	// compute number of clusters referenced by each vertex
+	memset(ref_offsets, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			ref_offsets[cluster_indices[j]]++;
+	}
+
+	// compute (worst-case) number of adjacent clusters for each cluster
+	size_t total_adjacency = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		size_t count = 0;
+
+		// worst case is every vertex has a disjoint cluster list
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			count += ref_offsets[cluster_indices[j]] - 1;
+
+		// ... but only every other cluster can be adjacent in the end
+		total_adjacency += count < cluster_count - 1 ? count : cluster_count - 1;
+	}
+
+	// we can now allocate adjacency buffers
+	adjacency.offsets = allocator.allocate<unsigned int>(cluster_count + 1);
+	adjacency.clusters = allocator.allocate<unsigned int>(total_adjacency);
+	adjacency.shared = allocator.allocate<unsigned int>(total_adjacency);
+
+	// convert ref counts to offsets
+	size_t total_refs = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		size_t count = ref_offsets[i];
+		ref_offsets[i] = unsigned(total_refs);
+		total_refs += count;
+	}
+
+	unsigned int* ref_data = allocator.allocate<unsigned int>(total_refs);
+
+	// fill cluster refs for each vertex
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			ref_data[ref_offsets[cluster_indices[j]]++] = unsigned(i);
+	}
+
+	// after the previous pass, ref_offsets contain the end of the data for each vertex; shift it forward to get the start
+	memmove(ref_offsets + 1, ref_offsets, vertex_count * sizeof(unsigned int));
+	ref_offsets[0] = 0;
+
+	// fill cluster adjacency for each cluster...
+	adjacency.offsets[0] = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		unsigned int* adj = adjacency.clusters + adjacency.offsets[i];
+		unsigned int* shd = adjacency.shared + adjacency.offsets[i];
+		size_t count = 0;
+
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+		{
+			unsigned int v = cluster_indices[j];
+
+			// merge the entire cluster list of each vertex into current list
+			for (size_t k = ref_offsets[v]; k < ref_offsets[v + 1]; ++k)
+			{
+				unsigned int c = ref_data[k];
+				assert(c < cluster_count);
+
+				if (c == unsigned(i))
+					continue;
+
+				// if the cluster is already in the list, increment the shared count
+				bool found = false;
+				for (size_t l = 0; l < count; ++l)
+					if (adj[l] == c)
+					{
+						found = true;
+						shd[l]++;
+						break;
+					}
+
+				// .. or append a new cluster
+				if (!found)
+				{
+					adj[count] = c;
+					shd[count] = 1;
+					count++;
+				}
+			}
+		}
+
+		// mark the end of the adjacency list; the next cluster will start there as well
+		adjacency.offsets[i + 1] = adjacency.offsets[i] + unsigned(count);
+	}
+
+	assert(adjacency.offsets[cluster_count] <= total_adjacency);
+
+	// ref_offsets can't be deallocated as it was allocated before adjacency
+	allocator.deallocate(ref_data);
+}
+
+struct ClusterGroup
+{
+	int group;
+	int next;
+	unsigned int size; // 0 unless root
+	unsigned int vertices;
+};
+
+struct GroupOrder
+{
+	unsigned int id;
+	int order;
+};
+
+static void heapPush(GroupOrder* heap, size_t size, GroupOrder item)
+{
+	// insert a new element at the end (breaks heap invariant)
+	heap[size++] = item;
+
+	// bubble up the new element to its correct position
+	size_t i = size - 1;
+	while (i > 0 && heap[i].order < heap[(i - 1) / 2].order)
+	{
+		size_t p = (i - 1) / 2;
+
+		GroupOrder temp = heap[i];
+		heap[i] = heap[p];
+		heap[p] = temp;
+		i = p;
+	}
+}
+
+static GroupOrder heapPop(GroupOrder* heap, size_t size)
+{
+	assert(size > 0);
+	GroupOrder top = heap[0];
+
+	// move the last element to the top (breaks heap invariant)
+	heap[0] = heap[--size];
+
+	// bubble down the new top element to its correct position
+	size_t i = 0;
+	while (i * 2 + 1 < size)
+	{
+		// find the smallest child
+		size_t j = i * 2 + 1;
+		j += (j + 1 < size && heap[j + 1].order < heap[j].order);
+
+		// if the parent is already smaller than both children, we're done
+		if (heap[j].order >= heap[i].order)
+			break;
+
+		// otherwise, swap the parent and child and continue
+		GroupOrder temp = heap[i];
+		heap[i] = heap[j];
+		heap[j] = temp;
+		i = j;
+	}
+
+	return top;
+}
+
+static unsigned int countShared(const ClusterGroup* groups, int group1, int group2, const ClusterAdjacency& adjacency)
+{
+	unsigned int total = 0;
+
+	for (int i1 = group1; i1 >= 0; i1 = groups[i1].next)
+		for (int i2 = group2; i2 >= 0; i2 = groups[i2].next)
+		{
+			for (unsigned int adj = adjacency.offsets[i1]; adj < adjacency.offsets[i1 + 1]; ++adj)
+				if (adjacency.clusters[adj] == unsigned(i2))
+				{
+					total += adjacency.shared[adj];
+					break;
+				}
+		}
+
+	return total;
+}
+
+static void mergeBounds(float* target, const float* source)
+{
+	float r1 = target[3], r2 = source[3];
+	float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2];
+	float d = sqrtf(dx * dx + dy * dy + dz * dz);
+
+	if (d + r1 < r2)
+	{
+		memcpy(target, source, 4 * sizeof(float));
+		return;
+	}
+
+	if (d + r2 > r1)
+	{
+		float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f;
+
+		target[0] += dx * k;
+		target[1] += dy * k;
+		target[2] += dz * k;
+		target[3] = (d + r2 + r1) / 2;
+	}
+}
+
+static float boundsScore(const float* target, const float* source)
+{
+	float r1 = target[3], r2 = source[3];
+	float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2];
+	float d = sqrtf(dx * dx + dy * dy + dz * dz);
+
+	float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2);
+
+	return mr > 0 ? r1 / mr : 0.f;
+}
+
+static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, const float* cluster_bounds)
+{
+	assert(groups[id].size > 0);
+
+	float group_rsqrt = 1.f / sqrtf(float(int(groups[id].vertices)));
+
+	int best_group = -1;
+	float best_score = 0;
+
+	for (int ci = id; ci >= 0; ci = groups[ci].next)
+	{
+		for (unsigned int adj = adjacency.offsets[ci]; adj != adjacency.offsets[ci + 1]; ++adj)
+		{
+			int other = groups[adjacency.clusters[adj]].group;
+			if (other < 0)
+				continue;
+
+			assert(groups[other].size > 0);
+			if (groups[id].size + groups[other].size > max_partition_size)
+				continue;
+
+			unsigned int shared = countShared(groups, id, other, adjacency);
+			float other_rsqrt = 1.f / sqrtf(float(int(groups[other].vertices)));
+
+			// normalize shared count by the expected boundary of each group (+ keeps scoring symmetric)
+			float score = float(int(shared)) * (group_rsqrt + other_rsqrt);
+
+			// incorporate spatial score to favor merging nearby groups
+			if (cluster_bounds)
+				score *= 1.f + 0.4f * boundsScore(&cluster_bounds[id * 4], &cluster_bounds[other * 4]);
+
+			if (score > best_score)
+			{
+				best_group = other;
+				best_score = score;
+			}
+		}
+	}
+
+	return best_group;
+}
+
+} // namespace meshopt
+
+size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
+{
+	using namespace meshopt;
+
+	assert((vertex_positions == NULL || vertex_positions_stride >= 12) && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_partition_size > 0);
+
+	size_t max_partition_size = target_partition_size + target_partition_size * 3 / 8;
+
+	meshopt_Allocator allocator;
+
+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+	memset(used, 0, vertex_count);
+
+	unsigned int* cluster_newindices = allocator.allocate<unsigned int>(total_index_count);
+	unsigned int* cluster_offsets = allocator.allocate<unsigned int>(cluster_count + 1);
+
+	// make new cluster index list that filters out duplicate indices
+	filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count);
+	cluster_indices = cluster_newindices;
+
+	// compute bounding sphere for each cluster if positions are provided
+	float* cluster_bounds = NULL;
+
+	if (vertex_positions)
+	{
+		cluster_bounds = allocator.allocate<float>(cluster_count * 4);
+		computeClusterBounds(cluster_bounds, cluster_indices, cluster_offsets, cluster_count, vertex_positions, vertex_positions_stride);
+	}
+
+	// build cluster adjacency along with edge weights (shared vertex count)
+	ClusterAdjacency adjacency = {};
+	buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator);
+
+	ClusterGroup* groups = allocator.allocate<ClusterGroup>(cluster_count);
+
+	GroupOrder* order = allocator.allocate<GroupOrder>(cluster_count);
+	size_t pending = 0;
+
+	// create a singleton group for each cluster and order them by priority
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		groups[i].group = int(i);
+		groups[i].next = -1;
+		groups[i].size = 1;
+		groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i];
+		assert(groups[i].vertices > 0);
+
+		GroupOrder item = {};
+		item.id = unsigned(i);
+		item.order = groups[i].vertices;
+
+		heapPush(order, pending++, item);
+	}
+
+	// iteratively merge the smallest group with the best group
+	while (pending)
+	{
+		GroupOrder top = heapPop(order, pending--);
+
+		// this group was merged into another group earlier
+		if (groups[top.id].size == 0)
+			continue;
+
+		// disassociate clusters from the group to prevent them from being merged again; we will re-associate them if the group is reinserted
+		for (int i = top.id; i >= 0; i = groups[i].next)
+		{
+			assert(groups[i].group == int(top.id));
+			groups[i].group = -1;
+		}
+
+		// the group is large enough, emit as is
+		if (groups[top.id].size >= target_partition_size)
+			continue;
+
+		int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, cluster_bounds);
+
+		// we can't grow the group any more, emit as is
+		if (best_group == -1)
+			continue;
+
+		// compute shared vertices to adjust the total vertices estimate after merging
+		unsigned int shared = countShared(groups, top.id, best_group, adjacency);
+
+		// combine groups by linking them together
+		assert(groups[best_group].size > 0);
+
+		for (int i = top.id; i >= 0; i = groups[i].next)
+			if (groups[i].next < 0)
+			{
+				groups[i].next = best_group;
+				break;
+			}
+
+		// update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size
+		groups[top.id].size += groups[best_group].size;
+		groups[top.id].vertices += groups[best_group].vertices;
+		groups[top.id].vertices = (groups[top.id].vertices > shared) ? groups[top.id].vertices - shared : 1;
+
+		groups[best_group].size = 0;
+		groups[best_group].vertices = 0;
+
+		// merge bounding spheres if bounds are available
+		if (cluster_bounds)
+		{
+			mergeBounds(&cluster_bounds[top.id * 4], &cluster_bounds[best_group * 4]);
+			memset(&cluster_bounds[best_group * 4], 0, 4 * sizeof(float));
+		}
+
+		// re-associate all clusters back to the merged group
+		for (int i = top.id; i >= 0; i = groups[i].next)
+			groups[i].group = int(top.id);
+
+		top.order = groups[top.id].vertices;
+		heapPush(order, pending++, top);
+	}
+
+	size_t next_group = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		if (groups[i].size == 0)
+			continue;
+
+		for (int j = int(i); j >= 0; j = groups[j].next)
+			destination[j] = unsigned(next_group);
+
+		next_group++;
+	}
+
+	assert(next_group <= cluster_count);
+	return next_group;
+}
--- a/src/external/meshoptimizer/quantization.cpp
+++ b/src/external/meshoptimizer/quantization.cpp
@@ -0,0 +1,76 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+
+union FloatBits
+{
+	float f;
+	unsigned int ui;
+};
+
+unsigned short meshopt_quantizeHalf(float v)
+{
+	FloatBits u = {v};
+	unsigned int ui = u.ui;
+
+	int s = (ui >> 16) & 0x8000;
+	int em = ui & 0x7fffffff;
+
+	// bias exponent and round to nearest; 112 is relative exponent bias (127-15)
+	int h = (em - (112 << 23) + (1 << 12)) >> 13;
+
+	// underflow: flush to zero; 113 encodes exponent -14
+	h = (em < (113 << 23)) ? 0 : h;
+
+	// overflow: infinity; 143 encodes exponent 16
+	h = (em >= (143 << 23)) ? 0x7c00 : h;
+
+	// NaN; note that we convert all types of NaN to qNaN
+	h = (em > (255 << 23)) ? 0x7e00 : h;
+
+	return (unsigned short)(s | h);
+}
+
+float meshopt_quantizeFloat(float v, int N)
+{
+	assert(N >= 0 && N <= 23);
+
+	FloatBits u = {v};
+	unsigned int ui = u.ui;
+
+	const int mask = (1 << (23 - N)) - 1;
+	const int round = (1 << (23 - N)) >> 1;
+
+	int e = ui & 0x7f800000;
+	unsigned int rui = (ui + round) & ~mask;
+
+	// round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0
+	ui = e == 0x7f800000 ? ui : rui;
+
+	// flush denormals to zero
+	ui = e == 0 ? 0 : ui;
+
+	u.ui = ui;
+	return u.f;
+}
+
+float meshopt_dequantizeHalf(unsigned short h)
+{
+	unsigned int s = unsigned(h & 0x8000) << 16;
+	int em = h & 0x7fff;
+
+	// bias exponent and pad mantissa with 0; 112 is relative exponent bias (127-15)
+	int r = (em + (112 << 10)) << 13;
+
+	// denormal: flush to zero
+	r = (em < (1 << 10)) ? 0 : r;
+
+	// infinity/NaN; note that we preserve NaN payload as a byproduct of unifying inf/nan cases
+	// 112 is an exponent bias fixup; since we already applied it once, applying it twice converts 31 to 255
+	r += (em >= (31 << 10)) ? (112 << 23) : 0;
+
+	FloatBits u;
+	u.ui = s | r;
+	return u.f;
+}
--- a/src/external/meshoptimizer/rasterizer.cpp
+++ b/src/external/meshoptimizer/rasterizer.cpp
@@ -0,0 +1,289 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+// This work is based on:
+// Nicolas Capens. Advanced Rasterization. 2004
+namespace meshopt
+{
+
+const int kViewport = 256;
+
+struct OverdrawBuffer
+{
+	float z[kViewport][kViewport][2];
+	unsigned int overdraw[kViewport][kViewport][2];
+};
+
+static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
+{
+	// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
+	// z3 = z1 + dzdx * (x3 - x1) + dzdy * (y3 - y1)
+	// (x2-x1 y2-y1)(dzdx) = (z2-z1)
+	// (x3-x1 y3-y1)(dzdy)   (z3-z1)
+	// we'll solve it with Cramer's rule
+	float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
+	float invdet = (det == 0) ? 0 : 1 / det;
+
+	dzdx = ((z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1)) * invdet;
+	dzdy = ((x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1)) * invdet;
+
+	return det;
+}
+
+// half-space fixed point triangle rasterizer
+static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, float v2x, float v2y, float v2z, float v3x, float v3y, float v3z)
+{
+	// compute depth gradients
+	float DZx, DZy;
+	float det = computeDepthGradients(DZx, DZy, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z);
+	int sign = det > 0;
+
+	// flip backfacing triangles to simplify rasterization logic
+	if (sign)
+	{
+		// flipping v2 & v3 preserves depth gradients since they're based on v1; only v1z is used below
+		float t;
+		t = v2x, v2x = v3x, v3x = t;
+		t = v2y, v2y = v3y, v3y = t;
+
+		// flip depth since we rasterize backfacing triangles to second buffer with reverse Z; only v1z is used below
+		v1z = kViewport - v1z;
+		DZx = -DZx;
+		DZy = -DZy;
+	}
+
+	// coordinates, 28.4 fixed point
+	int X1 = int(16.0f * v1x + 0.5f);
+	int X2 = int(16.0f * v2x + 0.5f);
+	int X3 = int(16.0f * v3x + 0.5f);
+
+	int Y1 = int(16.0f * v1y + 0.5f);
+	int Y2 = int(16.0f * v2y + 0.5f);
+	int Y3 = int(16.0f * v3y + 0.5f);
+
+	// bounding rectangle, clipped against viewport
+	// since we rasterize pixels with covered centers, min >0.5 should round up
+	// as for max, due to top-left filling convention we will never rasterize right/bottom edges
+	// so max >= 0.5 should round down for inclusive bounds, and up for exclusive (in our case)
+	int minx = X1 < X2 ? X1 : X2;
+	minx = minx < X3 ? minx : X3;
+	minx = (minx + 7) >> 4;
+	minx = minx < 0 ? 0 : minx;
+
+	int miny = Y1 < Y2 ? Y1 : Y2;
+	miny = miny < Y3 ? miny : Y3;
+	miny = (miny + 7) >> 4;
+	miny = miny < 0 ? 0 : miny;
+
+	int maxx = X1 > X2 ? X1 : X2;
+	maxx = maxx > X3 ? maxx : X3;
+	maxx = (maxx + 7) >> 4;
+	maxx = maxx > kViewport ? kViewport : maxx;
+
+	int maxy = Y1 > Y2 ? Y1 : Y2;
+	maxy = maxy > Y3 ? maxy : Y3;
+	maxy = (maxy + 7) >> 4;
+	maxy = maxy > kViewport ? kViewport : maxy;
+
+	// deltas, 28.4 fixed point
+	int DX12 = X1 - X2;
+	int DX23 = X2 - X3;
+	int DX31 = X3 - X1;
+
+	int DY12 = Y1 - Y2;
+	int DY23 = Y2 - Y3;
+	int DY31 = Y3 - Y1;
+
+	// fill convention correction
+	int TL1 = DY12 < 0 || (DY12 == 0 && DX12 > 0);
+	int TL2 = DY23 < 0 || (DY23 == 0 && DX23 > 0);
+	int TL3 = DY31 < 0 || (DY31 == 0 && DX31 > 0);
+
+	// half edge equations, 24.8 fixed point
+	// note that we offset minx/miny by half pixel since we want to rasterize pixels with covered centers
+	int FX = (minx << 4) + 8;
+	int FY = (miny << 4) + 8;
+	int CY1 = DX12 * (FY - Y1) - DY12 * (FX - X1) + TL1 - 1;
+	int CY2 = DX23 * (FY - Y2) - DY23 * (FX - X2) + TL2 - 1;
+	int CY3 = DX31 * (FY - Y3) - DY31 * (FX - X3) + TL3 - 1;
+	float ZY = v1z + (DZx * float(FX - X1) + DZy * float(FY - Y1)) * (1 / 16.f);
+
+	for (int y = miny; y < maxy; y++)
+	{
+		int CX1 = CY1;
+		int CX2 = CY2;
+		int CX3 = CY3;
+		float ZX = ZY;
+
+		for (int x = minx; x < maxx; x++)
+		{
+			// check if all CXn are non-negative
+			if ((CX1 | CX2 | CX3) >= 0)
+			{
+				if (ZX >= buffer->z[y][x][sign])
+				{
+					buffer->z[y][x][sign] = ZX;
+					buffer->overdraw[y][x][sign]++;
+				}
+			}
+
+			// signed left shift is UB for negative numbers so use unsigned-signed casts
+			CX1 -= int(unsigned(DY12) << 4);
+			CX2 -= int(unsigned(DY23) << 4);
+			CX3 -= int(unsigned(DY31) << 4);
+			ZX += DZx;
+		}
+
+		// signed left shift is UB for negative numbers so use unsigned-signed casts
+		CY1 += int(unsigned(DX12) << 4);
+		CY2 += int(unsigned(DX23) << 4);
+		CY3 += int(unsigned(DX31) << 4);
+		ZY += DZy;
+	}
+}
+
+static float transformTriangles(float* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions + i * vertex_stride_float;
+
+		for (int j = 0; j < 3; ++j)
+		{
+			float vj = v[j];
+
+			minv[j] = minv[j] > vj ? vj : minv[j];
+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
+		}
+	}
+
+	float extent = 0.f;
+
+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+	float scale = kViewport / extent;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		const float* v = vertex_positions + index * vertex_stride_float;
+
+		triangles[i * 3 + 0] = (v[0] - minv[0]) * scale;
+		triangles[i * 3 + 1] = (v[1] - minv[1]) * scale;
+		triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
+	}
+
+	return extent;
+}
+
+static void rasterizeTriangles(OverdrawBuffer* buffer, const float* triangles, size_t index_count, int axis)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		const float* vn0 = &triangles[3 * (i + 0)];
+		const float* vn1 = &triangles[3 * (i + 1)];
+		const float* vn2 = &triangles[3 * (i + 2)];
+
+		switch (axis)
+		{
+		case 0:
+			rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
+			break;
+		case 1:
+			rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
+			break;
+		case 2:
+			rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
+			break;
+		}
+	}
+}
+
+} // namespace meshopt
+
+meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	meshopt_OverdrawStatistics result = {};
+
+	float* triangles = allocator.allocate<float>(index_count * 3);
+	transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
+	OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		memset(buffer, 0, sizeof(OverdrawBuffer));
+		rasterizeTriangles(buffer, triangles, index_count, axis);
+
+		for (int y = 0; y < kViewport; ++y)
+			for (int x = 0; x < kViewport; ++x)
+				for (int s = 0; s < 2; ++s)
+				{
+					unsigned int overdraw = buffer->overdraw[y][x][s];
+
+					result.pixels_covered += overdraw > 0;
+					result.pixels_shaded += overdraw;
+				}
+	}
+
+	result.overdraw = result.pixels_covered ? float(result.pixels_shaded) / float(result.pixels_covered) : 0.f;
+
+	return result;
+}
+
+meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	meshopt_CoverageStatistics result = {};
+
+	float* triangles = allocator.allocate<float>(index_count * 3);
+	float extent = transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
+	OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		memset(buffer, 0, sizeof(OverdrawBuffer));
+		rasterizeTriangles(buffer, triangles, index_count, axis);
+
+		unsigned int covered = 0;
+
+		for (int y = 0; y < kViewport; ++y)
+			for (int x = 0; x < kViewport; ++x)
+				covered += (buffer->overdraw[y][x][0] | buffer->overdraw[y][x][1]) > 0;
+
+		result.coverage[axis] = float(covered) / float(kViewport * kViewport);
+	}
+
+	result.extent = extent;
+
+	return result;
+}
--- a/src/external/meshoptimizer/simplifier.cpp
+++ b/src/external/meshoptimizer/simplifier.cpp
--- a/src/external/meshoptimizer/spatialorder.cpp
+++ b/src/external/meshoptimizer/spatialorder.cpp
@@ -0,0 +1,340 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+// This work is based on:
+// Fabian Giesen. Decoding Morton codes. 2009
+namespace meshopt
+{
+
+// "Insert" two 0 bits after each of the 20 low bits of x
+inline unsigned long long part1By2(unsigned long long x)
+{
+	x &= 0x000fffffull;                          // x = ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- jihg fedc ba98 7654 3210
+	x = (x ^ (x << 32)) & 0x000f00000000ffffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- ---- ---- ---- ---- fedc ba98 7654 3210
+	x = (x ^ (x << 16)) & 0x000f0000ff0000ffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- fedc ba98 ---- ---- ---- ---- 7654 3210
+	x = (x ^ (x << 8)) & 0x000f00f00f00f00full;  // x = ---- ---- ---- jihg ---- ---- fedc ---- ---- ba98 ---- ---- 7654 ---- ---- 3210
+	x = (x ^ (x << 4)) & 0x00c30c30c30c30c3ull;  // x = ---- ---- ji-- --hg ---- fe-- --dc ---- ba-- --98 ---- 76-- --54 ---- 32-- --10
+	x = (x ^ (x << 2)) & 0x0249249249249249ull;  // x = ---- --j- -i-- h--g --f- -e-- d--c --b- -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+	return x;
+}
+
+static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, bool morton)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions_data + i * vertex_stride_float;
+
+		for (int j = 0; j < 3; ++j)
+		{
+			float vj = v[j];
+
+			minv[j] = minv[j] > vj ? vj : minv[j];
+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
+		}
+	}
+
+	float extent = 0.f;
+
+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+	// rescale each axis to 16 bits to get 48-bit Morton codes
+	float scale = extent == 0 ? 0.f : 65535.f / extent;
+
+	// generate Morton order based on the position inside a unit cube
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions_data + i * vertex_stride_float;
+
+		int x = int((v[0] - minv[0]) * scale + 0.5f);
+		int y = int((v[1] - minv[1]) * scale + 0.5f);
+		int z = int((v[2] - minv[2]) * scale + 0.5f);
+
+		if (morton)
+			result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+		else
+			result[i] = ((unsigned long long)x << 0) | ((unsigned long long)y << 20) | ((unsigned long long)z << 40);
+	}
+}
+
+static void radixSort10(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count)
+{
+	unsigned int hist[1024];
+	memset(hist, 0, sizeof(hist));
+
+	// compute histogram (assume keys are 10-bit)
+	for (size_t i = 0; i < count; ++i)
+		hist[keys[i]]++;
+
+	unsigned int sum = 0;
+
+	// replace histogram data with prefix histogram sums in-place
+	for (int i = 0; i < 1024; ++i)
+	{
+		unsigned int h = hist[i];
+		hist[i] = sum;
+		sum += h;
+	}
+
+	assert(sum == count);
+
+	// reorder values
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = keys[source[i]];
+
+		destination[hist[id]++] = source[i];
+	}
+}
+
+static void computeHistogram(unsigned int (&hist)[256][2], const unsigned short* data, size_t count)
+{
+	memset(hist, 0, sizeof(hist));
+
+	// compute 2 8-bit histograms in parallel
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned long long id = data[i];
+
+		hist[(id >> 0) & 255][0]++;
+		hist[(id >> 8) & 255][1]++;
+	}
+
+	unsigned int sum0 = 0, sum1 = 0;
+
+	// replace histogram data with prefix histogram sums in-place
+	for (int i = 0; i < 256; ++i)
+	{
+		unsigned int h0 = hist[i][0], h1 = hist[i][1];
+
+		hist[i][0] = sum0;
+		hist[i][1] = sum1;
+
+		sum0 += h0;
+		sum1 += h1;
+	}
+
+	assert(sum0 == count && sum1 == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count, unsigned int (&hist)[256][2], int pass)
+{
+	int bitoff = pass * 8;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = unsigned(keys[source[i]] >> bitoff) & 255;
+
+		destination[hist[id][pass]++] = source[i];
+	}
+}
+
+static void partitionPoints(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
+{
+	size_t l = 0, r = split;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned char side = sides[order[i]];
+		target[side ? r : l] = order[i];
+		l += 1;
+		l -= side;
+		r += side;
+	}
+
+	assert(l == split && r == count);
+}
+
+static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, const unsigned long long* keys, size_t count, void* scratch, size_t cluster_size)
+{
+	if (count <= cluster_size)
+	{
+		memcpy(destination, orderx, count * sizeof(unsigned int));
+		return;
+	}
+
+	unsigned int* axes[3] = {orderx, ordery, orderz};
+
+	int bestk = -1;
+	unsigned int bestdim = 0;
+
+	for (int k = 0; k < 3; ++k)
+	{
+		const unsigned int mask = (1 << 20) - 1;
+		unsigned int dim = (unsigned(keys[axes[k][count - 1]] >> (k * 20)) & mask) - (unsigned(keys[axes[k][0]] >> (k * 20)) & mask);
+
+		if (dim >= bestdim)
+		{
+			bestk = k;
+			bestdim = dim;
+		}
+	}
+
+	assert(bestk >= 0);
+
+	// split roughly in half, with the left split always being aligned to cluster size
+	size_t split = ((count / 2) + cluster_size - 1) / cluster_size * cluster_size;
+	assert(split > 0 && split < count);
+
+	// mark sides of split for partitioning
+	unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);
+
+	for (size_t i = 0; i < split; ++i)
+		sides[axes[bestk][i]] = 0;
+
+	for (size_t i = split; i < count; ++i)
+		sides[axes[bestk][i]] = 1;
+
+	// partition all axes into two sides, maintaining order
+	unsigned int* temp = static_cast<unsigned int*>(scratch);
+
+	for (int k = 0; k < 3; ++k)
+	{
+		if (k == bestk)
+			continue;
+
+		unsigned int* axis = axes[k];
+		memcpy(temp, axis, sizeof(unsigned int) * count);
+		partitionPoints(axis, temp, sides, split, count);
+	}
+
+	splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
+	splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
+}
+
+} // namespace meshopt
+
+void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	unsigned long long* keys = allocator.allocate<unsigned long long>(vertex_count);
+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ true);
+
+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count * 2); // 4b for order + 2b for keys
+	unsigned short* keyk = (unsigned short*)(scratch + vertex_count);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		destination[i] = unsigned(i);
+
+	unsigned int* order[] = {scratch, destination};
+
+	// 5-pass radix sort computes the resulting order into scratch
+	for (int k = 0; k < 5; ++k)
+	{
+		// copy 10-bit key segments into keyk to reduce cache pressure during radix pass
+		for (size_t i = 0; i < vertex_count; ++i)
+			keyk[i] = (unsigned short)((keys[i] >> (k * 10)) & 1023);
+
+		radixSort10(order[k % 2], order[(k + 1) % 2], keyk, vertex_count);
+	}
+
+	// since our remap table is mapping old=>new, we need to reverse it
+	for (size_t i = 0; i < vertex_count; ++i)
+		destination[scratch[i]] = unsigned(i);
+}
+
+void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	(void)vertex_count;
+
+	size_t face_count = index_count / 3;
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	meshopt_Allocator allocator;
+
+	float* centroids = allocator.allocate<float>(face_count * 3);
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* va = vertex_positions + a * vertex_stride_float;
+		const float* vb = vertex_positions + b * vertex_stride_float;
+		const float* vc = vertex_positions + c * vertex_stride_float;
+
+		centroids[i * 3 + 0] = (va[0] + vb[0] + vc[0]) / 3.f;
+		centroids[i * 3 + 1] = (va[1] + vb[1] + vc[1]) / 3.f;
+		centroids[i * 3 + 2] = (va[2] + vb[2] + vc[2]) / 3.f;
+	}
+
+	unsigned int* remap = allocator.allocate<unsigned int>(face_count);
+
+	meshopt_spatialSortRemap(remap, centroids, face_count, sizeof(float) * 3);
+
+	// support in-order remap
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		unsigned int r = remap[i];
+
+		destination[r * 3 + 0] = a;
+		destination[r * 3 + 1] = b;
+		destination[r * 3 + 2] = c;
+	}
+}
+
+void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(cluster_size > 0);
+
+	meshopt_Allocator allocator;
+
+	unsigned long long* keys = allocator.allocate<unsigned long long>(vertex_count);
+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ false);
+
+	unsigned int* order = allocator.allocate<unsigned int>(vertex_count * 3);
+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count * 2); // 4b for order + 1b for side or 2b for keys
+	unsigned short* keyk = reinterpret_cast<unsigned short*>(scratch + vertex_count);
+
+	for (int k = 0; k < 3; ++k)
+	{
+		// copy 16-bit key segments into keyk to reduce cache pressure during radix pass
+		for (size_t i = 0; i < vertex_count; ++i)
+			keyk[i] = (unsigned short)(keys[i] >> (k * 20));
+
+		unsigned int hist[256][2];
+		computeHistogram(hist, keyk, vertex_count);
+
+		for (size_t i = 0; i < vertex_count; ++i)
+			order[k * vertex_count + i] = unsigned(i);
+
+		radixPass(scratch, order + k * vertex_count, keyk, vertex_count, hist, 0);
+		radixPass(order + k * vertex_count, scratch, keyk, vertex_count, hist, 1);
+	}
+
+	splitPoints(destination, order, order + vertex_count, order + 2 * vertex_count, keys, vertex_count, scratch, cluster_size);
+}
--- a/src/external/meshoptimizer/stripifier.cpp
+++ b/src/external/meshoptimizer/stripifier.cpp
@@ -0,0 +1,296 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <string.h>
+
+// This work is based on:
+// Francine Evans, Steven Skiena and Amitabh Varshney. Optimizing Triangle Strips for Fast Rendering. 1996
+namespace meshopt
+{
+
+static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned char* valence)
+{
+	unsigned int index = 0;
+	unsigned int iv = ~0u;
+
+	for (size_t i = 0; i < buffer_size; ++i)
+	{
+		unsigned char va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
+		unsigned int v = (va < vb && va < vc) ? va : (vb < vc ? vb : vc);
+
+		if (v < iv)
+		{
+			index = unsigned(i);
+			iv = v;
+		}
+	}
+
+	return index;
+}
+
+static int findStripNext(const unsigned int buffer[][3], unsigned int buffer_size, unsigned int e0, unsigned int e1)
+{
+	for (size_t i = 0; i < buffer_size; ++i)
+	{
+		unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+		if (e0 == a && e1 == b)
+			return (int(i) << 2) | 2;
+		else if (e0 == b && e1 == c)
+			return (int(i) << 2) | 0;
+		else if (e0 == c && e1 == a)
+			return (int(i) << 2) | 1;
+	}
+
+	return -1;
+}
+
+} // namespace meshopt
+
+size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index)
+{
+	assert(destination != indices);
+	assert(index_count % 3 == 0);
+
+	using namespace meshopt;
+
+	meshopt_Allocator allocator;
+
+	const size_t buffer_capacity = 8;
+
+	unsigned int buffer[buffer_capacity][3] = {};
+	unsigned int buffer_size = 0;
+
+	size_t index_offset = 0;
+
+	unsigned int strip[2] = {};
+	unsigned int parity = 0;
+
+	size_t strip_size = 0;
+
+	// compute vertex valence; this is used to prioritize starting triangle for strips
+	// note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
+	unsigned char* valence = allocator.allocate<unsigned char>(vertex_count);
+	memset(valence, 0, vertex_count);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		valence[index]++;
+	}
+
+	int next = -1;
+
+	while (buffer_size > 0 || index_offset < index_count)
+	{
+		assert(next < 0 || (size_t(next >> 2) < buffer_size && (next & 3) < 3));
+
+		// fill triangle buffer
+		while (buffer_size < buffer_capacity && index_offset < index_count)
+		{
+			buffer[buffer_size][0] = indices[index_offset + 0];
+			buffer[buffer_size][1] = indices[index_offset + 1];
+			buffer[buffer_size][2] = indices[index_offset + 2];
+
+			buffer_size++;
+			index_offset += 3;
+		}
+
+		assert(buffer_size > 0);
+
+		if (next >= 0)
+		{
+			unsigned int i = next >> 2;
+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+			unsigned int v = buffer[i][next & 3];
+
+			// ordered removal from the buffer
+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+			buffer_size--;
+
+			// update vertex valences for strip start heuristic
+			valence[a]--;
+			valence[b]--;
+			valence[c]--;
+
+			// find next triangle (note that edge order flips on every iteration)
+			// in some cases we need to perform a swap to pick a different outgoing triangle edge
+			// for [a b c], the default strip edge is [b c], but we might want to use [a c]
+			int cont = findStripNext(buffer, buffer_size, parity ? strip[1] : v, parity ? v : strip[1]);
+			int swap = cont < 0 ? findStripNext(buffer, buffer_size, parity ? v : strip[0], parity ? strip[0] : v) : -1;
+
+			if (cont < 0 && swap >= 0)
+			{
+				// [a b c] => [a b a c]
+				destination[strip_size++] = strip[0];
+				destination[strip_size++] = v;
+
+				// next strip has same winding
+				// ? a b => b a v
+				strip[1] = v;
+
+				next = swap;
+			}
+			else
+			{
+				// emit the next vertex in the strip
+				destination[strip_size++] = v;
+
+				// next strip has flipped winding
+				strip[0] = strip[1];
+				strip[1] = v;
+				parity ^= 1;
+
+				next = cont;
+			}
+		}
+		else
+		{
+			// if we didn't find anything, we need to find the next new triangle
+			// we use a heuristic to maximize the strip length
+			unsigned int i = findStripFirst(buffer, buffer_size, valence);
+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+			// ordered removal from the buffer
+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+			buffer_size--;
+
+			// update vertex valences for strip start heuristic
+			valence[a]--;
+			valence[b]--;
+			valence[c]--;
+
+			// we need to pre-rotate the triangle so that we will find a match in the existing buffer on the next iteration
+			int ea = findStripNext(buffer, buffer_size, c, b);
+			int eb = findStripNext(buffer, buffer_size, a, c);
+			int ec = findStripNext(buffer, buffer_size, b, a);
+
+			// in some cases we can have several matching edges; since we can pick any edge, we pick the one with the smallest
+			// triangle index in the buffer. this reduces the effect of stripification on ACMR and additionally - for unclear
+			// reasons - slightly improves the stripification efficiency
+			int mine = INT_MAX;
+			mine = (ea >= 0 && mine > ea) ? ea : mine;
+			mine = (eb >= 0 && mine > eb) ? eb : mine;
+			mine = (ec >= 0 && mine > ec) ? ec : mine;
+
+			if (ea == mine)
+			{
+				// keep abc
+				next = ea;
+			}
+			else if (eb == mine)
+			{
+				// abc -> bca
+				unsigned int t = a;
+				a = b, b = c, c = t;
+
+				next = eb;
+			}
+			else if (ec == mine)
+			{
+				// abc -> cab
+				unsigned int t = c;
+				c = b, b = a, a = t;
+
+				next = ec;
+			}
+
+			if (restart_index)
+			{
+				if (strip_size)
+					destination[strip_size++] = restart_index;
+
+				destination[strip_size++] = a;
+				destination[strip_size++] = b;
+				destination[strip_size++] = c;
+
+				// new strip always starts with the same edge winding
+				strip[0] = b;
+				strip[1] = c;
+				parity = 1;
+			}
+			else
+			{
+				if (strip_size)
+				{
+					// connect last strip using degenerate triangles
+					destination[strip_size++] = strip[1];
+					destination[strip_size++] = a;
+				}
+
+				// note that we may need to flip the emitted triangle based on parity
+				// we always end up with outgoing edge "cb" in the end
+				unsigned int e0 = parity ? c : b;
+				unsigned int e1 = parity ? b : c;
+
+				destination[strip_size++] = a;
+				destination[strip_size++] = e0;
+				destination[strip_size++] = e1;
+
+				strip[0] = e0;
+				strip[1] = e1;
+				parity ^= 1;
+			}
+		}
+	}
+
+	return strip_size;
+}
+
+size_t meshopt_stripifyBound(size_t index_count)
+{
+	assert(index_count % 3 == 0);
+
+	// worst case without restarts is 2 degenerate indices and 3 indices per triangle
+	// worst case with restarts is 1 restart index and 3 indices per triangle
+	return (index_count / 3) * 5;
+}
+
+size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index)
+{
+	assert(destination != indices);
+
+	size_t offset = 0;
+	size_t start = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		if (restart_index && indices[i] == restart_index)
+		{
+			start = i + 1;
+		}
+		else if (i - start >= 2)
+		{
+			unsigned int a = indices[i - 2], b = indices[i - 1], c = indices[i];
+
+			// flip winding for odd triangles
+			if ((i - start) & 1)
+			{
+				unsigned int t = a;
+				a = b, b = t;
+			}
+
+			// although we use restart indices, strip swaps still produce degenerate triangles, so skip them
+			if (a != b && a != c && b != c)
+			{
+				destination[offset + 0] = a;
+				destination[offset + 1] = b;
+				destination[offset + 2] = c;
+				offset += 3;
+			}
+		}
+	}
+
+	return offset;
+}
+
+size_t meshopt_unstripifyBound(size_t index_count)
+{
+	assert(index_count == 0 || index_count >= 3);
+
+	return (index_count == 0) ? 0 : (index_count - 2) * 3;
+}
--- a/src/external/meshoptimizer/vcacheoptimizer.cpp
+++ b/src/external/meshoptimizer/vcacheoptimizer.cpp
@@ -0,0 +1,467 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// This work is based on:
+// Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+const size_t kCacheSizeMax = 16;
+const size_t kValenceMax = 8;
+
+struct VertexScoreTable
+{
+	float cache[1 + kCacheSizeMax];
+	float live[1 + kValenceMax];
+};
+
+// Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
+static const VertexScoreTable kVertexScoreTable = {
+    {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
+    {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
+};
+
+// Tuned to minimize the encoded index buffer size
+static const VertexScoreTable kVertexScoreTableStrip = {
+    {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
+    {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
+};
+
+struct TriangleAdjacency
+{
+	unsigned int* counts;
+	unsigned int* offsets;
+	unsigned int* data;
+};
+
+static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill triangle counts
+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		assert(indices[i] < vertex_count);
+
+		adjacency.counts[indices[i]]++;
+	}
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		adjacency.offsets[i] = offset;
+		offset += adjacency.counts[i];
+	}
+
+	assert(offset == index_count);
+
+	// fill triangle data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+		adjacency.offsets[i] -= adjacency.counts[i];
+	}
+}
+
+static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
+{
+	// check dead-end stack
+	while (dead_end_top)
+	{
+		unsigned int vertex = dead_end[--dead_end_top];
+
+		if (live_triangles[vertex] > 0)
+			return vertex;
+	}
+
+	// input order
+	while (input_cursor < vertex_count)
+	{
+		if (live_triangles[input_cursor] > 0)
+			return input_cursor;
+
+		++input_cursor;
+	}
+
+	return ~0u;
+}
+
+static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
+{
+	unsigned int best_candidate = ~0u;
+	int best_priority = -1;
+
+	for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
+	{
+		unsigned int vertex = *next_candidate;
+
+		// otherwise we don't need to process it
+		if (live_triangles[vertex] > 0)
+		{
+			int priority = 0;
+
+			// will it be in cache after fanning?
+			if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
+			{
+				priority = timestamp - cache_timestamps[vertex]; // position in cache
+			}
+
+			if (priority > best_priority)
+			{
+				best_candidate = vertex;
+				best_priority = priority;
+			}
+		}
+	}
+
+	return best_candidate;
+}
+
+static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
+{
+	assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
+
+	unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
+
+	return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
+}
+
+static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
+{
+	// input order
+	while (input_cursor < face_count)
+	{
+		if (!emitted_flags[input_cursor])
+			return input_cursor;
+
+		++input_cursor;
+	}
+
+	return ~0u;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	unsigned int cache_size = 16;
+	assert(cache_size <= kCacheSizeMax);
+
+	size_t face_count = index_count / 3;
+
+	// build adjacency information
+	TriangleAdjacency adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match
+	unsigned int* live_triangles = adjacency.counts;
+
+	// emitted flags
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	// compute initial vertex scores
+	float* vertex_scores = allocator.allocate<float>(vertex_count);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
+
+	// compute triangle scores
+	float* triangle_scores = allocator.allocate<float>(face_count);
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0];
+		unsigned int b = indices[i * 3 + 1];
+		unsigned int c = indices[i * 3 + 2];
+
+		triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
+	}
+
+	unsigned int cache_holder[2 * (kCacheSizeMax + 4)];
+	unsigned int* cache = cache_holder;
+	unsigned int* cache_new = cache_holder + kCacheSizeMax + 4;
+	size_t cache_count = 0;
+
+	unsigned int current_triangle = 0;
+	unsigned int input_cursor = 1;
+
+	unsigned int output_triangle = 0;
+
+	while (current_triangle != ~0u)
+	{
+		assert(output_triangle < face_count);
+
+		unsigned int a = indices[current_triangle * 3 + 0];
+		unsigned int b = indices[current_triangle * 3 + 1];
+		unsigned int c = indices[current_triangle * 3 + 2];
+
+		// output indices
+		destination[output_triangle * 3 + 0] = a;
+		destination[output_triangle * 3 + 1] = b;
+		destination[output_triangle * 3 + 2] = c;
+		output_triangle++;
+
+		// update emitted flags
+		emitted_flags[current_triangle] = true;
+		triangle_scores[current_triangle] = 0;
+
+		// new triangle
+		size_t cache_write = 0;
+		cache_new[cache_write++] = a;
+		cache_new[cache_write++] = b;
+		cache_new[cache_write++] = c;
+
+		// old triangles
+		for (size_t i = 0; i < cache_count; ++i)
+		{
+			unsigned int index = cache[i];
+
+			cache_new[cache_write] = index;
+			cache_write += (index != a) & (index != b) & (index != c);
+		}
+
+		unsigned int* cache_temp = cache;
+		cache = cache_new, cache_new = cache_temp;
+		cache_count = cache_write > cache_size ? cache_size : cache_write;
+
+		// remove emitted triangle from adjacency data
+		// this makes sure that we spend less time traversing these lists on subsequent iterations
+		// live triangle counts are updated as a byproduct of these adjustments
+		for (size_t k = 0; k < 3; ++k)
+		{
+			unsigned int index = indices[current_triangle * 3 + k];
+
+			unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbors_size = adjacency.counts[index];
+
+			for (size_t i = 0; i < neighbors_size; ++i)
+			{
+				unsigned int tri = neighbors[i];
+
+				if (tri == current_triangle)
+				{
+					neighbors[i] = neighbors[neighbors_size - 1];
+					adjacency.counts[index]--;
+					break;
+				}
+			}
+		}
+
+		unsigned int best_triangle = ~0u;
+		float best_score = 0;
+
+		// update cache positions, vertex scores and triangle scores, and find next best triangle
+		for (size_t i = 0; i < cache_write; ++i)
+		{
+			unsigned int index = cache[i];
+
+			// no need to update scores if we are never going to use this vertex
+			if (adjacency.counts[index] == 0)
+				continue;
+
+			int cache_position = i >= cache_size ? -1 : int(i);
+
+			// update vertex score
+			float score = vertexScore(table, cache_position, live_triangles[index]);
+			float score_diff = score - vertex_scores[index];
+
+			vertex_scores[index] = score;
+
+			// update scores of vertex triangles
+			const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index];
+			const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index];
+
+			for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
+			{
+				unsigned int tri = *it;
+				assert(!emitted_flags[tri]);
+
+				float tri_score = triangle_scores[tri] + score_diff;
+				assert(tri_score > 0);
+
+				best_triangle = best_score < tri_score ? tri : best_triangle;
+				best_score = best_score < tri_score ? tri_score : best_score;
+
+				triangle_scores[tri] = tri_score;
+			}
+		}
+
+		// step through input triangles in order if we hit a dead-end
+		current_triangle = best_triangle;
+
+		if (current_triangle == ~0u)
+		{
+			current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
+		}
+	}
+
+	assert(input_cursor == face_count);
+	assert(output_triangle == face_count);
+}
+
+void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
+}
+
+void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
+}
+
+void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(cache_size >= 3);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	size_t face_count = index_count / 3;
+
+	// build adjacency information
+	TriangleAdjacency adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// live triangle counts
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	// cache time stamps
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	// dead-end stack
+	unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
+	unsigned int dead_end_top = 0;
+
+	// emitted flags
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	unsigned int current_vertex = 0;
+
+	unsigned int timestamp = cache_size + 1;
+	unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
+
+	unsigned int output_triangle = 0;
+
+	while (current_vertex != ~0u)
+	{
+		const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
+
+		// emit all vertex neighbors
+		const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
+		const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex];
+
+		for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
+		{
+			unsigned int triangle = *it;
+
+			if (!emitted_flags[triangle])
+			{
+				unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+				// output indices
+				destination[output_triangle * 3 + 0] = a;
+				destination[output_triangle * 3 + 1] = b;
+				destination[output_triangle * 3 + 2] = c;
+				output_triangle++;
+
+				// update dead-end stack
+				dead_end[dead_end_top + 0] = a;
+				dead_end[dead_end_top + 1] = b;
+				dead_end[dead_end_top + 2] = c;
+				dead_end_top += 3;
+
+				// update live triangle counts
+				live_triangles[a]--;
+				live_triangles[b]--;
+				live_triangles[c]--;
+
+				// update cache info
+				// if vertex is not in cache, put it in cache
+				if (timestamp - cache_timestamps[a] > cache_size)
+					cache_timestamps[a] = timestamp++;
+
+				if (timestamp - cache_timestamps[b] > cache_size)
+					cache_timestamps[b] = timestamp++;
+
+				if (timestamp - cache_timestamps[c] > cache_size)
+					cache_timestamps[c] = timestamp++;
+
+				// update emitted flags
+				emitted_flags[triangle] = true;
+			}
+		}
+
+		// next candidates are the ones we pushed to dead-end stack just now
+		const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
+
+		// get next vertex
+		current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
+
+		if (current_vertex == ~0u)
+		{
+			current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
+		}
+	}
+
+	assert(output_triangle == face_count);
+}
--- a/src/external/meshoptimizer/vertexcodec.cpp
+++ b/src/external/meshoptimizer/vertexcodec.cpp
--- a/src/external/meshoptimizer/vertexfilter.cpp
+++ b/src/external/meshoptimizer/vertexfilter.cpp
--- a/src/external/meshoptimizer/vfetchoptimizer.cpp
+++ b/src/external/meshoptimizer/vfetchoptimizer.cpp
@@ -0,0 +1,74 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			destination[index] = next_vertex++;
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	// support in-place optimization
+	if (destination == vertices)
+	{
+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+		vertices = vertices_copy;
+	}
+
+	// build vertex remap table
+	unsigned int* vertex_remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(vertex_remap, -1, vertex_count * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		unsigned int& remap = vertex_remap[index];
+
+		if (remap == ~0u) // vertex was not added to destination VB
+		{
+			// add vertex
+			memcpy(static_cast<unsigned char*>(destination) + next_vertex * vertex_size, static_cast<const unsigned char*>(vertices) + index * vertex_size, vertex_size);
+
+			remap = next_vertex++;
+		}
+
+		// modify indices in place
+		indices[i] = remap;
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
--- a/src/tydra/render-data.cc
+++ b/src/tydra/render-data.cc
@@ -49,6 +49,10 @@
 #include "external/tiny-color-io.h"
 #endif

+#if defined(TINYUSDZ_WITH_MESHOPT)
+#include "external/meshoptimizer/meshoptimizer.h"
+#endif
+
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Weverything"
@@ -1309,6 +1313,80 @@ bool ArrayValueToVertexAttribute(
      value.underlying_type_name()));
 }

+#if defined(TINYUSDZ_WITH_MESHOPT)
+//
+// Optimize RenderMesh indices using meshoptimizer
+//
+void OptimizeRenderMeshIndices(RenderMesh& mesh) {
+  // Only optimize triangulated meshes with valid indices
+  if (!mesh.is_triangulated() || mesh.triangulatedFaceVertexIndices.empty() || mesh.points.empty()) {
+    return;
+  }
+
+  const size_t index_count = mesh.triangulatedFaceVertexIndices.size();
+  const size_t vertex_count = mesh.points.size();
+
+  if (index_count == 0 || vertex_count == 0) {
+    return;
+  }
+
+  // Create optimized index buffer
+  std::vector<unsigned int> optimized_indices(index_count);
+
+  // Convert indices to unsigned int for meshoptimizer
+  std::vector<unsigned int> indices(index_count);
+  for (size_t i = 0; i < index_count; i++) {
+    indices[i] = static_cast<unsigned int>(mesh.triangulatedFaceVertexIndices[i]);
+  }
+
+  // Step 1: Optimize vertex cache
+  meshopt_optimizeVertexCache(optimized_indices.data(), indices.data(), 
+                              index_count, vertex_count);
+
+  // Step 2: Optimize overdraw (requires vertex positions)
+  if (!mesh.points.empty()) {
+    std::vector<unsigned int> overdraw_optimized(index_count);
+    meshopt_optimizeOverdraw(overdraw_optimized.data(), optimized_indices.data(),
+                             index_count, 
+                             reinterpret_cast<const float*>(mesh.points.data()),
+                             vertex_count, 
+                             sizeof(vec3), // stride
+                             1.05f); // threshold (allow up to 5% vertex cache degradation)
+    
+    optimized_indices = std::move(overdraw_optimized);
+  }
+
+  // Step 3: Optimize vertex fetch
+  std::vector<unsigned int> fetch_remap(vertex_count);
+  size_t unique_vertices = meshopt_optimizeVertexFetchRemap(fetch_remap.data(),
+                                                            optimized_indices.data(),
+                                                            index_count,
+                                                            vertex_count);
+
+  // Only apply vertex fetch optimization if it reduces vertex count
+  if (unique_vertices < vertex_count && unique_vertices > 0) {
+    // Remap indices
+    meshopt_remapIndexBuffer(optimized_indices.data(), optimized_indices.data(),
+                             index_count, fetch_remap.data());
+
+    // Remap vertex positions
+    std::vector<vec3> optimized_points(unique_vertices);
+    meshopt_remapVertexBuffer(optimized_points.data(), mesh.points.data(),
+                              vertex_count, sizeof(vec3), fetch_remap.data());
+    
+    mesh.points = std::move(optimized_points);
+    
+    // TODO: Remap other vertex attributes (normals, texcoords, etc.) as needed
+    // This would require more complex logic to handle all vertex attributes
+  }
+
+  // Convert back to uint32_t and update mesh
+  for (size_t i = 0; i < index_count; i++) {
+    mesh.triangulatedFaceVertexIndices[i] = static_cast<uint32_t>(optimized_indices[i]);
+  }
+}
+#endif
+
 }  // namespace

 bool ToVertexAttribute(const GeomPrimvar &primvar, const std::string &name,
@@ -4114,6 +4192,11 @@ bool RenderSceneConverter::ConvertMesh(
  dst.abs_path = abs_prim_path.full_path_name();
  dst.display_name = mesh.metas().displayName.value_or("");

+#if defined(TINYUSDZ_WITH_MESHOPT)
+  // Optimize mesh indices for better rendering performance
+  OptimizeRenderMeshIndices(dst);
+#endif
+
  (*dstMesh) = std::move(dst);

  return true;