Meshlet Occlusion Culling (#78)

- record visibility per meshlet - required some improvements and restructuring throughout - perform occlusion-culling in task-shader
crocdialer · Oct 25, 2024 · 96fc668 · 96fc668
1 parent 4e2965a
commit 96fc668
Show file tree

Hide file tree

Showing 22 changed files with 663 additions and 231 deletions.
diff --git a/include/vierkant/Rasterizer.hpp b/include/vierkant/Rasterizer.hpp
@@ -24,13 +24,12 @@ using double_millisecond_t = std::chrono::duration<double, std::milli>;
 /**
  * @brief   Rasterizer can be used to run arbitrary rasterization/graphics pipelines.
  *
- *          It will not render anything on its own, only record secondary command-buffers,
- *          meant to be executed within an existing renderpass.
+ *  It will not submit anything on its own, only record drawing commands into command-buffers.
  *
- *          Required resources like descriptor-sets and uniform-buffers will be created
- *          and kept alive, depending on the requested number of in-flight (pending) frames.
+ *  Required resources like descriptor-sets and uniform-buffers will be created
+ *  and kept alive, depending on the requested number of in-flight (pending) frames.
  *
- *          Renderer is NOT thread-safe, with the exception of stage_drawables(...).
+ *  Renderer is NOT thread-safe, with the exception of stage_drawables(...).
  */
 class Rasterizer
 {
@@ -53,6 +52,8 @@ class Rasterizer
         BINDING_MESHLETS = 13,
         BINDING_MESHLET_VERTICES = 14,
         BINDING_MESHLET_TRIANGLES = 15,
+        BINDING_MESHLET_VISIBILITY = 16,
+        BINDING_DEPTH_PYRAMID = 17,
         BINDING_MAX_RANGE
     };
 
@@ -76,16 +77,18 @@ class Rasterizer
         vierkant::Mesh::lod_t lods[8];
     };
 
-    struct indexed_indirect_command_t
+    struct alignas(16) indexed_indirect_command_t
     {
         VkDrawIndexedIndirectCommand vk_draw = {};// size: 5
 
         VkDrawMeshTasksIndirectCommandEXT vk_mesh_draw = {};// size: 3
 
         uint32_t visible = false;
+        uint32_t late_visible = false;
         uint32_t object_index = 0;
         uint32_t base_meshlet = 0;
         uint32_t num_meshlets = 0;
+        uint32_t meshlet_visibility_index = 0;
         uint32_t count_buffer_offset = 0;
         uint32_t first_draw_index = 0;
     };
@@ -104,6 +107,9 @@ class Rasterizer
         //! device array containing any array of material_t
         vierkant::BufferPtr materials;
 
+        //! device array a visibility bitfield for all meshlets
+        vierkant::BufferPtr meshlet_visibilities;
+
         //! host-visible array of indexed_indirect_command_t
         vierkant::BufferPtr draws_in;
 
@@ -271,6 +277,7 @@ class Rasterizer
         vierkant::BufferPtr mesh_draw_buffer;
         vierkant::BufferPtr mesh_entry_buffer;
         vierkant::BufferPtr material_buffer;
+        vierkant::BufferPtr meshlet_visibility_buffer;
 
         // host visible keep-alive staging-buffer
         vierkant::BufferPtr staging_buffer;

diff --git a/include/vierkant/descriptor.hpp b/include/vierkant/descriptor.hpp
@@ -87,15 +87,15 @@ DescriptorSetLayoutPtr create_descriptor_set_layout(const vierkant::DevicePtr &d
                                                     const descriptor_map_t &descriptors);
 
 /**
- * @brief   Create a shared VkDescriptorSet (DescriptorSetPtr) for a provided DescriptorLayout
+ * @brief   Create a shared VkDescriptorSet (DescriptorSetPtr) for a provided set-layout.
  *
- * @param   device  handle for the vierkant::Device to create the DescriptorSet
- * @param   pool    handle for a shared VkDescriptorPool to allocate the DescriptorSet from
- * @param   layout  handle for a shared VkDescriptorSetLayout to use as blueprint
+ * @param   device      handle for the vierkant::Device to create the DescriptorSet
+ * @param   pool        handle for a shared VkDescriptorPool to allocate the DescriptorSet from
+ * @param   set_layout  handle for a VkDescriptorSetLayout
  * @return  the newly created DescriptorSetPtr
  */
 DescriptorSetPtr create_descriptor_set(const vierkant::DevicePtr &device, const DescriptorPoolPtr &pool,
-                                       const DescriptorSetLayoutPtr &layout, bool variable_count);
+                                       VkDescriptorSetLayout set_layout, bool variable_count);
 
 /**
  * @brief   Update an existing shared VkDescriptorSet with a provided array of vierkant::descriptor_t.
@@ -146,7 +146,7 @@ DescriptorSetLayoutPtr find_or_create_set_layout(const vierkant::DevicePtr &devi
  * @return  a retrieved or newly created, shared VkDescriptorSet.
  */
 DescriptorSetPtr find_or_create_descriptor_set(const vierkant::DevicePtr &device,
-                                               const DescriptorSetLayoutPtr &set_layout,
+                                               VkDescriptorSetLayout set_layout,
                                                const descriptor_map_t &descriptors,
                                                const vierkant::DescriptorPoolPtr &pool, descriptor_set_map_t &last,
                                                descriptor_set_map_t &current, bool variable_count,

diff --git a/include/vierkant/gpu_culling.hpp b/include/vierkant/gpu_culling.hpp
@@ -35,6 +35,8 @@ struct gpu_cull_params_t
     //! limit number of LoDs (0: no limit)
     uint32_t max_num_lods = 0;
 
+    bool skip_meshlets = false;
+
     VkQueue queue = VK_NULL_HANDLE;
     vierkant::semaphore_submit_info_t semaphore_submit_info = {};
 
@@ -77,16 +79,27 @@ struct create_depth_pyramid_params_t
  * @brief   create_gpu_cull_context is a factory to create an opaque gpu_cull_context_ptr.
  *
  * @param   device          a provided vierkant::Device.
+ * @param   size            context framebuffer-size
  * @param   pipeline_cache  an optional pipeline_cache.
  * @return  an opaque pointer, owning a gpu_cull_context.
  */
 gpu_cull_context_ptr create_gpu_cull_context(const vierkant::DevicePtr &device,
+                                             const glm::vec2 &size,
                                              const vierkant::PipelineCachePtr &pipeline_cache = nullptr);
 
 /**
- * @brief   create_depth_pyramid can be used to create a 'hierarchical z-buffer (hzb)' or 'depth-pyramid'.
+ * @brief   retrieve internally stored 'hierarchical z-buffer (hzb)' / depth-pyramid.
+ *
+ * @param   context     a provided gpu_cull_context_t
+ * @param   params      a provided struct with parameters
+ * @return  a vierkant::ImagePtr containing the created depth-pyramid
+ */
+vierkant::ImagePtr get_depth_pyramid(const vierkant::gpu_cull_context_ptr &context);
+
+/**
+ * @brief   create_depth_pyramid can be used to create a 'hierarchical z-buffer (hzb)' /depth-pyramid.
  *
- * @param   context     a provided vierkant::Device.
+ * @param   context     a provided gpu_cull_context_t
  * @param   params      a provided struct with parameters
  * @return  a vierkant::ImagePtr containing the created depth-pyramid
  */

diff --git a/include/vierkant/hash.hpp b/include/vierkant/hash.hpp
@@ -4,9 +4,9 @@
 
 #pragma once
 
-#include <functional>
-#include <cstring>
 #include <cstdint>
+#include <cstring>
+#include <functional>
 
 namespace vierkant
 {
@@ -90,11 +90,11 @@ static inline uint32_t murmur3_32(const K &key, uint32_t seed)
 
     if constexpr(num_hashes)
     {
-        auto ptr = reinterpret_cast<const uint32_t *>(&key);
+        auto ptr = reinterpret_cast<const uint32_t *>(&key), end = ptr + num_hashes;
 
-        for(uint32_t i = num_hashes; i; i--)
+        for(; ptr < end; ++ptr)
         {
-            h ^= murmur_32_scramble(ptr[i - 1]);
+            h ^= murmur_32_scramble(*ptr);
             h = (h << 13) | (h >> 19);
             h = h * 5 + 0xe6546b64;
         }

diff --git a/include/vierkant/linear_hashmap.hpp b/include/vierkant/linear_hashmap.hpp
@@ -41,6 +41,198 @@ class linear_hashmap
         clear();
     }
 
+    [[nodiscard]] inline size_t size() const { return m_num_elements; }
+
+    [[nodiscard]] inline size_t capacity() const { return m_capacity; }
+
+    [[nodiscard]] inline bool empty() const { return size() == 0; }
+
+    inline void clear()
+    {
+        m_num_elements = 0;
+        storage_item_t *ptr = m_storage.get(), *end = ptr + m_capacity;
+        for(; ptr != end; ++ptr)
+        {
+            ptr->key = key_t();
+            ptr->value = std::optional<value_t>();
+        }
+    }
+
+    inline uint32_t put(const key_t &key, const value_t &value)
+    {
+        check_load_factor();
+        return internal_put(key, value);
+    }
+
+    [[nodiscard]] std::optional<value_t> get(const key_t &key) const
+    {
+        if(!m_capacity) { return {}; }
+
+        for(uint32_t idx = m_hash_fn(key);; idx++)
+        {
+            idx &= m_capacity - 1;
+            auto &item = m_storage[idx];
+            if(item.key == key_t()) { return {}; }
+            else if(key == item.key)
+            {
+                if(item.value) { return item.value; }
+            }
+        }
+    }
+
+    void remove(const key_t &key)
+    {
+        if(!m_capacity) { return; }
+
+        for(uint32_t idx = m_hash_fn(key);; idx++)
+        {
+            idx &= m_capacity - 1;
+            auto &item = m_storage[idx];
+            if(item.key == key_t()) { return; }
+            else if(key == item.key && item.value)
+            {
+                item.value = {};
+                m_num_elements--;
+                return;
+            }
+        }
+    }
+
+    [[nodiscard]] inline bool contains(const key_t &key) const { return get(key) != std::nullopt; }
+
+    size_t get_storage(void *dst) const
+    {
+        struct output_item_t
+        {
+            key_t key = {};
+            value_t value = {};
+        };
+
+        if(dst)
+        {
+            auto output_ptr = reinterpret_cast<output_item_t *>(dst);
+            storage_item_t *item = m_storage.get(), *end = item + m_capacity;
+            for(; item != end; ++item, ++output_ptr)
+            {
+                if(item->key != key_t())
+                {
+                    output_ptr->key = item->key;
+                    output_ptr->value = item->value ? *item->value : value_t();
+                }
+                else { *output_ptr = {}; }
+            }
+        }
+        return sizeof(output_item_t) * m_capacity;
+    }
+
+    void reserve(size_t new_capacity)
+    {
+        auto new_linear_hashmap = linear_hashmap(new_capacity);
+        storage_item_t *ptr = m_storage.get(), *end = ptr + m_capacity;
+        for(; ptr != end; ++ptr)
+        {
+            if(ptr->key != key_t())
+            {
+                if(ptr->value) { new_linear_hashmap.put(ptr->key, *ptr->value); }
+            }
+        }
+        swap(*this, new_linear_hashmap);
+    }
+
+    [[nodiscard]] float load_factor() const { return static_cast<float>(m_num_elements) / m_capacity; }
+
+    [[nodiscard]] float max_load_factor() const { return m_max_load_factor; }
+
+    void max_load_factor(float load_factor)
+    {
+        m_max_load_factor = std::clamp<float>(load_factor, 0.01f, 1.f);
+        check_load_factor();
+    }
+
+    friend void swap(linear_hashmap &lhs, linear_hashmap &rhs)
+    {
+        std::swap(lhs.m_capacity, rhs.m_capacity);
+        std::swap(lhs.m_num_elements, rhs.m_num_elements);
+        std::swap(lhs.m_storage, rhs.m_storage);
+        std::swap(lhs.m_hash_fn, rhs.m_hash_fn);
+        std::swap(lhs.m_max_load_factor, rhs.m_max_load_factor);
+        std::swap(lhs.m_grow_factor, rhs.m_grow_factor);
+    }
+
+private:
+    struct storage_item_t
+    {
+        key_t key;
+        std::optional<value_t> value;
+    };
+
+    inline void check_load_factor()
+    {
+        if(m_num_elements >= m_capacity * m_max_load_factor)
+        {
+            reserve(std::max<size_t>(32, static_cast<size_t>(m_grow_factor * m_capacity)));
+        }
+    }
+
+    inline uint32_t internal_put(const key_t key, const value_t &value)
+    {
+        uint32_t probe_length = 0;
+
+        for(uint64_t idx = m_hash_fn(key);; idx++, probe_length++)
+        {
+            idx &= m_capacity - 1;
+            auto &item = m_storage[idx];
+
+            // load previous key
+            key_t probed_key = item.key;
+
+            if(probed_key != key)
+            {
+                // hit another valid entry, keep probing
+                if(probed_key != key_t() && item.value) { continue; }
+                item.key = key;
+                m_num_elements++;
+            }
+            item.value = value;
+            return probe_length;
+        }
+    }
+
+    uint64_t m_capacity = 0;
+    uint64_t m_num_elements = 0;
+    std::unique_ptr<storage_item_t[]> m_storage;
+    hash32_fn m_hash_fn = std::bind(murmur3_32<key_t>, std::placeholders::_1, 0);
+
+    // reasonably low load-factor to keep average probe-lengths low
+    float m_max_load_factor = 0.5f;
+    float m_grow_factor = 2.f;
+};
+
+template<typename K, typename V>
+class linear_hashmap_mt
+{
+public:
+    using key_t = K;
+    using value_t = V;
+    using hash32_fn = std::function<uint32_t(const key_t &)>;
+    static_assert(std::is_default_constructible_v<key_t>, "key_t not default-constructible");
+    static_assert(std::equality_comparable<key_t>, "key_t not comparable");
+
+    linear_hashmap_mt() = default;
+    linear_hashmap_mt(const linear_hashmap_mt &) = delete;
+    linear_hashmap_mt(linear_hashmap_mt &other) : linear_hashmap_mt() { swap(*this, other); };
+    linear_hashmap_mt &operator=(linear_hashmap_mt other)
+    {
+        swap(*this, other);
+        return *this;
+    }
+
+    explicit linear_hashmap_mt(uint64_t min_capacity)
+        : m_capacity(crocore::next_pow_2(min_capacity)), m_storage(std::make_unique<storage_item_t[]>(m_capacity))
+    {
+        clear();
+    }
+
     inline size_t size() const { return m_num_elements; }
 
     inline size_t capacity() const { return m_capacity; }
@@ -133,7 +325,7 @@ class linear_hashmap
 
     void reserve(size_t new_capacity)
     {
-        auto new_linear_hashmap = linear_hashmap(new_capacity);
+        auto new_linear_hashmap = linear_hashmap_mt(new_capacity);
         storage_item_t *ptr = m_storage.get(), *end = ptr + m_capacity;
         for(; ptr != end; ++ptr)
         {
@@ -155,7 +347,7 @@ class linear_hashmap
         check_load_factor();
     }
 
-    friend void swap(linear_hashmap &lhs, linear_hashmap &rhs)
+    friend void swap(linear_hashmap_mt &lhs, linear_hashmap_mt &rhs)
     {
         std::lock(lhs.m_mutex, rhs.m_mutex);
         std::unique_lock lock_lhs(lhs.m_mutex, std::adopt_lock), lock_rhs(rhs.m_mutex, std::adopt_lock);