anv/rt: Drop header update using blorp code path

Updating header using blorp code path involves setting up the render surface state. Header (CPU) update code path involves compute_w_to_host_r barrier which involves heavy flushing. Switching to completely shader based header update avoid all that overhead. Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39971>
2026-02-18 12:58:08 -08:00 · 2026-02-18 12:58:08 -08:00 · 87f7f0f039
commit 87f7f0f039
parent 37f26e346a
3 changed files with 31 additions and 81 deletions
--- a/src/intel/vulkan/bvh/anv_build_interface.h
+++ b/src/intel/vulkan/bvh/anv_build_interface.h
@ -44,6 +44,9 @@ struct header_args {
   uint32_t instance_count;

   uint32_t instance_leaves_offset;
+
+   uint64_t bvh_size;
+   uint8_t is_compacted;
 };

 #define ANV_COPY_MODE_COPY        0
--- a/src/intel/vulkan/bvh/header.comp
+++ b/src/intel/vulkan/bvh/header.comp
@ -17,10 +17,13 @@ layout(push_constant) uniform CONSTS
 void
 main(void)
 {
-   uint32_t compacted_size =
-      args.bvh_offset + DEREF(args.src).dst_node_offset * ANV_RT_BLOCK_SIZE;
+   uint64_t compacted_size = args.bvh_size;
+   if (args.is_compacted == uint8_t(1)) {
+      compacted_size =
+         args.bvh_offset + DEREF(args.src).dst_node_offset * ANV_RT_BLOCK_SIZE;
+   }

-   uint32_t serialization_size = compacted_size +
+   uint64_t serialization_size = compacted_size +
      SIZEOF(vk_accel_struct_serialization_header) + SIZEOF(uint64_t) *
      args.instance_count;

@ -34,7 +37,8 @@ main(void)
   /* 128 is local_size_x in copy.comp shader, 8 is the amount of data
    * copied by each iteration of that shader's loop
    */
-   DEREF(args.dst).copy_dispatch_size[0] = DIV_ROUND_UP(compacted_size, 8 * 128);
+   DEREF(args.dst).copy_dispatch_size[0] =
+      uint32_t(DIV_ROUND_UP(compacted_size, 8 * 128));
   DEREF(args.dst).copy_dispatch_size[1] = 1;
   DEREF(args.dst).copy_dispatch_size[2] = 1;
 #if GFX_VERx10 >= 300
--- a/src/intel/vulkan/genX_acceleration_structure.c
+++ b/src/intel/vulkan/genX_acceleration_structure.c
@ -429,12 +429,15 @@ anv_encode_as(VkCommandBuffer commandBuffer, const struct vk_acceleration_struct
 static VkResult
 anv_init_header_bind_pipeline(VkCommandBuffer commandBuffer, const struct vk_acceleration_structure_build_state *state)
 {
-   if (state->config.encode_key[1] == 1) {
-      anv_bvh_build_bind_pipeline(commandBuffer,
-                                  ANV_OBJECT_KEY_BVH_HEADER,
-                                  header_spv, sizeof(header_spv),
-                                  sizeof(struct header_args), 0);
-   }
+   /* Add a barrier to ensure the writes from encode.comp is ready to be
+    * read by header.comp
+    */
+   vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+   anv_bvh_build_bind_pipeline(commandBuffer,
+                               ANV_OBJECT_KEY_BVH_HEADER,
+                               header_spv, sizeof(header_spv),
+                               sizeof(struct header_args), 0);

   return VK_SUCCESS;
 }
@ -458,78 +461,18 @@ anv_init_header(VkCommandBuffer commandBuffer, const struct vk_acceleration_stru
   uint32_t instance_count = geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR ?
                             state->leaf_node_count : 0;

-   if (state->config.encode_key[1] == 1) {
-      /* Add a barrier to ensure the writes from encode.comp is ready to be
-       * read by header.comp
-       */
-      vk_barrier_compute_w_to_compute_r(commandBuffer);
+   struct header_args args = {
+      .src = intermediate_header_addr,
+      .dst = vk_acceleration_structure_get_va(dst),
+      .bvh_offset = bvh_layout.bvh_offset,
+      .instance_count = instance_count,
+      .instance_leaves_offset = bvh_layout.instance_leaves_offset,
+      .is_compacted = (state->config.encode_key[1] == 1),
+      .bvh_size = bvh_layout.size,
+   };

-      struct header_args args = {
-         .src = intermediate_header_addr,
-         .dst = vk_acceleration_structure_get_va(dst),
-         .bvh_offset = bvh_layout.bvh_offset,
-         .instance_count = instance_count,
-         .instance_leaves_offset = bvh_layout.instance_leaves_offset,
-      };
-
-      anv_bvh_build_set_args(commandBuffer, &args, sizeof(args));
-      vk_common_CmdDispatch(commandBuffer, 1, 1, 1);
-   } else {
-      vk_barrier_compute_w_to_host_r(commandBuffer);
-
-      /* L1/L2 caches flushes should have been dealt with by pipeline barriers.
-       * Unfortunately some platforms require L3 flush because CS (reading the
-       * dispatch size paramters) is not L3 coherent.
-       */
-      if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) {
-         anv_add_pending_pipe_bits(cmd_buffer,
-                                   VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR,
-                                   VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR,
-                                   ANV_PIPE_DATA_CACHE_FLUSH_BIT,
-                                   "copy dispatch size for dispatch");
-         genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-      }
-
-      size_t base = offsetof(struct anv_accel_struct_header,
-                             copy_dispatch_size);
-
-      struct anv_accel_struct_header header = {};
-
-      header.instance_count = instance_count;
-      header.self_ptr = header_addr;
-      header.compacted_size = bvh_layout.size;
-
-      /* 128 is local_size_x in copy.comp shader, 8 is the amount of data
-       * copied by each iteration of that shader's loop
-       */
-      header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size,
-                                                  8 * 128);
-      header.copy_dispatch_size[1] = 1;
-      header.copy_dispatch_size[2] = 1;
-
-      header.serialization_size =
-         header.compacted_size +
-         sizeof(struct vk_accel_struct_serialization_header) +
-         sizeof(uint64_t) * header.instance_count;
-
-      header.size = header.compacted_size;
-
-      header.instance_leaves_offset = bvh_layout.instance_leaves_offset;
-
-#if GFX_VERx10 >= 300
-      header.enable_64b_rt = 1;
-#else
-      header.enable_64b_rt = 0;
-#endif
-
-      size_t header_size = sizeof(struct anv_accel_struct_header) - base;
-      assert(base % sizeof(uint32_t) == 0);
-      assert(header_size % sizeof(uint32_t) == 0);
-      uint32_t *header_ptr = (uint32_t *)((char *)&header + base);
-
-      struct anv_address addr = anv_address_from_u64(header_addr + base);
-      anv_cmd_buffer_update_addr(cmd_buffer, addr, header_size, header_ptr);
-   }
+   anv_bvh_build_set_args(commandBuffer, &args, sizeof(args));
+   vk_common_CmdDispatch(commandBuffer, 1, 1, 1);

   if (INTEL_DEBUG_BVH_ANY) {
      debug_record_as_to_bvh_dump(cmd_buffer, header_addr, bvh_layout.size,