From 1bdd640d83ed7a718eb737d7fa829f2067b60880 Mon Sep 17 00:00:00 2001 From: Olivia Lee Date: Wed, 12 Nov 2025 20:34:16 -0800 Subject: [PATCH] panvk/csf: implement VK_EXT_primitives_generated_query except primitive restart Primitive restart requires scanning the index buffer to determine how many primitives are present, and will be handled in a later commit. Signed-off-by: Olivia Lee Reviewed-by: Lars-Ivar Hesselberg Simonsen Reviewed-by: Christoph Pillmayer Part-of: --- src/panfrost/libpan/draw_helper.cl | 29 ++ src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 60 ++++ src/panfrost/vulkan/csf/panvk_vX_cmd_query.c | 326 +++++++++++------- src/panfrost/vulkan/panvk_cmd_draw.h | 5 +- .../{panvk_cmd_oq.h => panvk_cmd_query.h} | 13 +- src/panfrost/vulkan/panvk_vX_query_pool.c | 9 + 6 files changed, 319 insertions(+), 123 deletions(-) rename src/panfrost/vulkan/{panvk_cmd_oq.h => panvk_cmd_query.h} (64%) diff --git a/src/panfrost/libpan/draw_helper.cl b/src/panfrost/libpan/draw_helper.cl index 89ac55a48b4..b6e01628e0c 100644 --- a/src/panfrost/libpan/draw_helper.cl +++ b/src/panfrost/libpan/draw_helper.cl @@ -5,10 +5,39 @@ #include "compiler/libcl/libcl.h" #include "compiler/libcl/libcl_vk.h" +#include "compiler/shader_enums.h" #include "genxml/gen_macros.h" #include "lib/pan_encoder.h" +#include "poly/cl/restart.h" #include "draw_helper.h" +#if PAN_ARCH >= 10 +KERNEL(1) +panlib_update_prims_generated_query_indirect( + global uint32_t *prims_generated, global uint32_t *draw_count_buffer, + uint32_t max_draw_count, uint32_t cmd_stride, constant uint32_t *cmd, + uint32_t view_count, uint32_t compact_prim__11) +{ + enum mesa_prim prim = poly_uncompact_prim(compact_prim__11); + uint32_t draw_count = draw_count_buffer ? + min(*draw_count_buffer, max_draw_count) : max_draw_count; + + for (uint32_t draw_id = 0; draw_id < draw_count; draw_id++) { + /* cmd may be either VkDrawnIndirectCommand or + * VkDrawIndexedIndirectCommand. In both cases the vertex/index count is + * the first field, and the instance count is the second */ + uint32_t vertex_count = cmd[0]; + uint32_t instance_count = cmd[1]; + + uint32_t prims_per_instance = + u_decomposed_prims_for_vertices(prim, vertex_count); + *prims_generated += prims_per_instance * instance_count; + + cmd = (constant uint32_t *) ((uintptr_t) cmd + cmd_stride); + } +} +#endif + #if (PAN_ARCH == 6 || PAN_ARCH == 7) struct panlib_draw_info { struct { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index c3227ee47d4..0b938af6b0f 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -22,6 +22,7 @@ #include "panvk_cmd_draw.h" #include "panvk_cmd_fb_preload.h" #include "panvk_cmd_meta.h" +#include "panvk_cmd_precomp.h" #include "panvk_cmd_ts.h" #include "panvk_device.h" #include "panvk_entrypoints.h" @@ -48,6 +49,7 @@ #include "vk_meta.h" #include "vk_pipeline_layout.h" #include "vk_render_pass.h" +#include "poly/geometry.h" static enum cs_reg_perm provoking_vertex_fn_reg_perm_cb(struct cs_builder *b, unsigned reg) @@ -2358,6 +2360,60 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) return VK_SUCCESS; } +static void +update_prims_generated_query(struct panvk_cmd_buffer *cmdbuf, + struct panvk_draw_info *draw) +{ + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE); + struct vk_input_assembly_state *ia = &cmdbuf->vk.dynamic_graphics_state.ia; + struct panvk_prims_generated_query_state *state = + &cmdbuf->state.gfx.prims_generated_query; + + if (!state->ptr) + return; + + /* TODO: primitive restart */ + assert(!draw->index.size || !ia->primitive_restart_enable); + + enum mesa_prim prim = vk_topology_to_mesa(ia->primitive_topology); + uint32_t view_count = cmdbuf->state.gfx.render.view_mask ? + util_bitcount(cmdbuf->state.gfx.render.view_mask) : 1; + + if (draw->indirect.buffer_dev_addr) { + struct panvk_precomp_ctx precomp_ctx = panvk_per_arch(precomp_cs)(cmdbuf); + + struct panlib_update_prims_generated_query_indirect_args args = { + .prims_generated = state->ptr, + .draw_count_buffer = draw->indirect.count_buffer_dev_addr, + .max_draw_count = draw->indirect.draw_count, + .cmd_stride = draw->indirect.stride, + .cmd = draw->indirect.buffer_dev_addr, + .view_count = view_count, + }; + + /* We need to WAIT in order to avoid overlapping the (non-atomic) direct + * draw counter updates with indirect draws. TODO: we could avoid that + * by having separate direct/indirect counters and adding them on read */ + panlib_update_prims_generated_query_indirect_struct( + &precomp_ctx, panlib_1d(1), PANLIB_BARRIER_CSF_WAIT, args, + poly_compact_prim(prim)); + } else { + uint32_t prims_per_instance = + u_decomposed_prims_for_vertices(prim, draw->vertex.count); + uint32_t prims_generated = + prims_per_instance * draw->instance.count * view_count; + + struct cs_index addr = cs_scratch_reg64(b, 0); + struct cs_index value = cs_scratch_reg32(b, 2); + + cs_move64_to(b, addr, state->ptr); + cs_load32_to(b, value, addr, 0); + cs_add32(b, value, value, prims_generated); + cs_store32(b, value, addr, 0); + } +} + static void panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) { @@ -2382,6 +2438,8 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) if (result != VK_SUCCESS) return; + update_prims_generated_query(cmdbuf, draw); + cs_update_vt_ctx(b) { cs_move32_to(b, cs_sr_reg32(b, IDVS, GLOBAL_ATTRIBUTE_OFFSET), 0); cs_move32_to(b, cs_sr_reg32(b, IDVS, INDEX_COUNT), draw->vertex.count); @@ -2562,6 +2620,8 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, if (result != VK_SUCCESS) return; + update_prims_generated_query(cmdbuf, draw); + struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc; const struct vk_dynamic_graphics_state *dyns = diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c index 592aa3f768f..1f144fb42fe 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c @@ -26,6 +26,21 @@ #include "panvk_query_pool.h" #include "panvk_queue.h" +static enum panvk_subqueue_id +panvk_subqueue_for_query_type(VkQueryType type) +{ + /* timestamp queries are not handled here , because they may be written + * from any subqueue */ + switch (type) { + case VK_QUERY_TYPE_OCCLUSION: + return PANVK_SUBQUEUE_FRAGMENT; + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + return PANVK_SUBQUEUE_COMPUTE; + default: + UNREACHABLE("Unsupported query type"); + } +} + /* At the API level, a query consists of a status and a result. Both are * uninitialized initially. There are these query operations: * @@ -52,6 +67,7 @@ * 0 and does not need to wait. */ +/* Default path for query types that don't need special logic */ static void reset_queries_batch(struct cs_builder *b, struct cs_index addr, struct cs_index zero_regs, uint32_t query_count) @@ -92,11 +108,13 @@ reset_queries_batch(struct cs_builder *b, struct cs_index addr, } static void -panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer *cmd, - struct panvk_query_pool *pool, - uint32_t first_query, uint32_t query_count) +panvk_cmd_reset_queries(struct panvk_cmd_buffer *cmd, + struct panvk_query_pool *pool, uint32_t first_query, + uint32_t query_count) { - struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT); + enum panvk_subqueue_id subqueue = + panvk_subqueue_for_query_type(pool->vk.query_type); + struct cs_builder *b = panvk_get_cs_builder(cmd, subqueue); /* Wait on deferred sync to ensure all prior query operations have * completed @@ -132,6 +150,118 @@ panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer *cmd, cs_wait_slot(b, SB_ID(IMM_FLUSH)); } +static void +copy_result_batch(struct cs_builder *b, + VkQueryResultFlags flags, + struct cs_index dst_addr, + VkDeviceSize dst_stride, + struct cs_index res_addr, + struct cs_index avail_addr, + struct cs_index scratch_regs, + uint32_t query_count) +{ + uint32_t res_size = (flags & VK_QUERY_RESULT_64_BIT) ? 2 : 1; + uint32_t regs_per_copy = + res_size + ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0); + + assert(query_count <= scratch_regs.size / regs_per_copy); + + for (uint32_t i = 0; i < query_count; i++) { + struct cs_index res = + cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), res_size); + struct cs_index avail = cs_reg32(b, res.reg + res_size); + + cs_load_to(b, res, res_addr, BITFIELD_MASK(res.size), + i * sizeof(uint64_t)); + + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) + cs_load32_to(b, avail, avail_addr, i * sizeof(struct panvk_cs_sync32)); + } + + for (uint32_t i = 0; i < query_count; i++) { + struct cs_index store_src = + cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), regs_per_copy); + + cs_store(b, store_src, dst_addr, BITFIELD_MASK(regs_per_copy), + i * dst_stride); + } + + /* Flush the stores. */ + cs_flush_stores(b); +} + +static void +panvk_copy_query_results(struct panvk_cmd_buffer *cmd, + struct panvk_query_pool *pool, + uint32_t first_query, uint32_t query_count, + uint64_t dst_buffer_addr, + VkDeviceSize stride, + VkQueryResultFlags flags) +{ + enum panvk_subqueue_id subqueue = + panvk_subqueue_for_query_type(pool->vk.query_type); + struct cs_builder *b = panvk_get_cs_builder(cmd, subqueue); + + /* Wait for query syncobjs to be signalled. */ + if (flags & VK_QUERY_RESULT_WAIT_BIT) + cs_wait_slot(b, SB_ID(DEFERRED_SYNC)); + + uint32_t res_size = (flags & VK_QUERY_RESULT_64_BIT) ? 2 : 1; + uint32_t regs_per_copy = + res_size + ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0); + + struct cs_index dst_addr = cs_scratch_reg64(b, 16); + struct cs_index res_addr = cs_scratch_reg64(b, 14); + struct cs_index avail_addr = cs_scratch_reg64(b, 12); + struct cs_index counter = cs_scratch_reg32(b, 11); + struct cs_index scratch_regs = cs_scratch_reg_tuple(b, 0, 11); + uint32_t queries_per_batch = scratch_regs.size / regs_per_copy; + + if (stride > 0) { + /* Store offset is a 16-bit signed integer, so we might be limited by the + * stride here. */ + queries_per_batch = MIN2(((1u << 15) / stride) + 1, queries_per_batch); + } + + /* Stop unrolling the loop when it takes more than 2 steps to copy the + * queries. */ + if (query_count > 2 * queries_per_batch) { + uint32_t copied_query_count = + query_count - (query_count % queries_per_batch); + + cs_move32_to(b, counter, copied_query_count); + cs_move64_to(b, dst_addr, dst_buffer_addr); + cs_move64_to(b, res_addr, panvk_query_report_dev_addr(pool, first_query)); + cs_move64_to(b, avail_addr, + panvk_query_available_dev_addr(pool, first_query)); + cs_while(b, MALI_CS_CONDITION_GREATER, counter) { + copy_result_batch(b, flags, dst_addr, stride, res_addr, avail_addr, + scratch_regs, queries_per_batch); + + cs_add32(b, counter, counter, -queries_per_batch); + cs_add64(b, dst_addr, dst_addr, queries_per_batch * stride); + cs_add64(b, res_addr, res_addr, queries_per_batch * sizeof(uint64_t)); + cs_add64(b, avail_addr, avail_addr, + queries_per_batch * sizeof(uint64_t)); + } + + dst_buffer_addr += stride * copied_query_count; + first_query += copied_query_count; + query_count -= copied_query_count; + } + + for (uint32_t i = 0; i < query_count; i += queries_per_batch) { + cs_move64_to(b, dst_addr, dst_buffer_addr + (i * stride)); + cs_move64_to(b, res_addr, + panvk_query_report_dev_addr(pool, i + first_query)); + cs_move64_to(b, avail_addr, + panvk_query_available_dev_addr(pool, i + first_query)); + copy_result_batch(b, flags, dst_addr, stride, res_addr, avail_addr, + scratch_regs, + MIN2(queries_per_batch, query_count - i)); + } +} + static void panvk_cmd_begin_occlusion_query(struct panvk_cmd_buffer *cmd, struct panvk_query_pool *pool, uint32_t query, @@ -206,116 +336,6 @@ panvk_cmd_end_occlusion_query(struct panvk_cmd_buffer *cmd, cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC))); } -static void -copy_oq_result_batch(struct cs_builder *b, - VkQueryResultFlags flags, - struct cs_index dst_addr, - VkDeviceSize dst_stride, - struct cs_index res_addr, - struct cs_index avail_addr, - struct cs_index scratch_regs, - uint32_t query_count) -{ - uint32_t res_size = (flags & VK_QUERY_RESULT_64_BIT) ? 2 : 1; - uint32_t regs_per_copy = - res_size + ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0); - - assert(query_count <= scratch_regs.size / regs_per_copy); - - for (uint32_t i = 0; i < query_count; i++) { - struct cs_index res = - cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), res_size); - struct cs_index avail = cs_reg32(b, res.reg + res_size); - - cs_load_to(b, res, res_addr, BITFIELD_MASK(res.size), - i * sizeof(uint64_t)); - - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) - cs_load32_to(b, avail, avail_addr, i * sizeof(struct panvk_cs_sync32)); - } - - for (uint32_t i = 0; i < query_count; i++) { - struct cs_index store_src = - cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), regs_per_copy); - - cs_store(b, store_src, dst_addr, BITFIELD_MASK(regs_per_copy), - i * dst_stride); - } - - /* Flush the stores. */ - cs_flush_stores(b); -} - -static void -panvk_copy_occlusion_query_results(struct panvk_cmd_buffer *cmd, - struct panvk_query_pool *pool, - uint32_t first_query, uint32_t query_count, - uint64_t dst_buffer_addr, - VkDeviceSize stride, - VkQueryResultFlags flags) -{ - struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT); - - /* Wait for occlusion query syncobjs to be signalled. */ - if (flags & VK_QUERY_RESULT_WAIT_BIT) - cs_wait_slot(b, SB_ID(DEFERRED_SYNC)); - - uint32_t res_size = (flags & VK_QUERY_RESULT_64_BIT) ? 2 : 1; - uint32_t regs_per_copy = - res_size + ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0); - - struct cs_index dst_addr = cs_scratch_reg64(b, 16); - struct cs_index res_addr = cs_scratch_reg64(b, 14); - struct cs_index avail_addr = cs_scratch_reg64(b, 12); - struct cs_index counter = cs_scratch_reg32(b, 11); - struct cs_index scratch_regs = cs_scratch_reg_tuple(b, 0, 11); - uint32_t queries_per_batch = scratch_regs.size / regs_per_copy; - - if (stride > 0) { - /* Store offset is a 16-bit signed integer, so we might be limited by the - * stride here. */ - queries_per_batch = MIN2(((1u << 15) / stride) + 1, queries_per_batch); - } - - /* Stop unrolling the loop when it takes more than 2 steps to copy the - * queries. */ - if (query_count > 2 * queries_per_batch) { - uint32_t copied_query_count = - query_count - (query_count % queries_per_batch); - - cs_move32_to(b, counter, copied_query_count); - cs_move64_to(b, dst_addr, dst_buffer_addr); - cs_move64_to(b, res_addr, panvk_query_report_dev_addr(pool, first_query)); - cs_move64_to(b, avail_addr, - panvk_query_available_dev_addr(pool, first_query)); - cs_while(b, MALI_CS_CONDITION_GREATER, counter) { - copy_oq_result_batch(b, flags, dst_addr, stride, res_addr, avail_addr, - scratch_regs, queries_per_batch); - - cs_add32(b, counter, counter, -queries_per_batch); - cs_add64(b, dst_addr, dst_addr, queries_per_batch * stride); - cs_add64(b, res_addr, res_addr, queries_per_batch * sizeof(uint64_t)); - cs_add64(b, avail_addr, avail_addr, - queries_per_batch * sizeof(uint64_t)); - } - - dst_buffer_addr += stride * copied_query_count; - first_query += copied_query_count; - query_count -= copied_query_count; - } - - for (uint32_t i = 0; i < query_count; i += queries_per_batch) { - cs_move64_to(b, dst_addr, dst_buffer_addr + (i * stride)); - cs_move64_to(b, res_addr, - panvk_query_report_dev_addr(pool, i + first_query)); - cs_move64_to(b, avail_addr, - panvk_query_available_dev_addr(pool, i + first_query)); - copy_oq_result_batch(b, flags, dst_addr, stride, res_addr, avail_addr, - scratch_regs, - MIN2(queries_per_batch, query_count - i)); - } -} - static void panvk_cmd_reset_timestamp_queries(struct panvk_cmd_buffer *cmd, struct panvk_query_pool *pool, @@ -638,6 +658,64 @@ panvk_copy_timestamp_query_results(struct panvk_cmd_buffer *cmd, PANLIB_BARRIER_CSF_SYNC, push); } +static void +panvk_cmd_begin_prims_generated_query( + struct panvk_cmd_buffer *cmd, struct panvk_query_pool *pool, uint32_t query, + VkQueryControlFlags flags) +{ + uint64_t report_addr = panvk_query_report_dev_addr(pool, query); + + cmd->state.gfx.prims_generated_query.ptr = report_addr; + cmd->state.gfx.prims_generated_query.syncobj = + panvk_query_available_dev_addr(pool, query); + + /* From the Vulkan spec: + * + * "When a primitives generated query begins, the count of primitives + * generated starts from zero." + * + */ + struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_COMPUTE); + + struct cs_index report_addr_gpu = cs_scratch_reg64(b, 0); + struct cs_index clear_value = cs_scratch_reg64(b, 2); + cs_move64_to(b, report_addr_gpu, report_addr); + cs_move64_to(b, clear_value, 0); + cs_store64(b, clear_value, report_addr_gpu, 0); + cs_flush_stores(b); +} + +static void +panvk_cmd_end_prims_generated_query( + struct panvk_cmd_buffer *cmd, struct panvk_query_pool *pool, uint32_t query) +{ + cmd->state.gfx.prims_generated_query.ptr = 0; + cmd->state.gfx.prims_generated_query.syncobj = 0; + + struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_COMPUTE); + struct cs_index query_syncobj = cs_scratch_reg64(b, 0); + struct cs_index val = cs_scratch_reg32(b, 2); + + /* Query accumulates sample counts to the report which is on a cached memory. + * Wait for the accumulation and flush the caches. + * + * No need to wait on iter_sb because all shader invocations to update the + * counter use PANLIB_BARRIER_CSF_WAIT already, and all direct draws update + * the counter synchronously. + */ + cs_move32_to(b, val, 0); + cs_flush_caches( + b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, + MALI_CS_OTHER_FLUSH_MODE_NONE, val, + cs_defer(SB_IMM_MASK, SB_ID(DEFERRED_FLUSH))); + + /* Signal the query syncobj after the flush is effective. */ + cs_move32_to(b, val, 1); + cs_move64_to(b, query_syncobj, panvk_query_available_dev_addr(pool, query)); + cs_sync32_set(b, true, MALI_CS_SYNC_SCOPE_CSG, val, query_syncobj, + cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC))); +} + VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdResetQueryPool)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, @@ -650,8 +728,9 @@ panvk_per_arch(CmdResetQueryPool)(VkCommandBuffer commandBuffer, return; switch (pool->vk.query_type) { - case VK_QUERY_TYPE_OCCLUSION: { - panvk_cmd_reset_occlusion_queries(cmd, pool, firstQuery, queryCount); + case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + panvk_cmd_reset_queries(cmd, pool, firstQuery, queryCount); break; } case VK_QUERY_TYPE_TIMESTAMP: { @@ -680,6 +759,10 @@ panvk_per_arch(CmdBeginQueryIndexedEXT)(VkCommandBuffer commandBuffer, panvk_cmd_begin_occlusion_query(cmd, pool, query, flags); break; } + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + panvk_cmd_begin_prims_generated_query(cmd, pool, query, flags); + break; + } default: UNREACHABLE("Unsupported query type"); } @@ -701,6 +784,10 @@ panvk_per_arch(CmdEndQueryIndexedEXT)(VkCommandBuffer commandBuffer, panvk_cmd_end_occlusion_query(cmd, pool, query); break; } + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + panvk_cmd_end_prims_generated_query(cmd, pool, query); + break; + } default: UNREACHABLE("Unsupported query type"); } @@ -730,9 +817,10 @@ panvk_per_arch(CmdCopyQueryPoolResults)( uint64_t dst_buffer_addr = panvk_buffer_gpu_ptr(dst_buffer, dstOffset); switch (pool->vk.query_type) { - case VK_QUERY_TYPE_OCCLUSION: { - panvk_copy_occlusion_query_results(cmd, pool, firstQuery, queryCount, - dst_buffer_addr, stride, flags); + case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + panvk_copy_query_results(cmd, pool, firstQuery, queryCount, + dst_buffer_addr, stride, flags); break; } #if PAN_ARCH >= 10 diff --git a/src/panfrost/vulkan/panvk_cmd_draw.h b/src/panfrost/vulkan/panvk_cmd_draw.h index 33c299335f5..1d498420a4c 100644 --- a/src/panfrost/vulkan/panvk_cmd_draw.h +++ b/src/panfrost/vulkan/panvk_cmd_draw.h @@ -12,7 +12,7 @@ #include "panvk_blend.h" #include "panvk_cmd_desc_state.h" -#include "panvk_cmd_oq.h" +#include "panvk_cmd_query.h" #include "panvk_entrypoints.h" #include "panvk_image.h" #include "panvk_image_view.h" @@ -129,6 +129,9 @@ struct panvk_cmd_graphics_state { } dynamic; struct panvk_occlusion_query_state occlusion_query; +#if PAN_ARCH >= 10 + struct panvk_prims_generated_query_state prims_generated_query; +#endif struct panvk_graphics_sysvals sysvals; #if PAN_ARCH < 9 diff --git a/src/panfrost/vulkan/panvk_cmd_oq.h b/src/panfrost/vulkan/panvk_cmd_query.h similarity index 64% rename from src/panfrost/vulkan/panvk_cmd_oq.h rename to src/panfrost/vulkan/panvk_cmd_query.h index efe2eeddbcd..e3f190b4685 100644 --- a/src/panfrost/vulkan/panvk_cmd_oq.h +++ b/src/panfrost/vulkan/panvk_cmd_query.h @@ -3,8 +3,8 @@ * SPDX-License-Identifier: MIT */ -#ifndef PANVK_CMD_OQ_H -#define PANVK_CMD_OQ_H +#ifndef PANVK_CMD_QUERY_H +#define PANVK_CMD_QUERY_H #ifndef PAN_ARCH #error "PAN_ARCH must be defined" @@ -20,4 +20,11 @@ struct panvk_occlusion_query_state { enum mali_occlusion_mode mode; }; -#endif \ No newline at end of file +#if PAN_ARCH >= 10 +struct panvk_prims_generated_query_state { + uint64_t syncobj; + uint64_t ptr; +}; +#endif + +#endif diff --git a/src/panfrost/vulkan/panvk_vX_query_pool.c b/src/panfrost/vulkan/panvk_vX_query_pool.c index 21afc7383d0..52d5c1efa49 100644 --- a/src/panfrost/vulkan/panvk_vX_query_pool.c +++ b/src/panfrost/vulkan/panvk_vX_query_pool.c @@ -66,6 +66,10 @@ panvk_per_arch(CreateQueryPool)(VkDevice _device, reports_per_query = PANVK_SUBQUEUE_COUNT + 1; break; } + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + reports_per_query = 1; + break; + } #endif default: UNREACHABLE("Unsupported query type"); @@ -274,6 +278,11 @@ panvk_per_arch(GetQueryPoolResults)(VkDevice _device, VkQueryPool queryPool, pool->reports_per_query); break; } + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + if (write_results) + cpu_write_query_result(dst, 0, flags, src[0].value); + break; + } #endif default: UNREACHABLE("Unsupported query type");