From ff99e5289ba2c54b9830bf039b5e6aa2cd03b916 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 8 Dec 2025 16:51:57 +0100 Subject: [PATCH] panvk/csf: Prepare for more complex scoreboard transitions Right now all we do is get the next available scoreboard, and set it as the current iterator scoreboard, but the fragment queue is about require something more involved to fix FINISH_FRAGMENT ordering issue. Provide a cs_iter_sb_update() block where everything between the selection of a new scoreboard and the transition to this new scorevoard is customizable. Implement cs_next_iter_sb() as a dummy wrapper around this new construct. Signed-off-by: Boris Brezillon Reviewed-by: Lars-Ivar Hesselberg Simonsen Reviewed-by: Christoph Pillmayer Part-of: --- src/panfrost/.clang-format | 2 + src/panfrost/vulkan/csf/panvk_cmd_buffer.h | 168 +++++++++++++++++- src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c | 59 ------ .../vulkan/csf/panvk_vX_cmd_dispatch.c | 5 +- src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 10 +- .../vulkan/csf/panvk_vX_cmd_precomp.c | 5 +- 6 files changed, 175 insertions(+), 74 deletions(-) diff --git a/src/panfrost/.clang-format b/src/panfrost/.clang-format index c700275b7d3..c039e56cb62 100644 --- a/src/panfrost/.clang-format +++ b/src/panfrost/.clang-format @@ -43,6 +43,8 @@ ForEachMacros: [ 'cs_emit', 'cs_exception_handler_def', 'cs_if', + 'cs_iter_sb_update', + 'cs_iter_sb_update_case', 'cs_match', 'cs_single_link_list_for_each_from', 'cs_update_compute_ctx', diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index d5cf41a029d..d7261c3f8dc 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -479,9 +479,171 @@ void panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf); cs_case(__b, SB_ITER(__val)) #endif -void panvk_per_arch(cs_next_iter_sb)(struct panvk_cmd_buffer *cmdbuf, - enum panvk_subqueue_id subqueue, - struct cs_index scratch_regs); +#if PAN_ARCH >= 11 +struct cs_iter_sb_update_ctx { + struct cs_builder *b; + uint16_t all_iters_mask; + + struct { + struct cs_index next_sb; + struct cs_index sb_mask; + } regs; +}; + +static inline struct cs_iter_sb_update_ctx +cs_iter_sb_update_start(struct panvk_cmd_buffer *cmdbuf, + enum panvk_subqueue_id subqueue, + struct cs_index scratch_regs) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); + struct cs_index next_sb = cs_extract32(b, scratch_regs, 0); + struct cs_iter_sb_update_ctx ctx = { + .b = b, + .all_iters_mask = dev->csf.sb.all_iters_mask, + .regs = { + .next_sb = next_sb, + .sb_mask = cs_extract32(b, scratch_regs, 1), + }, + }; + + cs_next_sb_entry(b, next_sb, MALI_CS_SCOREBOARD_TYPE_ENDPOINT, + MALI_CS_NEXT_SB_ENTRY_FORMAT_INDEX); + + return ctx; +} + +static inline void +cs_iter_sb_update_end(struct cs_iter_sb_update_ctx *ctx) +{ + struct cs_builder *b = ctx->b; + struct cs_index next_sb = ctx->regs.next_sb; + struct cs_index sb_mask = ctx->regs.sb_mask; + uint16_t all_iters_mask = ctx->all_iters_mask; + + /* Setup indirect scoreboard wait mask now for indirect defer */ + cs_move32_to(b, sb_mask, 0); + cs_bit_set32(b, sb_mask, sb_mask, next_sb); + cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_WAIT, sb_mask); + + /* Prevent direct re-use of the current SB to avoid conflict between + * wait(current),signal(next) (can't wait on an SB we signal). + */ + cs_move32_to(b, sb_mask, all_iters_mask); + cs_bit_clear32(b, sb_mask, sb_mask, next_sb); + cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_STREAM, sb_mask); + + ctx->b = NULL; +} + +#define cs_iter_sb_update(__cmdbuf, __subq, __scratch_regs, __upd_ctx) \ + for (struct cs_iter_sb_update_ctx __upd_ctx = \ + cs_iter_sb_update_start(__cmdbuf, __subq, __scratch_regs); \ + __upd_ctx.b; cs_iter_sb_update_end(&__upd_ctx)) + +#else +struct cs_iter_sb_update_ctx { + struct cs_builder *b; + uint8_t cur_sb; + uint8_t next_sb; + + struct { + struct cs_index next_sb; + struct cs_index cmp_scratch; + } regs; +}; + +static inline struct cs_iter_sb_update_ctx +cs_iter_sb_update_start(struct panvk_cmd_buffer *cmdbuf, + enum panvk_subqueue_id subqueue, + struct cs_index scratch_regs) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); + struct cs_index next_sb = cs_extract32(b, scratch_regs, 0); + struct cs_index cmp_scratch = cs_extract32(b, scratch_regs, 1); + struct cs_iter_sb_update_ctx ctx = { + .b = b, + .regs = { + .next_sb = next_sb, + .cmp_scratch = cmp_scratch, + }, + }; + + cs_load32_to(b, next_sb, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, iter_sb)); + + /* Select next scoreboard entry and wrap around if we get past the limit */ + cs_add32(b, next_sb, next_sb, 1); + cs_add32(b, cmp_scratch, next_sb, -SB_ITER(dev->csf.sb.iter_count)); + + cs_if(b, MALI_CS_CONDITION_GEQUAL, cmp_scratch) { + cs_move32_to(b, next_sb, SB_ITER(0)); + } + + cs_store32(b, next_sb, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, iter_sb)); + cs_flush_stores(b); + + return ctx; +} + +static inline void +cs_iter_sb_update_end(struct cs_iter_sb_update_ctx *ctx) +{ + ctx->b = NULL; +} + +static void +cs_iter_sb_update_first_case(struct cs_iter_sb_update_ctx *ctx) +{ + ctx->cur_sb = PANVK_SB_ITER_COUNT - 1; + ctx->next_sb = 0; +} + +static void +cs_iter_sb_update_next_case(struct cs_iter_sb_update_ctx *ctx) +{ + ctx->cur_sb = (ctx->cur_sb + 1) % PANVK_SB_ITER_COUNT; + ctx->next_sb++; +} + +static inline bool +cs_iter_sb_update_case_preamble(struct cs_iter_sb_update_ctx *ctx) +{ + struct cs_builder *b = ctx->b; + + cs_wait_slot(b, SB_ITER(ctx->next_sb)); + cs_select_endpoint_sb(b, SB_ITER(ctx->next_sb)); + return false; +} + +#define cs_iter_sb_update_case(__upd_ctx) \ + cs_case(__upd_ctx.b, SB_ITER(__upd_ctx.next_sb)) \ + for (bool __done = cs_iter_sb_update_case_preamble(&__upd_ctx); !__done; \ + __done = true) + +#define cs_iter_sb_update(__cmdbuf, __subq, __scratch_regs, __upd_ctx) \ + for (struct cs_iter_sb_update_ctx __upd_ctx = \ + cs_iter_sb_update_start(__cmdbuf, __subq, __scratch_regs); \ + __upd_ctx.b; cs_iter_sb_update_end(&__upd_ctx)) \ + cs_match((__upd_ctx).b, __upd_ctx.regs.next_sb, \ + __upd_ctx.regs.cmp_scratch) \ + for (cs_iter_sb_update_first_case(&__upd_ctx); \ + __upd_ctx.next_sb < PANVK_SB_ITER_COUNT; \ + cs_iter_sb_update_next_case(&__upd_ctx)) \ + cs_iter_sb_update_case(__upd_ctx) + +#endif + +static inline void +cs_next_iter_sb(struct panvk_cmd_buffer *cmdbuf, + enum panvk_subqueue_id subqueue, struct cs_index scratch_regs) +{ + cs_iter_sb_update(cmdbuf, subqueue, scratch_regs, _) { + /* We only want to move to the new scoreboard, so nothing to do here. */ + } +} enum panvk_barrier_stage { PANVK_BARRIER_STAGE_FIRST, diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c index 6fee1ddbf9b..4047c9fa312 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c @@ -768,65 +768,6 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, } } -#if PAN_ARCH >= 11 -void -panvk_per_arch(cs_next_iter_sb)(struct panvk_cmd_buffer *cmdbuf, - enum panvk_subqueue_id subqueue, - struct cs_index scratch_regs) -{ - struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); - struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); - struct cs_index iter_sb = cs_extract32(b, scratch_regs, 0); - struct cs_index sb_mask = cs_extract32(b, scratch_regs, 1); - - /* Wait for scoreboard to be available and select the next scoreboard entry */ - cs_next_sb_entry(b, iter_sb, MALI_CS_SCOREBOARD_TYPE_ENDPOINT, - MALI_CS_NEXT_SB_ENTRY_FORMAT_INDEX); - - /* Setup indirect scoreboard wait mask now for indirect defer */ - cs_move32_to(b, sb_mask, 0); - cs_bit_set32(b, sb_mask, sb_mask, iter_sb); - cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_WAIT, sb_mask); - - /* Prevent direct re-use of the current SB to avoid conflict between - * wait(current),signal(next) (can't wait on an SB we signal). - */ - cs_move32_to(b, sb_mask, dev->csf.sb.all_iters_mask); - cs_bit_clear32(b, sb_mask, sb_mask, iter_sb); - cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_STREAM, sb_mask); -} -#else -void -panvk_per_arch(cs_next_iter_sb)(struct panvk_cmd_buffer *cmdbuf, - enum panvk_subqueue_id subqueue, - struct cs_index scratch_regs) -{ - struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); - struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); - struct cs_index iter_sb = cs_extract32(b, scratch_regs, 0); - struct cs_index cmp_scratch = cs_extract32(b, scratch_regs, 1); - - cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b), - offsetof(struct panvk_cs_subqueue_context, iter_sb)); - - /* Select next scoreboard entry and wrap around if we get past the limit */ - cs_add32(b, iter_sb, iter_sb, 1); - cs_add32(b, cmp_scratch, iter_sb, -SB_ITER(dev->csf.sb.iter_count)); - cs_if(b, MALI_CS_CONDITION_GEQUAL, cmp_scratch) { - cs_move32_to(b, iter_sb, SB_ITER(0)); - } - - cs_match_iter_sb(b, x, iter_sb, cmp_scratch) { - cs_wait_slot(b, SB_ITER(x)); - cs_select_endpoint_sb(b, SB_ITER(x)); - } - - cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), - offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_flush_stores(b); -} -#endif - static struct cs_buffer alloc_cs_buffer(void *cookie) { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index effe716d46e..c94c3674303 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -277,9 +277,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) } } - struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2); - panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE, - next_iter_sb_scratch); + cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_COMPUTE, + cs_scratch_reg_tuple(b, 0, 2)); if (indirect) { /* Use run_compute with a set task axis instead of run_compute_indirect as diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 71f97403343..82b8dee9cd0 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -1188,9 +1188,8 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) /* Flush all stores to tiler_ctx_addr. */ cs_flush_stores(b); - struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2); - panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER, - next_iter_sb_scratch); + cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER, + cs_scratch_reg_tuple(b, 0, 2)); cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, cs_now()); return VK_SUCCESS; @@ -3067,9 +3066,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0; - struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2); - panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_FRAGMENT, - next_iter_sb_scratch); + cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_FRAGMENT, + cs_scratch_reg_tuple(b, 0, 2)); /* Now initialize the fragment bits. */ cs_update_frag_ctx(b) { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c index 7914407dd6a..3a5864ec203 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c @@ -111,9 +111,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Z), grid.count[2]); } - struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2); - panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE, - next_iter_sb_scratch); + cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_COMPUTE, + cs_scratch_reg_tuple(b, 0, 2)); unsigned task_axis = MALI_TASK_AXIS_X; unsigned task_increment = 0;