panvk/csf: Prepare for more complex scoreboard transitions

Right now all we do is get the next available scoreboard, and set
it as the current iterator scoreboard, but the fragment queue is
about require something more involved to fix FINISH_FRAGMENT ordering
issue.

Provide a cs_iter_sb_update() block where everything between the
selection of a new scoreboard and the transition to this new
scorevoard is customizable. Implement cs_next_iter_sb() as a dummy
wrapper around this new construct.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38826>
This commit is contained in:
Boris Brezillon 2025-12-08 16:51:57 +01:00
parent 14391cc5c5
commit ff99e5289b
6 changed files with 175 additions and 74 deletions

View file

@ -43,6 +43,8 @@ ForEachMacros: [
'cs_emit',
'cs_exception_handler_def',
'cs_if',
'cs_iter_sb_update',
'cs_iter_sb_update_case',
'cs_match',
'cs_single_link_list_for_each_from',
'cs_update_compute_ctx',

View file

@ -479,9 +479,171 @@ void panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf);
cs_case(__b, SB_ITER(__val))
#endif
void panvk_per_arch(cs_next_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
enum panvk_subqueue_id subqueue,
struct cs_index scratch_regs);
#if PAN_ARCH >= 11
struct cs_iter_sb_update_ctx {
struct cs_builder *b;
uint16_t all_iters_mask;
struct {
struct cs_index next_sb;
struct cs_index sb_mask;
} regs;
};
static inline struct cs_iter_sb_update_ctx
cs_iter_sb_update_start(struct panvk_cmd_buffer *cmdbuf,
enum panvk_subqueue_id subqueue,
struct cs_index scratch_regs)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
struct cs_index next_sb = cs_extract32(b, scratch_regs, 0);
struct cs_iter_sb_update_ctx ctx = {
.b = b,
.all_iters_mask = dev->csf.sb.all_iters_mask,
.regs = {
.next_sb = next_sb,
.sb_mask = cs_extract32(b, scratch_regs, 1),
},
};
cs_next_sb_entry(b, next_sb, MALI_CS_SCOREBOARD_TYPE_ENDPOINT,
MALI_CS_NEXT_SB_ENTRY_FORMAT_INDEX);
return ctx;
}
static inline void
cs_iter_sb_update_end(struct cs_iter_sb_update_ctx *ctx)
{
struct cs_builder *b = ctx->b;
struct cs_index next_sb = ctx->regs.next_sb;
struct cs_index sb_mask = ctx->regs.sb_mask;
uint16_t all_iters_mask = ctx->all_iters_mask;
/* Setup indirect scoreboard wait mask now for indirect defer */
cs_move32_to(b, sb_mask, 0);
cs_bit_set32(b, sb_mask, sb_mask, next_sb);
cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_WAIT, sb_mask);
/* Prevent direct re-use of the current SB to avoid conflict between
* wait(current),signal(next) (can't wait on an SB we signal).
*/
cs_move32_to(b, sb_mask, all_iters_mask);
cs_bit_clear32(b, sb_mask, sb_mask, next_sb);
cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_STREAM, sb_mask);
ctx->b = NULL;
}
#define cs_iter_sb_update(__cmdbuf, __subq, __scratch_regs, __upd_ctx) \
for (struct cs_iter_sb_update_ctx __upd_ctx = \
cs_iter_sb_update_start(__cmdbuf, __subq, __scratch_regs); \
__upd_ctx.b; cs_iter_sb_update_end(&__upd_ctx))
#else
struct cs_iter_sb_update_ctx {
struct cs_builder *b;
uint8_t cur_sb;
uint8_t next_sb;
struct {
struct cs_index next_sb;
struct cs_index cmp_scratch;
} regs;
};
static inline struct cs_iter_sb_update_ctx
cs_iter_sb_update_start(struct panvk_cmd_buffer *cmdbuf,
enum panvk_subqueue_id subqueue,
struct cs_index scratch_regs)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
struct cs_index next_sb = cs_extract32(b, scratch_regs, 0);
struct cs_index cmp_scratch = cs_extract32(b, scratch_regs, 1);
struct cs_iter_sb_update_ctx ctx = {
.b = b,
.regs = {
.next_sb = next_sb,
.cmp_scratch = cmp_scratch,
},
};
cs_load32_to(b, next_sb, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, iter_sb));
/* Select next scoreboard entry and wrap around if we get past the limit */
cs_add32(b, next_sb, next_sb, 1);
cs_add32(b, cmp_scratch, next_sb, -SB_ITER(dev->csf.sb.iter_count));
cs_if(b, MALI_CS_CONDITION_GEQUAL, cmp_scratch) {
cs_move32_to(b, next_sb, SB_ITER(0));
}
cs_store32(b, next_sb, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, iter_sb));
cs_flush_stores(b);
return ctx;
}
static inline void
cs_iter_sb_update_end(struct cs_iter_sb_update_ctx *ctx)
{
ctx->b = NULL;
}
static void
cs_iter_sb_update_first_case(struct cs_iter_sb_update_ctx *ctx)
{
ctx->cur_sb = PANVK_SB_ITER_COUNT - 1;
ctx->next_sb = 0;
}
static void
cs_iter_sb_update_next_case(struct cs_iter_sb_update_ctx *ctx)
{
ctx->cur_sb = (ctx->cur_sb + 1) % PANVK_SB_ITER_COUNT;
ctx->next_sb++;
}
static inline bool
cs_iter_sb_update_case_preamble(struct cs_iter_sb_update_ctx *ctx)
{
struct cs_builder *b = ctx->b;
cs_wait_slot(b, SB_ITER(ctx->next_sb));
cs_select_endpoint_sb(b, SB_ITER(ctx->next_sb));
return false;
}
#define cs_iter_sb_update_case(__upd_ctx) \
cs_case(__upd_ctx.b, SB_ITER(__upd_ctx.next_sb)) \
for (bool __done = cs_iter_sb_update_case_preamble(&__upd_ctx); !__done; \
__done = true)
#define cs_iter_sb_update(__cmdbuf, __subq, __scratch_regs, __upd_ctx) \
for (struct cs_iter_sb_update_ctx __upd_ctx = \
cs_iter_sb_update_start(__cmdbuf, __subq, __scratch_regs); \
__upd_ctx.b; cs_iter_sb_update_end(&__upd_ctx)) \
cs_match((__upd_ctx).b, __upd_ctx.regs.next_sb, \
__upd_ctx.regs.cmp_scratch) \
for (cs_iter_sb_update_first_case(&__upd_ctx); \
__upd_ctx.next_sb < PANVK_SB_ITER_COUNT; \
cs_iter_sb_update_next_case(&__upd_ctx)) \
cs_iter_sb_update_case(__upd_ctx)
#endif
static inline void
cs_next_iter_sb(struct panvk_cmd_buffer *cmdbuf,
enum panvk_subqueue_id subqueue, struct cs_index scratch_regs)
{
cs_iter_sb_update(cmdbuf, subqueue, scratch_regs, _) {
/* We only want to move to the new scoreboard, so nothing to do here. */
}
}
enum panvk_barrier_stage {
PANVK_BARRIER_STAGE_FIRST,

View file

@ -768,65 +768,6 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
}
}
#if PAN_ARCH >= 11
void
panvk_per_arch(cs_next_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
enum panvk_subqueue_id subqueue,
struct cs_index scratch_regs)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
struct cs_index iter_sb = cs_extract32(b, scratch_regs, 0);
struct cs_index sb_mask = cs_extract32(b, scratch_regs, 1);
/* Wait for scoreboard to be available and select the next scoreboard entry */
cs_next_sb_entry(b, iter_sb, MALI_CS_SCOREBOARD_TYPE_ENDPOINT,
MALI_CS_NEXT_SB_ENTRY_FORMAT_INDEX);
/* Setup indirect scoreboard wait mask now for indirect defer */
cs_move32_to(b, sb_mask, 0);
cs_bit_set32(b, sb_mask, sb_mask, iter_sb);
cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_WAIT, sb_mask);
/* Prevent direct re-use of the current SB to avoid conflict between
* wait(current),signal(next) (can't wait on an SB we signal).
*/
cs_move32_to(b, sb_mask, dev->csf.sb.all_iters_mask);
cs_bit_clear32(b, sb_mask, sb_mask, iter_sb);
cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_STREAM, sb_mask);
}
#else
void
panvk_per_arch(cs_next_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
enum panvk_subqueue_id subqueue,
struct cs_index scratch_regs)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
struct cs_index iter_sb = cs_extract32(b, scratch_regs, 0);
struct cs_index cmp_scratch = cs_extract32(b, scratch_regs, 1);
cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, iter_sb));
/* Select next scoreboard entry and wrap around if we get past the limit */
cs_add32(b, iter_sb, iter_sb, 1);
cs_add32(b, cmp_scratch, iter_sb, -SB_ITER(dev->csf.sb.iter_count));
cs_if(b, MALI_CS_CONDITION_GEQUAL, cmp_scratch) {
cs_move32_to(b, iter_sb, SB_ITER(0));
}
cs_match_iter_sb(b, x, iter_sb, cmp_scratch) {
cs_wait_slot(b, SB_ITER(x));
cs_select_endpoint_sb(b, SB_ITER(x));
}
cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, iter_sb));
cs_flush_stores(b);
}
#endif
static struct cs_buffer
alloc_cs_buffer(void *cookie)
{

View file

@ -277,9 +277,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
}
}
struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2);
panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE,
next_iter_sb_scratch);
cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_COMPUTE,
cs_scratch_reg_tuple(b, 0, 2));
if (indirect) {
/* Use run_compute with a set task axis instead of run_compute_indirect as

View file

@ -1188,9 +1188,8 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
/* Flush all stores to tiler_ctx_addr. */
cs_flush_stores(b);
struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2);
panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER,
next_iter_sb_scratch);
cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER,
cs_scratch_reg_tuple(b, 0, 2));
cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, cs_now());
return VK_SUCCESS;
@ -3067,9 +3066,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2);
panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_FRAGMENT,
next_iter_sb_scratch);
cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_FRAGMENT,
cs_scratch_reg_tuple(b, 0, 2));
/* Now initialize the fragment bits. */
cs_update_frag_ctx(b) {

View file

@ -111,9 +111,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Z), grid.count[2]);
}
struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2);
panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE,
next_iter_sb_scratch);
cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_COMPUTE,
cs_scratch_reg_tuple(b, 0, 2));
unsigned task_axis = MALI_TASK_AXIS_X;
unsigned task_increment = 0;