diff --git a/src/amd/common/ac_cmdbuf.c b/src/amd/common/ac_cmdbuf.c index 5a2356535dc..6f07262f43f 100644 --- a/src/amd/common/ac_cmdbuf.c +++ b/src/amd/common/ac_cmdbuf.c @@ -904,3 +904,78 @@ ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref, ac_cmdbuf_emit(4); /* poll interval */ ac_cmdbuf_end(); } + +static bool +is_ts_event(unsigned event_type) +{ + return event_type == V_028A90_CACHE_FLUSH_TS || + event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT || + event_type == V_028A90_BOTTOM_OF_PIPE_TS || + event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS || + event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS; +} + +/* This will wait or insert into the pipeline a wait for a previous + * RELEASE_MEM PWS event. + * + * "event_type" must be the same as the RELEASE_MEM PWS event. + * + * "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME, + * PRE_SHADER, PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the + * pipeline instead of completely idling the hw at the frontend. + * + * "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the + * pipeline, any cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM. + * + * "distance" determines how many RELEASE_MEM PWS events ago it should wait + * for, minus one (starting from 0). There are 3 event types: PS_DONE, + * CS_DONE, and TS events. The distance counter increments separately for each + * type, so 0 with PS_DONE means wait for the last PS_DONE event, while 0 with + * *_TS means wait for the last TS event (even if it's a different TS event + * because all TS events share the same counter). + * + * PRE_SHADER waits before the first shader that has IMAGE_OP=1, while + * PRE_PIX_SHADER waits before PS if it has IMAGE_OP=1 (IMAGE_OP should really + * be called SYNC_ENABLE) PRE_DEPTH waits before depth/stencil tests. + * + * PRE_COLOR also exists but shouldn't be used because it can hang. It's + * recommended to use PRE_PIX_SHADER instead, which means all PS that have + * color exports with enabled color buffers, non-zero colormask, and non-zero + * sample mask must have IMAGE_OP=1 to enable the sync before PS. + * + * Waiting for a PWS fence that was generated by a previous IB is valid, but + * if there is an IB from another process in between and that IB also inserted + * a PWS fence, the hw will wait for the newer fence instead because the PWS + * counter was incremented. + */ +void +ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level, + ASSERTED enum amd_ip_type ip_type, uint32_t event_type, + uint32_t stage_sel, uint32_t count, + uint32_t gcr_cntl) +{ + assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX); + + const bool ts = is_ts_event(event_type); + const bool ps_done = event_type == V_028A90_PS_DONE; + const bool cs_done = event_type == V_028A90_CS_DONE; + const uint32_t counter_sel = ts ? V_580_TS_SELECT : ps_done ? V_580_PS_SELECT : V_580_CS_SELECT; + + assert((int)ts + (int)cs_done + (int)ps_done == 1); + assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME); + assert(stage_sel != V_580_PRE_COLOR); + + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); + ac_cmdbuf_emit(S_580_PWS_STAGE_SEL(stage_sel) | + S_580_PWS_COUNTER_SEL(counter_sel) | + S_580_PWS_ENA2(1) | + S_580_PWS_COUNT(count)); + ac_cmdbuf_emit(0xffffffff); /* GCR_SIZE */ + ac_cmdbuf_emit(0x01ffffff); /* GCR_SIZE_HI */ + ac_cmdbuf_emit(0); /* GCR_BASE_LO */ + ac_cmdbuf_emit(0); /* GCR_BASE_HI */ + ac_cmdbuf_emit(S_585_PWS_ENA(1)); + ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */ + ac_cmdbuf_end(); +} diff --git a/src/amd/common/ac_cmdbuf.h b/src/amd/common/ac_cmdbuf.h index 583d89eff1b..31cf94174db 100644 --- a/src/amd/common/ac_cmdbuf.h +++ b/src/amd/common/ac_cmdbuf.h @@ -109,6 +109,12 @@ void ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref, uint32_t mask, unsigned flags); +void +ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level, + ASSERTED enum amd_ip_type ip_type, uint32_t event_type, + uint32_t stage_sel, uint32_t count, + uint32_t gcr_cntl); + #ifdef __cplusplus } #endif diff --git a/src/amd/vulkan/radv_cs.c b/src/amd/vulkan/radv_cs.c index 48cd3d402d6..74afd70c36a 100644 --- a/src/amd/vulkan/radv_cs.c +++ b/src/amd/vulkan/radv_cs.c @@ -274,19 +274,11 @@ gfx10_cs_emit_cache_flush(struct radv_cmd_stream *cs, enum amd_gfx_level gfx_lev radeon_emit(0); /* DATA_HI */ radeon_emit(0); /* INT_CTXID */ - /* Wait for the event and invalidate remaining caches if needed. */ - radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(S_580_PWS_STAGE_SEL(V_580_CP_PFP) | S_580_PWS_COUNTER_SEL(V_580_TS_SELECT) | S_580_PWS_ENA2(1) | - S_580_PWS_COUNT(0)); - radeon_emit(0xffffffff); /* GCR_SIZE */ - radeon_emit(0x01ffffff); /* GCR_SIZE_HI */ - radeon_emit(0); /* GCR_BASE_LO */ - radeon_emit(0); /* GCR_BASE_HI */ - radeon_emit(S_585_PWS_ENA(1)); - radeon_emit(gcr_cntl); /* GCR_CNTL */ - radeon_end(); + /* Wait for the event and invalidate remaining caches if needed. */ + ac_emit_cp_acquire_mem_pws(cs->b, gfx_level, cs->hw_ip, cb_db_event, V_580_CP_PFP, 0, gcr_cntl); + gcr_cntl = 0; /* all done */ } else { /* CB/DB flush and invalidate (or possibly just a wait for a diff --git a/src/gallium/drivers/radeonsi/si_cp_utils.c b/src/gallium/drivers/radeonsi/si_cp_utils.c index 1a20f479808..dc5d920f186 100644 --- a/src/gallium/drivers/radeonsi/si_cp_utils.c +++ b/src/gallium/drivers/radeonsi/si_cp_utils.c @@ -61,64 +61,16 @@ void si_cp_release_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs, radeon_end(); } -/* This will wait or insert into the pipeline a wait for a previous RELEASE_MEM PWS event. - * - * "event_type" must be the same as the RELEASE_MEM PWS event. - * - * "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME, PRE_SHADER, - * PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the pipeline instead of completely - * idling the hw at the frontend. - * - * "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the pipeline, any - * cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM. - * - * "distance" determines how many RELEASE_MEM PWS events ago it should wait for, minus one - * (starting from 0). There are 3 event types: PS_DONE, CS_DONE, and TS events. The distance - * counter increments separately for each type, so 0 with PS_DONE means wait for the last PS_DONE - * event, while 0 with *_TS means wait for the last TS event (even if it's a different TS event - * because all TS events share the same counter). - * - * PRE_SHADER waits before the first shader that has IMAGE_OP=1, while PRE_PIX_SHADER waits before - * PS if it has IMAGE_OP=1 (IMAGE_OP should really be called SYNC_ENABLE) PRE_DEPTH waits before - * depth/stencil tests. - * - * PRE_COLOR also exists but shouldn't be used because it can hang. It's recommended to use - * PRE_PIX_SHADER instead, which means all PS that have color exports with enabled color buffers, - * non-zero colormask, and non-zero sample mask must have IMAGE_OP=1 to enable the sync before PS. - * - * Waiting for a PWS fence that was generated by a previous IB is valid, but if there is an IB - * from another process in between and that IB also inserted a PWS fence, the hw will wait for - * the newer fence instead because the PWS counter was incremented. - */ void si_cp_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned event_type, unsigned stage_sel, unsigned gcr_cntl, unsigned distance, unsigned sqtt_flush_flags) { - assert(sctx->gfx_level >= GFX11 && sctx->is_gfx_queue); - bool ts = is_ts_event(event_type); - bool cs_done = event_type == V_028A90_CS_DONE; - bool ps = event_type == V_028A90_PS_DONE; - - assert((int)ts + (int)cs_done + (int)ps == 1); - assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME); - assert(stage_sel != V_580_PRE_COLOR); - if (unlikely(sctx->sqtt_enabled)) si_sqtt_describe_barrier_start(sctx, cs); - radeon_begin(cs); - radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(S_580_PWS_STAGE_SEL(stage_sel) | - S_580_PWS_COUNTER_SEL(ts ? V_580_TS_SELECT : ps ? V_580_PS_SELECT : V_580_CS_SELECT) | - S_580_PWS_ENA2(1) | - S_580_PWS_COUNT(distance)); - radeon_emit(0xffffffff); /* GCR_SIZE */ - radeon_emit(0x01ffffff); /* GCR_SIZE_HI */ - radeon_emit(0); /* GCR_BASE_LO */ - radeon_emit(0); /* GCR_BASE_HI */ - radeon_emit(S_585_PWS_ENA(1)); - radeon_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */ - radeon_end(); + ac_emit_cp_acquire_mem_pws(&cs->current, sctx->gfx_level, + sctx->is_gfx_queue ? AMD_IP_GFX : AMD_IP_COMPUTE, + event_type, stage_sel, distance, gcr_cntl); if (unlikely(sctx->sqtt_enabled)) si_sqtt_describe_barrier_end(sctx, cs, sqtt_flush_flags);