amd,radv,radeonsi: add ac_emit_cp_acquire_mem_pws()
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37813>
This commit is contained in:
parent
6329e282b8
commit
c45035ceb4
4 changed files with 87 additions and 62 deletions
|
|
@ -904,3 +904,78 @@ ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref,
|
|||
ac_cmdbuf_emit(4); /* poll interval */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
static bool
|
||||
is_ts_event(unsigned event_type)
|
||||
{
|
||||
return event_type == V_028A90_CACHE_FLUSH_TS ||
|
||||
event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT ||
|
||||
event_type == V_028A90_BOTTOM_OF_PIPE_TS ||
|
||||
event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS ||
|
||||
event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS;
|
||||
}
|
||||
|
||||
/* This will wait or insert into the pipeline a wait for a previous
|
||||
* RELEASE_MEM PWS event.
|
||||
*
|
||||
* "event_type" must be the same as the RELEASE_MEM PWS event.
|
||||
*
|
||||
* "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME,
|
||||
* PRE_SHADER, PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the
|
||||
* pipeline instead of completely idling the hw at the frontend.
|
||||
*
|
||||
* "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the
|
||||
* pipeline, any cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM.
|
||||
*
|
||||
* "distance" determines how many RELEASE_MEM PWS events ago it should wait
|
||||
* for, minus one (starting from 0). There are 3 event types: PS_DONE,
|
||||
* CS_DONE, and TS events. The distance counter increments separately for each
|
||||
* type, so 0 with PS_DONE means wait for the last PS_DONE event, while 0 with
|
||||
* *_TS means wait for the last TS event (even if it's a different TS event
|
||||
* because all TS events share the same counter).
|
||||
*
|
||||
* PRE_SHADER waits before the first shader that has IMAGE_OP=1, while
|
||||
* PRE_PIX_SHADER waits before PS if it has IMAGE_OP=1 (IMAGE_OP should really
|
||||
* be called SYNC_ENABLE) PRE_DEPTH waits before depth/stencil tests.
|
||||
*
|
||||
* PRE_COLOR also exists but shouldn't be used because it can hang. It's
|
||||
* recommended to use PRE_PIX_SHADER instead, which means all PS that have
|
||||
* color exports with enabled color buffers, non-zero colormask, and non-zero
|
||||
* sample mask must have IMAGE_OP=1 to enable the sync before PS.
|
||||
*
|
||||
* Waiting for a PWS fence that was generated by a previous IB is valid, but
|
||||
* if there is an IB from another process in between and that IB also inserted
|
||||
* a PWS fence, the hw will wait for the newer fence instead because the PWS
|
||||
* counter was incremented.
|
||||
*/
|
||||
void
|
||||
ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
||||
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
||||
uint32_t stage_sel, uint32_t count,
|
||||
uint32_t gcr_cntl)
|
||||
{
|
||||
assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX);
|
||||
|
||||
const bool ts = is_ts_event(event_type);
|
||||
const bool ps_done = event_type == V_028A90_PS_DONE;
|
||||
const bool cs_done = event_type == V_028A90_CS_DONE;
|
||||
const uint32_t counter_sel = ts ? V_580_TS_SELECT : ps_done ? V_580_PS_SELECT : V_580_CS_SELECT;
|
||||
|
||||
assert((int)ts + (int)cs_done + (int)ps_done == 1);
|
||||
assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME);
|
||||
assert(stage_sel != V_580_PRE_COLOR);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
ac_cmdbuf_emit(S_580_PWS_STAGE_SEL(stage_sel) |
|
||||
S_580_PWS_COUNTER_SEL(counter_sel) |
|
||||
S_580_PWS_ENA2(1) |
|
||||
S_580_PWS_COUNT(count));
|
||||
ac_cmdbuf_emit(0xffffffff); /* GCR_SIZE */
|
||||
ac_cmdbuf_emit(0x01ffffff); /* GCR_SIZE_HI */
|
||||
ac_cmdbuf_emit(0); /* GCR_BASE_LO */
|
||||
ac_cmdbuf_emit(0); /* GCR_BASE_HI */
|
||||
ac_cmdbuf_emit(S_585_PWS_ENA(1));
|
||||
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -109,6 +109,12 @@ void
|
|||
ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref,
|
||||
uint32_t mask, unsigned flags);
|
||||
|
||||
void
|
||||
ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
||||
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
||||
uint32_t stage_sel, uint32_t count,
|
||||
uint32_t gcr_cntl);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -274,19 +274,11 @@ gfx10_cs_emit_cache_flush(struct radv_cmd_stream *cs, enum amd_gfx_level gfx_lev
|
|||
radeon_emit(0); /* DATA_HI */
|
||||
radeon_emit(0); /* INT_CTXID */
|
||||
|
||||
/* Wait for the event and invalidate remaining caches if needed. */
|
||||
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
radeon_emit(S_580_PWS_STAGE_SEL(V_580_CP_PFP) | S_580_PWS_COUNTER_SEL(V_580_TS_SELECT) | S_580_PWS_ENA2(1) |
|
||||
S_580_PWS_COUNT(0));
|
||||
radeon_emit(0xffffffff); /* GCR_SIZE */
|
||||
radeon_emit(0x01ffffff); /* GCR_SIZE_HI */
|
||||
radeon_emit(0); /* GCR_BASE_LO */
|
||||
radeon_emit(0); /* GCR_BASE_HI */
|
||||
radeon_emit(S_585_PWS_ENA(1));
|
||||
radeon_emit(gcr_cntl); /* GCR_CNTL */
|
||||
|
||||
radeon_end();
|
||||
|
||||
/* Wait for the event and invalidate remaining caches if needed. */
|
||||
ac_emit_cp_acquire_mem_pws(cs->b, gfx_level, cs->hw_ip, cb_db_event, V_580_CP_PFP, 0, gcr_cntl);
|
||||
|
||||
gcr_cntl = 0; /* all done */
|
||||
} else {
|
||||
/* CB/DB flush and invalidate (or possibly just a wait for a
|
||||
|
|
|
|||
|
|
@ -61,64 +61,16 @@ void si_cp_release_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
|||
radeon_end();
|
||||
}
|
||||
|
||||
/* This will wait or insert into the pipeline a wait for a previous RELEASE_MEM PWS event.
|
||||
*
|
||||
* "event_type" must be the same as the RELEASE_MEM PWS event.
|
||||
*
|
||||
* "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME, PRE_SHADER,
|
||||
* PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the pipeline instead of completely
|
||||
* idling the hw at the frontend.
|
||||
*
|
||||
* "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the pipeline, any
|
||||
* cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM.
|
||||
*
|
||||
* "distance" determines how many RELEASE_MEM PWS events ago it should wait for, minus one
|
||||
* (starting from 0). There are 3 event types: PS_DONE, CS_DONE, and TS events. The distance
|
||||
* counter increments separately for each type, so 0 with PS_DONE means wait for the last PS_DONE
|
||||
* event, while 0 with *_TS means wait for the last TS event (even if it's a different TS event
|
||||
* because all TS events share the same counter).
|
||||
*
|
||||
* PRE_SHADER waits before the first shader that has IMAGE_OP=1, while PRE_PIX_SHADER waits before
|
||||
* PS if it has IMAGE_OP=1 (IMAGE_OP should really be called SYNC_ENABLE) PRE_DEPTH waits before
|
||||
* depth/stencil tests.
|
||||
*
|
||||
* PRE_COLOR also exists but shouldn't be used because it can hang. It's recommended to use
|
||||
* PRE_PIX_SHADER instead, which means all PS that have color exports with enabled color buffers,
|
||||
* non-zero colormask, and non-zero sample mask must have IMAGE_OP=1 to enable the sync before PS.
|
||||
*
|
||||
* Waiting for a PWS fence that was generated by a previous IB is valid, but if there is an IB
|
||||
* from another process in between and that IB also inserted a PWS fence, the hw will wait for
|
||||
* the newer fence instead because the PWS counter was incremented.
|
||||
*/
|
||||
void si_cp_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
unsigned event_type, unsigned stage_sel, unsigned gcr_cntl,
|
||||
unsigned distance, unsigned sqtt_flush_flags)
|
||||
{
|
||||
assert(sctx->gfx_level >= GFX11 && sctx->is_gfx_queue);
|
||||
bool ts = is_ts_event(event_type);
|
||||
bool cs_done = event_type == V_028A90_CS_DONE;
|
||||
bool ps = event_type == V_028A90_PS_DONE;
|
||||
|
||||
assert((int)ts + (int)cs_done + (int)ps == 1);
|
||||
assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME);
|
||||
assert(stage_sel != V_580_PRE_COLOR);
|
||||
|
||||
if (unlikely(sctx->sqtt_enabled))
|
||||
si_sqtt_describe_barrier_start(sctx, cs);
|
||||
|
||||
radeon_begin(cs);
|
||||
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
radeon_emit(S_580_PWS_STAGE_SEL(stage_sel) |
|
||||
S_580_PWS_COUNTER_SEL(ts ? V_580_TS_SELECT : ps ? V_580_PS_SELECT : V_580_CS_SELECT) |
|
||||
S_580_PWS_ENA2(1) |
|
||||
S_580_PWS_COUNT(distance));
|
||||
radeon_emit(0xffffffff); /* GCR_SIZE */
|
||||
radeon_emit(0x01ffffff); /* GCR_SIZE_HI */
|
||||
radeon_emit(0); /* GCR_BASE_LO */
|
||||
radeon_emit(0); /* GCR_BASE_HI */
|
||||
radeon_emit(S_585_PWS_ENA(1));
|
||||
radeon_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */
|
||||
radeon_end();
|
||||
ac_emit_cp_acquire_mem_pws(&cs->current, sctx->gfx_level,
|
||||
sctx->is_gfx_queue ? AMD_IP_GFX : AMD_IP_COMPUTE,
|
||||
event_type, stage_sel, distance, gcr_cntl);
|
||||
|
||||
if (unlikely(sctx->sqtt_enabled))
|
||||
si_sqtt_describe_barrier_end(sctx, cs, sqtt_flush_flags);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue