amd: change ac_shader_config::lds_size to bytes

We still keep it aligned to allocation granularity.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37577>
This commit is contained in:
Daniel Schürmann 2025-09-25 13:58:36 +02:00 committed by Marge Bot
parent 190bab9b77
commit b651234414
17 changed files with 63 additions and 56 deletions

View file

@ -8,6 +8,7 @@
#include "nir/tgsi_to_nir.h"
#include "si_build_pm4.h"
#include "si_shader_internal.h"
#include "util/macros.h"
#include "util/u_async_debug.h"
#include "util/u_memory.h"
#include "util/u_upload_mgr.h"
@ -133,7 +134,7 @@ static void si_create_compute_state_async(void *job, void *gdata, int thread_ind
S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2]
? 2
: sel->info.uses_thread_id[1] ? 1 : 0) |
S_00B84C_LDS_SIZE(shader->config.lds_size);
S_00B84C_LDS_SIZE(DIV_ROUND_UP(shader->config.lds_size, sscreen->info.lds_encode_granularity));
/* COMPUTE_PGM_RSRC3 is only present on GFX10+ and GFX940+. */
shader->config.rsrc3 = S_00B8A0_SHARED_VGPR_CNT(shader->config.num_shared_vgprs / 8);

View file

@ -435,17 +435,17 @@ static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shade
size_in_dw += shader->ngg.info.ngg_out_lds_size;
shader->config.lds_size =
DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
ALIGN(size_in_dw * 4, get_lds_granularity(sscreen, stage));
}
if (stage == MESA_SHADER_COMPUTE) {
shader->config.lds_size = DIV_ROUND_UP(shader->selector->info.base.shared_size,
sscreen->info.lds_encode_granularity);
shader->config.lds_size = ALIGN(shader->selector->info.base.shared_size,
sscreen->info.lds_alloc_granularity);
}
/* Check that the LDS size is within hw limits. */
assert(shader->config.lds_size * get_lds_granularity(sscreen, stage) <=
(sscreen->info.gfx_level == GFX6 ? 32 : 64) * 1024);
assert(shader->config.lds_size % sscreen->info.lds_alloc_granularity == 0);
assert(shader->config.lds_size <= (sscreen->info.gfx_level == GFX6 ? 32 : 64) * 1024);
}
static int upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
@ -623,13 +623,12 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
* Other stages don't know the size at compile time or don't
* allocate LDS per wave, but instead they do it per thread group.
*/
lds_per_wave = conf->lds_size * lds_increment +
lds_per_wave = conf->lds_size +
align(shader->info.num_ps_inputs * 48, lds_increment);
break;
case MESA_SHADER_COMPUTE: {
unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
lds_per_wave = (conf->lds_size * lds_increment) /
DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
lds_per_wave = conf->lds_size / DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
}
break;
default:;
@ -777,8 +776,7 @@ static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *sh
"********************\n\n\n",
conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
}
const char *si_get_shader_name(const struct si_shader *shader)
@ -2405,7 +2403,7 @@ void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_s
* It applies to workgroup sizes of more than one wavefront.
*/
if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
*lds_size = MAX2(*lds_size, 8);
*lds_size = MAX2(*lds_size, 8 * sscreen->info.lds_alloc_granularity);
}
static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)

View file

@ -776,8 +776,6 @@ si_sqtt_add_code_object(struct si_context *sctx,
memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
uint64_t va = pipeline->bo->gpu_address + (is_compute ? 0 : gfx_sh_offsets[i]);
unsigned lds_increment = sctx->gfx_level >= GFX11 && i == MESA_SHADER_FRAGMENT ?
1024 : sctx->screen->info.lds_encode_granularity;
memset(record->shader_data[i].rt_shader_name, 0, sizeof(record->shader_data[i].rt_shader_name));
record->shader_data[i].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
@ -791,7 +789,7 @@ si_sqtt_add_code_object(struct si_context *sctx,
record->shader_data[i].hw_stage = hw_stage;
record->shader_data[i].is_combined = false;
record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
record->shader_data[i].lds_size = shader->config.lds_size * lds_increment;
record->shader_data[i].lds_size = shader->config.lds_size;
record->shader_data[i].wavefront_size = shader->wave_size;
record->shader_stages_mask |= 1 << i;

View file

@ -1017,6 +1017,8 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
else
num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
unsigned lds_alloc = DIV_ROUND_UP(shader->config.lds_size, sscreen->info.lds_encode_granularity);
if (sscreen->info.gfx_level >= GFX10) {
ac_pm4_set_reg(&pm4->base, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
} else {
@ -1032,7 +1034,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
uint32_t rsrc2 = S_00B22C_USER_SGPR(num_user_sgprs) |
S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
S_00B22C_LDS_SIZE(shader->config.lds_size) |
S_00B22C_LDS_SIZE(lds_alloc) |
S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
if (sscreen->info.gfx_level >= GFX10) {
@ -1453,6 +1455,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
}
unsigned lds_alloc = DIV_ROUND_UP(shader->config.lds_size, sscreen->info.lds_encode_granularity);
/* Primitives with adjancency can only occur without tessellation. */
assert(gs_input_verts_per_prim <= 3 || es_stage == MESA_SHADER_VERTEX);
@ -1501,7 +1505,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
S_00B22C_USER_SGPR(num_user_sgprs) |
S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
S_00B22C_LDS_SIZE(shader->config.lds_size) |
S_00B22C_LDS_SIZE(lds_alloc) |
S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
S_00B22C_SHARED_VGPR_CNT(shader->config.num_shared_vgprs / 8));
@ -2196,8 +2200,10 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
S_00B028_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
S_00B028_MEM_ORDERED(si_shader_mem_ordered(shader)) |
S_00B028_FLOAT_MODE(shader->config.float_mode));
unsigned lds_alloc = DIV_ROUND_UP(shader->config.lds_size, sscreen->info.lds_encode_granularity);
ac_pm4_set_reg(&pm4->base, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
S_00B02C_EXTRA_LDS_SIZE(lds_alloc) |
S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
S_00B02C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
S_00B02C_SHARED_VGPR_CNT(shader->config.num_shared_vgprs / 8));
@ -4712,6 +4718,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
tcs->info.base.tess.tcs_vertices_out, ls_current->wave_size,
tess_uses_primid, num_tcs_input_cp, lds_input_vertex_size,
num_remapped_tess_level_outputs, &num_patches, &lds_size);
unsigned lds_alloc = DIV_ROUND_UP(lds_size, sctx->screen->info.lds_encode_granularity);
if (sctx->num_patches_per_workgroup != num_patches) {
sctx->num_patches_per_workgroup = num_patches;
@ -4751,14 +4758,15 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
ls_hs_rsrc2 = sctx->shader.tcs.current->config.rsrc2;
if (sctx->gfx_level >= GFX10)
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_alloc);
else
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_alloc);
} else {
ls_hs_rsrc2 = sctx->shader.vs.current->config.rsrc2;
si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
ls_hs_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
lds_alloc = DIV_ROUND_UP(lds_size, sctx->screen->info.lds_encode_granularity);
ls_hs_rsrc2 |= S_00B52C_LDS_SIZE(lds_alloc);
}
sctx->ls_hs_rsrc2 = ls_hs_rsrc2;