amd: change ac_shader_config::lds_size to bytes
We still keep it aligned to allocation granularity. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37577>
This commit is contained in:
parent
190bab9b77
commit
b651234414
17 changed files with 63 additions and 56 deletions
|
|
@ -8,6 +8,7 @@
|
|||
#include "nir/tgsi_to_nir.h"
|
||||
#include "si_build_pm4.h"
|
||||
#include "si_shader_internal.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/u_async_debug.h"
|
||||
#include "util/u_memory.h"
|
||||
#include "util/u_upload_mgr.h"
|
||||
|
|
@ -133,7 +134,7 @@ static void si_create_compute_state_async(void *job, void *gdata, int thread_ind
|
|||
S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2]
|
||||
? 2
|
||||
: sel->info.uses_thread_id[1] ? 1 : 0) |
|
||||
S_00B84C_LDS_SIZE(shader->config.lds_size);
|
||||
S_00B84C_LDS_SIZE(DIV_ROUND_UP(shader->config.lds_size, sscreen->info.lds_encode_granularity));
|
||||
|
||||
/* COMPUTE_PGM_RSRC3 is only present on GFX10+ and GFX940+. */
|
||||
shader->config.rsrc3 = S_00B8A0_SHARED_VGPR_CNT(shader->config.num_shared_vgprs / 8);
|
||||
|
|
|
|||
|
|
@ -435,17 +435,17 @@ static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shade
|
|||
size_in_dw += shader->ngg.info.ngg_out_lds_size;
|
||||
|
||||
shader->config.lds_size =
|
||||
DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
|
||||
ALIGN(size_in_dw * 4, get_lds_granularity(sscreen, stage));
|
||||
}
|
||||
|
||||
if (stage == MESA_SHADER_COMPUTE) {
|
||||
shader->config.lds_size = DIV_ROUND_UP(shader->selector->info.base.shared_size,
|
||||
sscreen->info.lds_encode_granularity);
|
||||
shader->config.lds_size = ALIGN(shader->selector->info.base.shared_size,
|
||||
sscreen->info.lds_alloc_granularity);
|
||||
}
|
||||
|
||||
/* Check that the LDS size is within hw limits. */
|
||||
assert(shader->config.lds_size * get_lds_granularity(sscreen, stage) <=
|
||||
(sscreen->info.gfx_level == GFX6 ? 32 : 64) * 1024);
|
||||
assert(shader->config.lds_size % sscreen->info.lds_alloc_granularity == 0);
|
||||
assert(shader->config.lds_size <= (sscreen->info.gfx_level == GFX6 ? 32 : 64) * 1024);
|
||||
}
|
||||
|
||||
static int upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
|
||||
|
|
@ -623,13 +623,12 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
|
|||
* Other stages don't know the size at compile time or don't
|
||||
* allocate LDS per wave, but instead they do it per thread group.
|
||||
*/
|
||||
lds_per_wave = conf->lds_size * lds_increment +
|
||||
lds_per_wave = conf->lds_size +
|
||||
align(shader->info.num_ps_inputs * 48, lds_increment);
|
||||
break;
|
||||
case MESA_SHADER_COMPUTE: {
|
||||
unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
|
||||
lds_per_wave = (conf->lds_size * lds_increment) /
|
||||
DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
|
||||
lds_per_wave = conf->lds_size / DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
|
||||
}
|
||||
break;
|
||||
default:;
|
||||
|
|
@ -777,8 +776,7 @@ static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *sh
|
|||
"********************\n\n\n",
|
||||
conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
|
||||
shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
|
||||
conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
|
||||
conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
|
||||
conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
|
||||
}
|
||||
|
||||
const char *si_get_shader_name(const struct si_shader *shader)
|
||||
|
|
@ -2405,7 +2403,7 @@ void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_s
|
|||
* It applies to workgroup sizes of more than one wavefront.
|
||||
*/
|
||||
if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
|
||||
*lds_size = MAX2(*lds_size, 8);
|
||||
*lds_size = MAX2(*lds_size, 8 * sscreen->info.lds_alloc_granularity);
|
||||
}
|
||||
|
||||
static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
|
||||
|
|
|
|||
|
|
@ -776,8 +776,6 @@ si_sqtt_add_code_object(struct si_context *sctx,
|
|||
memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
|
||||
|
||||
uint64_t va = pipeline->bo->gpu_address + (is_compute ? 0 : gfx_sh_offsets[i]);
|
||||
unsigned lds_increment = sctx->gfx_level >= GFX11 && i == MESA_SHADER_FRAGMENT ?
|
||||
1024 : sctx->screen->info.lds_encode_granularity;
|
||||
|
||||
memset(record->shader_data[i].rt_shader_name, 0, sizeof(record->shader_data[i].rt_shader_name));
|
||||
record->shader_data[i].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
|
||||
|
|
@ -791,7 +789,7 @@ si_sqtt_add_code_object(struct si_context *sctx,
|
|||
record->shader_data[i].hw_stage = hw_stage;
|
||||
record->shader_data[i].is_combined = false;
|
||||
record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
|
||||
record->shader_data[i].lds_size = shader->config.lds_size * lds_increment;
|
||||
record->shader_data[i].lds_size = shader->config.lds_size;
|
||||
record->shader_data[i].wavefront_size = shader->wave_size;
|
||||
|
||||
record->shader_stages_mask |= 1 << i;
|
||||
|
|
|
|||
|
|
@ -1017,6 +1017,8 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
|
|||
else
|
||||
num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
|
||||
|
||||
unsigned lds_alloc = DIV_ROUND_UP(shader->config.lds_size, sscreen->info.lds_encode_granularity);
|
||||
|
||||
if (sscreen->info.gfx_level >= GFX10) {
|
||||
ac_pm4_set_reg(&pm4->base, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
|
||||
} else {
|
||||
|
|
@ -1032,7 +1034,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
|
|||
uint32_t rsrc2 = S_00B22C_USER_SGPR(num_user_sgprs) |
|
||||
S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
|
||||
S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
|
||||
S_00B22C_LDS_SIZE(shader->config.lds_size) |
|
||||
S_00B22C_LDS_SIZE(lds_alloc) |
|
||||
S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
|
||||
|
||||
if (sscreen->info.gfx_level >= GFX10) {
|
||||
|
|
@ -1453,6 +1455,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
|||
num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
|
||||
}
|
||||
|
||||
unsigned lds_alloc = DIV_ROUND_UP(shader->config.lds_size, sscreen->info.lds_encode_granularity);
|
||||
|
||||
/* Primitives with adjancency can only occur without tessellation. */
|
||||
assert(gs_input_verts_per_prim <= 3 || es_stage == MESA_SHADER_VERTEX);
|
||||
|
||||
|
|
@ -1501,7 +1505,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
|||
S_00B22C_USER_SGPR(num_user_sgprs) |
|
||||
S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
|
||||
S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
|
||||
S_00B22C_LDS_SIZE(shader->config.lds_size) |
|
||||
S_00B22C_LDS_SIZE(lds_alloc) |
|
||||
S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
|
||||
S_00B22C_SHARED_VGPR_CNT(shader->config.num_shared_vgprs / 8));
|
||||
|
||||
|
|
@ -2196,8 +2200,10 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
|
|||
S_00B028_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
|
||||
S_00B028_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
||||
S_00B028_FLOAT_MODE(shader->config.float_mode));
|
||||
|
||||
unsigned lds_alloc = DIV_ROUND_UP(shader->config.lds_size, sscreen->info.lds_encode_granularity);
|
||||
ac_pm4_set_reg(&pm4->base, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
|
||||
S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
|
||||
S_00B02C_EXTRA_LDS_SIZE(lds_alloc) |
|
||||
S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
|
||||
S_00B02C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
|
||||
S_00B02C_SHARED_VGPR_CNT(shader->config.num_shared_vgprs / 8));
|
||||
|
|
@ -4712,6 +4718,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
|||
tcs->info.base.tess.tcs_vertices_out, ls_current->wave_size,
|
||||
tess_uses_primid, num_tcs_input_cp, lds_input_vertex_size,
|
||||
num_remapped_tess_level_outputs, &num_patches, &lds_size);
|
||||
unsigned lds_alloc = DIV_ROUND_UP(lds_size, sctx->screen->info.lds_encode_granularity);
|
||||
|
||||
if (sctx->num_patches_per_workgroup != num_patches) {
|
||||
sctx->num_patches_per_workgroup = num_patches;
|
||||
|
|
@ -4751,14 +4758,15 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
|||
ls_hs_rsrc2 = sctx->shader.tcs.current->config.rsrc2;
|
||||
|
||||
if (sctx->gfx_level >= GFX10)
|
||||
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
|
||||
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_alloc);
|
||||
else
|
||||
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
|
||||
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_alloc);
|
||||
} else {
|
||||
ls_hs_rsrc2 = sctx->shader.vs.current->config.rsrc2;
|
||||
|
||||
si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
|
||||
ls_hs_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
|
||||
lds_alloc = DIV_ROUND_UP(lds_size, sctx->screen->info.lds_encode_granularity);
|
||||
ls_hs_rsrc2 |= S_00B52C_LDS_SIZE(lds_alloc);
|
||||
}
|
||||
|
||||
sctx->ls_hs_rsrc2 = ls_hs_rsrc2;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue