From fd0248c37bfaa0dabbab11fc3060ebe52443eb05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Mon, 30 Mar 2020 16:04:53 +0200 Subject: [PATCH 1/9] radv: Refactor calculate_tess_lds_size and get_tcs_num_patches. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously these functions needed the bit mask of the TCS outputs and patch outputs written, and concluded the number of outputs from that. Now, they take the number of outputs and patch outputs instead. This will allow the backend compiler to better optimize the LDS layout. Signed-off-by: Timur Kristóf Reviewed-by: Samuel Pitoiset Part-of: --- .../aco_instruction_selection_setup.cpp | 13 ++++++++---- src/amd/vulkan/radv_nir_to_llvm.c | 12 +++++++---- src/amd/vulkan/radv_shader.h | 21 +++++++------------ 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 80280319673e..bf9e96e0b1cf 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -105,6 +105,8 @@ struct isel_context { unsigned tcs_tess_lvl_in_loc; uint64_t tcs_temp_only_inputs; uint32_t tcs_num_inputs; + uint32_t tcs_num_outputs; + uint32_t tcs_num_patch_outputs; uint32_t tcs_num_patches; bool tcs_in_out_eq = false; @@ -871,12 +873,15 @@ setup_tcs_info(isel_context *ctx, nir_shader *nir) unreachable("Unsupported TCS shader stage"); } + ctx->tcs_num_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); + ctx->tcs_num_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); + ctx->tcs_num_patches = get_tcs_num_patches( ctx->args->options->key.tcs.input_vertices, nir->info.tess.tcs_vertices_out, ctx->tcs_num_inputs, - ctx->args->shader_info->tcs.outputs_written, - ctx->args->shader_info->tcs.patch_outputs_written, + ctx->tcs_num_outputs, + ctx->tcs_num_patch_outputs, ctx->args->options->tess_offchip_block_dw_size, ctx->args->options->chip_class, ctx->args->options->family); @@ -885,8 +890,8 @@ setup_tcs_info(isel_context *ctx, nir_shader *nir) nir->info.tess.tcs_vertices_out, ctx->tcs_num_inputs, ctx->tcs_num_patches, - ctx->args->shader_info->tcs.outputs_written, - ctx->args->shader_info->tcs.patch_outputs_written); + ctx->tcs_num_outputs, + ctx->tcs_num_patch_outputs); ctx->args->shader_info->tcs.num_patches = ctx->tcs_num_patches; ctx->args->shader_info->tcs.lds_size = lds_size; diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index a40467a3194c..3f214f79b92b 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -4004,13 +4004,15 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.tcs_num_inputs = args->options->key.tcs.num_inputs; else ctx.tcs_num_inputs = util_last_bit64(args->shader_info->vs.ls_outputs_written); + unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written); + unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written); ctx.tcs_num_patches = get_tcs_num_patches( ctx.args->options->key.tcs.input_vertices, ctx.shader->info.tess.tcs_vertices_out, ctx.tcs_num_inputs, - ctx.args->shader_info->tcs.outputs_written, - ctx.args->shader_info->tcs.patch_outputs_written, + tcs_num_outputs, + tcs_num_patch_outputs, ctx.args->options->tess_offchip_block_dw_size, ctx.args->options->chip_class, ctx.args->options->family); @@ -4114,6 +4116,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, } if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) { + unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written); + unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written); args->shader_info->tcs.num_patches = ctx.tcs_num_patches; args->shader_info->tcs.lds_size = calculate_tess_lds_size( @@ -4121,8 +4125,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.shader->info.tess.tcs_vertices_out, ctx.tcs_num_inputs, ctx.tcs_num_patches, - ctx.args->shader_info->tcs.outputs_written, - ctx.args->shader_info->tcs.patch_outputs_written); + tcs_num_outputs, + tcs_num_patch_outputs); } } diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 165df3afe2e5..608900b5419f 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -527,19 +527,16 @@ calculate_tess_lds_size(unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, unsigned tcs_num_patches, - unsigned tcs_outputs_written, - unsigned tcs_per_patch_outputs_written) + unsigned tcs_num_outputs, + unsigned tcs_num_patch_outputs) { - unsigned num_tcs_outputs = util_last_bit64(tcs_outputs_written); - unsigned num_tcs_patch_outputs = util_last_bit64(tcs_per_patch_outputs_written); - unsigned input_vertex_size = tcs_num_inputs * 16; - unsigned output_vertex_size = num_tcs_outputs * 16; + unsigned output_vertex_size = tcs_num_outputs * 16; unsigned input_patch_size = tcs_num_input_vertices * input_vertex_size; unsigned pervertex_output_patch_size = tcs_num_output_vertices * output_vertex_size; - unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; + unsigned output_patch_size = pervertex_output_patch_size + tcs_num_patch_outputs * 16; unsigned output_patch0_offset = input_patch_size * tcs_num_patches; @@ -550,19 +547,17 @@ static inline unsigned get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, - unsigned tcs_outputs_written, - unsigned tcs_per_patch_outputs_written, + unsigned tcs_num_outputs, + unsigned tcs_num_patch_outputs, unsigned tess_offchip_block_dw_size, enum chip_class chip_class, enum radeon_family family) { uint32_t input_vertex_size = tcs_num_inputs * 16; uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size; - uint32_t num_tcs_outputs = util_last_bit64(tcs_outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(tcs_per_patch_outputs_written); - uint32_t output_vertex_size = num_tcs_outputs * 16; + uint32_t output_vertex_size = tcs_num_outputs * 16; uint32_t pervertex_output_patch_size = tcs_num_output_vertices * output_vertex_size; - uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; + uint32_t output_patch_size = pervertex_output_patch_size + tcs_num_patch_outputs * 16; /* Ensure that we only need one wave per SIMD so we don't need to check * resource usage. Also ensures that the number of tcs in and out -- GitLab From ab07c4ea70897d8d8c4d40bd336aee38926278bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Mon, 30 Mar 2020 16:11:14 +0200 Subject: [PATCH 2/9] aco: Use context variables instead of calculating TCS inputs/outputs. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VS needs the number of TCS inputs, and TES needs the number of TCS outputs. It is error-prone to repeat those calculations in both instruction selection and setup. Just set them in one place instead. Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 15 ++++----------- .../compiler/aco_instruction_selection_setup.cpp | 3 +++ 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index b13b2372f4b6..5a1629079174 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4153,11 +4153,9 @@ std::pair get_tcs_output_lds_offset(isel_context *ctx, nir_intri Builder bld(ctx->program, ctx->block); uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16; - uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); - uint32_t output_vertex_size = num_tcs_outputs * 16; + uint32_t output_vertex_size = ctx->tcs_num_outputs * 16; uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; - uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16; + uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16; std::pair offs = instr ? get_intrinsic_io_basic_offset(ctx, instr, 4u) @@ -4205,11 +4203,7 @@ std::pair get_tcs_per_patch_output_vmem_offset(isel_context *ctx { Builder bld(ctx->program, ctx->block); - unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL - ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written) - : ctx->args->options->key.tes.tcs_num_outputs; - - unsigned output_vertex_size = num_tcs_outputs * 16; + unsigned output_vertex_size = ctx->tcs_num_outputs * 16; unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches; unsigned attr_stride = ctx->tcs_num_patches; @@ -4344,9 +4338,8 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr) /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS. * GFX9+: LS is merged into HS, but still uses the same LDS layout. */ - unsigned num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written); Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id); - lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, num_tcs_inputs * 16u); + lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u); } else { unreachable("Invalid LS or ES stage"); } diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index bf9e96e0b1cf..f3464a7a2147 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -812,6 +812,8 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir) */ /* radv_es_output_info *outinfo = &ctx->program->info->vs.es_info; outinfo->esgs_itemsize = util_bitcount64(ctx->output_masks[nir->info.stage]) * 16u; */ + } else if (ctx->stage == vertex_ls) { + ctx->tcs_num_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written); } if (ctx->stage == ngg_vertex_gs && ctx->args->options->key.vs_common_out.export_prim_id) { @@ -918,6 +920,7 @@ void setup_tes_variables(isel_context *ctx, nir_shader *nir) { ctx->tcs_num_patches = ctx->args->options->key.tes.num_patches; + ctx->tcs_num_outputs = ctx->args->options->key.tes.tcs_num_outputs; nir_foreach_variable(variable, &nir->inputs) { variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4; -- GitLab From fdbb2968533be9a1caca731cf11c2ed3b46e6043 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Mon, 30 Mar 2020 16:54:56 +0200 Subject: [PATCH 3/9] aco: Remember VS/TCS output driver locations. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of relying on calling shader_io_get_unique_index repeatedly, remember the which output driver location corresponds to which varying slot. Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 17 +++++++---------- .../aco_instruction_selection_setup.cpp | 11 +++++++++++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 5a1629079174..f92462471674 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4224,10 +4224,12 @@ std::pair get_tcs_per_patch_output_vmem_offset(isel_context *ctx bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect) { + assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); + if (mask == 0) return false; - unsigned off = nir_intrinsic_base(instr) * 4u; + unsigned drv_loc = nir_intrinsic_base(instr); nir_src *off_src = nir_get_io_offset_src(instr); if (!nir_src_is_const(*off_src)) { @@ -4236,15 +4238,10 @@ bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr } *indirect = false; - off += nir_src_as_uint(*off_src) * 16u; - - while (mask) { - unsigned slot = u_bit_scan64(&mask) + (per_vertex ? 0 : VARYING_SLOT_PATCH0); - if (off == shader_io_get_unique_index((gl_varying_slot) slot) * 16u) - return true; - } - - return false; + uint64_t slot = per_vertex + ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4] + : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0); + return (((uint64_t) 1) << slot) & mask; } bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr) diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index f3464a7a2147..c09d1459846b 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -113,6 +113,8 @@ struct isel_context { /* I/O information */ shader_io_state inputs; shader_io_state outputs; + uint8_t output_drv_loc_to_var_slot[MESA_SHADER_COMPUTE][VARYING_SLOT_MAX]; + uint8_t output_tcs_patch_drv_loc_to_var_slot[VARYING_SLOT_MAX]; }; Temp get_arg(isel_context *ctx, struct ac_arg arg) @@ -798,6 +800,9 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir) variable->data.driver_location = variable->data.location * 4; else unreachable("Unsupported VS stage"); + + assert(variable->data.location >= 0 && variable->data.location <= UINT8_MAX); + ctx->output_drv_loc_to_var_slot[MESA_SHADER_VERTEX][variable->data.driver_location / 4] = variable->data.location; } if (ctx->stage == vertex_vs || ctx->stage == ngg_vertex_gs) { @@ -910,6 +915,12 @@ setup_tcs_variables(isel_context *ctx, nir_shader *nir) nir_foreach_variable(variable, &nir->outputs) { variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4; + assert(variable->data.location >= 0 && variable->data.location <= UINT8_MAX); + + if (variable->data.patch) + ctx->output_tcs_patch_drv_loc_to_var_slot[variable->data.driver_location / 4] = variable->data.location; + else + ctx->output_drv_loc_to_var_slot[MESA_SHADER_TESS_CTRL][variable->data.driver_location / 4] = variable->data.location; } ctx->tcs_tess_lvl_out_loc = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER) * 16u; -- GitLab From baa46878d4533f21d12bc93d5eed09436b3cc9fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Mon, 27 Apr 2020 19:51:40 +0200 Subject: [PATCH 4/9] aco: Calculate workgroup size of legacy GS. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_instruction_selection_setup.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index c09d1459846b..bf4b34c6e3e9 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -1293,7 +1293,11 @@ setup_isel_context(Program* program, program->workgroup_size = program->wave_size; } else if (program->stage & hw_gs) { /* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */ - program->workgroup_size = UINT_MAX; /* TODO: set by VGT_GS_ONCHIP_CNTL, which is not plumbed to ACO */ + assert(program->chip_class >= GFX9); + uint32_t es_verts_per_subgrp = G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl); + uint32_t gs_instr_prims_in_subgrp = G_028A44_GS_INST_PRIMS_IN_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl); + uint32_t workgroup_size = MAX2(es_verts_per_subgrp, gs_instr_prims_in_subgrp); + program->workgroup_size = MAX2(MIN2(workgroup_size, 256), 1); } else if (program->stage == vertex_ls) { /* Unmerged LS operates in workgroups */ program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but those are not plumbed to ACO for LS */ -- GitLab From 7056714f5039e8f4302075677d962b5dd925e107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Thu, 23 Apr 2020 14:02:47 +0200 Subject: [PATCH 5/9] aco: Set config->lds_size when TES or VS is running on HW ESGS. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This doesn't fix anything, just reports the LDS size used by merged ESGS shaders, such as vertex_geometry_gs and tess_eval_geometry_gs. Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_instruction_selection_setup.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index bf4b34c6e3e9..dee86585c8ed 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -835,6 +835,7 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir) nir_foreach_variable(variable, &nir->inputs) { variable->data.driver_location = util_bitcount64(ctx->input_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4; } + ctx->program->config->lds_size = ctx->program->info->gs_ring_info.lds_size; /* Already in units of the alloc granularity */ } else if (ctx->stage == geometry_gs) { //TODO: make this more compact nir_foreach_variable(variable, &nir->inputs) { -- GitLab From 7aa61c84fe47f139b96b29d39b3298f30b96c89c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Mon, 30 Mar 2020 15:58:07 +0200 Subject: [PATCH 6/9] nir: Add new linking helper to set linked driver locations. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces a new function nir_assign_linked_io_var_locations which is intended to help with assigning driver locations to shaders during linking, primarily aimed at the VS->TCS->TES->GS stages. It ensures that the linked shaders have the same driver locations, and it also packs these as close to each other as possible. Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Reviewed-by: Samuel Pitoiset Part-of: --- src/compiler/nir/nir.h | 9 +++ src/compiler/nir/nir_linking_helpers.c | 99 ++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 59a034f1907a..9f6d2c5895b8 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -3809,6 +3809,15 @@ void nir_assign_io_var_locations(struct exec_list *var_list, unsigned *size, gl_shader_stage stage); +typedef struct { + uint8_t num_linked_io_vars; + uint8_t num_linked_patch_io_vars; +} nir_linked_io_var_info; + +nir_linked_io_var_info +nir_assign_linked_io_var_locations(nir_shader *producer, + nir_shader *consumer); + typedef enum { /* If set, this causes all 64-bit IO operations to be lowered on-the-fly * to 32-bit operations. This is only valid for nir_var_shader_in/out diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c index 34a6f9a4d6b7..e626326c7a56 100644 --- a/src/compiler/nir/nir_linking_helpers.c +++ b/src/compiler/nir/nir_linking_helpers.c @@ -1201,3 +1201,102 @@ nir_assign_io_var_locations(struct exec_list *var_list, unsigned *size, *size = location; } +static uint64_t +get_linked_variable_location(unsigned location, bool patch) +{ + if (!patch) + return location; + + /* Reserve locations 0...3 for special patch variables + * like tess factors and bounding boxes, and the generic patch + * variables will come after them. + */ + if (location >= VARYING_SLOT_PATCH0) + return location - VARYING_SLOT_PATCH0 + 4; + else if (location >= VARYING_SLOT_TESS_LEVEL_OUTER && + location <= VARYING_SLOT_BOUNDING_BOX1) + return location - VARYING_SLOT_TESS_LEVEL_OUTER; + else + unreachable("Unsupported variable in get_linked_variable_location."); +} + +static uint64_t +get_linked_variable_io_mask(nir_variable *variable, gl_shader_stage stage) +{ + const struct glsl_type *type = variable->type; + + if (nir_is_per_vertex_io(variable, stage)) { + assert(glsl_type_is_array(type)); + type = glsl_get_array_element(type); + } + + unsigned slots = glsl_count_attribute_slots(type, false); + if (variable->data.compact) { + unsigned component_count = variable->data.location_frac + glsl_get_length(type); + slots = DIV_ROUND_UP(component_count, 4); + } + + uint64_t mask = u_bit_consecutive64(0, slots); + return mask; +} + +nir_linked_io_var_info +nir_assign_linked_io_var_locations(nir_shader *producer, nir_shader *consumer) +{ + assert(producer); + assert(consumer); + + uint64_t producer_output_mask = 0; + uint64_t producer_patch_output_mask = 0; + + nir_foreach_variable(variable, &producer->outputs) { + uint64_t mask = get_linked_variable_io_mask(variable, producer->info.stage); + uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch); + + if (variable->data.patch) + producer_patch_output_mask |= mask << loc; + else + producer_output_mask |= mask << loc; + } + + uint64_t consumer_input_mask = 0; + uint64_t consumer_patch_input_mask = 0; + + nir_foreach_variable(variable, &consumer->inputs) { + uint64_t mask = get_linked_variable_io_mask(variable, consumer->info.stage); + uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch); + + if (variable->data.patch) + consumer_patch_input_mask |= mask << loc; + else + consumer_input_mask |= mask << loc; + } + + uint64_t io_mask = producer_output_mask | consumer_input_mask; + uint64_t patch_io_mask = producer_patch_output_mask | consumer_patch_input_mask; + + nir_foreach_variable(variable, &producer->outputs) { + uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch); + + if (variable->data.patch) + variable->data.driver_location = util_bitcount64(patch_io_mask & u_bit_consecutive64(0, loc)) * 4; + else + variable->data.driver_location = util_bitcount64(io_mask & u_bit_consecutive64(0, loc)) * 4; + } + + nir_foreach_variable(variable, &consumer->inputs) { + uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch); + + if (variable->data.patch) + variable->data.driver_location = util_bitcount64(patch_io_mask & u_bit_consecutive64(0, loc)) * 4; + else + variable->data.driver_location = util_bitcount64(io_mask & u_bit_consecutive64(0, loc)) * 4; + } + + nir_linked_io_var_info result = { + .num_linked_io_vars = util_bitcount64(io_mask), + .num_linked_patch_io_vars = util_bitcount64(patch_io_mask), + }; + + return result; +} -- GitLab From efa4976709afbbbfd430235bb8b71e6abb66d8e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Mon, 27 Apr 2020 12:22:03 +0200 Subject: [PATCH 7/9] radv: Use new linking helper to set default driver locations. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Samuel Pitoiset Part-of: --- src/amd/vulkan/radv_pipeline.c | 50 ++++++++++++++++++++++++++++++++++ src/amd/vulkan/radv_shader.h | 8 ++++++ 2 files changed, 58 insertions(+) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 2e535a33e39a..ae584328ab4b 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -2254,6 +2254,54 @@ radv_link_shaders(struct radv_pipeline *pipeline, nir_shader **shaders) } } +static void +radv_set_linked_driver_locations(struct radv_pipeline *pipeline, nir_shader **shaders, + struct radv_shader_info infos[MESA_SHADER_STAGES]) +{ + bool has_tess = shaders[MESA_SHADER_TESS_CTRL]; + bool has_gs = shaders[MESA_SHADER_GEOMETRY]; + + if (!has_tess && !has_gs) + return; + + unsigned vs_info_idx = MESA_SHADER_VERTEX; + unsigned tes_info_idx = MESA_SHADER_TESS_EVAL; + + if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) { + /* These are merged into the next stage */ + vs_info_idx = has_tess ? MESA_SHADER_TESS_CTRL : MESA_SHADER_GEOMETRY; + tes_info_idx = has_gs ? MESA_SHADER_GEOMETRY : MESA_SHADER_TESS_EVAL; + } + + if (has_tess) { + nir_linked_io_var_info vs2tcs = + nir_assign_linked_io_var_locations(shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_TESS_CTRL]); + nir_linked_io_var_info tcs2tes = + nir_assign_linked_io_var_locations(shaders[MESA_SHADER_TESS_CTRL], shaders[MESA_SHADER_TESS_EVAL]); + + infos[vs_info_idx].vs.num_linked_outputs = vs2tcs.num_linked_io_vars; + infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs = vs2tcs.num_linked_io_vars; + infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs = tcs2tes.num_linked_io_vars; + infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs = tcs2tes.num_linked_patch_io_vars; + infos[tes_info_idx].tes.num_linked_inputs = tcs2tes.num_linked_io_vars; + infos[tes_info_idx].tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars; + + if (has_gs) { + nir_linked_io_var_info tes2gs = + nir_assign_linked_io_var_locations(shaders[MESA_SHADER_TESS_EVAL], shaders[MESA_SHADER_GEOMETRY]); + + infos[tes_info_idx].tes.num_linked_outputs = tes2gs.num_linked_io_vars; + infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = tes2gs.num_linked_io_vars; + } + } else if (has_gs) { + nir_linked_io_var_info vs2gs = + nir_assign_linked_io_var_locations(shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_GEOMETRY]); + + infos[vs_info_idx].vs.num_linked_outputs = vs2gs.num_linked_io_vars; + infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = vs2gs.num_linked_io_vars; + } +} + static uint32_t radv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo *input_state, uint32_t attrib_binding) @@ -2864,6 +2912,8 @@ void radv_create_shaders(struct radv_pipeline *pipeline, if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)) radv_link_shaders(pipeline, nir); + radv_set_linked_driver_locations(pipeline, nir, infos); + for (int i = 0; i < MESA_SHADER_STAGES; ++i) { if (nir[i]) { /* do this again since information such as outputs_read can be out-of-date */ diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 608900b5419f..d7c8119cd243 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -259,6 +259,7 @@ struct radv_shader_info { bool as_es; bool as_ls; bool export_prim_id; + uint8_t num_linked_outputs; } vs; struct { uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; @@ -273,6 +274,7 @@ struct radv_shader_info { unsigned output_prim; unsigned invocations; unsigned es_type; /* GFX9: VS or TES */ + uint8_t num_linked_inputs; } gs; struct { uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; @@ -284,6 +286,9 @@ struct radv_shader_info { bool ccw; bool point_mode; bool export_prim_id; + uint8_t num_linked_inputs; + uint8_t num_linked_patch_inputs; + uint8_t num_linked_outputs; } tes; struct { bool force_persample; @@ -321,6 +326,9 @@ struct radv_shader_info { unsigned tcs_vertices_out; uint32_t num_patches; uint32_t lds_size; + uint8_t num_linked_inputs; + uint8_t num_linked_outputs; + uint8_t num_linked_patch_outputs; } tcs; struct radv_streamout_info so; -- GitLab From ee5f04c9c9c02e42739924f9f0b6efd3f9077039 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Mon, 30 Mar 2020 17:23:25 +0200 Subject: [PATCH 8/9] aco: Use new default driver locations. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The way the new locations are set up has much fewer gaps between each I/O slot, so this results in a massive reduction in the LDS usage of tessellation shaders. Totals (GFX10): VGPRS: 3976792 -> 3974864 (-0.05 %) Code Size: 260552784 -> 260532860 (-0.01 %) bytes LDS: 48723 -> 30179 (-38.06 %) blocks Max Waves: 1053407 -> 1053583 (0.02 %) Totals from affected shaders (1407 shaders on GFX10): SGPRS: 59144 -> 59216 (0.12 %) VGPRS: 63024 -> 61096 (-3.06 %) Code Size: 2695508 -> 2675584 (-0.74 %) bytes LDS: 47109 -> 28565 (-39.36 %) blocks Max Waves: 12999 -> 13175 (1.35 %) Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- .../aco_instruction_selection_setup.cpp | 124 +++--------------- 1 file changed, 17 insertions(+), 107 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index dee86585c8ed..04dbe8f7e035 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -91,10 +91,6 @@ struct isel_context { /* GS inputs */ Temp gs_wave_id; - /* gathered information */ - uint64_t input_masks[MESA_SHADER_COMPUTE]; - uint64_t output_masks[MESA_SHADER_COMPUTE]; - /* VS output information */ bool export_clip_dists; unsigned num_clip_distances; @@ -744,7 +740,7 @@ setup_vs_output_info(isel_context *ctx, nir_shader *nir, if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer) pos_written |= 1 << 1; - uint64_t mask = ctx->output_masks[nir->info.stage]; + uint64_t mask = nir->info.outputs_written; while (mask) { int idx = u_bit_scan64(&mask); if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || @@ -789,17 +785,8 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir) } nir_foreach_variable(variable, &nir->outputs) { - if (ctx->stage == vertex_geometry_gs) - variable->data.driver_location = util_bitcount64(ctx->output_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4; - else if (ctx->stage == vertex_es || - ctx->stage == vertex_ls || - ctx->stage == vertex_tess_control_hs) - // TODO: make this more compact - variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4; - else if (ctx->stage == vertex_vs || ctx->stage == ngg_vertex_gs) + if (ctx->stage == vertex_vs || ctx->stage == ngg_vertex_gs) variable->data.driver_location = variable->data.location * 4; - else - unreachable("Unsupported VS stage"); assert(variable->data.location >= 0 && variable->data.location <= UINT8_MAX); ctx->output_drv_loc_to_var_slot[MESA_SHADER_VERTEX][variable->data.driver_location / 4] = variable->data.location; @@ -818,7 +805,7 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir) /* radv_es_output_info *outinfo = &ctx->program->info->vs.es_info; outinfo->esgs_itemsize = util_bitcount64(ctx->output_masks[nir->info.stage]) * 16u; */ } else if (ctx->stage == vertex_ls) { - ctx->tcs_num_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written); + ctx->tcs_num_inputs = ctx->program->info->vs.num_linked_outputs; } if (ctx->stage == ngg_vertex_gs && ctx->args->options->key.vs_common_out.export_prim_id) { @@ -831,19 +818,8 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir) void setup_gs_variables(isel_context *ctx, nir_shader *nir) { - if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) { - nir_foreach_variable(variable, &nir->inputs) { - variable->data.driver_location = util_bitcount64(ctx->input_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4; - } + if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) ctx->program->config->lds_size = ctx->program->info->gs_ring_info.lds_size; /* Already in units of the alloc granularity */ - } else if (ctx->stage == geometry_gs) { - //TODO: make this more compact - nir_foreach_variable(variable, &nir->inputs) { - variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot)variable->data.location) * 4; - } - } else { - unreachable("Unsupported GS stage."); - } nir_foreach_variable(variable, &nir->outputs) { variable->data.driver_location = variable->data.location * 4; @@ -867,22 +843,15 @@ setup_tcs_info(isel_context *ctx, nir_shader *nir) ctx->stage == vertex_tess_control_hs && ctx->args->options->key.tcs.input_vertices == nir->info.tess.tcs_vertices_out; - if (ctx->stage == tess_control_hs) { - ctx->tcs_num_inputs = ctx->args->options->key.tcs.num_inputs; - } else if (ctx->stage == vertex_tess_control_hs) { - ctx->tcs_num_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written); - - if (ctx->tcs_in_out_eq) { - ctx->tcs_temp_only_inputs = ~nir->info.tess.tcs_cross_invocation_inputs_read & - ~nir->info.inputs_read_indirectly & - nir->info.inputs_read; - } - } else { - unreachable("Unsupported TCS shader stage"); + if (ctx->tcs_in_out_eq) { + ctx->tcs_temp_only_inputs = ~nir->info.tess.tcs_cross_invocation_inputs_read & + ~nir->info.inputs_read_indirectly & + nir->info.inputs_read; } - ctx->tcs_num_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); - ctx->tcs_num_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); + ctx->tcs_num_inputs = ctx->program->info->tcs.num_linked_inputs; + ctx->tcs_num_outputs = ctx->program->info->tcs.num_linked_outputs; + ctx->tcs_num_patch_outputs = ctx->program->info->tcs.num_linked_patch_outputs; ctx->tcs_num_patches = get_tcs_num_patches( ctx->args->options->key.tcs.input_vertices, @@ -910,43 +879,30 @@ setup_tcs_info(isel_context *ctx, nir_shader *nir) void setup_tcs_variables(isel_context *ctx, nir_shader *nir) { - nir_foreach_variable(variable, &nir->inputs) { - variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4; - } - nir_foreach_variable(variable, &nir->outputs) { - variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4; assert(variable->data.location >= 0 && variable->data.location <= UINT8_MAX); + if (variable->data.location == VARYING_SLOT_TESS_LEVEL_OUTER) + ctx->tcs_tess_lvl_out_loc = variable->data.driver_location * 4u; + else if (variable->data.location == VARYING_SLOT_TESS_LEVEL_INNER) + ctx->tcs_tess_lvl_in_loc = variable->data.driver_location * 4u; + if (variable->data.patch) ctx->output_tcs_patch_drv_loc_to_var_slot[variable->data.driver_location / 4] = variable->data.location; else ctx->output_drv_loc_to_var_slot[MESA_SHADER_TESS_CTRL][variable->data.driver_location / 4] = variable->data.location; } - - ctx->tcs_tess_lvl_out_loc = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER) * 16u; - ctx->tcs_tess_lvl_in_loc = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER) * 16u; } void setup_tes_variables(isel_context *ctx, nir_shader *nir) { ctx->tcs_num_patches = ctx->args->options->key.tes.num_patches; - ctx->tcs_num_outputs = ctx->args->options->key.tes.tcs_num_outputs; - - nir_foreach_variable(variable, &nir->inputs) { - variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4; - } + ctx->tcs_num_outputs = ctx->program->info->tes.num_linked_inputs; nir_foreach_variable(variable, &nir->outputs) { if (ctx->stage == tess_eval_vs || ctx->stage == ngg_tess_eval_gs) variable->data.driver_location = variable->data.location * 4; - else if (ctx->stage == tess_eval_es) - variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4; - else if (ctx->stage == tess_eval_geometry_gs) - variable->data.driver_location = util_bitcount64(ctx->output_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4; - else - unreachable("Unsupported TES shader stage"); } if (ctx->stage == tess_eval_vs || ctx->stage == ngg_tess_eval_gs) { @@ -994,50 +950,6 @@ setup_variables(isel_context *ctx, nir_shader *nir) } } -void -get_io_masks(isel_context *ctx, unsigned shader_count, struct nir_shader *const *shaders) -{ - for (unsigned i = 0; i < shader_count; i++) { - nir_shader *nir = shaders[i]; - if (nir->info.stage == MESA_SHADER_COMPUTE) - continue; - - uint64_t output_mask = 0; - nir_foreach_variable(variable, &nir->outputs) { - const glsl_type *type = variable->type; - if (nir_is_per_vertex_io(variable, nir->info.stage)) - type = type->fields.array; - unsigned slots = type->count_attribute_slots(false); - if (variable->data.compact) { - unsigned component_count = variable->data.location_frac + type->length; - slots = (component_count + 3) / 4; - } - output_mask |= ((1ull << slots) - 1) << variable->data.location; - } - - uint64_t input_mask = 0; - nir_foreach_variable(variable, &nir->inputs) { - const glsl_type *type = variable->type; - if (nir_is_per_vertex_io(variable, nir->info.stage)) - type = type->fields.array; - unsigned slots = type->count_attribute_slots(false); - if (variable->data.compact) { - unsigned component_count = variable->data.location_frac + type->length; - slots = (component_count + 3) / 4; - } - input_mask |= ((1ull << slots) - 1) << variable->data.location; - } - - ctx->output_masks[nir->info.stage] |= output_mask; - if (i + 1 < shader_count) - ctx->input_masks[shaders[i + 1]->info.stage] |= output_mask; - - ctx->input_masks[nir->info.stage] |= input_mask; - if (i) - ctx->output_masks[shaders[i - 1]->info.stage] |= input_mask; - } -} - unsigned lower_bit_size_callback(const nir_alu_instr *alu, void *_) { @@ -1321,8 +1233,6 @@ setup_isel_context(Program* program, program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); - get_io_masks(&ctx, shader_count, shaders); - unsigned scratch_size = 0; if (program->stage == gs_copy_vs) { assert(shader_count == 1); -- GitLab From e4e1a0ac1321730bbdeb4aef89ff14281a0b56eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Thu, 23 Apr 2020 15:13:31 +0200 Subject: [PATCH 9/9] radv: Use smaller esgs_itemsize for ACO. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Samuel Pitoiset Part-of: --- .../aco_instruction_selection_setup.cpp | 8 ----- src/amd/vulkan/radv_pipeline.c | 13 +++++---- src/amd/vulkan/radv_private.h | 3 +- src/amd/vulkan/radv_shader_info.c | 29 +++++++++++++------ 4 files changed, 30 insertions(+), 23 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 04dbe8f7e035..28645017b770 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -796,14 +796,6 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir) radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; setup_vs_output_info(ctx, nir, outinfo->export_prim_id, ctx->options->key.vs_common_out.export_clip_dists, outinfo); - } else if (ctx->stage == vertex_geometry_gs || ctx->stage == vertex_es) { - /* TODO: radv_nir_shader_info_pass() already sets this but it's larger - * than it needs to be in order to set it better, we have to improve - * radv_nir_shader_info_pass() because gfx9_get_gs_info() uses - * esgs_itemsize and has to be done before compilation - */ - /* radv_es_output_info *outinfo = &ctx->program->info->vs.es_info; - outinfo->esgs_itemsize = util_bitcount64(ctx->output_masks[nir->info.stage]) * 16u; */ } else if (ctx->stage == vertex_ls) { ctx->tcs_num_inputs = ctx->program->info->vs.num_linked_outputs; } diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index ae584328ab4b..33b93e7d0c9d 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -2617,7 +2617,8 @@ radv_fill_shader_info(struct radv_pipeline *pipeline, radv_nir_shader_info_pass(nir[MESA_SHADER_FRAGMENT], pipeline->layout, &keys[MESA_SHADER_FRAGMENT], - &infos[MESA_SHADER_FRAGMENT]); + &infos[MESA_SHADER_FRAGMENT], + pipeline->device->physical_device->use_aco); /* TODO: These are no longer used as keys we should refactor this */ keys[MESA_SHADER_VERTEX].vs_common_out.export_prim_id = @@ -2668,7 +2669,8 @@ radv_fill_shader_info(struct radv_pipeline *pipeline, for (int i = 0; i < 2; i++) { radv_nir_shader_info_pass(combined_nir[i], pipeline->layout, &key, - &infos[MESA_SHADER_TESS_CTRL]); + &infos[MESA_SHADER_TESS_CTRL], + pipeline->device->physical_device->use_aco); } keys[MESA_SHADER_TESS_EVAL].tes.num_patches = @@ -2691,7 +2693,8 @@ radv_fill_shader_info(struct radv_pipeline *pipeline, radv_nir_shader_info_pass(combined_nir[i], pipeline->layout, &keys[pre_stage], - &infos[MESA_SHADER_GEOMETRY]); + &infos[MESA_SHADER_GEOMETRY], + pipeline->device->physical_device->use_aco); } filled_stages |= (1 << pre_stage); @@ -2716,7 +2719,7 @@ radv_fill_shader_info(struct radv_pipeline *pipeline, radv_nir_shader_info_init(&infos[i]); radv_nir_shader_info_pass(nir[i], pipeline->layout, - &keys[i], &infos[i]); + &keys[i], &infos[i], pipeline->device->physical_device->use_aco); } for (int i = 0; i < MESA_SHADER_STAGES; i++) { @@ -2975,7 +2978,7 @@ void radv_create_shaders(struct radv_pipeline *pipeline, radv_nir_shader_info_pass(nir[MESA_SHADER_GEOMETRY], pipeline->layout, &key, - &info); + &info, pipeline->device->physical_device->use_aco); info.wave_size = 64; /* Wave32 not supported. */ info.ballot_bit_size = 64; diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index ec4b45235c63..a54f0147fbcd 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -2366,7 +2366,8 @@ struct radv_shader_variant_key; void radv_nir_shader_info_pass(const struct nir_shader *nir, const struct radv_pipeline_layout *layout, const struct radv_shader_variant_key *key, - struct radv_shader_info *info); + struct radv_shader_info *info, + bool use_aco); void radv_nir_shader_info_init(struct radv_shader_info *info); diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index dfccba5600c3..c651f2162807 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -656,7 +656,8 @@ void radv_nir_shader_info_pass(const struct nir_shader *nir, const struct radv_pipeline_layout *layout, const struct radv_shader_variant_key *key, - struct radv_shader_info *info) + struct radv_shader_info *info, + bool use_aco) { struct nir_function *func = (struct nir_function *)exec_list_get_head_const(&nir->functions); @@ -808,17 +809,27 @@ radv_nir_shader_info_pass(const struct nir_shader *nir, key->vs_common_out.as_es) { struct radv_es_output_info *es_info = nir->info.stage == MESA_SHADER_VERTEX ? &info->vs.es_info : &info->tes.es_info; - uint32_t max_output_written = 0; - uint64_t output_mask = nir->info.outputs_written; - while (output_mask) { - const int i = u_bit_scan64(&output_mask); - unsigned param_index = shader_io_get_unique_index(i); + if (use_aco) { + /* The outputs don't contain gaps, se we can use the number of outputs */ + uint32_t num_outputs_written = nir->info.stage == MESA_SHADER_VERTEX + ? info->vs.num_linked_outputs + : info->tes.num_linked_outputs; + es_info->esgs_itemsize = num_outputs_written * 16; + } else { + /* The outputs may contain gaps, use the highest output index + 1 */ + uint32_t max_output_written = 0; + uint64_t output_mask = nir->info.outputs_written; + + while (output_mask) { + const int i = u_bit_scan64(&output_mask); + unsigned param_index = shader_io_get_unique_index(i); + + max_output_written = MAX2(param_index, max_output_written); + } - max_output_written = MAX2(param_index, max_output_written); + es_info->esgs_itemsize = (max_output_written + 1) * 16; } - - es_info->esgs_itemsize = (max_output_written + 1) * 16; } info->float_controls_mode = nir->info.float_controls_execution_mode; -- GitLab