From 7de303489770840613d047762add8c55cf80de4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Sat, 15 Jan 2022 13:56:13 +0100 Subject: [PATCH 01/15] ac/nir: Add I/O lowering for task and mesh shaders. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Task shaders store their output payload to VRAM where mesh shaders read from. There are two ring buffers: 1. Draw ring: this is where mesh dispatch sizes and the ready bit are stored. 2. Payload ring: this is where the optional payload is stored (up to 16K per task workgroup). Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/common/ac_nir.h | 13 + .../common/ac_nir_lower_taskmesh_io_to_mem.c | 429 ++++++++++++++++++ src/amd/common/meson.build | 1 + src/compiler/nir/nir_divergence_analysis.c | 5 + src/compiler/nir/nir_intrinsics.py | 9 + 5 files changed, 457 insertions(+) create mode 100644 src/amd/common/ac_nir_lower_taskmesh_io_to_mem.c diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index 08f0cc0eed39..91679d9501f2 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -127,6 +127,19 @@ void ac_nir_lower_ngg_ms(nir_shader *shader, unsigned wave_size); +void +ac_nir_apply_first_task_to_task_shader(nir_shader *shader); + +void +ac_nir_lower_task_outputs_to_mem(nir_shader *shader, + unsigned task_payload_entry_bytes, + unsigned task_num_entries); + +void +ac_nir_lower_mesh_inputs_to_mem(nir_shader *shader, + unsigned task_payload_entry_bytes, + unsigned task_num_entries); + nir_ssa_def * ac_nir_cull_triangle(nir_builder *b, nir_ssa_def *initially_accepted, diff --git a/src/amd/common/ac_nir_lower_taskmesh_io_to_mem.c b/src/amd/common/ac_nir_lower_taskmesh_io_to_mem.c new file mode 100644 index 000000000000..a3a6eefee90e --- /dev/null +++ b/src/amd/common/ac_nir_lower_taskmesh_io_to_mem.c @@ -0,0 +1,429 @@ +/* + * Copyright © 2022 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "ac_nir.h" +#include "nir_builder.h" +#include "amdgfxregs.h" +#include "u_math.h" + +/* + * These NIR passes are used to lower NIR cross-stage I/O intrinsics + * between task and mesh shader stages into the memory accesses + * that actually happen on the HW. + * + */ + +typedef struct { + unsigned payload_entry_bytes; + unsigned draw_entry_bytes; + unsigned num_entries; +} lower_tsms_io_state; + +typedef struct { + nir_ssa_def *hw_workgroup_id; + nir_ssa_def *api_workgroup_id; +} add_first_task_to_workgroup_id_state; + +static bool filter_workgroup_id(const nir_instr *instr, + UNUSED const void *state) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + return intrin->intrinsic == nir_intrinsic_load_workgroup_id; +} + +static nir_ssa_def * +replace_workgroup_id_use_first_task(nir_builder *b, + nir_instr *instr, + void *state) +{ + add_first_task_to_workgroup_id_state *s = (add_first_task_to_workgroup_id_state *) state; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + assert(s->hw_workgroup_id); + + if (s->hw_workgroup_id == &intrin->dest.ssa) + return NULL; + + return s->api_workgroup_id; +} + +void +ac_nir_apply_first_task_to_task_shader(nir_shader *shader) +{ + /* The draw packets on RDNA2 GPUs don't support adding an offset to the task shader + * workgroups, so we have to emulate the firstTask feature for NV_mesh_shader. + * + * 1. Pass the address of the IB (indirect buffer) from the NV_mesh_shader draw call + * to the shader in an SGPR argument (2 SGPRs for address, 1 SGPR for stride). + * 2. Create a descriptor for the IB in the shader. + * 3. Load the firstTask value from the IB + * 4. Add the firstTask value the workgroup ID and use the result instead of the + * workgroup ID generated by the HW. + * + * NOTE: This pass must run _before_ lowering the task shader outputs to memory + * accesses. The lowering uses the workgroup ID and that must be unchanged + * because it has to be the real HW workgroup ID. + */ + + /* If the shader doesn't use workgroup ID, nothing to do here. */ + if (!BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_WORKGROUP_ID)) + return; + + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + assert(impl); + + nir_builder builder; + nir_builder *b = &builder; /* This is to avoid the & */ + nir_builder_init(b, impl); + b->cursor = nir_before_cf_list(&impl->body); + + /* This is the stride passed to vkCmdDrawMeshTasksIndirectNV */ + nir_ssa_def *ib_stride = nir_load_task_ib_stride(b); + nir_ssa_def *zero = nir_imm_int(b, 0); + nir_ssa_def *first_task = NULL; + + /* If the stride is zero, we assume that firstTask is also 0. */ + nir_if *if_stride = nir_push_if(b, nir_ine(b, ib_stride, zero)); + { + /* Address of the IB (indirect buffer) used by the current draw call. */ + nir_ssa_def *ib_addr = nir_load_task_ib_addr(b); + + /* Compose a 64-bit address from the IB address. */ + nir_ssa_def *addr = nir_pack_64_2x32_split(b, nir_channel(b, ib_addr, 0), nir_channel(b, ib_addr, 1)); + /* The IB needs to be addressed by draw ID * stride. */ + addr = nir_iadd(b, addr, nir_u2u64(b, nir_imul(b, nir_load_draw_id(b), ib_stride))); + /* Byte offset of the firstTask field in VkDrawMeshTasksIndirectCommandNV. */ + addr = nir_iadd_imm(b, addr, 4); + + first_task = nir_build_load_global(b, 1, 32, addr, .access = ACCESS_NON_WRITEABLE | ACCESS_COHERENT); + } + nir_pop_if(b, if_stride); + first_task = nir_if_phi(b, first_task, zero); + + /* NV_mesh_shader workgroups are 1 dimensional so we only care about X here. */ + nir_ssa_def *hw_workgroup_id = nir_load_workgroup_id(b, 32); + nir_ssa_def *api_workgroup_id_x = nir_iadd(b, nir_channel(b, hw_workgroup_id, 0), first_task); + nir_ssa_def *api_workgroup_id = nir_vec3(b, api_workgroup_id_x, zero, zero); + + add_first_task_to_workgroup_id_state state = { + .hw_workgroup_id = hw_workgroup_id, + .api_workgroup_id = api_workgroup_id, + }; + nir_shader_lower_instructions(shader, + filter_workgroup_id, + replace_workgroup_id_use_first_task, + &state); + + nir_validate_shader(shader, "after including firstTask in the task shader workgroup ID"); +} + +static nir_ssa_def * +task_workgroup_index(nir_builder *b, + lower_tsms_io_state *s) +{ + nir_ssa_def *id = nir_load_workgroup_id(b, 32); + + /* NV_mesh_shader: workgroups are always 1D, so index is the same as ID.x */ + return nir_channel(b, id, 0); +} + +static nir_ssa_def * +task_ring_entry_index(nir_builder *b, + lower_tsms_io_state *s) +{ + /* Task shader ring_entry shader argument: + * + * - It's a copy of write_ptr[31:0] from the task control buffer. + * - The same value (which is the initial value at dispatch) + * seems to be copied to all workgroups in the same dispatch, + * therefore a workgroup index needs to be added. + * - write_ptr must be initialized to num_entries so ring_entry needs + * AND with num_entries - 1 to get the correct meaning. + * Note that num_entries must be a power of two. + */ + nir_ssa_def *ring_entry = nir_load_task_ring_entry_amd(b); + nir_ssa_def *idx = nir_iadd_nuw(b, ring_entry, task_workgroup_index(b, s)); + return nir_iand_imm(b, idx, s->num_entries - 1); +} + +static nir_ssa_def * +task_draw_ready_bit(nir_builder *b, + lower_tsms_io_state *s) +{ + /* Value of the ready bit is 1 for odd and 0 for even passes through the draw ring. + * + * The ring_entry is a copy of the write_ptr. We use that to determine whether + * the current pass through the draw ring is odd or even, so we can write the + * correct value to the draw ready bit. + * + * This tells the firmware that it can now start launching mesh shader workgroups. + * The encoding of the last dword of the draw ring entry is: + * - bit 0: Draw ready bit. + * Its meaning flips on every pass through the entry. + * - bit 1: Packet end bit. + * The firmware uses this to mark the entry after the last one + * used by the current task dispatch. + * - bits [2:31] unused. + * + * Task shaders MUST write the draw ready bit to the draw ring + * before they finish. The firmware waits for the shader to write + * this bit before it reads the mesh dispatch size to launch the + * mesh shader workgroups. + * + * If the task shader doesn't write this bit, the HW hangs. + */ + + nir_ssa_def *ring_entry = nir_load_task_ring_entry_amd(b); + nir_ssa_def *workgroup_index = task_workgroup_index(b, s); + + nir_ssa_def *idx = nir_iadd_nuw(b, ring_entry, workgroup_index); + return nir_ubfe(b, idx, nir_imm_int(b, util_bitcount(s->num_entries - 1)), nir_imm_int(b, 1)); +} + +static nir_ssa_def * +mesh_ring_entry_index(nir_builder *b, + lower_tsms_io_state *s) +{ + /* Mesh shader ring_entry shader argument: + * + * - It's a copy of the read_ptr[31:0] from the task control buffer. + * - All workgroups in the same task->mesh dispatch get the same value, + * which is fine because they need to read the same entry. + * - read_ptr must be initialized to num_entries so ring_entry needs + * AND with num_entries - 1 to get the correct meaning. + * Note that num_entries must be a power of two. + */ + return nir_iand_imm(b, nir_load_task_ring_entry_amd(b), s->num_entries - 1); +} + +static void +task_write_draw_ring(nir_builder *b, + nir_ssa_def *store_val, + unsigned const_off, + lower_tsms_io_state *s) +{ + nir_ssa_def *ptr = task_ring_entry_index(b, s); + nir_ssa_def *ring = nir_load_ring_task_draw_amd(b); + nir_ssa_def *scalar_off = nir_imul_imm(b, ptr, s->draw_entry_bytes); + nir_ssa_def *vector_off = nir_imm_int(b, 0); + + nir_store_buffer_amd(b, store_val, ring, vector_off, scalar_off, + .base = const_off, .memory_modes = nir_var_shader_out); +} + +static bool +filter_task_output_or_payload(const nir_instr *instr, + UNUSED const void *state) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + return intrin->intrinsic == nir_intrinsic_store_output || + intrin->intrinsic == nir_intrinsic_store_task_payload || + intrin->intrinsic == nir_intrinsic_load_task_payload; +} + +static nir_ssa_def * +lower_task_output_store(nir_builder *b, + nir_intrinsic_instr *intrin, + lower_tsms_io_state *s) +{ + /* NV_mesh_shader: + * Task shaders should only have 1 output: TASK_COUNT + * which is the number of launched mesh shader workgroups in 1D. + * + * Task count is one dimensional, but the HW needs X, Y, Z. + * Use the shader's value for X, and write Y=1, Z=1. + */ + + nir_ssa_def *store_val = nir_vec3(b, intrin->src[0].ssa, + nir_imm_int(b, 1), + nir_imm_int(b, 1)); + + task_write_draw_ring(b, store_val, 0, s); + return NIR_LOWER_INSTR_PROGRESS_REPLACE; +} + +static nir_ssa_def * +lower_task_payload_store(nir_builder *b, + nir_intrinsic_instr *intrin, + lower_tsms_io_state *s) +{ + unsigned write_mask = nir_intrinsic_write_mask(intrin); + unsigned base = nir_intrinsic_base(intrin); + + nir_ssa_def *store_val = intrin->src[0].ssa; + nir_ssa_def *addr = intrin->src[1].ssa; + nir_ssa_def *ring = nir_load_ring_task_payload_amd(b); + nir_ssa_def *ptr = task_ring_entry_index(b, s); + nir_ssa_def *ring_off = nir_imul_imm(b, ptr, s->payload_entry_bytes); + + nir_store_buffer_amd(b, store_val, ring, addr, ring_off, .base = base, + .write_mask = write_mask, + .memory_modes = nir_var_mem_task_payload); + + return NIR_LOWER_INSTR_PROGRESS_REPLACE; +} + +static nir_ssa_def * +lower_taskmesh_payload_load(nir_builder *b, + nir_intrinsic_instr *intrin, + lower_tsms_io_state *s) +{ + unsigned base = nir_intrinsic_base(intrin); + unsigned num_components = intrin->dest.ssa.num_components; + unsigned bit_size = intrin->dest.ssa.bit_size; + + nir_ssa_def *ptr = + b->shader->info.stage == MESA_SHADER_TASK ? + task_ring_entry_index(b, s) : + mesh_ring_entry_index(b, s); + + nir_ssa_def *addr = intrin->src[0].ssa; + nir_ssa_def *ring = nir_load_ring_task_payload_amd(b); + nir_ssa_def *ring_off = nir_imul_imm(b, ptr, s->payload_entry_bytes); + + return nir_load_buffer_amd(b, num_components, bit_size, ring, addr, ring_off, .base = base, + .memory_modes = nir_var_mem_task_payload); +} + +static nir_ssa_def * +lower_task_intrinsics(nir_builder *b, + nir_instr *instr, + void *state) +{ + assert(instr->type == nir_instr_type_intrinsic); + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + lower_tsms_io_state *s = (lower_tsms_io_state *)state; + + if (intrin->intrinsic == nir_intrinsic_store_output) + return lower_task_output_store(b, intrin, s); + else if (intrin->intrinsic == nir_intrinsic_store_task_payload) + return lower_task_payload_store(b, intrin, s); + else if (intrin->intrinsic == nir_intrinsic_load_task_payload) + return lower_taskmesh_payload_load(b, intrin, s); + else + unreachable("unsupported task shader intrinsic"); +} + +static void +emit_task_finale(nir_builder *b, lower_tsms_io_state *s) +{ + /* We assume there is always a single end block in the shader. */ + b->cursor = nir_after_block(nir_impl_last_block(b->impl)); + + /* Wait for all task_payload, output, SSBO and global stores to finish. */ + nir_scoped_barrier(b, .execution_scope = NIR_SCOPE_WORKGROUP, + .memory_scope = NIR_SCOPE_WORKGROUP, + .memory_semantics = NIR_MEMORY_ACQ_REL, + .memory_modes = nir_var_mem_task_payload | nir_var_shader_out | + nir_var_mem_ssbo | nir_var_mem_global); + + nir_ssa_def *invocation_index = nir_load_local_invocation_index(b); + nir_if *if_invocation_index_zero = nir_push_if(b, nir_ieq_imm(b, invocation_index, 0)); + { + /* Write ready bit. */ + nir_ssa_def *ready_bit = task_draw_ready_bit(b, s); + task_write_draw_ring(b, ready_bit, 12, s); + } + nir_pop_if(b, if_invocation_index_zero); +} + +void +ac_nir_lower_task_outputs_to_mem(nir_shader *shader, + unsigned task_payload_entry_bytes, + unsigned task_num_entries) +{ + assert(util_is_power_of_two_nonzero(task_num_entries)); + + lower_tsms_io_state state = { + .draw_entry_bytes = 16, + .payload_entry_bytes = task_payload_entry_bytes, + .num_entries = task_num_entries, + }; + + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + nir_builder builder; + nir_builder *b = &builder; /* This is to avoid the & */ + nir_builder_init(b, impl); + + nir_shader_lower_instructions(shader, + filter_task_output_or_payload, + lower_task_intrinsics, + &state); + + emit_task_finale(b, &state); + nir_metadata_preserve(impl, nir_metadata_none); + + nir_validate_shader(shader, "after lowering task shader outputs to memory stores"); +} + +static bool +filter_mesh_input_load(const nir_instr *instr, + UNUSED const void *state) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + return intrin->intrinsic == nir_intrinsic_load_task_payload; +} + +static nir_ssa_def * +lower_mesh_intrinsics(nir_builder *b, + nir_instr *instr, + void *state) +{ + assert(instr->type == nir_instr_type_intrinsic); + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + lower_tsms_io_state *s = (lower_tsms_io_state *)state; + + if (intrin->intrinsic == nir_intrinsic_load_task_payload) + return lower_taskmesh_payload_load(b, intrin, s); + else + unreachable("unsupported mesh shader intrinsic"); +} + +void +ac_nir_lower_mesh_inputs_to_mem(nir_shader *shader, + unsigned task_payload_entry_bytes, + unsigned task_num_entries) +{ + assert(util_is_power_of_two_nonzero(task_num_entries)); + + lower_tsms_io_state state = { + .draw_entry_bytes = 16, + .payload_entry_bytes = task_payload_entry_bytes, + .num_entries = task_num_entries, + }; + + nir_shader_lower_instructions(shader, + filter_mesh_input_load, + lower_mesh_intrinsics, + &state); +} diff --git a/src/amd/common/meson.build b/src/amd/common/meson.build index 78c7b0a88db8..0b511b534a07 100644 --- a/src/amd/common/meson.build +++ b/src/amd/common/meson.build @@ -96,6 +96,7 @@ amd_common_files = files( 'ac_nir_cull.c', 'ac_nir_lower_esgs_io_to_mem.c', 'ac_nir_lower_global_access.c', + 'ac_nir_lower_taskmesh_io_to_mem.c', 'ac_nir_lower_tess_io_to_mem.c', 'ac_nir_lower_ngg.c', 'amd_family.c', diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 8f46d4eeff6a..7fba34fd2292 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -145,6 +145,11 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr) case nir_intrinsic_load_ring_tess_offchip_offset_amd: case nir_intrinsic_load_ring_esgs_amd: case nir_intrinsic_load_ring_es2gs_offset_amd: + case nir_intrinsic_load_ring_task_draw_amd: + case nir_intrinsic_load_ring_task_payload_amd: + case nir_intrinsic_load_task_ring_entry_amd: + case nir_intrinsic_load_task_ib_addr: + case nir_intrinsic_load_task_ib_stride: case nir_intrinsic_load_sample_positions_pan: case nir_intrinsic_load_workgroup_num_input_vertices_amd: case nir_intrinsic_load_workgroup_num_input_primitives_amd: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 7bf257f2f032..e0c3938d4895 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1273,6 +1273,15 @@ system_value("ring_tess_factors_offset_amd", 1) # Descriptor where ES outputs are stored for GS to read on GFX6-8 system_value("ring_esgs_amd", 4) system_value("ring_es2gs_offset_amd", 1) +# Address of the task shader draw ring (used for VARYING_SLOT_TASK_COUNT) +system_value("ring_task_draw_amd", 4) +# Address of the task shader payload ring (used for all other outputs) +system_value("ring_task_payload_amd", 4) +# Pointer into the draw and payload rings +system_value("task_ring_entry_amd", 1) +# Pointer into the draw and payload rings +system_value("task_ib_addr", 2) +system_value("task_ib_stride", 1) # Number of patches processed by each TCS workgroup system_value("tcs_num_patches_amd", 1) -- GitLab From 285d20d3c505826391782a42795399cc9250500d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Sat, 15 Jan 2022 14:09:12 +0100 Subject: [PATCH 02/15] ac: Add task ring entry shader argument. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is going to be used by both task and mesh shaders for accessing the draw and payload ring buffers. Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/common/ac_shader_args.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/amd/common/ac_shader_args.h b/src/amd/common/ac_shader_args.h index 2a71360099cc..e74caeda68da 100644 --- a/src/amd/common/ac_shader_args.h +++ b/src/amd/common/ac_shader_args.h @@ -141,6 +141,9 @@ struct ac_shader_args { struct ac_arg workgroup_ids[3]; struct ac_arg tg_size; + /* Mesh and task shaders */ + struct ac_arg task_ring_entry; /* Pointer into the draw and payload rings. */ + /* Vulkan only */ struct ac_arg push_constants; struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS]; -- GitLab From bb71d1092b4bfd026d36855a801e9ed60c263dff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Tue, 18 Jan 2022 16:39:53 +0100 Subject: [PATCH 03/15] radv: Add radv_pipeline_has_task helper. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_private.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 05240d07a2cf..c1431e557048 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -2018,6 +2018,12 @@ radv_pipeline_has_mesh(const struct radv_pipeline *pipeline) return !!pipeline->shaders[MESA_SHADER_MESH]; } +static inline bool +radv_pipeline_has_task(const struct radv_pipeline *pipeline) +{ + return !!pipeline->shaders[MESA_SHADER_TASK]; +} + bool radv_pipeline_has_ngg_passthrough(const struct radv_pipeline *pipeline); bool radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline); -- GitLab From 988600e522779e44f3e44141c5e4cd3fda104b2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Tue, 18 Jan 2022 16:36:42 +0100 Subject: [PATCH 04/15] radv: Set user data register for task shaders. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_pipeline.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 3cb547ff6c54..a752658496ac 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -4747,6 +4747,7 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline, gl_shader_sta return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : R_00B230_SPI_SHADER_USER_DATA_GS_0; case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: return R_00B900_COMPUTE_USER_DATA_0; case MESA_SHADER_TESS_CTRL: return chip_class == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0 -- GitLab From c34aa784961aac34732e9f190b20769b2af98c35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Tue, 18 Jan 2022 16:37:16 +0100 Subject: [PATCH 05/15] radv: Set wave size for task shaders. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_pipeline.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index a752658496ac..bd23ec721e1c 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3167,6 +3167,8 @@ radv_get_wave_size(struct radv_device *device, gl_shader_stage stage, return info->cs.subgroup_size; } else if (stage == MESA_SHADER_FRAGMENT) return device->physical_device->ps_wave_size; + else if (stage == MESA_SHADER_TASK) + return device->physical_device->cs_wave_size; else return device->physical_device->ge_wave_size; } -- GitLab From 101a7321c4002eec9019f2e25d40d931f74e90b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Tue, 18 Jan 2022 16:37:34 +0100 Subject: [PATCH 06/15] radv: Fill task shader info. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_pipeline.c | 10 ++++++++++ src/amd/vulkan/radv_shader.c | 2 +- src/amd/vulkan/radv_shader.h | 1 + src/amd/vulkan/radv_shader_info.c | 16 ++++++++++++++++ 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index bd23ec721e1c..7df2f58717dc 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3494,6 +3494,16 @@ radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stag stages[i].info.user_sgprs_locs = stages[i].args.user_sgprs_locs; stages[i].info.inline_push_constant_mask = stages[i].args.ac.inline_push_const_mask; } + + if (stages[MESA_SHADER_TASK].nir) { + /* Task/mesh I/O uses the task ring buffers. */ + stages[MESA_SHADER_TASK].info.cs.uses_task_rings = true; + stages[MESA_SHADER_MESH].info.cs.uses_task_rings = true; + + stages[MESA_SHADER_TASK].info.workgroup_size = + ac_compute_cs_workgroup_size( + stages[MESA_SHADER_TASK].nir->info.workgroup_size, false, UINT32_MAX); + } } static void diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 460fa0e0b7f6..1a3b5d1ef8bb 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -2327,7 +2327,7 @@ radv_get_max_waves(const struct radv_device *device, struct radv_shader *shader, lds_per_wave = conf->lds_size * info->lds_encode_granularity + shader->info.ps.num_interp * 48; lds_per_wave = align(lds_per_wave, info->lds_alloc_granularity); - } else if (stage == MESA_SHADER_COMPUTE) { + } else if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_TASK) { unsigned max_workgroup_size = shader->info.workgroup_size; lds_per_wave = align(conf->lds_size * info->lds_encode_granularity, info->lds_alloc_granularity); diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index c158de552ff5..f843776f1020 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -345,6 +345,7 @@ struct radv_shader_info { bool uses_sbt; bool uses_ray_launch_size; + bool uses_task_rings; } cs; struct { uint64_t tes_inputs_read; diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 0b9ca2ace142..adc30456f3a6 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -590,9 +590,25 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_NUM_SUBGROUPS); switch (nir->info.stage) { case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: for (int i = 0; i < 3; ++i) info->cs.block_size[i] = nir->info.workgroup_size[i]; info->cs.uses_ray_launch_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_RAY_LAUNCH_SIZE); + + /* Task shaders always need these for the I/O lowering even if + * the API shader doesn't actually use them. + */ + if (nir->info.stage == MESA_SHADER_TASK) { + /* Needed to address the IB to read firstTask. */ + info->vs.needs_draw_id |= + BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_WORKGROUP_ID); + + /* Needed to address the task draw/payload rings. */ + info->cs.uses_block_id[0] = true; + + /* Needed for storing draw ready only on the 1st thread. */ + info->cs.uses_local_invocation_idx = true; + } break; case MESA_SHADER_FRAGMENT: info->ps.can_discard = nir->info.fs.uses_discard; -- GitLab From b3ea6c610363c26cfc461b92c7a002b94a2761fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Sun, 23 Jan 2022 18:35:12 +0100 Subject: [PATCH 07/15] radv: Add task shader arguments. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mostly the same as for compute shaders, but with a few extras: task_ring_offsets: Same as what ring_offsets is to graphics shaders. Contains an address that points to a buffer that contains the ring buffer descriptors. task_ring_entry: Index that can be used to address the draw and payload rings. draw_id: Same meaning as in graphics shaders. task_ib_addr/task_ib_stride: Indirect buffer address and stride from the draw calls. These are used to emulate the firstTask feature of NV_mesh_shader. Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_constants.h | 4 ++- src/amd/vulkan/radv_device.c | 6 ++++- src/amd/vulkan/radv_shader.h | 6 ++++- src/amd/vulkan/radv_shader_args.c | 42 +++++++++++++++++++++++++++++-- src/amd/vulkan/radv_shader_args.h | 7 ++++++ 5 files changed, 60 insertions(+), 5 deletions(-) diff --git a/src/amd/vulkan/radv_constants.h b/src/amd/vulkan/radv_constants.h index 5787f0423161..4b6d3b9667b4 100644 --- a/src/amd/vulkan/radv_constants.h +++ b/src/amd/vulkan/radv_constants.h @@ -74,7 +74,9 @@ #define RING_GSVS_GS 4 #define RING_HS_TESS_FACTOR 5 #define RING_HS_TESS_OFFCHIP 6 -#define RING_PS_SAMPLE_POSITIONS 7 +#define RING_TS_DRAW 7 +#define RING_TS_PAYLOAD 8 +#define RING_PS_SAMPLE_POSITIONS 9 /* max number of descriptor sets */ #define MAX_SETS 32 diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 9822deb815d7..63de77db804d 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -3707,6 +3707,10 @@ radv_fill_shader_rings(struct radv_queue *queue, uint32_t *map, bool add_sample_ desc += 8; + /* Reserved for task shader rings. */ + + desc += 8; + if (add_sample_positions) { /* add sample positions after all rings */ memcpy(desc, queue->device->sample_locations_1x, 8); @@ -4004,7 +4008,7 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave add_sample_positions) { uint32_t size = 0; if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || add_sample_positions) { - size = 112; /* 2 dword + 2 padding + 4 dword * 6 */ + size = 144; /* 2 dword + 2 padding + 4 dword * 8 */ if (add_sample_positions) size += 128; /* 64+32+16+8 = 120 bytes */ } else if (scratch_bo) { diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index f843776f1020..373e3410e785 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -146,7 +146,8 @@ enum radv_ud_index { AC_UD_NGG_CULLING_SETTINGS = 7, AC_UD_NGG_VIEWPORT = 8, AC_UD_FORCE_VRS_RATES = 9, - AC_UD_SHADER_START = 10, + AC_UD_TASK_RING_ENTRY = 10, + AC_UD_SHADER_START = 11, AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START, AC_UD_VS_BASE_VERTEX_START_INSTANCE, AC_UD_VS_PROLOG_INPUTS, @@ -155,6 +156,9 @@ enum radv_ud_index { AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START, AC_UD_CS_SBT_DESCRIPTORS, AC_UD_CS_RAY_LAUNCH_SIZE, + AC_UD_CS_TASK_RING_OFFSETS, + AC_UD_CS_TASK_DRAW_ID, + AC_UD_CS_TASK_IB, AC_UD_CS_MAX_UD, AC_UD_GS_MAX_UD, AC_UD_TCS_MAX_UD, diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index c4c95bf4d745..084c779934b9 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -50,7 +50,8 @@ set_loc_shader(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx, uint8_ static void set_loc_shader_ptr(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx) { - bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS; + bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS && + idx != AC_UD_CS_TASK_RING_OFFSETS; set_loc_shader(args, idx, sgpr_idx, use_32bit_pointers ? 1 : 2); } @@ -157,18 +158,26 @@ allocate_user_sgprs(enum chip_class chip_class, const struct radv_shader_info *i /* 2 user sgprs will always be allocated for scratch/rings */ user_sgpr_count += 2; + if (stage == MESA_SHADER_TASK) + user_sgpr_count += 2; /* task descriptors */ + /* prolog inputs */ if (info->vs.has_prolog) user_sgpr_count += 2; switch (stage) { case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: if (info->cs.uses_sbt) user_sgpr_count += 1; if (info->cs.uses_grid_size) user_sgpr_count += args->load_grid_size_from_user_sgpr ? 3 : 2; if (info->cs.uses_ray_launch_size) user_sgpr_count += 3; + if (info->vs.needs_draw_id) + user_sgpr_count += 1; + if (info->cs.uses_task_rings) + user_sgpr_count += 4; /* ring_entry, 2x ib_addr, ib_stride */ break; case MESA_SHADER_FRAGMENT: break; @@ -212,7 +221,8 @@ allocate_user_sgprs(enum chip_class chip_class, const struct radv_shader_info *i if (info->so.num_outputs) user_sgpr_count++; - uint32_t available_sgprs = chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16; + uint32_t available_sgprs = + chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_TASK ? 32 : 16; uint32_t remaining_sgprs = available_sgprs - user_sgpr_count; uint32_t num_desc_set = util_bitcount(info->desc_set_used_mask); @@ -527,6 +537,9 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ if (args->explicit_scratch_args) { ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->ring_offsets); } + if (stage == MESA_SHADER_TASK) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->task_ring_offsets); + } /* To ensure prologs match the main VS, VS specific input SGPRs have to be placed before other * sgprs. @@ -534,6 +547,7 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ switch (stage) { case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: declare_global_input_sgprs(info, &user_sgpr_info, args); if (info->cs.uses_sbt) { @@ -551,6 +565,16 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.ray_launch_size); } + if (info->vs.needs_draw_id) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id); + } + + if (info->cs.uses_task_rings) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.task_ring_entry); + ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_INT, &args->task_ib_addr); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->task_ib_stride); + } + for (int i = 0; i < 3; i++) { if (info->cs.uses_block_id[i]) { ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]); @@ -750,6 +774,9 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ uint8_t user_sgpr_idx = 0; set_loc_shader_ptr(args, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_idx); + if (stage == MESA_SHADER_TASK) { + set_loc_shader_ptr(args, AC_UD_CS_TASK_RING_OFFSETS, &user_sgpr_idx); + } /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */ @@ -765,6 +792,7 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ switch (stage) { case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: if (args->ac.sbt_descriptors.used) { set_loc_shader_ptr(args, AC_UD_CS_SBT_DESCRIPTORS, &user_sgpr_idx); } @@ -775,6 +803,16 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ if (args->ac.ray_launch_size.used) { set_loc_shader(args, AC_UD_CS_RAY_LAUNCH_SIZE, &user_sgpr_idx, 3); } + if (args->ac.draw_id.used) { + set_loc_shader(args, AC_UD_CS_TASK_DRAW_ID, &user_sgpr_idx, 1); + } + if (args->ac.task_ring_entry.used) { + set_loc_shader(args, AC_UD_TASK_RING_ENTRY, &user_sgpr_idx, 1); + } + if (args->task_ib_addr.used) { + assert(args->task_ib_stride.used); + set_loc_shader(args, AC_UD_CS_TASK_IB, &user_sgpr_idx, 3); + } break; case MESA_SHADER_VERTEX: if (args->ac.view_index.used) diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h index ed202a09faed..b510c31d0ef1 100644 --- a/src/amd/vulkan/radv_shader_args.h +++ b/src/amd/vulkan/radv_shader_args.h @@ -36,7 +36,10 @@ struct radv_shader_args { struct ac_shader_args ac; struct ac_arg descriptor_sets[MAX_SETS]; + /* User data 0/1. GFX: descriptor list, Compute: scratch BO */ struct ac_arg ring_offsets; + /* User data 2/3. same as the descriptor list above but for task shaders. */ + struct ac_arg task_ring_offsets; /* Streamout */ struct ac_arg streamout_buffers; @@ -47,6 +50,10 @@ struct radv_shader_args { struct ac_arg ngg_viewport_scale[2]; struct ac_arg ngg_viewport_translate[2]; + /* Task shaders */ + struct ac_arg task_ib_addr; + struct ac_arg task_ib_stride; + struct ac_arg prolog_inputs; struct ac_arg vs_inputs[MAX_VERTEX_ATTRIBS]; -- GitLab From a8c1f10294db954ce60964dadb9f219b2dee95a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Mon, 24 Jan 2022 11:33:55 +0100 Subject: [PATCH 08/15] radv: Add task ring entry argument for mesh shaders. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_shader_args.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index 084c779934b9..0326ec455097 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -97,6 +97,8 @@ count_ms_user_sgprs(const struct radv_shader_info *info) if (info->vs.needs_draw_id) count++; + if (info->cs.uses_task_rings) + count++; return count; } @@ -380,6 +382,9 @@ declare_ms_input_sgprs(const struct radv_shader_info *info, struct radv_shader_a if (info->vs.needs_draw_id) { ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id); } + if (info->cs.uses_task_rings) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.task_ring_entry); + } } static void @@ -507,6 +512,9 @@ set_ms_input_locs(struct radv_shader_args *args, uint8_t *user_sgpr_idx) unsigned vs_num = args->ac.base_vertex.used + 3 * args->ac.num_work_groups.used + args->ac.draw_id.used; set_loc_shader(args, AC_UD_VS_BASE_VERTEX_START_INSTANCE, user_sgpr_idx, vs_num); + + if (args->ac.task_ring_entry.used) + set_loc_shader(args, AC_UD_TASK_RING_ENTRY, user_sgpr_idx, 1); } void -- GitLab From a8bdcf3c92b12bb551e11bebaf23fa7802f01075 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Wed, 11 May 2022 12:38:02 +0200 Subject: [PATCH 09/15] radv: Implement task shader intrinsics in the ABI. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_nir_lower_abi.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/amd/vulkan/radv_nir_lower_abi.c b/src/amd/vulkan/radv_nir_lower_abi.c index 60c5fe5fc5b2..fd32c353514b 100644 --- a/src/amd/vulkan/radv_nir_lower_abi.c +++ b/src/amd/vulkan/radv_nir_lower_abi.c @@ -161,6 +161,21 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state) case nir_intrinsic_load_viewport_y_offset: return ac_nir_load_arg(b, &s->args->ac, s->args->ngg_viewport_translate[1]); + case nir_intrinsic_load_ring_task_draw_amd: + return load_ring(b, RING_TS_DRAW, s); + + case nir_intrinsic_load_ring_task_payload_amd: + return load_ring(b, RING_TS_PAYLOAD, s); + + case nir_intrinsic_load_task_ring_entry_amd: + return ac_nir_load_arg(b, &s->args->ac, s->args->ac.task_ring_entry); + + case nir_intrinsic_load_task_ib_addr: + return ac_nir_load_arg(b, &s->args->ac, s->args->task_ib_addr); + + case nir_intrinsic_load_task_ib_stride: + return ac_nir_load_arg(b, &s->args->ac, s->args->task_ib_stride); + default: unreachable("invalid NIR RADV ABI intrinsic."); } @@ -197,7 +212,12 @@ filter_abi_instr(const nir_instr *instr, intrin->intrinsic == nir_intrinsic_load_viewport_x_scale || intrin->intrinsic == nir_intrinsic_load_viewport_x_offset || intrin->intrinsic == nir_intrinsic_load_viewport_y_scale || - intrin->intrinsic == nir_intrinsic_load_viewport_y_offset; + intrin->intrinsic == nir_intrinsic_load_viewport_y_offset || + intrin->intrinsic == nir_intrinsic_load_ring_task_draw_amd || + intrin->intrinsic == nir_intrinsic_load_ring_task_payload_amd || + intrin->intrinsic == nir_intrinsic_load_task_ring_entry_amd || + intrin->intrinsic == nir_intrinsic_load_task_ib_addr || + intrin->intrinsic == nir_intrinsic_load_task_ib_stride; } void -- GitLab From 6e8f3677c7ea3d96596e86990c0fa87b99248a04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Tue, 18 Jan 2022 16:37:53 +0100 Subject: [PATCH 10/15] radv: Enable nir_opt_offsets for task shaders. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_pipeline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 7df2f58717dc..4a73eccc4a3c 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -4504,7 +4504,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout ac_nir_lower_global_access(stages[i].nir); radv_nir_lower_abi(stages[i].nir, device->physical_device->rad_info.chip_class, &stages[i].info, &stages[i].args, pipeline_key); - radv_optimize_nir_algebraic(stages[i].nir, io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE); + radv_optimize_nir_algebraic( + stages[i].nir, io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE || i == MESA_SHADER_TASK); if (stages[i].nir->info.bit_sizes_int & (8 | 16)) { if (device->physical_device->rad_info.chip_class >= GFX8) { -- GitLab From c17c523ec0ff09f3aa6989bad3473bec6cb52807 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Tue, 8 Feb 2022 11:27:24 +0100 Subject: [PATCH 11/15] radv: Use I/O lowering for task and mesh shaders. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We set the number of task shader ring entries in radv_device based on the generous assumption that each CU can run task/mesh shaders with maximum occupancy. Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_constants.h | 5 +++++ src/amd/vulkan/radv_device.c | 18 ++++++++++++++++++ src/amd/vulkan/radv_private.h | 3 +++ src/amd/vulkan/radv_shader.c | 9 +++++++++ 4 files changed, 35 insertions(+) diff --git a/src/amd/vulkan/radv_constants.h b/src/amd/vulkan/radv_constants.h index 4b6d3b9667b4..c40330f3bc33 100644 --- a/src/amd/vulkan/radv_constants.h +++ b/src/amd/vulkan/radv_constants.h @@ -91,6 +91,11 @@ */ #define RADV_MAX_MEMORY_ALLOCATION_SIZE 0xFFFFFFFCull +/* Size of each payload entry in the task payload ring. + * Spec requires minimum 16K bytes. + */ +#define RADV_TASK_PAYLOAD_ENTRY_BYTES 16384 + /* Number of invocations in each subgroup. */ #define RADV_SUBGROUP_SIZE 64 diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 63de77db804d..493748ce7df2 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -3331,6 +3331,24 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr ac_get_hs_info(&device->physical_device->rad_info, &device->hs); + /* Number of task shader ring entries. Needs to be a power of two. + * Use a low number on smaller chips so we don't waste space, + * but keep it high on bigger chips so it doesn't inhibit parallelism. + */ + switch (device->physical_device->rad_info.family) { + case CHIP_VANGOGH: + case CHIP_BEIGE_GOBY: + case CHIP_YELLOW_CARP: + device->task_num_entries = 256; + break; + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: + case CHIP_DIMGREY_CAVEFISH: + default: + device->task_num_entries = 1024; + break; + } + if (device->instance->debug_flags & RADV_DEBUG_HANG) { /* Enable GPU hangs detection and dump logs if a GPU hang is * detected. diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index c1431e557048..aa94c87c3073 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -777,6 +777,9 @@ struct radv_device { uint32_t scratch_waves; uint32_t dispatch_initiator; + /* Number of entries in the task shader ring buffers. */ + uint32_t task_num_entries; + uint32_t gs_table_depth; struct ac_hs_info hs; diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 1a3b5d1ef8bb..79394421dfe1 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1042,6 +1042,15 @@ radv_lower_io_to_mem(struct radv_device *device, struct radv_pipeline_stage *sta ac_nir_lower_gs_inputs_to_mem(nir, device->physical_device->rad_info.chip_class, info->gs.num_linked_inputs); return true; + } else if (nir->info.stage == MESA_SHADER_TASK) { + ac_nir_apply_first_task_to_task_shader(nir); + ac_nir_lower_task_outputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES, + device->task_num_entries); + return true; + } else if (nir->info.stage == MESA_SHADER_MESH) { + ac_nir_lower_mesh_inputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES, + device->task_num_entries); + return true; } return false; -- GitLab From 73c260594f04b8ee16eae51a0c4784d3f961aca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Tue, 18 Jan 2022 16:38:49 +0100 Subject: [PATCH 12/15] radv: Postprocess task shader configuration. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_shader.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 79394421dfe1..c1fb12a1a1f5 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1643,6 +1643,7 @@ radv_postprocess_config(const struct radv_device *device, const struct ac_shader S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) | S_00B22C_EXCP_EN(excp_en); break; case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: config_out->rsrc1 |= S_00B848_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | S_00B848_WGP_MODE(wgp_mode); config_out->rsrc2 |= S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) | -- GitLab From 123d0b8a75021a4f7356fb2bd5b03969c19ab31b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Tue, 18 Jan 2022 16:39:10 +0100 Subject: [PATCH 13/15] radv: Allow linking task shaders. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_pipeline.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 4a73eccc4a3c..9021ea297f79 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -2726,6 +2726,9 @@ radv_link_shaders(struct radv_pipeline *pipeline, if (stages[MESA_SHADER_MESH].nir) { ordered_shaders[shader_count++] = stages[MESA_SHADER_MESH].nir; } + if (stages[MESA_SHADER_TASK].nir) { + ordered_shaders[shader_count++] = stages[MESA_SHADER_TASK].nir; + } if (stages[MESA_SHADER_COMPUTE].nir) { ordered_shaders[shader_count++] = stages[MESA_SHADER_COMPUTE].nir; } -- GitLab From 8dbde926593ad2b5d356cb57a117c5545b9f3ae0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Thu, 27 Jan 2022 10:07:55 +0100 Subject: [PATCH 14/15] radv: Lower shared and task_payload variables in task/mesh shaders. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_shader.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index c1fb12a1a1f5..ba8d1d28c674 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -873,11 +873,18 @@ radv_shader_compile_to_nir(struct radv_device *device, const struct radv_pipelin /* Lower deref operations for compute shared memory. */ if (nir->info.stage == MESA_SHADER_COMPUTE || + nir->info.stage == MESA_SHADER_TASK || nir->info.stage == MESA_SHADER_MESH) { + nir_variable_mode var_modes = nir_var_mem_shared; + + if (nir->info.stage == MESA_SHADER_TASK || + nir->info.stage == MESA_SHADER_MESH) + var_modes |= nir_var_mem_task_payload; + if (!nir->info.shared_memory_explicit_layout) { - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_var_info); + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, var_modes, shared_var_info); } - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset); + NIR_PASS_V(nir, nir_lower_explicit_io, var_modes, nir_address_format_32bit_offset); if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) { const unsigned chunk_size = 16; /* max single store size */ -- GitLab From 0d0165db8ee4268fa4f539765c237b50da1a4883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Thu, 27 Jan 2022 10:08:12 +0100 Subject: [PATCH 15/15] radv: Print task shader stage name before disasm. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_shader.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index ba8d1d28c674..a403ef37d7fb 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -2322,6 +2322,8 @@ radv_get_shader_name(const struct radv_shader_info *info, gl_shader_stage stage) return "Compute Shader"; case MESA_SHADER_MESH: return "Mesh Shader as NGG"; + case MESA_SHADER_TASK: + return "Task Shader as CS"; default: return "Unknown shader"; }; -- GitLab