diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index 08f0cc0eed39953cc0f4e553fec1f29d25f88c5d..91679d9501f2090813defc97f6c6e8610b7297ee 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -127,6 +127,19 @@ void ac_nir_lower_ngg_ms(nir_shader *shader, unsigned wave_size); +void +ac_nir_apply_first_task_to_task_shader(nir_shader *shader); + +void +ac_nir_lower_task_outputs_to_mem(nir_shader *shader, + unsigned task_payload_entry_bytes, + unsigned task_num_entries); + +void +ac_nir_lower_mesh_inputs_to_mem(nir_shader *shader, + unsigned task_payload_entry_bytes, + unsigned task_num_entries); + nir_ssa_def * ac_nir_cull_triangle(nir_builder *b, nir_ssa_def *initially_accepted, diff --git a/src/amd/common/ac_nir_lower_taskmesh_io_to_mem.c b/src/amd/common/ac_nir_lower_taskmesh_io_to_mem.c new file mode 100644 index 0000000000000000000000000000000000000000..a3a6eefee90edaf9d64f682481e5e9f10522528b --- /dev/null +++ b/src/amd/common/ac_nir_lower_taskmesh_io_to_mem.c @@ -0,0 +1,429 @@ +/* + * Copyright © 2022 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "ac_nir.h" +#include "nir_builder.h" +#include "amdgfxregs.h" +#include "u_math.h" + +/* + * These NIR passes are used to lower NIR cross-stage I/O intrinsics + * between task and mesh shader stages into the memory accesses + * that actually happen on the HW. + * + */ + +typedef struct { + unsigned payload_entry_bytes; + unsigned draw_entry_bytes; + unsigned num_entries; +} lower_tsms_io_state; + +typedef struct { + nir_ssa_def *hw_workgroup_id; + nir_ssa_def *api_workgroup_id; +} add_first_task_to_workgroup_id_state; + +static bool filter_workgroup_id(const nir_instr *instr, + UNUSED const void *state) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + return intrin->intrinsic == nir_intrinsic_load_workgroup_id; +} + +static nir_ssa_def * +replace_workgroup_id_use_first_task(nir_builder *b, + nir_instr *instr, + void *state) +{ + add_first_task_to_workgroup_id_state *s = (add_first_task_to_workgroup_id_state *) state; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + assert(s->hw_workgroup_id); + + if (s->hw_workgroup_id == &intrin->dest.ssa) + return NULL; + + return s->api_workgroup_id; +} + +void +ac_nir_apply_first_task_to_task_shader(nir_shader *shader) +{ + /* The draw packets on RDNA2 GPUs don't support adding an offset to the task shader + * workgroups, so we have to emulate the firstTask feature for NV_mesh_shader. + * + * 1. Pass the address of the IB (indirect buffer) from the NV_mesh_shader draw call + * to the shader in an SGPR argument (2 SGPRs for address, 1 SGPR for stride). + * 2. Create a descriptor for the IB in the shader. + * 3. Load the firstTask value from the IB + * 4. Add the firstTask value the workgroup ID and use the result instead of the + * workgroup ID generated by the HW. + * + * NOTE: This pass must run _before_ lowering the task shader outputs to memory + * accesses. The lowering uses the workgroup ID and that must be unchanged + * because it has to be the real HW workgroup ID. + */ + + /* If the shader doesn't use workgroup ID, nothing to do here. */ + if (!BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_WORKGROUP_ID)) + return; + + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + assert(impl); + + nir_builder builder; + nir_builder *b = &builder; /* This is to avoid the & */ + nir_builder_init(b, impl); + b->cursor = nir_before_cf_list(&impl->body); + + /* This is the stride passed to vkCmdDrawMeshTasksIndirectNV */ + nir_ssa_def *ib_stride = nir_load_task_ib_stride(b); + nir_ssa_def *zero = nir_imm_int(b, 0); + nir_ssa_def *first_task = NULL; + + /* If the stride is zero, we assume that firstTask is also 0. */ + nir_if *if_stride = nir_push_if(b, nir_ine(b, ib_stride, zero)); + { + /* Address of the IB (indirect buffer) used by the current draw call. */ + nir_ssa_def *ib_addr = nir_load_task_ib_addr(b); + + /* Compose a 64-bit address from the IB address. */ + nir_ssa_def *addr = nir_pack_64_2x32_split(b, nir_channel(b, ib_addr, 0), nir_channel(b, ib_addr, 1)); + /* The IB needs to be addressed by draw ID * stride. */ + addr = nir_iadd(b, addr, nir_u2u64(b, nir_imul(b, nir_load_draw_id(b), ib_stride))); + /* Byte offset of the firstTask field in VkDrawMeshTasksIndirectCommandNV. */ + addr = nir_iadd_imm(b, addr, 4); + + first_task = nir_build_load_global(b, 1, 32, addr, .access = ACCESS_NON_WRITEABLE | ACCESS_COHERENT); + } + nir_pop_if(b, if_stride); + first_task = nir_if_phi(b, first_task, zero); + + /* NV_mesh_shader workgroups are 1 dimensional so we only care about X here. */ + nir_ssa_def *hw_workgroup_id = nir_load_workgroup_id(b, 32); + nir_ssa_def *api_workgroup_id_x = nir_iadd(b, nir_channel(b, hw_workgroup_id, 0), first_task); + nir_ssa_def *api_workgroup_id = nir_vec3(b, api_workgroup_id_x, zero, zero); + + add_first_task_to_workgroup_id_state state = { + .hw_workgroup_id = hw_workgroup_id, + .api_workgroup_id = api_workgroup_id, + }; + nir_shader_lower_instructions(shader, + filter_workgroup_id, + replace_workgroup_id_use_first_task, + &state); + + nir_validate_shader(shader, "after including firstTask in the task shader workgroup ID"); +} + +static nir_ssa_def * +task_workgroup_index(nir_builder *b, + lower_tsms_io_state *s) +{ + nir_ssa_def *id = nir_load_workgroup_id(b, 32); + + /* NV_mesh_shader: workgroups are always 1D, so index is the same as ID.x */ + return nir_channel(b, id, 0); +} + +static nir_ssa_def * +task_ring_entry_index(nir_builder *b, + lower_tsms_io_state *s) +{ + /* Task shader ring_entry shader argument: + * + * - It's a copy of write_ptr[31:0] from the task control buffer. + * - The same value (which is the initial value at dispatch) + * seems to be copied to all workgroups in the same dispatch, + * therefore a workgroup index needs to be added. + * - write_ptr must be initialized to num_entries so ring_entry needs + * AND with num_entries - 1 to get the correct meaning. + * Note that num_entries must be a power of two. + */ + nir_ssa_def *ring_entry = nir_load_task_ring_entry_amd(b); + nir_ssa_def *idx = nir_iadd_nuw(b, ring_entry, task_workgroup_index(b, s)); + return nir_iand_imm(b, idx, s->num_entries - 1); +} + +static nir_ssa_def * +task_draw_ready_bit(nir_builder *b, + lower_tsms_io_state *s) +{ + /* Value of the ready bit is 1 for odd and 0 for even passes through the draw ring. + * + * The ring_entry is a copy of the write_ptr. We use that to determine whether + * the current pass through the draw ring is odd or even, so we can write the + * correct value to the draw ready bit. + * + * This tells the firmware that it can now start launching mesh shader workgroups. + * The encoding of the last dword of the draw ring entry is: + * - bit 0: Draw ready bit. + * Its meaning flips on every pass through the entry. + * - bit 1: Packet end bit. + * The firmware uses this to mark the entry after the last one + * used by the current task dispatch. + * - bits [2:31] unused. + * + * Task shaders MUST write the draw ready bit to the draw ring + * before they finish. The firmware waits for the shader to write + * this bit before it reads the mesh dispatch size to launch the + * mesh shader workgroups. + * + * If the task shader doesn't write this bit, the HW hangs. + */ + + nir_ssa_def *ring_entry = nir_load_task_ring_entry_amd(b); + nir_ssa_def *workgroup_index = task_workgroup_index(b, s); + + nir_ssa_def *idx = nir_iadd_nuw(b, ring_entry, workgroup_index); + return nir_ubfe(b, idx, nir_imm_int(b, util_bitcount(s->num_entries - 1)), nir_imm_int(b, 1)); +} + +static nir_ssa_def * +mesh_ring_entry_index(nir_builder *b, + lower_tsms_io_state *s) +{ + /* Mesh shader ring_entry shader argument: + * + * - It's a copy of the read_ptr[31:0] from the task control buffer. + * - All workgroups in the same task->mesh dispatch get the same value, + * which is fine because they need to read the same entry. + * - read_ptr must be initialized to num_entries so ring_entry needs + * AND with num_entries - 1 to get the correct meaning. + * Note that num_entries must be a power of two. + */ + return nir_iand_imm(b, nir_load_task_ring_entry_amd(b), s->num_entries - 1); +} + +static void +task_write_draw_ring(nir_builder *b, + nir_ssa_def *store_val, + unsigned const_off, + lower_tsms_io_state *s) +{ + nir_ssa_def *ptr = task_ring_entry_index(b, s); + nir_ssa_def *ring = nir_load_ring_task_draw_amd(b); + nir_ssa_def *scalar_off = nir_imul_imm(b, ptr, s->draw_entry_bytes); + nir_ssa_def *vector_off = nir_imm_int(b, 0); + + nir_store_buffer_amd(b, store_val, ring, vector_off, scalar_off, + .base = const_off, .memory_modes = nir_var_shader_out); +} + +static bool +filter_task_output_or_payload(const nir_instr *instr, + UNUSED const void *state) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + return intrin->intrinsic == nir_intrinsic_store_output || + intrin->intrinsic == nir_intrinsic_store_task_payload || + intrin->intrinsic == nir_intrinsic_load_task_payload; +} + +static nir_ssa_def * +lower_task_output_store(nir_builder *b, + nir_intrinsic_instr *intrin, + lower_tsms_io_state *s) +{ + /* NV_mesh_shader: + * Task shaders should only have 1 output: TASK_COUNT + * which is the number of launched mesh shader workgroups in 1D. + * + * Task count is one dimensional, but the HW needs X, Y, Z. + * Use the shader's value for X, and write Y=1, Z=1. + */ + + nir_ssa_def *store_val = nir_vec3(b, intrin->src[0].ssa, + nir_imm_int(b, 1), + nir_imm_int(b, 1)); + + task_write_draw_ring(b, store_val, 0, s); + return NIR_LOWER_INSTR_PROGRESS_REPLACE; +} + +static nir_ssa_def * +lower_task_payload_store(nir_builder *b, + nir_intrinsic_instr *intrin, + lower_tsms_io_state *s) +{ + unsigned write_mask = nir_intrinsic_write_mask(intrin); + unsigned base = nir_intrinsic_base(intrin); + + nir_ssa_def *store_val = intrin->src[0].ssa; + nir_ssa_def *addr = intrin->src[1].ssa; + nir_ssa_def *ring = nir_load_ring_task_payload_amd(b); + nir_ssa_def *ptr = task_ring_entry_index(b, s); + nir_ssa_def *ring_off = nir_imul_imm(b, ptr, s->payload_entry_bytes); + + nir_store_buffer_amd(b, store_val, ring, addr, ring_off, .base = base, + .write_mask = write_mask, + .memory_modes = nir_var_mem_task_payload); + + return NIR_LOWER_INSTR_PROGRESS_REPLACE; +} + +static nir_ssa_def * +lower_taskmesh_payload_load(nir_builder *b, + nir_intrinsic_instr *intrin, + lower_tsms_io_state *s) +{ + unsigned base = nir_intrinsic_base(intrin); + unsigned num_components = intrin->dest.ssa.num_components; + unsigned bit_size = intrin->dest.ssa.bit_size; + + nir_ssa_def *ptr = + b->shader->info.stage == MESA_SHADER_TASK ? + task_ring_entry_index(b, s) : + mesh_ring_entry_index(b, s); + + nir_ssa_def *addr = intrin->src[0].ssa; + nir_ssa_def *ring = nir_load_ring_task_payload_amd(b); + nir_ssa_def *ring_off = nir_imul_imm(b, ptr, s->payload_entry_bytes); + + return nir_load_buffer_amd(b, num_components, bit_size, ring, addr, ring_off, .base = base, + .memory_modes = nir_var_mem_task_payload); +} + +static nir_ssa_def * +lower_task_intrinsics(nir_builder *b, + nir_instr *instr, + void *state) +{ + assert(instr->type == nir_instr_type_intrinsic); + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + lower_tsms_io_state *s = (lower_tsms_io_state *)state; + + if (intrin->intrinsic == nir_intrinsic_store_output) + return lower_task_output_store(b, intrin, s); + else if (intrin->intrinsic == nir_intrinsic_store_task_payload) + return lower_task_payload_store(b, intrin, s); + else if (intrin->intrinsic == nir_intrinsic_load_task_payload) + return lower_taskmesh_payload_load(b, intrin, s); + else + unreachable("unsupported task shader intrinsic"); +} + +static void +emit_task_finale(nir_builder *b, lower_tsms_io_state *s) +{ + /* We assume there is always a single end block in the shader. */ + b->cursor = nir_after_block(nir_impl_last_block(b->impl)); + + /* Wait for all task_payload, output, SSBO and global stores to finish. */ + nir_scoped_barrier(b, .execution_scope = NIR_SCOPE_WORKGROUP, + .memory_scope = NIR_SCOPE_WORKGROUP, + .memory_semantics = NIR_MEMORY_ACQ_REL, + .memory_modes = nir_var_mem_task_payload | nir_var_shader_out | + nir_var_mem_ssbo | nir_var_mem_global); + + nir_ssa_def *invocation_index = nir_load_local_invocation_index(b); + nir_if *if_invocation_index_zero = nir_push_if(b, nir_ieq_imm(b, invocation_index, 0)); + { + /* Write ready bit. */ + nir_ssa_def *ready_bit = task_draw_ready_bit(b, s); + task_write_draw_ring(b, ready_bit, 12, s); + } + nir_pop_if(b, if_invocation_index_zero); +} + +void +ac_nir_lower_task_outputs_to_mem(nir_shader *shader, + unsigned task_payload_entry_bytes, + unsigned task_num_entries) +{ + assert(util_is_power_of_two_nonzero(task_num_entries)); + + lower_tsms_io_state state = { + .draw_entry_bytes = 16, + .payload_entry_bytes = task_payload_entry_bytes, + .num_entries = task_num_entries, + }; + + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + nir_builder builder; + nir_builder *b = &builder; /* This is to avoid the & */ + nir_builder_init(b, impl); + + nir_shader_lower_instructions(shader, + filter_task_output_or_payload, + lower_task_intrinsics, + &state); + + emit_task_finale(b, &state); + nir_metadata_preserve(impl, nir_metadata_none); + + nir_validate_shader(shader, "after lowering task shader outputs to memory stores"); +} + +static bool +filter_mesh_input_load(const nir_instr *instr, + UNUSED const void *state) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + return intrin->intrinsic == nir_intrinsic_load_task_payload; +} + +static nir_ssa_def * +lower_mesh_intrinsics(nir_builder *b, + nir_instr *instr, + void *state) +{ + assert(instr->type == nir_instr_type_intrinsic); + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + lower_tsms_io_state *s = (lower_tsms_io_state *)state; + + if (intrin->intrinsic == nir_intrinsic_load_task_payload) + return lower_taskmesh_payload_load(b, intrin, s); + else + unreachable("unsupported mesh shader intrinsic"); +} + +void +ac_nir_lower_mesh_inputs_to_mem(nir_shader *shader, + unsigned task_payload_entry_bytes, + unsigned task_num_entries) +{ + assert(util_is_power_of_two_nonzero(task_num_entries)); + + lower_tsms_io_state state = { + .draw_entry_bytes = 16, + .payload_entry_bytes = task_payload_entry_bytes, + .num_entries = task_num_entries, + }; + + nir_shader_lower_instructions(shader, + filter_mesh_input_load, + lower_mesh_intrinsics, + &state); +} diff --git a/src/amd/common/ac_shader_args.h b/src/amd/common/ac_shader_args.h index 2a71360099cc4281865c86aac74e903fb6253308..e74caeda68da23c953f35570b1932995a09ca890 100644 --- a/src/amd/common/ac_shader_args.h +++ b/src/amd/common/ac_shader_args.h @@ -141,6 +141,9 @@ struct ac_shader_args { struct ac_arg workgroup_ids[3]; struct ac_arg tg_size; + /* Mesh and task shaders */ + struct ac_arg task_ring_entry; /* Pointer into the draw and payload rings. */ + /* Vulkan only */ struct ac_arg push_constants; struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS]; diff --git a/src/amd/common/meson.build b/src/amd/common/meson.build index 78c7b0a88db8193f440107bde4ad8d663c6825b2..0b511b534a078b1d0343ae324ef35d587a30b0ca 100644 --- a/src/amd/common/meson.build +++ b/src/amd/common/meson.build @@ -96,6 +96,7 @@ amd_common_files = files( 'ac_nir_cull.c', 'ac_nir_lower_esgs_io_to_mem.c', 'ac_nir_lower_global_access.c', + 'ac_nir_lower_taskmesh_io_to_mem.c', 'ac_nir_lower_tess_io_to_mem.c', 'ac_nir_lower_ngg.c', 'amd_family.c', diff --git a/src/amd/vulkan/radv_constants.h b/src/amd/vulkan/radv_constants.h index 5787f0423161e88aeca1776f3f8b4a855b6874f3..c40330f3bc33346ae7d0456f3eefe4264f485f57 100644 --- a/src/amd/vulkan/radv_constants.h +++ b/src/amd/vulkan/radv_constants.h @@ -74,7 +74,9 @@ #define RING_GSVS_GS 4 #define RING_HS_TESS_FACTOR 5 #define RING_HS_TESS_OFFCHIP 6 -#define RING_PS_SAMPLE_POSITIONS 7 +#define RING_TS_DRAW 7 +#define RING_TS_PAYLOAD 8 +#define RING_PS_SAMPLE_POSITIONS 9 /* max number of descriptor sets */ #define MAX_SETS 32 @@ -89,6 +91,11 @@ */ #define RADV_MAX_MEMORY_ALLOCATION_SIZE 0xFFFFFFFCull +/* Size of each payload entry in the task payload ring. + * Spec requires minimum 16K bytes. + */ +#define RADV_TASK_PAYLOAD_ENTRY_BYTES 16384 + /* Number of invocations in each subgroup. */ #define RADV_SUBGROUP_SIZE 64 diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 9822deb815d78ffb4ccdea1d6ad71b76eadf07e4..493748ce7df22f169aebb694f937890ffc689b15 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -3331,6 +3331,24 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr ac_get_hs_info(&device->physical_device->rad_info, &device->hs); + /* Number of task shader ring entries. Needs to be a power of two. + * Use a low number on smaller chips so we don't waste space, + * but keep it high on bigger chips so it doesn't inhibit parallelism. + */ + switch (device->physical_device->rad_info.family) { + case CHIP_VANGOGH: + case CHIP_BEIGE_GOBY: + case CHIP_YELLOW_CARP: + device->task_num_entries = 256; + break; + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: + case CHIP_DIMGREY_CAVEFISH: + default: + device->task_num_entries = 1024; + break; + } + if (device->instance->debug_flags & RADV_DEBUG_HANG) { /* Enable GPU hangs detection and dump logs if a GPU hang is * detected. @@ -3707,6 +3725,10 @@ radv_fill_shader_rings(struct radv_queue *queue, uint32_t *map, bool add_sample_ desc += 8; + /* Reserved for task shader rings. */ + + desc += 8; + if (add_sample_positions) { /* add sample positions after all rings */ memcpy(desc, queue->device->sample_locations_1x, 8); @@ -4004,7 +4026,7 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave add_sample_positions) { uint32_t size = 0; if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || add_sample_positions) { - size = 112; /* 2 dword + 2 padding + 4 dword * 6 */ + size = 144; /* 2 dword + 2 padding + 4 dword * 8 */ if (add_sample_positions) size += 128; /* 64+32+16+8 = 120 bytes */ } else if (scratch_bo) { diff --git a/src/amd/vulkan/radv_nir_lower_abi.c b/src/amd/vulkan/radv_nir_lower_abi.c index 60c5fe5fc5b2ea834c958a49ddf690caeedf85e3..fd32c353514bd485c02afde5cc4a50d92f59282b 100644 --- a/src/amd/vulkan/radv_nir_lower_abi.c +++ b/src/amd/vulkan/radv_nir_lower_abi.c @@ -161,6 +161,21 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state) case nir_intrinsic_load_viewport_y_offset: return ac_nir_load_arg(b, &s->args->ac, s->args->ngg_viewport_translate[1]); + case nir_intrinsic_load_ring_task_draw_amd: + return load_ring(b, RING_TS_DRAW, s); + + case nir_intrinsic_load_ring_task_payload_amd: + return load_ring(b, RING_TS_PAYLOAD, s); + + case nir_intrinsic_load_task_ring_entry_amd: + return ac_nir_load_arg(b, &s->args->ac, s->args->ac.task_ring_entry); + + case nir_intrinsic_load_task_ib_addr: + return ac_nir_load_arg(b, &s->args->ac, s->args->task_ib_addr); + + case nir_intrinsic_load_task_ib_stride: + return ac_nir_load_arg(b, &s->args->ac, s->args->task_ib_stride); + default: unreachable("invalid NIR RADV ABI intrinsic."); } @@ -197,7 +212,12 @@ filter_abi_instr(const nir_instr *instr, intrin->intrinsic == nir_intrinsic_load_viewport_x_scale || intrin->intrinsic == nir_intrinsic_load_viewport_x_offset || intrin->intrinsic == nir_intrinsic_load_viewport_y_scale || - intrin->intrinsic == nir_intrinsic_load_viewport_y_offset; + intrin->intrinsic == nir_intrinsic_load_viewport_y_offset || + intrin->intrinsic == nir_intrinsic_load_ring_task_draw_amd || + intrin->intrinsic == nir_intrinsic_load_ring_task_payload_amd || + intrin->intrinsic == nir_intrinsic_load_task_ring_entry_amd || + intrin->intrinsic == nir_intrinsic_load_task_ib_addr || + intrin->intrinsic == nir_intrinsic_load_task_ib_stride; } void diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 3cb547ff6c5494cf4e93c84f0ff216ea63e7ecab..9021ea297f797bc0ff20388292e4a38fc752b7bf 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -2726,6 +2726,9 @@ radv_link_shaders(struct radv_pipeline *pipeline, if (stages[MESA_SHADER_MESH].nir) { ordered_shaders[shader_count++] = stages[MESA_SHADER_MESH].nir; } + if (stages[MESA_SHADER_TASK].nir) { + ordered_shaders[shader_count++] = stages[MESA_SHADER_TASK].nir; + } if (stages[MESA_SHADER_COMPUTE].nir) { ordered_shaders[shader_count++] = stages[MESA_SHADER_COMPUTE].nir; } @@ -3167,6 +3170,8 @@ radv_get_wave_size(struct radv_device *device, gl_shader_stage stage, return info->cs.subgroup_size; } else if (stage == MESA_SHADER_FRAGMENT) return device->physical_device->ps_wave_size; + else if (stage == MESA_SHADER_TASK) + return device->physical_device->cs_wave_size; else return device->physical_device->ge_wave_size; } @@ -3492,6 +3497,16 @@ radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stag stages[i].info.user_sgprs_locs = stages[i].args.user_sgprs_locs; stages[i].info.inline_push_constant_mask = stages[i].args.ac.inline_push_const_mask; } + + if (stages[MESA_SHADER_TASK].nir) { + /* Task/mesh I/O uses the task ring buffers. */ + stages[MESA_SHADER_TASK].info.cs.uses_task_rings = true; + stages[MESA_SHADER_MESH].info.cs.uses_task_rings = true; + + stages[MESA_SHADER_TASK].info.workgroup_size = + ac_compute_cs_workgroup_size( + stages[MESA_SHADER_TASK].nir->info.workgroup_size, false, UINT32_MAX); + } } static void @@ -4492,7 +4507,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout ac_nir_lower_global_access(stages[i].nir); radv_nir_lower_abi(stages[i].nir, device->physical_device->rad_info.chip_class, &stages[i].info, &stages[i].args, pipeline_key); - radv_optimize_nir_algebraic(stages[i].nir, io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE); + radv_optimize_nir_algebraic( + stages[i].nir, io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE || i == MESA_SHADER_TASK); if (stages[i].nir->info.bit_sizes_int & (8 | 16)) { if (device->physical_device->rad_info.chip_class >= GFX8) { @@ -4747,6 +4763,7 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline, gl_shader_sta return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : R_00B230_SPI_SHADER_USER_DATA_GS_0; case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: return R_00B900_COMPUTE_USER_DATA_0; case MESA_SHADER_TESS_CTRL: return chip_class == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0 diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 05240d07a2cf7a5237a8876f7497f163418d1af4..aa94c87c3073db34f51694c0927777c9961b2361 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -777,6 +777,9 @@ struct radv_device { uint32_t scratch_waves; uint32_t dispatch_initiator; + /* Number of entries in the task shader ring buffers. */ + uint32_t task_num_entries; + uint32_t gs_table_depth; struct ac_hs_info hs; @@ -2018,6 +2021,12 @@ radv_pipeline_has_mesh(const struct radv_pipeline *pipeline) return !!pipeline->shaders[MESA_SHADER_MESH]; } +static inline bool +radv_pipeline_has_task(const struct radv_pipeline *pipeline) +{ + return !!pipeline->shaders[MESA_SHADER_TASK]; +} + bool radv_pipeline_has_ngg_passthrough(const struct radv_pipeline *pipeline); bool radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline); diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 460fa0e0b7f6ff51984ed8c0035247766825d151..a403ef37d7fba77f8ea85b852489764d7f237811 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -873,11 +873,18 @@ radv_shader_compile_to_nir(struct radv_device *device, const struct radv_pipelin /* Lower deref operations for compute shared memory. */ if (nir->info.stage == MESA_SHADER_COMPUTE || + nir->info.stage == MESA_SHADER_TASK || nir->info.stage == MESA_SHADER_MESH) { + nir_variable_mode var_modes = nir_var_mem_shared; + + if (nir->info.stage == MESA_SHADER_TASK || + nir->info.stage == MESA_SHADER_MESH) + var_modes |= nir_var_mem_task_payload; + if (!nir->info.shared_memory_explicit_layout) { - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_var_info); + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, var_modes, shared_var_info); } - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset); + NIR_PASS_V(nir, nir_lower_explicit_io, var_modes, nir_address_format_32bit_offset); if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) { const unsigned chunk_size = 16; /* max single store size */ @@ -1042,6 +1049,15 @@ radv_lower_io_to_mem(struct radv_device *device, struct radv_pipeline_stage *sta ac_nir_lower_gs_inputs_to_mem(nir, device->physical_device->rad_info.chip_class, info->gs.num_linked_inputs); return true; + } else if (nir->info.stage == MESA_SHADER_TASK) { + ac_nir_apply_first_task_to_task_shader(nir); + ac_nir_lower_task_outputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES, + device->task_num_entries); + return true; + } else if (nir->info.stage == MESA_SHADER_MESH) { + ac_nir_lower_mesh_inputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES, + device->task_num_entries); + return true; } return false; @@ -1634,6 +1650,7 @@ radv_postprocess_config(const struct radv_device *device, const struct ac_shader S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) | S_00B22C_EXCP_EN(excp_en); break; case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: config_out->rsrc1 |= S_00B848_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | S_00B848_WGP_MODE(wgp_mode); config_out->rsrc2 |= S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) | @@ -2305,6 +2322,8 @@ radv_get_shader_name(const struct radv_shader_info *info, gl_shader_stage stage) return "Compute Shader"; case MESA_SHADER_MESH: return "Mesh Shader as NGG"; + case MESA_SHADER_TASK: + return "Task Shader as CS"; default: return "Unknown shader"; }; @@ -2327,7 +2346,7 @@ radv_get_max_waves(const struct radv_device *device, struct radv_shader *shader, lds_per_wave = conf->lds_size * info->lds_encode_granularity + shader->info.ps.num_interp * 48; lds_per_wave = align(lds_per_wave, info->lds_alloc_granularity); - } else if (stage == MESA_SHADER_COMPUTE) { + } else if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_TASK) { unsigned max_workgroup_size = shader->info.workgroup_size; lds_per_wave = align(conf->lds_size * info->lds_encode_granularity, info->lds_alloc_granularity); diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index c158de552ff579586a3f1e25c58d345a46306cf7..373e3410e78565f85d1141b849139531139da2c8 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -146,7 +146,8 @@ enum radv_ud_index { AC_UD_NGG_CULLING_SETTINGS = 7, AC_UD_NGG_VIEWPORT = 8, AC_UD_FORCE_VRS_RATES = 9, - AC_UD_SHADER_START = 10, + AC_UD_TASK_RING_ENTRY = 10, + AC_UD_SHADER_START = 11, AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START, AC_UD_VS_BASE_VERTEX_START_INSTANCE, AC_UD_VS_PROLOG_INPUTS, @@ -155,6 +156,9 @@ enum radv_ud_index { AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START, AC_UD_CS_SBT_DESCRIPTORS, AC_UD_CS_RAY_LAUNCH_SIZE, + AC_UD_CS_TASK_RING_OFFSETS, + AC_UD_CS_TASK_DRAW_ID, + AC_UD_CS_TASK_IB, AC_UD_CS_MAX_UD, AC_UD_GS_MAX_UD, AC_UD_TCS_MAX_UD, @@ -345,6 +349,7 @@ struct radv_shader_info { bool uses_sbt; bool uses_ray_launch_size; + bool uses_task_rings; } cs; struct { uint64_t tes_inputs_read; diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index c4c95bf4d745a6dff892f10ac65216505711f0c6..0326ec455097d6feb97ea7646453663d3d900000 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -50,7 +50,8 @@ set_loc_shader(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx, uint8_ static void set_loc_shader_ptr(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx) { - bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS; + bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS && + idx != AC_UD_CS_TASK_RING_OFFSETS; set_loc_shader(args, idx, sgpr_idx, use_32bit_pointers ? 1 : 2); } @@ -96,6 +97,8 @@ count_ms_user_sgprs(const struct radv_shader_info *info) if (info->vs.needs_draw_id) count++; + if (info->cs.uses_task_rings) + count++; return count; } @@ -157,18 +160,26 @@ allocate_user_sgprs(enum chip_class chip_class, const struct radv_shader_info *i /* 2 user sgprs will always be allocated for scratch/rings */ user_sgpr_count += 2; + if (stage == MESA_SHADER_TASK) + user_sgpr_count += 2; /* task descriptors */ + /* prolog inputs */ if (info->vs.has_prolog) user_sgpr_count += 2; switch (stage) { case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: if (info->cs.uses_sbt) user_sgpr_count += 1; if (info->cs.uses_grid_size) user_sgpr_count += args->load_grid_size_from_user_sgpr ? 3 : 2; if (info->cs.uses_ray_launch_size) user_sgpr_count += 3; + if (info->vs.needs_draw_id) + user_sgpr_count += 1; + if (info->cs.uses_task_rings) + user_sgpr_count += 4; /* ring_entry, 2x ib_addr, ib_stride */ break; case MESA_SHADER_FRAGMENT: break; @@ -212,7 +223,8 @@ allocate_user_sgprs(enum chip_class chip_class, const struct radv_shader_info *i if (info->so.num_outputs) user_sgpr_count++; - uint32_t available_sgprs = chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16; + uint32_t available_sgprs = + chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_TASK ? 32 : 16; uint32_t remaining_sgprs = available_sgprs - user_sgpr_count; uint32_t num_desc_set = util_bitcount(info->desc_set_used_mask); @@ -370,6 +382,9 @@ declare_ms_input_sgprs(const struct radv_shader_info *info, struct radv_shader_a if (info->vs.needs_draw_id) { ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id); } + if (info->cs.uses_task_rings) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.task_ring_entry); + } } static void @@ -497,6 +512,9 @@ set_ms_input_locs(struct radv_shader_args *args, uint8_t *user_sgpr_idx) unsigned vs_num = args->ac.base_vertex.used + 3 * args->ac.num_work_groups.used + args->ac.draw_id.used; set_loc_shader(args, AC_UD_VS_BASE_VERTEX_START_INSTANCE, user_sgpr_idx, vs_num); + + if (args->ac.task_ring_entry.used) + set_loc_shader(args, AC_UD_TASK_RING_ENTRY, user_sgpr_idx, 1); } void @@ -527,6 +545,9 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ if (args->explicit_scratch_args) { ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->ring_offsets); } + if (stage == MESA_SHADER_TASK) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->task_ring_offsets); + } /* To ensure prologs match the main VS, VS specific input SGPRs have to be placed before other * sgprs. @@ -534,6 +555,7 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ switch (stage) { case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: declare_global_input_sgprs(info, &user_sgpr_info, args); if (info->cs.uses_sbt) { @@ -551,6 +573,16 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.ray_launch_size); } + if (info->vs.needs_draw_id) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id); + } + + if (info->cs.uses_task_rings) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.task_ring_entry); + ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_INT, &args->task_ib_addr); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->task_ib_stride); + } + for (int i = 0; i < 3; i++) { if (info->cs.uses_block_id[i]) { ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]); @@ -750,6 +782,9 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ uint8_t user_sgpr_idx = 0; set_loc_shader_ptr(args, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_idx); + if (stage == MESA_SHADER_TASK) { + set_loc_shader_ptr(args, AC_UD_CS_TASK_RING_OFFSETS, &user_sgpr_idx); + } /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */ @@ -765,6 +800,7 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ switch (stage) { case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: if (args->ac.sbt_descriptors.used) { set_loc_shader_ptr(args, AC_UD_CS_SBT_DESCRIPTORS, &user_sgpr_idx); } @@ -775,6 +811,16 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_ if (args->ac.ray_launch_size.used) { set_loc_shader(args, AC_UD_CS_RAY_LAUNCH_SIZE, &user_sgpr_idx, 3); } + if (args->ac.draw_id.used) { + set_loc_shader(args, AC_UD_CS_TASK_DRAW_ID, &user_sgpr_idx, 1); + } + if (args->ac.task_ring_entry.used) { + set_loc_shader(args, AC_UD_TASK_RING_ENTRY, &user_sgpr_idx, 1); + } + if (args->task_ib_addr.used) { + assert(args->task_ib_stride.used); + set_loc_shader(args, AC_UD_CS_TASK_IB, &user_sgpr_idx, 3); + } break; case MESA_SHADER_VERTEX: if (args->ac.view_index.used) diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h index ed202a09faedad24794724d58d4161911f0af2f4..b510c31d0ef1523d177d94e1d0db4fb91956e190 100644 --- a/src/amd/vulkan/radv_shader_args.h +++ b/src/amd/vulkan/radv_shader_args.h @@ -36,7 +36,10 @@ struct radv_shader_args { struct ac_shader_args ac; struct ac_arg descriptor_sets[MAX_SETS]; + /* User data 0/1. GFX: descriptor list, Compute: scratch BO */ struct ac_arg ring_offsets; + /* User data 2/3. same as the descriptor list above but for task shaders. */ + struct ac_arg task_ring_offsets; /* Streamout */ struct ac_arg streamout_buffers; @@ -47,6 +50,10 @@ struct radv_shader_args { struct ac_arg ngg_viewport_scale[2]; struct ac_arg ngg_viewport_translate[2]; + /* Task shaders */ + struct ac_arg task_ib_addr; + struct ac_arg task_ib_stride; + struct ac_arg prolog_inputs; struct ac_arg vs_inputs[MAX_VERTEX_ATTRIBS]; diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 0b9ca2ace1429be2e2266e9c46b8e6059fadfdb3..adc30456f3a697508be877de36a625984e0bb379 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -590,9 +590,25 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_NUM_SUBGROUPS); switch (nir->info.stage) { case MESA_SHADER_COMPUTE: + case MESA_SHADER_TASK: for (int i = 0; i < 3; ++i) info->cs.block_size[i] = nir->info.workgroup_size[i]; info->cs.uses_ray_launch_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_RAY_LAUNCH_SIZE); + + /* Task shaders always need these for the I/O lowering even if + * the API shader doesn't actually use them. + */ + if (nir->info.stage == MESA_SHADER_TASK) { + /* Needed to address the IB to read firstTask. */ + info->vs.needs_draw_id |= + BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_WORKGROUP_ID); + + /* Needed to address the task draw/payload rings. */ + info->cs.uses_block_id[0] = true; + + /* Needed for storing draw ready only on the 1st thread. */ + info->cs.uses_local_invocation_idx = true; + } break; case MESA_SHADER_FRAGMENT: info->ps.can_discard = nir->info.fs.uses_discard; diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 8f46d4eeff6a665c38b8148c39059ef833153f08..7fba34fd2292b34109ac0850d85526f0c456485d 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -145,6 +145,11 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr) case nir_intrinsic_load_ring_tess_offchip_offset_amd: case nir_intrinsic_load_ring_esgs_amd: case nir_intrinsic_load_ring_es2gs_offset_amd: + case nir_intrinsic_load_ring_task_draw_amd: + case nir_intrinsic_load_ring_task_payload_amd: + case nir_intrinsic_load_task_ring_entry_amd: + case nir_intrinsic_load_task_ib_addr: + case nir_intrinsic_load_task_ib_stride: case nir_intrinsic_load_sample_positions_pan: case nir_intrinsic_load_workgroup_num_input_vertices_amd: case nir_intrinsic_load_workgroup_num_input_primitives_amd: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 7bf257f2f0326bde4a934a1ab6a765a48684c4d6..e0c3938d4895a9b7532c85cd715e03c8827aa364 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1273,6 +1273,15 @@ system_value("ring_tess_factors_offset_amd", 1) # Descriptor where ES outputs are stored for GS to read on GFX6-8 system_value("ring_esgs_amd", 4) system_value("ring_es2gs_offset_amd", 1) +# Address of the task shader draw ring (used for VARYING_SLOT_TASK_COUNT) +system_value("ring_task_draw_amd", 4) +# Address of the task shader payload ring (used for all other outputs) +system_value("ring_task_payload_amd", 4) +# Pointer into the draw and payload rings +system_value("task_ring_entry_amd", 1) +# Pointer into the draw and payload rings +system_value("task_ib_addr", 2) +system_value("task_ib_stride", 1) # Number of patches processed by each TCS workgroup system_value("tcs_num_patches_amd", 1)