From 3aa71a61ddebce731205caac079372501f2f8ccd Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 18 Aug 2020 08:34:33 +0200 Subject: [PATCH 01/10] amd/registers: add missing TBA registers on GFX6-GFX8 Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen Part-of: --- src/amd/registers/amdgfxregs.json | 60 +++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/src/amd/registers/amdgfxregs.json b/src/amd/registers/amdgfxregs.json index 906fb4448008..bb8dffb2df8e 100644 --- a/src/amd/registers/amdgfxregs.json +++ b/src/amd/registers/amdgfxregs.json @@ -7913,6 +7913,66 @@ "name": "SPI_SHADER_PGM_HI_GS", "type_ref": "SPI_SHADER_TBA_HI_PS" }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 45060, "to": "mm"}, + "name": "SPI_SHADER_TBA_HI_PS" + }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 45316, "to": "mm"}, + "name": "SPI_SHADER_TBA_HI_VS" + }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 45572, "to": "mm"}, + "name": "SPI_SHADER_TBA_HI_GS" + }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 45828, "to": "mm"}, + "name": "SPI_SHADER_TBA_HI_ES" + }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 46084, "to": "mm"}, + "name": "SPI_SHADER_TBA_HI_HS" + }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 46340, "to": "mm"}, + "name": "SPI_SHADER_TBA_HI_LS" + }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 45056, "to": "mm"}, + "name": "SPI_SHADER_TBA_LO_PS" + }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 45312, "to": "mm"}, + "name": "SPI_SHADER_TBA_LO_VS" + }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 45568, "to": "mm"}, + "name": "SPI_SHADER_TBA_LO_GS" + }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 45824, "to": "mm"}, + "name": "SPI_SHADER_TBA_LO_ES" + }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 46080, "to": "mm"}, + "name": "SPI_SHADER_TBA_LO_HS" + }, + { + "chips": ["gfx7", "fiji", "gfx6", "stoney", "gfx8"], + "map": {"at": 46336, "to": "mm"}, + "name": "SPI_SHADER_TBA_LO_LS" + }, { "chips": ["gfx7", "fiji", "gfx9", "gfx6", "stoney", "gfx8"], "map": {"at": 46116, "to": "mm"}, -- GitLab From 7e493e510b7722ea54138906e7bb3b05b58637e7 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 18 Aug 2020 14:54:51 +0200 Subject: [PATCH 02/10] amd/registers: add some SQ_WAVE_* register definitions Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen Part-of: --- src/amd/registers/amdgfxregs.json | 123 ++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/src/amd/registers/amdgfxregs.json b/src/amd/registers/amdgfxregs.json index bb8dffb2df8e..fb7385452869 100644 --- a/src/amd/registers/amdgfxregs.json +++ b/src/amd/registers/amdgfxregs.json @@ -1487,6 +1487,19 @@ {"name": "RE_Z", "value": 2}, {"name": "EARLY_Z_THEN_RE_Z", "value": 3} ] + }, + "ExcpEn": { + "entries": [ + {"name": "INVALID", "value": 1}, + {"name": "INPUT_DENORMAL", "value": 2}, + {"name": "DIVIDE_BY_ZERO", "value": 4}, + {"name": "OVERFLOW", "value": 8}, + {"name": "UNDERFLOW", "value": 16}, + {"name": "INEXACT", "value": 32}, + {"name": "INT_DIVIDE_BY_ZERO", "value": 64}, + {"name": "ADDRESS_WATCH", "value": 128}, + {"name": "MEMORY_VIOLATION", "value": 256} + ] } }, "register_mappings": [ @@ -11457,6 +11470,42 @@ "map": {"at": 198988, "to": "mm"}, "name": "WD_POS_BUF_BASE_HI", "type_ref": "VGT_TF_MEMORY_BASE_HI" + }, + { + "chips": ["gfx8"], + "map": {"at": 2, "to": "hw"}, + "name": "SQ_HW_REG_STATUS", + "type_ref": "SQ_WAVE_STATUS_vi" + }, + { + "chips": ["gfx8"], + "map": {"at": 3, "to": "hw"}, + "name": "SQ_HW_REG_TRAP_STS", + "type_ref": "SQ_WAVE_TRAP_STS" + }, + { + "chips": ["gfx8"], + "map": {"at": 4, "to": "hw"}, + "name": "SQ_HW_REG_HW_ID", + "type_ref": "SQ_WAVE_HW_ID_cik_vi" + }, + { + "chips": ["gfx8"], + "map": {"at": 5, "to": "hw"}, + "name": "SQ_HW_REG_GPR_ALLOC", + "type_ref": "SQ_WAVE_GPR_ALLOC" + }, + { + "chips": ["gfx8"], + "map": {"at": 6, "to": "hw"}, + "name": "SQ_HW_REG_LDS_ALLOC", + "type_ref": "SQ_WAVE_LDS_ALLOC" + }, + { + "chips": ["gfx8"], + "map": {"at": 7, "to": "hw"}, + "name": "SQ_HW_REG_IB_STS", + "type_ref": "SQ_WAVE_IB_STS_cik_vi" } ], "register_types": { @@ -16048,6 +16097,80 @@ {"bits": [0, 7], "name": "PERF_SEL"}, {"bits": [28, 31], "name": "PERF_MODE"} ] + }, + "SQ_WAVE_GPR_ALLOC": { + "fields": [ + {"bits": [0, 5], "name": "VGPR_BASE"}, + {"bits": [8, 13], "name": "VGPR_SIZE"}, + {"bits": [16, 21], "name": "SGPR_BASE"}, + {"bits": [24, 27], "name": "SGPR_SIZE"} + ] + }, + "SQ_WAVE_LDS_ALLOC": { + "fields": [ + {"bits": [0, 7], "name": "LDS_BASE"}, + {"bits": [12, 20], "name": "LDS_SIZE"} + ] + }, + "SQ_WAVE_TRAP_STS": { + "fields": [ + {"bits": [0, 8], "enum_ref": "ExcpEn", "name": "EXCP"}, + {"bits": [10, 10], "name": "SAVE_CTX_vi"}, + {"bits": [16, 21], "name": "EXCP_CYCLE"}, + {"bits": [29, 31], "name": "DP_RATE"} + ] + }, + "SQ_WAVE_STATUS_vi": { + "fields": [ + {"bits": [0, 0], "name": "SCC"}, + {"bits": [1, 2], "name": "SPI_PRIO"}, + {"bits": [3, 4], "name": "USER_PRIO"}, + {"bits": [5, 5], "name": "PRIV"}, + {"bits": [6, 6], "name": "TRAP_EN"}, + {"bits": [7, 7], "name": "TTRACE_EN"}, + {"bits": [8, 8], "name": "EXPORT_RDY"}, + {"bits": [9, 9], "name": "EXECZ"}, + {"bits": [10, 10], "name": "VCCZ"}, + {"bits": [11, 11], "name": "IN_TG"}, + {"bits": [12, 12], "name": "IN_BARRIER"}, + {"bits": [13, 13], "name": "HALT"}, + {"bits": [14, 14], "name": "TRAP"}, + {"bits": [15, 15], "name": "TTRACE_CU_EN"}, + {"bits": [16, 16], "name": "VALID"}, + {"bits": [17, 17], "name": "ECC_ERR"}, + {"bits": [18, 18], "name": "SKIP_EXPORT"}, + {"bits": [19, 19], "name": "PERF_EN"}, + {"bits": [20, 20], "name": "COND_DBG_USER"}, + {"bits": [21, 21], "name": "COND_DBG_SYS"}, + {"bits": [22, 22], "name": "ALLOW_REPLAY"}, + {"bits": [23, 23], "name": "INST_ATC"}, + {"bits": [27, 27], "name": "MUST_EXPORT"} + ] + }, + "SQ_WAVE_IB_STS_cik_vi": { + "fields": [ + {"bits": [0, 3], "name": "VM_CNT"}, + {"bits": [4, 6], "name": "EXP_CNT"}, + {"bits": [8, 11], "name": "LGKM_CNT"}, + {"bits": [12, 14], "name": "VALU_CNT"}, + {"bits": [15, 15], "name": "FIRST_REPLAY_vi"}, + {"bits": [16, 19], "name": "RCNT_vi"} + ] + }, + "SQ_WAVE_HW_ID_cik_vi": { + "fields": [ + {"bits": [0, 3], "name": "WAVE_ID"}, + {"bits": [4, 5], "name": "SIMD_ID"}, + {"bits": [6, 7], "name": "PIPE_ID"}, + {"bits": [8, 11], "name": "CU_ID"}, + {"bits": [12, 12], "name": "SH_ID"}, + {"bits": [13, 14], "name": "SE_ID"}, + {"bits": [16, 19], "name": "TG_ID"}, + {"bits": [20, 23], "name": "VM_ID"}, + {"bits": [24, 26], "name": "QUEUE_ID"}, + {"bits": [27, 29], "name": "STATE_ID"}, + {"bits": [30, 31], "name": "ME_ID"} + ] } } } -- GitLab From baa9268eb68500e66c3d151f7c97da354552fa91 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 18 Aug 2020 09:26:48 +0200 Subject: [PATCH 03/10] aco: add TBA/TMA/TTMP0-11 physical registers definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TBA/TMA scalar registers are only available on GFX6-GFX8. On GFX9+, TBA/TMA addr are stored in hardware registers and the number of TTMP scalar registers is thus increased by 4. Just keep in mind that tba_lo is actually ttmp0. Best would be to support ttmp registers in RA but that's more complicated. Signed-off-by: Samuel Pitoiset Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_ir.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 0e5b69a7863b..00a2e2596a37 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -380,6 +380,20 @@ struct PhysReg { static constexpr PhysReg m0{124}; static constexpr PhysReg vcc{106}; static constexpr PhysReg vcc_hi{107}; +static constexpr PhysReg tba{108}; /* GFX6-GFX8 */ +static constexpr PhysReg tma{110}; /* GFX6-GFX8 */ +static constexpr PhysReg ttmp0{112}; +static constexpr PhysReg ttmp1{113}; +static constexpr PhysReg ttmp2{114}; +static constexpr PhysReg ttmp3{115}; +static constexpr PhysReg ttmp4{116}; +static constexpr PhysReg ttmp5{117}; +static constexpr PhysReg ttmp6{118}; +static constexpr PhysReg ttmp7{119}; +static constexpr PhysReg ttmp8{120}; +static constexpr PhysReg ttmp9{121}; +static constexpr PhysReg ttmp10{122}; +static constexpr PhysReg ttmp11{123}; static constexpr PhysReg sgpr_null{125}; /* GFX10+ */ static constexpr PhysReg exec{126}; static constexpr PhysReg exec_lo{126}; -- GitLab From a6146aa5980f972a11ee054a49bba9dc79b8bbd4 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 19 Aug 2020 09:41:42 +0200 Subject: [PATCH 04/10] aco: validate that SMEM operands can use fixed registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To fix a validation error when loading the scalar tma buffer descriptor because it's not a temp but a fixed reg (tma_lo/tma_hi). Signed-off-by: Samuel Pitoiset Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_validate.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 0d74458519c4..fce0e1a1189e 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -378,7 +378,8 @@ bool validate_ir(Program* program) } case Format::SMEM: { if (instr->operands.size() >= 1) - check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, "SMEM operands must be sgpr", instr.get()); + check((instr->operands[0].isFixed() && !instr->operands[0].isConstant()) || + (instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr), "SMEM operands must be sgpr", instr.get()); if (instr->operands.size() >= 2) check(instr->operands[1].isConstant() || (instr->operands[1].isTemp() && instr->operands[1].regClass().type() == RegType::sgpr), "SMEM offset must be constant or sgpr", instr.get()); -- GitLab From 9c46e6fca323390f3cb74d6e865d2883a4fbd453 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 18 Aug 2020 18:39:20 +0200 Subject: [PATCH 05/10] aco: add a helper for building a trap handler shader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's way easier to write a trap handler shader using ACO IR instead of writing disassembly by hand + clrxasm + copy&paste. This trap handler is quite simple for now, it just loads a buffer descriptor from the TMA BO, it saves ttmp0-1 which contain various info about the faulty instruction, and it stores some hw registers about the wave/trap status. Signed-off-by: Samuel Pitoiset Reviewed-by: Daniel Schürmann Part-of: --- .../compiler/aco_instruction_selection.cpp | 60 +++++++++++++++++++ src/amd/compiler/aco_ir.h | 3 + 2 files changed, 63 insertions(+) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 42b899e4c700..6f1f8b4e07e7 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -11104,4 +11104,64 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, cleanup_cfg(program); } + +void select_trap_handler_shader(Program *program, struct nir_shader *shader, + ac_shader_config* config, + struct radv_shader_args *args) +{ + assert(args->options->chip_class == GFX8); + + init_program(program, compute_cs, args->shader_info, + args->options->chip_class, args->options->family, config); + + isel_context ctx = {}; + ctx.program = program; + ctx.args = args; + ctx.options = args->options; + ctx.stage = program->stage; + + ctx.block = ctx.program->create_and_insert_block(); + ctx.block->loop_nest_depth = 0; + ctx.block->kind = block_kind_top_level; + + program->workgroup_size = 1; /* XXX */ + + add_startpgm(&ctx); + append_logical_start(ctx.block); + + Builder bld(ctx.program, ctx.block); + + /* Load the buffer descriptor from TMA. */ + bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), + Operand(PhysReg{tma}, s2), Operand(0u)); + + /* Store TTMP0-TTMP1. */ + bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), + Operand(0u), Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true); + + uint32_t hw_regs_idx[] = { + 2, /* HW_REG_STATUS */ + 3, /* HW_REG_TRAP_STS */ + 4, /* HW_REG_HW_ID */ + 7, /* HW_REG_IB_STS */ + }; + + /* Store some hardware registers. */ + for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) { + /* "((size - 1) << 11) | register" */ + bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1), + ((20 - 1) << 11) | hw_regs_idx[i]); + + bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4), + Operand(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true); + } + + program->config->float_mode = program->blocks[0].fp_mode.val; + + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_uniform; + bld.sopp(aco_opcode::s_endpgm); + + cleanup_cfg(program); +} } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 00a2e2596a37..18bc9bdd844f 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1665,6 +1665,9 @@ void select_program(Program *program, void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, ac_shader_config* config, struct radv_shader_args *args); +void select_trap_handler_shader(Program *program, struct nir_shader *shader, + ac_shader_config* config, + struct radv_shader_args *args); void lower_wqm(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); -- GitLab From a0814a873d50f65484b17927379fbb47cf90372e Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 21 Aug 2020 11:33:22 +0200 Subject: [PATCH 06/10] aco: skip unnecessary compiler pass for the trap handler program MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shader is written by hands with assigned registers, so most of the pass are unnecessary. Signed-off-by: Samuel Pitoiset Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_interface.cpp | 71 ++++++++++++++++-------------- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index 96a968d376a2..5d402495ae54 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -79,23 +79,26 @@ void aco_compile_shader(unsigned shader_count, aco_print_program(program.get(), stderr); } - /* Phi lowering */ - aco::lower_phis(program.get()); - aco::dominator_tree(program.get()); - validate(program.get()); - - /* Optimization */ - aco::value_numbering(program.get()); - aco::optimize(program.get()); - - /* cleanup and exec mask handling */ - aco::setup_reduce_temp(program.get()); - aco::insert_exec_mask(program.get()); - validate(program.get()); - - /* spilling and scheduling */ - aco::live live_vars = aco::live_var_analysis(program.get(), args->options); - aco::spill(program.get(), live_vars, args->options); + aco::live live_vars; + if (!args->is_trap_handler_shader) { + /* Phi lowering */ + aco::lower_phis(program.get()); + aco::dominator_tree(program.get()); + validate(program.get()); + + /* Optimization */ + aco::value_numbering(program.get()); + aco::optimize(program.get()); + + /* cleanup and exec mask handling */ + aco::setup_reduce_temp(program.get()); + aco::insert_exec_mask(program.get()); + validate(program.get()); + + /* spilling and scheduling */ + live_vars = aco::live_var_analysis(program.get(), args->options); + aco::spill(program.get(), live_vars, args->options); + } std::string llvm_ir; if (args->options->record_ir) { @@ -114,26 +117,30 @@ void aco_compile_shader(unsigned shader_count, if (program->collect_statistics) aco::collect_presched_stats(program.get()); - aco::schedule_program(program.get(), live_vars); - validate(program.get()); - /* Register Allocation */ - aco::register_allocation(program.get(), live_vars.live_out); - if (args->options->dump_shader) { - std::cerr << "After RA:\n"; - aco_print_program(program.get(), stderr); - } + if (!args->is_trap_handler_shader) { + aco::schedule_program(program.get(), live_vars); + validate(program.get()); - if (aco::validate_ra(program.get(), args->options)) { - std::cerr << "Program after RA validation failure:\n"; - aco_print_program(program.get(), stderr); - abort(); - } + /* Register Allocation */ + aco::register_allocation(program.get(), live_vars.live_out); + if (args->options->dump_shader) { + std::cerr << "After RA:\n"; + aco_print_program(program.get(), stderr); + } + + if (aco::validate_ra(program.get(), args->options)) { + std::cerr << "Program after RA validation failure:\n"; + aco_print_program(program.get(), stderr); + abort(); + } + + validate(program.get()); - validate(program.get()); + aco::ssa_elimination(program.get()); + } /* Lower to HW Instructions */ - aco::ssa_elimination(program.get()); aco::lower_to_hw_instr(program.get()); /* Insert Waitcnt */ -- GitLab From 8fd2f5c16d902708136764f7121aad471559fb23 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 18 Aug 2020 18:44:07 +0200 Subject: [PATCH 07/10] radv: add a small interface for creating the trap handler shader Similar to the GS copy shader except that NIR is unused because the shader is written directly using ACO IR. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen Part-of: --- src/amd/compiler/aco_interface.cpp | 2 ++ src/amd/vulkan/radv_shader.c | 36 +++++++++++++++++++++++++++--- src/amd/vulkan/radv_shader.h | 3 +++ src/amd/vulkan/radv_shader_args.h | 1 + 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index 5d402495ae54..8f45f503a824 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -72,6 +72,8 @@ void aco_compile_shader(unsigned shader_count, /* Instruction Selection */ if (args->is_gs_copy_shader) aco::select_gs_copy_shader(program.get(), shaders[0], &config, args); + else if (args->is_trap_handler_shader) + aco::select_trap_handler_shader(program.get(), shaders[0], &config, args); else aco::select_program(program.get(), shader_count, shaders, &config, args); if (args->options->dump_preoptir) { diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 1d227efe4afe..8d0cd9d4feb7 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1202,6 +1202,7 @@ shader_variant_compile(struct radv_device *device, struct radv_shader_info *info, struct radv_nir_compiler_options *options, bool gs_copy_shader, + bool trap_handler_shader, bool keep_shader_info, bool keep_statistic_info, struct radv_shader_binary **binary_out) @@ -1234,6 +1235,8 @@ shader_variant_compile(struct radv_device *device, args.options = options; args.shader_info = info; args.is_gs_copy_shader = gs_copy_shader; + args.is_trap_handler_shader = trap_handler_shader; + radv_declare_shader_args(&args, gs_copy_shader ? MESA_SHADER_VERTEX : shaders[shader_count - 1]->info.stage, @@ -1271,7 +1274,7 @@ shader_variant_compile(struct radv_device *device, if (keep_shader_info) { variant->nir_string = radv_dump_nir_shaders(shaders, shader_count); - if (!gs_copy_shader && !module->nir) { + if (!gs_copy_shader && !trap_handler_shader && !module->nir) { variant->spirv = malloc(module->size); if (!variant->spirv) { free(variant); @@ -1314,7 +1317,8 @@ radv_shader_variant_compile(struct radv_device *device, options.robust_buffer_access = device->robust_buffer_access; return shader_variant_compile(device, module, shaders, shader_count, stage, info, - &options, false, keep_shader_info, keep_statistic_info, binary_out); + &options, false, false, + keep_shader_info, keep_statistic_info, binary_out); } struct radv_shader_variant * @@ -1332,7 +1336,33 @@ radv_create_gs_copy_shader(struct radv_device *device, options.key.has_multiview_view_index = multiview; return shader_variant_compile(device, NULL, &shader, 1, stage, - info, &options, true, keep_shader_info, keep_statistic_info, binary_out); + info, &options, true, false, + keep_shader_info, keep_statistic_info, binary_out); +} + +struct radv_shader_variant * +radv_create_trap_handler_shader(struct radv_device *device) +{ + struct radv_nir_compiler_options options = {0}; + struct radv_shader_variant *shader = NULL; + struct radv_shader_binary *binary = NULL; + struct radv_shader_info info = {0}; + + nir_builder b; + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL); + b.shader->info.name = ralloc_strdup(b.shader, "meta_trap_handler"); + + options.explicit_scratch_args = true; + info.wave_size = 64; + + shader = shader_variant_compile(device, NULL, &b.shader, 1, + MESA_SHADER_COMPUTE, &info, &options, + false, true, true, false, &binary); + + ralloc_free(b.shader); + free(binary); + + return shader; } void diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index cb76e635dc5f..1638614091ce 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -488,6 +488,9 @@ radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *nir, bool multiview, bool keep_shader_info, bool keep_statistic_info); +struct radv_shader_variant * +radv_create_trap_handler_shader(struct radv_device *device); + void radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant); diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h index 451077a9ede0..f01c63ffa1ac 100644 --- a/src/amd/vulkan/radv_shader_args.h +++ b/src/amd/vulkan/radv_shader_args.h @@ -69,6 +69,7 @@ struct radv_shader_args { struct ac_arg ngg_gs_state; bool is_gs_copy_shader; + bool is_trap_handler_shader; }; static inline struct radv_shader_args * -- GitLab From af3230e39e9a4fe848e8c859095db8dab6869ccf Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 18 Aug 2020 18:51:46 +0200 Subject: [PATCH 08/10] radv: add initial trap handler support with RADV_TRAP_HANDLER=1 A trap handler is used to handle shader exceptions like memory violations, divide by zero etc. The trap handler shader code will help to identify the faulty shader/instruction and to report more information for better debugging. This has only been tested on GFX8, though it should work on GFX6-GFX7. It seems we need a different implemenation for GFX9+. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen Part-of: --- src/amd/vulkan/radv_debug.c | 56 +++++++++++++++++++++++++++++++ src/amd/vulkan/radv_debug.h | 3 ++ src/amd/vulkan/radv_device.c | 62 +++++++++++++++++++++++++++++++++++ src/amd/vulkan/radv_private.h | 5 +++ 4 files changed, 126 insertions(+) diff --git a/src/amd/vulkan/radv_debug.c b/src/amd/vulkan/radv_debug.c index 47d70b96499b..6ce76b77eaaa 100644 --- a/src/amd/vulkan/radv_debug.c +++ b/src/amd/vulkan/radv_debug.c @@ -36,6 +36,7 @@ #include "radv_shader.h" #define TRACE_BO_SIZE 4096 +#define TMA_BO_SIZE 4096 #define COLOR_RESET "\033[0m" #define COLOR_RED "\033[31m" @@ -678,3 +679,58 @@ fail: close(fd); unlink(path); } + +bool +radv_trap_handler_init(struct radv_device *device) +{ + struct radeon_winsys *ws = device->ws; + + /* Create the trap handler shader and upload it like other shaders. */ + device->trap_handler_shader = radv_create_trap_handler_shader(device); + if (!device->trap_handler_shader) { + fprintf(stderr, "radv: failed to create the trap handler shader.\n"); + return false; + } + + device->tma_bo = ws->buffer_create(ws, TMA_BO_SIZE, 8, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_CPU_ACCESS | + RADEON_FLAG_NO_INTERPROCESS_SHARING | + RADEON_FLAG_ZERO_VRAM, + RADV_BO_PRIORITY_SCRATCH); + if (!device->tma_bo) + return false; + + device->tma_ptr = ws->buffer_map(device->tma_bo); + if (!device->tma_ptr) + return false; + + /* Upload a buffer descriptor to store various info from the trap. */ + uint64_t tma_va = radv_buffer_get_va(device->tma_bo) + 16; + uint32_t desc[4]; + + desc[0] = tma_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(tma_va >> 32); + desc[2] = TMA_BO_SIZE; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + + memcpy(device->tma_ptr, desc, sizeof(desc)); + + return true; +} + +void +radv_trap_handler_finish(struct radv_device *device) +{ + struct radeon_winsys *ws = device->ws; + + if (unlikely(device->trap_handler_shader)) + radv_shader_variant_destroy(device, device->trap_handler_shader); + + if (unlikely(device->tma_bo)) + ws->buffer_destroy(device->tma_bo); +} diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h index 2e7c4694a155..787597f5da97 100644 --- a/src/amd/vulkan/radv_debug.h +++ b/src/amd/vulkan/radv_debug.h @@ -82,4 +82,7 @@ radv_print_spirv(const char *data, uint32_t size, FILE *fp); void radv_dump_enabled_options(struct radv_device *device, FILE *f); +bool radv_trap_handler_init(struct radv_device *device); +void radv_trap_handler_finish(struct radv_device *device); + #endif diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 5b93083913b9..68c0ccc2b37e 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -2817,6 +2817,19 @@ VkResult radv_CreateDevice( goto fail; } + if (getenv("RADV_TRAP_HANDLER")) { + /* TODO: Add support for more hardware. */ + assert(device->physical_device->rad_info.chip_class == GFX8); + + /* To get the disassembly of the faulty shaders, we have to + * keep some shader info around. + */ + keep_shader_info = true; + + if (!radv_trap_handler_init(device)) + goto fail; + } + device->keep_shader_info = keep_shader_info; result = radv_device_init_meta(device); if (result != VK_SUCCESS) @@ -2893,6 +2906,8 @@ fail: radv_thread_trace_finish(device); + radv_trap_handler_finish(device); + if (device->trace_bo) device->ws->buffer_destroy(device->trace_bo); @@ -2942,6 +2957,8 @@ void radv_DestroyDevice( VkPipelineCache pc = radv_pipeline_cache_to_handle(device->mem_cache); radv_DestroyPipelineCache(radv_device_to_handle(device), pc, NULL); + radv_trap_handler_finish(device); + radv_destroy_shader_slabs(device); pthread_cond_destroy(&device->timeline_cond); @@ -3420,6 +3437,50 @@ radv_emit_global_shader_pointers(struct radv_queue *queue, } } +static void +radv_emit_trap_handler(struct radv_queue *queue, + struct radeon_cmdbuf *cs, + struct radeon_winsys_bo *tma_bo) +{ + struct radv_device *device = queue->device; + struct radeon_winsys_bo *tba_bo; + uint64_t tba_va, tma_va; + + if (!device->trap_handler_shader || !tma_bo) + return; + + tba_bo = device->trap_handler_shader->bo; + + tba_va = radv_buffer_get_va(tba_bo) + device->trap_handler_shader->bo_offset; + tma_va = radv_buffer_get_va(tma_bo); + + radv_cs_add_buffer(queue->device->ws, cs, tba_bo); + radv_cs_add_buffer(queue->device->ws, cs, tma_bo); + + if (queue->queue_family_index == RADV_QUEUE_GENERAL) { + uint32_t regs[] = {R_00B000_SPI_SHADER_TBA_LO_PS, + R_00B100_SPI_SHADER_TBA_LO_VS, + R_00B200_SPI_SHADER_TBA_LO_GS, + R_00B300_SPI_SHADER_TBA_LO_ES, + R_00B400_SPI_SHADER_TBA_LO_HS, + R_00B500_SPI_SHADER_TBA_LO_LS}; + + for (int i = 0; i < ARRAY_SIZE(regs); ++i) { + radeon_set_sh_reg_seq(cs, regs[i], 4); + radeon_emit(cs, tba_va >> 8); + radeon_emit(cs, tba_va >> 40); + radeon_emit(cs, tma_va >> 8); + radeon_emit(cs, tma_va >> 40); + } + } else { + radeon_set_sh_reg_seq(cs, R_00B838_COMPUTE_TBA_LO, 4); + radeon_emit(cs, tba_va >> 8); + radeon_emit(cs, tba_va >> 40); + radeon_emit(cs, tma_va >> 8); + radeon_emit(cs, tma_va >> 40); + } +} + static void radv_init_graphics_state(struct radeon_cmdbuf *cs, struct radv_queue *queue) { @@ -3724,6 +3785,7 @@ radv_get_preamble_cs(struct radv_queue *queue, compute_scratch_waves, compute_scratch_bo); radv_emit_graphics_scratch(queue, cs, scratch_size_per_wave, scratch_waves, scratch_bo); + radv_emit_trap_handler(queue, cs, queue->device->tma_bo); if (gds_bo) radv_cs_add_buffer(queue->device->ws, cs, gds_bo); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 1f5c4403b4e7..e8383c910b2e 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -847,6 +847,11 @@ struct radv_device { uint32_t thread_trace_buffer_size; int thread_trace_start_frame; + /* Trap handler. */ + struct radv_shader_variant *trap_handler_shader; + struct radeon_winsys_bo *tma_bo; /* Trap Memory Address */ + uint32_t *tma_ptr; + /* Overallocation. */ bool overallocation_disallowed; uint64_t allocated_memory_size[VK_MAX_MEMORY_HEAPS]; -- GitLab From 8e97a61cfbb50e427159877a2b8110ec0f5ce8ef Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 18 Aug 2020 08:49:11 +0200 Subject: [PATCH 09/10] radv: enable the trap handler and configure the shader exceptions When TRAP_PRESENT is not enabled, all traps and exceptions are ignored. Only EXCP_EN.mem_viol is currently supported because the other exceptions have to be tested/validated first. EXCP_EN.mem_viol is used to detect any sort of invalid memory access like VM fault. When a memory violation is reported, the hw jumps to the trap handler. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen Part-of: --- src/amd/vulkan/radv_shader.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 8d0cd9d4feb7..5ca57e045e42 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -805,12 +805,13 @@ radv_get_shader_binary_size(size_t code_size) return code_size + DEBUGGER_NUM_MARKERS * 4; } -static void radv_postprocess_config(const struct radv_physical_device *pdevice, +static void radv_postprocess_config(const struct radv_device *device, const struct ac_shader_config *config_in, const struct radv_shader_info *info, gl_shader_stage stage, struct ac_shader_config *config_out) { + const struct radv_physical_device *pdevice = device->physical_device; bool scratch_enabled = config_in->scratch_bytes_per_wave > 0; unsigned vgpr_comp_cnt = 0; unsigned num_input_vgprs = info->num_input_vgprs; @@ -836,6 +837,15 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice, config_out->rsrc2 = S_00B12C_USER_SGPR(info->num_user_sgprs) | S_00B12C_SCRATCH_EN(scratch_enabled); + if (device->trap_handler_shader) { + /* Enable the trap handler if requested and configure the + * shader exceptions like memory violation, etc. + * TODO: Enable (and validate) more exceptions. + */ + config_out->rsrc2 |= S_00B12C_TRAP_PRESENT(1) | + S_00B12C_EXCP_EN(1 << 8); /* mem_viol */ + } + if (!pdevice->use_ngg_streamout) { config_out->rsrc2 |= S_00B12C_SO_BASE0_EN(!!info->so.strides[0]) | S_00B12C_SO_BASE1_EN(!!info->so.strides[1]) | @@ -1108,7 +1118,7 @@ radv_shader_variant_create(struct radv_device *device, } variant->info = binary->info; - radv_postprocess_config(device->physical_device, &config, &binary->info, + radv_postprocess_config(device, &config, &binary->info, binary->stage, &variant->config); void *dest_ptr = radv_alloc_shader_memory(device, variant); -- GitLab From 48a910d1f9344ed46822b9a969c200e606613cf3 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 18 Aug 2020 18:52:35 +0200 Subject: [PATCH 10/10] radv: use the trap handler to detect faulty shaders/instructions It should reliably report the faulty shader but the faulty instruction is inacurate, especially for memory violations because it's reported when the addr is processed. It will be improved by emitting more wait-states. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen Part-of: --- src/amd/vulkan/radv_debug.c | 139 +++++++++++++++++++++++++++++++++++ src/amd/vulkan/radv_debug.h | 1 + src/amd/vulkan/radv_device.c | 4 + 3 files changed, 144 insertions(+) diff --git a/src/amd/vulkan/radv_debug.c b/src/amd/vulkan/radv_debug.c index 6ce76b77eaaa..ce68c2ddb02f 100644 --- a/src/amd/vulkan/radv_debug.c +++ b/src/amd/vulkan/radv_debug.c @@ -734,3 +734,142 @@ radv_trap_handler_finish(struct radv_device *device) if (unlikely(device->tma_bo)) ws->buffer_destroy(device->tma_bo); } + +static struct radv_shader_variant * +radv_get_faulty_shader(struct radv_device *device, uint64_t faulty_pc) +{ + struct radv_shader_variant *shader = NULL; + + mtx_lock(&device->shader_slab_mutex); + list_for_each_entry(struct radv_shader_slab, slab, &device->shader_slabs, slabs) { + list_for_each_entry(struct radv_shader_variant, s, &slab->shaders, slab_list) { + uint64_t offset = align_u64(s->bo_offset + s->code_size, 256); + uint64_t va = radv_buffer_get_va(s->bo); + + if (faulty_pc >= va + s->bo_offset && faulty_pc < va + offset) { + mtx_unlock(&device->shader_slab_mutex); + return s; + } + } + } + mtx_unlock(&device->shader_slab_mutex); + + return shader; +} + +static void +radv_dump_faulty_shader(struct radv_device *device, uint64_t faulty_pc) +{ + struct radv_shader_variant *shader; + uint64_t start_addr, end_addr; + uint32_t instr_offset; + + shader = radv_get_faulty_shader(device, faulty_pc); + if (!shader) + return; + + start_addr = radv_buffer_get_va(shader->bo) + shader->bo_offset; + end_addr = start_addr + shader->code_size; + instr_offset = faulty_pc - start_addr; + + fprintf(stderr, "Faulty shader found " + "VA=[0x%"PRIx64"-0x%"PRIx64"], instr_offset=%d\n", + start_addr, end_addr, instr_offset); + + /* Get the list of instructions. + * Buffer size / 4 is the upper bound of the instruction count. + */ + unsigned num_inst = 0; + struct radv_shader_inst *instructions = + calloc(shader->code_size / 4, sizeof(struct radv_shader_inst)); + + /* Split the disassembly string into instructions. */ + si_add_split_disasm(shader->disasm_string, start_addr, &num_inst, instructions); + + /* Print instructions with annotations. */ + for (unsigned i = 0; i < num_inst; i++) { + struct radv_shader_inst *inst = &instructions[i]; + + if (start_addr + inst->offset == faulty_pc) { + fprintf(stderr, "\n!!! Faulty instruction below !!!\n"); + fprintf(stderr, "%s\n", inst->text); + fprintf(stderr, "\n"); + } else { + fprintf(stderr, "%s\n", inst->text); + } + } + + free(instructions); +} + +struct radv_sq_hw_reg { + uint32_t status; + uint32_t trap_sts; + uint32_t hw_id; + uint32_t ib_sts; +}; + +static void +radv_dump_sq_hw_regs(struct radv_device *device) +{ + struct radv_sq_hw_reg *regs = (struct radv_sq_hw_reg *)&device->tma_ptr[6]; + + fprintf(stderr, "\nHardware registers:\n"); + ac_dump_reg(stderr, device->physical_device->rad_info.chip_class, + R_000002_SQ_HW_REG_STATUS, regs->status, ~0); + ac_dump_reg(stderr, device->physical_device->rad_info.chip_class, + R_000003_SQ_HW_REG_TRAP_STS, regs->trap_sts, ~0); + ac_dump_reg(stderr, device->physical_device->rad_info.chip_class, + R_000004_SQ_HW_REG_HW_ID, regs->hw_id, ~0); + ac_dump_reg(stderr, device->physical_device->rad_info.chip_class, + R_000007_SQ_HW_REG_IB_STS, regs->ib_sts, ~0); + fprintf(stderr, "\n\n"); +} + +void +radv_check_trap_handler(struct radv_queue *queue) +{ + enum ring_type ring = radv_queue_family_to_ring(queue->queue_family_index); + struct radv_device *device = queue->device; + struct radeon_winsys *ws = device->ws; + + /* Wait for the context to be idle in a finite time. */ + ws->ctx_wait_idle(queue->hw_ctx, ring, queue->queue_idx); + + /* Try to detect if the trap handler has been reached by the hw by + * looking at ttmp0 which should be non-zero if a shader exception + * happened. + */ + if (!device->tma_ptr[4]) + return; + +#if 0 + fprintf(stderr, "tma_ptr:\n"); + for (unsigned i = 0; i < 10; i++) + fprintf(stderr, "tma_ptr[%d]=0x%x\n", i, device->tma_ptr[i]); +#endif + + radv_dump_sq_hw_regs(device); + + uint32_t ttmp0 = device->tma_ptr[4]; + uint32_t ttmp1 = device->tma_ptr[5]; + + /* According to the ISA docs, 3.10 Trap and Exception Registers: + * + * "{ttmp1, ttmp0} = {3'h0, pc_rewind[3:0], HT[0], trapID[7:0], PC[47:0]}" + * + * "When the trap handler is entered, the PC of the faulting + * instruction is: (PC - PC_rewind * 4)." + * */ + uint8_t trap_id = (ttmp1 >> 16) & 0xff; + uint8_t ht = (ttmp1 >> 24) & 0x1; + uint8_t pc_rewind = (ttmp1 >> 25) & 0xf; + uint64_t pc = (ttmp0 | ((ttmp1 & 0x0000ffffull) << 32)) - (pc_rewind * 4); + + fprintf(stderr, "PC=0x%"PRIx64", trapID=%d, HT=%d, PC_rewind=%d\n", + pc, trap_id, ht, pc_rewind); + + radv_dump_faulty_shader(device, pc); + + abort(); +} diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h index 787597f5da97..103716493fb0 100644 --- a/src/amd/vulkan/radv_debug.h +++ b/src/amd/vulkan/radv_debug.h @@ -84,5 +84,6 @@ radv_dump_enabled_options(struct radv_device *device, FILE *f); bool radv_trap_handler_init(struct radv_device *device); void radv_trap_handler_finish(struct radv_device *device); +void radv_check_trap_handler(struct radv_queue *queue); #endif diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 68c0ccc2b37e..ec7ddb078386 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -4551,6 +4551,10 @@ radv_queue_submit_deferred(struct radv_deferred_queue_submission *submission, if (queue->device->trace_bo) { radv_check_gpu_hangs(queue, cs_array[j]); } + + if (queue->device->tma_bo) { + radv_check_trap_handler(queue); + } } free(cs_array); -- GitLab