From 34d481fd1f689805e0f41a8907bd00f96270fbfc Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 19 May 2020 13:26:21 +0100 Subject: [PATCH 01/21] aco: use num_opcodes instead of last_opcode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 332d7a1987b9..a07ea8092fc9 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1323,19 +1323,19 @@ ALWAYS_INLINE bool get_cmp_info(aco_opcode op, aco_opcode *ordered, aco_opcode * aco_opcode get_ordered(aco_opcode op) { aco_opcode ordered, unordered, inverse; - return get_cmp_info(op, &ordered, &unordered, &inverse) ? ordered : aco_opcode::last_opcode; + return get_cmp_info(op, &ordered, &unordered, &inverse) ? ordered : aco_opcode::num_opcodes; } aco_opcode get_unordered(aco_opcode op) { aco_opcode ordered, unordered, inverse; - return get_cmp_info(op, &ordered, &unordered, &inverse) ? unordered : aco_opcode::last_opcode; + return get_cmp_info(op, &ordered, &unordered, &inverse) ? unordered : aco_opcode::num_opcodes; } aco_opcode get_inverse(aco_opcode op) { aco_opcode ordered, unordered, inverse; - return get_cmp_info(op, &ordered, &unordered, &inverse) ? inverse : aco_opcode::last_opcode; + return get_cmp_info(op, &ordered, &unordered, &inverse) ? inverse : aco_opcode::num_opcodes; } bool is_cmp(aco_opcode op) @@ -1650,7 +1650,7 @@ bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr& instr) return false; aco_opcode new_opcode = get_inverse(cmp->opcode); - if (new_opcode == aco_opcode::last_opcode) + if (new_opcode == aco_opcode::num_opcodes) return false; if (cmp->operands[0].isTemp()) -- GitLab From b6d9e45f473edf4a3cfa86963b1849365f2297b1 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 19 May 2020 11:53:44 +0100 Subject: [PATCH 02/21] aco: improve code for f2{i,u}{8,16} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use sub-dword definitions so that the RA can use SDWA No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- .../compiler/aco_instruction_selection.cpp | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 38bf449c83b2..0b59a7e2e918 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -2327,33 +2327,31 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_f2i8: case nir_op_f2i16: { Temp src = get_alu_src(ctx, instr->src[0]); + Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v1); if (instr->src[0].src.ssa->bit_size == 16) - src = bld.vop1(aco_opcode::v_cvt_i16_f16, bld.def(v1), src); + src = bld.vop1(aco_opcode::v_cvt_i16_f16, Definition(tmp), src); else if (instr->src[0].src.ssa->bit_size == 32) - src = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src); + src = bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(tmp), src); else - src = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src); + src = bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(tmp), src); - if (dst.type() == RegType::vgpr) - bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u)); - else + if (dst.type() != RegType::vgpr) bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); break; } case nir_op_f2u8: case nir_op_f2u16: { Temp src = get_alu_src(ctx, instr->src[0]); + Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v1); if (instr->src[0].src.ssa->bit_size == 16) - src = bld.vop1(aco_opcode::v_cvt_u16_f16, bld.def(v1), src); + bld.vop1(aco_opcode::v_cvt_u16_f16, Definition(tmp), src); else if (instr->src[0].src.ssa->bit_size == 32) - src = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src); + bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(tmp), src); else - src = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src); + bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(tmp), src); - if (dst.type() == RegType::vgpr) - bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u)); - else - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); + if (dst.type() != RegType::vgpr) + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); break; } case nir_op_f2i32: { -- GitLab From a8f800a836200f24607065fe172e51045baf9112 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 11 Jun 2020 14:35:13 +0100 Subject: [PATCH 03/21] aco: use p_as_uniform in emit_vop1_instruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- .../compiler/aco_instruction_selection.cpp | 56 +++++-------------- 1 file changed, 15 insertions(+), 41 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 0b59a7e2e918..3c72f0994111 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -640,7 +640,11 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) { Builder bld(ctx->program, ctx->block); - bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0])); + if (dst.type() == RegType::sgpr) + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0]))); + else + bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0])); } void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) @@ -2326,32 +2330,22 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_f2i8: case nir_op_f2i16: { - Temp src = get_alu_src(ctx, instr->src[0]); - Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v1); if (instr->src[0].src.ssa->bit_size == 16) - src = bld.vop1(aco_opcode::v_cvt_i16_f16, Definition(tmp), src); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst); else if (instr->src[0].src.ssa->bit_size == 32) - src = bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(tmp), src); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst); else - src = bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(tmp), src); - - if (dst.type() != RegType::vgpr) - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst); break; } case nir_op_f2u8: case nir_op_f2u16: { - Temp src = get_alu_src(ctx, instr->src[0]); - Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v1); if (instr->src[0].src.ssa->bit_size == 16) - bld.vop1(aco_opcode::v_cvt_u16_f16, Definition(tmp), src); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst); else if (instr->src[0].src.ssa->bit_size == 32) - bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(tmp), src); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst); else - bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(tmp), src); - - if (dst.type() != RegType::vgpr) - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst); break; } case nir_op_f2i32: { @@ -2365,19 +2359,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp)); } } else if (instr->src[0].src.ssa->bit_size == 32) { - if (dst.type() == RegType::vgpr) - bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src); - else - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), - bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src)); - + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst); } else if (instr->src[0].src.ssa->bit_size == 64) { - if (dst.type() == RegType::vgpr) - bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src); - else - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), - bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src)); - + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -2396,19 +2380,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp)); } } else if (instr->src[0].src.ssa->bit_size == 32) { - if (dst.type() == RegType::vgpr) - bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src); - else - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), - bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src)); - + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst); } else if (instr->src[0].src.ssa->bit_size == 64) { - if (dst.type() == RegType::vgpr) - bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src); - else - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), - bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src)); - + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); -- GitLab From 1b6a319c15f3c63acb0384c47a94fb40f2aeb17d Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 15 May 2020 13:58:20 +0100 Subject: [PATCH 04/21] aco: add and set precise flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_builder_h.py | 8 ++++++++ src/amd/compiler/aco_instruction_selection.cpp | 7 +++++-- src/amd/compiler/aco_ir.h | 13 ++++++++++++- src/amd/compiler/aco_opt_value_numbering.cpp | 2 ++ src/amd/compiler/aco_print_ir.cpp | 2 ++ 5 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index eb655471c902..edd5f3fda645 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -166,11 +166,18 @@ public: std::vector> *instructions; std::vector>::iterator it; + bool is_precise = false; Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), lm(pgm->lane_mask), instructions(NULL) {} Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(&block->instructions) {} Builder(Program *pgm, std::vector> *instrs) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(instrs) {} + Builder precise() const { + Builder res = *this; + res.is_precise = true; + return res; + }; + void moveEnd(Block *block) { instructions = &block->instructions; } @@ -524,6 +531,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod ${struct} *instr = create_instruction<${struct}>(opcode, (Format)(${'|'.join('(int)Format::%s' % f.name for f in formats)}), ${num_operands}, ${num_definitions}); % for i in range(num_definitions): instr->definitions[${i}] = def${i}; + instr->definitions[${i}].setPrecise(is_precise); % endfor % for i in range(num_operands): instr->operands[${i}] = op${i}.op; diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 3c72f0994111..0e9f5f0f609c 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -590,6 +590,8 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o bool commutative, bool swap_srcs=false, bool flush_denorms = false) { Builder bld(ctx->program, ctx->block); + bld.is_precise = instr->exact; + Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]); Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]); if (src1.type() == RegType::sgpr) { @@ -628,6 +630,7 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode src2 = as_vgpr(ctx, src2); Builder bld(ctx->program, ctx->block); + bld.is_precise = instr->exact; if (flush_denorms && ctx->program->chip_class < GFX9) { assert(dst.size() == 1); Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2); @@ -640,6 +643,7 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) { Builder bld(ctx->program, ctx->block); + bld.is_precise = instr->exact; if (dst.type() == RegType::sgpr) bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0]))); @@ -1041,6 +1045,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) abort(); } Builder bld(ctx->program, ctx->block); + bld.is_precise = instr->exact; Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa); switch(instr->op) { case nir_op_vec2: @@ -2703,7 +2708,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_unpack_half_2x16_split_x: { if (dst.regClass() == v1) { - Builder bld(ctx->program, ctx->block); bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0])); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -2714,7 +2718,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_unpack_half_2x16_split_y: { if (dst.regClass() == v1) { - Builder bld(ctx->program, ctx->block); /* TODO: use SDWA here */ bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0])))); diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 6ce1e8d00441..bd221ad6b617 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -652,7 +652,7 @@ private: class Definition final { public: - constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {} + constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0) {} Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {} explicit Definition(Temp tmp) noexcept @@ -739,6 +739,16 @@ public: return isKill_; } + constexpr void setPrecise(bool precise) noexcept + { + isPrecise_ = precise; + } + + constexpr bool isPrecise() const noexcept + { + return isPrecise_; + } + private: Temp temp = Temp(0, s1); PhysReg reg_; @@ -747,6 +757,7 @@ private: uint8_t isFixed_:1; uint8_t hasHint_:1; uint8_t isKill_:1; + uint8_t isPrecise_:1; }; /* can't initialize bit-fields in c++11, so work around using a union */ uint8_t control_ = 0; diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 487d15881281..93668442d329 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -401,6 +401,8 @@ void process_block(vn_ctx& ctx, Block& block) assert(instr->definitions[i].regClass() == orig_instr->definitions[i].regClass()); assert(instr->definitions[i].isTemp()); ctx.renames[instr->definitions[i].tempId()] = orig_instr->definitions[i].getTemp(); + if (instr->definitions[i].isPrecise()) + orig_instr->definitions[i].setPrecise(true); } } else { ctx.expr_values.erase(res.first); diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 545dc9f553cb..0fb0ceb186d5 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -174,6 +174,8 @@ static void print_operand(const Operand *operand, FILE *output) static void print_definition(const Definition *definition, FILE *output) { print_reg_class(definition->regClass(), output); + if (definition->isPrecise()) + fprintf(output, "(precise)"); fprintf(output, "%%%d", definition->tempId()); if (definition->isFixed()) -- GitLab From 6cb42cdd8fddb990ee47124c18a510f4da6e99ac Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 15 May 2020 14:24:12 +0100 Subject: [PATCH 05/21] aco: create mads when signed zeros should be preserved MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This check was added because I thought v_mad_f32 didn't preserve the signess of zero, but I can't reproduce that and this isn't mentioned anywhere in LLVM. No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index a07ea8092fc9..37dcb89b182f 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -2414,7 +2414,7 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr else if ((instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_subrev_f32) && - block.fp_mode.denorm32 == 0 && !block.fp_mode.preserve_signed_zero_inf_nan32) { + block.fp_mode.denorm32 == 0) { //TODO: we could use fma instead when denormals are enabled if the NIR isn't marked as precise uint32_t uses_src0 = UINT32_MAX; -- GitLab From 1b10764e50998a556e000323c77d4a1632a80dfc Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 15 May 2020 14:03:15 +0100 Subject: [PATCH 06/21] aco: try to use fma instead of mad when denormals are enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v_mad_f32 doesn't support denormals but v_fma_f32 does. No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- .../aco_instruction_selection_setup.cpp | 4 ++ src/amd/compiler/aco_ir.h | 1 + src/amd/compiler/aco_optimizer.cpp | 51 ++++++++++++------- src/amd/compiler/aco_register_allocation.cpp | 18 +++++-- 4 files changed, 54 insertions(+), 20 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index eb07e7b6a830..6bd36835ce2c 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -1256,6 +1256,10 @@ setup_isel_context(Program* program, setup_xnack(program); program->sram_ecc_enabled = args->options->family == CHIP_ARCTURUS; + /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */ + program->has_fast_fma32 = program->chip_class >= GFX9; + if (args->options->family == CHIP_TAHITI || args->options->family == CHIP_CARRIZO || args->options->family == CHIP_HAWAII) + program->has_fast_fma32 = true; return ctx; } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index bd221ad6b617..68d0b9bf4cee 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1451,6 +1451,7 @@ public: bool xnack_enabled = false; bool sram_ecc_enabled = false; + bool has_fast_fma32 = false; bool needs_vcc = false; bool needs_flat_scr = false; diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 37dcb89b182f..67d18231319f 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -2410,37 +2410,44 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); return; } + /* combine mul+add -> mad */ - else if ((instr->opcode == aco_opcode::v_add_f32 || - instr->opcode == aco_opcode::v_sub_f32 || - instr->opcode == aco_opcode::v_subrev_f32) && - block.fp_mode.denorm32 == 0) { - //TODO: we could use fma instead when denormals are enabled if the NIR isn't marked as precise + bool mad32 = instr->opcode == aco_opcode::v_add_f32 || + instr->opcode == aco_opcode::v_sub_f32 || + instr->opcode == aco_opcode::v_subrev_f32; + if (mad32) { + bool need_fma = block.fp_mode.denorm32 != 0; + if (need_fma && instr->definitions[0].isPrecise()) + return; + if (need_fma && !ctx.program->has_fast_fma32) + return; uint32_t uses_src0 = UINT32_MAX; uint32_t uses_src1 = UINT32_MAX; Instruction* mul_instr = nullptr; unsigned add_op_idx; /* check if any of the operands is a multiplication */ - if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_mul()) + ssa_info *op0_info = instr->operands[0].isTemp() ? &ctx.info[instr->operands[0].tempId()] : NULL; + ssa_info *op1_info = instr->operands[1].isTemp() ? &ctx.info[instr->operands[1].tempId()] : NULL; + if (op0_info && op0_info->is_mul() && (!need_fma || !op0_info->instr->definitions[0].isPrecise())) uses_src0 = ctx.uses[instr->operands[0].tempId()]; - if (instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_mul()) + if (op1_info && op1_info->is_mul() && (!need_fma || !op1_info->instr->definitions[0].isPrecise())) uses_src1 = ctx.uses[instr->operands[1].tempId()]; /* find the 'best' mul instruction to combine with the add */ if (uses_src0 < uses_src1) { - mul_instr = ctx.info[instr->operands[0].tempId()].instr; + mul_instr = op0_info->instr; add_op_idx = 1; } else if (uses_src1 < uses_src0) { - mul_instr = ctx.info[instr->operands[1].tempId()].instr; + mul_instr = op1_info->instr; add_op_idx = 0; } else if (uses_src0 != UINT32_MAX) { /* tiebreaker: quite random what to pick */ - if (ctx.info[instr->operands[0].tempId()].instr->operands[0].isLiteral()) { - mul_instr = ctx.info[instr->operands[1].tempId()].instr; + if (op0_info->instr->operands[0].isLiteral()) { + mul_instr = op1_info->instr; add_op_idx = 0; } else { - mul_instr = ctx.info[instr->operands[0].tempId()].instr; + mul_instr = op0_info->instr; add_op_idx = 1; } } @@ -2498,7 +2505,9 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr else if (instr->opcode == aco_opcode::v_subrev_f32) neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true; - aco_ptr mad{create_instruction(aco_opcode::v_mad_f32, Format::VOP3A, 3, 1)}; + aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32; + + aco_ptr mad{create_instruction(mad_op, Format::VOP3A, 3, 1)}; for (unsigned i = 0; i < 3; i++) { mad->operands[i] = op[i]; @@ -2706,7 +2715,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) } mad_info* mad_info = NULL; - if (instr->opcode == aco_opcode::v_mad_f32 && ctx.info[instr->definitions[0].tempId()].is_mad()) { + if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) { mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val]; /* re-check mad instructions */ if (ctx.uses[mad_info->mul_temp_id]) { @@ -2720,6 +2729,10 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) } /* check literals */ else if (!instr->usesModifiers()) { + /* FMA can only take literals on GFX10+ */ + if (instr->opcode == aco_opcode::v_fma_f32 && ctx.program->chip_class < GFX10) + return; + bool sgpr_used = false; uint32_t literal_idx = 0; uint32_t literal_uses = UINT32_MAX; @@ -2881,17 +2894,21 @@ void apply_literals(opt_ctx &ctx, aco_ptr& instr) return; /* apply literals on MAD */ - if (instr->opcode == aco_opcode::v_mad_f32 && ctx.info[instr->definitions[0].tempId()].is_mad()) { + if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) { mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val]; if (info->check_literal && (ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) { aco_ptr new_mad; + + aco_opcode new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32; + if (instr->opcode == aco_opcode::v_fma_f32) + new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32; + + new_mad.reset(create_instruction(new_op, Format::VOP2, 3, 1)); if (info->literal_idx == 2) { /* add literal -> madak */ - new_mad.reset(create_instruction(aco_opcode::v_madak_f32, Format::VOP2, 3, 1)); new_mad->operands[0] = instr->operands[0]; new_mad->operands[1] = instr->operands[1]; } else { /* mul literal -> madmk */ - new_mad.reset(create_instruction(aco_opcode::v_madmk_f32, Format::VOP2, 3, 1)); new_mad->operands[0] = instr->operands[1 - info->literal_idx]; new_mad->operands[1] = instr->operands[2]; } diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 8e662a282c8e..a824e8b546cd 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1734,7 +1734,8 @@ void register_allocation(Program *program, std::vector& live_out_per_bl Operand op = Operand(); if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) op = instr->operands[i]; - else if (instr->opcode == aco_opcode::v_mad_f32 && !instr->usesModifiers()) + else if ((instr->opcode == aco_opcode::v_mad_f32 || + (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10)) && !instr->usesModifiers()) op = instr->operands[2]; if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) { @@ -2009,7 +2010,8 @@ void register_allocation(Program *program, std::vector& live_out_per_bl } /* try to optimize v_mad_f32 -> v_mac_f32 */ - if (instr->opcode == aco_opcode::v_mad_f32 && + if ((instr->opcode == aco_opcode::v_mad_f32 || + (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10)) && instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() && instr->operands[2].getTemp().type() == RegType::vgpr && @@ -2022,13 +2024,23 @@ void register_allocation(Program *program, std::vector& live_out_per_bl instr->operands[2].physReg() == ctx.assignments[it->second].reg || register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) { instr->format = Format::VOP2; - instr->opcode = aco_opcode::v_mac_f32; + switch (instr->opcode) { + case aco_opcode::v_mad_f32: + instr->opcode = aco_opcode::v_mac_f32; + break; + case aco_opcode::v_fma_f32: + instr->opcode = aco_opcode::v_fmac_f32; + break; + default: + break; + } } } /* handle definitions which must have the same register as an operand */ if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 || + instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64) { instr->definitions[0].setFixed(instr->operands[2].physReg()); -- GitLab From 7f511efa16adb5e820c4535473de9bfb59f5e470 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 14 May 2020 21:09:36 +0100 Subject: [PATCH 07/21] aco: create 16-bit mad/fma MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (Navi, fp16 enabled): Totals from 1 (0.00% of 127638) affected shaders: CodeSize: 4868 -> 4552 (-6.49%) Instrs: 956 -> 863 (-9.73%) Cycles: 3824 -> 3452 (-9.73%) VMEM: 504 -> 490 (-2.78%) SMEM: 109 -> 107 (-1.83%) VClause: 19 -> 20 (+5.26%) Copies: 54 -> 58 (+7.41%) PreVGPRs: 43 -> 41 (-4.65%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 28 +++++++++++++++----- src/amd/compiler/aco_register_allocation.cpp | 24 ++++++++++++++--- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 67d18231319f..f56ef5f5170c 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1103,6 +1103,10 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } break; } + case aco_opcode::v_mul_f16: { + ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); + break; + } case aco_opcode::v_and_b32: /* abs */ if (!instr->usesModifiers() && instr->operands[0].constantEquals(0x7FFFFFFF) && instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr) @@ -2415,11 +2419,15 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_subrev_f32; - if (mad32) { - bool need_fma = block.fp_mode.denorm32 != 0; + bool mad16 = instr->opcode == aco_opcode::v_add_f16 || + instr->opcode == aco_opcode::v_sub_f16 || + instr->opcode == aco_opcode::v_subrev_f16; + if (mad16 || mad32) { + bool need_fma = mad32 ? block.fp_mode.denorm32 != 0 : + (block.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10); if (need_fma && instr->definitions[0].isPrecise()) return; - if (need_fma && !ctx.program->has_fast_fma32) + if (need_fma && mad32 && !ctx.program->has_fast_fma32) return; uint32_t uses_src0 = UINT32_MAX; @@ -2500,12 +2508,15 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr /* neg of the multiplication result */ neg[1] = neg[1] ^ vop3->neg[1 - add_op_idx]; } - if (instr->opcode == aco_opcode::v_sub_f32) + if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16) neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true; - else if (instr->opcode == aco_opcode::v_subrev_f32) + else if (instr->opcode == aco_opcode::v_subrev_f32 || instr->opcode == aco_opcode::v_subrev_f16) neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true; aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32; + if (mad16) + mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16 : aco_opcode::v_fma_f16) : + (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 : aco_opcode::v_mad_f16); aco_ptr mad{create_instruction(mad_op, Format::VOP3A, 3, 1)}; for (unsigned i = 0; i < 3; i++) @@ -2730,7 +2741,8 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) /* check literals */ else if (!instr->usesModifiers()) { /* FMA can only take literals on GFX10+ */ - if (instr->opcode == aco_opcode::v_fma_f32 && ctx.program->chip_class < GFX10) + if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) && + ctx.program->chip_class < GFX10) return; bool sgpr_used = false; @@ -2903,6 +2915,10 @@ void apply_literals(opt_ctx &ctx, aco_ptr& instr) aco_opcode new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32; if (instr->opcode == aco_opcode::v_fma_f32) new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32; + else if (instr->opcode == aco_opcode::v_mad_f16 || instr->opcode == aco_opcode::v_mad_legacy_f16) + new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16; + else if (instr->opcode == aco_opcode::v_fma_f16) + new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16; new_mad.reset(create_instruction(new_op, Format::VOP2, 3, 1)); if (info->literal_idx == 2) { /* add literal -> madak */ diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index a824e8b546cd..505f5cb613df 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1735,7 +1735,10 @@ void register_allocation(Program *program, std::vector& live_out_per_bl if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) op = instr->operands[i]; else if ((instr->opcode == aco_opcode::v_mad_f32 || - (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10)) && !instr->usesModifiers()) + (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) || + instr->opcode == aco_opcode::v_mad_f16 || + instr->opcode == aco_opcode::v_mad_legacy_f16 || + (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) && !instr->usesModifiers()) op = instr->operands[2]; if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) { @@ -2011,13 +2014,19 @@ void register_allocation(Program *program, std::vector& live_out_per_bl /* try to optimize v_mad_f32 -> v_mac_f32 */ if ((instr->opcode == aco_opcode::v_mad_f32 || - (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10)) && + (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) || + instr->opcode == aco_opcode::v_mad_f16 || + instr->opcode == aco_opcode::v_mad_legacy_f16 || + (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) && instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() && instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr && - !instr->usesModifiers()) { + !instr->usesModifiers() && + instr->operands[0].physReg().byte() == 0 && + instr->operands[1].physReg().byte() == 0 && + instr->operands[2].physReg().byte() == 0) { unsigned def_id = instr->definitions[0].tempId(); auto it = ctx.affinities.find(def_id); if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned || @@ -2031,6 +2040,13 @@ void register_allocation(Program *program, std::vector& live_out_per_bl case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break; + case aco_opcode::v_mad_f16: + case aco_opcode::v_mad_legacy_f16: + instr->opcode = aco_opcode::v_mac_f16; + break; + case aco_opcode::v_fma_f16: + instr->opcode = aco_opcode::v_fmac_f16; + break; default: break; } @@ -2041,6 +2057,8 @@ void register_allocation(Program *program, std::vector& live_out_per_bl if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 || + instr->opcode == aco_opcode::v_mac_f16 || + instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64) { instr->definitions[0].setFixed(instr->operands[2].physReg()); -- GitLab From f5a5674178f61089ff6d099dfad11b4852a50ad6 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 19 May 2020 13:24:45 +0100 Subject: [PATCH 08/21] aco: update comment about preserving fp16/fp64 denormals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 0e9f5f0f609c..06b8cab7b726 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -10505,7 +10505,8 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader) float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64); - /* default to preserving fp16 and fp64 denorms, since it's free */ + /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and + * the precision seems needed for Wolfenstein: Youngblood to render correctly */ if (program->next_fp_mode.must_flush_denorms16_64) program->next_fp_mode.denorm16_64 = 0; else -- GitLab From 1210e0bd6205c5f5365a29c91425dea3e49d89a7 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 15 May 2020 15:12:33 +0100 Subject: [PATCH 09/21] aco: create 16-bit input and output modifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (Navi, fp16 enabled): Totals from 1 (0.00% of 127638) affected shaders: CodeSize: 4552 -> 4540 (-0.26%) Instrs: 863 -> 861 (-0.23%) Cycles: 3452 -> 3444 (-0.23%) VMEM: 490 -> 489 (-0.20%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 74 +++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index f56ef5f5170c..211f347c0ab4 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -62,6 +62,9 @@ struct mad_info { enum Label { label_vec = 1 << 0, label_constant = 1 << 1, + /* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and + * 32-bit operations but this shouldn't cause any issues because we don't + * look through any conversions */ label_abs = 1 << 2, label_neg = 1 << 3, label_mul = 1 << 4, @@ -672,6 +675,18 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp return false; } +unsigned get_operand_size(aco_ptr& instr, unsigned index) +{ + if (instr->format == Format::PSEUDO) + return instr->operands[index].bytes() * 8u; + else if (instr->opcode == aco_opcode::v_mad_u64_u32 || instr->opcode == aco_opcode::v_mad_i64_i32) + return index == 2 ? 64 : 32; + else if (instr->isVALU() || instr->isSALU()) + return instr_info.operand_size[(int)instr->opcode]; + else + return 0; +} + Operand get_constant_op(opt_ctx &ctx, uint32_t val, bool is64bit = false) { // TODO: this functions shouldn't be needed if we store Operand instead of value. @@ -753,7 +768,12 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) instr->operands[i].setTemp(info.temp); info = ctx.info[info.temp.id()]; } - if (info.is_abs() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && instr_info.can_use_input_modifiers[(int)instr->opcode]) { + + /* for instructions other than v_cndmask_b32, the size of the instruction should match the operand size */ + unsigned can_use_mod = instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4; + can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode]; + + if (info.is_abs() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && can_use_mod) { if (!instr->isDPP()) to_VOP3(ctx, instr); instr->operands[i] = Operand(info.temp); @@ -766,7 +786,11 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32; instr->operands[i].setTemp(info.temp); continue; - } else if (info.is_neg() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && instr_info.can_use_input_modifiers[(int)instr->opcode]) { + } else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) { + instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16; + instr->operands[i].setTemp(info.temp); + continue; + } else if (info.is_neg() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && can_use_mod) { if (!instr->isDPP()) to_VOP3(ctx, instr); instr->operands[i].setTemp(info.temp); @@ -1079,21 +1103,24 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } break; } + case aco_opcode::v_mul_f16: case aco_opcode::v_mul_f32: { /* omod */ /* TODO: try to move the negate/abs modifier to the consumer instead */ if (instr->usesModifiers()) break; + bool fp16 = instr->opcode == aco_opcode::v_mul_f16; + for (unsigned i = 0; i < 2; i++) { if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) { - if (instr->operands[!i].constantValue() == 0x40000000) { /* 2.0 */ + if (instr->operands[!i].constantValue() == (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */ ctx.info[instr->operands[i].tempId()].set_omod2(instr->definitions[0].getTemp()); - } else if (instr->operands[!i].constantValue() == 0x40800000) { /* 4.0 */ + } else if (instr->operands[!i].constantValue() == (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */ ctx.info[instr->operands[i].tempId()].set_omod4(instr->definitions[0].getTemp()); - } else if (instr->operands[!i].constantValue() == 0x3f000000) { /* 0.5 */ + } else if (instr->operands[!i].constantValue() == (fp16 ? 0xb800 : 0x3f000000)) { /* 0.5 */ ctx.info[instr->operands[i].tempId()].set_omod5(instr->definitions[0].getTemp()); - } else if (instr->operands[!i].constantValue() == 0x3f800000 && - !block.fp_mode.must_flush_denorms32) { /* 1.0 */ + } else if (instr->operands[!i].constantValue() == (fp16 ? 0x3c00 : 0x3f800000) && + !(fp16 ? block.fp_mode.must_flush_denorms16_64 : block.fp_mode.must_flush_denorms32)) { /* 1.0 */ ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp()); } else { continue; @@ -1103,19 +1130,20 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } break; } - case aco_opcode::v_mul_f16: { - ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); - break; - } - case aco_opcode::v_and_b32: /* abs */ - if (!instr->usesModifiers() && instr->operands[0].constantEquals(0x7FFFFFFF) && - instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr) + case aco_opcode::v_and_b32: { /* abs */ + if (!instr->usesModifiers() && instr->operands[1].isTemp() && + instr->operands[1].getTemp().type() == RegType::vgpr && + ((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x7FFFFFFFu)) || + (instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x7FFFu)))) ctx.info[instr->definitions[0].tempId()].set_abs(instr->operands[1].getTemp()); else ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); break; + } case aco_opcode::v_xor_b32: { /* neg */ - if (!instr->usesModifiers() && instr->operands[0].constantEquals(0x80000000u) && instr->operands[1].isTemp()) { + if (!instr->usesModifiers() && instr->operands[1].isTemp() && + ((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x80000000u)) || + (instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x8000u)))) { if (ctx.info[instr->operands[1].tempId()].is_neg()) { ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp); } else if (instr->operands[1].getTemp().type() == RegType::vgpr) { @@ -1132,6 +1160,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } break; } + case aco_opcode::v_med3_f16: case aco_opcode::v_med3_f32: { /* clamp */ VOP3A_instruction* vop3 = static_cast(instr.get()); if (vop3->abs[0] || vop3->abs[1] || vop3->abs[2] || @@ -1141,11 +1170,12 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) unsigned idx = 0; bool found_zero = false, found_one = false; + bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16; for (unsigned i = 0; i < 3; i++) { if (instr->operands[i].constantEquals(0)) found_zero = true; - else if (instr->operands[i].constantEquals(0x3f800000)) /* 1.0 */ + else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */ found_one = true; else idx = i; @@ -2251,7 +2281,7 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr& instr) { /* check if we could apply omod on predecessor */ - if (instr->opcode == aco_opcode::v_mul_f32) { + if (instr->opcode == aco_opcode::v_mul_f32 || instr->opcode == aco_opcode::v_mul_f16) { bool op0 = instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_omod_success(); bool op1 = instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_omod_success(); if (op0 || op1) { @@ -2287,14 +2317,15 @@ bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr& instr) } /* check if we could apply clamp on predecessor */ - if (instr->opcode == aco_opcode::v_med3_f32) { + if (instr->opcode == aco_opcode::v_med3_f32 || instr->opcode == aco_opcode::v_med3_f16) { + bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16; unsigned idx = 0; bool found_zero = false, found_one = false; for (unsigned i = 0; i < 3; i++) { if (instr->operands[i].constantEquals(0)) found_zero = true; - else if (instr->operands[i].constantEquals(0x3f800000)) /* 1.0 */ + else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */ found_one = true; else idx = i; @@ -2319,11 +2350,10 @@ bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr& instr) } /* omod has no effect if denormals are enabled */ - bool can_use_omod = block.fp_mode.denorm32 == 0; - /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */ if (!instr->definitions.empty() && ctx.uses[instr->definitions[0].tempId()] == 1 && can_use_VOP3(ctx, instr) && instr_info.can_use_output_modifiers[(int)instr->opcode]) { + bool can_use_omod = (instr->definitions[0].bytes() == 4 ? block.fp_mode.denorm32 : block.fp_mode.denorm16_64) == 0; ssa_info& def_info = ctx.info[instr->definitions[0].tempId()]; if (can_use_omod && def_info.is_omod2() && ctx.uses[def_info.temp.id()]) { to_VOP3(ctx, instr); @@ -2395,7 +2425,7 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr Definition def = instr->definitions[0]; /* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */ bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs(); - instr.reset(create_instruction(aco_opcode::v_mul_f32, asVOP3(Format::VOP2), 2, 1)); + instr.reset(create_instruction(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1)); instr->operands[0] = mul_instr->operands[0]; instr->operands[1] = mul_instr->operands[1]; instr->definitions[0] = def; -- GitLab From 9b69ed0bb9503befd73e7bfa4867dc431d01e2ee Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 18 May 2020 15:26:58 +0100 Subject: [PATCH 10/21] aco: improve sub-dword check for sgpr/constant propagation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit p_create_vector can have sub-dword operands with a v1 definition. No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 211f347c0ab4..37564b7e993f 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -727,11 +727,16 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) /* SALU / PSEUDO: propagate inline constants */ if (instr->isSALU() || instr->format == Format::PSEUDO) { - const bool is_subdword = std::any_of(instr->definitions.begin(), instr->definitions.end(), - [] (const Definition& def) { return def.regClass().is_subdword();}); + bool is_subdword = false; // TODO: optimize SGPR and constant propagation for subdword pseudo instructions on gfx9+ - if (is_subdword) - continue; + if (instr->format == Format::PSEUDO) { + is_subdword = std::any_of(instr->definitions.begin(), instr->definitions.end(), + [] (const Definition& def) { return def.regClass().is_subdword();}); + is_subdword = is_subdword || std::any_of(instr->operands.begin(), instr->operands.end(), + [] (const Operand& op) { return op.hasRegClass() && op.regClass().is_subdword();}); + if (is_subdword) + continue; + } if (info.is_temp() && info.temp.type() == RegType::sgpr) { instr->operands[i].setTemp(info.temp); -- GitLab From dd233455679fe0f0ae441cc5ef3dd366132951e7 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 15 Jun 2020 14:21:03 +0100 Subject: [PATCH 11/21] aco: fix half_pi constant for 16-bit fsin/fcos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This worked because the optimizer didn't consider that the 16-bit instruction would interpret the inline constant differently. This will change in the next commit. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 06b8cab7b726..f256ec6eb3a9 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -2114,12 +2114,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fcos: { Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); aco_ptr norm; - Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u)); if (dst.regClass() == v2b) { + Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u)); Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src); aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16; bld.vop1(opcode, Definition(dst), tmp); } else if (dst.regClass() == v1) { + Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u)); Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src); /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */ -- GitLab From 4784111abc113ae64ce1d597407e0c32d6a88160 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 15 Jun 2020 14:30:34 +0100 Subject: [PATCH 12/21] aco: use 32-bit inline constants for 16-bit integer instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://reviews.llvm.org/D81841 Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_opcodes.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 1396b8c3af4e..a0ecc9c57884 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -194,12 +194,21 @@ class Opcode(object): parts = name.replace('_e64', '').rsplit('_', 2) op_dtype = parts[-1] def_dtype = parts[-2] if len(parts) > 1 else parts[-1] - dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]} - self.operand_size = dtype_sizes.get(op_dtype, 0) - self.definition_size = dtype_sizes.get(def_dtype, self.operand_size) + + def_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]} + op_dtype_sizes = {k:v for k, v in def_dtype_sizes.items()} + # inline constants are 32-bit for 16-bit integer/typeless instructions: https://reviews.llvm.org/D81841 + op_dtype_sizes['b16'] = 32 + op_dtype_sizes['i16'] = 32 + op_dtype_sizes['u16'] = 32 + + self.operand_size = op_dtype_sizes.get(op_dtype, 0) + self.definition_size = def_dtype_sizes.get(def_dtype, self.operand_size) # exceptions - if self.operand_size == 24: + if self.operand_size == 16 and op_dtype != 'f16': + self.operand_size = 16 + elif self.operand_size == 24: self.operand_size = 32 elif name in ['s_sext_i32_i8', 's_sext_i32_i16', 'v_msad_u8', 'v_cvt_pk_u16_u32', 'v_cvt_pk_i16_i32']: self.operand_size = 32 @@ -208,9 +217,6 @@ class Opcode(object): self.operand_size = 0 elif name in ['v_mad_u64_u32', 'v_mad_i64_i32']: self.operand_size = 0 - elif name.replace('_e64', '') in ['v_lshrrev_b16', 'v_ashrrev_i16', 'v_lshlrev_b16']: - # v_lshlrev_b16 tested on GFX10 with 1/2 PI inline constant - self.operand_size = 32 elif '_pk_' in name or name in ['v_lerp_u8', 'v_sad_u8', 'v_sad_u16', 'v_cvt_f32_ubyte0', 'v_cvt_f32_ubyte1', 'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']: -- GitLab From 3d6f67950d91de1dd50b096de144e504a89ea21d Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 15 May 2020 16:28:03 +0100 Subject: [PATCH 13/21] aco: improve 8/16-bit constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (Navi, fp16 enabled): Totals from 1 (0.00% of 127638) affected shaders: CodeSize: 4540 -> 4388 (-3.35%) Instrs: 861 -> 830 (-3.60%) Cycles: 3444 -> 3320 (-3.60%) VMEM: 489 -> 465 (-4.91%) SMEM: 107 -> 110 (+2.80%) SClause: 31 -> 30 (-3.23%) Copies: 58 -> 54 (-6.90%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_builder_h.py | 32 +++ .../compiler/aco_instruction_selection.cpp | 2 +- src/amd/compiler/aco_ir.h | 60 +++++- src/amd/compiler/aco_lower_to_hw_instr.cpp | 43 +++- src/amd/compiler/aco_optimizer.cpp | 194 ++++++++++-------- src/amd/compiler/aco_print_ir.cpp | 9 +- 6 files changed, 244 insertions(+), 96 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index edd5f3fda645..0296653efdaa 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -78,6 +78,8 @@ ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) aco_ptr create_s_mov(Definition dst, Operand src); +extern uint8_t int8_mul_table[512]; + enum sendmsg { sendmsg_none = 0, _sendmsg_gs = 2, @@ -388,6 +390,36 @@ public: return vop1(aco_opcode::v_mov_b32, dst, op); } else if (op.bytes() > 2) { return pseudo(aco_opcode::p_create_vector, dst, op); + } else if (op.bytes() == 1 && op.isConstant()) { + uint8_t val = op.constantValue(); + Operand op32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u)); + aco_ptr sdwa; + if (op32.isLiteral()) { + sdwa.reset(create_instruction(aco_opcode::v_mul_u32_u24, asSDWA(Format::VOP2), 2, 1)); + uint32_t a = (uint32_t)int8_mul_table[val * 2]; + uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1]; + sdwa->operands[0] = Operand(a | (a & 0x80u ? 0xffffff00u : 0x0u)); + sdwa->operands[1] = Operand(b | (b & 0x80u ? 0xffffff00u : 0x0u)); + } else { + sdwa.reset(create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)); + sdwa->operands[0] = op32; + } + sdwa->definitions[0] = dst; + sdwa->sel[0] = sdwa_udword; + sdwa->sel[1] = sdwa_udword; + sdwa->dst_sel = sdwa_ubyte; + sdwa->dst_preserve = true; + return insert(std::move(sdwa)); + } else if (op.bytes() == 2 && op.isConstant() && !op.isLiteral()) { + aco_ptr sdwa{create_instruction(aco_opcode::v_add_f16, asSDWA(Format::VOP2), 2, 1)}; + sdwa->operands[0] = op; + sdwa->operands[1] = Operand(0u); + sdwa->definitions[0] = dst; + sdwa->sel[0] = sdwa_uword; + sdwa->sel[1] = sdwa_udword; + sdwa->dst_sel = dst.bytes() == 1 ? sdwa_ubyte : sdwa_uword; + sdwa->dst_preserve = true; + return insert(std::move(sdwa)); } else if (dst.regClass().is_subdword()) { if (program->chip_class >= GFX8) { aco_ptr sdwa{create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index f256ec6eb3a9..c0cc445ffa38 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -1925,7 +1925,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fsat: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand(0u), Operand(0x3f800000u), src); + bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src); } else if (dst.regClass() == v1) { bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src); /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */ diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 68d0b9bf4cee..3db6b4b6d438 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -337,7 +337,7 @@ class Operand final public: constexpr Operand() : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), - isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false), + isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false) {} explicit Operand(Temp r) noexcept @@ -350,11 +350,51 @@ public: setFixed(PhysReg{128}); } }; + explicit Operand(uint8_t v) noexcept + { + /* 8-bit constants are only used for copies and copies from any 8-bit + * constant can be implemented with a SDWA v_mul_u32_u24. So consider all + * to be inline constants. */ + data_.i = v; + isConstant_ = true; + constSize = 0; + setFixed(PhysReg{0u}); + }; + explicit Operand(uint16_t v) noexcept + { + data_.i = v; + isConstant_ = true; + constSize = 1; + if (v <= 64) + setFixed(PhysReg{128u + v}); + else if (v >= 0xFFF0) /* [-16 .. -1] */ + setFixed(PhysReg{192u + (0xFFFF - v)}); + else if (v == 0x3800) /* 0.5 */ + setFixed(PhysReg{240}); + else if (v == 0xB800) /* -0.5 */ + setFixed(PhysReg{241}); + else if (v == 0x3C00) /* 1.0 */ + setFixed(PhysReg{242}); + else if (v == 0xBC00) /* -1.0 */ + setFixed(PhysReg{243}); + else if (v == 0x4000) /* 2.0 */ + setFixed(PhysReg{244}); + else if (v == 0xC000) /* -2.0 */ + setFixed(PhysReg{245}); + else if (v == 0x4400) /* 4.0 */ + setFixed(PhysReg{246}); + else if (v == 0xC400) /* -4.0 */ + setFixed(PhysReg{247}); + else if (v == 0x3118) /* 1/2 PI */ + setFixed(PhysReg{248}); + else /* Literal Constant */ + setFixed(PhysReg{255}); + }; explicit Operand(uint32_t v, bool is64bit = false) noexcept { data_.i = v; isConstant_ = true; - is64BitConst_ = is64bit; + constSize = is64bit ? 3 : 2; if (v <= 64) setFixed(PhysReg{128 + v}); else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */ @@ -383,7 +423,7 @@ public: explicit Operand(uint64_t v) noexcept { isConstant_ = true; - is64BitConst_ = true; + constSize = 3; if (v <= 64) { data_.i = (uint32_t) v; setFixed(PhysReg{128 + (uint32_t) v}); @@ -465,7 +505,7 @@ public: constexpr unsigned bytes() const noexcept { if (isConstant()) - return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants + return 1 << constSize; else return data_.temp.bytes(); } @@ -473,7 +513,7 @@ public: constexpr unsigned size() const noexcept { if (isConstant()) - return is64BitConst_ ? 2 : 1; + return constSize > 2 ? 2 : 1; else return data_.temp.size(); } @@ -521,7 +561,7 @@ public: constexpr uint64_t constantValue64(bool signext=false) const noexcept { - if (is64BitConst_) { + if (constSize == 3) { if (reg_ <= 192) return reg_ - 128; else if (reg_ <= 208) @@ -545,6 +585,10 @@ public: case 247: return 0xC010000000000000; } + } else if (constSize == 1) { + return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i; + } else if (constSize == 0) { + return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i; } return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i; } @@ -635,11 +679,11 @@ private: uint8_t isKill_:1; uint8_t isUndef_:1; uint8_t isFirstKill_:1; - uint8_t is64BitConst_:1; + uint8_t constSize:2; uint8_t isLateKill_:1; }; /* can't initialize bit-fields in c++11, so work around using a union */ - uint8_t control_ = 0; + uint16_t control_ = 0; }; }; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index bb63aea95d44..5e93dc603e6b 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -41,6 +41,37 @@ struct lower_context { std::vector> instructions; }; +/* used by handle_operands() indirectly through Builder::copy */ +uint8_t int8_mul_table[512] = { + 0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9, 1, 10, 1, 11, + 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19, 1, 20, 1, 21, + 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29, 1, 30, 1, 31, + 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39, 1, 40, 1, 41, + 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49, 1, 50, 1, 51, + 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59, 1, 60, 1, 61, + 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23, 2, 35, 11, 53, + 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243, 2, 40, 3, 27, + 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23, 2, 45, 7, 13, + 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33, 2, 50, 7, 51, + 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27, 2, 55, 3, 37, + 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17, 2, 60, 9, 241, + 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43, 5, 26, 9, 43, + 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243, 4, 35, 3, 47, + 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45, 3, 50, 7, 241, + 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53, 4, 40, 7, 23, + 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13, 5, 34, 3, 57, + 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245, 3, 60, 5, 241, + 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63, 5, 38, 5, 243, + 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35, 4, 242, 5, 245, + 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19, 5, 42, 3, 241, + 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43, 3, 244, + 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247, + 2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247, + 9, 55, 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247, + 1, 248, 1, 249, 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255 +}; + + aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) { /* Because some 16-bit instructions are already VOP3 on GFX10, we use the * 32-bit opcodes (VOP2) which allows to remove the tempory VGPR and to use @@ -999,11 +1030,15 @@ void split_copy(unsigned offset, Definition *def, Operand *op, const copy_operat RegClass(src.def.regClass().type(), bytes).as_subdword(); *def = Definition(src.def.tempId(), def_reg, def_cls); if (src.op.isConstant()) { - assert(offset == 0 || (offset == 4 && src.op.bytes() == 8)); - if (src.op.bytes() == 8 && bytes == 4) + assert(bytes >= 1 && bytes <= 8); + if (bytes == 8) + *op = Operand(src.op.constantValue64() >> (offset * 8u)); + else if (bytes == 4) *op = Operand(uint32_t(src.op.constantValue64() >> (offset * 8u))); - else - *op = src.op; + else if (bytes == 2) + *op = Operand(uint16_t(src.op.constantValue64() >> (offset * 8u))); + else if (bytes == 1) + *op = Operand(uint8_t(src.op.constantValue64() >> (offset * 8u))); } else { RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u) : RegClass(src.op.regClass().type(), bytes).as_subdword(); diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 37564b7e993f..58d22910150a 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -61,7 +61,7 @@ struct mad_info { enum Label { label_vec = 1 << 0, - label_constant = 1 << 1, + label_constant_32bit = 1 << 1, /* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and * 32-bit operations but this shouldn't cause any issues because we don't * look through any conversions */ @@ -91,13 +91,14 @@ enum Label { label_vcc_hint = 1 << 25, label_scc_needed = 1 << 26, label_b2i = 1 << 27, + label_constant_16bit = 1 << 29, }; static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success | label_add_sub | label_bitwise | label_uniform_bitwise | label_minmax | label_fcmp; static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool | label_omod2 | label_omod4 | label_omod5 | label_clamp | label_scc_invert | label_b2i; -static constexpr uint32_t val_labels = label_constant | label_constant_64bit | label_literal | label_mad; +static constexpr uint32_t val_labels = label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal | label_mad; struct ssa_info { uint32_t val; @@ -122,7 +123,10 @@ struct ssa_info { label &= ~instr_labels; /* instr and temp alias */ } - if (new_label & val_labels) + uint32_t const_labels = label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit; + if (new_label & const_labels) + label &= ~val_labels | const_labels; + else if (new_label & val_labels) label &= ~val_labels; label |= new_label; @@ -139,26 +143,85 @@ struct ssa_info { return label & label_vec; } - void set_constant(uint32_t constant) + void set_constant(chip_class chip, uint64_t constant) { - add_label(label_constant); + Operand op16((uint16_t)constant); + Operand op32((uint32_t)constant); + add_label(label_literal); val = constant; - } - bool is_constant() + if (chip >= GFX8 && !op16.isLiteral()) + add_label(label_constant_16bit); + + if (!op32.isLiteral() || ((uint32_t)constant == 0x3e22f983 && chip >= GFX8)) + add_label(label_constant_32bit); + + if (constant <= 64) { + add_label(label_constant_64bit); + } else if (constant >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */ + add_label(label_constant_64bit); + } else if (constant == 0x3FE0000000000000) { /* 0.5 */ + add_label(label_constant_64bit); + } else if (constant == 0xBFE0000000000000) { /* -0.5 */ + add_label(label_constant_64bit); + } else if (constant == 0x3FF0000000000000) { /* 1.0 */ + add_label(label_constant_64bit); + } else if (constant == 0xBFF0000000000000) { /* -1.0 */ + add_label(label_constant_64bit); + } else if (constant == 0x4000000000000000) { /* 2.0 */ + add_label(label_constant_64bit); + } else if (constant == 0xC000000000000000) { /* -2.0 */ + add_label(label_constant_64bit); + } else if (constant == 0x4010000000000000) { /* 4.0 */ + add_label(label_constant_64bit); + } else if (constant == 0xC010000000000000) { /* -4.0 */ + add_label(label_constant_64bit); + } + + if (label & label_constant_64bit) { + val = Operand(constant).constantValue(); + if (val != constant) + label &= ~(label_literal | label_constant_16bit | label_constant_32bit); + } + } + + bool is_constant(unsigned bits) { - return label & label_constant; + switch (bits) { + case 8: + return label & label_literal; + case 16: + return label & label_constant_16bit; + case 32: + return label & label_constant_32bit; + case 64: + return label & label_constant_64bit; + } + return false; } - void set_constant_64bit(uint32_t constant) + bool is_literal(unsigned bits) { - add_label(label_constant_64bit); - val = constant; + bool is_lit = label & label_literal; + switch (bits) { + case 8: + return false; + case 16: + return is_lit && ~(label & label_constant_16bit); + case 32: + return is_lit && ~(label & label_constant_32bit); + case 64: + return false; + } + return false; } - bool is_constant_64bit() + bool is_constant_or_literal(unsigned bits) { - return label & label_constant_64bit; + if (bits == 64) + return label & label_constant_64bit; + else + return label & label_literal; } void set_abs(Temp abs_temp) @@ -211,17 +274,6 @@ struct ssa_info { return label & label_temp; } - void set_literal(uint32_t lit) - { - add_label(label_literal); - val = lit; - } - - bool is_literal() - { - return label & label_literal; - } - void set_mad(Instruction* mad, uint32_t mad_info_idx) { add_label(label_mad); @@ -321,11 +373,6 @@ struct ssa_info { return label & label_vcc; } - bool is_constant_or_literal() - { - return is_constant() || is_literal(); - } - void set_b2f(Temp val) { add_label(label_b2f); @@ -655,7 +702,7 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp if (add_instr->operands[i].isConstant()) { *offset = add_instr->operands[i].constantValue(); } else if (add_instr->operands[i].isTemp() && - ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal()) { + ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal(32)) { *offset = ctx.info[add_instr->operands[i].tempId()].val; } else { continue; @@ -687,11 +734,15 @@ unsigned get_operand_size(aco_ptr& instr, unsigned index) return 0; } -Operand get_constant_op(opt_ctx &ctx, uint32_t val, bool is64bit = false) +Operand get_constant_op(opt_ctx &ctx, ssa_info info, uint32_t bits) { + if (bits == 8) + return Operand((uint8_t)info.val); + if (bits == 16) + return Operand((uint16_t)info.val); // TODO: this functions shouldn't be needed if we store Operand instead of value. - Operand op(val, is64bit); - if (val == 0x3e22f983 && ctx.program->chip_class >= GFX8) + Operand op(info.val, bits == 64); + if (info.is_literal(32) && info.val == 0x3e22f983 && ctx.program->chip_class >= GFX8) op.setFixed(PhysReg{248}); /* 1/2 PI can be an inline constant on GFX8+ */ return op; } @@ -706,7 +757,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) if (instr->isSALU() || instr->isVALU() || instr->format == Format::PSEUDO) { ASSERTED bool all_const = false; for (Operand& op : instr->operands) - all_const = all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal()); + all_const = all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32)); perfwarn(all_const, "All instruction operands are constant", instr.get()); } @@ -728,13 +779,13 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) /* SALU / PSEUDO: propagate inline constants */ if (instr->isSALU() || instr->format == Format::PSEUDO) { bool is_subdword = false; - // TODO: optimize SGPR and constant propagation for subdword pseudo instructions on gfx9+ + // TODO: optimize SGPR propagation for subdword pseudo instructions on gfx9+ if (instr->format == Format::PSEUDO) { is_subdword = std::any_of(instr->definitions.begin(), instr->definitions.end(), [] (const Definition& def) { return def.regClass().is_subdword();}); is_subdword = is_subdword || std::any_of(instr->operands.begin(), instr->operands.end(), [] (const Operand& op) { return op.hasRegClass() && op.regClass().is_subdword();}); - if (is_subdword) + if (is_subdword && ctx.program->chip_class < GFX9) continue; } @@ -760,9 +811,10 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) break; } } - if ((info.is_constant() || info.is_constant_64bit() || (info.is_literal() && instr->format == Format::PSEUDO)) && + unsigned bits = get_operand_size(instr, i); + if ((info.is_constant(bits) || (!is_subdword && info.is_literal(bits) && instr->format == Format::PSEUDO)) && !instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) { - instr->operands[i] = get_constant_op(ctx, info.val, info.is_constant_64bit()); + instr->operands[i] = get_constant_op(ctx, info, bits); continue; } } @@ -805,8 +857,9 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) static_cast(instr.get())->neg[i] = true; continue; } - if ((info.is_constant() || info.is_constant_64bit()) && alu_can_accept_constant(instr->opcode, i)) { - Operand op = get_constant_op(ctx, info.val, info.is_constant_64bit()); + unsigned bits = get_operand_size(instr, i); + if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i)) { + Operand op = get_constant_op(ctx, info, bits); perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get()); if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) { instr->operands[i] = op; @@ -831,13 +884,13 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) while (info.is_temp()) info = ctx.info[info.temp.id()]; - if (mubuf->offen && i == 1 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) { + if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) { assert(!mubuf->idxen); instr->operands[1] = Operand(v1); mubuf->offset += info.val; mubuf->offen = false; continue; - } else if (i == 2 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) { + } else if (i == 2 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) { instr->operands[2] = Operand((uint32_t) 0); mubuf->offset += info.val; continue; @@ -891,7 +944,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) SMEM_instruction *smem = static_cast(instr.get()); Temp base; uint32_t offset; - if (i == 1 && info.is_constant_or_literal() && + if (i == 1 && info.is_constant_or_literal(32) && ((ctx.program->chip_class == GFX6 && info.val <= 0x3FF) || (ctx.program->chip_class == GFX7 && info.val <= 0xFFFFFFFF) || (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) { @@ -900,7 +953,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4); if (soe && - (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal() || + (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) || ctx.info[smem->operands.back().tempId()].val != 0)) { continue; } @@ -996,12 +1049,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) Operand vec_op = vec->operands[vec_index]; if (vec_op.isConstant()) { - if (vec_op.isLiteral()) - ctx.info[instr->definitions[i].tempId()].set_literal(vec_op.constantValue()); - else if (vec_op.size() == 1) - ctx.info[instr->definitions[i].tempId()].set_constant(vec_op.constantValue()); - else if (vec_op.size() == 2) - ctx.info[instr->definitions[i].tempId()].set_constant_64bit(vec_op.constantValue()); + ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->chip_class, vec_op.constantValue64()); } else if (vec_op.isUndefined()) { ctx.info[instr->definitions[i].tempId()].set_undefined(); } else { @@ -1035,12 +1083,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) instr->operands[0] = op; if (op.isConstant()) { - if (op.isLiteral()) - ctx.info[instr->definitions[0].tempId()].set_literal(op.constantValue()); - else if (op.size() == 1) - ctx.info[instr->definitions[0].tempId()].set_constant(op.constantValue()); - else if (op.size() == 2) - ctx.info[instr->definitions[0].tempId()].set_constant_64bit(op.constantValue()); + ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, op.constantValue64()); } else if (op.isUndefined()) { ctx.info[instr->definitions[0].tempId()].set_undefined(); } else { @@ -1060,12 +1103,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } else if (instr->usesModifiers()) { // TODO } else if (instr->operands[0].isConstant()) { - if (instr->operands[0].isLiteral()) - ctx.info[instr->definitions[0].tempId()].set_literal(instr->operands[0].constantValue()); - else if (instr->operands[0].size() == 1) - ctx.info[instr->definitions[0].tempId()].set_constant(instr->operands[0].constantValue()); - else if (instr->operands[0].size() == 2) - ctx.info[instr->definitions[0].tempId()].set_constant_64bit(instr->operands[0].constantValue()); + ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, instr->operands[0].constantValue64()); } else if (instr->operands[0].isTemp()) { ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); } else { @@ -1074,25 +1112,19 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) break; case aco_opcode::p_is_helper: if (!ctx.program->needs_wqm) - ctx.info[instr->definitions[0].tempId()].set_constant(0u); + ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u); break; case aco_opcode::s_movk_i32: { uint32_t v = static_cast(instr.get())->imm; v = v & 0x8000 ? (v | 0xffff0000) : v; - if (v <= 64 || v >= 0xfffffff0) - ctx.info[instr->definitions[0].tempId()].set_constant(v); - else - ctx.info[instr->definitions[0].tempId()].set_literal(v); + ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, v); break; } case aco_opcode::v_bfrev_b32: case aco_opcode::s_brev_b32: { if (instr->operands[0].isConstant()) { uint32_t v = util_bitreverse(instr->operands[0].constantValue()); - if (v <= 64 || v >= 0xfffffff0) - ctx.info[instr->definitions[0].tempId()].set_constant(v); - else - ctx.info[instr->definitions[0].tempId()].set_literal(v); + ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, v); } break; } @@ -1101,10 +1133,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) unsigned size = instr->operands[0].constantValue() & 0x1f; unsigned start = instr->operands[1].constantValue() & 0x1f; uint32_t v = ((1u << size) - 1u) << start; - if (v <= 64 || v >= 0xfffffff0) - ctx.info[instr->definitions[0].tempId()].set_constant(v); - else - ctx.info[instr->definitions[0].tempId()].set_literal(v); + ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, v); } break; } @@ -1629,7 +1658,7 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& in } else if (cmp->operands[constant_operand].isTemp()) { Temp tmp = cmp->operands[constant_operand].getTemp(); unsigned id = original_temp_id(ctx, tmp); - if (!ctx.info[id].is_constant() && !ctx.info[id].is_literal()) + if (!ctx.info[id].is_constant_or_literal(32)) return false; constant = ctx.info[id].val; } else { @@ -2115,7 +2144,7 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, uint32_t val; if (operands[i].isConstant()) { val = operands[i].constantValue(); - } else if (operands[i].isTemp() && ctx.info[operands[i].tempId()].is_constant_or_literal()) { + } else if (operands[i].isTemp() && ctx.info[operands[i].tempId()].is_constant_or_literal(32)) { val = ctx.info[operands[i].tempId()].val; } else { continue; @@ -2791,9 +2820,10 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) } if (!instr->operands[i].isTemp()) continue; + unsigned bits = get_operand_size(instr, i); /* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10 or operands other than the 1st */ if (instr->operands[i].getTemp().type() == RegType::sgpr && (i > 0 || ctx.program->chip_class < GFX10)) { - if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal()) { + if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits)) { literal_uses = ctx.uses[instr->operands[i].tempId()]; literal_idx = i; } else { @@ -2802,7 +2832,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) sgpr_used = true; /* don't break because we still need to check constants */ } else if (!sgpr_used && - ctx.info[instr->operands[i].tempId()].is_literal() && + ctx.info[instr->operands[i].tempId()].is_literal(bits) && ctx.uses[instr->operands[i].tempId()] < literal_uses) { literal_uses = ctx.uses[instr->operands[i].tempId()]; literal_idx = i; @@ -2881,6 +2911,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) /* choose a literal to apply */ for (unsigned i = 0; i < num_operands; i++) { Operand op = instr->operands[i]; + unsigned bits = get_operand_size(instr, i); if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr && op.tempId() != sgpr_ids[0]) @@ -2889,7 +2920,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) if (op.isLiteral()) { current_literal = op; continue; - } else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal()) { + } else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal(bits)) { continue; } @@ -2974,7 +3005,8 @@ void apply_literals(opt_ctx &ctx, aco_ptr& instr) if (instr->isSALU() || instr->isVALU()) { for (unsigned i = 0; i < instr->operands.size(); i++) { Operand op = instr->operands[i]; - if (op.isTemp() && ctx.info[op.tempId()].is_literal() && ctx.uses[op.tempId()] == 0) { + unsigned bits = get_operand_size(instr, i); + if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) { Operand literal(ctx.info[op.tempId()].val); if (instr->isVALU() && i > 0) to_VOP3(ctx, instr); diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 0fb0ceb186d5..3daa60b71c14 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -153,8 +153,13 @@ static void print_constant(uint8_t reg, FILE *output) static void print_operand(const Operand *operand, FILE *output) { - if (operand->isLiteral()) { - fprintf(output, "0x%x", operand->constantValue()); + if (operand->isLiteral() || (operand->isConstant() && operand->bytes() == 1)) { + if (operand->bytes() == 1) + fprintf(output, "0x%.2x", operand->constantValue()); + else if (operand->bytes() == 2) + fprintf(output, "0x%.4x", operand->constantValue()); + else + fprintf(output, "0x%x", operand->constantValue()); } else if (operand->isConstant()) { print_constant(operand->physReg().reg(), output); } else if (operand->isUndefined()) { -- GitLab From 22d712273989701c91c50f98e27162aa2a1fb12f Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 18 May 2020 19:42:40 +0100 Subject: [PATCH 14/21] aco: copy-propagate constants through p_extract_vector/p_split_vector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (Navi, fp16 enabled): Totals from 1 (0.00% of 127638) affected shaders: CodeSize: 4388 -> 4392 (+0.09%) VMEM: 465 -> 458 (-1.51%) Copies: 54 -> 55 (+1.85%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 58d22910150a..82b9cb771853 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1034,8 +1034,20 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) break; } case aco_opcode::p_split_vector: { - if (!ctx.info[instr->operands[0].tempId()].is_vec()) + ssa_info& info = ctx.info[instr->operands[0].tempId()]; + + if (info.is_constant_or_literal(32)) { + uint32_t val = info.val; + for (Definition def : instr->definitions) { + uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u); + ctx.info[def.tempId()].set_constant(ctx.program->chip_class, val & mask); + val >>= def.bytes() * 8u; + } + break; + } else if (!info.is_vec()) { break; + } + Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; unsigned split_offset = 0; unsigned vec_offset = 0; @@ -1060,13 +1072,20 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) break; } case aco_opcode::p_extract_vector: { /* mov */ - if (!ctx.info[instr->operands[0].tempId()].is_vec()) + ssa_info& info = ctx.info[instr->operands[0].tempId()]; + const unsigned index = instr->operands[1].constantValue(); + const unsigned dst_offset = index * instr->definitions[0].bytes(); + + if (info.is_constant_or_literal(32)) { + uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u); + ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, (info.val >> (dst_offset * 8u)) & mask); + break; + } else if (!info.is_vec()) { break; + } /* check if we index directly into a vector element */ - Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; - const unsigned index = instr->operands[1].constantValue(); - const unsigned dst_offset = index * instr->definitions[0].bytes(); + Instruction* vec = info.instr; unsigned offset = 0; for (const Operand& op : vec->operands) { -- GitLab From d16a7190a309ba87dc52760999dd3a6c033143ef Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 15 May 2020 20:26:39 +0100 Subject: [PATCH 15/21] aco: optimize 16-bit and 64-bit float comparisons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 137 +++++++++++++++++++++-------- 1 file changed, 101 insertions(+), 36 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 82b9cb771853..0934a6f32726 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1344,20 +1344,26 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) case aco_opcode::v_max_i16: ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get()); break; - case aco_opcode::v_cmp_lt_f32: - case aco_opcode::v_cmp_eq_f32: - case aco_opcode::v_cmp_le_f32: - case aco_opcode::v_cmp_gt_f32: - case aco_opcode::v_cmp_lg_f32: - case aco_opcode::v_cmp_ge_f32: + #define CMP(cmp) \ + case aco_opcode::v_cmp_##cmp##_f16:\ + case aco_opcode::v_cmp_##cmp##_f32:\ + case aco_opcode::v_cmp_##cmp##_f64:\ + case aco_opcode::v_cmp_n##cmp##_f16:\ + case aco_opcode::v_cmp_n##cmp##_f32:\ + case aco_opcode::v_cmp_n##cmp##_f64: + CMP(lt) + CMP(eq) + CMP(le) + CMP(gt) + CMP(lg) + CMP(ge) + case aco_opcode::v_cmp_o_f16: + case aco_opcode::v_cmp_u_f16: case aco_opcode::v_cmp_o_f32: case aco_opcode::v_cmp_u_f32: - case aco_opcode::v_cmp_nge_f32: - case aco_opcode::v_cmp_nlg_f32: - case aco_opcode::v_cmp_ngt_f32: - case aco_opcode::v_cmp_nle_f32: - case aco_opcode::v_cmp_neq_f32: - case aco_opcode::v_cmp_nlt_f32: + case aco_opcode::v_cmp_o_f64: + case aco_opcode::v_cmp_u_f64: + #undef CMP ctx.info[instr->definitions[0].tempId()].set_fcmp(instr.get()); break; case aco_opcode::s_cselect_b64: @@ -1384,17 +1390,32 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } } -ALWAYS_INLINE bool get_cmp_info(aco_opcode op, aco_opcode *ordered, aco_opcode *unordered, aco_opcode *inverse) +struct CmpInfo { + aco_opcode ordered; + aco_opcode unordered; + aco_opcode inverse; + aco_opcode f32; + unsigned size; +}; + +ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info) { - *ordered = *unordered = op; + info->ordered = aco_opcode::num_opcodes; + info->unordered = aco_opcode::num_opcodes; switch (op) { - #define CMP(ord, unord) \ - case aco_opcode::v_cmp_##ord##_f32:\ - case aco_opcode::v_cmp_n##unord##_f32:\ - *ordered = aco_opcode::v_cmp_##ord##_f32;\ - *unordered = aco_opcode::v_cmp_n##unord##_f32;\ - *inverse = op == aco_opcode::v_cmp_n##unord##_f32 ? aco_opcode::v_cmp_##unord##_f32 : aco_opcode::v_cmp_n##ord##_f32;\ + #define CMP2(ord, unord, sz) \ + case aco_opcode::v_cmp_##ord##_f##sz:\ + case aco_opcode::v_cmp_n##unord##_f##sz:\ + info->ordered = aco_opcode::v_cmp_##ord##_f##sz;\ + info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;\ + info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz : aco_opcode::v_cmp_n##ord##_f##sz;\ + info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 : aco_opcode::v_cmp_n##unord##_f32;\ + info->size = sz;\ return true; + #define CMP(ord, unord) \ + CMP2(ord, unord, 16)\ + CMP2(ord, unord, 32)\ + CMP2(ord, unord, 64) CMP(lt, /*n*/ge) CMP(eq, /*n*/lg) CMP(le, /*n*/gt) @@ -1402,6 +1423,22 @@ ALWAYS_INLINE bool get_cmp_info(aco_opcode op, aco_opcode *ordered, aco_opcode * CMP(lg, /*n*/eq) CMP(ge, /*n*/lt) #undef CMP + #undef CMP2 + #define ORD_TEST(sz) \ + case aco_opcode::v_cmp_u_f##sz:\ + info->f32 = aco_opcode::v_cmp_u_f32;\ + info->inverse = aco_opcode::v_cmp_o_f##sz;\ + info->size = sz;\ + return true;\ + case aco_opcode::v_cmp_o_f##sz:\ + info->f32 = aco_opcode::v_cmp_o_f32;\ + info->inverse = aco_opcode::v_cmp_u_f##sz;\ + info->size = sz;\ + return true; + ORD_TEST(16) + ORD_TEST(32) + ORD_TEST(64) + #undef ORD_TEST default: return false; } @@ -1409,26 +1446,38 @@ ALWAYS_INLINE bool get_cmp_info(aco_opcode op, aco_opcode *ordered, aco_opcode * aco_opcode get_ordered(aco_opcode op) { - aco_opcode ordered, unordered, inverse; - return get_cmp_info(op, &ordered, &unordered, &inverse) ? ordered : aco_opcode::num_opcodes; + CmpInfo info; + return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes; } aco_opcode get_unordered(aco_opcode op) { - aco_opcode ordered, unordered, inverse; - return get_cmp_info(op, &ordered, &unordered, &inverse) ? unordered : aco_opcode::num_opcodes; + CmpInfo info; + return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes; } aco_opcode get_inverse(aco_opcode op) { - aco_opcode ordered, unordered, inverse; - return get_cmp_info(op, &ordered, &unordered, &inverse) ? inverse : aco_opcode::num_opcodes; + CmpInfo info; + return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes; +} + +aco_opcode get_f32_cmp(aco_opcode op) +{ + CmpInfo info; + return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes; +} + +unsigned get_cmp_bitsize(aco_opcode op) +{ + CmpInfo info; + return get_cmp_info(op, &info) ? info.size : 0; } bool is_cmp(aco_opcode op) { - aco_opcode ordered, unordered, inverse; - return get_cmp_info(op, &ordered, &unordered, &inverse); + CmpInfo info; + return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes; } unsigned original_temp_id(opt_ctx &ctx, Temp tmp) @@ -1484,14 +1533,18 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) Instruction *op_instr[2]; Temp op[2]; + unsigned bitsize = 0; for (unsigned i = 0; i < 2; i++) { op_instr[i] = follow_operand(ctx, instr->operands[i], true); if (!op_instr[i]) return false; aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32; + unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode); - if (op_instr[i]->opcode != expected_cmp) + if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp) + return false; + if (bitsize && op_bitsize != bitsize) return false; if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp()) return false; @@ -1511,6 +1564,7 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) return false; op[i] = op1; + bitsize = op_bitsize; } if (op[1].type() == RegType::sgpr) @@ -1524,7 +1578,18 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) decrease_uses(ctx, op_instr[0]); decrease_uses(ctx, op_instr[1]); - aco_opcode new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; + aco_opcode new_op = aco_opcode::num_opcodes; + switch (bitsize) { + case 16: + new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; + break; + case 32: + new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; + break; + case 64: + new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; + break; + } Instruction *new_instr; if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) { VOP3A_instruction *vop3 = create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); @@ -1566,12 +1631,12 @@ bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) if (!nan_test || !cmp) return false; - if (cmp->opcode == expected_nan_test) + if (get_f32_cmp(cmp->opcode) == expected_nan_test) std::swap(nan_test, cmp); - else if (nan_test->opcode != expected_nan_test) + else if (get_f32_cmp(nan_test->opcode) != expected_nan_test) return false; - if (!is_cmp(cmp->opcode)) + if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode)) return false; if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp()) @@ -1637,12 +1702,12 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& in return false; aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32; - if (cmp->opcode == expected_nan_test) + if (get_f32_cmp(cmp->opcode) == expected_nan_test) std::swap(nan_test, cmp); - else if (nan_test->opcode != expected_nan_test) + else if (get_f32_cmp(nan_test->opcode) != expected_nan_test) return false; - if (!is_cmp(cmp->opcode)) + if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode)) return false; if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp()) -- GitLab From 575b431c80170c2f3234e996947b363c4db5dcb9 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 19 May 2020 11:45:12 +0100 Subject: [PATCH 16/21] aco: validate sub-dword pseudo instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_validate.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index d4ba88e014a3..53c16bc5ac7b 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -272,6 +272,24 @@ void validate(Program* program, FILE * output) switch (instr->format) { case Format::PSEUDO: { + bool is_subdword = false; + bool has_const_sgpr = false; + bool has_literal = false; + for (Definition def : instr->definitions) + is_subdword |= def.regClass().is_subdword(); + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (instr->opcode == aco_opcode::p_extract_vector && i == 1) + continue; + Operand op = instr->operands[i]; + is_subdword |= op.hasRegClass() && op.regClass().is_subdword(); + has_const_sgpr |= op.isConstant() || (op.hasRegClass() && op.regClass().type() == RegType::sgpr); + has_literal |= op.isLiteral(); + } + + check(!is_subdword || !has_const_sgpr || program->chip_class >= GFX9, + "Sub-dword pseudo instructions can only take constants or SGPRs on GFX9+", instr.get()); + check(!is_subdword || !has_literal, "Sub-dword pseudo instructions cannot take literals", instr.get()); + if (instr->opcode == aco_opcode::p_create_vector) { unsigned size = 0; for (const Operand& op : instr->operands) { -- GitLab From 3c1b55962e0da924bc96a16689c9421888891959 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Thu, 9 Apr 2020 16:41:00 +0200 Subject: [PATCH 17/21] aco: allow to swap operands for some 16-bit float instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No fossil-db changes. Signed-off-by: Samuel Pitoiset Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 0934a6f32726..5e68e3f3b432 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -509,26 +509,41 @@ bool can_swap_operands(aco_ptr& instr) return false; switch (instr->opcode) { + case aco_opcode::v_add_f16: case aco_opcode::v_add_f32: + case aco_opcode::v_mul_f16: case aco_opcode::v_mul_f32: case aco_opcode::v_or_b32: case aco_opcode::v_and_b32: case aco_opcode::v_xor_b32: + case aco_opcode::v_max_f16: case aco_opcode::v_max_f32: + case aco_opcode::v_min_f16: case aco_opcode::v_min_f32: case aco_opcode::v_max_i32: case aco_opcode::v_min_i32: case aco_opcode::v_max_u32: case aco_opcode::v_min_u32: + case aco_opcode::v_cmp_eq_f16: case aco_opcode::v_cmp_eq_f32: + case aco_opcode::v_cmp_lg_f16: case aco_opcode::v_cmp_lg_f32: return true; + case aco_opcode::v_sub_f16: + instr->opcode = aco_opcode::v_subrev_f16; + return true; case aco_opcode::v_sub_f32: instr->opcode = aco_opcode::v_subrev_f32; return true; + case aco_opcode::v_cmp_lt_f16: + instr->opcode = aco_opcode::v_cmp_gt_f16; + return true; case aco_opcode::v_cmp_lt_f32: instr->opcode = aco_opcode::v_cmp_gt_f32; return true; + case aco_opcode::v_cmp_ge_f16: + instr->opcode = aco_opcode::v_cmp_le_f16; + return true; case aco_opcode::v_cmp_ge_f32: instr->opcode = aco_opcode::v_cmp_le_f32; return true; -- GitLab From 82de70d06e1b678aa2426ae647327ca2f418dc0e Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 19 May 2020 13:41:43 +0100 Subject: [PATCH 18/21] aco: add more opcodes to can_swap_operands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (Navi, fp16 enabled): Totals from 310 (0.24% of 127638) affected shaders: CodeSize: 1290508 -> 1289716 (-0.06%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 89 ++++++++++++++++++------------ 1 file changed, 53 insertions(+), 36 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 5e68e3f3b432..1e2d4841891b 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -502,6 +502,18 @@ struct opt_ctx { std::vector uses; }; +struct CmpInfo { + aco_opcode ordered; + aco_opcode unordered; + aco_opcode ordered_swapped; + aco_opcode unordered_swapped; + aco_opcode inverse; + aco_opcode f32; + unsigned size; +}; + +ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info); + bool can_swap_operands(aco_ptr& instr) { if (instr->operands[0].isConstant() || @@ -524,10 +536,14 @@ bool can_swap_operands(aco_ptr& instr) case aco_opcode::v_min_i32: case aco_opcode::v_max_u32: case aco_opcode::v_min_u32: - case aco_opcode::v_cmp_eq_f16: - case aco_opcode::v_cmp_eq_f32: - case aco_opcode::v_cmp_lg_f16: - case aco_opcode::v_cmp_lg_f32: + case aco_opcode::v_max_i16: + case aco_opcode::v_min_i16: + case aco_opcode::v_max_u16: + case aco_opcode::v_min_u16: + case aco_opcode::v_max_i16_e64: + case aco_opcode::v_min_i16_e64: + case aco_opcode::v_max_u16_e64: + case aco_opcode::v_min_u16_e64: return true; case aco_opcode::v_sub_f16: instr->opcode = aco_opcode::v_subrev_f16; @@ -535,24 +551,29 @@ bool can_swap_operands(aco_ptr& instr) case aco_opcode::v_sub_f32: instr->opcode = aco_opcode::v_subrev_f32; return true; - case aco_opcode::v_cmp_lt_f16: - instr->opcode = aco_opcode::v_cmp_gt_f16; - return true; - case aco_opcode::v_cmp_lt_f32: - instr->opcode = aco_opcode::v_cmp_gt_f32; - return true; - case aco_opcode::v_cmp_ge_f16: - instr->opcode = aco_opcode::v_cmp_le_f16; + case aco_opcode::v_sub_co_u32: + instr->opcode = aco_opcode::v_subrev_co_u32; return true; - case aco_opcode::v_cmp_ge_f32: - instr->opcode = aco_opcode::v_cmp_le_f32; + case aco_opcode::v_sub_u16: + instr->opcode = aco_opcode::v_subrev_u16; return true; - case aco_opcode::v_cmp_lt_i32: - instr->opcode = aco_opcode::v_cmp_gt_i32; + case aco_opcode::v_sub_u32: + instr->opcode = aco_opcode::v_subrev_u32; return true; - default: + default: { + CmpInfo info; + get_cmp_info(instr->opcode, &info); + if (info.ordered == instr->opcode) { + instr->opcode = info.ordered_swapped; + return true; + } + if (info.unordered == instr->opcode) { + instr->opcode = info.unordered_swapped; + return true; + } return false; } + } } bool can_use_VOP3(opt_ctx& ctx, const aco_ptr& instr) @@ -1405,38 +1426,34 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } } -struct CmpInfo { - aco_opcode ordered; - aco_opcode unordered; - aco_opcode inverse; - aco_opcode f32; - unsigned size; -}; - ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info) { info->ordered = aco_opcode::num_opcodes; info->unordered = aco_opcode::num_opcodes; + info->ordered_swapped = aco_opcode::num_opcodes; + info->unordered_swapped = aco_opcode::num_opcodes; switch (op) { - #define CMP2(ord, unord, sz) \ + #define CMP2(ord, unord, ord_swap, unord_swap, sz) \ case aco_opcode::v_cmp_##ord##_f##sz:\ case aco_opcode::v_cmp_n##unord##_f##sz:\ info->ordered = aco_opcode::v_cmp_##ord##_f##sz;\ info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;\ + info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz;\ + info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz;\ info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz : aco_opcode::v_cmp_n##ord##_f##sz;\ info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 : aco_opcode::v_cmp_n##unord##_f32;\ info->size = sz;\ return true; - #define CMP(ord, unord) \ - CMP2(ord, unord, 16)\ - CMP2(ord, unord, 32)\ - CMP2(ord, unord, 64) - CMP(lt, /*n*/ge) - CMP(eq, /*n*/lg) - CMP(le, /*n*/gt) - CMP(gt, /*n*/le) - CMP(lg, /*n*/eq) - CMP(ge, /*n*/lt) + #define CMP(ord, unord, ord_swap, unord_swap) \ + CMP2(ord, unord, ord_swap, unord_swap, 16)\ + CMP2(ord, unord, ord_swap, unord_swap, 32)\ + CMP2(ord, unord, ord_swap, unord_swap, 64) + CMP(lt, /*n*/ge, gt, /*n*/le) + CMP(eq, /*n*/lg, eq, /*n*/lg) + CMP(le, /*n*/gt, ge, /*n*/lt) + CMP(gt, /*n*/le, lt, /*n*/le) + CMP(lg, /*n*/eq, lg, /*n*/eq) + CMP(ge, /*n*/lt, le, /*n*/gt) #undef CMP #undef CMP2 #define ORD_TEST(sz) \ -- GitLab From e9578e303386c22940b62a88ea85cea794124537 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 11 Jun 2020 14:05:48 +0100 Subject: [PATCH 19/21] aco: allow GFX9 partial writes with instructions which use opsel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some instructions such as v_mad_f16 can do partial writes on GFX9. No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_register_allocation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 505f5cb613df..4841b5eb6f00 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -429,7 +429,7 @@ std::pair get_subdword_definition_info(Program *program, con if (can_use_SDWA(chip, instr)) { return std::make_pair(rc.bytes(), rc.bytes()); } else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, 1)) { - return std::make_pair(2u, chip >= GFX10 ? 2u : 4u); + return std::make_pair(2u, bytes_written); } switch (instr->opcode) { -- GitLab From 82c265a51467ec8c112146bc7a2875609d5be0cf Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 11 Jun 2020 14:22:13 +0100 Subject: [PATCH 20/21] aco: improve check for moving temporaries out of fixed definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_register_allocation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 4841b5eb6f00..985aae4cafda 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -2085,7 +2085,7 @@ void register_allocation(Program *program, std::vector& live_out_per_bl adjust_max_used_regs(ctx, definition.regClass(), definition.physReg()); /* check if the target register is blocked */ - if (register_file[definition.physReg().reg()] != 0) { + if (register_file.test(definition.physReg(), definition.bytes())) { /* create parallelcopy pair to move blocking vars */ std::set> vars = collect_vars(ctx, register_file, definition.physReg(), definition.size()); -- GitLab From a02e7f679975bba76ee2a5c64b5b43432619b5a5 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 11 Jun 2020 14:06:32 +0100 Subject: [PATCH 21/21] aco: fix encoding of certain s_setreg_imm32_b32 instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the mode is too small, the operand will be an inline constant and the literal dword won't be written. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 5e93dc603e6b..441e1b6b8e5e 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1658,7 +1658,9 @@ void lower_to_hw_instr(Program* program) assert(block->kind & block_kind_top_level); uint32_t mode = block->fp_mode.val; /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */ - bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand(mode), (7 << 11) | 1); + Instruction *instr = bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand(mode), (7 << 11) | 1).instr; + /* has to be a literal */ + instr->operands[0].setFixed(PhysReg{255}); } for (size_t j = 0; j < block->instructions.size(); j++) { -- GitLab