From 3826d1ca38c4766b87a226e4ad4388b1cbe9947c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Mon, 7 Mar 2022 10:19:56 +0100 Subject: [PATCH 1/4] aco: use split_vector to extract subdword components --- src/amd/compiler/aco_instruction_selection.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 29fe8f368766..ea1359e8d789 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -340,7 +340,14 @@ void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst) { Builder bld(ctx->program, ctx->block); - bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx)); + if (dst.regClass() == v2b && src.regClass() == v1) { + /* D16 subdword split */ + Definition def0 = idx == 0 ? Definition(dst) : bld.def(v2b); + Definition def1 = idx == 0 ? bld.def(v2b) : Definition(dst); + bld.pseudo(aco_opcode::p_split_vector, def0, def1, src); + } else { + bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx)); + } } Temp -- GitLab From a79f2cccf1f53eb5b58b54c3ddd749ddc50b85fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Mon, 7 Mar 2022 14:27:38 +0100 Subject: [PATCH 2/4] aco: don't allow dmask holes for d16 texture loads This makes it easier to optimize dword accesses for packed math instructions. --- src/amd/compiler/aco_instruction_selection.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index ea1359e8d789..1bfaebcd37ce 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -9522,7 +9522,9 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) /* Build tex instruction */ unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa) & 0xf; - if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) + bool d16 = instr->dest.ssa.bit_size == 16; + /* D16 dmask can have holes, but it is easier to optimize this way */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF || d16) dmask = u_bit_consecutive(0, util_last_bit(dmask)); if (instr->is_sparse) dmask = MAX2(dmask, 1) | 0x10; @@ -9530,7 +9532,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) ctx->options->gfx_level >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF ? ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array) : 0; - bool d16 = instr->dest.ssa.bit_size == 16; + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); Temp tmp_dst = dst; @@ -9545,6 +9547,10 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4)); } else if (instr->op == nir_texop_fragment_mask_fetch_amd) { tmp_dst = bld.tmp(v1); + } else if (d16 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) { + RegClass rc = RegClass(RegType::vgpr, (util_bitcount(dmask) + 1) / 2); + if (rc != dst.regClass()) + tmp_dst = bld.tmp(rc); } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) { unsigned bytes = util_bitcount(dmask) * instr->dest.ssa.bit_size / 8; -- GitLab From b9b1c3b14f92c9e85bacd394f83562e9601a934c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Mon, 7 Mar 2022 12:39:53 +0100 Subject: [PATCH 3/4] aco: split 16bit vec4 into dword components, only This improves subdword adressing and opsel optimization opportunities. Totals from 397 (0.29% of 134913) affected shaders: (GFX10.3) SpillVGPRs: 1812 -> 1790 (-1.21%); split: -1.60%, +0.39% CodeSize: 1795688 -> 1801644 (+0.33%); split: -0.17%, +0.50% Scratch: 180224 -> 176128 (-2.27%) Instrs: 317736 -> 318528 (+0.25%); split: -0.16%, +0.41% Latency: 5684532 -> 5670065 (-0.25%); split: -0.79%, +0.54% InvThroughput: 2579901 -> 2573088 (-0.26%); split: -0.87%, +0.61% VClause: 7590 -> 7638 (+0.63%); split: -0.14%, +0.78% Copies: 52328 -> 53017 (+1.32%); split: -0.75%, +2.07% Branches: 9498 -> 9492 (-0.06%); split: -0.08%, +0.02% PreVGPRs: 9462 -> 8717 (-7.87%) --- src/amd/compiler/aco_instruction_selection.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 1bfaebcd37ce..83f8e161a744 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -361,6 +361,14 @@ emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc) assert(src.bytes() > (idx * dst_rc.bytes())); Builder bld(ctx->program, ctx->block); + if (dst_rc == v2b && src.regClass() == v2) { + /* we only split these into v1 */ + src = emit_extract_vector(ctx, src, idx / 2, v1); + Temp dst = bld.tmp(dst_rc); + emit_extract_vector(ctx, src, idx & 1, dst); + return dst; + } + auto it = ctx->allocated_vec.find(src.id()); if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) { if (it->second[idx].regClass() == dst_rc) { @@ -394,7 +402,7 @@ emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components) return; RegClass rc; if (num_components > vec_src.size()) { - if (vec_src.type() == RegType::sgpr) { + if (vec_src.type() == RegType::sgpr || vec_src.regClass() == v2) { /* should still help get_alu_src() */ emit_split_vector(ctx, vec_src, vec_src.size()); return; -- GitLab From 8efce760b2151cb6ca229c9cc09f6d77dbc79300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Mon, 7 Mar 2022 15:54:39 +0100 Subject: [PATCH 4/4] aco/optimizer: combine extract into subdword SDWA instructions This fixes an issue, when an SDWA instruction already selected a variable only partially. Totals from 194 (0.14% of 134913) affected shaders: (GFX10.3) SpillVGPRs: 944 -> 946 (+0.21%); split: -1.17%, +1.38% CodeSize: 997680 -> 991232 (-0.65%); split: -0.78%, +0.13% Instrs: 169978 -> 169141 (-0.49%); split: -0.59%, +0.09% Latency: 2932016 -> 2908588 (-0.80%) InvThroughput: 1335218 -> 1323196 (-0.90%) VClause: 3881 -> 3872 (-0.23%); split: -0.36%, +0.13% Copies: 31058 -> 30211 (-2.73%); split: -3.08%, +0.36% PreVGPRs: 4724 -> 4587 (-2.90%) --- src/amd/compiler/aco_optimizer.cpp | 53 +++++++++++++++++++----------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 96d5eddf77e9..1669dac376cc 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1058,6 +1058,29 @@ parse_insert(Instruction* instr) } } +SubdwordSel +match_subdword_selection(SubdwordSel inner, SubdwordSel outer) +{ + /* the offset of outer must be within extracted range of inner */ + if (outer.offset() >= inner.size()) + return SubdwordSel(); + + /* don't remove the sign-extension when increasing the size further */ + bool sign_extend = false; + if (outer.size() == 4) + sign_extend = inner.sign_extend(); + else if (outer.size() <= inner.size()) + sign_extend = outer.sign_extend(); + else if (outer.sign_extend()) + sign_extend = inner.sign_extend(); + else if (inner.sign_extend()) + return SubdwordSel(); + + unsigned size = std::min(inner.size(), outer.size()); + unsigned offset = inner.offset() + outer.offset(); + return SubdwordSel(size, offset, sign_extend); +} + bool can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& info) { @@ -1075,8 +1098,10 @@ can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_i return true; } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) && (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) { - if (instr->isSDWA() && instr->sdwa().sel[idx] != SubdwordSel::dword) - return false; + if (instr->isSDWA()) { + if (!match_subdword_selection(sel, instr->sdwa().sel[idx])) + return false; + } return true; } else if (instr->isVOP3() && sel.size() == 2 && can_use_opsel(ctx.program->gfx_level, instr->opcode, idx) && @@ -1084,15 +1109,8 @@ can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_i return true; } else if (instr->opcode == aco_opcode::p_extract) { SubdwordSel instrSel = parse_extract(instr.get()); - - /* the outer offset must be within extracted range */ - if (instrSel.offset() >= sel.size()) - return false; - - /* don't remove the sign-extension when increasing the size further */ - if (instrSel.size() > sel.size() && !instrSel.sign_extend() && sel.sign_extend()) + if (!match_subdword_selection(sel, instrSel)) return false; - return true; } @@ -1132,21 +1150,16 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) && (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) { to_SDWA(ctx, instr); + sel = match_subdword_selection(sel, instr->sdwa().sel[idx]); static_cast(instr.get())->sel[idx] = sel; } else if (instr->isVOP3()) { if (sel.offset()) instr->vop3().opsel |= 1 << idx; } else if (instr->opcode == aco_opcode::p_extract) { - SubdwordSel instrSel = parse_extract(instr.get()); - - unsigned size = std::min(sel.size(), instrSel.size()); - unsigned offset = sel.offset() + instrSel.offset(); - unsigned sign_extend = - instrSel.sign_extend() && (sel.sign_extend() || instrSel.size() <= sel.size()); - - instr->operands[1] = Operand::c32(offset / size); - instr->operands[2] = Operand::c32(size * 8u); - instr->operands[3] = Operand::c32(sign_extend); + sel = match_subdword_selection(sel, parse_extract(instr.get())); + instr->operands[1] = Operand::c32(sel.offset() / sel.size()); + instr->operands[2] = Operand::c32(sel.size() * 8u); + instr->operands[3] = Operand::c32(sel.sign_extend()); return; } -- GitLab