diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 29fe8f3687662f8bde6033daf21528a37e3a9d83..83f8e161a7442a0c6187e6a8bc77b848ef2be1c3 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -340,7 +340,14 @@ void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst) { Builder bld(ctx->program, ctx->block); - bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx)); + if (dst.regClass() == v2b && src.regClass() == v1) { + /* D16 subdword split */ + Definition def0 = idx == 0 ? Definition(dst) : bld.def(v2b); + Definition def1 = idx == 0 ? bld.def(v2b) : Definition(dst); + bld.pseudo(aco_opcode::p_split_vector, def0, def1, src); + } else { + bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx)); + } } Temp @@ -354,6 +361,14 @@ emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc) assert(src.bytes() > (idx * dst_rc.bytes())); Builder bld(ctx->program, ctx->block); + if (dst_rc == v2b && src.regClass() == v2) { + /* we only split these into v1 */ + src = emit_extract_vector(ctx, src, idx / 2, v1); + Temp dst = bld.tmp(dst_rc); + emit_extract_vector(ctx, src, idx & 1, dst); + return dst; + } + auto it = ctx->allocated_vec.find(src.id()); if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) { if (it->second[idx].regClass() == dst_rc) { @@ -387,7 +402,7 @@ emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components) return; RegClass rc; if (num_components > vec_src.size()) { - if (vec_src.type() == RegType::sgpr) { + if (vec_src.type() == RegType::sgpr || vec_src.regClass() == v2) { /* should still help get_alu_src() */ emit_split_vector(ctx, vec_src, vec_src.size()); return; @@ -9515,7 +9530,9 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) /* Build tex instruction */ unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa) & 0xf; - if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) + bool d16 = instr->dest.ssa.bit_size == 16; + /* D16 dmask can have holes, but it is easier to optimize this way */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF || d16) dmask = u_bit_consecutive(0, util_last_bit(dmask)); if (instr->is_sparse) dmask = MAX2(dmask, 1) | 0x10; @@ -9523,7 +9540,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) ctx->options->gfx_level >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF ? ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array) : 0; - bool d16 = instr->dest.ssa.bit_size == 16; + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); Temp tmp_dst = dst; @@ -9538,6 +9555,10 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4)); } else if (instr->op == nir_texop_fragment_mask_fetch_amd) { tmp_dst = bld.tmp(v1); + } else if (d16 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) { + RegClass rc = RegClass(RegType::vgpr, (util_bitcount(dmask) + 1) / 2); + if (rc != dst.regClass()) + tmp_dst = bld.tmp(rc); } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) { unsigned bytes = util_bitcount(dmask) * instr->dest.ssa.bit_size / 8; diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 96d5eddf77e93488b8b75791fa557b2a2d8b6334..1669dac376cce0035e2f4424142f44ddb3b89c4f 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1058,6 +1058,29 @@ parse_insert(Instruction* instr) } } +SubdwordSel +match_subdword_selection(SubdwordSel inner, SubdwordSel outer) +{ + /* the offset of outer must be within extracted range of inner */ + if (outer.offset() >= inner.size()) + return SubdwordSel(); + + /* don't remove the sign-extension when increasing the size further */ + bool sign_extend = false; + if (outer.size() == 4) + sign_extend = inner.sign_extend(); + else if (outer.size() <= inner.size()) + sign_extend = outer.sign_extend(); + else if (outer.sign_extend()) + sign_extend = inner.sign_extend(); + else if (inner.sign_extend()) + return SubdwordSel(); + + unsigned size = std::min(inner.size(), outer.size()); + unsigned offset = inner.offset() + outer.offset(); + return SubdwordSel(size, offset, sign_extend); +} + bool can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& info) { @@ -1075,8 +1098,10 @@ can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_i return true; } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) && (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) { - if (instr->isSDWA() && instr->sdwa().sel[idx] != SubdwordSel::dword) - return false; + if (instr->isSDWA()) { + if (!match_subdword_selection(sel, instr->sdwa().sel[idx])) + return false; + } return true; } else if (instr->isVOP3() && sel.size() == 2 && can_use_opsel(ctx.program->gfx_level, instr->opcode, idx) && @@ -1084,15 +1109,8 @@ can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_i return true; } else if (instr->opcode == aco_opcode::p_extract) { SubdwordSel instrSel = parse_extract(instr.get()); - - /* the outer offset must be within extracted range */ - if (instrSel.offset() >= sel.size()) - return false; - - /* don't remove the sign-extension when increasing the size further */ - if (instrSel.size() > sel.size() && !instrSel.sign_extend() && sel.sign_extend()) + if (!match_subdword_selection(sel, instrSel)) return false; - return true; } @@ -1132,21 +1150,16 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) && (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) { to_SDWA(ctx, instr); + sel = match_subdword_selection(sel, instr->sdwa().sel[idx]); static_cast(instr.get())->sel[idx] = sel; } else if (instr->isVOP3()) { if (sel.offset()) instr->vop3().opsel |= 1 << idx; } else if (instr->opcode == aco_opcode::p_extract) { - SubdwordSel instrSel = parse_extract(instr.get()); - - unsigned size = std::min(sel.size(), instrSel.size()); - unsigned offset = sel.offset() + instrSel.offset(); - unsigned sign_extend = - instrSel.sign_extend() && (sel.sign_extend() || instrSel.size() <= sel.size()); - - instr->operands[1] = Operand::c32(offset / size); - instr->operands[2] = Operand::c32(size * 8u); - instr->operands[3] = Operand::c32(sign_extend); + sel = match_subdword_selection(sel, parse_extract(instr.get())); + instr->operands[1] = Operand::c32(sel.offset() / sel.size()); + instr->operands[2] = Operand::c32(sel.size() * 8u); + instr->operands[3] = Operand::c32(sel.sign_extend()); return; }