Commit 401b7ba3 authored by Rhys Perry's avatar Rhys Perry
Browse files

aco: improve do_pack_2x16() with zero constants



We can skip the v_or_b32 or use an instruction smaller than
v_alignbyte_b32.

Signed-off-by: Rhys Perry's avatarRhys Perry <pendingchaos02@gmail.com>
parent 7f330d16
Pipeline #749981 waiting for manual action with stages
......@@ -1451,8 +1451,8 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
/* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
(!hi.isConstant() || !Operand::c32(hi.constantValue()).isLiteral() ||
ctx->program->gfx_level >= GFX10)) {
(!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
ctx->program->gfx_level >= GFX10)))) {
if (hi.isConstant())
bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo,
Operand::c32(2u));
......@@ -1474,8 +1474,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
else
bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
Operand(def.physReg(), v1));
if (lo.constantValue())
bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
Operand(def.physReg(), v1));
return;
}
if (hi.isConstant()) {
......@@ -1484,8 +1485,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
else
bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
Operand(def.physReg(), v1));
if (hi.constantValue())
bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
Operand(def.physReg(), v1));
return;
}
......
......@@ -841,26 +841,54 @@ BEGIN_TEST(to_hw_instr.swap_linear_vgpr)
finish_to_hw_instr_test();
END_TEST
BEGIN_TEST(to_hw_instr.pack2x16_alignbyte_constant)
BEGIN_TEST(to_hw_instr.pack2x16_constant)
PhysReg v0_lo{256};
PhysReg v0_hi{256};
PhysReg v1_lo{257};
PhysReg v1_hi{257};
v0_hi.reg_b += 2;
v1_hi.reg_b += 2;
if (!setup_cs(NULL, GFX10))
return;
for (amd_gfx_level lvl : {GFX10, GFX11}) {
if (!setup_cs(NULL, lvl))
continue;
/* prevent usage of v_pack_b32_f16 */
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
/* prevent usage of v_pack_b32_f16 */
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
//>> p_unit_test 0
//! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v1_hi, v2b), Operand::c16(0x3800));
//>> p_unit_test 0
//! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v1_hi, v2b), Operand::c16(0x3800));
//! s_endpgm
//! p_unit_test 1
//! v2b: %_:v[0][0:16] = v_lshrrev_b32 16, %_:v[1][16:32]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v1_hi, v2b), Operand::zero(2));
finish_to_hw_instr_test();
//! p_unit_test 2
//~gfx10! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
//~gfx11! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1][0:16]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand(v1_lo, v2b), Operand::zero(2));
//! p_unit_test 3
//! v2b: %_:v[0][16:32] = v_and_b32 0xffff0000, %_:v[1][16:32]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand::zero(2), Operand(v1_hi, v2b));
//! p_unit_test 4
//! v2b: %_:v[0][16:32] = v_lshlrev_b32 16, %_:v[1][0:16]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
Operand::zero(2), Operand(v1_lo, v2b));
//! s_endpgm
finish_to_hw_instr_test();
}
END_TEST
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment