Commit 26eb42e5 authored by Rhys Perry's avatar Rhys Perry
Browse files

aco: combine conversions into fp16 by promoting to fp32



Signed-off-by: Rhys Perry's avatarRhys Perry <pendingchaos02@gmail.com>
parent f9b90c54
......@@ -122,16 +122,16 @@ enum Label {
label_insert = 1ull << 34,
label_dpp16 = 1ull << 35,
label_dpp8 = 1ull << 36,
label_f2f32 = 1ull << 37,
label_f2f16 = 1ull << 38,
label_f2f_in = 1ull << 37,
label_f2f_out = 1ull << 38,
};
static constexpr uint64_t instr_usedef_labels =
label_vec | label_mul | label_mad | label_add_sub | label_vop3p | label_bitwise |
label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract | label_dpp16 |
label_dpp8 | label_f2f32;
label_dpp8 | label_f2f_in;
static constexpr uint64_t instr_mod_labels =
label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f16;
label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f_out;
static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels;
static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f |
......@@ -327,13 +327,13 @@ struct ssa_info {
bool is_clamp() { return label & label_clamp; }
void set_f2f16(Instruction* conv)
void set_f2f_out(Instruction* conv)
{
add_label(label_f2f16);
add_label(label_f2f_out);
instr = conv;
}
bool is_f2f16() { return label & label_f2f16; }
bool is_f2f_out() { return label & label_f2f_out; }
void set_undefined() { add_label(label_undefined); }
......@@ -451,13 +451,13 @@ struct ssa_info {
bool is_canonicalized() { return label & label_canonicalized; }
void set_f2f32(Instruction* cvt)
void set_f2f_in(Instruction* cvt)
{
add_label(label_f2f32);
add_label(label_f2f_in);
instr = cvt;
}
bool is_f2f32() { return label & label_f2f32; }
bool is_f2f_in() { return label & label_f2f_in; }
void set_extract(Instruction* extract)
{
......@@ -1096,11 +1096,11 @@ apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info&
return;
}
/* Output modifier, label_vopc and label_f2f32 seem to be the only one worth keeping at the
/* Output modifier, label_vopc and label_f2f_in seem to be the only one worth keeping at the
* moment
*/
for (Definition& def : instr->definitions)
ctx.info[def.tempId()].label &= (label_vopc | label_f2f32 | instr_mod_labels);
ctx.info[def.tempId()].label &= (label_vopc | label_f2f_in | instr_mod_labels);
}
void
......@@ -1899,13 +1899,18 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
break;
}
case aco_opcode::v_cvt_f16_f32: {
if (instr->operands[0].isTemp())
ctx.info[instr->operands[0].tempId()].set_f2f16(instr.get());
if (instr->operands[0].isTemp()) {
ctx.info[instr->definitions[0].tempId()].set_f2f_in(instr.get());
ctx.info[instr->operands[0].tempId()].set_f2f_out(instr.get());
}
break;
}
case aco_opcode::v_cvt_f32_f16: {
if (instr->operands[0].isTemp())
ctx.info[instr->definitions[0].tempId()].set_f2f32(instr.get());
if (instr->operands[0].isTemp()) {
ctx.info[instr->definitions[0].tempId()].set_f2f_in(instr.get());
if (!(ctx.info[instr->operands[0].tempId()].label & label_extract))
ctx.info[instr->operands[0].tempId()].set_f2f_out(instr.get());
}
break;
}
default: break;
......@@ -3074,7 +3079,8 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
}
instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert | label_f2f16;
ctx.info[instr->definitions[0].tempId()].label &=
label_clamp | label_insert | label_f2f_out;
ctx.uses[def_info.instr->definitions[0].tempId()]--;
return true;
......@@ -3459,10 +3465,15 @@ can_use_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
return false;
switch (instr->opcode) {
case aco_opcode::v_add_f16:
case aco_opcode::v_add_f32:
case aco_opcode::v_sub_f16:
case aco_opcode::v_sub_f32:
case aco_opcode::v_subrev_f16:
case aco_opcode::v_subrev_f32:
case aco_opcode::v_mul_f16:
case aco_opcode::v_mul_f32:
case aco_opcode::v_fma_f16:
case aco_opcode::v_fma_f32: break;
case aco_opcode::v_fma_mix_f32:
case aco_opcode::v_fma_mixlo_f16: return true;
......@@ -3476,19 +3487,34 @@ can_use_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (instr->isVOP3())
return !instr->vop3().omod && !(instr->vop3().opsel & 0x8);
if (instr->isSDWA()) {
SDWA_instruction* sdwa = &instr->sdwa();
if (sdwa->dst_sel.size() != instr->definitions[0].bytes() || sdwa->dst_sel.offset() ||
sdwa->omod)
return false;
for (unsigned i = 0; i < instr->operands.size(); i++) {
if (sdwa->sel[i].size() != instr->definitions[0].bytes())
return false;
}
return true;
}
return instr->format == Format::VOP2;
}
void
to_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
to_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr, bool* is_add_ptr)
{
bool is_add = instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32;
bool is_32bit = instr->definitions[0].bytes() == 4;
bool is_mul = instr->opcode == aco_opcode::v_mul_f16 || instr->opcode == aco_opcode::v_mul_f32;
bool is_add = !is_mul && instr->operands.size() == 2;
aco_opcode opcode = is_32bit ? aco_opcode::v_fma_mix_f32 : aco_opcode::v_fma_mixlo_f16;
aco_ptr<VOP3P_instruction> vop3p{
create_instruction<VOP3P_instruction>(aco_opcode::v_fma_mix_f32, Format::VOP3P, 3, 1)};
create_instruction<VOP3P_instruction>(opcode, Format::VOP3P, 3, 1)};
vop3p->opsel_lo = instr->isVOP3() ? (instr->vop3().opsel & 0x7) << is_add : 0x0;
vop3p->opsel_hi = 0x0;
vop3p->opsel_hi = is_32bit ? 0x0 : 0x7;
for (unsigned i = 0; i < instr->operands.size(); i++) {
vop3p->operands[is_add + i] = instr->operands[i];
vop3p->neg_lo[is_add + i] = instr->isVOP3() && instr->vop3().neg[i];
......@@ -3497,32 +3523,36 @@ to_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
vop3p->neg_hi[is_add + i] |= instr->isSDWA() && instr->sdwa().abs[i];
vop3p->opsel_lo |= (instr->isSDWA() && instr->sdwa().sel[i].offset()) << (is_add + i);
}
if (instr->opcode == aco_opcode::v_mul_f32) {
if (is_mul) {
vop3p->opsel_hi &= 0x3;
vop3p->operands[2] = Operand::zero();
vop3p->neg_lo[2] = true;
} else if (is_add) {
vop3p->opsel_hi &= 0x6;
vop3p->operands[0] = Operand::c32(0x3f800000);
if (instr->opcode == aco_opcode::v_sub_f32)
if (instr->opcode == aco_opcode::v_sub_f16 || instr->opcode == aco_opcode::v_sub_f32)
vop3p->neg_lo[2] ^= true;
else if (instr->opcode == aco_opcode::v_subrev_f32)
else if (instr->opcode == aco_opcode::v_subrev_f16 ||
instr->opcode == aco_opcode::v_subrev_f32)
vop3p->neg_lo[1] ^= true;
}
vop3p->definitions[0] = instr->definitions[0];
vop3p->clamp = instr->isVOP3() && instr->vop3().clamp;
instr = std::move(vop3p);
ctx.info[instr->definitions[0].tempId()].label &= label_f2f16 | label_clamp | label_mul;
ctx.info[instr->definitions[0].tempId()].label &= label_f2f_out | label_clamp | label_mul;
if (ctx.info[instr->definitions[0].tempId()].label & label_mul)
ctx.info[instr->definitions[0].tempId()].instr = instr.get();
if (is_add_ptr)
*is_add_ptr = is_add;
}
bool
combine_output_conversion(opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
if (!def_info.is_f2f16())
if (!def_info.is_f2f_out())
return false;
Instruction* conv = def_info.instr;
......@@ -3532,19 +3562,34 @@ combine_output_conversion(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (!ctx.uses[conv->definitions[0].tempId()])
return false;
if (conv->usesModifiers())
if (conv->definitions[0].bytes() == 4 &&
(instr->definitions[0].isPrecise() || conv->definitions[0].isPrecise()))
return false;
if ((conv->opcode == aco_opcode::v_cvt_f32_f16 ? 2 : 4) != instr->definitions[0].bytes() ||
conv->usesModifiers())
return false;
if (!instr->isVOP3P())
to_mad_mix(ctx, instr);
to_mad_mix(ctx, instr, NULL);
instr->opcode = aco_opcode::v_fma_mixlo_f16;
if (conv->opcode == aco_opcode::v_cvt_f16_f32)
instr->opcode = aco_opcode::v_fma_mixlo_f16;
else
instr->opcode = aco_opcode::v_fma_mix_f32;
instr->definitions[0].swapTemp(conv->definitions[0]);
if (conv->definitions[0].isPrecise())
instr->definitions[0].setPrecise(true);
ctx.info[instr->definitions[0].tempId()].label &= label_clamp;
ctx.uses[conv->definitions[0].tempId()]--;
if ((instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
instr->opcode == aco_opcode::v_fma_mix_f32) &&
instr->operands[2].constantEquals(0) && instr->vop3p().neg_lo[2] &&
!ctx.info[instr->definitions[0].tempId()].is_clamp()) {
ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
}
return true;
}
......@@ -3558,12 +3603,13 @@ combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (!instr->operands[i].isTemp())
continue;
Temp tmp = instr->operands[i].getTemp();
if (!ctx.info[tmp.id()].is_f2f32())
if (!ctx.info[tmp.id()].is_f2f_in())
continue;
Instruction* conv = ctx.info[tmp.id()].instr;
if (conv->isSDWA() && (conv->sdwa().dst_sel.size() != 4 || conv->sdwa().sel[0].size() != 2 ||
conv->sdwa().clamp || conv->sdwa().omod)) {
if (conv->isSDWA() &&
(conv->opcode != aco_opcode::v_cvt_f32_f16 || conv->sdwa().dst_sel.size() != 4 ||
conv->sdwa().sel[0].size() != 2 || conv->sdwa().clamp || conv->sdwa().omod)) {
continue;
} else if (conv->isVOP3() && (conv->vop3().clamp || conv->vop3().omod)) {
continue;
......@@ -3571,7 +3617,10 @@ combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
continue;
}
if (get_operand_size(instr, i) != 32)
if (conv->definitions[0].bytes() * 8 != get_operand_size(instr, i))
continue;
if (conv->definitions[0].bytes() == 2 &&
(conv->definitions[0].isPrecise() || instr->definitions[0].isPrecise()))
continue;
/* Conversion to VOP3P will add inline constant operands, but that shouldn't affect
......@@ -3584,9 +3633,8 @@ combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
continue;
if (!instr->isVOP3P()) {
bool is_add =
instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32;
to_mad_mix(ctx, instr);
bool is_add;
to_mad_mix(ctx, instr, &is_add);
i += is_add;
}
......@@ -3769,8 +3817,8 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
/* v_fma_mix_f32/etc can't do omod */
if (info.instr->isVOP3P() && instr->isVOP3() && instr->vop3().omod)
continue;
/* don't promote fp16 to fp32 or remove fp32->fp16->fp32 conversions */
if (is_add_mix && info.instr->definitions[0].bytes() == 2)
if (get_operand_size(instr, i) != info.instr->definitions[0].bytes() * 8)
continue;
if (get_operand_size(instr, i) != info.instr->definitions[0].bytes() * 8)
......@@ -3779,12 +3827,14 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
bool legacy = info.instr->opcode == aco_opcode::v_mul_legacy_f32;
bool mad_mix = is_add_mix || info.instr->isVOP3P();
bool is_mad_mix_fused =
ctx.program->dev.fused_mad_mix || info.instr->definitions[0].bytes() == 2;
bool has_fma = mad16 || mad64 || (legacy && ctx.program->chip_class >= GFX10_3) ||
(mad32 && !legacy && !mad_mix && ctx.program->dev.has_fast_fma32) ||
(mad_mix && ctx.program->dev.fused_mad_mix);
bool has_mad = mad_mix ? !ctx.program->dev.fused_mad_mix
(mad32 && !legacy && ctx.program->dev.has_fast_fma32) ||
(mad_mix && is_mad_mix_fused);
bool has_mad = mad_mix ? !is_mad_mix_fused
: ((mad32 && ctx.program->chip_class < GFX10_3) ||
(mad16 && ctx.program->chip_class <= GFX9));
(mad16 && ctx.program->chip_class <= GFX9) || mad64);
bool can_use_fma = has_fma && !info.instr->definitions[0].isPrecise() &&
!instr->definitions[0].isPrecise();
bool can_use_mad =
......@@ -3892,6 +3942,11 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (add_instr->isVOP3P() || mul_instr->isVOP3P()) {
assert(!omod);
if (mul_instr->definitions[0].bytes() == 2 && !mul_instr->isVOP3P())
opsel_hi |= 0x3;
if (add_instr->definitions[0].bytes() == 2 && !add_instr->isVOP3P())
opsel_hi |= 0x4;
aco_opcode mad_op = add_instr->definitions[0].bytes() == 2 ? aco_opcode::v_fma_mixlo_f16
: aco_opcode::v_fma_mix_f32;
aco_ptr<VOP3P_instruction> mad{
......
......@@ -344,14 +344,14 @@ Temp fsat(Temp src, Builder b)
Temp ext_ushort(Temp src, unsigned idx, Builder b)
{
return b.pseudo(aco_opcode::p_extract, b.def(src.regClass()), src, Operand::c32(idx),
Operand::c32(16u), Operand::c32(false));
return b.pseudo(aco_opcode::p_extract, b.def(v1), src, Operand::c32(idx), Operand::c32(16u),
Operand::c32(false));
}
Temp ext_ubyte(Temp src, unsigned idx, Builder b)
{
return b.pseudo(aco_opcode::p_extract, b.def(src.regClass()), src, Operand::c32(idx),
Operand::c32(8u), Operand::c32(false));
return b.pseudo(aco_opcode::p_extract, b.def(v1), src, Operand::c32(idx), Operand::c32(8u),
Operand::c32(false));
}
VkDevice get_vk_device(enum chip_class chip_class)
......
......@@ -1187,6 +1187,22 @@ BEGIN_TEST(optimize.mad_mix.input_conv.basic)
//! p_unit_test 4, %res4
writeout(4, fma(a, a, f2f32(a16)));
//! v2b: %res5 = v_fma_mixlo_f16 lo(%a16), %a, -0
//! p_unit_test 5, %res5
writeout(5, fmul(a16, f2f16(a)));
//! v2b: %res6 = v_fma_mixlo_f16 1.0, lo(%a16), %a
//! p_unit_test 6, %res6
writeout(6, fadd(a16, f2f16(a)));
//! v2b: %res7 = v_fma_mixlo_f16 1.0, %a, lo(%a16)
//! p_unit_test 7, %res7
writeout(7, fadd(f2f16(a), a16));
//! v2b: %res8 = v_fma_mixlo_f16 lo(%a16), lo(%a16), %a
//! p_unit_test 8, %res8
writeout(8, fma(a16, a16, f2f16(a)));
finish_opt_test();
}
END_TEST
......@@ -1337,8 +1353,8 @@ END_TEST
BEGIN_TEST(optimize.mad_mix.output_conv.basic)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i))
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16, v2b: %c16 = p_startpgm
if (!setup_cs("v1 v1 v1 v2b v2b v2b", (chip_class)i))
continue;
Temp a = inputs[0];
......@@ -1346,6 +1362,7 @@ BEGIN_TEST(optimize.mad_mix.output_conv.basic)
Temp c = inputs[2];
Temp a16 = inputs[3];
Temp b16 = inputs[4];
Temp c16 = inputs[5];
//! v2b: %res0 = v_fma_mixlo_f16 %a, %b, -0
//! p_unit_test 0, %res0
......@@ -1371,6 +1388,30 @@ BEGIN_TEST(optimize.mad_mix.output_conv.basic)
//! p_unit_test 5, %res5
writeout(5, f2f16(fma(a, f2f32(b16), c)));
//! v1: %res6 = v_fma_mix_f32 lo(%a16), lo(%b16), -0
//! p_unit_test 6, %res6
writeout(6, f2f32(fmul(a16, b16)));
//! v1: %res7 = v_fma_mix_f32 1.0, lo(%a16), lo(%b16)
//! p_unit_test 7, %res7
writeout(7, f2f32(fadd(a16, b16)));
//! v1: %res8 = v_fma_mix_f32 lo(%a16), lo(%b16), lo(%c16)
//! p_unit_test 8, %res8
writeout(8, f2f32(fma(a16, b16, c16)));
//! v1: %res9 = v_fma_mix_f32 %a, lo(%b16), -0
//! p_unit_test 9, %res9
writeout(9, f2f32(fmul(f2f16(a), b16)));
//! v1: %res10 = v_fma_mix_f32 1.0, lo(%a16), %b
//! p_unit_test 10, %res10
writeout(10, f2f32(fadd(a16, f2f16(b))));
//! v1: %res11 = v_fma_mix_f32 lo(%a16), %b, lo(%c16)
//! p_unit_test 11, %res11
writeout(11, f2f32(fma(a16, f2f16(b), c16)));
finish_opt_test();
}
END_TEST
......@@ -1430,10 +1471,10 @@ BEGIN_TEST(optimize.mad_mix.output_conv.modifiers)
writeout(3, f2f32(fneg(fadd(a16, b16))));
/* sdwa */
//! v2b: %res4_add = v_fma_mixlo_f16 1.0, %a, %b
//! v2b: %res4 = p_extract %res4_add, 0, 8, 0
//! v1: %res4_add = v_fma_mix_f32 1.0, lo(%a16), lo(%b16)
//! v1: %res4 = p_extract %res4_add, 0, 16, 0
//! p_unit_test 4, %res4
writeout(4, ext_ubyte(f2f16(fadd(a, b)), 0));
writeout(4, ext_ushort(f2f32(fadd(a16, b16)), 0));
//! v1: %res5_mul = v_add_f32 %a, %b dst_sel:uword0 src0_sel:dword src1_sel:dword
//! v2b: %res5 = v_cvt_f16_f32 %res5_mul
......@@ -1446,15 +1487,16 @@ END_TEST
BEGIN_TEST(optimize.mad_mix.fma.basic)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %c16 = p_startpgm
if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i))
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16, v2b: %c16 = p_startpgm
if (!setup_cs("v1 v1 v1 v2b v2b v2b", (chip_class)i))
continue;
Temp a = inputs[0];
Temp b = inputs[1];
Temp c = inputs[2];
Temp a16 = inputs[3];
Temp c16 = inputs[4];
Temp b16 = inputs[4];
Temp c16 = inputs[5];
//! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
//! p_unit_test 0, %res0
......@@ -1487,10 +1529,32 @@ BEGIN_TEST(optimize.mad_mix.fma.basic)
//! p_unit_test 6, %res6
writeout(6, fadd(fneg(fabs(fmul(fneg(a), fneg(b)))), f2f32(c16)));
/* output conversions */
//! v2b: %res7 = v_fma_mixlo_f16 %a, %b, %c
/* fp16 */
//! v2b: %res7 = v_fma_mixlo_f16 %a, lo(%b16), lo(%c16)
//! p_unit_test 7, %res7
writeout(7, f2f16(fadd(fmul(a, b), c)));
writeout(7, fadd(fmul(f2f16(a), b16), c16));
//! v2b: %res8 = v_fma_mixlo_f16 lo(%a16), lo(%b16), %c
//! p_unit_test 8, %res8
writeout(8, fadd(fmul(a16, b16), f2f16(c)));
/* conversions in the middle */
//! v2b: %res9 = v_fma_mixlo_f16 %a, %b, lo(%c16)
//! p_unit_test 9, %res9
writeout(9, fadd(f2f16(fmul(a, b)), c16));
//! v1: %res10 = v_fma_mix_f32 lo(%a16), lo(%b16), %c
//! p_unit_test 10, %res10
writeout(10, fadd(f2f32(fmul(a16, b16)), c));
/* output conversions */
//! v2b: %res11 = v_fma_mixlo_f16 %a, %b, %c
//! p_unit_test 11, %res11
writeout(11, f2f16(fadd(fmul(a, b), c)));
//! v1: %res12 = v_fma_mix_f32 lo(%a16), lo(%b16), lo(%c16)
//! p_unit_test 12, %res12
writeout(12, f2f32(fadd(fmul(a16, b16), c16)));
finish_opt_test();
}
......@@ -1498,8 +1562,8 @@ END_TEST
BEGIN_TEST(optimize.mad_mix.fma.precision)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i))
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16, v2b: %c16 = p_startpgm
if (!setup_cs("v1 v1 v1 v2b v2b v2b", (chip_class)i))
continue;
Temp a = inputs[0];
......@@ -1507,6 +1571,7 @@ BEGIN_TEST(optimize.mad_mix.fma.precision)
Temp c = inputs[2];
Temp a16 = inputs[3];
Temp b16 = inputs[4];
Temp c16 = inputs[5];
/* the optimization is precise for 32-bit on GFX9 */
//~gfx9! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
......@@ -1521,36 +1586,52 @@ BEGIN_TEST(optimize.mad_mix.fma.precision)
//! p_unit_test 1, %res1
writeout(1, fadd(fmul(f2f32(a16), b), c, bld.precise()));
/* never promote 16-bit arithmetic to 32-bit */
//! v2b: %res2_tmp = v_cvt_f16_f32 %a
//! v2b: %res2 = v_add_f16 %res2_tmp, %b16
/* never combine precise 16-bit arithmetic */
//! v2b: (precise)%res2_tmp = v_mul_f16 %a16, %b16
//! v2b: %res2 = v_fma_mixlo_f16 1.0, lo(%res2_tmp), %c
//! p_unit_test 2, %res2
writeout(2, fadd(f2f16(a), b16));
writeout(2, fadd(fmul(a16, b16, bld.precise()), f2f16(c)));
//! v2b: %res3_tmp = v_cvt_f16_f32 %a
//! v2b: %res3 = v_mul_f16 %res3_tmp, %b16
//! v2b: %res3_tmp = v_fma_mixlo_f16 %a, lo(%b16), -0
//! v2b: (precise)%res3 = v_add_f16 %res3_tmp, %c16
//! p_unit_test 3, %res3
writeout(3, fmul(f2f16(a), b16));
writeout(3, fadd(fmul(f2f16(a), b16), c16, bld.precise()));
//! v2b: %res4_tmp = v_mul_f16 %a16, %b16
//! v1: %res4 = v_cvt_f32_f16 %res4_tmp
/* conversions in the middle: combining skips them, making it always unsafe */
//! v2b: (precise)%res4_tmp = v_fma_mixlo_f16 %1, %2, -0
//! v2b: %res4 = v_add_f16 %res4_tmp, %c16
//! p_unit_test 4, %res4
writeout(4, f2f32(fmul(a16, b16)));
writeout(4, fadd(f2f16(fmul(a, b, bld.precise())), c16));
//! v2b: %res5_tmp = v_add_f16 %a16, %b16
//! v1: %res5 = v_cvt_f32_f16 %res5_tmp
//! v2b: (precise)%res5_tmp = v_mul_f16 %4, %5
//! v1: %res5 = v_fma_mix_f32 1.0, lo(%res5_tmp), %c
//! p_unit_test 5, %res5
writeout(5, f2f32(fadd(a16, b16)));
writeout(5, fadd(f2f32(fmul(a16, b16, bld.precise())), c));
//! v2b: %res6_tmp = v_fma_mixlo_f16 %a, %b, -0
//! v2b: %res6 = v_add_f16 %res6_tmp, %a16
//! v2b: (precise)%res6_tmp = v_fma_mixlo_f16 %1, %2, -0
//! v2b: %res6 = v_add_f16 %res6_tmp, %c16
//! p_unit_test 6, %res6
writeout(6, fadd(f2f16(fmul(a, b)), a16));
writeout(6, fadd(f2f16(fmul(a, b), bld.precise()), c16));
//! v2b: %res7_tmp = v_mul_f16 %a16, %b16
//! v1: %res7 = v_fma_mix_f32 1.0, lo(%res7_tmp), %c
//! v2b: %res7_tmp = v_mul_f16 %4, %5
//! v1: (precise)%res7 = v_fma_mix_f32 1.0, lo(%res7_tmp), %c
//! p_unit_test 7, %res7
writeout(7, fadd(f2f32(fmul(a16, b16)), c));
writeout(7, fadd(f2f32(fmul(a16, b16), bld.precise()), c));
//! v2b: (precise)%res8_tmp = v_fma_mixlo_f16 %1, %2, -0
//! v2b: (precise)%res8 = v_add_f16 %res8_tmp, %c16
//! p_unit_test 8, %res8
writeout(8, fadd(f2f16(fmul(a, b), bld.precise()), c16, bld.precise()));
//! v2b: %res9_tmp = v_mul_f16 %4, %5
//! v1: (precise)%res9 = v_fma_mix_f32 1.0, lo(%res9_tmp), %c
//! p_unit_test 9, %res9
writeout(9, fadd(f2f32(fmul(a16, b16), bld.precise()), c, bld.precise()));
//! v2b: %res10_tmp = v_fma_f16 %a16, %b16, %c16
//! v1: (precise)%res11 = v_cvt_f32_f16 %res10_tmp
//! p_unit_test 10, %res11
writeout(10, f2f32(fadd(fmul(a16, b16), c16), bld.precise()));
finish_opt_test();
}
......@@ -1569,13 +1650,25 @@ BEGIN_TEST(optimize.mad_mix.clamp)
//! p_unit_test 0, %res0
writeout(0, fsat(fmul(f2f32(a16), a)));
//! v2b: %res1 = v_fma_mixlo_f16 %a, %a, -0 clamp
//! v2b: %res1 = v_fma_mixlo_f16 %a, lo(%a16), -0 clamp
//! p_unit_test 1, %res1
writeout(1, f2f16(fsat(fmul(a, a))));
writeout(1, fsat(fmul(f2f16(a), a16)));
//! v2b: %res2 = v_fma_mixlo_f16 %a, %a, -0 clamp
//! p_unit_test 2, %res2
writeout(2, fsat(f2f16(fmul(a, a))));
writeout(2, f2f16(fsat(fmul(a, a))));
//! v2b: %res3 = v_fma_mixlo_f16 %a, %a, -0 clamp
//! p_unit_test 3, %res3
writeout(3, fsat(f2f16(fmul(a, a))));
//! v1: %res4 = v_fma_mix_f32 lo(%a16), lo(%a16), -0 clamp
//! p_unit_test 4, %res4