Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Rhys Perry
mesa
Commits
26eb42e5
Commit
26eb42e5
authored
Mar 17, 2022
by
Rhys Perry
Browse files
aco: combine conversions into fp16 by promoting to fp32
Signed-off-by:
Rhys Perry
<
pendingchaos02@gmail.com
>
parent
f9b90c54
Changes
3
Hide whitespace changes
Inline
Side-by-side
src/amd/compiler/aco_optimizer.cpp
View file @
26eb42e5
...
...
@@ -122,16 +122,16 @@ enum Label {
label_insert
=
1ull
<<
34
,
label_dpp16
=
1ull
<<
35
,
label_dpp8
=
1ull
<<
36
,
label_f2f
32
=
1ull
<<
37
,
label_f2f
16
=
1ull
<<
38
,
label_f2f
_in
=
1ull
<<
37
,
label_f2f
_out
=
1ull
<<
38
,
};
static
constexpr
uint64_t
instr_usedef_labels
=
label_vec
|
label_mul
|
label_mad
|
label_add_sub
|
label_vop3p
|
label_bitwise
|
label_uniform_bitwise
|
label_minmax
|
label_vopc
|
label_usedef
|
label_extract
|
label_dpp16
|
label_dpp8
|
label_f2f
32
;
label_dpp8
|
label_f2f
_in
;
static
constexpr
uint64_t
instr_mod_labels
=
label_omod2
|
label_omod4
|
label_omod5
|
label_clamp
|
label_insert
|
label_f2f
16
;
label_omod2
|
label_omod4
|
label_omod5
|
label_clamp
|
label_insert
|
label_f2f
_out
;
static
constexpr
uint64_t
instr_labels
=
instr_usedef_labels
|
instr_mod_labels
;
static
constexpr
uint64_t
temp_labels
=
label_abs
|
label_neg
|
label_temp
|
label_vcc
|
label_b2f
|
...
...
@@ -327,13 +327,13 @@ struct ssa_info {
bool
is_clamp
()
{
return
label
&
label_clamp
;
}
void
set_f2f
16
(
Instruction
*
conv
)
void
set_f2f
_out
(
Instruction
*
conv
)
{
add_label
(
label_f2f
16
);
add_label
(
label_f2f
_out
);
instr
=
conv
;
}
bool
is_f2f
16
()
{
return
label
&
label_f2f
16
;
}
bool
is_f2f
_out
()
{
return
label
&
label_f2f
_out
;
}
void
set_undefined
()
{
add_label
(
label_undefined
);
}
...
...
@@ -451,13 +451,13 @@ struct ssa_info {
bool
is_canonicalized
()
{
return
label
&
label_canonicalized
;
}
void
set_f2f
32
(
Instruction
*
cvt
)
void
set_f2f
_in
(
Instruction
*
cvt
)
{
add_label
(
label_f2f
32
);
add_label
(
label_f2f
_in
);
instr
=
cvt
;
}
bool
is_f2f
32
()
{
return
label
&
label_f2f
32
;
}
bool
is_f2f
_in
()
{
return
label
&
label_f2f
_in
;
}
void
set_extract
(
Instruction
*
extract
)
{
...
...
@@ -1096,11 +1096,11 @@ apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info&
return
;
}
/* Output modifier, label_vopc and label_f2f
32
seem to be the only one worth keeping at the
/* Output modifier, label_vopc and label_f2f
_in
seem to be the only one worth keeping at the
* moment
*/
for
(
Definition
&
def
:
instr
->
definitions
)
ctx
.
info
[
def
.
tempId
()].
label
&=
(
label_vopc
|
label_f2f
32
|
instr_mod_labels
);
ctx
.
info
[
def
.
tempId
()].
label
&=
(
label_vopc
|
label_f2f
_in
|
instr_mod_labels
);
}
void
...
...
@@ -1899,13 +1899,18 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
break
;
}
case
aco_opcode
::
v_cvt_f16_f32
:
{
if
(
instr
->
operands
[
0
].
isTemp
())
ctx
.
info
[
instr
->
operands
[
0
].
tempId
()].
set_f2f16
(
instr
.
get
());
if
(
instr
->
operands
[
0
].
isTemp
())
{
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
set_f2f_in
(
instr
.
get
());
ctx
.
info
[
instr
->
operands
[
0
].
tempId
()].
set_f2f_out
(
instr
.
get
());
}
break
;
}
case
aco_opcode
::
v_cvt_f32_f16
:
{
if
(
instr
->
operands
[
0
].
isTemp
())
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
set_f2f32
(
instr
.
get
());
if
(
instr
->
operands
[
0
].
isTemp
())
{
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
set_f2f_in
(
instr
.
get
());
if
(
!
(
ctx
.
info
[
instr
->
operands
[
0
].
tempId
()].
label
&
label_extract
))
ctx
.
info
[
instr
->
operands
[
0
].
tempId
()].
set_f2f_out
(
instr
.
get
());
}
break
;
}
default:
break
;
...
...
@@ -3074,7 +3079,8 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
}
instr
->
definitions
[
0
].
swapTemp
(
def_info
.
instr
->
definitions
[
0
]);
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
label
&=
label_clamp
|
label_insert
|
label_f2f16
;
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
label
&=
label_clamp
|
label_insert
|
label_f2f_out
;
ctx
.
uses
[
def_info
.
instr
->
definitions
[
0
].
tempId
()]
--
;
return
true
;
...
...
@@ -3459,10 +3465,15 @@ can_use_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
return
false
;
switch
(
instr
->
opcode
)
{
case
aco_opcode
::
v_add_f16
:
case
aco_opcode
::
v_add_f32
:
case
aco_opcode
::
v_sub_f16
:
case
aco_opcode
::
v_sub_f32
:
case
aco_opcode
::
v_subrev_f16
:
case
aco_opcode
::
v_subrev_f32
:
case
aco_opcode
::
v_mul_f16
:
case
aco_opcode
::
v_mul_f32
:
case
aco_opcode
::
v_fma_f16
:
case
aco_opcode
::
v_fma_f32
:
break
;
case
aco_opcode
::
v_fma_mix_f32
:
case
aco_opcode
::
v_fma_mixlo_f16
:
return
true
;
...
...
@@ -3476,19 +3487,34 @@ can_use_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if
(
instr
->
isVOP3
())
return
!
instr
->
vop3
().
omod
&&
!
(
instr
->
vop3
().
opsel
&
0x8
);
if
(
instr
->
isSDWA
())
{
SDWA_instruction
*
sdwa
=
&
instr
->
sdwa
();
if
(
sdwa
->
dst_sel
.
size
()
!=
instr
->
definitions
[
0
].
bytes
()
||
sdwa
->
dst_sel
.
offset
()
||
sdwa
->
omod
)
return
false
;
for
(
unsigned
i
=
0
;
i
<
instr
->
operands
.
size
();
i
++
)
{
if
(
sdwa
->
sel
[
i
].
size
()
!=
instr
->
definitions
[
0
].
bytes
())
return
false
;
}
return
true
;
}
return
instr
->
format
==
Format
::
VOP2
;
}
void
to_mad_mix
(
opt_ctx
&
ctx
,
aco_ptr
<
Instruction
>&
instr
)
to_mad_mix
(
opt_ctx
&
ctx
,
aco_ptr
<
Instruction
>&
instr
,
bool
*
is_add_ptr
)
{
bool
is_add
=
instr
->
opcode
!=
aco_opcode
::
v_mul_f32
&&
instr
->
opcode
!=
aco_opcode
::
v_fma_f32
;
bool
is_32bit
=
instr
->
definitions
[
0
].
bytes
()
==
4
;
bool
is_mul
=
instr
->
opcode
==
aco_opcode
::
v_mul_f16
||
instr
->
opcode
==
aco_opcode
::
v_mul_f32
;
bool
is_add
=
!
is_mul
&&
instr
->
operands
.
size
()
==
2
;
aco_opcode
opcode
=
is_32bit
?
aco_opcode
::
v_fma_mix_f32
:
aco_opcode
::
v_fma_mixlo_f16
;
aco_ptr
<
VOP3P_instruction
>
vop3p
{
create_instruction
<
VOP3P_instruction
>
(
aco_
opcode
::
v_fma_mix_f32
,
Format
::
VOP3P
,
3
,
1
)};
create_instruction
<
VOP3P_instruction
>
(
opcode
,
Format
::
VOP3P
,
3
,
1
)};
vop3p
->
opsel_lo
=
instr
->
isVOP3
()
?
(
instr
->
vop3
().
opsel
&
0x7
)
<<
is_add
:
0x0
;
vop3p
->
opsel_hi
=
0x0
;
vop3p
->
opsel_hi
=
is_32bit
?
0x0
:
0x7
;
for
(
unsigned
i
=
0
;
i
<
instr
->
operands
.
size
();
i
++
)
{
vop3p
->
operands
[
is_add
+
i
]
=
instr
->
operands
[
i
];
vop3p
->
neg_lo
[
is_add
+
i
]
=
instr
->
isVOP3
()
&&
instr
->
vop3
().
neg
[
i
];
...
...
@@ -3497,32 +3523,36 @@ to_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
vop3p
->
neg_hi
[
is_add
+
i
]
|=
instr
->
isSDWA
()
&&
instr
->
sdwa
().
abs
[
i
];
vop3p
->
opsel_lo
|=
(
instr
->
isSDWA
()
&&
instr
->
sdwa
().
sel
[
i
].
offset
())
<<
(
is_add
+
i
);
}
if
(
i
nstr
->
opcode
==
aco_opcode
::
v_mul_f32
)
{
if
(
i
s_mul
)
{
vop3p
->
opsel_hi
&=
0x3
;
vop3p
->
operands
[
2
]
=
Operand
::
zero
();
vop3p
->
neg_lo
[
2
]
=
true
;
}
else
if
(
is_add
)
{
vop3p
->
opsel_hi
&=
0x6
;
vop3p
->
operands
[
0
]
=
Operand
::
c32
(
0x3f800000
);
if
(
instr
->
opcode
==
aco_opcode
::
v_sub_f32
)
if
(
instr
->
opcode
==
aco_opcode
::
v_sub_f16
||
instr
->
opcode
==
aco_opcode
::
v_sub_f32
)
vop3p
->
neg_lo
[
2
]
^=
true
;
else
if
(
instr
->
opcode
==
aco_opcode
::
v_subrev_f32
)
else
if
(
instr
->
opcode
==
aco_opcode
::
v_subrev_f16
||
instr
->
opcode
==
aco_opcode
::
v_subrev_f32
)
vop3p
->
neg_lo
[
1
]
^=
true
;
}
vop3p
->
definitions
[
0
]
=
instr
->
definitions
[
0
];
vop3p
->
clamp
=
instr
->
isVOP3
()
&&
instr
->
vop3
().
clamp
;
instr
=
std
::
move
(
vop3p
);
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
label
&=
label_f2f
16
|
label_clamp
|
label_mul
;
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
label
&=
label_f2f
_out
|
label_clamp
|
label_mul
;
if
(
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
label
&
label_mul
)
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
instr
=
instr
.
get
();
if
(
is_add_ptr
)
*
is_add_ptr
=
is_add
;
}
bool
combine_output_conversion
(
opt_ctx
&
ctx
,
aco_ptr
<
Instruction
>&
instr
)
{
ssa_info
&
def_info
=
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()];
if
(
!
def_info
.
is_f2f
16
())
if
(
!
def_info
.
is_f2f
_out
())
return
false
;
Instruction
*
conv
=
def_info
.
instr
;
...
...
@@ -3532,19 +3562,34 @@ combine_output_conversion(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if
(
!
ctx
.
uses
[
conv
->
definitions
[
0
].
tempId
()])
return
false
;
if
(
conv
->
usesModifiers
())
if
(
conv
->
definitions
[
0
].
bytes
()
==
4
&&
(
instr
->
definitions
[
0
].
isPrecise
()
||
conv
->
definitions
[
0
].
isPrecise
()))
return
false
;
if
((
conv
->
opcode
==
aco_opcode
::
v_cvt_f32_f16
?
2
:
4
)
!=
instr
->
definitions
[
0
].
bytes
()
||
conv
->
usesModifiers
())
return
false
;
if
(
!
instr
->
isVOP3P
())
to_mad_mix
(
ctx
,
instr
);
to_mad_mix
(
ctx
,
instr
,
NULL
);
instr
->
opcode
=
aco_opcode
::
v_fma_mixlo_f16
;
if
(
conv
->
opcode
==
aco_opcode
::
v_cvt_f16_f32
)
instr
->
opcode
=
aco_opcode
::
v_fma_mixlo_f16
;
else
instr
->
opcode
=
aco_opcode
::
v_fma_mix_f32
;
instr
->
definitions
[
0
].
swapTemp
(
conv
->
definitions
[
0
]);
if
(
conv
->
definitions
[
0
].
isPrecise
())
instr
->
definitions
[
0
].
setPrecise
(
true
);
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
label
&=
label_clamp
;
ctx
.
uses
[
conv
->
definitions
[
0
].
tempId
()]
--
;
if
((
instr
->
opcode
==
aco_opcode
::
v_fma_mixlo_f16
||
instr
->
opcode
==
aco_opcode
::
v_fma_mix_f32
)
&&
instr
->
operands
[
2
].
constantEquals
(
0
)
&&
instr
->
vop3p
().
neg_lo
[
2
]
&&
!
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
is_clamp
())
{
ctx
.
info
[
instr
->
definitions
[
0
].
tempId
()].
set_mul
(
instr
.
get
());
}
return
true
;
}
...
...
@@ -3558,12 +3603,13 @@ combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if
(
!
instr
->
operands
[
i
].
isTemp
())
continue
;
Temp
tmp
=
instr
->
operands
[
i
].
getTemp
();
if
(
!
ctx
.
info
[
tmp
.
id
()].
is_f2f
32
())
if
(
!
ctx
.
info
[
tmp
.
id
()].
is_f2f
_in
())
continue
;
Instruction
*
conv
=
ctx
.
info
[
tmp
.
id
()].
instr
;
if
(
conv
->
isSDWA
()
&&
(
conv
->
sdwa
().
dst_sel
.
size
()
!=
4
||
conv
->
sdwa
().
sel
[
0
].
size
()
!=
2
||
conv
->
sdwa
().
clamp
||
conv
->
sdwa
().
omod
))
{
if
(
conv
->
isSDWA
()
&&
(
conv
->
opcode
!=
aco_opcode
::
v_cvt_f32_f16
||
conv
->
sdwa
().
dst_sel
.
size
()
!=
4
||
conv
->
sdwa
().
sel
[
0
].
size
()
!=
2
||
conv
->
sdwa
().
clamp
||
conv
->
sdwa
().
omod
))
{
continue
;
}
else
if
(
conv
->
isVOP3
()
&&
(
conv
->
vop3
().
clamp
||
conv
->
vop3
().
omod
))
{
continue
;
...
...
@@ -3571,7 +3617,10 @@ combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
continue
;
}
if
(
get_operand_size
(
instr
,
i
)
!=
32
)
if
(
conv
->
definitions
[
0
].
bytes
()
*
8
!=
get_operand_size
(
instr
,
i
))
continue
;
if
(
conv
->
definitions
[
0
].
bytes
()
==
2
&&
(
conv
->
definitions
[
0
].
isPrecise
()
||
instr
->
definitions
[
0
].
isPrecise
()))
continue
;
/* Conversion to VOP3P will add inline constant operands, but that shouldn't affect
...
...
@@ -3584,9 +3633,8 @@ combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
continue
;
if
(
!
instr
->
isVOP3P
())
{
bool
is_add
=
instr
->
opcode
!=
aco_opcode
::
v_mul_f32
&&
instr
->
opcode
!=
aco_opcode
::
v_fma_f32
;
to_mad_mix
(
ctx
,
instr
);
bool
is_add
;
to_mad_mix
(
ctx
,
instr
,
&
is_add
);
i
+=
is_add
;
}
...
...
@@ -3769,8 +3817,8 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
/* v_fma_mix_f32/etc can't do omod */
if
(
info
.
instr
->
isVOP3P
()
&&
instr
->
isVOP3
()
&&
instr
->
vop3
().
omod
)
continue
;
/* don't promote fp16 to fp32 or remove fp32->fp16->fp32 conversions */
if
(
is_add_mix
&&
info
.
instr
->
definitions
[
0
].
bytes
()
==
2
)
if
(
get_operand_size
(
instr
,
i
)
!=
info
.
instr
->
definitions
[
0
].
bytes
()
*
8
)
continue
;
if
(
get_operand_size
(
instr
,
i
)
!=
info
.
instr
->
definitions
[
0
].
bytes
()
*
8
)
...
...
@@ -3779,12 +3827,14 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
bool
legacy
=
info
.
instr
->
opcode
==
aco_opcode
::
v_mul_legacy_f32
;
bool
mad_mix
=
is_add_mix
||
info
.
instr
->
isVOP3P
();
bool
is_mad_mix_fused
=
ctx
.
program
->
dev
.
fused_mad_mix
||
info
.
instr
->
definitions
[
0
].
bytes
()
==
2
;
bool
has_fma
=
mad16
||
mad64
||
(
legacy
&&
ctx
.
program
->
chip_class
>=
GFX10_3
)
||
(
mad32
&&
!
legacy
&&
!
mad_mix
&&
ctx
.
program
->
dev
.
has_fast_fma32
)
||
(
mad_mix
&&
ctx
.
program
->
dev
.
fused
_mad_mix
);
bool
has_mad
=
mad_mix
?
!
ctx
.
program
->
dev
.
fused
_mad_mix
(
mad32
&&
!
legacy
&&
ctx
.
program
->
dev
.
has_fast_fma32
)
||
(
mad_mix
&&
is
_mad_mix
_fused
);
bool
has_mad
=
mad_mix
?
!
is
_mad_mix
_fused
:
((
mad32
&&
ctx
.
program
->
chip_class
<
GFX10_3
)
||
(
mad16
&&
ctx
.
program
->
chip_class
<=
GFX9
));
(
mad16
&&
ctx
.
program
->
chip_class
<=
GFX9
)
||
mad64
);
bool
can_use_fma
=
has_fma
&&
!
info
.
instr
->
definitions
[
0
].
isPrecise
()
&&
!
instr
->
definitions
[
0
].
isPrecise
();
bool
can_use_mad
=
...
...
@@ -3892,6 +3942,11 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if
(
add_instr
->
isVOP3P
()
||
mul_instr
->
isVOP3P
())
{
assert
(
!
omod
);
if
(
mul_instr
->
definitions
[
0
].
bytes
()
==
2
&&
!
mul_instr
->
isVOP3P
())
opsel_hi
|=
0x3
;
if
(
add_instr
->
definitions
[
0
].
bytes
()
==
2
&&
!
add_instr
->
isVOP3P
())
opsel_hi
|=
0x4
;
aco_opcode
mad_op
=
add_instr
->
definitions
[
0
].
bytes
()
==
2
?
aco_opcode
::
v_fma_mixlo_f16
:
aco_opcode
::
v_fma_mix_f32
;
aco_ptr
<
VOP3P_instruction
>
mad
{
...
...
src/amd/compiler/tests/helpers.cpp
View file @
26eb42e5
...
...
@@ -344,14 +344,14 @@ Temp fsat(Temp src, Builder b)
Temp
ext_ushort
(
Temp
src
,
unsigned
idx
,
Builder
b
)
{
return
b
.
pseudo
(
aco_opcode
::
p_extract
,
b
.
def
(
src
.
regClass
()
),
src
,
Operand
::
c32
(
idx
),
Operand
::
c32
(
16u
),
Operand
::
c32
(
false
));
return
b
.
pseudo
(
aco_opcode
::
p_extract
,
b
.
def
(
v1
),
src
,
Operand
::
c32
(
idx
),
Operand
::
c32
(
16u
),
Operand
::
c32
(
false
));
}
Temp
ext_ubyte
(
Temp
src
,
unsigned
idx
,
Builder
b
)
{
return
b
.
pseudo
(
aco_opcode
::
p_extract
,
b
.
def
(
src
.
regClass
()
),
src
,
Operand
::
c32
(
idx
),
Operand
::
c32
(
8u
),
Operand
::
c32
(
false
));
return
b
.
pseudo
(
aco_opcode
::
p_extract
,
b
.
def
(
v1
),
src
,
Operand
::
c32
(
idx
),
Operand
::
c32
(
8u
),
Operand
::
c32
(
false
));
}
VkDevice
get_vk_device
(
enum
chip_class
chip_class
)
...
...
src/amd/compiler/tests/test_optimizer.cpp
View file @
26eb42e5
...
...
@@ -1187,6 +1187,22 @@ BEGIN_TEST(optimize.mad_mix.input_conv.basic)
//! p_unit_test 4, %res4
writeout
(
4
,
fma
(
a
,
a
,
f2f32
(
a16
)));
//! v2b: %res5 = v_fma_mixlo_f16 lo(%a16), %a, -0
//! p_unit_test 5, %res5
writeout
(
5
,
fmul
(
a16
,
f2f16
(
a
)));
//! v2b: %res6 = v_fma_mixlo_f16 1.0, lo(%a16), %a
//! p_unit_test 6, %res6
writeout
(
6
,
fadd
(
a16
,
f2f16
(
a
)));
//! v2b: %res7 = v_fma_mixlo_f16 1.0, %a, lo(%a16)
//! p_unit_test 7, %res7
writeout
(
7
,
fadd
(
f2f16
(
a
),
a16
));
//! v2b: %res8 = v_fma_mixlo_f16 lo(%a16), lo(%a16), %a
//! p_unit_test 8, %res8
writeout
(
8
,
fma
(
a16
,
a16
,
f2f16
(
a
)));
finish_opt_test
();
}
END_TEST
...
...
@@ -1337,8 +1353,8 @@ END_TEST
BEGIN_TEST
(
optimize
.
mad_mix
.
output_conv
.
basic
)
for
(
unsigned
i
=
GFX9
;
i
<=
GFX10
;
i
++
)
{
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
if
(
!
setup_cs
(
"v1 v1 v1 v2b v2b"
,
(
chip_class
)
i
))
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16
, v2b: %c16
= p_startpgm
if
(
!
setup_cs
(
"v1 v1 v1 v2b
v2b
v2b"
,
(
chip_class
)
i
))
continue
;
Temp
a
=
inputs
[
0
];
...
...
@@ -1346,6 +1362,7 @@ BEGIN_TEST(optimize.mad_mix.output_conv.basic)
Temp
c
=
inputs
[
2
];
Temp
a16
=
inputs
[
3
];
Temp
b16
=
inputs
[
4
];
Temp
c16
=
inputs
[
5
];
//! v2b: %res0 = v_fma_mixlo_f16 %a, %b, -0
//! p_unit_test 0, %res0
...
...
@@ -1371,6 +1388,30 @@ BEGIN_TEST(optimize.mad_mix.output_conv.basic)
//! p_unit_test 5, %res5
writeout
(
5
,
f2f16
(
fma
(
a
,
f2f32
(
b16
),
c
)));
//! v1: %res6 = v_fma_mix_f32 lo(%a16), lo(%b16), -0
//! p_unit_test 6, %res6
writeout
(
6
,
f2f32
(
fmul
(
a16
,
b16
)));
//! v1: %res7 = v_fma_mix_f32 1.0, lo(%a16), lo(%b16)
//! p_unit_test 7, %res7
writeout
(
7
,
f2f32
(
fadd
(
a16
,
b16
)));
//! v1: %res8 = v_fma_mix_f32 lo(%a16), lo(%b16), lo(%c16)
//! p_unit_test 8, %res8
writeout
(
8
,
f2f32
(
fma
(
a16
,
b16
,
c16
)));
//! v1: %res9 = v_fma_mix_f32 %a, lo(%b16), -0
//! p_unit_test 9, %res9
writeout
(
9
,
f2f32
(
fmul
(
f2f16
(
a
),
b16
)));
//! v1: %res10 = v_fma_mix_f32 1.0, lo(%a16), %b
//! p_unit_test 10, %res10
writeout
(
10
,
f2f32
(
fadd
(
a16
,
f2f16
(
b
))));
//! v1: %res11 = v_fma_mix_f32 lo(%a16), %b, lo(%c16)
//! p_unit_test 11, %res11
writeout
(
11
,
f2f32
(
fma
(
a16
,
f2f16
(
b
),
c16
)));
finish_opt_test
();
}
END_TEST
...
...
@@ -1430,10 +1471,10 @@ BEGIN_TEST(optimize.mad_mix.output_conv.modifiers)
writeout
(
3
,
f2f32
(
fneg
(
fadd
(
a16
,
b16
))));
/* sdwa */
//! v
2b
: %res4_add = v_fma_mix
lo_f16 1.0, %a, %b
//! v
2b
: %res4 = p_extract %res4_add, 0,
8
, 0
//! v
1
: %res4_add = v_fma_mix
_f32 1.0, lo(%a16), lo(%b16)
//! v
1
: %res4 = p_extract %res4_add, 0,
16
, 0
//! p_unit_test 4, %res4
writeout
(
4
,
ext_u
byte
(
f2f
16
(
fadd
(
a
,
b
)),
0
));
writeout
(
4
,
ext_u
short
(
f2f
32
(
fadd
(
a
16
,
b
16
)),
0
));
//! v1: %res5_mul = v_add_f32 %a, %b dst_sel:uword0 src0_sel:dword src1_sel:dword
//! v2b: %res5 = v_cvt_f16_f32 %res5_mul
...
...
@@ -1446,15 +1487,16 @@ END_TEST
BEGIN_TEST
(
optimize
.
mad_mix
.
fma
.
basic
)
for
(
unsigned
i
=
GFX9
;
i
<=
GFX10
;
i
++
)
{
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %c16 = p_startpgm
if
(
!
setup_cs
(
"v1 v1 v1 v2b v2b"
,
(
chip_class
)
i
))
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b:
%b16, v2b:
%c16 = p_startpgm
if
(
!
setup_cs
(
"v1 v1 v1 v2b
v2b
v2b"
,
(
chip_class
)
i
))
continue
;
Temp
a
=
inputs
[
0
];
Temp
b
=
inputs
[
1
];
Temp
c
=
inputs
[
2
];
Temp
a16
=
inputs
[
3
];
Temp
c16
=
inputs
[
4
];
Temp
b16
=
inputs
[
4
];
Temp
c16
=
inputs
[
5
];
//! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
//! p_unit_test 0, %res0
...
...
@@ -1487,10 +1529,32 @@ BEGIN_TEST(optimize.mad_mix.fma.basic)
//! p_unit_test 6, %res6
writeout
(
6
,
fadd
(
fneg
(
fabs
(
fmul
(
fneg
(
a
),
fneg
(
b
)))),
f2f32
(
c16
)));
/*
output conversions
*/
//! v2b: %res7 = v_fma_mixlo_f16 %a,
%b, %c
/*
fp16
*/
//! v2b: %res7 = v_fma_mixlo_f16 %a,
lo(%b16), lo(%c16)
//! p_unit_test 7, %res7
writeout
(
7
,
f2f16
(
fadd
(
fmul
(
a
,
b
),
c
)));
writeout
(
7
,
fadd
(
fmul
(
f2f16
(
a
),
b16
),
c16
));
//! v2b: %res8 = v_fma_mixlo_f16 lo(%a16), lo(%b16), %c
//! p_unit_test 8, %res8
writeout
(
8
,
fadd
(
fmul
(
a16
,
b16
),
f2f16
(
c
)));
/* conversions in the middle */
//! v2b: %res9 = v_fma_mixlo_f16 %a, %b, lo(%c16)
//! p_unit_test 9, %res9
writeout
(
9
,
fadd
(
f2f16
(
fmul
(
a
,
b
)),
c16
));
//! v1: %res10 = v_fma_mix_f32 lo(%a16), lo(%b16), %c
//! p_unit_test 10, %res10
writeout
(
10
,
fadd
(
f2f32
(
fmul
(
a16
,
b16
)),
c
));
/* output conversions */
//! v2b: %res11 = v_fma_mixlo_f16 %a, %b, %c
//! p_unit_test 11, %res11
writeout
(
11
,
f2f16
(
fadd
(
fmul
(
a
,
b
),
c
)));
//! v1: %res12 = v_fma_mix_f32 lo(%a16), lo(%b16), lo(%c16)
//! p_unit_test 12, %res12
writeout
(
12
,
f2f32
(
fadd
(
fmul
(
a16
,
b16
),
c16
)));
finish_opt_test
();
}
...
...
@@ -1498,8 +1562,8 @@ END_TEST
BEGIN_TEST
(
optimize
.
mad_mix
.
fma
.
precision
)
for
(
unsigned
i
=
GFX9
;
i
<=
GFX10
;
i
++
)
{
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
if
(
!
setup_cs
(
"v1 v1 v1 v2b v2b"
,
(
chip_class
)
i
))
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16
, v2b: %c16
= p_startpgm
if
(
!
setup_cs
(
"v1 v1 v1 v2b
v2b
v2b"
,
(
chip_class
)
i
))
continue
;
Temp
a
=
inputs
[
0
];
...
...
@@ -1507,6 +1571,7 @@ BEGIN_TEST(optimize.mad_mix.fma.precision)
Temp
c
=
inputs
[
2
];
Temp
a16
=
inputs
[
3
];
Temp
b16
=
inputs
[
4
];
Temp
c16
=
inputs
[
5
];
/* the optimization is precise for 32-bit on GFX9 */
//~gfx9! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
...
...
@@ -1521,36 +1586,52 @@ BEGIN_TEST(optimize.mad_mix.fma.precision)
//! p_unit_test 1, %res1
writeout
(
1
,
fadd
(
fmul
(
f2f32
(
a16
),
b
),
c
,
bld
.
precise
()));
/* never
promot
e 16-bit arithmetic
to 32-bit
*/
//! v2b: %res2_tmp = v_
cvt
_f16
_f32 %a
//! v2b: %res2 = v_
add_f16
%res2_tmp, %
b16
/* never
combine precis
e 16-bit arithmetic */
//! v2b:
(precise)
%res2_tmp = v_
mul
_f16
%a16, %b16
//! v2b: %res2 = v_
fma_mixlo_f16 1.0, lo(
%res2_tmp
)
, %
c
//! p_unit_test 2, %res2
writeout
(
2
,
fadd
(
f
2f16
(
a
),
b16
));
writeout
(
2
,
fadd
(
f
mul
(
a16
,
b16
,
bld
.
precise
()),
f2f16
(
c
)
));
//! v2b: %res3_tmp = v_
cvt_f16_f32 %a
//! v2b: %res3 = v_
mul
_f16 %res3_tmp, %
b
16
//! v2b: %res3_tmp = v_
fma_mixlo_f16 %a, lo(%b16), -0
//! v2b:
(precise)
%res3 = v_
add
_f16 %res3_tmp, %
c
16
//! p_unit_test 3, %res3
writeout
(
3
,
fmul
(
f2f16
(
a
),
b16
));
writeout
(
3
,
fadd
(
fmul
(
f2f16
(
a
),
b16
)
,
c16
,
bld
.
precise
())
);
//! v2b: %res4_tmp = v_mul_f16 %a16, %b16
//! v1: %res4 = v_cvt_f32_f16 %res4_tmp
/* conversions in the middle: combining skips them, making it always unsafe */
//! v2b: (precise)%res4_tmp = v_fma_mixlo_f16 %1, %2, -0
//! v2b: %res4 = v_add_f16 %res4_tmp, %c16
//! p_unit_test 4, %res4
writeout
(
4
,
f
2f32
(
fmul
(
a
16
,
b16
))
)
;
writeout
(
4
,
f
add
(
f2f16
(
fmul
(
a
,
b
,
bld
.
precise
())),
c
16
));
//! v2b: %res5_tmp = v_
add
_f16 %
a16, %b16
//! v1: %res5 = v_
cvt_f32_f16
%res5_tmp
//! v2b:
(precise)
%res5_tmp = v_
mul
_f16 %
4, %5
//! v1: %res5 = v_
fma_mix_f32 1.0, lo(
%res5_tmp
), %c
//! p_unit_test 5, %res5
writeout
(
5
,
f2f32
(
f
add
(
a16
,
b16
)
));
writeout
(
5
,
fadd
(
f2f32
(
f
mul
(
a16
,
b16
,
bld
.
precise
())),
c
));
//! v2b: %res6_tmp = v_fma_mixlo_f16 %
a
, %
b
, -0
//! v2b: %res6 = v_add_f16 %res6_tmp, %
a
16
//! v2b:
(precise)
%res6_tmp = v_fma_mixlo_f16 %
1
, %
2
, -0
//! v2b: %res6 = v_add_f16 %res6_tmp, %
c
16
//! p_unit_test 6, %res6
writeout
(
6
,
fadd
(
f2f16
(
fmul
(
a
,
b
)),
a
16
));
writeout
(
6
,
fadd
(
f2f16
(
fmul
(
a
,
b
)
,
bld
.
precise
()
),
c
16
));
//! v2b: %res7_tmp = v_mul_f16 %
a16, %b16
//! v1: %res7 = v_fma_mix_f32 1.0, lo(%res7_tmp), %c
//! v2b: %res7_tmp = v_mul_f16 %
4, %5
//! v1:
(precise)
%res7 = v_fma_mix_f32 1.0, lo(%res7_tmp), %c
//! p_unit_test 7, %res7
writeout
(
7
,
fadd
(
f2f32
(
fmul
(
a16
,
b16
)),
c
));
writeout
(
7
,
fadd
(
f2f32
(
fmul
(
a16
,
b16
),
bld
.
precise
()),
c
));
//! v2b: (precise)%res8_tmp = v_fma_mixlo_f16 %1, %2, -0
//! v2b: (precise)%res8 = v_add_f16 %res8_tmp, %c16
//! p_unit_test 8, %res8
writeout
(
8
,
fadd
(
f2f16
(
fmul
(
a
,
b
),
bld
.
precise
()),
c16
,
bld
.
precise
()));
//! v2b: %res9_tmp = v_mul_f16 %4, %5
//! v1: (precise)%res9 = v_fma_mix_f32 1.0, lo(%res9_tmp), %c
//! p_unit_test 9, %res9
writeout
(
9
,
fadd
(
f2f32
(
fmul
(
a16
,
b16
),
bld
.
precise
()),
c
,
bld
.
precise
()));
//! v2b: %res10_tmp = v_fma_f16 %a16, %b16, %c16
//! v1: (precise)%res11 = v_cvt_f32_f16 %res10_tmp
//! p_unit_test 10, %res11
writeout
(
10
,
f2f32
(
fadd
(
fmul
(
a16
,
b16
),
c16
),
bld
.
precise
()));
finish_opt_test
();
}
...
...
@@ -1569,13 +1650,25 @@ BEGIN_TEST(optimize.mad_mix.clamp)
//! p_unit_test 0, %res0
writeout
(
0
,
fsat
(
fmul
(
f2f32
(
a16
),
a
)));
//! v2b: %res1 = v_fma_mixlo_f16 %a,
%a
, -0 clamp
//! v2b: %res1 = v_fma_mixlo_f16 %a,
lo(%a16)
, -0 clamp
//! p_unit_test 1, %res1
writeout
(
1
,
f2f16
(
fsat
(
fmul
(
a
,
a
)
)));
writeout
(
1
,
fsat
(
fmul
(
f2f16
(
a
)
,
a
16
)));
//! v2b: %res2 = v_fma_mixlo_f16 %a, %a, -0 clamp
//! p_unit_test 2, %res2
writeout
(
2
,
fsat
(
f2f16
(
fmul
(
a
,
a
))));
writeout
(
2
,
f2f16
(
fsat
(
fmul
(
a
,
a
))));
//! v2b: %res3 = v_fma_mixlo_f16 %a, %a, -0 clamp
//! p_unit_test 3, %res3
writeout
(
3
,
fsat
(
f2f16
(
fmul
(
a
,
a
))));
//! v1: %res4 = v_fma_mix_f32 lo(%a16), lo(%a16), -0 clamp
//! p_unit_test 4, %res4