Commit af474e26 authored by Ian Romanick's avatar Ian Romanick
Browse files

nir/algebraic: Add lowering for idp4a and udp4a

parent 7d2a3e6b
Pipeline #338857 waiting for manual action with stages
......@@ -191,9 +191,37 @@ optimizations = [
(('iadd', ('udp4a', a, b, 0), c), ('udp4a', a, b, c)),
(('iadd', ('idp4a(is_used_once)', a, b, '#c'), '#d'), ('idp4a', a, b, ('iadd', c, d))),
(('iadd', ('udp4a(is_used_once)', a, b, '#c'), '#d'), ('udp4a', a, b, ('iadd', c, d))),
]
# Shorthand for the expansion of just the dot product part of the [iu]dp4a
# instructions.
idp4_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)),
('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))),
('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 3)),
('imul', ('extract_i8', a, 3), ('extract_i8', b, 3))))
udp4_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_i8', b, 0)),
('imul', ('extract_u8', a, 1), ('extract_i8', b, 1))),
('iadd', ('imul', ('extract_u8', a, 2), ('extract_i8', b, 3)),
('imul', ('extract_u8', a, 3), ('extract_i8', b, 3))))
optimizations.extend([
(('idp4a', a, b, c), ('iadd', idp4_a_b, c), '!options->has_dp4a'),
(('udp4a', a, b, c), ('iadd', udp4_a_b, c), '!options->has_dp4a'),
# For the unsigned dot-product, the largest possible value 4*(255*255) =
# 0x3f804, so we don't have to worry about that intermediate result
# overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant
# that is less than 0xfffc07fc, then the result cannot overflow ever.
(('udp4a_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udp4a', a, b, c)),
(('udp4a_sat', a, b, c), ('uadd_sat', udp4_a_b, c), '!options->has_dp4a'),
# For the signed dot-product, the largest positive value is 4*(-128*-128) =
# 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We
# don't have to worry about that intermediate result overflowing or
# underflowing.
(('idp4a_sat', a, b, c), ('iadd_sat', idp4_a_b, c), '!options->has_dp4a'),
])
# Float sizes
for s in [16, 32, 64]:
optimizations.extend([
......
......@@ -202,6 +202,27 @@ is_not_const_zero(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
return true;
}
/** Is value unsigned less than 0xfffc07fc? */
static inline bool
is_ult_0xfffc07fc(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
unsigned src, unsigned num_components,
const uint8_t *swizzle)
{
/* only constant srcs: */
if (!nir_src_is_const(instr->src[src].src))
return false;
for (unsigned i = 0; i < num_components; i++) {
const unsigned val =
nir_src_comp_as_uint(instr->src[src].src, swizzle[i]);
if (val >= 0xfffc07fcU)
return false;
}
return true;
}
static inline bool
is_not_const(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
unsigned src, UNUSED unsigned num_components,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment