RADV: Missing optimizations for typical uaddCarry patterns
In vkd3d-proton, when computing BDA addresses, we use 64-bit + 32-bit adds, and to avoid requiring Int64 just for this, we rely on uaddCarry patterns for convenience, but it seems like codegen is doing some inefficient things:
E.g. given this shader (carry.foz):
#version 450
#extension GL_ARB_gpu_shader_int64 : require
layout(local_size_x = 4) in;
layout(binding = 0) buffer SSBO
{
uvec2 a[4]; uvec2 b[4]; uint c[4];
};
uvec2 add_64_32(uvec2 b, uint c)
{
uint carry;
uint lo = uaddCarry(b.x, c, carry);
uint hi = b.y + carry;
return uvec2(lo, hi);
}
void main()
{
a[gl_LocalInvocationIndex] = add_64_32(b[gl_LocalInvocationIndex], c[gl_LocalInvocationIndex]);
}
ACO:
BB0:
s_mov_b32 s0, s3 ; be800303
s_movk_i32 s3, 0x8000 ; b0038000
s_load_dwordx4 s[4:7], s[2:3], 0x0 ; f4080101 fa000000
v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; d7650000 000100c1
v_mbcnt_hi_u32_b32_e64 v0, -1, v0 ; d7660000 000200c1
v_and_or_b32 v0, 0xfc0, s0, v0 ; d7710000 040000ff 00000fc0
v_lshlrev_b32_e32 v1, 3, v0 ; 34020083
v_lshlrev_b32_e32 v0, 2, v0 ; 34000082
s_waitcnt lgkmcnt(0) ; bf8cc07f
s_clause 0x1 ; bfa10001
buffer_load_dwordx2 v[2:3], v1, s[4:7], 0 offen offset:32 ; e0341020 80010201
buffer_load_dword v0, v0, s[4:7], 0 offen offset:64 ; e0301040 80010000
s_waitcnt vmcnt(0) ; bf8c3f70
v_add_nc_u32_e32 v4, v2, v0 ; 4a080102 <-- Redundant
v_add_co_u32_e64 v0, s[0:1], v2, v0 ; d70f0000 00020102
v_add_co_ci_u32_e64 v5, vcc, 0, v3, s[0:1] ; d5286a05 00020680
buffer_store_dwordx2 v[4:5], v1, s[4:7], 0 offen ; e0741000 80010401
s_endpgm ; bf810000
LLVM:
main:
BB699_0:
s_mov_b32 s0, s3 ; BE800303
s_movk_i32 s3, 0x8000 ; B0038000
v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; D7650000 000100C1
s_load_dwordx4 s[4:7], s[2:3], 0x0 ; F4080101 FA000000
v_mbcnt_hi_u32_b32_e64 v0, -1, v0 ; D7660000 000200C1
v_and_or_b32 v0, s0, 0xfc0, v0 ; D7710000 0401FE00 00000FC0
v_lshlrev_b32_e32 v4, 3, v0 ; 34080083
v_lshlrev_b32_e32 v2, 2, v0 ; 34040082
s_waitcnt lgkmcnt(0) ; BF8CC07F
s_clause 0x1 ; BFA10001
buffer_load_dwordx2 v[0:1], v4, s[4:7], 0 offen offset:32 ; E0341020 80010004
buffer_load_dword v3, v2, s[4:7], 0 offen offset:64 ; E0301040 80010302
s_waitcnt vmcnt(0) ; BF8C3F70
v_add_nc_u32_e32 v2, v0, v3 ; 4A040700 <-- Redundant
v_add_co_u32_e64 v0, vcc, v0, v3 ; D70F6A00 00020700
v_add_co_ci_u32_e32 v3, vcc, 0, v1, vcc ; 50060280
buffer_store_dwordx2 v[2:3], v4, s[4:7], 0 offen ; E0741000 80010204
s_endpgm ; BF810000
Neither LLVM, nor amdgpu-pro actually seem to pick up on this optimization, so this might be something to solve at a NIR level.
For reference, an int64 path (int64.foz) looks like:
#version 450
#extension GL_ARB_gpu_shader_int64 : require
layout(local_size_x = 4) in;
layout(binding = 0) buffer SSBO
{
uint64_t a[4]; uint64_t b[4]; uint c[4];
};
uint64_t add_64_32(uint64_t b, uint c)
{
return b + uint64_t(c);
}
void main()
{
a[gl_LocalInvocationIndex] = add_64_32(b[gl_LocalInvocationIndex], c[gl_LocalInvocationIndex]);
}
ACO:
BB0:
s_mov_b32 s0, s3 ; be800303
s_movk_i32 s3, 0x8000 ; b0038000
s_load_dwordx4 s[4:7], s[2:3], 0x0 ; f4080101 fa000000
v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; d7650000 000100c1
v_mbcnt_hi_u32_b32_e64 v0, -1, v0 ; d7660000 000200c1
v_and_or_b32 v0, 0xfc0, s0, v0 ; d7710000 040000ff 00000fc0
v_lshlrev_b32_e32 v1, 3, v0 ; 34020083
v_lshlrev_b32_e32 v0, 2, v0 ; 34000082
s_waitcnt lgkmcnt(0) ; bf8cc07f
s_clause 0x1 ; bfa10001
buffer_load_dwordx2 v[2:3], v1, s[4:7], 0 offen offset:32 ; e0341020 80010201
buffer_load_dword v0, v0, s[4:7], 0 offen offset:64 ; e0301040 80010000
s_waitcnt vmcnt(0) ; bf8c3f70
v_add_co_u32_e64 v4, s[0:1], v2, v0 ; d70f0004 00020102
v_add_co_ci_u32_e64 v5, vcc, v3, 0, s[0:1] ; d5286a05 00010103
buffer_store_dwordx2 v[4:5], v1, s[4:7], 0 offen ; e0741000 80010401
s_endpgm ; bf810000