RADV: Missing optimizations for typical uaddCarry patterns

In vkd3d-proton, when computing BDA addresses, we use 64-bit + 32-bit adds, and to avoid requiring Int64 just for this, we rely on uaddCarry patterns for convenience, but it seems like codegen is doing some inefficient things:

E.g. given this shader (carry.foz):

#version 450
#extension GL_ARB_gpu_shader_int64 : require

layout(local_size_x = 4) in;

layout(binding = 0) buffer SSBO
{
	uvec2 a[4]; uvec2 b[4]; uint c[4];
};

uvec2 add_64_32(uvec2 b, uint c)
{
	uint carry;
	uint lo = uaddCarry(b.x, c, carry);
	uint hi = b.y + carry;
	return uvec2(lo, hi);
}

void main()
{
	a[gl_LocalInvocationIndex] = add_64_32(b[gl_LocalInvocationIndex], c[gl_LocalInvocationIndex]);
}

ACO:

BB0:
	s_mov_b32 s0, s3                                            ; be800303
	s_movk_i32 s3, 0x8000                                       ; b0038000
	s_load_dwordx4 s[4:7], s[2:3], 0x0                          ; f4080101 fa000000
	v_mbcnt_lo_u32_b32_e64 v0, -1, 0                            ; d7650000 000100c1
	v_mbcnt_hi_u32_b32_e64 v0, -1, v0                           ; d7660000 000200c1
	v_and_or_b32 v0, 0xfc0, s0, v0                              ; d7710000 040000ff 00000fc0
	v_lshlrev_b32_e32 v1, 3, v0                                 ; 34020083
	v_lshlrev_b32_e32 v0, 2, v0                                 ; 34000082
	s_waitcnt lgkmcnt(0)                                        ; bf8cc07f
	s_clause 0x1                                                ; bfa10001
	buffer_load_dwordx2 v[2:3], v1, s[4:7], 0 offen offset:32   ; e0341020 80010201
	buffer_load_dword v0, v0, s[4:7], 0 offen offset:64         ; e0301040 80010000
	s_waitcnt vmcnt(0)                                          ; bf8c3f70
	v_add_nc_u32_e32 v4, v2, v0                                 ; 4a080102             <-- Redundant
	v_add_co_u32_e64 v0, s[0:1], v2, v0                         ; d70f0000 00020102
	v_add_co_ci_u32_e64 v5, vcc, 0, v3, s[0:1]                  ; d5286a05 00020680
	buffer_store_dwordx2 v[4:5], v1, s[4:7], 0 offen            ; e0741000 80010401
	s_endpgm                                                    ; bf810000

LLVM:

main:
BB699_0:
	s_mov_b32 s0, s3                                          ; BE800303
	s_movk_i32 s3, 0x8000                                     ; B0038000
	v_mbcnt_lo_u32_b32_e64 v0, -1, 0                          ; D7650000 000100C1
	s_load_dwordx4 s[4:7], s[2:3], 0x0                        ; F4080101 FA000000
	v_mbcnt_hi_u32_b32_e64 v0, -1, v0                         ; D7660000 000200C1
	v_and_or_b32 v0, s0, 0xfc0, v0                            ; D7710000 0401FE00 00000FC0
	v_lshlrev_b32_e32 v4, 3, v0                               ; 34080083
	v_lshlrev_b32_e32 v2, 2, v0                               ; 34040082
	s_waitcnt lgkmcnt(0)                                      ; BF8CC07F
	s_clause 0x1                                              ; BFA10001
	buffer_load_dwordx2 v[0:1], v4, s[4:7], 0 offen offset:32 ; E0341020 80010004
	buffer_load_dword v3, v2, s[4:7], 0 offen offset:64       ; E0301040 80010302
	s_waitcnt vmcnt(0)                                        ; BF8C3F70
	v_add_nc_u32_e32 v2, v0, v3                               ; 4A040700             <-- Redundant
	v_add_co_u32_e64 v0, vcc, v0, v3                          ; D70F6A00 00020700
	v_add_co_ci_u32_e32 v3, vcc, 0, v1, vcc                   ; 50060280
	buffer_store_dwordx2 v[2:3], v4, s[4:7], 0 offen          ; E0741000 80010204
	s_endpgm                                                  ; BF810000

Neither LLVM, nor amdgpu-pro actually seem to pick up on this optimization, so this might be something to solve at a NIR level.

For reference, an int64 path (int64.foz) looks like:

#version 450
#extension GL_ARB_gpu_shader_int64 : require

layout(local_size_x = 4) in;

layout(binding = 0) buffer SSBO
{
	uint64_t a[4]; uint64_t b[4]; uint c[4];
};

uint64_t add_64_32(uint64_t b, uint c)
{
	return b + uint64_t(c);
}

void main()
{
	a[gl_LocalInvocationIndex] = add_64_32(b[gl_LocalInvocationIndex], c[gl_LocalInvocationIndex]);
}

ACO:

BB0:
	s_mov_b32 s0, s3                                            ; be800303
	s_movk_i32 s3, 0x8000                                       ; b0038000
	s_load_dwordx4 s[4:7], s[2:3], 0x0                          ; f4080101 fa000000
	v_mbcnt_lo_u32_b32_e64 v0, -1, 0                            ; d7650000 000100c1
	v_mbcnt_hi_u32_b32_e64 v0, -1, v0                           ; d7660000 000200c1
	v_and_or_b32 v0, 0xfc0, s0, v0                              ; d7710000 040000ff 00000fc0
	v_lshlrev_b32_e32 v1, 3, v0                                 ; 34020083
	v_lshlrev_b32_e32 v0, 2, v0                                 ; 34000082
	s_waitcnt lgkmcnt(0)                                        ; bf8cc07f
	s_clause 0x1                                                ; bfa10001
	buffer_load_dwordx2 v[2:3], v1, s[4:7], 0 offen offset:32   ; e0341020 80010201
	buffer_load_dword v0, v0, s[4:7], 0 offen offset:64         ; e0301040 80010000
	s_waitcnt vmcnt(0)                                          ; bf8c3f70
	v_add_co_u32_e64 v4, s[0:1], v2, v0                         ; d70f0004 00020102
	v_add_co_ci_u32_e64 v5, vcc, v3, 0, s[0:1]                  ; d5286a05 00010103
	buffer_store_dwordx2 v[4:5], v1, s[4:7], 0 offen            ; e0741000 80010401
	s_endpgm                                                    ; bf810000

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information