ACO doesn't form a VMEM clause for image stores in one case on GFX11

A VMEM clause isn't formed, reducing performance by 40-50% with this MSAA resolving shader, resolving 2 pixels per lane that share a cache line.

Build this to reproduce it easily: https://gitlab.freedesktop.org/mareko/mesa/-/tree/hello-clause?ref_type=heads

Run: HELLO=1 AMD_DEBUG=useaco,cs,asm glxgears

If you don't have GFX11, add: AMD_FORCE_FAMILY=gfx1100

Result:

Compute Shader:
Shader main disassembly:
 BB0:
	s_mov_b32 s0, s3                                            ; be800003
	s_movk_i32 s1, 0x8000                                       ; b0018000
	s_load_b256 s[24:31], s[0:1], 0x3c0                         ; f40c0600 f80003c0
	v_bfe_u32 v1, v0, 0, 10                                     ; d6100001 02290100
	v_bfe_u32 v0, v0, 10, 10                                    ; d6100000 02291500
	s_and_b32 s7, s4, 0x3ff                                     ; 8b07ff04 000003ff
	s_bfe_u32 s4, s4, 0xa000a                                   ; 9304ff04 000a000a
	s_pack_ll_b32_b16 s0, s16, s17                              ; 99001110
	s_pack_ll_b32_b16 s7, s7, s4                                ; 99070407
	v_mov_b16_e32 v1.h, v0.l                                    ; 7f023900
	s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; bf8701b1
	v_pk_mad_u16 v0, s0, s7, v1                                 ; cc094000 1c040e00
	v_mov_b16_e32 v4.l, 0                                       ; 7e083880
	v_mov_b16_e32 v5.l, 1                                       ; 7e0a3881
	v_lshlrev_b16 v1, 1, v0                                     ; d7380001 00020081
	v_add_nc_u16 v3, s6, v0 op_sel:[1,1,1]                      ; d7035803 00020006
	v_add_nc_u16 v0, s6, v0 op_sel:[0,1,1]                      ; d7035000 00020006
	s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; bf8700a3
	v_add_nc_u16 v2, s5, v1 op_sel:[1,0,0]                      ; d7030802 00020205
	v_add_nc_u16 v1, s5, v1                                     ; d7030001 00020205
	v_add_nc_u16 v0, v1, 1                                      ; d7030000 00010301
	s_delay_alu instid0(VALU_DEP_1)                             ; bf870001
	v_mov_b16_e32 v1.h, v0.h                                    ; 7f023980
	s_clause 0x3                                                ; bf850003
	image_load v[6:7], [v1, v4], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 d16 ; f0030f99 00020601 00000004
	image_load v[8:9], [v1, v5], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 d16 ; f0030f99 00020801 00000005
	image_load v[10:11], [v0, v4], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 d16 ; f0030f99 00020a00 00000004
	image_load v[0:1], [v0, v5], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 d16 ; f0030f99 00020000 00000005
	v_add_nc_u16 v3, v2, 1                                      ; d7030003 00010302
	s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; bf8700c1
	v_mov_b16_e32 v2.h, v3.h                                    ; 7f043983
	s_waitcnt vmcnt(2)                                          ; bf890bf7
	v_pk_add_f16 v9, v7, v9                                     ; cc0f0009 18021307
	v_pk_add_f16 v8, v6, v8                                     ; cc0f0008 18021106
	v_pk_mul_f16 v8, v8, 0.5 op_sel_hi:[1,0]                    ; cc100008 0801e108
	v_pk_mul_f16 v9, v9, 0.5 op_sel_hi:[1,0]                    ; cc100009 0801e109
	s_delay_alu instid0(VALU_DEP_2)                             ; bf870002
	v_mov_b16_e32 v4.l, v8.l                                    ; 7e083908
	v_mov_b16_e32 v4.h, v8.h                                    ; 7f083988
	v_mov_b16_e32 v5.l, v9.l                                    ; 7e0a3909
	v_mov_b16_e32 v5.h, v9.h                                    ; 7f0a3989
	s_waitcnt lgkmcnt(0)                                        ; bf89fc07
	image_store v[4:5], v2, s[24:31] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 d16 ; f01b0f84 00060402 <----------------------- store1
	s_waitcnt vmcnt(0)                                          ; bf8903f7
	v_pk_add_f16 v0, v10, v0                                    ; cc0f0000 1802010a
	v_pk_add_f16 v1, v11, v1                                    ; cc0f0001 1802030b
	s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; bf870122
	v_pk_mul_f16 v0, v0, 0.5 op_sel_hi:[1,0]                    ; cc100000 0801e100
	v_pk_mul_f16 v1, v1, 0.5 op_sel_hi:[1,0]                    ; cc100001 0801e101
	v_mov_b16_e32 v4.l, v0.l                                    ; 7e083900
	v_mov_b16_e32 v4.h, v0.h                                    ; 7f083980
	v_mov_b16_e32 v5.l, v1.l                                    ; 7e0a3901
	v_mov_b16_e32 v5.h, v1.h                                    ; 7f0a3981
	image_store v[4:5], v3, s[24:31] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 d16 ; f01b0f84 00060403 <----------------------- store 2
	s_nop 0                                                     ; bf800000
	s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)                        ; bfb60003
	s_endpgm                                                    ; bfb00000```

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information