ACO doesn't form a VMEM clause for image stores in one case on GFX11
A VMEM clause isn't formed, reducing performance by 40-50% with this MSAA resolving shader, resolving 2 pixels per lane that share a cache line.
Build this to reproduce it easily: https://gitlab.freedesktop.org/mareko/mesa/-/tree/hello-clause?ref_type=heads
Run: HELLO=1 AMD_DEBUG=useaco,cs,asm glxgears
If you don't have GFX11, add: AMD_FORCE_FAMILY=gfx1100
Result:
Compute Shader:
Shader main disassembly:
BB0:
s_mov_b32 s0, s3 ; be800003
s_movk_i32 s1, 0x8000 ; b0018000
s_load_b256 s[24:31], s[0:1], 0x3c0 ; f40c0600 f80003c0
v_bfe_u32 v1, v0, 0, 10 ; d6100001 02290100
v_bfe_u32 v0, v0, 10, 10 ; d6100000 02291500
s_and_b32 s7, s4, 0x3ff ; 8b07ff04 000003ff
s_bfe_u32 s4, s4, 0xa000a ; 9304ff04 000a000a
s_pack_ll_b32_b16 s0, s16, s17 ; 99001110
s_pack_ll_b32_b16 s7, s7, s4 ; 99070407
v_mov_b16_e32 v1.h, v0.l ; 7f023900
s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; bf8701b1
v_pk_mad_u16 v0, s0, s7, v1 ; cc094000 1c040e00
v_mov_b16_e32 v4.l, 0 ; 7e083880
v_mov_b16_e32 v5.l, 1 ; 7e0a3881
v_lshlrev_b16 v1, 1, v0 ; d7380001 00020081
v_add_nc_u16 v3, s6, v0 op_sel:[1,1,1] ; d7035803 00020006
v_add_nc_u16 v0, s6, v0 op_sel:[0,1,1] ; d7035000 00020006
s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; bf8700a3
v_add_nc_u16 v2, s5, v1 op_sel:[1,0,0] ; d7030802 00020205
v_add_nc_u16 v1, s5, v1 ; d7030001 00020205
v_add_nc_u16 v0, v1, 1 ; d7030000 00010301
s_delay_alu instid0(VALU_DEP_1) ; bf870001
v_mov_b16_e32 v1.h, v0.h ; 7f023980
s_clause 0x3 ; bf850003
image_load v[6:7], [v1, v4], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 d16 ; f0030f99 00020601 00000004
image_load v[8:9], [v1, v5], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 d16 ; f0030f99 00020801 00000005
image_load v[10:11], [v0, v4], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 d16 ; f0030f99 00020a00 00000004
image_load v[0:1], [v0, v5], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 d16 ; f0030f99 00020000 00000005
v_add_nc_u16 v3, v2, 1 ; d7030003 00010302
s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; bf8700c1
v_mov_b16_e32 v2.h, v3.h ; 7f043983
s_waitcnt vmcnt(2) ; bf890bf7
v_pk_add_f16 v9, v7, v9 ; cc0f0009 18021307
v_pk_add_f16 v8, v6, v8 ; cc0f0008 18021106
v_pk_mul_f16 v8, v8, 0.5 op_sel_hi:[1,0] ; cc100008 0801e108
v_pk_mul_f16 v9, v9, 0.5 op_sel_hi:[1,0] ; cc100009 0801e109
s_delay_alu instid0(VALU_DEP_2) ; bf870002
v_mov_b16_e32 v4.l, v8.l ; 7e083908
v_mov_b16_e32 v4.h, v8.h ; 7f083988
v_mov_b16_e32 v5.l, v9.l ; 7e0a3909
v_mov_b16_e32 v5.h, v9.h ; 7f0a3989
s_waitcnt lgkmcnt(0) ; bf89fc07
image_store v[4:5], v2, s[24:31] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 d16 ; f01b0f84 00060402 <----------------------- store1
s_waitcnt vmcnt(0) ; bf8903f7
v_pk_add_f16 v0, v10, v0 ; cc0f0000 1802010a
v_pk_add_f16 v1, v11, v1 ; cc0f0001 1802030b
s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; bf870122
v_pk_mul_f16 v0, v0, 0.5 op_sel_hi:[1,0] ; cc100000 0801e100
v_pk_mul_f16 v1, v1, 0.5 op_sel_hi:[1,0] ; cc100001 0801e101
v_mov_b16_e32 v4.l, v0.l ; 7e083900
v_mov_b16_e32 v4.h, v0.h ; 7f083980
v_mov_b16_e32 v5.l, v1.l ; 7e0a3901
v_mov_b16_e32 v5.h, v1.h ; 7f0a3981
image_store v[4:5], v3, s[24:31] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 d16 ; f01b0f84 00060403 <----------------------- store 2
s_nop 0 ; bf800000
s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; bfb60003
s_endpgm ; bfb00000```