ACO doesn't hide lds_param_load latencies
Reproduce with:
AMD_DEBUG=useaco,ps,asm glxgears -samples 8
If you don't have GFX11, add AMD_FORCE_FAMILY=gfx1100
.
You'll get this PS prolog below where lds_param_load
is always immediately followed by instructions using its result.
Pixel Shader:
Shader prolog disassembly:
BB0:
s_mov_b64 s[6:7], exec ; be86017e
s_wqm_b64 exec, exec ; befe1d7e
s_mov_b32 m0, s5 ; befd0005
lds_param_load v18, attr0.x wait_vdst:15 ; ce0f0012
v_interp_p10_f32 v19, v18, v2, v18 wait_exp:0 ; cd000013 044a0512
s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; bf8700b1
v_interp_p2_f32 v18, v18, v3, v19 wait_exp:7 ; cd010712 044e0712
lds_param_load v19, attr0.y wait_vdst:0 ; ce000113
v_interp_p10_f32 v20, v19, v2, v19 wait_exp:0 ; cd000014 044e0513
v_interp_p2_f32 v19, v19, v3, v20 wait_exp:7 ; cd010713 04520713
lds_param_load v20, attr0.z wait_vdst:0 ; ce000214
v_interp_p10_f32 v21, v20, v2, v20 wait_exp:0 ; cd000015 04520514
s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; bf8700c1
v_interp_p2_f32 v20, v20, v3, v21 wait_exp:7 ; cd010714 04560714
lds_param_load v21, attr0.w wait_vdst:0 ; ce000315
v_interp_p10_f32 v16, v21, v2, v21 wait_exp:0 ; cd000010 04560515
s_mov_b32 s5, m0 ; be85007d
v_interp_p2_f32 v16, v21, v3, v16 wait_exp:7 ; cd010710 04420715
s_mov_b64 exec, s[6:7] ; befe0106
v_mov_b32_e32 v17, v19 ; 7e220313
s_delay_alu instid0(VALU_DEP_2) ; bf870002
v_mov_b32_e32 v19, v16 ; 7e260310
v_mov_b32_e32 v16, v18 ; 7e200312
v_mov_b32_e32 v18, v20 ; 7e240314
s_waitcnt_depctr 0xfff ; bf880fff
Shader main disassembly:
BB0:
v_mov_b32_e32 v0, v16 ; 7e000310
v_mov_b32_e32 v1, v17 ; 7e020311
v_mov_b32_e32 v2, v18 ; 7e040312
v_mov_b32_e32 v3, v19 ; 7e060313
s_waitcnt_depctr 0xfff ; bf880fff
Shader epilog disassembly:
BB0:
v_cvt_pk_rtz_f16_f32_e32 v0, v0, v1 ; 5e000300
v_cvt_pk_rtz_f16_f32_e32 v1, v2, v3 ; 5e020702
exp mrt0 v0, v1, off, off done ; f8000803 80800100
s_nop 0 ; bf800000
s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; bfb60003
s_endpgm ; bfb00000