aco: optimize conditional divergent breaks at the end of loops
Instead of jumping to the exit in an if-break block and then always jumping to the top at the end of the loop, we jump conditionally to the top at the end.
Foz-DB Navi21:
Totals from 1338 (1.69% of 79206) affected shaders:
Instrs: 4741950 -> 4704338 (-0.79%)
CodeSize: 26112976 -> 25974556 (-0.53%); split: -0.54%, +0.01%
Latency: 79448030 -> 79509165 (+0.08%); split: -0.03%, +0.11%
InvThroughput: 22443107 -> 22453539 (+0.05%); split: -0.03%, +0.07%
Copies: 447244 -> 427015 (-4.52%)
Branches: 176627 -> 156393 (-11.46%)
SALU: 619935 -> 599706 (-3.26%)
Example loop before this MR
v_mov_b32_e32 v31, v0 ; 7e3e0300
s_inst_prefetch 0x2 ; bfa00002
BB1:
v_readfirstlane_b32 s4, v2 ; 7e080502
s_mov_b64 s[10:11], exec ; be8a047e
v_cmpx_eq_i32_e32 s4, v2 ; 7d240404
s_cbranch_execz BB9 ; bf880012
BB2:
s_mov_b32 s12, s2 ; be8c0302
s_movk_i32 s13, 0x8000 ; b00d8000
s_lshl_b32 s5, s6, 4 ; 8f058406
s_mov_b32 s16, s3 ; be900303
s_movk_i32 s17, 0x8000 ; b0118000
s_lshl_b32 s4, s4, 6 ; 8f048604
s_clause 0x1 ; bfa10001
s_load_dwordx4 s[12:15], s[12:13], s5 ; f4080306 0a000000
s_load_dwordx8 s[16:23], s[16:17], s4 offset:0x20 ; f40c0408 08000020
s_waitcnt lgkmcnt(0) ; bf8cc07f
v_mov_b32_e32 v0, s9 ; 7e000209
image_sample_b v[0:3], [v0, v30, v31], s[16:23], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D ; f0940f0a 00640000 00001f1e
s_andn2_b64 s[10:11], s[10:11], exec ; 8a8a7e0a
s_cbranch_scc0 BB10 ; bf840002
BB9:
s_mov_b64 exec, s[10:11] ; befe040a
s_branch BB1 ; bf82ffe8
BB10:
s_inst_prefetch 0x3 ; bfa00003
Example loop after this MR
s_mov_b64 s[10:11], exec ; be8a047e
v_mov_b32_e32 v31, v0 ; 7e3e0300
s_inst_prefetch 0x2 ; bfa00002
BB1:
v_readfirstlane_b32 s4, v2 ; 7e080502
v_cmpx_eq_i32_e32 s4, v2 ; 7d240404
s_cbranch_execz BB9 ; bf880010
BB2:
s_mov_b32 s12, s2 ; be8c0302
s_movk_i32 s13, 0x8000 ; b00d8000
s_lshl_b32 s5, s6, 4 ; 8f058406
s_mov_b32 s16, s3 ; be900303
s_movk_i32 s17, 0x8000 ; b0118000
s_lshl_b32 s4, s4, 6 ; 8f048604
s_clause 0x1 ; bfa10001
s_load_dwordx4 s[12:15], s[12:13], s5 ; f4080306 0a000000
s_load_dwordx8 s[16:23], s[16:17], s4 offset:0x20 ; f40c0408 08000020
s_waitcnt lgkmcnt(0) ; bf8cc07f
v_mov_b32_e32 v0, s9 ; 7e000209
image_sample_b v[0:3], [v0, v30, v31], s[16:23], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D ; f0940f0a 00640000 00001f1e
BB9:
s_andn2_wrexec_b64 s[10:11], s[10:11] ; be8a3a0a
s_cbranch_scc1 BB1 ; bf85ffeb
BB10:
s_inst_prefetch 0x3 ; bfa00003
Edited by Georg Lehmann