Skip to content

aco: optimize conditional divergent breaks at the end of loops

Georg Lehmann requested to merge DadSchoorse/mesa:aco-wrexec into main

Instead of jumping to the exit in an if-break block and then always jumping to the top at the end of the loop, we jump conditionally to the top at the end.

Foz-DB Navi21:
Totals from 1338 (1.69% of 79206) affected shaders:
Instrs: 4741950 -> 4704338 (-0.79%)
CodeSize: 26112976 -> 25974556 (-0.53%); split: -0.54%, +0.01%
Latency: 79448030 -> 79509165 (+0.08%); split: -0.03%, +0.11%
InvThroughput: 22443107 -> 22453539 (+0.05%); split: -0.03%, +0.07%
Copies: 447244 -> 427015 (-4.52%)
Branches: 176627 -> 156393 (-11.46%)
SALU: 619935 -> 599706 (-3.26%)
Example loop before this MR
	v_mov_b32_e32 v31, v0                                       ; 7e3e0300
	s_inst_prefetch 0x2                                         ; bfa00002
BB1:
	v_readfirstlane_b32 s4, v2                                  ; 7e080502
	s_mov_b64 s[10:11], exec                                    ; be8a047e
	v_cmpx_eq_i32_e32 s4, v2                                    ; 7d240404
	s_cbranch_execz BB9                                         ; bf880012
BB2:
	s_mov_b32 s12, s2                                           ; be8c0302
	s_movk_i32 s13, 0x8000                                      ; b00d8000
	s_lshl_b32 s5, s6, 4                                        ; 8f058406
	s_mov_b32 s16, s3                                           ; be900303
	s_movk_i32 s17, 0x8000                                      ; b0118000
	s_lshl_b32 s4, s4, 6                                        ; 8f048604
	s_clause 0x1                                                ; bfa10001
	s_load_dwordx4 s[12:15], s[12:13], s5                       ; f4080306 0a000000
	s_load_dwordx8 s[16:23], s[16:17], s4 offset:0x20           ; f40c0408 08000020
	s_waitcnt lgkmcnt(0)                                        ; bf8cc07f
	v_mov_b32_e32 v0, s9                                        ; 7e000209
	image_sample_b v[0:3], [v0, v30, v31], s[16:23], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D ; f0940f0a 00640000 00001f1e
	s_andn2_b64 s[10:11], s[10:11], exec                        ; 8a8a7e0a
	s_cbranch_scc0 BB10                                         ; bf840002
BB9:
	s_mov_b64 exec, s[10:11]                                    ; befe040a
	s_branch BB1                                                ; bf82ffe8
BB10:
	s_inst_prefetch 0x3                                         ; bfa00003
Example loop after this MR
	s_mov_b64 s[10:11], exec                                    ; be8a047e
	v_mov_b32_e32 v31, v0                                       ; 7e3e0300
	s_inst_prefetch 0x2                                         ; bfa00002
BB1:
	v_readfirstlane_b32 s4, v2                                  ; 7e080502
	v_cmpx_eq_i32_e32 s4, v2                                    ; 7d240404
	s_cbranch_execz BB9                                         ; bf880010
BB2:
	s_mov_b32 s12, s2                                           ; be8c0302
	s_movk_i32 s13, 0x8000                                      ; b00d8000
	s_lshl_b32 s5, s6, 4                                        ; 8f058406
	s_mov_b32 s16, s3                                           ; be900303
	s_movk_i32 s17, 0x8000                                      ; b0118000
	s_lshl_b32 s4, s4, 6                                        ; 8f048604
	s_clause 0x1                                                ; bfa10001
	s_load_dwordx4 s[12:15], s[12:13], s5                       ; f4080306 0a000000
	s_load_dwordx8 s[16:23], s[16:17], s4 offset:0x20           ; f40c0408 08000020
	s_waitcnt lgkmcnt(0)                                        ; bf8cc07f
	v_mov_b32_e32 v0, s9                                        ; 7e000209
	image_sample_b v[0:3], [v0, v30, v31], s[16:23], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D ; f0940f0a 00640000 00001f1e
BB9:
	s_andn2_wrexec_b64 s[10:11], s[10:11]                       ; be8a3a0a
	s_cbranch_scc1 BB1                                          ; bf85ffeb
BB10:
	s_inst_prefetch 0x3                                         ; bfa00003
Edited by Georg Lehmann

Merge request reports

Loading