"intel/compiler: Signed integer range analysis for imul_32x16 generation" regresses several OpenCL tests
@idr here is the bug.
The nir and the IR dumps might night actually be the same thing, but it's probably more relevant to compare the before/after anyway.
Main difference is, that imul
is now used instead of umul_32x16
in a few places when calculating the thread id.
before
NIR (final form) for compute shader:
shader: MESA_SHADER_COMPUTE
source_sha1: {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
workgroup-size: 0, 0, 0 (variable)
shared-size: 0
inputs: 0
outputs: 0
uniforms: 0
ubos: 1
shared: 0
ray queries: 0
decl_var uniform INTERP_MODE_NONE uint64_t @0 (0, 0, 0)
decl_var uniform INTERP_MODE_NONE uint64_t @1 (1, 8, 0)
decl_var uniform INTERP_MODE_NONE uint64_t @2 (2, 16, 0)
decl_var uniform INTERP_MODE_NONE uint64_t @3 (3, 24, 0)
decl_var uniform INTERP_MODE_NONE u64vec4x0a32B base_global_invocation_id (4, 32, 0)
decl_var ubo INTERP_MODE_NONE uint8_t[64] kernel_input (0, 0, 0)
decl_function __wrapped_select_char_char (0 params)
impl __wrapped_select_char_char {
block block_0:
/* preds: */
vec1 32 con ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 con ssa_1 = load_const (0x00000001 = 0.000000)
vec4 32 con ssa_2 = intrinsic load_ubo (ssa_1, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=64)
vec1 32 con ssa_3 = load_const (0x00000010 = 0.000000)
vec4 32 con ssa_4 = intrinsic load_ubo (ssa_1, ssa_3) (access=0, align_mul=1073741824, align_offset=16, range_base=0, range=64)
vec1 32 con ssa_5 = load_const (0x00000020 = 0.000000)
vec3 64 con ssa_6 = intrinsic load_ubo (ssa_1, ssa_5) (access=0, align_mul=1073741824, align_offset=32, range_base=0, range=64)
vec1 32 con ssa_7 = load_const (0x00000002 = 0.000000)
vec3 32 con ssa_8 = intrinsic load_ubo (ssa_7, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=-1)
vec3 32 con ssa_9 = intrinsic load_workgroup_id () ()
vec1 32 con ssa_10 = intrinsic load_subgroup_id () ()
vec1 32 con ssa_11 = load_const (0x00000005 = 0.000000)
vec1 32 con ssa_12 = ishl ssa_10, ssa_11
vec1 32 div ssa_13 = intrinsic load_subgroup_invocation () ()
vec1 32 div ssa_14 = iadd ssa_13, ssa_12
vec1 32 div ssa_15 = umod ssa_14, ssa_8.x
vec1 32 con ssa_16 = umul_high ssa_9.x, ssa_8.x
vec1 32 con ssa_17 = umul_32x16 ssa_8.x, ssa_9.x
vec1 32 div ssa_18 = iadd ssa_17, ssa_15
vec1 32 div ssa_19 = ult32 ssa_18, ssa_17
vec1 32 div ssa_20 = b2i32 ssa_19
vec1 32 div ssa_21 = iadd ssa_20, ssa_16
vec1 32 con ssa_22 = unpack_64_2x32_split_x ssa_6.x
vec1 32 con ssa_23 = unpack_64_2x32_split_y ssa_6.x
vec1 32 div ssa_24 = iadd ssa_18, ssa_22
vec1 32 div ssa_25 = ult32 ssa_24, ssa_18
vec1 32 div ssa_26 = b2i32 ssa_25
vec1 32 div ssa_27 = iadd ssa_21, ssa_23
vec1 32 div ssa_28 = iadd ssa_26, ssa_27
vec3 32 con ssa_29 = intrinsic load_num_workgroups () ()
vec1 32 con ssa_30 = umul_high ssa_8.x, ssa_29.x
vec1 32 con ssa_31 = umul_32x16 ssa_8.x, ssa_29.x
vec1 32 div ssa_32 = ieq32 ssa_28, ssa_30
vec1 32 div ssa_33 = ult32 ssa_24, ssa_31
vec1 32 div ssa_34 = iand ssa_32, ssa_33
vec1 32 div ssa_35 = ult32 ssa_28, ssa_30
vec1 32 div ssa_36 = ior ssa_35, ssa_34
/* succs: block_1 block_2 */
if ssa_36 {
block block_1:
/* preds: block_0 */
vec1 32 div ssa_37 = iadd ssa_2.z, ssa_24
vec1 32 div ssa_38 = ult32 ssa_37, ssa_2.z
vec1 32 div ssa_39 = b2i32 ssa_38
vec1 32 div ssa_40 = iadd ssa_2.w, ssa_28
vec1 32 div ssa_41 = iadd ssa_39, ssa_40
vec1 64 div ssa_42 = pack_64_2x32_split ssa_37, ssa_41
vec1 8 div ssa_43 = intrinsic load_global (ssa_42) (access=0, align_mul=1, align_offset=0)
vec1 32 div ssa_44 = iadd ssa_4.x, ssa_24
vec1 32 div ssa_45 = ult32 ssa_44, ssa_4.x
vec1 32 div ssa_46 = b2i32 ssa_45
vec1 32 div ssa_47 = iadd ssa_4.y, ssa_28
vec1 32 div ssa_48 = iadd ssa_46, ssa_47
vec1 64 div ssa_49 = pack_64_2x32_split ssa_44, ssa_48
vec1 8 div ssa_50 = intrinsic load_global (ssa_49) (access=0, align_mul=1, align_offset=0)
vec1 32 div ssa_51 = iadd ssa_4.z, ssa_24
vec1 32 div ssa_52 = ult32 ssa_51, ssa_4.z
vec1 32 div ssa_53 = b2i32 ssa_52
vec1 32 div ssa_54 = iadd ssa_4.w, ssa_28
vec1 32 div ssa_55 = iadd ssa_53, ssa_54
vec1 64 div ssa_56 = pack_64_2x32_split ssa_51, ssa_55
vec1 8 div ssa_57 = intrinsic load_global (ssa_56) (access=0, align_mul=1, align_offset=0)
vec1 16 div ssa_58 = i2i16 ssa_57
vec1 16 con ssa_59 = load_const (0x0000 = 0.000000)
vec1 16 div ssa_60 = u2u16 ssa_43
vec1 16 div ssa_61 = u2u16 ssa_50
vec1 32 div ssa_62 = ieq32 ssa_58, ssa_59
vec1 16 div ssa_63 = b32csel ssa_62, ssa_60, ssa_61
vec1 8 div ssa_64 = u2u8 ssa_63
vec1 32 div ssa_65 = iadd ssa_2.x, ssa_24
vec1 32 div ssa_66 = ult32 ssa_65, ssa_2.x
vec1 32 div ssa_67 = b2i32 ssa_66
vec1 32 div ssa_68 = iadd ssa_2.y, ssa_28
vec1 32 div ssa_69 = iadd ssa_67, ssa_68
vec1 64 div ssa_70 = pack_64_2x32_split ssa_65, ssa_69
intrinsic store_global (ssa_64, ssa_70) (wrmask=x /*1*/, access=0, align_mul=1, align_offset=0)
/* succs: block_3 */
} else {
block block_2:
/* preds: block_0 */
/* succs: block_3 */
}
block block_3:
/* preds: block_1 block_2 */
/* succs: block_4 */
block block_4:
}
Native code for unnamed compute shader (null) (sha1 d82f39de2f14103f3f04cd6b4107108d0abbbfdf)
SIMD8 shader: 84 instructions. 0 loops. 714 cycles. 0:0 spills:fills, 8 sends, scheduled with mode top-down. Promoted 0 constants. Compacted 1344 to 1104 bytes (18%)
START B0 (370 cycles)
and(1) cr0<1>UD cr0<0,1,0>UD 0xfffffb7fUD { align1 1N @1 };
or(1) cr0<1>UD cr0<0,1,0>UD 0x00000400UD { align1 1N @1 };
sync nop(8) null<0,1,0>UB { align1 1Q @1 };
mov(8) g2<1>UW 0x76543210V { align1 WE_all 1Q };
mov(8) g50<1>UD g0.1<0,1,0>UD { align1 1Q };
mov(8) g65<1>UD g0<8,8,1>UD { align1 WE_all 1Q };
mov(8) g66<1>UD g0<8,8,1>UD { align1 WE_all 1Q };
shl(8) g4<1>D g1<0,1,0>D 0x00000003UD { align1 1Q };
mov(8) g54<1>UD 0x00000000UD { align1 1Q };
sync nop(1) null<0,1,0>UB { align1 WE_all 1N @6 };
mov(8) g5<1>D g2<8,8,1>UW { align1 1Q };
mov(1) g65.2<1>UD 0x00000000UD { align1 WE_all 1N @5 };
mov(1) g66.2<1>UD 0x00000000UD { align1 WE_all 1N @5 };
send(8) g17UD g54UD nullUD 0x02306800 0x00000000
dp data 1 MsgDesc: (untyped surface read, Surface = 0, SIMD8, Mask = 0x8) mlen 1 ex_mlen 0 rlen 3 { align1 1Q @4 $0 };
add(8) g6<1>D g5<8,8,1>D g4<8,8,1>D { align1 1Q @3 compacted };
sync nop(1) null<0,1,0>UB { align1 WE_all 1N $0.src };
send(16) g53UD g65UD nullUD 0x02280301 0x00000000
const MsgDesc: (bti 1, msg_ctrl 3, msg_type 0, write_commit 0) mlen 1 ex_mlen 0 rlen 2 { align1 WE_all 1H @3 $1 };
send(16) g3UD g66UD nullUD 0x02280302 0x00000000
const MsgDesc: (bti 2, msg_ctrl 3, msg_type 0, write_commit 0) mlen 1 ex_mlen 0 rlen 2 { align1 WE_all 1H @1 $2 };
sync nop(1) null<0,1,0>UB { align1 WE_all 1N $1.dst };
mov(8) g51.1<2>F g54.1<0,1,0>F { align1 1Q compacted };
sync nop(1) null<0,1,0>UB { align1 WE_all 1N $2.dst };
math intmod(8) g7<1>UD g6<8,8,1>UD g3<0,1,0>UD { align1 1Q @2 $3 };
sync nop(1) null<0,1,0>UB { align1 WE_all 1N $3.src };
mul(8) acc0<1>UD g50<8,8,1>UD g3<0,1,0>UW { align1 1Q @7 };
mul(8) g9<1>UD g3<0,1,0>UD g50<16,8,2>UW { align1 1Q };
mov(8) g51<2>F g54<0,1,0>F { align1 1Q @3 compacted };
mach(8) g8<1>UD g50<8,8,1>UD g3<0,1,0>UD { align1 1Q compacted AccWrEnable };
add(8) g10<1>D g9<8,8,1>D g7<8,8,1>D { align1 1Q @3 $3.dst compacted };
cmp.l.f0.0(8) g11<1>UD g10<8,8,1>UD g9<8,8,1>UD { align1 1Q @1 compacted };
add(8) g13<1>D g10<8,8,1>D g51<8,4,2>D { align1 1Q @4 };
add(8) g12<1>D -g11<8,8,1>D g8<8,8,1>D { align1 1Q @2 compacted };
cmp.l.f0.0(8) g14<1>UD g13<8,8,1>UD g10<8,8,1>UD { align1 1Q @2 compacted };
add(8) g15<1>D g12<8,8,1>D g51.1<8,4,2>D { align1 1Q @2 };
add(8) g16<1>D -g14<8,8,1>D g15<8,8,1>D { align1 1Q @1 compacted };
mul(8) acc0<1>UD g3<0,1,0>UD g17<16,8,2>UW { align1 1Q $0.dst };
mul(8) g20<1>UD g3<0,1,0>UD g17<16,8,2>UW { align1 1Q };
mach(8) g18<1>UD g3<0,1,0>UD g17<8,8,1>UD { align1 1Q $0.dst compacted AccWrEnable };
cmp.z.f0.0(8) g21<1>D g16<8,8,1>D g18<8,8,1>D { align1 1Q @1 compacted };
cmp.l.f0.0(8) g22<1>UD g13<8,8,1>UD g20<8,8,1>UD { align1 1Q @3 compacted };
cmp.l.f0.0(8) g24<1>UD g16<8,8,1>UD g18<8,8,1>UD { align1 1Q compacted };
and(8) g23<1>UD g21<8,8,1>UD g22<8,8,1>UD { align1 1Q @2 compacted };
or.nz.f0.0(8) null<1>UD g24<8,8,1>UD g23<8,8,1>UD { align1 1Q @1 };
(+f0.0) if(8) JIP: LABEL0 UIP: LABEL0 { align1 1Q };
END B0 ->B1 ->B2
START B1 <-B0 (322 cycles)
sync nop(1) null<0,1,0>UB { align1 WE_all 1N $1.dst };
add(8) g25<1>D g53.2<0,1,0>D g13<8,8,1>D { align1 1Q compacted };
add(8) g27<1>D g53.3<0,1,0>D g16<8,8,1>D { align1 1Q compacted };
add(8) g30<1>D g53.4<0,1,0>D g13<8,8,1>D { align1 1Q compacted };
add(8) g32<1>D g53.5<0,1,0>D g16<8,8,1>D { align1 1Q compacted };
add(8) g35<1>D g53.6<0,1,0>D g13<8,8,1>D { align1 1Q compacted };
add(8) g37<1>D g53.7<0,1,0>D g16<8,8,1>D { align1 1Q compacted };
add(8) g46<1>D g53<0,1,0>D g13<8,8,1>D { align1 1Q compacted };
add(8) g48<1>D g53.1<0,1,0>D g16<8,8,1>D { align1 1Q compacted };
cmp.l.f0.0(8) g26<1>UD g25<8,8,1>UD g53.2<0,1,0>UD { align1 1Q @7 compacted };
mov(8) g55<2>UD g25<4,4,1>UD { align1 1Q };
mov(8) g57<2>UD g30<4,4,1>UD { align1 1Q @7 };
mov(8) g59<2>UD g35<4,4,1>UD { align1 1Q @7 };
cmp.l.f0.0(8) g31<1>UD g30<8,8,1>UD g53.4<0,1,0>UD { align1 1Q compacted };
cmp.l.f0.0(8) g36<1>UD g35<8,8,1>UD g53.6<0,1,0>UD { align1 1Q compacted };
mov(8) g61<2>UD g46<4,4,1>UD { align1 1Q @7 };
add(8) g28<1>D -g26<8,8,1>D g27<8,8,1>D { align1 1Q @7 compacted };
add(8) g33<1>D -g31<8,8,1>D g32<8,8,1>D { align1 1Q @4 compacted };
add(8) g38<1>D -g36<8,8,1>D g37<8,8,1>D { align1 1Q @4 compacted };
mov(8) g55.1<2>UD g28<4,4,1>UD { align1 1Q @3 };
mov(8) g57.1<2>UD g33<4,4,1>UD { align1 1Q @3 };
mov(8) g59.1<2>UD g38<4,4,1>UD { align1 1Q @3 };
send(8) g29UD g55UD nullUD 0x041400fd 0x00000000
dp data 1 MsgDesc: (DC A64 scattered read, Surface = 253, 0x0) mlen 2 ex_mlen 0 rlen 1 { align1 1Q @3 $4 };
send(8) g34UD g57UD nullUD 0x041400fd 0x00000000
dp data 1 MsgDesc: (DC A64 scattered read, Surface = 253, 0x0) mlen 2 ex_mlen 0 rlen 1 { align1 1Q @2 $5 };
send(8) g39UD g59UD nullUD 0x041400fd 0x00000000
dp data 1 MsgDesc: (DC A64 scattered read, Surface = 253, 0x0) mlen 2 ex_mlen 0 rlen 1 { align1 1Q @1 $6 };
mov(8) g41<1>UW g29<32,8,4>UB { align1 1Q $4.dst };
mov(8) g42<1>UW g34<32,8,4>UB { align1 1Q $5.dst };
mov(8) g40<1>W g39<32,8,4>B { align1 1Q $6.dst };
cmp.z.f0.0(8) g43<1>W g40<8,8,1>W 0W { align1 1Q @1 };
mov.nz.f0.0(8) null<1>D g43<8,8,1>W { align1 1Q @1 };
(+f0.0) sel(8) g44<1>UW g41<8,8,1>UW g42<8,8,1>UW { align1 1Q @4 };
mov(8) g64<2>UB g44<8,8,1>UW { align1 1Q @1 };
cmp.l.f0.0(8) g47<1>UD g46<8,8,1>UD g53<0,1,0>UD { align1 1Q compacted };
mov(8) g63<1>UD g64<16,8,2>UB { align1 1Q @2 };
add(8) g49<1>D -g47<8,8,1>D g48<8,8,1>D { align1 1Q @2 compacted };
mov(8) g61.1<2>UD g49<4,4,1>UD { align1 1Q @1 };
send(8) nullUD g61UD g63UD 0x040680fd 0x00000040
dp data 1 MsgDesc: (DC A64 scattered write, Surface = 253, 0x0) mlen 2 ex_mlen 1 rlen 0 { align1 1Q @1 $7 };
END B1 ->B2
START B2 <-B1 <-B0 (22 cycles)
LABEL0:
endif(8) JIP: LABEL1 { align1 1Q };
LABEL1:
mov(8) g126<1>UD g0<8,8,1>UD { align1 WE_all 1Q };
mov(16) acc0<1>F 0x0F /* 0F */ { align1 WE_all 1H @1 };
send(8) nullUD g126UD nullUD 0x02000000 0x00000000
thread_spawner MsgDesc: mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1Q @1 EOT };
END B2
After
NIR (final form) for compute shader:
shader: MESA_SHADER_COMPUTE
source_sha1: {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
workgroup-size: 0, 0, 0 (variable)
shared-size: 0
inputs: 0
outputs: 0
uniforms: 0
ubos: 1
shared: 0
ray queries: 0
decl_var uniform INTERP_MODE_NONE uint64_t @0 (0, 0, 0)
decl_var uniform INTERP_MODE_NONE uint64_t @1 (1, 8, 0)
decl_var uniform INTERP_MODE_NONE uint64_t @2 (2, 16, 0)
decl_var uniform INTERP_MODE_NONE uint64_t @3 (3, 24, 0)
decl_var uniform INTERP_MODE_NONE u64vec4x0a32B base_global_invocation_id (4, 32, 0)
decl_var ubo INTERP_MODE_NONE uint8_t[64] kernel_input (0, 0, 0)
decl_function __wrapped_select_char_char (0 params)
impl __wrapped_select_char_char {
block block_0:
/* preds: */
vec1 32 con ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 con ssa_1 = load_const (0x00000001 = 0.000000)
vec4 32 con ssa_2 = intrinsic load_ubo (ssa_1, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=64)
vec1 32 con ssa_3 = load_const (0x00000010 = 0.000000)
vec4 32 con ssa_4 = intrinsic load_ubo (ssa_1, ssa_3) (access=0, align_mul=1073741824, align_offset=16, range_base=0, range=64)
vec1 32 con ssa_5 = load_const (0x00000020 = 0.000000)
vec3 64 con ssa_6 = intrinsic load_ubo (ssa_1, ssa_5) (access=0, align_mul=1073741824, align_offset=32, range_base=0, range=64)
vec1 32 con ssa_7 = load_const (0x00000002 = 0.000000)
vec3 32 con ssa_8 = intrinsic load_ubo (ssa_7, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=-1)
vec3 32 con ssa_9 = intrinsic load_workgroup_id () ()
vec1 32 con ssa_10 = intrinsic load_subgroup_id () ()
vec1 32 con ssa_11 = load_const (0x00000005 = 0.000000)
vec1 32 con ssa_12 = ishl ssa_10, ssa_11
vec1 32 div ssa_13 = intrinsic load_subgroup_invocation () ()
vec1 32 div ssa_14 = iadd ssa_13, ssa_12
vec1 32 div ssa_15 = umod ssa_14, ssa_8.x
vec1 32 con ssa_16 = umul_high ssa_9.x, ssa_8.x
vec1 32 con ssa_17 = imul ssa_9.x, ssa_8.x
vec1 32 div ssa_18 = iadd ssa_17, ssa_15
vec1 32 div ssa_19 = ult32 ssa_18, ssa_17
vec1 32 div ssa_20 = b2i32 ssa_19
vec1 32 div ssa_21 = iadd ssa_20, ssa_16
vec1 32 con ssa_22 = unpack_64_2x32_split_x ssa_6.x
vec1 32 con ssa_23 = unpack_64_2x32_split_y ssa_6.x
vec1 32 div ssa_24 = iadd ssa_18, ssa_22
vec1 32 div ssa_25 = ult32 ssa_24, ssa_18
vec1 32 div ssa_26 = b2i32 ssa_25
vec1 32 div ssa_27 = iadd ssa_21, ssa_23
vec1 32 div ssa_28 = iadd ssa_26, ssa_27
vec3 32 con ssa_29 = intrinsic load_num_workgroups () ()
vec1 32 con ssa_30 = umul_high ssa_8.x, ssa_29.x
vec1 32 con ssa_31 = imul ssa_8.x, ssa_29.x
vec1 32 div ssa_32 = ieq32 ssa_28, ssa_30
vec1 32 div ssa_33 = ult32 ssa_24, ssa_31
vec1 32 div ssa_34 = iand ssa_32, ssa_33
vec1 32 div ssa_35 = ult32 ssa_28, ssa_30
vec1 32 div ssa_36 = ior ssa_35, ssa_34
/* succs: block_1 block_2 */
if ssa_36 {
block block_1:
/* preds: block_0 */
vec1 32 div ssa_37 = iadd ssa_2.z, ssa_24
vec1 32 div ssa_38 = ult32 ssa_37, ssa_2.z
vec1 32 div ssa_39 = b2i32 ssa_38
vec1 32 div ssa_40 = iadd ssa_2.w, ssa_28
vec1 32 div ssa_41 = iadd ssa_39, ssa_40
vec1 64 div ssa_42 = pack_64_2x32_split ssa_37, ssa_41
vec1 8 div ssa_43 = intrinsic load_global (ssa_42) (access=0, align_mul=1, align_offset=0)
vec1 32 div ssa_44 = iadd ssa_4.x, ssa_24
vec1 32 div ssa_45 = ult32 ssa_44, ssa_4.x
vec1 32 div ssa_46 = b2i32 ssa_45
vec1 32 div ssa_47 = iadd ssa_4.y, ssa_28
vec1 32 div ssa_48 = iadd ssa_46, ssa_47
vec1 64 div ssa_49 = pack_64_2x32_split ssa_44, ssa_48
vec1 8 div ssa_50 = intrinsic load_global (ssa_49) (access=0, align_mul=1, align_offset=0)
vec1 32 div ssa_51 = iadd ssa_4.z, ssa_24
vec1 32 div ssa_52 = ult32 ssa_51, ssa_4.z
vec1 32 div ssa_53 = b2i32 ssa_52
vec1 32 div ssa_54 = iadd ssa_4.w, ssa_28
vec1 32 div ssa_55 = iadd ssa_53, ssa_54
vec1 64 div ssa_56 = pack_64_2x32_split ssa_51, ssa_55
vec1 8 div ssa_57 = intrinsic load_global (ssa_56) (access=0, align_mul=1, align_offset=0)
vec1 16 div ssa_58 = i2i16 ssa_57
vec1 16 con ssa_59 = load_const (0x0000 = 0.000000)
vec1 16 div ssa_60 = u2u16 ssa_43
vec1 16 div ssa_61 = u2u16 ssa_50
vec1 32 div ssa_62 = ieq32 ssa_58, ssa_59
vec1 16 div ssa_63 = b32csel ssa_62, ssa_60, ssa_61
vec1 8 div ssa_64 = u2u8 ssa_63
vec1 32 div ssa_65 = iadd ssa_2.x, ssa_24
vec1 32 div ssa_66 = ult32 ssa_65, ssa_2.x
vec1 32 div ssa_67 = b2i32 ssa_66
vec1 32 div ssa_68 = iadd ssa_2.y, ssa_28
vec1 32 div ssa_69 = iadd ssa_67, ssa_68
vec1 64 div ssa_70 = pack_64_2x32_split ssa_65, ssa_69
intrinsic store_global (ssa_64, ssa_70) (wrmask=x /*1*/, access=0, align_mul=1, align_offset=0)
/* succs: block_3 */
} else {
block block_2:
/* preds: block_0 */
/* succs: block_3 */
}
block block_3:
/* preds: block_1 block_2 */
/* succs: block_4 */
block block_4:
}
Native code for unnamed compute shader (null) (sha1 5e11f88932e5ade05d3767c9772a6585fae3a8fb)
SIMD8 shader: 88 instructions. 0 loops. 708 cycles. 0:0 spills:fills, 8 sends, scheduled with mode top-down. Promoted 0 constants. Compacted 1408 to 1168 bytes (17%)
START B0 (364 cycles)
and(1) cr0<1>UD cr0<0,1,0>UD 0xfffffb7fUD { align1 1N @1 };
or(1) cr0<1>UD cr0<0,1,0>UD 0x00000400UD { align1 1N @1 };
sync nop(8) null<0,1,0>UB { align1 1Q @1 };
mov(8) g2<1>UW 0x76543210V { align1 WE_all 1Q };
mov(8) g50<1>UD g0.1<0,1,0>UD { align1 1Q };
mov(8) g67<1>UD g0<8,8,1>UD { align1 WE_all 1Q };
mov(8) g68<1>UD g0<8,8,1>UD { align1 WE_all 1Q };
shl(8) g4<1>D g1<0,1,0>D 0x00000003UD { align1 1Q };
mov(8) g54<1>UD 0x00000000UD { align1 1Q };
sync nop(1) null<0,1,0>UB { align1 WE_all 1N @6 };
mov(8) g5<1>D g2<8,8,1>UW { align1 1Q };
mov(1) g67.2<1>UD 0x00000000UD { align1 WE_all 1N @5 };
mov(1) g68.2<1>UD 0x00000000UD { align1 WE_all 1N @5 };
send(8) g17UD g54UD nullUD 0x02306800 0x00000000
dp data 1 MsgDesc: (untyped surface read, Surface = 0, SIMD8, Mask = 0x8) mlen 1 ex_mlen 0 rlen 3 { align1 1Q @4 $0 };
add(8) g6<1>D g5<8,8,1>D g4<8,8,1>D { align1 1Q @3 compacted };
sync nop(1) null<0,1,0>UB { align1 WE_all 1N $0.src };
send(16) g53UD g67UD nullUD 0x02280301 0x00000000
const MsgDesc: (bti 1, msg_ctrl 3, msg_type 0, write_commit 0) mlen 1 ex_mlen 0 rlen 2 { align1 WE_all 1H @3 $1 };
send(16) g3UD g68UD nullUD 0x02280302 0x00000000
const MsgDesc: (bti 2, msg_ctrl 3, msg_type 0, write_commit 0) mlen 1 ex_mlen 0 rlen 2 { align1 WE_all 1H @1 $2 };
sync nop(1) null<0,1,0>UB { align1 WE_all 1N $1.dst };
mov(8) g51.1<2>F g54.1<0,1,0>F { align1 1Q compacted };
sync nop(1) null<0,1,0>UB { align1 WE_all 1N $2.dst };
math intmod(8) g7<1>UD g6<8,8,1>UD g3<0,1,0>UD { align1 1Q @2 $3 };
sync nop(1) null<0,1,0>UB { align1 WE_all 1N $3.src };
mul(8) acc0<1>UD g50<8,8,1>UD g3<0,1,0>UW { align1 1Q @7 };
mul(8) g9<1>D g50<8,8,1>D g3<0,1,0>UW { align1 1Q };
mul(8) g64<1>D g50<8,8,1>D g3.1<0,1,0>UW { align1 1Q };
mov(8) g51<2>F g54<0,1,0>F { align1 1Q @4 compacted };
mach(8) g8<1>UD g50<8,8,1>UD g3<0,1,0>UD { align1 1Q compacted AccWrEnable };
add(8) g9.1<2>UW g9.1<16,8,2>UW g64<16,8,2>UW { align1 1Q @3 };
add(8) g10<1>D g9<8,8,1>D g7<8,8,1>D { align1 1Q @1 $3.dst compacted };
cmp.l.f0.0(8) g11<1>UD g10<8,8,1>UD g9<8,8,1>UD { align1 1Q @1 compacted };
add(8) g13<1>D g10<8,8,1>D g51<8,4,2>D { align1 1Q @5 };
add(8) g12<1>D -g11<8,8,1>D g8<8,8,1>D { align1 1Q @2 compacted };
cmp.l.f0.0(8) g14<1>UD g13<8,8,1>UD g10<8,8,1>UD { align1 1Q @2 compacted };
add(8) g15<1>D g12<8,8,1>D g51.1<8,4,2>D { align1 1Q @2 };
mul(8) acc0<1>UD g3<0,1,0>UD g17<16,8,2>UW { align1 1Q $0.dst };
mul(8) g20<1>D g3<0,1,0>D g17<16,8,2>UW { align1 1Q };
mul(8) g65<1>D g3<0,1,0>D g17.1<16,8,2>UW { align1 1Q };
add(8) g16<1>D -g14<8,8,1>D g15<8,8,1>D { align1 1Q @4 compacted };
mach(8) g18<1>UD g3<0,1,0>UD g17<8,8,1>UD { align1 1Q $0.dst compacted AccWrEnable };
add(8) g20.1<2>UW g20.1<16,8,2>UW g65<16,8,2>UW { align1 1Q @3 };
cmp.z.f0.0(8) g21<1>D g16<8,8,1>D g18<8,8,1>D { align1 1Q @2 compacted };
cmp.l.f0.0(8) g22<1>UD g13<8,8,1>UD g20<8,8,1>UD { align1 1Q @2 compacted };
cmp.l.f0.0(8) g24<1>UD g16<8,8,1>UD g18<8,8,1>UD { align1 1Q compacted };
and(8) g23<1>UD g21<8,8,1>UD g22<8,8,1>UD { align1 1Q @2 compacted };
or.nz.f0.0(8) null<1>UD g24<8,8,1>UD g23<8,8,1>UD { align1 1Q @1 };
(+f0.0) if(8) JIP: LABEL0 UIP: LABEL0 { align1 1Q };
END B0 ->B1 ->B2
START B1 <-B0 (322 cycles)
sync nop(1) null<0,1,0>UB { align1 WE_all 1N $1.dst };
add(8) g25<1>D g53.2<0,1,0>D g13<8,8,1>D { align1 1Q compacted };
add(8) g27<1>D g53.3<0,1,0>D g16<8,8,1>D { align1 1Q compacted };
add(8) g30<1>D g53.4<0,1,0>D g13<8,8,1>D { align1 1Q compacted };
add(8) g32<1>D g53.5<0,1,0>D g16<8,8,1>D { align1 1Q compacted };
add(8) g35<1>D g53.6<0,1,0>D g13<8,8,1>D { align1 1Q compacted };
add(8) g37<1>D g53.7<0,1,0>D g16<8,8,1>D { align1 1Q compacted };
add(8) g46<1>D g53<0,1,0>D g13<8,8,1>D { align1 1Q compacted };
add(8) g48<1>D g53.1<0,1,0>D g16<8,8,1>D { align1 1Q compacted };
cmp.l.f0.0(8) g26<1>UD g25<8,8,1>UD g53.2<0,1,0>UD { align1 1Q @7 compacted };
mov(8) g55<2>UD g25<4,4,1>UD { align1 1Q };
mov(8) g57<2>UD g30<4,4,1>UD { align1 1Q @7 };
mov(8) g59<2>UD g35<4,4,1>UD { align1 1Q @7 };
cmp.l.f0.0(8) g31<1>UD g30<8,8,1>UD g53.4<0,1,0>UD { align1 1Q compacted };
cmp.l.f0.0(8) g36<1>UD g35<8,8,1>UD g53.6<0,1,0>UD { align1 1Q compacted };
mov(8) g61<2>UD g46<4,4,1>UD { align1 1Q @7 };
add(8) g28<1>D -g26<8,8,1>D g27<8,8,1>D { align1 1Q @7 compacted };
add(8) g33<1>D -g31<8,8,1>D g32<8,8,1>D { align1 1Q @4 compacted };
add(8) g38<1>D -g36<8,8,1>D g37<8,8,1>D { align1 1Q @4 compacted };
mov(8) g55.1<2>UD g28<4,4,1>UD { align1 1Q @3 };
mov(8) g57.1<2>UD g33<4,4,1>UD { align1 1Q @3 };
mov(8) g59.1<2>UD g38<4,4,1>UD { align1 1Q @3 };
send(8) g29UD g55UD nullUD 0x041400fd 0x00000000
dp data 1 MsgDesc: (DC A64 scattered read, Surface = 253, 0x0) mlen 2 ex_mlen 0 rlen 1 { align1 1Q @3 $4 };
send(8) g34UD g57UD nullUD 0x041400fd 0x00000000
dp data 1 MsgDesc: (DC A64 scattered read, Surface = 253, 0x0) mlen 2 ex_mlen 0 rlen 1 { align1 1Q @2 $5 };
send(8) g39UD g59UD nullUD 0x041400fd 0x00000000
dp data 1 MsgDesc: (DC A64 scattered read, Surface = 253, 0x0) mlen 2 ex_mlen 0 rlen 1 { align1 1Q @1 $6 };
mov(8) g41<1>UW g29<32,8,4>UB { align1 1Q $4.dst };
mov(8) g42<1>UW g34<32,8,4>UB { align1 1Q $5.dst };
mov(8) g40<1>W g39<32,8,4>B { align1 1Q $6.dst };
cmp.z.f0.0(8) g43<1>W g40<8,8,1>W 0W { align1 1Q @1 };
mov.nz.f0.0(8) null<1>D g43<8,8,1>W { align1 1Q @1 };
(+f0.0) sel(8) g44<1>UW g41<8,8,1>UW g42<8,8,1>UW { align1 1Q @4 };
mov(8) g66<2>UB g44<8,8,1>UW { align1 1Q @1 };
cmp.l.f0.0(8) g47<1>UD g46<8,8,1>UD g53<0,1,0>UD { align1 1Q compacted };
mov(8) g63<1>UD g66<16,8,2>UB { align1 1Q @2 };
add(8) g49<1>D -g47<8,8,1>D g48<8,8,1>D { align1 1Q @2 compacted };
mov(8) g61.1<2>UD g49<4,4,1>UD { align1 1Q @1 };
send(8) nullUD g61UD g63UD 0x040680fd 0x00000040
dp data 1 MsgDesc: (DC A64 scattered write, Surface = 253, 0x0) mlen 2 ex_mlen 1 rlen 0 { align1 1Q @1 $7 };
END B1 ->B2
START B2 <-B1 <-B0 (22 cycles)
LABEL0:
endif(8) JIP: LABEL1 { align1 1Q };
LABEL1:
mov(8) g126<1>UD g0<8,8,1>UD { align1 WE_all 1Q };
mov(16) acc0<1>F 0x0F /* 0F */ { align1 WE_all 1H @1 };
send(8) nullUD g126UD nullUD 0x02000000 0x00000000
thread_spawner MsgDesc: mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1Q @1 EOT };
END B2