RADV/LLVM: Bug in sign-extension case for 16-bit integers
On mesa-git with LLVM 9.0.1 on 5700 XT. Cannot test on ACO since it does not support small integers yet.
I have a compute shader looking like this which computes the wrong result on RADV, works on other implementations.
#version 450
layout(local_size_x = 64) in;
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
#extension GL_EXT_shader_8bit_storage : require
#extension GL_EXT_shader_16bit_storage : require
layout(set = 0, binding = 0) readonly buffer SSBO
{
i16vec2 inputs[];
};
layout(set = 0, binding = 1) writeonly buffer SSBOOutput
{
u8vec2 outputs[];
};
i16vec2 clamp_9bit_notrunc(i16vec2 color)
{
color -= 0x80s; // <-- This causes the problem
color <<= 7s;
color >>= 7s;
color += 0x80s; // <-- This causes the problem
return clamp(color, i16vec2(0), i16vec2(0xff));
}
void main()
{
i16vec2 inp = inputs[gl_GlobalInvocationID.x];
u8vec2 outp = u8vec2(clamp_9bit_notrunc(inp));
outputs[gl_GlobalInvocationID.x] = outp;
}
Here's a FOZ archive which can build a pipeline: repro.foz
The NIR generated:
local-size: 64, 1, 1
shared-size: 0
inputs: 0
outputs: 0
uniforms: 0
shared: 0
decl_var ssbo INTERP_MODE_NONE SSBO (~0, 0, 0)
decl_var ssbo INTERP_MODE_NONE SSBOOutput @0 (~0, 0, 1)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec1 16 ssa_0 = load_const (0x00ff /* 0.000015 */)
vec1 16 ssa_1 = load_const (0x0000 /* 0.000000 */)
vec1 16 ssa_2 = load_const (0x0080 /* 0.000008 */)
vec3 32 ssa_3 = intrinsic load_work_group_id () ()
vec3 32 ssa_4 = intrinsic load_local_invocation_id () ()
vec1 32 ssa_5 = load_const (0x00000006 /* 0.000000 */)
vec1 32 ssa_6 = ishl ssa_3.x, ssa_5
vec1 32 ssa_7 = iadd ssa_6, ssa_4.x
vec1 32 ssa_8 = load_const (0x00000000 /* 0.000000 */)
vec1 32 ssa_9 = intrinsic vulkan_resource_index (ssa_8) (0, 0, 7) /* desc-set=0 */ /* binding=0 */ /* desc_type=SSBO */
vec1 32 ssa_10 = load_const (0x00000002 /* 0.000000 */)
vec1 32 ssa_11 = ishl ssa_7, ssa_10
vec2 16 ssa_12 = intrinsic load_ssbo (ssa_9, ssa_11) (16, 2, 0) /* access=16 */ /* align_mul=2 */ /* align_offset=0 */
vec1 32 ssa_13 = load_const (0x00000007 /* 0.000000 */)
vec1 16 ssa_14 = ishl ssa_12.x, ssa_13
vec1 16 ssa_15 = load_const (0xc000 /* -2.000000 */)
vec1 16 ssa_16 = iadd ssa_14, ssa_15
vec1 16 ssa_17 = ishl ssa_12.y, ssa_13
vec1 16 ssa_18 = iadd ssa_17, ssa_15
vec1 16 ssa_19 = ishr ssa_16, ssa_13
vec1 16 ssa_20 = ishr ssa_18, ssa_13
vec1 16 ssa_21 = iadd ssa_19, ssa_2
vec1 16 ssa_22 = iadd ssa_20, ssa_2
vec1 16 ssa_23 = imax ssa_21, ssa_1
vec1 16 ssa_24 = imax ssa_22, ssa_1
vec1 16 ssa_25 = imin ssa_23, ssa_0
vec1 16 ssa_26 = imin ssa_24, ssa_0
vec1 8 ssa_27 = i2i8 ssa_25
vec1 8 ssa_28 = i2i8 ssa_26
vec2 8 ssa_29 = vec2 ssa_27, ssa_28
vec1 32 ssa_30 = intrinsic vulkan_resource_index (ssa_8) (0, 1, 7) /* desc-set=0 */ /* binding=1 */ /* desc_type=SSBO */
vec1 32 ssa_31 = load_const (0x00000001 /* 0.000000 */)
vec1 32 ssa_32 = ishl ssa_7, ssa_31
intrinsic store_ssbo (ssa_29, ssa_30, ssa_32) (3, 8, 1, 0) /* wrmask=xy */ /* access=8 */ /* align_mul=1 */ /* align_offset=0 */
/* succs: block_1 */
block block_1:
}
This seems fine.
The LLVM output is a little fishy though.
; ModuleID = 'mesa-shader'
source_filename = "mesa-shader"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
target triple = "amdgcn-mesa-mesa3d"
@compute_lds = external addrspace(3) global [0 x i8], align 65536
define amdgpu_cs void @main(i8 addrspace(6)* inreg noalias dereferenceable(18446744073709551615), i32 inreg, <3 x i32>) #0 {
main_body:
%3 = shl i32 %1, 6
%4 = extractelement <3 x i32> %2, i32 0
%5 = add i32 %3, %4
%6 = bitcast i8 addrspace(6)* %0 to <4 x i32> addrspace(6)*, !amdgpu.uniform !0
%7 = shl i32 %5, 2
%8 = load <4 x i32>, <4 x i32> addrspace(6)* %6, align 16, !invariant.load !0
%9 = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %8, i32 %7, i32 0, i32 0) #2
%10 = or i32 %7, 2
%11 = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %8, i32 %10, i32 0, i32 0) #2
%12 = shl i16 %9, 7
%13 = add i16 %12, -16384
%14 = shl i16 %11, 7
%15 = add i16 %14, -16384
%16 = ashr exact i16 %13, 7
%17 = ashr exact i16 %15, 7
; ... ?!?! This might be correct, but surprising.
%18 = icmp sgt i16 %16, -128
%19 = select i1 %18, i16 %16, i16 -128
%20 = icmp sgt i16 %17, -128
%21 = select i1 %20, i16 %17, i16 -128
%22 = icmp slt i16 %19, 127
%23 = select i1 %22, i16 %19, i16 127
%24 = icmp slt i16 %21, 127
%25 = select i1 %24, i16 %21, i16 127
%26 = trunc i16 %23 to i8
%27 = xor i8 %26, -128
%28 = trunc i16 %25 to i8
%29 = xor i8 %28, -128
%30 = insertelement <2 x i8> undef, i8 %27, i32 0
%31 = insertelement <2 x i8> %30, i8 %29, i32 1
%32 = getelementptr i8, i8 addrspace(6)* %0, i32 16
%33 = bitcast i8 addrspace(6)* %32 to <4 x i32> addrspace(6)*, !amdgpu.uniform !0
%34 = shl i32 %5, 1
%35 = load <4 x i32>, <4 x i32> addrspace(6)* %33, align 16, !invariant.load !0
%36 = bitcast <2 x i8> %31 to i16
call void @llvm.amdgcn.raw.buffer.store.i16(i16 %36, <4 x i32> %35, i32 %34, i32 0, i32 1) #4
ret void
}
ISA:
BB835_0:
s_mov_b32 s0, s3 ; BE800303
s_movk_i32 s3, 0x8000 ; B0038000
v_mov_b32_e32 v1, 0xffff ; 7E0202FF 0000FFFF
s_movk_i32 s8, 0x7f ; B008007F
s_movk_i32 s9, 0xff80 ; B009FF80
v_lshl_add_u32 v0, s0, 6, v0 ; D7460000 04010C00
s_load_dwordx4 s[4:7], s[2:3], 0x0 ; F4080101 FA000000
s_add_i32 s2, s2, 16 ; 81029002
s_movk_i32 s10, 0x80 ; B00A0080
v_lshlrev_b32_e32 v2, 2, v0 ; 34040082
v_lshlrev_b32_e32 v0, 1, v0 ; 34000081
s_load_dwordx4 s[0:3], s[2:3], 0x0 ; F4080001 FA000000
s_waitcnt lgkmcnt(0) ; BF8CC07F
buffer_load_ushort v3, v2, s[4:7], 0 offen ; E0281000 80010302
buffer_load_ushort v2, v2, s[4:7], 0 offen offset:2 ; E0281002 80010202
s_waitcnt vmcnt(1) ; BF8C3F71
v_lshlrev_b16_e64 v3, 7, v3 ; D7140003 00020687
s_waitcnt vmcnt(0) ; BF8C3F70
v_lshlrev_b16_e64 v2, 7, v2 ; D7140002 00020487
v_and_b32_e32 v3, v3, v1 ; 36060303
v_and_b32_e32 v2, v2, v1 ; 36040302
v_add_nc_u16_e64 v3, v3, 0xffffc000 ; D7030003 0001EB03
v_add_nc_u16_e64 v2, v2, 0xffffc000 ; D7030002 0001EB02
v_and_b32_e32 v3, v3, v1 ; 36060303
v_and_b32_e32 v2, v2, v1 ; 36040302
v_ashrrev_i16_e64 v3, 7, v3 ; D7080003 00020687
v_ashrrev_i16_e64 v2, 7, v2 ; D7080002 00020487
v_and_b32_e32 v3, v3, v1 ; 36060303
v_and_b32_e32 v1, v2, v1 ; 36020302
v_med3_i16 v2, v3, s9, s8 ; D7580002 00201303
v_med3_i16 v1, v1, s9, s8 ; D7580001 00201301
v_xor_b32_e32 v2, s10, v2 ; 3A04040A
v_xor_b32_sdwa v1, v1, s10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; 3A0214F9 86060101
v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:WORD_0 ; 380202F9 04000602
buffer_store_short v1, v0, s[0:3], 0 offen glc ; E0685000 80000100
s_endpgm