nv30: Big endian fragment and vertex shaders clobber instructions with -fmerge-storing on GCC
System information
System:
Host: g5box Kernel: 6.1.1-gentoo arch: ppc64 bits: 64 compiler: gcc
v: 11.2.1 Desktop: Fluxbox v: 1.3.7-gentoo-r4 dm: startx Distro: Gentoo
Base System release 2.9
CPU:
Info: quad core model: PPC970MP altivec supported bits: 64 type: MCP
arch: ppc64 rev: 1.1 (pvr 0044 0101) cache: L1: 384 KiB L2: 4 MiB
Speed (MHz): avg: 2500 min/max: 1250/2500 cores: 1: 2500 2: 2500 3: 2500
4: 2500 bogomips: N/A
Features: N/A
Graphics:
Device-1: NVIDIA G70 [GeForce 7800 GT] driver: nouveau v: kernel
arch: Curie ports: active: DVI-I-2 empty: DVI-I-1 bus-ID: 0000:0a:00.0
chip-ID: 10de:0092
Display: server: X.Org v: 1.21.1.3 driver: X: loaded: nouveau
gpu: nouveau display-ID: :0.0 screens: 1
Screen-1: 0 s-res: 1920x1080 s-dpi: 96
Monitor-1: DVI-I-2 model: LG (GoldStar) IPS FULLHD res: 1920x1080
dpi: 102 diag: 551mm (21.7")
OpenGL: renderer: NV47 v: 2.1 Mesa 23.1.0-devel (git-ad8f8be4e8)
direct render: Yes
Describe the issue
I'm seeing with very basic fragment and vertex shaders (mythtv's UI) some uninitialized access with memory. This is very likely a GCC bug, and I have a minimal reproducer that maybe should be filed with GCC, but working around it is possibly by disabling the optimization in the fragment and vertex shader instruction emitting functions. The particular issue that's causing it is with DP2 and RSQ instructions, but I suspect it's an issue any time the constant "0" is used for the saturate argument.
Instead of a UI rendering, I'm simply greeted with a black screen half of the time, due to uninitialized memory being used.
Regression
It really started showing up when NIR was used as an intermediary between GLSL and TGSI, but it's possible that some version of this bug has existed for a long time.
Here's a minimal reproducer, compiling this with -O2 -fstore-merging and -O2 -fno-store-merging end up printing different results for sat, where the sat bit value is often set to 1 instead of 0:
#include <stdio.h>
#include <stdint.h>
#define NVFX_FP_MASK_X 1
#define NVFX_FP_MASK_Y 2
#define NVFX_FP_MASK_Z 4
#define NVFX_FP_MASK_W 8
#define NVFX_FP_MASK_ALL 0xf
#define NV40_FP_OP_OUT_NONE (1U << 30)
#define NVFX_FP_OP_OPCODE_MUL 0x02
#define NVFX_COND_TR 7
#define NVFXSR_NONE 0
#define arith(s,o,d,m,s0,s1,s2) \
nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, -1, \
(d), (m), (s0), (s1), (s2))
struct nvfx_reg {
int8_t type;
int32_t index;
};
struct nvfx_src {
struct nvfx_reg reg;
uint8_t indirect : 1;
uint8_t indirect_reg : 1;
uint8_t indirect_swz : 2;
uint8_t negate : 1;
uint8_t abs : 1;
uint8_t swz[4];
};
struct nvfx_insn
{
uint8_t op;
char scale;
int8_t unit;
uint8_t mask;
uint8_t cc_swz[4];
uint8_t sat : 1;
uint8_t cc_update : 1;
uint8_t cc_update_reg : 1;
uint8_t cc_test : 3;
uint8_t cc_test_reg : 1;
struct nvfx_reg dst;
struct nvfx_src src[3];
};
static inline struct nvfx_insn
nvfx_insn(uint8_t sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
{
struct nvfx_insn insn = {
.op = op,
.scale = 0,
.unit = unit,
.sat = sat,
.mask = mask,
.cc_update = 0,
.cc_update_reg = 0,
.cc_test = NVFX_COND_TR,
.cc_test_reg = 0,
.cc_swz = { 0, 1, 2, 3 },
.dst = dst,
.src = {s0, s1, s2}
};
return insn;
}
static inline struct nvfx_reg
nvfx_reg(int type, int index)
{
struct nvfx_reg temp = {
.type = type,
.index = index,
};
return temp;
}
static inline struct nvfx_src
nvfx_src(struct nvfx_reg reg)
{
struct nvfx_src temp = {
.reg = reg,
.abs = 0,
.negate = 0,
.swz = { 0, 1, 2, 3 },
.indirect = 0,
};
return temp;
}
struct nvfx_insn emit_test(void)
{
const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
struct nvfx_insn insn;
struct nvfx_src src[2];
struct nvfx_reg tmp = {0, 1};
int mask, sat, unit = 0;
int ai = -1, ci = -1, ii = -1;
int i;
src[0].reg.type = 0;
src[0].reg.index = 2;
src[1].reg.type = 4;
src[1].reg.index = 8;
return arith(0, MUL, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], src[1], none);
}
int main(void)
{
struct nvfx_insn ins = emit_test();
printf("sat? = %d\n", ins.sat);
}
Here's the output of -fno-store-merging:
adam@g5box ~ $ ./test.out
sat? = 0
Here it is with -fstore-merging (though, enabled by default with -O2):
adam@g5box ~ $ ./test.out
sat? = 1
And, running with valgrind, you can see it immediately spots it's uninitialized stack memory:
==13566== Memcheck, a memory error detector
==13566== Copyright (C) 2002-2022, and GNU GPL'd, by Julian Seward et al.
==13566== Using Valgrind-3.20.0 and LibVEX; rerun with -h for copyright info
==13566== Command: ./test.out
==13566==
==13566== Use of uninitialised value of size 8
==13566== at 0x415B8C8: _itoa_word (in /lib64/libc.so.6)
==13566== by 0x4167BE3: __vfprintf_internal (in /lib64/libc.so.6)
==13566== by 0x100005F7: main (in /home/adam/test.out)
==13566==
==13566== Conditional jump or move depends on uninitialised value(s)
==13566== at 0x415B8D0: _itoa_word (in /lib64/libc.so.6)
==13566== by 0x4167BE3: __vfprintf_internal (in /lib64/libc.so.6)
==13566== by 0x100005F7: main (in /home/adam/test.out)
==13566==
==13566== Conditional jump or move depends on uninitialised value(s)
==13566== at 0x41683F4: __vfprintf_internal (in /lib64/libc.so.6)
==13566== by 0x4252C13: __printf_chk@@GLIBC_2.4 (in /lib64/libc.so.6)
==13566== by 0x100005F7: main (in /home/adam/test.out)
==13566==
==13566== Conditional jump or move depends on uninitialised value(s)
==13566== at 0x4168FA8: __vfprintf_internal (in /lib64/libc.so.6)
==13566== by 0x4252C13: __printf_chk@@GLIBC_2.4 (in /lib64/libc.so.6)
==13566== by 0x100005F7: main (in /home/adam/test.out)
==13566==
sat? = 0
==13566==
==13566== HEAP SUMMARY:
==13566== in use at exit: 0 bytes in 0 blocks
==13566== total heap usage: 1 allocs, 1 frees, 1,024 bytes allocated
==13566==
==13566== All heap blocks were freed -- no leaks are possible
==13566==
==13566== Use --track-origins=yes to see where uninitialised values come from
==13566== For lists of detected and suppressed errors, rerun with: -s
==13566== ERROR SUMMARY: 4 errors from 4 contexts (suppressed: 0 from 0)
Here's the PPC64 (BE) assembly of that routine with and without that optimization.
mflr r0
std r31,-8(r1)
addis r10,r2,-2
lwz r10,-30200(r10)
li r4,0
li r9,0
li r11,1
li r6,2
li r7,4
sldi r10,r10,24
li r8,8
li r5,68
std r0,16(r1)
stdu r1,-208(r1)
lis r0,512
ori r0,r0,65283
sldi r0,r0,32
oris r0,r0,1
ori r0,r0,515
ld r31,-28688(r13)
std r31,184(r1)
li r31,0
std r4,168(r1)
addi r4,r1,116
std r10,176(r1)
stb r9,128(r1)
stb r9,136(r1)
mr r31,r3
stb r9,168(r1)
stw r9,172(r1)
std r0,116(r1)
stw r11,132(r1)
stw r6,140(r1)
stb r7,152(r1)
stw r8,156(r1)
bl 10000540 <0000003b.plt_call.memcpy@@GLIBC_2.3>
ld r2,40(r1)
ld r10,184(r1)
ld r9,-28688(r13)
xor. r10,r10,r9
li r9,0
bne 10000860 <.emit_test+0xc0>
addi r1,r1,208
mr r3,r31
ld r0,16(r1)
ld r31,-8(r1)
mtlr r0
blr
bl 100005a0 <0000003b.plt_call.__stack_chk_fail@@GLIBC_2.4>
ld r2,40(r1)
.long 0x0
.long 0x1
lwz r0,0(r1)
nop
.long 0x0
.long 0x1f778
Here's without:
mflr r0
std r29,-24(r1)
std r30,-16(r1)
lis r8,1
std r31,-8(r1)
addis r7,r2,-2
lwz r7,-30152(r7)
ori r8,r8,515
li r5,0
li r9,0
li r12,3
li r11,2
li r6,4
sldi r7,r7,24
li r29,2
std r0,16(r1)
stdu r1,-224(r1)
li r0,1
li r30,-1
mr r31,r3
lbz r10,124(r1)
ld r4,-28688(r13)
std r4,184(r1)
li r4,0
stw r8,120(r1)
li r8,8
std r5,168(r1)
addi r4,r1,116
stb r9,117(r1)
li r5,68
stb r9,128(r1)
stb r9,136(r1)
andi. r10,r10,29
stb r9,168(r1)
stw r9,172(r1)
std r7,176(r1)
stb r29,116(r1)
ori r10,r10,28
stb r30,118(r1)
stb r12,119(r1)
stw r0,132(r1)
stw r11,140(r1)
stb r10,124(r1)
stb r6,152(r1)
stw r8,156(r1)
bl 10000540 <0000003b.plt_call.memcpy@@GLIBC_2.3>
ld r2,40(r1)
ld r10,184(r1)
ld r9,-28688(r13)
xor. r10,r10,r9
li r9,0
bne 10000890 <.emit_test+0xf0>
addi r1,r1,224
mr r3,r31
ld r0,16(r1)
ld r29,-24(r1)
ld r30,-16(r1)
ld r31,-8(r1)
mtlr r0
blr
bl 100005a0 <0000003b.plt_call.__stack_chk_fail@@GLIBC_2.4>
ld r2,40(r1)
.long 0x0
.long 0x1
lwz r0,0(r3)
nop
.long 0x0
.long 0x1f748