nv30: Big endian fragment and vertex shaders clobber instructions with -fmerge-storing on GCC

System information

System:
  Host: g5box Kernel: 6.1.1-gentoo arch: ppc64 bits: 64 compiler: gcc
    v: 11.2.1 Desktop: Fluxbox v: 1.3.7-gentoo-r4 dm: startx Distro: Gentoo
    Base System release 2.9
CPU:
  Info: quad core model: PPC970MP altivec supported bits: 64 type: MCP
    arch: ppc64 rev: 1.1 (pvr 0044 0101) cache: L1: 384 KiB L2: 4 MiB
  Speed (MHz): avg: 2500 min/max: 1250/2500 cores: 1: 2500 2: 2500 3: 2500
    4: 2500 bogomips: N/A
  Features: N/A
Graphics:
  Device-1: NVIDIA G70 [GeForce 7800 GT] driver: nouveau v: kernel
    arch: Curie ports: active: DVI-I-2 empty: DVI-I-1 bus-ID: 0000:0a:00.0
    chip-ID: 10de:0092
  Display: server: X.Org v: 1.21.1.3 driver: X: loaded: nouveau
    gpu: nouveau display-ID: :0.0 screens: 1
  Screen-1: 0 s-res: 1920x1080 s-dpi: 96
  Monitor-1: DVI-I-2 model: LG (GoldStar) IPS FULLHD res: 1920x1080
    dpi: 102 diag: 551mm (21.7")
  OpenGL: renderer: NV47 v: 2.1 Mesa 23.1.0-devel (git-ad8f8be4e8)
    direct render: Yes

Describe the issue

I'm seeing with very basic fragment and vertex shaders (mythtv's UI) some uninitialized access with memory. This is very likely a GCC bug, and I have a minimal reproducer that maybe should be filed with GCC, but working around it is possibly by disabling the optimization in the fragment and vertex shader instruction emitting functions. The particular issue that's causing it is with DP2 and RSQ instructions, but I suspect it's an issue any time the constant "0" is used for the saturate argument.

Instead of a UI rendering, I'm simply greeted with a black screen half of the time, due to uninitialized memory being used.

Regression

It really started showing up when NIR was used as an intermediary between GLSL and TGSI, but it's possible that some version of this bug has existed for a long time.

Here's a minimal reproducer, compiling this with -O2 -fstore-merging and -O2 -fno-store-merging end up printing different results for sat, where the sat bit value is often set to 1 instead of 0:

#include <stdio.h>
#include <stdint.h>

#define NVFX_FP_MASK_X 1
#define NVFX_FP_MASK_Y 2
#define NVFX_FP_MASK_Z 4
#define NVFX_FP_MASK_W 8
#define NVFX_FP_MASK_ALL 0xf
#define NV40_FP_OP_OUT_NONE         (1U << 30)
#define NVFX_FP_OP_OPCODE_MUL 0x02
#define NVFX_COND_TR  7
#define NVFXSR_NONE	0

#define arith(s,o,d,m,s0,s1,s2) \
       nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, -1, \
                       (d), (m), (s0), (s1), (s2))

struct nvfx_reg {
	int8_t type;
	int32_t index;
};

struct nvfx_src {
	struct nvfx_reg reg;

	uint8_t indirect : 1;
	uint8_t indirect_reg : 1;
	uint8_t indirect_swz : 2;
	uint8_t negate : 1;
	uint8_t abs : 1;
	uint8_t swz[4];
};

struct nvfx_insn
{
	uint8_t op;
	char scale;
	int8_t unit;
	uint8_t mask;
	uint8_t cc_swz[4];

	uint8_t sat : 1;
	uint8_t cc_update : 1;
	uint8_t cc_update_reg : 1;
	uint8_t cc_test : 3;
	uint8_t cc_test_reg : 1;

	struct nvfx_reg dst;
	struct nvfx_src src[3];
};

static inline struct nvfx_insn
nvfx_insn(uint8_t sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
{
	struct nvfx_insn insn = {
		.op = op,
		.scale = 0,
		.unit = unit,
		.sat = sat,
		.mask = mask,
		.cc_update = 0,
		.cc_update_reg = 0,
		.cc_test = NVFX_COND_TR,
		.cc_test_reg = 0,
		.cc_swz = { 0, 1, 2, 3 },
		.dst = dst,
		.src = {s0, s1, s2}
	};
	return insn;
}

static inline struct nvfx_reg
nvfx_reg(int type, int index)
{
	struct nvfx_reg temp = {
		.type = type,
		.index = index,
	};
	return temp;
}

static inline struct nvfx_src
nvfx_src(struct nvfx_reg reg)
{
	struct nvfx_src temp = {
		.reg = reg,
		.abs = 0,
		.negate = 0,
		.swz = { 0, 1, 2, 3 },
		.indirect = 0,
	};
	return temp;
}

struct nvfx_insn emit_test(void)
{
   const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
   struct nvfx_insn insn;
   struct nvfx_src src[2];
   struct nvfx_reg tmp = {0, 1};
   int mask, sat, unit = 0;
   int ai = -1, ci = -1, ii = -1;
   int i;

   src[0].reg.type = 0;
   src[0].reg.index = 2;
   src[1].reg.type = 4;
   src[1].reg.index = 8;

   return arith(0, MUL, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], src[1], none);
}

int main(void)
{
    struct nvfx_insn ins = emit_test();
    printf("sat? = %d\n", ins.sat);
}

Here's the output of -fno-store-merging:

adam@g5box ~ $ ./test.out 
sat? = 0

Here it is with -fstore-merging (though, enabled by default with -O2):

adam@g5box ~ $ ./test.out 
sat? = 1

And, running with valgrind, you can see it immediately spots it's uninitialized stack memory:

==13566== Memcheck, a memory error detector
==13566== Copyright (C) 2002-2022, and GNU GPL'd, by Julian Seward et al.
==13566== Using Valgrind-3.20.0 and LibVEX; rerun with -h for copyright info
==13566== Command: ./test.out
==13566== 
==13566== Use of uninitialised value of size 8
==13566==    at 0x415B8C8: _itoa_word (in /lib64/libc.so.6)
==13566==    by 0x4167BE3: __vfprintf_internal (in /lib64/libc.so.6)
==13566==    by 0x100005F7: main (in /home/adam/test.out)
==13566== 
==13566== Conditional jump or move depends on uninitialised value(s)
==13566==    at 0x415B8D0: _itoa_word (in /lib64/libc.so.6)
==13566==    by 0x4167BE3: __vfprintf_internal (in /lib64/libc.so.6)
==13566==    by 0x100005F7: main (in /home/adam/test.out)
==13566== 
==13566== Conditional jump or move depends on uninitialised value(s)
==13566==    at 0x41683F4: __vfprintf_internal (in /lib64/libc.so.6)
==13566==    by 0x4252C13: __printf_chk@@GLIBC_2.4 (in /lib64/libc.so.6)
==13566==    by 0x100005F7: main (in /home/adam/test.out)
==13566== 
==13566== Conditional jump or move depends on uninitialised value(s)
==13566==    at 0x4168FA8: __vfprintf_internal (in /lib64/libc.so.6)
==13566==    by 0x4252C13: __printf_chk@@GLIBC_2.4 (in /lib64/libc.so.6)
==13566==    by 0x100005F7: main (in /home/adam/test.out)
==13566== 
sat? = 0
==13566== 
==13566== HEAP SUMMARY:
==13566==     in use at exit: 0 bytes in 0 blocks
==13566==   total heap usage: 1 allocs, 1 frees, 1,024 bytes allocated
==13566== 
==13566== All heap blocks were freed -- no leaks are possible
==13566== 
==13566== Use --track-origins=yes to see where uninitialised values come from
==13566== For lists of detected and suppressed errors, rerun with: -s
==13566== ERROR SUMMARY: 4 errors from 4 contexts (suppressed: 0 from 0)

Here's the PPC64 (BE) assembly of that routine with and without that optimization.

mflr    r0
std     r31,-8(r1)
addis   r10,r2,-2
lwz     r10,-30200(r10)
li      r4,0
li      r9,0
li      r11,1
li      r6,2
li      r7,4
sldi    r10,r10,24
li      r8,8
li      r5,68
std     r0,16(r1)
stdu    r1,-208(r1)
lis     r0,512
ori     r0,r0,65283
sldi    r0,r0,32
oris    r0,r0,1
ori     r0,r0,515
ld      r31,-28688(r13)
std     r31,184(r1)
li      r31,0
std     r4,168(r1)
addi    r4,r1,116
std     r10,176(r1)
stb     r9,128(r1)
stb     r9,136(r1)
mr      r31,r3
stb     r9,168(r1)
stw     r9,172(r1)
std     r0,116(r1)
stw     r11,132(r1)
stw     r6,140(r1)
stb     r7,152(r1)
stw     r8,156(r1)
bl      10000540 <0000003b.plt_call.memcpy@@GLIBC_2.3>
ld      r2,40(r1)
ld      r10,184(r1)
ld      r9,-28688(r13)
xor.    r10,r10,r9
li      r9,0
bne     10000860 <.emit_test+0xc0>
addi    r1,r1,208
mr      r3,r31
ld      r0,16(r1)
ld      r31,-8(r1)
mtlr    r0
blr
bl      100005a0 <0000003b.plt_call.__stack_chk_fail@@GLIBC_2.4>
ld      r2,40(r1)
.long 0x0
.long 0x1
lwz     r0,0(r1)
nop
.long 0x0
.long 0x1f778

Here's without:

mflr    r0
std     r29,-24(r1)
std     r30,-16(r1)
lis     r8,1
std     r31,-8(r1)
addis   r7,r2,-2
lwz     r7,-30152(r7)
ori     r8,r8,515
li      r5,0
li      r9,0
li      r12,3
li      r11,2
li      r6,4
sldi    r7,r7,24
li      r29,2
std     r0,16(r1)
stdu    r1,-224(r1)
li      r0,1
li      r30,-1
mr      r31,r3
lbz     r10,124(r1)
ld      r4,-28688(r13)
std     r4,184(r1)
li      r4,0
stw     r8,120(r1)
li      r8,8
std     r5,168(r1)
addi    r4,r1,116
stb     r9,117(r1)
li      r5,68
stb     r9,128(r1)
stb     r9,136(r1)
andi.   r10,r10,29
stb     r9,168(r1)
stw     r9,172(r1)
std     r7,176(r1)
stb     r29,116(r1)
ori     r10,r10,28
stb     r30,118(r1)
stb     r12,119(r1)
stw     r0,132(r1)
stw     r11,140(r1)
stb     r10,124(r1)
stb     r6,152(r1)
stw     r8,156(r1)
bl      10000540 <0000003b.plt_call.memcpy@@GLIBC_2.3>
ld      r2,40(r1)
ld      r10,184(r1)
ld      r9,-28688(r13)
xor.    r10,r10,r9
li      r9,0
bne     10000890 <.emit_test+0xf0>
addi    r1,r1,224
mr      r3,r31
ld      r0,16(r1)
ld      r29,-24(r1)
ld      r30,-16(r1)
ld      r31,-8(r1)
mtlr    r0
blr
bl      100005a0 <0000003b.plt_call.__stack_chk_fail@@GLIBC_2.4>
ld      r2,40(r1)
.long 0x0
.long 0x1
lwz     r0,0(r3)
nop
.long 0x0
.long 0x1f748

Edited Jan 22, 2023 by Adam Stylinski

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information

Admin message

Admin message

nv30: Big endian fragment and vertex shaders clobber instructions with -fmerge-storing on GCC

System information

Describe the issue

Regression