Skip to content

WIP: ir3: add shader instrumentation to dump registers content

Danylo Piliaiev requested to merge Danil/mesa:ir3/dbg/register-dumper into main

The goal of this is to provide a way to inspect what's in the registers of a shader. And while we don't have a way for interactive debugging, we could instrument a shader to dump the contents of its registers.

It is not in a final state - I'm looking for a feedback on the decisions being made, and suggestions on how to proceed further.

Note, this MR contains !8717 (merged) in order to use global load/store/atomics.

(Works on a6xx + Turnip at the moment)

Here is an example of current output of the instrumentation of every instruction for a simple shader with a loop:

IR3_SHADER_DEBUG=fs IR3_SHADER_INSTRUMENT=72715a1ef3eb336b914c18f608682c325d860c2c IR3_SHADER_INSTRUMENT_INSTR_REGEX= vkrunner loop.shader_test
    // len = 4

    color = vec4(0);
    for (int i = 0; i < len; i++)
    {
        color += vec4(arr[i] / 50.f);
    }
IR3 (click me)
Native code for unnamed FRAG shader (null) with sha1 72715a1ef3eb336b914c18f608682c325d860c2c:
SIMD0
@out(r0.w)      out0 (wrmask=0xf)
@const(c133.x)  0x3ca3d70a, 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0
mov.u32u32 r1.x, 0
(rpt2)nop
mov.u32u32 r0.w, r1.x
(rpt2)nop
mov.u32u32 r0.z, r0.w
(rpt2)nop
mov.u32u32 r0.y, r0.z
(rpt2)nop
mov.u32u32 r0.x, r0.y
(jp)(nop3) cmps.s.ge p0.x, r1.x, c0.x
(rpt2)nop
br p0.x, #l25
(jp)(nop3) shl.b r1.y, r1.x, 2
cov.u32s16 hr2.w, r1.y
(rpt2)nop
mova a0.x, hr2.w
(rpt5)nop
(ul)cov.s32f32 r1.y, c<a0.x + 4>
(rpt2)nop
mad.f32 r0.w, c133.x, r1.y, r0.w
mad.f32 r0.z, c133.x, r1.y, r0.z
mad.f32 r0.y, c133.x, r1.y, r0.y
mad.f32 r0.x, c133.x, r1.y, r0.x
add.u r1.x, r1.x, 1
jump #l9
l25:
(jp)mov.u32u32 r1.x, r0.z
mov.u32u32 r1.y, r0.y
mov.u32u32 r1.z, r0.x
end
nop
nop
nop
nop
; FRAG: outputs: r0.w (FRAG_RESULT_DATA0)
; FRAG: inputs: r63.x (SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL slot=50 cm=3,il=0,b=0)
; FRAG prog 2/1: 54 instr, 33 nops, 21 non-nops, 9 mov, 2 cov, 66 dwords
; FRAG prog 2/1: 0 last-baryf, 0 half, 2 full, 136 constlen
; FRAG prog 2/1: 36 cat0, 11 cat1, 9 cat2, 4 cat3, 0 cat4, 0 cat5, 0 cat6, 0 cat7, 
; FRAG prog 2/1: 0 sstall, 0 (ss), 0 (sy), 0 max_sun, 1 loops
; data0: r0.w
Instrumented IR3 (click me)
Native code for INTRUMENTED FRAG shader (null) with sha1 72715a1ef3eb336b914c18f608682c325d860c2c:
@out(r0.w)      out0 (wrmask=0xf)
@const(c133.x)  0x3ca3d70a, 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0
nop
nop
mov.u32u32 r6.y, 0
mov.u32u32 r6.x, 0x6a88004
(rpt3)nop
atomic.g.inc.untyped.1d.u32.1.g r7.x, r6.x, r6.x
(ss)nop
mov.u32u32 r6.x, 0x6a88000
mov.u32u32 r6.w, 0
(ss)nop
mov.u32u32 r6.z, 12
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 0
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
mov.u32u32 r1.x, 0
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.x, 1
(rpt2)nop
(ss)nop
mov.u32u32 r6.z, 16
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 1
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.x, 1
mov.u32u32 r0.w, r1.x
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.w, 1
(rpt2)nop
(ss)nop
mov.u32u32 r6.z, 16
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.w, 1
mov.u32u32 r0.z, r0.w
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.z, 1
(rpt2)nop
(ss)nop
mov.u32u32 r6.z, 16
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.z, 1
mov.u32u32 r0.y, r0.z
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.y, 1
(rpt2)nop
(ss)nop
mov.u32u32 r6.z, 16
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 4
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.y, 1
mov.u32u32 r0.x, r0.y
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.x, 1
(ss)nop
mov.u32u32 r6.z, 12
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 5
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.x, 1
(jp)(nop3) cmps.s.ge p0.x, r1.x, c0.x
(rpt2)nop
(ss)nop
mov.u32u32 r6.z, 16
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 6
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.x, 1
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.x, 1
br p0.x, #l242
(ss)nop
mov.u32u32 r6.z, 16
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 7
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.x, 1
(jp)(nop3) shl.b r1.y, r1.x, 2
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.y, 1
(ss)nop
mov.u32u32 r6.z, 16
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 8
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.y, 1
cov.u32s16 hr2.w, r1.y
(ss)mov.u32u32 r7.y, 4
(rpt3)nop
(sy)stg.u16 g[r6.z+r7.y], r2.w, 1
(rpt2)nop
(ss)nop
mov.u32u32 r6.z, 12
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 9
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 4
(rpt3)nop
(sy)stg.u16 g[r6.z+r7.y], r2.w, 1
mova a0.x, hr2.w
(rpt5)nop
(ss)nop
mov.u32u32 r6.z, 12
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 10
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ul)cov.s32f32 r1.y, c<a0.x + 4>
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.y, 1
(rpt2)nop
(ss)nop
mov.u32u32 r6.z, 20
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 11
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.y, 1
(ss)mov.u32u32 r7.y, 4
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.w, 1
mad.f32 r0.w, c133.x, r1.y, r0.w
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.w, 1
(ss)nop
mov.u32u32 r6.z, 20
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 12
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.y, 1
(ss)mov.u32u32 r7.y, 4
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.z, 1
mad.f32 r0.z, c133.x, r1.y, r0.z
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.z, 1
(ss)nop
mov.u32u32 r6.z, 20
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 13
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.y, 1
(ss)mov.u32u32 r7.y, 4
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.y, 1
mad.f32 r0.y, c133.x, r1.y, r0.y
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.y, 1
(ss)nop
mov.u32u32 r6.z, 20
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 14
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.y, 1
(ss)mov.u32u32 r7.y, 4
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.x, 1
mad.f32 r0.x, c133.x, r1.y, r0.x
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.x, 1
(ss)nop
mov.u32u32 r6.z, 16
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 15
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.x, 1
add.u r1.x, r1.x, 1
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.x, 1
jump #l77
l242:
(ss)nop
mov.u32u32 r6.z, 16
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 16
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.z, 1
(jp)mov.u32u32 r1.x, r0.z
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.x, 1
(ss)nop
mov.u32u32 r6.z, 16
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 17
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.y, 1
mov.u32u32 r1.y, r0.y
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.y, 1
(ss)nop
mov.u32u32 r6.z, 16
(rpt3)nop
atomic.g.add.untyped.1d.u32.1.g r6.z, r6.x, r6.z
mov.u32u32 r7.y, 18
(rpt3)nop
(sy)stg.u32 g[r6.z], r7.x, 2
(ss)mov.u32u32 r7.y, 3
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r0.x, 1
mov.u32u32 r1.z, r0.x
(ss)mov.u32u32 r7.y, 2
(rpt3)nop
(sy)stg.u32 g[r6.z+r7.y], r1.z, 1
end
nop
nop
nop
nop
; FRAG: outputs: r0.w (FRAG_RESULT_DATA0)
; FRAG: inputs: r63.x (SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL slot=50 cm=3,il=0,b=0)
; FRAG prog 2/1: 541 instr, 363 nops, 178 non-nops, 89 mov, 2 cov, 578 dwords
; FRAG prog 2/1: 0 last-baryf, 0 half, 8 full, 136 constlen
; FRAG prog 2/1: 366 cat0, 91 cat1, 9 cat2, 4 cat3, 0 cat4, 0 cat5, 77 cat6, 0 cat7, 
; FRAG prog 2/1: 0 sstall, 58 (ss), 57 (sy), 0 max_sun, 1 loops
; data0: r0.w
Output (click me)
Data written 13625002
Invocations 62500
[0/0]: mov.u32u32 r1.x, 0
        dst(r1.x)=00000000 /* 0.000000 */  
[0/1]: mov.u32u32 r0.w, r1.x
        dst(r0.w)=00000000 /* 0.000000 */  src(r1.x)=00000000 /* 0.000000 */  
[0/2]: mov.u32u32 r0.z, r0.w
        dst(r0.z)=00000000 /* 0.000000 */  src(r0.w)=00000000 /* 0.000000 */  
[0/3]: mov.u32u32 r0.y, r0.z
        dst(r0.y)=00000000 /* 0.000000 */  src(r0.z)=00000000 /* 0.000000 */  
[0/4]: mov.u32u32 r0.x, r0.y
        dst(r0.x)=00000000 /* 0.000000 */  src(r0.y)=00000000 /* 0.000000 */  
[0/5]: (jp)(nop3) cmps.s.ge p0.x, r1.x, c0.x
        src(r1.x)=00000000 /* 0.000000 */  
[0/6]: br p0.x, #14
        src(r0.x)=00000000 /* 0.000000 */  src(r0.x)=00000000 /* 0.000000 */  
[0/7]: (jp)(nop3) shl.b r1.y, r1.x, 2
        dst(r1.y)=00000000 /* 0.000000 */  src(r1.x)=00000000 /* 0.000000 */  
[0/8]: cov.u32s16 hr2.w, r1.y
        dst(hr2.w)=00000000 /* 0.000000 */  src(r1.y)=00000000 /* 0.000000 */  
[0/9]: mova a0.x, hr2.w
        src(hr2.w)=00000000 /* 0.000000 */  
[0/10]: (ul)cov.s32f32 r1.y, c<a0.x + 4>
        dst(r1.y)=0x40a00000 /* 5.000000 */  
[0/11]: mad.f32 r0.w, c133.x, r1.y, r0.w
        dst(r0.w)=0x3dcccccc /* 0.100000 */  src(r1.y)=0x40a00000 /* 5.000000 */  src(r0.w)=0x3dcccccc /* 0.100000 */  
[0/12]: mad.f32 r0.z, c133.x, r1.y, r0.z
        dst(r0.z)=0x3dcccccc /* 0.100000 */  src(r1.y)=0x40a00000 /* 5.000000 */  src(r0.z)=0x3dcccccc /* 0.100000 */  
[0/13]: mad.f32 r0.y, c133.x, r1.y, r0.y
        dst(r0.y)=0x3dcccccc /* 0.100000 */  src(r1.y)=0x40a00000 /* 5.000000 */  src(r0.y)=0x3dcccccc /* 0.100000 */  
[0/14]: mad.f32 r0.x, c133.x, r1.y, r0.x
        dst(r0.x)=0x3dcccccc /* 0.100000 */  src(r1.y)=0x40a00000 /* 5.000000 */  src(r0.x)=0x3dcccccc /* 0.100000 */  
[0/15]: add.u r1.x, r1.x, 1
        dst(r1.x)=0x3dcccccc /* 0.100000 */  src(r1.x)=0x000001 /* 0.000000 */  
[0/5]: (jp)(nop3) cmps.s.ge p0.x, r1.x, c0.x
        src(r1.x)=0x000001 /* 0.000000 */  
[0/6]: br p0.x, #14
        src(r0.x)=0x3dcccccc /* 0.100000 */  src(r0.x)=0x3dcccccc /* 0.100000 */  
[0/7]: (jp)(nop3) shl.b r1.y, r1.x, 2
        dst(r1.y)=0x000004 /* 0.000000 */  src(r1.x)=0x000001 /* 0.000000 */  
[0/8]: cov.u32s16 hr2.w, r1.y
        dst(hr2.w)=0x000004 /* 0.000000 */  src(r1.y)=0x040004 /* 0.000000 */  
[0/9]: mova a0.x, hr2.w
        src(hr2.w)=0x000004 /* 0.000000 */  
[0/10]: (ul)cov.s32f32 r1.y, c<a0.x + 4>
        dst(r1.y)=0x41200000 /* 10.000000 */  
[0/11]: mad.f32 r0.w, c133.x, r1.y, r0.w
        dst(r0.w)=0x3e999999 /* 0.300000 */  src(r1.y)=0x41200000 /* 10.000000 */  src(r0.w)=0x3e999999 /* 0.300000 */  
[0/12]: mad.f32 r0.z, c133.x, r1.y, r0.z
        dst(r0.z)=0x3e999999 /* 0.300000 */  src(r1.y)=0x41200000 /* 10.000000 */  src(r0.z)=0x3e999999 /* 0.300000 */  
[0/13]: mad.f32 r0.y, c133.x, r1.y, r0.y
        dst(r0.y)=0x3e999999 /* 0.300000 */  src(r1.y)=0x41200000 /* 10.000000 */  src(r0.y)=0x3e999999 /* 0.300000 */  
[0/14]: mad.f32 r0.x, c133.x, r1.y, r0.x
        dst(r0.x)=0x3e999999 /* 0.300000 */  src(r1.y)=0x41200000 /* 10.000000 */  src(r0.x)=0x3e999999 /* 0.300000 */  
[0/15]: add.u r1.x, r1.x, 1
        dst(r1.x)=0x3e999999 /* 0.300000 */  src(r1.x)=0x000002 /* 0.000000 */  
[0/5]: (jp)(nop3) cmps.s.ge p0.x, r1.x, c0.x
        src(r1.x)=0x000002 /* 0.000000 */  
[0/6]: br p0.x, #14
        src(r0.x)=0x3e999999 /* 0.300000 */  src(r0.x)=0x3e999999 /* 0.300000 */  
[0/7]: (jp)(nop3) shl.b r1.y, r1.x, 2
        dst(r1.y)=0x000008 /* 0.000000 */  src(r1.x)=0x000002 /* 0.000000 */  
[0/8]: cov.u32s16 hr2.w, r1.y
        dst(hr2.w)=0x000008 /* 0.000000 */  src(r1.y)=0x080008 /* 0.000000 */  
[0/9]: mova a0.x, hr2.w
        src(hr2.w)=0x000008 /* 0.000000 */  
[0/10]: (ul)cov.s32f32 r1.y, c<a0.x + 4>
        dst(r1.y)=0x41700000 /* 15.000000 */  
[0/11]: mad.f32 r0.w, c133.x, r1.y, r0.w
        dst(r0.w)=0x3f199999 /* 0.600000 */  src(r1.y)=0x41700000 /* 15.000000 */  src(r0.w)=0x3f199999 /* 0.600000 */  
[0/12]: mad.f32 r0.z, c133.x, r1.y, r0.z
        dst(r0.z)=0x3f199999 /* 0.600000 */  src(r1.y)=0x41700000 /* 15.000000 */  src(r0.z)=0x3f199999 /* 0.600000 */  
[0/13]: mad.f32 r0.y, c133.x, r1.y, r0.y
        dst(r0.y)=0x3f199999 /* 0.600000 */  src(r1.y)=0x41700000 /* 15.000000 */  src(r0.y)=0x3f199999 /* 0.600000 */  
[0/14]: mad.f32 r0.x, c133.x, r1.y, r0.x
        dst(r0.x)=0x3f199999 /* 0.600000 */  src(r1.y)=0x41700000 /* 15.000000 */  src(r0.x)=0x3f199999 /* 0.600000 */  
[0/15]: add.u r1.x, r1.x, 1
        dst(r1.x)=0x3f199999 /* 0.600000 */  src(r1.x)=0x000003 /* 0.000000 */  
[0/5]: (jp)(nop3) cmps.s.ge p0.x, r1.x, c0.x
        src(r1.x)=0x000003 /* 0.000000 */  
[0/6]: br p0.x, #14
        src(r0.x)=0x3f199999 /* 0.600000 */  src(r0.x)=0x3f199999 /* 0.600000 */  
[0/7]: (jp)(nop3) shl.b r1.y, r1.x, 2
        dst(r1.y)=0x00000c /* 0.000000 */  src(r1.x)=0x000003 /* 0.000000 */  
[0/8]: cov.u32s16 hr2.w, r1.y
        dst(hr2.w)=0x00000c /* 0.000000 */  src(r1.y)=0x0c000c /* 0.000000 */  
[0/9]: mova a0.x, hr2.w
        src(hr2.w)=0x00000c /* 0.000000 */  
[0/10]: (ul)cov.s32f32 r1.y, c<a0.x + 4>
        dst(r1.y)=0x41a00000 /* 20.000000 */  
[0/11]: mad.f32 r0.w, c133.x, r1.y, r0.w
        dst(r0.w)=0x3f7fffff /* 1.000000 */  src(r1.y)=0x41a00000 /* 20.000000 */  src(r0.w)=0x3f7fffff /* 1.000000 */  
[0/12]: mad.f32 r0.z, c133.x, r1.y, r0.z
        dst(r0.z)=0x3f7fffff /* 1.000000 */  src(r1.y)=0x41a00000 /* 20.000000 */  src(r0.z)=0x3f7fffff /* 1.000000 */  
[0/13]: mad.f32 r0.y, c133.x, r1.y, r0.y
        dst(r0.y)=0x3f7fffff /* 1.000000 */  src(r1.y)=0x41a00000 /* 20.000000 */  src(r0.y)=0x3f7fffff /* 1.000000 */  
[0/14]: mad.f32 r0.x, c133.x, r1.y, r0.x
        dst(r0.x)=0x3f7fffff /* 1.000000 */  src(r1.y)=0x41a00000 /* 20.000000 */  src(r0.x)=0x3f7fffff /* 1.000000 */  
[0/15]: add.u r1.x, r1.x, 1
        dst(r1.x)=0x3f7fffff /* 1.000000 */  src(r1.x)=0x000004 /* 0.000000 */  
[0/5]: (jp)(nop3) cmps.s.ge p0.x, r1.x, c0.x
        src(r1.x)=0x000004 /* 0.000000 */  
[0/6]: br p0.x, #14
        src(r0.x)=0x3f7fffff /* 1.000000 */  src(r0.x)=0x3f7fffff /* 1.000000 */  
[0/16]: (jp)mov.u32u32 r1.x, r0.z
        dst(r1.x)=0x3f7fffff /* 1.000000 */  src(r0.z)=0x3f7fffff /* 1.000000 */  
[0/17]: mov.u32u32 r1.y, r0.y
        dst(r1.y)=0x3f7fffff /* 1.000000 */  src(r0.y)=0x3f7fffff /* 1.000000 */  
[0/18]: mov.u32u32 r1.z, r0.x
        dst(r1.z)=0x3f7fffff /* 1.000000 */  src(r0.x)=0x3f7fffff /* 1.000000 */

Current decisions:

  • Shader is instrumented after all compilation is done in order to be able to work with overriden shaders (via IR3_SHADER_OVERRIDE_PATH) and not to change the register allocation (which may be undesirable?).
    • So it is done after RA, meaning we need some free regs (since there is no spilling in RA at the moment - it's not that big of an issue I think);
    • Running RA after instrumentation may be useful after there is a support of spilling. However, the other way would be to reduce the upper limit of registers in RA for the shader we want to instrument in order to always have some free regs;
    • Without RA pass the jump offsets are manually retargeted;
  • Shader to instrument is targeted by its hash (the very same that is used for override) via IR3_SHADER_INSTRUMENT, the instructions for which registers should be dumped could be filtered via IR3_SHADER_INSTRUMENT_INSTR_REGEX e.g. IR3_SHADER_INSTRUMENT_INSTR_REGEX="\(sy\)stg\.u32";
  • The space for registers is allocated with per-instruction granularity, meaning the the instructions are interleaved in the global buffer.
    • Pros: we could dump arbitrary number of instruction from one shader invocation;
    • Cons: it is slower because it requires doing an atomic.add for every instruction and waiting for its result. An alternative is to pre-allocate memory for each invocation and calculating offset without global atomics.

In global buffer the dump of each instruction has a following structure:

invocation_id: u32
instruction_id: u32
dst_values: u32[num_of_dest_regs]
src_values: u32[num_of_src_regs]

Unavoidable limitations:

  • Requires a6xx+ GPU due to usage of global load/store/atomics;
  • The instrumentation will mess up with the cases where the bug is due to improper synchronization between instructions or shader invocations;
  • Dumping all registers for all invocations at once may be too much for medium/large shaders with many invocations, both due to memory and time constraints.

Current limitations/issues:

  • Registers are written one by one instead of writing up to four at once. Writing one by one is slower but requires less registers;
  • The registers are dumped for all invocations. Which could be infeasible if we want to dump registers for each instruction in huge shader which has too many invocations. I think the solution would be to have a shader binary with non-instrumented and instrumented code together, then decide which one to run based on some condition;
  • Some instructions write several registers or consume several registers from one source, most of them aren't handled properly at the moment.
  • Currently only the result of first invocation is printed
  • There is no check for clashing with input registers?
  • Constant registers aren't printed

My current plan:

  • Leave one by one writing of registers (doesn't affect the end result);
  • Do not pre-allocate space in global memory (doesn't affect the end result);
  • Handle the instructions which write more than one register and read several registers from one src;
  • Get a feedback on the output format and improve it based on that;
  • Add a way to control which invocation(s) are printed;

In any case, I'd like to get a feedback before proceeding further.

Merge request reports