r300: further nir_to_tgsi improvements needed before we can drop our deadcode and dataflow analysis
Besides the LRP issue and some needed improvements for rc_copy_output (branch in https://gitlab.freedesktop.org/ondracka/mesa/-/commits/r300_optimize_vertex_outputs ) there are some other cases where we we still need our own dataflow analysis in vertex shaders because we don't get optimal code from nir_to_tgsi
From a simple Trine shader:
7: MOV TEMP[1].x, |IN[1].wyzw|
8: MAD TEMP[1].x, TEMP[1].xxxx, IMM[0].yyyy, IMM[0].zzzz
The full NIR_DEBUG=tgsi output:
NIR before translation to TGSI:
shader: MESA_SHADER_VERTEX
source_sha1: {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
name: ARB1
inputs: 2
outputs: 2
uniforms: 12
ubos: 1
shared: 0
ray queries: 0
decl_var uniform INTERP_MODE_NONE vec4[12] (0, 0, 0)
decl_var shader_in INTERP_MODE_NONE vec4 in_0 (VERT_ATTRIB_POS.xyzw, 0, 0)
decl_var shader_in INTERP_MODE_NONE vec4 in_6 (VERT_ATTRIB_TEX0.xyzw, 1, 0)
decl_var shader_out INTERP_MODE_SMOOTH vec4 out_0 (VARYING_SLOT_POS.xyzw, 0, 0)
decl_var shader_out INTERP_MODE_SMOOTH vec4 out_4 (VARYING_SLOT_VAR0.xyzw, 1, 0)
decl_var ubo INTERP_MODE_NONE vec4[12] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
decl_reg vec3 32 r5
decl_reg vec3 32 r6
decl_reg vec4 32 r7
decl_reg vec4 32 r8
block block_0:
/* preds: */
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec4 32 ssa_2 = intrinsic load_ubo_vec4 (ssa_0, ssa_0) (access=0, base=8, component=0)
vec4 32 ssa_1 = intrinsic load_input (ssa_0) (base=0, component=0, dest_type=float32 /*160*/, io location=0 slots=1 /*128*/) /* in_0 */
vec3 32 ssa_3 = fmul ssa_1.xyz, ssa_2.xyz
r5.y = fneg ssa_3.z
vec4 32 ssa_6 = intrinsic load_ubo_vec4 (ssa_0, ssa_0) (access=0, base=4, component=0)
r5.xz = mov ssa_3.xy
r6.z = fdot3_replicated r5, ssa_6.xyz
vec4 32 ssa_8 = intrinsic load_ubo_vec4 (ssa_0, ssa_0) (access=0, base=3, component=0)
r6.y = fdot3_replicated r5, ssa_8.xyz
vec4 32 ssa_10 = intrinsic load_ubo_vec4 (ssa_0, ssa_0) (access=0, base=2, component=0)
r6.x = fdot3_replicated r5, ssa_10.xyz
vec1 32 ssa_13 = load_const (0x447a0000 = 1000.000000)
r7.xyz = fmul r6, ssa_13.xxx
vec4 32 ssa_15 = intrinsic load_input (ssa_0) (base=1, component=0, dest_type=float32 /*160*/, io location=6 slots=1 /*134*/) /* in_6 */
vec1 32 ssa_16 = fabs ssa_15.w
vec1 32 ssa_18 = load_const (0xc6fffe00 = -32767.000000)
vec1 32 ssa_17 = load_const (0x40000000 = 2.000000)
vec1 32 ssa_19 = ffma ssa_16, ssa_17, ssa_18
vec1 32 ssa_22 = load_const (0xbf000000 = -0.500000)
vec4 32 ssa_21 = intrinsic load_ubo_vec4 (ssa_0, ssa_0) (access=0, base=9, component=0)
vec1 32 ssa_23 = ffma ssa_19, ssa_21.y, ssa_22
vec1 32 ssa_25 = ffma ssa_1.w, ssa_21.x, ssa_22
vec4 32 ssa_28 = intrinsic load_ubo_vec4 (ssa_0, ssa_0) (access=0, base=7, component=0)
vec1 32 ssa_20 = load_const (0x3f800000 = 1.000000)
r7.w = mov ssa_20.x
r8.zw = fdot4_replicated r7, ssa_28
vec4 32 ssa_30 = intrinsic load_ubo_vec4 (ssa_0, ssa_0) (access=0, base=6, component=0)
r8.y = fdot4_replicated r7, ssa_30
vec4 32 ssa_32 = intrinsic load_ubo_vec4 (ssa_0, ssa_0) (access=0, base=5, component=0)
r8.x = fdot4_replicated r7, ssa_32
vec1 32 ssa_34 = load_const (0x3f000000 = 0.500000)
vec4 32 ssa_26 = intrinsic load_ubo_vec4 (ssa_0, ssa_0) (access=0, base=10, component=0)
vec2 32 ssa_35 = fadd ssa_26.xy, ssa_34.xx
vec4 32 ssa_24 = intrinsic load_ubo_vec4 (ssa_0, ssa_0) (access=0, base=11, component=0)
vec2 32 ssa_36 = ffma ssa_25.xx, ssa_24.xy, ssa_35
vec2 32 ssa_37 = ffma ssa_23.xx, ssa_24.zw, ssa_36
intrinsic store_output (r8, ssa_0) (base=0, wrmask=xyzw /*15*/, component=0, src_type=float32 /*160*/, io location=0 slots=1 /*128*/, xfb() /*0*/, xfb2() /*0*/) /* out_0 */
intrinsic store_output (ssa_37, ssa_0) (base=1, wrmask=xy /*3*/, component=0, src_type=float32 /*160*/, io location=32 slots=1 /*160*/, xfb() /*0*/, xfb2() /*0*/) /* out_4 */
/* succs: block_1 */
block block_1:
}
TGSI after translation from NIR:
VERT
PROPERTY NEXT_SHADER VERT
DCL IN[0]
DCL IN[1]
DCL OUT[0], POSITION
DCL OUT[1].xy, GENERIC[0]
DCL CONST[0][0..11]
DCL TEMP[0..2]
IMM[0] FLT32 { 1000.0000, 2.0000, -32767.0000, -0.5000}
IMM[1] FLT32 { 1.0000, 0.5000, 0.0000, 0.0000}
0: MUL TEMP[0].xyz, IN[0], CONST[0][8]
1: MOV TEMP[1].y, -TEMP[0].zzzz
2: MOV TEMP[1].xz, TEMP[0].xyyx
3: DP3 TEMP[0].z, TEMP[1].xyzx, CONST[0][4].xyzx
4: DP3 TEMP[0].y, TEMP[1].xyzx, CONST[0][3].xyzx
5: DP3 TEMP[0].x, TEMP[1].xyzx, CONST[0][2].xyzx
6: MUL TEMP[0].xyz, TEMP[0].xyzx, IMM[0].xxxx
7: MOV TEMP[1].x, |IN[1].wyzw|
8: MAD TEMP[1].x, TEMP[1].xxxx, IMM[0].yyyy, IMM[0].zzzz
9: MAD TEMP[1].x, TEMP[1].xxxx, CONST[0][9].yxxx, IMM[0].wwww
10: MAD TEMP[2].x, IN[0].wxxx, CONST[0][9].xxxx, IMM[0].wwww
11: MOV TEMP[0].w, IMM[1].xxxx
12: DP4 OUT[0].zw, TEMP[0], CONST[0][7]
13: DP4 OUT[0].y, TEMP[0], CONST[0][6]
14: DP4 OUT[0].x, TEMP[0], CONST[0][5]
15: ADD TEMP[0].xy, CONST[0][10].xyxx, IMM[1].yyyy
16: MAD TEMP[0].xy, TEMP[2].xxxx, CONST[0][11].xyxx, TEMP[0].xyxx
17: MAD OUT[1].xy, TEMP[1].xxxx, CONST[0][11].zwxx, TEMP[0].xyxx
18: END
Edited by Pavel Ondračka