Skip to content

ir3: add newly found shlg.b16 instruction

Danylo Piliaiev requested to merge Danil/mesa:ir3/feature/shlg into main

Example of blob's output:

  (nop3) shlg.b16 hr8.x, (r)8, (r)hr8.x, 12

It does: (src2 << src1) | src3

src1 and src3 could also be half-registers or relatively address half-regs/consts, but it is not seen in blob's output.

Blob does use it in conjuncture with "samgq" instruction.

The shader where blob generates this instruction: http://shader-playground.timjones.io/3170cdcfe8938ceb8536f95203d78bba

Decompiled GLSL (click me)
#version 450

#ifndef SPIRV_CROSS_CONSTANT_ID_5
#define SPIRV_CROSS_CONSTANT_ID_5 true
#endif
const bool cb4_bound = SPIRV_CROSS_CONSTANT_ID_5;
#ifndef SPIRV_CROSS_CONSTANT_ID_8
#define SPIRV_CROSS_CONSTANT_ID_8 true
#endif
const bool t1_bound = SPIRV_CROSS_CONSTANT_ID_8;
#ifndef SPIRV_CROSS_CONSTANT_ID_9
#define SPIRV_CROSS_CONSTANT_ID_9 true
#endif
const bool t15_bound = SPIRV_CROSS_CONSTANT_ID_9;
#ifndef SPIRV_CROSS_CONSTANT_ID_1216
#define SPIRV_CROSS_CONSTANT_ID_1216 12816u
#endif
const uint omap0 = SPIRV_CROSS_CONSTANT_ID_1216;
#ifndef SPIRV_CROSS_CONSTANT_ID_1217
#define SPIRV_CROSS_CONSTANT_ID_1217 12816u
#endif
const uint omap1 = SPIRV_CROSS_CONSTANT_ID_1217;
#ifndef SPIRV_CROSS_CONSTANT_ID_1218
#define SPIRV_CROSS_CONSTANT_ID_1218 12816u
#endif
const uint omap2 = SPIRV_CROSS_CONSTANT_ID_1218;

layout(binding = 5, std140) uniform cb4_t
{
    vec4 m[212];
} cb4;

uniform sampler2D SPIRV_Cross_Combinedt1s2;
uniform sampler2D SPIRV_Cross_Combinedt15s6;

layout(location = 2) sample in vec3 v2;
layout(location = 3) sample in vec4 v3;
layout(location = 4) sample in vec4 v4;
layout(location = 6) sample in vec4 v6;
layout(location = 0, index = 0) out vec4 o0;
layout(location = 1, index = 0) out vec4 o1;
layout(location = 2, index = 0) out vec4 o2;
layout(location = 3, index = 0) out uint o3;
vec4 shader_in[8];
vec4 r0;
vec4 r1;
vec4 r2;
vec4 r3;

void ps_main()
{
    o0 = uintBitsToFloat(uvec4(0u, 0u, 0u, 1015580809u));
    r0 = dFdxCoarse(shader_in[6]);
    r0 *= vec4(cb4.m[211].y);
    r1 = dFdyCoarse(shader_in[6]);
    r1 *= vec4(cb4.m[211].y);
    vec2 _98 = mix(vec2(0.0), textureGrad(SPIRV_Cross_Combinedt1s2, shader_in[6].zwzz.xy, r0.zw, r1.zw).xy, bvec2(t1_bound, t1_bound));
    r0 = vec4(r0.x, r0.y, _98.x, _98.y);
    vec2 _114 = mix(vec2(0.0), textureGrad(SPIRV_Cross_Combinedt1s2, shader_in[6].xyxx.xy, r0.xy, r1.xy).xy, bvec2(t1_bound, t1_bound));
    r0 = vec4(_114.x, _114.y, r0.z, r0.w);
    vec2 _126 = fma(r0.xy, uintBitsToFloat(uvec2(1073741824u)), uintBitsToFloat(uvec2(3212836864u)));
    r0 = vec4(_126.x, _126.y, r0.z, r0.w);
    vec2 _134 = fma(r0.zw, uintBitsToFloat(uvec2(1073741824u)), r0.xy);
    r0 = vec4(r0.x, r0.y, _134.x, _134.y);
    vec4 _143 = r0;
    _143.x = dot(r0.xy, r0.xy);
    r0 = _143;
    vec4 _151 = r0;
    _151.x = (-r0.x) + uintBitsToFloat(1065353216u);
    r0 = _151;
    float _154 = uintBitsToFloat(0u);
    vec4 _157 = r0;
    _157.x = isnan(_154) ? r0.x : (isnan(r0.x) ? _154 : max(r0.x, _154));
    r0 = _157;
    vec4 _162 = r1;
    _162.z = sqrt(r0.x);
    r1 = _162;
    vec2 _166 = r0.zw + uintBitsToFloat(uvec2(3212836864u));
    r1 = vec4(_166.x, _166.y, r1.z, r1.w);
    vec4 _175 = r0;
    _175.x = dot(r1.xyz, r1.xyz);
    r0 = _175;
    float _179 = uintBitsToFloat(925353388u);
    vec4 _182 = r0;
    _182.x = isnan(_179) ? r0.x : (isnan(r0.x) ? _179 : max(r0.x, _179));
    r0 = _182;
    vec4 _187 = r0;
    _187.x = inversesqrt(r0.x);
    r0 = _187;
    vec3 _196 = fma(r1.xyz, r0.xxx, uintBitsToFloat(uvec3(2147483648u, 2147483648u, 3212836864u)));
    r0 = vec4(_196.x, _196.y, _196.z, r0.w);
    vec3 _207 = fma(shader_in[4].xxx, r0.xyz, uintBitsToFloat(uvec3(0u, 0u, 1065353216u)));
    r0 = vec4(_207.x, _207.y, _207.z, r0.w);
    vec4 _219 = r0;
    _219.w = dot(shader_in[2].xyz, shader_in[2].xyz);
    r0 = _219;
    float _222 = uintBitsToFloat(925353388u);
    vec4 _225 = r0;
    _225.w = isnan(_222) ? r0.w : (isnan(r0.w) ? _222 : max(r0.w, _222));
    r0 = _225;
    vec4 _230 = r0;
    _230.w = inversesqrt(r0.w);
    r0 = _230;
    vec3 _236 = r0.www * shader_in[2].xyz;
    r1 = vec4(_236.x, _236.y, _236.z, r1.w);
    vec4 _248 = r0;
    _248.w = dot(shader_in[3].xyz, shader_in[3].xyz);
    r0 = _248;
    float _251 = uintBitsToFloat(925353388u);
    vec4 _254 = r0;
    _254.w = isnan(_251) ? r0.w : (isnan(r0.w) ? _251 : max(r0.w, _251));
    r0 = _254;
    vec4 _259 = r0;
    _259.w = inversesqrt(r0.w);
    r0 = _259;
    vec3 _265 = r0.www * shader_in[3].xyz;
    r2 = vec4(_265.x, _265.y, _265.z, r2.w);
    vec3 _273 = r1.yzx * r2.zxy;
    r3 = vec4(_273.x, _273.y, _273.z, r3.w);
    vec3 _284 = fma(r2.yzx, r1.zxy, -r3.xyz);
    r3 = vec4(_284.x, _284.y, _284.z, r3.w);
    vec3 _292 = r3.xyz * shader_in[3].www;
    r3 = vec4(_292.x, _292.y, _292.z, r3.w);
    vec3 _299 = r0.yyy * r3.xyz;
    r3 = vec4(_299.x, _299.y, _299.z, r3.w);
    vec3 _308 = fma(r0.xxx, r2.xyz, r3.xyz);
    r0 = vec4(_308.x, _308.y, r0.z, _308.z);
    vec3 _317 = fma(r0.zzz, r1.xyz, r0.xyw);
    r0 = vec4(_317.x, _317.y, _317.z, r0.w);
    vec4 _326 = r0;
    _326.w = dot(r0.xyz, r0.xyz);
    r0 = _326;
    float _329 = uintBitsToFloat(925353388u);
    vec4 _332 = r0;
    _332.w = isnan(_329) ? r0.w : (isnan(r0.w) ? _329 : max(r0.w, _329));
    r0 = _332;
    vec4 _337 = r0;
    _337.w = inversesqrt(r0.w);
    r0 = _337;
    vec3 _342 = r0.www * r0.xyz;
    r0 = vec4(_342.x, _342.y, _342.z, r0.w);
    vec3 _347 = abs(r0.xyz);
    r1 = vec4(_347.x, _347.y, _347.z, r1.w);
    vec4 _356 = r0;
    _356.w = isnan(r1.y) ? r1.z : (isnan(r1.z) ? r1.y : max(r1.z, r1.y));
    r0 = _356;
    vec4 _363 = r0;
    _363.w = isnan(r1.x) ? r0.w : (isnan(r0.w) ? r1.x : max(r0.w, r1.x));
    r0 = _363;
    vec2 _373 = uintBitsToFloat(mix(uvec2(0u), uvec2(4294967295u), equal(r1.xy, r0.ww)));
    r2 = vec4(_373.x, _373.y, r2.z, r2.w);
    float _378 = uintBitsToFloat(925353388u);
    vec4 _381 = r0;
    _381.w = isnan(_378) ? r0.w : (isnan(r0.w) ? _378 : max(r0.w, _378));
    r0 = _381;
    vec4 _387 = r0;
    _387.w = 1.0 / r0.w;
    r0 = _387;
    vec4 _398 = r1;
    _398.w = (floatBitsToUint(r2.y) != 0u) ? r1.z : r1.y;
    r1 = _398;
    vec2 _407 = mix(r1.xw, r1.yz, notEqual(floatBitsToUint(r2.xx), uvec2(0u)));
    r1 = vec4(_407.x, _407.y, r1.z, r1.w);
    vec4 _416 = r2;
    _416.x = isnan(r1.x) ? r1.y : (isnan(r1.y) ? r1.x : max(r1.y, r1.x));
    r2 = _416;
    vec4 _423 = r1;
    _423.x = isnan(r1.x) ? r1.y : (isnan(r1.y) ? r1.x : min(r1.y, r1.x));
    r1 = _423;
    float _426 = uintBitsToFloat(925353388u);
    vec4 _429 = r1;
    _429.y = isnan(_426) ? r2.x : (isnan(r2.x) ? _426 : max(r2.x, _426));
    r1 = _429;
    vec4 _434 = r1;
    _434.y = 1.0 / r1.y;
    r1 = _434;
    vec4 _441 = r2;
    _441.y = r1.y * r1.x;
    r2 = _441;
    vec2 _444 = dFdxCoarse(r2.xy);
    r1 = vec4(_444.x, _444.y, r1.z, r1.w);
    vec2 _449 = dFdyCoarse(r2.xy);
    r1 = vec4(r1.x, r1.y, _449.x, _449.y);
    r1 *= vec4(cb4.m[211].y);
    vec4 _472 = r1;
    _472.x = t15_bound ? textureGrad(SPIRV_Cross_Combinedt15s6, r2.xyxx.xy, r1.xy, r1.zw).w : 0.0;
    r1 = _472;
    vec4 _479 = r1;
    _479.x = r1.x * uintBitsToFloat(1056964608u);
    r1 = _479;
    vec4 _486 = r0;
    _486.w = r0.w * r1.x;
    r0 = _486;
    vec3 _493 = fma(r0.xyz, r0.www, uintBitsToFloat(uvec3(1056964608u)));
    o1 = vec4(_493.x, _493.y, _493.z, o1.w);
    vec4 _498 = o1;
    _498.w = uintBitsToFloat(1065353216u);
    o1 = _498;
    vec4 _506 = r0;
    _506.x = shader_in[4].w * uintBitsToFloat(1082130432u);
    r0 = _506;
    vec4 _512 = r0;
    _512.x = intBitsToFloat(int(r0.x));
    r0 = _512;
    vec4 _520 = r0;
    _520.x = uintBitsToFloat(floatBitsToUint(r0.x) & 3u);
    r0 = _520;
    vec4 _526 = r0;
    _526.x = float(floatBitsToUint(r0.x));
    r0 = _526;
    vec4 _535 = r0;
    _535.x = fma(r0.x, uintBitsToFloat(1090519040u), uintBitsToFloat(1126449152u));
    r0 = _535;
    vec4 _542 = o2;
    _542.y = r0.x * uintBitsToFloat(998277249u);
    o2 = _542;
    vec3 _546 = uintBitsToFloat(uvec3(1025758986u, 0u, 981500033u));
    o2 = vec4(_546.x, o2.y, _546.y, _546.z);
    o3 = floatBitsToUint(uintBitsToFloat(3u));
}

void main()
{
    shader_in[2] = vec4(v2.x, v2.y, v2.z, shader_in[2].w);
    shader_in[3] = v3;
    shader_in[4] = v4;
    shader_in[6] = v6;
    ps_main();
    o0 = vec4(o0[bitfieldExtract(omap0, int(0u), int(4u))], o0[bitfieldExtract(omap0, int(4u), int(4u))], o0[bitfieldExtract(omap0, int(8u), int(4u))], o0[bitfieldExtract(omap0, int(12u), int(4u))]);
    o1 = vec4(o1[bitfieldExtract(omap1, int(0u), int(4u))], o1[bitfieldExtract(omap1, int(4u), int(4u))], o1[bitfieldExtract(omap1, int(8u), int(4u))], o1[bitfieldExtract(omap1, int(12u), int(4u))]);
    o2 = vec4(o2[bitfieldExtract(omap2, int(0u), int(4u))], o2[bitfieldExtract(omap2, int(4u), int(4u))], o2[bitfieldExtract(omap2, int(8u), int(4u))], o2[bitfieldExtract(omap2, int(12u), int(4u))]);
}
Blob's asm (click me)

  0[03820000_00000006] shps #6;
  1[02820000_00000005] getone #5;
  2[204880f5_00000000] mova1 a1.y, 0;
  3[00000500_00000000] (rpt5)nop ;
  4[c0360a03_d0c78100] ldc.4.k.mode4.base0 c[a1.x], 0, 5;
  5[14021000_00000000] (sy)(ss)shpe ;
  6[4f300012_00002004] (jp)bary.f r4.z, 4, r0.x;
  7[47300013_00002007] bary.f r4.w, 7, r0.x;
  8[47300a14_0000200c] (rpt2)bary.f r5.x, (r)12, r0.x;
  9[47300002_00002008] bary.f r0.z, 8, r0.x;
 10[47300004_00002009] bary.f r1.x, 9, r0.x;
 11[47300003_0000200a] bary.f r0.w, 10, r0.x;
 12[47300005_0000200b] bary.f r1.y, 11, r0.x;
 13[47308b17_00002000] (rpt3)bary.f (ei)r5.w, (r)0, r0.x;
 14[40100000_28000002] add.f r0.x, r0.z, (0.0);
 15[40100002_28000003] add.f r0.z, r0.w, (0.0);
 16[40100003_28000005] add.f r0.w, r1.y, (0.0);
 17[40100001_28000004] add.f r0.y, r1.x, (0.0);
 18[20444106_00000000] (rpt1)mov.f32f32 r1.z, (0.000000);
 19[2044410a_00000000] (rpt1)mov.f32f32 r2.z, (0.000000);
 20[20044904_00000002] (rpt1)mov.f32f32 r1.x, (r)r0.z;
 21[a380130c_00000005] dsx (f32)(xyOO)r3.x, r0.z;
 22[a3c0130e_00000005] dsy (f32)(xyOO)r3.z, r0.z;
 23[20044008_00000000] mov.f32f32 r2.x, r0.x;
 24[a3801302_00000001] dsx (f32)(xyOO)r0.z, r0.x;
 25[a3c01310_00000001] dsy (f32)(xyOO)r4.x, r0.x;
 26[20044009_00000001] mov.f32f32 r2.y, r0.y;
 27[50700900_100d0002] (sy)(rpt1)mul.f r0.x, (r)r0.z, c3.y;
 28[40700902_100d0010] (rpt1)mul.f r0.z, (r)r4.x, c3.y;
 29[40700b0c_100d000c] (rpt3)mul.f r3.x, (r)r3.x, c3.y;
 30[20510020_0000000e] mov.s16s16 hr8.x, 14;
 31[00000200_00000000] (rpt2)nop ;
 32[65900820_100cb008] (nop3) shlg.b16 hr8.x, (r)8, (r)hr8.x, 12;
 33[200880f5_00000020] mova1 a1.y, hr8.x;
 34[00000500_00000000] (rpt5)nop ;
 35[a1881304_d0c20009] samgq.s2en.mode6.base0.s34 (f32)(xyOO)r1.x, r1.x, 134;
 36[20510020_00000002] mov.s16s16 hr8.x, 2;
 37[00000200_00000000] (rpt2)nop ;
 38[65900820_1000b008] (nop3) shlg.b16 hr8.x, (r)8, (r)hr8.x, 0;
 39[200890f5_00000020] (ss)mova1 a1.y, hr8.x;
 40[00000500_00000000] (rpt5)nop ;
 41[a1881300_d0c20011] samgq.s2en.mode6.base0.s34 (f32)(xyOO)r0.x, r2.x, 134;
 42[73808001_10111010] (sy)mad.f32 r0.y, c4.x, r0.y, c4.y;
 43[63800000_10111010] mad.f32 r0.x, c4.x, r0.x, c4.y;
 44[63828802_00011010] (nop1) mad.f32 r0.z, (r)c4.x, r1.y, r0.y;
 45[40700001_00010001] mul.f r0.y, r0.y, r0.y;
 46[63820003_00001010] mad.f32 r0.w, c4.x, r1.x, r0.x;
 47[63800000_00010000] mad.f32 r0.x, r0.x, r0.x, r0.y;
 48[40100001_28000015] add.f r0.y, r5.y, (0.0);
 49[40100004_28000017] add.f r1.x, r5.w, (0.0);
 50[40100005_28000014] add.f r1.y, r5.x, (0.0);
 51[40100006_28000018] add.f r1.z, r6.x, (0.0);
 52[40700007_00010001] mul.f r1.w, r0.y, r0.y;
 53[40100008_28000016] add.f r2.x, r5.z, (0.0);
 54[63828007_00070005] mad.f32 r1.w, r1.y, r1.y, r1.w;
 55[40100000_28024000] add.f r0.x, (neg)r0.x, (1.0);
 56[40100009_28000019] add.f r2.y, r6.y, (0.0);
 57[63840007_00070008] mad.f32 r1.w, r2.x, r2.x, r1.w;
 58[4070000a_00060006] mul.f r2.z, r1.z, r1.z;
 59[4050000b_28000000] max.f r2.w, r0.x, (0.0);
 60[40b50000_00000000] cmps.f.ne r0.x, r0.x, r0.x;
 61[6382000a_000a0004] mad.f32 r2.z, r1.x, r1.x, r2.z;
 62[4050000c_10130007] max.f r3.x, r1.w, c4.w;
 63[40b50007_00070007] cmps.f.ne r1.w, r1.w, r1.w;
 64[64800000_000b1012] sel.b32 r0.x, c4.z, r0.x, r2.w;
 65[6384800a_000a0009] mad.f32 r2.z, r2.y, r2.y, r2.z;
 66[40100002_68020002] add.f r0.z, r0.z, (neg)(1.0);
 67[64838007_000c1013] sel.b32 r1.w, c4.w, r1.w, r3.x;
 68[40100003_68020003] add.f r0.w, r0.w, (neg)(1.0);
 69[4050000b_1013000a] max.f r2.w, r2.z, c4.w;
 70[4070000c_00020002] mul.f r3.x, r0.z, r0.z;
 71[40b5000a_000a000a] cmps.f.ne r2.z, r2.z, r2.z;
 72[6381800c_000c0003] mad.f32 r3.x, r0.w, r0.w, r3.x;
 73[80d00000_00008000] sqrt r0.x, (abs)r0.x;
 74[00000000_00000000] nop ;
 75[6380100c_000c0000] (ss)mad.f32 r3.x, r0.x, r0.x, r3.x;
 76[6485000a_000b9013] (nop2) sel.b32 r2.z, c4.w, (r)r2.z, r2.w;
 77[4050000b_1013000c] max.f r2.w, r3.x, c4.w;
 78[40bd080c_000c000c] (nop3) cmps.f.ne r3.x, r3.x, r3.x;
 79[6486000b_000b1013] sel.b32 r2.w, c4.w, r3.x, r2.w;
 80[80300007_00008007] rsq r1.w, (abs)r1.w;
 81[8030000a_0000800a] rsq r2.z, (abs)r2.z;
 82[00000500_00000000] (rpt5)nop ;
 83[8030000b_0000800b] rsq r2.w, (abs)r2.w;
 84[40701005_00070005] (ss)mul.f r1.y, r1.y, r1.w;
 85[40700001_00070001] mul.f r0.y, r0.y, r1.w;
 86[40700007_00080007] mul.f r1.w, r1.w, r2.x;
 87[40700004_000a0004] mul.f r1.x, r1.x, r2.z;
 88[40700006_000a0006] mul.f r1.z, r1.z, r2.z;
 89[40700008_000a0009] mul.f r2.x, r2.y, r2.z;
 90[40700003_000b0003] mul.f r0.w, r0.w, r2.w;
 91[40700002_000b0002] mul.f r0.z, r0.z, r2.w;
 92[63858000_10110000] mad.f32 r0.x, r0.x, r2.w, c4.y;
 93[40100009_2800001a] add.f r2.y, r6.z, (0.0);
 94[6384000a_10154001] mad.f32 r2.z, (neg)r0.y, r2.x, c5.y;
 95[6383800b_50150004] mad.f32 r2.w, r1.x, (neg)r1.w, c5.y;
 96[6383800a_000a0006] mad.f32 r2.z, r1.z, r1.w, r2.z;
 97[6384000b_000b0005] mad.f32 r2.w, r1.y, r2.x, r2.w;
 98[6383000c_10154005] mad.f32 r3.x, (neg)r1.y, r1.z, c5.y;
 99[4010000d_28000012] add.f r3.y, r4.z, (0.0);
100[6382000c_000c0001] mad.f32 r3.x, r0.y, r1.x, r3.x;
101[4078010a_000a0009] (rpt1)mul.f r2.z, r2.y, (r)r2.z;
102[63868002_10120002] mad.f32 r0.z, r0.z, r3.y, c4.z;
103[40700009_000c0009] mul.f r2.y, r2.y, r3.x;
104[63868003_10120003] mad.f32 r0.w, r0.w, r3.y, c4.z;
105[63868000_10140000] mad.f32 r0.x, r0.x, r3.y, c5.x;
106[4078010a_000a0002] (rpt1)mul.f r2.z, r0.z, (r)r2.z;
107[40700002_00090002] mul.f r0.z, r0.z, r2.y;
108[63830006_000b0003] mad.f32 r1.z, r0.w, r1.z, r2.w;
109[63820004_000a0003] mad.f32 r1.x, r0.w, r1.x, r2.z;
110[63808001_00060000] mad.f32 r0.y, r0.x, r0.y, r1.z;
111[63840002_00020003] mad.f32 r0.z, r0.w, r2.x, r0.z;
112[63828003_00040000] mad.f32 r0.w, r0.x, r1.y, r1.x;
113[63838000_00020000] mad.f32 r0.x, r0.x, r1.w, r0.z;
114[40700802_00010001] (nop1) mul.f r0.z, r0.y, r0.y;
115[63818802_00020003] (nop1) mad.f32 r0.z, (r)r0.w, r0.w, r0.z;
116[63800802_00028000] (nop3) mad.f32 r0.z, (r)r0.x, (r)r0.x, r0.z;
117[40500004_10130002] max.f r1.x, r0.z, c4.w;
118[40bd0802_00020002] (nop3) cmps.f.ne r0.z, r0.z, r0.z;
119[64810002_00041013] sel.b32 r0.z, c4.w, r0.z, r1.x;
120[00000500_00000000] (rpt5)nop ;
121[80300002_00008002] rsq r0.z, (abs)r0.z;
122[4070100a_00020001] (ss)mul.f r2.z, r0.y, r0.z;
123[4078000b_00020000] (nop2) mul.f r2.w, r0.x, r0.z;
124[40d00000_0000800a] absneg.f r0.x, (abs)r2.z;
125[40500001_800b800a] max.f r0.y, (abs)r2.z, (abs)r2.w;
126[40bd0804_800b800b] (nop3) cmps.f.ne r1.x, (abs)r2.w, (abs)r2.w;
127[64820001_00010000] sel.b32 r0.y, r0.x, r1.x, r0.y;
128[40d00004_0000800b] absneg.f r1.x, (abs)r2.w;
129[40b50005_800a800a] cmps.f.ne r1.y, (abs)r2.z, (abs)r2.z;
130[4078000c_00030002] (nop2) mul.f r3.x, r0.z, r0.w;
131[64828001_00010004] sel.b32 r0.y, r1.x, r1.y, r0.y;
132[40d80002_0000800c] (nop2) absneg.f r0.z, (abs)r3.x;
133[40500003_800c0001] max.f r0.w, r0.y, (abs)r3.x;
134[40bd0805_00010001] (nop3) cmps.f.ne r1.y, r0.y, r0.y;
135[64828003_00030002] sel.b32 r0.w, r0.z, r1.y, r0.w;
136[40bd0805_800c800c] (nop3) cmps.f.ne r1.y, (abs)r3.x, (abs)r3.x;
137[6482880d_00038001] (nop3) sel.b32 r3.y, (r)r0.y, (r)r1.y, r0.w;
138[40b40001_000d800a] cmps.f.eq r0.y, (abs)r2.z, r3.y;
139[40bc0003_000d800c] (nop2) cmps.f.eq r0.w, (abs)r3.x, r3.y;
140[64808001_00000004] sel.b32 r0.y, r1.x, r0.y, r0.x;
141[64818002_00028000] (nop2) sel.b32 r0.z, r0.x, (r)r0.w, r0.z;
142[64818801_00018004] (nop3) sel.b32 r0.y, (r)r1.x, (r)r0.w, r0.y;
143[40500000_00020001] max.f r0.x, r0.y, r0.z;
144[40bd0803_00010001] (nop3) cmps.f.ne r0.w, r0.y, r0.y;
145[64818000_00000002] sel.b32 r0.x, r0.z, r0.w, r0.x;
146[40bd0804_00020002] (nop3) cmps.f.ne r1.x, r0.z, r0.z;
147[64820800_00008001] (nop3) sel.b32 r0.x, (r)r0.y, (r)r1.x, r0.x;
148[40500005_10130000] max.f r1.y, r0.x, c4.w;
149[40bd0806_00000000] (nop3) cmps.f.ne r1.z, r0.x, r0.x;
150[64830005_00051013] sel.b32 r1.y, c4.w, r1.z, r1.y;
151[40300006_00020001] min.f r1.z, r0.y, r0.z;
152[00000400_00000000] (rpt4)nop ;
153[80100005_00000005] rcp r1.y, r1.y;
154[64818802_00068002] (nop3) sel.b32 r0.z, (r)r0.z, (r)r0.w, r1.z;
155[64820801_00028001] (nop3) sel.b32 r0.y, (r)r0.y, (r)r1.x, r0.z;
156[40701001_00050001] (ss)mul.f r0.y, r0.y, r1.y;
157[00000500_00000000] (rpt5)nop ;
158[a3801302_00000001] dsx (f32)(xyOO)r0.z, r0.x;
159[a3c01304_00000001] dsy (f32)(xyOO)r1.x, r0.x;
160[20444108_00000000] (rpt1)mov.f32f32 r2.x, (0.000000);
161[20044906_00000000] (rpt1)mov.f32f32 r1.z, (r)r0.x;
162[50700b00_100d0002] (sy)(rpt3)mul.f r0.x, (r)r0.z, c3.y;
163[20510008_00000002] mov.s16s16 hr2.x, 2;
164[00000200_00000000] (rpt2)nop ;
165[65840808_1000b008] (nop3) shlg.b16 hr2.x, (r)8, (r)hr2.x, 0;
166[200880f5_00000008] mova1 a1.y, hr2.x;
167[00000500_00000000] (rpt5)nop ;
168[a1881800_d2e2000d] samgq.s2en.mode6.base0.s34 (f32)(OOOw)r0.x, r1.z, 151;
169[40501000_1013000d] (ss)max.f r0.x, r3.y, c4.w;
170[40bd0801_000d000d] (nop3) cmps.f.ne r0.y, r3.y, r3.y;
171[64808000_00001013] sel.b32 r0.x, c4.w, r0.y, r0.x;
172[00000500_00000000] (rpt5)nop ;
173[80100000_00000000] rcp r0.x, r0.x;
174[50780801_28010003] (sy)(nop3) mul.f r0.y, r0.w, (0.5);
175[40781800_00010000] (ss)(nop3) mul.f r0.x, r0.x, r0.y;
176[63860004_10160000] mad.f32 r1.x, r0.x, r3.x, c5.z;
177[63850105_10168000] (rpt1)mad.f32 r1.y, r0.x, (r)r2.z, c5.z;
178[40180800_28000013] (nop3) add.f r0.x, r4.w, (0.0);
179[40780800_280b0000] (nop3) mul.f r0.x, r0.x, (4.0);
180[20054000_00000000] cov.f32s32 r0.x, r0.x;
181[00000200_00000000] (rpt2)nop ;
182[43980800_20030000] (nop3) and.b r0.x, r0.x, 3;
183[208c4000_00000000] cov.u32f32 (even)r0.x, r0.x;
184[00000200_00000000] (rpt2)nop ;
185[63800800_10189017] (nop3) mad.f32 r0.x, (r)c5.w, (r)r0.x, c6.x;
186[40700009_10190000] mul.f r2.y, r0.x, c6.y;
187[20244007_00000014] mov.f32f32 r1.w, c5.x;
188[2024400a_00000012] mov.f32f32 r2.z, c4.z;
189[2024400b_0000001a] mov.f32f32 r2.w, c6.z;
190[20444200_00000000] (rpt2)mov.f32f32 r0.x, (0.000000);
191[20444003_3c888889] mov.f32f32 r0.w, (0.016667);
192[20444008_3d23d70a] mov.f32f32 r2.x, (0.040000);
193[2055400c_00000003] mov.s32s32 r3.x, 3;
194[03000000_00000000] end ;

I don't know if it works on previous gens, so should I limit it to a6xx?

Edited by Danylo Piliaiev

Merge request reports