intel/compiler: Register coalesce doesn't move conditional modifiers
Using INTEL_DEBUG=optimizer
I was able to produce the following output. The left is before register_coalesce
, and the right is after. Note the mov.nz
is removed, and the .nz
should go to the earlier add
.
The pattern of mov.nz
followed by predicated mov
is what we used to generate for fsign
of an operand with an abs
source modifier (or both neg
and abs
). I think in that case the source to the mov.nz
would still have a source modifier, and that would prevent coalescing and this bug.
My guess is that this is an old problem that is only just now revealed.
load_payload(8) vgrf4:F, g2:F, g4:F, g3:F, g5:F NoMas load_payload(8) vgrf4:F, g2:F, g4:F, g3:F, g5:F NoMas
linterp(16) vgrf12+0.0:F, vgrf4:F, attr4<0>:F linterp(16) vgrf12+0.0:F, vgrf4:F, attr4<0>:F
linterp(16) vgrf12+2.0:F, vgrf4:F, attr5<0>:F linterp(16) vgrf12+2.0:F, vgrf4:F, attr5<0>:F
linterp(16) vgrf13:F, vgrf4:F, attr7<0>:F linterp(16) vgrf13:F, vgrf4:F, attr7<0>:F
tex_logical(16) vgrf77:F, vgrf12:F(null):UD(null):UD( tex_logical(16) vgrf77:F, vgrf12:F(null):UD(null):UD(
add(16) vgrf17:F, -vgrf77:F, vgrf13:F | add(16) vgrf28:F, -vgrf77:F, vgrf13:F
add.sat(16) vgrf19:F, vgrf77:F, -vgrf13:F | add.sat(16) vgrf20:F, vgrf77:F, -vgrf13:F
mov(16) vgrf20:F, vgrf19:F <
rcp(16) vgrf25:F, u65532+7.0<0>:F rcp(16) vgrf25:F, u65532+7.0<0>:F
mul(16) vgrf26:F, |vgrf17|:F, vgrf25:F | mul(16) vgrf26:F, |vgrf28|:F, vgrf25:F
mov.nz.f0.0(16) vgrf28:F, vgrf17:F <
(+f0.0) mov(16) vgrf28:F, 1.58456e+29f (+f0.0) mov(16) vgrf28:F, 1.58456e+29f
csel.nz(16) vgrf30:F, vgrf26:F, vgrf28:F, u65532+7.0< csel.nz(16) vgrf30:F, vgrf26:F, vgrf28:F, u65532+7.0<
add.sat(16) vgrf32:F, -vgrf30:F, 1f | add.sat(16) vgrf33:F, -vgrf30:F, 1f
mov(16) vgrf33:F, vgrf32:F <
pow(16) vgrf37:F, vgrf33:F, u65532+8.0<0>:F pow(16) vgrf37:F, vgrf33:F, u65532+8.0<0>:F
add(16) vgrf39:F, -vgrf37:F, 1f add(16) vgrf39:F, -vgrf37:F, 1f
tex_logical(16) vgrf40:F, vgrf12:F(null):UD(null):UD( tex_logical(16) vgrf40:F, vgrf12:F(null):UD(null):UD(
mul(16) vgrf42:F, vgrf40+6.0:F, 255f mul(16) vgrf42:F, vgrf40+6.0:F, 255f
rnde(16) vgrf43:F, vgrf42:F rnde(16) vgrf43:F, vgrf42:F
add.sat(16) vgrf46:F, -vgrf43:F, 100f | add.sat(16) vgrf47:F, -vgrf43:F, 100f
mov(16) vgrf47:F, vgrf46:F <
sel.ge(16) vgrf48:F, vgrf47:F, vgrf20:F sel.ge(16) vgrf48:F, vgrf47:F, vgrf20:F
linterp(16) vgrf49+0.0:F, vgrf4:F, attr0<0>:F linterp(16) vgrf49+0.0:F, vgrf4:F, attr0<0>:F
linterp(16) vgrf49+2.0:F, vgrf4:F, attr1<0>:F linterp(16) vgrf49+2.0:F, vgrf4:F, attr1<0>:F
tex_logical(16) vgrf50:F, vgrf49:F(null):UD(null):UD( tex_logical(16) vgrf50:F, vgrf49:F(null):UD(null):UD(
linterp(16) vgrf100:F, vgrf4:F, attr8<0>:F linterp(16) vgrf100:F, vgrf4:F, attr8<0>:F
linterp(16) vgrf101:F, vgrf4:F, attr9<0>:F linterp(16) vgrf101:F, vgrf4:F, attr9<0>:F
linterp(16) vgrf102:F, vgrf4:F, attr10<0>:F linterp(16) vgrf102:F, vgrf4:F, attr10<0>:F
linterp(16) vgrf52:F, vgrf4:F, attr11<0>:F linterp(16) vgrf52:F, vgrf4:F, attr11<0>:F
mul(16) vgrf53:F, vgrf50+0.0:F, vgrf100:F mul(16) vgrf53:F, vgrf50+0.0:F, vgrf100:F
mul(16) vgrf54:F, vgrf50+2.0:F, vgrf101:F mul(16) vgrf54:F, vgrf50+2.0:F, vgrf101:F
mul(16) vgrf55:F, vgrf50+4.0:F, vgrf102:F mul(16) vgrf55:F, vgrf50+4.0:F, vgrf102:F
mul(16) vgrf56:F, vgrf50+6.0:F, vgrf52:F mul(16) vgrf56:F, vgrf50+6.0:F, vgrf52:F
mul(16) vgrf57:F, vgrf48:F, vgrf56:F mul(16) vgrf57:F, vgrf48:F, vgrf56:F
mul(16) vgrf58:F, vgrf39:F, vgrf57:F mul(16) vgrf58:F, vgrf39:F, vgrf57:F
mul(16) vgrf59:F, vgrf58:F, vgrf53:F | mul(16) vgrf70+0.0:F, vgrf58:F, vgrf53:F
mul(16) vgrf60:F, vgrf58:F, vgrf54:F | mul(16) vgrf70+2.0:F, vgrf58:F, vgrf54:F
mul(16) vgrf61:F, vgrf58:F, vgrf55:F | mul(16) vgrf70+4.0:F, vgrf58:F, vgrf55:F
mul(16) vgrf65:F, vgrf60:F, u65533+1.0<0>:F | mul(16) vgrf65:F, vgrf70+2.0:F, u65533+1.0<0>:F
mad(16) vgrf66:F, vgrf65:F, u65533<0>:F, vgrf59:F | mad(16) vgrf66:F, vgrf65:F, u65533<0>:F, vgrf70+0.0:F
mad(16) vgrf67:F, vgrf66:F, u65533+2.0<0>:F, vgrf61:F | mad(16) vgrf67:F, vgrf66:F, u65533+2.0<0>:F, vgrf70+4
mul(16) vgrf68:F, vgrf67:F, u65532+9.0<0>:F | mul(16) vgrf70+6.0:F, vgrf67:F, u65532+9.0<0>:F
mov(16) vgrf70+0.0:D, vgrf59:D <
mov(16) vgrf70+2.0:D, vgrf60:D <
mov(16) vgrf70+4.0:D, vgrf61:D <
mov(16) vgrf70+6.0:D, vgrf68:D <
mov(16) vgrf71+0.0:D, 0d mov(16) vgrf71+0.0:D, 0d
mov(16) vgrf71+2.0:D, 0d mov(16) vgrf71+2.0:D, 0d
mov(16) vgrf71+4.0:D, 0d mov(16) vgrf71+4.0:D, 0d
mov(16) vgrf71+6.0:D, 0d mov(16) vgrf71+6.0:D, 0d
mov(16) vgrf72+0.0:D, 0d mov(16) vgrf72+0.0:D, 0d
mov(16) vgrf72+2.0:D, 0d mov(16) vgrf72+2.0:D, 0d
mov(16) vgrf72+4.0:D, 0d mov(16) vgrf72+4.0:D, 0d
mov(16) vgrf72+6.0:D, 0d mov(16) vgrf72+6.0:D, 0d
mov(16) vgrf73+0.0:D, 0d mov(16) vgrf73+0.0:D, 0d
mov(16) vgrf73+2.0:D, 0d mov(16) vgrf73+2.0:D, 0d
mov(16) vgrf73+4.0:D, 0d mov(16) vgrf73+4.0:D, 0d
mov(16) vgrf73+6.0:D, 0d mov(16) vgrf73+6.0:D, 0d
fb_write_logical(16) (null):UD, vgrf70:F(null):UD(nul fb_write_logical(16) (null):UD, vgrf70:F(null):UD(nul
fb_write_logical(16) (null):UD, vgrf71:F(null):UD(nul fb_write_logical(16) (null):UD, vgrf71:F(null):UD(nul
fb_write_logical(16) (null):UD, vgrf72:F(null):UD(nul fb_write_logical(16) (null):UD, vgrf72:F(null):UD(nul
fb_write_logical(16) (EOT) (null):UD, vgrf73:F(null): fb_write_logical(16) (EOT) (null):UD, vgrf73:F(null):