From aeae7b812cc05328268047c8d4cb7cfdea3bcf38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Thu, 14 Jul 2022 18:35:56 +0200 Subject: [PATCH 1/8] r300: allow constant swizzles with inline constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will prevent a regression in the number of inlined constants in a later commit. Constructs like 4.000000 (0x48).w110 works just fine. There is a small behavioral change. We would previously allow positive and negative same-value contants to be produced, e.g., 4.000000 (0x48).w-w__ and this would be later split into some extra movs in the dataflow swizzle pass. We now explicitly check that the final swizzle is valid while inlining. So there is a minor decrease in inlined constants and in the total instructions. total lits in shared programs: 4328 -> 4194 (-3.10%) lits in affected programs: 554 -> 420 (-24.19%) total instructions in shared programs: 155488 -> 155361 (-0.08%) instructions in affected programs: 5707 -> 5580 (-2.23%) Additonally, a fix for pair translation is needed since the constant inlining can now produce swizzles like this: 4.000000 (0x48).w-0-0-_ so we have to teach pair translation to also ignore the sign for zero swizzle. Signed-off-by: Pavel Ondračka Reviewed-by: Filip Gawin Part-of: --- .../r300/compiler/radeon_inline_literals.c | 36 +++++++------------ .../r300/compiler/radeon_pair_translate.c | 18 +++++++++- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/r300/compiler/radeon_inline_literals.c b/src/gallium/drivers/r300/compiler/radeon_inline_literals.c index d1f2c4b03ef7..9077728be82c 100644 --- a/src/gallium/drivers/r300/compiler/radeon_inline_literals.c +++ b/src/gallium/drivers/r300/compiler/radeon_inline_literals.c @@ -28,6 +28,7 @@ #include "radeon_dataflow.h" #include "radeon_program.h" #include "radeon_program_constants.h" +#include "radeon_swizzle.h" #include "util/u_bitcast.h" #include @@ -104,32 +105,22 @@ void rc_inline_literals(struct radeon_compiler *c, void *user) /* We aren't using rc_for_all_reads_src here, because presub * sources need to be handled differently. */ for (src_idx = 0; src_idx < info->NumSrcRegs; src_idx++) { - unsigned new_swizzle; unsigned use_literal = 0; - unsigned negate_mask = 0; unsigned swz, chan; - struct rc_src_register * src_reg = - &inst->U.I.SrcReg[src_idx]; - swz = RC_SWIZZLE_UNUSED; - if (src_reg->File != RC_FILE_CONSTANT) { + struct rc_src_register src_reg = inst->U.I.SrcReg[src_idx]; + if (src_reg.File != RC_FILE_CONSTANT) { continue; } constant = - &c->Program.Constants.Constants[src_reg->Index]; + &c->Program.Constants.Constants[src_reg.Index]; if (constant->Type != RC_CONSTANT_IMMEDIATE) { continue; } - new_swizzle = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0); for (chan = 0; chan < 4; chan++) { unsigned char r300_float_tmp; - swz = GET_SWZ(src_reg->Swizzle, chan); - if (swz == RC_SWIZZLE_UNUSED) { - continue; - } - /* Don't try to inline constant swizzle */ + swz = GET_SWZ(src_reg.Swizzle, chan); if (swz >= RC_SWIZZLE_ZERO) { - use_literal = 0; - break; + continue; } float_value = constant->u.Immediate[swz]; ret = ieee_754_to_r300_float(float_value, @@ -140,7 +131,7 @@ void rc_inline_literals(struct radeon_compiler *c, void *user) break; } - if (ret == -1 && src_reg->Abs) { + if (ret == -1 && src_reg.Abs) { use_literal = 0; break; } @@ -152,19 +143,18 @@ void rc_inline_literals(struct radeon_compiler *c, void *user) /* Use RC_SWIZZLE_W for the inline constant, so * it will become one of the alpha sources. */ - SET_SWZ(new_swizzle, chan, RC_SWIZZLE_W); + SET_SWZ(src_reg.Swizzle, chan, RC_SWIZZLE_W); if (ret == -1) { - negate_mask |= (1 << chan); + src_reg.Negate ^= (1 << chan); } } - if (!use_literal) { + src_reg.File = RC_FILE_INLINE; + src_reg.Index = r300_float; + if (!use_literal || !c->SwizzleCaps->IsNative(inst->U.I.Opcode, src_reg)) { continue; } - src_reg->File = RC_FILE_INLINE; - src_reg->Index = r300_float; - src_reg->Swizzle = new_swizzle; - src_reg->Negate = src_reg->Negate ^ negate_mask; + inst->U.I.SrcReg[src_idx] = src_reg; } } } diff --git a/src/gallium/drivers/r300/compiler/radeon_pair_translate.c b/src/gallium/drivers/r300/compiler/radeon_pair_translate.c index b75b658b8623..9960c4a2289b 100644 --- a/src/gallium/drivers/r300/compiler/radeon_pair_translate.c +++ b/src/gallium/drivers/r300/compiler/radeon_pair_translate.c @@ -230,7 +230,23 @@ static void set_pair_instruction(struct r300_fragment_program_compiler *c, else if (swz == RC_SWIZZLE_W) srcalpha = 1; - if (swz < RC_SWIZZLE_UNUSED) + /* We check for ZERO here as well because otherwise the zero + * sign (which doesn't matter and we already ignore it previously + * when checking for valid swizzle) could mess up the final negate sign. + * Example problematic pattern where this would be produced is: + * CONST[1] FLT32 { 0.0000, 0.0000, -4.0000, 0.0000} + * ADD temp[0].xyz, const[0].xyz_, -const[1].z00_; + * + * after inline literals would become: + * ADD temp[0].xyz, const[0].xyz_, 4.000000 (0x48).w-0-0-_; + * + * and after pair translate: + * src0.xyz = const[0], src0.w = 4.000000 (0x48) + * MAD temp[0].xyz, src0.xyz, src0.111, src0.w00 + * + * Without the zero check there would be -src0.w00. + */ + if (swz < RC_SWIZZLE_UNUSED && swz != RC_SWIZZLE_ZERO) srcmask |= 1 << j; } source = rc_pair_alloc_source(pair, srcrgb, srcalpha, -- GitLab From 6286e48e5dda633cc45aa06f48118f99855c1e08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Tue, 12 Jul 2022 10:17:13 +0200 Subject: [PATCH 2/8] r300: fix negate mask computation when merging movs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The main problem here is we can have a negate bit set for an unused channel, so we can't just OR together the negates when channel merging. Right now the bug is hidden because how we run the pass order, but that will change in a later commit. Add some helpers for merging of the negates, they will be also used more in a later commits. As a bonus construct the new source separatelly and only rewrite the original instructions after checking that the final swizzle is valid. Signed-off-by: Pavel Ondračka Reviewed-by: Filip Gawin Part-of: --- .../drivers/r300/compiler/radeon_optimize.c | 38 ++++++++++++++----- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/r300/compiler/radeon_optimize.c b/src/gallium/drivers/r300/compiler/radeon_optimize.c index 8d120984b894..fef370d88078 100644 --- a/src/gallium/drivers/r300/compiler/radeon_optimize.c +++ b/src/gallium/drivers/r300/compiler/radeon_optimize.c @@ -900,12 +900,28 @@ static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2) { return new_swz; } +/* Sets negate to 0 for unused channels. */ +static unsigned int clean_negate(struct rc_src_register src) +{ + unsigned int new_negate = 0; + for (unsigned int chan = 0; chan < 4; chan++) { + unsigned int swz = GET_SWZ(src.Swizzle, chan); + if (swz != RC_SWIZZLE_UNUSED) + new_negate |= src.Negate & (1 << chan); + } + return new_negate; +} + +static unsigned int merge_negates(struct rc_src_register src1, struct rc_src_register src2) +{ + return clean_negate(src1) | clean_negate(src2); +} + static int merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) { unsigned int orig_dst_reg = inst->U.I.DstReg.Index; unsigned int orig_dst_file = inst->U.I.DstReg.File; unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask; - unsigned int orig_src_reg = inst->U.I.SrcReg[0].Index; unsigned int orig_src_file = inst->U.I.SrcReg[0].File; struct rc_instruction * cur = inst; @@ -944,17 +960,19 @@ static int merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) orig_src_file == RC_FILE_NONE) { cur->U.I.DstReg.WriteMask |= orig_dst_wmask; + struct rc_src_register src; if (cur->U.I.SrcReg[0].File == RC_FILE_NONE) { - cur->U.I.SrcReg[0].File = orig_src_file; - cur->U.I.SrcReg[0].Index = orig_src_reg; - cur->U.I.SrcReg[0].Abs = inst->U.I.SrcReg[0].Abs; - cur->U.I.SrcReg[0].RelAddr = inst->U.I.SrcReg[0].RelAddr; + src = inst->U.I.SrcReg[0]; + } else { + src = cur->U.I.SrcReg[0]; } - cur->U.I.SrcReg[0].Swizzle = - merge_swizzles(cur->U.I.SrcReg[0].Swizzle, - inst->U.I.SrcReg[0].Swizzle); - - cur->U.I.SrcReg[0].Negate |= inst->U.I.SrcReg[0].Negate; + src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, + inst->U.I.SrcReg[0].Swizzle); + src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]); + if (!c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) + return 0; + cur->U.I.DstReg.WriteMask |= orig_dst_wmask; + cur->U.I.SrcReg[0] = src; /* finally delete the original mov */ rc_remove_instruction(inst); -- GitLab From 2755faf9386b4428973f105dfada6625ab85f392 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Wed, 22 Jun 2022 18:30:06 +0200 Subject: [PATCH 3/8] r300: check for identical saturate mode when merging MOVs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Pavel Ondračka Reviewed-by: Filip Gawin Part-of: --- src/gallium/drivers/r300/compiler/radeon_optimize.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gallium/drivers/r300/compiler/radeon_optimize.c b/src/gallium/drivers/r300/compiler/radeon_optimize.c index fef370d88078..31bd8526a5a9 100644 --- a/src/gallium/drivers/r300/compiler/radeon_optimize.c +++ b/src/gallium/drivers/r300/compiler/radeon_optimize.c @@ -953,6 +953,7 @@ static int merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) if (cur->U.I.Opcode == RC_OPCODE_MOV && cur->U.I.DstReg.File == orig_dst_file && cur->U.I.DstReg.Index == orig_dst_reg && + cur->U.I.SaturateMode == inst->U.I.SaturateMode && (cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) { /* We can merge the movs if one of them is from inline constant */ -- GitLab From 05785d482ea86493a700ad778264f563787ceb20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Fri, 17 Jun 2022 21:09:32 +0200 Subject: [PATCH 4/8] r300: run dataflow optimizations in separate loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Constant folding first, than copy propagate simple movs, after that we run the merge movs pass and finally peephole. The important part is to do the copy propagate for the whole program before running merge movs. We no longer check the return from merge_movs so convert it to void. Shader-db changes with RV530: total instructions in shared programs: 155361 -> 154787 (-0.37%) instructions in affected programs: 67920 -> 67346 (-0.85%) total temps in shared programs: 20836 -> 20773 (-0.30%) temps in affected programs: 711 -> 648 (-8.86%) total presub in shared programs: 8226 -> 8202 (-0.29%) presub in affected programs: 223 -> 199 (-10.76%) total temps in shared programs: 20836 -> 20773 (-0.30%) temps in affected programs: 711 -> 648 (-8.86%) Signed-off-by: Pavel Ondračka Reviewed-by: Filip Gawin Part-of: --- .../drivers/r300/compiler/radeon_optimize.c | 49 +++++++++++++------ 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/r300/compiler/radeon_optimize.c b/src/gallium/drivers/r300/compiler/radeon_optimize.c index 31bd8526a5a9..4c9c66cbb3cc 100644 --- a/src/gallium/drivers/r300/compiler/radeon_optimize.c +++ b/src/gallium/drivers/r300/compiler/radeon_optimize.c @@ -917,7 +917,7 @@ static unsigned int merge_negates(struct rc_src_register src1, struct rc_src_reg return clean_negate(src1) | clean_negate(src2); } -static int merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) +static void merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) { unsigned int orig_dst_reg = inst->U.I.DstReg.Index; unsigned int orig_dst_file = inst->U.I.DstReg.File; @@ -933,13 +933,13 @@ static int merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) * control flow. */ if (opcode->IsFlowControl) - return 0; + return; /* Stop when the original destination is overwritten */ if (orig_dst_reg == cur->U.I.DstReg.Index && orig_dst_file == cur->U.I.DstReg.File && (orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0) - return 0; + return; /* Stop the search when the original instruction destination * is used as a source for anything. @@ -947,7 +947,7 @@ static int merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) for (unsigned i = 0; i < opcode->NumSrcRegs; i++) { if (cur->U.I.SrcReg[i].File == orig_dst_file && cur->U.I.SrcReg[i].Index == orig_dst_reg) - return 0; + return; } if (cur->U.I.Opcode == RC_OPCODE_MOV && @@ -971,18 +971,17 @@ static int merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) inst->U.I.SrcReg[0].Swizzle); src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]); if (!c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) - return 0; + return; cur->U.I.DstReg.WriteMask |= orig_dst_wmask; cur->U.I.SrcReg[0] = src; /* finally delete the original mov */ rc_remove_instruction(inst); - return 1; + return; } } } - return 0; } void rc_optimize(struct radeon_compiler * c, void *user) @@ -991,22 +990,40 @@ void rc_optimize(struct radeon_compiler * c, void *user) while(inst != &c->Program.Instructions) { struct rc_instruction * cur = inst; inst = inst->Next; - constant_folding(c, cur); + } - if(peephole(c, cur)) - continue; - + /* Copy propagate simple movs away. */ + inst = c->Program.Instructions.Next; + while(inst != &c->Program.Instructions) { + struct rc_instruction * cur = inst; + inst = inst->Next; if (cur->U.I.Opcode == RC_OPCODE_MOV) { - if (c->is_r500) { - if (merge_movs(c, cur)) - continue; - } copy_propagate(c, cur); - /* cur may no longer be part of the program */ } } + /* Merge MOVs to same source in different channels using the constant + * swizzles. + */ + if (c->is_r500) { + inst = c->Program.Instructions.Next; + while(inst != &c->Program.Instructions) { + struct rc_instruction * cur = inst; + inst = inst->Next; + if (cur->U.I.Opcode == RC_OPCODE_MOV) + merge_movs(c, cur); + } + } + + /* Presubtract operations. */ + inst = c->Program.Instructions.Next; + while(inst != &c->Program.Instructions) { + struct rc_instruction * cur = inst; + inst = inst->Next; + peephole(c, cur); + } + if (!c->has_omod) { return; } -- GitLab From 268f317f228b2e7ffc850c1881d8f3f75f41aed2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Tue, 12 Jul 2022 15:15:20 +0200 Subject: [PATCH 5/8] r300: generalize the merge_movs pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To allow a simple extension with more merging combinations in the later commits. Signed-off-by: Pavel Ondračka Reviewed-by: Filip Gawin Part-of: --- .../drivers/r300/compiler/radeon_optimize.c | 110 +++++++++++++----- 1 file changed, 82 insertions(+), 28 deletions(-) diff --git a/src/gallium/drivers/r300/compiler/radeon_optimize.c b/src/gallium/drivers/r300/compiler/radeon_optimize.c index 4c9c66cbb3cc..4d2efd7b86d9 100644 --- a/src/gallium/drivers/r300/compiler/radeon_optimize.c +++ b/src/gallium/drivers/r300/compiler/radeon_optimize.c @@ -886,7 +886,8 @@ static int peephole(struct radeon_compiler * c, struct rc_instruction * inst) return 0; } -static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2) { +static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2) +{ unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0); for (unsigned int chan = 0; chan < 4; chan++) { unsigned int swz = GET_SWZ(swz1, chan); @@ -917,12 +918,79 @@ static unsigned int merge_negates(struct rc_src_register src1, struct rc_src_reg return clean_negate(src1) | clean_negate(src2); } -static void merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) +static unsigned int fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz) +{ + for (unsigned int chan = 0; chan < 4; chan++) { + unsigned int swz = GET_SWZ(orig_swz, chan); + if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) { + SET_SWZ(orig_swz, chan, const_swz); + } + } + return orig_swz; +} + +/** + * Merges two MOVs writing different channels of the same destination register + * with the use of the constant swizzles. + */ +static bool merge_movs( + struct radeon_compiler * c, + struct rc_instruction * inst, + struct rc_instruction * cur) +{ + /* We can merge two MOVs into MOV if one of them is from inline constant, + * i.e., constant swizzles and RC_FILE_NONE). + * + * For example + * MOV temp[0].x none.1___ + * MOV temp[0].y input[0]._x__ + * + * becomes + * MOV temp[0].xy input[0].1x__ + */ + unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask; + if (cur->U.I.SrcReg[0].File == RC_FILE_NONE || + inst->U.I.SrcReg[0].File == RC_FILE_NONE) { + struct rc_src_register src; + if (cur->U.I.SrcReg[0].File == RC_FILE_NONE) + src = inst->U.I.SrcReg[0]; + else + src = cur->U.I.SrcReg[0]; + src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, + inst->U.I.SrcReg[0].Swizzle); + src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]); + if (!c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) + return false; + cur->U.I.DstReg.WriteMask |= orig_dst_wmask; + cur->U.I.SrcReg[0] = src; + rc_remove_instruction(inst); + return true; + } + return false; +} + +static bool inst_combination( + struct rc_instruction * inst1, + struct rc_instruction * inst2, + rc_opcode opcode1, + rc_opcode opcode2) +{ + return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) || + (inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2)); +} + +/** + * Searches for instructions writing different channels of the same register that could + * be merged together with the use of constant swizzles. + * + * The potential candidates are combinations of MOVs, ADDs, MULs and MADs. + */ +static void merge_channels(struct radeon_compiler * c, struct rc_instruction * inst) { unsigned int orig_dst_reg = inst->U.I.DstReg.Index; unsigned int orig_dst_file = inst->U.I.DstReg.File; unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask; - unsigned int orig_src_file = inst->U.I.SrcReg[0].File; + const struct rc_opcode_info * orig_opcode = rc_get_opcode_info(inst->U.I.Opcode); struct rc_instruction * cur = inst; while (cur!= &c->Program.Instructions) { @@ -950,35 +1018,21 @@ static void merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) return; } - if (cur->U.I.Opcode == RC_OPCODE_MOV && - cur->U.I.DstReg.File == orig_dst_file && + /* Stop the search when some of the original sources are touched. */ + for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) { + if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File && + inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index) + return; + } + + if (cur->U.I.DstReg.File == orig_dst_file && cur->U.I.DstReg.Index == orig_dst_reg && cur->U.I.SaturateMode == inst->U.I.SaturateMode && (cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) { - /* We can merge the movs if one of them is from inline constant */ - if (cur->U.I.SrcReg[0].File == RC_FILE_NONE || - orig_src_file == RC_FILE_NONE) { - cur->U.I.DstReg.WriteMask |= orig_dst_wmask; - - struct rc_src_register src; - if (cur->U.I.SrcReg[0].File == RC_FILE_NONE) { - src = inst->U.I.SrcReg[0]; - } else { - src = cur->U.I.SrcReg[0]; - } - src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, - inst->U.I.SrcReg[0].Swizzle); - src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]); - if (!c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) + if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) { + if (merge_movs(c, inst, cur)) return; - cur->U.I.DstReg.WriteMask |= orig_dst_wmask; - cur->U.I.SrcReg[0] = src; - - /* finally delete the original mov */ - rc_remove_instruction(inst); - - return; } } } @@ -1012,7 +1066,7 @@ void rc_optimize(struct radeon_compiler * c, void *user) struct rc_instruction * cur = inst; inst = inst->Next; if (cur->U.I.Opcode == RC_OPCODE_MOV) - merge_movs(c, cur); + merge_channels(c, cur); } } -- GitLab From 13607d8c4829edda3f3b01a9bdda0ece6e1c4821 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Tue, 12 Jul 2022 16:03:35 +0200 Subject: [PATCH 6/8] r300: don't merge w channel in fragment shaders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skip the merge if one of the instructions writes just w channel and we are compiling a fragment shader. We can pair-schedule it together later anyway and it will also give the scheduler a bit more flexibility. Shader-db stats with RV530: total instructions in shared programs: 169522 -> 169509 (<.01%) instructions in affected programs: 14170 -> 14157 (-0.09%) total temps in shared programs: 21712 -> 21722 (0.05%) temps in affected programs: 324 -> 334 (3.09%) Signed-off-by: Pavel Ondračka Reviewed-by: Filip Gawin Part-of: --- src/gallium/drivers/r300/compiler/radeon_optimize.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/gallium/drivers/r300/compiler/radeon_optimize.c b/src/gallium/drivers/r300/compiler/radeon_optimize.c index 4d2efd7b86d9..14d8eb6bb4e4 100644 --- a/src/gallium/drivers/r300/compiler/radeon_optimize.c +++ b/src/gallium/drivers/r300/compiler/radeon_optimize.c @@ -1030,6 +1030,14 @@ static void merge_channels(struct radeon_compiler * c, struct rc_instruction * i cur->U.I.SaturateMode == inst->U.I.SaturateMode && (cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) { + /* Skip the merge if one of the instructions writes just w channel + * and we are compiling a fragment shader. We can pair-schedule it together + * later anyway and it will also give the scheduler a bit more flexibility. + */ + if (c->has_omod && (cur->U.I.DstReg.WriteMask == RC_MASK_W || + inst->U.I.DstReg.WriteMask == RC_MASK_W)) + continue; + if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) { if (merge_movs(c, inst, cur)) return; -- GitLab From 275beae42d4f74790b084892269041f4fcafee48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Tue, 12 Jul 2022 16:04:22 +0200 Subject: [PATCH 7/8] r300: merge MOVs into ADD using the 0 swizzle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shader-db stats with RV530: total instructions in shared programs: 169509 -> 166013 (-2.06%) instructions in affected programs: 99126 -> 95630 (-3.53%) total presub in shared programs: 10975 -> 10758 (-1.98%) presub in affected programs: 744 -> 527 (-29.17%) total temps in shared programs: 21722 -> 21649 (-0.34%) temps in affected programs: 1350 -> 1277 (-5.41%) Signed-off-by: Pavel Ondračka Reviewed-by: Filip Gawin Part-of: --- .../drivers/r300/compiler/radeon_optimize.c | 43 ++++++++++++++++--- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/r300/compiler/radeon_optimize.c b/src/gallium/drivers/r300/compiler/radeon_optimize.c index 14d8eb6bb4e4..b13c8634f942 100644 --- a/src/gallium/drivers/r300/compiler/radeon_optimize.c +++ b/src/gallium/drivers/r300/compiler/radeon_optimize.c @@ -959,14 +959,43 @@ static bool merge_movs( src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, inst->U.I.SrcReg[0].Swizzle); src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]); - if (!c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) - return false; - cur->U.I.DstReg.WriteMask |= orig_dst_wmask; - cur->U.I.SrcReg[0] = src; - rc_remove_instruction(inst); - return true; + if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) { + cur->U.I.DstReg.WriteMask |= orig_dst_wmask; + cur->U.I.SrcReg[0] = src; + rc_remove_instruction(inst); + return true; + } } - return false; + + /* Otherwise, we can convert the MOVs into ADD. + * + * For example + * MOV temp[0].x const[0].x + * MOV temp[0].y input[0].y + * + * becomes + * ADD temp[0].xy const[0].x0 input[0].0y + */ + unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask; + struct rc_src_register src0 = inst->U.I.SrcReg[0]; + struct rc_src_register src1 = cur->U.I.SrcReg[0]; + + src0.Swizzle = fill_swizzle(src0.Swizzle, + wmask, RC_SWIZZLE_ZERO); + src1.Swizzle = fill_swizzle(src1.Swizzle, + wmask, RC_SWIZZLE_ZERO); + if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) || + !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1)) + return false; + + cur->U.I.DstReg.WriteMask = wmask; + cur->U.I.Opcode = RC_OPCODE_ADD; + cur->U.I.SrcReg[0] = src0; + cur->U.I.SrcReg[1] = src1; + + /* finally delete the original mov */ + rc_remove_instruction(inst); + return true; } static bool inst_combination( -- GitLab From 9c01fff4453cc067b3e1dc0448f36510cb1950ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Mon, 11 Jul 2022 13:11:53 +0200 Subject: [PATCH 8/8] r300: merge MOVs with MULs or ADDs in merge channels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shader-db stats with RV530: total instructions in shared programs: 166499 -> 164362 (-1.28%) instructions in affected programs: 80056 -> 77919 (-2.67%) total temps in shared programs: 21658 -> 21565 (-0.43%) temps in affected programs: 1780 -> 1687 (-5.22%) Signed-off-by: Pavel Ondračka Reviewed-by: Filip Gawin Part-of: --- .../drivers/r300/compiler/radeon_optimize.c | 117 +++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/r300/compiler/radeon_optimize.c b/src/gallium/drivers/r300/compiler/radeon_optimize.c index b13c8634f942..4d8f5cbf0317 100644 --- a/src/gallium/drivers/r300/compiler/radeon_optimize.c +++ b/src/gallium/drivers/r300/compiler/radeon_optimize.c @@ -998,6 +998,113 @@ static bool merge_movs( return true; } +static int have_shared_source(struct rc_instruction * inst1, struct rc_instruction * inst2) +{ + int shared_src = -1; + const struct rc_opcode_info * opcode1 = rc_get_opcode_info(inst1->U.I.Opcode); + const struct rc_opcode_info * opcode2 = rc_get_opcode_info(inst2->U.I.Opcode); + for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) { + for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) { + if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File && + inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index && + inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr) + shared_src = i; + } + } + return shared_src; +} + +/** + * This function will try to merge MOV and ADD/MUL instructions with the same + * destination, making use of the constant swizzles. + * + * For example: + * MOV temp[0].x const[0].x + * MUL temp[0].yz const[1].yz const[2].yz + * + * becomes + * MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00 + */ +static int merge_mov_add_mul( + struct radeon_compiler * c, + struct rc_instruction * inst1, + struct rc_instruction * inst2) +{ + struct rc_instruction * inst, * mov; + if (inst1->U.I.Opcode == RC_OPCODE_MOV) { + mov = inst1; + inst = inst2; + } else { + mov = inst2; + inst = inst1; + } + + const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL; + int shared_index = have_shared_source(inst, mov); + unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask; + + /* If there is a shared source, just merge the swizzles and be done with it. */ + if (shared_index != -1) { + struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index]; + struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index]; + + shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src); + shared_src.Swizzle = merge_swizzles(shared_src.Swizzle, + mov->U.I.SrcReg[0].Swizzle); + other_src.Negate = clean_negate(other_src); + unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO; + other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz); + + if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) || + !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src)) + return 0; + + inst2->U.I.Opcode = inst->U.I.Opcode; + inst2->U.I.SrcReg[0] = shared_src; + inst2->U.I.SrcReg[1] = other_src; + + /* TODO: we can do a bit better in the special case when one of the sources is none. + * Convert to MAD otherwise. + */ + } else { + struct rc_src_register src0, src1, src2; + if (is_mul) { + src2 = mov->U.I.SrcReg[0]; + src0 = inst->U.I.SrcReg[0]; + src1 = inst->U.I.SrcReg[1]; + } else { + src0 = mov->U.I.SrcReg[0]; + src1 = inst->U.I.SrcReg[0]; + src2 = inst->U.I.SrcReg[1]; + } + /* The following login expects that the unused channels have empty negate bits. */ + src0.Negate = clean_negate(src0); + src1.Negate = clean_negate(src1); + src2.Negate = clean_negate(src2); + + src0.Swizzle = fill_swizzle(src0.Swizzle, + wmask, RC_SWIZZLE_ONE); + src1.Swizzle = fill_swizzle(src1.Swizzle, + wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE); + src2.Swizzle = fill_swizzle(src2.Swizzle, + wmask, RC_SWIZZLE_ZERO); + if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) || + !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) || + !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2)) + return 0; + + inst2->U.I.Opcode = RC_OPCODE_MAD; + inst2->U.I.SrcReg[0] = src0; + inst2->U.I.SrcReg[1] = src1; + inst2->U.I.SrcReg[2] = src2; + } + inst2->U.I.DstReg.WriteMask = wmask; + /* finally delete the original instruction */ + rc_remove_instruction(inst1); + + return 1; +} + static bool inst_combination( struct rc_instruction * inst1, struct rc_instruction * inst2, @@ -1071,6 +1178,12 @@ static void merge_channels(struct radeon_compiler * c, struct rc_instruction * i if (merge_movs(c, inst, cur)) return; } + + if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) || + inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) { + if (merge_mov_add_mul(c, inst, cur)) + return; + } } } } @@ -1102,7 +1215,9 @@ void rc_optimize(struct radeon_compiler * c, void *user) while(inst != &c->Program.Instructions) { struct rc_instruction * cur = inst; inst = inst->Next; - if (cur->U.I.Opcode == RC_OPCODE_MOV) + if (cur->U.I.Opcode == RC_OPCODE_MOV || + cur->U.I.Opcode == RC_OPCODE_ADD || + cur->U.I.Opcode == RC_OPCODE_MUL) merge_channels(c, cur); } } -- GitLab