diff --git a/src/gallium/drivers/r300/compiler/radeon_inline_literals.c b/src/gallium/drivers/r300/compiler/radeon_inline_literals.c index d1f2c4b03ef7b49279189d36de99336fff6e0e2c..9077728be82c7a430a927facbd0f564f081d42a7 100644 --- a/src/gallium/drivers/r300/compiler/radeon_inline_literals.c +++ b/src/gallium/drivers/r300/compiler/radeon_inline_literals.c @@ -28,6 +28,7 @@ #include "radeon_dataflow.h" #include "radeon_program.h" #include "radeon_program_constants.h" +#include "radeon_swizzle.h" #include "util/u_bitcast.h" #include @@ -104,32 +105,22 @@ void rc_inline_literals(struct radeon_compiler *c, void *user) /* We aren't using rc_for_all_reads_src here, because presub * sources need to be handled differently. */ for (src_idx = 0; src_idx < info->NumSrcRegs; src_idx++) { - unsigned new_swizzle; unsigned use_literal = 0; - unsigned negate_mask = 0; unsigned swz, chan; - struct rc_src_register * src_reg = - &inst->U.I.SrcReg[src_idx]; - swz = RC_SWIZZLE_UNUSED; - if (src_reg->File != RC_FILE_CONSTANT) { + struct rc_src_register src_reg = inst->U.I.SrcReg[src_idx]; + if (src_reg.File != RC_FILE_CONSTANT) { continue; } constant = - &c->Program.Constants.Constants[src_reg->Index]; + &c->Program.Constants.Constants[src_reg.Index]; if (constant->Type != RC_CONSTANT_IMMEDIATE) { continue; } - new_swizzle = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0); for (chan = 0; chan < 4; chan++) { unsigned char r300_float_tmp; - swz = GET_SWZ(src_reg->Swizzle, chan); - if (swz == RC_SWIZZLE_UNUSED) { - continue; - } - /* Don't try to inline constant swizzle */ + swz = GET_SWZ(src_reg.Swizzle, chan); if (swz >= RC_SWIZZLE_ZERO) { - use_literal = 0; - break; + continue; } float_value = constant->u.Immediate[swz]; ret = ieee_754_to_r300_float(float_value, @@ -140,7 +131,7 @@ void rc_inline_literals(struct radeon_compiler *c, void *user) break; } - if (ret == -1 && src_reg->Abs) { + if (ret == -1 && src_reg.Abs) { use_literal = 0; break; } @@ -152,19 +143,18 @@ void rc_inline_literals(struct radeon_compiler *c, void *user) /* Use RC_SWIZZLE_W for the inline constant, so * it will become one of the alpha sources. */ - SET_SWZ(new_swizzle, chan, RC_SWIZZLE_W); + SET_SWZ(src_reg.Swizzle, chan, RC_SWIZZLE_W); if (ret == -1) { - negate_mask |= (1 << chan); + src_reg.Negate ^= (1 << chan); } } - if (!use_literal) { + src_reg.File = RC_FILE_INLINE; + src_reg.Index = r300_float; + if (!use_literal || !c->SwizzleCaps->IsNative(inst->U.I.Opcode, src_reg)) { continue; } - src_reg->File = RC_FILE_INLINE; - src_reg->Index = r300_float; - src_reg->Swizzle = new_swizzle; - src_reg->Negate = src_reg->Negate ^ negate_mask; + inst->U.I.SrcReg[src_idx] = src_reg; } } } diff --git a/src/gallium/drivers/r300/compiler/radeon_optimize.c b/src/gallium/drivers/r300/compiler/radeon_optimize.c index 8d120984b89423e8a5e405fbded23649191ab20e..4d8f5cbf031721fe6faed50e10d68d7d9ac73060 100644 --- a/src/gallium/drivers/r300/compiler/radeon_optimize.c +++ b/src/gallium/drivers/r300/compiler/radeon_optimize.c @@ -886,7 +886,8 @@ static int peephole(struct radeon_compiler * c, struct rc_instruction * inst) return 0; } -static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2) { +static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2) +{ unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0); for (unsigned int chan = 0; chan < 4; chan++) { unsigned int swz = GET_SWZ(swz1, chan); @@ -900,13 +901,232 @@ static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2) { return new_swz; } -static int merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) +/* Sets negate to 0 for unused channels. */ +static unsigned int clean_negate(struct rc_src_register src) +{ + unsigned int new_negate = 0; + for (unsigned int chan = 0; chan < 4; chan++) { + unsigned int swz = GET_SWZ(src.Swizzle, chan); + if (swz != RC_SWIZZLE_UNUSED) + new_negate |= src.Negate & (1 << chan); + } + return new_negate; +} + +static unsigned int merge_negates(struct rc_src_register src1, struct rc_src_register src2) +{ + return clean_negate(src1) | clean_negate(src2); +} + +static unsigned int fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz) +{ + for (unsigned int chan = 0; chan < 4; chan++) { + unsigned int swz = GET_SWZ(orig_swz, chan); + if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) { + SET_SWZ(orig_swz, chan, const_swz); + } + } + return orig_swz; +} + +/** + * Merges two MOVs writing different channels of the same destination register + * with the use of the constant swizzles. + */ +static bool merge_movs( + struct radeon_compiler * c, + struct rc_instruction * inst, + struct rc_instruction * cur) +{ + /* We can merge two MOVs into MOV if one of them is from inline constant, + * i.e., constant swizzles and RC_FILE_NONE). + * + * For example + * MOV temp[0].x none.1___ + * MOV temp[0].y input[0]._x__ + * + * becomes + * MOV temp[0].xy input[0].1x__ + */ + unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask; + if (cur->U.I.SrcReg[0].File == RC_FILE_NONE || + inst->U.I.SrcReg[0].File == RC_FILE_NONE) { + struct rc_src_register src; + if (cur->U.I.SrcReg[0].File == RC_FILE_NONE) + src = inst->U.I.SrcReg[0]; + else + src = cur->U.I.SrcReg[0]; + src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, + inst->U.I.SrcReg[0].Swizzle); + src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]); + if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) { + cur->U.I.DstReg.WriteMask |= orig_dst_wmask; + cur->U.I.SrcReg[0] = src; + rc_remove_instruction(inst); + return true; + } + } + + /* Otherwise, we can convert the MOVs into ADD. + * + * For example + * MOV temp[0].x const[0].x + * MOV temp[0].y input[0].y + * + * becomes + * ADD temp[0].xy const[0].x0 input[0].0y + */ + unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask; + struct rc_src_register src0 = inst->U.I.SrcReg[0]; + struct rc_src_register src1 = cur->U.I.SrcReg[0]; + + src0.Swizzle = fill_swizzle(src0.Swizzle, + wmask, RC_SWIZZLE_ZERO); + src1.Swizzle = fill_swizzle(src1.Swizzle, + wmask, RC_SWIZZLE_ZERO); + if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) || + !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1)) + return false; + + cur->U.I.DstReg.WriteMask = wmask; + cur->U.I.Opcode = RC_OPCODE_ADD; + cur->U.I.SrcReg[0] = src0; + cur->U.I.SrcReg[1] = src1; + + /* finally delete the original mov */ + rc_remove_instruction(inst); + return true; +} + +static int have_shared_source(struct rc_instruction * inst1, struct rc_instruction * inst2) +{ + int shared_src = -1; + const struct rc_opcode_info * opcode1 = rc_get_opcode_info(inst1->U.I.Opcode); + const struct rc_opcode_info * opcode2 = rc_get_opcode_info(inst2->U.I.Opcode); + for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) { + for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) { + if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File && + inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index && + inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr) + shared_src = i; + } + } + return shared_src; +} + +/** + * This function will try to merge MOV and ADD/MUL instructions with the same + * destination, making use of the constant swizzles. + * + * For example: + * MOV temp[0].x const[0].x + * MUL temp[0].yz const[1].yz const[2].yz + * + * becomes + * MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00 + */ +static int merge_mov_add_mul( + struct radeon_compiler * c, + struct rc_instruction * inst1, + struct rc_instruction * inst2) +{ + struct rc_instruction * inst, * mov; + if (inst1->U.I.Opcode == RC_OPCODE_MOV) { + mov = inst1; + inst = inst2; + } else { + mov = inst2; + inst = inst1; + } + + const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL; + int shared_index = have_shared_source(inst, mov); + unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask; + + /* If there is a shared source, just merge the swizzles and be done with it. */ + if (shared_index != -1) { + struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index]; + struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index]; + + shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src); + shared_src.Swizzle = merge_swizzles(shared_src.Swizzle, + mov->U.I.SrcReg[0].Swizzle); + other_src.Negate = clean_negate(other_src); + unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO; + other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz); + + if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) || + !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src)) + return 0; + + inst2->U.I.Opcode = inst->U.I.Opcode; + inst2->U.I.SrcReg[0] = shared_src; + inst2->U.I.SrcReg[1] = other_src; + + /* TODO: we can do a bit better in the special case when one of the sources is none. + * Convert to MAD otherwise. + */ + } else { + struct rc_src_register src0, src1, src2; + if (is_mul) { + src2 = mov->U.I.SrcReg[0]; + src0 = inst->U.I.SrcReg[0]; + src1 = inst->U.I.SrcReg[1]; + } else { + src0 = mov->U.I.SrcReg[0]; + src1 = inst->U.I.SrcReg[0]; + src2 = inst->U.I.SrcReg[1]; + } + /* The following login expects that the unused channels have empty negate bits. */ + src0.Negate = clean_negate(src0); + src1.Negate = clean_negate(src1); + src2.Negate = clean_negate(src2); + + src0.Swizzle = fill_swizzle(src0.Swizzle, + wmask, RC_SWIZZLE_ONE); + src1.Swizzle = fill_swizzle(src1.Swizzle, + wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE); + src2.Swizzle = fill_swizzle(src2.Swizzle, + wmask, RC_SWIZZLE_ZERO); + if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) || + !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) || + !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2)) + return 0; + + inst2->U.I.Opcode = RC_OPCODE_MAD; + inst2->U.I.SrcReg[0] = src0; + inst2->U.I.SrcReg[1] = src1; + inst2->U.I.SrcReg[2] = src2; + } + inst2->U.I.DstReg.WriteMask = wmask; + /* finally delete the original instruction */ + rc_remove_instruction(inst1); + + return 1; +} + +static bool inst_combination( + struct rc_instruction * inst1, + struct rc_instruction * inst2, + rc_opcode opcode1, + rc_opcode opcode2) +{ + return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) || + (inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2)); +} + +/** + * Searches for instructions writing different channels of the same register that could + * be merged together with the use of constant swizzles. + * + * The potential candidates are combinations of MOVs, ADDs, MULs and MADs. + */ +static void merge_channels(struct radeon_compiler * c, struct rc_instruction * inst) { unsigned int orig_dst_reg = inst->U.I.DstReg.Index; unsigned int orig_dst_file = inst->U.I.DstReg.File; unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask; - unsigned int orig_src_reg = inst->U.I.SrcReg[0].Index; - unsigned int orig_src_file = inst->U.I.SrcReg[0].File; + const struct rc_opcode_info * orig_opcode = rc_get_opcode_info(inst->U.I.Opcode); struct rc_instruction * cur = inst; while (cur!= &c->Program.Instructions) { @@ -917,13 +1137,13 @@ static int merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) * control flow. */ if (opcode->IsFlowControl) - return 0; + return; /* Stop when the original destination is overwritten */ if (orig_dst_reg == cur->U.I.DstReg.Index && orig_dst_file == cur->U.I.DstReg.File && (orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0) - return 0; + return; /* Stop the search when the original instruction destination * is used as a source for anything. @@ -931,39 +1151,41 @@ static int merge_movs(struct radeon_compiler * c, struct rc_instruction * inst) for (unsigned i = 0; i < opcode->NumSrcRegs; i++) { if (cur->U.I.SrcReg[i].File == orig_dst_file && cur->U.I.SrcReg[i].Index == orig_dst_reg) - return 0; + return; + } + + /* Stop the search when some of the original sources are touched. */ + for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) { + if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File && + inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index) + return; } - if (cur->U.I.Opcode == RC_OPCODE_MOV && - cur->U.I.DstReg.File == orig_dst_file && + if (cur->U.I.DstReg.File == orig_dst_file && cur->U.I.DstReg.Index == orig_dst_reg && + cur->U.I.SaturateMode == inst->U.I.SaturateMode && (cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) { - /* We can merge the movs if one of them is from inline constant */ - if (cur->U.I.SrcReg[0].File == RC_FILE_NONE || - orig_src_file == RC_FILE_NONE) { - cur->U.I.DstReg.WriteMask |= orig_dst_wmask; - - if (cur->U.I.SrcReg[0].File == RC_FILE_NONE) { - cur->U.I.SrcReg[0].File = orig_src_file; - cur->U.I.SrcReg[0].Index = orig_src_reg; - cur->U.I.SrcReg[0].Abs = inst->U.I.SrcReg[0].Abs; - cur->U.I.SrcReg[0].RelAddr = inst->U.I.SrcReg[0].RelAddr; - } - cur->U.I.SrcReg[0].Swizzle = - merge_swizzles(cur->U.I.SrcReg[0].Swizzle, - inst->U.I.SrcReg[0].Swizzle); - - cur->U.I.SrcReg[0].Negate |= inst->U.I.SrcReg[0].Negate; + /* Skip the merge if one of the instructions writes just w channel + * and we are compiling a fragment shader. We can pair-schedule it together + * later anyway and it will also give the scheduler a bit more flexibility. + */ + if (c->has_omod && (cur->U.I.DstReg.WriteMask == RC_MASK_W || + inst->U.I.DstReg.WriteMask == RC_MASK_W)) + continue; - /* finally delete the original mov */ - rc_remove_instruction(inst); + if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) { + if (merge_movs(c, inst, cur)) + return; + } - return 1; + if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) || + inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) { + if (merge_mov_add_mul(c, inst, cur)) + return; } } } - return 0; } void rc_optimize(struct radeon_compiler * c, void *user) @@ -972,22 +1194,42 @@ void rc_optimize(struct radeon_compiler * c, void *user) while(inst != &c->Program.Instructions) { struct rc_instruction * cur = inst; inst = inst->Next; - constant_folding(c, cur); + } - if(peephole(c, cur)) - continue; - + /* Copy propagate simple movs away. */ + inst = c->Program.Instructions.Next; + while(inst != &c->Program.Instructions) { + struct rc_instruction * cur = inst; + inst = inst->Next; if (cur->U.I.Opcode == RC_OPCODE_MOV) { - if (c->is_r500) { - if (merge_movs(c, cur)) - continue; - } copy_propagate(c, cur); - /* cur may no longer be part of the program */ } } + /* Merge MOVs to same source in different channels using the constant + * swizzles. + */ + if (c->is_r500) { + inst = c->Program.Instructions.Next; + while(inst != &c->Program.Instructions) { + struct rc_instruction * cur = inst; + inst = inst->Next; + if (cur->U.I.Opcode == RC_OPCODE_MOV || + cur->U.I.Opcode == RC_OPCODE_ADD || + cur->U.I.Opcode == RC_OPCODE_MUL) + merge_channels(c, cur); + } + } + + /* Presubtract operations. */ + inst = c->Program.Instructions.Next; + while(inst != &c->Program.Instructions) { + struct rc_instruction * cur = inst; + inst = inst->Next; + peephole(c, cur); + } + if (!c->has_omod) { return; } diff --git a/src/gallium/drivers/r300/compiler/radeon_pair_translate.c b/src/gallium/drivers/r300/compiler/radeon_pair_translate.c index b75b658b86239cf4e983b30b66bcd4a3adbd6429..9960c4a2289b67685f5b55c0eec9bb0964c65ac5 100644 --- a/src/gallium/drivers/r300/compiler/radeon_pair_translate.c +++ b/src/gallium/drivers/r300/compiler/radeon_pair_translate.c @@ -230,7 +230,23 @@ static void set_pair_instruction(struct r300_fragment_program_compiler *c, else if (swz == RC_SWIZZLE_W) srcalpha = 1; - if (swz < RC_SWIZZLE_UNUSED) + /* We check for ZERO here as well because otherwise the zero + * sign (which doesn't matter and we already ignore it previously + * when checking for valid swizzle) could mess up the final negate sign. + * Example problematic pattern where this would be produced is: + * CONST[1] FLT32 { 0.0000, 0.0000, -4.0000, 0.0000} + * ADD temp[0].xyz, const[0].xyz_, -const[1].z00_; + * + * after inline literals would become: + * ADD temp[0].xyz, const[0].xyz_, 4.000000 (0x48).w-0-0-_; + * + * and after pair translate: + * src0.xyz = const[0], src0.w = 4.000000 (0x48) + * MAD temp[0].xyz, src0.xyz, src0.111, src0.w00 + * + * Without the zero check there would be -src0.w00. + */ + if (swz < RC_SWIZZLE_UNUSED && swz != RC_SWIZZLE_ZERO) srcmask |= 1 << j; } source = rc_pair_alloc_source(pair, srcrgb, srcalpha,