mir_promote_uniforms.c 7.35 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/*
 * Copyright (C) 2019 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Authors (Collabora):
 *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
 */

#include "compiler.h"
28
#include "util/u_math.h"
29
#include "util/u_memory.h"
30
31
32
33
34
35

/* This pass promotes reads from uniforms from load/store ops to uniform
 * registers if it is beneficial to do so. Normally, this saves both
 * instructions and total register pressure, but it does take a toll on the
 * number of work registers that are available, so this is a balance.
 *
36
37
38
39
40
41
42
43
44
45
 * We use a heuristic to determine the ideal count, implemented by
 * mir_work_heuristic, which returns the ideal number of work registers.
 */

static bool
mir_is_promoteable_ubo(midgard_instruction *ins)
{
        /* TODO: promote unaligned access via swizzle? */

        return (ins->type == TAG_LOAD_STORE_4) &&
46
                (OP_IS_UBO_READ(ins->op)) &&
47
                !(ins->constants.u32[0] & 0xF) &&
48
49
                !(ins->load_store.arg_1) &&
                (ins->load_store.arg_2 == 0x1E) &&
50
                ((ins->constants.u32[0] / 16) < 16);
51
52
53
54
55
56
57
58
59
}

static unsigned
mir_promoteable_uniform_count(compiler_context *ctx)
{
        unsigned count = 0;

        mir_foreach_instr_global(ctx, ins) {
                if (mir_is_promoteable_ubo(ins))
60
                        count = MAX2(count, ins->constants.u32[0] / 16);
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
        }

        return count;
}

static unsigned
mir_count_live(uint16_t *live, unsigned temp_count)
{
        unsigned count = 0;

        for (unsigned i = 0; i < temp_count; ++i)
                count += util_bitcount(live[i]);

        return count;
}

static unsigned
mir_estimate_pressure(compiler_context *ctx)
{
        mir_invalidate_liveness(ctx);
        mir_compute_liveness(ctx);

        unsigned max_live = 0;

85
86
87
        mir_foreach_block(ctx, _block) {
                midgard_block *block = (midgard_block *) _block;
                uint16_t *live = mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t));
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

                mir_foreach_instr_in_block_rev(block, ins) {
                        unsigned count = mir_count_live(live, ctx->temp_count);
                        max_live = MAX2(max_live, count);
                        mir_liveness_ins_update(live, ins, ctx->temp_count);
                }

                free(live);
        }

        return DIV_ROUND_UP(max_live, 16);
}

static unsigned
mir_work_heuristic(compiler_context *ctx)
{
        unsigned uniform_count = mir_promoteable_uniform_count(ctx);

        /* If there are 8 or fewer uniforms, it doesn't matter what we do, so
         * allow as many work registers as needed */

        if (uniform_count <= 8)
                return 16;

        /* Otherwise, estimate the register pressure */

        unsigned pressure = mir_estimate_pressure(ctx);

        /* Prioritize not spilling above all else. The relation between the
         * pressure estimate and the actual register pressure is a little
         * murkier than we might like (due to scheduling, pipeline registers,
         * failure to pack vector registers, load/store registers, texture
         * registers...), hence why this is a heuristic parameter */

        if (pressure > 6)
                return 16;

        /* If there's no chance of spilling, prioritize UBOs and thread count */

        return 8;
}
129

130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
/* Bitset of indices that will be used as a special register -- inputs to a
 * non-ALU op. We precompute this set so that testing is efficient, otherwise
 * we end up O(mn) behaviour for n instructions and m uniform reads */

static BITSET_WORD *
mir_special_indices(compiler_context *ctx)
{
        mir_compute_temp_count(ctx);
        BITSET_WORD *bset = calloc(BITSET_WORDS(ctx->temp_count), sizeof(BITSET_WORD));

        mir_foreach_instr_global(ctx, ins) {
                /* Look for special instructions */
                bool is_ldst = ins->type == TAG_LOAD_STORE_4;
                bool is_tex = ins->type == TAG_TEXTURE_4;
                bool is_writeout = ins->compact_branch && ins->writeout;

                if (!(is_ldst || is_tex || is_writeout))
                        continue;

                /* Anything read by a special instruction is itself special */
                mir_foreach_src(ins, i) {
                        unsigned idx = ins->src[i];

                        if (idx < ctx->temp_count)
                                BITSET_SET(bset, idx);
                }
        }

        return bset;
}

161
void
162
midgard_promote_uniforms(compiler_context *ctx)
163
{
164
165
166
        unsigned work_count = mir_work_heuristic(ctx);
        unsigned promoted_count = 24 - work_count;

167
168
169
        /* First, figure out special indices a priori so we don't recompute a lot */
        BITSET_WORD *special = mir_special_indices(ctx);

170
        mir_foreach_instr_global_safe(ctx, ins) {
171
                if (!mir_is_promoteable_ubo(ins)) continue;
172

173
                unsigned off = ins->constants.u32[0];
174
                unsigned address = off / 16;
175
176
177
178

                /* Check if it's a promotable range */
                unsigned uniform_reg = 23 - address;

179
                if (address >= promoted_count) continue;
180
181
182
183
184

                /* It is, great! Let's promote */

                ctx->uniform_cutoff = MAX2(ctx->uniform_cutoff, address + 1);
                unsigned promoted = SSA_FIXED_REGISTER(uniform_reg);
185

186
187
                /* We do need the move for safety for a non-SSA dest, or if
                 * we're being fed into a special class */
188

189
                bool needs_move = ins->dest & PAN_IS_REG || ins->dest == ctx->blend_src1;
190
191
192

                if (ins->dest < ctx->temp_count)
                        needs_move |= BITSET_TEST(special, ins->dest);
193
194

                if (needs_move) {
Alyssa Rosenzweig's avatar
Alyssa Rosenzweig committed
195
                        unsigned type_size = nir_alu_type_get_type_size(ins->dest_type);
196
                        midgard_instruction mov = v_mov(promoted, ins->dest);
197
                        mov.dest_type = nir_type_uint | type_size;
198
                        mov.src_types[1] = mov.dest_type;
199

200
                        uint16_t rounded = mir_round_bytemask_up(mir_bytemask(ins), type_size);
201
                        mir_set_bytemask(&mov, rounded);
202
                        mir_insert_instruction_before(ctx, ins, mov);
203
                } else {
204
                        mir_rewrite_index_src(ctx, ins->dest, promoted);
205
                }
206
207
208

                mir_remove_instruction(ins);
        }
209
210

        free(special);
211
}