vc4_program.c 105 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * Copyright (c) 2014 Scott Mansell
 * Copyright © 2014 Broadcom
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include <inttypes.h>
26
#include "util/format/u_format.h"
27
#include "util/crc32.h"
Eric Anholt's avatar
Eric Anholt committed
28
#include "util/u_math.h"
29
#include "util/u_memory.h"
30
#include "util/ralloc.h"
31
#include "util/hash_table.h"
32
#include "tgsi/tgsi_dump.h"
Eric Anholt's avatar
Eric Anholt committed
33
#include "tgsi/tgsi_parse.h"
Emil Velikov's avatar
Emil Velikov committed
34 35
#include "compiler/nir/nir.h"
#include "compiler/nir/nir_builder.h"
36
#include "compiler/nir_types.h"
Eric Anholt's avatar
Eric Anholt committed
37
#include "nir/tgsi_to_nir.h"
38 39
#include "vc4_context.h"
#include "vc4_qpu.h"
40 41
#include "vc4_qir.h"

42 43
static struct qreg
ntq_get_src(struct vc4_compile *c, nir_src src, int i);
44 45
static void
ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
46

47
static int
48
type_size(const struct glsl_type *type, bool bindless)
49 50 51 52
{
   return glsl_count_attribute_slots(type, false);
}

53 54 55 56 57 58 59 60 61
static void
resize_qreg_array(struct vc4_compile *c,
                  struct qreg **regs,
                  uint32_t *size,
                  uint32_t decl_size)
{
        if (*size >= decl_size)
                return;

62
        uint32_t old_size = *size;
63 64 65 66 67 68
        *size = MAX2(*size * 2, decl_size);
        *regs = reralloc(c, *regs, struct qreg, *size);
        if (!*regs) {
                fprintf(stderr, "Malloc failure\n");
                abort();
        }
69 70 71

        for (uint32_t i = old_size; i < *size; i++)
                (*regs)[i] = c->undef;
72 73
}

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
static void
ntq_emit_thrsw(struct vc4_compile *c)
{
        if (!c->fs_threaded)
                return;

        /* Always thread switch after each texture operation for now.
         *
         * We could do better by batching a bunch of texture fetches up and
         * then doing one thread switch and collecting all their results
         * afterward.
         */
        qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef,
                                    c->undef, c->undef));
        c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
}

91
static struct qreg
92
indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
Eric Anholt's avatar
Eric Anholt committed
93
{
94
        struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
95 96

        /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
97
        uint32_t range = nir_intrinsic_range(intr);
98
        indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));
99
        indirect_offset = qir_MIN_NOIMM(c, indirect_offset,
100
                                        qir_uniform_ui(c, range - 4));
101

102 103
        qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
                     indirect_offset,
104 105
                     qir_uniform(c, QUNIFORM_UBO0_ADDR,
                                 nir_intrinsic_base(intr)));
106

107
        c->num_texture_samples++;
108 109 110

        ntq_emit_thrsw(c);

111
        return qir_TEX_RESULT(c);
112 113
}

114 115 116
static struct qreg
vc4_ubo_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
{
117
        int buffer_index = nir_src_as_uint(intr->src[0]);
118
        assert(buffer_index == 1);
119 120 121 122 123 124 125 126 127 128 129
        assert(c->stage == QSTAGE_FRAG);

        struct qreg offset = ntq_get_src(c, intr->src[1], 0);

        /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
        offset = qir_MAX(c, offset, qir_uniform_ui(c, 0));
        offset = qir_MIN_NOIMM(c, offset,
                               qir_uniform_ui(c, c->fs_key->ubo_1_size - 4));

        qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
                     offset,
130
                     qir_uniform(c, QUNIFORM_UBO1_ADDR, 0));
131 132 133 134 135 136 137 138

        c->num_texture_samples++;

        ntq_emit_thrsw(c);

        return qir_TEX_RESULT(c);
}

139 140 141 142 143
nir_ssa_def *
vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
{
        switch (swiz) {
        default:
144
        case PIPE_SWIZZLE_NONE:
145 146
                fprintf(stderr, "warning: unknown swizzle\n");
                /* FALLTHROUGH */
147
        case PIPE_SWIZZLE_0:
148
                return nir_imm_float(b, 0.0);
149
        case PIPE_SWIZZLE_1:
150
                return nir_imm_float(b, 1.0);
151 152 153 154
        case PIPE_SWIZZLE_X:
        case PIPE_SWIZZLE_Y:
        case PIPE_SWIZZLE_Z:
        case PIPE_SWIZZLE_W:
155 156 157 158
                return srcs[swiz];
        }
}

Eric Anholt's avatar
Eric Anholt committed
159
static struct qreg *
160
ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
161
{
162 163 164
        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
                                          def->num_components);
        _mesa_hash_table_insert(c->def_ht, def, qregs);
Eric Anholt's avatar
Eric Anholt committed
165 166
        return qregs;
}
167

168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
/**
 * This function is responsible for getting QIR results into the associated
 * storage for a NIR instruction.
 *
 * If it's a NIR SSA def, then we just set the associated hash table entry to
 * the new result.
 *
 * If it's a NIR reg, then we need to update the existing qreg assigned to the
 * NIR destination with the incoming value.  To do that without introducing
 * new MOVs, we require that the incoming qreg either be a uniform, or be
 * SSA-defined by the previous QIR instruction in the block and rewritable by
 * this function.  That lets us sneak ahead and insert the SF flag beforehand
 * (knowing that the previous instruction doesn't depend on flags) and rewrite
 * its destination to be the NIR reg's destination
 */
183 184 185 186
static void
ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
               struct qreg result)
{
187
        struct qinst *last_inst = NULL;
188
        if (!list_is_empty(&c->cur_block->instructions))
189 190 191 192 193 194
                last_inst = (struct qinst *)c->cur_block->instructions.prev;

        assert(result.file == QFILE_UNIF ||
               (result.file == QFILE_TEMP &&
                last_inst && last_inst == c->defs[result.index]));

195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
        if (dest->is_ssa) {
                assert(chan < dest->ssa.num_components);

                struct qreg *qregs;
                struct hash_entry *entry =
                        _mesa_hash_table_search(c->def_ht, &dest->ssa);

                if (entry)
                        qregs = entry->data;
                else
                        qregs = ntq_init_ssa_def(c, &dest->ssa);

                qregs[chan] = result;
        } else {
                nir_register *reg = dest->reg.reg;
                assert(dest->reg.base_offset == 0);
                assert(reg->num_array_elems == 0);
                struct hash_entry *entry =
                        _mesa_hash_table_search(c->def_ht, reg);
                struct qreg *qregs = entry->data;

216 217 218 219 220 221 222 223 224 225 226 227 228 229
                /* Insert a MOV if the source wasn't an SSA def in the
                 * previous instruction.
                 */
                if (result.file == QFILE_UNIF) {
                        result = qir_MOV(c, result);
                        last_inst = c->defs[result.index];
                }

                /* We know they're both temps, so just rewrite index. */
                c->defs[last_inst->dst.index] = NULL;
                last_inst->dst.index = qregs[chan].index;

                /* If we're in control flow, then make this update of the reg
                 * conditional on the execution mask.
230 231
                 */
                if (c->execute.file != QFILE_NULL) {
232
                        last_inst->dst.index = qregs[chan].index;
233

234 235 236 237
                        /* Set the flags to the current exec mask.  To insert
                         * the SF, we temporarily remove our SSA instruction.
                         */
                        list_del(&last_inst->link);
238
                        qir_SF(c, c->execute);
239 240 241 242 243
                        list_addtail(&last_inst->link,
                                     &c->cur_block->instructions);

                        last_inst->cond = QPU_COND_ZS;
                        last_inst->cond_is_exec_mask = true;
244 245 246 247
                }
        }
}

248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
static struct qreg *
ntq_get_dest(struct vc4_compile *c, nir_dest *dest)
{
        if (dest->is_ssa) {
                struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa);
                for (int i = 0; i < dest->ssa.num_components; i++)
                        qregs[i] = c->undef;
                return qregs;
        } else {
                nir_register *reg = dest->reg.reg;
                assert(dest->reg.base_offset == 0);
                assert(reg->num_array_elems == 0);
                struct hash_entry *entry =
                        _mesa_hash_table_search(c->def_ht, reg);
                return entry->data;
        }
}

Eric Anholt's avatar
Eric Anholt committed
266 267 268 269 270 271 272 273 274 275 276 277 278
static struct qreg
ntq_get_src(struct vc4_compile *c, nir_src src, int i)
{
        struct hash_entry *entry;
        if (src.is_ssa) {
                entry = _mesa_hash_table_search(c->def_ht, src.ssa);
                assert(i < src.ssa->num_components);
        } else {
                nir_register *reg = src.reg.reg;
                entry = _mesa_hash_table_search(c->def_ht, reg);
                assert(reg->num_array_elems == 0);
                assert(src.reg.base_offset == 0);
                assert(i < reg->num_components);
279
        }
280

Eric Anholt's avatar
Eric Anholt committed
281 282 283
        struct qreg *qregs = entry->data;
        return qregs[i];
}
284

Eric Anholt's avatar
Eric Anholt committed
285 286 287
static struct qreg
ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
                unsigned src)
288
{
289
        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
Eric Anholt's avatar
Eric Anholt committed
290 291 292
        unsigned chan = ffs(instr->dest.write_mask) - 1;
        struct qreg r = ntq_get_src(c, instr->src[src].src,
                                    instr->src[src].swizzle[chan]);
293

Eric Anholt's avatar
Eric Anholt committed
294 295
        assert(!instr->src[src].abs);
        assert(!instr->src[src].negate);
296

Eric Anholt's avatar
Eric Anholt committed
297
        return r;
298 299
};

300 301 302 303 304 305 306 307
static inline struct qreg
qir_SAT(struct vc4_compile *c, struct qreg val)
{
        return qir_FMAX(c,
                        qir_FMIN(c, val, qir_uniform_f(c, 1.0)),
                        qir_uniform_f(c, 0.0));
}

308
static struct qreg
Eric Anholt's avatar
Eric Anholt committed
309
ntq_rcp(struct vc4_compile *c, struct qreg x)
310
{
311 312 313 314 315 316 317 318 319 320
        struct qreg r = qir_RCP(c, x);

        /* Apply a Newton-Raphson step to improve the accuracy. */
        r = qir_FMUL(c, r, qir_FSUB(c,
                                    qir_uniform_f(c, 2.0),
                                    qir_FMUL(c, x, r)));

        return r;
}

321
static struct qreg
Eric Anholt's avatar
Eric Anholt committed
322
ntq_rsq(struct vc4_compile *c, struct qreg x)
323 324 325 326 327 328 329 330 331 332 333 334 335 336
{
        struct qreg r = qir_RSQ(c, x);

        /* Apply a Newton-Raphson step to improve the accuracy. */
        r = qir_FMUL(c, r, qir_FSUB(c,
                                    qir_uniform_f(c, 1.5),
                                    qir_FMUL(c,
                                             qir_uniform_f(c, 0.5),
                                             qir_FMUL(c, x,
                                                      qir_FMUL(c, r, r)))));

        return r;
}

337
static struct qreg
Eric Anholt's avatar
Eric Anholt committed
338
ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
339
{
Eric Anholt's avatar
Eric Anholt committed
340 341 342 343
        struct qreg src0_hi = qir_SHR(c, src0,
                                      qir_uniform_ui(c, 24));
        struct qreg src1_hi = qir_SHR(c, src1,
                                      qir_uniform_ui(c, 24));
344 345 346 347

        struct qreg hilo = qir_MUL24(c, src0_hi, src1);
        struct qreg lohi = qir_MUL24(c, src0, src1_hi);
        struct qreg lolo = qir_MUL24(c, src0, src1);
348 349 350

        return qir_ADD(c, lolo, qir_SHL(c,
                                        qir_ADD(c, hilo, lohi),
351
                                        qir_uniform_ui(c, 24)));
352 353
}

354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
static struct qreg
ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src)
{
        struct qreg depthf = qir_ITOF(c, qir_SHR(c, src,
                                                 qir_uniform_ui(c, 8)));
        return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff));
}

/**
 * Emits a lowered TXF_MS from an MSAA texture.
 *
 * The addressing math has been lowered in NIR, and now we just need to read
 * it like a UBO.
 */
static void
ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
{
        uint32_t tile_width = 32;
        uint32_t tile_height = 32;
        uint32_t tile_size = (tile_height * tile_width *
                              VC4_MAX_SAMPLES * sizeof(uint32_t));

376
        unsigned unit = instr->texture_index;
377 378 379 380 381 382 383 384 385 386 387 388 389
        uint32_t w = align(c->key->tex[unit].msaa_width, tile_width);
        uint32_t w_tiles = w / tile_width;
        uint32_t h = align(c->key->tex[unit].msaa_height, tile_height);
        uint32_t h_tiles = h / tile_height;
        uint32_t size = w_tiles * h_tiles * tile_size;

        struct qreg addr;
        assert(instr->num_srcs == 1);
        assert(instr->src[0].src_type == nir_tex_src_coord);
        addr = ntq_get_src(c, instr->src[0].src, 0);

        /* Perform the clamping required by kernel validation. */
        addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
390
        addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4));
391

392 393
        qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
                     addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
394

395 396
        ntq_emit_thrsw(c);

397 398 399 400 401 402 403
        struct qreg tex = qir_TEX_RESULT(c);
        c->num_texture_samples++;

        enum pipe_format format = c->key->tex[unit].format;
        if (util_format_is_depth_or_stencil(format)) {
                struct qreg scaled = ntq_scale_depth_texture(c, tex);
                for (int i = 0; i < 4; i++)
404
                        ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled));
405 406
        } else {
                for (int i = 0; i < 4; i++)
407 408
                        ntq_store_dest(c, &instr->dest, i,
                                       qir_UNPACK_8_F(c, tex, i));
409 410 411
        }
}

412
static void
Eric Anholt's avatar
Eric Anholt committed
413 414
ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
{
415 416
        struct qreg s, t, r, lod, compare;
        bool is_txb = false, is_txl = false;
417
        unsigned unit = instr->texture_index;
Eric Anholt's avatar
Eric Anholt committed
418

419 420 421 422 423
        if (instr->op == nir_texop_txf) {
                ntq_emit_txf(c, instr);
                return;
        }

Eric Anholt's avatar
Eric Anholt committed
424 425 426 427
        for (unsigned i = 0; i < instr->num_srcs; i++) {
                switch (instr->src[i].src_type) {
                case nir_tex_src_coord:
                        s = ntq_get_src(c, instr->src[i].src, 0);
428 429 430
                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D)
                                t = qir_uniform_f(c, 0.5);
                        else
Eric Anholt's avatar
Eric Anholt committed
431 432 433 434 435 436 437 438 439 440 441 442
                                t = ntq_get_src(c, instr->src[i].src, 1);
                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
                                r = ntq_get_src(c, instr->src[i].src, 2);
                        break;
                case nir_tex_src_bias:
                        lod = ntq_get_src(c, instr->src[i].src, 0);
                        is_txb = true;
                        break;
                case nir_tex_src_lod:
                        lod = ntq_get_src(c, instr->src[i].src, 0);
                        is_txl = true;
                        break;
443
                case nir_tex_src_comparator:
Eric Anholt's avatar
Eric Anholt committed
444 445 446 447 448
                        compare = ntq_get_src(c, instr->src[i].src, 0);
                        break;
                default:
                        unreachable("unknown texture source");
                }
449 450
        }

451 452 453 454 455 456 457 458 459 460
        if (c->stage != QSTAGE_FRAG && !is_txl) {
                /* From the GLSL 1.20 spec:
                 *
                 *     "If it is mip-mapped and running on the vertex shader,
                 *      then the base texture is used."
                 */
                is_txl = true;
                lod = qir_uniform_ui(c, 0);
        }

461 462
        if (c->key->tex[unit].force_first_level) {
                lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit);
463 464 465 466
                is_txl = true;
                is_txb = false;
        }

467
        struct qreg texture_u[] = {
468 469 470 471
                qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit),
                qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit),
                qir_uniform(c, QUNIFORM_CONSTANT, 0),
                qir_uniform(c, QUNIFORM_CONSTANT, 0),
472 473 474
        };
        uint32_t next_texture_u = 0;

475 476 477 478
        /* There is no native support for GL texture rectangle coordinates, so
         * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0,
         * 1]).
         */
Eric Anholt's avatar
Eric Anholt committed
479
        if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
480
                s = qir_FMUL(c, s,
481
                             qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, unit));
482
                t = qir_FMUL(c, t,
483
                             qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, unit));
484 485
        }

Eric Anholt's avatar
Eric Anholt committed
486
        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) {
487
                texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2,
488 489 490
                                           unit | (is_txl << 16));
        }

491
        struct qinst *tmu;
Eric Anholt's avatar
Eric Anholt committed
492
        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
493 494 495
                tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r);
                tmu->src[qir_get_tex_uniform_src(tmu)] =
                        texture_u[next_texture_u++];
496 497 498 499
        } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
                   c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||
                   c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
                   c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
500 501 502 503 504
                tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0),
                                   qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR,
                                               unit));
                tmu->src[qir_get_tex_uniform_src(tmu)] =
                        texture_u[next_texture_u++];
505 506
        }

507
        if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) {
508
                s = qir_SAT(c, s);
509 510 511
        }

        if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
512
                t = qir_SAT(c, t);
513 514
        }

515 516 517
        tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t);
        tmu->src[qir_get_tex_uniform_src(tmu)] =
                texture_u[next_texture_u++];
518

519 520 521 522 523
        if (is_txl || is_txb) {
                tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod);
                tmu->src[qir_get_tex_uniform_src(tmu)] =
                        texture_u[next_texture_u++];
        }
524

525 526
        tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s);
        tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++];
527

528
        c->num_texture_samples++;
529 530 531

        ntq_emit_thrsw(c);

532
        struct qreg tex = qir_TEX_RESULT(c);
533

534
        enum pipe_format format = c->key->tex[unit].format;
535

536
        struct qreg *dest = ntq_get_dest(c, &instr->dest);
537
        if (util_format_is_depth_or_stencil(format)) {
538
                struct qreg normalized = ntq_scale_depth_texture(c, tex);
539 540
                struct qreg depth_output;

541 542
                struct qreg u0 = qir_uniform_f(c, 0.0f);
                struct qreg u1 = qir_uniform_f(c, 1.0f);
543
                if (c->key->tex[unit].compare_mode) {
544 545 546 547 548 549 550 551 552
                        /* From the GL_ARB_shadow spec:
                         *
                         *     "Let Dt (D subscript t) be the depth texture
                         *      value, in the range [0, 1].  Let R be the
                         *      interpolated texture coordinate clamped to the
                         *      range [0, 1]."
                         */
                        compare = qir_SAT(c, compare);

553 554 555 556 557
                        switch (c->key->tex[unit].compare_func) {
                        case PIPE_FUNC_NEVER:
                                depth_output = qir_uniform_f(c, 0.0f);
                                break;
                        case PIPE_FUNC_ALWAYS:
558
                                depth_output = u1;
559 560 561
                                break;
                        case PIPE_FUNC_EQUAL:
                                qir_SF(c, qir_FSUB(c, compare, normalized));
562
                                depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);
563 564 565
                                break;
                        case PIPE_FUNC_NOTEQUAL:
                                qir_SF(c, qir_FSUB(c, compare, normalized));
566
                                depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);
567 568 569
                                break;
                        case PIPE_FUNC_GREATER:
                                qir_SF(c, qir_FSUB(c, compare, normalized));
570
                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
571 572 573
                                break;
                        case PIPE_FUNC_GEQUAL:
                                qir_SF(c, qir_FSUB(c, normalized, compare));
574
                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
575 576 577
                                break;
                        case PIPE_FUNC_LESS:
                                qir_SF(c, qir_FSUB(c, compare, normalized));
578
                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
579 580 581
                                break;
                        case PIPE_FUNC_LEQUAL:
                                qir_SF(c, qir_FSUB(c, normalized, compare));
582
                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
583 584 585 586 587 588
                                break;
                        }
                } else {
                        depth_output = normalized;
                }

589
                for (int i = 0; i < 4; i++)
590
                        dest[i] = depth_output;
591 592
        } else {
                for (int i = 0; i < 4; i++)
593
                        dest[i] = qir_UNPACK_8_F(c, tex, i);
594
        }
595 596
}

597 598 599 600 601
/**
 * Computes x - floor(x), which is tricky because our FTOI truncates (rounds
 * to zero).
 */
static struct qreg
Eric Anholt's avatar
Eric Anholt committed
602
ntq_ffract(struct vc4_compile *c, struct qreg src)
603
{
Eric Anholt's avatar
Eric Anholt committed
604 605
        struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
        struct qreg diff = qir_FSUB(c, src, trunc);
606
        qir_SF(c, diff);
607 608 609 610 611

        qir_FADD_dest(c, diff,
                      diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;

        return qir_MOV(c, diff);
612
}
613

614 615 616 617 618
/**
 * Computes floor(x), which is tricky because our FTOI truncates (rounds to
 * zero).
 */
static struct qreg
Eric Anholt's avatar
Eric Anholt committed
619
ntq_ffloor(struct vc4_compile *c, struct qreg src)
620
{
621
        struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
622 623 624 625

        /* This will be < 0 if we truncated and the truncation was of a value
         * that was < 0 in the first place.
         */
626
        qir_SF(c, qir_FSUB(c, src, result));
627

628 629 630 631 632
        struct qinst *sub = qir_FSUB_dest(c, result,
                                          result, qir_uniform_f(c, 1.0));
        sub->cond = QPU_COND_NS;

        return qir_MOV(c, result);
633 634
}

635 636 637 638 639
/**
 * Computes ceil(x), which is tricky because our FTOI truncates (rounds to
 * zero).
 */
static struct qreg
Eric Anholt's avatar
Eric Anholt committed
640
ntq_fceil(struct vc4_compile *c, struct qreg src)
641
{
642
        struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
643 644 645 646

        /* This will be < 0 if we truncated and the truncation was of a value
         * that was > 0 in the first place.
         */
647 648 649 650
        qir_SF(c, qir_FSUB(c, result, src));

        qir_FADD_dest(c, result,
                      result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
651

652
        return qir_MOV(c, result);
653 654
}

655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
static struct qreg
ntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x)
{
        /* Since we're using a Taylor approximation, we want to have a small
         * number of coefficients and take advantage of sin/cos repeating
         * every 2pi.  We keep our x as close to 0 as we can, since the series
         * will be less accurate as |x| increases.  (Also, be careful of
         * shifting the input x value to be tricky with sin/cos relations,
         * because getting accurate values for x==0 is very important for SDL
         * rendering)
         */
        struct qreg scaled_x =
                qir_FMUL(c, x,
                         qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
        /* Note: FTOI truncates toward 0. */
        struct qreg x_frac = qir_FSUB(c, scaled_x,
                                      qir_ITOF(c, qir_FTOI(c, scaled_x)));
        /* Map [0.5, 1] to [-0.5, 0] */
        qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5)));
        qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC;
        /* Map [-1, -0.5] to [0, 0.5] */
        qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5)));
        qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;

        return x_frac;
}

Eric Anholt's avatar
Eric Anholt committed
682
static struct qreg
Eric Anholt's avatar
Eric Anholt committed
683
ntq_fsin(struct vc4_compile *c, struct qreg src)
684 685
{
        float coeff[] = {
686 687 688 689 690
                2.0 * M_PI,
                -pow(2.0 * M_PI, 3) / (3 * 2 * 1),
                pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
                -pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
                pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
691 692
        };

693
        struct qreg x = ntq_shrink_sincos_input_range(c, src);
694
        struct qreg x2 = qir_FMUL(c, x, x);
695
        struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0]));
696 697 698 699 700 701
        for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
                x = qir_FMUL(c, x, x2);
                sum = qir_FADD(c,
                               sum,
                               qir_FMUL(c,
                                        x,
702
                                        qir_uniform_f(c, coeff[i])));
703 704 705 706
        }
        return sum;
}

707
static struct qreg
Eric Anholt's avatar
Eric Anholt committed
708
ntq_fcos(struct vc4_compile *c, struct qreg src)
709 710
{
        float coeff[] = {
711 712 713 714 715 716
                1.0f,
                -pow(2.0 * M_PI, 2) / (2 * 1),
                pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
                -pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
                pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
                -pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
717 718
        };

719
        struct qreg x_frac = ntq_shrink_sincos_input_range(c, src);
720
        struct qreg sum = qir_uniform_f(c, coeff[0]);
721 722 723 724 725 726
        struct qreg x2 = qir_FMUL(c, x_frac, x_frac);
        struct qreg x = x2; /* Current x^2, x^4, or x^6 */
        for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
                if (i != 1)
                        x = qir_FMUL(c, x, x2);

727
                sum = qir_FADD(c, qir_FMUL(c,
728
                                           x,
729 730
                                           qir_uniform_f(c, coeff[i])),
                               sum);
731 732 733 734
        }
        return sum;
}

735
static struct qreg
Eric Anholt's avatar
Eric Anholt committed
736
ntq_fsign(struct vc4_compile *c, struct qreg src)
737
{
738 739
        struct qreg t = qir_get_temp(c);

Eric Anholt's avatar
Eric Anholt committed
740
        qir_SF(c, src);
741 742 743
        qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
        qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
        qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
744
        return qir_MOV(c, t);
745 746
}

747
static void
748
emit_vertex_input(struct vc4_compile *c, int attr)
749
{
750
        enum pipe_format format = c->vs_key->attr_formats[attr];
751
        uint32_t attr_size = util_format_get_blocksize(format);
752

753
        c->vattr_sizes[attr] = align(attr_size, 4);
754
        for (int i = 0; i < align(attr_size, 4) / 4; i++) {
755
                c->inputs[attr * 4 + i] =
756
                        qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
757 758 759 760
                c->num_inputs++;
        }
}

761
static void
762
emit_fragcoord_input(struct vc4_compile *c, int attr)
763
{
764 765
        c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0));
        c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0));
766
        c->inputs[attr * 4 + 2] =
767
                qir_FMUL(c,
768
                         qir_ITOF(c, qir_FRAG_Z(c)),
769
                         qir_uniform_f(c, 1.0 / 0xffffff));
770
        c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
771 772
}

773
static struct qreg
774 775
emit_fragment_varying(struct vc4_compile *c, gl_varying_slot slot,
                      uint8_t swizzle)
776
{
777
        uint32_t i = c->num_input_slots++;
778 779
        struct qreg vary = {
                QFILE_VARY,
780
                i
781 782
        };

783 784 785
        if (c->num_input_slots >= c->input_slots_array_size) {
                c->input_slots_array_size =
                        MAX2(4, c->input_slots_array_size * 2);
786

787 788 789
                c->input_slots = reralloc(c, c->input_slots,
                                          struct vc4_varying_slot,
                                          c->input_slots_array_size);
790 791
        }

792 793
        c->input_slots[i].slot = slot;
        c->input_slots[i].swizzle = swizzle;
794

795
        return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c)));
796 797
}

798
static void
799
emit_fragment_input(struct vc4_compile *c, int attr, gl_varying_slot slot)
800 801
{
        for (int i = 0; i < 4; i++) {
802
                c->inputs[attr * 4 + i] =
803
                        emit_fragment_varying(c, slot, i);
804 805 806 807
                c->num_inputs++;
        }
}

808 809 810
static void
add_output(struct vc4_compile *c,
           uint32_t decl_offset,
811 812
           uint8_t slot,
           uint8_t swizzle)
813 814 815 816 817 818
{
        uint32_t old_array_size = c->outputs_array_size;
        resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
                          decl_offset + 1);

        if (old_array_size != c->outputs_array_size) {
819 820 821 822
                c->output_slots = reralloc(c,
                                           c->output_slots,
                                           struct vc4_varying_slot,
                                           c->outputs_array_size);
823 824
        }

825 826
        c->output_slots[decl_offset].slot = slot;
        c->output_slots[decl_offset].swizzle = swizzle;
827 828
}

829 830 831 832 833 834
static bool
ntq_src_is_only_ssa_def_user(nir_src *src)
{
        if (!src->is_ssa)
                return false;

835
        if (!list_is_empty(&src->ssa->if_uses))
836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
                return false;

        return (src->ssa->uses.next == &src->use_link &&
                src->ssa->uses.next->next == &src->ssa->uses);
}

/**
 * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack
 * bit set.
 *
 * However, as an optimization, it tries to find the instructions generating
 * the sources to be packed and just emit the pack flag there, if possible.
 */
static void
ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
{
        struct qreg result = qir_get_temp(c);
        struct nir_alu_instr *vec4 = NULL;

        /* If packing from a vec4 op (as expected), identify it so that we can
         * peek back at what generated its sources.
         */
        if (instr->src[0].src.is_ssa &&
            instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&
            nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==
            nir_op_vec4) {
                vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
        }

865 866 867 868 869 870 871
        /* If the pack is replicating the same channel 4 times, use the 8888
         * pack flag.  This is common for blending using the alpha
         * channel.
         */
        if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] &&
            instr->src[0].swizzle[0] == instr->src[0].swizzle[2] &&
            instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) {
872 873 874 875
                struct qreg rep = ntq_get_src(c,
                                              instr->src[0].src,
                                              instr->src[0].swizzle[0]);
                ntq_store_dest(c, &instr->dest.dest, 0, qir_PACK_8888_F(c, rep));
876 877 878
                return;
        }

879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904
        for (int i = 0; i < 4; i++) {
                int swiz = instr->src[0].swizzle[i];
                struct qreg src;
                if (vec4) {
                        src = ntq_get_src(c, vec4->src[swiz].src,
                                          vec4->src[swiz].swizzle[0]);
                } else {
                        src = ntq_get_src(c, instr->src[0].src, swiz);
                }

                if (vec4 &&
                    ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&
                    src.file == QFILE_TEMP &&
                    c->defs[src.index] &&
                    qir_is_mul(c->defs[src.index]) &&
                    !c->defs[src.index]->dst.pack) {
                        struct qinst *rewrite = c->defs[src.index];
                        c->defs[src.index] = NULL;
                        rewrite->dst = result;
                        rewrite->dst.pack = QPU_PACK_MUL_8A + i;
                        continue;
                }

                qir_PACK_8_F(c, result, src, i);
        }

905
        ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result));
906 907
}

908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941
/** Handles sign-extended bitfield extracts for 16 bits. */
static struct qreg
ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
              struct qreg bits)
{
        assert(bits.file == QFILE_UNIF &&
               c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
               c->uniform_data[bits.index] == 16);

        assert(offset.file == QFILE_UNIF &&
               c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
        int offset_bit = c->uniform_data[offset.index];
        assert(offset_bit % 16 == 0);

        return qir_UNPACK_16_I(c, base, offset_bit / 16);
}

/** Handles unsigned bitfield extracts for 8 bits. */
static struct qreg
ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
              struct qreg bits)
{
        assert(bits.file == QFILE_UNIF &&
               c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
               c->uniform_data[bits.index] == 8);

        assert(offset.file == QFILE_UNIF &&
               c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
        int offset_bit = c->uniform_data[offset.index];
        assert(offset_bit % 8 == 0);

        return qir_UNPACK_8_I(c, base, offset_bit / 8);
}

942 943 944 945 946 947 948 949 950
/**
 * If compare_instr is a valid comparison instruction, emits the
 * compare_instr's comparison and returns the sel_instr's return value based
 * on the compare_instr's result.
 */
static bool
ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
                    nir_alu_instr *compare_instr,
                    nir_alu_instr *sel_instr)
951 952 953
{
        enum qpu_cond cond;

954
        switch (compare_instr->op) {
955 956
        case nir_op_feq32:
        case nir_op_ieq32:
957 958 959
        case nir_op_seq:
                cond = QPU_COND_ZS;
                break;
960 961
        case nir_op_fne32:
        case nir_op_ine32:
962 963 964
        case nir_op_sne:
                cond = QPU_COND_ZC;
                break;
965 966 967
        case nir_op_fge32:
        case nir_op_ige32:
        case nir_op_uge32:
968 969 970
        case nir_op_sge:
                cond = QPU_COND_NC;
                break;
971 972
        case nir_op_flt32:
        case nir_op_ilt32:
973 974 975 976
        case nir_op_slt:
                cond = QPU_COND_NS;
                break;
        default:
977
                return false;
978 979
        }

980 981 982
        struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
        struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);

983 984 985
        unsigned unsized_type =
                nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]);
        if (unsized_type == nir_type_float)
986 987 988 989
                qir_SF(c, qir_FSUB(c, src0, src1));
        else
                qir_SF(c, qir_SUB(c, src0, src1));

990
        switch (sel_instr->op) {
991 992 993 994
        case nir_op_seq:
        case nir_op_sne:
        case nir_op_sge:
        case nir_op_slt:
995 996 997 998
                *dest = qir_SEL(c, cond,
                                qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
                break;

999
        case nir_op_b32csel:
1000 1001 1002 1003 1004
                *dest = qir_SEL(c, cond,
                                ntq_get_alu_src(c, sel_instr, 1),
                                ntq_get_alu_src(c, sel_instr, 2));
                break;

1005
        default:
1006 1007 1008
                *dest = qir_SEL(c, cond,
                                qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));
                break;
1009
        }
1010

1011 1012 1013
        /* Make the temporary for nir_store_dest(). */
        *dest = qir_MOV(c, *dest);

1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026
        return true;
}

/**
 * Attempts to fold a comparison generating a boolean result into the
 * condition code for selecting between two values, instead of comparing the
 * boolean result against 0 to generate the condition code.
 */
static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
                                  struct qreg *src)
{
        if (!instr->src[0].src.is_ssa)
                goto out;