brw_fs_generator.cpp 79.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 * Copyright © 2010 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

24
/** @file brw_fs_generator.cpp
25
 *
26
 * This file supports generating code from the FS LIR to the actual
27 28 29 30 31
 * native instructions.
 */

#include "brw_eu.h"
#include "brw_fs.h"
32
#include "brw_cfg.h"
33

34 35
static enum brw_reg_file
brw_file_from_reg(fs_reg *reg)
36 37
{
   switch (reg->file) {
38 39 40
   case ARF:
      return BRW_ARCHITECTURE_REGISTER_FILE;
   case FIXED_GRF:
Matt Turner's avatar
Matt Turner committed
41
   case VGRF:
42 43 44 45 46
      return BRW_GENERAL_REGISTER_FILE;
   case MRF:
      return BRW_MESSAGE_REGISTER_FILE;
   case IMM:
      return BRW_IMMEDIATE_VALUE;
47 48 49
   case BAD_FILE:
   case ATTR:
   case UNIFORM:
50 51
      unreachable("not reached");
   }
52
   return BRW_ARCHITECTURE_REGISTER_FILE;
53 54
}

55
static struct brw_reg
56 57
brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst,
                    fs_reg *reg, bool compressed)
58 59 60 61 62
{
   struct brw_reg brw_reg;

   switch (reg->file) {
   case MRF:
63
      assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
64
      /* Fallthrough */
Matt Turner's avatar
Matt Turner committed
65
   case VGRF:
66
      if (reg->stride == 0) {
67
         brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
68 69 70
      } else {
         /* From the Haswell PRM:
          *
71 72 73
          *  "VertStride must be used to cross GRF register boundaries. This
          *   rule implies that elements within a 'Width' cannot cross GRF
          *   boundaries."
74
          *
75
          * The maximum width value that could satisfy this restriction is:
76
          */
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
         const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));

         /* Because the hardware can only split source regions at a whole
          * multiple of width during decompression (i.e. vertically), clamp
          * the value obtained above to the physical execution size of a
          * single decompressed chunk of the instruction:
          */
         const unsigned phys_width = compressed ? inst->exec_size / 2 :
                                     inst->exec_size;

         /* XXX - The equation above is strictly speaking not correct on
          *       hardware that supports unbalanced GRF writes -- On Gen9+
          *       each decompressed chunk of the instruction may have a
          *       different execution size when the number of components
          *       written to each destination GRF is not the same.
          */
93 94 95
         const unsigned width = MIN2(reg_width, phys_width);
         brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
         brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136

         if (devinfo->gen == 7 && !devinfo->is_haswell) {
            /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
             *  "Each DF (Double Float) operand uses an element size of 4 rather
             *   than 8 and all regioning parameters are twice what the values
             *   would be based on the true element size: ExecSize, Width,
             *   HorzStride, and VertStride. Each DF operand uses a pair of
             *   channels and all masking and swizzing should be adjusted
             *   appropriately."
             *
             * From the IvyBridge PRM (Special Requirements for Handling Double
             * Precision Data Types, page 71):
             *  "In Align1 mode, all regioning parameters like stride, execution
             *   size, and width must use the syntax of a pair of packed
             *   floats. The offsets for these data types must be 64-bit
             *   aligned. The execution size and regioning parameters are in terms
             *   of floats."
             *
             * Summarized: when handling DF-typed arguments, ExecSize,
             * VertStride, and Width must be doubled.
             *
             * It applies to BayTrail too.
             */
            if (type_sz(reg->type) == 8) {
               brw_reg.width++;
               if (brw_reg.vstride > 0)
                  brw_reg.vstride++;
               assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
            }

            /* When converting from DF->F, we set the destination stride to 2
             * because each d2f conversion implicitly writes 2 floats, being
             * the first one the converted value. IVB/BYT actually writes two
             * F components per SIMD channel, and every other component is
             * filled with garbage.
             */
            if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
                type_sz(inst->dst.type) < 8) {
               assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
               brw_reg.hstride--;
            }
137
         }
138 139 140
      }

      brw_reg = retype(brw_reg, reg->type);
141
      brw_reg = byte_offset(brw_reg, reg->offset);
142 143
      brw_reg.abs = reg->abs;
      brw_reg.negate = reg->negate;
144
      break;
145 146
   case ARF:
   case FIXED_GRF:
147
   case IMM:
148
      assert(reg->offset == 0);
149
      brw_reg = reg->as_brw_reg();
150 151 152 153 154
      break;
   case BAD_FILE:
      /* Probably unused. */
      brw_reg = brw_null_reg();
      break;
155 156
   case ATTR:
   case UNIFORM:
157 158 159
      unreachable("not reached");
   }

160 161 162 163 164 165 166 167 168 169 170 171 172
   /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
    * region, but on IVB and BYT DF regions must be programmed in terms of
    * floats. A <0,2,1> region accomplishes this.
    */
   if (devinfo->gen == 7 && !devinfo->is_haswell &&
       type_sz(reg->type) == 8 &&
       brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
       brw_reg.width == BRW_WIDTH_1 &&
       brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
      brw_reg.width = BRW_WIDTH_2;
      brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
   }

173 174 175
   return brw_reg;
}

176
fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
177
                           void *mem_ctx,
178 179
                           const void *key,
                           struct brw_stage_prog_data *prog_data,
180
                           unsigned promoted_constants,
181
                           bool runtime_check_aads_emit,
182
                           gl_shader_stage stage)
183

184 185
   : compiler(compiler), log_data(log_data),
     devinfo(compiler->devinfo), key(key),
186
     prog_data(prog_data),
187
     promoted_constants(promoted_constants),
188
     runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
189
     stage(stage), mem_ctx(mem_ctx)
190
{
191 192
   p = rzalloc(mem_ctx, struct brw_codegen);
   brw_init_codegen(devinfo, p, mem_ctx);
193 194 195 196 197 198

   /* In the FS code generator, we are very careful to ensure that we always
    * set the right execution size so we don't need the EU code to "help" us
    * by trying to infer it.  Sometimes, it infers the wrong thing.
    */
   p->automatic_exec_sizes = false;
199 200 201 202 203 204
}

fs_generator::~fs_generator()
{
}

205 206 207 208 209 210 211 212 213 214 215 216
class ip_record : public exec_node {
public:
   DECLARE_RALLOC_CXX_OPERATORS(ip_record)

   ip_record(int ip)
   {
      this->ip = ip;
   }

   int ip;
};

217
bool
218 219
fs_generator::patch_discard_jumps_to_fb_writes()
{
220
   if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
221
      return false;
222

223
   int scale = brw_jump_scale(p->devinfo);
224

225 226 227 228 229 230 231 232 233 234 235
   /* There is a somewhat strange undocumented requirement of using
    * HALT, according to the simulator.  If some channel has HALTed to
    * a particular UIP, then by the end of the program, every channel
    * must have HALTed to that UIP.  Furthermore, the tracking is a
    * stack, so you can't do the final halt of a UIP after starting
    * halting to a new UIP.
    *
    * Symptoms of not emitting this instruction on actual hardware
    * included GPU hangs and sparkly rendering on the piglit discard
    * tests.
    */
236
   brw_inst *last_halt = gen6_HALT(p);
237 238
   brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
   brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
239 240 241

   int ip = p->nr_insn;

242
   foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
243
      brw_inst *patch = &p->store[patch_ip->ip];
244

245
      assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
246
      /* HALT takes a half-instruction distance from the pre-incremented IP. */
247
      brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
248 249 250
   }

   this->discard_halt_patches.make_empty();
251
   return true;
252 253
}

254 255
void
fs_generator::fire_fb_write(fs_inst *inst,
256
                            struct brw_reg payload,
257 258 259 260 261
                            struct brw_reg implied_header,
                            GLuint nr)
{
   uint32_t msg_control;

262
   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
263

264
   if (devinfo->gen < 6) {
265
      brw_push_insn_state(p);
266
      brw_set_default_exec_size(p, BRW_EXECUTE_8);
267 268 269
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
270
      brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
271
      brw_pop_insn_state(p);
272 273
   }

274 275
   if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
276
   else if (prog_data->dual_src_blend) {
277
      if (!inst->group)
278 279 280
         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
      else
         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
281
   } else if (inst->exec_size == 16)
282 283 284 285
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
   else
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;

286 287 288 289 290
   /* We assume render targets start at 0, because headerless FB write
    * messages set "Render Target Index" to 0.  Using a different binding
    * table index would make it impossible to use headerless messages.
    */
   const uint32_t surf_index = inst->target;
291

292 293 294 295
   bool last_render_target = inst->eot ||
                             (prog_data->dual_src_blend && dispatch_width == 16);


296
   brw_fb_WRITE(p,
297
                payload,
298 299 300 301 302 303
                implied_header,
                msg_control,
                surf_index,
                nr,
                0,
                inst->eot,
304
                last_render_target,
305
                inst->header_size != 0);
306 307 308 309

   brw_mark_surface_used(&prog_data->base, surf_index);
}

310
void
311
fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
312
{
313
   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
314
   const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
315 316
   struct brw_reg implied_header;

317
   if (devinfo->gen < 8 && !devinfo->is_haswell) {
318 319 320
      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   }

321 322
   if (inst->base_mrf >= 0)
      payload = brw_message_reg(inst->base_mrf);
323

324 325 326
   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
    * move, here's g1.
    */
327
   if (inst->header_size != 0) {
328 329
      brw_push_insn_state(p);
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
330
      brw_set_default_exec_size(p, BRW_EXECUTE_1);
331 332
      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
333
      brw_set_default_flag_reg(p, 0, 0);
334

335 336 337
      /* On HSW, the GPU will use the predicate on SENDC, unless the header is
       * present.
       */
338
      if (prog_data->uses_kill) {
339
         struct brw_reg pixel_mask;
340

341
         if (devinfo->gen >= 6)
342 343 344
            pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
         else
            pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
345

346 347
         brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
      }
348

349
      if (devinfo->gen >= 6) {
350 351
         brw_push_insn_state(p);
         brw_set_default_exec_size(p, BRW_EXECUTE_16);
352
	 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
353
	 brw_MOV(p,
354
		 retype(payload, BRW_REGISTER_TYPE_UD),
355
		 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
356
         brw_pop_insn_state(p);
357

358
         if (inst->target > 0 && key->replicate_alpha) {
359 360 361 362
            /* Set "Source0 Alpha Present to RenderTarget" bit in message
             * header.
             */
            brw_OR(p,
363
		   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
364 365 366 367
		   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
		   brw_imm_ud(0x1 << 11));
         }

368 369
	 if (inst->target > 0) {
	    /* Set the render target index for choosing BLEND_STATE. */
370 371
	    brw_MOV(p, retype(vec1(suboffset(payload, 2)),
                              BRW_REGISTER_TYPE_UD),
372 373 374
		    brw_imm_ud(inst->target));
	 }

375 376 377 378 379 380 381 382
         /* Set computes stencil to render target */
         if (prog_data->computed_stencil) {
            brw_OR(p,
                   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
                   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
                   brw_imm_ud(0x1 << 14));
         }

383 384 385 386
	 implied_header = brw_null_reg();
      } else {
	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
      }
387 388

      brw_pop_insn_state(p);
389 390 391 392
   } else {
      implied_header = brw_null_reg();
   }

393
   if (!runtime_check_aads_emit) {
394
      fire_fb_write(inst, payload, implied_header, inst->mlen);
395 396
   } else {
      /* This can only happen in gen < 6 */
397
      assert(devinfo->gen < 6);
398 399 400 401

      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));

      /* Check runtime bit to detect if we have to send AA data or not */
402
      brw_push_insn_state(p);
403 404
      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
      brw_set_default_exec_size(p, BRW_EXECUTE_1);
405 406 407 408
      brw_AND(p,
              v1_null_ud,
              retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
              brw_imm_ud(1<<26));
409
      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
410 411

      int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
412
      brw_pop_insn_state(p);
413 414
      {
         /* Don't send AA data */
415
         fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
416 417
      }
      brw_land_fwd_jump(p, jmp);
418
      fire_fb_write(inst, payload, implied_header, inst->mlen);
419
   }
420 421
}

422 423 424 425
void
fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
                               struct brw_reg payload)
{
426
   assert(inst->size_written % REG_SIZE == 0);
427
   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
428 429
   /* We assume that render targets start at binding table index 0. */
   const unsigned surf_index = inst->target;
430 431

   gen9_fb_READ(p, dst, payload, surf_index,
432
                inst->header_size, inst->size_written / REG_SIZE,
433 434 435 436 437
                prog_data->persample_dispatch);

   brw_mark_surface_used(&prog_data->base, surf_index);
}

438 439 440 441 442 443 444 445
void
fs_generator::generate_mov_indirect(fs_inst *inst,
                                    struct brw_reg dst,
                                    struct brw_reg reg,
                                    struct brw_reg indirect_byte_offset)
{
   assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
   assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
446 447
   assert(!reg.abs && !reg.negate);
   assert(reg.type == dst.type);
448 449 450

   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;

451 452
   if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
      imm_byte_offset += indirect_byte_offset.ud;
453

454 455 456 457 458
      reg.nr = imm_byte_offset / REG_SIZE;
      reg.subnr = imm_byte_offset % REG_SIZE;
      brw_MOV(p, dst, reg);
   } else {
      /* Prior to Broadwell, there are only 8 address registers. */
459
      assert(inst->exec_size <= 8 || devinfo->gen >= 8);
460 461 462 463 464 465 466 467 468 469 470 471

      /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
      struct brw_reg addr = vec8(brw_address_reg(0));

      /* The destination stride of an instruction (in bytes) must be greater
       * than or equal to the size of the rest of the instruction.  Since the
       * address register is of type UW, we can't use a D-type instruction.
       * In order to get around this, re retype to UW and use a stride.
       */
      indirect_byte_offset =
         retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);

472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
      /* There are a number of reasons why we don't use the base offset here.
       * One reason is that the field is only 9 bits which means we can only
       * use it to access the first 16 GRFs.  Also, from the Haswell PRM
       * section "Register Region Restrictions":
       *
       *    "The lower bits of the AddressImmediate must not overflow to
       *    change the register address.  The lower 5 bits of Address
       *    Immediate when added to lower 5 bits of address register gives
       *    the sub-register offset. The upper bits of Address Immediate
       *    when added to upper bits of address register gives the register
       *    address. Any overflow from sub-register offset is dropped."
       *
       * Since the indirect may cause us to cross a register boundary, this
       * makes the base offset almost useless.  We could try and do something
       * clever where we use a actual base offset if base_offset % 32 == 0 but
       * that would mean we were generating different code depending on the
       * base offset.  Instead, for the sake of consistency, we'll just do the
       * add ourselves.  This restriction is only listed in the Haswell PRM
       * but empirical testing indicates that it applies on all older
       * generations and is lifted on Broadwell.
       *
       * In the end, while base_offset is nice to look at in the generated
       * code, using it saves us 0 instructions and would require quite a bit
       * of case-by-case work.  It's just not worth it.
       */
497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
      brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));

      if (type_sz(reg.type) > 4 &&
          ((devinfo->gen == 7 && !devinfo->is_haswell) ||
           devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
         /* IVB has an issue (which we found empirically) where it reads two
          * address register components per channel for indirectly addressed
          * 64-bit sources.
          *
          * From the Cherryview PRM Vol 7. "Register Region Restrictions":
          *
          *    "When source or destination datatype is 64b or operation is
          *    integer DWord multiply, indirect addressing must not be used."
          *
          * To work around both of these, we do two integer MOVs insead of one
          * 64-bit MOV.  Because no double value should ever cross a register
          * boundary, it's safe to use the immediate offset in the indirect
          * here to handle adding 4 bytes to the offset and avoid the extra
          * ADD to the register file.
516
          */
517 518 519 520 521 522
         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
                    retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
                    retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
      } else {
         struct brw_reg ind_src = brw_VxH_indirect(0, 0);
523

524
         brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
525

526 527 528 529 530 531 532 533 534 535 536 537 538
         if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
             !inst->get_next()->is_tail_sentinel() &&
             ((fs_inst *)inst->get_next())->mlen > 0) {
            /* From the Sandybridge PRM:
             *
             *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
             *    instruction that “indexed/indirect” source AND is followed
             *    by a send, the instruction requires a “Switch”. This is to
             *    avoid race condition where send may dispatch before MRF is
             *    updated."
             */
            brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
         }
539 540
      }
   }
541 542
}

543 544 545 546 547
void
fs_generator::generate_urb_read(fs_inst *inst,
                                struct brw_reg dst,
                                struct brw_reg header)
{
548
   assert(inst->size_written % REG_SIZE == 0);
549 550 551 552
   assert(header.file == BRW_GENERAL_REGISTER_FILE);
   assert(header.type == BRW_REGISTER_TYPE_UD);

   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
553
   brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
554 555 556 557 558 559
   brw_set_src0(p, send, header);
   brw_set_src1(p, send, brw_imm_ud(0u));

   brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
   brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);

560 561 562
   if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
      brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);

563
   brw_inst_set_mlen(p->devinfo, send, inst->mlen);
564
   brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
565 566 567 568
   brw_inst_set_header_present(p->devinfo, send, true);
   brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
}

569 570 571 572 573
void
fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
{
   brw_inst *insn;

574 575 576 577 578 579 580 581 582 583 584 585 586
    /* WaClearTDRRegBeforeEOTForNonPS.
     *
     *   WA: Clear tdr register before send EOT in all non-PS shader kernels
     *
     *   mov(8) tdr0:ud 0x0:ud {NoMask}"
     */
   if (inst->eot && p->devinfo->gen == 10) {
      brw_push_insn_state(p);
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
      brw_MOV(p, brw_tdr_reg(), brw_imm_uw(0));
      brw_pop_insn_state(p);
   }

587 588 589 590 591 592
   insn = brw_next_insn(p, BRW_OPCODE_SEND);

   brw_set_dest(p, insn, brw_null_reg());
   brw_set_src0(p, insn, payload);
   brw_set_src1(p, insn, brw_imm_d(0));

593 594
   brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
   brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
595

596 597 598 599 600 601 602 603
   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
      brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);

   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
      brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);

604 605 606 607 608
   brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
   brw_inst_set_rlen(p->devinfo, insn, 0);
   brw_inst_set_eot(p->devinfo, insn, inst->eot);
   brw_inst_set_header_present(p->devinfo, insn, true);
   brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
609 610
}

611 612 613 614 615 616 617
void
fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
{
   struct brw_inst *insn;

   insn = brw_next_insn(p, BRW_OPCODE_SEND);

618
   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
619
   brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641
   brw_set_src1(p, insn, brw_imm_d(0));

   /* Terminate a compute shader by sending a message to the thread spawner.
    */
   brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
   brw_inst_set_mlen(devinfo, insn, 1);
   brw_inst_set_rlen(devinfo, insn, 0);
   brw_inst_set_eot(devinfo, insn, inst->eot);
   brw_inst_set_header_present(devinfo, insn, false);

   brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
   brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */

   /* Note that even though the thread has a URB resource associated with it,
    * we set the "do not dereference URB" bit, because the URB resource is
    * managed by the fixed-function unit, so it will free it automatically.
    */
   brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */

   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
}

642 643 644 645 646 647 648
void
fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
{
   brw_barrier(p, src);
   brw_WAIT(p);
}

649
void
650
fs_generator::generate_linterp(fs_inst *inst,
651 652
			     struct brw_reg dst, struct brw_reg *src)
{
653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
   /* PLN reads:
    *                      /   in SIMD16   \
    *    -----------------------------------
    *   | src1+0 | src1+1 | src1+2 | src1+3 |
    *   |-----------------------------------|
    *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
    *    -----------------------------------
    *
    * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
    *
    *    -----------------------------------
    *   | src1+0 | src1+1 | src1+2 | src1+3 |
    *   |-----------------------------------|
    *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
    *   |-----------------------------------|
    *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
    *    -----------------------------------
    *
    * See also: emit_interpolation_setup_gen4().
    */
673
   struct brw_reg delta_x = src[0];
674
   struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
675
   struct brw_reg interp = src[1];
676

677 678
   if (devinfo->has_pln &&
       (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
679 680 681 682 683 684 685
      brw_PLN(p, dst, interp, delta_x);
   } else {
      brw_LINE(p, brw_null_reg(), interp, delta_x);
      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
   }
}

686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
void
fs_generator::generate_get_buffer_size(fs_inst *inst,
                                       struct brw_reg dst,
                                       struct brw_reg src,
                                       struct brw_reg surf_index)
{
   assert(devinfo->gen >= 7);
   assert(surf_index.file == BRW_IMMEDIATE_VALUE);

   uint32_t simd_mode;
   int rlen = 4;

   switch (inst->exec_size) {
   case 8:
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
      break;
   case 16:
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
      break;
   default:
      unreachable("Invalid width for texture instruction");
   }

   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
      rlen = 8;
      dst = vec16(dst);
   }

   brw_SAMPLE(p,
              retype(dst, BRW_REGISTER_TYPE_UW),
              inst->base_mrf,
              src,
718
              surf_index.ud,
719 720 721 722 723 724 725
              0,
              GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
              rlen, /* response length */
              inst->mlen,
              inst->header_size > 0,
              simd_mode,
              BRW_SAMPLER_RETURN_FORMAT_SINT32);
726 727

   brw_mark_surface_used(prog_data, surf_index.ud);
728 729
}

730
void
731
fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
732
                           struct brw_reg surface_index,
733
                           struct brw_reg sampler_index)
734
{
735
   assert(inst->size_written % REG_SIZE == 0);
736
   int msg_type = -1;
737
   uint32_t simd_mode;
738
   uint32_t return_format;
739
   bool is_combined_send = inst->eot;
740 741 742 743 744 745 746 747 748 749 750 751

   switch (dst.type) {
   case BRW_REGISTER_TYPE_D:
      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
      break;
   case BRW_REGISTER_TYPE_UD:
      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
      break;
   default:
      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
      break;
   }
752

753 754 755 756 757 758 759 760 761 762 763
   /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
    * is set as part of the message descriptor.  On gen4, the PRM seems to
    * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
    * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
    * gone from the message descriptor entirely and you just get UINT32 all
    * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
    * just stomp it to UINT32 all the time.
    */
   if (inst->opcode == SHADER_OPCODE_TXS)
      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;

764 765 766 767 768
   switch (inst->exec_size) {
   case 8:
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
      break;
   case 16:
769
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
770 771 772 773
      break;
   default:
      unreachable("Invalid width for texture instruction");
   }
774

775
   if (devinfo->gen >= 5) {
776
      switch (inst->opcode) {
777
      case SHADER_OPCODE_TEX:
778 779 780 781 782 783 784 785 786 787 788 789 790
	 if (inst->shadow_compare) {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
	 } else {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
	 }
	 break;
      case FS_OPCODE_TXB:
	 if (inst->shadow_compare) {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
	 } else {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
	 }
	 break;
791
      case SHADER_OPCODE_TXL:
792 793 794 795 796 797
	 if (inst->shadow_compare) {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
	 } else {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
	 }
	 break;
798 799 800 801 802 803 804 805
      case SHADER_OPCODE_TXL_LZ:
         assert(devinfo->gen >= 9);
	 if (inst->shadow_compare) {
            msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
         } else {
            msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
         }
         break;
806
      case SHADER_OPCODE_TXS:
807 808
	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
	 break;
809
      case SHADER_OPCODE_TXD:
810
         if (inst->shadow_compare) {
811
            /* Gen7.5+.  Otherwise, lowered in NIR */
812
            assert(devinfo->gen >= 8 || devinfo->is_haswell);
813 814 815 816
            msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
         } else {
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
         }
817
	 break;
818
      case SHADER_OPCODE_TXF:
819 820
	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
	 break;
821 822 823 824
      case SHADER_OPCODE_TXF_LZ:
         assert(devinfo->gen >= 9);
         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
         break;
825 826 827 828
      case SHADER_OPCODE_TXF_CMS_W:
         assert(devinfo->gen >= 9);
         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
         break;
829
      case SHADER_OPCODE_TXF_CMS:
830
         if (devinfo->gen >= 7)
831 832 833 834
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
         else
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
         break;
835
      case SHADER_OPCODE_TXF_UMS:
836
         assert(devinfo->gen >= 7);
837 838
         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
         break;
839
      case SHADER_OPCODE_TXF_MCS:
840
         assert(devinfo->gen >= 7);
841 842
         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
         break;
843 844 845
      case SHADER_OPCODE_LOD:
         msg_type = GEN5_SAMPLER_MESSAGE_LOD;
         break;
Chris Forbes's avatar
Chris Forbes committed
846
      case SHADER_OPCODE_TG4:
847
         if (inst->shadow_compare) {
848
            assert(devinfo->gen >= 7);
849 850
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
         } else {
851
            assert(devinfo->gen >= 6);
852 853
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
         }
Chris Forbes's avatar
Chris Forbes committed
854
         break;
855
      case SHADER_OPCODE_TG4_OFFSET:
856
         assert(devinfo->gen >= 7);
857 858 859 860 861
         if (inst->shadow_compare) {
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
         } else {
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
         }
862
         break;
863 864 865
      case SHADER_OPCODE_SAMPLEINFO:
         msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
         break;
866
      default:
867
	 unreachable("not reached");
868 869 870
      }
   } else {
      switch (inst->opcode) {
871
      case SHADER_OPCODE_TEX:
872 873 874
	 /* Note that G45 and older determines shadow compare and dispatch width
	  * from message length for most messages.
	  */
875
         if (inst->exec_size == 8) {
876 877 878 879 880 881 882 883 884 885 886 887 888 889 890
            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
            if (inst->shadow_compare) {
               assert(inst->mlen == 6);
            } else {
               assert(inst->mlen <= 4);
            }
         } else {
            if (inst->shadow_compare) {
               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
               assert(inst->mlen == 9);
            } else {
               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
               assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
            }
         }
891 892 893
	 break;
      case FS_OPCODE_TXB:
	 if (inst->shadow_compare) {
894
            assert(inst->exec_size == 8);
895 896 897 898 899 900 901 902
	    assert(inst->mlen == 6);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
	 } else {
	    assert(inst->mlen == 9);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 }
	 break;
903
      case SHADER_OPCODE_TXL:
904
	 if (inst->shadow_compare) {
905
            assert(inst->exec_size == 8);
906 907 908 909 910 911 912 913
	    assert(inst->mlen == 6);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
	 } else {
	    assert(inst->mlen == 9);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 }
	 break;
914
      case SHADER_OPCODE_TXD:
915
	 /* There is no sample_d_c message; comparisons are done manually */
916
         assert(inst->exec_size == 8);
917 918
	 assert(inst->mlen == 7 || inst->mlen == 10);
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
919
	 break;
920
      case SHADER_OPCODE_TXF:
921
         assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
922 923 924
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 break;
925
      case SHADER_OPCODE_TXS:
926 927 928 929
	 assert(inst->mlen == 3);
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 break;
930
      default:
931
	 unreachable("not reached");
932 933 934 935 936 937 938 939
      }
   }
   assert(msg_type != -1);

   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
      dst = vec16(dst);
   }

940
   assert(devinfo->gen < 7 || inst->header_size == 0 ||
941
          src.file == BRW_GENERAL_REGISTER_FILE);
942

943 944
   assert(sampler_index.type == BRW_REGISTER_TYPE_UD);

945 946 947 948
   /* Load the message header if present.  If there's a texture offset,
    * we need to set it up explicitly and load the offset bitfield.
    * Otherwise, we can use an implied move from g0 to the first message reg.
    */
949
   if (inst->header_size != 0) {
950
      if (devinfo->gen < 6 && !inst->offset) {
951 952
         /* Set up an implied move from g0 to the MRF. */
         src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
953
      } else {
954 955
         struct brw_reg header_reg;

956
         if (devinfo->gen >= 7) {
957 958 959 960 961 962
            header_reg = src;
         } else {
            assert(inst->base_mrf != -1);
            header_reg = brw_message_reg(inst->base_mrf);
         }

963
         brw_push_insn_state(p);
964
         brw_set_default_exec_size(p, BRW_EXECUTE_8);
965 966
         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
         brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
967 968 969
         /* Explicitly set up the message header by copying g0 to the MRF. */
         brw_MOV(p, header_reg, brw_vec8_grf(0, 0));

970
         brw_set_default_exec_size(p, BRW_EXECUTE_1);
971
         if (inst->offset) {
972 973
            /* Set the offset bits in DWord 2. */
            brw_MOV(p, get_element_ud(header_reg, 2),
974
                       brw_imm_ud(inst->offset));
975 976 977 978 979 980 981 982
         } else if (stage != MESA_SHADER_VERTEX &&
                    stage != MESA_SHADER_FRAGMENT) {
            /* The vertex and fragment stages have g0.2 set to 0, so
             * header0.2 is 0 when g0 is copied. Other stages may not, so we
             * must set it to 0 to avoid setting undesirable bits in the
             * message.
             */
            brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
983
         }
984

985
         brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
986 987
         brw_pop_insn_state(p);
      }
988 989
   }

990 991
   uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
         inst->opcode == SHADER_OPCODE_TG4_OFFSET)
992 993
         ? prog_data->binding_table.gather_texture_start
         : prog_data->binding_table.texture_start;
994

995 996 997
   if (surface_index.file == BRW_IMMEDIATE_VALUE &&
       sampler_index.file == BRW_IMMEDIATE_VALUE) {
      uint32_t surface = surface_index.ud;
998
      uint32_t sampler = sampler_index.ud;
999 1000 1001 1002 1003

      brw_SAMPLE(p,
                 retype(dst, BRW_REGISTER_TYPE_UW),
                 inst->base_mrf,
                 src,
1004
                 surface + base_binding_table_index,
1005 1006
                 sampler % 16,
                 msg_type,
1007
                 inst->size_written / REG_SIZE,
1008
                 inst->mlen,
1009
                 inst->header_size != 0,
1010 1011 1012
                 simd_mode,
                 return_format);

1013
      brw_mark_surface_used(prog_data, surface + base_binding_table_index);
1014
   } else {
1015 1016 1017
      /* Non-const sampler index */

      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1018
      struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
1019 1020 1021 1022 1023
      struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));

      brw_push_insn_state(p);
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
      brw_set_default_access_mode(p, BRW_ALIGN_1);
1024
      brw_set_default_exec_size(p, BRW_EXECUTE_1);
1025

1026
      if (brw_regs_equal(&surface_reg, &sampler_reg)) {
1027 1028
         brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
      } else {
1029 1030 1031 1032 1033 1034
         if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
            brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
         } else {
            brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
            brw_OR(p, addr, addr, surface_reg);
         }
1035
      }
1036 1037
      if (base_binding_table_index)
         brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
1038
      brw_AND(p, addr, addr, brw_imm_ud(0xfff));
1039

1040 1041
      brw_pop_insn_state(p);

1042 1043 1044 1045
      /* dst = send(offset, a0.0 | <descriptor>) */
      brw_inst *insn = brw_send_indirect_message(
         p, BRW_SFID_SAMPLER, dst, src, addr);
      brw_set_sampler_message(p, insn,
1046 1047 1048
                              0 /* surface */,
                              0 /* sampler */,
                              msg_type,
1049
                              inst->size_written / REG_SIZE,
1050
                              inst->mlen /* mlen */,
1051
                              inst->header_size != 0 /* header */,
1052 1053 1054 1055 1056 1057
                              simd_mode,
                              return_format);

      /* visitor knows more than we do about the surface limit required,
       * so has already done marking.
       */
1058
   }
1059 1060

   if (is_combined_send) {
1061 1062
      brw_inst_set_eot(p->devinfo, brw_last_inst, true);
      brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
1063
   }
1064 1065 1066 1067 1068 1069 1070 1071
}


/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 * looking like:
 *
 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 *
1072
 * Ideally, we want to produce:
1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087
 *
 *           DDX                     DDY
 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 *
 * and add another set of two more subspans if in 16-pixel dispatch mode.
 *
 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1088 1089 1090 1091
 * pair.  But the ideal approximation may impose a huge performance cost on
 * sample_d.  On at least Haswell, sample_d instruction does some
 * optimizations if the same LOD is used for all pixels in the subspan.
 *
1092 1093
 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
 * appropriate swizzling.
1094 1095
 */
void
1096 1097
fs_generator::generate_ddx(enum opcode opcode,
                           struct brw_reg dst, struct brw_reg src)
1098
{
1099 1100
   unsigned vstride, width;

1101
   if (opcode == FS_OPCODE_DDX_FINE) {
1102 1103 1104
      /* produce accurate derivatives */
      vstride = BRW_VERTICAL_STRIDE_2;
      width = BRW_WIDTH_2;
1105
   } else {
1106 1107 1108 1109 1110
      /* replicate the derivative at the top-left pixel to other pixels */
      vstride = BRW_VERTICAL_STRIDE_4;
      width = BRW_WIDTH_4;
   }

1111
   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
1112
                                 src.negate, src.abs,
1113
				 BRW_REGISTER_TYPE_F,
1114 1115
				 vstride,
				 width,
1116 1117 1118
				 BRW_HORIZONTAL_STRIDE_0,
				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1119
                                 src.negate, src.abs,
1120
				 BRW_REGISTER_TYPE_F,
1121 1122
				 vstride,
				 width,
1123 1124 1125 1126 1127
				 BRW_HORIZONTAL_STRIDE_0,
				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
   brw_ADD(p, dst, src0, negate(src1));
}

1128 1129 1130 1131
/* The negate_value boolean is used to negate the derivative computation for
 * FBOs, since they place the origin at the upper left instead of the lower
 * left.
 */
1132
void
1133
fs_generator::generate_ddy(enum opcode opcode,
1134
                           struct brw_reg dst, struct brw_reg src)
1135
{
1136
   if (opcode == FS_OPCODE_DDY_FINE) {
1137 1138
      /* produce accurate derivatives */
      struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1139
                                    src.negate, src.abs,
1140 1141 1142 1143 1144 1145
                                    BRW_REGISTER_TYPE_F,
                                    BRW_VERTICAL_STRIDE_4,
                                    BRW_WIDTH_4,
                                    BRW_HORIZONTAL_STRIDE_1,
                                    BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
      struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1146
                                    src.negate, src.abs,
1147 1148 1149 1150 1151 1152
                                    BRW_REGISTER_TYPE_F,
                                    BRW_VERTICAL_STRIDE_4,
                                    BRW_WIDTH_4,
                                    BRW_HORIZONTAL_STRIDE_1,
                                    BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
      brw_push_insn_state(p);
1153
      brw_set_default_access_mode(p, BRW_ALIGN_16);
1154
      brw_ADD(p, dst, negate(src0), src1);
1155 1156 1157 1158
      brw_pop_insn_state(p);
   } else {
      /* replicate the derivative at the top-left pixel to other pixels */
      struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1159
                                    src.negate, src.abs,
1160 1161 1162 1163 1164 1165
                                    BRW_REGISTER_TYPE_F,
                                    BRW_VERTICAL_STRIDE_4,
                                    BRW_WIDTH_4,
                                    BRW_HORIZONTAL_STRIDE_0,
                                    BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
      struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1166
                                    src.negate, src.abs,
1167 1168 1169 1170 1171
                                    BRW_REGISTER_TYPE_F,
                                    BRW_VERTICAL_STRIDE_4,
                                    BRW_WIDTH_4,
                                    BRW_HORIZONTAL_STRIDE_0,
                                    BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1172
      brw_ADD(p, dst, negate(src0), src1);
1173
   }
1174 1175
}

1176 1177 1178
void
fs_generator::generate_discard_jump(fs_inst *inst)
{
1179
   assert(devinfo->gen >= 6);