brw_fs_generator.cpp 78.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 * Copyright © 2010 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

24
/** @file brw_fs_generator.cpp
25
 *
26
 * This file supports generating code from the FS LIR to the actual
27 28 29 30 31
 * native instructions.
 */

#include "brw_eu.h"
#include "brw_fs.h"
32
#include "brw_cfg.h"
33

34 35
static enum brw_reg_file
brw_file_from_reg(fs_reg *reg)
36 37
{
   switch (reg->file) {
38 39 40
   case ARF:
      return BRW_ARCHITECTURE_REGISTER_FILE;
   case FIXED_GRF:
Matt Turner's avatar
Matt Turner committed
41
   case VGRF:
42 43 44 45 46
      return BRW_GENERAL_REGISTER_FILE;
   case MRF:
      return BRW_MESSAGE_REGISTER_FILE;
   case IMM:
      return BRW_IMMEDIATE_VALUE;
47 48 49
   case BAD_FILE:
   case ATTR:
   case UNIFORM:
50 51
      unreachable("not reached");
   }
52
   return BRW_ARCHITECTURE_REGISTER_FILE;
53 54
}

55
static struct brw_reg
56 57
brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst,
                    fs_reg *reg, bool compressed)
58 59 60 61 62
{
   struct brw_reg brw_reg;

   switch (reg->file) {
   case MRF:
63
      assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
64
      /* Fallthrough */
Matt Turner's avatar
Matt Turner committed
65
   case VGRF:
66
      if (reg->stride == 0) {
67
         brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
68 69 70
      } else {
         /* From the Haswell PRM:
          *
71 72 73
          *  "VertStride must be used to cross GRF register boundaries. This
          *   rule implies that elements within a 'Width' cannot cross GRF
          *   boundaries."
74
          *
75
          * The maximum width value that could satisfy this restriction is:
76
          */
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
         const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));

         /* Because the hardware can only split source regions at a whole
          * multiple of width during decompression (i.e. vertically), clamp
          * the value obtained above to the physical execution size of a
          * single decompressed chunk of the instruction:
          */
         const unsigned phys_width = compressed ? inst->exec_size / 2 :
                                     inst->exec_size;

         /* XXX - The equation above is strictly speaking not correct on
          *       hardware that supports unbalanced GRF writes -- On Gen9+
          *       each decompressed chunk of the instruction may have a
          *       different execution size when the number of components
          *       written to each destination GRF is not the same.
          */
93 94 95
         const unsigned width = MIN2(reg_width, phys_width);
         brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
         brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136

         if (devinfo->gen == 7 && !devinfo->is_haswell) {
            /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
             *  "Each DF (Double Float) operand uses an element size of 4 rather
             *   than 8 and all regioning parameters are twice what the values
             *   would be based on the true element size: ExecSize, Width,
             *   HorzStride, and VertStride. Each DF operand uses a pair of
             *   channels and all masking and swizzing should be adjusted
             *   appropriately."
             *
             * From the IvyBridge PRM (Special Requirements for Handling Double
             * Precision Data Types, page 71):
             *  "In Align1 mode, all regioning parameters like stride, execution
             *   size, and width must use the syntax of a pair of packed
             *   floats. The offsets for these data types must be 64-bit
             *   aligned. The execution size and regioning parameters are in terms
             *   of floats."
             *
             * Summarized: when handling DF-typed arguments, ExecSize,
             * VertStride, and Width must be doubled.
             *
             * It applies to BayTrail too.
             */
            if (type_sz(reg->type) == 8) {
               brw_reg.width++;
               if (brw_reg.vstride > 0)
                  brw_reg.vstride++;
               assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
            }

            /* When converting from DF->F, we set the destination stride to 2
             * because each d2f conversion implicitly writes 2 floats, being
             * the first one the converted value. IVB/BYT actually writes two
             * F components per SIMD channel, and every other component is
             * filled with garbage.
             */
            if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
                type_sz(inst->dst.type) < 8) {
               assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
               brw_reg.hstride--;
            }
137
         }
138 139 140
      }

      brw_reg = retype(brw_reg, reg->type);
141
      brw_reg = byte_offset(brw_reg, reg->offset);
142 143
      brw_reg.abs = reg->abs;
      brw_reg.negate = reg->negate;
144
      break;
145 146
   case ARF:
   case FIXED_GRF:
147
   case IMM:
148
      assert(reg->offset == 0);
149
      brw_reg = reg->as_brw_reg();
150 151 152 153 154
      break;
   case BAD_FILE:
      /* Probably unused. */
      brw_reg = brw_null_reg();
      break;
155 156
   case ATTR:
   case UNIFORM:
157 158 159
      unreachable("not reached");
   }

160 161 162 163 164 165 166 167 168 169 170 171 172
   /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
    * region, but on IVB and BYT DF regions must be programmed in terms of
    * floats. A <0,2,1> region accomplishes this.
    */
   if (devinfo->gen == 7 && !devinfo->is_haswell &&
       type_sz(reg->type) == 8 &&
       brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
       brw_reg.width == BRW_WIDTH_1 &&
       brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
      brw_reg.width = BRW_WIDTH_2;
      brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
   }

173 174 175
   return brw_reg;
}

176
fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
177
                           void *mem_ctx,
178 179
                           const void *key,
                           struct brw_stage_prog_data *prog_data,
180
                           unsigned promoted_constants,
181
                           bool runtime_check_aads_emit,
182
                           gl_shader_stage stage)
183

184 185
   : compiler(compiler), log_data(log_data),
     devinfo(compiler->devinfo), key(key),
186
     prog_data(prog_data),
187
     promoted_constants(promoted_constants),
188
     runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
189
     stage(stage), mem_ctx(mem_ctx)
190
{
191 192
   p = rzalloc(mem_ctx, struct brw_codegen);
   brw_init_codegen(devinfo, p, mem_ctx);
193 194 195 196 197 198

   /* In the FS code generator, we are very careful to ensure that we always
    * set the right execution size so we don't need the EU code to "help" us
    * by trying to infer it.  Sometimes, it infers the wrong thing.
    */
   p->automatic_exec_sizes = false;
199 200 201 202 203 204
}

fs_generator::~fs_generator()
{
}

205 206 207 208 209 210 211 212 213 214 215 216
class ip_record : public exec_node {
public:
   DECLARE_RALLOC_CXX_OPERATORS(ip_record)

   ip_record(int ip)
   {
      this->ip = ip;
   }

   int ip;
};

217
bool
218 219
fs_generator::patch_discard_jumps_to_fb_writes()
{
220
   if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
221
      return false;
222

223
   int scale = brw_jump_scale(p->devinfo);
224

225 226 227 228 229 230 231 232 233 234 235
   /* There is a somewhat strange undocumented requirement of using
    * HALT, according to the simulator.  If some channel has HALTed to
    * a particular UIP, then by the end of the program, every channel
    * must have HALTed to that UIP.  Furthermore, the tracking is a
    * stack, so you can't do the final halt of a UIP after starting
    * halting to a new UIP.
    *
    * Symptoms of not emitting this instruction on actual hardware
    * included GPU hangs and sparkly rendering on the piglit discard
    * tests.
    */
236
   brw_inst *last_halt = gen6_HALT(p);
237 238
   brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
   brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
239 240 241

   int ip = p->nr_insn;

242
   foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
243
      brw_inst *patch = &p->store[patch_ip->ip];
244

245
      assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
246
      /* HALT takes a half-instruction distance from the pre-incremented IP. */
247
      brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
248 249 250
   }

   this->discard_halt_patches.make_empty();
251
   return true;
252 253
}

254 255
void
fs_generator::fire_fb_write(fs_inst *inst,
256
                            struct brw_reg payload,
257 258 259 260 261
                            struct brw_reg implied_header,
                            GLuint nr)
{
   uint32_t msg_control;

262
   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
263

264
   if (devinfo->gen < 6) {
265
      brw_push_insn_state(p);
266
      brw_set_default_exec_size(p, BRW_EXECUTE_8);
267 268 269
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
270
      brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
271
      brw_pop_insn_state(p);
272 273
   }

274 275
   if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
276
   else if (prog_data->dual_src_blend) {
277
      if (!inst->group)
278 279 280
         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
      else
         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
281
   } else if (inst->exec_size == 16)
282 283 284 285
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
   else
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;

286 287 288 289 290 291 292
   /* We assume render targets start at 0, because headerless FB write
    * messages set "Render Target Index" to 0.  Using a different binding
    * table index would make it impossible to use headerless messages.
    */
   assert(prog_data->binding_table.render_target_start == 0);

   const uint32_t surf_index = inst->target;
293

294 295 296 297
   bool last_render_target = inst->eot ||
                             (prog_data->dual_src_blend && dispatch_width == 16);


298
   brw_fb_WRITE(p,
299
                payload,
300 301 302 303 304 305
                implied_header,
                msg_control,
                surf_index,
                nr,
                0,
                inst->eot,
306
                last_render_target,
307
                inst->header_size != 0);
308 309 310 311

   brw_mark_surface_used(&prog_data->base, surf_index);
}

312
void
313
fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
314
{
315
   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
316
   const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
317 318
   struct brw_reg implied_header;

319
   if (devinfo->gen < 8 && !devinfo->is_haswell) {
320 321 322
      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   }

323 324
   if (inst->base_mrf >= 0)
      payload = brw_message_reg(inst->base_mrf);
325

326 327 328
   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
    * move, here's g1.
    */
329
   if (inst->header_size != 0) {
330 331
      brw_push_insn_state(p);
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
332
      brw_set_default_exec_size(p, BRW_EXECUTE_1);
333 334
      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
335
      brw_set_default_flag_reg(p, 0, 0);
336

337 338 339
      /* On HSW, the GPU will use the predicate on SENDC, unless the header is
       * present.
       */
340
      if (prog_data->uses_kill) {
341
         struct brw_reg pixel_mask;
342

343
         if (devinfo->gen >= 6)
344 345 346
            pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
         else
            pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
347

348 349
         brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
      }
350

351
      if (devinfo->gen >= 6) {
352 353
         brw_push_insn_state(p);
         brw_set_default_exec_size(p, BRW_EXECUTE_16);
354
	 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
355
	 brw_MOV(p,
356
		 retype(payload, BRW_REGISTER_TYPE_UD),
357
		 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
358
         brw_pop_insn_state(p);
359

360
         if (inst->target > 0 && key->replicate_alpha) {
361 362 363 364
            /* Set "Source0 Alpha Present to RenderTarget" bit in message
             * header.
             */
            brw_OR(p,
365
		   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
366 367 368 369
		   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
		   brw_imm_ud(0x1 << 11));
         }

370 371
	 if (inst->target > 0) {
	    /* Set the render target index for choosing BLEND_STATE. */
372 373
	    brw_MOV(p, retype(vec1(suboffset(payload, 2)),
                              BRW_REGISTER_TYPE_UD),
374 375 376
		    brw_imm_ud(inst->target));
	 }

377 378 379 380 381 382 383 384
         /* Set computes stencil to render target */
         if (prog_data->computed_stencil) {
            brw_OR(p,
                   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
                   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
                   brw_imm_ud(0x1 << 14));
         }

385 386 387 388
	 implied_header = brw_null_reg();
      } else {
	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
      }
389 390

      brw_pop_insn_state(p);
391 392 393 394
   } else {
      implied_header = brw_null_reg();
   }

395
   if (!runtime_check_aads_emit) {
396
      fire_fb_write(inst, payload, implied_header, inst->mlen);
397 398
   } else {
      /* This can only happen in gen < 6 */
399
      assert(devinfo->gen < 6);
400 401 402 403

      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));

      /* Check runtime bit to detect if we have to send AA data or not */
404
      brw_push_insn_state(p);
405 406
      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
      brw_set_default_exec_size(p, BRW_EXECUTE_1);
407 408 409 410
      brw_AND(p,
              v1_null_ud,
              retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
              brw_imm_ud(1<<26));
411
      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
412 413

      int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
414
      brw_pop_insn_state(p);
415 416
      {
         /* Don't send AA data */
417
         fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
418 419
      }
      brw_land_fwd_jump(p, jmp);
420
      fire_fb_write(inst, payload, implied_header, inst->mlen);
421
   }
422 423
}

424 425 426 427
void
fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
                               struct brw_reg payload)
{
428
   assert(inst->size_written % REG_SIZE == 0);
429
   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
430 431 432 433
   const unsigned surf_index =
      prog_data->binding_table.render_target_start + inst->target;

   gen9_fb_READ(p, dst, payload, surf_index,
434
                inst->header_size, inst->size_written / REG_SIZE,
435 436 437 438 439
                prog_data->persample_dispatch);

   brw_mark_surface_used(&prog_data->base, surf_index);
}

440 441 442 443 444 445 446 447
void
fs_generator::generate_mov_indirect(fs_inst *inst,
                                    struct brw_reg dst,
                                    struct brw_reg reg,
                                    struct brw_reg indirect_byte_offset)
{
   assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
   assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
448 449
   assert(!reg.abs && !reg.negate);
   assert(reg.type == dst.type);
450 451 452

   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;

453 454
   if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
      imm_byte_offset += indirect_byte_offset.ud;
455

456 457 458 459 460
      reg.nr = imm_byte_offset / REG_SIZE;
      reg.subnr = imm_byte_offset % REG_SIZE;
      brw_MOV(p, dst, reg);
   } else {
      /* Prior to Broadwell, there are only 8 address registers. */
461
      assert(inst->exec_size <= 8 || devinfo->gen >= 8);
462 463 464 465 466 467 468 469 470 471 472 473

      /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
      struct brw_reg addr = vec8(brw_address_reg(0));

      /* The destination stride of an instruction (in bytes) must be greater
       * than or equal to the size of the rest of the instruction.  Since the
       * address register is of type UW, we can't use a D-type instruction.
       * In order to get around this, re retype to UW and use a stride.
       */
      indirect_byte_offset =
         retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);

474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
      /* There are a number of reasons why we don't use the base offset here.
       * One reason is that the field is only 9 bits which means we can only
       * use it to access the first 16 GRFs.  Also, from the Haswell PRM
       * section "Register Region Restrictions":
       *
       *    "The lower bits of the AddressImmediate must not overflow to
       *    change the register address.  The lower 5 bits of Address
       *    Immediate when added to lower 5 bits of address register gives
       *    the sub-register offset. The upper bits of Address Immediate
       *    when added to upper bits of address register gives the register
       *    address. Any overflow from sub-register offset is dropped."
       *
       * Since the indirect may cause us to cross a register boundary, this
       * makes the base offset almost useless.  We could try and do something
       * clever where we use a actual base offset if base_offset % 32 == 0 but
       * that would mean we were generating different code depending on the
       * base offset.  Instead, for the sake of consistency, we'll just do the
       * add ourselves.  This restriction is only listed in the Haswell PRM
       * but empirical testing indicates that it applies on all older
       * generations and is lifted on Broadwell.
       *
       * In the end, while base_offset is nice to look at in the generated
       * code, using it saves us 0 instructions and would require quite a bit
       * of case-by-case work.  It's just not worth it.
       */
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
      brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));

      if (type_sz(reg.type) > 4 &&
          ((devinfo->gen == 7 && !devinfo->is_haswell) ||
           devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
         /* IVB has an issue (which we found empirically) where it reads two
          * address register components per channel for indirectly addressed
          * 64-bit sources.
          *
          * From the Cherryview PRM Vol 7. "Register Region Restrictions":
          *
          *    "When source or destination datatype is 64b or operation is
          *    integer DWord multiply, indirect addressing must not be used."
          *
          * To work around both of these, we do two integer MOVs insead of one
          * 64-bit MOV.  Because no double value should ever cross a register
          * boundary, it's safe to use the immediate offset in the indirect
          * here to handle adding 4 bytes to the offset and avoid the extra
          * ADD to the register file.
518
          */
519 520 521 522 523 524
         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
                    retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
                    retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
      } else {
         struct brw_reg ind_src = brw_VxH_indirect(0, 0);
525

526
         brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
527

528 529 530 531 532 533 534 535 536 537 538 539 540
         if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
             !inst->get_next()->is_tail_sentinel() &&
             ((fs_inst *)inst->get_next())->mlen > 0) {
            /* From the Sandybridge PRM:
             *
             *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
             *    instruction that “indexed/indirect” source AND is followed
             *    by a send, the instruction requires a “Switch”. This is to
             *    avoid race condition where send may dispatch before MRF is
             *    updated."
             */
            brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
         }
541 542
      }
   }
543 544
}

545 546 547 548 549
void
fs_generator::generate_urb_read(fs_inst *inst,
                                struct brw_reg dst,
                                struct brw_reg header)
{
550
   assert(inst->size_written % REG_SIZE == 0);
551 552 553 554
   assert(header.file == BRW_GENERAL_REGISTER_FILE);
   assert(header.type == BRW_REGISTER_TYPE_UD);

   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
555
   brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
556 557 558 559 560 561
   brw_set_src0(p, send, header);
   brw_set_src1(p, send, brw_imm_ud(0u));

   brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
   brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);

562 563 564
   if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
      brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);

565
   brw_inst_set_mlen(p->devinfo, send, inst->mlen);
566
   brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
567 568 569 570
   brw_inst_set_header_present(p->devinfo, send, true);
   brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
}

571 572 573 574 575 576 577 578 579 580 581
void
fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
{
   brw_inst *insn;

   insn = brw_next_insn(p, BRW_OPCODE_SEND);

   brw_set_dest(p, insn, brw_null_reg());
   brw_set_src0(p, insn, payload);
   brw_set_src1(p, insn, brw_imm_d(0));

582 583
   brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
   brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
584

585 586 587 588 589 590 591 592
   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
      brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);

   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
      brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);

593 594 595 596 597
   brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
   brw_inst_set_rlen(p->devinfo, insn, 0);
   brw_inst_set_eot(p->devinfo, insn, inst->eot);
   brw_inst_set_header_present(p->devinfo, insn, true);
   brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
598 599
}

600 601 602 603 604 605 606
void
fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
{
   struct brw_inst *insn;

   insn = brw_next_insn(p, BRW_OPCODE_SEND);

607
   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
608
   brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
   brw_set_src1(p, insn, brw_imm_d(0));

   /* Terminate a compute shader by sending a message to the thread spawner.
    */
   brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
   brw_inst_set_mlen(devinfo, insn, 1);
   brw_inst_set_rlen(devinfo, insn, 0);
   brw_inst_set_eot(devinfo, insn, inst->eot);
   brw_inst_set_header_present(devinfo, insn, false);

   brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
   brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */

   /* Note that even though the thread has a URB resource associated with it,
    * we set the "do not dereference URB" bit, because the URB resource is
    * managed by the fixed-function unit, so it will free it automatically.
    */
   brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */

   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
}

631 632 633 634 635 636 637
void
fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
{
   brw_barrier(p, src);
   brw_WAIT(p);
}

638
void
639
fs_generator::generate_linterp(fs_inst *inst,
640 641
			     struct brw_reg dst, struct brw_reg *src)
{
642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
   /* PLN reads:
    *                      /   in SIMD16   \
    *    -----------------------------------
    *   | src1+0 | src1+1 | src1+2 | src1+3 |
    *   |-----------------------------------|
    *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
    *    -----------------------------------
    *
    * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
    *
    *    -----------------------------------
    *   | src1+0 | src1+1 | src1+2 | src1+3 |
    *   |-----------------------------------|
    *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
    *   |-----------------------------------|
    *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
    *    -----------------------------------
    *
    * See also: emit_interpolation_setup_gen4().
    */
662
   struct brw_reg delta_x = src[0];
663
   struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
664
   struct brw_reg interp = src[1];
665

666 667
   if (devinfo->has_pln &&
       (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
668 669 670 671 672 673 674
      brw_PLN(p, dst, interp, delta_x);
   } else {
      brw_LINE(p, brw_null_reg(), interp, delta_x);
      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
   }
}

675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706
void
fs_generator::generate_get_buffer_size(fs_inst *inst,
                                       struct brw_reg dst,
                                       struct brw_reg src,
                                       struct brw_reg surf_index)
{
   assert(devinfo->gen >= 7);
   assert(surf_index.file == BRW_IMMEDIATE_VALUE);

   uint32_t simd_mode;
   int rlen = 4;

   switch (inst->exec_size) {
   case 8:
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
      break;
   case 16:
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
      break;
   default:
      unreachable("Invalid width for texture instruction");
   }

   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
      rlen = 8;
      dst = vec16(dst);
   }

   brw_SAMPLE(p,
              retype(dst, BRW_REGISTER_TYPE_UW),
              inst->base_mrf,
              src,
707
              surf_index.ud,
708 709 710 711 712 713 714
              0,
              GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
              rlen, /* response length */
              inst->mlen,
              inst->header_size > 0,
              simd_mode,
              BRW_SAMPLER_RETURN_FORMAT_SINT32);
715 716

   brw_mark_surface_used(prog_data, surf_index.ud);
717 718
}

719
void
720
fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
721
                           struct brw_reg surface_index,
722
                           struct brw_reg sampler_index)
723
{
724
   assert(inst->size_written % REG_SIZE == 0);
725
   int msg_type = -1;
726
   uint32_t simd_mode;
727
   uint32_t return_format;
728
   bool is_combined_send = inst->eot;
729 730 731 732 733 734 735 736 737 738 739 740

   switch (dst.type) {
   case BRW_REGISTER_TYPE_D:
      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
      break;
   case BRW_REGISTER_TYPE_UD:
      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
      break;
   default:
      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
      break;
   }
741

742 743 744 745 746 747 748 749 750 751 752
   /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
    * is set as part of the message descriptor.  On gen4, the PRM seems to
    * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
    * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
    * gone from the message descriptor entirely and you just get UINT32 all
    * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
    * just stomp it to UINT32 all the time.
    */
   if (inst->opcode == SHADER_OPCODE_TXS)
      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;

753 754 755 756 757
   switch (inst->exec_size) {
   case 8:
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
      break;
   case 16:
758
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
759 760 761 762
      break;
   default:
      unreachable("Invalid width for texture instruction");
   }
763

764
   if (devinfo->gen >= 5) {
765
      switch (inst->opcode) {
766
      case SHADER_OPCODE_TEX:
767 768 769 770 771 772 773 774 775 776 777 778 779
	 if (inst->shadow_compare) {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
	 } else {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
	 }
	 break;
      case FS_OPCODE_TXB:
	 if (inst->shadow_compare) {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
	 } else {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
	 }
	 break;
780
      case SHADER_OPCODE_TXL:
781 782 783 784 785 786
	 if (inst->shadow_compare) {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
	 } else {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
	 }
	 break;
787 788 789 790 791 792 793 794
      case SHADER_OPCODE_TXL_LZ:
         assert(devinfo->gen >= 9);
	 if (inst->shadow_compare) {
            msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
         } else {
            msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
         }
         break;
795
      case SHADER_OPCODE_TXS:
796 797
	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
	 break;
798
      case SHADER_OPCODE_TXD:
799
         if (inst->shadow_compare) {
800
            /* Gen7.5+.  Otherwise, lowered in NIR */
801
            assert(devinfo->gen >= 8 || devinfo->is_haswell);
802 803 804 805
            msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
         } else {
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
         }
806
	 break;
807
      case SHADER_OPCODE_TXF:
808 809
	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
	 break;
810 811 812 813
      case SHADER_OPCODE_TXF_LZ:
         assert(devinfo->gen >= 9);
         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
         break;
814 815 816 817
      case SHADER_OPCODE_TXF_CMS_W:
         assert(devinfo->gen >= 9);
         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
         break;
818
      case SHADER_OPCODE_TXF_CMS:
819
         if (devinfo->gen >= 7)
820 821 822 823
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
         else
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
         break;
824
      case SHADER_OPCODE_TXF_UMS:
825
         assert(devinfo->gen >= 7);
826 827
         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
         break;
828
      case SHADER_OPCODE_TXF_MCS:
829
         assert(devinfo->gen >= 7);
830 831
         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
         break;
832 833 834
      case SHADER_OPCODE_LOD:
         msg_type = GEN5_SAMPLER_MESSAGE_LOD;
         break;
Chris Forbes's avatar
Chris Forbes committed
835
      case SHADER_OPCODE_TG4:
836
         if (inst->shadow_compare) {
837
            assert(devinfo->gen >= 7);
838 839
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
         } else {
840
            assert(devinfo->gen >= 6);
841 842
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
         }
Chris Forbes's avatar
Chris Forbes committed
843
         break;
844
      case SHADER_OPCODE_TG4_OFFSET:
845
         assert(devinfo->gen >= 7);
846 847 848 849 850
         if (inst->shadow_compare) {
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
         } else {
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
         }
851
         break;
852 853 854
      case SHADER_OPCODE_SAMPLEINFO:
         msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
         break;
855
      default:
856
	 unreachable("not reached");
857 858 859
      }
   } else {
      switch (inst->opcode) {
860
      case SHADER_OPCODE_TEX:
861 862 863
	 /* Note that G45 and older determines shadow compare and dispatch width
	  * from message length for most messages.
	  */
864
         if (inst->exec_size == 8) {
865 866 867 868 869 870 871 872 873 874 875 876 877 878 879
            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
            if (inst->shadow_compare) {
               assert(inst->mlen == 6);
            } else {
               assert(inst->mlen <= 4);
            }
         } else {
            if (inst->shadow_compare) {
               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
               assert(inst->mlen == 9);
            } else {
               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
               assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
            }
         }
880 881 882
	 break;
      case FS_OPCODE_TXB:
	 if (inst->shadow_compare) {
883
            assert(inst->exec_size == 8);
884 885 886 887 888 889 890 891
	    assert(inst->mlen == 6);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
	 } else {
	    assert(inst->mlen == 9);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 }
	 break;
892
      case SHADER_OPCODE_TXL:
893
	 if (inst->shadow_compare) {
894
            assert(inst->exec_size == 8);
895 896 897 898 899 900 901 902
	    assert(inst->mlen == 6);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
	 } else {
	    assert(inst->mlen == 9);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 }
	 break;
903
      case SHADER_OPCODE_TXD:
904
	 /* There is no sample_d_c message; comparisons are done manually */
905
         assert(inst->exec_size == 8);
906 907
	 assert(inst->mlen == 7 || inst->mlen == 10);
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
908
	 break;
909
      case SHADER_OPCODE_TXF:
910
         assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
911 912 913
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 break;
914
      case SHADER_OPCODE_TXS:
915 916 917 918
	 assert(inst->mlen == 3);
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 break;
919
      default:
920
	 unreachable("not reached");
921 922 923 924 925 926 927 928
      }
   }
   assert(msg_type != -1);

   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
      dst = vec16(dst);
   }

929
   assert(devinfo->gen < 7 || inst->header_size == 0 ||
930
          src.file == BRW_GENERAL_REGISTER_FILE);
931

932 933
   assert(sampler_index.type == BRW_REGISTER_TYPE_UD);

934 935 936 937
   /* Load the message header if present.  If there's a texture offset,
    * we need to set it up explicitly and load the offset bitfield.
    * Otherwise, we can use an implied move from g0 to the first message reg.
    */
938
   if (inst->header_size != 0) {
939
      if (devinfo->gen < 6 && !inst->offset) {
940 941
         /* Set up an implied move from g0 to the MRF. */
         src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
942
      } else {
943 944
         struct brw_reg header_reg;

945
         if (devinfo->gen >= 7) {
946 947 948 949 950 951
            header_reg = src;
         } else {
            assert(inst->base_mrf != -1);
            header_reg = brw_message_reg(inst->base_mrf);
         }

952
         brw_push_insn_state(p);
953
         brw_set_default_exec_size(p, BRW_EXECUTE_8);
954 955
         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
         brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
956 957 958
         /* Explicitly set up the message header by copying g0 to the MRF. */
         brw_MOV(p, header_reg, brw_vec8_grf(0, 0));

959
         brw_set_default_exec_size(p, BRW_EXECUTE_1);
960
         if (inst->offset) {
961 962
            /* Set the offset bits in DWord 2. */
            brw_MOV(p, get_element_ud(header_reg, 2),
963
                       brw_imm_ud(inst->offset));
964 965 966 967 968 969 970 971
         } else if (stage != MESA_SHADER_VERTEX &&
                    stage != MESA_SHADER_FRAGMENT) {
            /* The vertex and fragment stages have g0.2 set to 0, so
             * header0.2 is 0 when g0 is copied. Other stages may not, so we
             * must set it to 0 to avoid setting undesirable bits in the
             * message.
             */
            brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
972
         }
973

974
         brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
975 976
         brw_pop_insn_state(p);
      }
977 978
   }

979 980
   uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
         inst->opcode == SHADER_OPCODE_TG4_OFFSET)
981 982
         ? prog_data->binding_table.gather_texture_start
         : prog_data->binding_table.texture_start;
983

984 985 986
   if (surface_index.file == BRW_IMMEDIATE_VALUE &&
       sampler_index.file == BRW_IMMEDIATE_VALUE) {
      uint32_t surface = surface_index.ud;
987
      uint32_t sampler = sampler_index.ud;
988 989 990 991 992

      brw_SAMPLE(p,
                 retype(dst, BRW_REGISTER_TYPE_UW),
                 inst->base_mrf,
                 src,
993
                 surface + base_binding_table_index,
994 995
                 sampler % 16,
                 msg_type,
996
                 inst->size_written / REG_SIZE,
997
                 inst->mlen,
998
                 inst->header_size != 0,
999 1000 1001
                 simd_mode,
                 return_format);

1002
      brw_mark_surface_used(prog_data, surface + base_binding_table_index);
1003
   } else {
1004 1005 1006
      /* Non-const sampler index */

      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1007
      struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
1008 1009 1010 1011 1012
      struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));

      brw_push_insn_state(p);
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
      brw_set_default_access_mode(p, BRW_ALIGN_1);
1013
      brw_set_default_exec_size(p, BRW_EXECUTE_1);
1014

1015
      if (brw_regs_equal(&surface_reg, &sampler_reg)) {
1016 1017
         brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
      } else {
1018 1019 1020 1021 1022 1023
         if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
            brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
         } else {
            brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
            brw_OR(p, addr, addr, surface_reg);
         }
1024
      }
1025 1026
      if (base_binding_table_index)
         brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
1027
      brw_AND(p, addr, addr, brw_imm_ud(0xfff));
1028

1029 1030
      brw_pop_insn_state(p);

1031 1032 1033 1034
      /* dst = send(offset, a0.0 | <descriptor>) */
      brw_inst *insn = brw_send_indirect_message(
         p, BRW_SFID_SAMPLER, dst, src, addr);
      brw_set_sampler_message(p, insn,
1035 1036 1037
                              0 /* surface */,
                              0 /* sampler */,
                              msg_type,
1038
                              inst->size_written / REG_SIZE,
1039
                              inst->mlen /* mlen */,
1040
                              inst->header_size != 0 /* header */,
1041 1042 1043 1044 1045 1046
                              simd_mode,
                              return_format);

      /* visitor knows more than we do about the surface limit required,
       * so has already done marking.
       */
1047
   }
1048 1049

   if (is_combined_send) {
1050 1051
      brw_inst_set_eot(p->devinfo, brw_last_inst, true);
      brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
1052
   }
1053 1054 1055 1056 1057 1058 1059 1060
}


/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 * looking like:
 *
 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 *
1061
 * Ideally, we want to produce:
1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
 *
 *           DDX                     DDY
 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 *
 * and add another set of two more subspans if in 16-pixel dispatch mode.
 *
 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1077 1078 1079 1080
 * pair.  But the ideal approximation may impose a huge performance cost on
 * sample_d.  On at least Haswell, sample_d instruction does some
 * optimizations if the same LOD is used for all pixels in the subspan.
 *
1081 1082
 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
 * appropriate swizzling.
1083 1084
 */
void
1085 1086
fs_generator::generate_ddx(enum opcode opcode,
                           struct brw_reg dst, struct brw_reg src)
1087
{
1088 1089
   unsigned vstride, width;

1090
   if (opcode == FS_OPCODE_DDX_FINE) {
1091 1092 1093
      /* produce accurate derivatives */
      vstride = BRW_VERTICAL_STRIDE_2;
      width = BRW_WIDTH_2;
1094
   } else {
1095 1096 1097 1098 1099
      /* replicate the derivative at the top-left pixel to other pixels */
      vstride = BRW_VERTICAL_STRIDE_4;
      width = BRW_WIDTH_4;
   }

1100
   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
1101
                                 src.negate, src.abs,
1102
				 BRW_REGISTER_TYPE_F,
1103 1104
				 vstride,
				 width,
1105 1106 1107
				 BRW_HORIZONTAL_STRIDE_0,
				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1108
                                 src.negate, src.abs,
1109
				 BRW_REGISTER_TYPE_F,
1110 1111
				 vstride,
				 width,
1112 1113 1114 1115 1116
				 BRW_HORIZONTAL_STRIDE_0,
				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
   brw_ADD(p, dst, src0, negate(src1));
}

1117 1118 1119 1120
/* The negate_value boolean is used to negate the derivative computation for
 * FBOs, since they place the origin at the upper left instead of the lower
 * left.
 */
1121
void
1122
fs_generator::generate_ddy(enum opcode opcode,
1123
                           struct brw_reg dst, struct brw_reg src)
1124
{
1125
   if (opcode == FS_OPCODE_DDY_FINE) {
1126 1127
      /* produce accurate derivatives */
      struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1128
                                    src.negate, src.abs,
1129 1130 1131 1132 1133 1134
                                    BRW_REGISTER_TYPE_F,
                                    BRW_VERTICAL_STRIDE_4,
                                    BRW_WIDTH_4,
                                    BRW_HORIZONTAL_STRIDE_1,
                                    BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
      struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1135
                                    src.negate, src.abs,
1136 1137 1138 1139 1140 1141
                                    BRW_REGISTER_TYPE_F,
                                    BRW_VERTICAL_STRIDE_4,
                                    BRW_WIDTH_4,
                                    BRW_HORIZONTAL_STRIDE_1,
                                    BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
      brw_push_insn_state(p);
1142
      brw_set_default_access_mode(p, BRW_ALIGN_16);
1143
      brw_ADD(p, dst, negate(src0), src1);
1144 1145 1146 1147
      brw_pop_insn_state(p);
   } else {
      /* replicate the derivative at the top-left pixel to other pixels */
      struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1148
                                    src.negate, src.abs,
1149 1150 1151 1152 1153 1154
                                    BRW_REGISTER_TYPE_F,
                                    BRW_VERTICAL_STRIDE_4,
                                    BRW_WIDTH_4,
                                    BRW_HORIZONTAL_STRIDE_0,
                                    BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
      struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1155
                                    src.negate, src.abs,
1156 1157 1158 1159 1160
                                    BRW_REGISTER_TYPE_F,
                                    BRW_VERTICAL_STRIDE_4,
                                    BRW_WIDTH_4,
                                    BRW_HORIZONTAL_STRIDE_0,
                                    BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1161
      brw_ADD(p, dst, negate(src0), src1);
1162
   }
1163 1164
}

1165 1166 1167
void
fs_generator::generate_discard_jump(fs_inst *inst)
{
1168
   assert(devinfo->gen >= 6);
1169 1170 1171 1172 1173 1174 1175 1176 1177

   /* This HALT will be patched up at FB write time to point UIP at the end of
    * the program, and at brw_uip_jip() JIP will be set to the end of the
    * current block (or the program).
    */
   this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
   gen6_HALT(p);
}