brw_fs_generator.cpp 80.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 * Copyright © 2010 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

24
/** @file brw_fs_generator.cpp
25
 *
26
 * This file supports generating code from the FS LIR to the actual
27 28 29 30 31
 * native instructions.
 */

#include "brw_eu.h"
#include "brw_fs.h"
32
#include "brw_cfg.h"
33

34 35
static enum brw_reg_file
brw_file_from_reg(fs_reg *reg)
36 37
{
   switch (reg->file) {
38 39 40
   case ARF:
      return BRW_ARCHITECTURE_REGISTER_FILE;
   case FIXED_GRF:
Matt Turner's avatar
Matt Turner committed
41
   case VGRF:
42 43 44 45 46
      return BRW_GENERAL_REGISTER_FILE;
   case MRF:
      return BRW_MESSAGE_REGISTER_FILE;
   case IMM:
      return BRW_IMMEDIATE_VALUE;
47 48 49
   case BAD_FILE:
   case ATTR:
   case UNIFORM:
50 51
      unreachable("not reached");
   }
52
   return BRW_ARCHITECTURE_REGISTER_FILE;
53 54
}

55
static struct brw_reg
56 57
brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst,
                    fs_reg *reg, bool compressed)
58 59 60 61 62
{
   struct brw_reg brw_reg;

   switch (reg->file) {
   case MRF:
63
      assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
64
      /* Fallthrough */
Matt Turner's avatar
Matt Turner committed
65
   case VGRF:
66
      if (reg->stride == 0) {
67
         brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
68 69 70
      } else {
         /* From the Haswell PRM:
          *
71 72 73
          *  "VertStride must be used to cross GRF register boundaries. This
          *   rule implies that elements within a 'Width' cannot cross GRF
          *   boundaries."
74
          *
75
          * The maximum width value that could satisfy this restriction is:
76
          */
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
         const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));

         /* Because the hardware can only split source regions at a whole
          * multiple of width during decompression (i.e. vertically), clamp
          * the value obtained above to the physical execution size of a
          * single decompressed chunk of the instruction:
          */
         const unsigned phys_width = compressed ? inst->exec_size / 2 :
                                     inst->exec_size;

         /* XXX - The equation above is strictly speaking not correct on
          *       hardware that supports unbalanced GRF writes -- On Gen9+
          *       each decompressed chunk of the instruction may have a
          *       different execution size when the number of components
          *       written to each destination GRF is not the same.
          */
93 94 95
         const unsigned width = MIN2(reg_width, phys_width);
         brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
         brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136

         if (devinfo->gen == 7 && !devinfo->is_haswell) {
            /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
             *  "Each DF (Double Float) operand uses an element size of 4 rather
             *   than 8 and all regioning parameters are twice what the values
             *   would be based on the true element size: ExecSize, Width,
             *   HorzStride, and VertStride. Each DF operand uses a pair of
             *   channels and all masking and swizzing should be adjusted
             *   appropriately."
             *
             * From the IvyBridge PRM (Special Requirements for Handling Double
             * Precision Data Types, page 71):
             *  "In Align1 mode, all regioning parameters like stride, execution
             *   size, and width must use the syntax of a pair of packed
             *   floats. The offsets for these data types must be 64-bit
             *   aligned. The execution size and regioning parameters are in terms
             *   of floats."
             *
             * Summarized: when handling DF-typed arguments, ExecSize,
             * VertStride, and Width must be doubled.
             *
             * It applies to BayTrail too.
             */
            if (type_sz(reg->type) == 8) {
               brw_reg.width++;
               if (brw_reg.vstride > 0)
                  brw_reg.vstride++;
               assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
            }

            /* When converting from DF->F, we set the destination stride to 2
             * because each d2f conversion implicitly writes 2 floats, being
             * the first one the converted value. IVB/BYT actually writes two
             * F components per SIMD channel, and every other component is
             * filled with garbage.
             */
            if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
                type_sz(inst->dst.type) < 8) {
               assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
               brw_reg.hstride--;
            }
137
         }
138 139 140
      }

      brw_reg = retype(brw_reg, reg->type);
141
      brw_reg = byte_offset(brw_reg, reg->offset);
142 143
      brw_reg.abs = reg->abs;
      brw_reg.negate = reg->negate;
144
      break;
145 146
   case ARF:
   case FIXED_GRF:
147
   case IMM:
148
      assert(reg->offset == 0);
149
      brw_reg = reg->as_brw_reg();
150 151 152 153 154
      break;
   case BAD_FILE:
      /* Probably unused. */
      brw_reg = brw_null_reg();
      break;
155 156
   case ATTR:
   case UNIFORM:
157 158 159
      unreachable("not reached");
   }

160 161 162 163 164 165 166 167 168 169 170 171 172
   /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
    * region, but on IVB and BYT DF regions must be programmed in terms of
    * floats. A <0,2,1> region accomplishes this.
    */
   if (devinfo->gen == 7 && !devinfo->is_haswell &&
       type_sz(reg->type) == 8 &&
       brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
       brw_reg.width == BRW_WIDTH_1 &&
       brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
      brw_reg.width = BRW_WIDTH_2;
      brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
   }

173 174 175
   return brw_reg;
}

176
fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
177
                           void *mem_ctx,
178 179
                           const void *key,
                           struct brw_stage_prog_data *prog_data,
180
                           unsigned promoted_constants,
181
                           bool runtime_check_aads_emit,
182
                           gl_shader_stage stage)
183

184 185
   : compiler(compiler), log_data(log_data),
     devinfo(compiler->devinfo), key(key),
186
     prog_data(prog_data),
187
     promoted_constants(promoted_constants),
188
     runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
189
     stage(stage), mem_ctx(mem_ctx)
190
{
191 192
   p = rzalloc(mem_ctx, struct brw_codegen);
   brw_init_codegen(devinfo, p, mem_ctx);
193 194 195 196 197 198

   /* In the FS code generator, we are very careful to ensure that we always
    * set the right execution size so we don't need the EU code to "help" us
    * by trying to infer it.  Sometimes, it infers the wrong thing.
    */
   p->automatic_exec_sizes = false;
199 200 201 202 203 204
}

fs_generator::~fs_generator()
{
}

205 206 207 208 209 210 211 212 213 214 215 216
class ip_record : public exec_node {
public:
   DECLARE_RALLOC_CXX_OPERATORS(ip_record)

   ip_record(int ip)
   {
      this->ip = ip;
   }

   int ip;
};

217
bool
218 219
fs_generator::patch_discard_jumps_to_fb_writes()
{
220
   if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
221
      return false;
222

223
   int scale = brw_jump_scale(p->devinfo);
224

225 226 227 228 229 230 231 232 233 234 235
   /* There is a somewhat strange undocumented requirement of using
    * HALT, according to the simulator.  If some channel has HALTed to
    * a particular UIP, then by the end of the program, every channel
    * must have HALTed to that UIP.  Furthermore, the tracking is a
    * stack, so you can't do the final halt of a UIP after starting
    * halting to a new UIP.
    *
    * Symptoms of not emitting this instruction on actual hardware
    * included GPU hangs and sparkly rendering on the piglit discard
    * tests.
    */
236
   brw_inst *last_halt = gen6_HALT(p);
237 238
   brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
   brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
239 240 241

   int ip = p->nr_insn;

242
   foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
243
      brw_inst *patch = &p->store[patch_ip->ip];
244

245
      assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
246
      /* HALT takes a half-instruction distance from the pre-incremented IP. */
247
      brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
248 249 250
   }

   this->discard_halt_patches.make_empty();
251
   return true;
252 253
}

254 255
void
fs_generator::fire_fb_write(fs_inst *inst,
256
                            struct brw_reg payload,
257 258 259 260 261
                            struct brw_reg implied_header,
                            GLuint nr)
{
   uint32_t msg_control;

262
   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
263

264
   if (devinfo->gen < 6) {
265
      brw_push_insn_state(p);
266
      brw_set_default_exec_size(p, BRW_EXECUTE_8);
267 268 269
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
270
      brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
271
      brw_pop_insn_state(p);
272 273
   }

274 275
   if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
276
   else if (prog_data->dual_src_blend) {
277
      if (!inst->group)
278 279 280
         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
      else
         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
281
   } else if (inst->exec_size == 16)
282 283 284 285
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
   else
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;

286 287 288 289 290
   /* We assume render targets start at 0, because headerless FB write
    * messages set "Render Target Index" to 0.  Using a different binding
    * table index would make it impossible to use headerless messages.
    */
   const uint32_t surf_index = inst->target;
291

292 293 294 295
   bool last_render_target = inst->eot ||
                             (prog_data->dual_src_blend && dispatch_width == 16);


296
   brw_fb_WRITE(p,
297
                payload,
298 299 300 301 302 303
                implied_header,
                msg_control,
                surf_index,
                nr,
                0,
                inst->eot,
304
                last_render_target,
305
                inst->header_size != 0);
306 307 308 309

   brw_mark_surface_used(&prog_data->base, surf_index);
}

310
void
311
fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
312
{
313
   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
314
   const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
315 316
   struct brw_reg implied_header;

317
   if (devinfo->gen < 8 && !devinfo->is_haswell) {
318 319 320
      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   }

321 322
   if (inst->base_mrf >= 0)
      payload = brw_message_reg(inst->base_mrf);
323

324 325 326
   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
    * move, here's g1.
    */
327
   if (inst->header_size != 0) {
328 329
      brw_push_insn_state(p);
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
330
      brw_set_default_exec_size(p, BRW_EXECUTE_1);
331 332
      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
333
      brw_set_default_flag_reg(p, 0, 0);
334

335 336 337
      /* On HSW, the GPU will use the predicate on SENDC, unless the header is
       * present.
       */
338
      if (prog_data->uses_kill) {
339
         struct brw_reg pixel_mask;
340

341
         if (devinfo->gen >= 6)
342 343 344
            pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
         else
            pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
345

346 347
         brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
      }
348

349
      if (devinfo->gen >= 6) {
350 351
         brw_push_insn_state(p);
         brw_set_default_exec_size(p, BRW_EXECUTE_16);
352
	 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
353
	 brw_MOV(p,
354
		 retype(payload, BRW_REGISTER_TYPE_UD),
355
		 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
356
         brw_pop_insn_state(p);
357

358
         if (inst->target > 0 && key->replicate_alpha) {
359 360 361 362
            /* Set "Source0 Alpha Present to RenderTarget" bit in message
             * header.
             */
            brw_OR(p,
363
		   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
364 365 366 367
		   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
		   brw_imm_ud(0x1 << 11));
         }

368 369
	 if (inst->target > 0) {
	    /* Set the render target index for choosing BLEND_STATE. */
370 371
	    brw_MOV(p, retype(vec1(suboffset(payload, 2)),
                              BRW_REGISTER_TYPE_UD),
372 373 374
		    brw_imm_ud(inst->target));
	 }

375 376 377 378 379 380 381 382
         /* Set computes stencil to render target */
         if (prog_data->computed_stencil) {
            brw_OR(p,
                   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
                   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
                   brw_imm_ud(0x1 << 14));
         }

383 384 385 386
	 implied_header = brw_null_reg();
      } else {
	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
      }
387 388

      brw_pop_insn_state(p);
389 390 391 392
   } else {
      implied_header = brw_null_reg();
   }

393
   if (!runtime_check_aads_emit) {
394
      fire_fb_write(inst, payload, implied_header, inst->mlen);
395 396
   } else {
      /* This can only happen in gen < 6 */
397
      assert(devinfo->gen < 6);
398 399 400 401

      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));

      /* Check runtime bit to detect if we have to send AA data or not */
402
      brw_push_insn_state(p);
403 404
      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
      brw_set_default_exec_size(p, BRW_EXECUTE_1);
405 406 407 408
      brw_AND(p,
              v1_null_ud,
              retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
              brw_imm_ud(1<<26));
409
      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
410 411

      int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
412
      brw_pop_insn_state(p);
413 414
      {
         /* Don't send AA data */
415
         fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
416 417
      }
      brw_land_fwd_jump(p, jmp);
418
      fire_fb_write(inst, payload, implied_header, inst->mlen);
419
   }
420 421
}

422 423 424 425
void
fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
                               struct brw_reg payload)
{
426
   assert(inst->size_written % REG_SIZE == 0);
427
   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
428 429
   /* We assume that render targets start at binding table index 0. */
   const unsigned surf_index = inst->target;
430 431

   gen9_fb_READ(p, dst, payload, surf_index,
432
                inst->header_size, inst->size_written / REG_SIZE,
433 434 435 436 437
                prog_data->persample_dispatch);

   brw_mark_surface_used(&prog_data->base, surf_index);
}

438 439 440 441 442 443 444 445
void
fs_generator::generate_mov_indirect(fs_inst *inst,
                                    struct brw_reg dst,
                                    struct brw_reg reg,
                                    struct brw_reg indirect_byte_offset)
{
   assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
   assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
446 447
   assert(!reg.abs && !reg.negate);
   assert(reg.type == dst.type);
448 449 450

   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;

451 452
   if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
      imm_byte_offset += indirect_byte_offset.ud;
453

454 455 456 457 458
      reg.nr = imm_byte_offset / REG_SIZE;
      reg.subnr = imm_byte_offset % REG_SIZE;
      brw_MOV(p, dst, reg);
   } else {
      /* Prior to Broadwell, there are only 8 address registers. */
459
      assert(inst->exec_size <= 8 || devinfo->gen >= 8);
460 461 462 463 464 465 466 467 468 469 470 471

      /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
      struct brw_reg addr = vec8(brw_address_reg(0));

      /* The destination stride of an instruction (in bytes) must be greater
       * than or equal to the size of the rest of the instruction.  Since the
       * address register is of type UW, we can't use a D-type instruction.
       * In order to get around this, re retype to UW and use a stride.
       */
      indirect_byte_offset =
         retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);

472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
      /* There are a number of reasons why we don't use the base offset here.
       * One reason is that the field is only 9 bits which means we can only
       * use it to access the first 16 GRFs.  Also, from the Haswell PRM
       * section "Register Region Restrictions":
       *
       *    "The lower bits of the AddressImmediate must not overflow to
       *    change the register address.  The lower 5 bits of Address
       *    Immediate when added to lower 5 bits of address register gives
       *    the sub-register offset. The upper bits of Address Immediate
       *    when added to upper bits of address register gives the register
       *    address. Any overflow from sub-register offset is dropped."
       *
       * Since the indirect may cause us to cross a register boundary, this
       * makes the base offset almost useless.  We could try and do something
       * clever where we use a actual base offset if base_offset % 32 == 0 but
       * that would mean we were generating different code depending on the
       * base offset.  Instead, for the sake of consistency, we'll just do the
       * add ourselves.  This restriction is only listed in the Haswell PRM
       * but empirical testing indicates that it applies on all older
       * generations and is lifted on Broadwell.
       *
       * In the end, while base_offset is nice to look at in the generated
       * code, using it saves us 0 instructions and would require quite a bit
       * of case-by-case work.  It's just not worth it.
       */
497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
      brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));

      if (type_sz(reg.type) > 4 &&
          ((devinfo->gen == 7 && !devinfo->is_haswell) ||
           devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
         /* IVB has an issue (which we found empirically) where it reads two
          * address register components per channel for indirectly addressed
          * 64-bit sources.
          *
          * From the Cherryview PRM Vol 7. "Register Region Restrictions":
          *
          *    "When source or destination datatype is 64b or operation is
          *    integer DWord multiply, indirect addressing must not be used."
          *
          * To work around both of these, we do two integer MOVs insead of one
          * 64-bit MOV.  Because no double value should ever cross a register
          * boundary, it's safe to use the immediate offset in the indirect
          * here to handle adding 4 bytes to the offset and avoid the extra
          * ADD to the register file.
516
          */
517 518 519 520 521 522
         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
                    retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
                    retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
      } else {
         struct brw_reg ind_src = brw_VxH_indirect(0, 0);
523

524
         brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
525

526 527 528 529 530 531 532 533 534 535 536 537 538
         if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
             !inst->get_next()->is_tail_sentinel() &&
             ((fs_inst *)inst->get_next())->mlen > 0) {
            /* From the Sandybridge PRM:
             *
             *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
             *    instruction that “indexed/indirect” source AND is followed
             *    by a send, the instruction requires a “Switch”. This is to
             *    avoid race condition where send may dispatch before MRF is
             *    updated."
             */
            brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
         }
539 540
      }
   }
541 542
}

543 544 545 546 547
void
fs_generator::generate_urb_read(fs_inst *inst,
                                struct brw_reg dst,
                                struct brw_reg header)
{
548
   assert(inst->size_written % REG_SIZE == 0);
549 550 551 552
   assert(header.file == BRW_GENERAL_REGISTER_FILE);
   assert(header.type == BRW_REGISTER_TYPE_UD);

   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
553
   brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
554 555 556 557 558 559
   brw_set_src0(p, send, header);
   brw_set_src1(p, send, brw_imm_ud(0u));

   brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
   brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);

560 561 562
   if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
      brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);

563
   brw_inst_set_mlen(p->devinfo, send, inst->mlen);
564
   brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
565 566 567 568
   brw_inst_set_header_present(p->devinfo, send, true);
   brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
}

569 570 571 572 573
void
fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
{
   brw_inst *insn;

574 575 576 577 578 579 580 581 582 583 584 585 586
    /* WaClearTDRRegBeforeEOTForNonPS.
     *
     *   WA: Clear tdr register before send EOT in all non-PS shader kernels
     *
     *   mov(8) tdr0:ud 0x0:ud {NoMask}"
     */
   if (inst->eot && p->devinfo->gen == 10) {
      brw_push_insn_state(p);
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
      brw_MOV(p, brw_tdr_reg(), brw_imm_uw(0));
      brw_pop_insn_state(p);
   }

587 588 589 590 591 592
   insn = brw_next_insn(p, BRW_OPCODE_SEND);

   brw_set_dest(p, insn, brw_null_reg());
   brw_set_src0(p, insn, payload);
   brw_set_src1(p, insn, brw_imm_d(0));

593 594
   brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
   brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
595

596 597 598 599 600 601 602 603
   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
      brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);

   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
      brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);

604 605 606 607 608
   brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
   brw_inst_set_rlen(p->devinfo, insn, 0);
   brw_inst_set_eot(p->devinfo, insn, inst->eot);
   brw_inst_set_header_present(p->devinfo, insn, true);
   brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
609 610
}

611 612 613 614 615 616 617
void
fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
{
   struct brw_inst *insn;

   insn = brw_next_insn(p, BRW_OPCODE_SEND);

618
   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
619
   brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641
   brw_set_src1(p, insn, brw_imm_d(0));

   /* Terminate a compute shader by sending a message to the thread spawner.
    */
   brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
   brw_inst_set_mlen(devinfo, insn, 1);
   brw_inst_set_rlen(devinfo, insn, 0);
   brw_inst_set_eot(devinfo, insn, inst->eot);
   brw_inst_set_header_present(devinfo, insn, false);

   brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
   brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */

   /* Note that even though the thread has a URB resource associated with it,
    * we set the "do not dereference URB" bit, because the URB resource is
    * managed by the fixed-function unit, so it will free it automatically.
    */
   brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */

   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
}

642 643 644 645 646 647 648
void
fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
{
   brw_barrier(p, src);
   brw_WAIT(p);
}

649
bool
650
fs_generator::generate_linterp(fs_inst *inst,
651
                               struct brw_reg dst, struct brw_reg *src)
652
{
653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
   /* PLN reads:
    *                      /   in SIMD16   \
    *    -----------------------------------
    *   | src1+0 | src1+1 | src1+2 | src1+3 |
    *   |-----------------------------------|
    *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
    *    -----------------------------------
    *
    * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
    *
    *    -----------------------------------
    *   | src1+0 | src1+1 | src1+2 | src1+3 |
    *   |-----------------------------------|
    *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
    *   |-----------------------------------|
    *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
    *    -----------------------------------
    *
    * See also: emit_interpolation_setup_gen4().
    */
673
   struct brw_reg delta_x = src[0];
674
   struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
675
   struct brw_reg interp = src[1];
676
   brw_inst *i[4];
677

678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721
   if (devinfo->gen >= 11) {
      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_NF);
      struct brw_reg dwP = suboffset(interp, 0);
      struct brw_reg dwQ = suboffset(interp, 1);
      struct brw_reg dwR = suboffset(interp, 3);

      brw_set_default_exec_size(p, BRW_EXECUTE_8);

      if (inst->exec_size == 8) {
         i[0] = brw_MAD(p,            acc, dwR, offset(delta_x, 0), dwP);
         i[1] = brw_MAD(p, offset(dst, 0), acc, offset(delta_y, 0), dwQ);

         brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod);

         /* brw_set_default_saturate() is called before emitting instructions,
          * so the saturate bit is set in each instruction, so we need to unset
          * it on the first instruction of each pair.
          */
         brw_inst_set_saturate(p->devinfo, i[0], false);
      } else {
         brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
         i[0] = brw_MAD(p,            acc, dwR, offset(delta_x, 0), dwP);
         i[1] = brw_MAD(p, offset(dst, 0), acc, offset(delta_x, 1), dwQ);

         brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
         i[2] = brw_MAD(p,            acc, dwR, offset(delta_y, 0), dwP);
         i[3] = brw_MAD(p, offset(dst, 1), acc, offset(delta_y, 1), dwQ);

         brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);

         brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod);
         brw_inst_set_cond_modifier(p->devinfo, i[3], inst->conditional_mod);

         /* brw_set_default_saturate() is called before emitting instructions,
          * so the saturate bit is set in each instruction, so we need to unset
          * it on the first instruction of each pair.
          */
         brw_inst_set_saturate(p->devinfo, i[0], false);
         brw_inst_set_saturate(p->devinfo, i[2], false);
      }

      return true;
   } else if (devinfo->has_pln &&
              (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
722
      brw_PLN(p, dst, interp, delta_x);
723 724

      return false;
725
   } else {
726 727 728 729 730 731 732 733 734 735
      i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x);
      i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y);

      brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod);

      /* brw_set_default_saturate() is called before emitting instructions, so
       * the saturate bit is set in each instruction, so we need to unset it on
       * the first instruction.
       */
      brw_inst_set_saturate(p->devinfo, i[0], false);
736 737

      return true;
738 739 740
   }
}

741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772
void
fs_generator::generate_get_buffer_size(fs_inst *inst,
                                       struct brw_reg dst,
                                       struct brw_reg src,
                                       struct brw_reg surf_index)
{
   assert(devinfo->gen >= 7);
   assert(surf_index.file == BRW_IMMEDIATE_VALUE);

   uint32_t simd_mode;
   int rlen = 4;

   switch (inst->exec_size) {
   case 8:
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
      break;
   case 16:
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
      break;
   default:
      unreachable("Invalid width for texture instruction");
   }

   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
      rlen = 8;
      dst = vec16(dst);
   }

   brw_SAMPLE(p,
              retype(dst, BRW_REGISTER_TYPE_UW),
              inst->base_mrf,
              src,
773
              surf_index.ud,
774 775 776 777 778 779 780
              0,
              GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
              rlen, /* response length */
              inst->mlen,
              inst->header_size > 0,
              simd_mode,
              BRW_SAMPLER_RETURN_FORMAT_SINT32);
781 782

   brw_mark_surface_used(prog_data, surf_index.ud);
783 784
}

785
void
786
fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
787
                           struct brw_reg surface_index,
788
                           struct brw_reg sampler_index)
789
{
790
   assert(inst->size_written % REG_SIZE == 0);
791
   int msg_type = -1;
792
   uint32_t simd_mode;
793
   uint32_t return_format;
794
   bool is_combined_send = inst->eot;
795 796 797 798 799 800 801 802 803 804 805 806

   switch (dst.type) {
   case BRW_REGISTER_TYPE_D:
      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
      break;
   case BRW_REGISTER_TYPE_UD:
      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
      break;
   default:
      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
      break;
   }
807

808 809 810 811 812 813 814 815 816 817 818
   /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
    * is set as part of the message descriptor.  On gen4, the PRM seems to
    * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
    * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
    * gone from the message descriptor entirely and you just get UINT32 all
    * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
    * just stomp it to UINT32 all the time.
    */
   if (inst->opcode == SHADER_OPCODE_TXS)
      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;

819 820 821 822 823
   switch (inst->exec_size) {
   case 8:
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
      break;
   case 16:
824
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
825 826 827 828
      break;
   default:
      unreachable("Invalid width for texture instruction");
   }
829

830
   if (devinfo->gen >= 5) {
831
      switch (inst->opcode) {
832
      case SHADER_OPCODE_TEX:
833 834 835 836 837 838 839 840 841 842 843 844 845
	 if (inst->shadow_compare) {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
	 } else {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
	 }
	 break;
      case FS_OPCODE_TXB:
	 if (inst->shadow_compare) {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
	 } else {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
	 }
	 break;
846
      case SHADER_OPCODE_TXL:
847 848 849 850 851 852
	 if (inst->shadow_compare) {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
	 } else {
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
	 }
	 break;
853 854 855 856 857 858 859 860
      case SHADER_OPCODE_TXL_LZ:
         assert(devinfo->gen >= 9);
	 if (inst->shadow_compare) {
            msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
         } else {
            msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
         }
         break;
861
      case SHADER_OPCODE_TXS:
862 863
	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
	 break;
864
      case SHADER_OPCODE_TXD:
865
         if (inst->shadow_compare) {
866
            /* Gen7.5+.  Otherwise, lowered in NIR */
867
            assert(devinfo->gen >= 8 || devinfo->is_haswell);
868 869 870 871
            msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
         } else {
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
         }
872
	 break;
873
      case SHADER_OPCODE_TXF:
874 875
	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
	 break;
876 877 878 879
      case SHADER_OPCODE_TXF_LZ:
         assert(devinfo->gen >= 9);
         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
         break;
880 881 882 883
      case SHADER_OPCODE_TXF_CMS_W:
         assert(devinfo->gen >= 9);
         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
         break;
884
      case SHADER_OPCODE_TXF_CMS:
885
         if (devinfo->gen >= 7)
886 887 888 889
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
         else
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
         break;
890
      case SHADER_OPCODE_TXF_UMS:
891
         assert(devinfo->gen >= 7);
892 893
         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
         break;
894
      case SHADER_OPCODE_TXF_MCS:
895
         assert(devinfo->gen >= 7);
896 897
         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
         break;
898 899 900
      case SHADER_OPCODE_LOD:
         msg_type = GEN5_SAMPLER_MESSAGE_LOD;
         break;
Chris Forbes's avatar
Chris Forbes committed
901
      case SHADER_OPCODE_TG4:
902
         if (inst->shadow_compare) {
903
            assert(devinfo->gen >= 7);
904 905
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
         } else {
906
            assert(devinfo->gen >= 6);
907 908
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
         }
Chris Forbes's avatar
Chris Forbes committed
909
         break;
910
      case SHADER_OPCODE_TG4_OFFSET:
911
         assert(devinfo->gen >= 7);
912 913 914 915 916
         if (inst->shadow_compare) {
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
         } else {
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
         }
917
         break;
918 919 920
      case SHADER_OPCODE_SAMPLEINFO:
         msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
         break;
921
      default:
922
	 unreachable("not reached");
923 924 925
      }
   } else {
      switch (inst->opcode) {
926
      case SHADER_OPCODE_TEX:
927 928 929
	 /* Note that G45 and older determines shadow compare and dispatch width
	  * from message length for most messages.
	  */
930
         if (inst->exec_size == 8) {
931 932 933 934 935 936 937 938 939 940 941 942 943 944 945
            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
            if (inst->shadow_compare) {
               assert(inst->mlen == 6);
            } else {
               assert(inst->mlen <= 4);
            }
         } else {
            if (inst->shadow_compare) {
               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
               assert(inst->mlen == 9);
            } else {
               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
               assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
            }
         }
946 947 948
	 break;
      case FS_OPCODE_TXB:
	 if (inst->shadow_compare) {
949
            assert(inst->exec_size == 8);
950 951 952 953 954 955 956 957
	    assert(inst->mlen == 6);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
	 } else {
	    assert(inst->mlen == 9);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 }
	 break;
958
      case SHADER_OPCODE_TXL:
959
	 if (inst->shadow_compare) {
960
            assert(inst->exec_size == 8);
961 962 963 964 965 966 967 968
	    assert(inst->mlen == 6);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
	 } else {
	    assert(inst->mlen == 9);
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 }
	 break;
969
      case SHADER_OPCODE_TXD:
970
	 /* There is no sample_d_c message; comparisons are done manually */
971
         assert(inst->exec_size == 8);
972 973
	 assert(inst->mlen == 7 || inst->mlen == 10);
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
974
	 break;
975
      case SHADER_OPCODE_TXF:
976
         assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
977 978 979
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 break;
980
      case SHADER_OPCODE_TXS:
981 982 983 984
	 assert(inst->mlen == 3);
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
	 break;
985
      default:
986
	 unreachable("not reached");
987 988 989 990 991 992 993 994
      }
   }
   assert(msg_type != -1);

   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
      dst = vec16(dst);
   }

995
   assert(devinfo->gen < 7 || inst->header_size == 0 ||
996
          src.file == BRW_GENERAL_REGISTER_FILE);
997

998 999
   assert(sampler_index.type == BRW_REGISTER_TYPE_UD);

1000 1001 1002 1003
   /* Load the message header if present.  If there's a texture offset,
    * we need to set it up explicitly and load the offset bitfield.
    * Otherwise, we can use an implied move from g0 to the first message reg.
    */
1004
   if (inst->header_size != 0 && devinfo->gen < 7) {
1005
      if (devinfo->gen < 6 && !inst->offset) {
1006 1007
         /* Set up an implied move from g0 to the MRF. */
         src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
1008
      } else {
1009 1010
         assert(inst->base_mrf != -1);
         struct brw_reg header_reg = brw_message_reg(inst->base_mrf);
1011

1012
         brw_push_insn_state(p);
1013
         brw_set_default_exec_size(p, BRW_EXECUTE_8);
1014 1015
         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
         brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1016 1017 1018
         /* Explicitly set up the message header by copying g0 to the MRF. */
         brw_MOV(p, header_reg, brw_vec8_grf(0, 0));

1019
         brw_set_default_exec_size(p, BRW_EXECUTE_1);
1020
         if (inst->offset) {
1021 1022
            /* Set the offset bits in DWord 2. */
            brw_MOV(p, get_element_ud(header_reg, 2),
1023
                       brw_imm_ud(inst->offset));
1024
         }
1025

1026 1027
         brw_pop_insn_state(p);
      }
1028 1029
   }

1030 1031
   uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
         inst->opcode == SHADER_OPCODE_TG4_OFFSET)
1032 1033
         ? prog_data->binding_table.gather_texture_start
         : prog_data->binding_table.texture_start;
1034

1035 1036 1037
   if (surface_index.file == BRW_IMMEDIATE_VALUE &&
       sampler_index.file == BRW_IMMEDIATE_VALUE) {
      uint32_t surface = surface_index.ud;
1038
      uint32_t sampler = sampler_index.ud;
1039 1040 1041 1042 1043

      brw_SAMPLE(p,
                 retype(dst, BRW_REGISTER_TYPE_UW),
                 inst->base_mrf,
                 src,
1044
                 surface + base_binding_table_index,
1045 1046
                 sampler % 16,
                 msg_type,
1047
                 inst->size_written / REG_SIZE,
1048
                 inst->mlen,
1049
                 inst->header_size != 0,
1050 1051 1052
                 simd_mode,
                 return_format);

1053
      brw_mark_surface_used(prog_data, surface + base_binding_table_index);
1054
   } else {
1055 1056 1057
      /* Non-const sampler index */

      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1058
      struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
1059 1060 1061 1062 1063
      struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));

      brw_push_insn_state(p);
      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
      brw_set_default_access_mode(p, BRW_ALIGN_1);
1064
      brw_set_default_exec_size(p, BRW_EXECUTE_1);
1065

1066
      if (brw_regs_equal(&surface_reg, &sampler_reg)) {
1067 1068
         brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
      } else {
1069 1070 1071 1072 1073 1074
         if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
            brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
         } else {
            brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
            brw_OR(p, addr, addr, surface_reg);
         }
1075
      }
1076 1077
      if (base_binding_table_index)
         brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
1078
      brw_AND(p, addr, addr, brw_imm_ud(0xfff));
1079

1080 1081
      brw_pop_insn_state(p);

1082 1083 1084 1085
      /* dst = send(offset, a0.0 | <descriptor>) */
      brw_inst *insn = brw_send_indirect_message(
         p, BRW_SFID_SAMPLER, dst, src, addr);
      brw_set_sampler_message(p, insn,
1086 1087 1088
                              0 /* surface */,
                              0 /* sampler */,
                              msg_type,
1089
                              inst->size_written / REG_SIZE,
1090
                              inst->mlen /* mlen */,
1091
                              inst->header_size != 0 /* header */,
1092 1093 1094 1095 1096 1097
                              simd_mode,
                              return_format);

      /* visitor knows more than we do about the surface limit required,
       * so has already done marking.
       */
1098
   }
1099 1100

   if (is_combined_send) {
1101 1102
      brw_inst_set_eot(p->devinfo, brw_last_inst, true);
      brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
1103
   }
1104 1105 1106 1107 1108 1109 1110 1111
}


/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 * looking like:
 *
 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 *
1112
 * Ideally, we want to produce:
1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
 *
 *           DDX                     DDY
 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 *
 * and add another set of two more subspans if in 16-pixel dispatch mode.
 *
 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1128 1129 1130 1131
 * pair.  But the ideal approximation may impose a huge performance cost on
 * sample_d.  On at least Haswell, sample_d instruction does some
 * optimizations if the same LOD is used for all pixels in the subspan.
 *
1132 1133
 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
 * appropriate swizzling.
1134 1135
 */
void
1136
fs_generator::generate_ddx(const fs_inst *inst,
1137
                           struct brw_reg dst, struct brw_reg src)
1138
{
1139 1140
   unsigned vstride, width;

1141
   if (inst->opcode == FS_OPCODE_DDX_FINE) {
1142 1143 1144
      /* produce accurate derivatives */
      vstride = BRW_VERTICAL_STRIDE_2;
      width = BRW_WIDTH_2;
1145
   } else {
1146 1147 1148 1149 1150
      /* replicate the derivative at the top-left pixel to other pixels */
      vstride = BRW_VERTICAL_STRIDE_4;
      width = BRW_WIDTH_4;
   }

1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161
   struct brw_reg src0 = src;
   struct brw_reg src1 = src;

   src0.subnr   = sizeof(float);
   src0.vstride = vstride;
   src0.width   = width;
   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
   src1.vstride = vstride;
   src1.width   = width;
   src1.hstride = BRW_HORIZONTAL_STRIDE_0;

1162 1163 1164
   brw_ADD(p, dst, src0, negate(src1));
}

1165 1166 1167 1168
/* The negate_value boolean is used to negate the derivative computation for
 * FBOs, since they place the origin at the upper left instead of the lower
 * left.
 */
1169
void
1170
fs_generator::generate_ddy(const fs_inst *inst,
1171
                           struct brw_reg dst, struct brw_reg src)
1172
{
1173
   if (inst->opcode == FS_OPCODE_DDY_FINE) {
1174
      /* produce accurate derivatives */
1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189
      if (devinfo->gen >= 11) {
         src = stride(src, 0, 2, 1);
         struct brw_reg src_0  = byte_offset(src,  0 * sizeof(float));
         struct brw_reg src_2  = byte_offset(src,  2 * sizeof(float));
         struct brw_reg src_4  = byte_offset(src,  4 * sizeof(float));
         struct brw_reg src_6  = byte_offset(src,  6 * sizeof(float));
         struct brw_reg src_8  = byte_offset(src,  8 * sizeof(float));
         struct brw_reg src_10 = byte_offset(src, 10 * sizeof(float));
         struct brw_reg src_12 = byte_offset(src, 12 * sizeof(float));
         struct brw_reg src_14 = byte_offset(src, 14 * sizeof(float));

         struct brw_reg dst_0  = byte_offset(dst,  0 * sizeof(float));
         struct brw_reg dst_4  = byte_offset(dst,  4 * sizeof(float));
         struct brw_reg dst_8  = byte_offset(dst,  8 * sizeof(float));
         struct brw_reg dst_12 = byte_offset(dst, 12 * sizeof(float));
1190

1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213
         brw_push_insn_state(p);
         brw_set_default_exec_size(p, BRW_EXECUTE_4);

         brw_ADD(p, dst_0, negate(src_0), src_2);
         brw_ADD(p, dst_4, negate(src_4), src_6);

         if (inst->exec_size == 16) {
            brw_ADD(p, dst_8,  negate(src_8),  src_10);
            brw_ADD(p, dst_12, negate(src_12), src_14);
         }

         brw_pop_insn_state(p);
      } else {
         struct brw_reg src0 = stride(src, 4, 4, 1);
         struct brw_reg src1 = stride(src, 4, 4, 1);
         src0.swizzle = BRW_SWIZZLE_XYXY;
         src1.swizzle = BRW_SWIZZLE_ZWZW;

         brw_push_insn_state(p);
         brw_set_default_access_mode(p, BRW_ALIGN_16);
         brw_ADD(p, dst, negate(src0), src1);
         brw_pop_insn_state(p);
      }
1214 1215
   } else {
      /* replicate the derivative at the top-left pixel to other pixels */
1216 1217 1218 1219 1220
      struct brw_reg src0 = stride(src, 4, 4, 0);
      struct brw_reg src1 = stride(src, 4, 4, 0);
      src0.subnr = 0 * sizeof(float);
      src1.subnr = 2 * sizeof(float);

1221
      brw_ADD(p, dst, negate(src0), src1);