aco_instruction_selection.cpp 481 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/*
 * Copyright © 2018 Valve Corporation
 * Copyright © 2018 Google
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 */

#include <algorithm>
27
#include <array>
James Park's avatar
James Park committed
28
#include <functional>
29
#include <map>
James Park's avatar
James Park committed
30
31
#include <numeric>
#include <stack>
32

33
#include "ac_shader_util.h"
34
35
36
#include "aco_ir.h"
#include "aco_builder.h"
#include "aco_interface.h"
37
#include "aco_instruction_selection.h"
38
#include "util/fast_idiv_by_const.h"
39
#include "util/memstream.h"
40

41
42
43
44
45
#include "ac_exp_param.h"
#include "sid.h"
#include "vulkan/radv_descriptor_set.h"
#include "vulkan/radv_shader.h"

46
47
48
namespace aco {
namespace {

49
50
51
52
53
54
55
#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)

static void _isel_err(isel_context *ctx, const char *file, unsigned line,
                      const nir_instr *instr, const char *msg)
{
   char *out;
   size_t outsize;
56
57
58
   struct u_memstream mem;
   u_memstream_open(&mem, &out, &outsize);
   FILE *const memf = u_memstream_get(&mem);
59
60
61

   fprintf(memf, "%s: ", msg);
   nir_print_instr(instr, memf);
62
   u_memstream_close(&mem);
63
64
65
66
67

   _aco_err(ctx->program, file, line, out);
   free(out);
}

68
69
70
71
struct if_context {
   Temp cond;

   bool divergent_old;
72
73
74
   bool exec_potentially_empty_discard_old;
   bool exec_potentially_empty_break_old;
   uint16_t exec_potentially_empty_break_depth_old;
75
76
77

   unsigned BB_if_idx;
   unsigned invert_idx;
78
   bool uniform_has_then_branch;
79
80
81
82
83
   bool then_branch_divergent;
   Block BB_invert;
   Block BB_endif;
};

Rhys Perry's avatar
Rhys Perry committed
84
85
86
87
88
89
90
91
92
93
struct loop_context {
   Block loop_exit;

   unsigned header_idx_old;
   Block* exit_old;
   bool divergent_cont_old;
   bool divergent_branch_old;
   bool divergent_if_old;
};

94
static bool visit_cf_list(struct isel_context *ctx,
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
                          struct exec_list *list);

static void add_logical_edge(unsigned pred_idx, Block *succ)
{
   succ->logical_preds.emplace_back(pred_idx);
}


static void add_linear_edge(unsigned pred_idx, Block *succ)
{
   succ->linear_preds.emplace_back(pred_idx);
}

static void add_edge(unsigned pred_idx, Block *succ)
{
   add_logical_edge(pred_idx, succ);
   add_linear_edge(pred_idx, succ);
}

static void append_logical_start(Block *b)
{
   Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
}

static void append_logical_end(Block *b)
{
   Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
}

Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
{
126
127
   uint32_t id = ctx->first_temp_id + def->index;
   return Temp(id, ctx->program->temp_rc[id]);
128
129
}

130
Temp emit_mbcnt(isel_context *ctx, Temp dst, Operand mask = Operand(), Operand base = Operand(0u))
131
132
{
   Builder bld(ctx->program, ctx->block);
133
   assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
134
   assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
135
136

   if (ctx->program->wave_size == 32) {
137
      Operand mask_lo = mask.isUndefined() ? Operand(-1u) : mask;
138
      return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
Timur Kristóf's avatar
Timur Kristóf committed
139
140
141
142
143
   }

   Operand mask_lo(-1u);
   Operand mask_hi(-1u);

144
   if (mask.isTemp()) {
145
146
      RegClass rc = RegClass(mask.regClass().type(), 1);
      Builder::Result mask_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
Timur Kristóf's avatar
Timur Kristóf committed
147
148
      mask_lo = Operand(mask_split.def(0).getTemp());
      mask_hi = Operand(mask_split.def(1).getTemp());
149
150
151
   } else if (mask.physReg() == exec) {
      mask_lo = Operand(exec_lo, s1);
      mask_hi = Operand(exec_hi, s1);
152
   }
Timur Kristóf's avatar
Timur Kristóf committed
153

154
   Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
Timur Kristóf's avatar
Timur Kristóf committed
155
156
157
158
159

   if (ctx->program->chip_class <= GFX7)
      return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
   else
      return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
160
161
}

162
Temp emit_wqm(Builder& bld, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
163
164
165
166
{
   if (!dst.id())
      dst = bld.tmp(src.regClass());

167
168
   assert(src.size() == dst.size());

169
   if (bld.program->stage != fragment_fs) {
170
171
172
      if (!dst.id())
         return src;

173
      bld.copy(Definition(dst), src);
174
175
176
177
      return dst;
   }

   bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
178
   bld.program->needs_wqm |= program_needs_wqm;
179
180
181
   return dst;
}

182
183
static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
{
184
   if (index.regClass() == s1)
185
      return bld.readlane(bld.def(s1), data, index);
186

187
188
   if (ctx->options->chip_class <= GFX7) {
      /* GFX6-7: there is no bpermute instruction */
189
190
191
192
193
194
      Operand index_op(index);
      Operand input_data(data);
      index_op.setLateKill(true);
      input_data.setLateKill(true);

      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
195
196
   } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {

197
      /* GFX10 wave64 mode: emulate full-wave bpermute */
198
199
200
201
202
203
204
205
206
207
208
      Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
      Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
      Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
      Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
      Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
      Operand input_data(data);

      index_x4.setLateKill(true);
      input_data.setLateKill(true);
      same_half.setLateKill(true);

209
210
      /* We need one pair of shared VGPRs:
       * Note, that these have twice the allocation granularity of normal VGPRs */
Rhys Perry's avatar
Rhys Perry committed
211
      ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
212

213
214
215
216
      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
   } else {
      /* GFX8-9 or GFX10 wave32: bpermute works normally */
      Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
217
218
219
220
      return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
   }
}

221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
{
   if (ctx->options->chip_class >= GFX8) {
      unsigned and_mask = mask & 0x1f;
      unsigned or_mask = (mask >> 5) & 0x1f;
      unsigned xor_mask = (mask >> 10) & 0x1f;

      uint16_t dpp_ctrl = 0xffff;

      // TODO: we could use DPP8 for some swizzles
      if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
         unsigned res[4] = {0, 1, 2, 3};
         for (unsigned i = 0; i < 4; i++)
            res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
         dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
      } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
         dpp_ctrl = dpp_row_rr(8);
      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
         dpp_ctrl = dpp_row_mirror;
      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
         dpp_ctrl = dpp_row_half_mirror;
      }

      if (dpp_ctrl != 0xffff)
         return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
   }

   return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
}

251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
Temp as_vgpr(isel_context *ctx, Temp val)
{
   if (val.type() == RegType::sgpr) {
      Builder bld(ctx->program, ctx->block);
      return bld.copy(bld.def(RegType::vgpr, val.size()), val);
   }
   assert(val.type() == RegType::vgpr);
   return val;
}

//assumes a != 0xffffffff
void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
{
   assert(b != 0);
   Builder bld(ctx->program, ctx->block);

   if (util_is_power_of_two_or_zero(b)) {
      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
      return;
   }

   util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);

   assert(info.multiplier <= 0xffffffff);

   bool pre_shift = info.pre_shift != 0;
   bool increment = info.increment != 0;
   bool multiply = true;
   bool post_shift = info.post_shift != 0;

   if (!pre_shift && !increment && !multiply && !post_shift) {
Rhys Perry's avatar
Rhys Perry committed
282
      bld.copy(Definition(dst), a);
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
      return;
   }

   Temp pre_shift_dst = a;
   if (pre_shift) {
      pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
   }

   Temp increment_dst = pre_shift_dst;
   if (increment) {
      increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
      bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
   }

   Temp multiply_dst = increment_dst;
   if (multiply) {
      multiply_dst = post_shift ? bld.tmp(v1) : dst;
      bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
Rhys Perry's avatar
Rhys Perry committed
302
               bld.copy(bld.def(v1), Operand((uint32_t)info.multiplier)));
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
   }

   if (post_shift) {
      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
   }
}

void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
{
   Builder bld(ctx->program, ctx->block);
   bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
}


Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
{
   /* no need to extract the whole vector */
   if (src.regClass() == dst_rc) {
      assert(idx == 0);
      return src;
   }
324
325

   assert(src.bytes() > (idx * dst_rc.bytes()));
326
327
   Builder bld(ctx->program, ctx->block);
   auto it = ctx->allocated_vec.find(src.id());
328
   if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
329
330
331
      if (it->second[idx].regClass() == dst_rc) {
         return it->second[idx];
      } else {
332
         assert(!dst_rc.is_subdword());
333
334
335
336
337
         assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
         return bld.copy(bld.def(dst_rc), it->second[idx]);
      }
   }

338
339
340
341
   if (dst_rc.is_subdword())
      src = as_vgpr(ctx, src);

   if (src.bytes() == dst_rc.bytes()) {
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
      assert(idx == 0);
      return bld.copy(bld.def(dst_rc), src);
   } else {
      Temp dst = bld.tmp(dst_rc);
      emit_extract_vector(ctx, src, idx, dst);
      return dst;
   }
}

void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
{
   if (num_components == 1)
      return;
   if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
      return;
357
358
   RegClass rc;
   if (num_components > vec_src.size()) {
359
360
361
      if (vec_src.type() == RegType::sgpr) {
         /* should still help get_alu_src() */
         emit_split_vector(ctx, vec_src, vec_src.size());
362
         return;
363
      }
364
365
366
367
368
      /* sub-dword split */
      rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
   } else {
      rc = RegClass(vec_src.type(), vec_src.size() / num_components);
   }
369
370
371
   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
   split->operands[0] = Operand(vec_src);
   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
372
   for (unsigned i = 0; i < num_components; i++) {
373
      elems[i] = ctx->program->allocateTmp(rc);
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
      split->definitions[i] = Definition(elems[i]);
   }
   ctx->block->instructions.emplace_back(std::move(split));
   ctx->allocated_vec.emplace(vec_src.id(), elems);
}

/* This vector expansion uses a mask to determine which elements in the new vector
 * come from the original vector. The other elements are undefined. */
void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
{
   emit_split_vector(ctx, vec_src, util_bitcount(mask));

   if (vec_src == dst)
      return;

   Builder bld(ctx->program, ctx->block);
   if (num_components == 1) {
      if (dst.type() == RegType::sgpr)
         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
      else
         bld.copy(Definition(dst), vec_src);
      return;
   }

   unsigned component_size = dst.size() / num_components;
399
   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
400
401
402
403
404
405
406
407
408
409
410

   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
   vec->definitions[0] = Definition(dst);
   unsigned k = 0;
   for (unsigned i = 0; i < num_components; i++) {
      if (mask & (1 << i)) {
         Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
         if (dst.type() == RegType::sgpr)
            src = bld.as_uniform(src);
         vec->operands[i] = Operand(src);
      } else {
Rhys Perry's avatar
Rhys Perry committed
411
         vec->operands[i] = Operand(0u, component_size == 2);
412
413
414
415
416
417
418
      }
      elems[i] = vec->operands[i].getTemp();
   }
   ctx->block->instructions.emplace_back(std::move(vec));
   ctx->allocated_vec.emplace(dst.id(), elems);
}

419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
/* adjust misaligned small bit size loads */
void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
{
   Builder bld(ctx->program, ctx->block);
   Operand shift;
   Temp select = Temp();
   if (offset.isConstant()) {
      assert(offset.constantValue() && offset.constantValue() < 4);
      shift = Operand(offset.constantValue() * 8);
   } else {
      /* bit_offset = 8 * (offset & 0x3) */
      Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
      select = bld.tmp(s1);
      shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
   }

   if (vec.size() == 1) {
      bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
   } else if (vec.size() == 2) {
      Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
      bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
      if (tmp == dst)
         emit_split_vector(ctx, dst, 2);
      else
         emit_extract_vector(ctx, tmp, 0, dst);
444
445
446
447
448
449
450
451
452
453
454
   } else if (vec.size() == 3 || vec.size() == 4) {
      Temp lo = bld.tmp(s2), hi;
      if (vec.size() == 3) {
         /* this can happen if we use VMEM for a uniform load */
         hi = bld.tmp(s1);
         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
      } else {
         hi = bld.tmp(s2);
         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
         hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
      }
455
      if (select != Temp())
456
         hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
457
458
459
460
461
462
463
464
465
466
      lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
      Temp mid = bld.tmp(s1);
      lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
      hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
      mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
      emit_split_vector(ctx, dst, 2);
   }
}

467
void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
468
469
{
   Builder bld(ctx->program, ctx->block);
470
471
   if (offset.isTemp()) {
      Temp tmp[4] = {vec, vec, vec, vec};
472

473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
      if (vec.size() == 4) {
         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
      } else if (vec.size() == 3) {
         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
      } else if (vec.size() == 2) {
         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
      }
      for (unsigned i = 0; i < dst.size(); i++)
         tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);

      vec = tmp[0];
      if (dst.size() == 2)
         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);

      offset = Operand(0u);
491
492
   }

493
   unsigned num_components = vec.bytes() / component_size;
494
495
496
497
498
499
500
   if (vec.regClass() == dst.regClass()) {
      assert(offset.constantValue() == 0);
      bld.copy(Definition(dst), vec);
      emit_split_vector(ctx, dst, num_components);
      return;
   }

501
   emit_split_vector(ctx, vec, num_components);
502
503
504
505
506
   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
   RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();

   assert(offset.constantValue() % component_size == 0);
   unsigned skip = offset.constantValue() / component_size;
507
508
   for (unsigned i = skip; i < num_components; i++)
      elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
509
510

   /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
511
   if (dst.type() == RegType::vgpr) {
512
      num_components = dst.bytes() / component_size;
513
514
515
516
517
518
519
520
521
522
      aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
      for (unsigned i = 0; i < num_components; i++)
         create_vec->operands[i] = Operand(elems[i]);
      create_vec->definitions[0] = Definition(dst);
      bld.insert(std::move(create_vec));

   /* if dst is sgpr - split the src, but move the original to sgpr. */
   } else if (skip) {
      vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
      byte_align_scalar(ctx, vec, offset, dst);
523
   } else {
524
525
      assert(dst.size() == vec.size());
      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
526
   }
527

528
529
530
   ctx->allocated_vec.emplace(dst.id(), elems);
}

531
Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
532
{
533
534
   Builder bld(ctx->program, ctx->block);
   if (!dst.id())
535
      dst = bld.tmp(bld.lm);
536
537

   assert(val.regClass() == s1);
538
   assert(dst.regClass() == bld.lm);
539

540
   return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
541
542
}

543
Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
544
{
545
546
547
548
   Builder bld(ctx->program, ctx->block);
   if (!dst.id())
      dst = bld.tmp(s1);

549
   assert(val.regClass() == bld.lm);
550
551
552
553
   assert(dst.regClass() == s1);

   /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
   Temp tmp = bld.tmp(s1);
554
   bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
555
   return emit_wqm(bld, tmp, dst);
556
557
}

558
559
560
561
562
563
564
565
566
567
/**
 * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
 * src_bits and dst_bits are truncated.
 *
 * Sign extension may be applied using the sign_extend parameter. The position of the input sign
 * bit is indicated by src_bits in this case.
 *
 * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
 */
Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool sign_extend, Temp dst=Temp())
568
{
569
570
   assert(!(sign_extend && dst_bits < src_bits) && "Shrinking integers is not supported for signed inputs");

571
572
573
574
575
576
577
   if (!dst.id()) {
      if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
         dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
      else
         dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
   }

578
579
580
581
582
583
   assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
   assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);

   if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
      /* Copy the raw value, leaving an undefined value in the upper bits for
       * the caller to handle appropriately */
584
      return bld.copy(Definition(dst), src);
585
   } else if (dst.bytes() < src.bytes()) {
586
      return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
587
   }
588
589
590
591
592
593
594

   Temp tmp = dst;
   if (dst_bits == 64)
      tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);

   if (tmp == src) {
   } else if (src.regClass() == s1) {
595
      assert(src_bits < 32);
596
597
      bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc),
                 src, Operand(0u), Operand(src_bits), Operand((unsigned)sign_extend));
598
   } else if (ctx->options->chip_class >= GFX8) {
599
      assert(src_bits < 32);
600
601
      assert(src_bits != 8 || src.regClass() == v1b);
      assert(src_bits != 16 || src.regClass() == v2b);
602
      assert(dst_bits >= 16);
603
604
605
      aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
      sdwa->operands[0] = Operand(src);
      sdwa->definitions[0] = Definition(tmp);
606
      if (sign_extend)
607
608
609
610
611
612
         sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
      else
         sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
      sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
      bld.insert(std::move(sdwa));
   } else {
613
      assert(src_bits < 32);
614
      assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
615
      aco_opcode opcode = sign_extend ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
616
617
618
619
      bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
   }

   if (dst_bits == 64) {
620
      if (sign_extend && dst.regClass() == s2) {
621
622
         Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
623
      } else if (sign_extend && dst.regClass() == v2) {
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
         Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
      } else {
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
      }
   }

   return dst;
}

enum sgpr_extract_mode {
   sgpr_extract_sext,
   sgpr_extract_zext,
   sgpr_extract_undef,
};

Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src, sgpr_extract_mode mode)
{
   Temp vec = get_ssa_temp(ctx, src->src.ssa);
   unsigned src_size = src->src.ssa->bit_size;
   unsigned swizzle = src->swizzle[0];

   if (vec.size() > 1) {
      assert(src_size == 16);
      vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
      swizzle = swizzle & 1;
   }

   Builder bld(ctx->program, ctx->block);
   Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;

655
   if (mode == sgpr_extract_undef && swizzle == 0)
656
      bld.copy(Definition(tmp), vec);
657
658
659
   else
      bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
                 Operand(swizzle), Operand(src_size), Operand((uint32_t)(mode == sgpr_extract_sext)));
660
661
662
663
664
665
666

   if (dst.regClass() == s2)
      convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);

   return dst;
}

667
668
Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
{
669
   if (src.src.ssa->num_components == 1 && size == 1)
670
671
672
      return get_ssa_temp(ctx, src.src.ssa);

   Temp vec = get_ssa_temp(ctx, src.src.ssa);
673
   unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
674
675
676
677
678
679
680
681
682
   bool identity_swizzle = true;

   for (unsigned i = 0; identity_swizzle && i < size; i++) {
      if (src.swizzle[i] != i)
         identity_swizzle = false;
   }
   if (identity_swizzle)
      return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));

683
684
685
686
687
688
   assert(elem_size > 0);
   assert(vec.bytes() % elem_size == 0);

   if (elem_size < 4 && vec.type() == RegType::sgpr) {
      assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
      assert(size == 1);
689
      return extract_8_16_bit_sgpr_element(
690
         ctx, ctx->program->allocateTmp(s1), &src, sgpr_extract_undef);
691
   }
692

693
   RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
694
695
696
697
   if (size == 1) {
      return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
   } else {
      assert(size <= 4);
698
      std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
699
700
701
702
703
      aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
      for (unsigned i = 0; i < size; ++i) {
         elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
         vec_instr->operands[i] = Operand{elems[i]};
      }
704
      Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
705
706
707
708
709
710
711
      vec_instr->definitions[0] = Definition(dst);
      ctx->block->instructions.emplace_back(std::move(vec_instr));
      ctx->allocated_vec.emplace(dst.id(), elems);
      return dst;
   }
}

712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
Temp get_alu_src_vop3p(struct isel_context *ctx, nir_alu_src src)
{
   /* returns v2b or v1 for vop3p usage.
    * The source expects exactly 2 16bit components
    * which are within the same dword
    */
   assert(src.src.ssa->bit_size == 16);
   assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);

   Temp tmp = get_ssa_temp(ctx, src.src.ssa);
   if (tmp.size() == 1)
      return tmp;

   /* the size is larger than 1 dword: check the swizzle */
   unsigned dword = src.swizzle[0] >> 1;

   /* extract a full dword if possible */
   if (tmp.bytes() >= (dword + 1) * 4) {
      return emit_extract_vector(ctx, tmp, dword, RegClass(tmp.type(), 1));
   } else {
      /* This must be a swizzled access to %a.zz where %a is v6b */
      assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
      assert(tmp.regClass() == v6b && dword == 1);
      return emit_extract_vector(ctx, tmp, dword * 2, v2b);
   }
}

739
740
uint32_t get_alu_src_ub(isel_context *ctx, nir_alu_instr *instr, int src_idx)
{
741
742
   nir_ssa_scalar scalar = nir_ssa_scalar{instr->src[src_idx].src.ssa,
                                          instr->src[src_idx].swizzle[0]};
743
744
745
   return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
}

746
Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr, bool non_uniform=false)
747
748
749
750
{
   if (ptr.size() == 2)
      return ptr;
   Builder bld(ctx->program, ctx->block);
751
752
   if (ptr.type() == RegType::vgpr && !non_uniform)
      ptr = bld.as_uniform(ptr);
753
   return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)),
754
755
756
                     ptr, Operand((unsigned)ctx->options->address32_hi));
}

757
758
void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op,
                           Temp dst, bool writes_scc, uint8_t uses_ub = 0)
759
760
761
762
763
{
   aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
   sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
   sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
   sop2->definitions[0] = Definition(dst);
Rhys Perry's avatar
Rhys Perry committed
764
765
   if (instr->no_unsigned_wrap)
      sop2->definitions[0].setNUW(true);
766
   if (writes_scc)
767
      sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
768
769
770
771
772
773
774
775
776
777
778

   for (int i = 0; i < 2; i++) {
      if (uses_ub & (1 << i)) {
         uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
         if (src_ub <= 0xffff)
            sop2->operands[i].set16bit(true);
         else if (src_ub <= 0xffffff)
            sop2->operands[i].set24bit(true);
      }
   }

779
780
781
   ctx->block->instructions.emplace_back(std::move(sop2));
}

782
void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
783
                           bool commutative, bool swap_srcs=false,
784
785
                           bool flush_denorms = false, bool nuw = false,
                           uint8_t uses_ub = 0)
786
787
{
   Builder bld(ctx->program, ctx->block);
Rhys Perry's avatar
Rhys Perry committed
788
789
   bld.is_precise = instr->exact;

790
791
792
793
794
795
796
797
   Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
   Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
   if (src1.type() == RegType::sgpr) {
      if (commutative && src0.type() == RegType::vgpr) {
         Temp t = src0;
         src0 = src1;
         src1 = t;
      } else {
798
         src1 = as_vgpr(ctx, src1);
799
800
      }
   }
801

802
803
804
805
806
807
808
809
810
811
812
813
814
   Operand op0(src0);
   Operand op1(src1);

   for (int i = 0; i < 2; i++) {
      uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
      if (uses_ub & (1 << i)) {
         if (src_ub <= 0xffff)
            bld.set16bit(i ? op1 : op0);
         else if (src_ub <= 0xffffff)
            bld.set24bit(i ? op1 : op0);
      }
   }

815
816
   if (flush_denorms && ctx->program->chip_class < GFX9) {
      assert(dst.size() == 1);
817
      Temp tmp = bld.vop2(op, bld.def(v1), op0, op1);
818
819
      bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
   } else {
820
      if (nuw) {
821
         bld.nuw().vop2(op, Definition(dst), op0, op1);
822
      } else {
823
         bld.vop2(op, Definition(dst), op0, op1);
824
      }
825
   }
826
827
}

828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
                                   aco_opcode op, Temp dst)
{
   Builder bld(ctx->program, ctx->block);
   bld.is_precise = instr->exact;

   Temp src0 = get_alu_src(ctx, instr->src[0]);
   Temp src1 = get_alu_src(ctx, instr->src[1]);

   if (src1.type() == RegType::sgpr) {
      assert(src0.type() == RegType::vgpr);
      std::swap(src0, src1);
   }

   Temp src00 = bld.tmp(src0.type(), 1);
   Temp src01 = bld.tmp(src0.type(), 1);
   bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
   Temp src10 = bld.tmp(v1);
   Temp src11 = bld.tmp(v1);
   bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
   Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
   Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
}

853
void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
854
                            bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
855
{
856
857
858
859
   assert(num_sources == 2 || num_sources == 3);
   Temp src[3] = { Temp(0, v1), Temp(0, v1), Temp(0, v1) };
   bool has_sgpr = false;
   for (unsigned i = 0; i < num_sources; i++) {
860
      src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
861
862
863
864
865
      if (has_sgpr)
         src[i] = as_vgpr(ctx, src[i]);
      else
         has_sgpr = src[i].type() == RegType::sgpr;
   }
866
867

   Builder bld(ctx->program, ctx->block);
Rhys Perry's avatar
Rhys Perry committed
868
   bld.is_precise = instr->exact;
869
   if (flush_denorms && ctx->program->chip_class < GFX9) {
870
871
872
873
874
875
876
877
      Temp tmp;
      if (num_sources == 3)
         tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
      else
         tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
      if (dst.size() == 1)
         bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
      else
878
         bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(UINT64_C(0x3FF0000000000000)), tmp);
879
880
   } else if (num_sources == 3) {
      bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
881
   } else {
882
      bld.vop3(op, Definition(dst), src[0], src[1]);
883
   }
884
885
}

886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
Builder::Result emit_vop3p_instruction(isel_context *ctx, nir_alu_instr *instr,
                                       aco_opcode op, Temp dst, bool swap_srcs=false)
{
   Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
   Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
   if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
      src1 = as_vgpr(ctx, src1);
   assert(instr->dest.dest.ssa.num_components == 2);

   /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
   unsigned opsel_lo = (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
   unsigned opsel_hi = (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);

   Builder bld(ctx->program, ctx->block);
   bld.is_precise = instr->exact;
   Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
   emit_split_vector(ctx, dst, 2);
   return res;
}

906
907
908
void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
{
   Builder bld(ctx->program, ctx->block);
Rhys Perry's avatar
Rhys Perry committed
909
   bld.is_precise = instr->exact;
910
911
912
913
914
   if (dst.type() == RegType::sgpr)
      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
                 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
   else
      bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
915
916
917
918
919
920
}

void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
{
   Temp src0 = get_alu_src(ctx, instr->src[0]);
   Temp src1 = get_alu_src(ctx, instr->src[1]);
921
922
   assert(src0.size() == src1.size());

923
924
925
926
927
   aco_ptr<Instruction> vopc;
   if (src1.type() == RegType::sgpr) {
      if (src0.type() == RegType::vgpr) {
         /* to swap the operands, we might also have to change the opcode */
         switch (op) {
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
            case aco_opcode::v_cmp_lt_f16:
               op = aco_opcode::v_cmp_gt_f16;
               break;
            case aco_opcode::v_cmp_ge_f16:
               op = aco_opcode::v_cmp_le_f16;
               break;
            case aco_opcode::v_cmp_lt_i16:
               op = aco_opcode::v_cmp_gt_i16;
               break;
            case aco_opcode::v_cmp_ge_i16:
               op = aco_opcode::v_cmp_le_i16;
               break;
            case aco_opcode::v_cmp_lt_u16:
               op = aco_opcode::v_cmp_gt_u16;
               break;
            case aco_opcode::v_cmp_ge_u16:
               op = aco_opcode::v_cmp_le_u16;
               break;
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
            case aco_opcode::v_cmp_lt_f32:
               op = aco_opcode::v_cmp_gt_f32;
               break;
            case aco_opcode::v_cmp_ge_f32:
               op = aco_opcode::v_cmp_le_f32;
               break;
            case aco_opcode::v_cmp_lt_i32:
               op = aco_opcode::v_cmp_gt_i32;
               break;
            case aco_opcode::v_cmp_ge_i32:
               op = aco_opcode::v_cmp_le_i32;
               break;
            case aco_opcode::v_cmp_lt_u32:
               op = aco_opcode::v_cmp_gt_u32;
               break;
            case aco_opcode::v_cmp_ge_u32:
               op = aco_opcode::v_cmp_le_u32;
               break;
            case aco_opcode::v_cmp_lt_f64:
               op = aco_opcode::v_cmp_gt_f64;
               break;
            case aco_opcode::v_cmp_ge_f64:
               op = aco_opcode::v_cmp_le_f64;
               break;
            case aco_opcode::v_cmp_lt_i64:
               op = aco_opcode::v_cmp_gt_i64;
               break;
            case aco_opcode::v_cmp_ge_i64:
               op = aco_opcode::v_cmp_le_i64;
               break;
            case aco_opcode::v_cmp_lt_u64:
               op = aco_opcode::v_cmp_gt_u64;
               break;
            case aco_opcode::v_cmp_ge_u64:
               op = aco_opcode::v_cmp_le_u64;
               break;
            default: /* eq and ne are commutative */
               break;
         }
         Temp t = src0;
         src0 = src1;
         src1 = t;
      } else {
         src1 = as_vgpr(ctx, src1);
      }
   }
992

993
   Builder bld(ctx->program, ctx->block);
994
   bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
995
996
}

997
void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
998
{
999
1000
   Temp src0 = get_alu_src(ctx, instr->src[0]);
   Temp src1 = get_alu_src(ctx, instr->src[1]);
1001
   Builder bld(ctx->program, ctx->block);
1002

1003
   assert(dst.regClass() == bld.lm);
1004
1005
   assert(src0.type() == RegType::sgpr);
   assert(src1.type() == RegType::sgpr);
1006
   assert(src0.regClass() == src1.regClass());
1007

1008
1009
1010
1011
1012
1013
1014
   /* Emit the SALU comparison instruction */
   Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
   /* Turn the result into a per-lane bool */
   bool_to_vector_condition(ctx, cmp, dst);
}

void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
1015
                     aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
1016
{
1017
1018
   aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
   aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
1019
   bool use_valu = s_op == aco_opcode::num_opcodes ||
1020
                   nir_dest_is_divergent(instr->dest.dest) ||
1021
1022
                   get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
                   get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1023
   aco_opcode op = use_valu ? v_op : s_op;
1024
   assert(op != aco_opcode::num_opcodes);
1025
   assert(dst.regClass() == ctx->program->lane_mask);
1026
1027
1028
1029
1030

   if (use_valu)
      emit_vopc_instruction(ctx, instr, op, dst);
   else
      emit_sopc_instruction(ctx, instr, op, dst);
1031
1032
}

1033
void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
1034
1035
1036
1037
1038
{
   Builder bld(ctx->program, ctx->block);
   Temp src0 = get_alu_src(ctx, instr->src[0]);
   Temp src1 = get_alu_src(ctx, instr->src[1]);

1039
1040
1041
   assert(dst.regClass() == bld.lm);
   assert(src0.regClass() == bld.lm);
   assert(src1.regClass() == bld.lm);
1042

1043
   bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1044
}
1045
1046
1047
1048
1049
1050
1051
1052

void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
{
   Builder bld(ctx->program, ctx->block);
   Temp cond = get_alu_src(ctx, instr->src[0]);
   Temp then = get_alu_src(ctx, instr->src[1]);
   Temp els = get_alu_src(ctx, instr->src[2]);

1053
   assert(cond.regClass() == bld.lm);
1054

1055
   if (dst.type() == RegType::vgpr) {
1056
      aco_ptr<Instruction> bcsel;
1057
      if (dst.size() == 1) {
1058
1059
1060
1061
         then = as_vgpr(ctx, then);
         els = as_vgpr(ctx, els);

         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1062
      } else if (dst.size() == 2) {
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
         Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
         bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
         Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
         bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);

         Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
         Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);

         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
      } else {
1073
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1074
1075
1076
1077
      }
      return;
   }

1078
   if (instr->dest.dest.ssa.bit_size == 1) {
1079
1080
1081
      assert(dst.regClass() == bld.lm);
      assert(then.regClass() == bld.lm);
      assert(els.regClass() == bld.lm);
1082
1083
   }

1084
   if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1085
1086
      if (dst.regClass() == s1 || dst.regClass() == s2) {
         assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
1087
         assert(dst.size() == then.size());
1088
         aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1089
         bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1090
      } else {
1091
         isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1092
1093
1094
1095
1096
1097
1098
      }
      return;
   }

   /* divergent boolean bcsel
    * this implements bcsel on bools: dst = s0 ? s1 : s2
    * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1099
   assert(instr->dest.dest.ssa.bit_size == 1);
1100
1101

   if (cond.id() != then.id())
1102
      then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1103
1104

   if (cond.id() == els.id())
Rhys Perry's avatar
Rhys Perry committed
1105
      bld.copy(Definition(dst), then);
1106
   else
1107
1108
      bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
               bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1109
1110
}

1111
1112
1113
1114
void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
                    aco_opcode op, uint32_t undo)
{
   /* multiply by 16777216 to handle denormals */
1115
   Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
                               as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
   Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
   scaled = bld.vop1(op, bld.def(v1), scaled);
   scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);

   Temp not_scaled = bld.vop1(op, bld.def(v1), val);

   bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
}

void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
{
   if (ctx->block->fp_mode.denorm32 == 0) {
      bld.vop1(aco_opcode::v_rcp_f32, dst, val);
      return;
   }

   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
}

void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
{
   if (ctx->block->fp_mode.denorm32 == 0) {
      bld.vop1(aco_opcode::v_rsq_f32, dst, val);
      return;
   }

   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
}

void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
{
   if (ctx->block->fp_mode.denorm32 == 0) {
      bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
      return;
   }

   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
}

void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
{
   if (ctx->block->fp_mode.denorm32 == 0) {
      bld.vop1(aco_opcode::v_log_f32, dst, val);
      return;
   }

   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
}

1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
{
   if (ctx->options->chip_class >= GFX7)
      return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);

   /* GFX6 doesn't support V_TRUNC_F64, lower it. */
   /* TODO: create more efficient code! */
   if (val.type() == RegType::sgpr)
      val = as_vgpr(ctx, val);

   /* Split the input value. */
   Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
   bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);

   /* Extract the exponent and compute the unbiased value. */
1181
1182
   Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
   exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197

   /* Extract the fractional part. */
   Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
   fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);

   Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
   bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);

   Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
   Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
   fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
   tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
   fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);

   /* Get the sign bit. */
1198
   Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210

   /* Decide the operation to apply depending on the unbiased exponent. */
   Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
   Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
   Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
   Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
   dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
   dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);

   return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
}

1211
1212
1213
1214
1215
Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
{
   if (ctx->options->chip_class >= GFX7)
      return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);

1216
1217
   /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
    * lowered at NIR level for precision reasons). */
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
   Temp src0 = as_vgpr(ctx, val);

   Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
   Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));

   Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
   Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
   Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);

   Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
   bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
   Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
   bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);

   Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
   Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);

   Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);

   Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1238
   add->vop3().neg[1] = true;
1239
1240
1241
1242

   return add->definitions[0].getTemp();
}

1243
1244
Temp uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
{
1245
1246
1247
1248
1249
1250
   if (bld.program->chip_class < GFX8) {
      Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
      return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand((uint32_t) -1), add.def(1).getTemp());
   }

   Builder::Result add(NULL);
1251
   if (bld.program->chip_class >= GFX9) {
1252
      add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1253
   } else {
1254
      add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.hint_vcc(bld.def(bld.lm)), src0, src1);
1255
   }
1256
   add.instr->vop3().clamp = 1;
1257
1258
1259
   return dst.getTemp();
}

1260
1261
1262
void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
{
   if (!instr->dest.dest.is_ssa) {
1263
      isel_err(&instr->instr, "nir alu dst not in ssa");
1264
1265
1266
      abort();
   }
   Builder bld(ctx->program, ctx->block);
Rhys Perry's avatar
Rhys Perry committed
1267
   bld.is_precise = instr->exact;
1268
1269
1270
1271
   Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
   switch(instr->op) {
   case nir_op_vec2:
   case nir_op_vec3:
Rhys Perry's avatar
Rhys Perry committed
1272
1273
   case nir_op_vec4:
   case nir_op_vec5: {
1274
      std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
1275
1276
      unsigned num = instr->dest.dest.ssa.num_components;
      for (unsigned i = 0; i < num; ++i)
1277
         elems[i] = get_alu_src(ctx, instr->src[i]);
1278
1279
1280

      if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1281
1282
1283
1284
1285
1286
1287
         RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
         for (unsigned i = 0; i < num; ++i) {
            if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
               vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
            else
               vec->operands[i] = Operand{elems[i]};
         }
1288
1289
1290
1291
         vec->definitions[0] = Definition(dst);
         ctx->block->instructions.emplace_back(std::move(vec));
         ctx->allocated_vec.emplace(dst.id(), elems);
      } else {
1292
         bool use_s_pack = ctx->program->chip_class >= GFX9;
1293
         Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306

         std::array<Temp,NIR_MAX_VEC_COMPONENTS> packed;
         uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
         for (unsigned i = 0; i < num; i++) {
            unsigned packed_size = use_s_pack ? 16 : 32;
            unsigned idx = i * instr->dest.dest.ssa.bit_size / packed_size;
            unsigned offset = i * instr->dest.dest.ssa.bit_size % packed_size;
            if (nir_src_is_const(instr->src[i].src)) {
               const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
               continue;
            }

            if (offset != packed_size - instr->dest.dest.ssa.bit_size)
1307
               elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1308
1309

            if (offset)
1310
               elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
                                   elems[i], Operand(offset));

            if (packed[idx].id())
               packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
                                      elems[i], packed[idx]);
            else
               packed[idx] = elems[i];
         }

         if (use_s_pack) {
            for (unsigned i = 0; i < dst.size(); i++) {
               bool same = !!packed[i*2].id() == !!packed[i*2+1].id();

               if (packed[i*2].id() && packed[i*2+1].id())
                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i*2], packed[i*2+1]);
               else if (packed[i*2+1].id())
                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), Operand(const_vals[i * 2]), packed[i*2+1]);
               else if (packed[i*2].id())
                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i*2], Operand(const_vals[i * 2 + 1]));

               if (same)
                  const_vals[i] = const_vals[i*2] | (const_vals[i*2+1] << 16);
               else
                  const_vals[i] = 0;
1335
1336
            }
         }
1337
1338
1339
1340
1341
1342
1343
1344
1345

         for (unsigned i = 0; i < dst.size(); i++) {
            if (const_vals[i] && packed[i].id())
               packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
                                    Operand(const_vals[i]), packed[i]);
            else if (!packed[i].id())
               packed[i] = bld.copy(bld.def(s1), Operand(const_vals[i]));
         }

1346
         if (dst.size() == 1)
1347
1348
1349
            bld.copy(Definition(dst), packed[0]);
         else if (dst.size() == 2)
            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]);
1350
         else
1351
            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1], packed[2]);
1352
1353
1354
1355
1356
      }
      break;
   }
   case nir_op_mov: {
      Temp src = get_alu_src(ctx, instr->src[0]);
1357
1358
1359
      if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
         /* use size() instead of bytes() for 8/16-bit */
         assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
Rhys Perry's avatar
Rhys Perry committed
1360
         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1361
1362
      } else {
         assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
Rhys Perry's avatar
Rhys Perry committed
1363
         bld.copy(Definition(dst), src);
1364
      }
1365
1366
1367
1368
      break;
   }
   case nir_op_inot: {
      Temp src = get_alu_src(ctx, instr->src[0]);
1369
      if (instr->dest.dest.ssa.bit_size == 1) {
1370
1371
         assert(src.regClass() == bld.lm);
         assert(dst.regClass() == bld.lm);
1372
1373
1374
         /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
         Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
         bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
1375
      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1376
         emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1377
1378
1379
1380
1381
1382
      } else if (dst.regClass() == v2) {
         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
         lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
         hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1383
1384
1385
1386
      } else if (dst.type() == RegType::sgpr) {
         aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
         bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
      } else {
1387
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1388
1389
1390
1391
      }
      break;
   }
   case nir_op_iabs: {
1392
      Temp src = get_alu_src(ctx, instr->src[0]);
1393
      if (dst.regClass() == s1) {
1394
         bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1395
1396
1397
      } else if (dst.regClass() == v1) {
         bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
      } else {
1398
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1399
1400
1401
1402
1403
1404
      }
      break;
   }
   case nir_op_isign: {
      Temp src = get_alu_src(ctx, instr->src[0]);
      if (dst.regClass() == s1) {
1405
1406
         Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1));
         bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u));
1407
1408
      } else if (dst.regClass() == s2) {
         Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
1409
1410
1411
1412
1413
1414
1415
         Temp neqz;
         if (ctx->program->chip_class >= GFX8)
            neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
         else
            neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
         /* SCC gets zero-extended to 64 bit */
         bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1416
      } else if (dst.regClass() == v1) {
1417
         bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand((uint32_t)-1), src, Operand(1u));
1418
1419
1420
      } else if (dst.regClass() == v2) {
         Temp upper = emit_extract_vector(ctx, src, 1, v1);
         Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
1421
         Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1422
1423
1424
1425
         Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
      } else {
1426
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1427
1428
1429
1430
      }
      break;
   }
   case nir_op_imax: {
1431
1432
1433
1434
      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
      } else if (dst.regClass() == v2b) {
         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1435
1436
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1437
      } else if (dst.regClass() == v1) {
1438
1439
1440
1441
         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
      } else if (dst.regClass() == s1) {
         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
      } else {
1442
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1443
1444
1445
1446
      }
      break;
   }
   case nir_op_umax: {
1447
1448
1449
1450
      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
      } else if (dst.regClass() == v2b) {
         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1451
1452
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1453
      } else if (dst.regClass() == v1) {
1454
1455
1456
1457
         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
      } else if (dst.regClass() == s1) {
         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
      } else {
1458
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1459
1460
1461
1462
      }
      break;
   }
   case nir_op_imin: {
1463
1464
1465
1466
      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
      } else if (dst.regClass() == v2b) {
         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1467
1468
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1469
      } else if (dst.regClass() == v1) {
1470
1471
1472
1473
         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
      } else if (dst.regClass() == s1) {
         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
      } else {
1474
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1475
1476
1477
1478
      }
      break;
   }
   case nir_op_umin: {
1479
1480
1481
1482
      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
      } else if (dst.regClass() == v2b) {
         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1483
1484
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1485
      } else if (dst.regClass() == v1) {
1486
1487
1488
1489
         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
      } else if (dst.regClass() == s1) {
         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
      } else {
1490
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1491
1492
1493
1494
1495
      }
      break;
   }
   case nir_op_ior: {
      if (instr->dest.dest.ssa.bit_size == 1) {
1496
         emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1497
      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1498
         emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1499
1500
      } else if (dst.regClass() == v2) {
         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1501
1502
1503
1504
1505
      } else if (dst.regClass() == s1) {
         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
      } else if (dst.regClass() == s2) {
         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
      } else {
1506
         isel_e