si_state_shaders.c 166 KB
Newer Older
1
2
/*
 * Copyright 2012 Advanced Micro Devices, Inc.
Marek Olšák's avatar
Marek Olšák committed
3
 * All Rights Reserved.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

25
26
#include "ac_exp_param.h"
#include "ac_shader_util.h"
27
#include "compiler/nir/nir_serialize.h"
28
#include "nir/tgsi_to_nir.h"
29
30
#include "si_build_pm4.h"
#include "sid.h"
31
#include "util/crc32.h"
32
33
34
#include "util/disk_cache.h"
#include "util/hash_table.h"
#include "util/mesa-sha1.h"
35
#include "util/u_async_debug.h"
36
#include "util/u_memory.h"
37
#include "util/u_prim.h"
38
#include "tgsi/tgsi_from_mesa.h"
39

40
41
42
/* SHADER_CACHE */

/**
43
 * Return the IR key for the shader cache.
44
 */
45
void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
                         unsigned char ir_sha1_cache_key[20])
{
   struct blob blob = {};
   unsigned ir_size;
   void *ir_binary;

   if (sel->nir_binary) {
      ir_binary = sel->nir_binary;
      ir_size = sel->nir_size;
   } else {
      assert(sel->nir);

      blob_init(&blob);
      nir_serialize(&blob, sel->nir, true);
      ir_binary = blob.data;
      ir_size = blob.size;
   }

   /* These settings affect the compilation, but they are not derived
    * from the input shader IR.
    */
   unsigned shader_variant_flags = 0;

   if (ngg)
      shader_variant_flags |= 1 << 0;
   if (sel->nir)
      shader_variant_flags |= 1 << 1;
73
   if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false, false) == 32)
74
      shader_variant_flags |= 1 << 2;
75
   if (sel->info.stage == MESA_SHADER_FRAGMENT &&
76
77
       /* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */
       sel->info.base.fs.needs_quad_helper_invocations &&
78
       sel->info.base.fs.uses_discard &&
79
80
       sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
      shader_variant_flags |= 1 << 3;
81
82
83
84
85
   if (sel->info.stage == MESA_SHADER_VERTEX) {
      /* This varies depending on whether compute-based culling is enabled. */
      assert(sel->screen->num_vbos_in_user_sgprs <= 7);
      shader_variant_flags |= MIN2(sel->screen->num_vbos_in_user_sgprs, 7) << 4;
   }
86
87
88
89
90
91
   if (sel->screen->options.no_infinite_interp)
      shader_variant_flags |= 1 << 7;
   if (sel->screen->options.clamp_div_by_zero)
      shader_variant_flags |= 1 << 8;
   if (sel->screen->debug_flags & DBG(GISEL))
      shader_variant_flags |= 1 << 9;
92
93
94
95
96
97
   if ((sel->info.stage == MESA_SHADER_VERTEX ||
        sel->info.stage == MESA_SHADER_TESS_EVAL ||
        sel->info.stage == MESA_SHADER_GEOMETRY) &&
       !es &&
       sel->screen->options.vrs2x2)
      shader_variant_flags |= 1 << 10;
98
99
   if (sel->screen->options.inline_uniforms)
      shader_variant_flags |= 1 << 11;
100
101
102
103
104

   struct mesa_sha1 ctx;
   _mesa_sha1_init(&ctx);
   _mesa_sha1_update(&ctx, &shader_variant_flags, 4);
   _mesa_sha1_update(&ctx, ir_binary, ir_size);
105
106
   if (sel->info.stage == MESA_SHADER_VERTEX || sel->info.stage == MESA_SHADER_TESS_EVAL ||
       sel->info.stage == MESA_SHADER_GEOMETRY)
107
108
109
110
111
      _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
   _mesa_sha1_final(&ctx, ir_sha1_cache_key);

   if (ir_binary == blob.data)
      blob_finish(&blob);
112
113
114
115
116
}

/** Copy "data" to "ptr" and return the next dword following copied data. */
static uint32_t *write_data(uint32_t *ptr, const void *data, unsigned size)
{
117
118
119
120
121
   /* data may be NULL if size == 0 */
   if (size)
      memcpy(ptr, data, size);
   ptr += DIV_ROUND_UP(size, 4);
   return ptr;
122
123
124
125
126
}

/** Read data from "ptr". Return the next dword following the data. */
static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size)
{
127
128
129
   memcpy(data, ptr, size);
   ptr += DIV_ROUND_UP(size, 4);
   return ptr;
130
131
132
133
134
135
136
137
}

/**
 * Write the size as uint followed by the data. Return the next dword
 * following the copied data.
 */
static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size)
{
138
139
   *ptr++ = size;
   return write_data(ptr, data, size);
140
141
142
143
144
145
146
147
}

/**
 * Read the size as uint followed by the data. Return both via parameters.
 * Return the next dword following the data.
 */
static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size)
{
148
149
150
151
152
153
   *size = *ptr++;
   assert(*data == NULL);
   if (!*size)
      return ptr;
   *data = malloc(*size);
   return read_data(ptr, *data, *size);
154
155
156
157
158
159
160
161
}

/**
 * Return the shader binary in a buffer. The first 4 bytes contain its size
 * as integer.
 */
static void *si_get_shader_binary(struct si_shader *shader)
{
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
   /* There is always a size of data followed by the data itself. */
   unsigned llvm_ir_size =
      shader->binary.llvm_ir_string ? strlen(shader->binary.llvm_ir_string) + 1 : 0;

   /* Refuse to allocate overly large buffers and guard against integer
    * overflow. */
   if (shader->binary.elf_size > UINT_MAX / 4 || llvm_ir_size > UINT_MAX / 4)
      return NULL;

   unsigned size = 4 + /* total size */
                   4 + /* CRC32 of the data below */
                   align(sizeof(shader->config), 4) + align(sizeof(shader->info), 4) + 4 +
                   align(shader->binary.elf_size, 4) + 4 + align(llvm_ir_size, 4);
   void *buffer = CALLOC(1, size);
   uint32_t *ptr = (uint32_t *)buffer;

   if (!buffer)
      return NULL;

   *ptr++ = size;
   ptr++; /* CRC32 is calculated at the end. */

   ptr = write_data(ptr, &shader->config, sizeof(shader->config));
   ptr = write_data(ptr, &shader->info, sizeof(shader->info));
   ptr = write_chunk(ptr, shader->binary.elf_buffer, shader->binary.elf_size);
   ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size);
   assert((char *)ptr - (char *)buffer == size);

   /* Compute CRC32. */
   ptr = (uint32_t *)buffer;
   ptr++;
   *ptr = util_hash_crc32(ptr + 1, size - 8);

   return buffer;
196
197
198
199
}

static bool si_load_shader_binary(struct si_shader *shader, void *binary)
{
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
   uint32_t *ptr = (uint32_t *)binary;
   uint32_t size = *ptr++;
   uint32_t crc32 = *ptr++;
   unsigned chunk_size;
   unsigned elf_size;

   if (util_hash_crc32(ptr, size - 8) != crc32) {
      fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n");
      return false;
   }

   ptr = read_data(ptr, &shader->config, sizeof(shader->config));
   ptr = read_data(ptr, &shader->info, sizeof(shader->info));
   ptr = read_chunk(ptr, (void **)&shader->binary.elf_buffer, &elf_size);
   shader->binary.elf_size = elf_size;
   ptr = read_chunk(ptr, (void **)&shader->binary.llvm_ir_string, &chunk_size);

   return true;
218
219
220
221
222
223
}

/**
 * Insert a shader into the cache. It's assumed the shader is not in the cache.
 * Use si_shader_cache_load_shader before calling this.
 */
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
                                   struct si_shader *shader, bool insert_into_disk_cache)
{
   void *hw_binary;
   struct hash_entry *entry;
   uint8_t key[CACHE_KEY_SIZE];

   entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
   if (entry)
      return; /* already added */

   hw_binary = si_get_shader_binary(shader);
   if (!hw_binary)
      return;

   if (_mesa_hash_table_insert(sscreen->shader_cache, mem_dup(ir_sha1_cache_key, 20), hw_binary) ==
       NULL) {
      FREE(hw_binary);
      return;
   }

   if (sscreen->disk_shader_cache && insert_into_disk_cache) {
      disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, key);
      disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, *((uint32_t *)hw_binary), NULL);
   }
}

bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
                                 struct si_shader *shader)
{
   struct hash_entry *entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);

   if (entry) {
      if (si_load_shader_binary(shader, entry->data)) {
         p_atomic_inc(&sscreen->num_memory_shader_cache_hits);
         return true;
      }
   }
   p_atomic_inc(&sscreen->num_memory_shader_cache_misses);

   if (!sscreen->disk_shader_cache)
      return false;

   unsigned char sha1[CACHE_KEY_SIZE];
   disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, sha1);

   size_t binary_size;
   uint8_t *buffer = disk_cache_get(sscreen->disk_shader_cache, sha1, &binary_size);
   if (buffer) {
      if (binary_size >= sizeof(uint32_t) && *((uint32_t *)buffer) == binary_size) {
         if (si_load_shader_binary(shader, buffer)) {
            free(buffer);
            si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, false);
            p_atomic_inc(&sscreen->num_disk_shader_cache_hits);
            return true;
         }
      } else {
         /* Something has gone wrong discard the item from the cache and
          * rebuild/link from source.
          */
         assert(!"Invalid radeonsi shader disk cache item!");
         disk_cache_remove(sscreen->disk_shader_cache, sha1);
      }
   }

   free(buffer);
   p_atomic_inc(&sscreen->num_disk_shader_cache_misses);
   return false;
292
293
294
295
}

static uint32_t si_shader_cache_key_hash(const void *key)
{
296
297
   /* Take the first dword of SHA1. */
   return *(uint32_t *)key;
298
299
300
301
}

static bool si_shader_cache_key_equals(const void *a, const void *b)
{
302
303
   /* Compare SHA1s. */
   return memcmp(a, b, 20) == 0;
304
305
306
307
}

static void si_destroy_shader_cache_entry(struct hash_entry *entry)
{
308
309
   FREE((void *)entry->key);
   FREE(entry->data);
310
311
312
313
}

bool si_init_shader_cache(struct si_screen *sscreen)
{
314
315
316
   (void)simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain);
   sscreen->shader_cache =
      _mesa_hash_table_create(NULL, si_shader_cache_key_hash, si_shader_cache_key_equals);
317

318
   return sscreen->shader_cache != NULL;
319
320
321
322
}

void si_destroy_shader_cache(struct si_screen *sscreen)
{
323
324
325
   if (sscreen->shader_cache)
      _mesa_hash_table_destroy(sscreen->shader_cache, si_destroy_shader_cache_entry);
   simple_mtx_destroy(&sscreen->shader_cache_mutex);
326
327
328
329
}

/* SHADER STATES */

330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
bool si_shader_mem_ordered(struct si_shader *shader)
{
   if (shader->selector->screen->info.chip_class < GFX10)
      return false;

   const struct si_shader_info *info = &shader->selector->info;
   const struct si_shader_info *prev_info =
      shader->previous_stage_sel ? &shader->previous_stage_sel->info : NULL;

   bool sampler_or_bvh = info->uses_vmem_return_type_sampler_or_bvh;
   bool other = info->uses_vmem_return_type_other ||
                info->uses_indirect_descriptor ||
                shader->config.scratch_bytes_per_wave ||
                (info->stage == MESA_SHADER_FRAGMENT &&
                 (info->base.fs.uses_fbfetch_output ||
                  shader->key.part.ps.prolog.poly_stipple));

   if (prev_info) {
      sampler_or_bvh |= prev_info->uses_vmem_return_type_sampler_or_bvh;
      other |= prev_info->uses_vmem_return_type_other ||
               prev_info->uses_indirect_descriptor;
   }

   /* Return true if both types of VMEM that return something are used. */
   return sampler_or_bvh && other;
}

357
358
359
360
static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes,
                                 struct si_pm4_state *pm4)
{
   const struct si_shader_info *info = &tes->info;
361
   unsigned tes_prim_mode = info->base.tess.primitive_mode;
362
   unsigned tes_spacing = info->base.tess.spacing;
363
   bool tes_vertex_order_cw = !info->base.tess.ccw;
364
   bool tes_point_mode = info->base.tess.point_mode;
365
366
367
   unsigned type, partitioning, topology, distribution_mode;

   switch (tes_prim_mode) {
368
   case GL_LINES:
369
370
      type = V_028B6C_TESS_ISOLINE;
      break;
371
   case GL_TRIANGLES:
372
373
      type = V_028B6C_TESS_TRIANGLE;
      break;
374
   case GL_QUADS:
375
376
377
378
379
380
381
382
      type = V_028B6C_TESS_QUAD;
      break;
   default:
      assert(0);
      return;
   }

   switch (tes_spacing) {
383
   case TESS_SPACING_FRACTIONAL_ODD:
384
385
      partitioning = V_028B6C_PART_FRAC_ODD;
      break;
386
   case TESS_SPACING_FRACTIONAL_EVEN:
387
388
      partitioning = V_028B6C_PART_FRAC_EVEN;
      break;
389
   case TESS_SPACING_EQUAL:
390
391
392
393
394
395
396
397
398
      partitioning = V_028B6C_PART_INTEGER;
      break;
   default:
      assert(0);
      return;
   }

   if (tes_point_mode)
      topology = V_028B6C_OUTPUT_POINT;
399
   else if (tes_prim_mode == GL_LINES)
400
401
402
403
404
405
406
407
408
      topology = V_028B6C_OUTPUT_LINE;
   else if (tes_vertex_order_cw)
      /* for some reason, this must be the other way around */
      topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
   else
      topology = V_028B6C_OUTPUT_TRIANGLE_CW;

   if (sscreen->info.has_distributed_tess) {
      if (sscreen->info.family == CHIP_FIJI || sscreen->info.family >= CHIP_POLARIS10)
409
         distribution_mode = V_028B6C_TRAPEZOIDS;
410
      else
411
         distribution_mode = V_028B6C_DONUTS;
412
   } else
413
      distribution_mode = V_028B6C_NO_DIST;
414
415
416
417
418

   assert(pm4->shader);
   pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
                               S_028B6C_TOPOLOGY(topology) |
                               S_028B6C_DISTRIBUTION_MODE(distribution_mode);
419
420
}

421
422
423
424
425
426
427
428
429
430
431
/* Polaris needs different VTX_REUSE_DEPTH settings depending on
 * whether the "fractional odd" tessellation spacing is used.
 *
 * Possible VGT configurations and which state should set the register:
 *
 *   Reg set in | VGT shader configuration   | Value
 * ------------------------------------------------------
 *     VS as VS | VS                         | 30
 *     VS as ES | ES -> GS -> VS             | 30
 *    TES as VS | LS -> HS -> VS             | 14 or 30
 *    TES as ES | LS -> HS -> ES -> GS -> VS | 14 or 30
432
433
 *
 * If "shader" is NULL, it's assumed it's not LS or GS copy shader.
434
 */
435
436
static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel,
                                         struct si_shader *shader, struct si_pm4_state *pm4)
437
{
438
439
440
441
   if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.chip_class >= GFX10)
      return;

   /* VS as VS, or VS as ES: */
442
   if ((sel->info.stage == MESA_SHADER_VERTEX &&
443
444
        (!shader || (!shader->key.as_ls && !shader->is_gs_copy_shader))) ||
       /* TES as VS, or TES as ES: */
445
       sel->info.stage == MESA_SHADER_TESS_EVAL) {
446
447
      unsigned vtx_reuse_depth = 30;

448
      if (sel->info.stage == MESA_SHADER_TESS_EVAL &&
449
          sel->info.base.tess.spacing == TESS_SPACING_FRACTIONAL_ODD)
450
451
452
453
454
         vtx_reuse_depth = 14;

      assert(pm4->shader);
      pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
   }
455
456
}

457
458
static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader)
{
459
460
461
462
463
464
465
466
467
468
469
470
   if (shader->pm4)
      si_pm4_clear_state(shader->pm4);
   else
      shader->pm4 = CALLOC_STRUCT(si_pm4_state);

   if (shader->pm4) {
      shader->pm4->shader = shader;
      return shader->pm4;
   } else {
      fprintf(stderr, "radeonsi: Failed to create pm4 state.\n");
      return NULL;
   }
471
472
}

473
static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader,
474
                                         unsigned num_always_on_user_sgprs)
475
{
476
477
478
   struct si_shader_selector *vs =
      shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
   unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs;
479

480
481
   /* 1 SGPR is reserved for the vertex buffer pointer. */
   assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
482

483
484
   if (num_vbos_in_user_sgprs)
      return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
485

486
487
   /* Add the pointer to VBO descriptors. */
   return num_always_on_user_sgprs + 1;
488
489
}

490
/* Return VGPR_COMP_CNT for the API vertex shader. This can be hw LS, LSHS, ES, ESGS, VS. */
491
492
493
static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_shader *shader,
                                        bool legacy_vs_prim_id)
{
494
495
   assert(shader->selector->info.stage == MESA_SHADER_VERTEX ||
          (shader->previous_stage_sel && shader->previous_stage_sel->info.stage == MESA_SHADER_VERTEX));
496
497
498
499
500
501
502

   /* GFX6-9 LS    (VertexID, RelAutoindex,                InstanceID / StepRate0(==1), ...).
    * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID,                    ...)
    * GFX10  LS    (VertexID, RelAutoindex,                UserVGPR1,                   InstanceID).
    * GFX10  ES,VS (VertexID, UserVGPR0,                   UserVGPR1 or VSPrimID,       UserVGPR2 or
    * InstanceID)
    */
503
   bool is_ls = shader->selector->info.stage == MESA_SHADER_TESS_CTRL || shader->key.as_ls;
504
505
506
507
508
509
510
511
512

   if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid)
      return 3;
   else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id)
      return 2;
   else if (is_ls || shader->info.uses_instanceid)
      return 1;
   else
      return 0;
513
514
}

515
static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
516
{
517
518
   struct si_pm4_state *pm4;
   uint64_t va;
519

520
   assert(sscreen->info.chip_class <= GFX8);
521

522
523
524
   pm4 = si_get_shader_pm4_state(shader);
   if (!pm4)
      return;
525

526
527
528
   va = shader->bo->gpu_address;
   si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
   si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
529

530
531
532
533
534
535
536
   shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
                          S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
                          S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
                          S_00B528_DX10_CLAMP(1) | S_00B528_FLOAT_MODE(shader->config.float_mode);
   shader->config.rsrc2 =
      S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
      S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
537
538
}

539
static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
540
{
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
   struct si_pm4_state *pm4;
   uint64_t va;

   pm4 = si_get_shader_pm4_state(shader);
   if (!pm4)
      return;

   va = shader->bo->gpu_address;

   if (sscreen->info.chip_class >= GFX9) {
      if (sscreen->info.chip_class >= GFX10) {
         si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
         si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
      } else {
         si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
         si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40));
      }

      unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);

      shader->config.rsrc2 = S_00B42C_USER_SGPR(num_user_sgprs) |
                             S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);

      if (sscreen->info.chip_class >= GFX10)
         shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
      else
         shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
   } else {
      si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
      si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40));

      shader->config.rsrc2 = S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) | S_00B42C_OC_LDS_EN(1) |
                             S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
   }

   si_pm4_set_reg(
      pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
      S_00B428_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
         (sscreen->info.chip_class <= GFX9 ? S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8)
                                           : 0) |
581
         S_00B428_DX10_CLAMP(1) | S_00B428_MEM_ORDERED(si_shader_mem_ordered(shader)) |
582
583
584
585
586
587
588
589
590
         S_00B428_WGP_MODE(sscreen->info.chip_class >= GFX10) |
         S_00B428_FLOAT_MODE(shader->config.float_mode) |
         S_00B428_LS_VGPR_COMP_CNT(sscreen->info.chip_class >= GFX9
                                      ? si_get_vs_vgpr_comp_cnt(sscreen, shader, false)
                                      : 0));

   if (sscreen->info.chip_class <= GFX8) {
      si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2);
   }
591
592
}

593
594
static void si_emit_shader_es(struct si_context *sctx)
{
595
596
597
   struct si_shader *shader = sctx->queued.named.es->shader;
   if (!shader)
      return;
598

599
   radeon_begin(&sctx->gfx_cs);
600
601
602
   radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
                              SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
                              shader->selector->esgs_itemsize / 4);
603

604
   if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL)
605
606
      radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
                                 shader->vgt_tf_param);
607

608
609
610
611
   if (shader->vgt_vertex_reuse_block_cntl)
      radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
                                 SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
                                 shader->vgt_vertex_reuse_block_cntl);
612
   radeon_end_update_context_roll(sctx);
613
614
}

615
static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
616
{
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
   struct si_pm4_state *pm4;
   unsigned num_user_sgprs;
   unsigned vgpr_comp_cnt;
   uint64_t va;
   unsigned oc_lds_en;

   assert(sscreen->info.chip_class <= GFX8);

   pm4 = si_get_shader_pm4_state(shader);
   if (!pm4)
      return;

   pm4->atom.emit = si_emit_shader_es;
   va = shader->bo->gpu_address;

632
   if (shader->selector->info.stage == MESA_SHADER_VERTEX) {
633
634
      vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
      num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
635
   } else if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL) {
636
637
638
639
640
      vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
      num_user_sgprs = SI_TES_NUM_USER_SGPR;
   } else
      unreachable("invalid shader selector type");

641
   oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
642
643
644
645
646
647
648
649
650
651
652
653

   si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
   si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
                  S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
                     S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) |
                     S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B328_DX10_CLAMP(1) |
                     S_00B328_FLOAT_MODE(shader->config.float_mode));
   si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
                  S_00B32C_USER_SGPR(num_user_sgprs) | S_00B32C_OC_LDS_EN(oc_lds_en) |
                     S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));

654
   if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL)
655
656
657
658
659
660
661
662
      si_set_tesseval_regs(sscreen, shader->selector, pm4);

   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
}

void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
                      struct gfx9_gs_info *out)
{
663
   unsigned gs_num_invocations = MAX2(gs->info.base.gs.invocations, 1);
664
   unsigned input_prim = gs->info.base.gs.input_primitive;
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
   bool uses_adjacency =
      input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;

   /* All these are in dwords: */
   /* We can't allow using the whole LDS, because GS waves compete with
    * other shader stages for LDS space. */
   const unsigned max_lds_size = 8 * 1024;
   const unsigned esgs_itemsize = es->esgs_itemsize / 4;
   unsigned esgs_lds_size;

   /* All these are per subgroup: */
   const unsigned max_out_prims = 32 * 1024;
   const unsigned max_es_verts = 255;
   const unsigned ideal_gs_prims = 64;
   unsigned max_gs_prims, gs_prims;
   unsigned min_es_verts, es_verts, worst_case_es_verts;

   if (uses_adjacency || gs_num_invocations > 1)
      max_gs_prims = 127 / gs_num_invocations;
   else
      max_gs_prims = 255;

   /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
    * Make sure we don't go over the maximum value.
    */
690
   if (gs->info.base.gs.vertices_out > 0) {
691
      max_gs_prims =
692
         MIN2(max_gs_prims, max_out_prims / (gs->info.base.gs.vertices_out * gs_num_invocations));
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
   }
   assert(max_gs_prims > 0);

   /* If the primitive has adjacency, halve the number of vertices
    * that will be reused in multiple primitives.
    */
   min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1);

   gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
   worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);

   /* Compute ESGS LDS size based on the worst case number of ES vertices
    * needed to create the target number of GS prims per subgroup.
    */
   esgs_lds_size = esgs_itemsize * worst_case_es_verts;

   /* If total LDS usage is too big, refactor partitions based on ratio
    * of ESGS item sizes.
    */
   if (esgs_lds_size > max_lds_size) {
      /* Our target GS Prims Per Subgroup was too large. Calculate
       * the maximum number of GS Prims Per Subgroup that will fit
       * into LDS, capped by the maximum that the hardware can support.
       */
      gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
      assert(gs_prims > 0);
      worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);

      esgs_lds_size = esgs_itemsize * worst_case_es_verts;
      assert(esgs_lds_size <= max_lds_size);
   }

   /* Now calculate remaining ESGS information. */
   if (esgs_lds_size)
      es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
   else
      es_verts = max_es_verts;

   /* Vertices for adjacency primitives are not always reused, so restore
    * it for ES_VERTS_PER_SUBGRP.
    */
   min_es_verts = gs->gs_input_verts_per_prim;

   /* For normal primitives, the VGT only checks if they are past the ES
    * verts per subgroup after allocating a full GS primitive and if they
    * are, kick off a new subgroup.  But if those additional ES verts are
    * unique (e.g. not reused) we need to make sure there is enough LDS
    * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
    */
   es_verts -= min_es_verts - 1;

   out->es_verts_per_subgroup = es_verts;
   out->gs_prims_per_subgroup = gs_prims;
   out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
747
   out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->info.base.gs.vertices_out;
748
   out->esgs_ring_size = esgs_lds_size;
749
750

   assert(out->max_prims_per_subgroup <= max_out_prims);
751
752
}

753
754
static void si_emit_shader_gs(struct si_context *sctx)
{
755
756
757
758
   struct si_shader *shader = sctx->queued.named.gs->shader;
   if (!shader)
      return;

759
760
   radeon_begin(&sctx->gfx_cs);

761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
   /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
    * R_028A68_VGT_GSVS_RING_OFFSET_3 */
   radeon_opt_set_context_reg3(
      sctx, R_028A60_VGT_GSVS_RING_OFFSET_1, SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
      shader->ctx_reg.gs.vgt_gsvs_ring_offset_1, shader->ctx_reg.gs.vgt_gsvs_ring_offset_2,
      shader->ctx_reg.gs.vgt_gsvs_ring_offset_3);

   /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
   radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
                              SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
                              shader->ctx_reg.gs.vgt_gsvs_ring_itemsize);

   /* R_028B38_VGT_GS_MAX_VERT_OUT */
   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
                              shader->ctx_reg.gs.vgt_gs_max_vert_out);

   /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
    * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
   radeon_opt_set_context_reg4(
      sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE, SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
      shader->ctx_reg.gs.vgt_gs_vert_itemsize, shader->ctx_reg.gs.vgt_gs_vert_itemsize_1,
      shader->ctx_reg.gs.vgt_gs_vert_itemsize_2, shader->ctx_reg.gs.vgt_gs_vert_itemsize_3);

   /* R_028B90_VGT_GS_INSTANCE_CNT */
   radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
                              shader->ctx_reg.gs.vgt_gs_instance_cnt);

   if (sctx->chip_class >= GFX9) {
      /* R_028A44_VGT_GS_ONCHIP_CNTL */
      radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
                                 shader->ctx_reg.gs.vgt_gs_onchip_cntl);
      /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */
      radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
                                 SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
                                 shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup);
      /* R_028AAC_VGT_ESGS_RING_ITEMSIZE */
      radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
                                 SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
                                 shader->ctx_reg.gs.vgt_esgs_ring_itemsize);

801
      if (shader->key.part.gs.es->info.stage == MESA_SHADER_TESS_EVAL)
802
803
804
805
806
807
808
         radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
                                    shader->vgt_tf_param);
      if (shader->vgt_vertex_reuse_block_cntl)
         radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
                                    SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
                                    shader->vgt_vertex_reuse_block_cntl);
   }
809
   radeon_end_update_context_roll(sctx);
810
811
}

812
static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
813
{
814
815
   struct si_shader_selector *sel = shader->selector;
   const ubyte *num_components = sel->info.num_stream_output_components;
816
   unsigned gs_num_invocations = sel->info.base.gs.invocations;
817
818
   struct si_pm4_state *pm4;
   uint64_t va;
819
   unsigned max_stream = util_last_bit(sel->info.base.gs.active_stream_mask);
820
821
822
823
824
825
826
827
   unsigned offset;

   pm4 = si_get_shader_pm4_state(shader);
   if (!pm4)
      return;

   pm4->atom.emit = si_emit_shader_gs;

828
   offset = num_components[0] * sel->info.base.gs.vertices_out;
829
830
   shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset;

831
   if (max_stream >= 2)
832
      offset += num_components[1] * sel->info.base.gs.vertices_out;
833
834
   shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset;

835
   if (max_stream >= 3)
836
      offset += num_components[2] * sel->info.base.gs.vertices_out;
837
838
   shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset;

839
   if (max_stream >= 4)
840
      offset += num_components[3] * sel->info.base.gs.vertices_out;
841
842
843
844
845
   shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset;

   /* The GSVS_RING_ITEMSIZE register takes 15 bits */
   assert(offset < (1 << 15));

846
   shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->info.base.gs.vertices_out;
847
848

   shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0];
849
850
851
   shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 2) ? num_components[1] : 0;
   shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 3) ? num_components[2] : 0;
   shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 4) ? num_components[3] : 0;
852
853
854
855
856
857
858

   shader->ctx_reg.gs.vgt_gs_instance_cnt =
      S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);

   va = shader->bo->gpu_address;

   if (sscreen->info.chip_class >= GFX9) {
859
      unsigned input_prim = sel->info.base.gs.input_primitive;
860
      gl_shader_stage es_stage = shader->key.part.gs.es->info.stage;
861
862
      unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;

863
      if (es_stage == MESA_SHADER_VERTEX) {
864
         es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
865
      } else if (es_stage == MESA_SHADER_TESS_EVAL)
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
         es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
      else
         unreachable("invalid shader selector type");

      /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
       * VGPR[0:4] are always loaded.
       */
      if (sel->info.uses_invocationid)
         gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
      else if (sel->info.uses_primid)
         gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
      else if (input_prim >= PIPE_PRIM_TRIANGLES)
         gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
      else
         gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */

      unsigned num_user_sgprs;
883
      if (es_stage == MESA_SHADER_VERTEX)
884
885
886
887
888
889
890
891
892
893
894
895
896
         num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
      else
         num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;

      if (sscreen->info.chip_class >= GFX10) {
         si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
         si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
      } else {
         si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
         si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40));
      }

      uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) |
897
                       S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
898
899
900
901
902
                       S_00B228_WGP_MODE(sscreen->info.chip_class >= GFX10) |
                       S_00B228_FLOAT_MODE(shader->config.float_mode) |
                       S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
      uint32_t rsrc2 = S_00B22C_USER_SGPR(num_user_sgprs) |
                       S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
903
                       S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
                       S_00B22C_LDS_SIZE(shader->config.lds_size) |
                       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);

      if (sscreen->info.chip_class >= GFX10) {
         rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
      } else {
         rsrc1 |= S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8);
         rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
      }

      si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
      si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);

      if (sscreen->info.chip_class >= GFX10) {
         si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
                        S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
      }

      shader->ctx_reg.gs.vgt_gs_onchip_cntl =
         S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
         S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
         S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup);
      shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup =
         S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup);
      shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4;

930
      if (es_stage == MESA_SHADER_TESS_EVAL)
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
         si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);

      polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4);
   } else {
      si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
      si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));

      si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
                     S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
                        S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
                        S_00B228_DX10_CLAMP(1) | S_00B228_FLOAT_MODE(shader->config.float_mode));
      si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
                     S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
                        S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
   }
946
947
}

948
949
static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value)
{
950
   enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC;
951

952
953
   if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
       sctx->tracked_regs.reg_value[reg] != value) {
954
      struct radeon_cmdbuf *cs = &sctx->gfx_cs;
955

956
957
      radeon_begin(cs);

958
      if (sctx->chip_class == GFX10) {
959
960
961
962
         /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
         radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
      }
963

964
      radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
965
      radeon_end();
966

967
968
969
      sctx->tracked_regs.reg_saved |= 0x1ull << reg;
      sctx->tracked_regs.reg_value[reg] = value;
   }
970
971
}

972
/* Common tail code for NGG primitive shaders. */
973
static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader)
974
{
975
   radeon_begin(&sctx->gfx_cs);
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
   radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
                              SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
                              shader->ctx_reg.ngg.ge_max_output_per_subgroup);
   radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL,
                              shader->ctx_reg.ngg.ge_ngg_subgrp_cntl);
   radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
                              shader->ctx_reg.ngg.vgt_primitiveid_en);
   radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
                              shader->ctx_reg.ngg.vgt_gs_onchip_cntl);
   radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
                              shader->ctx_reg.ngg.vgt_gs_instance_cnt);
   radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
                              SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
                              shader->ctx_reg.ngg.vgt_esgs_ring_itemsize);
   radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
                              shader->ctx_reg.ngg.spi_vs_out_config);
   radeon_opt_set_context_reg2(
      sctx, R_028708_SPI_SHADER_IDX_FORMAT, SI_TRACKED_SPI_SHADER_IDX_FORMAT,
      shader->ctx_reg.ngg.spi_shader_idx_format, shader->ctx_reg.ngg.spi_shader_pos_format);
   radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
                              shader->ctx_reg.ngg.pa_cl_vte_cntl);
   radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL,
                              shader->ctx_reg.ngg.pa_cl_ngg_cntl);

   radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
                                  SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
                                  SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
1003
   radeon_end_update_context_roll(sctx);
1004
1005
1006

   /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
   gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
1007
1008
1009
1010
}

static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
{
1011
1012
1013
   struct si_shader *shader = sctx->queued.named.gs->shader;
   if (!shader)
      return;
1014

1015
   gfx10_emit_shader_ngg_tail(sctx, shader);
1016
1017
1018
1019
}

static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx)
{
1020
1021
1022
   struct si_shader *shader = sctx->queued.named.gs->shader;
   if (!shader)
      return;
1023

1024
   radeon_begin(&sctx->gfx_cs);
1025
1026
   radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
                              shader->vgt_tf_param);
1027
   radeon_end_update_context_roll(sctx);
1028

1029
   gfx10_emit_shader_ngg_tail(sctx, shader);
1030
1031
}

1032
1033
static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx)
{
1034
1035
1036
   struct si_shader *shader = sctx->queued.named.gs->shader;
   if (!shader)
      return;
1037

1038
   radeon_begin(&sctx->gfx_cs);
1039
1040
   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
                              shader->ctx_reg.ngg.vgt_gs_max_vert_out);
1041
   radeon_end_update_context_roll(sctx);
1042

1043
   gfx10_emit_shader_ngg_tail(sctx, shader);
1044
1045
1046
1047
}

static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
{
1048
   struct si_shader *shader = sctx->queued.named.gs->shader;
1049

1050
1051
   if (!shader)
      return;
1052

1053
   radeon_begin(&sctx->gfx_cs);
1054
1055
1056
1057
   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
                              shader->ctx_reg.ngg.vgt_gs_max_vert_out);
   radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
                              shader->vgt_tf_param);
1058
   radeon_end_update_context_roll(sctx);
1059

1060
   gfx10_emit_shader_ngg_tail(sctx, shader);
1061
1062
}

1063
unsigned si_get_input_prim(const struct si_shader_selector *gs)
1064
{
1065
   if (gs->info.stage == MESA_SHADER_GEOMETRY)
1066
      return gs->info.base.gs.input_primitive;
1067

1068
   if (gs->info.stage == MESA_SHADER_TESS_EVAL) {
1069
      if (gs->info.base.tess.point_mode)
1070
         return PIPE_PRIM_POINTS;
1071
      if (gs->info.base.tess.primitive_mode == GL_LINES)
1072
1073
1074
1075
1076
1077
         return PIPE_PRIM_LINES;
      return PIPE_PRIM_TRIANGLES;
   }

   /* TODO: Set this correctly if the primitive type is set in the shader key. */
   return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
1078
1079
}

1080
1081
static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel,
                                   const struct si_shader *shader, bool ngg)
1082
{
1083
1084
1085
1086
1087
1088
   bool writes_psize = sel->info.writes_psize;

   if (shader)
      writes_psize &= !shader->key.opt.kill_pointsize;

   bool misc_vec_ena = writes_psize || (sel->info.writes_edgeflag && !ngg) ||
1089
                       sel->screen->options.vrs2x2 ||
1090
                       sel->info.writes_layer || sel->info.writes_viewport_index;
1091
   return S_02881C_USE_VTX_POINT_SIZE(writes_psize) |
1092
          S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
1093
          S_02881C_USE_VTX_VRS_RATE(sel->screen->options.vrs2x2) |
1094
1095
1096
1097
          S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
          S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
          S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
          S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena);
1098
1099
}

1100
1101
1102
1103
1104
1105
/**
 * Prepare the PM4 image for \p shader, which will run as a merged ESGS shader
 * in NGG mode.
 */
static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader)
{
1106
1107
   const struct si_shader_selector *gs_sel = shader->selector;
   const struct si_shader_info *gs_info = &gs_sel->info;
1108
   const gl_shader_stage gs_stage = shader->selector->info.stage;
1109
1110
1111
   const struct si_shader_selector *es_sel =
      shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
   const struct si_shader_info *es_info = &es_sel->info;
1112
   const gl_shader_stage es_stage = es_sel->info.stage;
1113
1114
1115
   unsigned num_user_sgprs;
   unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
   uint64_t va;
1116
1117
   bool window_space = gs_info->stage == MESA_SHADER_VERTEX ?
                          gs_info->base.vs.window_space_position : 0;
1118
   bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
1119
   unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
1120
1121
1122
1123
1124
1125
   unsigned input_prim = si_get_input_prim(gs_sel);
   bool break_wave_at_eoi = false;
   struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
   if (!pm4)
      return;

1126
1127
   if (es_stage == MESA_SHADER_TESS_EVAL) {
      pm4->atom.emit = gs_stage == MESA_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs
1128
1129
                                                       : gfx10_emit_shader_ngg_tess_nogs;
   } else {
1130
      pm4->atom.emit = gs_stage == MESA_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs
1131
1132
1133
1134
1135
                                                       : gfx10_emit_shader_ngg_notess_nogs;
   }

   va = shader->bo->gpu_address;

1136
   if (es_stage == MESA_SHADER_VERTEX) {
1137
1138
      es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);

1139
      if (es_info->base.vs.blit_sgprs_amd) {
1140
         num_user_sgprs =
1141
            SI_SGPR_VS_BLIT_DATA + es_info->base.vs.blit_sgprs_amd;
1142
1143
1144
1145
      } else {
         num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
      }
   } else {
1146
      assert(es_stage == MESA_SHADER_TESS_EVAL);
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
      es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2;
      num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;

      if (es_enable_prim_id || gs_info->uses_primid)
         break_wave_at_eoi = true;
   }

   /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
    * VGPR[0:4] are always loaded.
    *
    * Vertex shaders always need to load VGPR3, because they need to
    * pass edge flags for decomposed primitives (such as quads) to the PA
    * for the GL_LINE polygon mode to skip rendering lines on inner edges.
    */
   if (gs_info->uses_invocationid ||
1162
       (gs_stage == MESA_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader)))
1163
      gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
1164
1165
   else if ((gs_stage == MESA_SHADER_GEOMETRY && gs_info->uses_primid) ||
            (gs_stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
1166
1167
1168
1169
1170
1171
      gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
   else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader))
      gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
   else
      gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */

1172
1173
   unsigned wave_size = si_get_shader_wave_size(shader);

1174
   si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
1175
   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
1176
1177
   si_pm4_set_reg(
      pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
1178
      S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
1179
         S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) |
1180
         S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
1181
1182
1183
         /* Disable the WGP mode on gfx10.3 because it can hang. (it happened on VanGogh)
          * Let's disable it on all chips that disable exactly 1 CU per SA for GS. */
         S_00B228_WGP_MODE(sscreen->info.chip_class == GFX10) |
1184
1185
1186
1187
1188
1189
         S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
   si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
                  S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
                     S_00B22C_USER_SGPR(num_user_sgprs) |
                     S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
                     S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
1190
                     S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
1191
1192
1193
                     S_00B22C_LDS_SIZE(shader->config.lds_size));

   /* Determine LATE_ALLOC_GS. */
1194
   unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa;
1195
   unsigned late_alloc_wave64; /* The limit is per SA. */
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205

   /* For Wave32, the hw will launch twice the number of late
    * alloc waves, so 1 == 2x wave32.
    *
    * Don't use late alloc for NGG on Navi14 due to a hw bug.
    */
   if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc)
      late_alloc_wave64 = 0;
   else if (num_cu_per_sh <= 6)
      late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
1206
1207
   else if (shader->key.opt.ngg_culling)
      late_alloc_wave64 = num_cu_per_sh * 10;
1208
   else
1209
      late_alloc_wave64 = num_cu_per_sh * 4;
1210
1211

   /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
1212
   if (sscreen->info.chip_class == GFX10)
1213
1214
      late_alloc_wave64 = MIN2(late_alloc_wave64, 64);

1215
1216
1217
   /* Max number that fits into the register field. */
   late_alloc_wave64 = MIN2(late_alloc_wave64, 127);

1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
   si_pm4_set_reg(
      pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
      S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));

   nparams = MAX2(shader->info.nr_param_exports, 1);
   shader->ctx_reg.ngg.spi_vs_out_config =
      S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
      S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);

   shader->ctx_reg.ngg.spi_shader_idx_format =
      S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP);
   shader->ctx_reg.ngg.spi_shader_pos_format =
      S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
      S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
                                                                  : V_02870C_SPI_SHADER_NONE) |
      S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
                                                                  : V_02870C_SPI_SHADER_NONE) |
      S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
                                                                  : V_02870C_SPI_SHADER_NONE);

   shader->ctx_reg.ngg.vgt_primitiveid_en =
      S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
      S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id ||
                                        gs_sel->info.writes_primid);

1243
   if (gs_stage == MESA_SHADER_GEOMETRY) {
1244
      shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
1245
      shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->info.base.gs.vertices_out;
1246
1247
1248
1249
   } else {
      shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1;
   }

1250
   if (es_stage == MESA_SHADER_TESS_EVAL)
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
      si_set_tesseval_regs(sscreen, es_sel, pm4);