From f51b960af18f697a187fe2ed4918fb95495e5569 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 23 Nov 2023 18:05:10 -0500
Subject: [PATCH 01/38] radeonsi/gfx11: fix unaligned SET_CONTEXT_PAIRS_PACKED

It set an invalid register. Luckily it didn't cause any issues.

Fixes: 2ac6816b70d7bb - radeonsi/gfx11: use SET_CONTEXT_REG_PAIRS_PACKED for other states

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_build_pm4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h
index 6ce53255cfee2..24e799080706f 100644
--- a/src/gallium/drivers/radeonsi/si_build_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
@@ -339,7 +339,7 @@
    if (__cs_context_reg_count >= 2) { \
       /* Align the count to 2 by duplicating the first register. */ \
       if (__cs_context_reg_count % 2 == 1) { \
-         gfx11_set_context_reg(__cs_context_regs[0].reg_offset[0] + SI_CONTEXT_REG_OFFSET, \
+         gfx11_set_context_reg(SI_CONTEXT_REG_OFFSET + __cs_context_regs[0].reg_offset[0] * 4, \
                                __cs_context_regs[0].reg_value[0]); \
       } \
       assert(__cs_context_reg_count % 2 == 0); \
-- 
GitLab


From 98e7a7123b8af55c2aa287a5f6071db980ccfd53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 24 Nov 2023 04:49:30 -0500
Subject: [PATCH 02/38] radeonsi: don't set non-existent
 VGT_GS_MAX_PRIMS_PER_SUBGROUP on gfx10

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_state.h           | 2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.cpp | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 0d094b4ce9537..1f4fda8c11184 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -296,7 +296,7 @@ enum si_tracked_context_reg
    SI_TRACKED_VGT_REUSE_OFF,                 /* GFX6-8 (GFX9+ can reuse this slot) */
    SI_TRACKED_IA_MULTI_VGT_PARAM,            /* GFX6-8 (GFX9+ can reuse this slot) */
 
-   SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, /* GFX9-10 - the slots above can be reused */
+   SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, /* GFX9 - the slots above can be reused */
    SI_TRACKED_VGT_GS_ONCHIP_CNTL,            /* GFX9-10 - the slots above can be reused */
 
    SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,        /* GFX6-10 (GFX11+ can reuse this slot) */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index 42d3e78af8dbb..d37ced356a7cb 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -958,9 +958,11 @@ static void si_emit_shader_gs(struct si_context *sctx, unsigned index)
       radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
                                  shader->gs.vgt_gs_onchip_cntl);
       /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */
-      radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
-                                 SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
-                                 shader->gs.vgt_gs_max_prims_per_subgroup);
+      if (sctx->gfx_level == GFX9) {
+         radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+                                    SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+                                    shader->gs.vgt_gs_max_prims_per_subgroup);
+      }
 
       if (shader->key.ge.part.gs.es->stage == MESA_SHADER_TESS_EVAL)
          radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
-- 
GitLab


From c8411ddf17700bac9910b9cc1017805415fc0fdc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 20 Nov 2023 02:03:29 -0500
Subject: [PATCH 03/38] radeonsi: change the low-priority compiler queue to
 normal priority

I'm guessing that low priority could cause us to get optimized shaders later
than we need.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_pipe.c            | 11 +++++------
 src/gallium/drivers/radeonsi/si_pipe.h            |  2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.cpp |  4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index a65f8b7d307ea..5a2111d66985e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -448,7 +448,7 @@ static void si_set_debug_callback(struct pipe_context *ctx, const struct util_de
    struct si_screen *screen = sctx->screen;
 
    util_queue_finish(&screen->shader_compiler_queue);
-   util_queue_finish(&screen->shader_compiler_queue_low_priority);
+   util_queue_finish(&screen->shader_compiler_queue_opt_variants);
 
    if (cb)
       sctx->debug = *cb;
@@ -1004,7 +1004,7 @@ static void si_destroy_screen(struct pipe_screen *pscreen)
    }
 
    util_queue_destroy(&sscreen->shader_compiler_queue);
-   util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
+   util_queue_destroy(&sscreen->shader_compiler_queue_opt_variants);
 
    /* Release the reference on glsl types of the compiler threads. */
    glsl_type_singleton_decref();
@@ -1340,7 +1340,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
    if (!util_queue_init(&sscreen->shader_compiler_queue, "sh", num_slots,
                         num_comp_hi_threads,
                         UTIL_QUEUE_INIT_RESIZE_IF_FULL |
-                           UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) {
+                        UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) {
       si_destroy_shader_cache(sscreen);
       FREE(sscreen->nir_options);
       FREE(sscreen);
@@ -1348,11 +1348,10 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
       return NULL;
    }
 
-   if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority, "shlo", num_slots,
+   if (!util_queue_init(&sscreen->shader_compiler_queue_opt_variants, "sh_opt", num_slots,
                         num_comp_lo_threads,
                         UTIL_QUEUE_INIT_RESIZE_IF_FULL |
-                           UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY |
-                           UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY, NULL)) {
+                        UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) {
       si_destroy_shader_cache(sscreen);
       FREE(sscreen->nir_options);
       FREE(sscreen);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 6277c448de209..07492b9469ce3 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -703,7 +703,7 @@ struct si_screen {
     */
    struct ac_llvm_compiler *compiler[24]; /* used by the queue only */
 
-   struct util_queue shader_compiler_queue_low_priority;
+   struct util_queue shader_compiler_queue_opt_variants;
    /* Compiler instances for asynchronous shader compilation of optimized shader variants,
     * one for each thread of the low-priority shader compiler queue. */
    struct ac_llvm_compiler *compiler_lowp[10];
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index d37ced356a7cb..3405e754e8ea7 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -2912,7 +2912,7 @@ current_not_ready:
    /* If it's an optimized shader, compile it asynchronously. */
    if (shader->is_optimized) {
       /* Compile it asynchronously. */
-      util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, shader, &shader->ready,
+      util_queue_add_job(&sscreen->shader_compiler_queue_opt_variants, shader, &shader->ready,
                          si_build_shader_variant_low_priority, NULL, 0);
 
       /* Add only after the ready fence was reset, to guard against a
@@ -3693,7 +3693,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
    if (shader->is_optimized) {
-      util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority, &shader->ready);
+      util_queue_drop_job(&sctx->screen->shader_compiler_queue_opt_variants, &shader->ready);
    }
 
    util_queue_fence_destroy(&shader->ready);
-- 
GitLab


From 613ea16aab501edc67115768b428673f1e05daae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 20 Nov 2023 03:41:17 -0500
Subject: [PATCH 04/38] radeonsi: update shaders for blend state only if the
 shader key changed

Check if any key bit or state changed before setting do_update_shaders.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_state.c       |  1 -
 .../drivers/radeonsi/si_state_shaders.cpp     | 32 ++++++++++++++++---
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 063f2e805d5c1..f4237bccbee48 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -775,7 +775,6 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
        old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) {
       si_ps_key_update_framebuffer_blend_rasterizer(sctx);
       si_update_ps_inputs_read_or_disabled(sctx);
-      sctx->do_update_shaders = true;
    }
 
    if (sctx->screen->dpbb_allowed &&
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index 3405e754e8ea7..20942729efa6a 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -2214,8 +2214,10 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx)
                     (!ps_colormask && !ps_modifies_zs && !ps->info.base.writes_memory);
    }
 
+   uint64_t ps_inputs_read_or_disabled;
+
    if (ps_disabled) {
-      sctx->ps_inputs_read_or_disabled = 0;
+      ps_inputs_read_or_disabled = 0;
    } else {
       uint64_t inputs_read = ps->info.inputs_read;
 
@@ -2227,7 +2229,12 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx)
             inputs_read |= BITFIELD64_BIT(SI_UNIQUE_SLOT_BFC1);
       }
 
-      sctx->ps_inputs_read_or_disabled = inputs_read;
+      ps_inputs_read_or_disabled = inputs_read;
+   }
+
+   if (sctx->ps_inputs_read_or_disabled != ps_inputs_read_or_disabled) {
+      sctx->ps_inputs_read_or_disabled = ps_inputs_read_or_disabled;
+      sctx->do_update_shaders = true;
    }
 }
 
@@ -2302,6 +2309,9 @@ void si_ps_key_update_framebuffer(struct si_context *sctx)
 void si_ps_key_update_framebuffer_blend_rasterizer(struct si_context *sctx)
 {
    struct si_shader_selector *sel = sctx->shader.ps.cso;
+   if (!sel)
+      return;
+
    union si_shader_key *key = &sctx->shader.ps.key;
    struct si_state_blend *blend = sctx->queued.named.blend;
    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
@@ -2309,8 +2319,14 @@ void si_ps_key_update_framebuffer_blend_rasterizer(struct si_context *sctx)
                             sctx->framebuffer.nr_samples >= 2;
    unsigned need_src_alpha_4bit = blend->need_src_alpha_4bit;
 
-   if (!sel)
-      return;
+   /* Old key data for comparison. */
+   struct si_ps_epilog_bits old_epilog;
+   memcpy(&old_epilog, &key->ps.part.epilog, sizeof(old_epilog));
+   bool old_prefer_mono = key->ps.opt.prefer_mono;
+#ifndef NDEBUG
+   struct si_shader_key_ps old_key;
+   memcpy(&old_key, &key->ps, sizeof(old_key));
+#endif
 
    key->ps.part.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable;
    key->ps.part.epilog.alpha_to_coverage_via_mrtz =
@@ -2414,6 +2430,14 @@ void si_ps_key_update_framebuffer_blend_rasterizer(struct si_context *sctx)
       key->ps.opt.prefer_mono = 1;
    else
       key->ps.opt.prefer_mono = 0;
+
+   /* Update shaders only if the key changed. */
+   if (memcmp(&key->ps.part.epilog, &old_epilog, sizeof(old_epilog)) ||
+       key->ps.opt.prefer_mono != old_prefer_mono) {
+      sctx->do_update_shaders = true;
+   } else {
+      assert(memcmp(&key->ps, &old_key, sizeof(old_key)) == 0);
+   }
 }
 
 void si_ps_key_update_rasterizer(struct si_context *sctx)
-- 
GitLab


From f9c4ac3477ba3beffbc1f12ba6f188ee332ba2e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 20 Nov 2023 03:41:17 -0500
Subject: [PATCH 05/38] radeonsi: update shaders for rasterizer state only if
 the shader key changed

Check if any key bit changed before setting do_update_shaders.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_state.c       | 18 ++++++++++++------
 .../drivers/radeonsi/si_state_shaders.cpp     | 19 +++++++++++++++++++
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index f4237bccbee48..636ee6cf88322 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1251,24 +1251,30 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
    if (sctx->screen->dpbb_allowed && (old_rs->bottom_edge_rule != rs->bottom_edge_rule))
       si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
 
-   if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
-       old_rs->rasterizer_discard != rs->rasterizer_discard ||
+   if (old_rs->rasterizer_discard != rs->rasterizer_discard ||
        old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
        old_rs->flatshade != rs->flatshade || old_rs->two_side != rs->two_side ||
        old_rs->multisample_enable != rs->multisample_enable ||
        old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
-       old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth ||
        old_rs->point_smooth != rs->point_smooth ||
        old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
-       old_rs->force_persample_interp != rs->force_persample_interp ||
-       old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) {
+       old_rs->force_persample_interp != rs->force_persample_interp) {
       si_ps_key_update_framebuffer_blend_rasterizer(sctx);
       si_ps_key_update_rasterizer(sctx);
       si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
       si_update_ps_inputs_read_or_disabled(sctx);
-      sctx->do_update_shaders = true;
    }
 
+   if (/* Used by si_get_vs_key_outputs in si_update_shaders: */
+       old_rs->clip_plane_enable != rs->clip_plane_enable ||
+       old_rs->polygon_mode_is_points != rs->polygon_mode_is_points ||
+       /* Used by si_ps_key_update_primtype_shader_rasterizer_framebuffer in si_update_shaders: */
+       old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
+       old_rs->poly_smooth != rs->poly_smooth ||
+       old_rs->line_smooth != rs->line_smooth ||
+       old_rs->point_smooth != rs->point_smooth)
+      sctx->do_update_shaders = true;
+
    if (old_rs->line_smooth != rs->line_smooth ||
        old_rs->poly_smooth != rs->poly_smooth ||
        old_rs->point_smooth != rs->point_smooth ||
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index 20942729efa6a..7b193c9ad0872 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -2449,9 +2449,18 @@ void si_ps_key_update_rasterizer(struct si_context *sctx)
    if (!sel)
       return;
 
+   bool old_color_two_side = key->ps.part.prolog.color_two_side;
+   bool old_flatshade_colors = key->ps.part.prolog.flatshade_colors;
+   bool old_clamp_color = key->ps.part.epilog.clamp_color;
+
    key->ps.part.prolog.color_two_side = rs->two_side && sel->info.colors_read;
    key->ps.part.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color;
    key->ps.part.epilog.clamp_color = rs->clamp_fragment_color;
+
+   if (key->ps.part.prolog.color_two_side != old_color_two_side ||
+       key->ps.part.prolog.flatshade_colors != old_flatshade_colors ||
+       key->ps.part.epilog.clamp_color != old_clamp_color)
+      sctx->do_update_shaders = true;
 }
 
 void si_ps_key_update_dsa(struct si_context *sctx)
@@ -2501,6 +2510,11 @@ void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *s
    if (!sel)
       return;
 
+   /* Old key data for comparison. */
+   struct si_ps_prolog_bits old_prolog;
+   memcpy(&old_prolog, &key->ps.part.prolog, sizeof(old_prolog));
+   bool old_interpolate_at_sample_force_center = key->ps.mono.interpolate_at_sample_force_center;
+
    bool uses_persp_center = sel->info.uses_persp_center ||
                             (!rs->flatshade && sel->info.uses_persp_center_color);
    bool uses_persp_centroid = sel->info.uses_persp_centroid ||
@@ -2548,6 +2562,11 @@ void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *s
       key->ps.part.prolog.bc_optimize_for_linear = 0;
       key->ps.mono.interpolate_at_sample_force_center = sel->info.uses_interp_at_sample;
    }
+
+   /* Update shaders only if the key changed. */
+   if (memcmp(&key->ps.part.prolog, &old_prolog, sizeof(old_prolog)) ||
+       key->ps.mono.interpolate_at_sample_force_center != old_interpolate_at_sample_force_center)
+      sctx->do_update_shaders = true;
 }
 
 /* Compute the key for the hw shader variant */
-- 
GitLab


From 4ab5374ec361ce735e87d95550a72b9988c32f59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 20 Nov 2023 04:18:15 -0500
Subject: [PATCH 06/38] radeonsi: clean up setting poly/line/stipple shader key
 bits

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 .../drivers/radeonsi/si_state_shaders.cpp     | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index 7b193c9ad0872..23e9a126b5b6d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -2475,16 +2475,20 @@ static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_co
    union si_shader_key *key = &sctx->shader.ps.key;
    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
-   bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
-   bool is_line = util_prim_is_lines(sctx->current_rast_prim);
-
-   key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
-   key->ps.mono.poly_line_smoothing =
-      ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) &&
-      sctx->framebuffer.nr_samples <= 1;
-
-   key->ps.mono.point_smoothing = rs->point_smooth &&
-                                  sctx->current_rast_prim == MESA_PRIM_POINTS;
+   if (sctx->current_rast_prim == MESA_PRIM_POINTS) {
+      key->ps.part.prolog.poly_stipple = 0;
+      key->ps.mono.poly_line_smoothing = 0;
+      key->ps.mono.point_smoothing = rs->point_smooth;
+   } else if (util_prim_is_lines(sctx->current_rast_prim)) {
+      key->ps.part.prolog.poly_stipple = 0;
+      key->ps.mono.poly_line_smoothing = rs->line_smooth && sctx->framebuffer.nr_samples <= 1;
+      key->ps.mono.point_smoothing = 0;
+   } else {
+      /* Triangles. */
+      key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable;
+      key->ps.mono.poly_line_smoothing = rs->poly_smooth && sctx->framebuffer.nr_samples <= 1;
+      key->ps.mono.point_smoothing = 0;
+   }
 }
 
 void si_ps_key_update_sample_shading(struct si_context *sctx)
-- 
GitLab


From e2b817b948a3ad15f163135459b388aaef4eeec3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 20 Nov 2023 04:56:56 -0500
Subject: [PATCH 07/38] radeonsi: rewrite how shader key bits dependent on
 current_rast_prim are updated

Don't set do_update_shaders every time current_rast_prim changes, which can
be EVERY DRAW. Instead, just update the shader key bits and set
do_update_shaders only if any bits are different.

When we bind a new rasterizer state, do the same.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_pipe.h        |  2 +-
 src/gallium/drivers/radeonsi/si_state.c       | 16 ++--
 src/gallium/drivers/radeonsi/si_state.h       |  1 +
 .../drivers/radeonsi/si_state_shaders.cpp     | 82 ++++++++++++-------
 4 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 07492b9469ce3..901d7d1409833 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -2159,10 +2159,10 @@ si_set_rasterized_prim(struct si_context *sctx, enum mesa_prim rast_prim,
          si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
 
       sctx->current_rast_prim = rast_prim;
+      si_vs_ps_key_update_rast_prim_smooth_stipple(sctx);
       sctx->gs_out_prim = is_triangles ? V_028A6C_TRISTRIP :
                           is_lines ? V_028A6C_LINESTRIP :
                           is_rect ? V_028A6C_RECTLIST : V_028A6C_POINTLIST;
-      sctx->do_update_shaders = true;
       si_update_ngg_prim_state_sgpr(sctx, hw_vs, ngg);
    }
 }
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 636ee6cf88322..7be00a921507e 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1265,14 +1265,15 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
       si_update_ps_inputs_read_or_disabled(sctx);
    }
 
-   if (/* Used by si_get_vs_key_outputs in si_update_shaders: */
-       old_rs->clip_plane_enable != rs->clip_plane_enable ||
-       old_rs->polygon_mode_is_points != rs->polygon_mode_is_points ||
-       /* Used by si_ps_key_update_primtype_shader_rasterizer_framebuffer in si_update_shaders: */
-       old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
-       old_rs->poly_smooth != rs->poly_smooth ||
+   if (old_rs->point_smooth != rs->point_smooth ||
        old_rs->line_smooth != rs->line_smooth ||
-       old_rs->point_smooth != rs->point_smooth)
+       old_rs->poly_smooth != rs->poly_smooth ||
+       old_rs->polygon_mode_is_points != rs->polygon_mode_is_points ||
+       old_rs->poly_stipple_enable != rs->poly_stipple_enable)
+      si_vs_ps_key_update_rast_prim_smooth_stipple(sctx);
+
+   /* Used by si_get_vs_key_outputs in si_update_shaders: */
+   if (old_rs->clip_plane_enable != rs->clip_plane_enable)
       sctx->do_update_shaders = true;
 
    if (old_rs->line_smooth != rs->line_smooth ||
@@ -3166,6 +3167,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
    si_ps_key_update_framebuffer(sctx);
    si_ps_key_update_framebuffer_blend_rasterizer(sctx);
    si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
+   si_vs_ps_key_update_rast_prim_smooth_stipple(sctx);
    si_update_ps_inputs_read_or_disabled(sctx);
    sctx->do_update_shaders = true;
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 1f4fda8c11184..6f2f146462f21 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -616,6 +616,7 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx);
 void si_update_vrs_flat_shading(struct si_context *sctx);
 unsigned si_get_input_prim(const struct si_shader_selector *gs, const union si_shader_key *key);
 bool si_update_ngg(struct si_context *sctx);
+void si_vs_ps_key_update_rast_prim_smooth_stipple(struct si_context *sctx);
 void si_ps_key_update_framebuffer(struct si_context *sctx);
 void si_ps_key_update_framebuffer_blend_rasterizer(struct si_context *sctx);
 void si_ps_key_update_rasterizer(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index 23e9a126b5b6d..a7d74aa8c3386 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -2238,6 +2238,47 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx)
    }
 }
 
+void si_vs_ps_key_update_rast_prim_smooth_stipple(struct si_context *sctx)
+{
+   struct si_shader_ctx_state *hw_vs = si_get_vs(sctx);
+   if (!hw_vs->cso)
+      return;
+
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   union si_shader_key *vs_key = &hw_vs->key; /* could also be TES or GS before PS */
+   union si_shader_key *ps_key = &sctx->shader.ps.key;
+
+   bool old_kill_pointsize = vs_key->ge.opt.kill_pointsize;
+   bool old_poly_stipple = ps_key->ps.part.prolog.poly_stipple;
+   bool old_poly_line_smoothing = ps_key->ps.mono.poly_line_smoothing;
+   bool old_point_smoothing = ps_key->ps.mono.point_smoothing;
+
+   if (sctx->current_rast_prim == MESA_PRIM_POINTS) {
+      vs_key->ge.opt.kill_pointsize = 0;
+      ps_key->ps.part.prolog.poly_stipple = 0;
+      ps_key->ps.mono.poly_line_smoothing = 0;
+      ps_key->ps.mono.point_smoothing = rs->point_smooth;
+   } else if (util_prim_is_lines(sctx->current_rast_prim)) {
+      vs_key->ge.opt.kill_pointsize = hw_vs->cso->info.writes_psize;
+      ps_key->ps.part.prolog.poly_stipple = 0;
+      ps_key->ps.mono.poly_line_smoothing = rs->line_smooth && sctx->framebuffer.nr_samples <= 1;
+      ps_key->ps.mono.point_smoothing = 0;
+   } else {
+      /* Triangles. */
+      vs_key->ge.opt.kill_pointsize = hw_vs->cso->info.writes_psize &&
+                                      !rs->polygon_mode_is_points;
+      ps_key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable;
+      ps_key->ps.mono.poly_line_smoothing = rs->poly_smooth && sctx->framebuffer.nr_samples <= 1;
+      ps_key->ps.mono.point_smoothing = 0;
+   }
+
+   if (vs_key->ge.opt.kill_pointsize != old_kill_pointsize ||
+       ps_key->ps.part.prolog.poly_stipple != old_poly_stipple ||
+       ps_key->ps.mono.poly_line_smoothing != old_poly_line_smoothing ||
+       ps_key->ps.mono.point_smoothing != old_point_smoothing)
+      sctx->do_update_shaders = true;
+}
+
 static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs,
                                   union si_shader_key *key)
 {
@@ -2247,15 +2288,12 @@ static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_sele
    uint64_t outputs_written = vs->info.outputs_written_before_ps;
    uint64_t linked = outputs_written & sctx->ps_inputs_read_or_disabled;
 
+   key->ge.opt.kill_layer = vs->info.writes_layer &&
+                            sctx->framebuffer.state.layers <= 1;
    key->ge.opt.kill_outputs = ~linked & outputs_written;
    key->ge.opt.ngg_culling = sctx->ngg_culling;
    key->ge.mono.u.vs_export_prim_id = vs->stage != MESA_SHADER_GEOMETRY &&
                                       sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid;
-   key->ge.opt.kill_pointsize = vs->info.writes_psize &&
-                                sctx->current_rast_prim != MESA_PRIM_POINTS &&
-                                !sctx->queued.named.rasterizer->polygon_mode_is_points;
-   key->ge.opt.kill_layer = vs->info.writes_layer &&
-                            sctx->framebuffer.state.layers <= 1;
    key->ge.opt.remove_streamout = vs->info.enabled_streamout_buffer_mask &&
                                   !sctx->streamout.enabled_mask;
 }
@@ -2268,7 +2306,6 @@ static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_se
    key->ge.opt.remove_streamout = 0;
    key->ge.opt.ngg_culling = 0;
    key->ge.mono.u.vs_export_prim_id = 0;
-   key->ge.opt.kill_pointsize = 0;
 }
 
 void si_ps_key_update_framebuffer(struct si_context *sctx)
@@ -2470,27 +2507,6 @@ void si_ps_key_update_dsa(struct si_context *sctx)
    key->ps.part.epilog.alpha_func = sctx->queued.named.dsa->alpha_func;
 }
 
-static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_context *sctx)
-{
-   union si_shader_key *key = &sctx->shader.ps.key;
-   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-
-   if (sctx->current_rast_prim == MESA_PRIM_POINTS) {
-      key->ps.part.prolog.poly_stipple = 0;
-      key->ps.mono.poly_line_smoothing = 0;
-      key->ps.mono.point_smoothing = rs->point_smooth;
-   } else if (util_prim_is_lines(sctx->current_rast_prim)) {
-      key->ps.part.prolog.poly_stipple = 0;
-      key->ps.mono.poly_line_smoothing = rs->line_smooth && sctx->framebuffer.nr_samples <= 1;
-      key->ps.mono.point_smoothing = 0;
-   } else {
-      /* Triangles. */
-      key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable;
-      key->ps.mono.poly_line_smoothing = rs->poly_smooth && sctx->framebuffer.nr_samples <= 1;
-      key->ps.mono.point_smoothing = 0;
-   }
-}
-
 void si_ps_key_update_sample_shading(struct si_context *sctx)
 {
    struct si_shader_selector *sel = sctx->shader.ps.cso;
@@ -2616,7 +2632,6 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh
       }
       break;
    case MESA_SHADER_FRAGMENT:
-      si_ps_key_update_primtype_shader_rasterizer_framebuffer(sctx);
       break;
    default:
       assert(0);
@@ -3491,11 +3506,18 @@ static void si_update_last_vgt_stage_state(struct si_context *sctx,
                                            struct si_shader_selector *old_hw_vs,
                                            struct si_shader *old_hw_vs_variant)
 {
+   struct si_shader_ctx_state *hw_vs = si_get_vs(sctx);
+
    si_update_vs_viewport_state(sctx);
    si_update_streamout_state(sctx);
-   si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
-                       si_get_vs(sctx)->current);
+   si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, hw_vs->cso, hw_vs->current);
    si_update_rasterized_prim(sctx);
+
+   /* Clear kill_pointsize because we only want it to be set in the last shader before PS. */
+   sctx->shader.vs.key.ge.opt.kill_pointsize = 0;
+   sctx->shader.tes.key.ge.opt.kill_pointsize = 0;
+   sctx->shader.gs.key.ge.opt.kill_pointsize = 0;
+   si_vs_ps_key_update_rast_prim_smooth_stipple(sctx);
 }
 
 static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
-- 
GitLab


From 53aa36772a1ab7764cc02498ccfec4a515393231 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 20 Nov 2023 05:11:33 -0500
Subject: [PATCH 08/38] radeonsi: rewrite si_get_total_colormask as
 si_any_colorbuffer_written

The result is only used as bool.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_blit.c        |  2 +-
 src/gallium/drivers/radeonsi/si_pipe.h        | 20 +++++++------------
 .../drivers/radeonsi/si_state_shaders.cpp     |  4 ++--
 3 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 230ea7bd27c7c..81142d7556086 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -733,7 +733,7 @@ static void si_check_render_feedback(struct si_context *sctx)
    /* There is no render feedback if color writes are disabled.
     * (e.g. a pixel shader with image stores)
     */
-   if (!si_get_total_colormask(sctx))
+   if (!si_any_colorbuffer_written(sctx))
       return;
 
    for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; ++i) {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 901d7d1409833..5cf091451b657 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1972,24 +1972,18 @@ static inline unsigned si_get_ps_iter_samples(struct si_context *sctx)
    return MIN2(sctx->ps_iter_samples, sctx->framebuffer.nr_color_samples);
 }
 
-static inline unsigned si_get_total_colormask(struct si_context *sctx)
+static inline bool si_any_colorbuffer_written(struct si_context *sctx)
 {
    if (sctx->queued.named.rasterizer->rasterizer_discard)
-      return 0;
+      return false;
 
    struct si_shader_selector *ps = sctx->shader.ps.cso;
-   if (!ps)
-      return 0;
-
-   unsigned colormask =
-      sctx->framebuffer.colorbuf_enabled_4bit & sctx->queued.named.blend->cb_target_mask;
-
-   if (!ps->info.color0_writes_all_cbufs)
-      colormask &= ps->info.colors_written_4bit;
-   else if (!ps->info.colors_written_4bit)
-      colormask = 0; /* color0 writes all cbufs, but it's not written */
+   if (!ps || !ps->info.colors_written_4bit)
+      return false;
 
-   return colormask;
+   return (sctx->framebuffer.colorbuf_enabled_4bit &
+           sctx->queued.named.blend->cb_target_enabled_4bit &
+           (ps->info.color0_writes_all_cbufs ? ~0 : ps->info.colors_written_4bit)) != 0;
 }
 
 #define UTIL_ALL_PRIM_LINE_MODES                                                                   \
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index a7d74aa8c3386..bc4af64780cd7 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -2208,10 +2208,10 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx)
                             sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS ||
                             sctx->queued.named.rasterizer->poly_stipple_enable ||
                             sctx->queued.named.rasterizer->point_smooth;
-      unsigned ps_colormask = si_get_total_colormask(sctx);
 
       ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard ||
-                    (!ps_colormask && !ps_modifies_zs && !ps->info.base.writes_memory);
+                    (!ps_modifies_zs && !ps->info.base.writes_memory &&
+                     !si_any_colorbuffer_written(sctx));
    }
 
    uint64_t ps_inputs_read_or_disabled;
-- 
GitLab


From bf7debee82c95f15f50f616c3c800eb6e19acff2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 20 Nov 2023 05:26:44 -0500
Subject: [PATCH 09/38] radeonsi: in bind_{blend,rs}_state, only call 1 update
 function per if

Also don't use "key.ps.part.prolog.color_two_side" during updates
because it would depend on the order the update functions are called,
which is not a problem now, but it's a trap for the future.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_state.c       | 29 ++++++++++++-------
 .../drivers/radeonsi/si_state_shaders.cpp     |  2 +-
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 7be00a921507e..7671a66fcc40f 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -772,10 +772,12 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
        old_blend->alpha_to_one != blend->alpha_to_one ||
        old_blend->dual_src_blend != blend->dual_src_blend ||
        old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
-       old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) {
+       old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
       si_ps_key_update_framebuffer_blend_rasterizer(sctx);
+
+   if (old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
+       old_blend->alpha_to_coverage != blend->alpha_to_coverage)
       si_update_ps_inputs_read_or_disabled(sctx);
-   }
 
    if (sctx->screen->dpbb_allowed &&
        (old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
@@ -1251,19 +1253,24 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
    if (sctx->screen->dpbb_allowed && (old_rs->bottom_edge_rule != rs->bottom_edge_rule))
       si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
 
-   if (old_rs->rasterizer_discard != rs->rasterizer_discard ||
-       old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
-       old_rs->flatshade != rs->flatshade || old_rs->two_side != rs->two_side ||
-       old_rs->multisample_enable != rs->multisample_enable ||
-       old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
-       old_rs->point_smooth != rs->point_smooth ||
-       old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
-       old_rs->force_persample_interp != rs->force_persample_interp) {
+   if (old_rs->multisample_enable != rs->multisample_enable)
       si_ps_key_update_framebuffer_blend_rasterizer(sctx);
+
+   if (old_rs->two_side != rs->two_side ||
+       old_rs->flatshade != rs->flatshade ||
+       old_rs->clamp_fragment_color != rs->clamp_fragment_color)
       si_ps_key_update_rasterizer(sctx);
+
+   if (old_rs->flatshade != rs->flatshade ||
+       old_rs->force_persample_interp != rs->force_persample_interp ||
+       old_rs->multisample_enable != rs->multisample_enable)
       si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
+
+   if (old_rs->rasterizer_discard != rs->rasterizer_discard ||
+       old_rs->two_side != rs->two_side ||
+       old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
+       old_rs->point_smooth != rs->point_smooth)
       si_update_ps_inputs_read_or_disabled(sctx);
-   }
 
    if (old_rs->point_smooth != rs->point_smooth ||
        old_rs->line_smooth != rs->line_smooth ||
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index bc4af64780cd7..e52f799fcc84d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -2221,7 +2221,7 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx)
    } else {
       uint64_t inputs_read = ps->info.inputs_read;
 
-      if (sctx->shader.ps.key.ps.part.prolog.color_two_side) {
+      if (ps->info.colors_read && sctx->queued.named.rasterizer->two_side) {
          if (inputs_read & BITFIELD64_BIT(SI_UNIQUE_SLOT_COL0))
             inputs_read |= BITFIELD64_BIT(SI_UNIQUE_SLOT_BFC0);
 
-- 
GitLab


From 202285436027b5ba0519407b466240481660b545 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 10 Nov 2023 19:26:20 -0500
Subject: [PATCH 10/38] radeonsi/gfx11: skip si_set_streamout_enable because it
 has no effect

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_state_streamout.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c
index 73a8213468174..b61cfb89a8a4a 100644
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -359,6 +359,9 @@ static void si_emit_streamout_enable(struct si_context *sctx, unsigned index)
 
 static void si_set_streamout_enable(struct si_context *sctx, bool enable)
 {
+   if (sctx->gfx_level >= GFX11)
+      return;
+
    bool old_strmout_en = si_get_strmout_en(sctx);
    unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
 
@@ -368,9 +371,8 @@ static void si_set_streamout_enable(struct si_context *sctx, bool enable)
       sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
       (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
 
-   if (sctx->gfx_level < GFX11 &&
-       ((old_strmout_en != si_get_strmout_en(sctx)) ||
-        (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
+   if ((old_strmout_en != si_get_strmout_en(sctx)) ||
+       (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))
       si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 }
 
-- 
GitLab


From 9e764596163a598a7b0b2082ff79d5cacd9e650c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 10 Nov 2023 19:27:37 -0500
Subject: [PATCH 11/38] radeonsi: execute streamout_begin after cache flushes

so that si_emit_streamout_begin can assume that cache flushes have
finished.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_state.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 6f2f146462f21..9441459cf2c68 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -185,8 +185,7 @@ union si_state_atoms {
       /* This must be first. */
       struct si_atom pm4_states[SI_NUM_STATES];
       struct si_atom gfx_add_all_to_bo_list;
-      struct si_atom streamout_begin;
-      struct si_atom streamout_enable; /* must be after streamout_begin */
+      struct si_atom streamout_enable;
       struct si_atom framebuffer;
       struct si_atom sample_locations;
       struct si_atom db_render_state;
@@ -210,6 +209,7 @@ union si_state_atoms {
       struct si_atom vgt_pipeline_state;
       struct si_atom tess_io_layout;
       struct si_atom cache_flush;
+      struct si_atom streamout_begin; /* this must be done after cache_flush */
       struct si_atom render_cond; /* this must be after cache_flush */
    } s;
    struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
-- 
GitLab


From 1afe6f33212d9ae8aeffd9f2d464df85eb5facda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 10 Nov 2023 19:16:39 -0500
Subject: [PATCH 12/38] radeonsi: don't print the preamble state separately for
 GALLIUM_DDEBUG

because it's always printed as part of command buffers.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_debug.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index 5e62ec1b0f005..a6a8c8902a28d 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -392,12 +392,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
       last_trace_id = map[0];
 
    if (chunk->gfx_end != chunk->gfx_begin) {
-      if (chunk->gfx_begin == 0) {
-         if (ctx->cs_preamble_state)
-            ac_parse_ib(f, ctx->cs_preamble_state->pm4, ctx->cs_preamble_state->ndw, NULL, 0,
-                        "IB2: Init config", ctx->gfx_level, ctx->family, AMD_IP_GFX, NULL, NULL);
-      }
-
       if (scs->flushed) {
          ac_parse_ib(f, scs->gfx.ib + chunk->gfx_begin, chunk->gfx_end - chunk->gfx_begin,
                      &last_trace_id, map ? 1 : 0, "IB", ctx->gfx_level, ctx->family, AMD_IP_GFX, NULL, NULL);
-- 
GitLab


From 55d81214c9d800c0667337808a82143ebab17c1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 20 Nov 2023 07:06:29 -0500
Subject: [PATCH 13/38] radeonsi: replace gl_FrontFacing with a constant if one
 side is always culled

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 .../drivers/radeonsi/si_nir_lower_abi.c       |  5 ++++
 src/gallium/drivers/radeonsi/si_shader.c      |  6 +++-
 src/gallium/drivers/radeonsi/si_shader.h      |  3 ++
 src/gallium/drivers/radeonsi/si_state.c       | 15 ++++++++--
 src/gallium/drivers/radeonsi/si_state.h       |  1 +
 .../drivers/radeonsi/si_state_shaders.cpp     | 30 +++++++++++++++----
 6 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
index fdd29fdbbc5dc..e57d9590be691 100644
--- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
+++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
@@ -587,6 +587,11 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
    case nir_intrinsic_load_alpha_reference_amd:
       replacement = ac_nir_load_arg(b, &args->ac, args->alpha_reference);
       break;
+   case nir_intrinsic_load_front_face:
+      if (!key->ps.opt.force_front_face_input)
+         return false;
+      replacement = nir_imm_bool(b, key->ps.opt.force_front_face_input == 1);
+      break;
    case nir_intrinsic_load_barycentric_optimize_amd: {
       nir_def *prim_mask = ac_nir_load_arg(b, &args->ac, args->ac.prim_mask);
       /* enabled when bit 31 is set */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 1f24c78a57e37..89c4c7a93f127 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2645,6 +2645,10 @@ si_set_spi_ps_input_config(struct si_shader *shader)
    const struct si_shader_info *info = &sel->info;
    const union si_shader_key *key = &shader->key;
 
+   /* TODO: This should be determined from the final NIR instead of the input NIR,
+    * otherwise LLVM will have a performance advantage here because it determines
+    * VGPR inputs for each shader variant after LLVM optimizations.
+    */
    shader->config.spi_ps_input_ena =
       S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center) |
       S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid) |
@@ -2652,7 +2656,7 @@ si_set_spi_ps_input_config(struct si_shader *shader)
       S_0286CC_LINEAR_CENTER_ENA(info->uses_linear_center) |
       S_0286CC_LINEAR_CENTROID_ENA(info->uses_linear_centroid) |
       S_0286CC_LINEAR_SAMPLE_ENA(info->uses_linear_sample) |
-      S_0286CC_FRONT_FACE_ENA(info->uses_frontface) |
+      S_0286CC_FRONT_FACE_ENA(info->uses_frontface && !key->ps.opt.force_front_face_input) |
       S_0286CC_SAMPLE_COVERAGE_ENA(info->reads_samplemask) |
       S_0286CC_ANCILLARY_ENA(info->uses_sampleid || info->uses_layer_id);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index edf054999be03..f7dcc961c7602 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -797,6 +797,9 @@ struct si_shader_key_ps {
       unsigned prefer_mono : 1;
       unsigned inline_uniforms:1;
 
+      /* This eliminates the FRONT_FACE input VGPR as well as shader code using it. */
+      int force_front_face_input : 2; /* 0 = gl_FrontFacing, 1 = true, -1 = false */
+
       /* This must be kept last to limit the number of variants
        * depending only on the uniform values.
        */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 7671a66fcc40f..2c633cf55e1e4 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1061,6 +1061,14 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
       }
    }
 
+   /* Force gl_FrontFacing to true or false if the other face is culled. */
+   if (util_bitcount(state->cull_face) == 1) {
+      if (state->cull_face & PIPE_FACE_FRONT)
+         rs->force_front_face_input = -1;
+      else
+         rs->force_front_face_input = 1;
+   }
+
    unsigned spi_interp_control_0 =
       S_0286D4_FLAT_SHADE_ENA(1) |
       S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
@@ -1256,8 +1264,7 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
    if (old_rs->multisample_enable != rs->multisample_enable)
       si_ps_key_update_framebuffer_blend_rasterizer(sctx);
 
-   if (old_rs->two_side != rs->two_side ||
-       old_rs->flatshade != rs->flatshade ||
+   if (old_rs->flatshade != rs->flatshade ||
        old_rs->clamp_fragment_color != rs->clamp_fragment_color)
       si_ps_key_update_rasterizer(sctx);
 
@@ -1276,7 +1283,9 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
        old_rs->line_smooth != rs->line_smooth ||
        old_rs->poly_smooth != rs->poly_smooth ||
        old_rs->polygon_mode_is_points != rs->polygon_mode_is_points ||
-       old_rs->poly_stipple_enable != rs->poly_stipple_enable)
+       old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
+       old_rs->two_side != rs->two_side ||
+       old_rs->force_front_face_input != rs->force_front_face_input)
       si_vs_ps_key_update_rast_prim_smooth_stipple(sctx);
 
    /* Used by si_get_vs_key_outputs in si_update_shaders: */
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 9441459cf2c68..8b8f5dcaf1184 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -85,6 +85,7 @@ struct si_state_rasterizer {
    unsigned polygon_mode_is_points : 1;
    unsigned perpendicular_end_caps : 1;
    unsigned bottom_edge_rule : 1;
+   int force_front_face_input : 2;
 };
 
 struct si_dsa_stencil_ref_part {
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index e52f799fcc84d..ef7672368eb9a 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -1872,6 +1872,14 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
    assert(!shader->key.ps.part.prolog.force_linear_sample_interp ||
           (!G_0286CC_LINEAR_CENTER_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
 
+   /* color_two_side always enables FRONT_FACE. Since st/mesa disables two-side colors if the back
+    * face is culled, the only case when both color_two_side and force_front_face_input can be set
+    * is when the front face is culled (which means force_front_face_input == -1).
+    */
+   assert(!shader->key.ps.opt.force_front_face_input || !G_0286CC_FRONT_FACE_ENA(input_ena) ||
+          (shader->key.ps.part.prolog.color_two_side &&
+           shader->key.ps.opt.force_front_face_input == -1));
+
    /* Validate cases when the optimizations are off (read as implications). */
    assert(shader->key.ps.part.prolog.bc_optimize_for_persp ||
           !G_0286CC_PERSP_CENTER_ENA(input_ena) || !G_0286CC_PERSP_CENTROID_ENA(input_ena));
@@ -2241,7 +2249,9 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx)
 void si_vs_ps_key_update_rast_prim_smooth_stipple(struct si_context *sctx)
 {
    struct si_shader_ctx_state *hw_vs = si_get_vs(sctx);
-   if (!hw_vs->cso)
+   struct si_shader_selector *ps = sctx->shader.ps.cso;
+
+   if (!hw_vs->cso || !ps)
       return;
 
    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
@@ -2249,33 +2259,44 @@ void si_vs_ps_key_update_rast_prim_smooth_stipple(struct si_context *sctx)
    union si_shader_key *ps_key = &sctx->shader.ps.key;
 
    bool old_kill_pointsize = vs_key->ge.opt.kill_pointsize;
+   bool old_color_two_side = ps_key->ps.part.prolog.color_two_side;
    bool old_poly_stipple = ps_key->ps.part.prolog.poly_stipple;
    bool old_poly_line_smoothing = ps_key->ps.mono.poly_line_smoothing;
    bool old_point_smoothing = ps_key->ps.mono.point_smoothing;
+   int old_force_front_face_input = ps_key->ps.opt.force_front_face_input;
 
    if (sctx->current_rast_prim == MESA_PRIM_POINTS) {
       vs_key->ge.opt.kill_pointsize = 0;
+      ps_key->ps.part.prolog.color_two_side = 0;
       ps_key->ps.part.prolog.poly_stipple = 0;
       ps_key->ps.mono.poly_line_smoothing = 0;
       ps_key->ps.mono.point_smoothing = rs->point_smooth;
+      ps_key->ps.opt.force_front_face_input = ps->info.uses_frontface;
    } else if (util_prim_is_lines(sctx->current_rast_prim)) {
       vs_key->ge.opt.kill_pointsize = hw_vs->cso->info.writes_psize;
+      ps_key->ps.part.prolog.color_two_side = 0;
       ps_key->ps.part.prolog.poly_stipple = 0;
       ps_key->ps.mono.poly_line_smoothing = rs->line_smooth && sctx->framebuffer.nr_samples <= 1;
       ps_key->ps.mono.point_smoothing = 0;
+      ps_key->ps.opt.force_front_face_input = ps->info.uses_frontface;
    } else {
       /* Triangles. */
       vs_key->ge.opt.kill_pointsize = hw_vs->cso->info.writes_psize &&
                                       !rs->polygon_mode_is_points;
+      ps_key->ps.part.prolog.color_two_side = rs->two_side && ps->info.colors_read;
       ps_key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable;
       ps_key->ps.mono.poly_line_smoothing = rs->poly_smooth && sctx->framebuffer.nr_samples <= 1;
       ps_key->ps.mono.point_smoothing = 0;
+      ps_key->ps.opt.force_front_face_input = rs->force_front_face_input &&
+                                              ps->info.uses_frontface;
    }
 
    if (vs_key->ge.opt.kill_pointsize != old_kill_pointsize ||
+       ps_key->ps.part.prolog.color_two_side != old_color_two_side ||
        ps_key->ps.part.prolog.poly_stipple != old_poly_stipple ||
        ps_key->ps.mono.poly_line_smoothing != old_poly_line_smoothing ||
-       ps_key->ps.mono.point_smoothing != old_point_smoothing)
+       ps_key->ps.mono.point_smoothing != old_point_smoothing ||
+       ps_key->ps.opt.force_front_face_input != old_force_front_face_input)
       sctx->do_update_shaders = true;
 }
 
@@ -2486,16 +2507,13 @@ void si_ps_key_update_rasterizer(struct si_context *sctx)
    if (!sel)
       return;
 
-   bool old_color_two_side = key->ps.part.prolog.color_two_side;
    bool old_flatshade_colors = key->ps.part.prolog.flatshade_colors;
    bool old_clamp_color = key->ps.part.epilog.clamp_color;
 
-   key->ps.part.prolog.color_two_side = rs->two_side && sel->info.colors_read;
    key->ps.part.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color;
    key->ps.part.epilog.clamp_color = rs->clamp_fragment_color;
 
-   if (key->ps.part.prolog.color_two_side != old_color_two_side ||
-       key->ps.part.prolog.flatshade_colors != old_flatshade_colors ||
+   if (key->ps.part.prolog.flatshade_colors != old_flatshade_colors ||
        key->ps.part.epilog.clamp_color != old_clamp_color)
       sctx->do_update_shaders = true;
 }
-- 
GitLab


From 7fa0ee15df200d4a0114804e43187d26a717eb5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 20 Nov 2023 09:51:56 -0500
Subject: [PATCH 14/38] radeonsi: set OOB_SELECT for VBOs in
 si_create_vertex_elements

we can do this since the stride is in the CSO now

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_state.c        | 11 +++++++++--
 src/gallium/drivers/radeonsi/si_state_draw.cpp | 12 +-----------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 2c633cf55e1e4..9fc1ade843620 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -5328,7 +5328,8 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
             (sscreen->info.gfx_level == GFX6 || sscreen->info.gfx_level >= GFX10);
       bool opencode = sscreen->options.vs_fetch_always_opencode;
 
-      if (check_alignment && ((elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0 || elements[i].src_stride & 3))
+      if (check_alignment && ((elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0 ||
+                              elements[i].src_stride & 3))
          opencode = true;
 
       if (always_fix || check_alignment || opencode)
@@ -5357,7 +5358,13 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
          ASSERTED unsigned last_vertex_format = sscreen->info.gfx_level >= GFX11 ? 64 : 128;
          assert(fmt->img_format != 0 && fmt->img_format < last_vertex_format);
          v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) |
-                             S_008F0C_RESOURCE_LEVEL(sscreen->info.gfx_level < GFX11);
+                             S_008F0C_RESOURCE_LEVEL(sscreen->info.gfx_level < GFX11) |
+                             /* OOB_SELECT chooses the out-of-bounds check:
+                              *  - 1: index >= NUM_RECORDS (Structured)
+                              *  - 3: offset >= NUM_RECORDS (Raw)
+                              */
+                             S_008F0C_OOB_SELECT(v->src_stride[i] ? V_008F0C_OOB_SELECT_STRUCTURED
+                                                                  : V_008F0C_OOB_SELECT_RAW);
       } else {
          unsigned data_format, num_format;
          data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 95e1478c5cc05..f522f330dc2af 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -1643,20 +1643,10 @@ static void ALWAYS_INLINE si_set_vb_descriptor(struct si_vertex_elements *velems
    }
    assert(num_records >= 0 && num_records <= UINT_MAX);
 
-   uint32_t rsrc_word3 = velems->rsrc_word3[index];
-
-   /* OOB_SELECT chooses the out-of-bounds check:
-    *  - 1: index >= NUM_RECORDS (Structured)
-    *  - 3: offset >= NUM_RECORDS (Raw)
-    */
-   if (GFX_VERSION >= GFX10)
-      rsrc_word3 |= S_008F0C_OOB_SELECT(stride ? V_008F0C_OOB_SELECT_STRUCTURED
-                                               : V_008F0C_OOB_SELECT_RAW);
-
    desc[0] = va;
    desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
    desc[2] = num_records;
-   desc[3] = rsrc_word3;
+   desc[3] = velems->rsrc_word3[index];
 }
 
 #if GFX_VER == 6 /* declare this function only once because it supports all chips. */
-- 
GitLab


From 48ce5fbaa19e2afad97ef98f66b17326fef19c14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 20 Nov 2023 10:10:24 -0500
Subject: [PATCH 15/38] radeonsi: group most vertex element fields

fix_fetch and vertex_buffer_index can't be grouped because we do
memcmp on those arrays

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_state.c       | 34 ++++++++++---------
 src/gallium/drivers/radeonsi/si_state.h       | 16 +++++----
 .../drivers/radeonsi/si_state_draw.cpp        |  8 ++---
 3 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 9fc1ade843620..77e4625dca1a2 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -5221,10 +5221,10 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
       first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
       channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
 
-      v->format_size[i] = desc->block.bits / 8;
-      v->src_offset[i] = elements[i].src_offset;
+      v->elem[i].format_size = desc->block.bits / 8;
+      v->elem[i].src_offset = elements[i].src_offset;
+      v->elem[i].stride = elements[i].src_stride;
       v->vertex_buffer_index[i] = vbo_index;
-      v->src_stride[i] = elements[i].src_stride;
 
       bool always_fix = false;
       union si_vs_fix_fetch fix_fetch;
@@ -5348,28 +5348,30 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
          v->vb_alignment_check_mask |= 1 << vbo_index;
       }
 
-      v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
-                         S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
-                         S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
-                         S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+      v->elem[i].rsrc_word3 = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+                              S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+                              S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+                              S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
 
       if (sscreen->info.gfx_level >= GFX10) {
          const struct gfx10_format *fmt = &ac_get_gfx10_format_table(&sscreen->info)[elements[i].src_format];
          ASSERTED unsigned last_vertex_format = sscreen->info.gfx_level >= GFX11 ? 64 : 128;
          assert(fmt->img_format != 0 && fmt->img_format < last_vertex_format);
-         v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) |
-                             S_008F0C_RESOURCE_LEVEL(sscreen->info.gfx_level < GFX11) |
-                             /* OOB_SELECT chooses the out-of-bounds check:
-                              *  - 1: index >= NUM_RECORDS (Structured)
-                              *  - 3: offset >= NUM_RECORDS (Raw)
-                              */
-                             S_008F0C_OOB_SELECT(v->src_stride[i] ? V_008F0C_OOB_SELECT_STRUCTURED
-                                                                  : V_008F0C_OOB_SELECT_RAW);
+         v->elem[i].rsrc_word3 |=
+            S_008F0C_FORMAT(fmt->img_format) |
+            S_008F0C_RESOURCE_LEVEL(sscreen->info.gfx_level < GFX11) |
+            /* OOB_SELECT chooses the out-of-bounds check:
+             *  - 1: index >= NUM_RECORDS (Structured)
+             *  - 3: offset >= NUM_RECORDS (Raw)
+             */
+            S_008F0C_OOB_SELECT(v->elem[i].stride ? V_008F0C_OOB_SELECT_STRUCTURED
+                                                  : V_008F0C_OOB_SELECT_RAW);
       } else {
          unsigned data_format, num_format;
          data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
          num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
-         v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
+         v->elem[i].rsrc_word3 |= S_008F0C_NUM_FORMAT(num_format) |
+                                  S_008F0C_DATA_FORMAT(data_format);
       }
    }
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 8b8f5dcaf1184..0cee83c1632fd 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -126,12 +126,6 @@ struct si_stencil_ref {
 
 struct si_vertex_elements {
    struct si_resource *instance_divisor_factor_buffer;
-   uint32_t rsrc_word3[SI_MAX_ATTRIBS];
-   uint16_t src_offset[SI_MAX_ATTRIBS];
-   uint16_t src_stride[SI_MAX_ATTRIBS];
-   uint8_t fix_fetch[SI_MAX_ATTRIBS];
-   uint8_t format_size[SI_MAX_ATTRIBS];
-   uint8_t vertex_buffer_index[SI_MAX_ATTRIBS];
 
    /* Bitmask of elements that always need a fixup to be applied. */
    uint16_t fix_fetch_always;
@@ -158,6 +152,16 @@ struct si_vertex_elements {
    uint16_t vb_desc_list_alloc_size;
    uint16_t instance_divisor_is_one;     /* bitmask of inputs */
    uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
+
+   uint8_t fix_fetch[SI_MAX_ATTRIBS];
+   uint8_t vertex_buffer_index[SI_MAX_ATTRIBS];
+
+   struct {
+      uint32_t rsrc_word3;
+      uint16_t src_offset;
+      uint16_t stride;
+      uint8_t format_size;
+   } elem[SI_MAX_ATTRIBS];
 };
 
 union si_state {
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index f522f330dc2af..738968a878d1c 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -1626,7 +1626,7 @@ static void ALWAYS_INLINE si_set_vb_descriptor(struct si_vertex_elements *velems
                                                uint32_t *desc) /* where to upload descriptors */
 {
    struct si_resource *buf = si_resource(vb->buffer.resource);
-   int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[index];
+   int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->elem[index].src_offset;
 
    if (!buf || offset >= buf->b.b.width0) {
       memset(desc, 0, 16);
@@ -1634,19 +1634,19 @@ static void ALWAYS_INLINE si_set_vb_descriptor(struct si_vertex_elements *velems
    }
 
    uint64_t va = buf->gpu_address + offset;
-   unsigned stride = velems->src_stride[index];
+   unsigned stride = velems->elem[index].stride;
 
    int64_t num_records = (int64_t)buf->b.b.width0 - offset;
    if (GFX_VERSION != GFX8 && stride) {
       /* Round up by rounding down and adding 1 */
-      num_records = (num_records - velems->format_size[index]) / stride + 1;
+      num_records = (num_records - velems->elem[index].format_size) / stride + 1;
    }
    assert(num_records >= 0 && num_records <= UINT_MAX);
 
    desc[0] = va;
    desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
    desc[2] = num_records;
-   desc[3] = velems->rsrc_word3[index];
+   desc[3] = velems->elem[index].rsrc_word3;
 }
 
 #if GFX_VER == 6 /* declare this function only once because it supports all chips. */
-- 
GitLab


From 65b3b0b355de7644a3efd70143c66e4a6189024e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 24 Nov 2023 07:40:54 -0500
Subject: [PATCH 16/38] radeonsi/gfx11: prefer Wave64 for PS without inputs for
 better VALU perf

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_state_shaders.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index ef7672368eb9a..0ae0e4b6e7cbd 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -86,10 +86,12 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha
    if (dbg_wave_size)
       return dbg_wave_size;
 
-   /* Pixel shaders without interp instructions don't suffer from reduced interpolation
+   /* Gfx10: Pixel shaders without interp instructions don't suffer from reduced interpolation
     * performance in Wave32, so use Wave32. This helps Piano and Voloplosion.
+    *
+    * Gfx11: Prefer Wave64 to take advantage of doubled VALU performance.
     */
-   if (stage == MESA_SHADER_FRAGMENT && !info->num_inputs)
+   if (sscreen->info.gfx_level < GFX11 && stage == MESA_SHADER_FRAGMENT && !info->num_inputs)
       return 32;
 
    /* There are a few very rare cases where VS is better with Wave32, and there are no known
@@ -111,6 +113,8 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha
    /* Divergent loops in Wave64 can end up having too many iterations in one half of the wave
     * while the other half is idling but occupying VGPRs, preventing other waves from launching.
     * Wave32 eliminates the idling half to allow the next wave to start.
+    *
+    * Gfx11: Wave32 continues to be faster with divergent loops despite worse VALU performance.
     */
    if (!merged_shader && info && info->has_divergent_loop)
       return 32;
-- 
GitLab


From f85488824e6abaf3a8ae36f8ca5016c05ba8a978 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 24 Nov 2023 17:49:26 -0500
Subject: [PATCH 17/38] radeonsi/gfx11: disable the shader profile for Medical
 that forces Wave64

GFX10 should keep using it, but not GFX11.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_shader.h          | 2 +-
 src/gallium/drivers/radeonsi/si_shader_info.c     | 2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.cpp | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index f7dcc961c7602..1df5f01b6410f 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -302,7 +302,7 @@ enum
 #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x)  (((x) >> 5) & 0xff)
 
 #define SI_PROFILE_WAVE32                    (1 << 0)
-#define SI_PROFILE_WAVE64                    (1 << 1)
+#define SI_PROFILE_GFX10_WAVE64              (1 << 1)
 /* bit gap */
 #define SI_PROFILE_VS_NO_BINNING             (1 << 3)
 #define SI_PROFILE_PS_NO_BINNING             (1 << 4)
diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c
index de2fc335f80a1..4141968d83e08 100644
--- a/src/gallium/drivers/radeonsi/si_shader_info.c
+++ b/src/gallium/drivers/radeonsi/si_shader_info.c
@@ -33,7 +33,7 @@ static struct si_shader_profile profiles[] =
        * probably due to interpolation performance.
        */
       {0x29f0f4a0, 0x0672258d, 0x47ccdcfd, 0x31e67dcc, 0xdcb1fda8},
-      SI_PROFILE_WAVE64,
+      SI_PROFILE_GFX10_WAVE64,
    },
    {
       /* Viewperf/Creo */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index 0ae0e4b6e7cbd..f8638eda48081 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -67,7 +67,8 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha
    if (info && info->options & SI_PROFILE_WAVE32)
       profile_wave_size = 32;
 
-   if (info && info->options & SI_PROFILE_WAVE64) {
+   if (info && info->options & SI_PROFILE_GFX10_WAVE64 &&
+       (sscreen->info.gfx_level == GFX10 || sscreen->info.gfx_level == GFX10_3)) {
       assert(!profile_wave_size);
       profile_wave_size = 64;
    }
-- 
GitLab


From 716b521515bc5fcdd7acfcb23202032e43bffae1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 24 Nov 2023 17:49:26 -0500
Subject: [PATCH 18/38] radeonsi/gfx11: disable the shader profile for Medical
 that disables binning

GFX11 performs better with the default behavior.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_shader.h          | 2 +-
 src/gallium/drivers/radeonsi/si_shader_info.c     | 2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.cpp | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 1df5f01b6410f..3c4f90e2dd187 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -305,7 +305,7 @@ enum
 #define SI_PROFILE_GFX10_WAVE64              (1 << 1)
 /* bit gap */
 #define SI_PROFILE_VS_NO_BINNING             (1 << 3)
-#define SI_PROFILE_PS_NO_BINNING             (1 << 4)
+#define SI_PROFILE_GFX9_GFX10_PS_NO_BINNING  (1 << 4)
 #define SI_PROFILE_CLAMP_DIV_BY_ZERO         (1 << 5)
 
 enum si_shader_dump_type {
diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c
index 4141968d83e08..b6af2badffc5d 100644
--- a/src/gallium/drivers/radeonsi/si_shader_info.c
+++ b/src/gallium/drivers/radeonsi/si_shader_info.c
@@ -26,7 +26,7 @@ static struct si_shader_profile profiles[] =
    {
       /* Viewperf/Medical */
       {0x4dce4331, 0x38f778d5, 0x1b75a717, 0x3e454fb9, 0xeb1527f0},
-      SI_PROFILE_PS_NO_BINNING,
+      SI_PROFILE_GFX9_GFX10_PS_NO_BINNING,
    },
    {
       /* Viewperf/Medical, a shader with a divergent loop doesn't benefit from Wave32,
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index f8638eda48081..ebb846d1e39d0 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -3773,7 +3773,8 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
    si_update_vrs_flat_shading(sctx);
 
    if (sctx->screen->dpbb_allowed) {
-      bool force_off = sel && sel->info.options & SI_PROFILE_PS_NO_BINNING;
+      bool force_off = sel && sel->info.options & SI_PROFILE_GFX9_GFX10_PS_NO_BINNING &&
+                       (sctx->gfx_level >= GFX9 && sctx->gfx_level <= GFX10_3);
 
       if (force_off != sctx->dpbb_force_off_profile_ps) {
          sctx->dpbb_force_off_profile_ps = force_off;
-- 
GitLab


From 257f07f499f8b7794f4e4ca49fda98ccdb10a396 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 24 Nov 2023 18:18:17 -0500
Subject: [PATCH 19/38] radeonsi: clean up how debug flags and shader profiles
 determine the wave size

- remove DBG_W32_PS_DISCARD
- just return the wave size instead of setting local variables dbg_wave_size
  and profile_wave_size

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_pipe.c        |  1 -
 src/gallium/drivers/radeonsi/si_pipe.h        |  1 -
 .../drivers/radeonsi/si_state_shaders.cpp     | 36 +++++--------------
 3 files changed, 8 insertions(+), 30 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 5a2111d66985e..c00fb1e923bfc 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -56,7 +56,6 @@ static const struct debug_named_value radeonsi_debug_options[] = {
    /* Shader compiler options the shader cache should be aware of: */
    {"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."},
    {"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."},
-   {"w32psdiscard", DBG(W32_PS_DISCARD), "Use Wave32 for pixel shaders even if they contain discard and LLVM is buggy."},
    {"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."},
    {"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."},
    {"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."},
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 5cf091451b657..20fdcf180eeec 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -196,7 +196,6 @@ enum
    /* Shader compiler options the shader cache should be aware of: */
    DBG_W32_GE,
    DBG_W32_PS,
-   DBG_W32_PS_DISCARD,
    DBG_W32_CS,
    DBG_W64_GE,
    DBG_W64_PS,
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index ebb846d1e39d0..18646a5f00cf3 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -48,44 +48,24 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha
         info->base.workgroup_size[2]) % 64 != 0)
       return 32;
 
-   /* Debug flags. */
-   unsigned dbg_wave_size = 0;
+   /* AMD_DEBUG wave flags override everything else. */
    if (sscreen->debug_flags &
        (stage == MESA_SHADER_COMPUTE ? DBG(W32_CS) :
-        stage == MESA_SHADER_FRAGMENT ? DBG(W32_PS) | DBG(W32_PS_DISCARD) : DBG(W32_GE)))
-      dbg_wave_size = 32;
+        stage == MESA_SHADER_FRAGMENT ? DBG(W32_PS) : DBG(W32_GE)))
+      return 32;
 
    if (sscreen->debug_flags &
        (stage == MESA_SHADER_COMPUTE ? DBG(W64_CS) :
-        stage == MESA_SHADER_FRAGMENT ? DBG(W64_PS) : DBG(W64_GE))) {
-      assert(!dbg_wave_size);
-      dbg_wave_size = 64;
-   }
+        stage == MESA_SHADER_FRAGMENT ? DBG(W64_PS) : DBG(W64_GE)))
+      return 64;
 
    /* Shader profiles. */
-   unsigned profile_wave_size = 0;
    if (info && info->options & SI_PROFILE_WAVE32)
-      profile_wave_size = 32;
+      return 32;
 
    if (info && info->options & SI_PROFILE_GFX10_WAVE64 &&
-       (sscreen->info.gfx_level == GFX10 || sscreen->info.gfx_level == GFX10_3)) {
-      assert(!profile_wave_size);
-      profile_wave_size = 64;
-   }
-
-   if (profile_wave_size) {
-      /* Only debug flags override shader profiles. */
-      if (dbg_wave_size)
-         return dbg_wave_size;
-
-      return profile_wave_size;
-   }
-
-   /* Debug flags except w32psdiscard don't override the discard bug workaround,
-    * but they override everything else.
-    */
-   if (dbg_wave_size)
-      return dbg_wave_size;
+       (sscreen->info.gfx_level == GFX10 || sscreen->info.gfx_level == GFX10_3))
+      return 64;
 
    /* Gfx10: Pixel shaders without interp instructions don't suffer from reduced interpolation
     * performance in Wave32, so use Wave32. This helps Piano and Voloplosion.
-- 
GitLab


From c77bcf00a36c0c6f2bc42a052de5152589f0a372 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 24 Nov 2023 19:26:14 -0500
Subject: [PATCH 20/38] radeonsi/gfx11: prefer Wave64 for VS/TCS/TES/GS because
 it's slightly faster

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_state_shaders.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index 18646a5f00cf3..95ee06b33130d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -75,12 +75,16 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha
    if (sscreen->info.gfx_level < GFX11 && stage == MESA_SHADER_FRAGMENT && !info->num_inputs)
       return 32;
 
-   /* There are a few very rare cases where VS is better with Wave32, and there are no known
-    * cases where Wave64 is better.
+   /* Gfx10: There are a few very rare cases where VS is better with Wave32, and there are no
+    * known cases where Wave64 is better.
+    *
     * Wave32 is disabled for GFX10 when culling is active as a workaround for #6457. I don't
     * know why this helps.
+    *
+    * Gfx11: Prefer Wave64 because it's slightly better than Wave32.
     */
    if (stage <= MESA_SHADER_GEOMETRY &&
+       (sscreen->info.gfx_level == GFX10 || sscreen->info.gfx_level == GFX10_3) &&
        !(sscreen->info.gfx_level == GFX10 && shader && shader->key.ge.opt.ngg_culling))
       return 32;
 
-- 
GitLab


From e1e35112c859bff2b5ba02397ac548af5b8317d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Nov 2023 07:41:24 -0500
Subject: [PATCH 21/38] winsys/amdgpu: bypass GL2 for command buffers

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 485e14af6334f..7449a97ca3871 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -871,9 +871,13 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws,
    /* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU.
     * The speed of writing to GTT WC is somewhere between no difference and very slow, while
     * VRAM being very slow a lot more often.
+    *
+    * Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency
+    * and doesn't have to wait for cached GL2 requests to be processed.
     */
    enum radeon_bo_domain domain = RADEON_DOMAIN_GTT;
-   unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING;
+   unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING |
+                    RADEON_FLAG_GL2_BYPASS;
 
    if (cs->ip_type == AMD_IP_GFX ||
        cs->ip_type == AMD_IP_COMPUTE ||
-- 
GitLab


From 4f2b794e98c63b5c6171f00cdc4c51141d8181a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Nov 2023 10:49:31 -0500
Subject: [PATCH 22/38] radeonsi: track NIR progress properly for optimizations
 in si_get_nir_shader

Just a small code size decrease in 12 shaders.

TOTALS FROM AFFECTED SHADERS (12/58918)
  SGPRS: 600.00 -> 600.00 (0.00 %)
  VGPRS: 528.00 -> 520.00 (-1.52 %)
  Spilled SGPRs: 0.00 -> 0.00 (0.00 %)
  Spilled VGPRs: 0.00 -> 0.00 (0.00 %)
  Private memory VGPRs: 0.00 -> 0.00 (0.00 %)
  Scratch size: 0.00 -> 0.00 (0.00 %) dwords per thread
  Code Size: 39772.00 -> 39688.00 (-0.21 %) bytes
  Max Waves: 180.00 -> 180.00 (0.00 %)
  Outputs: 0.00 -> 0.00 (0.00 %)
  Patch Outputs: 0.00 -> 0.00 (0.00 %)

Reviewed-by: Qiang Yu <yuq825@gmail.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_shader.c | 79 +++++++++++++-----------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 89c4c7a93f127..9f308ac28444b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2050,8 +2050,9 @@ static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void
    return true;
 }
 
-static void si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shader)
+static bool si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shader)
 {
+   bool progress = false;
    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
 
    nir_builder builder = nir_builder_at(nir_before_impl(impl));
@@ -2123,12 +2124,14 @@ static void si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shade
          nir_def *is_front_face = nir_load_front_face(b, 1);
          colors[i] = nir_bcsel(b, is_front_face, colors[i], back_color);
       }
+
+      progress = true;
    }
 
    /* lower nir_load_color0/1 to use the color value. */
-   nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic,
-                                nir_metadata_block_index | nir_metadata_dominance,
-                                colors);
+   return nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic,
+                                       nir_metadata_block_index | nir_metadata_dominance,
+                                       colors) || progress;
 }
 
 static void si_nir_emit_polygon_stipple(nir_shader *nir, struct si_shader_args *args)
@@ -2181,6 +2184,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
    }
 
    bool progress = false;
+   bool late_opts = false;
 
    const char *original_name = NULL;
    if (unlikely(should_print_nir(nir))) {
@@ -2197,12 +2201,11 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
    if (sel->stage <= MESA_SHADER_GEOMETRY)
       NIR_PASS(progress, nir, si_nir_kill_outputs, key);
 
-   NIR_PASS(
-      _, nir, ac_nir_lower_tex,
-      &(ac_nir_lower_tex_options){
-         .gfx_level = sel->screen->info.gfx_level,
-         .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord,
-      });
+   NIR_PASS(progress, nir, ac_nir_lower_tex,
+            &(ac_nir_lower_tex_options){
+               .gfx_level = sel->screen->info.gfx_level,
+               .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord,
+            });
 
    if (nir->info.uses_resource_info_query)
       NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level);
@@ -2253,10 +2256,8 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
        * TODO: The driver uses a linear search to find a shader variant. This
        * can be really slow if we get too many variants due to uniform inlining.
        */
-      NIR_PASS_V(nir, nir_inline_uniforms,
-                 nir->info.num_inlinable_uniforms,
-                 inlined_uniform_values,
-                 nir->info.inlinable_uniform_dw_offsets);
+      NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms,
+                 inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets);
       progress = true;
    }
 
@@ -2292,8 +2293,11 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
    if (is_last_vgt_stage || is_legacy_gs)
       NIR_PASS(progress, nir, si_nir_clamp_vertex_color);
 
-   if (progress)
+   if (progress) {
       si_nir_opts(sel->screen, nir, true);
+      late_opts = true;
+      progress = false;
+   }
 
    /* Lower large variables that are always constant with load_constant intrinsics, which
     * get turned into PC-relative loads from a data section next to the shader.
@@ -2304,19 +2308,18 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
     * The pass crashes if there are dead temps of lowered IO interface types, so remove
     * them first.
     */
-   bool progress2 = false;
    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
-   NIR_PASS(progress2, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
+   NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
 
    /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
     * this should be done after that.
     */
-   progress2 |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
+   progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
 
    if (sel->stage == MESA_SHADER_VERTEX)
-      progress2 |= si_nir_lower_vs_inputs(nir, shader, args);
+      NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, args);
 
-   bool opt_offsets = si_lower_io_to_mem(shader, nir, tcs_vgpr_only_inputs);
+   progress |= si_lower_io_to_mem(shader, nir, tcs_vgpr_only_inputs);
 
    if (is_last_vgt_stage) {
       /* Assign param export indices. */
@@ -2328,7 +2331,6 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
       if (key->ge.as_ngg) {
          /* Lower last VGT NGG shader stage. */
          si_lower_ngg(shader, nir);
-         opt_offsets = true;
       } else if (sel->stage == MESA_SHADER_VERTEX || sel->stage == MESA_SHADER_TESS_EVAL) {
          /* Lower last VGT none-NGG VS/TES shader stage. */
          unsigned clip_cull_mask =
@@ -2346,12 +2348,14 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
                     key->ge.opt.kill_layer,
                     sel->screen->options.vrs2x2);
       }
+      progress = true;
    } else if (is_legacy_gs) {
       NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg, output_info);
+      progress = true;
    } else if (sel->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
       /* two-side color selection and interpolation */
       if (sel->info.colors_read)
-         NIR_PASS_V(nir, si_nir_lower_ps_color_input, shader);
+         NIR_PASS(progress, nir, si_nir_lower_ps_color_input, shader);
 
       ac_nir_lower_ps_options options = {
          .gfx_level = sel->screen->info.gfx_level,
@@ -2383,33 +2387,34 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
       if (key->ps.part.prolog.poly_stipple)
          NIR_PASS_V(nir, si_nir_emit_polygon_stipple, args);
 
-      progress2 = true;
+      progress = true;
    }
 
-   NIR_PASS(progress2, nir, nir_opt_idiv_const, 8);
-   NIR_PASS(progress2, nir, nir_lower_idiv,
+   NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
+   NIR_PASS(progress, nir, nir_lower_idiv,
             &(nir_lower_idiv_options){
                .allow_fp16 = sel->screen->info.gfx_level >= GFX9,
             });
 
-   NIR_PASS(progress2, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
+   NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
             si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level),
             &args->ac);
-   NIR_PASS(progress2, nir, si_nir_lower_abi, shader, args);
+   NIR_PASS(progress, nir, si_nir_lower_abi, shader, args);
 
-   if (progress2 || opt_offsets)
+   if (progress) {
       si_nir_opts(sel->screen, nir, false);
-
-   if (opt_offsets) {
-      static const nir_opt_offsets_options offset_options = {
-         .uniform_max = 0,
-         .buffer_max = ~0,
-         .shared_max = ~0,
-      };
-      NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
+      progress = false;
+      late_opts = true;
    }
 
-   if (progress || progress2 || opt_offsets)
+   static const nir_opt_offsets_options offset_options = {
+      .uniform_max = 0,
+      .buffer_max = ~0,
+      .shared_max = ~0,
+   };
+   NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
+
+   if (late_opts)
       si_nir_late_opts(nir);
 
    /* aco only accept scalar const, must be done after si_nir_late_opts()
-- 
GitLab


From 00dd4d400ea283a9ebfe3fb674a48cacf5b0c484 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Nov 2023 11:01:42 -0500
Subject: [PATCH 23/38] ac,radeonsi: rename pos_inputs -> fragcoord_components

Reviewed-by: Qiang Yu <yuq825@gmail.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/amd/common/ac_shader_util.c          | 16 ++++++++--------
 src/amd/common/ac_shader_util.h          |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c |  6 +++---
 src/gallium/drivers/radeonsi/si_shader.h |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c
index 6013eb5811f05..24e530988434c 100644
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@@ -620,10 +620,10 @@ enum ac_image_dim ac_get_image_dim(enum amd_gfx_level gfx_level, enum glsl_sampl
 }
 
 unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
-                                  uint8_t *num_pos_inputs)
+                                  uint8_t *num_fragcoord_components)
 {
    unsigned num_input_vgprs = 0;
-   unsigned pos_inputs = 0;
+   unsigned fragcoord_components = 0;
 
    if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
       num_input_vgprs += 2;
@@ -643,19 +643,19 @@ unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
       num_input_vgprs += 1;
    if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr)) {
       num_input_vgprs += 1;
-      pos_inputs++;
+      fragcoord_components++;
    }
    if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr)) {
       num_input_vgprs += 1;
-      pos_inputs++;
+      fragcoord_components++;
    }
    if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr)) {
       num_input_vgprs += 1;
-      pos_inputs++;
+      fragcoord_components++;
    }
    if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr)) {
       num_input_vgprs += 1;
-      pos_inputs++;
+      fragcoord_components++;
    }
    if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr))
       num_input_vgprs += 1;
@@ -666,8 +666,8 @@ unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
    if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
       num_input_vgprs += 1;
 
-   if (num_pos_inputs)
-      *num_pos_inputs = pos_inputs;
+   if (num_fragcoord_components)
+      *num_fragcoord_components = fragcoord_components;
 
    return num_input_vgprs;
 }
diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h
index e97cc0cddb4c7..e870262a10337 100644
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@@ -170,7 +170,7 @@ enum ac_image_dim ac_get_image_dim(enum amd_gfx_level gfx_level, enum glsl_sampl
                                    bool is_array);
 
 unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
-                                  uint8_t *num_pos_inputs);
+                                  uint8_t *num_fragcoord_components);
 
 uint16_t ac_get_ps_iter_mask(unsigned ps_iter_samples);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 9f308ac28444b..e920c8b355a50 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2903,7 +2903,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
    /* Calculate the number of fragment input VGPRs. */
    if (sel->stage == MESA_SHADER_FRAGMENT) {
       shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
-         &shader->config, &shader->info.num_ps_pos_inputs);
+         &shader->config, &shader->info.num_fragcoord_components);
    }
 
    si_calculate_max_simd_waves(shader);
@@ -3088,7 +3088,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke
        key->ps_prolog.states.force_persp_center_interp ||
        key->ps_prolog.states.force_linear_center_interp ||
        key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
-   key->ps_prolog.num_pos_inputs = shader->info.num_ps_pos_inputs;
+   key->ps_prolog.num_fragcoord_components = shader->info.num_fragcoord_components;
 
    if (shader->key.ps.part.prolog.poly_stipple)
       shader->info.uses_vmem_load_other = true;
@@ -3673,7 +3673,7 @@ void si_get_ps_prolog_args(struct si_shader_args *args,
    /* skip LINE_STIPPLE_TEX */
 
    /* POS_X|Y|Z|W_FLOAT */
-   for (unsigned i = 0; i < key->ps_prolog.num_pos_inputs; i++)
+   for (unsigned i = 0; i < key->ps_prolog.num_fragcoord_components; i++)
       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
 
    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 3c4f90e2dd187..047ffc5c0a685 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -679,7 +679,7 @@ union si_shader_part_key {
       /* Color interpolation and two-side color selection. */
       unsigned colors_read : 8;       /* color input components read */
       unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
-      unsigned num_pos_inputs : 3;
+      unsigned num_fragcoord_components : 3;
       unsigned wqm : 1;
       char color_attr_index[2];
       signed char color_interp_vgpr_index[2]; /* -1 == constant */
@@ -823,7 +823,7 @@ struct si_shader_binary_info {
    uint8_t num_input_vgprs;
    bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
    bool uses_vmem_sampler_or_bvh;
-   uint8_t num_ps_pos_inputs;
+   uint8_t num_fragcoord_components;
    bool uses_instanceid;
    uint8_t nr_pos_exports;
    uint8_t nr_param_exports;
-- 
GitLab


From 7d2faa88abb3be9935a2af1515199dc5aa96f61b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Nov 2023 13:29:49 -0500
Subject: [PATCH 24/38] nir,radeonsi: add FLAGS into load_vector_arg_amd to
 record color input usage

This will be needed for gathering color usage from lowered PS.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/compiler/nir/nir_intrinsics.py              |  8 ++++++--
 src/gallium/drivers/radeonsi/si_nir_lower_abi.c | 12 +++++++++---
 src/gallium/drivers/radeonsi/si_shader.h        |  4 ++++
 src/gallium/drivers/radeonsi/si_shader_info.c   |  8 ++++++++
 4 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 380c153f0ddf8..8ac4ee3f5e604 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1589,8 +1589,12 @@ intrinsic("store_hit_attrib_amd", src_comp=[1], indices=[BASE])
 # Load forced VRS rates.
 intrinsic("load_force_vrs_rates_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER])
 
-intrinsic("load_scalar_arg_amd", dest_comp=0, bit_sizes=[32], indices=[BASE, ARG_UPPER_BOUND_U32_AMD], flags=[CAN_ELIMINATE, CAN_REORDER])
-intrinsic("load_vector_arg_amd", dest_comp=0, bit_sizes=[32], indices=[BASE, ARG_UPPER_BOUND_U32_AMD], flags=[CAN_ELIMINATE, CAN_REORDER])
+intrinsic("load_scalar_arg_amd", dest_comp=0, bit_sizes=[32],
+          indices=[BASE, ARG_UPPER_BOUND_U32_AMD],
+          flags=[CAN_ELIMINATE, CAN_REORDER])
+intrinsic("load_vector_arg_amd", dest_comp=0, bit_sizes=[32],
+          indices=[BASE, ARG_UPPER_BOUND_U32_AMD, FLAGS],
+          flags=[CAN_ELIMINATE, CAN_REORDER])
 store("scalar_arg_amd", [], [BASE])
 store("vector_arg_amd", [], [BASE])
 
diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
index e57d9590be691..c96572bd5a932 100644
--- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
+++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
@@ -616,9 +616,15 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
 
       nir_def *color[4];
       for (int i = 0; i < 4; i++) {
-         color[i] = colors_read & BITFIELD_BIT(start + i) ?
-            ac_nir_load_arg_at_offset(b, &args->ac, args->color_start, offset++) :
-            nir_undef(b, 1, 32);
+         if (colors_read & BITFIELD_BIT(start + i)) {
+            color[i] = ac_nir_load_arg_at_offset(b, &args->ac, args->color_start, offset++);
+
+            nir_intrinsic_set_flags(nir_instr_as_intrinsic(color[i]->parent_instr),
+                                    SI_VECTOR_ARG_IS_COLOR |
+                                    SI_VECTOR_ARG_COLOR_COMPONENT(start + i));
+         } else {
+            color[i] = nir_undef(b, 1, 32);
+         }
       }
 
       replacement = nir_vec(b, color, 4);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 047ffc5c0a685..64f478b06d6a4 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -137,6 +137,10 @@ struct nir_lower_subgroups_options;
 /* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
 #define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
 
+#define SI_VECTOR_ARG_IS_COLOR               BITFIELD_BIT(0)
+#define SI_VECTOR_ARG_COLOR_COMPONENT(x)     (((x) & 0x7) << 1)
+#define SI_GET_VECTOR_ARG_COLOR_COMPONENT(x) (((x) >> 1) & 0x7)
+
 /* SGPR user data indices */
 enum
 {
diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c
index b6af2badffc5d..66fd766be973c 100644
--- a/src/gallium/drivers/radeonsi/si_shader_info.c
+++ b/src/gallium/drivers/radeonsi/si_shader_info.c
@@ -526,6 +526,14 @@ static void scan_instruction(const struct nir_shader *nir, struct si_shader_info
          }
          break;
       }
+      case nir_intrinsic_load_vector_arg_amd:
+         /* Non-monolithic lowered PS can have this. We need to record color usage. */
+         if (nir_intrinsic_flags(intr) & SI_VECTOR_ARG_IS_COLOR) {
+            /* The channel can be between 0 and 7. */
+            unsigned chan = SI_GET_VECTOR_ARG_COLOR_COMPONENT(nir_intrinsic_flags(intr));
+            info->colors_read |= BITFIELD_BIT(chan);
+         }
+         break;
       case nir_intrinsic_load_barycentric_at_offset:   /* uses center */
       case nir_intrinsic_load_barycentric_at_sample:   /* uses center */
          if (nir_intrinsic_interp_mode(intr) == INTERP_MODE_FLAT)
-- 
GitLab


From 94e0a1267e4ef1f2ef8b4cbdaea8e892b355db8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Nov 2023 13:33:04 -0500
Subject: [PATCH 25/38] radeonsi: change the signature of
 si_nir_lower_ps_color_input

This will be needed later.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_shader.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index e920c8b355a50..c8438c4af90a8 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2050,7 +2050,8 @@ static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void
    return true;
 }
 
-static bool si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shader)
+static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_key *key,
+                                        const struct si_shader_info *info)
 {
    bool progress = false;
    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
@@ -2058,24 +2059,21 @@ static bool si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shade
    nir_builder builder = nir_builder_at(nir_before_impl(impl));
    nir_builder *b = &builder;
 
-   const struct si_shader_selector *sel = shader->selector;
-   const union si_shader_key *key = &shader->key;
-
    /* Build ready to be used colors at the beginning of the shader. */
    nir_def *colors[2] = {0};
    for (int i = 0; i < 2; i++) {
-      if (!(sel->info.colors_read & (0xf << (i * 4))))
+      if (!(info->colors_read & (0xf << (i * 4))))
          continue;
 
-      unsigned color_base = sel->info.color_attr_index[i];
+      unsigned color_base = info->color_attr_index[i];
       /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
        * otherwise it's at offset "num_inputs".
        */
-      unsigned back_color_base = sel->info.num_inputs;
-      if (i == 1 && (sel->info.colors_read & 0xf))
+      unsigned back_color_base = info->num_inputs;
+      if (i == 1 && (info->colors_read & 0xf))
          back_color_base += 1;
 
-      enum glsl_interp_mode interp_mode = sel->info.color_interpolate[i];
+      enum glsl_interp_mode interp_mode = info->color_interpolate[i];
       if (interp_mode == INTERP_MODE_COLOR) {
          interp_mode = key->ps.part.prolog.flatshade_colors ?
             INTERP_MODE_FLAT : INTERP_MODE_SMOOTH;
@@ -2092,7 +2090,7 @@ static bool si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shade
          }
       } else {
          nir_intrinsic_op op = 0;
-         switch (sel->info.color_interpolate_loc[i]) {
+         switch (info->color_interpolate_loc[i]) {
          case TGSI_INTERPOLATE_LOC_CENTER:
             op = nir_intrinsic_load_barycentric_pixel;
             break;
@@ -2355,7 +2353,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
    } else if (sel->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
       /* two-side color selection and interpolation */
       if (sel->info.colors_read)
-         NIR_PASS(progress, nir, si_nir_lower_ps_color_input, shader);
+         NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &sel->info);
 
       ac_nir_lower_ps_options options = {
          .gfx_level = sel->screen->info.gfx_level,
-- 
GitLab


From 3faa9ba06be051973f6806ba534983fc130ea859 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Nov 2023 13:34:45 -0500
Subject: [PATCH 26/38] radeonsi: gather lowered color inputs for monolithic PS

This also adds missing io_semantics to the input loads that the gathering
expects.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_shader.c      | 16 ++++++++++++----
 src/gallium/drivers/radeonsi/si_shader_info.c | 12 ++++++++++++
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index c8438c4af90a8..15844b2244162 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2082,11 +2082,15 @@ static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_k
       nir_def *back_color = NULL;
       if (interp_mode == INTERP_MODE_FLAT) {
          colors[i] = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
-                                   .base = color_base);
+                                   .base = color_base,
+                                   .io_semantics.location = VARYING_SLOT_COL0 + i,
+                                   .io_semantics.num_slots = 1);
 
          if (key->ps.part.prolog.color_two_side) {
             back_color = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
-                                        .base = back_color_base);
+                                        .base = back_color_base,
+                                        .io_semantics.location = VARYING_SLOT_BFC0 + i,
+                                        .io_semantics.num_slots = 1);
          }
       } else {
          nir_intrinsic_op op = 0;
@@ -2109,12 +2113,16 @@ static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_k
 
          colors[i] =
             nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
-                                        .base = color_base);
+                                        .base = color_base,
+                                        .io_semantics.location = VARYING_SLOT_COL0 + i,
+                                        .io_semantics.num_slots = 1);
 
          if (key->ps.part.prolog.color_two_side) {
             back_color =
                nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
-                                           .base = back_color_base);
+                                           .base = back_color_base,
+                                           .io_semantics.location = VARYING_SLOT_BFC0 + i,
+                                           .io_semantics.num_slots = 1);
          }
       }
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c
index 66fd766be973c..f14ff069cbcf5 100644
--- a/src/gallium/drivers/radeonsi/si_shader_info.c
+++ b/src/gallium/drivers/radeonsi/si_shader_info.c
@@ -259,6 +259,18 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info,
    if (nir->info.stage != MESA_SHADER_VERTEX || !is_input)
       semantic = nir_intrinsic_io_semantics(intr).location;
 
+   if (nir->info.stage == MESA_SHADER_FRAGMENT && is_input) {
+      /* Gather color PS inputs. We can only get here after lowering colors in monolithic
+       * shaders. This must match what we do for nir_intrinsic_load_color0/1.
+       */
+      if (semantic == VARYING_SLOT_COL0 || semantic == VARYING_SLOT_COL1 ||
+          semantic == VARYING_SLOT_BFC0 || semantic == VARYING_SLOT_BFC1) {
+         unsigned index = semantic == VARYING_SLOT_COL1 || semantic == VARYING_SLOT_BFC1;
+         info->colors_read |= mask << (index * 4);
+         return;
+      }
+   }
+
    if (nir->info.stage == MESA_SHADER_FRAGMENT && !is_input) {
       /* Never use FRAG_RESULT_COLOR directly. */
       if (semantic == FRAG_RESULT_COLOR)
-- 
GitLab


From 197af036989d23ee1191d5b3703cf88fd0b0bbe0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Nov 2023 13:44:07 -0500
Subject: [PATCH 27/38] radeonsi: add PS input info into si_shader_binary_info

It will be modified to reflect PS inputs after uniform inlining.
For now, it's just a copy of selector->info.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_shader.c      | 34 ++++++++++++++-----
 src/gallium/drivers/radeonsi/si_shader.h      |  4 +++
 .../drivers/radeonsi/si_state_shaders.cpp     |  3 +-
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 15844b2244162..2ca944f401cd6 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -30,10 +30,9 @@ static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *s
 /* Get the number of all interpolated inputs */
 unsigned si_get_ps_num_interp(struct si_shader *ps)
 {
-   struct si_shader_info *info = &ps->selector->info;
-   unsigned num_colors = !!(info->colors_read & 0x0f) + !!(info->colors_read & 0xf0);
+   unsigned num_colors = !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0);
    unsigned num_interp =
-      ps->selector->info.num_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0);
+      ps->info.num_ps_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0);
 
    assert(num_interp <= 32);
    return MIN2(num_interp, 32);
@@ -1178,7 +1177,6 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
 {
    struct si_screen *sscreen = shader->selector->screen;
    struct ac_shader_config *conf = &shader->config;
-   unsigned num_inputs = shader->selector->info.num_inputs;
    unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
    unsigned lds_per_wave = 0;
    unsigned max_simd_waves;
@@ -1198,7 +1196,8 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
        * Other stages don't know the size at compile time or don't
        * allocate LDS per wave, but instead they do it per thread group.
        */
-      lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
+      lds_per_wave = conf->lds_size * lds_increment +
+                     align(shader->info.num_ps_inputs * 48, lds_increment);
       break;
    case MESA_SHADER_COMPUTE: {
          unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
@@ -2363,6 +2362,12 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
       if (sel->info.colors_read)
          NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &sel->info);
 
+      /* We need to set this early for lowering nir_intrinsic_load_point_coord_maybe_flipped,
+       * which can only occur with monolithic PS.
+       */
+      shader->info.num_ps_inputs = sel->info.num_inputs;
+      shader->info.ps_colors_read = sel->info.colors_read;
+
       ac_nir_lower_ps_options options = {
          .gfx_level = sel->screen->info.gfx_level,
          .family = sel->screen->info.family,
@@ -2449,6 +2454,17 @@ void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
 
    shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
    shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
+
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      shader->info.num_ps_inputs = shader->selector->info.num_inputs;
+      shader->info.ps_colors_read = shader->selector->info.colors_read;
+
+      unsigned num_colors = !!(shader->selector->info.colors_read & 0x0f) +
+                            !!(shader->selector->info.colors_read & 0xf0);
+      unsigned max_interp = MIN2(shader->info.num_ps_inputs + num_colors, SI_NUM_INTERP);
+      memcpy(shader->info.ps_inputs, shader->selector->info.input,
+             max_interp * sizeof(info.input[0]));
+   }
 }
 
 /* Generate code for the hardware VS shader stage to go with a geometry shader */
@@ -3085,7 +3101,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke
    memset(key, 0, sizeof(*key));
    key->ps_prolog.states = shader->key.ps.part.prolog;
    key->ps_prolog.wave32 = shader->wave_size == 32;
-   key->ps_prolog.colors_read = info->colors_read;
+   key->ps_prolog.colors_read = shader->info.ps_colors_read;
    key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
    key->ps_prolog.wqm =
       info->base.fs.needs_quad_helper_invocations &&
@@ -3099,12 +3115,12 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke
    if (shader->key.ps.part.prolog.poly_stipple)
       shader->info.uses_vmem_load_other = true;
 
-   if (info->colors_read) {
+   if (shader->info.ps_colors_read) {
       uint8_t *color = shader->selector->info.color_attr_index;
 
       if (shader->key.ps.part.prolog.color_two_side) {
          /* BCOLORs are stored after the last input. */
-         key->ps_prolog.num_interp_inputs = info->num_inputs;
+         key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs;
          shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
       }
 
@@ -3112,7 +3128,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke
          unsigned interp = info->color_interpolate[i];
          unsigned location = info->color_interpolate_loc[i];
 
-         if (!(info->colors_read & (0xf << i * 4)))
+         if (!(shader->info.ps_colors_read & (0xf << i * 4)))
             continue;
 
          key->ps_prolog.color_attr_index[i] = color[i];
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 64f478b06d6a4..f429ad2069908 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -127,6 +127,7 @@ struct nir_shader;
 struct nir_instr;
 struct nir_lower_subgroups_options;
 
+#define SI_NUM_INTERP     32
 #define SI_MAX_ATTRIBS    16
 #define SI_MAX_VS_OUTPUTS 40
 #define SI_USER_CLIP_PLANE_MASK  0x3F
@@ -823,6 +824,9 @@ union si_shader_key {
 struct si_shader_binary_info {
    uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
    uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
+   union si_input_info ps_inputs[SI_NUM_INTERP];
+   uint8_t num_ps_inputs;
+   uint8_t ps_colors_read;
    uint8_t num_input_sgprs;
    uint8_t num_input_vgprs;
    bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index 95ee06b33130d..6b7269322362b 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -4801,7 +4801,6 @@ template<int NUM_INTERP>
 static void si_emit_spi_map(struct si_context *sctx, unsigned index)
 {
    struct si_shader *ps = sctx->shader.ps.current;
-   struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
    unsigned spi_ps_input_cntl[NUM_INTERP];
 
    STATIC_ASSERT(NUM_INTERP >= 0 && NUM_INTERP <= 32);
@@ -4813,7 +4812,7 @@ static void si_emit_spi_map(struct si_context *sctx, unsigned index)
    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
    for (unsigned i = 0; i < NUM_INTERP; i++) {
-      union si_input_info input = psinfo->input[i];
+      union si_input_info input = ps->info.ps_inputs[i];
       unsigned ps_input_cntl = vs->info.vs_output_ps_input_cntl[input.semantic];
       bool non_default_val = G_028644_OFFSET(ps_input_cntl) != 0x20;
 
-- 
GitLab


From 1d4402b02ad7cf98250e07c09dce8f65b42d1925 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Nov 2023 15:32:48 -0500
Subject: [PATCH 28/38] radeonsi: don't include the PARAM_GEN input in
 si_shader_info

It's only produced by lowering point smoothing.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_nir_lower_abi.c | 4 +++-
 src/gallium/drivers/radeonsi/si_shader_info.c   | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
index c96572bd5a932..8a99e9682b9da 100644
--- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
+++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
@@ -637,7 +637,9 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
       /* Load point coordinates (x, y) which are written by the hw after the interpolated inputs */
       replacement = nir_load_interpolated_input(b, 2, 32, interp_param, nir_imm_int(b, 0),
                                                 .base = si_get_ps_num_interp(shader),
-                                                .component = 2);
+                                                .component = 2,
+                                                /* This tells si_nir_scan_shader that it's PARAM_GEN */
+                                                .io_semantics.no_varying = 1);
       break;
    }
    case nir_intrinsic_load_poly_line_smooth_enabled:
diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c
index f14ff069cbcf5..5f6007898db73 100644
--- a/src/gallium/drivers/radeonsi/si_shader_info.c
+++ b/src/gallium/drivers/radeonsi/si_shader_info.c
@@ -260,6 +260,10 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info,
       semantic = nir_intrinsic_io_semantics(intr).location;
 
    if (nir->info.stage == MESA_SHADER_FRAGMENT && is_input) {
+      /* The PARAM_GEN input shouldn't be scanned. */
+      if (nir_intrinsic_io_semantics(intr).no_varying)
+         return;
+
       /* Gather color PS inputs. We can only get here after lowering colors in monolithic
        * shaders. This must match what we do for nir_intrinsic_load_color0/1.
        */
-- 
GitLab


From dbea0f51a392ce7b54051de1e63265004deff627 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Nov 2023 15:37:47 -0500
Subject: [PATCH 29/38] radeonsi: decrease NUM_INTERP if uniform inlining
 eliminated PS inputs

This should improve performance when that happens.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_shader.c | 40 +++++++++++++++---------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 2ca944f401cd6..aa95727d34916 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2236,12 +2236,11 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
        *   The storage for eliminated outputs is also not allocated.
        * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
        * - TCS output stores are eliminated
+       * - Eliminated PS inputs are removed from PS.NUM_INTERP.
        *
        * TODO: These are things the driver ignores in the final shader code
        * and relies on the default shader info.
        * - Other system values are not eliminated
-       * - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs
-       *   to remove holes
        * - uses_discard - if it changed to false
        * - writes_memory - if it changed to false
        * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
@@ -2358,15 +2357,27 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
       NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg, output_info);
       progress = true;
    } else if (sel->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
-      /* two-side color selection and interpolation */
-      if (sel->info.colors_read)
-         NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &sel->info);
+      /* Uniform inlining can eliminate PS inputs, and colormask can remove PS outputs,
+       * which can also cause the elimination of PS inputs. Remove holes after removed PS inputs
+       * by renumbering them. This can only happen with monolithic PS. Colors are unaffected
+       * because they are still represented by nir_intrinsic_load_color0/1.
+       */
+      NIR_PASS_V(nir, nir_recompute_io_bases, nir_var_shader_in);
+
+      /* Two-side color selection and interpolation: Get the latest shader info because
+       * uniform inlining and colormask can fully eliminate color inputs.
+       */
+      struct si_shader_info info;
+      si_nir_scan_shader(sel->screen, nir, &info);
+
+      if (info.colors_read)
+         NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &info);
 
       /* We need to set this early for lowering nir_intrinsic_load_point_coord_maybe_flipped,
        * which can only occur with monolithic PS.
        */
-      shader->info.num_ps_inputs = sel->info.num_inputs;
-      shader->info.ps_colors_read = sel->info.colors_read;
+      shader->info.num_ps_inputs = info.num_inputs;
+      shader->info.ps_colors_read = info.colors_read;
 
       ac_nir_lower_ps_options options = {
          .gfx_level = sel->screen->info.gfx_level,
@@ -2456,14 +2467,13 @@ void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
    shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
 
    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-      shader->info.num_ps_inputs = shader->selector->info.num_inputs;
-      shader->info.ps_colors_read = shader->selector->info.colors_read;
-
-      unsigned num_colors = !!(shader->selector->info.colors_read & 0x0f) +
-                            !!(shader->selector->info.colors_read & 0xf0);
-      unsigned max_interp = MIN2(shader->info.num_ps_inputs + num_colors, SI_NUM_INTERP);
-      memcpy(shader->info.ps_inputs, shader->selector->info.input,
-             max_interp * sizeof(info.input[0]));
+      /* Since uniform inlining can remove PS inputs, set the latest info about PS inputs here. */
+      shader->info.num_ps_inputs = info.num_inputs;
+      shader->info.ps_colors_read = info.colors_read;
+
+      /* A non-monolithic PS doesn't know if back colors are enabled, so copy 2 more. */
+      unsigned max_interp = MIN2(info.num_inputs + 2, SI_NUM_INTERP);
+      memcpy(shader->info.ps_inputs, info.input, max_interp * sizeof(info.input[0]));
    }
 }
 
-- 
GitLab


From bd700bace5f458212cac3fb1184a5b2cbc5c8c63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Nov 2023 16:18:38 -0500
Subject: [PATCH 30/38] radeonsi: update comments about uniform inlining

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_shader.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index aa95727d34916..abb29fece9dc3 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2232,22 +2232,22 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
        * - Register usage and code size decrease (obvious)
        * - Eliminated PS system values are disabled by LLVM
        *   (FragCoord, FrontFace, barycentrics)
-       * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef.
-       *   The storage for eliminated outputs is also not allocated.
+       * - VS/TES/GS param exports are eliminated if they are undef.
+       *   The param space for eliminated outputs is also not allocated.
        * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
        * - TCS output stores are eliminated
        * - Eliminated PS inputs are removed from PS.NUM_INTERP.
        *
        * TODO: These are things the driver ignores in the final shader code
        * and relies on the default shader info.
-       * - Other system values are not eliminated
+       * - System values in VS, TCS, TES, GS are not eliminated
        * - uses_discard - if it changed to false
        * - writes_memory - if it changed to false
        * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
        *   eliminated
        * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
        *   GS outputs are eliminated except for the temporary LDS.
-       *   Clip distances, gl_PointSize, and PS outputs are eliminated based
+       *   Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based
        *   on current states, so we don't care about the shader code.
        *
        * TODO: Merged shaders don't inline uniforms for the first stage.
-- 
GitLab


From 6d2a7f53acfb219910fde175a4233bd5157937f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Nov 2023 17:25:42 -0500
Subject: [PATCH 31/38] radeonsi: decrease NUM_INTERP if export
 formats/colormask eliminated PS inputs

This adds a pass that removes output stores. It's called before NIR
optimizations that are before PS lowering, which will cause it to remove
PS inputs from register settings.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_shader.c | 88 ++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index abb29fece9dc3..3a9493af83acf 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1721,6 +1721,80 @@ static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key)
    return progress;
 }
 
+/* Remove PS output components from NIR if they are disabled by spi_shader_col_format. */
+static bool kill_ps_outputs_cb(struct nir_builder *b, nir_instr *instr, void *_key)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   /* No indirect indexing allowed. */
+   ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
+   assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
+
+   unsigned location = nir_intrinsic_io_semantics(intr).location;
+   const union si_shader_key *key = _key;
+
+   switch (location) {
+   case FRAG_RESULT_DEPTH:
+   case FRAG_RESULT_STENCIL:
+      return false;
+
+   case FRAG_RESULT_SAMPLE_MASK:
+      if (key->ps.part.epilog.kill_samplemask) {
+         nir_instr_remove(instr);
+         return true;
+      }
+      return false;
+   }
+
+   /* Color outputs. */
+   unsigned comp_mask = BITFIELD_MASK(intr->num_components);
+   assert(nir_intrinsic_component(intr) == 0);
+   unsigned cb_shader_mask = ac_get_cb_shader_mask(key->ps.part.epilog.spi_shader_col_format);
+
+   /* If COLOR is broadcasted to multiple color buffers, combine their masks. */
+   if (location == FRAG_RESULT_COLOR) {
+      for (unsigned i = 1; i <= key->ps.part.epilog.last_cbuf; i++)
+         cb_shader_mask |= (cb_shader_mask >> (i * 4)) & 0xf;
+   }
+
+   unsigned index = location == FRAG_RESULT_COLOR ? 0 : location - FRAG_RESULT_DATA0;
+   unsigned output_mask = (cb_shader_mask >> (index * 4)) & 0xf;
+
+   if ((output_mask & comp_mask) == comp_mask)
+      return false;
+
+   if (!(output_mask & comp_mask)) {
+      nir_instr_remove(instr);
+      return true;
+   }
+
+   /* Fill disabled components with undef. */
+   b->cursor = nir_before_instr(instr);
+   nir_def *new_value = intr->src[0].ssa;
+   nir_def *undef = nir_undef(b, 1, new_value->bit_size);
+
+   unsigned kill_mask = ~output_mask & comp_mask;
+   u_foreach_bit(i, kill_mask) {
+      new_value = nir_vector_insert_imm(b, new_value, undef, i);
+   }
+
+   nir_src_rewrite(&intr->src[0], new_value);
+   return true;
+}
+
+static bool si_nir_kill_ps_outputs(nir_shader *nir, const union si_shader_key *key)
+{
+   assert(nir->info.stage == MESA_SHADER_FRAGMENT);
+   return nir_shader_instructions_pass(nir, kill_ps_outputs_cb,
+                                       nir_metadata_dominance |
+                                       nir_metadata_block_index, (void*)key);
+}
+
 static bool clamp_vertex_color_instr(nir_builder *b,
                                      nir_intrinsic_instr *intrin, void *state)
 {
@@ -2265,11 +2339,17 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
       progress = true;
    }
 
-   if (sel->stage == MESA_SHADER_FRAGMENT && key->ps.mono.poly_line_smoothing)
-      NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
+   if (sel->stage == MESA_SHADER_FRAGMENT) {
+      /* This uses the epilog key, so only monolithic shaders can call this. */
+      if (shader->is_monolithic)
+         NIR_PASS(progress, nir, si_nir_kill_ps_outputs, key);
+
+      if (key->ps.mono.poly_line_smoothing)
+         NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
 
-   if (sel->stage == MESA_SHADER_FRAGMENT && key->ps.mono.point_smoothing)
-      NIR_PASS(progress, nir, nir_lower_point_smooth);
+      if (key->ps.mono.point_smoothing)
+         NIR_PASS(progress, nir, nir_lower_point_smooth);
+   }
 
    /* This must be before si_nir_lower_resource. */
    if (!sel->screen->info.has_image_opcodes)
-- 
GitLab


From fb994f44d91a4b94738ea4ebb83aab1a257ef123 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 26 Nov 2023 09:23:10 -0500
Subject: [PATCH 32/38] util: make BITSET_TEST_RANGE_INSIDE_WORD take a value
 to compare with

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/util/bitset.h              | 8 ++++----
 src/util/tests/bitset_test.cpp | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/util/bitset.h b/src/util/bitset.h
index d5b5d6dae2d0d..cffbb73ecce54 100644
--- a/src/util/bitset.h
+++ b/src/util/bitset.h
@@ -205,11 +205,11 @@ __bitset_shl(BITSET_WORD *x, unsigned amount, unsigned n)
 #define BITSET_SHL(x, n)   \
    __bitset_shl(x, n, ARRAY_SIZE(x));
 
-/* bit range operations
+/* bit range operations (e=end is inclusive)
  */
-#define BITSET_TEST_RANGE_INSIDE_WORD(x, b, e) \
+#define BITSET_TEST_RANGE_INSIDE_WORD(x, b, e, mask) \
    (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
-   (((x)[BITSET_BITWORD(b)] & BITSET_RANGE(b, e)) != 0) : \
+   (((x)[BITSET_BITWORD(b)] & BITSET_RANGE(b, e)) == mask) : \
    (assert (!"BITSET_TEST_RANGE: bit range crosses word boundary"), 0))
 #define BITSET_SET_RANGE_INSIDE_WORD(x, b, e) \
    (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
@@ -227,7 +227,7 @@ __bitset_test_range(const BITSET_WORD *r, unsigned start, unsigned end)
    const unsigned start_mod = start % BITSET_WORDBITS;
 
    if (start_mod + size <= BITSET_WORDBITS) {
-      return BITSET_TEST_RANGE_INSIDE_WORD(r, start, end);
+      return !BITSET_TEST_RANGE_INSIDE_WORD(r, start, end, 0);
    } else {
       const unsigned first_size = BITSET_WORDBITS - start_mod;
 
diff --git a/src/util/tests/bitset_test.cpp b/src/util/tests/bitset_test.cpp
index ec3ba5104ab7d..5c69c5ac0b151 100644
--- a/src/util/tests/bitset_test.cpp
+++ b/src/util/tests/bitset_test.cpp
@@ -74,8 +74,8 @@ TEST(bitset, test_basic_range)
 
    const int max_set = 15;
    BITSET_SET_RANGE_INSIDE_WORD(mask128, 0, max_set);
-   EXPECT_EQ(BITSET_TEST_RANGE_INSIDE_WORD(mask128, 0, max_set), true);
-   EXPECT_EQ(BITSET_TEST_RANGE_INSIDE_WORD(mask128, max_set + 1, max_set + 15), false);
+   EXPECT_EQ(!BITSET_TEST_RANGE_INSIDE_WORD(mask128, 0, max_set, 0), true);
+   EXPECT_EQ(!BITSET_TEST_RANGE_INSIDE_WORD(mask128, max_set + 1, max_set + 15, 0), false);
    for (int i = 0; i < 128; i++) {
       if (i <= max_set)
          EXPECT_EQ(BITSET_TEST(mask128, i), true);
@@ -83,7 +83,7 @@ TEST(bitset, test_basic_range)
          EXPECT_EQ(BITSET_TEST(mask128, i), false);
    }
    BITSET_CLEAR_RANGE(mask128, 0, max_set);
-   EXPECT_EQ(BITSET_TEST_RANGE_INSIDE_WORD(mask128, 0, max_set), false);
+   EXPECT_EQ(!BITSET_TEST_RANGE_INSIDE_WORD(mask128, 0, max_set, 0), false);
    for (int i = 0; i < 128; i++) {
       EXPECT_EQ(BITSET_TEST(mask128, i), false);
    }
-- 
GitLab


From 17e01a9a9b743d89066ba0a42c841e9b7e7d0528 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 26 Nov 2023 09:23:41 -0500
Subject: [PATCH 33/38] radeonsi: merge context_reg_saved_mask and
 other_reg_saved_mask into a BITSET

There will be more than 64 context registers that we'll need to track,
so use BITSET for all of them.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_build_pm4.h   | 164 ++++++++++--------
 src/gallium/drivers/radeonsi/si_compute.c     |   6 +-
 src/gallium/drivers/radeonsi/si_gfx_cs.c      | 163 +++++++++--------
 src/gallium/drivers/radeonsi/si_state.h       |  22 ++-
 .../drivers/radeonsi/si_state_draw.cpp        |  29 ++--
 5 files changed, 195 insertions(+), 189 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h
index 24e799080706f..dad653720d446 100644
--- a/src/gallium/drivers/radeonsi/si_build_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
@@ -68,88 +68,96 @@
    radeon_emit(value); \
 } while (0)
 
-#define radeon_opt_set_reg(reg, reg_enum, idx, value, prefix_name, packet, category) do { \
+#define radeon_opt_set_reg(reg, reg_enum, idx, value, prefix_name, packet) do { \
    unsigned __value = (value); \
-   if (!((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x1) || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum)] != __value) { \
+   if (!BITSET_TEST(sctx->tracked_regs.reg_saved_mask, (reg_enum)) || \
+       sctx->tracked_regs.reg_value[(reg_enum)] != __value) { \
       radeon_set_reg(reg, idx, __value, prefix_name, packet); \
-      sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_BIT(reg_enum); \
-      sctx->tracked_regs.category##_reg_value[(reg_enum)] = __value; \
+      BITSET_SET(sctx->tracked_regs.reg_saved_mask, (reg_enum)); \
+      sctx->tracked_regs.reg_value[(reg_enum)] = __value; \
    } \
 } while (0)
 
 /* Set consecutive registers if any value is different. */
-#define radeon_opt_set_reg2(reg, reg_enum, v1, v2, prefix_name, packet, category) do { \
+#define radeon_opt_set_reg2(reg, reg_enum, v1, v2, prefix_name, packet) do { \
    unsigned __v1 = (v1), __v2 = (v2); \
-   if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x3) != 0x3 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2) { \
+   if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                      (reg_enum), (reg_enum) + 1, 0x3) || \
+       sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2) { \
       radeon_set_reg_seq(reg, 2, 0, prefix_name, packet, 0); \
       radeon_emit(__v1); \
       radeon_emit(__v2); \
-      sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 2); \
-      sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \
+      BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                   (reg_enum), (reg_enum) + 1); \
+      sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \
    } \
 } while (0)
 
-#define radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, prefix_name, packet, category) do { \
+#define radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, prefix_name, packet) do { \
    unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3); \
-   if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x7) != 0x7 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3) { \
+   if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                      (reg_enum), (reg_enum) + 2, 0x7) || \
+       sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3) { \
       radeon_set_reg_seq(reg, 3, 0, prefix_name, packet, 0); \
       radeon_emit(__v1); \
       radeon_emit(__v2); \
       radeon_emit(__v3); \
-      sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 3); \
-      sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \
+      BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                   (reg_enum), (reg_enum) + 2); \
+      sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \
    } \
 } while (0)
 
-#define radeon_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, prefix_name, packet, category) do { \
+#define radeon_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, prefix_name, packet) do { \
    unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3), __v4 = (v4); \
-   if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0xf) != 0xf || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] != __v4) { \
+   if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                      (reg_enum), (reg_enum) + 3, 0xf) || \
+       sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 3] != __v4) { \
       radeon_set_reg_seq(reg, 4, 0, prefix_name, packet, 0); \
       radeon_emit(__v1); \
       radeon_emit(__v2); \
       radeon_emit(__v3); \
       radeon_emit(__v4); \
-      sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 4); \
-      sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] = __v4; \
+      BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                   (reg_enum), (reg_enum) + 3); \
+      sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 3] = __v4; \
    } \
 } while (0)
 
-#define radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, prefix_name, packet, category) do { \
+#define radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, prefix_name, packet) do { \
    unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3), __v4 = (v4), __v5 = (v5); \
-   if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x1f) != 0x1f || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] != __v4 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 4] != __v5) { \
+   if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                      (reg_enum), (reg_enum) + 4, 0x1f) || \
+       sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 3] != __v4 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 4] != __v5) { \
       radeon_set_reg_seq(reg, 5, 0, prefix_name, packet, 0); \
       radeon_emit(__v1); \
       radeon_emit(__v2); \
       radeon_emit(__v3); \
       radeon_emit(__v4); \
       radeon_emit(__v5); \
-      sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 5); \
-      sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] = __v4; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 4] = __v5; \
+      BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                   (reg_enum), (reg_enum) + 4); \
+      sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 3] = __v4; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 4] = __v5; \
    } \
 } while (0)
 
@@ -174,22 +182,22 @@
    radeon_set_reg(reg, 0, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG)
 
 #define radeon_opt_set_context_reg(_unused, reg, reg_enum, value) \
-   radeon_opt_set_reg(reg, reg_enum, 0, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context)
+   radeon_opt_set_reg(reg, reg_enum, 0, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG)
 
 #define radeon_opt_set_context_reg_idx(_unused, reg, reg_enum, idx, value) \
-   radeon_opt_set_reg(reg, reg_enum, idx, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context)
+   radeon_opt_set_reg(reg, reg_enum, idx, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG)
 
 #define radeon_opt_set_context_reg2(_unused, reg, reg_enum, v1, v2) \
-   radeon_opt_set_reg2(reg, reg_enum, v1, v2, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context)
+   radeon_opt_set_reg2(reg, reg_enum, v1, v2, SI_CONTEXT, PKT3_SET_CONTEXT_REG)
 
 #define radeon_opt_set_context_reg3(_unused, reg, reg_enum, v1, v2, v3) \
-   radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context)
+   radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, SI_CONTEXT, PKT3_SET_CONTEXT_REG)
 
 #define radeon_opt_set_context_reg4(_unused, reg, reg_enum, v1, v2, v3, v4) \
-   radeon_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context)
+   radeon_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT, PKT3_SET_CONTEXT_REG)
 
 #define radeon_opt_set_context_reg5(_unused, reg, reg_enum, v1, v2, v3, v4, v5) \
-   radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context)
+   radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, SI_CONTEXT, PKT3_SET_CONTEXT_REG)
 
 #define radeon_opt_set_context_regn(_unused, reg, values, saved_values, num) \
    radeon_opt_set_regn(reg, values, saved_values, num, SI_CONTEXT, PKT3_SET_CONTEXT_REG)
@@ -202,17 +210,17 @@
    radeon_set_reg(reg, 0, value, SI_SH, PKT3_SET_SH_REG)
 
 #define radeon_opt_set_sh_reg(_unused, reg, reg_enum, value) \
-   radeon_opt_set_reg(reg, reg_enum, 0, value, SI_SH, PKT3_SET_SH_REG, other)
+   radeon_opt_set_reg(reg, reg_enum, 0, value, SI_SH, PKT3_SET_SH_REG)
 
 #define radeon_opt_set_sh_reg2(_unused, reg, reg_enum, v1, v2) \
-   radeon_opt_set_reg2(reg, reg_enum, v1, v2, SI_SH, PKT3_SET_SH_REG, other)
+   radeon_opt_set_reg2(reg, reg_enum, v1, v2, SI_SH, PKT3_SET_SH_REG)
 
 #define radeon_opt_set_sh_reg3(_unused, reg, reg_enum, v1, v2, v3) \
-   radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, SI_SH, PKT3_SET_SH_REG, other)
+   radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, SI_SH, PKT3_SET_SH_REG)
 
 #define radeon_opt_set_sh_reg_idx(_unused, reg, reg_enum, idx, value) do { \
    assert(sctx->gfx_level >= GFX10); \
-   radeon_opt_set_reg(reg, reg_enum, idx, value, SI_SH, PKT3_SET_SH_REG_INDEX, other); \
+   radeon_opt_set_reg(reg, reg_enum, idx, value, SI_SH, PKT3_SET_SH_REG_INDEX); \
 } while (0)
 
 #define radeon_emit_32bit_pointer(_unused, va) do { \
@@ -236,7 +244,7 @@
    radeon_set_reg(reg, 0, value, CIK_UCONFIG, PKT3_SET_UCONFIG_REG)
 
 #define radeon_opt_set_uconfig_reg(_unused, reg, reg_enum, value) \
-   radeon_opt_set_reg(reg, reg_enum, 0, value, CIK_UCONFIG, PKT3_SET_UCONFIG_REG, other)
+   radeon_opt_set_reg(reg, reg_enum, 0, value, CIK_UCONFIG, PKT3_SET_UCONFIG_REG)
 
 #define RESOLVE_PKT3_SET_UCONFIG_REG_INDEX \
    (GFX_VERSION >= GFX10 || (GFX_VERSION == GFX9 && sctx->screen->info.me_fw_version >= 26) ? \
@@ -246,7 +254,7 @@
    radeon_set_reg(reg, idx, value, CIK_UCONFIG, RESOLVE_PKT3_SET_UCONFIG_REG_INDEX)
 
 #define radeon_opt_set_uconfig_reg_idx(_unused, _unused2, reg, reg_enum, idx, value) \
-   radeon_opt_set_reg(reg, reg_enum, idx, value, CIK_UCONFIG, RESOLVE_PKT3_SET_UCONFIG_REG_INDEX, other)
+   radeon_opt_set_reg(reg, reg_enum, idx, value, CIK_UCONFIG, RESOLVE_PKT3_SET_UCONFIG_REG_INDEX)
 
 #define radeon_set_privileged_config_reg(reg, value) do { \
    assert((reg) < CIK_UCONFIG_REG_OFFSET); \
@@ -268,35 +276,37 @@
    buffer[__i / 2].reg_value[__i % 2] = value; \
 } while (0)
 
-#define gfx11_opt_push_reg(reg, reg_enum, value, prefix_name, category, buffer, reg_count) do { \
+#define gfx11_opt_push_reg(reg, reg_enum, value, prefix_name, buffer, reg_count) do { \
    unsigned __value = value; \
-   if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x1) != 0x1 || \
-       sctx->tracked_regs.category##_reg_value[reg_enum] != __value) { \
+   if (!BITSET_TEST(sctx->tracked_regs.reg_saved_mask, (reg_enum)) || \
+       sctx->tracked_regs.reg_value[reg_enum] != __value) { \
       gfx11_push_reg(reg, __value, prefix_name, buffer, reg_count); \
-      sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_BIT(reg_enum); \
-      sctx->tracked_regs.category##_reg_value[reg_enum] = __value; \
+      BITSET_SET(sctx->tracked_regs.reg_saved_mask, (reg_enum)); \
+      sctx->tracked_regs.reg_value[reg_enum] = __value; \
    } \
 } while (0)
 
-#define gfx11_opt_push_reg4(reg, reg_enum, v1, v2, v3, v4, prefix_name, category, buffer, reg_count) do { \
+#define gfx11_opt_push_reg4(reg, reg_enum, v1, v2, v3, v4, prefix_name, buffer, reg_count) do { \
    unsigned __v1 = (v1); \
    unsigned __v2 = (v2); \
    unsigned __v3 = (v3); \
    unsigned __v4 = (v4); \
-   if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0xf) != 0xf || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3 || \
-       sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] != __v4) { \
+   if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                      (reg_enum), (reg_enum) + 3, 0xf) || \
+       sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 3] != __v4) { \
       gfx11_push_reg((reg), __v1, prefix_name, buffer, reg_count); \
       gfx11_push_reg((reg) + 4, __v2, prefix_name, buffer, reg_count); \
       gfx11_push_reg((reg) + 8, __v3, prefix_name, buffer, reg_count); \
       gfx11_push_reg((reg) + 12, __v4, prefix_name, buffer, reg_count); \
-      sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE((reg_enum), 4); \
-      sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \
-      sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] = __v4; \
+      BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                   (reg_enum), (reg_enum) + 3); \
+      sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 3] = __v4; \
    } \
 } while (0)
 
@@ -310,11 +320,11 @@
                   sctx->num_buffered_compute_sh_regs)
 
 #define gfx11_opt_push_gfx_sh_reg(reg, reg_enum, value) \
-   gfx11_opt_push_reg(reg, reg_enum, value, SI_SH, other, sctx->gfx11.buffered_gfx_sh_regs, \
+   gfx11_opt_push_reg(reg, reg_enum, value, SI_SH, sctx->gfx11.buffered_gfx_sh_regs, \
                       sctx->num_buffered_gfx_sh_regs)
 
 #define gfx11_opt_push_compute_sh_reg(reg, reg_enum, value) \
-   gfx11_opt_push_reg(reg, reg_enum, value, SI_SH, other, sctx->gfx11.buffered_compute_sh_regs, \
+   gfx11_opt_push_reg(reg, reg_enum, value, SI_SH, sctx->gfx11.buffered_compute_sh_regs, \
                       sctx->num_buffered_compute_sh_regs)
 
 /* GFX11 packet building helpers for SET_CONTEXT_REG_PAIRS_PACKED.
@@ -328,11 +338,11 @@
    gfx11_push_reg(reg, value, SI_CONTEXT, __cs_context_regs, __cs_context_reg_count)
 
 #define gfx11_opt_set_context_reg(reg, reg_enum, value) \
-   gfx11_opt_push_reg(reg, reg_enum, value, SI_CONTEXT, context, __cs_context_regs, \
+   gfx11_opt_push_reg(reg, reg_enum, value, SI_CONTEXT, __cs_context_regs, \
                       __cs_context_reg_count)
 
 #define gfx11_opt_set_context_reg4(reg, reg_enum, v1, v2, v3, v4) \
-   gfx11_opt_push_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT, context, __cs_context_regs, \
+   gfx11_opt_push_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT, __cs_context_regs, \
                        __cs_context_reg_count)
 
 #define gfx11_end_packed_context_regs() do { \
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 21bda9d08cd5f..092b3e3038dbb 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -999,8 +999,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
       si_compute_resources_add_all_to_bo_list(sctx);
 
    /* Skipping setting redundant registers on compute queues breaks compute. */
-   if (!sctx->has_graphics)
-      sctx->tracked_regs.other_reg_saved_mask = 0;
+   if (!sctx->has_graphics) {
+      BITSET_SET_RANGE(sctx->tracked_regs.reg_saved_mask,
+                       SI_FIRST_TRACKED_OTHER_REG, SI_NUM_ALL_TRACKED_REGS - 1);
+   }
 
    /* First emit registers. */
    bool prefetch;
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 44e95f6eb41f9..1b6d0b8f44ce8 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -204,83 +204,83 @@ static void si_add_gds_to_buffer_list(struct si_context *sctx)
 
 void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
 {
-   STATIC_ASSERT(SI_NUM_TRACKED_CONTEXT_REGS <= sizeof(ctx->tracked_regs.context_reg_saved_mask) * 8);
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x1000;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x5;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_EQAA] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x90000;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x2;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_SHADER_STAGES_EN] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_LS_HS_CONFIG] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_TF_PARAM] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x3;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL] = 0;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_IA_MULTI_VGT_PARAM] = 0xff;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_MODE] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x1e;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE] = 0;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0;
-
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0;
-   ctx->tracked_regs.context_reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0;
+   STATIC_ASSERT(SI_NUM_ALL_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved_mask) * 8);
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x1000;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x5;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
+   ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x90000;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x2;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_SHADER_STAGES_EN] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_LS_HS_CONFIG] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x3;
+   ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL] = 0;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_IA_MULTI_VGT_PARAM] = 0xff;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x1e;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE] = 0;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0;
 
    /* Set all cleared context registers to saved. */
-   ctx->tracked_regs.context_reg_saved_mask = BITFIELD64_MASK(SI_NUM_TRACKED_CONTEXT_REGS);
+   BITSET_SET_RANGE(ctx->tracked_regs.reg_saved_mask, 0, SI_NUM_TRACKED_CONTEXT_REGS - 1);
 }
 
 void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper,
@@ -512,17 +512,14 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
       si_mark_atom_dirty(ctx, &ctx->atoms.s.vgt_pipeline_state);
       si_mark_atom_dirty(ctx, &ctx->atoms.s.tess_io_layout);
 
-      if (has_clear_state) {
+      /* Set all register values to unknown. */
+      BITSET_ZERO(ctx->tracked_regs.reg_saved_mask);
+
+      if (has_clear_state)
          si_set_tracked_regs_to_clear_state(ctx);
-      } else {
-         /* Set all register values to unknown. */
-         ctx->tracked_regs.context_reg_saved_mask = 0;
-      }
 
-      /* 0xffffffff is an impossible value to register SPI_PS_INPUT_CNTL_n */
+      /* 0xffffffff is an impossible value for SPI_PS_INPUT_CNTL_n registers */
       memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
-
-      ctx->tracked_regs.other_reg_saved_mask = 0; /* unknown values */
    }
 
    /* Invalidate various draw states so that they are emitted before
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 0cee83c1632fd..ac4833116d17e 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -9,6 +9,7 @@
 
 #include "si_pm4.h"
 #include "util/format/u_format.h"
+#include "util/bitset.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -237,9 +238,10 @@ struct si_shader_data {
    uint32_t sh_base[SI_NUM_SHADERS];
 };
 
-/* Context registers whose values are tracked by si_context. */
-enum si_tracked_context_reg
+/* Registers whose values are tracked by si_context. */
+enum si_tracked_reg
 {
+   /* CONTEXT registers. */
    /* 2 consecutive registers */
    SI_TRACKED_DB_RENDER_CONTROL,
    SI_TRACKED_DB_COUNT_CONTROL,
@@ -326,11 +328,10 @@ enum si_tracked_context_reg
    SI_TRACKED_CB_DCC_CONTROL,                /* GFX8-xx (TBD) */
 
    SI_NUM_TRACKED_CONTEXT_REGS,
-};
+   SI_FIRST_TRACKED_OTHER_REG = SI_NUM_TRACKED_CONTEXT_REGS,
 
-/* Non-context registers whose values are tracked by si_context. */
-enum si_tracked_other_reg {
-   SI_TRACKED_GE_PC_ALLOC,                   /* GFX10+ */
+   /* SH and UCONFIG registers. */
+   SI_TRACKED_GE_PC_ALLOC = SI_FIRST_TRACKED_OTHER_REG, /* GFX10+ */
    SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,       /* GFX7+ */
    SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,       /* GFX10+ */
    SI_TRACKED_VGT_GS_OUT_PRIM_TYPE_UCONFIG,  /* GFX11+ */
@@ -372,7 +373,7 @@ enum si_tracked_other_reg {
    SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_LO, /* GFX11+ */
    SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_HI, /* GFX11+ */
 
-   SI_NUM_TRACKED_OTHER_REGS,
+   SI_NUM_ALL_TRACKED_REGS,
 };
 
 /* For 3 draw constants: BaseVertex, DrawID, StartInstance */
@@ -383,12 +384,9 @@ enum si_tracked_other_reg {
 #define BASEVERTEX_DRAWID_STARTINSTANCE_MASK (BASEVERTEX_MASK | DRAWID_MASK | STARTINSTANCE_MASK)
 
 struct si_tracked_regs {
-   uint64_t context_reg_saved_mask;
-   uint32_t context_reg_value[SI_NUM_TRACKED_CONTEXT_REGS];
+   BITSET_DECLARE(reg_saved_mask, SI_NUM_ALL_TRACKED_REGS);
+   uint32_t reg_value[SI_NUM_ALL_TRACKED_REGS];
    uint32_t spi_ps_input_cntl[32];
-
-   uint32_t other_reg_saved_mask;
-   uint32_t other_reg_value[SI_NUM_TRACKED_OTHER_REGS];
 };
 
 /* Private read-write buffer slots. */
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 738968a878d1c..bba9aae96ce89 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -1041,7 +1041,7 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx,
    if (GFX_VERSION == GFX9) {
       /* Workaround for SpecviewPerf13 Catia hang on GFX9. */
       if (prim != sctx->last_prim)
-         sctx->tracked_regs.other_reg_saved_mask &= ~BITFIELD64_BIT(SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG);
+         BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG);
 
       radeon_opt_set_uconfig_reg_idx(sctx, GFX_VERSION, R_030960_IA_MULTI_VGT_PARAM,
                                      SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG,
@@ -1347,8 +1347,9 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
       }
 
       /* Invalidate tracked draw constants because DrawIndirect overwrites them. */
-      sctx->tracked_regs.other_reg_saved_mask &=
-         ~(BASEVERTEX_DRAWID_STARTINSTANCE_MASK << tracked_base_vertex_reg);
+      BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */
+      BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */
+      BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 2); /* StartInstance */
       sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN;
 
       radeon_emit(PKT3(PKT3_SET_BASE, 2, 0));
@@ -1460,8 +1461,9 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
        */
       if (is_blit) {
          /* Re-emit draw constants after we leave u_blitter. */
-         sctx->tracked_regs.other_reg_saved_mask &=
-            ~(BASEVERTEX_DRAWID_STARTINSTANCE_MASK << tracked_base_vertex_reg);
+         BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */
+         BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */
+         BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 2); /* StartInstance */
 
          /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */
          radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs);
@@ -1510,8 +1512,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                   radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
                }
                if (num_draws > 1) {
-                  sctx->tracked_regs.other_reg_saved_mask &=
-                     ~(BASEVERTEX_DRAWID_MASK << tracked_base_vertex_reg);
+                  BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */
+                  BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */
                }
             } else {
                /* Only DrawID varies. */
@@ -1529,8 +1531,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                   radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
                }
                if (num_draws > 1) {
-                  sctx->tracked_regs.other_reg_saved_mask &=
-                     ~(DRAWID_MASK << tracked_base_vertex_reg);
+                  BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */
                }
             }
          } else {
@@ -1550,8 +1551,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                   radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
                }
                if (num_draws > 1) {
-                  sctx->tracked_regs.other_reg_saved_mask &=
-                     ~(BASEVERTEX_MASK << tracked_base_vertex_reg);
+                  BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */
                }
             } else {
                /* DrawID and BaseVertex are constant. */
@@ -1594,8 +1594,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
             }
             if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) {
-               sctx->tracked_regs.other_reg_saved_mask &=
-                  ~(BASEVERTEX_DRAWID_MASK << tracked_base_vertex_reg);
+               BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */
+               BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */
             }
          } else {
             for (unsigned i = 0; i < num_draws; i++) {
@@ -1607,8 +1607,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
             }
             if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) {
-               sctx->tracked_regs.other_reg_saved_mask &=
-                  ~(BASEVERTEX_MASK << tracked_base_vertex_reg);
+               BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */
             }
          }
       }
-- 
GitLab


From cabf9277654a6dccd830d4c006c56573ab535f86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 26 Nov 2023 10:56:24 -0500
Subject: [PATCH 34/38] radeonsi: convert depth-stencil-alpha state to tracked
 registers

Some of these registers don't change, so we should not set them when they
don't. This reworks the DSA state to use a custom emit function and
eliminate redundant register changes.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_gfx_cs.c |   5 ++
 src/gallium/drivers/radeonsi/si_state.c  | 109 ++++++++++++++++-------
 src/gallium/drivers/radeonsi/si_state.h  |  19 +++-
 3 files changed, 101 insertions(+), 32 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 1b6d0b8f44ce8..0f48775e0ee62 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -209,6 +209,11 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
    ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0;
    ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0;
 
+   ctx->tracked_regs.reg_value[SI_TRACKED_DB_DEPTH_CONTROL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_DB_STENCIL_CONTROL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_DB_DEPTH_BOUNDS_MIN] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_DB_DEPTH_BOUNDS_MAX] = 0;
+
    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x1000;
    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0;
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 77e4625dca1a2..e6324b710f76d 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1403,73 +1403,60 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
                                  const struct pipe_depth_stencil_alpha_state *state)
 {
    struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa);
-   struct si_pm4_state *pm4 = &dsa->pm4;
-   unsigned db_depth_control;
-   uint32_t db_stencil_control = 0;
-
    if (!dsa) {
       return NULL;
    }
 
-   si_pm4_clear_state(pm4, (struct si_screen*)ctx->screen, false);
-
    dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask;
    dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask;
    dsa->stencil_ref.writemask[0] = state->stencil[0].writemask;
    dsa->stencil_ref.writemask[1] = state->stencil[1].writemask;
 
-   db_depth_control =
+   dsa->db_depth_control =
       S_028800_Z_ENABLE(state->depth_enabled) | S_028800_Z_WRITE_ENABLE(state->depth_writemask) |
       S_028800_ZFUNC(state->depth_func) | S_028800_DEPTH_BOUNDS_ENABLE(state->depth_bounds_test);
 
    /* stencil */
    if (state->stencil[0].enabled) {
-      db_depth_control |= S_028800_STENCIL_ENABLE(1);
-      db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func);
-      db_stencil_control |=
+      dsa->db_depth_control |= S_028800_STENCIL_ENABLE(1);
+      dsa->db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func);
+      dsa->db_stencil_control |=
          S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op));
-      db_stencil_control |=
+      dsa->db_stencil_control |=
          S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op));
-      db_stencil_control |=
+      dsa->db_stencil_control |=
          S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op));
 
       if (state->stencil[1].enabled) {
-         db_depth_control |= S_028800_BACKFACE_ENABLE(1);
-         db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func);
-         db_stencil_control |=
+         dsa->db_depth_control |= S_028800_BACKFACE_ENABLE(1);
+         dsa->db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func);
+         dsa->db_stencil_control |=
             S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op));
-         db_stencil_control |=
+         dsa->db_stencil_control |=
             S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op));
-         db_stencil_control |=
+         dsa->db_stencil_control |=
             S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op));
       }
    }
 
+   dsa->db_depth_bounds_min = fui(state->depth_bounds_min);
+   dsa->db_depth_bounds_max = fui(state->depth_bounds_max);
+
    /* alpha */
    if (state->alpha_enabled) {
       dsa->alpha_func = state->alpha_func;
-
-      si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4,
-                     fui(state->alpha_ref_value));
+      dsa->spi_shader_user_data_ps_alpha_ref = fui(state->alpha_ref_value);
    } else {
       dsa->alpha_func = PIPE_FUNC_ALWAYS;
    }
 
-   si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
-   if (state->stencil[0].enabled)
-      si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
-   if (state->depth_bounds_test) {
-      si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth_bounds_min));
-      si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth_bounds_max));
-   }
-   si_pm4_finalize(pm4);
-
    dsa->depth_enabled = state->depth_enabled;
    dsa->depth_write_enabled = state->depth_enabled && state->depth_writemask;
    dsa->stencil_enabled = state->stencil[0].enabled;
    dsa->stencil_write_enabled =
       (util_writes_stencil(&state->stencil[0]) || util_writes_stencil(&state->stencil[1]));
    dsa->db_can_write = dsa->depth_write_enabled || dsa->stencil_write_enabled;
+   dsa->depth_bounds_enabled = state->depth_bounds_test;
 
    bool zfunc_is_ordered =
       state->depth_func == PIPE_FUNC_NEVER || state->depth_func == PIPE_FUNC_LESS ||
@@ -1496,6 +1483,68 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
    return dsa;
 }
 
+static void si_pm4_emit_dsa(struct si_context *sctx, unsigned index)
+{
+   struct si_state_dsa *state = sctx->queued.named.dsa;
+   assert(state && state != sctx->emitted.named.dsa);
+
+   if (sctx->screen->info.has_set_context_pairs_packed) {
+      radeon_begin(&sctx->gfx_cs);
+      gfx11_begin_packed_context_regs();
+      gfx11_opt_set_context_reg(R_028800_DB_DEPTH_CONTROL, SI_TRACKED_DB_DEPTH_CONTROL,
+                                state->db_depth_control);
+      if (state->stencil_enabled) {
+         gfx11_opt_set_context_reg(R_02842C_DB_STENCIL_CONTROL, SI_TRACKED_DB_STENCIL_CONTROL,
+                                   state->db_stencil_control);
+      }
+      if (state->depth_bounds_enabled) {
+         gfx11_opt_set_context_reg(R_028020_DB_DEPTH_BOUNDS_MIN, SI_TRACKED_DB_DEPTH_BOUNDS_MIN,
+                                   state->db_depth_bounds_min);
+         gfx11_opt_set_context_reg(R_028024_DB_DEPTH_BOUNDS_MAX, SI_TRACKED_DB_DEPTH_BOUNDS_MAX,
+                                   state->db_depth_bounds_max);
+      }
+      gfx11_end_packed_context_regs();
+
+      if (state->alpha_func != PIPE_FUNC_ALWAYS) {
+         if (sctx->screen->info.has_set_sh_pairs_packed) {
+            gfx11_opt_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4,
+                                      SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF,
+                                      state->spi_shader_user_data_ps_alpha_ref);
+         } else {
+            radeon_opt_set_sh_reg(sctx, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4,
+                                  SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF,
+                                  state->spi_shader_user_data_ps_alpha_ref);
+         }
+      }
+      radeon_end(); /* don't track context rolls on GFX11 */
+   } else {
+      radeon_begin(&sctx->gfx_cs);
+      radeon_opt_set_context_reg(sctx, R_028800_DB_DEPTH_CONTROL, SI_TRACKED_DB_DEPTH_CONTROL,
+                                 state->db_depth_control);
+      if (state->stencil_enabled) {
+         radeon_opt_set_context_reg(sctx, R_02842C_DB_STENCIL_CONTROL, SI_TRACKED_DB_STENCIL_CONTROL,
+                                    state->db_stencil_control);
+      }
+      if (state->depth_bounds_enabled) {
+         radeon_opt_set_context_reg2(sctx, R_028020_DB_DEPTH_BOUNDS_MIN,
+                                     SI_TRACKED_DB_DEPTH_BOUNDS_MIN,
+                                     state->db_depth_bounds_min,
+                                     state->db_depth_bounds_max);
+      }
+      radeon_end_update_context_roll();
+
+      if (state->alpha_func != PIPE_FUNC_ALWAYS) {
+         radeon_begin(&sctx->gfx_cs);
+         radeon_opt_set_sh_reg(sctx, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4,
+                               SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF,
+                               state->spi_shader_user_data_ps_alpha_ref);
+         radeon_end();
+      }
+   }
+
+   sctx->emitted.named.dsa = state;
+}
+
 static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
 {
    struct si_context *sctx = (struct si_context *)ctx;
@@ -5713,7 +5762,7 @@ void si_init_state_functions(struct si_context *sctx)
 {
    sctx->atoms.s.pm4_states[SI_STATE_IDX(blend)].emit = si_pm4_emit_state;
    sctx->atoms.s.pm4_states[SI_STATE_IDX(rasterizer)].emit = si_pm4_emit_state;
-   sctx->atoms.s.pm4_states[SI_STATE_IDX(dsa)].emit = si_pm4_emit_state;
+   sctx->atoms.s.pm4_states[SI_STATE_IDX(dsa)].emit = si_pm4_emit_dsa;
    sctx->atoms.s.pm4_states[SI_STATE_IDX(poly_offset)].emit = si_pm4_emit_state;
    sctx->atoms.s.pm4_states[SI_STATE_IDX(ls)].emit = si_pm4_emit_shader;
    sctx->atoms.s.pm4_states[SI_STATE_IDX(hs)].emit = si_pm4_emit_shader;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index ac4833116d17e..189ae76c99d54 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -109,6 +109,13 @@ struct si_state_dsa {
    struct si_pm4_state pm4;
    struct si_dsa_stencil_ref_part stencil_ref;
 
+   /* Register values. */
+   unsigned db_depth_control;
+   unsigned db_stencil_control;
+   unsigned db_depth_bounds_min;
+   unsigned db_depth_bounds_max;
+   unsigned spi_shader_user_data_ps_alpha_ref;
+
    /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */
    struct si_dsa_order_invariance order_invariance[2];
 
@@ -118,6 +125,7 @@ struct si_state_dsa {
    bool stencil_enabled : 1;
    bool stencil_write_enabled : 1;
    bool db_can_write : 1;
+   bool depth_bounds_enabled : 1;
 };
 
 struct si_stencil_ref {
@@ -226,8 +234,7 @@ union si_state_atoms {
 
 static inline uint64_t si_atoms_that_always_roll_context(void)
 {
-   return SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) |
-          SI_STATE_BIT(poly_offset) |
+   return SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(poly_offset) |
           SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) |
           SI_ATOM_BIT(sample_locations) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color)|
           SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports)|
@@ -246,6 +253,12 @@ enum si_tracked_reg
    SI_TRACKED_DB_RENDER_CONTROL,
    SI_TRACKED_DB_COUNT_CONTROL,
 
+   SI_TRACKED_DB_DEPTH_CONTROL,
+   SI_TRACKED_DB_STENCIL_CONTROL,
+   /* 2 consecutive registers */
+   SI_TRACKED_DB_DEPTH_BOUNDS_MIN,
+   SI_TRACKED_DB_DEPTH_BOUNDS_MAX,
+
    /* 2 consecutive registers */
    SI_TRACKED_PA_SC_LINE_CNTL,
    SI_TRACKED_PA_SC_AA_CONFIG,
@@ -358,6 +371,8 @@ enum si_tracked_reg
    SI_TRACKED_SPI_SHADER_USER_DATA_VS__DRAWID,           /* GFX6-10 */
    SI_TRACKED_SPI_SHADER_USER_DATA_VS__START_INSTANCE,   /* GFX6-10 */
 
+   SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF,
+
    SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
    SI_TRACKED_COMPUTE_NUM_THREAD_X,
    SI_TRACKED_COMPUTE_NUM_THREAD_Y,
-- 
GitLab


From 11fcd58957e6023f97c41f57526c84efa2a74d08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 26 Nov 2023 10:56:24 -0500
Subject: [PATCH 35/38] radeonsi: convert rasterizer state to tracked registers

Most of these registers don't change, so we should not set them when they
don't. This reworks the rasterizer state to use a custom emit function and
eliminate redundant register changes. This required merging the poly_offset
state into the rasterizer state and change how the poly offset state is
updated.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_build_pm4.h |  31 ++
 src/gallium/drivers/radeonsi/si_gfx_cs.c    |  16 +
 src/gallium/drivers/radeonsi/si_pipe.h      |   1 +
 src/gallium/drivers/radeonsi/si_state.c     | 322 +++++++++++---------
 src/gallium/drivers/radeonsi/si_state.h     |  37 ++-
 5 files changed, 265 insertions(+), 142 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h
index dad653720d446..2242213d20d35 100644
--- a/src/gallium/drivers/radeonsi/si_build_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
@@ -161,6 +161,34 @@
    } \
 } while (0)
 
+#define radeon_opt_set_reg6(reg, reg_enum, v1, v2, v3, v4, v5, v6, prefix_name, packet) do { \
+   unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3), __v4 = (v4), __v5 = (v5), __v6 = (v6); \
+   if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                      (reg_enum), (reg_enum) + 5, 0x3f) || \
+       sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 3] != __v4 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 4] != __v5 || \
+       sctx->tracked_regs.reg_value[(reg_enum) + 5] != __v6) { \
+      radeon_set_reg_seq(reg, 6, 0, prefix_name, packet, 0); \
+      radeon_emit(__v1); \
+      radeon_emit(__v2); \
+      radeon_emit(__v3); \
+      radeon_emit(__v4); \
+      radeon_emit(__v5); \
+      radeon_emit(__v6); \
+      BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
+                                   (reg_enum), (reg_enum) + 5); \
+      sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 3] = __v4; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 4] = __v5; \
+      sctx->tracked_regs.reg_value[(reg_enum) + 5] = __v6; \
+   } \
+} while (0)
+
 #define radeon_opt_set_regn(reg, values, saved_values, num, prefix_name, packet) do { \
    if (memcmp(values, saved_values, sizeof(uint32_t) * (num))) { \
       radeon_set_reg_seq(reg, num, 0, prefix_name, packet, 0); \
@@ -199,6 +227,9 @@
 #define radeon_opt_set_context_reg5(_unused, reg, reg_enum, v1, v2, v3, v4, v5) \
    radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, SI_CONTEXT, PKT3_SET_CONTEXT_REG)
 
+#define radeon_opt_set_context_reg6(reg, reg_enum, v1, v2, v3, v4, v5, v6) \
+   radeon_opt_set_reg6(reg, reg_enum, v1, v2, v3, v4, v5, v6, SI_CONTEXT, PKT3_SET_CONTEXT_REG)
+
 #define radeon_opt_set_context_regn(_unused, reg, values, saved_values, num) \
    radeon_opt_set_regn(reg, values, saved_values, num, SI_CONTEXT, PKT3_SET_CONTEXT_REG)
 
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 0f48775e0ee62..89c337b27802a 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -214,6 +214,21 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
    ctx->tracked_regs.reg_value[SI_TRACKED_DB_DEPTH_BOUNDS_MIN] = 0;
    ctx->tracked_regs.reg_value[SI_TRACKED_DB_DEPTH_BOUNDS_MAX] = 0;
 
+   ctx->tracked_regs.reg_value[SI_TRACKED_SPI_INTERP_CONTROL_0] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POINT_SIZE] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POINT_MINMAX] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_LINE_CNTL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_0] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SC_MODE_CNTL] = 0x4;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_EDGERULE] = 0xaa99aaaa;
+
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_DB_FMT_CNTL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_CLAMP] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_SCALE] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_OFFSET] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_BACK_SCALE] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_BACK_OFFSET] = 0;
+
    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x1000;
    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0;
 
@@ -252,6 +267,7 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x3;
    ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0;
    ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0;
    ctx->tracked_regs.reg_value[SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL] = 0;
 
    ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 20fdcf180eeec..1a122970021fd 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -457,6 +457,7 @@ struct si_surface {
    bool color_is_int8 : 1;
    bool color_is_int10 : 1;
    bool dcc_incompatible : 1;
+   uint8_t db_format_index : 3;
 
    /* Color registers. */
    unsigned cb_color_info;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index e6324b710f76d..4d61823eab3c9 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -924,35 +924,6 @@ static void si_emit_clip_regs(struct si_context *sctx, unsigned index)
    }
 }
 
-/*
- * inferred state between framebuffer and rasterizer
- */
-static void si_update_poly_offset_state(struct si_context *sctx)
-{
-   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-
-   if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
-      si_pm4_bind_state(sctx, poly_offset, NULL);
-      return;
-   }
-
-   /* Use the user format, not db_render_format, so that the polygon
-    * offset behaves as expected by applications.
-    */
-   switch (sctx->framebuffer.state.zsbuf->texture->format) {
-   case PIPE_FORMAT_Z16_UNORM:
-      si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
-      break;
-   default: /* 24-bit */
-      si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
-      break;
-   case PIPE_FORMAT_Z32_FLOAT:
-   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-      si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
-      break;
-   }
-}
-
 /*
  * Rasterizer
  */
@@ -976,16 +947,11 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
 {
    struct si_screen *sscreen = ((struct si_context *)ctx)->screen;
    struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer);
-   struct si_pm4_state *pm4 = &rs->pm4;
-   unsigned tmp, i;
-   float psize_min, psize_max;
 
    if (!rs) {
       return NULL;
    }
 
-   si_pm4_clear_state(pm4, sscreen, false);
-
    rs->scissor_enable = state->scissor;
    rs->clip_halfz = state->clip_halfz;
    rs->two_side = state->light_twoside;
@@ -1069,20 +1035,19 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
          rs->force_front_face_input = 1;
    }
 
-   unsigned spi_interp_control_0 =
-      S_0286D4_FLAT_SHADE_ENA(1) |
-      S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
-      S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
-      S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
-      S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
-      S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
-      S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT);
-
-   si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0, spi_interp_control_0);
+   rs->spi_interp_control_0 = S_0286D4_FLAT_SHADE_ENA(1) |
+                              S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
+                              S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
+                              S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
+                              S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
+                              S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
+                              S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode !=
+                                                        PIPE_SPRITE_COORD_UPPER_LEFT);
 
    /* point size 12.4 fixed point */
-   tmp = (unsigned)(state->point_size * 8.0);
-   si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
+   float psize_min, psize_max;
+   unsigned tmp = (unsigned)(state->point_size * 8.0);
+   rs->pa_su_point_size = S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp);
 
    if (state->point_size_per_vertex) {
       psize_min = util_get_min_point_size(state);
@@ -1095,115 +1060,176 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
    rs->max_point_size = psize_max;
 
    /* Divide by two, because 0.5 = 1 pixel. */
-   si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX,
-                  S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) |
-                  S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2)));
-   si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL,
-                  S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2)));
-
-   si_pm4_set_reg(pm4, R_028A48_PA_SC_MODE_CNTL_0,
-                  S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
-                  S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth ||
-                                       state->line_smooth) |
-                  S_028A48_VPORT_SCISSOR_ENABLE(1) |
-                  S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.gfx_level >= GFX9));
+   rs->pa_su_point_minmax = S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) |
+                            S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2));
+   rs->pa_su_line_cntl = S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2));
+
+   rs->pa_sc_mode_cntl_0 = S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
+                           S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth ||
+                                                state->line_smooth) |
+                           S_028A48_VPORT_SCISSOR_ENABLE(1) |
+                           S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.gfx_level >= GFX9);
 
    bool polygon_mode_enabled =
       (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
       (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
 
-   si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
-                  S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
-                  S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
-                  S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
-                  S_028814_FACE(!state->front_ccw) |
-                  S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
-                  S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
-                  S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
-                  S_028814_POLY_MODE(polygon_mode_enabled) |
-                  S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
-                  S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) |
-                  /* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */
-                  S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.gfx_level >= GFX10 ?
-                                                   polygon_mode_enabled ||
-                                                   rs->perpendicular_end_caps : 0));
-
+   rs->pa_su_sc_mode_cntl = S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
+                            S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+                            S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+                            S_028814_FACE(!state->front_ccw) |
+                            S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
+                            S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
+                            S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
+                            S_028814_POLY_MODE(polygon_mode_enabled) |
+                            S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
+                            S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) |
+                            /* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */
+                            S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.gfx_level >= GFX10 ?
+                                                             polygon_mode_enabled ||
+                                                             rs->perpendicular_end_caps : 0);
    if (sscreen->info.gfx_level >= GFX10) {
-      si_pm4_set_reg(pm4, R_028838_PA_CL_NGG_CNTL,
-                     S_028838_INDEX_BUF_EDGE_FLAG_ENA(rs->polygon_mode_is_points ||
-                                                      rs->polygon_mode_is_lines) |
-                     S_028838_VERTEX_REUSE_DEPTH(sscreen->info.gfx_level >= GFX10_3 ? 30 : 0));
+      rs->pa_cl_ngg_cntl = S_028838_INDEX_BUF_EDGE_FLAG_ENA(rs->polygon_mode_is_points ||
+                                                            rs->polygon_mode_is_lines) |
+                           S_028838_VERTEX_REUSE_DEPTH(sscreen->info.gfx_level >= GFX10_3 ? 30 : 0);
    }
 
    if (state->bottom_edge_rule) {
       /* OpenGL windows should set this. */
-      si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
-                     S_028230_ER_TRI(0xA) |
-                     S_028230_ER_POINT(0x5) |
-                     S_028230_ER_RECT(0x9) |
-                     S_028230_ER_LINE_LR(0x2A) |
-                     S_028230_ER_LINE_RL(0x2A) |
-                     S_028230_ER_LINE_TB(0xA) |
-                     S_028230_ER_LINE_BT(0xA));
+      rs->pa_sc_edgerule = S_028230_ER_TRI(0xA) |
+                           S_028230_ER_POINT(0x5) |
+                           S_028230_ER_RECT(0x9) |
+                           S_028230_ER_LINE_LR(0x2A) |
+                           S_028230_ER_LINE_RL(0x2A) |
+                           S_028230_ER_LINE_TB(0xA) |
+                           S_028230_ER_LINE_BT(0xA);
    } else {
       /* OpenGL FBOs and Direct3D should set this. */
-      si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
-                     S_028230_ER_TRI(0xA) |
-                     S_028230_ER_POINT(0x6) |
-                     S_028230_ER_RECT(0xA) |
-                     S_028230_ER_LINE_LR(0x19) |
-                     S_028230_ER_LINE_RL(0x25) |
-                     S_028230_ER_LINE_TB(0xA) |
-                     S_028230_ER_LINE_BT(0xA));
+      rs->pa_sc_edgerule = S_028230_ER_TRI(0xA) |
+                           S_028230_ER_POINT(0x6) |
+                           S_028230_ER_RECT(0xA) |
+                           S_028230_ER_LINE_LR(0x19) |
+                           S_028230_ER_LINE_RL(0x25) |
+                           S_028230_ER_LINE_TB(0xA) |
+                           S_028230_ER_LINE_BT(0xA);
    }
-   si_pm4_finalize(pm4);
 
-   if (!rs->uses_poly_offset)
-      return rs;
+   if (rs->uses_poly_offset) {
+      /* Calculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
+      rs->pa_su_poly_offset_clamp = fui(state->offset_clamp);
+      rs->pa_su_poly_offset_frontback_scale = fui(state->offset_scale * 16);
 
-   rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state));
-   if (!rs->pm4_poly_offset) {
-      FREE(rs);
-      return NULL;
+      if (!state->offset_units_unscaled) {
+         /* 16-bit zbuffer */
+         rs->pa_su_poly_offset_db_fmt_cntl[0] = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
+         rs->pa_su_poly_offset_frontback_offset[0] = fui(state->offset_units * 4);
+
+         /* 24-bit zbuffer */
+         rs->pa_su_poly_offset_db_fmt_cntl[1] = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
+         rs->pa_su_poly_offset_frontback_offset[1] = fui(state->offset_units * 2);
+
+         /* 32-bit zbuffer */
+         rs->pa_su_poly_offset_db_fmt_cntl[2] = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) |
+                                                S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
+         rs->pa_su_poly_offset_frontback_offset[2] = fui(state->offset_units);
+      } else {
+         rs->pa_su_poly_offset_frontback_offset[0] = fui(state->offset_units);
+         rs->pa_su_poly_offset_frontback_offset[1] = fui(state->offset_units);
+         rs->pa_su_poly_offset_frontback_offset[2] = fui(state->offset_units);
+      }
    }
 
-   /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
-   for (i = 0; i < 3; i++) {
-      struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i];
-      float offset_units = state->offset_units;
-      float offset_scale = state->offset_scale * 16.0f;
-      uint32_t pa_su_poly_offset_db_fmt_cntl = 0;
+   return rs;
+}
 
-      si_pm4_clear_state(pm4, sscreen, false);
+static void si_pm4_emit_rasterizer(struct si_context *sctx, unsigned index)
+{
+   struct si_state_rasterizer *state = sctx->queued.named.rasterizer;
 
-      if (!state->offset_units_unscaled) {
-         switch (i) {
-         case 0: /* 16-bit zbuffer */
-            offset_units *= 4.0f;
-            pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
-            break;
-         case 1: /* 24-bit zbuffer */
-            offset_units *= 2.0f;
-            pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
-            break;
-         case 2: /* 32-bit zbuffer */
-            offset_units *= 1.0f;
-            pa_su_poly_offset_db_fmt_cntl =
-               S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
-            break;
-         }
+   if (sctx->screen->info.has_set_context_pairs_packed) {
+      radeon_begin(&sctx->gfx_cs);
+      gfx11_begin_packed_context_regs();
+      gfx11_opt_set_context_reg(R_0286D4_SPI_INTERP_CONTROL_0, SI_TRACKED_SPI_INTERP_CONTROL_0,
+                                state->spi_interp_control_0);
+      gfx11_opt_set_context_reg(R_028A00_PA_SU_POINT_SIZE, SI_TRACKED_PA_SU_POINT_SIZE,
+                                state->pa_su_point_size);
+      gfx11_opt_set_context_reg(R_028A04_PA_SU_POINT_MINMAX, SI_TRACKED_PA_SU_POINT_MINMAX,
+                                state->pa_su_point_minmax);
+      gfx11_opt_set_context_reg(R_028A08_PA_SU_LINE_CNTL, SI_TRACKED_PA_SU_LINE_CNTL,
+                                state->pa_su_line_cntl);
+      gfx11_opt_set_context_reg(R_028A48_PA_SC_MODE_CNTL_0, SI_TRACKED_PA_SC_MODE_CNTL_0,
+                                state->pa_sc_mode_cntl_0);
+      gfx11_opt_set_context_reg(R_028814_PA_SU_SC_MODE_CNTL, SI_TRACKED_PA_SU_SC_MODE_CNTL,
+                                state->pa_su_sc_mode_cntl);
+      gfx11_opt_set_context_reg(R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL,
+                                state->pa_cl_ngg_cntl);
+      gfx11_opt_set_context_reg(R_028230_PA_SC_EDGERULE, SI_TRACKED_PA_SC_EDGERULE,
+                                state->pa_sc_edgerule);
+
+      if (state->uses_poly_offset && sctx->framebuffer.state.zsbuf) {
+         unsigned db_format_index =
+            ((struct si_surface *)sctx->framebuffer.state.zsbuf)->db_format_index;
+
+         gfx11_opt_set_context_reg(R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
+                                   SI_TRACKED_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
+                                   state->pa_su_poly_offset_db_fmt_cntl[db_format_index]);
+         gfx11_opt_set_context_reg(R_028B7C_PA_SU_POLY_OFFSET_CLAMP,
+                                   SI_TRACKED_PA_SU_POLY_OFFSET_CLAMP,
+                                   state->pa_su_poly_offset_clamp);
+         gfx11_opt_set_context_reg(R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE,
+                                   SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_SCALE,
+                                   state->pa_su_poly_offset_frontback_scale);
+         gfx11_opt_set_context_reg(R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET,
+                                   SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_OFFSET,
+                                   state->pa_su_poly_offset_frontback_offset[db_format_index]);
+         gfx11_opt_set_context_reg(R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE,
+                                   SI_TRACKED_PA_SU_POLY_OFFSET_BACK_SCALE,
+                                   state->pa_su_poly_offset_frontback_scale);
+         gfx11_opt_set_context_reg(R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET,
+                                   SI_TRACKED_PA_SU_POLY_OFFSET_BACK_OFFSET,
+                                   state->pa_su_poly_offset_frontback_offset[db_format_index]);
       }
-
-      si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
-      si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
-      si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale));
-      si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units));
-      si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale));
-      si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units));
-      si_pm4_finalize(pm4);
+      gfx11_end_packed_context_regs();
+      radeon_end(); /* don't track context rolls on GFX11 */
+   } else {
+      radeon_begin(&sctx->gfx_cs);
+      radeon_opt_set_context_reg(sctx, R_0286D4_SPI_INTERP_CONTROL_0,
+                                 SI_TRACKED_SPI_INTERP_CONTROL_0,
+                                 state->spi_interp_control_0);
+      radeon_opt_set_context_reg(sctx, R_028A00_PA_SU_POINT_SIZE, SI_TRACKED_PA_SU_POINT_SIZE,
+                                 state->pa_su_point_size);
+      radeon_opt_set_context_reg(sctx, R_028A04_PA_SU_POINT_MINMAX, SI_TRACKED_PA_SU_POINT_MINMAX,
+                                 state->pa_su_point_minmax);
+      radeon_opt_set_context_reg(sctx, R_028A08_PA_SU_LINE_CNTL, SI_TRACKED_PA_SU_LINE_CNTL,
+                                 state->pa_su_line_cntl);
+      radeon_opt_set_context_reg(sctx, R_028A48_PA_SC_MODE_CNTL_0, SI_TRACKED_PA_SC_MODE_CNTL_0,
+                                 state->pa_sc_mode_cntl_0);
+      radeon_opt_set_context_reg(sctx, R_028814_PA_SU_SC_MODE_CNTL,
+                                 SI_TRACKED_PA_SU_SC_MODE_CNTL, state->pa_su_sc_mode_cntl);
+      if (sctx->gfx_level >= GFX10) {
+         radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL,
+                                    state->pa_cl_ngg_cntl);
+      }
+      radeon_opt_set_context_reg(sctx, R_028230_PA_SC_EDGERULE, SI_TRACKED_PA_SC_EDGERULE,
+                                 state->pa_sc_edgerule);
+
+      if (state->uses_poly_offset && sctx->framebuffer.state.zsbuf) {
+         unsigned db_format_index =
+            ((struct si_surface *)sctx->framebuffer.state.zsbuf)->db_format_index;
+
+         radeon_opt_set_context_reg6(R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
+                                     SI_TRACKED_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
+                                     state->pa_su_poly_offset_db_fmt_cntl[db_format_index],
+                                     state->pa_su_poly_offset_clamp,
+                                     state->pa_su_poly_offset_frontback_scale,
+                                     state->pa_su_poly_offset_frontback_offset[db_format_index],
+                                     state->pa_su_poly_offset_frontback_scale,
+                                     state->pa_su_poly_offset_frontback_offset[db_format_index]);
+      }
+      radeon_end_update_context_roll();
    }
 
-   return rs;
+   sctx->emitted.named.rasterizer = state;
 }
 
 static void si_bind_rs_state(struct pipe_context *ctx, void *state)
@@ -1238,7 +1264,6 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
    SET_FIELD(sctx->current_vs_state, VS_STATE_CLAMP_VERTEX_COLOR, rs->clamp_vertex_color);
 
    si_pm4_bind_state(sctx, rasterizer, rs);
-   si_update_poly_offset_state(sctx);
 
    if (old_rs->scissor_enable != rs->scissor_enable)
       si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
@@ -1311,7 +1336,6 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state)
    if (sctx->queued.named.rasterizer == state)
       si_bind_rs_state(ctx, sctx->discard_rasterizer_state);
 
-   FREE(rs->pm4_poly_offset);
    si_pm4_free_state(sctx, &rs->pm4, SI_STATE_IDX(rasterizer));
 }
 
@@ -2748,6 +2772,22 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su
    if (format == V_028040_Z_INVALID)
       PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format);
 
+   /* Use the original Z format, not db_render_format, so that the polygon offset behaves as
+    * expected by applications.
+    */
+   switch (tex->buffer.b.b.format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      surf->db_format_index = 0;
+      break;
+   default: /* 24-bit */
+      surf->db_format_index = 1;
+      break;
+   case PIPE_FORMAT_Z32_FLOAT:
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      surf->db_format_index = 2;
+      break;
+   }
+
    if (sctx->gfx_level >= GFX9) {
       surf->db_htile_data_base = 0;
       surf->db_htile_surface = 0;
@@ -2973,6 +3013,9 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
    bool old_has_stencil =
       old_has_zsbuf &&
       ((struct si_texture *)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
+   uint8_t old_db_format_index =
+      old_has_zsbuf ?
+      ((struct si_surface *)sctx->framebuffer.state.zsbuf)->db_format_index : -1;
    int i;
 
    /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs
@@ -3162,10 +3205,14 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
       if (!sctx->framebuffer.min_bytes_per_pixel ||
           zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
          sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe;
+
+      /* Update polygon offset based on the Z format. */
+      if (sctx->queued.named.rasterizer->uses_poly_offset &&
+          surf->db_format_index != old_db_format_index)
+         (sctx)->dirty_atoms |= SI_STATE_BIT(rasterizer);
    }
 
    si_update_ps_colorbuf0_slot(sctx);
-   si_update_poly_offset_state(sctx);
    si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
    si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
 
@@ -5761,9 +5808,8 @@ void si_init_state_compute_functions(struct si_context *sctx)
 void si_init_state_functions(struct si_context *sctx)
 {
    sctx->atoms.s.pm4_states[SI_STATE_IDX(blend)].emit = si_pm4_emit_state;
-   sctx->atoms.s.pm4_states[SI_STATE_IDX(rasterizer)].emit = si_pm4_emit_state;
+   sctx->atoms.s.pm4_states[SI_STATE_IDX(rasterizer)].emit = si_pm4_emit_rasterizer;
    sctx->atoms.s.pm4_states[SI_STATE_IDX(dsa)].emit = si_pm4_emit_dsa;
-   sctx->atoms.s.pm4_states[SI_STATE_IDX(poly_offset)].emit = si_pm4_emit_state;
    sctx->atoms.s.pm4_states[SI_STATE_IDX(ls)].emit = si_pm4_emit_shader;
    sctx->atoms.s.pm4_states[SI_STATE_IDX(hs)].emit = si_pm4_emit_shader;
    sctx->atoms.s.pm4_states[SI_STATE_IDX(es)].emit = si_pm4_emit_shader;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 189ae76c99d54..096f94cee391f 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -54,8 +54,21 @@ struct si_state_blend {
 
 struct si_state_rasterizer {
    struct si_pm4_state pm4;
-   /* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
-   struct si_pm4_state *pm4_poly_offset;
+
+   /* Register values. */
+   unsigned spi_interp_control_0;
+   unsigned pa_su_point_size;
+   unsigned pa_su_point_minmax;
+   unsigned pa_su_line_cntl;
+   unsigned pa_sc_mode_cntl_0;
+   unsigned pa_su_sc_mode_cntl;
+   unsigned pa_cl_ngg_cntl;
+   unsigned pa_sc_edgerule;
+   unsigned pa_su_poly_offset_db_fmt_cntl[3];
+   unsigned pa_su_poly_offset_clamp;
+   unsigned pa_su_poly_offset_frontback_scale;
+   unsigned pa_su_poly_offset_frontback_offset[3];
+
    unsigned pa_sc_line_stipple;
    unsigned pa_cl_clip_cntl;
    float line_width;
@@ -178,7 +191,6 @@ union si_state {
       struct si_state_blend *blend;
       struct si_state_rasterizer *rasterizer;
       struct si_state_dsa *dsa;
-      struct si_pm4_state *poly_offset;
       struct si_shader *ls;
       struct si_shader *hs;
       struct si_shader *es;
@@ -234,7 +246,7 @@ union si_state_atoms {
 
 static inline uint64_t si_atoms_that_always_roll_context(void)
 {
-   return SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(poly_offset) |
+   return SI_STATE_BIT(blend) |
           SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) |
           SI_ATOM_BIT(sample_locations) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color)|
           SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports)|
@@ -259,6 +271,22 @@ enum si_tracked_reg
    SI_TRACKED_DB_DEPTH_BOUNDS_MIN,
    SI_TRACKED_DB_DEPTH_BOUNDS_MAX,
 
+   SI_TRACKED_SPI_INTERP_CONTROL_0,
+   SI_TRACKED_PA_SU_POINT_SIZE,
+   SI_TRACKED_PA_SU_POINT_MINMAX,
+   SI_TRACKED_PA_SU_LINE_CNTL,
+   SI_TRACKED_PA_SC_MODE_CNTL_0,
+   SI_TRACKED_PA_SU_SC_MODE_CNTL,
+   SI_TRACKED_PA_SC_EDGERULE,
+
+   /* 6 consecutive registers */
+   SI_TRACKED_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
+   SI_TRACKED_PA_SU_POLY_OFFSET_CLAMP,
+   SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_SCALE,
+   SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_OFFSET,
+   SI_TRACKED_PA_SU_POLY_OFFSET_BACK_SCALE,
+   SI_TRACKED_PA_SU_POLY_OFFSET_BACK_OFFSET,
+
    /* 2 consecutive registers */
    SI_TRACKED_PA_SC_LINE_CNTL,
    SI_TRACKED_PA_SC_AA_CONFIG,
@@ -304,6 +332,7 @@ enum si_tracked_reg
    SI_TRACKED_PA_SC_BINNER_CNTL_0,           /* GFX9+ */
    SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,    /* GFX10+ - the SMALL_PRIM_FILTER slot above can be reused */
    SI_TRACKED_GE_NGG_SUBGRP_CNTL,            /* GFX10+ */
+   SI_TRACKED_PA_CL_NGG_CNTL,                /* GFX10+ */
    SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL,    /* GFX10.3+ */
 
    /* 3 consecutive registers */
-- 
GitLab


From 2f281b39abe2650d914806e4ad1288bbb1c01567 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 1 Dec 2023 22:54:59 -0500
Subject: [PATCH 36/38] ac/gpu_info: fix printing radeon_info after adding VPE

Fixes: 3ec397819e7 - amd: add new hardware ip for vpe

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/amd/common/ac_gpu_info.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index f0f06b66e69d0..9ed47e04dac00 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -1701,7 +1701,7 @@ void ac_print_gpu_info(const struct radeon_info *info, FILE *f)
    fprintf(f, "    pcie_bandwidth = %1.1f GB/s\n", info->pcie_bandwidth_mbps / 1024.0);
    fprintf(f, "    clock_crystal_freq = %i KHz\n", info->clock_crystal_freq);
 
-   const char *ip_string[] = {
+   const char *ip_string[AMD_NUM_IP_TYPES] = {
       [AMD_IP_GFX] = "GFX",
       [AMD_IP_COMPUTE] = "COMP",
       [AMD_IP_SDMA] = "SDMA",
@@ -1711,6 +1711,7 @@ void ac_print_gpu_info(const struct radeon_info *info, FILE *f)
       [AMD_IP_VCN_DEC] = "VCN_DEC",
       [AMD_IP_VCN_ENC] = (info->vcn_ip_version >= VCN_4_0_0) ? "VCN" : "VCN_ENC",
       [AMD_IP_VCN_JPEG] = "VCN_JPG",
+      [AMD_IP_VPE] = "VPE",
    };
 
    for (unsigned i = 0; i < AMD_NUM_IP_TYPES; i++) {
-- 
GitLab


From 2e50b6677bf0d82dd4d21e8efb42a09ba8233a60 Mon Sep 17 00:00:00 2001
From: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Date: Thu, 7 Dec 2023 09:31:22 +0100
Subject: [PATCH 37/38] radeonsi: update guardband if
 vs_disables_clipping_viewport changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because the guardband state depends on this value.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_state_viewport.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index 88b26580e9c01..b70900d51bf2e 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -610,6 +610,7 @@ void si_update_vs_viewport_state(struct si_context *ctx)
 
    if (ctx->vs_disables_clipping_viewport != vs_window_space) {
       ctx->vs_disables_clipping_viewport = vs_window_space;
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
       si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
       si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
    }
-- 
GitLab


From 57e658d0418c178ccacb8faf23f81a4ca37ae0aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 26 Nov 2023 17:03:45 -0500
Subject: [PATCH 38/38] radeonsi: rework how guardband registers are updated to
 decrease overhead

See the code comments.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307>
---
 src/gallium/drivers/radeonsi/si_pipe.h        | 58 +++++++++++++++++--
 src/gallium/drivers/radeonsi/si_state.c       |  9 ++-
 .../drivers/radeonsi/si_state_viewport.c      | 30 +++-------
 3 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 1a122970021fd..ac96d1cbfa71c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1255,6 +1255,23 @@ struct si_context {
    bool vs_disables_clipping_viewport;
    bool has_reset_been_notified;
 
+   /* The number of pixels outside the viewport that are not culled by the clipper.
+    * Normally, the clipper clips everything outside the viewport, however, points and lines
+    * can have vertices outside the viewport, but their edges can be inside the viewport. Those
+    * shouldn't be culled. The problem is that the register setting (PA_CL_GB_*_DISC_ADJ) that
+    * controls the discard distance, which depends on the point size and line width, applies to
+    * all primitive types, and we would have to set 0 distance for triangles and non-zero for
+    * points and lines whenever the primitive type changes, which would add overhead and cause
+    * context rolls.
+    *
+    * To reduce that, whenever the discard distance changes for points and lines, we keep it
+    * at that higher value up to a certain small number for all primitive types including all
+    * points and lines within a specific size. This is slightly inefficient, but it eliminates
+    * a lot of guardband state updates and context register changes.
+    */
+   float min_clip_discard_distance_watermark;
+   float current_clip_discard_distance;
+
    /* Precomputed IA_MULTI_VGT_PARAM */
    union si_vgt_param_key ia_multi_vgt_param_key;
    unsigned ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES];
@@ -2114,6 +2131,28 @@ void si_check_dirty_buffers_textures(struct si_context *sctx)
    }
 }
 
+static inline void si_set_clip_discard_distance(struct si_context *sctx, float distance)
+{
+   /* Determine whether the guardband registers change.
+    *
+    * When we see a value greater than min_clip_discard_distance_watermark, we increase it
+    * up to a certain number to eliminate those state changes next time they happen.
+    * See the comment at min_clip_discard_distance_watermark.
+    */
+   if (distance > sctx->min_clip_discard_distance_watermark) {
+      /* The maximum number was determined from Viewperf. The number is in units of half-pixels. */
+      sctx->min_clip_discard_distance_watermark = MIN2(distance, 6);
+
+      float old_distance = sctx->current_clip_discard_distance;
+      float new_distance = MAX2(distance, sctx->min_clip_discard_distance_watermark);
+
+      if (old_distance != new_distance) {
+         sctx->current_clip_discard_distance = new_distance;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
+      }
+   }
+}
+
 /* Update these two GS_STATE fields. They depend on whatever the last shader before PS is
  * and the rasterizer state.
  *
@@ -2147,16 +2186,23 @@ si_set_rasterized_prim(struct si_context *sctx, enum mesa_prim rast_prim,
       bool is_rect = rast_prim == SI_PRIM_RECTANGLE_LIST;
       bool is_points = rast_prim == MESA_PRIM_POINTS;
       bool is_lines = util_prim_is_lines(rast_prim);
-      bool is_triangles = util_rast_prim_is_triangles(rast_prim);
 
-      if ((is_points || is_lines) != util_prim_is_points_or_lines(sctx->current_rast_prim))
-         si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
+      if (is_points) {
+         si_set_clip_discard_distance(sctx, sctx->queued.named.rasterizer->max_point_size);
+         sctx->gs_out_prim = V_028A6C_POINTLIST;
+      } else if (is_lines) {
+         si_set_clip_discard_distance(sctx, sctx->queued.named.rasterizer->line_width);
+         sctx->gs_out_prim = V_028A6C_LINESTRIP;
+      } else if (is_rect) {
+         /* Don't change the clip discard distance for rectangles. */
+         sctx->gs_out_prim = V_028A6C_RECTLIST;
+      } else {
+         si_set_clip_discard_distance(sctx, 0);
+         sctx->gs_out_prim = V_028A6C_TRISTRIP;
+      }
 
       sctx->current_rast_prim = rast_prim;
       si_vs_ps_key_update_rast_prim_smooth_stipple(sctx);
-      sctx->gs_out_prim = is_triangles ? V_028A6C_TRISTRIP :
-                          is_lines ? V_028A6C_LINESTRIP :
-                          is_rect ? V_028A6C_RECTLIST : V_028A6C_POINTLIST;
       si_update_ngg_prim_state_sgpr(sctx, hw_vs, ngg);
    }
 }
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 4d61823eab3c9..0c47c40695c23 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1268,10 +1268,15 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
    if (old_rs->scissor_enable != rs->scissor_enable)
       si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
 
-   if (old_rs->line_width != rs->line_width || old_rs->max_point_size != rs->max_point_size ||
-       old_rs->half_pixel_center != rs->half_pixel_center)
+   /* This never changes for OpenGL. */
+   if (old_rs->half_pixel_center != rs->half_pixel_center)
       si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
 
+   if (util_prim_is_lines(sctx->current_rast_prim))
+      si_set_clip_discard_distance(sctx, rs->line_width);
+   else if (sctx->current_rast_prim == MESA_PRIM_POINTS)
+      si_set_clip_discard_distance(sctx, rs->max_point_size);
+
    if (old_rs->clip_halfz != rs->clip_halfz)
       si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
 
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index b70900d51bf2e..e2de5ffe2f0be 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -249,7 +249,6 @@ static void si_emit_guardband(struct si_context *sctx, unsigned index)
    struct si_signed_scissor vp_as_scissor;
    struct pipe_viewport_state vp;
    float left, top, right, bottom, max_range, guardband_x, guardband_y;
-   float discard_x, discard_y;
 
    if (sctx->vs_writes_viewport_index) {
       /* Shaders can draw to any viewport. Make a union of all
@@ -339,28 +338,17 @@ static void si_emit_guardband(struct si_context *sctx, unsigned index)
    guardband_x = MIN2(-left, right);
    guardband_y = MIN2(-top, bottom);
 
-   discard_x = 1.0;
-   discard_y = 1.0;
+   float discard_x = 1.0;
+   float discard_y = 1.0;
+   float distance = sctx->current_clip_discard_distance;
 
-   if (unlikely(util_prim_is_points_or_lines(sctx->current_rast_prim))) {
-      /* When rendering wide points or lines, we need to be more
-       * conservative about when to discard them entirely. */
-      float pixels;
+   /* Add half the point size / line width */
+   discard_x += distance / (2.0 * vp.scale[0]);
+   discard_y += distance / (2.0 * vp.scale[1]);
 
-      if (sctx->current_rast_prim == MESA_PRIM_POINTS)
-         pixels = rs->max_point_size;
-      else
-         pixels = rs->line_width;
-
-      /* Add half the point size / line width */
-      discard_x += pixels / (2.0 * vp.scale[0]);
-      discard_y += pixels / (2.0 * vp.scale[1]);
-
-      /* Discard primitives that would lie entirely outside the clip
-       * region. */
-      discard_x = MIN2(discard_x, guardband_x);
-      discard_y = MIN2(discard_y, guardband_y);
-   }
+   /* Discard primitives that would lie entirely outside the viewport area. */
+   discard_x = MIN2(discard_x, guardband_x);
+   discard_y = MIN2(discard_y, guardband_y);
 
    unsigned pa_su_vtx_cntl = S_028BE4_PIX_CENTER(rs->half_pixel_center) |
                              S_028BE4_ROUND_MODE(V_028BE4_X_ROUND_TO_EVEN) |
-- 
GitLab