From f51b960af18f697a187fe2ed4918fb95495e5569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Thu, 23 Nov 2023 18:05:10 -0500 Subject: [PATCH 01/38] radeonsi/gfx11: fix unaligned SET_CONTEXT_PAIRS_PACKED It set an invalid register. Luckily it didn't cause any issues. Fixes: 2ac6816b70d7bb - radeonsi/gfx11: use SET_CONTEXT_REG_PAIRS_PACKED for other states Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_build_pm4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index 6ce53255cfee2..24e799080706f 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -339,7 +339,7 @@ if (__cs_context_reg_count >= 2) { \ /* Align the count to 2 by duplicating the first register. */ \ if (__cs_context_reg_count % 2 == 1) { \ - gfx11_set_context_reg(__cs_context_regs[0].reg_offset[0] + SI_CONTEXT_REG_OFFSET, \ + gfx11_set_context_reg(SI_CONTEXT_REG_OFFSET + __cs_context_regs[0].reg_offset[0] * 4, \ __cs_context_regs[0].reg_value[0]); \ } \ assert(__cs_context_reg_count % 2 == 0); \ -- GitLab From 98e7a7123b8af55c2aa287a5f6071db980ccfd53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Fri, 24 Nov 2023 04:49:30 -0500 Subject: [PATCH 02/38] radeonsi: don't set non-existent VGT_GS_MAX_PRIMS_PER_SUBGROUP on gfx10 Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_state.h | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 0d094b4ce9537..1f4fda8c11184 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -296,7 +296,7 @@ enum si_tracked_context_reg SI_TRACKED_VGT_REUSE_OFF, /* GFX6-8 (GFX9+ can reuse this slot) */ SI_TRACKED_IA_MULTI_VGT_PARAM, /* GFX6-8 (GFX9+ can reuse this slot) */ - SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, /* GFX9-10 - the slots above can be reused */ + SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, /* GFX9 - the slots above can be reused */ SI_TRACKED_VGT_GS_ONCHIP_CNTL, /* GFX9-10 - the slots above can be reused */ SI_TRACKED_VGT_GSVS_RING_ITEMSIZE, /* GFX6-10 (GFX11+ can reuse this slot) */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 42d3e78af8dbb..d37ced356a7cb 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -958,9 +958,11 @@ static void si_emit_shader_gs(struct si_context *sctx, unsigned index) radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL, shader->gs.vgt_gs_onchip_cntl); /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */ - radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, - SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, - shader->gs.vgt_gs_max_prims_per_subgroup); + if (sctx->gfx_level == GFX9) { + radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, + SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, + shader->gs.vgt_gs_max_prims_per_subgroup); + } if (shader->key.ge.part.gs.es->stage == MESA_SHADER_TESS_EVAL) radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, -- GitLab From c8411ddf17700bac9910b9cc1017805415fc0fdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Mon, 20 Nov 2023 02:03:29 -0500 Subject: [PATCH 03/38] radeonsi: change the low-priority compiler queue to normal priority I'm guessing that low priority could cause us to get optimized shaders later than we need. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_pipe.c | 11 +++++------ src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index a65f8b7d307ea..5a2111d66985e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -448,7 +448,7 @@ static void si_set_debug_callback(struct pipe_context *ctx, const struct util_de struct si_screen *screen = sctx->screen; util_queue_finish(&screen->shader_compiler_queue); - util_queue_finish(&screen->shader_compiler_queue_low_priority); + util_queue_finish(&screen->shader_compiler_queue_opt_variants); if (cb) sctx->debug = *cb; @@ -1004,7 +1004,7 @@ static void si_destroy_screen(struct pipe_screen *pscreen) } util_queue_destroy(&sscreen->shader_compiler_queue); - util_queue_destroy(&sscreen->shader_compiler_queue_low_priority); + util_queue_destroy(&sscreen->shader_compiler_queue_opt_variants); /* Release the reference on glsl types of the compiler threads. */ glsl_type_singleton_decref(); @@ -1340,7 +1340,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, if (!util_queue_init(&sscreen->shader_compiler_queue, "sh", num_slots, num_comp_hi_threads, UTIL_QUEUE_INIT_RESIZE_IF_FULL | - UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) { + UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) { si_destroy_shader_cache(sscreen); FREE(sscreen->nir_options); FREE(sscreen); @@ -1348,11 +1348,10 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, return NULL; } - if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority, "shlo", num_slots, + if (!util_queue_init(&sscreen->shader_compiler_queue_opt_variants, "sh_opt", num_slots, num_comp_lo_threads, UTIL_QUEUE_INIT_RESIZE_IF_FULL | - UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY | - UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY, NULL)) { + UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) { si_destroy_shader_cache(sscreen); FREE(sscreen->nir_options); FREE(sscreen); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 6277c448de209..07492b9469ce3 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -703,7 +703,7 @@ struct si_screen { */ struct ac_llvm_compiler *compiler[24]; /* used by the queue only */ - struct util_queue shader_compiler_queue_low_priority; + struct util_queue shader_compiler_queue_opt_variants; /* Compiler instances for asynchronous shader compilation of optimized shader variants, * one for each thread of the low-priority shader compiler queue. */ struct ac_llvm_compiler *compiler_lowp[10]; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index d37ced356a7cb..3405e754e8ea7 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -2912,7 +2912,7 @@ current_not_ready: /* If it's an optimized shader, compile it asynchronously. */ if (shader->is_optimized) { /* Compile it asynchronously. */ - util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, shader, &shader->ready, + util_queue_add_job(&sscreen->shader_compiler_queue_opt_variants, shader, &shader->ready, si_build_shader_variant_low_priority, NULL, 0); /* Add only after the ready fence was reset, to guard against a @@ -3693,7 +3693,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) { if (shader->is_optimized) { - util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority, &shader->ready); + util_queue_drop_job(&sctx->screen->shader_compiler_queue_opt_variants, &shader->ready); } util_queue_fence_destroy(&shader->ready); -- GitLab From 613ea16aab501edc67115768b428673f1e05daae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Mon, 20 Nov 2023 03:41:17 -0500 Subject: [PATCH 04/38] radeonsi: update shaders for blend state only if the shader key changed Check if any key bit or state changed before setting do_update_shaders. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_state.c | 1 - .../drivers/radeonsi/si_state_shaders.cpp | 32 ++++++++++++++++--- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 063f2e805d5c1..f4237bccbee48 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -775,7 +775,6 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) { si_ps_key_update_framebuffer_blend_rasterizer(sctx); si_update_ps_inputs_read_or_disabled(sctx); - sctx->do_update_shaders = true; } if (sctx->screen->dpbb_allowed && diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 3405e754e8ea7..20942729efa6a 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -2214,8 +2214,10 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx) (!ps_colormask && !ps_modifies_zs && !ps->info.base.writes_memory); } + uint64_t ps_inputs_read_or_disabled; + if (ps_disabled) { - sctx->ps_inputs_read_or_disabled = 0; + ps_inputs_read_or_disabled = 0; } else { uint64_t inputs_read = ps->info.inputs_read; @@ -2227,7 +2229,12 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx) inputs_read |= BITFIELD64_BIT(SI_UNIQUE_SLOT_BFC1); } - sctx->ps_inputs_read_or_disabled = inputs_read; + ps_inputs_read_or_disabled = inputs_read; + } + + if (sctx->ps_inputs_read_or_disabled != ps_inputs_read_or_disabled) { + sctx->ps_inputs_read_or_disabled = ps_inputs_read_or_disabled; + sctx->do_update_shaders = true; } } @@ -2302,6 +2309,9 @@ void si_ps_key_update_framebuffer(struct si_context *sctx) void si_ps_key_update_framebuffer_blend_rasterizer(struct si_context *sctx) { struct si_shader_selector *sel = sctx->shader.ps.cso; + if (!sel) + return; + union si_shader_key *key = &sctx->shader.ps.key; struct si_state_blend *blend = sctx->queued.named.blend; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; @@ -2309,8 +2319,14 @@ void si_ps_key_update_framebuffer_blend_rasterizer(struct si_context *sctx) sctx->framebuffer.nr_samples >= 2; unsigned need_src_alpha_4bit = blend->need_src_alpha_4bit; - if (!sel) - return; + /* Old key data for comparison. */ + struct si_ps_epilog_bits old_epilog; + memcpy(&old_epilog, &key->ps.part.epilog, sizeof(old_epilog)); + bool old_prefer_mono = key->ps.opt.prefer_mono; +#ifndef NDEBUG + struct si_shader_key_ps old_key; + memcpy(&old_key, &key->ps, sizeof(old_key)); +#endif key->ps.part.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable; key->ps.part.epilog.alpha_to_coverage_via_mrtz = @@ -2414,6 +2430,14 @@ void si_ps_key_update_framebuffer_blend_rasterizer(struct si_context *sctx) key->ps.opt.prefer_mono = 1; else key->ps.opt.prefer_mono = 0; + + /* Update shaders only if the key changed. */ + if (memcmp(&key->ps.part.epilog, &old_epilog, sizeof(old_epilog)) || + key->ps.opt.prefer_mono != old_prefer_mono) { + sctx->do_update_shaders = true; + } else { + assert(memcmp(&key->ps, &old_key, sizeof(old_key)) == 0); + } } void si_ps_key_update_rasterizer(struct si_context *sctx) -- GitLab From f9c4ac3477ba3beffbc1f12ba6f188ee332ba2e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Mon, 20 Nov 2023 03:41:17 -0500 Subject: [PATCH 05/38] radeonsi: update shaders for rasterizer state only if the shader key changed Check if any key bit changed before setting do_update_shaders. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_state.c | 18 ++++++++++++------ .../drivers/radeonsi/si_state_shaders.cpp | 19 +++++++++++++++++++ 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index f4237bccbee48..636ee6cf88322 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1251,24 +1251,30 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) if (sctx->screen->dpbb_allowed && (old_rs->bottom_edge_rule != rs->bottom_edge_rule)) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - if (old_rs->clip_plane_enable != rs->clip_plane_enable || - old_rs->rasterizer_discard != rs->rasterizer_discard || + if (old_rs->rasterizer_discard != rs->rasterizer_discard || old_rs->sprite_coord_enable != rs->sprite_coord_enable || old_rs->flatshade != rs->flatshade || old_rs->two_side != rs->two_side || old_rs->multisample_enable != rs->multisample_enable || old_rs->poly_stipple_enable != rs->poly_stipple_enable || - old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth || old_rs->point_smooth != rs->point_smooth || old_rs->clamp_fragment_color != rs->clamp_fragment_color || - old_rs->force_persample_interp != rs->force_persample_interp || - old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) { + old_rs->force_persample_interp != rs->force_persample_interp) { si_ps_key_update_framebuffer_blend_rasterizer(sctx); si_ps_key_update_rasterizer(sctx); si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); si_update_ps_inputs_read_or_disabled(sctx); - sctx->do_update_shaders = true; } + if (/* Used by si_get_vs_key_outputs in si_update_shaders: */ + old_rs->clip_plane_enable != rs->clip_plane_enable || + old_rs->polygon_mode_is_points != rs->polygon_mode_is_points || + /* Used by si_ps_key_update_primtype_shader_rasterizer_framebuffer in si_update_shaders: */ + old_rs->poly_stipple_enable != rs->poly_stipple_enable || + old_rs->poly_smooth != rs->poly_smooth || + old_rs->line_smooth != rs->line_smooth || + old_rs->point_smooth != rs->point_smooth) + sctx->do_update_shaders = true; + if (old_rs->line_smooth != rs->line_smooth || old_rs->poly_smooth != rs->poly_smooth || old_rs->point_smooth != rs->point_smooth || diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 20942729efa6a..7b193c9ad0872 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -2449,9 +2449,18 @@ void si_ps_key_update_rasterizer(struct si_context *sctx) if (!sel) return; + bool old_color_two_side = key->ps.part.prolog.color_two_side; + bool old_flatshade_colors = key->ps.part.prolog.flatshade_colors; + bool old_clamp_color = key->ps.part.epilog.clamp_color; + key->ps.part.prolog.color_two_side = rs->two_side && sel->info.colors_read; key->ps.part.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color; key->ps.part.epilog.clamp_color = rs->clamp_fragment_color; + + if (key->ps.part.prolog.color_two_side != old_color_two_side || + key->ps.part.prolog.flatshade_colors != old_flatshade_colors || + key->ps.part.epilog.clamp_color != old_clamp_color) + sctx->do_update_shaders = true; } void si_ps_key_update_dsa(struct si_context *sctx) @@ -2501,6 +2510,11 @@ void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *s if (!sel) return; + /* Old key data for comparison. */ + struct si_ps_prolog_bits old_prolog; + memcpy(&old_prolog, &key->ps.part.prolog, sizeof(old_prolog)); + bool old_interpolate_at_sample_force_center = key->ps.mono.interpolate_at_sample_force_center; + bool uses_persp_center = sel->info.uses_persp_center || (!rs->flatshade && sel->info.uses_persp_center_color); bool uses_persp_centroid = sel->info.uses_persp_centroid || @@ -2548,6 +2562,11 @@ void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *s key->ps.part.prolog.bc_optimize_for_linear = 0; key->ps.mono.interpolate_at_sample_force_center = sel->info.uses_interp_at_sample; } + + /* Update shaders only if the key changed. */ + if (memcmp(&key->ps.part.prolog, &old_prolog, sizeof(old_prolog)) || + key->ps.mono.interpolate_at_sample_force_center != old_interpolate_at_sample_force_center) + sctx->do_update_shaders = true; } /* Compute the key for the hw shader variant */ -- GitLab From 4ab5374ec361ce735e87d95550a72b9988c32f59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Mon, 20 Nov 2023 04:18:15 -0500 Subject: [PATCH 06/38] radeonsi: clean up setting poly/line/stipple shader key bits Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- .../drivers/radeonsi/si_state_shaders.cpp | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 7b193c9ad0872..23e9a126b5b6d 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -2475,16 +2475,20 @@ static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_co union si_shader_key *key = &sctx->shader.ps.key; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); - bool is_line = util_prim_is_lines(sctx->current_rast_prim); - - key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; - key->ps.mono.poly_line_smoothing = - ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) && - sctx->framebuffer.nr_samples <= 1; - - key->ps.mono.point_smoothing = rs->point_smooth && - sctx->current_rast_prim == MESA_PRIM_POINTS; + if (sctx->current_rast_prim == MESA_PRIM_POINTS) { + key->ps.part.prolog.poly_stipple = 0; + key->ps.mono.poly_line_smoothing = 0; + key->ps.mono.point_smoothing = rs->point_smooth; + } else if (util_prim_is_lines(sctx->current_rast_prim)) { + key->ps.part.prolog.poly_stipple = 0; + key->ps.mono.poly_line_smoothing = rs->line_smooth && sctx->framebuffer.nr_samples <= 1; + key->ps.mono.point_smoothing = 0; + } else { + /* Triangles. */ + key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable; + key->ps.mono.poly_line_smoothing = rs->poly_smooth && sctx->framebuffer.nr_samples <= 1; + key->ps.mono.point_smoothing = 0; + } } void si_ps_key_update_sample_shading(struct si_context *sctx) -- GitLab From e2b817b948a3ad15f163135459b388aaef4eeec3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Mon, 20 Nov 2023 04:56:56 -0500 Subject: [PATCH 07/38] radeonsi: rewrite how shader key bits dependent on current_rast_prim are updated Don't set do_update_shaders every time current_rast_prim changes, which can be EVERY DRAW. Instead, just update the shader key bits and set do_update_shaders only if any bits are different. When we bind a new rasterizer state, do the same. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_state.c | 16 ++-- src/gallium/drivers/radeonsi/si_state.h | 1 + .../drivers/radeonsi/si_state_shaders.cpp | 82 ++++++++++++------- 4 files changed, 63 insertions(+), 38 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 07492b9469ce3..901d7d1409833 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -2159,10 +2159,10 @@ si_set_rasterized_prim(struct si_context *sctx, enum mesa_prim rast_prim, si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); sctx->current_rast_prim = rast_prim; + si_vs_ps_key_update_rast_prim_smooth_stipple(sctx); sctx->gs_out_prim = is_triangles ? V_028A6C_TRISTRIP : is_lines ? V_028A6C_LINESTRIP : is_rect ? V_028A6C_RECTLIST : V_028A6C_POINTLIST; - sctx->do_update_shaders = true; si_update_ngg_prim_state_sgpr(sctx, hw_vs, ngg); } } diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 636ee6cf88322..7be00a921507e 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1265,14 +1265,15 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) si_update_ps_inputs_read_or_disabled(sctx); } - if (/* Used by si_get_vs_key_outputs in si_update_shaders: */ - old_rs->clip_plane_enable != rs->clip_plane_enable || - old_rs->polygon_mode_is_points != rs->polygon_mode_is_points || - /* Used by si_ps_key_update_primtype_shader_rasterizer_framebuffer in si_update_shaders: */ - old_rs->poly_stipple_enable != rs->poly_stipple_enable || - old_rs->poly_smooth != rs->poly_smooth || + if (old_rs->point_smooth != rs->point_smooth || old_rs->line_smooth != rs->line_smooth || - old_rs->point_smooth != rs->point_smooth) + old_rs->poly_smooth != rs->poly_smooth || + old_rs->polygon_mode_is_points != rs->polygon_mode_is_points || + old_rs->poly_stipple_enable != rs->poly_stipple_enable) + si_vs_ps_key_update_rast_prim_smooth_stipple(sctx); + + /* Used by si_get_vs_key_outputs in si_update_shaders: */ + if (old_rs->clip_plane_enable != rs->clip_plane_enable) sctx->do_update_shaders = true; if (old_rs->line_smooth != rs->line_smooth || @@ -3166,6 +3167,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_ps_key_update_framebuffer(sctx); si_ps_key_update_framebuffer_blend_rasterizer(sctx); si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + si_vs_ps_key_update_rast_prim_smooth_stipple(sctx); si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 1f4fda8c11184..6f2f146462f21 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -616,6 +616,7 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx); void si_update_vrs_flat_shading(struct si_context *sctx); unsigned si_get_input_prim(const struct si_shader_selector *gs, const union si_shader_key *key); bool si_update_ngg(struct si_context *sctx); +void si_vs_ps_key_update_rast_prim_smooth_stipple(struct si_context *sctx); void si_ps_key_update_framebuffer(struct si_context *sctx); void si_ps_key_update_framebuffer_blend_rasterizer(struct si_context *sctx); void si_ps_key_update_rasterizer(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 23e9a126b5b6d..a7d74aa8c3386 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -2238,6 +2238,47 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx) } } +void si_vs_ps_key_update_rast_prim_smooth_stipple(struct si_context *sctx) +{ + struct si_shader_ctx_state *hw_vs = si_get_vs(sctx); + if (!hw_vs->cso) + return; + + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + union si_shader_key *vs_key = &hw_vs->key; /* could also be TES or GS before PS */ + union si_shader_key *ps_key = &sctx->shader.ps.key; + + bool old_kill_pointsize = vs_key->ge.opt.kill_pointsize; + bool old_poly_stipple = ps_key->ps.part.prolog.poly_stipple; + bool old_poly_line_smoothing = ps_key->ps.mono.poly_line_smoothing; + bool old_point_smoothing = ps_key->ps.mono.point_smoothing; + + if (sctx->current_rast_prim == MESA_PRIM_POINTS) { + vs_key->ge.opt.kill_pointsize = 0; + ps_key->ps.part.prolog.poly_stipple = 0; + ps_key->ps.mono.poly_line_smoothing = 0; + ps_key->ps.mono.point_smoothing = rs->point_smooth; + } else if (util_prim_is_lines(sctx->current_rast_prim)) { + vs_key->ge.opt.kill_pointsize = hw_vs->cso->info.writes_psize; + ps_key->ps.part.prolog.poly_stipple = 0; + ps_key->ps.mono.poly_line_smoothing = rs->line_smooth && sctx->framebuffer.nr_samples <= 1; + ps_key->ps.mono.point_smoothing = 0; + } else { + /* Triangles. */ + vs_key->ge.opt.kill_pointsize = hw_vs->cso->info.writes_psize && + !rs->polygon_mode_is_points; + ps_key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable; + ps_key->ps.mono.poly_line_smoothing = rs->poly_smooth && sctx->framebuffer.nr_samples <= 1; + ps_key->ps.mono.point_smoothing = 0; + } + + if (vs_key->ge.opt.kill_pointsize != old_kill_pointsize || + ps_key->ps.part.prolog.poly_stipple != old_poly_stipple || + ps_key->ps.mono.poly_line_smoothing != old_poly_line_smoothing || + ps_key->ps.mono.point_smoothing != old_point_smoothing) + sctx->do_update_shaders = true; +} + static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, union si_shader_key *key) { @@ -2247,15 +2288,12 @@ static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_sele uint64_t outputs_written = vs->info.outputs_written_before_ps; uint64_t linked = outputs_written & sctx->ps_inputs_read_or_disabled; + key->ge.opt.kill_layer = vs->info.writes_layer && + sctx->framebuffer.state.layers <= 1; key->ge.opt.kill_outputs = ~linked & outputs_written; key->ge.opt.ngg_culling = sctx->ngg_culling; key->ge.mono.u.vs_export_prim_id = vs->stage != MESA_SHADER_GEOMETRY && sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid; - key->ge.opt.kill_pointsize = vs->info.writes_psize && - sctx->current_rast_prim != MESA_PRIM_POINTS && - !sctx->queued.named.rasterizer->polygon_mode_is_points; - key->ge.opt.kill_layer = vs->info.writes_layer && - sctx->framebuffer.state.layers <= 1; key->ge.opt.remove_streamout = vs->info.enabled_streamout_buffer_mask && !sctx->streamout.enabled_mask; } @@ -2268,7 +2306,6 @@ static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_se key->ge.opt.remove_streamout = 0; key->ge.opt.ngg_culling = 0; key->ge.mono.u.vs_export_prim_id = 0; - key->ge.opt.kill_pointsize = 0; } void si_ps_key_update_framebuffer(struct si_context *sctx) @@ -2470,27 +2507,6 @@ void si_ps_key_update_dsa(struct si_context *sctx) key->ps.part.epilog.alpha_func = sctx->queued.named.dsa->alpha_func; } -static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_context *sctx) -{ - union si_shader_key *key = &sctx->shader.ps.key; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - - if (sctx->current_rast_prim == MESA_PRIM_POINTS) { - key->ps.part.prolog.poly_stipple = 0; - key->ps.mono.poly_line_smoothing = 0; - key->ps.mono.point_smoothing = rs->point_smooth; - } else if (util_prim_is_lines(sctx->current_rast_prim)) { - key->ps.part.prolog.poly_stipple = 0; - key->ps.mono.poly_line_smoothing = rs->line_smooth && sctx->framebuffer.nr_samples <= 1; - key->ps.mono.point_smoothing = 0; - } else { - /* Triangles. */ - key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable; - key->ps.mono.poly_line_smoothing = rs->poly_smooth && sctx->framebuffer.nr_samples <= 1; - key->ps.mono.point_smoothing = 0; - } -} - void si_ps_key_update_sample_shading(struct si_context *sctx) { struct si_shader_selector *sel = sctx->shader.ps.cso; @@ -2616,7 +2632,6 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh } break; case MESA_SHADER_FRAGMENT: - si_ps_key_update_primtype_shader_rasterizer_framebuffer(sctx); break; default: assert(0); @@ -3491,11 +3506,18 @@ static void si_update_last_vgt_stage_state(struct si_context *sctx, struct si_shader_selector *old_hw_vs, struct si_shader *old_hw_vs_variant) { + struct si_shader_ctx_state *hw_vs = si_get_vs(sctx); + si_update_vs_viewport_state(sctx); si_update_streamout_state(sctx); - si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso, - si_get_vs(sctx)->current); + si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, hw_vs->cso, hw_vs->current); si_update_rasterized_prim(sctx); + + /* Clear kill_pointsize because we only want it to be set in the last shader before PS. */ + sctx->shader.vs.key.ge.opt.kill_pointsize = 0; + sctx->shader.tes.key.ge.opt.kill_pointsize = 0; + sctx->shader.gs.key.ge.opt.kill_pointsize = 0; + si_vs_ps_key_update_rast_prim_smooth_stipple(sctx); } static void si_bind_vs_shader(struct pipe_context *ctx, void *state) -- GitLab From 53aa36772a1ab7764cc02498ccfec4a515393231 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Mon, 20 Nov 2023 05:11:33 -0500 Subject: [PATCH 08/38] radeonsi: rewrite si_get_total_colormask as si_any_colorbuffer_written The result is only used as bool. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_blit.c | 2 +- src/gallium/drivers/radeonsi/si_pipe.h | 20 +++++++------------ .../drivers/radeonsi/si_state_shaders.cpp | 4 ++-- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 230ea7bd27c7c..81142d7556086 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -733,7 +733,7 @@ static void si_check_render_feedback(struct si_context *sctx) /* There is no render feedback if color writes are disabled. * (e.g. a pixel shader with image stores) */ - if (!si_get_total_colormask(sctx)) + if (!si_any_colorbuffer_written(sctx)) return; for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; ++i) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 901d7d1409833..5cf091451b657 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1972,24 +1972,18 @@ static inline unsigned si_get_ps_iter_samples(struct si_context *sctx) return MIN2(sctx->ps_iter_samples, sctx->framebuffer.nr_color_samples); } -static inline unsigned si_get_total_colormask(struct si_context *sctx) +static inline bool si_any_colorbuffer_written(struct si_context *sctx) { if (sctx->queued.named.rasterizer->rasterizer_discard) - return 0; + return false; struct si_shader_selector *ps = sctx->shader.ps.cso; - if (!ps) - return 0; - - unsigned colormask = - sctx->framebuffer.colorbuf_enabled_4bit & sctx->queued.named.blend->cb_target_mask; - - if (!ps->info.color0_writes_all_cbufs) - colormask &= ps->info.colors_written_4bit; - else if (!ps->info.colors_written_4bit) - colormask = 0; /* color0 writes all cbufs, but it's not written */ + if (!ps || !ps->info.colors_written_4bit) + return false; - return colormask; + return (sctx->framebuffer.colorbuf_enabled_4bit & + sctx->queued.named.blend->cb_target_enabled_4bit & + (ps->info.color0_writes_all_cbufs ? ~0 : ps->info.colors_written_4bit)) != 0; } #define UTIL_ALL_PRIM_LINE_MODES \ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index a7d74aa8c3386..bc4af64780cd7 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -2208,10 +2208,10 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx) sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS || sctx->queued.named.rasterizer->poly_stipple_enable || sctx->queued.named.rasterizer->point_smooth; - unsigned ps_colormask = si_get_total_colormask(sctx); ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard || - (!ps_colormask && !ps_modifies_zs && !ps->info.base.writes_memory); + (!ps_modifies_zs && !ps->info.base.writes_memory && + !si_any_colorbuffer_written(sctx)); } uint64_t ps_inputs_read_or_disabled; -- GitLab From bf7debee82c95f15f50f616c3c800eb6e19acff2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Mon, 20 Nov 2023 05:26:44 -0500 Subject: [PATCH 09/38] radeonsi: in bind_{blend,rs}_state, only call 1 update function per if Also don't use "key.ps.part.prolog.color_two_side" during updates because it would depend on the order the update functions are called, which is not a problem now, but it's a trap for the future. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_state.c | 29 ++++++++++++------- .../drivers/radeonsi/si_state_shaders.cpp | 2 +- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 7be00a921507e..7671a66fcc40f 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -772,10 +772,12 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) old_blend->alpha_to_one != blend->alpha_to_one || old_blend->dual_src_blend != blend->dual_src_blend || old_blend->blend_enable_4bit != blend->blend_enable_4bit || - old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) { + old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) si_ps_key_update_framebuffer_blend_rasterizer(sctx); + + if (old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit || + old_blend->alpha_to_coverage != blend->alpha_to_coverage) si_update_ps_inputs_read_or_disabled(sctx); - } if (sctx->screen->dpbb_allowed && (old_blend->alpha_to_coverage != blend->alpha_to_coverage || @@ -1251,19 +1253,24 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) if (sctx->screen->dpbb_allowed && (old_rs->bottom_edge_rule != rs->bottom_edge_rule)) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - if (old_rs->rasterizer_discard != rs->rasterizer_discard || - old_rs->sprite_coord_enable != rs->sprite_coord_enable || - old_rs->flatshade != rs->flatshade || old_rs->two_side != rs->two_side || - old_rs->multisample_enable != rs->multisample_enable || - old_rs->poly_stipple_enable != rs->poly_stipple_enable || - old_rs->point_smooth != rs->point_smooth || - old_rs->clamp_fragment_color != rs->clamp_fragment_color || - old_rs->force_persample_interp != rs->force_persample_interp) { + if (old_rs->multisample_enable != rs->multisample_enable) si_ps_key_update_framebuffer_blend_rasterizer(sctx); + + if (old_rs->two_side != rs->two_side || + old_rs->flatshade != rs->flatshade || + old_rs->clamp_fragment_color != rs->clamp_fragment_color) si_ps_key_update_rasterizer(sctx); + + if (old_rs->flatshade != rs->flatshade || + old_rs->force_persample_interp != rs->force_persample_interp || + old_rs->multisample_enable != rs->multisample_enable) si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + + if (old_rs->rasterizer_discard != rs->rasterizer_discard || + old_rs->two_side != rs->two_side || + old_rs->poly_stipple_enable != rs->poly_stipple_enable || + old_rs->point_smooth != rs->point_smooth) si_update_ps_inputs_read_or_disabled(sctx); - } if (old_rs->point_smooth != rs->point_smooth || old_rs->line_smooth != rs->line_smooth || diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index bc4af64780cd7..e52f799fcc84d 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -2221,7 +2221,7 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx) } else { uint64_t inputs_read = ps->info.inputs_read; - if (sctx->shader.ps.key.ps.part.prolog.color_two_side) { + if (ps->info.colors_read && sctx->queued.named.rasterizer->two_side) { if (inputs_read & BITFIELD64_BIT(SI_UNIQUE_SLOT_COL0)) inputs_read |= BITFIELD64_BIT(SI_UNIQUE_SLOT_BFC0); -- GitLab From 202285436027b5ba0519407b466240481660b545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Fri, 10 Nov 2023 19:26:20 -0500 Subject: [PATCH 10/38] radeonsi/gfx11: skip si_set_streamout_enable because it has no effect Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_state_streamout.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 73a8213468174..b61cfb89a8a4a 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -359,6 +359,9 @@ static void si_emit_streamout_enable(struct si_context *sctx, unsigned index) static void si_set_streamout_enable(struct si_context *sctx, bool enable) { + if (sctx->gfx_level >= GFX11) + return; + bool old_strmout_en = si_get_strmout_en(sctx); unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask; @@ -368,9 +371,8 @@ static void si_set_streamout_enable(struct si_context *sctx, bool enable) sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) | (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12); - if (sctx->gfx_level < GFX11 && - ((old_strmout_en != si_get_strmout_en(sctx)) || - (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))) + if ((old_strmout_en != si_get_strmout_en(sctx)) || + (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)) si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); } -- GitLab From 9e764596163a598a7b0b2082ff79d5cacd9e650c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Fri, 10 Nov 2023 19:27:37 -0500 Subject: [PATCH 11/38] radeonsi: execute streamout_begin after cache flushes so that si_emit_streamout_begin can assume that cache flushes have finished. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_state.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 6f2f146462f21..9441459cf2c68 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -185,8 +185,7 @@ union si_state_atoms { /* This must be first. */ struct si_atom pm4_states[SI_NUM_STATES]; struct si_atom gfx_add_all_to_bo_list; - struct si_atom streamout_begin; - struct si_atom streamout_enable; /* must be after streamout_begin */ + struct si_atom streamout_enable; struct si_atom framebuffer; struct si_atom sample_locations; struct si_atom db_render_state; @@ -210,6 +209,7 @@ union si_state_atoms { struct si_atom vgt_pipeline_state; struct si_atom tess_io_layout; struct si_atom cache_flush; + struct si_atom streamout_begin; /* this must be done after cache_flush */ struct si_atom render_cond; /* this must be after cache_flush */ } s; struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)]; -- GitLab From 1afe6f33212d9ae8aeffd9f2d464df85eb5facda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Fri, 10 Nov 2023 19:16:39 -0500 Subject: [PATCH 12/38] radeonsi: don't print the preamble state separately for GALLIUM_DDEBUG because it's always printed as part of command buffers. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_debug.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index 5e62ec1b0f005..a6a8c8902a28d 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -392,12 +392,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) last_trace_id = map[0]; if (chunk->gfx_end != chunk->gfx_begin) { - if (chunk->gfx_begin == 0) { - if (ctx->cs_preamble_state) - ac_parse_ib(f, ctx->cs_preamble_state->pm4, ctx->cs_preamble_state->ndw, NULL, 0, - "IB2: Init config", ctx->gfx_level, ctx->family, AMD_IP_GFX, NULL, NULL); - } - if (scs->flushed) { ac_parse_ib(f, scs->gfx.ib + chunk->gfx_begin, chunk->gfx_end - chunk->gfx_begin, &last_trace_id, map ? 1 : 0, "IB", ctx->gfx_level, ctx->family, AMD_IP_GFX, NULL, NULL); -- GitLab From 55d81214c9d800c0667337808a82143ebab17c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Mon, 20 Nov 2023 07:06:29 -0500 Subject: [PATCH 13/38] radeonsi: replace gl_FrontFacing with a constant if one side is always culled Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- .../drivers/radeonsi/si_nir_lower_abi.c | 5 ++++ src/gallium/drivers/radeonsi/si_shader.c | 6 +++- src/gallium/drivers/radeonsi/si_shader.h | 3 ++ src/gallium/drivers/radeonsi/si_state.c | 15 ++++++++-- src/gallium/drivers/radeonsi/si_state.h | 1 + .../drivers/radeonsi/si_state_shaders.cpp | 30 +++++++++++++++---- 6 files changed, 50 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index fdd29fdbbc5dc..e57d9590be691 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -587,6 +587,11 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s case nir_intrinsic_load_alpha_reference_amd: replacement = ac_nir_load_arg(b, &args->ac, args->alpha_reference); break; + case nir_intrinsic_load_front_face: + if (!key->ps.opt.force_front_face_input) + return false; + replacement = nir_imm_bool(b, key->ps.opt.force_front_face_input == 1); + break; case nir_intrinsic_load_barycentric_optimize_amd: { nir_def *prim_mask = ac_nir_load_arg(b, &args->ac, args->ac.prim_mask); /* enabled when bit 31 is set */ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 1f24c78a57e37..89c4c7a93f127 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2645,6 +2645,10 @@ si_set_spi_ps_input_config(struct si_shader *shader) const struct si_shader_info *info = &sel->info; const union si_shader_key *key = &shader->key; + /* TODO: This should be determined from the final NIR instead of the input NIR, + * otherwise LLVM will have a performance advantage here because it determines + * VGPR inputs for each shader variant after LLVM optimizations. + */ shader->config.spi_ps_input_ena = S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center) | S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid) | @@ -2652,7 +2656,7 @@ si_set_spi_ps_input_config(struct si_shader *shader) S_0286CC_LINEAR_CENTER_ENA(info->uses_linear_center) | S_0286CC_LINEAR_CENTROID_ENA(info->uses_linear_centroid) | S_0286CC_LINEAR_SAMPLE_ENA(info->uses_linear_sample) | - S_0286CC_FRONT_FACE_ENA(info->uses_frontface) | + S_0286CC_FRONT_FACE_ENA(info->uses_frontface && !key->ps.opt.force_front_face_input) | S_0286CC_SAMPLE_COVERAGE_ENA(info->reads_samplemask) | S_0286CC_ANCILLARY_ENA(info->uses_sampleid || info->uses_layer_id); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index edf054999be03..f7dcc961c7602 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -797,6 +797,9 @@ struct si_shader_key_ps { unsigned prefer_mono : 1; unsigned inline_uniforms:1; + /* This eliminates the FRONT_FACE input VGPR as well as shader code using it. */ + int force_front_face_input : 2; /* 0 = gl_FrontFacing, 1 = true, -1 = false */ + /* This must be kept last to limit the number of variants * depending only on the uniform values. */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 7671a66fcc40f..2c633cf55e1e4 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1061,6 +1061,14 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast } } + /* Force gl_FrontFacing to true or false if the other face is culled. */ + if (util_bitcount(state->cull_face) == 1) { + if (state->cull_face & PIPE_FACE_FRONT) + rs->force_front_face_input = -1; + else + rs->force_front_face_input = 1; + } + unsigned spi_interp_control_0 = S_0286D4_FLAT_SHADE_ENA(1) | S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) | @@ -1256,8 +1264,7 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) if (old_rs->multisample_enable != rs->multisample_enable) si_ps_key_update_framebuffer_blend_rasterizer(sctx); - if (old_rs->two_side != rs->two_side || - old_rs->flatshade != rs->flatshade || + if (old_rs->flatshade != rs->flatshade || old_rs->clamp_fragment_color != rs->clamp_fragment_color) si_ps_key_update_rasterizer(sctx); @@ -1276,7 +1283,9 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) old_rs->line_smooth != rs->line_smooth || old_rs->poly_smooth != rs->poly_smooth || old_rs->polygon_mode_is_points != rs->polygon_mode_is_points || - old_rs->poly_stipple_enable != rs->poly_stipple_enable) + old_rs->poly_stipple_enable != rs->poly_stipple_enable || + old_rs->two_side != rs->two_side || + old_rs->force_front_face_input != rs->force_front_face_input) si_vs_ps_key_update_rast_prim_smooth_stipple(sctx); /* Used by si_get_vs_key_outputs in si_update_shaders: */ diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 9441459cf2c68..8b8f5dcaf1184 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -85,6 +85,7 @@ struct si_state_rasterizer { unsigned polygon_mode_is_points : 1; unsigned perpendicular_end_caps : 1; unsigned bottom_edge_rule : 1; + int force_front_face_input : 2; }; struct si_dsa_stencil_ref_part { diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index e52f799fcc84d..ef7672368eb9a 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -1872,6 +1872,14 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) assert(!shader->key.ps.part.prolog.force_linear_sample_interp || (!G_0286CC_LINEAR_CENTER_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena))); + /* color_two_side always enables FRONT_FACE. Since st/mesa disables two-side colors if the back + * face is culled, the only case when both color_two_side and force_front_face_input can be set + * is when the front face is culled (which means force_front_face_input == -1). + */ + assert(!shader->key.ps.opt.force_front_face_input || !G_0286CC_FRONT_FACE_ENA(input_ena) || + (shader->key.ps.part.prolog.color_two_side && + shader->key.ps.opt.force_front_face_input == -1)); + /* Validate cases when the optimizations are off (read as implications). */ assert(shader->key.ps.part.prolog.bc_optimize_for_persp || !G_0286CC_PERSP_CENTER_ENA(input_ena) || !G_0286CC_PERSP_CENTROID_ENA(input_ena)); @@ -2241,7 +2249,9 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx) void si_vs_ps_key_update_rast_prim_smooth_stipple(struct si_context *sctx) { struct si_shader_ctx_state *hw_vs = si_get_vs(sctx); - if (!hw_vs->cso) + struct si_shader_selector *ps = sctx->shader.ps.cso; + + if (!hw_vs->cso || !ps) return; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; @@ -2249,33 +2259,44 @@ void si_vs_ps_key_update_rast_prim_smooth_stipple(struct si_context *sctx) union si_shader_key *ps_key = &sctx->shader.ps.key; bool old_kill_pointsize = vs_key->ge.opt.kill_pointsize; + bool old_color_two_side = ps_key->ps.part.prolog.color_two_side; bool old_poly_stipple = ps_key->ps.part.prolog.poly_stipple; bool old_poly_line_smoothing = ps_key->ps.mono.poly_line_smoothing; bool old_point_smoothing = ps_key->ps.mono.point_smoothing; + int old_force_front_face_input = ps_key->ps.opt.force_front_face_input; if (sctx->current_rast_prim == MESA_PRIM_POINTS) { vs_key->ge.opt.kill_pointsize = 0; + ps_key->ps.part.prolog.color_two_side = 0; ps_key->ps.part.prolog.poly_stipple = 0; ps_key->ps.mono.poly_line_smoothing = 0; ps_key->ps.mono.point_smoothing = rs->point_smooth; + ps_key->ps.opt.force_front_face_input = ps->info.uses_frontface; } else if (util_prim_is_lines(sctx->current_rast_prim)) { vs_key->ge.opt.kill_pointsize = hw_vs->cso->info.writes_psize; + ps_key->ps.part.prolog.color_two_side = 0; ps_key->ps.part.prolog.poly_stipple = 0; ps_key->ps.mono.poly_line_smoothing = rs->line_smooth && sctx->framebuffer.nr_samples <= 1; ps_key->ps.mono.point_smoothing = 0; + ps_key->ps.opt.force_front_face_input = ps->info.uses_frontface; } else { /* Triangles. */ vs_key->ge.opt.kill_pointsize = hw_vs->cso->info.writes_psize && !rs->polygon_mode_is_points; + ps_key->ps.part.prolog.color_two_side = rs->two_side && ps->info.colors_read; ps_key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable; ps_key->ps.mono.poly_line_smoothing = rs->poly_smooth && sctx->framebuffer.nr_samples <= 1; ps_key->ps.mono.point_smoothing = 0; + ps_key->ps.opt.force_front_face_input = rs->force_front_face_input && + ps->info.uses_frontface; } if (vs_key->ge.opt.kill_pointsize != old_kill_pointsize || + ps_key->ps.part.prolog.color_two_side != old_color_two_side || ps_key->ps.part.prolog.poly_stipple != old_poly_stipple || ps_key->ps.mono.poly_line_smoothing != old_poly_line_smoothing || - ps_key->ps.mono.point_smoothing != old_point_smoothing) + ps_key->ps.mono.point_smoothing != old_point_smoothing || + ps_key->ps.opt.force_front_face_input != old_force_front_face_input) sctx->do_update_shaders = true; } @@ -2486,16 +2507,13 @@ void si_ps_key_update_rasterizer(struct si_context *sctx) if (!sel) return; - bool old_color_two_side = key->ps.part.prolog.color_two_side; bool old_flatshade_colors = key->ps.part.prolog.flatshade_colors; bool old_clamp_color = key->ps.part.epilog.clamp_color; - key->ps.part.prolog.color_two_side = rs->two_side && sel->info.colors_read; key->ps.part.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color; key->ps.part.epilog.clamp_color = rs->clamp_fragment_color; - if (key->ps.part.prolog.color_two_side != old_color_two_side || - key->ps.part.prolog.flatshade_colors != old_flatshade_colors || + if (key->ps.part.prolog.flatshade_colors != old_flatshade_colors || key->ps.part.epilog.clamp_color != old_clamp_color) sctx->do_update_shaders = true; } -- GitLab From 7fa0ee15df200d4a0114804e43187d26a717eb5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Mon, 20 Nov 2023 09:51:56 -0500 Subject: [PATCH 14/38] radeonsi: set OOB_SELECT for VBOs in si_create_vertex_elements we can do this since the stride is in the CSO now Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_state.c | 11 +++++++++-- src/gallium/drivers/radeonsi/si_state_draw.cpp | 12 +----------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 2c633cf55e1e4..9fc1ade843620 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5328,7 +5328,8 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, (sscreen->info.gfx_level == GFX6 || sscreen->info.gfx_level >= GFX10); bool opencode = sscreen->options.vs_fetch_always_opencode; - if (check_alignment && ((elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0 || elements[i].src_stride & 3)) + if (check_alignment && ((elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0 || + elements[i].src_stride & 3)) opencode = true; if (always_fix || check_alignment || opencode) @@ -5357,7 +5358,13 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, ASSERTED unsigned last_vertex_format = sscreen->info.gfx_level >= GFX11 ? 64 : 128; assert(fmt->img_format != 0 && fmt->img_format < last_vertex_format); v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | - S_008F0C_RESOURCE_LEVEL(sscreen->info.gfx_level < GFX11); + S_008F0C_RESOURCE_LEVEL(sscreen->info.gfx_level < GFX11) | + /* OOB_SELECT chooses the out-of-bounds check: + * - 1: index >= NUM_RECORDS (Structured) + * - 3: offset >= NUM_RECORDS (Raw) + */ + S_008F0C_OOB_SELECT(v->src_stride[i] ? V_008F0C_OOB_SELECT_STRUCTURED + : V_008F0C_OOB_SELECT_RAW); } else { unsigned data_format, num_format; data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 95e1478c5cc05..f522f330dc2af 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1643,20 +1643,10 @@ static void ALWAYS_INLINE si_set_vb_descriptor(struct si_vertex_elements *velems } assert(num_records >= 0 && num_records <= UINT_MAX); - uint32_t rsrc_word3 = velems->rsrc_word3[index]; - - /* OOB_SELECT chooses the out-of-bounds check: - * - 1: index >= NUM_RECORDS (Structured) - * - 3: offset >= NUM_RECORDS (Raw) - */ - if (GFX_VERSION >= GFX10) - rsrc_word3 |= S_008F0C_OOB_SELECT(stride ? V_008F0C_OOB_SELECT_STRUCTURED - : V_008F0C_OOB_SELECT_RAW); - desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); desc[2] = num_records; - desc[3] = rsrc_word3; + desc[3] = velems->rsrc_word3[index]; } #if GFX_VER == 6 /* declare this function only once because it supports all chips. */ -- GitLab From 48ce5fbaa19e2afad97ef98f66b17326fef19c14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Mon, 20 Nov 2023 10:10:24 -0500 Subject: [PATCH 15/38] radeonsi: group most vertex element fields fix_fetch and vertex_buffer_index can't be grouped because we do memcmp on those arrays Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_state.c | 34 ++++++++++--------- src/gallium/drivers/radeonsi/si_state.h | 16 +++++---- .../drivers/radeonsi/si_state_draw.cpp | 8 ++--- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 9fc1ade843620..77e4625dca1a2 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5221,10 +5221,10 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, first_non_void = util_format_get_first_non_void_channel(elements[i].src_format); channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL; - v->format_size[i] = desc->block.bits / 8; - v->src_offset[i] = elements[i].src_offset; + v->elem[i].format_size = desc->block.bits / 8; + v->elem[i].src_offset = elements[i].src_offset; + v->elem[i].stride = elements[i].src_stride; v->vertex_buffer_index[i] = vbo_index; - v->src_stride[i] = elements[i].src_stride; bool always_fix = false; union si_vs_fix_fetch fix_fetch; @@ -5348,28 +5348,30 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, v->vb_alignment_check_mask |= 1 << vbo_index; } - v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | - S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | - S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | - S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); + v->elem[i].rsrc_word3 = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | + S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | + S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | + S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); if (sscreen->info.gfx_level >= GFX10) { const struct gfx10_format *fmt = &ac_get_gfx10_format_table(&sscreen->info)[elements[i].src_format]; ASSERTED unsigned last_vertex_format = sscreen->info.gfx_level >= GFX11 ? 64 : 128; assert(fmt->img_format != 0 && fmt->img_format < last_vertex_format); - v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | - S_008F0C_RESOURCE_LEVEL(sscreen->info.gfx_level < GFX11) | - /* OOB_SELECT chooses the out-of-bounds check: - * - 1: index >= NUM_RECORDS (Structured) - * - 3: offset >= NUM_RECORDS (Raw) - */ - S_008F0C_OOB_SELECT(v->src_stride[i] ? V_008F0C_OOB_SELECT_STRUCTURED - : V_008F0C_OOB_SELECT_RAW); + v->elem[i].rsrc_word3 |= + S_008F0C_FORMAT(fmt->img_format) | + S_008F0C_RESOURCE_LEVEL(sscreen->info.gfx_level < GFX11) | + /* OOB_SELECT chooses the out-of-bounds check: + * - 1: index >= NUM_RECORDS (Structured) + * - 3: offset >= NUM_RECORDS (Raw) + */ + S_008F0C_OOB_SELECT(v->elem[i].stride ? V_008F0C_OOB_SELECT_STRUCTURED + : V_008F0C_OOB_SELECT_RAW); } else { unsigned data_format, num_format; data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void); - v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format); + v->elem[i].rsrc_word3 |= S_008F0C_NUM_FORMAT(num_format) | + S_008F0C_DATA_FORMAT(data_format); } } diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 8b8f5dcaf1184..0cee83c1632fd 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -126,12 +126,6 @@ struct si_stencil_ref { struct si_vertex_elements { struct si_resource *instance_divisor_factor_buffer; - uint32_t rsrc_word3[SI_MAX_ATTRIBS]; - uint16_t src_offset[SI_MAX_ATTRIBS]; - uint16_t src_stride[SI_MAX_ATTRIBS]; - uint8_t fix_fetch[SI_MAX_ATTRIBS]; - uint8_t format_size[SI_MAX_ATTRIBS]; - uint8_t vertex_buffer_index[SI_MAX_ATTRIBS]; /* Bitmask of elements that always need a fixup to be applied. */ uint16_t fix_fetch_always; @@ -158,6 +152,16 @@ struct si_vertex_elements { uint16_t vb_desc_list_alloc_size; uint16_t instance_divisor_is_one; /* bitmask of inputs */ uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ + + uint8_t fix_fetch[SI_MAX_ATTRIBS]; + uint8_t vertex_buffer_index[SI_MAX_ATTRIBS]; + + struct { + uint32_t rsrc_word3; + uint16_t src_offset; + uint16_t stride; + uint8_t format_size; + } elem[SI_MAX_ATTRIBS]; }; union si_state { diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index f522f330dc2af..738968a878d1c 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1626,7 +1626,7 @@ static void ALWAYS_INLINE si_set_vb_descriptor(struct si_vertex_elements *velems uint32_t *desc) /* where to upload descriptors */ { struct si_resource *buf = si_resource(vb->buffer.resource); - int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[index]; + int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->elem[index].src_offset; if (!buf || offset >= buf->b.b.width0) { memset(desc, 0, 16); @@ -1634,19 +1634,19 @@ static void ALWAYS_INLINE si_set_vb_descriptor(struct si_vertex_elements *velems } uint64_t va = buf->gpu_address + offset; - unsigned stride = velems->src_stride[index]; + unsigned stride = velems->elem[index].stride; int64_t num_records = (int64_t)buf->b.b.width0 - offset; if (GFX_VERSION != GFX8 && stride) { /* Round up by rounding down and adding 1 */ - num_records = (num_records - velems->format_size[index]) / stride + 1; + num_records = (num_records - velems->elem[index].format_size) / stride + 1; } assert(num_records >= 0 && num_records <= UINT_MAX); desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); desc[2] = num_records; - desc[3] = velems->rsrc_word3[index]; + desc[3] = velems->elem[index].rsrc_word3; } #if GFX_VER == 6 /* declare this function only once because it supports all chips. */ -- GitLab From 65b3b0b355de7644a3efd70143c66e4a6189024e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Fri, 24 Nov 2023 07:40:54 -0500 Subject: [PATCH 16/38] radeonsi/gfx11: prefer Wave64 for PS without inputs for better VALU perf Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index ef7672368eb9a..0ae0e4b6e7cbd 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -86,10 +86,12 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha if (dbg_wave_size) return dbg_wave_size; - /* Pixel shaders without interp instructions don't suffer from reduced interpolation + /* Gfx10: Pixel shaders without interp instructions don't suffer from reduced interpolation * performance in Wave32, so use Wave32. This helps Piano and Voloplosion. + * + * Gfx11: Prefer Wave64 to take advantage of doubled VALU performance. */ - if (stage == MESA_SHADER_FRAGMENT && !info->num_inputs) + if (sscreen->info.gfx_level < GFX11 && stage == MESA_SHADER_FRAGMENT && !info->num_inputs) return 32; /* There are a few very rare cases where VS is better with Wave32, and there are no known @@ -111,6 +113,8 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha /* Divergent loops in Wave64 can end up having too many iterations in one half of the wave * while the other half is idling but occupying VGPRs, preventing other waves from launching. * Wave32 eliminates the idling half to allow the next wave to start. + * + * Gfx11: Wave32 continues to be faster with divergent loops despite worse VALU performance. */ if (!merged_shader && info && info->has_divergent_loop) return 32; -- GitLab From f85488824e6abaf3a8ae36f8ca5016c05ba8a978 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Fri, 24 Nov 2023 17:49:26 -0500 Subject: [PATCH 17/38] radeonsi/gfx11: disable the shader profile for Medical that forces Wave64 GFX10 should keep using it, but not GFX11. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_shader.h | 2 +- src/gallium/drivers/radeonsi/si_shader_info.c | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index f7dcc961c7602..1df5f01b6410f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -302,7 +302,7 @@ enum #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 5) & 0xff) #define SI_PROFILE_WAVE32 (1 << 0) -#define SI_PROFILE_WAVE64 (1 << 1) +#define SI_PROFILE_GFX10_WAVE64 (1 << 1) /* bit gap */ #define SI_PROFILE_VS_NO_BINNING (1 << 3) #define SI_PROFILE_PS_NO_BINNING (1 << 4) diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index de2fc335f80a1..4141968d83e08 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -33,7 +33,7 @@ static struct si_shader_profile profiles[] = * probably due to interpolation performance. */ {0x29f0f4a0, 0x0672258d, 0x47ccdcfd, 0x31e67dcc, 0xdcb1fda8}, - SI_PROFILE_WAVE64, + SI_PROFILE_GFX10_WAVE64, }, { /* Viewperf/Creo */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 0ae0e4b6e7cbd..f8638eda48081 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -67,7 +67,8 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha if (info && info->options & SI_PROFILE_WAVE32) profile_wave_size = 32; - if (info && info->options & SI_PROFILE_WAVE64) { + if (info && info->options & SI_PROFILE_GFX10_WAVE64 && + (sscreen->info.gfx_level == GFX10 || sscreen->info.gfx_level == GFX10_3)) { assert(!profile_wave_size); profile_wave_size = 64; } -- GitLab From 716b521515bc5fcdd7acfcb23202032e43bffae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Fri, 24 Nov 2023 17:49:26 -0500 Subject: [PATCH 18/38] radeonsi/gfx11: disable the shader profile for Medical that disables binning GFX11 performs better with the default behavior. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_shader.h | 2 +- src/gallium/drivers/radeonsi/si_shader_info.c | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 1df5f01b6410f..3c4f90e2dd187 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -305,7 +305,7 @@ enum #define SI_PROFILE_GFX10_WAVE64 (1 << 1) /* bit gap */ #define SI_PROFILE_VS_NO_BINNING (1 << 3) -#define SI_PROFILE_PS_NO_BINNING (1 << 4) +#define SI_PROFILE_GFX9_GFX10_PS_NO_BINNING (1 << 4) #define SI_PROFILE_CLAMP_DIV_BY_ZERO (1 << 5) enum si_shader_dump_type { diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index 4141968d83e08..b6af2badffc5d 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -26,7 +26,7 @@ static struct si_shader_profile profiles[] = { /* Viewperf/Medical */ {0x4dce4331, 0x38f778d5, 0x1b75a717, 0x3e454fb9, 0xeb1527f0}, - SI_PROFILE_PS_NO_BINNING, + SI_PROFILE_GFX9_GFX10_PS_NO_BINNING, }, { /* Viewperf/Medical, a shader with a divergent loop doesn't benefit from Wave32, diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index f8638eda48081..ebb846d1e39d0 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -3773,7 +3773,8 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) si_update_vrs_flat_shading(sctx); if (sctx->screen->dpbb_allowed) { - bool force_off = sel && sel->info.options & SI_PROFILE_PS_NO_BINNING; + bool force_off = sel && sel->info.options & SI_PROFILE_GFX9_GFX10_PS_NO_BINNING && + (sctx->gfx_level >= GFX9 && sctx->gfx_level <= GFX10_3); if (force_off != sctx->dpbb_force_off_profile_ps) { sctx->dpbb_force_off_profile_ps = force_off; -- GitLab From 257f07f499f8b7794f4e4ca49fda98ccdb10a396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Fri, 24 Nov 2023 18:18:17 -0500 Subject: [PATCH 19/38] radeonsi: clean up how debug flags and shader profiles determine the wave size - remove DBG_W32_PS_DISCARD - just return the wave size instead of setting local variables dbg_wave_size and profile_wave_size Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_pipe.c | 1 - src/gallium/drivers/radeonsi/si_pipe.h | 1 - .../drivers/radeonsi/si_state_shaders.cpp | 36 +++++-------------- 3 files changed, 8 insertions(+), 30 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 5a2111d66985e..c00fb1e923bfc 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -56,7 +56,6 @@ static const struct debug_named_value radeonsi_debug_options[] = { /* Shader compiler options the shader cache should be aware of: */ {"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."}, {"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."}, - {"w32psdiscard", DBG(W32_PS_DISCARD), "Use Wave32 for pixel shaders even if they contain discard and LLVM is buggy."}, {"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."}, {"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."}, {"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."}, diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 5cf091451b657..20fdcf180eeec 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -196,7 +196,6 @@ enum /* Shader compiler options the shader cache should be aware of: */ DBG_W32_GE, DBG_W32_PS, - DBG_W32_PS_DISCARD, DBG_W32_CS, DBG_W64_GE, DBG_W64_PS, diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index ebb846d1e39d0..18646a5f00cf3 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -48,44 +48,24 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha info->base.workgroup_size[2]) % 64 != 0) return 32; - /* Debug flags. */ - unsigned dbg_wave_size = 0; + /* AMD_DEBUG wave flags override everything else. */ if (sscreen->debug_flags & (stage == MESA_SHADER_COMPUTE ? DBG(W32_CS) : - stage == MESA_SHADER_FRAGMENT ? DBG(W32_PS) | DBG(W32_PS_DISCARD) : DBG(W32_GE))) - dbg_wave_size = 32; + stage == MESA_SHADER_FRAGMENT ? DBG(W32_PS) : DBG(W32_GE))) + return 32; if (sscreen->debug_flags & (stage == MESA_SHADER_COMPUTE ? DBG(W64_CS) : - stage == MESA_SHADER_FRAGMENT ? DBG(W64_PS) : DBG(W64_GE))) { - assert(!dbg_wave_size); - dbg_wave_size = 64; - } + stage == MESA_SHADER_FRAGMENT ? DBG(W64_PS) : DBG(W64_GE))) + return 64; /* Shader profiles. */ - unsigned profile_wave_size = 0; if (info && info->options & SI_PROFILE_WAVE32) - profile_wave_size = 32; + return 32; if (info && info->options & SI_PROFILE_GFX10_WAVE64 && - (sscreen->info.gfx_level == GFX10 || sscreen->info.gfx_level == GFX10_3)) { - assert(!profile_wave_size); - profile_wave_size = 64; - } - - if (profile_wave_size) { - /* Only debug flags override shader profiles. */ - if (dbg_wave_size) - return dbg_wave_size; - - return profile_wave_size; - } - - /* Debug flags except w32psdiscard don't override the discard bug workaround, - * but they override everything else. - */ - if (dbg_wave_size) - return dbg_wave_size; + (sscreen->info.gfx_level == GFX10 || sscreen->info.gfx_level == GFX10_3)) + return 64; /* Gfx10: Pixel shaders without interp instructions don't suffer from reduced interpolation * performance in Wave32, so use Wave32. This helps Piano and Voloplosion. -- GitLab From c77bcf00a36c0c6f2bc42a052de5152589f0a372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Fri, 24 Nov 2023 19:26:14 -0500 Subject: [PATCH 20/38] radeonsi/gfx11: prefer Wave64 for VS/TCS/TES/GS because it's slightly faster Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 18646a5f00cf3..95ee06b33130d 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -75,12 +75,16 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha if (sscreen->info.gfx_level < GFX11 && stage == MESA_SHADER_FRAGMENT && !info->num_inputs) return 32; - /* There are a few very rare cases where VS is better with Wave32, and there are no known - * cases where Wave64 is better. + /* Gfx10: There are a few very rare cases where VS is better with Wave32, and there are no + * known cases where Wave64 is better. + * * Wave32 is disabled for GFX10 when culling is active as a workaround for #6457. I don't * know why this helps. + * + * Gfx11: Prefer Wave64 because it's slightly better than Wave32. */ if (stage <= MESA_SHADER_GEOMETRY && + (sscreen->info.gfx_level == GFX10 || sscreen->info.gfx_level == GFX10_3) && !(sscreen->info.gfx_level == GFX10 && shader && shader->key.ge.opt.ngg_culling)) return 32; -- GitLab From e1e35112c859bff2b5ba02397ac548af5b8317d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sat, 25 Nov 2023 07:41:24 -0500 Subject: [PATCH 21/38] winsys/amdgpu: bypass GL2 for command buffers Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 485e14af6334f..7449a97ca3871 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -871,9 +871,13 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, /* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU. * The speed of writing to GTT WC is somewhere between no difference and very slow, while * VRAM being very slow a lot more often. + * + * Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency + * and doesn't have to wait for cached GL2 requests to be processed. */ enum radeon_bo_domain domain = RADEON_DOMAIN_GTT; - unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING; + unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING | + RADEON_FLAG_GL2_BYPASS; if (cs->ip_type == AMD_IP_GFX || cs->ip_type == AMD_IP_COMPUTE || -- GitLab From 4f2b794e98c63b5c6171f00cdc4c51141d8181a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sat, 25 Nov 2023 10:49:31 -0500 Subject: [PATCH 22/38] radeonsi: track NIR progress properly for optimizations in si_get_nir_shader Just a small code size decrease in 12 shaders. TOTALS FROM AFFECTED SHADERS (12/58918) SGPRS: 600.00 -> 600.00 (0.00 %) VGPRS: 528.00 -> 520.00 (-1.52 %) Spilled SGPRs: 0.00 -> 0.00 (0.00 %) Spilled VGPRs: 0.00 -> 0.00 (0.00 %) Private memory VGPRs: 0.00 -> 0.00 (0.00 %) Scratch size: 0.00 -> 0.00 (0.00 %) dwords per thread Code Size: 39772.00 -> 39688.00 (-0.21 %) bytes Max Waves: 180.00 -> 180.00 (0.00 %) Outputs: 0.00 -> 0.00 (0.00 %) Patch Outputs: 0.00 -> 0.00 (0.00 %) Reviewed-by: Qiang Yu <yuq825@gmail.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_shader.c | 79 +++++++++++++----------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 89c4c7a93f127..9f308ac28444b 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2050,8 +2050,9 @@ static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void return true; } -static void si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shader) +static bool si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shader) { + bool progress = false; nir_function_impl *impl = nir_shader_get_entrypoint(nir); nir_builder builder = nir_builder_at(nir_before_impl(impl)); @@ -2123,12 +2124,14 @@ static void si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shade nir_def *is_front_face = nir_load_front_face(b, 1); colors[i] = nir_bcsel(b, is_front_face, colors[i], back_color); } + + progress = true; } /* lower nir_load_color0/1 to use the color value. */ - nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic, - nir_metadata_block_index | nir_metadata_dominance, - colors); + return nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic, + nir_metadata_block_index | nir_metadata_dominance, + colors) || progress; } static void si_nir_emit_polygon_stipple(nir_shader *nir, struct si_shader_args *args) @@ -2181,6 +2184,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, } bool progress = false; + bool late_opts = false; const char *original_name = NULL; if (unlikely(should_print_nir(nir))) { @@ -2197,12 +2201,11 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, if (sel->stage <= MESA_SHADER_GEOMETRY) NIR_PASS(progress, nir, si_nir_kill_outputs, key); - NIR_PASS( - _, nir, ac_nir_lower_tex, - &(ac_nir_lower_tex_options){ - .gfx_level = sel->screen->info.gfx_level, - .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord, - }); + NIR_PASS(progress, nir, ac_nir_lower_tex, + &(ac_nir_lower_tex_options){ + .gfx_level = sel->screen->info.gfx_level, + .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord, + }); if (nir->info.uses_resource_info_query) NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level); @@ -2253,10 +2256,8 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, * TODO: The driver uses a linear search to find a shader variant. This * can be really slow if we get too many variants due to uniform inlining. */ - NIR_PASS_V(nir, nir_inline_uniforms, - nir->info.num_inlinable_uniforms, - inlined_uniform_values, - nir->info.inlinable_uniform_dw_offsets); + NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms, + inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets); progress = true; } @@ -2292,8 +2293,11 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, if (is_last_vgt_stage || is_legacy_gs) NIR_PASS(progress, nir, si_nir_clamp_vertex_color); - if (progress) + if (progress) { si_nir_opts(sel->screen, nir, true); + late_opts = true; + progress = false; + } /* Lower large variables that are always constant with load_constant intrinsics, which * get turned into PC-relative loads from a data section next to the shader. @@ -2304,19 +2308,18 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, * The pass crashes if there are dead temps of lowered IO interface types, so remove * them first. */ - bool progress2 = false; NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL); - NIR_PASS(progress2, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16); + NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16); /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so * this should be done after that. */ - progress2 |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level); + progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level); if (sel->stage == MESA_SHADER_VERTEX) - progress2 |= si_nir_lower_vs_inputs(nir, shader, args); + NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, args); - bool opt_offsets = si_lower_io_to_mem(shader, nir, tcs_vgpr_only_inputs); + progress |= si_lower_io_to_mem(shader, nir, tcs_vgpr_only_inputs); if (is_last_vgt_stage) { /* Assign param export indices. */ @@ -2328,7 +2331,6 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, if (key->ge.as_ngg) { /* Lower last VGT NGG shader stage. */ si_lower_ngg(shader, nir); - opt_offsets = true; } else if (sel->stage == MESA_SHADER_VERTEX || sel->stage == MESA_SHADER_TESS_EVAL) { /* Lower last VGT none-NGG VS/TES shader stage. */ unsigned clip_cull_mask = @@ -2346,12 +2348,14 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, key->ge.opt.kill_layer, sel->screen->options.vrs2x2); } + progress = true; } else if (is_legacy_gs) { NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg, output_info); + progress = true; } else if (sel->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) { /* two-side color selection and interpolation */ if (sel->info.colors_read) - NIR_PASS_V(nir, si_nir_lower_ps_color_input, shader); + NIR_PASS(progress, nir, si_nir_lower_ps_color_input, shader); ac_nir_lower_ps_options options = { .gfx_level = sel->screen->info.gfx_level, @@ -2383,33 +2387,34 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, if (key->ps.part.prolog.poly_stipple) NIR_PASS_V(nir, si_nir_emit_polygon_stipple, args); - progress2 = true; + progress = true; } - NIR_PASS(progress2, nir, nir_opt_idiv_const, 8); - NIR_PASS(progress2, nir, nir_lower_idiv, + NIR_PASS(progress, nir, nir_opt_idiv_const, 8); + NIR_PASS(progress, nir, nir_lower_idiv, &(nir_lower_idiv_options){ .allow_fp16 = sel->screen->info.gfx_level >= GFX9, }); - NIR_PASS(progress2, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level, + NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level, si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level), &args->ac); - NIR_PASS(progress2, nir, si_nir_lower_abi, shader, args); + NIR_PASS(progress, nir, si_nir_lower_abi, shader, args); - if (progress2 || opt_offsets) + if (progress) { si_nir_opts(sel->screen, nir, false); - - if (opt_offsets) { - static const nir_opt_offsets_options offset_options = { - .uniform_max = 0, - .buffer_max = ~0, - .shared_max = ~0, - }; - NIR_PASS_V(nir, nir_opt_offsets, &offset_options); + progress = false; + late_opts = true; } - if (progress || progress2 || opt_offsets) + static const nir_opt_offsets_options offset_options = { + .uniform_max = 0, + .buffer_max = ~0, + .shared_max = ~0, + }; + NIR_PASS_V(nir, nir_opt_offsets, &offset_options); + + if (late_opts) si_nir_late_opts(nir); /* aco only accept scalar const, must be done after si_nir_late_opts() -- GitLab From 00dd4d400ea283a9ebfe3fb674a48cacf5b0c484 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sat, 25 Nov 2023 11:01:42 -0500 Subject: [PATCH 23/38] ac,radeonsi: rename pos_inputs -> fragcoord_components Reviewed-by: Qiang Yu <yuq825@gmail.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/amd/common/ac_shader_util.c | 16 ++++++++-------- src/amd/common/ac_shader_util.h | 2 +- src/gallium/drivers/radeonsi/si_shader.c | 6 +++--- src/gallium/drivers/radeonsi/si_shader.h | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index 6013eb5811f05..24e530988434c 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -620,10 +620,10 @@ enum ac_image_dim ac_get_image_dim(enum amd_gfx_level gfx_level, enum glsl_sampl } unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, - uint8_t *num_pos_inputs) + uint8_t *num_fragcoord_components) { unsigned num_input_vgprs = 0; - unsigned pos_inputs = 0; + unsigned fragcoord_components = 0; if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr)) num_input_vgprs += 2; @@ -643,19 +643,19 @@ unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, num_input_vgprs += 1; if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr)) { num_input_vgprs += 1; - pos_inputs++; + fragcoord_components++; } if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr)) { num_input_vgprs += 1; - pos_inputs++; + fragcoord_components++; } if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr)) { num_input_vgprs += 1; - pos_inputs++; + fragcoord_components++; } if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr)) { num_input_vgprs += 1; - pos_inputs++; + fragcoord_components++; } if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) num_input_vgprs += 1; @@ -666,8 +666,8 @@ unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr)) num_input_vgprs += 1; - if (num_pos_inputs) - *num_pos_inputs = pos_inputs; + if (num_fragcoord_components) + *num_fragcoord_components = fragcoord_components; return num_input_vgprs; } diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index e97cc0cddb4c7..e870262a10337 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -170,7 +170,7 @@ enum ac_image_dim ac_get_image_dim(enum amd_gfx_level gfx_level, enum glsl_sampl bool is_array); unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, - uint8_t *num_pos_inputs); + uint8_t *num_fragcoord_components); uint16_t ac_get_ps_iter_mask(unsigned ps_iter_samples); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 9f308ac28444b..e920c8b355a50 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2903,7 +2903,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi /* Calculate the number of fragment input VGPRs. */ if (sel->stage == MESA_SHADER_FRAGMENT) { shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt( - &shader->config, &shader->info.num_ps_pos_inputs); + &shader->config, &shader->info.num_fragcoord_components); } si_calculate_max_simd_waves(shader); @@ -3088,7 +3088,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke key->ps_prolog.states.force_persp_center_interp || key->ps_prolog.states.force_linear_center_interp || key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear); - key->ps_prolog.num_pos_inputs = shader->info.num_ps_pos_inputs; + key->ps_prolog.num_fragcoord_components = shader->info.num_fragcoord_components; if (shader->key.ps.part.prolog.poly_stipple) shader->info.uses_vmem_load_other = true; @@ -3673,7 +3673,7 @@ void si_get_ps_prolog_args(struct si_shader_args *args, /* skip LINE_STIPPLE_TEX */ /* POS_X|Y|Z|W_FLOAT */ - for (unsigned i = 0; i < key->ps_prolog.num_pos_inputs; i++) + for (unsigned i = 0; i < key->ps_prolog.num_fragcoord_components; i++) ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 3c4f90e2dd187..047ffc5c0a685 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -679,7 +679,7 @@ union si_shader_part_key { /* Color interpolation and two-side color selection. */ unsigned colors_read : 8; /* color input components read */ unsigned num_interp_inputs : 5; /* BCOLOR is at this location */ - unsigned num_pos_inputs : 3; + unsigned num_fragcoord_components : 3; unsigned wqm : 1; char color_attr_index[2]; signed char color_interp_vgpr_index[2]; /* -1 == constant */ @@ -823,7 +823,7 @@ struct si_shader_binary_info { uint8_t num_input_vgprs; bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */ bool uses_vmem_sampler_or_bvh; - uint8_t num_ps_pos_inputs; + uint8_t num_fragcoord_components; bool uses_instanceid; uint8_t nr_pos_exports; uint8_t nr_param_exports; -- GitLab From 7d2faa88abb3be9935a2af1515199dc5aa96f61b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sat, 25 Nov 2023 13:29:49 -0500 Subject: [PATCH 24/38] nir,radeonsi: add FLAGS into load_vector_arg_amd to record color input usage This will be needed for gathering color usage from lowered PS. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/compiler/nir/nir_intrinsics.py | 8 ++++++-- src/gallium/drivers/radeonsi/si_nir_lower_abi.c | 12 +++++++++--- src/gallium/drivers/radeonsi/si_shader.h | 4 ++++ src/gallium/drivers/radeonsi/si_shader_info.c | 8 ++++++++ 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 380c153f0ddf8..8ac4ee3f5e604 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1589,8 +1589,12 @@ intrinsic("store_hit_attrib_amd", src_comp=[1], indices=[BASE]) # Load forced VRS rates. intrinsic("load_force_vrs_rates_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER]) -intrinsic("load_scalar_arg_amd", dest_comp=0, bit_sizes=[32], indices=[BASE, ARG_UPPER_BOUND_U32_AMD], flags=[CAN_ELIMINATE, CAN_REORDER]) -intrinsic("load_vector_arg_amd", dest_comp=0, bit_sizes=[32], indices=[BASE, ARG_UPPER_BOUND_U32_AMD], flags=[CAN_ELIMINATE, CAN_REORDER]) +intrinsic("load_scalar_arg_amd", dest_comp=0, bit_sizes=[32], + indices=[BASE, ARG_UPPER_BOUND_U32_AMD], + flags=[CAN_ELIMINATE, CAN_REORDER]) +intrinsic("load_vector_arg_amd", dest_comp=0, bit_sizes=[32], + indices=[BASE, ARG_UPPER_BOUND_U32_AMD, FLAGS], + flags=[CAN_ELIMINATE, CAN_REORDER]) store("scalar_arg_amd", [], [BASE]) store("vector_arg_amd", [], [BASE]) diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index e57d9590be691..c96572bd5a932 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -616,9 +616,15 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s nir_def *color[4]; for (int i = 0; i < 4; i++) { - color[i] = colors_read & BITFIELD_BIT(start + i) ? - ac_nir_load_arg_at_offset(b, &args->ac, args->color_start, offset++) : - nir_undef(b, 1, 32); + if (colors_read & BITFIELD_BIT(start + i)) { + color[i] = ac_nir_load_arg_at_offset(b, &args->ac, args->color_start, offset++); + + nir_intrinsic_set_flags(nir_instr_as_intrinsic(color[i]->parent_instr), + SI_VECTOR_ARG_IS_COLOR | + SI_VECTOR_ARG_COLOR_COMPONENT(start + i)); + } else { + color[i] = nir_undef(b, 1, 32); + } } replacement = nir_vec(b, color, 4); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 047ffc5c0a685..64f478b06d6a4 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -137,6 +137,10 @@ struct nir_lower_subgroups_options; /* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */ #define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001 +#define SI_VECTOR_ARG_IS_COLOR BITFIELD_BIT(0) +#define SI_VECTOR_ARG_COLOR_COMPONENT(x) (((x) & 0x7) << 1) +#define SI_GET_VECTOR_ARG_COLOR_COMPONENT(x) (((x) >> 1) & 0x7) + /* SGPR user data indices */ enum { diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index b6af2badffc5d..66fd766be973c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -526,6 +526,14 @@ static void scan_instruction(const struct nir_shader *nir, struct si_shader_info } break; } + case nir_intrinsic_load_vector_arg_amd: + /* Non-monolithic lowered PS can have this. We need to record color usage. */ + if (nir_intrinsic_flags(intr) & SI_VECTOR_ARG_IS_COLOR) { + /* The channel can be between 0 and 7. */ + unsigned chan = SI_GET_VECTOR_ARG_COLOR_COMPONENT(nir_intrinsic_flags(intr)); + info->colors_read |= BITFIELD_BIT(chan); + } + break; case nir_intrinsic_load_barycentric_at_offset: /* uses center */ case nir_intrinsic_load_barycentric_at_sample: /* uses center */ if (nir_intrinsic_interp_mode(intr) == INTERP_MODE_FLAT) -- GitLab From 94e0a1267e4ef1f2ef8b4cbdaea8e892b355db8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sat, 25 Nov 2023 13:33:04 -0500 Subject: [PATCH 25/38] radeonsi: change the signature of si_nir_lower_ps_color_input This will be needed later. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_shader.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index e920c8b355a50..c8438c4af90a8 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2050,7 +2050,8 @@ static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void return true; } -static bool si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shader) +static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_key *key, + const struct si_shader_info *info) { bool progress = false; nir_function_impl *impl = nir_shader_get_entrypoint(nir); @@ -2058,24 +2059,21 @@ static bool si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shade nir_builder builder = nir_builder_at(nir_before_impl(impl)); nir_builder *b = &builder; - const struct si_shader_selector *sel = shader->selector; - const union si_shader_key *key = &shader->key; - /* Build ready to be used colors at the beginning of the shader. */ nir_def *colors[2] = {0}; for (int i = 0; i < 2; i++) { - if (!(sel->info.colors_read & (0xf << (i * 4)))) + if (!(info->colors_read & (0xf << (i * 4)))) continue; - unsigned color_base = sel->info.color_attr_index[i]; + unsigned color_base = info->color_attr_index[i]; /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", * otherwise it's at offset "num_inputs". */ - unsigned back_color_base = sel->info.num_inputs; - if (i == 1 && (sel->info.colors_read & 0xf)) + unsigned back_color_base = info->num_inputs; + if (i == 1 && (info->colors_read & 0xf)) back_color_base += 1; - enum glsl_interp_mode interp_mode = sel->info.color_interpolate[i]; + enum glsl_interp_mode interp_mode = info->color_interpolate[i]; if (interp_mode == INTERP_MODE_COLOR) { interp_mode = key->ps.part.prolog.flatshade_colors ? INTERP_MODE_FLAT : INTERP_MODE_SMOOTH; @@ -2092,7 +2090,7 @@ static bool si_nir_lower_ps_color_input(nir_shader *nir, struct si_shader *shade } } else { nir_intrinsic_op op = 0; - switch (sel->info.color_interpolate_loc[i]) { + switch (info->color_interpolate_loc[i]) { case TGSI_INTERPOLATE_LOC_CENTER: op = nir_intrinsic_load_barycentric_pixel; break; @@ -2355,7 +2353,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, } else if (sel->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) { /* two-side color selection and interpolation */ if (sel->info.colors_read) - NIR_PASS(progress, nir, si_nir_lower_ps_color_input, shader); + NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &sel->info); ac_nir_lower_ps_options options = { .gfx_level = sel->screen->info.gfx_level, -- GitLab From 3faa9ba06be051973f6806ba534983fc130ea859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sat, 25 Nov 2023 13:34:45 -0500 Subject: [PATCH 26/38] radeonsi: gather lowered color inputs for monolithic PS This also adds missing io_semantics to the input loads that the gathering expects. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_shader.c | 16 ++++++++++++---- src/gallium/drivers/radeonsi/si_shader_info.c | 12 ++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index c8438c4af90a8..15844b2244162 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2082,11 +2082,15 @@ static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_k nir_def *back_color = NULL; if (interp_mode == INTERP_MODE_FLAT) { colors[i] = nir_load_input(b, 4, 32, nir_imm_int(b, 0), - .base = color_base); + .base = color_base, + .io_semantics.location = VARYING_SLOT_COL0 + i, + .io_semantics.num_slots = 1); if (key->ps.part.prolog.color_two_side) { back_color = nir_load_input(b, 4, 32, nir_imm_int(b, 0), - .base = back_color_base); + .base = back_color_base, + .io_semantics.location = VARYING_SLOT_BFC0 + i, + .io_semantics.num_slots = 1); } } else { nir_intrinsic_op op = 0; @@ -2109,12 +2113,16 @@ static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_k colors[i] = nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0), - .base = color_base); + .base = color_base, + .io_semantics.location = VARYING_SLOT_COL0 + i, + .io_semantics.num_slots = 1); if (key->ps.part.prolog.color_two_side) { back_color = nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0), - .base = back_color_base); + .base = back_color_base, + .io_semantics.location = VARYING_SLOT_BFC0 + i, + .io_semantics.num_slots = 1); } } diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index 66fd766be973c..f14ff069cbcf5 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -259,6 +259,18 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info, if (nir->info.stage != MESA_SHADER_VERTEX || !is_input) semantic = nir_intrinsic_io_semantics(intr).location; + if (nir->info.stage == MESA_SHADER_FRAGMENT && is_input) { + /* Gather color PS inputs. We can only get here after lowering colors in monolithic + * shaders. This must match what we do for nir_intrinsic_load_color0/1. + */ + if (semantic == VARYING_SLOT_COL0 || semantic == VARYING_SLOT_COL1 || + semantic == VARYING_SLOT_BFC0 || semantic == VARYING_SLOT_BFC1) { + unsigned index = semantic == VARYING_SLOT_COL1 || semantic == VARYING_SLOT_BFC1; + info->colors_read |= mask << (index * 4); + return; + } + } + if (nir->info.stage == MESA_SHADER_FRAGMENT && !is_input) { /* Never use FRAG_RESULT_COLOR directly. */ if (semantic == FRAG_RESULT_COLOR) -- GitLab From 197af036989d23ee1191d5b3703cf88fd0b0bbe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sat, 25 Nov 2023 13:44:07 -0500 Subject: [PATCH 27/38] radeonsi: add PS input info into si_shader_binary_info It will be modified to reflect PS inputs after uniform inlining. For now, it's just a copy of selector->info. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_shader.c | 34 ++++++++++++++----- src/gallium/drivers/radeonsi/si_shader.h | 4 +++ .../drivers/radeonsi/si_state_shaders.cpp | 3 +- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 15844b2244162..2ca944f401cd6 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -30,10 +30,9 @@ static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *s /* Get the number of all interpolated inputs */ unsigned si_get_ps_num_interp(struct si_shader *ps) { - struct si_shader_info *info = &ps->selector->info; - unsigned num_colors = !!(info->colors_read & 0x0f) + !!(info->colors_read & 0xf0); + unsigned num_colors = !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0); unsigned num_interp = - ps->selector->info.num_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0); + ps->info.num_ps_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0); assert(num_interp <= 32); return MIN2(num_interp, 32); @@ -1178,7 +1177,6 @@ static void si_calculate_max_simd_waves(struct si_shader *shader) { struct si_screen *sscreen = shader->selector->screen; struct ac_shader_config *conf = &shader->config; - unsigned num_inputs = shader->selector->info.num_inputs; unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage); unsigned lds_per_wave = 0; unsigned max_simd_waves; @@ -1198,7 +1196,8 @@ static void si_calculate_max_simd_waves(struct si_shader *shader) * Other stages don't know the size at compile time or don't * allocate LDS per wave, but instead they do it per thread group. */ - lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment); + lds_per_wave = conf->lds_size * lds_increment + + align(shader->info.num_ps_inputs * 48, lds_increment); break; case MESA_SHADER_COMPUTE: { unsigned max_workgroup_size = si_get_max_workgroup_size(shader); @@ -2363,6 +2362,12 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, if (sel->info.colors_read) NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &sel->info); + /* We need to set this early for lowering nir_intrinsic_load_point_coord_maybe_flipped, + * which can only occur with monolithic PS. + */ + shader->info.num_ps_inputs = sel->info.num_inputs; + shader->info.ps_colors_read = sel->info.colors_read; + ac_nir_lower_ps_options options = { .gfx_level = sel->screen->info.gfx_level, .family = sel->screen->info.family, @@ -2449,6 +2454,17 @@ void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir) shader->info.uses_vmem_load_other |= info.uses_vmem_load_other; shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh; + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + shader->info.num_ps_inputs = shader->selector->info.num_inputs; + shader->info.ps_colors_read = shader->selector->info.colors_read; + + unsigned num_colors = !!(shader->selector->info.colors_read & 0x0f) + + !!(shader->selector->info.colors_read & 0xf0); + unsigned max_interp = MIN2(shader->info.num_ps_inputs + num_colors, SI_NUM_INTERP); + memcpy(shader->info.ps_inputs, shader->selector->info.input, + max_interp * sizeof(info.input[0])); + } } /* Generate code for the hardware VS shader stage to go with a geometry shader */ @@ -3085,7 +3101,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke memset(key, 0, sizeof(*key)); key->ps_prolog.states = shader->key.ps.part.prolog; key->ps_prolog.wave32 = shader->wave_size == 32; - key->ps_prolog.colors_read = info->colors_read; + key->ps_prolog.colors_read = shader->info.ps_colors_read; key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; key->ps_prolog.wqm = info->base.fs.needs_quad_helper_invocations && @@ -3099,12 +3115,12 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke if (shader->key.ps.part.prolog.poly_stipple) shader->info.uses_vmem_load_other = true; - if (info->colors_read) { + if (shader->info.ps_colors_read) { uint8_t *color = shader->selector->info.color_attr_index; if (shader->key.ps.part.prolog.color_two_side) { /* BCOLORs are stored after the last input. */ - key->ps_prolog.num_interp_inputs = info->num_inputs; + key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs; shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); } @@ -3112,7 +3128,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke unsigned interp = info->color_interpolate[i]; unsigned location = info->color_interpolate_loc[i]; - if (!(info->colors_read & (0xf << i * 4))) + if (!(shader->info.ps_colors_read & (0xf << i * 4))) continue; key->ps_prolog.color_attr_index[i] = color[i]; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 64f478b06d6a4..f429ad2069908 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -127,6 +127,7 @@ struct nir_shader; struct nir_instr; struct nir_lower_subgroups_options; +#define SI_NUM_INTERP 32 #define SI_MAX_ATTRIBS 16 #define SI_MAX_VS_OUTPUTS 40 #define SI_USER_CLIP_PLANE_MASK 0x3F @@ -823,6 +824,9 @@ union si_shader_key { struct si_shader_binary_info { uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS]; uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS]; + union si_input_info ps_inputs[SI_NUM_INTERP]; + uint8_t num_ps_inputs; + uint8_t ps_colors_read; uint8_t num_input_sgprs; uint8_t num_input_vgprs; bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 95ee06b33130d..6b7269322362b 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4801,7 +4801,6 @@ template<int NUM_INTERP> static void si_emit_spi_map(struct si_context *sctx, unsigned index) { struct si_shader *ps = sctx->shader.ps.current; - struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL; unsigned spi_ps_input_cntl[NUM_INTERP]; STATIC_ASSERT(NUM_INTERP >= 0 && NUM_INTERP <= 32); @@ -4813,7 +4812,7 @@ static void si_emit_spi_map(struct si_context *sctx, unsigned index) struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; for (unsigned i = 0; i < NUM_INTERP; i++) { - union si_input_info input = psinfo->input[i]; + union si_input_info input = ps->info.ps_inputs[i]; unsigned ps_input_cntl = vs->info.vs_output_ps_input_cntl[input.semantic]; bool non_default_val = G_028644_OFFSET(ps_input_cntl) != 0x20; -- GitLab From 1d4402b02ad7cf98250e07c09dce8f65b42d1925 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sat, 25 Nov 2023 15:32:48 -0500 Subject: [PATCH 28/38] radeonsi: don't include the PARAM_GEN input in si_shader_info It's only produced by lowering point smoothing. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_nir_lower_abi.c | 4 +++- src/gallium/drivers/radeonsi/si_shader_info.c | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index c96572bd5a932..8a99e9682b9da 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -637,7 +637,9 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s /* Load point coordinates (x, y) which are written by the hw after the interpolated inputs */ replacement = nir_load_interpolated_input(b, 2, 32, interp_param, nir_imm_int(b, 0), .base = si_get_ps_num_interp(shader), - .component = 2); + .component = 2, + /* This tells si_nir_scan_shader that it's PARAM_GEN */ + .io_semantics.no_varying = 1); break; } case nir_intrinsic_load_poly_line_smooth_enabled: diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index f14ff069cbcf5..5f6007898db73 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -260,6 +260,10 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info, semantic = nir_intrinsic_io_semantics(intr).location; if (nir->info.stage == MESA_SHADER_FRAGMENT && is_input) { + /* The PARAM_GEN input shouldn't be scanned. */ + if (nir_intrinsic_io_semantics(intr).no_varying) + return; + /* Gather color PS inputs. We can only get here after lowering colors in monolithic * shaders. This must match what we do for nir_intrinsic_load_color0/1. */ -- GitLab From dbea0f51a392ce7b54051de1e63265004deff627 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sat, 25 Nov 2023 15:37:47 -0500 Subject: [PATCH 29/38] radeonsi: decrease NUM_INTERP if uniform inlining eliminated PS inputs This should improve performance when that happens. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_shader.c | 40 +++++++++++++++--------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 2ca944f401cd6..aa95727d34916 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2236,12 +2236,11 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, * The storage for eliminated outputs is also not allocated. * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM) * - TCS output stores are eliminated + * - Eliminated PS inputs are removed from PS.NUM_INTERP. * * TODO: These are things the driver ignores in the final shader code * and relies on the default shader info. * - Other system values are not eliminated - * - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs - * to remove holes * - uses_discard - if it changed to false * - writes_memory - if it changed to false * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not @@ -2358,15 +2357,27 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg, output_info); progress = true; } else if (sel->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) { - /* two-side color selection and interpolation */ - if (sel->info.colors_read) - NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &sel->info); + /* Uniform inlining can eliminate PS inputs, and colormask can remove PS outputs, + * which can also cause the elimination of PS inputs. Remove holes after removed PS inputs + * by renumbering them. This can only happen with monolithic PS. Colors are unaffected + * because they are still represented by nir_intrinsic_load_color0/1. + */ + NIR_PASS_V(nir, nir_recompute_io_bases, nir_var_shader_in); + + /* Two-side color selection and interpolation: Get the latest shader info because + * uniform inlining and colormask can fully eliminate color inputs. + */ + struct si_shader_info info; + si_nir_scan_shader(sel->screen, nir, &info); + + if (info.colors_read) + NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &info); /* We need to set this early for lowering nir_intrinsic_load_point_coord_maybe_flipped, * which can only occur with monolithic PS. */ - shader->info.num_ps_inputs = sel->info.num_inputs; - shader->info.ps_colors_read = sel->info.colors_read; + shader->info.num_ps_inputs = info.num_inputs; + shader->info.ps_colors_read = info.colors_read; ac_nir_lower_ps_options options = { .gfx_level = sel->screen->info.gfx_level, @@ -2456,14 +2467,13 @@ void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir) shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh; if (nir->info.stage == MESA_SHADER_FRAGMENT) { - shader->info.num_ps_inputs = shader->selector->info.num_inputs; - shader->info.ps_colors_read = shader->selector->info.colors_read; - - unsigned num_colors = !!(shader->selector->info.colors_read & 0x0f) + - !!(shader->selector->info.colors_read & 0xf0); - unsigned max_interp = MIN2(shader->info.num_ps_inputs + num_colors, SI_NUM_INTERP); - memcpy(shader->info.ps_inputs, shader->selector->info.input, - max_interp * sizeof(info.input[0])); + /* Since uniform inlining can remove PS inputs, set the latest info about PS inputs here. */ + shader->info.num_ps_inputs = info.num_inputs; + shader->info.ps_colors_read = info.colors_read; + + /* A non-monolithic PS doesn't know if back colors are enabled, so copy 2 more. */ + unsigned max_interp = MIN2(info.num_inputs + 2, SI_NUM_INTERP); + memcpy(shader->info.ps_inputs, info.input, max_interp * sizeof(info.input[0])); } } -- GitLab From bd700bace5f458212cac3fb1184a5b2cbc5c8c63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sat, 25 Nov 2023 16:18:38 -0500 Subject: [PATCH 30/38] radeonsi: update comments about uniform inlining Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_shader.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index aa95727d34916..abb29fece9dc3 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2232,22 +2232,22 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, * - Register usage and code size decrease (obvious) * - Eliminated PS system values are disabled by LLVM * (FragCoord, FrontFace, barycentrics) - * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef. - * The storage for eliminated outputs is also not allocated. + * - VS/TES/GS param exports are eliminated if they are undef. + * The param space for eliminated outputs is also not allocated. * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM) * - TCS output stores are eliminated * - Eliminated PS inputs are removed from PS.NUM_INTERP. * * TODO: These are things the driver ignores in the final shader code * and relies on the default shader info. - * - Other system values are not eliminated + * - System values in VS, TCS, TES, GS are not eliminated * - uses_discard - if it changed to false * - writes_memory - if it changed to false * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not * eliminated * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS) * GS outputs are eliminated except for the temporary LDS. - * Clip distances, gl_PointSize, and PS outputs are eliminated based + * Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based * on current states, so we don't care about the shader code. * * TODO: Merged shaders don't inline uniforms for the first stage. -- GitLab From 6d2a7f53acfb219910fde175a4233bd5157937f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sat, 25 Nov 2023 17:25:42 -0500 Subject: [PATCH 31/38] radeonsi: decrease NUM_INTERP if export formats/colormask eliminated PS inputs This adds a pass that removes output stores. It's called before NIR optimizations that are before PS lowering, which will cause it to remove PS inputs from register settings. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_shader.c | 88 ++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index abb29fece9dc3..3a9493af83acf 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1721,6 +1721,80 @@ static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key) return progress; } +/* Remove PS output components from NIR if they are disabled by spi_shader_col_format. */ +static bool kill_ps_outputs_cb(struct nir_builder *b, nir_instr *instr, void *_key) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + /* No indirect indexing allowed. */ + ASSERTED nir_src offset = *nir_get_io_offset_src(intr); + assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0); + + unsigned location = nir_intrinsic_io_semantics(intr).location; + const union si_shader_key *key = _key; + + switch (location) { + case FRAG_RESULT_DEPTH: + case FRAG_RESULT_STENCIL: + return false; + + case FRAG_RESULT_SAMPLE_MASK: + if (key->ps.part.epilog.kill_samplemask) { + nir_instr_remove(instr); + return true; + } + return false; + } + + /* Color outputs. */ + unsigned comp_mask = BITFIELD_MASK(intr->num_components); + assert(nir_intrinsic_component(intr) == 0); + unsigned cb_shader_mask = ac_get_cb_shader_mask(key->ps.part.epilog.spi_shader_col_format); + + /* If COLOR is broadcasted to multiple color buffers, combine their masks. */ + if (location == FRAG_RESULT_COLOR) { + for (unsigned i = 1; i <= key->ps.part.epilog.last_cbuf; i++) + cb_shader_mask |= (cb_shader_mask >> (i * 4)) & 0xf; + } + + unsigned index = location == FRAG_RESULT_COLOR ? 0 : location - FRAG_RESULT_DATA0; + unsigned output_mask = (cb_shader_mask >> (index * 4)) & 0xf; + + if ((output_mask & comp_mask) == comp_mask) + return false; + + if (!(output_mask & comp_mask)) { + nir_instr_remove(instr); + return true; + } + + /* Fill disabled components with undef. */ + b->cursor = nir_before_instr(instr); + nir_def *new_value = intr->src[0].ssa; + nir_def *undef = nir_undef(b, 1, new_value->bit_size); + + unsigned kill_mask = ~output_mask & comp_mask; + u_foreach_bit(i, kill_mask) { + new_value = nir_vector_insert_imm(b, new_value, undef, i); + } + + nir_src_rewrite(&intr->src[0], new_value); + return true; +} + +static bool si_nir_kill_ps_outputs(nir_shader *nir, const union si_shader_key *key) +{ + assert(nir->info.stage == MESA_SHADER_FRAGMENT); + return nir_shader_instructions_pass(nir, kill_ps_outputs_cb, + nir_metadata_dominance | + nir_metadata_block_index, (void*)key); +} + static bool clamp_vertex_color_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) { @@ -2265,11 +2339,17 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, progress = true; } - if (sel->stage == MESA_SHADER_FRAGMENT && key->ps.mono.poly_line_smoothing) - NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES); + if (sel->stage == MESA_SHADER_FRAGMENT) { + /* This uses the epilog key, so only monolithic shaders can call this. */ + if (shader->is_monolithic) + NIR_PASS(progress, nir, si_nir_kill_ps_outputs, key); + + if (key->ps.mono.poly_line_smoothing) + NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES); - if (sel->stage == MESA_SHADER_FRAGMENT && key->ps.mono.point_smoothing) - NIR_PASS(progress, nir, nir_lower_point_smooth); + if (key->ps.mono.point_smoothing) + NIR_PASS(progress, nir, nir_lower_point_smooth); + } /* This must be before si_nir_lower_resource. */ if (!sel->screen->info.has_image_opcodes) -- GitLab From fb994f44d91a4b94738ea4ebb83aab1a257ef123 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sun, 26 Nov 2023 09:23:10 -0500 Subject: [PATCH 32/38] util: make BITSET_TEST_RANGE_INSIDE_WORD take a value to compare with Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/util/bitset.h | 8 ++++---- src/util/tests/bitset_test.cpp | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/util/bitset.h b/src/util/bitset.h index d5b5d6dae2d0d..cffbb73ecce54 100644 --- a/src/util/bitset.h +++ b/src/util/bitset.h @@ -205,11 +205,11 @@ __bitset_shl(BITSET_WORD *x, unsigned amount, unsigned n) #define BITSET_SHL(x, n) \ __bitset_shl(x, n, ARRAY_SIZE(x)); -/* bit range operations +/* bit range operations (e=end is inclusive) */ -#define BITSET_TEST_RANGE_INSIDE_WORD(x, b, e) \ +#define BITSET_TEST_RANGE_INSIDE_WORD(x, b, e, mask) \ (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \ - (((x)[BITSET_BITWORD(b)] & BITSET_RANGE(b, e)) != 0) : \ + (((x)[BITSET_BITWORD(b)] & BITSET_RANGE(b, e)) == mask) : \ (assert (!"BITSET_TEST_RANGE: bit range crosses word boundary"), 0)) #define BITSET_SET_RANGE_INSIDE_WORD(x, b, e) \ (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \ @@ -227,7 +227,7 @@ __bitset_test_range(const BITSET_WORD *r, unsigned start, unsigned end) const unsigned start_mod = start % BITSET_WORDBITS; if (start_mod + size <= BITSET_WORDBITS) { - return BITSET_TEST_RANGE_INSIDE_WORD(r, start, end); + return !BITSET_TEST_RANGE_INSIDE_WORD(r, start, end, 0); } else { const unsigned first_size = BITSET_WORDBITS - start_mod; diff --git a/src/util/tests/bitset_test.cpp b/src/util/tests/bitset_test.cpp index ec3ba5104ab7d..5c69c5ac0b151 100644 --- a/src/util/tests/bitset_test.cpp +++ b/src/util/tests/bitset_test.cpp @@ -74,8 +74,8 @@ TEST(bitset, test_basic_range) const int max_set = 15; BITSET_SET_RANGE_INSIDE_WORD(mask128, 0, max_set); - EXPECT_EQ(BITSET_TEST_RANGE_INSIDE_WORD(mask128, 0, max_set), true); - EXPECT_EQ(BITSET_TEST_RANGE_INSIDE_WORD(mask128, max_set + 1, max_set + 15), false); + EXPECT_EQ(!BITSET_TEST_RANGE_INSIDE_WORD(mask128, 0, max_set, 0), true); + EXPECT_EQ(!BITSET_TEST_RANGE_INSIDE_WORD(mask128, max_set + 1, max_set + 15, 0), false); for (int i = 0; i < 128; i++) { if (i <= max_set) EXPECT_EQ(BITSET_TEST(mask128, i), true); @@ -83,7 +83,7 @@ TEST(bitset, test_basic_range) EXPECT_EQ(BITSET_TEST(mask128, i), false); } BITSET_CLEAR_RANGE(mask128, 0, max_set); - EXPECT_EQ(BITSET_TEST_RANGE_INSIDE_WORD(mask128, 0, max_set), false); + EXPECT_EQ(!BITSET_TEST_RANGE_INSIDE_WORD(mask128, 0, max_set, 0), false); for (int i = 0; i < 128; i++) { EXPECT_EQ(BITSET_TEST(mask128, i), false); } -- GitLab From 17e01a9a9b743d89066ba0a42c841e9b7e7d0528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sun, 26 Nov 2023 09:23:41 -0500 Subject: [PATCH 33/38] radeonsi: merge context_reg_saved_mask and other_reg_saved_mask into a BITSET There will be more than 64 context registers that we'll need to track, so use BITSET for all of them. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_build_pm4.h | 164 ++++++++++-------- src/gallium/drivers/radeonsi/si_compute.c | 6 +- src/gallium/drivers/radeonsi/si_gfx_cs.c | 163 +++++++++-------- src/gallium/drivers/radeonsi/si_state.h | 22 ++- .../drivers/radeonsi/si_state_draw.cpp | 29 ++-- 5 files changed, 195 insertions(+), 189 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index 24e799080706f..dad653720d446 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -68,88 +68,96 @@ radeon_emit(value); \ } while (0) -#define radeon_opt_set_reg(reg, reg_enum, idx, value, prefix_name, packet, category) do { \ +#define radeon_opt_set_reg(reg, reg_enum, idx, value, prefix_name, packet) do { \ unsigned __value = (value); \ - if (!((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x1) || \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] != __value) { \ + if (!BITSET_TEST(sctx->tracked_regs.reg_saved_mask, (reg_enum)) || \ + sctx->tracked_regs.reg_value[(reg_enum)] != __value) { \ radeon_set_reg(reg, idx, __value, prefix_name, packet); \ - sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_BIT(reg_enum); \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] = __value; \ + BITSET_SET(sctx->tracked_regs.reg_saved_mask, (reg_enum)); \ + sctx->tracked_regs.reg_value[(reg_enum)] = __value; \ } \ } while (0) /* Set consecutive registers if any value is different. */ -#define radeon_opt_set_reg2(reg, reg_enum, v1, v2, prefix_name, packet, category) do { \ +#define radeon_opt_set_reg2(reg, reg_enum, v1, v2, prefix_name, packet) do { \ unsigned __v1 = (v1), __v2 = (v2); \ - if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x3) != 0x3 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2) { \ + if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 1, 0x3) || \ + sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2) { \ radeon_set_reg_seq(reg, 2, 0, prefix_name, packet, 0); \ radeon_emit(__v1); \ radeon_emit(__v2); \ - sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 2); \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \ + BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 1); \ + sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \ } \ } while (0) -#define radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, prefix_name, packet, category) do { \ +#define radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, prefix_name, packet) do { \ unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3); \ - if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x7) != 0x7 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3) { \ + if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 2, 0x7) || \ + sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3) { \ radeon_set_reg_seq(reg, 3, 0, prefix_name, packet, 0); \ radeon_emit(__v1); \ radeon_emit(__v2); \ radeon_emit(__v3); \ - sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 3); \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \ + BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 2); \ + sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \ } \ } while (0) -#define radeon_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, prefix_name, packet, category) do { \ +#define radeon_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, prefix_name, packet) do { \ unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3), __v4 = (v4); \ - if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0xf) != 0xf || \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] != __v4) { \ + if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 3, 0xf) || \ + sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 3] != __v4) { \ radeon_set_reg_seq(reg, 4, 0, prefix_name, packet, 0); \ radeon_emit(__v1); \ radeon_emit(__v2); \ radeon_emit(__v3); \ radeon_emit(__v4); \ - sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 4); \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] = __v4; \ + BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 3); \ + sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \ + sctx->tracked_regs.reg_value[(reg_enum) + 3] = __v4; \ } \ } while (0) -#define radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, prefix_name, packet, category) do { \ +#define radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, prefix_name, packet) do { \ unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3), __v4 = (v4), __v5 = (v5); \ - if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x1f) != 0x1f || \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] != __v4 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 4] != __v5) { \ + if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 4, 0x1f) || \ + sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 3] != __v4 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 4] != __v5) { \ radeon_set_reg_seq(reg, 5, 0, prefix_name, packet, 0); \ radeon_emit(__v1); \ radeon_emit(__v2); \ radeon_emit(__v3); \ radeon_emit(__v4); \ radeon_emit(__v5); \ - sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 5); \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] = __v4; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 4] = __v5; \ + BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 4); \ + sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \ + sctx->tracked_regs.reg_value[(reg_enum) + 3] = __v4; \ + sctx->tracked_regs.reg_value[(reg_enum) + 4] = __v5; \ } \ } while (0) @@ -174,22 +182,22 @@ radeon_set_reg(reg, 0, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG) #define radeon_opt_set_context_reg(_unused, reg, reg_enum, value) \ - radeon_opt_set_reg(reg, reg_enum, 0, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) + radeon_opt_set_reg(reg, reg_enum, 0, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG) #define radeon_opt_set_context_reg_idx(_unused, reg, reg_enum, idx, value) \ - radeon_opt_set_reg(reg, reg_enum, idx, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) + radeon_opt_set_reg(reg, reg_enum, idx, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG) #define radeon_opt_set_context_reg2(_unused, reg, reg_enum, v1, v2) \ - radeon_opt_set_reg2(reg, reg_enum, v1, v2, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) + radeon_opt_set_reg2(reg, reg_enum, v1, v2, SI_CONTEXT, PKT3_SET_CONTEXT_REG) #define radeon_opt_set_context_reg3(_unused, reg, reg_enum, v1, v2, v3) \ - radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) + radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, SI_CONTEXT, PKT3_SET_CONTEXT_REG) #define radeon_opt_set_context_reg4(_unused, reg, reg_enum, v1, v2, v3, v4) \ - radeon_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) + radeon_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT, PKT3_SET_CONTEXT_REG) #define radeon_opt_set_context_reg5(_unused, reg, reg_enum, v1, v2, v3, v4, v5) \ - radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) + radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, SI_CONTEXT, PKT3_SET_CONTEXT_REG) #define radeon_opt_set_context_regn(_unused, reg, values, saved_values, num) \ radeon_opt_set_regn(reg, values, saved_values, num, SI_CONTEXT, PKT3_SET_CONTEXT_REG) @@ -202,17 +210,17 @@ radeon_set_reg(reg, 0, value, SI_SH, PKT3_SET_SH_REG) #define radeon_opt_set_sh_reg(_unused, reg, reg_enum, value) \ - radeon_opt_set_reg(reg, reg_enum, 0, value, SI_SH, PKT3_SET_SH_REG, other) + radeon_opt_set_reg(reg, reg_enum, 0, value, SI_SH, PKT3_SET_SH_REG) #define radeon_opt_set_sh_reg2(_unused, reg, reg_enum, v1, v2) \ - radeon_opt_set_reg2(reg, reg_enum, v1, v2, SI_SH, PKT3_SET_SH_REG, other) + radeon_opt_set_reg2(reg, reg_enum, v1, v2, SI_SH, PKT3_SET_SH_REG) #define radeon_opt_set_sh_reg3(_unused, reg, reg_enum, v1, v2, v3) \ - radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, SI_SH, PKT3_SET_SH_REG, other) + radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, SI_SH, PKT3_SET_SH_REG) #define radeon_opt_set_sh_reg_idx(_unused, reg, reg_enum, idx, value) do { \ assert(sctx->gfx_level >= GFX10); \ - radeon_opt_set_reg(reg, reg_enum, idx, value, SI_SH, PKT3_SET_SH_REG_INDEX, other); \ + radeon_opt_set_reg(reg, reg_enum, idx, value, SI_SH, PKT3_SET_SH_REG_INDEX); \ } while (0) #define radeon_emit_32bit_pointer(_unused, va) do { \ @@ -236,7 +244,7 @@ radeon_set_reg(reg, 0, value, CIK_UCONFIG, PKT3_SET_UCONFIG_REG) #define radeon_opt_set_uconfig_reg(_unused, reg, reg_enum, value) \ - radeon_opt_set_reg(reg, reg_enum, 0, value, CIK_UCONFIG, PKT3_SET_UCONFIG_REG, other) + radeon_opt_set_reg(reg, reg_enum, 0, value, CIK_UCONFIG, PKT3_SET_UCONFIG_REG) #define RESOLVE_PKT3_SET_UCONFIG_REG_INDEX \ (GFX_VERSION >= GFX10 || (GFX_VERSION == GFX9 && sctx->screen->info.me_fw_version >= 26) ? \ @@ -246,7 +254,7 @@ radeon_set_reg(reg, idx, value, CIK_UCONFIG, RESOLVE_PKT3_SET_UCONFIG_REG_INDEX) #define radeon_opt_set_uconfig_reg_idx(_unused, _unused2, reg, reg_enum, idx, value) \ - radeon_opt_set_reg(reg, reg_enum, idx, value, CIK_UCONFIG, RESOLVE_PKT3_SET_UCONFIG_REG_INDEX, other) + radeon_opt_set_reg(reg, reg_enum, idx, value, CIK_UCONFIG, RESOLVE_PKT3_SET_UCONFIG_REG_INDEX) #define radeon_set_privileged_config_reg(reg, value) do { \ assert((reg) < CIK_UCONFIG_REG_OFFSET); \ @@ -268,35 +276,37 @@ buffer[__i / 2].reg_value[__i % 2] = value; \ } while (0) -#define gfx11_opt_push_reg(reg, reg_enum, value, prefix_name, category, buffer, reg_count) do { \ +#define gfx11_opt_push_reg(reg, reg_enum, value, prefix_name, buffer, reg_count) do { \ unsigned __value = value; \ - if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x1) != 0x1 || \ - sctx->tracked_regs.category##_reg_value[reg_enum] != __value) { \ + if (!BITSET_TEST(sctx->tracked_regs.reg_saved_mask, (reg_enum)) || \ + sctx->tracked_regs.reg_value[reg_enum] != __value) { \ gfx11_push_reg(reg, __value, prefix_name, buffer, reg_count); \ - sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_BIT(reg_enum); \ - sctx->tracked_regs.category##_reg_value[reg_enum] = __value; \ + BITSET_SET(sctx->tracked_regs.reg_saved_mask, (reg_enum)); \ + sctx->tracked_regs.reg_value[reg_enum] = __value; \ } \ } while (0) -#define gfx11_opt_push_reg4(reg, reg_enum, v1, v2, v3, v4, prefix_name, category, buffer, reg_count) do { \ +#define gfx11_opt_push_reg4(reg, reg_enum, v1, v2, v3, v4, prefix_name, buffer, reg_count) do { \ unsigned __v1 = (v1); \ unsigned __v2 = (v2); \ unsigned __v3 = (v3); \ unsigned __v4 = (v4); \ - if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0xf) != 0xf || \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3 || \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] != __v4) { \ + if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 3, 0xf) || \ + sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 3] != __v4) { \ gfx11_push_reg((reg), __v1, prefix_name, buffer, reg_count); \ gfx11_push_reg((reg) + 4, __v2, prefix_name, buffer, reg_count); \ gfx11_push_reg((reg) + 8, __v3, prefix_name, buffer, reg_count); \ gfx11_push_reg((reg) + 12, __v4, prefix_name, buffer, reg_count); \ - sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE((reg_enum), 4); \ - sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \ - sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] = __v4; \ + BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 3); \ + sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \ + sctx->tracked_regs.reg_value[(reg_enum) + 3] = __v4; \ } \ } while (0) @@ -310,11 +320,11 @@ sctx->num_buffered_compute_sh_regs) #define gfx11_opt_push_gfx_sh_reg(reg, reg_enum, value) \ - gfx11_opt_push_reg(reg, reg_enum, value, SI_SH, other, sctx->gfx11.buffered_gfx_sh_regs, \ + gfx11_opt_push_reg(reg, reg_enum, value, SI_SH, sctx->gfx11.buffered_gfx_sh_regs, \ sctx->num_buffered_gfx_sh_regs) #define gfx11_opt_push_compute_sh_reg(reg, reg_enum, value) \ - gfx11_opt_push_reg(reg, reg_enum, value, SI_SH, other, sctx->gfx11.buffered_compute_sh_regs, \ + gfx11_opt_push_reg(reg, reg_enum, value, SI_SH, sctx->gfx11.buffered_compute_sh_regs, \ sctx->num_buffered_compute_sh_regs) /* GFX11 packet building helpers for SET_CONTEXT_REG_PAIRS_PACKED. @@ -328,11 +338,11 @@ gfx11_push_reg(reg, value, SI_CONTEXT, __cs_context_regs, __cs_context_reg_count) #define gfx11_opt_set_context_reg(reg, reg_enum, value) \ - gfx11_opt_push_reg(reg, reg_enum, value, SI_CONTEXT, context, __cs_context_regs, \ + gfx11_opt_push_reg(reg, reg_enum, value, SI_CONTEXT, __cs_context_regs, \ __cs_context_reg_count) #define gfx11_opt_set_context_reg4(reg, reg_enum, v1, v2, v3, v4) \ - gfx11_opt_push_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT, context, __cs_context_regs, \ + gfx11_opt_push_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT, __cs_context_regs, \ __cs_context_reg_count) #define gfx11_end_packed_context_regs() do { \ diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 21bda9d08cd5f..092b3e3038dbb 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -999,8 +999,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info si_compute_resources_add_all_to_bo_list(sctx); /* Skipping setting redundant registers on compute queues breaks compute. */ - if (!sctx->has_graphics) - sctx->tracked_regs.other_reg_saved_mask = 0; + if (!sctx->has_graphics) { + BITSET_SET_RANGE(sctx->tracked_regs.reg_saved_mask, + SI_FIRST_TRACKED_OTHER_REG, SI_NUM_ALL_TRACKED_REGS - 1); + } /* First emit registers. */ bool prefetch; diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 44e95f6eb41f9..1b6d0b8f44ce8 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -204,83 +204,83 @@ static void si_add_gds_to_buffer_list(struct si_context *sctx) void si_set_tracked_regs_to_clear_state(struct si_context *ctx) { - STATIC_ASSERT(SI_NUM_TRACKED_CONTEXT_REGS <= sizeof(ctx->tracked_regs.context_reg_saved_mask) * 8); - - ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x1000; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x5; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_EQAA] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff; - ctx->tracked_regs.context_reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x90000; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x2; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_SHADER_STAGES_EN] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_LS_HS_CONFIG] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_TF_PARAM] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x3; - ctx->tracked_regs.context_reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL] = 0; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_IA_MULTI_VGT_PARAM] = 0xff; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_MODE] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x1e; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE] = 0; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0; - - ctx->tracked_regs.context_reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0; - ctx->tracked_regs.context_reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0; + STATIC_ASSERT(SI_NUM_ALL_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved_mask) * 8); + + ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0; + + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x1000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0; + + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x5; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000; + + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0; + + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0; + + ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff; + ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x90000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x2; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_SHADER_STAGES_EN] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_LS_HS_CONFIG] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x3; + ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL] = 0; + + ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0; + + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_IA_MULTI_VGT_PARAM] = 0xff; + + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0; + + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x1e; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE] = 0; + + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0; + + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0; + + ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0; /* Set all cleared context registers to saved. */ - ctx->tracked_regs.context_reg_saved_mask = BITFIELD64_MASK(SI_NUM_TRACKED_CONTEXT_REGS); + BITSET_SET_RANGE(ctx->tracked_regs.reg_saved_mask, 0, SI_NUM_TRACKED_CONTEXT_REGS - 1); } void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper, @@ -512,17 +512,14 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) si_mark_atom_dirty(ctx, &ctx->atoms.s.vgt_pipeline_state); si_mark_atom_dirty(ctx, &ctx->atoms.s.tess_io_layout); - if (has_clear_state) { + /* Set all register values to unknown. */ + BITSET_ZERO(ctx->tracked_regs.reg_saved_mask); + + if (has_clear_state) si_set_tracked_regs_to_clear_state(ctx); - } else { - /* Set all register values to unknown. */ - ctx->tracked_regs.context_reg_saved_mask = 0; - } - /* 0xffffffff is an impossible value to register SPI_PS_INPUT_CNTL_n */ + /* 0xffffffff is an impossible value for SPI_PS_INPUT_CNTL_n registers */ memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32); - - ctx->tracked_regs.other_reg_saved_mask = 0; /* unknown values */ } /* Invalidate various draw states so that they are emitted before diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 0cee83c1632fd..ac4833116d17e 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -9,6 +9,7 @@ #include "si_pm4.h" #include "util/format/u_format.h" +#include "util/bitset.h" #ifdef __cplusplus extern "C" { @@ -237,9 +238,10 @@ struct si_shader_data { uint32_t sh_base[SI_NUM_SHADERS]; }; -/* Context registers whose values are tracked by si_context. */ -enum si_tracked_context_reg +/* Registers whose values are tracked by si_context. */ +enum si_tracked_reg { + /* CONTEXT registers. */ /* 2 consecutive registers */ SI_TRACKED_DB_RENDER_CONTROL, SI_TRACKED_DB_COUNT_CONTROL, @@ -326,11 +328,10 @@ enum si_tracked_context_reg SI_TRACKED_CB_DCC_CONTROL, /* GFX8-xx (TBD) */ SI_NUM_TRACKED_CONTEXT_REGS, -}; + SI_FIRST_TRACKED_OTHER_REG = SI_NUM_TRACKED_CONTEXT_REGS, -/* Non-context registers whose values are tracked by si_context. */ -enum si_tracked_other_reg { - SI_TRACKED_GE_PC_ALLOC, /* GFX10+ */ + /* SH and UCONFIG registers. */ + SI_TRACKED_GE_PC_ALLOC = SI_FIRST_TRACKED_OTHER_REG, /* GFX10+ */ SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, /* GFX7+ */ SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, /* GFX10+ */ SI_TRACKED_VGT_GS_OUT_PRIM_TYPE_UCONFIG, /* GFX11+ */ @@ -372,7 +373,7 @@ enum si_tracked_other_reg { SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_LO, /* GFX11+ */ SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_HI, /* GFX11+ */ - SI_NUM_TRACKED_OTHER_REGS, + SI_NUM_ALL_TRACKED_REGS, }; /* For 3 draw constants: BaseVertex, DrawID, StartInstance */ @@ -383,12 +384,9 @@ enum si_tracked_other_reg { #define BASEVERTEX_DRAWID_STARTINSTANCE_MASK (BASEVERTEX_MASK | DRAWID_MASK | STARTINSTANCE_MASK) struct si_tracked_regs { - uint64_t context_reg_saved_mask; - uint32_t context_reg_value[SI_NUM_TRACKED_CONTEXT_REGS]; + BITSET_DECLARE(reg_saved_mask, SI_NUM_ALL_TRACKED_REGS); + uint32_t reg_value[SI_NUM_ALL_TRACKED_REGS]; uint32_t spi_ps_input_cntl[32]; - - uint32_t other_reg_saved_mask; - uint32_t other_reg_value[SI_NUM_TRACKED_OTHER_REGS]; }; /* Private read-write buffer slots. */ diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 738968a878d1c..bba9aae96ce89 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1041,7 +1041,7 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx, if (GFX_VERSION == GFX9) { /* Workaround for SpecviewPerf13 Catia hang on GFX9. */ if (prim != sctx->last_prim) - sctx->tracked_regs.other_reg_saved_mask &= ~BITFIELD64_BIT(SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG); + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG); radeon_opt_set_uconfig_reg_idx(sctx, GFX_VERSION, R_030960_IA_MULTI_VGT_PARAM, SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG, @@ -1347,8 +1347,9 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw } /* Invalidate tracked draw constants because DrawIndirect overwrites them. */ - sctx->tracked_regs.other_reg_saved_mask &= - ~(BASEVERTEX_DRAWID_STARTINSTANCE_MASK << tracked_base_vertex_reg); + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */ + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */ + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 2); /* StartInstance */ sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN; radeon_emit(PKT3(PKT3_SET_BASE, 2, 0)); @@ -1460,8 +1461,9 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw */ if (is_blit) { /* Re-emit draw constants after we leave u_blitter. */ - sctx->tracked_regs.other_reg_saved_mask &= - ~(BASEVERTEX_DRAWID_STARTINSTANCE_MASK << tracked_base_vertex_reg); + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */ + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */ + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 2); /* StartInstance */ /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */ radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs); @@ -1510,8 +1512,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */ } if (num_draws > 1) { - sctx->tracked_regs.other_reg_saved_mask &= - ~(BASEVERTEX_DRAWID_MASK << tracked_base_vertex_reg); + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */ + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */ } } else { /* Only DrawID varies. */ @@ -1529,8 +1531,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */ } if (num_draws > 1) { - sctx->tracked_regs.other_reg_saved_mask &= - ~(DRAWID_MASK << tracked_base_vertex_reg); + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */ } } } else { @@ -1550,8 +1551,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */ } if (num_draws > 1) { - sctx->tracked_regs.other_reg_saved_mask &= - ~(BASEVERTEX_MASK << tracked_base_vertex_reg); + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */ } } else { /* DrawID and BaseVertex are constant. */ @@ -1594,8 +1594,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); } if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) { - sctx->tracked_regs.other_reg_saved_mask &= - ~(BASEVERTEX_DRAWID_MASK << tracked_base_vertex_reg); + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */ + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */ } } else { for (unsigned i = 0; i < num_draws; i++) { @@ -1607,8 +1607,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); } if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) { - sctx->tracked_regs.other_reg_saved_mask &= - ~(BASEVERTEX_MASK << tracked_base_vertex_reg); + BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */ } } } -- GitLab From cabf9277654a6dccd830d4c006c56573ab535f86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sun, 26 Nov 2023 10:56:24 -0500 Subject: [PATCH 34/38] radeonsi: convert depth-stencil-alpha state to tracked registers Some of these registers don't change, so we should not set them when they don't. This reworks the DSA state to use a custom emit function and eliminate redundant register changes. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_gfx_cs.c | 5 ++ src/gallium/drivers/radeonsi/si_state.c | 109 ++++++++++++++++------- src/gallium/drivers/radeonsi/si_state.h | 19 +++- 3 files changed, 101 insertions(+), 32 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 1b6d0b8f44ce8..0f48775e0ee62 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -209,6 +209,11 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx) ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_DEPTH_CONTROL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_STENCIL_CONTROL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_DEPTH_BOUNDS_MIN] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_DEPTH_BOUNDS_MAX] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x1000; ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 77e4625dca1a2..e6324b710f76d 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1403,73 +1403,60 @@ static void *si_create_dsa_state(struct pipe_context *ctx, const struct pipe_depth_stencil_alpha_state *state) { struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa); - struct si_pm4_state *pm4 = &dsa->pm4; - unsigned db_depth_control; - uint32_t db_stencil_control = 0; - if (!dsa) { return NULL; } - si_pm4_clear_state(pm4, (struct si_screen*)ctx->screen, false); - dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask; dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask; dsa->stencil_ref.writemask[0] = state->stencil[0].writemask; dsa->stencil_ref.writemask[1] = state->stencil[1].writemask; - db_depth_control = + dsa->db_depth_control = S_028800_Z_ENABLE(state->depth_enabled) | S_028800_Z_WRITE_ENABLE(state->depth_writemask) | S_028800_ZFUNC(state->depth_func) | S_028800_DEPTH_BOUNDS_ENABLE(state->depth_bounds_test); /* stencil */ if (state->stencil[0].enabled) { - db_depth_control |= S_028800_STENCIL_ENABLE(1); - db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func); - db_stencil_control |= + dsa->db_depth_control |= S_028800_STENCIL_ENABLE(1); + dsa->db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func); + dsa->db_stencil_control |= S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op)); - db_stencil_control |= + dsa->db_stencil_control |= S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op)); - db_stencil_control |= + dsa->db_stencil_control |= S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op)); if (state->stencil[1].enabled) { - db_depth_control |= S_028800_BACKFACE_ENABLE(1); - db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func); - db_stencil_control |= + dsa->db_depth_control |= S_028800_BACKFACE_ENABLE(1); + dsa->db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func); + dsa->db_stencil_control |= S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op)); - db_stencil_control |= + dsa->db_stencil_control |= S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op)); - db_stencil_control |= + dsa->db_stencil_control |= S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op)); } } + dsa->db_depth_bounds_min = fui(state->depth_bounds_min); + dsa->db_depth_bounds_max = fui(state->depth_bounds_max); + /* alpha */ if (state->alpha_enabled) { dsa->alpha_func = state->alpha_func; - - si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4, - fui(state->alpha_ref_value)); + dsa->spi_shader_user_data_ps_alpha_ref = fui(state->alpha_ref_value); } else { dsa->alpha_func = PIPE_FUNC_ALWAYS; } - si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control); - if (state->stencil[0].enabled) - si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control); - if (state->depth_bounds_test) { - si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth_bounds_min)); - si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth_bounds_max)); - } - si_pm4_finalize(pm4); - dsa->depth_enabled = state->depth_enabled; dsa->depth_write_enabled = state->depth_enabled && state->depth_writemask; dsa->stencil_enabled = state->stencil[0].enabled; dsa->stencil_write_enabled = (util_writes_stencil(&state->stencil[0]) || util_writes_stencil(&state->stencil[1])); dsa->db_can_write = dsa->depth_write_enabled || dsa->stencil_write_enabled; + dsa->depth_bounds_enabled = state->depth_bounds_test; bool zfunc_is_ordered = state->depth_func == PIPE_FUNC_NEVER || state->depth_func == PIPE_FUNC_LESS || @@ -1496,6 +1483,68 @@ static void *si_create_dsa_state(struct pipe_context *ctx, return dsa; } +static void si_pm4_emit_dsa(struct si_context *sctx, unsigned index) +{ + struct si_state_dsa *state = sctx->queued.named.dsa; + assert(state && state != sctx->emitted.named.dsa); + + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_028800_DB_DEPTH_CONTROL, SI_TRACKED_DB_DEPTH_CONTROL, + state->db_depth_control); + if (state->stencil_enabled) { + gfx11_opt_set_context_reg(R_02842C_DB_STENCIL_CONTROL, SI_TRACKED_DB_STENCIL_CONTROL, + state->db_stencil_control); + } + if (state->depth_bounds_enabled) { + gfx11_opt_set_context_reg(R_028020_DB_DEPTH_BOUNDS_MIN, SI_TRACKED_DB_DEPTH_BOUNDS_MIN, + state->db_depth_bounds_min); + gfx11_opt_set_context_reg(R_028024_DB_DEPTH_BOUNDS_MAX, SI_TRACKED_DB_DEPTH_BOUNDS_MAX, + state->db_depth_bounds_max); + } + gfx11_end_packed_context_regs(); + + if (state->alpha_func != PIPE_FUNC_ALWAYS) { + if (sctx->screen->info.has_set_sh_pairs_packed) { + gfx11_opt_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF, + state->spi_shader_user_data_ps_alpha_ref); + } else { + radeon_opt_set_sh_reg(sctx, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF, + state->spi_shader_user_data_ps_alpha_ref); + } + } + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_reg(sctx, R_028800_DB_DEPTH_CONTROL, SI_TRACKED_DB_DEPTH_CONTROL, + state->db_depth_control); + if (state->stencil_enabled) { + radeon_opt_set_context_reg(sctx, R_02842C_DB_STENCIL_CONTROL, SI_TRACKED_DB_STENCIL_CONTROL, + state->db_stencil_control); + } + if (state->depth_bounds_enabled) { + radeon_opt_set_context_reg2(sctx, R_028020_DB_DEPTH_BOUNDS_MIN, + SI_TRACKED_DB_DEPTH_BOUNDS_MIN, + state->db_depth_bounds_min, + state->db_depth_bounds_max); + } + radeon_end_update_context_roll(); + + if (state->alpha_func != PIPE_FUNC_ALWAYS) { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_sh_reg(sctx, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF, + state->spi_shader_user_data_ps_alpha_ref); + radeon_end(); + } + } + + sctx->emitted.named.dsa = state; +} + static void si_bind_dsa_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -5713,7 +5762,7 @@ void si_init_state_functions(struct si_context *sctx) { sctx->atoms.s.pm4_states[SI_STATE_IDX(blend)].emit = si_pm4_emit_state; sctx->atoms.s.pm4_states[SI_STATE_IDX(rasterizer)].emit = si_pm4_emit_state; - sctx->atoms.s.pm4_states[SI_STATE_IDX(dsa)].emit = si_pm4_emit_state; + sctx->atoms.s.pm4_states[SI_STATE_IDX(dsa)].emit = si_pm4_emit_dsa; sctx->atoms.s.pm4_states[SI_STATE_IDX(poly_offset)].emit = si_pm4_emit_state; sctx->atoms.s.pm4_states[SI_STATE_IDX(ls)].emit = si_pm4_emit_shader; sctx->atoms.s.pm4_states[SI_STATE_IDX(hs)].emit = si_pm4_emit_shader; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index ac4833116d17e..189ae76c99d54 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -109,6 +109,13 @@ struct si_state_dsa { struct si_pm4_state pm4; struct si_dsa_stencil_ref_part stencil_ref; + /* Register values. */ + unsigned db_depth_control; + unsigned db_stencil_control; + unsigned db_depth_bounds_min; + unsigned db_depth_bounds_max; + unsigned spi_shader_user_data_ps_alpha_ref; + /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */ struct si_dsa_order_invariance order_invariance[2]; @@ -118,6 +125,7 @@ struct si_state_dsa { bool stencil_enabled : 1; bool stencil_write_enabled : 1; bool db_can_write : 1; + bool depth_bounds_enabled : 1; }; struct si_stencil_ref { @@ -226,8 +234,7 @@ union si_state_atoms { static inline uint64_t si_atoms_that_always_roll_context(void) { - return SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) | - SI_STATE_BIT(poly_offset) | + return SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(poly_offset) | SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) | SI_ATOM_BIT(sample_locations) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color)| SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports)| @@ -246,6 +253,12 @@ enum si_tracked_reg SI_TRACKED_DB_RENDER_CONTROL, SI_TRACKED_DB_COUNT_CONTROL, + SI_TRACKED_DB_DEPTH_CONTROL, + SI_TRACKED_DB_STENCIL_CONTROL, + /* 2 consecutive registers */ + SI_TRACKED_DB_DEPTH_BOUNDS_MIN, + SI_TRACKED_DB_DEPTH_BOUNDS_MAX, + /* 2 consecutive registers */ SI_TRACKED_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_AA_CONFIG, @@ -358,6 +371,8 @@ enum si_tracked_reg SI_TRACKED_SPI_SHADER_USER_DATA_VS__DRAWID, /* GFX6-10 */ SI_TRACKED_SPI_SHADER_USER_DATA_VS__START_INSTANCE, /* GFX6-10 */ + SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF, + SI_TRACKED_COMPUTE_RESOURCE_LIMITS, SI_TRACKED_COMPUTE_NUM_THREAD_X, SI_TRACKED_COMPUTE_NUM_THREAD_Y, -- GitLab From 11fcd58957e6023f97c41f57526c84efa2a74d08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sun, 26 Nov 2023 10:56:24 -0500 Subject: [PATCH 35/38] radeonsi: convert rasterizer state to tracked registers Most of these registers don't change, so we should not set them when they don't. This reworks the rasterizer state to use a custom emit function and eliminate redundant register changes. This required merging the poly_offset state into the rasterizer state and change how the poly offset state is updated. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_build_pm4.h | 31 ++ src/gallium/drivers/radeonsi/si_gfx_cs.c | 16 + src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_state.c | 322 +++++++++++--------- src/gallium/drivers/radeonsi/si_state.h | 37 ++- 5 files changed, 265 insertions(+), 142 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index dad653720d446..2242213d20d35 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -161,6 +161,34 @@ } \ } while (0) +#define radeon_opt_set_reg6(reg, reg_enum, v1, v2, v3, v4, v5, v6, prefix_name, packet) do { \ + unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3), __v4 = (v4), __v5 = (v5), __v6 = (v6); \ + if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 5, 0x3f) || \ + sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 3] != __v4 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 4] != __v5 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 5] != __v6) { \ + radeon_set_reg_seq(reg, 6, 0, prefix_name, packet, 0); \ + radeon_emit(__v1); \ + radeon_emit(__v2); \ + radeon_emit(__v3); \ + radeon_emit(__v4); \ + radeon_emit(__v5); \ + radeon_emit(__v6); \ + BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 5); \ + sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \ + sctx->tracked_regs.reg_value[(reg_enum) + 3] = __v4; \ + sctx->tracked_regs.reg_value[(reg_enum) + 4] = __v5; \ + sctx->tracked_regs.reg_value[(reg_enum) + 5] = __v6; \ + } \ +} while (0) + #define radeon_opt_set_regn(reg, values, saved_values, num, prefix_name, packet) do { \ if (memcmp(values, saved_values, sizeof(uint32_t) * (num))) { \ radeon_set_reg_seq(reg, num, 0, prefix_name, packet, 0); \ @@ -199,6 +227,9 @@ #define radeon_opt_set_context_reg5(_unused, reg, reg_enum, v1, v2, v3, v4, v5) \ radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, SI_CONTEXT, PKT3_SET_CONTEXT_REG) +#define radeon_opt_set_context_reg6(reg, reg_enum, v1, v2, v3, v4, v5, v6) \ + radeon_opt_set_reg6(reg, reg_enum, v1, v2, v3, v4, v5, v6, SI_CONTEXT, PKT3_SET_CONTEXT_REG) + #define radeon_opt_set_context_regn(_unused, reg, values, saved_values, num) \ radeon_opt_set_regn(reg, values, saved_values, num, SI_CONTEXT, PKT3_SET_CONTEXT_REG) diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 0f48775e0ee62..89c337b27802a 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -214,6 +214,21 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx) ctx->tracked_regs.reg_value[SI_TRACKED_DB_DEPTH_BOUNDS_MIN] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_DB_DEPTH_BOUNDS_MAX] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_INTERP_CONTROL_0] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POINT_SIZE] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POINT_MINMAX] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_LINE_CNTL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_0] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SC_MODE_CNTL] = 0x4; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_EDGERULE] = 0xaa99aaaa; + + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_DB_FMT_CNTL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_CLAMP] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_SCALE] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_OFFSET] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_BACK_SCALE] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_POLY_OFFSET_BACK_OFFSET] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x1000; ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0; @@ -252,6 +267,7 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx) ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x3; ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 20fdcf180eeec..1a122970021fd 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -457,6 +457,7 @@ struct si_surface { bool color_is_int8 : 1; bool color_is_int10 : 1; bool dcc_incompatible : 1; + uint8_t db_format_index : 3; /* Color registers. */ unsigned cb_color_info; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index e6324b710f76d..4d61823eab3c9 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -924,35 +924,6 @@ static void si_emit_clip_regs(struct si_context *sctx, unsigned index) } } -/* - * inferred state between framebuffer and rasterizer - */ -static void si_update_poly_offset_state(struct si_context *sctx) -{ - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - - if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) { - si_pm4_bind_state(sctx, poly_offset, NULL); - return; - } - - /* Use the user format, not db_render_format, so that the polygon - * offset behaves as expected by applications. - */ - switch (sctx->framebuffer.state.zsbuf->texture->format) { - case PIPE_FORMAT_Z16_UNORM: - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]); - break; - default: /* 24-bit */ - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]); - break; - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]); - break; - } -} - /* * Rasterizer */ @@ -976,16 +947,11 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast { struct si_screen *sscreen = ((struct si_context *)ctx)->screen; struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer); - struct si_pm4_state *pm4 = &rs->pm4; - unsigned tmp, i; - float psize_min, psize_max; if (!rs) { return NULL; } - si_pm4_clear_state(pm4, sscreen, false); - rs->scissor_enable = state->scissor; rs->clip_halfz = state->clip_halfz; rs->two_side = state->light_twoside; @@ -1069,20 +1035,19 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast rs->force_front_face_input = 1; } - unsigned spi_interp_control_0 = - S_0286D4_FLAT_SHADE_ENA(1) | - S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) | - S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | - S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | - S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | - S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | - S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT); - - si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0, spi_interp_control_0); + rs->spi_interp_control_0 = S_0286D4_FLAT_SHADE_ENA(1) | + S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) | + S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | + S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | + S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | + S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | + S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != + PIPE_SPRITE_COORD_UPPER_LEFT); /* point size 12.4 fixed point */ - tmp = (unsigned)(state->point_size * 8.0); - si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp)); + float psize_min, psize_max; + unsigned tmp = (unsigned)(state->point_size * 8.0); + rs->pa_su_point_size = S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp); if (state->point_size_per_vertex) { psize_min = util_get_min_point_size(state); @@ -1095,115 +1060,176 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast rs->max_point_size = psize_max; /* Divide by two, because 0.5 = 1 pixel. */ - si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX, - S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) | - S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2))); - si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL, - S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2))); - - si_pm4_set_reg(pm4, R_028A48_PA_SC_MODE_CNTL_0, - S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) | - S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth || - state->line_smooth) | - S_028A48_VPORT_SCISSOR_ENABLE(1) | - S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.gfx_level >= GFX9)); + rs->pa_su_point_minmax = S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) | + S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2)); + rs->pa_su_line_cntl = S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2)); + + rs->pa_sc_mode_cntl_0 = S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) | + S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth || + state->line_smooth) | + S_028A48_VPORT_SCISSOR_ENABLE(1) | + S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.gfx_level >= GFX9); bool polygon_mode_enabled = (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) || (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK)); - si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL, - S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) | - S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | - S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | - S_028814_FACE(!state->front_ccw) | - S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) | - S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) | - S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) | - S_028814_POLY_MODE(polygon_mode_enabled) | - S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | - S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) | - /* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */ - S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.gfx_level >= GFX10 ? - polygon_mode_enabled || - rs->perpendicular_end_caps : 0)); - + rs->pa_su_sc_mode_cntl = S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) | + S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | + S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | + S_028814_FACE(!state->front_ccw) | + S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) | + S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) | + S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) | + S_028814_POLY_MODE(polygon_mode_enabled) | + S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | + S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) | + /* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */ + S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.gfx_level >= GFX10 ? + polygon_mode_enabled || + rs->perpendicular_end_caps : 0); if (sscreen->info.gfx_level >= GFX10) { - si_pm4_set_reg(pm4, R_028838_PA_CL_NGG_CNTL, - S_028838_INDEX_BUF_EDGE_FLAG_ENA(rs->polygon_mode_is_points || - rs->polygon_mode_is_lines) | - S_028838_VERTEX_REUSE_DEPTH(sscreen->info.gfx_level >= GFX10_3 ? 30 : 0)); + rs->pa_cl_ngg_cntl = S_028838_INDEX_BUF_EDGE_FLAG_ENA(rs->polygon_mode_is_points || + rs->polygon_mode_is_lines) | + S_028838_VERTEX_REUSE_DEPTH(sscreen->info.gfx_level >= GFX10_3 ? 30 : 0); } if (state->bottom_edge_rule) { /* OpenGL windows should set this. */ - si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE, - S_028230_ER_TRI(0xA) | - S_028230_ER_POINT(0x5) | - S_028230_ER_RECT(0x9) | - S_028230_ER_LINE_LR(0x2A) | - S_028230_ER_LINE_RL(0x2A) | - S_028230_ER_LINE_TB(0xA) | - S_028230_ER_LINE_BT(0xA)); + rs->pa_sc_edgerule = S_028230_ER_TRI(0xA) | + S_028230_ER_POINT(0x5) | + S_028230_ER_RECT(0x9) | + S_028230_ER_LINE_LR(0x2A) | + S_028230_ER_LINE_RL(0x2A) | + S_028230_ER_LINE_TB(0xA) | + S_028230_ER_LINE_BT(0xA); } else { /* OpenGL FBOs and Direct3D should set this. */ - si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE, - S_028230_ER_TRI(0xA) | - S_028230_ER_POINT(0x6) | - S_028230_ER_RECT(0xA) | - S_028230_ER_LINE_LR(0x19) | - S_028230_ER_LINE_RL(0x25) | - S_028230_ER_LINE_TB(0xA) | - S_028230_ER_LINE_BT(0xA)); + rs->pa_sc_edgerule = S_028230_ER_TRI(0xA) | + S_028230_ER_POINT(0x6) | + S_028230_ER_RECT(0xA) | + S_028230_ER_LINE_LR(0x19) | + S_028230_ER_LINE_RL(0x25) | + S_028230_ER_LINE_TB(0xA) | + S_028230_ER_LINE_BT(0xA); } - si_pm4_finalize(pm4); - if (!rs->uses_poly_offset) - return rs; + if (rs->uses_poly_offset) { + /* Calculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */ + rs->pa_su_poly_offset_clamp = fui(state->offset_clamp); + rs->pa_su_poly_offset_frontback_scale = fui(state->offset_scale * 16); - rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state)); - if (!rs->pm4_poly_offset) { - FREE(rs); - return NULL; + if (!state->offset_units_unscaled) { + /* 16-bit zbuffer */ + rs->pa_su_poly_offset_db_fmt_cntl[0] = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16); + rs->pa_su_poly_offset_frontback_offset[0] = fui(state->offset_units * 4); + + /* 24-bit zbuffer */ + rs->pa_su_poly_offset_db_fmt_cntl[1] = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24); + rs->pa_su_poly_offset_frontback_offset[1] = fui(state->offset_units * 2); + + /* 32-bit zbuffer */ + rs->pa_su_poly_offset_db_fmt_cntl[2] = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | + S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1); + rs->pa_su_poly_offset_frontback_offset[2] = fui(state->offset_units); + } else { + rs->pa_su_poly_offset_frontback_offset[0] = fui(state->offset_units); + rs->pa_su_poly_offset_frontback_offset[1] = fui(state->offset_units); + rs->pa_su_poly_offset_frontback_offset[2] = fui(state->offset_units); + } } - /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */ - for (i = 0; i < 3; i++) { - struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i]; - float offset_units = state->offset_units; - float offset_scale = state->offset_scale * 16.0f; - uint32_t pa_su_poly_offset_db_fmt_cntl = 0; + return rs; +} - si_pm4_clear_state(pm4, sscreen, false); +static void si_pm4_emit_rasterizer(struct si_context *sctx, unsigned index) +{ + struct si_state_rasterizer *state = sctx->queued.named.rasterizer; - if (!state->offset_units_unscaled) { - switch (i) { - case 0: /* 16-bit zbuffer */ - offset_units *= 4.0f; - pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16); - break; - case 1: /* 24-bit zbuffer */ - offset_units *= 2.0f; - pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24); - break; - case 2: /* 32-bit zbuffer */ - offset_units *= 1.0f; - pa_su_poly_offset_db_fmt_cntl = - S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1); - break; - } + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_0286D4_SPI_INTERP_CONTROL_0, SI_TRACKED_SPI_INTERP_CONTROL_0, + state->spi_interp_control_0); + gfx11_opt_set_context_reg(R_028A00_PA_SU_POINT_SIZE, SI_TRACKED_PA_SU_POINT_SIZE, + state->pa_su_point_size); + gfx11_opt_set_context_reg(R_028A04_PA_SU_POINT_MINMAX, SI_TRACKED_PA_SU_POINT_MINMAX, + state->pa_su_point_minmax); + gfx11_opt_set_context_reg(R_028A08_PA_SU_LINE_CNTL, SI_TRACKED_PA_SU_LINE_CNTL, + state->pa_su_line_cntl); + gfx11_opt_set_context_reg(R_028A48_PA_SC_MODE_CNTL_0, SI_TRACKED_PA_SC_MODE_CNTL_0, + state->pa_sc_mode_cntl_0); + gfx11_opt_set_context_reg(R_028814_PA_SU_SC_MODE_CNTL, SI_TRACKED_PA_SU_SC_MODE_CNTL, + state->pa_su_sc_mode_cntl); + gfx11_opt_set_context_reg(R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL, + state->pa_cl_ngg_cntl); + gfx11_opt_set_context_reg(R_028230_PA_SC_EDGERULE, SI_TRACKED_PA_SC_EDGERULE, + state->pa_sc_edgerule); + + if (state->uses_poly_offset && sctx->framebuffer.state.zsbuf) { + unsigned db_format_index = + ((struct si_surface *)sctx->framebuffer.state.zsbuf)->db_format_index; + + gfx11_opt_set_context_reg(R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + SI_TRACKED_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + state->pa_su_poly_offset_db_fmt_cntl[db_format_index]); + gfx11_opt_set_context_reg(R_028B7C_PA_SU_POLY_OFFSET_CLAMP, + SI_TRACKED_PA_SU_POLY_OFFSET_CLAMP, + state->pa_su_poly_offset_clamp); + gfx11_opt_set_context_reg(R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, + SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_SCALE, + state->pa_su_poly_offset_frontback_scale); + gfx11_opt_set_context_reg(R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, + SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_OFFSET, + state->pa_su_poly_offset_frontback_offset[db_format_index]); + gfx11_opt_set_context_reg(R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, + SI_TRACKED_PA_SU_POLY_OFFSET_BACK_SCALE, + state->pa_su_poly_offset_frontback_scale); + gfx11_opt_set_context_reg(R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, + SI_TRACKED_PA_SU_POLY_OFFSET_BACK_OFFSET, + state->pa_su_poly_offset_frontback_offset[db_format_index]); } - - si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl); - si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp)); - si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale)); - si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units)); - si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale)); - si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units)); - si_pm4_finalize(pm4); + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_reg(sctx, R_0286D4_SPI_INTERP_CONTROL_0, + SI_TRACKED_SPI_INTERP_CONTROL_0, + state->spi_interp_control_0); + radeon_opt_set_context_reg(sctx, R_028A00_PA_SU_POINT_SIZE, SI_TRACKED_PA_SU_POINT_SIZE, + state->pa_su_point_size); + radeon_opt_set_context_reg(sctx, R_028A04_PA_SU_POINT_MINMAX, SI_TRACKED_PA_SU_POINT_MINMAX, + state->pa_su_point_minmax); + radeon_opt_set_context_reg(sctx, R_028A08_PA_SU_LINE_CNTL, SI_TRACKED_PA_SU_LINE_CNTL, + state->pa_su_line_cntl); + radeon_opt_set_context_reg(sctx, R_028A48_PA_SC_MODE_CNTL_0, SI_TRACKED_PA_SC_MODE_CNTL_0, + state->pa_sc_mode_cntl_0); + radeon_opt_set_context_reg(sctx, R_028814_PA_SU_SC_MODE_CNTL, + SI_TRACKED_PA_SU_SC_MODE_CNTL, state->pa_su_sc_mode_cntl); + if (sctx->gfx_level >= GFX10) { + radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL, + state->pa_cl_ngg_cntl); + } + radeon_opt_set_context_reg(sctx, R_028230_PA_SC_EDGERULE, SI_TRACKED_PA_SC_EDGERULE, + state->pa_sc_edgerule); + + if (state->uses_poly_offset && sctx->framebuffer.state.zsbuf) { + unsigned db_format_index = + ((struct si_surface *)sctx->framebuffer.state.zsbuf)->db_format_index; + + radeon_opt_set_context_reg6(R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + SI_TRACKED_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + state->pa_su_poly_offset_db_fmt_cntl[db_format_index], + state->pa_su_poly_offset_clamp, + state->pa_su_poly_offset_frontback_scale, + state->pa_su_poly_offset_frontback_offset[db_format_index], + state->pa_su_poly_offset_frontback_scale, + state->pa_su_poly_offset_frontback_offset[db_format_index]); + } + radeon_end_update_context_roll(); } - return rs; + sctx->emitted.named.rasterizer = state; } static void si_bind_rs_state(struct pipe_context *ctx, void *state) @@ -1238,7 +1264,6 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) SET_FIELD(sctx->current_vs_state, VS_STATE_CLAMP_VERTEX_COLOR, rs->clamp_vertex_color); si_pm4_bind_state(sctx, rasterizer, rs); - si_update_poly_offset_state(sctx); if (old_rs->scissor_enable != rs->scissor_enable) si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors); @@ -1311,7 +1336,6 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state) if (sctx->queued.named.rasterizer == state) si_bind_rs_state(ctx, sctx->discard_rasterizer_state); - FREE(rs->pm4_poly_offset); si_pm4_free_state(sctx, &rs->pm4, SI_STATE_IDX(rasterizer)); } @@ -2748,6 +2772,22 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su if (format == V_028040_Z_INVALID) PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format); + /* Use the original Z format, not db_render_format, so that the polygon offset behaves as + * expected by applications. + */ + switch (tex->buffer.b.b.format) { + case PIPE_FORMAT_Z16_UNORM: + surf->db_format_index = 0; + break; + default: /* 24-bit */ + surf->db_format_index = 1; + break; + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + surf->db_format_index = 2; + break; + } + if (sctx->gfx_level >= GFX9) { surf->db_htile_data_base = 0; surf->db_htile_surface = 0; @@ -2973,6 +3013,9 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, bool old_has_stencil = old_has_zsbuf && ((struct si_texture *)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil; + uint8_t old_db_format_index = + old_has_zsbuf ? + ((struct si_surface *)sctx->framebuffer.state.zsbuf)->db_format_index : -1; int i; /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs @@ -3162,10 +3205,14 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (!sctx->framebuffer.min_bytes_per_pixel || zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe; + + /* Update polygon offset based on the Z format. */ + if (sctx->queued.named.rasterizer->uses_poly_offset && + surf->db_format_index != old_db_format_index) + (sctx)->dirty_atoms |= SI_STATE_BIT(rasterizer); } si_update_ps_colorbuf0_slot(sctx); - si_update_poly_offset_state(sctx); si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); @@ -5761,9 +5808,8 @@ void si_init_state_compute_functions(struct si_context *sctx) void si_init_state_functions(struct si_context *sctx) { sctx->atoms.s.pm4_states[SI_STATE_IDX(blend)].emit = si_pm4_emit_state; - sctx->atoms.s.pm4_states[SI_STATE_IDX(rasterizer)].emit = si_pm4_emit_state; + sctx->atoms.s.pm4_states[SI_STATE_IDX(rasterizer)].emit = si_pm4_emit_rasterizer; sctx->atoms.s.pm4_states[SI_STATE_IDX(dsa)].emit = si_pm4_emit_dsa; - sctx->atoms.s.pm4_states[SI_STATE_IDX(poly_offset)].emit = si_pm4_emit_state; sctx->atoms.s.pm4_states[SI_STATE_IDX(ls)].emit = si_pm4_emit_shader; sctx->atoms.s.pm4_states[SI_STATE_IDX(hs)].emit = si_pm4_emit_shader; sctx->atoms.s.pm4_states[SI_STATE_IDX(es)].emit = si_pm4_emit_shader; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 189ae76c99d54..096f94cee391f 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -54,8 +54,21 @@ struct si_state_blend { struct si_state_rasterizer { struct si_pm4_state pm4; - /* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */ - struct si_pm4_state *pm4_poly_offset; + + /* Register values. */ + unsigned spi_interp_control_0; + unsigned pa_su_point_size; + unsigned pa_su_point_minmax; + unsigned pa_su_line_cntl; + unsigned pa_sc_mode_cntl_0; + unsigned pa_su_sc_mode_cntl; + unsigned pa_cl_ngg_cntl; + unsigned pa_sc_edgerule; + unsigned pa_su_poly_offset_db_fmt_cntl[3]; + unsigned pa_su_poly_offset_clamp; + unsigned pa_su_poly_offset_frontback_scale; + unsigned pa_su_poly_offset_frontback_offset[3]; + unsigned pa_sc_line_stipple; unsigned pa_cl_clip_cntl; float line_width; @@ -178,7 +191,6 @@ union si_state { struct si_state_blend *blend; struct si_state_rasterizer *rasterizer; struct si_state_dsa *dsa; - struct si_pm4_state *poly_offset; struct si_shader *ls; struct si_shader *hs; struct si_shader *es; @@ -234,7 +246,7 @@ union si_state_atoms { static inline uint64_t si_atoms_that_always_roll_context(void) { - return SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(poly_offset) | + return SI_STATE_BIT(blend) | SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) | SI_ATOM_BIT(sample_locations) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color)| SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports)| @@ -259,6 +271,22 @@ enum si_tracked_reg SI_TRACKED_DB_DEPTH_BOUNDS_MIN, SI_TRACKED_DB_DEPTH_BOUNDS_MAX, + SI_TRACKED_SPI_INTERP_CONTROL_0, + SI_TRACKED_PA_SU_POINT_SIZE, + SI_TRACKED_PA_SU_POINT_MINMAX, + SI_TRACKED_PA_SU_LINE_CNTL, + SI_TRACKED_PA_SC_MODE_CNTL_0, + SI_TRACKED_PA_SU_SC_MODE_CNTL, + SI_TRACKED_PA_SC_EDGERULE, + + /* 6 consecutive registers */ + SI_TRACKED_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + SI_TRACKED_PA_SU_POLY_OFFSET_CLAMP, + SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_SCALE, + SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_OFFSET, + SI_TRACKED_PA_SU_POLY_OFFSET_BACK_SCALE, + SI_TRACKED_PA_SU_POLY_OFFSET_BACK_OFFSET, + /* 2 consecutive registers */ SI_TRACKED_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_AA_CONFIG, @@ -304,6 +332,7 @@ enum si_tracked_reg SI_TRACKED_PA_SC_BINNER_CNTL_0, /* GFX9+ */ SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP, /* GFX10+ - the SMALL_PRIM_FILTER slot above can be reused */ SI_TRACKED_GE_NGG_SUBGRP_CNTL, /* GFX10+ */ + SI_TRACKED_PA_CL_NGG_CNTL, /* GFX10+ */ SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL, /* GFX10.3+ */ /* 3 consecutive registers */ -- GitLab From 2f281b39abe2650d914806e4ad1288bbb1c01567 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Fri, 1 Dec 2023 22:54:59 -0500 Subject: [PATCH 36/38] ac/gpu_info: fix printing radeon_info after adding VPE Fixes: 3ec397819e7 - amd: add new hardware ip for vpe Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/amd/common/ac_gpu_info.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index f0f06b66e69d0..9ed47e04dac00 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -1701,7 +1701,7 @@ void ac_print_gpu_info(const struct radeon_info *info, FILE *f) fprintf(f, " pcie_bandwidth = %1.1f GB/s\n", info->pcie_bandwidth_mbps / 1024.0); fprintf(f, " clock_crystal_freq = %i KHz\n", info->clock_crystal_freq); - const char *ip_string[] = { + const char *ip_string[AMD_NUM_IP_TYPES] = { [AMD_IP_GFX] = "GFX", [AMD_IP_COMPUTE] = "COMP", [AMD_IP_SDMA] = "SDMA", @@ -1711,6 +1711,7 @@ void ac_print_gpu_info(const struct radeon_info *info, FILE *f) [AMD_IP_VCN_DEC] = "VCN_DEC", [AMD_IP_VCN_ENC] = (info->vcn_ip_version >= VCN_4_0_0) ? "VCN" : "VCN_ENC", [AMD_IP_VCN_JPEG] = "VCN_JPG", + [AMD_IP_VPE] = "VPE", }; for (unsigned i = 0; i < AMD_NUM_IP_TYPES; i++) { -- GitLab From 2e50b6677bf0d82dd4d21e8efb42a09ba8233a60 Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Date: Thu, 7 Dec 2023 09:31:22 +0100 Subject: [PATCH 37/38] radeonsi: update guardband if vs_disables_clipping_viewport changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because the guardband state depends on this value. Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_state_viewport.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index 88b26580e9c01..b70900d51bf2e 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -610,6 +610,7 @@ void si_update_vs_viewport_state(struct si_context *ctx) if (ctx->vs_disables_clipping_viewport != vs_window_space) { ctx->vs_disables_clipping_viewport = vs_window_space; + si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband); si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); } -- GitLab From 57e658d0418c178ccacb8faf23f81a4ca37ae0aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com> Date: Sun, 26 Nov 2023 17:03:45 -0500 Subject: [PATCH 38/38] radeonsi: rework how guardband registers are updated to decrease overhead See the code comments. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26307> --- src/gallium/drivers/radeonsi/si_pipe.h | 58 +++++++++++++++++-- src/gallium/drivers/radeonsi/si_state.c | 9 ++- .../drivers/radeonsi/si_state_viewport.c | 30 +++------- 3 files changed, 68 insertions(+), 29 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 1a122970021fd..ac96d1cbfa71c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1255,6 +1255,23 @@ struct si_context { bool vs_disables_clipping_viewport; bool has_reset_been_notified; + /* The number of pixels outside the viewport that are not culled by the clipper. + * Normally, the clipper clips everything outside the viewport, however, points and lines + * can have vertices outside the viewport, but their edges can be inside the viewport. Those + * shouldn't be culled. The problem is that the register setting (PA_CL_GB_*_DISC_ADJ) that + * controls the discard distance, which depends on the point size and line width, applies to + * all primitive types, and we would have to set 0 distance for triangles and non-zero for + * points and lines whenever the primitive type changes, which would add overhead and cause + * context rolls. + * + * To reduce that, whenever the discard distance changes for points and lines, we keep it + * at that higher value up to a certain small number for all primitive types including all + * points and lines within a specific size. This is slightly inefficient, but it eliminates + * a lot of guardband state updates and context register changes. + */ + float min_clip_discard_distance_watermark; + float current_clip_discard_distance; + /* Precomputed IA_MULTI_VGT_PARAM */ union si_vgt_param_key ia_multi_vgt_param_key; unsigned ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES]; @@ -2114,6 +2131,28 @@ void si_check_dirty_buffers_textures(struct si_context *sctx) } } +static inline void si_set_clip_discard_distance(struct si_context *sctx, float distance) +{ + /* Determine whether the guardband registers change. + * + * When we see a value greater than min_clip_discard_distance_watermark, we increase it + * up to a certain number to eliminate those state changes next time they happen. + * See the comment at min_clip_discard_distance_watermark. + */ + if (distance > sctx->min_clip_discard_distance_watermark) { + /* The maximum number was determined from Viewperf. The number is in units of half-pixels. */ + sctx->min_clip_discard_distance_watermark = MIN2(distance, 6); + + float old_distance = sctx->current_clip_discard_distance; + float new_distance = MAX2(distance, sctx->min_clip_discard_distance_watermark); + + if (old_distance != new_distance) { + sctx->current_clip_discard_distance = new_distance; + si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); + } + } +} + /* Update these two GS_STATE fields. They depend on whatever the last shader before PS is * and the rasterizer state. * @@ -2147,16 +2186,23 @@ si_set_rasterized_prim(struct si_context *sctx, enum mesa_prim rast_prim, bool is_rect = rast_prim == SI_PRIM_RECTANGLE_LIST; bool is_points = rast_prim == MESA_PRIM_POINTS; bool is_lines = util_prim_is_lines(rast_prim); - bool is_triangles = util_rast_prim_is_triangles(rast_prim); - if ((is_points || is_lines) != util_prim_is_points_or_lines(sctx->current_rast_prim)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); + if (is_points) { + si_set_clip_discard_distance(sctx, sctx->queued.named.rasterizer->max_point_size); + sctx->gs_out_prim = V_028A6C_POINTLIST; + } else if (is_lines) { + si_set_clip_discard_distance(sctx, sctx->queued.named.rasterizer->line_width); + sctx->gs_out_prim = V_028A6C_LINESTRIP; + } else if (is_rect) { + /* Don't change the clip discard distance for rectangles. */ + sctx->gs_out_prim = V_028A6C_RECTLIST; + } else { + si_set_clip_discard_distance(sctx, 0); + sctx->gs_out_prim = V_028A6C_TRISTRIP; + } sctx->current_rast_prim = rast_prim; si_vs_ps_key_update_rast_prim_smooth_stipple(sctx); - sctx->gs_out_prim = is_triangles ? V_028A6C_TRISTRIP : - is_lines ? V_028A6C_LINESTRIP : - is_rect ? V_028A6C_RECTLIST : V_028A6C_POINTLIST; si_update_ngg_prim_state_sgpr(sctx, hw_vs, ngg); } } diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 4d61823eab3c9..0c47c40695c23 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1268,10 +1268,15 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) if (old_rs->scissor_enable != rs->scissor_enable) si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors); - if (old_rs->line_width != rs->line_width || old_rs->max_point_size != rs->max_point_size || - old_rs->half_pixel_center != rs->half_pixel_center) + /* This never changes for OpenGL. */ + if (old_rs->half_pixel_center != rs->half_pixel_center) si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); + if (util_prim_is_lines(sctx->current_rast_prim)) + si_set_clip_discard_distance(sctx, rs->line_width); + else if (sctx->current_rast_prim == MESA_PRIM_POINTS) + si_set_clip_discard_distance(sctx, rs->max_point_size); + if (old_rs->clip_halfz != rs->clip_halfz) si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports); diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index b70900d51bf2e..e2de5ffe2f0be 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -249,7 +249,6 @@ static void si_emit_guardband(struct si_context *sctx, unsigned index) struct si_signed_scissor vp_as_scissor; struct pipe_viewport_state vp; float left, top, right, bottom, max_range, guardband_x, guardband_y; - float discard_x, discard_y; if (sctx->vs_writes_viewport_index) { /* Shaders can draw to any viewport. Make a union of all @@ -339,28 +338,17 @@ static void si_emit_guardband(struct si_context *sctx, unsigned index) guardband_x = MIN2(-left, right); guardband_y = MIN2(-top, bottom); - discard_x = 1.0; - discard_y = 1.0; + float discard_x = 1.0; + float discard_y = 1.0; + float distance = sctx->current_clip_discard_distance; - if (unlikely(util_prim_is_points_or_lines(sctx->current_rast_prim))) { - /* When rendering wide points or lines, we need to be more - * conservative about when to discard them entirely. */ - float pixels; + /* Add half the point size / line width */ + discard_x += distance / (2.0 * vp.scale[0]); + discard_y += distance / (2.0 * vp.scale[1]); - if (sctx->current_rast_prim == MESA_PRIM_POINTS) - pixels = rs->max_point_size; - else - pixels = rs->line_width; - - /* Add half the point size / line width */ - discard_x += pixels / (2.0 * vp.scale[0]); - discard_y += pixels / (2.0 * vp.scale[1]); - - /* Discard primitives that would lie entirely outside the clip - * region. */ - discard_x = MIN2(discard_x, guardband_x); - discard_y = MIN2(discard_y, guardband_y); - } + /* Discard primitives that would lie entirely outside the viewport area. */ + discard_x = MIN2(discard_x, guardband_x); + discard_y = MIN2(discard_y, guardband_y); unsigned pa_su_vtx_cntl = S_028BE4_PIX_CENTER(rs->half_pixel_center) | S_028BE4_ROUND_MODE(V_028BE4_X_ROUND_TO_EVEN) | -- GitLab