From 03b5a94258ead1bd867d67ba3ff326236d655276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 02:15:15 -0400 Subject: [PATCH 01/42] radeonsi: add const to the key parameter in si_shader_select_with_key The keys will match the current state, so we shouldn't change them. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_shader.h | 2 +- src/gallium/drivers/radeonsi/si_state.h | 3 +- .../drivers/radeonsi/si_state_shaders.c | 35 +++++++++++++++---- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index acb5d6fe3cf3..e943347f3ba8 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -888,7 +888,7 @@ bool gfx10_is_ngg_passthrough(struct si_shader *shader); /* Return the pointer to the main shader part's pointer. */ static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel, - struct si_shader_key *key) + const struct si_shader_key *key) { if (key->as_ls) return &sel->main_shader_part_ls; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index f26087367842..3f230bcf2716 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -576,7 +576,8 @@ void si_get_active_slot_masks(const struct si_shader_info *info, uint64_t *const uint64_t *samplers_and_images); int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state, struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key, int thread_index, bool optimized_or_none); + const struct si_shader_key *key, int thread_index, + bool optimized_or_none); void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs, struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key); unsigned si_get_input_prim(const struct si_shader_selector *gs); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index ccbc41951bb4..87300f8d071d 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2146,7 +2146,7 @@ static const struct si_shader_key zeroed; static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel, struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key) + const struct si_shader_key *key) { struct si_shader **mainp = si_get_main_shader_part(sel, key); @@ -2177,6 +2177,16 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad return true; } +/* A helper to copy *key to *local_key and return local_key. */ +static const struct si_shader_key * +use_local_key_copy(const struct si_shader_key *key, struct si_shader_key *local_key) +{ + if (key != local_key) + memcpy(local_key, key, sizeof(*key)); + + return local_key; +} + /** * Select a shader variant according to the shader key. * @@ -2186,12 +2196,18 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad */ int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state, struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key, int thread_index, bool optimized_or_none) + const struct si_shader_key *key, int thread_index, + bool optimized_or_none) { struct si_shader_selector *sel = state->cso; struct si_shader_selector *previous_stage_sel = NULL; struct si_shader *current = state->current; struct si_shader *iter, *shader = NULL; + /* si_shader_select_with_key must not modify 'key' because it would affect future shaders. + * If we need to modify it for this specific shader (eg: to disable optimizations), we + * use a copy. + */ + struct si_shader_key local_key; again: /* Check if we don't need to change anything. @@ -2204,7 +2220,8 @@ again: if (optimized_or_none) return -1; - memset(&key->opt, 0, sizeof(key->opt)); + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); goto current_not_ready; } @@ -2243,9 +2260,10 @@ current_not_ready: key->opt.inlined_uniform_values, MAX_INLINABLE_UNIFORMS * 4) != 0) { if (variant_count++ > max_inline_uniforms_variants) { + key = use_local_key_copy(key, &local_key); /* Too many variants. Disable inlining for this shader. */ - key->opt.inline_uniforms = 0; - memset(key->opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4); + local_key.opt.inline_uniforms = 0; + memset(local_key.opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4); simple_mtx_unlock(&sel->mutex); goto again; } @@ -2262,7 +2280,9 @@ current_not_ready: if (iter->is_optimized) { if (optimized_or_none) return -1; - memset(&key->opt, 0, sizeof(key->opt)); + + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); goto again; } @@ -2383,7 +2403,8 @@ current_not_ready: } /* Use the default (unoptimized) shader for now. */ - memset(&key->opt, 0, sizeof(key->opt)); + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); simple_mtx_unlock(&sel->mutex); if (sscreen->options.sync_compile) -- GitLab From 0b1fd84950c75e0e6a6052f2b787ada486abe398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 02:18:15 -0400 Subject: [PATCH 02/42] radeonsi: handle NO_OPT_VARIANT in si_shader_select_with_key so as not to change the keys in si_context Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state_shaders.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 87300f8d071d..efb689e4a000 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2086,9 +2086,6 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh default: assert(0); } - - if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT))) - memset(&key->opt, 0, sizeof(key->opt)); } static void si_build_shader_variant(struct si_shader *shader, int thread_index, bool low_priority) @@ -2209,6 +2206,12 @@ int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_st */ struct si_shader_key local_key; + if (unlikely(sscreen->debug_flags & DBG(NO_OPT_VARIANT))) { + /* Disable shader variant optimizations. */ + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); + } + again: /* Check if we don't need to change anything. * This path is also used for most shaders that don't need multiple -- GitLab From a912c804391dfa00019b1131a3fe896a246abda2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 02:30:38 -0400 Subject: [PATCH 03/42] radeonsi: sink memsets and disable uniform inlining in si_shader_selector_key to facilitate refactoring. Uniform inlining will be re-enabled later. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_state_shaders.c | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index efb689e4a000..1f620b5b00c3 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1880,10 +1880,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh { struct si_context *sctx = (struct si_context *)ctx; - memset(&key->part, 0, sizeof(key->part)); - memset(&key->mono, 0, sizeof(key->mono)); - memset(&key->opt, 0, sizeof(key->opt)); - +#if 0 /* TODO: enable this */ unsigned num_inlinable_uniforms = sel->info.base.num_inlinable_uniforms; if (num_inlinable_uniforms && sctx->inlinable_uniforms_valid_mask & (1 << sel->pipe_shader_type)) { @@ -1892,15 +1889,24 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh sctx->inlinable_uniforms[sel->pipe_shader_type], num_inlinable_uniforms * 4); } +#endif switch (sel->info.stage) { case MESA_SHADER_VERTEX: + memset(&key->part, 0, sizeof(key->part)); + memset(&key->mono, 0, sizeof(key->mono)); + memset(&key->opt, 0, sizeof(key->opt)); + si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog); if (!sctx->shader.tes.cso && !sctx->shader.gs.cso) si_shader_selector_key_hw_vs(sctx, sel, key); break; case MESA_SHADER_TESS_CTRL: + memset(&key->part, 0, sizeof(key->part)); + memset(&key->mono, 0, sizeof(key->mono)); + memset(&key->opt, 0, sizeof(key->opt)); + if (sctx->chip_class >= GFX9) { si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, key, &key->part.tcs.ls_prolog); key->part.tcs.ls = sctx->shader.vs.cso; @@ -1931,10 +1937,18 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh key->mono.u.ff_tcs_inputs_to_copy = sctx->shader.vs.cso->outputs_written; break; case MESA_SHADER_TESS_EVAL: + memset(&key->part, 0, sizeof(key->part)); + memset(&key->mono, 0, sizeof(key->mono)); + memset(&key->opt, 0, sizeof(key->opt)); + if (!sctx->shader.gs.cso) si_shader_selector_key_hw_vs(sctx, sel, key); break; case MESA_SHADER_GEOMETRY: + memset(&key->part, 0, sizeof(key->part)); + memset(&key->mono, 0, sizeof(key->mono)); + memset(&key->opt, 0, sizeof(key->opt)); + if (sctx->chip_class >= GFX9) { if (sctx->shader.tes.cso) { key->part.gs.es = sctx->shader.tes.cso; @@ -1953,6 +1967,10 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix; break; case MESA_SHADER_FRAGMENT: { + memset(&key->part, 0, sizeof(key->part)); + memset(&key->mono, 0, sizeof(key->mono)); + memset(&key->opt, 0, sizeof(key->opt)); + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; struct si_state_blend *blend = sctx->queued.named.blend; -- GitLab From 46bda71a547afbde95f724e8af7527209479f220 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 02:32:19 -0400 Subject: [PATCH 04/42] radeonsi: move PS shader key code into a separate function There is reordering and new comments. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_state_shaders.c | 286 ++++++++++-------- 1 file changed, 152 insertions(+), 134 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 1f620b5b00c3..f97798cb62e7 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1874,6 +1874,156 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shad key->opt.kill_pointsize = 1; } +static void si_update_ps_shader_key(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + + memset(&key->part, 0, sizeof(key->part)); + memset(&key->mono, 0, sizeof(key->mono)); + memset(&key->opt, 0, sizeof(key->opt)); + + /** Framebuffer dependencies. */ + if (sel->info.color0_writes_all_cbufs && + sel->info.colors_written == 0x1) + key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; + + /* ps_uses_fbfetch is true only if the color buffer is bound. */ + if (sctx->ps_uses_fbfetch && !sctx->blitter_running) { + struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; + struct pipe_resource *tex = cb0->texture; + + /* 1D textures are allocated and used as 2D on GFX9. */ + key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1; + key->mono.u.ps.fbfetch_is_1D = + sctx->chip_class != GFX9 && + (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY); + key->mono.u.ps.fbfetch_layered = + tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY || + tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY || + tex->target == PIPE_TEXTURE_3D; + } + + /** Framebuffer and blend dependencies. */ + /* Select the shader color format based on whether + * blending or alpha are needed. + */ + struct si_state_blend *blend = sctx->queued.named.blend; + + key->part.ps.epilog.spi_shader_col_format = + (blend->blend_enable_4bit & blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_blend_alpha) | + (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_blend) | + (~blend->blend_enable_4bit & blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_alpha) | + (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format); + key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit; + + /* The output for dual source blending should have + * the same format as the first output. + */ + if (blend->dual_src_blend) { + key->part.ps.epilog.spi_shader_col_format |= + (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4; + } + + /* If alpha-to-coverage is enabled, we have to export alpha + * even if there is no color buffer. + */ + if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage) + key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR; + + /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs + * to the range supported by the type if a channel has less + * than 16 bits and the export format is 16_ABGR. + */ + if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) { + key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8; + key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10; + } + + /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */ + if (!key->part.ps.epilog.last_cbuf) { + key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit; + key->part.ps.epilog.color_is_int8 &= sel->info.colors_written; + key->part.ps.epilog.color_is_int10 &= sel->info.colors_written; + } + + /* Eliminate shader code computing output values that are unused. + * This enables dead code elimination between shader parts. + * Check if any output is eliminated. + */ + if (sel->colors_written_4bit & + ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit)) + key->opt.prefer_mono = 1; + + /** Primitive type and shader dependencies. */ + bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); + bool is_line = util_prim_is_lines(sctx->current_rast_prim); + + /** Blend and rasterizer dependencies. */ + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable; + + /** Rasterizer dependencies. */ + key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read; + key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color; + key->part.ps.epilog.clamp_color = rs->clamp_fragment_color; + + /** Primitive type, shader, and rasterizer dependencies. */ + key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; + + /** Primitive type, shader, rasterizer, and framebuffer dependencies. */ + key->part.ps.epilog.poly_line_smoothing = + ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) && + sctx->framebuffer.nr_samples <= 1; + + /** Sample shading dependencies. */ + if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) + key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples); + + /** Framebuffer, rasterizer, and sample shading dependencies. */ + bool uses_persp_center = sel->info.uses_persp_center || + (!rs->flatshade && sel->info.uses_persp_center_color); + bool uses_persp_centroid = sel->info.uses_persp_centroid || + (!rs->flatshade && sel->info.uses_persp_centroid_color); + bool uses_persp_sample = sel->info.uses_persp_sample || + (!rs->flatshade && sel->info.uses_persp_sample_color); + + if (rs->force_persample_interp && rs->multisample_enable && + sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) { + key->part.ps.prolog.force_persp_sample_interp = + uses_persp_center || uses_persp_centroid; + + key->part.ps.prolog.force_linear_sample_interp = + sel->info.uses_linear_center || sel->info.uses_linear_centroid; + } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) { + key->part.ps.prolog.bc_optimize_for_persp = + uses_persp_center && uses_persp_centroid; + key->part.ps.prolog.bc_optimize_for_linear = + sel->info.uses_linear_center && sel->info.uses_linear_centroid; + } else { + /* Make sure SPI doesn't compute more than 1 pair + * of (i,j), which is the optimization here. */ + key->part.ps.prolog.force_persp_center_interp = uses_persp_center + + uses_persp_centroid + + uses_persp_sample > 1; + + key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center + + sel->info.uses_linear_centroid + + sel->info.uses_linear_sample > 1; + + if (sel->info.uses_interp_at_sample) + key->mono.u.ps.interpolate_at_sample_force_center = 1; + } + + /** DSA dependencies. */ + key->part.ps.epilog.alpha_func = sctx->queued.named.dsa->alpha_func; +} + /* Compute the key for the hw shader variant */ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel, struct si_shader_key *key) @@ -1966,141 +2116,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh } key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix; break; - case MESA_SHADER_FRAGMENT: { - memset(&key->part, 0, sizeof(key->part)); - memset(&key->mono, 0, sizeof(key->mono)); - memset(&key->opt, 0, sizeof(key->opt)); - - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_state_blend *blend = sctx->queued.named.blend; - - if (sel->info.color0_writes_all_cbufs && - sel->info.colors_written == 0x1) - key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; - - /* Select the shader color format based on whether - * blending or alpha are needed. - */ - key->part.ps.epilog.spi_shader_col_format = - (blend->blend_enable_4bit & blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_blend_alpha) | - (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_blend) | - (~blend->blend_enable_4bit & blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_alpha) | - (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format); - key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit; - - /* The output for dual source blending should have - * the same format as the first output. - */ - if (blend->dual_src_blend) { - key->part.ps.epilog.spi_shader_col_format |= - (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4; - } - - /* If alpha-to-coverage is enabled, we have to export alpha - * even if there is no color buffer. - */ - if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage) - key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR; - - /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs - * to the range supported by the type if a channel has less - * than 16 bits and the export format is 16_ABGR. - */ - if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) { - key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8; - key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10; - } - - /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */ - if (!key->part.ps.epilog.last_cbuf) { - key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit; - key->part.ps.epilog.color_is_int8 &= sel->info.colors_written; - key->part.ps.epilog.color_is_int10 &= sel->info.colors_written; - } - - /* Eliminate shader code computing output values that are unused. - * This enables dead code elimination between shader parts. - * Check if any output is eliminated. - */ - if (sel->colors_written_4bit & - ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit)) - key->opt.prefer_mono = 1; - - bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); - bool is_line = util_prim_is_lines(sctx->current_rast_prim); - - key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read; - key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color; - - key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable; - - key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; - key->part.ps.epilog.poly_line_smoothing = - ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) && - sctx->framebuffer.nr_samples <= 1; - key->part.ps.epilog.clamp_color = rs->clamp_fragment_color; - - if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) { - key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples); - } - - bool uses_persp_center = sel->info.uses_persp_center || - (!rs->flatshade && sel->info.uses_persp_center_color); - bool uses_persp_centroid = sel->info.uses_persp_centroid || - (!rs->flatshade && sel->info.uses_persp_centroid_color); - bool uses_persp_sample = sel->info.uses_persp_sample || - (!rs->flatshade && sel->info.uses_persp_sample_color); - - if (rs->force_persample_interp && rs->multisample_enable && - sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) { - key->part.ps.prolog.force_persp_sample_interp = - uses_persp_center || uses_persp_centroid; - - key->part.ps.prolog.force_linear_sample_interp = - sel->info.uses_linear_center || sel->info.uses_linear_centroid; - } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) { - key->part.ps.prolog.bc_optimize_for_persp = - uses_persp_center && uses_persp_centroid; - key->part.ps.prolog.bc_optimize_for_linear = - sel->info.uses_linear_center && sel->info.uses_linear_centroid; - } else { - /* Make sure SPI doesn't compute more than 1 pair - * of (i,j), which is the optimization here. */ - key->part.ps.prolog.force_persp_center_interp = uses_persp_center + - uses_persp_centroid + - uses_persp_sample > 1; - - key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center + - sel->info.uses_linear_centroid + - sel->info.uses_linear_sample > 1; - - if (sel->info.uses_interp_at_sample) - key->mono.u.ps.interpolate_at_sample_force_center = 1; - } - - key->part.ps.epilog.alpha_func = sctx->queued.named.dsa->alpha_func; - - /* ps_uses_fbfetch is true only if the color buffer is bound. */ - if (sctx->ps_uses_fbfetch && !sctx->blitter_running) { - struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; - struct pipe_resource *tex = cb0->texture; - - /* 1D textures are allocated and used as 2D on GFX9. */ - key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1; - key->mono.u.ps.fbfetch_is_1D = - sctx->chip_class != GFX9 && - (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY); - key->mono.u.ps.fbfetch_layered = - tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY || - tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY || - tex->target == PIPE_TEXTURE_3D; - } + case MESA_SHADER_FRAGMENT: + si_update_ps_shader_key(sctx); break; - } default: assert(0); } -- GitLab From 60580c04c00b0b2e3b992618642df5ee3d27fdbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 02:35:32 -0400 Subject: [PATCH 05/42] radeonsi: don't memset mono and opt in si_update_ps_shader_key Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state_shaders.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index f97798cb62e7..30bd35a91285 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1880,8 +1880,6 @@ static void si_update_ps_shader_key(struct si_context *sctx) struct si_shader_key *key = &sctx->shader.ps.key; memset(&key->part, 0, sizeof(key->part)); - memset(&key->mono, 0, sizeof(key->mono)); - memset(&key->opt, 0, sizeof(key->opt)); /** Framebuffer dependencies. */ if (sel->info.color0_writes_all_cbufs && @@ -1902,6 +1900,10 @@ static void si_update_ps_shader_key(struct si_context *sctx) tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY || tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY || tex->target == PIPE_TEXTURE_3D; + } else { + key->mono.u.ps.fbfetch_msaa = 0; + key->mono.u.ps.fbfetch_is_1D = 0; + key->mono.u.ps.fbfetch_layered = 0; } /** Framebuffer and blend dependencies. */ @@ -1958,6 +1960,8 @@ static void si_update_ps_shader_key(struct si_context *sctx) if (sel->colors_written_4bit & ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit)) key->opt.prefer_mono = 1; + else + key->opt.prefer_mono = 0; /** Primitive type and shader dependencies. */ bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); @@ -1993,6 +1997,8 @@ static void si_update_ps_shader_key(struct si_context *sctx) bool uses_persp_sample = sel->info.uses_persp_sample || (!rs->flatshade && sel->info.uses_persp_sample_color); + key->mono.u.ps.interpolate_at_sample_force_center = 0; + if (rs->force_persample_interp && rs->multisample_enable && sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) { key->part.ps.prolog.force_persp_sample_interp = @@ -2016,8 +2022,7 @@ static void si_update_ps_shader_key(struct si_context *sctx) sel->info.uses_linear_centroid + sel->info.uses_linear_sample > 1; - if (sel->info.uses_interp_at_sample) - key->mono.u.ps.interpolate_at_sample_force_center = 1; + key->mono.u.ps.interpolate_at_sample_force_center = sel->info.uses_interp_at_sample; } /** DSA dependencies. */ -- GitLab From 59072ee4847dc2b61d67746611bac2d77fec4ea0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 02:35:32 -0400 Subject: [PATCH 06/42] radeonsi: don't memset part in si_update_ps_shader_key Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_state_shaders.c | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 30bd35a91285..02cf1915db3a 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1879,12 +1879,12 @@ static void si_update_ps_shader_key(struct si_context *sctx) struct si_shader_selector *sel = sctx->shader.ps.cso; struct si_shader_key *key = &sctx->shader.ps.key; - memset(&key->part, 0, sizeof(key->part)); - /** Framebuffer dependencies. */ if (sel->info.color0_writes_all_cbufs && sel->info.colors_written == 0x1) key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; + else + key->part.ps.epilog.last_cbuf = 0; /* ps_uses_fbfetch is true only if the color buffer is bound. */ if (sctx->ps_uses_fbfetch && !sctx->blitter_running) { @@ -1988,6 +1988,8 @@ static void si_update_ps_shader_key(struct si_context *sctx) /** Sample shading dependencies. */ if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples); + else + key->part.ps.prolog.samplemask_log_ps_iter = 0; /** Framebuffer, rasterizer, and sample shading dependencies. */ bool uses_persp_center = sel->info.uses_persp_center || @@ -1997,8 +1999,6 @@ static void si_update_ps_shader_key(struct si_context *sctx) bool uses_persp_sample = sel->info.uses_persp_sample || (!rs->flatshade && sel->info.uses_persp_sample_color); - key->mono.u.ps.interpolate_at_sample_force_center = 0; - if (rs->force_persample_interp && rs->multisample_enable && sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) { key->part.ps.prolog.force_persp_sample_interp = @@ -2006,12 +2006,26 @@ static void si_update_ps_shader_key(struct si_context *sctx) key->part.ps.prolog.force_linear_sample_interp = sel->info.uses_linear_center || sel->info.uses_linear_centroid; + + key->part.ps.prolog.force_persp_center_interp = 0; + key->part.ps.prolog.force_linear_center_interp = 0; + key->part.ps.prolog.bc_optimize_for_persp = 0; + key->part.ps.prolog.bc_optimize_for_linear = 0; + key->mono.u.ps.interpolate_at_sample_force_center = 0; } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) { + key->part.ps.prolog.force_persp_sample_interp = 0; + key->part.ps.prolog.force_linear_sample_interp = 0; + key->part.ps.prolog.force_persp_center_interp = 0; + key->part.ps.prolog.force_linear_center_interp = 0; key->part.ps.prolog.bc_optimize_for_persp = uses_persp_center && uses_persp_centroid; key->part.ps.prolog.bc_optimize_for_linear = sel->info.uses_linear_center && sel->info.uses_linear_centroid; + key->mono.u.ps.interpolate_at_sample_force_center = 0; } else { + key->part.ps.prolog.force_persp_sample_interp = 0; + key->part.ps.prolog.force_linear_sample_interp = 0; + /* Make sure SPI doesn't compute more than 1 pair * of (i,j), which is the optimization here. */ key->part.ps.prolog.force_persp_center_interp = uses_persp_center + @@ -2021,7 +2035,8 @@ static void si_update_ps_shader_key(struct si_context *sctx) key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center + sel->info.uses_linear_centroid + sel->info.uses_linear_sample > 1; - + key->part.ps.prolog.bc_optimize_for_persp = 0; + key->part.ps.prolog.bc_optimize_for_linear = 0; key->mono.u.ps.interpolate_at_sample_force_center = sel->info.uses_interp_at_sample; } -- GitLab From 00d1d947eabb66c05c67d00b87f6a0aa680ce5a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 03:04:31 -0400 Subject: [PATCH 07/42] radeonsi: divide si_update_ps_shader_key into many separate functions they will be used in bind functions etc. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_state_shaders.c | 92 +++++++++++++++---- 1 file changed, 72 insertions(+), 20 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 02cf1915db3a..69dc557f8310 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1874,12 +1874,14 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shad key->opt.kill_pointsize = 1; } -static void si_update_ps_shader_key(struct si_context *sctx) +static void si_ps_key_update_framebuffer(struct si_context *sctx) { struct si_shader_selector *sel = sctx->shader.ps.cso; struct si_shader_key *key = &sctx->shader.ps.key; - /** Framebuffer dependencies. */ + if (!sel) + return; + if (sel->info.color0_writes_all_cbufs && sel->info.colors_written == 0x1) key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; @@ -1905,13 +1907,20 @@ static void si_update_ps_shader_key(struct si_context *sctx) key->mono.u.ps.fbfetch_is_1D = 0; key->mono.u.ps.fbfetch_layered = 0; } +} + +static void si_ps_key_update_framebuffer_blend(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_blend *blend = sctx->queued.named.blend; + + if (!sel) + return; - /** Framebuffer and blend dependencies. */ /* Select the shader color format based on whether * blending or alpha are needed. */ - struct si_state_blend *blend = sctx->queued.named.blend; - key->part.ps.epilog.spi_shader_col_format = (blend->blend_enable_4bit & blend->need_src_alpha_4bit & sctx->framebuffer.spi_shader_col_format_blend_alpha) | @@ -1962,36 +1971,75 @@ static void si_update_ps_shader_key(struct si_context *sctx) key->opt.prefer_mono = 1; else key->opt.prefer_mono = 0; +} - /** Primitive type and shader dependencies. */ - bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); - bool is_line = util_prim_is_lines(sctx->current_rast_prim); - - /** Blend and rasterizer dependencies. */ +static void si_ps_key_update_blend_rasterizer(struct si_context *sctx) +{ + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_blend *blend = sctx->queued.named.blend; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable; +} + +static void si_ps_key_update_rasterizer(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + if (!sel) + return; - /** Rasterizer dependencies. */ key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read; key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color; key->part.ps.epilog.clamp_color = rs->clamp_fragment_color; +} - /** Primitive type, shader, and rasterizer dependencies. */ - key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; +static void si_ps_key_update_dsa(struct si_context *sctx) +{ + struct si_shader_key *key = &sctx->shader.ps.key; + + key->part.ps.epilog.alpha_func = sctx->queued.named.dsa->alpha_func; +} + +static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_context *sctx) +{ + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); + bool is_line = util_prim_is_lines(sctx->current_rast_prim); - /** Primitive type, shader, rasterizer, and framebuffer dependencies. */ + key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; key->part.ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) && sctx->framebuffer.nr_samples <= 1; +} + +static void si_ps_key_update_sample_shading(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + + if (!sel) + return; - /** Sample shading dependencies. */ if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples); else key->part.ps.prolog.samplemask_log_ps_iter = 0; +} + +static void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + if (!sel) + return; - /** Framebuffer, rasterizer, and sample shading dependencies. */ bool uses_persp_center = sel->info.uses_persp_center || (!rs->flatshade && sel->info.uses_persp_center_color); bool uses_persp_centroid = sel->info.uses_persp_centroid || @@ -2039,9 +2087,6 @@ static void si_update_ps_shader_key(struct si_context *sctx) key->part.ps.prolog.bc_optimize_for_linear = 0; key->mono.u.ps.interpolate_at_sample_force_center = sel->info.uses_interp_at_sample; } - - /** DSA dependencies. */ - key->part.ps.epilog.alpha_func = sctx->queued.named.dsa->alpha_func; } /* Compute the key for the hw shader variant */ @@ -2137,7 +2182,14 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix; break; case MESA_SHADER_FRAGMENT: - si_update_ps_shader_key(sctx); + si_ps_key_update_framebuffer(sctx); + si_ps_key_update_framebuffer_blend(sctx); + si_ps_key_update_blend_rasterizer(sctx); + si_ps_key_update_rasterizer(sctx); + si_ps_key_update_dsa(sctx); + si_ps_key_update_primtype_shader_rasterizer_framebuffer(sctx); + si_ps_key_update_sample_shading(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); break; default: assert(0); -- GitLab From 7e3c03bc6ae906a93b4a6df88a3a47196337962d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 03:10:46 -0400 Subject: [PATCH 08/42] radeonsi: ignore blitter when computing the PS shader key it doesn't have any effect Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state_shaders.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 69dc557f8310..6d467d9fa8f3 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1889,7 +1889,7 @@ static void si_ps_key_update_framebuffer(struct si_context *sctx) key->part.ps.epilog.last_cbuf = 0; /* ps_uses_fbfetch is true only if the color buffer is bound. */ - if (sctx->ps_uses_fbfetch && !sctx->blitter_running) { + if (sctx->ps_uses_fbfetch) { struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; struct pipe_resource *tex = cb0->texture; -- GitLab From dbdde903bb91992c66be6fdd069fdde94b3b9d9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 03:21:55 -0400 Subject: [PATCH 09/42] radeonsi: update most of the PS shader key in set & bind functions This decreases overhead of si_update_shaders and overall driver overhead. There is only one function that depends on the rasterized primitive type, and thus it can't be moved to set & bind functions. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state.c | 21 ++++++++++++-- src/gallium/drivers/radeonsi/si_state.h | 7 +++++ .../drivers/radeonsi/si_state_shaders.c | 29 ++++++++++--------- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 0ce9f50d2505..cb4bf2d8ebf8 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -709,8 +709,11 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) old_blend->alpha_to_one != blend->alpha_to_one || old_blend->dual_src_blend != blend->dual_src_blend || old_blend->blend_enable_4bit != blend->blend_enable_4bit || - old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) + old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) { + si_ps_key_update_framebuffer_blend(sctx); + si_ps_key_update_blend_rasterizer(sctx); sctx->do_update_shaders = true; + } if (sctx->screen->dpbb_allowed && (old_blend->alpha_to_coverage != blend->alpha_to_coverage || @@ -1119,8 +1122,12 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth || old_rs->clamp_fragment_color != rs->clamp_fragment_color || old_rs->force_persample_interp != rs->force_persample_interp || - old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) + old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) { + si_ps_key_update_blend_rasterizer(sctx); + si_ps_key_update_rasterizer(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); sctx->do_update_shaders = true; + } } static void si_delete_rs_state(struct pipe_context *ctx, void *state) @@ -1336,8 +1343,10 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); } - if (old_dsa->alpha_func != dsa->alpha_func) + if (old_dsa->alpha_func != dsa->alpha_func) { + si_ps_key_update_dsa(sctx); sctx->do_update_shaders = true; + } if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled || old_dsa->stencil_enabled != dsa->stencil_enabled || @@ -2983,6 +2992,9 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); } + si_ps_key_update_framebuffer(sctx); + si_ps_key_update_framebuffer_blend(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); sctx->do_update_shaders = true; if (!sctx->decompression_enabled) { @@ -3635,6 +3647,9 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples) return; sctx->ps_iter_samples = min_samples; + + si_ps_key_update_sample_shading(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); sctx->do_update_shaders = true; si_update_ps_iter_samples(sctx); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 3f230bcf2716..8d42a186bea1 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -582,6 +582,13 @@ void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selecto struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key); unsigned si_get_input_prim(const struct si_shader_selector *gs); bool si_update_ngg(struct si_context *sctx); +void si_ps_key_update_framebuffer(struct si_context *sctx); +void si_ps_key_update_framebuffer_blend(struct si_context *sctx); +void si_ps_key_update_blend_rasterizer(struct si_context *sctx); +void si_ps_key_update_rasterizer(struct si_context *sctx); +void si_ps_key_update_dsa(struct si_context *sctx); +void si_ps_key_update_sample_shading(struct si_context *sctx); +void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx); /* si_state_draw.c */ void si_init_draw_functions_GFX6(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 6d467d9fa8f3..c5e424ca9754 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1874,7 +1874,7 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shad key->opt.kill_pointsize = 1; } -static void si_ps_key_update_framebuffer(struct si_context *sctx) +void si_ps_key_update_framebuffer(struct si_context *sctx) { struct si_shader_selector *sel = sctx->shader.ps.cso; struct si_shader_key *key = &sctx->shader.ps.key; @@ -1909,7 +1909,7 @@ static void si_ps_key_update_framebuffer(struct si_context *sctx) } } -static void si_ps_key_update_framebuffer_blend(struct si_context *sctx) +void si_ps_key_update_framebuffer_blend(struct si_context *sctx) { struct si_shader_selector *sel = sctx->shader.ps.cso; struct si_shader_key *key = &sctx->shader.ps.key; @@ -1973,7 +1973,7 @@ static void si_ps_key_update_framebuffer_blend(struct si_context *sctx) key->opt.prefer_mono = 0; } -static void si_ps_key_update_blend_rasterizer(struct si_context *sctx) +void si_ps_key_update_blend_rasterizer(struct si_context *sctx) { struct si_shader_key *key = &sctx->shader.ps.key; struct si_state_blend *blend = sctx->queued.named.blend; @@ -1982,7 +1982,7 @@ static void si_ps_key_update_blend_rasterizer(struct si_context *sctx) key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable; } -static void si_ps_key_update_rasterizer(struct si_context *sctx) +void si_ps_key_update_rasterizer(struct si_context *sctx) { struct si_shader_selector *sel = sctx->shader.ps.cso; struct si_shader_key *key = &sctx->shader.ps.key; @@ -1996,7 +1996,7 @@ static void si_ps_key_update_rasterizer(struct si_context *sctx) key->part.ps.epilog.clamp_color = rs->clamp_fragment_color; } -static void si_ps_key_update_dsa(struct si_context *sctx) +void si_ps_key_update_dsa(struct si_context *sctx) { struct si_shader_key *key = &sctx->shader.ps.key; @@ -2017,7 +2017,7 @@ static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_co sctx->framebuffer.nr_samples <= 1; } -static void si_ps_key_update_sample_shading(struct si_context *sctx) +void si_ps_key_update_sample_shading(struct si_context *sctx) { struct si_shader_selector *sel = sctx->shader.ps.cso; struct si_shader_key *key = &sctx->shader.ps.key; @@ -2031,7 +2031,7 @@ static void si_ps_key_update_sample_shading(struct si_context *sctx) key->part.ps.prolog.samplemask_log_ps_iter = 0; } -static void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx) +void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx) { struct si_shader_selector *sel = sctx->shader.ps.cso; struct si_shader_key *key = &sctx->shader.ps.key; @@ -2182,14 +2182,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix; break; case MESA_SHADER_FRAGMENT: - si_ps_key_update_framebuffer(sctx); - si_ps_key_update_framebuffer_blend(sctx); - si_ps_key_update_blend_rasterizer(sctx); - si_ps_key_update_rasterizer(sctx); - si_ps_key_update_dsa(sctx); si_ps_key_update_primtype_shader_rasterizer_framebuffer(sctx); - si_ps_key_update_sample_shading(sctx); - si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); break; default: assert(0); @@ -3348,6 +3341,14 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); } si_update_ps_colorbuf0_slot(sctx); + + si_ps_key_update_framebuffer(sctx); + si_ps_key_update_framebuffer_blend(sctx); + si_ps_key_update_blend_rasterizer(sctx); + si_ps_key_update_rasterizer(sctx); + si_ps_key_update_dsa(sctx); + si_ps_key_update_sample_shading(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); } static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) -- GitLab From 74a0c9bd519f88176b2d504422651144acacf85a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 06:56:06 -0400 Subject: [PATCH 10/42] radeonsi: clean up and clear VS shader key fields related to outputs Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_state_shaders.c | 42 +++++++++++++------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index c5e424ca9754..8369699746d8 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1822,8 +1822,8 @@ void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selecto key->mono.vs_fetch_opencode = opencode; } -static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shader_selector *vs, - struct si_shader_key *key) +static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, + struct si_shader_key *key) { struct si_shader_selector *ps = sctx->shader.ps.cso; @@ -1861,17 +1861,27 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shad if (vs->info.stage != MESA_SHADER_GEOMETRY) { key->opt.ngg_culling = sctx->ngg_culling; - - if (sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid) - key->mono.u.vs_export_prim_id = 1; + key->mono.u.vs_export_prim_id = sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid; + } else { + key->opt.ngg_culling = 0; + key->mono.u.vs_export_prim_id = 0; } /* We need PKT3_CONTEXT_REG_RMW, which we currently only use on GFX10+. */ - if (sctx->chip_class >= GFX10 && - vs->info.writes_psize && - sctx->current_rast_prim != PIPE_PRIM_POINTS && - !sctx->queued.named.rasterizer->polygon_mode_is_points) - key->opt.kill_pointsize = 1; + key->opt.kill_pointsize = sctx->chip_class >= GFX10 && + vs->info.writes_psize && + sctx->current_rast_prim != PIPE_PRIM_POINTS && + !sctx->queued.named.rasterizer->polygon_mode_is_points; +} + +static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, + struct si_shader_key *key) +{ + key->opt.kill_clip_distances = 0; + key->opt.kill_outputs = 0; + key->opt.ngg_culling = 0; + key->mono.u.vs_export_prim_id = 0; + key->opt.kill_pointsize = 0; } void si_ps_key_update_framebuffer(struct si_context *sctx) @@ -2115,7 +2125,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog); if (!sctx->shader.tes.cso && !sctx->shader.gs.cso) - si_shader_selector_key_hw_vs(sctx, sel, key); + si_get_vs_key_outputs(sctx, sel, key); + else + si_clear_vs_key_outputs(sctx, sel, key); break; case MESA_SHADER_TESS_CTRL: memset(&key->part, 0, sizeof(key->part)); @@ -2157,7 +2169,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh memset(&key->opt, 0, sizeof(key->opt)); if (!sctx->shader.gs.cso) - si_shader_selector_key_hw_vs(sctx, sel, key); + si_get_vs_key_outputs(sctx, sel, key); + else + si_clear_vs_key_outputs(sctx, sel, key); break; case MESA_SHADER_GEOMETRY: memset(&key->part, 0, sizeof(key->part)); @@ -2174,7 +2188,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh /* Only NGG can eliminate GS outputs, because the code is shared with VS. */ if (sctx->ngg) - si_shader_selector_key_hw_vs(sctx, sel, key); + si_get_vs_key_outputs(sctx, sel, key); + else + si_clear_vs_key_outputs(sctx, sel, key); /* This enables jumping over the VS prolog for GS-only waves. */ key->opt.prefer_mono = 1; -- GitLab From aed93eb991a2b5fe61b764259ad5d524ff3ca103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 06:16:07 -0400 Subject: [PATCH 11/42] radeonsi: update the VS shader key in set & bind functions and remove memsets This decreases overhead of si_update_shaders and overall driver overhead. The VS shader key portion related to VS inputs is updated in set & bind functions. Other fields related to outputs are still updated in si_shader_selector_key. Now that all modified fields are set to 0 when not needed, and remove the memsets. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state.c | 8 ++- src/gallium/drivers/radeonsi/si_state.h | 5 +- .../drivers/radeonsi/si_state_shaders.c | 64 +++++++++++-------- 3 files changed, 47 insertions(+), 30 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index cb4bf2d8ebf8..cf63ddd333ac 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -4890,8 +4890,10 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) * src_offset alignment, which is reflected in fix_fetch_opencode. */ old->fix_fetch_opencode != v->fix_fetch_opencode || memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * - MAX2(old->count, v->count))) + MAX2(old->count, v->count))) { + si_vs_key_update_inputs(sctx); sctx->do_update_shaders = true; + } if (v->instance_divisor_is_fetched) { struct pipe_constant_buffer cb; @@ -4987,8 +4989,10 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, * be the case in well-behaved applications anyway. */ if ((sctx->vertex_elements->vb_alignment_check_mask & - (unaligned | orig_unaligned) & updated_mask)) + (unaligned | orig_unaligned) & updated_mask)) { + si_vs_key_update_inputs(sctx); sctx->do_update_shaders = true; + } } /* diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 8d42a186bea1..285b74dddc16 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -578,8 +578,9 @@ int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_st struct si_compiler_ctx_state *compiler_state, const struct si_shader_key *key, int thread_index, bool optimized_or_none); -void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs, - struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key); +void si_vs_key_update_inputs(struct si_context *sctx); +void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, + struct si_vs_prolog_bits *prolog_key); unsigned si_get_input_prim(const struct si_shader_selector *gs); bool si_update_ngg(struct si_context *sctx); void si_ps_key_update_framebuffer(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 8369699746d8..5d6667e6e57c 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1785,16 +1785,31 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader } } -void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs, - struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key) +static void si_clear_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, + struct si_vs_prolog_bits *prolog_key) { - if (vs->info.base.vs.blit_sgprs_amd) - return; + prolog_key->instance_divisor_is_one = 0; + prolog_key->instance_divisor_is_fetched = 0; + key->mono.vs_fetch_opencode = 0; + memset(key->mono.vs_fix_fetch, 0, sizeof(key->mono.vs_fix_fetch)); +} +void si_vs_key_update_inputs(struct si_context *sctx) +{ + struct si_shader_selector *vs = sctx->shader.vs.cso; struct si_vertex_elements *elts = sctx->vertex_elements; + struct si_shader_key *key = &sctx->shader.vs.key; - prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one; - prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched; + if (!vs) + return; + + if (vs->info.base.vs.blit_sgprs_amd) { + si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog); + return; + } + + key->part.vs.prolog.instance_divisor_is_one = elts->instance_divisor_is_one; + key->part.vs.prolog.instance_divisor_is_fetched = elts->instance_divisor_is_fetched; unsigned count_mask = (1 << vs->info.num_inputs) - 1; unsigned fix = elts->fix_fetch_always & count_mask; @@ -1815,6 +1830,8 @@ void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selecto } } + memset(key->mono.vs_fix_fetch, 0, sizeof(key->mono.vs_fix_fetch)); + while (fix) { unsigned i = u_bit_scan(&fix); key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i]; @@ -1822,6 +1839,17 @@ void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selecto key->mono.vs_fetch_opencode = opencode; } +void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, + struct si_vs_prolog_bits *prolog_key) +{ + prolog_key->instance_divisor_is_one = sctx->shader.vs.key.part.vs.prolog.instance_divisor_is_one; + prolog_key->instance_divisor_is_fetched = sctx->shader.vs.key.part.vs.prolog.instance_divisor_is_fetched; + + key->mono.vs_fetch_opencode = sctx->shader.vs.key.mono.vs_fetch_opencode; + memcpy(key->mono.vs_fix_fetch, sctx->shader.vs.key.mono.vs_fix_fetch, + sizeof(key->mono.vs_fix_fetch)); +} + static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, struct si_shader_key *key) { @@ -2118,24 +2146,14 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh switch (sel->info.stage) { case MESA_SHADER_VERTEX: - memset(&key->part, 0, sizeof(key->part)); - memset(&key->mono, 0, sizeof(key->mono)); - memset(&key->opt, 0, sizeof(key->opt)); - - si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog); - if (!sctx->shader.tes.cso && !sctx->shader.gs.cso) si_get_vs_key_outputs(sctx, sel, key); else si_clear_vs_key_outputs(sctx, sel, key); break; case MESA_SHADER_TESS_CTRL: - memset(&key->part, 0, sizeof(key->part)); - memset(&key->mono, 0, sizeof(key->mono)); - memset(&key->opt, 0, sizeof(key->opt)); - if (sctx->chip_class >= GFX9) { - si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, key, &key->part.tcs.ls_prolog); + si_get_vs_key_inputs(sctx, key, &key->part.tcs.ls_prolog); key->part.tcs.ls = sctx->shader.vs.cso; /* When the LS VGPR fix is needed, monolithic shaders @@ -2164,25 +2182,18 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh key->mono.u.ff_tcs_inputs_to_copy = sctx->shader.vs.cso->outputs_written; break; case MESA_SHADER_TESS_EVAL: - memset(&key->part, 0, sizeof(key->part)); - memset(&key->mono, 0, sizeof(key->mono)); - memset(&key->opt, 0, sizeof(key->opt)); - if (!sctx->shader.gs.cso) si_get_vs_key_outputs(sctx, sel, key); else si_clear_vs_key_outputs(sctx, sel, key); break; case MESA_SHADER_GEOMETRY: - memset(&key->part, 0, sizeof(key->part)); - memset(&key->mono, 0, sizeof(key->mono)); - memset(&key->opt, 0, sizeof(key->opt)); - if (sctx->chip_class >= GFX9) { if (sctx->shader.tes.cso) { + si_clear_vs_key_inputs(sctx, key, &key->part.gs.vs_prolog); key->part.gs.es = sctx->shader.tes.cso; } else { - si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, key, &key->part.gs.vs_prolog); + si_get_vs_key_inputs(sctx, key, &key->part.gs.vs_prolog); key->part.gs.es = sctx->shader.vs.cso; } @@ -3194,6 +3205,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso, si_get_vs(sctx)->current); si_update_rasterized_prim(sctx); + si_vs_key_update_inputs(sctx); } static void si_update_tess_uses_prim_id(struct si_context *sctx) -- GitLab From 6d1ab77a8f8a013c30a79d968db70fa13b8214dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 06:33:44 -0400 Subject: [PATCH 12/42] radeonsi: rewrite inlinable uniform states for shader keys in si_context directly update the shader keys in si_context Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_descriptors.c | 32 +++++++++++++------ src/gallium/drivers/radeonsi/si_pipe.h | 2 -- src/gallium/drivers/radeonsi/si_state.h | 1 + .../drivers/radeonsi/si_state_shaders.c | 15 +-------- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 93c9f28bee7d..7b0a9da74974 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1225,6 +1225,19 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res sctx->descriptors_dirty |= 1u << descriptors_idx; } +void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader) +{ + if (shader == PIPE_SHADER_COMPUTE) + return; + + if (sctx->shaders[shader].key.opt.inline_uniforms) { + sctx->shaders[shader].key.opt.inline_uniforms = false; + memset(sctx->shaders[shader].key.opt.inlined_uniform_values, 0, + sizeof(sctx->shaders[shader].key.opt.inlined_uniform_values)); + sctx->do_update_shaders = true; + } +} + static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shader_type shader, uint slot, bool take_ownership, const struct pipe_constant_buffer *input) @@ -1244,10 +1257,8 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shad si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER; } - if (slot == 0) { - /* Invalidate current inlinable uniforms. */ - sctx->inlinable_uniforms_valid_mask &= ~(1 << shader); - } + if (slot == 0) + si_invalidate_inlinable_uniforms(sctx, shader); } slot = si_get_constbuf_slot(slot); @@ -1262,10 +1273,13 @@ static void si_set_inlinable_constants(struct pipe_context *ctx, { struct si_context *sctx = (struct si_context *)ctx; - if (!(sctx->inlinable_uniforms_valid_mask & BITFIELD_BIT(shader))) { + if (shader == PIPE_SHADER_COMPUTE) + return; + + if (!sctx->shaders[shader].key.opt.inline_uniforms) { /* It's the first time we set the constants. Always update shaders. */ - memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4); - sctx->inlinable_uniforms_valid_mask |= BITFIELD_BIT(shader); + sctx->shaders[shader].key.opt.inline_uniforms = true; + memcpy(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4); sctx->do_update_shaders = true; return; } @@ -1273,8 +1287,8 @@ static void si_set_inlinable_constants(struct pipe_context *ctx, /* We have already set inlinable constants for this shader. Update the shader only if * the constants are being changed so as not to update shaders needlessly. */ - if (memcmp(sctx->inlinable_uniforms[shader], values, num_values * 4)) { - memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4); + if (memcmp(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4)) { + memcpy(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4); sctx->do_update_shaders = true; } } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 057fd9aef55b..92762bc881ab 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1054,8 +1054,6 @@ struct si_context { unsigned descriptors_dirty; unsigned shader_pointers_dirty; unsigned shader_needs_decompress_mask; - unsigned inlinable_uniforms_valid_mask; - uint32_t inlinable_uniforms[SI_NUM_SHADERS][MAX_INLINABLE_UNIFORMS]; struct si_buffer_resources internal_bindings; struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS]; struct si_samplers samplers[SI_NUM_SHADERS]; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 285b74dddc16..c84db4e3addf 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -488,6 +488,7 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */ bool is_stencil, uint16_t access, uint32_t * restrict state); void si_update_ps_colorbuf0_slot(struct si_context *sctx); +void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader); void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot, struct pipe_constant_buffer *cbuf); void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot, diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 5d6667e6e57c..8c2e91b2ebcc 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2133,17 +2133,6 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh { struct si_context *sctx = (struct si_context *)ctx; -#if 0 /* TODO: enable this */ - unsigned num_inlinable_uniforms = sel->info.base.num_inlinable_uniforms; - if (num_inlinable_uniforms && - sctx->inlinable_uniforms_valid_mask & (1 << sel->pipe_shader_type)) { - key->opt.inline_uniforms = true; - memcpy(key->opt.inlined_uniform_values, - sctx->inlinable_uniforms[sel->pipe_shader_type], - num_inlinable_uniforms * 4); - } -#endif - switch (sel->info.stage) { case MESA_SHADER_VERTEX: if (!sctx->shader.tes.cso && !sctx->shader.gs.cso) @@ -3174,9 +3163,7 @@ static void si_update_common_shader_state(struct si_context *sctx, struct si_sha si_shader_uses_bindless_images(sctx->shader.tcs.cso) || si_shader_uses_bindless_images(sctx->shader.tes.cso); - /* Invalidate inlinable uniforms. */ - sctx->inlinable_uniforms_valid_mask &= ~(1 << type); - + si_invalidate_inlinable_uniforms(sctx, type); sctx->do_update_shaders = true; } -- GitLab From 35a42377cd2aeceaadf149a7e9bedebc6019d902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 07:31:29 -0400 Subject: [PATCH 13/42] radeonsi: move si_shader_io_get_unique_index calls out of si_get_vs_key_outputs Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state_shaders.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 8c2e91b2ebcc..e7797f977021 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1874,11 +1874,6 @@ static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_sele uint64_t outputs_written = vs->outputs_written_before_ps; uint64_t inputs_read = 0; - /* Ignore outputs that are not passed from VS to PS. */ - outputs_written &= ~((1ull << si_shader_io_get_unique_index(VARYING_SLOT_POS, true)) | - (1ull << si_shader_io_get_unique_index(VARYING_SLOT_PSIZ, true)) | - (1ull << si_shader_io_get_unique_index(VARYING_SLOT_CLIP_VERTEX, true))); - if (!ps_disabled) { inputs_read = ps->inputs_read; } @@ -2881,8 +2876,14 @@ static void *si_create_shader_selector(struct pipe_context *ctx, } else if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && semantic != VARYING_SLOT_EDGE) { sel->outputs_written |= 1ull << si_shader_io_get_unique_index(semantic, false); - sel->outputs_written_before_ps |= 1ull - << si_shader_io_get_unique_index(semantic, true); + + /* Ignore outputs that are not passed from VS to PS. */ + if (semantic != VARYING_SLOT_POS && + semantic != VARYING_SLOT_PSIZ && + semantic != VARYING_SLOT_CLIP_VERTEX) { + sel->outputs_written_before_ps |= 1ull + << si_shader_io_get_unique_index(semantic, true); + } } } } -- GitLab From 76fe6a024276572c25c91dd3d9825e079f7f916c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 07:47:32 -0400 Subject: [PATCH 14/42] radeonsi: move PS inputs_read computation out of si_get_vs_key_outputs to reduce overhead of si_update_shaders Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_state.c | 4 ++++ src/gallium/drivers/radeonsi/si_state.h | 1 + .../drivers/radeonsi/si_state_shaders.c | 23 ++++++++++--------- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 92762bc881ab..9f6383965ccc 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1037,6 +1037,7 @@ struct si_context { struct si_cs_shader_state cs_shader_state; /* shader information */ + uint64_t ps_inputs_read_or_disabled; struct si_vertex_elements *vertex_elements; unsigned num_vertex_elements; unsigned sprite_coord_enable; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index cf63ddd333ac..be84e9b50cda 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -712,6 +712,7 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) { si_ps_key_update_framebuffer_blend(sctx); si_ps_key_update_blend_rasterizer(sctx); + si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; } @@ -1126,6 +1127,7 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) si_ps_key_update_blend_rasterizer(sctx); si_ps_key_update_rasterizer(sctx); si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; } } @@ -1345,6 +1347,7 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) if (old_dsa->alpha_func != dsa->alpha_func) { si_ps_key_update_dsa(sctx); + si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; } @@ -2995,6 +2998,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_ps_key_update_framebuffer(sctx); si_ps_key_update_framebuffer_blend(sctx); si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; if (!sctx->decompression_enabled) { diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index c84db4e3addf..a1b3e421b570 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -582,6 +582,7 @@ int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_st void si_vs_key_update_inputs(struct si_context *sctx); void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key); +void si_update_ps_inputs_read_or_disabled(struct si_context *sctx); unsigned si_get_input_prim(const struct si_shader_selector *gs); bool si_update_ngg(struct si_context *sctx); void si_ps_key_update_framebuffer(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index e7797f977021..19e887cb4a70 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1850,13 +1850,10 @@ void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, sizeof(key->mono.vs_fix_fetch)); } -static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, - struct si_shader_key *key) +void si_update_ps_inputs_read_or_disabled(struct si_context *sctx) { struct si_shader_selector *ps = sctx->shader.ps.cso; - key->opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable; - /* Find out if PS is disabled. */ bool ps_disabled = true; if (ps) { @@ -1870,15 +1867,18 @@ static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_sele (!ps_colormask && !ps_modifies_zs && !ps->info.base.writes_memory); } - /* Find out which VS outputs aren't used by the PS. */ - uint64_t outputs_written = vs->outputs_written_before_ps; - uint64_t inputs_read = 0; + sctx->ps_inputs_read_or_disabled = ps_disabled ? 0 : ps->inputs_read; +} - if (!ps_disabled) { - inputs_read = ps->inputs_read; - } +static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, + struct si_shader_key *key) +{ - uint64_t linked = outputs_written & inputs_read; + key->opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable; + + /* Find out which VS outputs aren't used by the PS. */ + uint64_t outputs_written = vs->outputs_written_before_ps; + uint64_t linked = outputs_written & sctx->ps_inputs_read_or_disabled; key->opt.kill_outputs = ~linked & outputs_written; @@ -3365,6 +3365,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) si_ps_key_update_dsa(sctx); si_ps_key_update_sample_shading(sctx); si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + si_update_ps_inputs_read_or_disabled(sctx); } static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) -- GitLab From dff6dc031617f7903155e611563cbf8a93342ad9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 08:23:50 -0400 Subject: [PATCH 15/42] radeonsi: unset SI_PREFETCH_* only when we unbind pm4 shader states Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_state_shaders.c | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 19e887cb4a70..210597f95e0f 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -4201,9 +4201,12 @@ bool si_update_shaders(struct si_context *sctx) } } } else { - if (sctx->chip_class <= GFX8) + if (sctx->chip_class <= GFX8) { si_pm4_bind_state(sctx, ls, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS; + } si_pm4_bind_state(sctx, hs, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS; } /* Update GS. */ @@ -4219,12 +4222,16 @@ bool si_update_shaders(struct si_context *sctx) return false; } else { si_pm4_bind_state(sctx, vs, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; } } else { if (!sctx->ngg) { si_pm4_bind_state(sctx, gs, NULL); - if (sctx->chip_class <= GFX8) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS; + if (sctx->chip_class <= GFX8) { si_pm4_bind_state(sctx, es, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES; + } } } @@ -4238,6 +4245,7 @@ bool si_update_shaders(struct si_context *sctx) if (sctx->ngg) { si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4); si_pm4_bind_state(sctx, vs, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; } else { si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4); } @@ -4358,33 +4366,21 @@ bool si_update_shaders(struct si_context *sctx) if (sctx->chip_class >= GFX7) { if (si_pm4_state_enabled_and_changed(sctx, ls)) sctx->prefetch_L2_mask |= SI_PREFETCH_LS; - else if (!sctx->queued.named.ls) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS; if (si_pm4_state_enabled_and_changed(sctx, hs)) sctx->prefetch_L2_mask |= SI_PREFETCH_HS; - else if (!sctx->queued.named.hs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS; if (si_pm4_state_enabled_and_changed(sctx, es)) sctx->prefetch_L2_mask |= SI_PREFETCH_ES; - else if (!sctx->queued.named.es) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES; if (si_pm4_state_enabled_and_changed(sctx, gs)) sctx->prefetch_L2_mask |= SI_PREFETCH_GS; - else if (!sctx->queued.named.gs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS; if (si_pm4_state_enabled_and_changed(sctx, vs)) sctx->prefetch_L2_mask |= SI_PREFETCH_VS; - else if (!sctx->queued.named.vs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; if (si_pm4_state_enabled_and_changed(sctx, ps)) sctx->prefetch_L2_mask |= SI_PREFETCH_PS; - else if (!sctx->queued.named.ps) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS; } sctx->do_update_shaders = false; -- GitLab From 98d07e1928af458f0832904752b86d061239bd05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 08:44:49 -0400 Subject: [PATCH 16/42] radeonsi: make si_update_shaders a C++ template in si_state_draw.cpp This reduces driver overhead. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pipe.h | 3 + src/gallium/drivers/radeonsi/si_state.h | 6 +- .../drivers/radeonsi/si_state_draw.cpp | 272 ++++++++++++++++- .../drivers/radeonsi/si_state_shaders.c | 287 +----------------- 4 files changed, 285 insertions(+), 283 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 9f6383965ccc..598bc1d2fdba 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1578,6 +1578,9 @@ bool si_init_thread_trace(struct si_context *sctx); void si_destroy_thread_trace(struct si_context *sctx); void si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs); +/* si_state_shaders.c */ +struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key); + /* * common helpers */ diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index a1b3e421b570..581df046d311 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -564,7 +564,6 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20], struct si_shader *shader, bool insert_into_disk_cache); bool si_shader_mem_ordered(struct si_shader *shader); -bool si_update_shaders(struct si_context *sctx); void si_init_screen_live_shader_cache(struct si_screen *sscreen); void si_init_shader_functions(struct si_context *sctx); bool si_init_shader_cache(struct si_screen *sscreen); @@ -579,6 +578,8 @@ int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_st struct si_compiler_ctx_state *compiler_state, const struct si_shader_key *key, int thread_index, bool optimized_or_none); +int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state, + struct si_compiler_ctx_state *compiler_state); void si_vs_key_update_inputs(struct si_context *sctx); void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key); @@ -592,6 +593,9 @@ void si_ps_key_update_rasterizer(struct si_context *sctx); void si_ps_key_update_dsa(struct si_context *sctx); void si_ps_key_update_sample_shading(struct si_context *sctx); void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx); +void si_init_tess_factor_ring(struct si_context *sctx); +bool si_update_gs_ring_buffers(struct si_context *sctx); +bool si_update_spi_tmpring_size(struct si_context *sctx); /* si_state_draw.c */ void si_init_draw_functions_GFX6(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index f8084e8a4821..b115dd1b619d 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -47,6 +47,276 @@ /* special primitive types */ #define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX +template +static bool si_update_shaders(struct si_context *sctx) +{ + struct pipe_context *ctx = (struct pipe_context *)sctx; + struct si_compiler_ctx_state compiler_state; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct si_shader *old_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current; + unsigned old_kill_clip_distances = old_vs ? old_vs->key.opt.kill_clip_distances : 0; + struct si_shader *old_ps = sctx->shader.ps.current; + unsigned old_spi_shader_col_format = + old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0; + int r; + + if (!sctx->compiler.passes) + si_init_compiler(sctx->screen, &sctx->compiler); + + compiler_state.compiler = &sctx->compiler; + compiler_state.debug = sctx->debug; + compiler_state.is_debug_context = sctx->is_debug; + + /* Update TCS and TES. */ + if (HAS_TESS) { + if (!sctx->tess_rings) { + si_init_tess_factor_ring(sctx); + if (!sctx->tess_rings) + return false; + } + + if (sctx->shader.tcs.cso) { + r = si_shader_select(ctx, &sctx->shader.tcs, &compiler_state); + if (r) + return false; + si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current->pm4); + } else { + if (!sctx->fixed_func_tcs_shader.cso) { + sctx->fixed_func_tcs_shader.cso = + (struct si_shader_selector*)si_create_fixed_func_tcs(sctx); + if (!sctx->fixed_func_tcs_shader.cso) + return false; + } + + r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, &compiler_state); + if (r) + return false; + si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4); + } + + if (!HAS_GS || GFX_VERSION <= GFX8) { + r = si_shader_select(ctx, &sctx->shader.tes, &compiler_state); + if (r) + return false; + + if (HAS_GS) { + /* TES as ES */ + assert(GFX_VERSION <= GFX8); + si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4); + } else if (NGG) { + si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4); + } else { + si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4); + } + } + } else { + if (GFX_VERSION <= GFX8) { + si_pm4_bind_state(sctx, ls, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS; + } + si_pm4_bind_state(sctx, hs, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS; + } + + /* Update GS. */ + if (HAS_GS) { + r = si_shader_select(ctx, &sctx->shader.gs, &compiler_state); + if (r) + return false; + si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4); + if (!NGG) { + si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader->pm4); + + if (!si_update_gs_ring_buffers(sctx)) + return false; + } else { + si_pm4_bind_state(sctx, vs, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; + } + } else { + if (!NGG) { + si_pm4_bind_state(sctx, gs, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS; + if (GFX_VERSION <= GFX8) { + si_pm4_bind_state(sctx, es, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES; + } + } + } + + /* Update VS. */ + if ((!HAS_TESS && !HAS_GS) || GFX_VERSION <= GFX8) { + r = si_shader_select(ctx, &sctx->shader.vs, &compiler_state); + if (r) + return false; + + if (!HAS_TESS && !HAS_GS) { + if (NGG) { + si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, vs, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; + } else { + si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4); + } + } else if (HAS_TESS) { + si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4); + } else { + assert(HAS_GS); + si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4); + } + } + + sctx->vs_uses_base_instance = + sctx->shader.vs.current ? sctx->shader.vs.current->uses_base_instance : + sctx->queued.named.hs ? sctx->queued.named.hs->shader->uses_base_instance : + sctx->shader.gs.current->uses_base_instance; + + union si_vgt_stages_key key; + key.index = 0; + + /* Update VGT_SHADER_CONFIG. */ + if (HAS_TESS) + key.u.tess = 1; + if (HAS_GS) + key.u.gs = 1; + + if (NGG) { + struct si_shader *vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current; + + key.u.ngg = 1; + key.u.streamout = !!si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso->so.num_outputs; + /* These must be done after the shader variant is selected. */ + key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs); + key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); + } + + struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index]; + if (unlikely(!*pm4)) + *pm4 = si_build_vgt_shader_config(sctx->screen, key); + si_pm4_bind_state(sctx, vgt_shader_config, *pm4); + + if (old_kill_clip_distances != si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.opt.kill_clip_distances) + si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); + + r = si_shader_select(ctx, &sctx->shader.ps, &compiler_state); + if (r) + return false; + si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4); + + unsigned db_shader_control = sctx->shader.ps.cso->db_shader_control | + S_02880C_KILL_ENABLE(sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS); + + if (si_pm4_state_changed(sctx, ps) || + (!NGG && si_pm4_state_changed(sctx, vs)) || + (NGG && si_pm4_state_changed(sctx, gs)) || + sctx->sprite_coord_enable != rs->sprite_coord_enable || + sctx->flatshade != rs->flatshade) { + sctx->sprite_coord_enable = rs->sprite_coord_enable; + sctx->flatshade = rs->flatshade; + si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); + } + + if ((GFX_VERSION >= GFX10_3 || (GFX_VERSION >= GFX9 && sctx->screen->info.rbplus_allowed)) && + si_pm4_state_changed(sctx, ps) && + (!old_ps || old_spi_shader_col_format != + sctx->shader.ps.current->key.part.ps.epilog.spi_shader_col_format)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); + + if (sctx->ps_db_shader_control != db_shader_control) { + sctx->ps_db_shader_control = db_shader_control; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + if (sctx->screen->dpbb_allowed) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + } + + if (sctx->smoothing_enabled != + sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing) { + sctx->smoothing_enabled = sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing; + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + + /* NGG cull state uses smoothing_enabled. */ + if (GFX_VERSION >= GFX10 && sctx->screen->use_ngg_culling) + si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state); + + if (GFX_VERSION == GFX6) + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + + if (sctx->framebuffer.nr_samples <= 1) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); + } + + if (GFX_VERSION >= GFX10_3) { + struct si_shader_info *info = &sctx->shader.ps.cso->info; + bool allow_flat_shading = info->allow_flat_shading; + + if (allow_flat_shading && + (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable || + (!rs->flatshade && info->uses_interp_color))) + allow_flat_shading = false; + + if (sctx->allow_flat_shading != allow_flat_shading) { + sctx->allow_flat_shading = allow_flat_shading; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + } + } + + if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) { + /* Pretend the bound shaders form a vk pipeline */ + uint32_t pipeline_code_hash = 0; + uint64_t base_address = ~0; + + for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) { + struct si_shader *shader = sctx->shaders[i].current; + if (sctx->shaders[i].cso && shader) { + pipeline_code_hash = _mesa_hash_data_with_seed( + shader->binary.elf_buffer, + shader->binary.elf_size, + pipeline_code_hash); + base_address = MIN2(base_address, + shader->bo->gpu_address); + } + } + + struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; + if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) { + si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false); + } + + si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0); + } + + if ((GFX_VERSION <= GFX8 && + (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, es))) || + si_pm4_state_enabled_and_changed(sctx, hs) || si_pm4_state_enabled_and_changed(sctx, gs) || + si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) { + if (!si_update_spi_tmpring_size(sctx)) + return false; + + if (GFX_VERSION >= GFX7) { + if (GFX_VERSION <= GFX8 && HAS_TESS && si_pm4_state_enabled_and_changed(sctx, ls)) + sctx->prefetch_L2_mask |= SI_PREFETCH_LS; + + if (HAS_TESS && si_pm4_state_enabled_and_changed(sctx, hs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_HS; + + if (GFX_VERSION <= GFX8 && HAS_GS && si_pm4_state_enabled_and_changed(sctx, es)) + sctx->prefetch_L2_mask |= SI_PREFETCH_ES; + + if ((HAS_GS || NGG) && si_pm4_state_enabled_and_changed(sctx, gs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_GS; + + if (!NGG && si_pm4_state_enabled_and_changed(sctx, vs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_VS; + + if (si_pm4_state_enabled_and_changed(sctx, ps)) + sctx->prefetch_L2_mask |= SI_PREFETCH_PS; + } + } + + sctx->do_update_shaders = false; + return true; +} + ALWAYS_INLINE static unsigned si_conv_pipe_prim(unsigned mode) { @@ -1886,7 +2156,7 @@ static void si_draw_vbo(struct pipe_context *ctx, } if (unlikely(sctx->do_update_shaders)) { - if (unlikely(!si_update_shaders(sctx))) { + if (unlikely(!(si_update_shaders(sctx)))) { DRAW_CLEANUP; return; } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 210597f95e0f..703987e74275 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2554,8 +2554,8 @@ current_not_ready: return shader->compilation_failed ? -1 : 0; } -static int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state) +int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state, + struct si_compiler_ctx_state *compiler_state) { struct si_context *sctx = (struct si_context *)ctx; @@ -3629,7 +3629,7 @@ static void si_emit_vgt_flush(struct radeon_cmdbuf *cs) } /* Initialize state related to ESGS / GSVS ring buffers */ -static bool si_update_gs_ring_buffers(struct si_context *sctx) +bool si_update_gs_ring_buffers(struct si_context *sctx) { struct si_shader_selector *es = sctx->shader.tes.cso ? sctx->shader.tes.cso : sctx->shader.vs.cso; @@ -3904,7 +3904,7 @@ static bool si_update_scratch_relocs(struct si_context *sctx) return true; } -static bool si_update_spi_tmpring_size(struct si_context *sctx) +bool si_update_spi_tmpring_size(struct si_context *sctx) { /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer. * There are 2 cases to handle: @@ -3968,7 +3968,7 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) return true; } -static void si_init_tess_factor_ring(struct si_context *sctx) +void si_init_tess_factor_ring(struct si_context *sctx) { assert(!sctx->tess_rings); assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0); @@ -4068,8 +4068,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx) si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } -static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, - union si_vgt_stages_key key) +struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key) { struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); uint32_t stages = 0; @@ -4113,280 +4112,6 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, return pm4; } -static void si_update_vgt_shader_config(struct si_context *sctx) -{ - union si_vgt_stages_key key; - key.index = 0; - - if (sctx->shader.tes.cso) - key.u.tess = 1; - if (sctx->shader.gs.cso) - key.u.gs = 1; - - if (sctx->ngg) { - struct si_shader *vs = si_get_vs(sctx)->current; - - key.u.ngg = 1; - key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs; - /* These must be done after the shader variant is selected. */ - key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs); - key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); - } - - struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index]; - - if (unlikely(!*pm4)) - *pm4 = si_build_vgt_shader_config(sctx->screen, key); - si_pm4_bind_state(sctx, vgt_shader_config, *pm4); -} - -bool si_update_shaders(struct si_context *sctx) -{ - struct pipe_context *ctx = (struct pipe_context *)sctx; - struct si_compiler_ctx_state compiler_state; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_shader *old_vs = si_get_vs(sctx)->current; - unsigned old_kill_clip_distances = old_vs ? old_vs->key.opt.kill_clip_distances : 0; - struct si_shader *old_ps = sctx->shader.ps.current; - unsigned old_spi_shader_col_format = - old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0; - int r; - - if (!sctx->compiler.passes) - si_init_compiler(sctx->screen, &sctx->compiler); - - compiler_state.compiler = &sctx->compiler; - compiler_state.debug = sctx->debug; - compiler_state.is_debug_context = sctx->is_debug; - - /* Update TCS and TES. */ - if (sctx->shader.tes.cso) { - if (!sctx->tess_rings) { - si_init_tess_factor_ring(sctx); - if (!sctx->tess_rings) - return false; - } - - if (sctx->shader.tcs.cso) { - r = si_shader_select(ctx, &sctx->shader.tcs, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current->pm4); - } else { - if (!sctx->fixed_func_tcs_shader.cso) { - sctx->fixed_func_tcs_shader.cso = si_create_fixed_func_tcs(sctx); - if (!sctx->fixed_func_tcs_shader.cso) - return false; - } - - r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4); - } - - if (!sctx->shader.gs.cso || sctx->chip_class <= GFX8) { - r = si_shader_select(ctx, &sctx->shader.tes, &compiler_state); - if (r) - return false; - - if (sctx->shader.gs.cso) { - /* TES as ES */ - assert(sctx->chip_class <= GFX8); - si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4); - } else if (sctx->ngg) { - si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4); - } else { - si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4); - } - } - } else { - if (sctx->chip_class <= GFX8) { - si_pm4_bind_state(sctx, ls, NULL); - sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS; - } - si_pm4_bind_state(sctx, hs, NULL); - sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS; - } - - /* Update GS. */ - if (sctx->shader.gs.cso) { - r = si_shader_select(ctx, &sctx->shader.gs, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4); - if (!sctx->ngg) { - si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader->pm4); - - if (!si_update_gs_ring_buffers(sctx)) - return false; - } else { - si_pm4_bind_state(sctx, vs, NULL); - sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; - } - } else { - if (!sctx->ngg) { - si_pm4_bind_state(sctx, gs, NULL); - sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS; - if (sctx->chip_class <= GFX8) { - si_pm4_bind_state(sctx, es, NULL); - sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES; - } - } - } - - /* Update VS. */ - if ((!sctx->shader.tes.cso && !sctx->shader.gs.cso) || sctx->chip_class <= GFX8) { - r = si_shader_select(ctx, &sctx->shader.vs, &compiler_state); - if (r) - return false; - - if (!sctx->shader.tes.cso && !sctx->shader.gs.cso) { - if (sctx->ngg) { - si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4); - si_pm4_bind_state(sctx, vs, NULL); - sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; - } else { - si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4); - } - } else if (sctx->shader.tes.cso) { - si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4); - } else { - assert(sctx->shader.gs.cso); - si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4); - } - } - - sctx->vs_uses_base_instance = - sctx->shader.vs.current ? sctx->shader.vs.current->uses_base_instance : - sctx->queued.named.hs ? sctx->queued.named.hs->shader->uses_base_instance : - sctx->shader.gs.current->uses_base_instance; - - si_update_vgt_shader_config(sctx); - - if (old_kill_clip_distances != si_get_vs(sctx)->current->key.opt.kill_clip_distances) - si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); - - if (sctx->shader.ps.cso) { - unsigned db_shader_control; - - r = si_shader_select(ctx, &sctx->shader.ps, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4); - - db_shader_control = sctx->shader.ps.cso->db_shader_control | - S_02880C_KILL_ENABLE(sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS); - - if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) || - (sctx->ngg && si_pm4_state_changed(sctx, gs)) || - sctx->sprite_coord_enable != rs->sprite_coord_enable || - sctx->flatshade != rs->flatshade) { - sctx->sprite_coord_enable = rs->sprite_coord_enable; - sctx->flatshade = rs->flatshade; - si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); - } - - if (sctx->screen->info.rbplus_allowed && si_pm4_state_changed(sctx, ps) && - (!old_ps || old_spi_shader_col_format != - sctx->shader.ps.current->key.part.ps.epilog.spi_shader_col_format)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - - if (sctx->ps_db_shader_control != db_shader_control) { - sctx->ps_db_shader_control = db_shader_control; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - if (sctx->screen->dpbb_allowed) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - } - - if (sctx->smoothing_enabled != - sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing) { - sctx->smoothing_enabled = sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing; - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - - /* NGG cull state uses smoothing_enabled. */ - if (sctx->screen->use_ngg_culling) - si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state); - - if (sctx->chip_class == GFX6) - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - - if (sctx->framebuffer.nr_samples <= 1) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); - } - - if (sctx->chip_class >= GFX10_3) { - struct si_shader_info *info = &sctx->shader.ps.cso->info; - bool allow_flat_shading = info->allow_flat_shading; - - if (allow_flat_shading && - (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable || - (!rs->flatshade && info->uses_interp_color))) - allow_flat_shading = false; - - if (sctx->allow_flat_shading != allow_flat_shading) { - sctx->allow_flat_shading = allow_flat_shading; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - } - } - } - - if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) { - /* Pretend the bound shaders form a vk pipeline */ - uint32_t pipeline_code_hash = 0; - uint64_t base_address = ~0; - - for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) { - struct si_shader *shader = sctx->shaders[i].current; - if (sctx->shaders[i].cso && shader) { - pipeline_code_hash = _mesa_hash_data_with_seed( - shader->binary.elf_buffer, - shader->binary.elf_size, - pipeline_code_hash); - base_address = MIN2(base_address, - shader->bo->gpu_address); - } - } - - struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; - if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) { - si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false); - } - - si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0); - } - - if (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, hs) || - si_pm4_state_enabled_and_changed(sctx, es) || si_pm4_state_enabled_and_changed(sctx, gs) || - si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) { - if (!si_update_spi_tmpring_size(sctx)) - return false; - } - - if (sctx->chip_class >= GFX7) { - if (si_pm4_state_enabled_and_changed(sctx, ls)) - sctx->prefetch_L2_mask |= SI_PREFETCH_LS; - - if (si_pm4_state_enabled_and_changed(sctx, hs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_HS; - - if (si_pm4_state_enabled_and_changed(sctx, es)) - sctx->prefetch_L2_mask |= SI_PREFETCH_ES; - - if (si_pm4_state_enabled_and_changed(sctx, gs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_GS; - - if (si_pm4_state_enabled_and_changed(sctx, vs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_VS; - - if (si_pm4_state_enabled_and_changed(sctx, ps)) - sctx->prefetch_L2_mask |= SI_PREFETCH_PS; - } - - sctx->do_update_shaders = false; - return true; -} - static void si_emit_scratch_state(struct si_context *sctx) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; -- GitLab From 2dfe01db01552bd0f4b5380e1edc308b52134813 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 09:14:48 -0400 Subject: [PATCH 17/42] radeonsi: optimize scratch buffer size updates using C++ template arguments Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state.h | 2 +- .../drivers/radeonsi/si_state_draw.cpp | 29 ++++++++++++++++++- .../drivers/radeonsi/si_state_shaders.c | 18 +----------- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 581df046d311..f750ea4bbe1b 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -595,7 +595,7 @@ void si_ps_key_update_sample_shading(struct si_context *sctx); void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx); void si_init_tess_factor_ring(struct si_context *sctx); bool si_update_gs_ring_buffers(struct si_context *sctx); -bool si_update_spi_tmpring_size(struct si_context *sctx); +bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes); /* si_state_draw.c */ void si_init_draw_functions_GFX6(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index b115dd1b619d..0940e69dd38d 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -289,7 +289,34 @@ static bool si_update_shaders(struct si_context *sctx) (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, es))) || si_pm4_state_enabled_and_changed(sctx, hs) || si_pm4_state_enabled_and_changed(sctx, gs) || si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) { - if (!si_update_spi_tmpring_size(sctx)) + unsigned scratch_size = 0; + + if (HAS_TESS) { + if (GFX_VERSION <= GFX8) /* LS */ + scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave); + + scratch_size = MAX2(scratch_size, sctx->queued.named.hs->shader->config.scratch_bytes_per_wave); + + if (HAS_GS) { + if (GFX_VERSION <= GFX8) /* ES */ + scratch_size = MAX2(scratch_size, sctx->shader.tes.current->config.scratch_bytes_per_wave); + + scratch_size = MAX2(scratch_size, sctx->shader.gs.current->config.scratch_bytes_per_wave); + } else { + scratch_size = MAX2(scratch_size, sctx->shader.tes.current->config.scratch_bytes_per_wave); + } + } else if (HAS_GS) { + if (GFX_VERSION <= GFX8) /* ES */ + scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave); + + scratch_size = MAX2(scratch_size, sctx->shader.gs.current->config.scratch_bytes_per_wave); + } else { + scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave); + } + + scratch_size = MAX2(scratch_size, sctx->shader.ps.current->config.scratch_bytes_per_wave); + + if (scratch_size && !si_update_spi_tmpring_size(sctx, scratch_size)) return false; if (GFX_VERSION >= GFX7) { diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 703987e74275..7e4543d87011 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3832,11 +3832,6 @@ static int si_update_scratch_buffer(struct si_context *sctx, struct si_shader *s return 1; } -static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader) -{ - return shader ? shader->config.scratch_bytes_per_wave : 0; -} - static struct si_shader *si_get_tcs_current(struct si_context *sctx) { if (!sctx->shader.tes.cso) @@ -3904,7 +3899,7 @@ static bool si_update_scratch_relocs(struct si_context *sctx) return true; } -bool si_update_spi_tmpring_size(struct si_context *sctx) +bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes) { /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer. * There are 2 cases to handle: @@ -3919,17 +3914,6 @@ bool si_update_spi_tmpring_size(struct si_context *sctx) * Otherwise, the number of waves that can use scratch is * SPI_TMPRING_SIZE.WAVES. */ - unsigned bytes = 0; - - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.ps.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.gs.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.vs.current)); - - if (sctx->shader.tes.cso) { - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.tes.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx))); - } - sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes); unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves; -- GitLab From 70220a18967888446aedf5848201cad72ef8d8c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 11:08:44 -0400 Subject: [PATCH 18/42] radeonsi: check flatshade and sprite_coord_enable for spi_map in bind_rs_state it doesn't need to be in si_update_shaders Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pipe.h | 2 -- src/gallium/drivers/radeonsi/si_state.c | 4 ++++ src/gallium/drivers/radeonsi/si_state_draw.cpp | 7 +------ src/gallium/drivers/radeonsi/si_state_shaders.c | 5 +++-- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 598bc1d2fdba..c65d7dbdf086 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1040,9 +1040,7 @@ struct si_context { uint64_t ps_inputs_read_or_disabled; struct si_vertex_elements *vertex_elements; unsigned num_vertex_elements; - unsigned sprite_coord_enable; unsigned cs_max_waves_per_sh; - bool flatshade; bool do_update_shaders; bool compute_shaderbuf_sgprs_dirty; bool compute_image_sgprs_dirty; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index be84e9b50cda..3abe32afc7e7 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1114,6 +1114,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl) si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); + if (old_rs->sprite_coord_enable != rs->sprite_coord_enable || + old_rs->flatshade != rs->flatshade) + si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); + if (old_rs->clip_plane_enable != rs->clip_plane_enable || old_rs->rasterizer_discard != rs->rasterizer_discard || old_rs->sprite_coord_enable != rs->sprite_coord_enable || diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 0940e69dd38d..c53c7c18a3dc 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -208,13 +208,8 @@ static bool si_update_shaders(struct si_context *sctx) if (si_pm4_state_changed(sctx, ps) || (!NGG && si_pm4_state_changed(sctx, vs)) || - (NGG && si_pm4_state_changed(sctx, gs)) || - sctx->sprite_coord_enable != rs->sprite_coord_enable || - sctx->flatshade != rs->flatshade) { - sctx->sprite_coord_enable = rs->sprite_coord_enable; - sctx->flatshade = rs->flatshade; + (NGG && si_pm4_state_changed(sctx, gs))) si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); - } if ((GFX_VERSION >= GFX10_3 || (GFX_VERSION >= GFX9 && sctx->screen->info.rbplus_allowed)) && si_pm4_state_changed(sctx, ps) && diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 7e4543d87011..55708b096459 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3478,16 +3478,17 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader * ubyte fp16_lo_hi_mask) { struct si_shader_info *vsinfo = &vs->selector->info; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned offset, ps_input_cntl = 0; if (interpolate == INTERP_MODE_FLAT || - (interpolate == INTERP_MODE_COLOR && sctx->flatshade) || + (interpolate == INTERP_MODE_COLOR && rs->flatshade) || semantic == VARYING_SLOT_PRIMITIVE_ID) ps_input_cntl |= S_028644_FLAT_SHADE(1); if (semantic == VARYING_SLOT_PNTC || (semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7 && - sctx->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) { + rs->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) { ps_input_cntl |= S_028644_PT_SPRITE_TEX(1); if (fp16_lo_hi_mask & 0x1) { ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | -- GitLab From eed149aa7cae51207cb28b3d489db0471167561e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 11:14:50 -0400 Subject: [PATCH 19/42] radeonsi: move DB_SHADER_CONTROL update for PS out of si_update_shaders It only depends on the pixel shader CSO and alpha test. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state.c | 1 + src/gallium/drivers/radeonsi/si_state.h | 1 + src/gallium/drivers/radeonsi/si_state_draw.cpp | 10 ---------- src/gallium/drivers/radeonsi/si_state_shaders.c | 17 +++++++++++++++++ 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 3abe32afc7e7..abed98ed8aee 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1352,6 +1352,7 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) if (old_dsa->alpha_func != dsa->alpha_func) { si_ps_key_update_dsa(sctx); si_update_ps_inputs_read_or_disabled(sctx); + si_update_ps_kill_enable(sctx); sctx->do_update_shaders = true; } diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index f750ea4bbe1b..595358067c09 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -584,6 +584,7 @@ void si_vs_key_update_inputs(struct si_context *sctx); void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key); void si_update_ps_inputs_read_or_disabled(struct si_context *sctx); +void si_update_ps_kill_enable(struct si_context *sctx); unsigned si_get_input_prim(const struct si_shader_selector *gs); bool si_update_ngg(struct si_context *sctx); void si_ps_key_update_framebuffer(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index c53c7c18a3dc..1c4e6aa2aa39 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -203,9 +203,6 @@ static bool si_update_shaders(struct si_context *sctx) return false; si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4); - unsigned db_shader_control = sctx->shader.ps.cso->db_shader_control | - S_02880C_KILL_ENABLE(sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS); - if (si_pm4_state_changed(sctx, ps) || (!NGG && si_pm4_state_changed(sctx, vs)) || (NGG && si_pm4_state_changed(sctx, gs))) @@ -217,13 +214,6 @@ static bool si_update_shaders(struct si_context *sctx) sctx->shader.ps.current->key.part.ps.epilog.spi_shader_col_format)) si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - if (sctx->ps_db_shader_control != db_shader_control) { - sctx->ps_db_shader_control = db_shader_control; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - if (sctx->screen->dpbb_allowed) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - } - if (sctx->smoothing_enabled != sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing) { sctx->smoothing_enabled = sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 55708b096459..b0c7751f5830 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3329,6 +3329,22 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state) si_update_rasterized_prim(sctx); } +void si_update_ps_kill_enable(struct si_context *sctx) +{ + if (!sctx->shader.ps.cso) + return; + + unsigned db_shader_control = sctx->shader.ps.cso->db_shader_control | + S_02880C_KILL_ENABLE(sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS); + + if (sctx->ps_db_shader_control != db_shader_control) { + sctx->ps_db_shader_control = db_shader_control; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + if (sctx->screen->dpbb_allowed) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + } +} + static void si_bind_ps_shader(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -3366,6 +3382,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) si_ps_key_update_sample_shading(sctx); si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); si_update_ps_inputs_read_or_disabled(sctx); + si_update_ps_kill_enable(sctx); } static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) -- GitLab From 5a131566b15fb699681cc01c4329bacbaa75997b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 11:22:07 -0400 Subject: [PATCH 20/42] radeonsi: move flat shading VRS enablement out of si_update_shaders Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state.c | 6 ++++++ src/gallium/drivers/radeonsi/si_state.h | 1 + .../drivers/radeonsi/si_state_draw.cpp | 16 --------------- .../drivers/radeonsi/si_state_shaders.c | 20 +++++++++++++++++++ 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index abed98ed8aee..4d278cfb7707 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1134,6 +1134,12 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; } + + if (old_rs->line_smooth != rs->line_smooth || + old_rs->poly_smooth != rs->poly_smooth || + old_rs->poly_stipple_enable != rs->poly_stipple_enable || + old_rs->flatshade != rs->flatshade) + si_update_vrs_flat_shading(sctx); } static void si_delete_rs_state(struct pipe_context *ctx, void *state) diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 595358067c09..0e0a0f955fc4 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -585,6 +585,7 @@ void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key); void si_update_ps_inputs_read_or_disabled(struct si_context *sctx); void si_update_ps_kill_enable(struct si_context *sctx); +void si_update_vrs_flat_shading(struct si_context *sctx); unsigned si_get_input_prim(const struct si_shader_selector *gs); bool si_update_ngg(struct si_context *sctx); void si_ps_key_update_framebuffer(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 1c4e6aa2aa39..e66dc69bd3f4 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -52,7 +52,6 @@ static bool si_update_shaders(struct si_context *sctx) { struct pipe_context *ctx = (struct pipe_context *)sctx; struct si_compiler_ctx_state compiler_state; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; struct si_shader *old_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current; unsigned old_kill_clip_distances = old_vs ? old_vs->key.opt.kill_clip_distances : 0; struct si_shader *old_ps = sctx->shader.ps.current; @@ -230,21 +229,6 @@ static bool si_update_shaders(struct si_context *sctx) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); } - if (GFX_VERSION >= GFX10_3) { - struct si_shader_info *info = &sctx->shader.ps.cso->info; - bool allow_flat_shading = info->allow_flat_shading; - - if (allow_flat_shading && - (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable || - (!rs->flatshade && info->uses_interp_color))) - allow_flat_shading = false; - - if (sctx->allow_flat_shading != allow_flat_shading) { - sctx->allow_flat_shading = allow_flat_shading; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - } - } - if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) { /* Pretend the bound shaders form a vk pipeline */ uint32_t pipeline_code_hash = 0; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index b0c7751f5830..358ae4a91d88 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3345,6 +3345,25 @@ void si_update_ps_kill_enable(struct si_context *sctx) } } +void si_update_vrs_flat_shading(struct si_context *sctx) +{ + if (sctx->chip_class >= GFX10_3 && sctx->shader.ps.cso) { + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct si_shader_info *info = &sctx->shader.ps.cso->info; + bool allow_flat_shading = info->allow_flat_shading; + + if (allow_flat_shading && + (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable || + (!rs->flatshade && info->uses_interp_color))) + allow_flat_shading = false; + + if (sctx->allow_flat_shading != allow_flat_shading) { + sctx->allow_flat_shading = allow_flat_shading; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + } + } +} + static void si_bind_ps_shader(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -3383,6 +3402,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); si_update_ps_inputs_read_or_disabled(sctx); si_update_ps_kill_enable(sctx); + si_update_vrs_flat_shading(sctx); } static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) -- GitLab From 7a20110ad3469e0090312311fd9462b013721e81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 11:27:53 -0400 Subject: [PATCH 21/42] radeonsi: precompute si_vgt_stages_key for NGG in si_shader to remove this overhead from si_update_shaders Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pipe.h | 29 ------------------ src/gallium/drivers/radeonsi/si_shader.h | 30 +++++++++++++++++++ .../drivers/radeonsi/si_state_draw.cpp | 12 ++------ .../drivers/radeonsi/si_state_shaders.c | 6 ++++ 4 files changed, 38 insertions(+), 39 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index c65d7dbdf086..3823803b3724 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -837,35 +837,6 @@ union si_vgt_param_key { uint16_t index; }; -#define SI_NUM_VGT_STAGES_KEY_BITS 6 -#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) - -/* The VGT_SHADER_STAGES key used to index the table of precomputed values. - * Some fields are set by state-change calls, most are set by draw_vbo. - */ -union si_vgt_stages_key { - struct { -#if UTIL_ARCH_LITTLE_ENDIAN - uint8_t tess : 1; - uint8_t gs : 1; - uint8_t ngg_gs_fast_launch : 1; - uint8_t ngg_passthrough : 1; - uint8_t ngg : 1; /* gfx10+ */ - uint8_t streamout : 1; /* only used with NGG */ - uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; -#else /* UTIL_ARCH_BIG_ENDIAN */ - uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; - uint8_t streamout : 1; - uint8_t ngg : 1; - uint8_t ngg_passthrough : 1; - uint8_t ngg_gs_fast_launch : 1; - uint8_t gs : 1; - uint8_t tess : 1; -#endif - } u; - uint8_t index; -}; - struct si_texture_handle { unsigned desc_slot; bool desc_dirty; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index e943347f3ba8..b14a9a27f28f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -729,6 +729,35 @@ struct gfx9_gs_info { unsigned esgs_ring_size; /* in bytes */ }; +#define SI_NUM_VGT_STAGES_KEY_BITS 6 +#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) + +/* The VGT_SHADER_STAGES key used to index the table of precomputed values. + * Some fields are set by state-change calls, most are set by draw_vbo. + */ +union si_vgt_stages_key { + struct { +#if UTIL_ARCH_LITTLE_ENDIAN + uint8_t tess : 1; + uint8_t gs : 1; + uint8_t ngg_gs_fast_launch : 1; + uint8_t ngg_passthrough : 1; + uint8_t ngg : 1; /* gfx10+ */ + uint8_t streamout : 1; /* only used with NGG */ + uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; +#else /* UTIL_ARCH_BIG_ENDIAN */ + uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; + uint8_t streamout : 1; + uint8_t ngg : 1; + uint8_t ngg_passthrough : 1; + uint8_t ngg_gs_fast_launch : 1; + uint8_t gs : 1; + uint8_t tess : 1; +#endif + } u; + uint8_t index; +}; + struct si_shader { struct si_compiler_ctx_state compiler_ctx_state; @@ -812,6 +841,7 @@ struct si_shader { unsigned pa_cl_ngg_cntl; unsigned vgt_gs_max_vert_out; /* for API GS */ unsigned ge_pc_alloc; /* uconfig register */ + union si_vgt_stages_key vgt_stages; } ngg; struct { diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index e66dc69bd3f4..4b76d0cd6c5d 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -178,16 +178,8 @@ static bool si_update_shaders(struct si_context *sctx) key.u.tess = 1; if (HAS_GS) key.u.gs = 1; - - if (NGG) { - struct si_shader *vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current; - - key.u.ngg = 1; - key.u.streamout = !!si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso->so.num_outputs; - /* These must be done after the shader variant is selected. */ - key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs); - key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); - } + if (NGG) + key.index |= si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ctx_reg.ngg.vgt_stages.index; struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index]; if (unlikely(!*pm4)) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 358ae4a91d88..d04f538052a5 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1346,6 +1346,12 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); } + + shader->ctx_reg.ngg.vgt_stages.u.ngg = 1; + shader->ctx_reg.ngg.vgt_stages.u.streamout = gs_sel->so.num_outputs; + shader->ctx_reg.ngg.vgt_stages.u.ngg_passthrough = gfx10_is_ngg_passthrough(shader); + shader->ctx_reg.ngg.vgt_stages.u.ngg_gs_fast_launch = + !!(shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); } static void si_emit_shader_vs(struct si_context *sctx) -- GitLab From a65f99b2d165b6b0612fc03e4ff479c6f79d5087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 11:47:47 -0400 Subject: [PATCH 22/42] radeonsi: deduplicate si_compiler_ctx_state initialization to remove it from si_update_shaders Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state.h | 6 ++---- .../drivers/radeonsi/si_state_draw.cpp | 20 ++++++------------ .../drivers/radeonsi/si_state_shaders.c | 21 ++++++++++++------- 3 files changed, 21 insertions(+), 26 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 0e0a0f955fc4..7555a1525640 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -574,12 +574,10 @@ void si_schedule_initial_compile(struct si_context *sctx, gl_shader_stage stage, util_queue_execute_func execute); void si_get_active_slot_masks(const struct si_shader_info *info, uint64_t *const_and_shader_buffers, uint64_t *samplers_and_images); -int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state, +int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state, const struct si_shader_key *key, int thread_index, bool optimized_or_none); -int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state); +int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state); void si_vs_key_update_inputs(struct si_context *sctx); void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 4b76d0cd6c5d..81d2fdccd5cb 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -51,7 +51,6 @@ template current; unsigned old_kill_clip_distances = old_vs ? old_vs->key.opt.kill_clip_distances : 0; struct si_shader *old_ps = sctx->shader.ps.current; @@ -59,13 +58,6 @@ static bool si_update_shaders(struct si_context *sctx) old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0; int r; - if (!sctx->compiler.passes) - si_init_compiler(sctx->screen, &sctx->compiler); - - compiler_state.compiler = &sctx->compiler; - compiler_state.debug = sctx->debug; - compiler_state.is_debug_context = sctx->is_debug; - /* Update TCS and TES. */ if (HAS_TESS) { if (!sctx->tess_rings) { @@ -75,7 +67,7 @@ static bool si_update_shaders(struct si_context *sctx) } if (sctx->shader.tcs.cso) { - r = si_shader_select(ctx, &sctx->shader.tcs, &compiler_state); + r = si_shader_select(ctx, &sctx->shader.tcs); if (r) return false; si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current->pm4); @@ -87,14 +79,14 @@ static bool si_update_shaders(struct si_context *sctx) return false; } - r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, &compiler_state); + r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader); if (r) return false; si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4); } if (!HAS_GS || GFX_VERSION <= GFX8) { - r = si_shader_select(ctx, &sctx->shader.tes, &compiler_state); + r = si_shader_select(ctx, &sctx->shader.tes); if (r) return false; @@ -119,7 +111,7 @@ static bool si_update_shaders(struct si_context *sctx) /* Update GS. */ if (HAS_GS) { - r = si_shader_select(ctx, &sctx->shader.gs, &compiler_state); + r = si_shader_select(ctx, &sctx->shader.gs); if (r) return false; si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4); @@ -145,7 +137,7 @@ static bool si_update_shaders(struct si_context *sctx) /* Update VS. */ if ((!HAS_TESS && !HAS_GS) || GFX_VERSION <= GFX8) { - r = si_shader_select(ctx, &sctx->shader.vs, &compiler_state); + r = si_shader_select(ctx, &sctx->shader.vs); if (r) return false; @@ -189,7 +181,7 @@ static bool si_update_shaders(struct si_context *sctx) if (old_kill_clip_distances != si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.opt.kill_clip_distances) si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); - r = si_shader_select(ctx, &sctx->shader.ps, &compiler_state); + r = si_shader_select(ctx, &sctx->shader.ps); if (r) return false; si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index d04f538052a5..c5f01e70cfb8 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2309,11 +2309,11 @@ use_local_key_copy(const struct si_shader_key *key, struct si_shader_key *local_ * the compilation isn't finished, don't select any * shader and return an error. */ -int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state, +int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state, const struct si_shader_key *key, int thread_index, bool optimized_or_none) { + struct si_screen *sscreen = sctx->screen; struct si_shader_selector *sel = state->cso; struct si_shader_selector *previous_stage_sel = NULL; struct si_shader *current = state->current; @@ -2428,9 +2428,14 @@ current_not_ready: util_queue_fence_init(&shader->ready); + if (!sctx->compiler.passes) + si_init_compiler(sctx->screen, &sctx->compiler); + shader->selector = sel; shader->key = *key; - shader->compiler_ctx_state = *compiler_state; + shader->compiler_ctx_state.compiler = &sctx->compiler; + shader->compiler_ctx_state.debug = sctx->debug; + shader->compiler_ctx_state.is_debug_context = sctx->is_debug; /* If this is a merged shader, get the first shader's selector. */ if (sscreen->info.chip_class >= GFX9) { @@ -2476,12 +2481,13 @@ current_not_ready: } simple_mtx_lock(&previous_stage_sel->mutex); - ok = si_check_missing_main_part(sscreen, previous_stage_sel, compiler_state, &shader1_key); + ok = si_check_missing_main_part(sscreen, previous_stage_sel, &shader->compiler_ctx_state, + &shader1_key); simple_mtx_unlock(&previous_stage_sel->mutex); } if (ok) { - ok = si_check_missing_main_part(sscreen, sel, compiler_state, key); + ok = si_check_missing_main_part(sscreen, sel, &shader->compiler_ctx_state, key); } if (!ok) { @@ -2560,13 +2566,12 @@ current_not_ready: return shader->compilation_failed ? -1 : 0; } -int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state) +int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state) { struct si_context *sctx = (struct si_context *)ctx; si_shader_selector_key(ctx, state->cso, &state->key); - return si_shader_select_with_key(sctx->screen, state, compiler_state, &state->key, -1, false); + return si_shader_select_with_key(sctx, state, &state->key, -1, false); } static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout, -- GitLab From 3ea3621b8d4f608f64961a3bd7fb184e317390d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 19:37:25 -0400 Subject: [PATCH 23/42] radeonsi: determine num_vbos_in_user_sgprs from template arguments in draw_vbo Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_blit.c | 6 ++++-- src/gallium/drivers/radeonsi/si_descriptors.c | 12 ++++++++---- src/gallium/drivers/radeonsi/si_pipe.c | 5 ----- src/gallium/drivers/radeonsi/si_pipe.h | 15 ++++++++++++++- src/gallium/drivers/radeonsi/si_state.c | 3 ++- src/gallium/drivers/radeonsi/si_state_draw.cpp | 3 +-- src/gallium/drivers/radeonsi/si_state_shaders.c | 3 ++- 7 files changed, 31 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 00081dd6391b..6de37ab2bd89 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -98,11 +98,13 @@ void si_blitter_end(struct si_context *sctx) /* Restore shader pointers because the VS blit shader changed all * non-global VS user SGPRs. */ sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX); + + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen); sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL && sctx->num_vertex_elements > - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 && - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); } diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 7b0a9da74974..0a563b3caa0f 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1954,11 +1954,13 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS); if (shader == PIPE_SHADER_VERTEX) { + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen); + sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL && sctx->num_vertex_elements > - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; sctx->vertex_buffer_user_sgprs_dirty = - sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs; + sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs; } si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); @@ -1966,12 +1968,14 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad void si_shader_pointers_mark_dirty(struct si_context *sctx) { + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen); + sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS); sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL && sctx->num_vertex_elements > - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; sctx->vertex_buffer_user_sgprs_dirty = - sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs; + sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs; si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index e77eb0e4de27..619008209ba3 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -1155,11 +1155,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3; - /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't - * have to allocate and count references for the upload buffer. - */ - sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1; - /* Determine tessellation ring info. */ bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 && sscreen->info.family != CHIP_CARRIZO && diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 3823803b3724..efa853751b75 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -518,7 +518,6 @@ struct si_screen { unsigned width, unsigned height, unsigned depth, uint32_t *state, uint32_t *fmask_state); - unsigned num_vbos_in_user_sgprs; unsigned max_memory_usage_kb; unsigned pa_sc_raster_config; unsigned pa_sc_raster_config_1; @@ -1959,6 +1958,20 @@ static inline unsigned si_get_num_coverage_samples(struct si_context *sctx) return 1; } +static unsigned ALWAYS_INLINE +si_num_vbos_in_user_sgprs_inline(enum chip_class chip_class) +{ + /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't + * have to allocate and count references for the upload buffer. + */ + return chip_class >= GFX9 ? 5 : 1; +} + +static inline unsigned si_num_vbos_in_user_sgprs(struct si_screen *sscreen) +{ + return si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class); +} + #define PRINT_ERR(fmt, args...) \ fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 4d278cfb7707..8180201bd28d 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -4680,8 +4680,9 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, v->count = count; + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sscreen); unsigned alloc_count = - count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0; + count > num_vbos_in_user_sgprs ? count - num_vbos_in_user_sgprs : 0; v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT); for (i = 0; i < count; ++i) { diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 81d2fdccd5cb..c95e3938f36f 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1605,6 +1605,7 @@ template num_vertex_elements; + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs_inline(GFX_VERSION); bool pointer_dirty, user_sgprs_dirty; assert(count <= SI_MAX_ATTRIBS); @@ -1641,7 +1642,6 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx) } unsigned first_vb_use_mask = velems->first_vb_use_mask; - unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs; for (unsigned i = 0; i < count; i++) { struct pipe_vertex_buffer *vb; @@ -1706,7 +1706,6 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx) if (pointer_dirty || user_sgprs_dirty) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; - unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs; unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG, PIPE_SHADER_VERTEX); assert(count); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index c5f01e70cfb8..221b9a431539 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2861,7 +2861,8 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->info.stage == MESA_SHADER_VERTEX && !sel->info.base.vs.blit_sgprs_amd ? sel->info.num_inputs : 0; - sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs); + unsigned num_vbos_in_sgprs = si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class); + sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, num_vbos_in_sgprs); /* The prolog is a no-op if there are no inputs. */ sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs && -- GitLab From 5cdbbcc2abb4442c24571f777964f41e7cd95d5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 11 Aug 2021 00:28:57 -0400 Subject: [PATCH 24/42] radeonsi: eliminate a not-found conditional for PrimID in si_get_ps_input_cntl Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_shader_nir.c | 11 +++++++++++ src/gallium/drivers/radeonsi/si_state_shaders.c | 6 +----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 832dd4fed5ca..8da24a4bd68b 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -483,6 +483,17 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf scan_instruction(nir, info, instr); } + if (info->stage == MESA_SHADER_VERTEX || info->stage == MESA_SHADER_TESS_EVAL) { + /* Add the PrimitiveID output, but don't increment num_outputs. + * The driver inserts PrimitiveID only when it's used by the pixel shader, + * and si_emit_spi_map uses this unconditionally when such a pixel shader is used. + */ + info->output_semantic[info->num_outputs] = VARYING_SLOT_PRIMITIVE_ID; + info->output_semantic_to_slot[VARYING_SLOT_PRIMITIVE_ID] = info->num_outputs; + info->output_type[info->num_outputs] = nir_type_uint32; + info->output_usagemask[info->num_outputs] = 0x1; + } + if (nir->info.stage == MESA_SHADER_FRAGMENT) { info->allow_flat_shading = !(info->uses_persp_center || info->uses_persp_centroid || info->uses_persp_sample || info->uses_linear_center || diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 221b9a431539..b217ec7eb54c 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3571,11 +3571,7 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader * S_028644_ATTR1_VALID(!!(fp16_lo_hi_mask & 0x2)); } } else { - /* VS output not found. */ - if (semantic == VARYING_SLOT_PRIMITIVE_ID) { - /* PrimID is written after the last output when HW VS is used. */ - ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]); - } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { + if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { /* No corresponding output found, load defaults into input. * Don't set any other bits. * (FLAT_SHADE=1 completely changes behavior) */ -- GitLab From b59bb9c07ae47f9f59bd7fcad6af77c591bb3611 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 11 Aug 2021 00:12:05 -0400 Subject: [PATCH 25/42] radeonsi: force flat for PrimID early in si_nir_scan_shader Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_shader_nir.c | 6 +++++- src/gallium/drivers/radeonsi/si_state_shaders.c | 3 +-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 8da24a4bd68b..0da9054b561c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -114,7 +114,11 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr unsigned loc = driver_location + i; info->input_semantic[loc] = semantic + i; - info->input_interpolate[loc] = interp; + + if (semantic == SYSTEM_VALUE_PRIMITIVE_ID) + info->input_interpolate[loc] = INTERP_MODE_FLAT; + else + info->input_interpolate[loc] = interp; if (mask) { info->input_usage_mask[loc] |= mask; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index b217ec7eb54c..bec6524c4514 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3531,8 +3531,7 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader * unsigned offset, ps_input_cntl = 0; if (interpolate == INTERP_MODE_FLAT || - (interpolate == INTERP_MODE_COLOR && rs->flatshade) || - semantic == VARYING_SLOT_PRIMITIVE_ID) + (interpolate == INTERP_MODE_COLOR && rs->flatshade)) ps_input_cntl |= S_028644_FLAT_SHADE(1); if (semantic == VARYING_SLOT_PNTC || -- GitLab From 11d1309d8276768aca3f02ddd7cc57cdb323113c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 10 Aug 2021 23:50:09 -0400 Subject: [PATCH 26/42] radeonsi: restructure si_get_ps_input_cntl for future refactoring Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_state_shaders.c | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index bec6524c4514..c8235ddf15fe 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3530,20 +3530,6 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader * struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned offset, ps_input_cntl = 0; - if (interpolate == INTERP_MODE_FLAT || - (interpolate == INTERP_MODE_COLOR && rs->flatshade)) - ps_input_cntl |= S_028644_FLAT_SHADE(1); - - if (semantic == VARYING_SLOT_PNTC || - (semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7 && - rs->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) { - ps_input_cntl |= S_028644_PT_SPRITE_TEX(1); - if (fp16_lo_hi_mask & 0x1) { - ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | - S_028644_ATTR0_VALID(1); - } - } - int vs_slot = vsinfo->output_semantic_to_slot[semantic]; if (vs_slot >= 0) { offset = vs->info.vs_output_param_offset[vs_slot]; @@ -3551,16 +3537,23 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader * if (offset <= AC_EXP_PARAM_OFFSET_31) { /* The input is loaded from parameter memory. */ ps_input_cntl |= S_028644_OFFSET(offset); - } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { + + if (interpolate == INTERP_MODE_FLAT || + (interpolate == INTERP_MODE_COLOR && rs->flatshade)) { + ps_input_cntl |= S_028644_FLAT_SHADE(1); + } + } else { /* The input is a DEFAULT_VAL constant. */ assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; - ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset); + /* Overwrite the whole value. OFFSET=0x20 means that DEFAULT_VAL is used. */ + ps_input_cntl = S_028644_OFFSET(0x20) | + S_028644_DEFAULT_VAL(offset); } - if (fp16_lo_hi_mask && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) { + if (fp16_lo_hi_mask) { assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000); ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | @@ -3570,14 +3563,21 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader * S_028644_ATTR1_VALID(!!(fp16_lo_hi_mask & 0x2)); } } else { - if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { - /* No corresponding output found, load defaults into input. - * Don't set any other bits. - * (FLAT_SHADE=1 completely changes behavior) */ - ps_input_cntl = S_028644_OFFSET(0x20); - /* D3D 9 behaviour. GL is undefined */ - if (semantic == VARYING_SLOT_COL0) - ps_input_cntl |= S_028644_DEFAULT_VAL(3); + /* No corresponding output found, load defaults into input. */ + ps_input_cntl = S_028644_OFFSET(0x20) | + /* D3D 9 behaviour for COLOR0. GL is undefined */ + S_028644_DEFAULT_VAL(semantic == VARYING_SLOT_COL1 ? 3 : 0); + } + + if (semantic == VARYING_SLOT_PNTC || + (semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7 && + rs->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) { + /* Overwrite the whole value for sprite coordinates. */ + ps_input_cntl = S_028644_OFFSET(0) | + S_028644_PT_SPRITE_TEX(1); + if (fp16_lo_hi_mask & 0x1) { + ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | + S_028644_ATTR0_VALID(1); } } -- GitLab From 46802f7b608b7e6f809033f671aedce8e93064a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 11 Aug 2021 00:48:17 -0400 Subject: [PATCH 27/42] radeonsi: interleave si_shader_info::input_* in memory for faster emit_spi_map Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_shader.h | 15 ++++++++++---- .../drivers/radeonsi/si_shader_llvm_gs.c | 2 +- .../drivers/radeonsi/si_shader_llvm_tess.c | 4 ++-- .../drivers/radeonsi/si_shader_llvm_vs.c | 4 ++-- src/gallium/drivers/radeonsi/si_shader_nir.c | 20 +++++++++---------- .../drivers/radeonsi/si_state_shaders.c | 12 +++++------ 6 files changed, 32 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b14a9a27f28f..292bd0d2a07f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -323,6 +323,16 @@ enum si_color_output_type { SI_TYPE_UINT16, }; +union si_input_info { + struct { + ubyte semantic; + ubyte interpolate; + ubyte fp16_lo_hi_valid; + ubyte usage_mask; + }; + uint32_t _unused; /* this just forces 4-byte alignment */ +}; + struct si_shader_info { shader_info base; @@ -330,10 +340,7 @@ struct si_shader_info { ubyte num_inputs; ubyte num_outputs; - ubyte input_semantic[PIPE_MAX_SHADER_INPUTS]; - ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS]; - ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS]; - ubyte input_fp16_lo_hi_valid[PIPE_MAX_SHADER_INPUTS]; + union si_input_info input[PIPE_MAX_SHADER_INPUTS]; ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS]; char output_semantic_to_slot[VARYING_SLOT_VAR15_16BIT + 1]; ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index b4a3b8a8aadc..a9ab0c549f3d 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -52,7 +52,7 @@ static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned in unsigned param; LLVMValueRef value; - param = si_shader_io_get_unique_index(info->input_semantic[input_index], false); + param = si_shader_io_get_unique_index(info->input[input_index].semantic, false); /* GFX9 has the ESGS ring in LDS. */ if (ctx->screen->info.chip_class >= GFX9) { diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index c79c475506d7..68e3fc18e218 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -390,7 +390,7 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMType ubyte semantic; if (load_input) { - semantic = info->input_semantic[driver_location]; + semantic = info->input[driver_location].semantic; } else { semantic = info->output_semantic[driver_location]; } @@ -448,7 +448,7 @@ static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, LLVMTypeRef struct si_shader_info *info = &ctx->shader->selector->info; LLVMValueRef base, addr; - ubyte semantic = info->input_semantic[driver_location]; + ubyte semantic = info->input[driver_location].semantic; assert((semantic >= VARYING_SLOT_PATCH0 || semantic == VARYING_SLOT_TESS_LEVEL_INNER || diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index b6bfa6fe09d4..cf57a6e77e81 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -107,7 +107,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L * ... which is what we must prevent at all cost. */ const bool can_speculate = false; - unsigned bit_size = info->input_fp16_lo_hi_valid[input_index] & 0x1 ? 16 : 32; + unsigned bit_size = info->input[input_index].fp16_lo_hi_valid & 0x1 ? 16 : 32; LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32; LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32; unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; @@ -157,7 +157,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L return; } - unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]); + unsigned required_channels = util_last_bit(info->input[input_index].usage_mask); if (required_channels == 0) { for (unsigned i = 0; i < 4; ++i) out[i] = LLVMGetUndef(ctx->ac.f32); diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 0da9054b561c..5de678b62d43 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -108,25 +108,25 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr unsigned num_slots = indirect ? nir_intrinsic_io_semantics(intr).num_slots : 1; if (is_input) { - assert(driver_location + num_slots <= ARRAY_SIZE(info->input_usage_mask)); + assert(driver_location + num_slots <= ARRAY_SIZE(info->input)); for (unsigned i = 0; i < num_slots; i++) { unsigned loc = driver_location + i; - info->input_semantic[loc] = semantic + i; + info->input[loc].semantic = semantic + i; if (semantic == SYSTEM_VALUE_PRIMITIVE_ID) - info->input_interpolate[loc] = INTERP_MODE_FLAT; + info->input[loc].interpolate = INTERP_MODE_FLAT; else - info->input_interpolate[loc] = interp; + info->input[loc].interpolate = interp; if (mask) { - info->input_usage_mask[loc] |= mask; + info->input[loc].usage_mask |= mask; if (bit_size == 16) { if (nir_intrinsic_io_semantics(intr).high_16bits) - info->input_fp16_lo_hi_valid[loc] |= 0x2; + info->input[loc].fp16_lo_hi_valid |= 0x2; else - info->input_fp16_lo_hi_valid[loc] |= 0x1; + info->input[loc].fp16_lo_hi_valid |= 0x1; } info->num_inputs = MAX2(info->num_inputs, loc + 1); } @@ -517,9 +517,9 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf if (nir->info.stage == MESA_SHADER_FRAGMENT) { for (unsigned i = 0; i < 2; i++) { if ((info->colors_read >> (i * 4)) & 0xf) { - info->input_semantic[info->num_inputs] = VARYING_SLOT_COL0 + i; - info->input_interpolate[info->num_inputs] = info->color_interpolate[i]; - info->input_usage_mask[info->num_inputs] = info->colors_read >> (i * 4); + info->input[info->num_inputs].semantic = VARYING_SLOT_COL0 + i; + info->input[info->num_inputs].interpolate = info->color_interpolate[i]; + info->input[info->num_inputs].usage_mask = info->colors_read >> (i * 4); info->num_inputs++; } } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index c8235ddf15fe..8d9068bf8432 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2962,7 +2962,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, case MESA_SHADER_FRAGMENT: for (i = 0; i < sel->info.num_inputs; i++) { - unsigned semantic = sel->info.input_semantic[i]; + unsigned semantic = sel->info.input[i].semantic; if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && semantic != VARYING_SLOT_PNTC) { @@ -2975,9 +2975,9 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->colors_written_4bit |= 0xf << (4 * i); for (i = 0; i < sel->info.num_inputs; i++) { - if (sel->info.input_semantic[i] == VARYING_SLOT_COL0) + if (sel->info.input[i].semantic == VARYING_SLOT_COL0) sel->color_attr_index[0] = i; - else if (sel->info.input_semantic[i] == VARYING_SLOT_COL1) + else if (sel->info.input[i].semantic == VARYING_SLOT_COL1) sel->color_attr_index[1] = i; } break; @@ -3605,9 +3605,9 @@ static void si_emit_spi_map(struct si_context *sctx) assert(num_interp > 0); for (i = 0; i < psinfo->num_inputs; i++) { - unsigned semantic = psinfo->input_semantic[i]; - unsigned interpolate = psinfo->input_interpolate[i]; - ubyte fp16_lo_hi_mask = psinfo->input_fp16_lo_hi_valid[i]; + unsigned semantic = psinfo->input[i].semantic; + unsigned interpolate = psinfo->input[i].interpolate; + ubyte fp16_lo_hi_mask = psinfo->input[i].fp16_lo_hi_valid; spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, semantic, interpolate, fp16_lo_hi_mask); -- GitLab From 57f9452b466734b9fc9fd92bc8c508ef9973a540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 11 Aug 2021 00:56:51 -0400 Subject: [PATCH 28/42] radeonsi: precompute num_interp for si_emit_spi_map Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_shader.h | 1 + src/gallium/drivers/radeonsi/si_state_shaders.c | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 292bd0d2a07f..44946891a191 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -869,6 +869,7 @@ struct si_shader { unsigned spi_shader_z_format; unsigned spi_shader_col_format; unsigned cb_shader_mask; + unsigned num_interp; } ps; } ctx_reg; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 8d9068bf8432..faf144461af2 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1721,10 +1721,13 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) shader->ctx_reg.ps.spi_ps_input_ena = input_ena; shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr; + unsigned num_interp = si_get_ps_num_interp(shader); + /* Set interpolation controls. */ - spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) | + spi_ps_in_control = S_0286D8_NUM_INTERP(num_interp) | S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32); + shader->ctx_reg.ps.num_interp = num_interp; shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl; shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control; shader->ctx_reg.ps.spi_shader_z_format = @@ -3589,7 +3592,7 @@ static void si_emit_spi_map(struct si_context *sctx) struct si_shader *ps = sctx->shader.ps.current; struct si_shader *vs; struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL; - unsigned i, num_interp, num_written = 0; + unsigned i, num_written = 0; unsigned spi_ps_input_cntl[32]; if (!ps || !ps->selector->info.num_inputs) @@ -3601,9 +3604,6 @@ static void si_emit_spi_map(struct si_context *sctx) else vs = si_get_vs(sctx)->current; - num_interp = si_get_ps_num_interp(ps); - assert(num_interp > 0); - for (i = 0; i < psinfo->num_inputs; i++) { unsigned semantic = psinfo->input[i].semantic; unsigned interpolate = psinfo->input[i].interpolate; @@ -3624,6 +3624,9 @@ static void si_emit_spi_map(struct si_context *sctx) false); } } + + unsigned num_interp = ps->ctx_reg.ps.num_interp; + assert(num_interp > 0); assert(num_interp == num_written); /* R_028644_SPI_PS_INPUT_CNTL_0 */ -- GitLab From 5f090891e94948e2227a2cba3f3bd80b2580cd45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 11 Aug 2021 02:29:47 -0400 Subject: [PATCH 29/42] radeonsi: simplify si_emit_spi_map for back-face colors Just precompute what we need in si_shader_info. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_shader_nir.c | 27 ++++++++++++------- .../drivers/radeonsi/si_state_shaders.c | 17 +++--------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 5de678b62d43..c3ac1dbef049 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -511,16 +511,25 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) || BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN) || BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_HELPER_INVOCATION)); - } - /* Add color inputs to the list of inputs. */ - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - for (unsigned i = 0; i < 2; i++) { - if ((info->colors_read >> (i * 4)) & 0xf) { - info->input[info->num_inputs].semantic = VARYING_SLOT_COL0 + i; - info->input[info->num_inputs].interpolate = info->color_interpolate[i]; - info->input[info->num_inputs].usage_mask = info->colors_read >> (i * 4); - info->num_inputs++; + /* Add both front and back color inputs. */ + unsigned num_inputs_with_colors = info->num_inputs; + for (unsigned back = 0; back < 2; back++) { + for (unsigned i = 0; i < 2; i++) { + if ((info->colors_read >> (i * 4)) & 0xf) { + unsigned index = num_inputs_with_colors; + + info->input[index].semantic = (back ? VARYING_SLOT_BFC0 : VARYING_SLOT_COL0) + i; + info->input[index].interpolate = info->color_interpolate[i]; + info->input[index].usage_mask = info->colors_read >> (i * 4); + num_inputs_with_colors++; + + /* Back-face color don't increment num_inputs. si_emit_spi_map will use + * back-face colors conditionally only when they are needed. + */ + if (!back) + info->num_inputs = num_inputs_with_colors; + } } } } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index faf144461af2..7c41bcbed264 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3604,7 +3604,9 @@ static void si_emit_spi_map(struct si_context *sctx) else vs = si_get_vs(sctx)->current; - for (i = 0; i < psinfo->num_inputs; i++) { + unsigned num_interp = ps->ctx_reg.ps.num_interp; + + for (i = 0; i < num_interp; i++) { unsigned semantic = psinfo->input[i].semantic; unsigned interpolate = psinfo->input[i].interpolate; ubyte fp16_lo_hi_mask = psinfo->input[i].fp16_lo_hi_valid; @@ -3613,19 +3615,6 @@ static void si_emit_spi_map(struct si_context *sctx) fp16_lo_hi_mask); } - if (ps->key.part.ps.prolog.color_two_side) { - for (i = 0; i < 2; i++) { - if (!(psinfo->colors_read & (0xf << (i * 4)))) - continue; - - unsigned semantic = VARYING_SLOT_BFC0 + i; - spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, semantic, - psinfo->color_interpolate[i], - false); - } - } - - unsigned num_interp = ps->ctx_reg.ps.num_interp; assert(num_interp > 0); assert(num_interp == num_written); -- GitLab From 3264372539a38bb1df58bf6eef9c16d7d58f32df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 11 Aug 2021 02:54:08 -0400 Subject: [PATCH 30/42] radeonsi: inline si_get_ps_input_cntl because it has only one use Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_state_shaders.c | 119 ++++++++---------- 1 file changed, 55 insertions(+), 64 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 7c41bcbed264..e2f61ebe7c96 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3525,68 +3525,6 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state) si_shader_selector_reference(sctx, &sel, NULL); } -static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader *vs, - unsigned semantic, enum glsl_interp_mode interpolate, - ubyte fp16_lo_hi_mask) -{ - struct si_shader_info *vsinfo = &vs->selector->info; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned offset, ps_input_cntl = 0; - - int vs_slot = vsinfo->output_semantic_to_slot[semantic]; - if (vs_slot >= 0) { - offset = vs->info.vs_output_param_offset[vs_slot]; - - if (offset <= AC_EXP_PARAM_OFFSET_31) { - /* The input is loaded from parameter memory. */ - ps_input_cntl |= S_028644_OFFSET(offset); - - if (interpolate == INTERP_MODE_FLAT || - (interpolate == INTERP_MODE_COLOR && rs->flatshade)) { - ps_input_cntl |= S_028644_FLAT_SHADE(1); - } - } else { - /* The input is a DEFAULT_VAL constant. */ - assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && - offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); - offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; - - /* Overwrite the whole value. OFFSET=0x20 means that DEFAULT_VAL is used. */ - ps_input_cntl = S_028644_OFFSET(0x20) | - S_028644_DEFAULT_VAL(offset); - } - - if (fp16_lo_hi_mask) { - assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000); - - ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | - S_028644_USE_DEFAULT_ATTR1(offset == AC_EXP_PARAM_DEFAULT_VAL_0000) | - S_028644_DEFAULT_VAL_ATTR1(0) | - S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */ - S_028644_ATTR1_VALID(!!(fp16_lo_hi_mask & 0x2)); - } - } else { - /* No corresponding output found, load defaults into input. */ - ps_input_cntl = S_028644_OFFSET(0x20) | - /* D3D 9 behaviour for COLOR0. GL is undefined */ - S_028644_DEFAULT_VAL(semantic == VARYING_SLOT_COL1 ? 3 : 0); - } - - if (semantic == VARYING_SLOT_PNTC || - (semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7 && - rs->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) { - /* Overwrite the whole value for sprite coordinates. */ - ps_input_cntl = S_028644_OFFSET(0) | - S_028644_PT_SPRITE_TEX(1); - if (fp16_lo_hi_mask & 0x1) { - ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | - S_028644_ATTR0_VALID(1); - } - } - - return ps_input_cntl; -} - static void si_emit_spi_map(struct si_context *sctx) { struct si_shader *ps = sctx->shader.ps.current; @@ -3605,14 +3543,67 @@ static void si_emit_spi_map(struct si_context *sctx) vs = si_get_vs(sctx)->current; unsigned num_interp = ps->ctx_reg.ps.num_interp; + struct si_shader_info *vsinfo = &vs->selector->info; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; for (i = 0; i < num_interp; i++) { unsigned semantic = psinfo->input[i].semantic; unsigned interpolate = psinfo->input[i].interpolate; ubyte fp16_lo_hi_mask = psinfo->input[i].fp16_lo_hi_valid; + unsigned ps_input_cntl = 0; + + int vs_slot = vsinfo->output_semantic_to_slot[semantic]; + if (vs_slot >= 0) { + unsigned offset = vs->info.vs_output_param_offset[vs_slot]; + + if (offset <= AC_EXP_PARAM_OFFSET_31) { + /* The input is loaded from parameter memory. */ + ps_input_cntl |= S_028644_OFFSET(offset); + + if (interpolate == INTERP_MODE_FLAT || + (interpolate == INTERP_MODE_COLOR && rs->flatshade)) { + ps_input_cntl |= S_028644_FLAT_SHADE(1); + } + } else { + /* The input is a DEFAULT_VAL constant. */ + assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && + offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); + offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; + + /* Overwrite the whole value. OFFSET=0x20 means that DEFAULT_VAL is used. */ + ps_input_cntl = S_028644_OFFSET(0x20) | + S_028644_DEFAULT_VAL(offset); + } + + if (fp16_lo_hi_mask) { + assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000); + + ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | + S_028644_USE_DEFAULT_ATTR1(offset == AC_EXP_PARAM_DEFAULT_VAL_0000) | + S_028644_DEFAULT_VAL_ATTR1(0) | + S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */ + S_028644_ATTR1_VALID(!!(fp16_lo_hi_mask & 0x2)); + } + } else { + /* No corresponding output found, load defaults into input. */ + ps_input_cntl = S_028644_OFFSET(0x20) | + /* D3D 9 behaviour for COLOR0. GL is undefined */ + S_028644_DEFAULT_VAL(semantic == VARYING_SLOT_COL1 ? 3 : 0); + } + + if (semantic == VARYING_SLOT_PNTC || + (semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7 && + rs->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) { + /* Overwrite the whole value for sprite coordinates. */ + ps_input_cntl = S_028644_OFFSET(0) | + S_028644_PT_SPRITE_TEX(1); + if (fp16_lo_hi_mask & 0x1) { + ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | + S_028644_ATTR0_VALID(1); + } + } - spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, semantic, interpolate, - fp16_lo_hi_mask); + spi_ps_input_cntl[num_written++] = ps_input_cntl; } assert(num_interp > 0); -- GitLab From dba914de85d48847a52896cd12489c6af498fdc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 11 Aug 2021 03:07:03 -0400 Subject: [PATCH 31/42] radeonsi: unroll loops in si_emit_spi_map using 33 C++ template instantiations Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pipe.c | 1 + src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_state.h | 1 + .../drivers/radeonsi/si_state_draw.cpp | 139 +++++++++++++++++- .../drivers/radeonsi/si_state_shaders.c | 94 ------------ 5 files changed, 141 insertions(+), 95 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 619008209ba3..d16cba67368d 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -571,6 +571,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign si_init_state_functions(sctx); si_init_streamout_functions(sctx); si_init_viewport_functions(sctx); + si_init_spi_map_functions(sctx); sctx->blitter = util_blitter_create(&sctx->b); if (sctx->blitter == NULL) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index efa853751b75..926af88fd875 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1224,6 +1224,7 @@ struct si_context { pipe_draw_vbo_func draw_vbo[2][2][2]; /* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */ pipe_draw_vbo_func real_draw_vbo; + void (*emit_spi_map[33])(struct si_context *sctx); /* SQTT */ struct ac_thread_trace_data *thread_trace; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 7555a1525640..895e280bb460 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -604,6 +604,7 @@ void si_init_draw_functions_GFX8(struct si_context *sctx); void si_init_draw_functions_GFX9(struct si_context *sctx); void si_init_draw_functions_GFX10(struct si_context *sctx); void si_init_draw_functions_GFX10_3(struct si_context *sctx); +void si_init_spi_map_functions(struct si_context *sctx); /* si_state_msaa.c */ void si_init_msaa_functions(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index c95e3938f36f..6f5f13ca06bd 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -22,6 +22,7 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "ac_exp_param.h" #include "ac_sqtt.h" #include "si_build_pm4.h" #include "util/u_index_modify.h" @@ -47,6 +48,95 @@ /* special primitive types */ #define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX +template +static void si_emit_spi_map(struct si_context *sctx) +{ + struct si_shader *ps = sctx->shader.ps.current; + struct si_shader *vs; + struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL; + unsigned spi_ps_input_cntl[NUM_INTERP]; + + STATIC_ASSERT(NUM_INTERP >= 0 && NUM_INTERP <= 32); + + if (!NUM_INTERP) + return; + + /* With legacy GS, only the GS copy shader contains information about param exports. */ + if (sctx->shader.gs.cso && !sctx->ngg) + vs = sctx->shader.gs.cso->gs_copy_shader; + else + vs = si_get_vs(sctx)->current; + + struct si_shader_info *vsinfo = &vs->selector->info; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + for (unsigned i = 0; i < NUM_INTERP; i++) { + union si_input_info input = psinfo->input[i]; + unsigned ps_input_cntl = 0; + + int vs_slot = vsinfo->output_semantic_to_slot[input.semantic]; + if (vs_slot >= 0) { + unsigned offset = vs->info.vs_output_param_offset[vs_slot]; + + if (offset <= AC_EXP_PARAM_OFFSET_31) { + /* The input is loaded from parameter memory. */ + ps_input_cntl |= S_028644_OFFSET(offset); + + if (input.interpolate == INTERP_MODE_FLAT || + (input.interpolate == INTERP_MODE_COLOR && rs->flatshade)) { + ps_input_cntl |= S_028644_FLAT_SHADE(1); + } + } else { + /* The input is a DEFAULT_VAL constant. */ + assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && + offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); + offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; + + /* Overwrite the whole value. OFFSET=0x20 means that DEFAULT_VAL is used. */ + ps_input_cntl = S_028644_OFFSET(0x20) | + S_028644_DEFAULT_VAL(offset); + } + + if (input.fp16_lo_hi_valid) { + assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000); + + ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | + S_028644_USE_DEFAULT_ATTR1(offset == AC_EXP_PARAM_DEFAULT_VAL_0000) | + S_028644_DEFAULT_VAL_ATTR1(0) | + S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */ + S_028644_ATTR1_VALID(!!(input.fp16_lo_hi_valid & 0x2)); + } + } else { + /* No corresponding output found, load defaults into input. */ + ps_input_cntl = S_028644_OFFSET(0x20) | + /* D3D 9 behaviour for COLOR0. GL is undefined */ + S_028644_DEFAULT_VAL(input.semantic == VARYING_SLOT_COL1 ? 3 : 0); + } + + if (input.semantic == VARYING_SLOT_PNTC || + (input.semantic >= VARYING_SLOT_TEX0 && input.semantic <= VARYING_SLOT_TEX7 && + rs->sprite_coord_enable & (1 << (input.semantic - VARYING_SLOT_TEX0)))) { + /* Overwrite the whole value for sprite coordinates. */ + ps_input_cntl = S_028644_OFFSET(0) | + S_028644_PT_SPRITE_TEX(1); + if (input.fp16_lo_hi_valid & 0x1) { + ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | + S_028644_ATTR0_VALID(1); + } + } + + spi_ps_input_cntl[i] = ps_input_cntl; + } + + /* R_028644_SPI_PS_INPUT_CNTL_0 */ + /* Dota 2: Only ~16% of SPI map updates set different values. */ + /* Talos: Only ~9% of SPI map updates set different values. */ + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, + sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP); + radeon_end_update_context_roll(sctx); +} + template static bool si_update_shaders(struct si_context *sctx) { @@ -188,8 +278,10 @@ static bool si_update_shaders(struct si_context *sctx) if (si_pm4_state_changed(sctx, ps) || (!NGG && si_pm4_state_changed(sctx, vs)) || - (NGG && si_pm4_state_changed(sctx, gs))) + (NGG && si_pm4_state_changed(sctx, gs))) { + sctx->atoms.s.spi_map.emit = sctx->emit_spi_map[sctx->shader.ps.current->ctx_reg.ps.num_interp]; si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); + } if ((GFX_VERSION >= GFX10_3 || (GFX_VERSION >= GFX9 && sctx->screen->info.rbplus_allowed)) && si_pm4_state_changed(sctx, ps) && @@ -2413,3 +2505,48 @@ void GFX(si_init_draw_functions_)(struct si_context *sctx) si_init_ia_multi_vgt_param_table(sctx); } + +#if GFX_VER == 6 /* declare this function only once because it supports all chips. */ + +extern "C" +void si_init_spi_map_functions(struct si_context *sctx) +{ + /* This unrolls the loops in si_emit_spi_map and inlines memcmp and memcpys. + * It improves performance for viewperf/snx. + */ + sctx->emit_spi_map[0] = si_emit_spi_map<0>; + sctx->emit_spi_map[1] = si_emit_spi_map<1>; + sctx->emit_spi_map[2] = si_emit_spi_map<2>; + sctx->emit_spi_map[3] = si_emit_spi_map<3>; + sctx->emit_spi_map[4] = si_emit_spi_map<4>; + sctx->emit_spi_map[5] = si_emit_spi_map<5>; + sctx->emit_spi_map[6] = si_emit_spi_map<6>; + sctx->emit_spi_map[7] = si_emit_spi_map<7>; + sctx->emit_spi_map[8] = si_emit_spi_map<8>; + sctx->emit_spi_map[9] = si_emit_spi_map<9>; + sctx->emit_spi_map[10] = si_emit_spi_map<10>; + sctx->emit_spi_map[11] = si_emit_spi_map<11>; + sctx->emit_spi_map[12] = si_emit_spi_map<12>; + sctx->emit_spi_map[13] = si_emit_spi_map<13>; + sctx->emit_spi_map[14] = si_emit_spi_map<14>; + sctx->emit_spi_map[15] = si_emit_spi_map<15>; + sctx->emit_spi_map[16] = si_emit_spi_map<16>; + sctx->emit_spi_map[17] = si_emit_spi_map<17>; + sctx->emit_spi_map[18] = si_emit_spi_map<18>; + sctx->emit_spi_map[19] = si_emit_spi_map<19>; + sctx->emit_spi_map[20] = si_emit_spi_map<20>; + sctx->emit_spi_map[21] = si_emit_spi_map<21>; + sctx->emit_spi_map[22] = si_emit_spi_map<22>; + sctx->emit_spi_map[23] = si_emit_spi_map<23>; + sctx->emit_spi_map[24] = si_emit_spi_map<24>; + sctx->emit_spi_map[25] = si_emit_spi_map<25>; + sctx->emit_spi_map[26] = si_emit_spi_map<26>; + sctx->emit_spi_map[27] = si_emit_spi_map<27>; + sctx->emit_spi_map[28] = si_emit_spi_map<28>; + sctx->emit_spi_map[29] = si_emit_spi_map<29>; + sctx->emit_spi_map[30] = si_emit_spi_map<30>; + sctx->emit_spi_map[31] = si_emit_spi_map<31>; + sctx->emit_spi_map[32] = si_emit_spi_map<32>; +} + +#endif diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index e2f61ebe7c96..fac6b4888252 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3525,99 +3525,6 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state) si_shader_selector_reference(sctx, &sel, NULL); } -static void si_emit_spi_map(struct si_context *sctx) -{ - struct si_shader *ps = sctx->shader.ps.current; - struct si_shader *vs; - struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL; - unsigned i, num_written = 0; - unsigned spi_ps_input_cntl[32]; - - if (!ps || !ps->selector->info.num_inputs) - return; - - /* With legacy GS, only the GS copy shader contains information about param exports. */ - if (sctx->shader.gs.cso && !sctx->ngg) - vs = sctx->shader.gs.cso->gs_copy_shader; - else - vs = si_get_vs(sctx)->current; - - unsigned num_interp = ps->ctx_reg.ps.num_interp; - struct si_shader_info *vsinfo = &vs->selector->info; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - - for (i = 0; i < num_interp; i++) { - unsigned semantic = psinfo->input[i].semantic; - unsigned interpolate = psinfo->input[i].interpolate; - ubyte fp16_lo_hi_mask = psinfo->input[i].fp16_lo_hi_valid; - unsigned ps_input_cntl = 0; - - int vs_slot = vsinfo->output_semantic_to_slot[semantic]; - if (vs_slot >= 0) { - unsigned offset = vs->info.vs_output_param_offset[vs_slot]; - - if (offset <= AC_EXP_PARAM_OFFSET_31) { - /* The input is loaded from parameter memory. */ - ps_input_cntl |= S_028644_OFFSET(offset); - - if (interpolate == INTERP_MODE_FLAT || - (interpolate == INTERP_MODE_COLOR && rs->flatshade)) { - ps_input_cntl |= S_028644_FLAT_SHADE(1); - } - } else { - /* The input is a DEFAULT_VAL constant. */ - assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && - offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); - offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; - - /* Overwrite the whole value. OFFSET=0x20 means that DEFAULT_VAL is used. */ - ps_input_cntl = S_028644_OFFSET(0x20) | - S_028644_DEFAULT_VAL(offset); - } - - if (fp16_lo_hi_mask) { - assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000); - - ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | - S_028644_USE_DEFAULT_ATTR1(offset == AC_EXP_PARAM_DEFAULT_VAL_0000) | - S_028644_DEFAULT_VAL_ATTR1(0) | - S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */ - S_028644_ATTR1_VALID(!!(fp16_lo_hi_mask & 0x2)); - } - } else { - /* No corresponding output found, load defaults into input. */ - ps_input_cntl = S_028644_OFFSET(0x20) | - /* D3D 9 behaviour for COLOR0. GL is undefined */ - S_028644_DEFAULT_VAL(semantic == VARYING_SLOT_COL1 ? 3 : 0); - } - - if (semantic == VARYING_SLOT_PNTC || - (semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7 && - rs->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) { - /* Overwrite the whole value for sprite coordinates. */ - ps_input_cntl = S_028644_OFFSET(0) | - S_028644_PT_SPRITE_TEX(1); - if (fp16_lo_hi_mask & 0x1) { - ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | - S_028644_ATTR0_VALID(1); - } - } - - spi_ps_input_cntl[num_written++] = ps_input_cntl; - } - - assert(num_interp > 0); - assert(num_interp == num_written); - - /* R_028644_SPI_PS_INPUT_CNTL_0 */ - /* Dota 2: Only ~16% of SPI map updates set different values. */ - /* Talos: Only ~9% of SPI map updates set different values. */ - radeon_begin(&sctx->gfx_cs); - radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, - sctx->tracked_regs.spi_ps_input_cntl, num_interp); - radeon_end_update_context_roll(sctx); -} - /** * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that. */ @@ -4146,7 +4053,6 @@ void si_init_screen_live_shader_cache(struct si_screen *sscreen) void si_init_shader_functions(struct si_context *sctx) { - sctx->atoms.s.spi_map.emit = si_emit_spi_map; sctx->atoms.s.scratch_state.emit = si_emit_scratch_state; sctx->b.create_vs_state = si_create_shader; -- GitLab From 5824ab569e1130fb1cb7f096714672e2f9fdef4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 11 Aug 2021 03:54:14 -0400 Subject: [PATCH 32/42] radeonsi: precompute more spi_map code This replaces vs_output_param_offset by vs_output_ps_input_cntl, which is easier to use. For geometry shaders, vs_output_ps_input_cntl is stored in the GS si_shader structure, not gs_copy_shader. This requires that gs_copy_shader compilation is finished before the GS main shader part, so that GS can initialize vs_output_ps_input_cntl using the compiled GS copy shader. output_semantic_to_slot becomes unused, so it's removed. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_shader.c | 47 +++++++++++++++++-- src/gallium/drivers/radeonsi/si_shader.h | 8 +++- src/gallium/drivers/radeonsi/si_shader_llvm.c | 1 + .../drivers/radeonsi/si_shader_llvm_vs.c | 4 ++ src/gallium/drivers/radeonsi/si_shader_nir.c | 5 -- .../drivers/radeonsi/si_state_draw.cpp | 47 +++---------------- .../drivers/radeonsi/si_state_shaders.c | 34 +++++++------- 7 files changed, 80 insertions(+), 66 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index b3d84eec8feb..b381d5d09a34 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1433,8 +1433,10 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi si_dump_streamout(&sel->so); } - memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000, - sizeof(shader->info.vs_output_param_offset)); + /* Initialize vs_output_ps_input_cntl to default. */ + for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++) + shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED; + shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0; shader->info.uses_instanceid = sel->info.uses_instanceid; @@ -1445,6 +1447,43 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir)) return false; + /* Compute vs_output_ps_input_cntl. */ + if ((sel->info.stage == MESA_SHADER_VERTEX || + sel->info.stage == MESA_SHADER_TESS_EVAL || + sel->info.stage == MESA_SHADER_GEOMETRY) && + !shader->key.as_ls && !shader->key.as_es) { + ubyte *vs_output_param_offset = shader->info.vs_output_param_offset; + + if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) + vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset; + + /* VS and TES should also set primitive ID output if it's used. */ + unsigned num_outputs_with_prim_id = sel->info.num_outputs + + shader->key.mono.u.vs_export_prim_id; + + for (unsigned i = 0; i < num_outputs_with_prim_id; i++) { + unsigned semantic = sel->info.output_semantic[i]; + unsigned offset = vs_output_param_offset[i]; + unsigned ps_input_cntl; + + if (offset <= AC_EXP_PARAM_OFFSET_31) { + /* The input is loaded from parameter memory. */ + ps_input_cntl = S_028644_OFFSET(offset); + } else { + /* The input is a DEFAULT_VAL constant. */ + assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && + offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); + offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; + + /* OFFSET=0x20 means that DEFAULT_VAL is used. */ + ps_input_cntl = S_028644_OFFSET(0x20) | + S_028644_DEFAULT_VAL(offset); + } + + shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl; + } + } + /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */ if (sel->info.stage == MESA_SHADER_COMPUTE) { unsigned wave_size = sscreen->compute_wave_size; @@ -2002,8 +2041,8 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler shader->info.num_input_vgprs = mainp->info.num_input_vgprs; shader->info.face_vgpr_index = mainp->info.face_vgpr_index; shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; - memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset, - sizeof(mainp->info.vs_output_param_offset)); + memcpy(shader->info.vs_output_ps_input_cntl, mainp->info.vs_output_ps_input_cntl, + sizeof(mainp->info.vs_output_ps_input_cntl)); shader->info.uses_instanceid = mainp->info.uses_instanceid; shader->info.nr_pos_exports = mainp->info.nr_pos_exports; shader->info.nr_param_exports = mainp->info.nr_param_exports; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 44946891a191..de0dc232e12f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -158,6 +158,12 @@ struct si_context; #define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29)) +#define SI_PS_INPUT_CNTL_0000 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0)) +#define SI_PS_INPUT_CNTL_0001 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3)) +#define SI_PS_INPUT_CNTL_UNUSED SI_PS_INPUT_CNTL_0000 +/* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */ +#define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001 + /* SGPR user data indices */ enum { @@ -342,7 +348,6 @@ struct si_shader_info { ubyte num_outputs; union si_input_info input[PIPE_MAX_SHADER_INPUTS]; ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS]; - char output_semantic_to_slot[VARYING_SLOT_VAR15_16BIT + 1]; ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS]; ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS]; @@ -707,6 +712,7 @@ struct si_shader_key { /* GCN-specific shader info. */ struct si_shader_binary_info { ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; + uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS]; ubyte num_input_sgprs; ubyte num_input_vgprs; signed char face_vgpr_index; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index c975581fe4f7..083d73fca752 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -22,6 +22,7 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "ac_exp_param.h" #include "ac_nir_to_llvm.h" #include "ac_rtld.h" #include "si_pipe.h" diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index cf57a6e77e81..d35c296c2195 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -26,6 +26,7 @@ #include "si_shader_internal.h" #include "sid.h" #include "util/u_memory.h" +#include "ac_exp_param.h" static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index) { @@ -452,6 +453,9 @@ static void si_prepare_param_exports(struct si_shader_context *ctx, struct si_shader *shader = ctx->shader; unsigned param_count = 0; + memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000, + sizeof(shader->info.vs_output_param_offset)); + for (unsigned i = 0; i < noutput; i++) { unsigned semantic = outputs[i].semantic; diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index c3ac1dbef049..ed07fa7e0a78 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -134,13 +134,11 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr } else { /* Outputs. */ assert(driver_location + num_slots <= ARRAY_SIZE(info->output_usagemask)); - assert(semantic + num_slots < ARRAY_SIZE(info->output_semantic_to_slot)); for (unsigned i = 0; i < num_slots; i++) { unsigned loc = driver_location + i; info->output_semantic[loc] = semantic + i; - info->output_semantic_to_slot[semantic + i] = loc; if (is_output_load) { /* Output loads have only a few things that we need to track. */ @@ -479,8 +477,6 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf info->writes_position = nir->info.outputs_written & VARYING_BIT_POS; } - memset(info->output_semantic_to_slot, -1, sizeof(info->output_semantic_to_slot)); - func = (struct nir_function *)exec_list_get_head_const(&nir->functions); nir_foreach_block (block, func->impl) { nir_foreach_instr (instr, block) @@ -493,7 +489,6 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf * and si_emit_spi_map uses this unconditionally when such a pixel shader is used. */ info->output_semantic[info->num_outputs] = VARYING_SLOT_PRIMITIVE_ID; - info->output_semantic_to_slot[VARYING_SLOT_PRIMITIVE_ID] = info->num_outputs; info->output_type[info->num_outputs] = nir_type_uint32; info->output_usagemask[info->num_outputs] = 0x1; } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 6f5f13ca06bd..b6ce4fd7174a 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -52,7 +52,6 @@ template static void si_emit_spi_map(struct si_context *sctx) { struct si_shader *ps = sctx->shader.ps.current; - struct si_shader *vs; struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL; unsigned spi_ps_input_cntl[NUM_INTERP]; @@ -61,56 +60,24 @@ static void si_emit_spi_map(struct si_context *sctx) if (!NUM_INTERP) return; - /* With legacy GS, only the GS copy shader contains information about param exports. */ - if (sctx->shader.gs.cso && !sctx->ngg) - vs = sctx->shader.gs.cso->gs_copy_shader; - else - vs = si_get_vs(sctx)->current; - - struct si_shader_info *vsinfo = &vs->selector->info; + struct si_shader *vs = si_get_vs(sctx)->current; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; for (unsigned i = 0; i < NUM_INTERP; i++) { union si_input_info input = psinfo->input[i]; - unsigned ps_input_cntl = 0; - - int vs_slot = vsinfo->output_semantic_to_slot[input.semantic]; - if (vs_slot >= 0) { - unsigned offset = vs->info.vs_output_param_offset[vs_slot]; + unsigned ps_input_cntl = vs->info.vs_output_ps_input_cntl[input.semantic]; + bool non_default_val = G_028644_OFFSET(ps_input_cntl) != 0x20; - if (offset <= AC_EXP_PARAM_OFFSET_31) { - /* The input is loaded from parameter memory. */ - ps_input_cntl |= S_028644_OFFSET(offset); - - if (input.interpolate == INTERP_MODE_FLAT || - (input.interpolate == INTERP_MODE_COLOR && rs->flatshade)) { - ps_input_cntl |= S_028644_FLAT_SHADE(1); - } - } else { - /* The input is a DEFAULT_VAL constant. */ - assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && - offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); - offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; - - /* Overwrite the whole value. OFFSET=0x20 means that DEFAULT_VAL is used. */ - ps_input_cntl = S_028644_OFFSET(0x20) | - S_028644_DEFAULT_VAL(offset); - } + if (non_default_val) { + if (input.interpolate == INTERP_MODE_FLAT || + (input.interpolate == INTERP_MODE_COLOR && rs->flatshade)) + ps_input_cntl |= S_028644_FLAT_SHADE(1); if (input.fp16_lo_hi_valid) { - assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000); - ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | - S_028644_USE_DEFAULT_ATTR1(offset == AC_EXP_PARAM_DEFAULT_VAL_0000) | - S_028644_DEFAULT_VAL_ATTR1(0) | S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */ S_028644_ATTR1_VALID(!!(input.fp16_lo_hi_valid & 0x2)); } - } else { - /* No corresponding output found, load defaults into input. */ - ps_input_cntl = S_028644_OFFSET(0x20) | - /* D3D 9 behaviour for COLOR0. GL is undefined */ - S_028644_DEFAULT_VAL(input.semantic == VARYING_SLOT_COL1 ? 3 : 0); } if (input.semantic == VARYING_SLOT_PNTC || diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index fac6b4888252..02fc51b5b597 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2632,6 +2632,19 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind if (!compiler->passes) si_init_compiler(sscreen, compiler); + /* The GS copy shader is always pre-compiled. */ + if (sel->info.stage == MESA_SHADER_GEOMETRY && + (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */ + sel->tess_turns_off_ngg)) { + sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug); + if (!sel->gs_copy_shader) { + fprintf(stderr, "radeonsi: can't create GS copy shader\n"); + return; + } + + si_shader_vs(sscreen, sel->gs_copy_shader, sel); + } + /* Serialize NIR to save memory. Monolithic shader variants * have to deserialize NIR before compilation. */ @@ -2716,14 +2729,16 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind unsigned i; for (i = 0; i < sel->info.num_outputs; i++) { - unsigned offset = shader->info.vs_output_param_offset[i]; + unsigned semantic = sel->info.output_semantic[i]; + unsigned ps_input_cntl = shader->info.vs_output_ps_input_cntl[semantic]; - if (offset <= AC_EXP_PARAM_OFFSET_31) + /* OFFSET=0x20 means DEFAULT_VAL, which means VS doesn't export it. */ + if (G_028644_OFFSET(ps_input_cntl) != 0x20) continue; - unsigned semantic = sel->info.output_semantic[i]; unsigned id; + /* Remove the output from the mask. */ if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && semantic != VARYING_SLOT_POS && semantic != VARYING_SLOT_PSIZ && @@ -2736,19 +2751,6 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind } } - /* The GS copy shader is always pre-compiled. */ - if (sel->info.stage == MESA_SHADER_GEOMETRY && - (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */ - sel->tess_turns_off_ngg)) { - sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug); - if (!sel->gs_copy_shader) { - fprintf(stderr, "radeonsi: can't create GS copy shader\n"); - return; - } - - si_shader_vs(sscreen, sel->gs_copy_shader, sel); - } - /* Free NIR. We only keep serialized NIR after this point. */ if (sel->nir) { ralloc_free(sel->nir); -- GitLab From fb04378e769f0eaee694490b935e39eeb619e285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 11 Aug 2021 12:41:52 -0400 Subject: [PATCH 33/42] radeonsi: set prefer_mono outside of si_shader_selector_key Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pipe.c | 12 ++++++++++++ src/gallium/drivers/radeonsi/si_state_shaders.c | 9 +-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index d16cba67368d..a2e8dfc27676 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -717,6 +717,18 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign si_init_cp_reg_shadowing(sctx); } + /* Set immutable fields of shader keys. */ + if (sctx->chip_class >= GFX9) { + /* The LS output / HS input layout can be communicated + * directly instead of via user SGPRs for merged LS-HS. + * This also enables jumping over the VS prolog for HS-only waves. + */ + sctx->shader.tcs.key.opt.prefer_mono = 1; + + /* This enables jumping over the VS prolog for GS-only waves. */ + sctx->shader.gs.key.opt.prefer_mono = 1; + } + si_begin_new_gfx_cs(sctx, true); assert(sctx->gfx_cs.current.cdw == sctx->initial_gfx_cs_size); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 02fc51b5b597..5bcef1376d9c 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1819,6 +1819,7 @@ void si_vs_key_update_inputs(struct si_context *sctx) key->part.vs.prolog.instance_divisor_is_one = elts->instance_divisor_is_one; key->part.vs.prolog.instance_divisor_is_fetched = elts->instance_divisor_is_fetched; + key->opt.prefer_mono = elts->instance_divisor_is_fetched; unsigned count_mask = (1 << vs->info.num_inputs) - 1; unsigned fix = elts->fix_fetch_always & count_mask; @@ -2157,11 +2158,6 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh */ key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix; - /* The LS output / HS input layout can be communicated - * directly instead of via user SGPRs for merged LS-HS. - * This also enables jumping over the VS prolog for HS-only waves. - */ - key->opt.prefer_mono = 1; key->opt.same_patch_vertices = sctx->same_patch_vertices; } @@ -2195,9 +2191,6 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh si_get_vs_key_outputs(sctx, sel, key); else si_clear_vs_key_outputs(sctx, sel, key); - - /* This enables jumping over the VS prolog for GS-only waves. */ - key->opt.prefer_mono = 1; } key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix; break; -- GitLab From 99c5e03986294e3aa90be6dc656080d8304d3313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 11 Aug 2021 12:53:46 -0400 Subject: [PATCH 34/42] radeonsi: move setting most TCS shader key fields out of si_shader_selector_key Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pipe.c | 5 ++++ src/gallium/drivers/radeonsi/si_pipe.h | 2 -- .../drivers/radeonsi/si_state_draw.cpp | 12 +++++--- .../drivers/radeonsi/si_state_shaders.c | 30 +++++++------------ 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index a2e8dfc27676..2b3400dc8004 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -722,6 +722,11 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign /* The LS output / HS input layout can be communicated * directly instead of via user SGPRs for merged LS-HS. * This also enables jumping over the VS prolog for HS-only waves. + * + * When the LS VGPR fix is needed, monolithic shaders can: + * - avoid initializing EXEC in both the LS prolog + * and the LS main part when !vs_needs_prolog + * - remove the fixup for unused input VGPRs */ sctx->shader.tcs.key.opt.prefer_mono = 1; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 926af88fd875..c03f7ac6b2ee 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1078,9 +1078,7 @@ struct si_context { /* Emitted draw state. */ bool gs_tri_strip_adj_fix : 1; - bool ls_vgpr_fix : 1; bool ngg : 1; - bool same_patch_vertices : 1; uint8_t ngg_culling; unsigned last_index_size; int last_base_vertex; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index b6ce4fd7174a..f9c60eaba7b6 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -134,6 +134,9 @@ static bool si_update_shaders(struct si_context *sctx) (struct si_shader_selector*)si_create_fixed_func_tcs(sctx); if (!sctx->fixed_func_tcs_shader.cso) return false; + + sctx->fixed_func_tcs_shader.key.part.tcs.epilog.invoc0_tess_factors_are_def = + sctx->fixed_func_tcs_shader.cso->info.tessfactors_are_def_in_all_invocs; } r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader); @@ -1973,8 +1976,8 @@ static void si_draw_vbo(struct pipe_context *ctx, GFX_VERSION >= GFX9 && tcs && sctx->patch_vertices == tcs->info.base.tess.tcs_vertices_out; - if (sctx->same_patch_vertices != same_patch_vertices) { - sctx->same_patch_vertices = same_patch_vertices; + if (sctx->shader.tcs.key.opt.same_patch_vertices != same_patch_vertices) { + sctx->shader.tcs.key.opt.same_patch_vertices = same_patch_vertices; sctx->do_update_shaders = true; } @@ -1989,8 +1992,9 @@ static void si_draw_vbo(struct pipe_context *ctx, bool ls_vgpr_fix = tcs && sctx->patch_vertices > tcs->info.base.tess.tcs_vertices_out; - if (ls_vgpr_fix != sctx->ls_vgpr_fix) { - sctx->ls_vgpr_fix = ls_vgpr_fix; + if (ls_vgpr_fix != sctx->shader.tcs.key.part.tcs.ls_prolog.ls_vgpr_fix) { + sctx->shader.tcs.key.part.tcs.ls_prolog.ls_vgpr_fix = ls_vgpr_fix; + sctx->fixed_func_tcs_shader.key.part.tcs.ls_prolog.ls_vgpr_fix = ls_vgpr_fix; sctx->do_update_shaders = true; } } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 5bcef1376d9c..9effd12b63a7 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2149,26 +2149,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh if (sctx->chip_class >= GFX9) { si_get_vs_key_inputs(sctx, key, &key->part.tcs.ls_prolog); key->part.tcs.ls = sctx->shader.vs.cso; - - /* When the LS VGPR fix is needed, monolithic shaders - * can: - * - avoid initializing EXEC in both the LS prolog - * and the LS main part when !vs_needs_prolog - * - remove the fixup for unused input VGPRs - */ - key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix; - - key->opt.same_patch_vertices = sctx->same_patch_vertices; } - - key->part.tcs.epilog.prim_mode = - sctx->shader.tes.cso->info.base.tess.primitive_mode; - key->part.tcs.epilog.invoc0_tess_factors_are_def = - sel->info.tessfactors_are_def_in_all_invocs; - key->part.tcs.epilog.tes_reads_tess_factors = sctx->shader.tes.cso->info.reads_tess_factors; - - if (sel == sctx->fixed_func_tcs_shader.cso) - key->mono.u.ff_tcs_inputs_to_copy = sctx->shader.vs.cso->outputs_written; break; case MESA_SHADER_TESS_EVAL: if (!sctx->shader.gs.cso) @@ -3192,6 +3173,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) sctx->shader.vs.current = sel ? sel->first_variant : NULL; sctx->num_vs_blit_sgprs = sel ? sel->info.base.vs.blit_sgprs_amd : 0; sctx->vs_uses_draw_id = sel ? sel->info.uses_drawid : false; + sctx->fixed_func_tcs_shader.key.mono.u.ff_tcs_inputs_to_copy = sel ? sel->outputs_written : 0; if (si_update_ngg(sctx)) si_shader_change_notify(sctx); @@ -3299,6 +3281,8 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) sctx->shader.tcs.cso = sel; sctx->shader.tcs.current = sel ? sel->first_variant : NULL; + sctx->shader.tcs.key.part.tcs.epilog.invoc0_tess_factors_are_def = + sel ? sel->info.tessfactors_are_def_in_all_invocs : 0; si_update_tess_uses_prim_id(sctx); si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_CTRL); @@ -3323,6 +3307,14 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state) sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL; si_update_tess_uses_prim_id(sctx); + sctx->shader.tcs.key.part.tcs.epilog.prim_mode = + sctx->fixed_func_tcs_shader.key.part.tcs.epilog.prim_mode = + sel ? sel->info.base.tess.primitive_mode : 0; + + sctx->shader.tcs.key.part.tcs.epilog.tes_reads_tess_factors = + sctx->fixed_func_tcs_shader.key.part.tcs.epilog.tes_reads_tess_factors = + sel ? sel->info.reads_tess_factors : 0; + si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL); si_select_draw_vbo(sctx); sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ -- GitLab From 73c82570cbfc305adbe734ae5e723ca5dba3fce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 11 Aug 2021 12:53:46 -0400 Subject: [PATCH 35/42] radeonsi: move setting one GS shader key field out of si_shader_selector_key Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pipe.h | 1 - src/gallium/drivers/radeonsi/si_state_draw.cpp | 4 ++-- src/gallium/drivers/radeonsi/si_state_shaders.c | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index c03f7ac6b2ee..60a589573759 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1077,7 +1077,6 @@ struct si_context { bool allow_flat_shading : 1; /* Emitted draw state. */ - bool gs_tri_strip_adj_fix : 1; bool ngg : 1; uint8_t ngg_culling; unsigned last_index_size; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index f9c60eaba7b6..021f309691d2 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -2028,8 +2028,8 @@ static void si_draw_vbo(struct pipe_context *ctx, bool gs_tri_strip_adj_fix = !HAS_TESS && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; - if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) { - sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix; + if (gs_tri_strip_adj_fix != sctx->shader.gs.key.part.gs.prolog.tri_strip_adj_fix) { + sctx->shader.gs.key.part.gs.prolog.tri_strip_adj_fix = gs_tri_strip_adj_fix; sctx->do_update_shaders = true; } } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 9effd12b63a7..75b382d03126 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2173,7 +2173,6 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh else si_clear_vs_key_outputs(sctx, sel, key); } - key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix; break; case MESA_SHADER_FRAGMENT: si_ps_key_update_primtype_shader_rasterizer_framebuffer(sctx); -- GitLab From 3df035d08cc45a1e0045c88d68bd19e8da3ea16f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 12 Aug 2021 09:39:04 -0400 Subject: [PATCH 36/42] radeonsi: put si_pm4_state at the beginning of si_shader instead of allocating it separately. This removes pointer indirections. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pm4.c | 6 +- src/gallium/drivers/radeonsi/si_pm4.h | 2 +- src/gallium/drivers/radeonsi/si_shader.h | 3 +- src/gallium/drivers/radeonsi/si_state.h | 12 +- .../drivers/radeonsi/si_state_draw.cpp | 32 ++-- .../drivers/radeonsi/si_state_shaders.c | 171 ++++++++---------- 6 files changed, 107 insertions(+), 119 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index 22b6e3ad5185..8213fe381905 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -117,8 +117,8 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; - if (state->shader) { - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, state->shader->bo, + if (state->is_shader) { + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, ((struct si_shader*)state)->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); } @@ -139,7 +139,7 @@ void si_pm4_reset_emitted(struct si_context *sctx, bool first_cs) for (unsigned i = 0; i < SI_NUM_STATES; i++) { struct si_pm4_state *state = sctx->emitted.array[i]; - if (state && state->shader) { + if (state && state->is_shader) { sctx->emitted.array[i] = NULL; sctx->dirty_states |= 1 << i; } diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h index 06909ff1a910..03f79e0ba30e 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.h +++ b/src/gallium/drivers/radeonsi/si_pm4.h @@ -54,7 +54,7 @@ struct si_pm4_state { uint32_t pm4[SI_PM4_MAX_DW]; /* For shader states only */ - struct si_shader *shader; + bool is_shader; struct si_atom atom; }; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index de0dc232e12f..0bbd20157e05 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -138,6 +138,7 @@ #include "util/u_inlines.h" #include "util/u_live_shader_cache.h" #include "util/u_queue.h" +#include "si_pm4.h" #include @@ -772,6 +773,7 @@ union si_vgt_stages_key { }; struct si_shader { + struct si_pm4_state pm4; /* base class */ struct si_compiler_ctx_state compiler_ctx_state; struct si_shader_selector *selector; @@ -783,7 +785,6 @@ struct si_shader { struct si_shader_part *prolog2; struct si_shader_part *epilog; - struct si_pm4_state *pm4; struct si_resource *bo; struct si_resource *scratch_bo; struct si_shader_key key; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 895e280bb460..34fbf43f90a9 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -183,13 +183,13 @@ union si_state { struct si_state_rasterizer *rasterizer; struct si_state_dsa *dsa; struct si_pm4_state *poly_offset; - struct si_pm4_state *ls; - struct si_pm4_state *hs; - struct si_pm4_state *es; - struct si_pm4_state *gs; + struct si_shader *ls; + struct si_shader *hs; + struct si_shader *es; + struct si_shader *gs; struct si_pm4_state *vgt_shader_config; - struct si_pm4_state *vs; - struct si_pm4_state *ps; + struct si_shader *vs; + struct si_shader *ps; } named; struct si_pm4_state *array[sizeof(struct si_state_named) / sizeof(struct si_pm4_state *)]; }; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 021f309691d2..623d99f74525 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -127,7 +127,7 @@ static bool si_update_shaders(struct si_context *sctx) r = si_shader_select(ctx, &sctx->shader.tcs); if (r) return false; - si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current->pm4); + si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current); } else { if (!sctx->fixed_func_tcs_shader.cso) { sctx->fixed_func_tcs_shader.cso = @@ -142,7 +142,7 @@ static bool si_update_shaders(struct si_context *sctx) r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader); if (r) return false; - si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4); + si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current); } if (!HAS_GS || GFX_VERSION <= GFX8) { @@ -153,11 +153,11 @@ static bool si_update_shaders(struct si_context *sctx) if (HAS_GS) { /* TES as ES */ assert(GFX_VERSION <= GFX8); - si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, es, sctx->shader.tes.current); } else if (NGG) { - si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.tes.current); } else { - si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, vs, sctx->shader.tes.current); } } } else { @@ -174,9 +174,9 @@ static bool si_update_shaders(struct si_context *sctx) r = si_shader_select(ctx, &sctx->shader.gs); if (r) return false; - si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.gs.current); if (!NGG) { - si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader->pm4); + si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader); if (!si_update_gs_ring_buffers(sctx)) return false; @@ -203,23 +203,23 @@ static bool si_update_shaders(struct si_context *sctx) if (!HAS_TESS && !HAS_GS) { if (NGG) { - si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.vs.current); si_pm4_bind_state(sctx, vs, NULL); sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; } else { - si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, vs, sctx->shader.vs.current); } } else if (HAS_TESS) { - si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, ls, sctx->shader.vs.current); } else { assert(HAS_GS); - si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, es, sctx->shader.vs.current); } } sctx->vs_uses_base_instance = sctx->shader.vs.current ? sctx->shader.vs.current->uses_base_instance : - sctx->queued.named.hs ? sctx->queued.named.hs->shader->uses_base_instance : + sctx->queued.named.hs ? sctx->queued.named.hs->uses_base_instance : sctx->shader.gs.current->uses_base_instance; union si_vgt_stages_key key; @@ -244,7 +244,7 @@ static bool si_update_shaders(struct si_context *sctx) r = si_shader_select(ctx, &sctx->shader.ps); if (r) return false; - si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4); + si_pm4_bind_state(sctx, ps, sctx->shader.ps.current); if (si_pm4_state_changed(sctx, ps) || (!NGG && si_pm4_state_changed(sctx, vs)) || @@ -310,7 +310,7 @@ static bool si_update_shaders(struct si_context *sctx) if (GFX_VERSION <= GFX8) /* LS */ scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave); - scratch_size = MAX2(scratch_size, sctx->queued.named.hs->shader->config.scratch_bytes_per_wave); + scratch_size = MAX2(scratch_size, sctx->queued.named.hs->config.scratch_bytes_per_wave); if (HAS_GS) { if (GFX_VERSION <= GFX8) /* ES */ @@ -383,9 +383,9 @@ static unsigned si_conv_pipe_prim(unsigned mode) return prim_conv[mode]; } -static void si_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state) +static void si_prefetch_shader_async(struct si_context *sctx, struct si_shader *shader) { - struct pipe_resource *bo = &state->shader->bo->b.b; + struct pipe_resource *bo = &shader->bo->b.b; si_cp_dma_prefetch(sctx, bo, 0, bo->width0); } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 75b382d03126..636c3184236e 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -373,7 +373,7 @@ bool si_shader_mem_ordered(struct si_shader *shader) } static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes, - struct si_pm4_state *pm4) + struct si_shader *shader) { const struct si_shader_info *info = &tes->info; unsigned tes_prim_mode = info->base.tess.primitive_mode; @@ -430,10 +430,9 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad } else distribution_mode = V_028B6C_NO_DIST; - assert(pm4->shader); - pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | - S_028B6C_TOPOLOGY(topology) | - S_028B6C_DISTRIBUTION_MODE(distribution_mode); + shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | + S_028B6C_TOPOLOGY(topology) | + S_028B6C_DISTRIBUTION_MODE(distribution_mode); } /* Polaris needs different VTX_REUSE_DEPTH settings depending on @@ -447,18 +446,16 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad * VS as ES | ES -> GS -> VS | 30 * TES as VS | LS -> HS -> VS | 14 or 30 * TES as ES | LS -> HS -> ES -> GS -> VS | 14 or 30 - * - * If "shader" is NULL, it's assumed it's not LS or GS copy shader. */ static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel, - struct si_shader *shader, struct si_pm4_state *pm4) + struct si_shader *shader) { if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.chip_class >= GFX10) return; /* VS as VS, or VS as ES: */ if ((sel->info.stage == MESA_SHADER_VERTEX && - (!shader || (!shader->key.as_ls && !shader->is_gs_copy_shader))) || + (!shader->key.as_ls && !shader->is_gs_copy_shader)) || /* TES as VS, or TES as ES: */ sel->info.stage == MESA_SHADER_TESS_EVAL) { unsigned vtx_reuse_depth = 30; @@ -467,25 +464,15 @@ static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_sh sel->info.base.tess.spacing == TESS_SPACING_FRACTIONAL_ODD) vtx_reuse_depth = 14; - assert(pm4->shader); - pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth; + shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth; } } static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader) { - if (shader->pm4) - si_pm4_clear_state(shader->pm4); - else - shader->pm4 = CALLOC_STRUCT(si_pm4_state); - - if (shader->pm4) { - shader->pm4->shader = shader; - return shader->pm4; - } else { - fprintf(stderr, "radeonsi: Failed to create pm4 state.\n"); - return NULL; - } + si_pm4_clear_state(&shader->pm4); + shader->pm4.is_shader = true; + return &shader->pm4; } static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader, @@ -616,7 +603,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) static void si_emit_shader_es(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.es->shader; + struct si_shader *shader = sctx->queued.named.es; if (!shader) return; @@ -677,9 +664,9 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->selector, pm4); + si_set_tesseval_regs(sscreen, shader->selector, shader); - polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4); + polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader); } void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs, @@ -777,7 +764,7 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector * static void si_emit_shader_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -953,9 +940,9 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4; if (es_stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4); + si_set_tesseval_regs(sscreen, shader->key.part.gs.es, shader); - polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4); + polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, shader); } else { if (sscreen->info.chip_class >= GFX7) { si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, @@ -1046,7 +1033,7 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1055,7 +1042,7 @@ static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1069,7 +1056,7 @@ static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx) static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1083,7 +1070,7 @@ static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx) static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1267,7 +1254,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader } if (es_stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, es_sel, pm4); + si_set_tesseval_regs(sscreen, es_sel, shader); shader->ctx_reg.ngg.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) | @@ -1356,7 +1343,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader static void si_emit_shader_vs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.vs->shader; + struct si_shader *shader = sctx->queued.named.vs; if (!shader) return; @@ -1556,9 +1543,9 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->selector, pm4); + si_set_tesseval_regs(sscreen, shader->selector, shader); - polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4); + polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader); } static unsigned si_get_ps_num_interp(struct si_shader *ps) @@ -1593,7 +1580,7 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader) static void si_emit_shader_ps(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.ps->shader; + struct si_shader *shader = sctx->queued.named.ps; if (!shader) return; @@ -3414,55 +3401,55 @@ static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) util_queue_fence_destroy(&shader->ready); - if (shader->pm4) { - /* If destroyed shaders were not unbound, the next compiled - * shader variant could get the same pointer address and so - * binding it to the same shader stage would be considered - * a no-op, causing random behavior. - */ - switch (shader->selector->info.stage) { - case MESA_SHADER_VERTEX: - if (shader->key.as_ls) { - assert(sctx->chip_class <= GFX8); - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(ls)); - } else if (shader->key.as_es) { - assert(sctx->chip_class <= GFX8); - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(es)); - } else if (shader->key.as_ngg) { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs)); - } else { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs)); - } - break; - case MESA_SHADER_TESS_CTRL: - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(hs)); - break; - case MESA_SHADER_TESS_EVAL: - if (shader->key.as_es) { - assert(sctx->chip_class <= GFX8); - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(es)); - } else if (shader->key.as_ngg) { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs)); - } else { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs)); - } - break; - case MESA_SHADER_GEOMETRY: - if (shader->is_gs_copy_shader) - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs)); - else - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs)); - break; - case MESA_SHADER_FRAGMENT: - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(ps)); - break; - default:; + /* If destroyed shaders were not unbound, the next compiled + * shader variant could get the same pointer address and so + * binding it to the same shader stage would be considered + * a no-op, causing random behavior. + */ + int state_index = -1; + + switch (shader->selector->info.stage) { + case MESA_SHADER_VERTEX: + if (shader->key.as_ls) { + if (sctx->chip_class <= GFX8) + state_index = SI_STATE_IDX(ls); + } else if (shader->key.as_es) { + if (sctx->chip_class <= GFX8) + state_index = SI_STATE_IDX(es); + } else if (shader->key.as_ngg) { + state_index = SI_STATE_IDX(gs); + } else { + state_index = SI_STATE_IDX(vs); + } + break; + case MESA_SHADER_TESS_CTRL: + state_index = SI_STATE_IDX(hs); + break; + case MESA_SHADER_TESS_EVAL: + if (shader->key.as_es) { + if (sctx->chip_class <= GFX8) + state_index = SI_STATE_IDX(es); + } else if (shader->key.as_ngg) { + state_index = SI_STATE_IDX(gs); + } else { + state_index = SI_STATE_IDX(vs); } + break; + case MESA_SHADER_GEOMETRY: + if (shader->is_gs_copy_shader) + state_index = SI_STATE_IDX(vs); + else + state_index = SI_STATE_IDX(gs); + break; + case MESA_SHADER_FRAGMENT: + state_index = SI_STATE_IDX(ps); + break; + default:; } si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL); si_shader_destroy(shader); - free(shader); + si_pm4_free_state(sctx, &shader->pm4, state_index); } static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso) @@ -3775,19 +3762,19 @@ static bool si_update_scratch_relocs(struct si_context *sctx) if (r < 0) return false; if (r == 1) - si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4); + si_pm4_bind_state(sctx, ps, sctx->shader.ps.current); r = si_update_scratch_buffer(sctx, sctx->shader.gs.current); if (r < 0) return false; if (r == 1) - si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.gs.current); r = si_update_scratch_buffer(sctx, tcs); if (r < 0) return false; if (r == 1) - si_pm4_bind_state(sctx, hs, tcs->pm4); + si_pm4_bind_state(sctx, hs, tcs); /* VS can be bound as LS, ES, or VS. */ r = si_update_scratch_buffer(sctx, sctx->shader.vs.current); @@ -3795,13 +3782,13 @@ static bool si_update_scratch_relocs(struct si_context *sctx) return false; if (r == 1) { if (sctx->shader.vs.current->key.as_ls) - si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, ls, sctx->shader.vs.current); else if (sctx->shader.vs.current->key.as_es) - si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, es, sctx->shader.vs.current); else if (sctx->shader.vs.current->key.as_ngg) - si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.vs.current); else - si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, vs, sctx->shader.vs.current); } /* TES can be bound as ES or VS. */ @@ -3810,11 +3797,11 @@ static bool si_update_scratch_relocs(struct si_context *sctx) return false; if (r == 1) { if (sctx->shader.tes.current->key.as_es) - si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, es, sctx->shader.tes.current); else if (sctx->shader.tes.current->key.as_ngg) - si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.tes.current); else - si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, vs, sctx->shader.tes.current); } return true; -- GitLab From edb5fa4d59de8ef68deaedb27ca2798b57bc28a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 18 Aug 2021 13:05:16 -0400 Subject: [PATCH 37/42] radeonsi: eliminate redundant SPI_SHADER_PGM_RSRC3/4_GS register writes They don't change much. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_build_pm4.h | 10 ++++ src/gallium/drivers/radeonsi/si_gfx_cs.c | 2 +- src/gallium/drivers/radeonsi/si_shader.h | 4 ++ src/gallium/drivers/radeonsi/si_state.h | 3 ++ .../drivers/radeonsi/si_state_shaders.c | 49 +++++++++++++------ 5 files changed, 52 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index b96c9201fb7e..40e430bd407f 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -259,6 +259,16 @@ } \ } while (0) +#define radeon_opt_set_sh_reg(sctx, offset, reg, val) do { \ + unsigned __value = val; \ + if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \ + sctx->tracked_regs.reg_value[reg] != __value) { \ + radeon_set_sh_reg(cs, offset, __value); \ + sctx->tracked_regs.reg_saved |= BITFIELD64_BIT(reg); \ + sctx->tracked_regs.reg_value[reg] = __value; \ + } \ +} while (0) + #define radeon_set_privileged_config_reg(cs, reg, value) do { \ assert((reg) < CIK_UCONFIG_REG_OFFSET); \ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); \ diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 1081b238bcda..80e9f760e09b 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -294,7 +294,7 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx) ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x0000001e; /* From GFX8 */ /* Set all cleared context registers to saved. */ - ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */ + ctx->tracked_regs.reg_saved = BITFIELD64_MASK(SI_TRACKED_GE_PC_ALLOC); ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */ } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 0bbd20157e05..fa32c8ed705f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -839,6 +839,8 @@ struct si_shader { unsigned vgt_gs_onchip_cntl; unsigned vgt_gs_max_prims_per_subgroup; unsigned vgt_esgs_ring_itemsize; + unsigned spi_shader_pgm_rsrc3_gs; + unsigned spi_shader_pgm_rsrc4_gs; } gs; struct { @@ -855,6 +857,8 @@ struct si_shader { unsigned pa_cl_ngg_cntl; unsigned vgt_gs_max_vert_out; /* for API GS */ unsigned ge_pc_alloc; /* uconfig register */ + unsigned spi_shader_pgm_rsrc3_gs; + unsigned spi_shader_pgm_rsrc4_gs; union si_vgt_stages_key vgt_stages; } ngg; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 34fbf43f90a9..cd1bd6328515 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -342,7 +342,10 @@ enum si_tracked_reg SI_TRACKED_VGT_TF_PARAM, SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, + /* Non-context registers: */ SI_TRACKED_GE_PC_ALLOC, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, SI_NUM_TRACKED_REGS, }; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 636c3184236e..66a68711cba0 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -819,6 +819,20 @@ static void si_emit_shader_gs(struct si_context *sctx) shader->vgt_vertex_reuse_block_cntl); } radeon_end_update_context_roll(sctx); + + /* These don't cause any context rolls. */ + radeon_begin_again(&sctx->gfx_cs); + if (sctx->chip_class >= GFX7) { + radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs); + } + if (sctx->chip_class >= GFX10) { + radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs); + } + radeon_end(); } static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) @@ -923,13 +937,11 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); - if (sscreen->info.chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); - } + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) | + S_00B21C_WAVE_LIMIT(0x3F); + shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs = + S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0); shader->ctx_reg.gs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) | @@ -944,10 +956,9 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, shader); } else { - if (sscreen->info.chip_class >= GFX7) { - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); - } + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) | + S_00B21C_WAVE_LIMIT(0x3F); + si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8)); @@ -1029,6 +1040,15 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc); + + radeon_begin_again(&sctx->gfx_cs); + radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs); + radeon_end(); } static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) @@ -1218,12 +1238,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) | S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) | S_00B22C_LDS_SIZE(shader->config.lds_size)); - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F)); - si_pm4_set_reg( - pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64)); + shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(cu_mask) | + S_00B21C_WAVE_LIMIT(0x3F); + shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs = + S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64); nparams = MAX2(shader->info.nr_param_exports, 1); shader->ctx_reg.ngg.spi_vs_out_config = -- GitLab From 2d8dfb91116e36d9fc8ce3665fef0d75f0dfa47c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 18 Aug 2021 15:27:47 -0400 Subject: [PATCH 38/42] radeonsi: convert gfx10_emit_ge_pc_alloc to radeon_opt_set_uconfig_reg Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_build_pm4.h | 10 ++++++ .../drivers/radeonsi/si_state_shaders.c | 31 ++++++------------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index 40e430bd407f..76949b7de54b 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -269,6 +269,16 @@ } \ } while (0) +#define radeon_opt_set_uconfig_reg(sctx, offset, reg, val) do { \ + unsigned __value = val; \ + if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \ + sctx->tracked_regs.reg_value[reg] != __value) { \ + radeon_set_uconfig_reg(cs, offset, __value); \ + sctx->tracked_regs.reg_saved |= 0x1ull << (reg); \ + sctx->tracked_regs.reg_value[reg] = __value; \ + } \ +} while (0) + #define radeon_set_privileged_config_reg(cs, reg, value) do { \ assert((reg) < CIK_UCONFIG_REG_OFFSET); \ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); \ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 66a68711cba0..76d96a561521 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -973,23 +973,6 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) } } -static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value) -{ - enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC; - - if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || - sctx->tracked_regs.reg_value[reg] != value) { - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - - radeon_begin(cs); - radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value); - radeon_end(); - - sctx->tracked_regs.reg_saved |= 0x1ull << reg; - sctx->tracked_regs.reg_value[reg] = value; - } -} - bool gfx10_is_ngg_passthrough(struct si_shader *shader) { struct si_shader_selector *sel = shader->selector; @@ -1038,10 +1021,10 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); radeon_end_update_context_roll(sctx); - /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ - gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc); - + /* These don't cause a context roll. */ radeon_begin_again(&sctx->gfx_cs); + radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC, + shader->ctx_reg.ngg.ge_pc_alloc); radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs); @@ -1413,8 +1396,12 @@ static void si_emit_shader_vs(struct si_context *sctx) radeon_end_update_context_roll(sctx); /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ - if (sctx->chip_class >= GFX10) - gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc); + if (sctx->chip_class >= GFX10) { + radeon_begin_again(&sctx->gfx_cs); + radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC, + shader->ctx_reg.vs.ge_pc_alloc); + radeon_end(); + } } /** -- GitLab From b330c7cb2a3dd6c6e1e05a76182896cf93473818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 31 Aug 2021 21:45:21 -0400 Subject: [PATCH 39/42] radeonsi: use a trick to extract and pack edgeflags using fewer instructions This removes 4 instructions from the prim export packing. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/llvm/ac_llvm_build.c | 4 +-- src/amd/llvm/ac_llvm_build.h | 2 +- src/amd/vulkan/radv_nir_to_llvm.c | 9 ++---- .../drivers/radeonsi/gfx10_shader_ngg.c | 29 +++++++------------ 4 files changed, 15 insertions(+), 29 deletions(-) diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 0f07d90327a9..cf0148c1477a 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -4627,13 +4627,11 @@ LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ng LLVMBuilderRef builder = ctx->builder; LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, ""); LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), ""); + result = LLVMBuildOr(ctx->builder, result, prim->edgeflags, ""); for (unsigned i = 0; i < prim->num_vertices; ++i) { tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), ""); result = LLVMBuildOr(builder, result, tmp, ""); - tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, ""); - tmp = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 10 * i + 9, false), ""); - result = LLVMBuildOr(builder, result, tmp, ""); } return result; } diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h index 0bdab2872faf..cdbec2060086 100644 --- a/src/amd/llvm/ac_llvm_build.h +++ b/src/amd/llvm/ac_llvm_build.h @@ -581,7 +581,7 @@ struct ac_ngg_prim { unsigned num_vertices; LLVMValueRef isnull; LLVMValueRef index[3]; - LLVMValueRef edgeflag[3]; + LLVMValueRef edgeflags; LLVMValueRef passthrough; }; diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index dc357f29ac3d..2990e7c3c056 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -1654,13 +1654,8 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) } else { prim.num_vertices = num_vertices; prim.isnull = ctx->ac.i1false; + prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args->ac); memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); - - for (unsigned i = 0; i < num_vertices; ++i) { - tmp = LLVMBuildLShr(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_invocation_id), - LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); - prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); - } } ac_build_export_prim(&ctx->ac, &prim); @@ -1926,11 +1921,11 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) tmp = ngg_gs_vertex_ptr(ctx, tid); flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); + prim.edgeflags = ctx->ac.i32_0; for (unsigned i = 0; i < verts_per_prim; ++i) { prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); - prim.edgeflag[i] = ctx->ac.i1false; } /* Geometry shaders output triangle strips, but NGG expects triangles. */ diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index c789c282df52..dc00d1008d71 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -70,17 +70,6 @@ static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx) LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false)); } -static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index) -{ - if (ctx->stage == MESA_SHADER_VERTEX) { - LLVMValueRef tmp; - tmp = LLVMBuildLShr(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id), - LLVMConstInt(ctx->ac.i32, 8 + index, false), ""); - return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, ""); - } - return ctx->ac.i1false; -} - /** * Return the number of vertices as a constant in \p num_vertices, * and return a more precise value as LLVMValueRef from the function. @@ -190,19 +179,23 @@ void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef use ngg_get_vertices_per_prim(ctx, &prim.num_vertices); prim.isnull = ctx->ac.i1false; + prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args); + for (unsigned i = 0; i < 3; ++i) prim.index[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16); - for (unsigned i = 0; i < prim.num_vertices; ++i) { - prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i); + if (ctx->shader->selector->info.writes_edgeflag) { + LLVMValueRef edgeflags = ctx->ac.i32_0; - if (ctx->shader->selector->info.writes_edgeflag) { + for (unsigned i = 0; i < prim.num_vertices; ++i) { LLVMValueRef edge; edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], ""); - edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, ""); - prim.edgeflag[i] = edge; + edge = LLVMBuildZExt(ctx->ac.builder, edge, ctx->ac.i32, ""); + edge = LLVMBuildShl(ctx->ac.builder, edge, LLVMConstInt(ctx->ac.i32, 9 + i*10, 0), ""); + edgeflags = LLVMBuildOr(ctx->ac.builder, edgeflags, edge, ""); } + prim.edgeflags = LLVMBuildAnd(ctx->ac.builder, prim.edgeflags, edgeflags, ""); } ac_build_export_prim(&ctx->ac, &prim); @@ -1159,12 +1152,12 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) struct ac_ngg_prim prim = {}; prim.num_vertices = 3; prim.isnull = ctx->ac.i1false; + prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args); for (unsigned vtx = 0; vtx < 3; vtx++) { prim.index[vtx] = LLVMBuildLoad( builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), ""); prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, ""); - prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx); } /* Set the new GS input VGPR. */ @@ -1909,11 +1902,11 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) tmp = ngg_gs_vertex_ptr(ctx, tid); flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); + prim.edgeflags = ctx->ac.i32_0; for (unsigned i = 0; i < verts_per_prim; ++i) { prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); - prim.edgeflag[i] = ctx->ac.i1false; } /* Geometry shaders output triangle strips, but NGG expects triangles. */ -- GitLab From 0b5c1537aa003a7f7a9f78af4dc9d9a8b7412679 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 22 Aug 2021 16:32:50 -0400 Subject: [PATCH 40/42] radeonsi: don't set edgeflags for TES and blit VS they are disabled (TES) or have no effect (blit VS) Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index dc00d1008d71..7a92ab91febc 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -179,7 +179,12 @@ void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef use ngg_get_vertices_per_prim(ctx, &prim.num_vertices); prim.isnull = ctx->ac.i1false; - prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args); + + if (ctx->stage == MESA_SHADER_VERTEX && + !ctx->shader->selector->info.base.vs.blit_sgprs_amd) + prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args); + else + prim.edgeflags = ctx->ac.i32_0; for (unsigned i = 0; i < 3; ++i) prim.index[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16); @@ -1152,7 +1157,11 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) struct ac_ngg_prim prim = {}; prim.num_vertices = 3; prim.isnull = ctx->ac.i1false; - prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args); + + if (ctx->stage == MESA_SHADER_VERTEX) + prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args); + else + prim.edgeflags = ctx->ac.i32_0; for (unsigned vtx = 0; vtx < 3; vtx++) { prim.index[vtx] = LLVMBuildLoad( -- GitLab From 70c975fd1366571d065e14724ecb35bd232a27b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 30 Aug 2021 02:21:19 -0400 Subject: [PATCH 41/42] radeonsi: fix incorrect comments about VGT_SHADER_STAGES_EN Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state_draw.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 623d99f74525..763dfac8c4de 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -225,7 +225,7 @@ static bool si_update_shaders(struct si_context *sctx) union si_vgt_stages_key key; key.index = 0; - /* Update VGT_SHADER_CONFIG. */ + /* Update VGT_SHADER_STAGES_EN. */ if (HAS_TESS) key.u.tess = 1; if (HAS_GS) -- GitLab From 7b4427b199f5aea6952d779eceeab5f7e8bae05f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 1 Sep 2021 03:43:34 -0400 Subject: [PATCH 42/42] radeonsi: enable NGG passthrough when LDS is used, document the real constraints Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 2 +- src/gallium/drivers/radeonsi/si_shader_llvm.c | 6 +----- src/gallium/drivers/radeonsi/si_state_shaders.c | 12 +++++++++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 7a92ab91febc..69f18f9fe202 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -1339,7 +1339,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi) LLVMValueRef is_es_thread = si_is_es_thread(ctx); LLVMValueRef vtxindex[3]; - if (ctx->shader->key.opt.ngg_culling) { + if (ctx->shader->key.opt.ngg_culling || gfx10_is_ngg_passthrough(ctx->shader)) { for (unsigned i = 0; i < 3; ++i) vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[0], 10 * i, 9); } else { diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 083d73fca752..1a1dd07a507f 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -896,12 +896,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad /* Unconditionally declare scratch space base for streamout and * vertex compaction. Whether space is actually allocated is * determined during linking / PM4 creation. - * - * Add an extra dword per vertex to ensure an odd stride, which - * avoids bank conflicts for SoA accesses. */ - if (!gfx10_is_ngg_passthrough(shader)) - si_llvm_declare_esgs_ring(ctx); + si_llvm_declare_esgs_ring(ctx); /* This is really only needed when streamout and / or vertex * compaction is enabled. diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 76d96a561521..3ecbd5664a6d 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -983,9 +983,15 @@ bool gfx10_is_ngg_passthrough(struct si_shader *shader) if (sel->screen->use_ngg_culling) return false; - return sel->info.stage != MESA_SHADER_GEOMETRY && !sel->so.num_outputs && !sel->info.writes_edgeflag && - !shader->key.opt.ngg_culling && - (sel->info.stage != MESA_SHADER_VERTEX || !shader->key.mono.u.vs_export_prim_id); + /* The definition of NGG passthrough is: + * - user GS is turned off (no amplification, no GS instancing, and no culling) + * - VGT_ESGS_RING_ITEMSIZE is ignored (behaving as if it was equal to 1) + * - vertex indices are packed into 1 VGPR + * - Dimgrey and later chips can optionally skip the gs_alloc_req message + * + * NGG passthrough still allows the use of LDS. + */ + return sel->info.stage != MESA_SHADER_GEOMETRY && !shader->key.opt.ngg_culling; } /* Common tail code for NGG primitive shaders. */ -- GitLab