diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 0f07d90327a93ad8715043023499cfc83c6f4267..cf0148c1477aefab06629190623a2c4d0c4e8cef 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -4627,13 +4627,11 @@ LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ng LLVMBuilderRef builder = ctx->builder; LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, ""); LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), ""); + result = LLVMBuildOr(ctx->builder, result, prim->edgeflags, ""); for (unsigned i = 0; i < prim->num_vertices; ++i) { tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), ""); result = LLVMBuildOr(builder, result, tmp, ""); - tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, ""); - tmp = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 10 * i + 9, false), ""); - result = LLVMBuildOr(builder, result, tmp, ""); } return result; } diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h index 0bdab2872faf500110ae73a292726084c9c30783..cdbec20600866a8f37aeceb32750209e240ce59f 100644 --- a/src/amd/llvm/ac_llvm_build.h +++ b/src/amd/llvm/ac_llvm_build.h @@ -581,7 +581,7 @@ struct ac_ngg_prim { unsigned num_vertices; LLVMValueRef isnull; LLVMValueRef index[3]; - LLVMValueRef edgeflag[3]; + LLVMValueRef edgeflags; LLVMValueRef passthrough; }; diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index dc357f29ac3db7e4f943001225121b842f9ecee5..2990e7c3c05629234d58eb2e40df41885852b720 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -1654,13 +1654,8 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) } else { prim.num_vertices = num_vertices; prim.isnull = ctx->ac.i1false; + prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args->ac); memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); - - for (unsigned i = 0; i < num_vertices; ++i) { - tmp = LLVMBuildLShr(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_invocation_id), - LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); - prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); - } } ac_build_export_prim(&ctx->ac, &prim); @@ -1926,11 +1921,11 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) tmp = ngg_gs_vertex_ptr(ctx, tid); flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); + prim.edgeflags = ctx->ac.i32_0; for (unsigned i = 0; i < verts_per_prim; ++i) { prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); - prim.edgeflag[i] = ctx->ac.i1false; } /* Geometry shaders output triangle strips, but NGG expects triangles. */ diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index c789c282df52e5fcda444d184671ca834b5fe1b7..69f18f9fe20239a4786904e77d75c747f4a299ec 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -70,17 +70,6 @@ static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx) LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false)); } -static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index) -{ - if (ctx->stage == MESA_SHADER_VERTEX) { - LLVMValueRef tmp; - tmp = LLVMBuildLShr(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id), - LLVMConstInt(ctx->ac.i32, 8 + index, false), ""); - return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, ""); - } - return ctx->ac.i1false; -} - /** * Return the number of vertices as a constant in \p num_vertices, * and return a more precise value as LLVMValueRef from the function. @@ -190,19 +179,28 @@ void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef use ngg_get_vertices_per_prim(ctx, &prim.num_vertices); prim.isnull = ctx->ac.i1false; + + if (ctx->stage == MESA_SHADER_VERTEX && + !ctx->shader->selector->info.base.vs.blit_sgprs_amd) + prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args); + else + prim.edgeflags = ctx->ac.i32_0; + for (unsigned i = 0; i < 3; ++i) prim.index[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16); - for (unsigned i = 0; i < prim.num_vertices; ++i) { - prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i); + if (ctx->shader->selector->info.writes_edgeflag) { + LLVMValueRef edgeflags = ctx->ac.i32_0; - if (ctx->shader->selector->info.writes_edgeflag) { + for (unsigned i = 0; i < prim.num_vertices; ++i) { LLVMValueRef edge; edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], ""); - edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, ""); - prim.edgeflag[i] = edge; + edge = LLVMBuildZExt(ctx->ac.builder, edge, ctx->ac.i32, ""); + edge = LLVMBuildShl(ctx->ac.builder, edge, LLVMConstInt(ctx->ac.i32, 9 + i*10, 0), ""); + edgeflags = LLVMBuildOr(ctx->ac.builder, edgeflags, edge, ""); } + prim.edgeflags = LLVMBuildAnd(ctx->ac.builder, prim.edgeflags, edgeflags, ""); } ac_build_export_prim(&ctx->ac, &prim); @@ -1160,11 +1158,15 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) prim.num_vertices = 3; prim.isnull = ctx->ac.i1false; + if (ctx->stage == MESA_SHADER_VERTEX) + prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args); + else + prim.edgeflags = ctx->ac.i32_0; + for (unsigned vtx = 0; vtx < 3; vtx++) { prim.index[vtx] = LLVMBuildLoad( builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), ""); prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, ""); - prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx); } /* Set the new GS input VGPR. */ @@ -1337,7 +1339,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi) LLVMValueRef is_es_thread = si_is_es_thread(ctx); LLVMValueRef vtxindex[3]; - if (ctx->shader->key.opt.ngg_culling) { + if (ctx->shader->key.opt.ngg_culling || gfx10_is_ngg_passthrough(ctx->shader)) { for (unsigned i = 0; i < 3; ++i) vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[0], 10 * i, 9); } else { @@ -1909,11 +1911,11 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) tmp = ngg_gs_vertex_ptr(ctx, tid); flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); + prim.edgeflags = ctx->ac.i32_0; for (unsigned i = 0; i < verts_per_prim; ++i) { prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); - prim.edgeflag[i] = ctx->ac.i1false; } /* Geometry shaders output triangle strips, but NGG expects triangles. */ diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 00081dd6391b627802acda916a44b12d42f57ced..6de37ab2bd89bc91e877346a50b9e780e8cfa349 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -98,11 +98,13 @@ void si_blitter_end(struct si_context *sctx) /* Restore shader pointers because the VS blit shader changed all * non-global VS user SGPRs. */ sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX); + + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen); sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL && sctx->num_vertex_elements > - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 && - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); } diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index b96c9201fb7edfdec1ba4a774901f3d235300f7f..76949b7de54b7114a4532bab7a8f793d86e0cc7a 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -259,6 +259,26 @@ } \ } while (0) +#define radeon_opt_set_sh_reg(sctx, offset, reg, val) do { \ + unsigned __value = val; \ + if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \ + sctx->tracked_regs.reg_value[reg] != __value) { \ + radeon_set_sh_reg(cs, offset, __value); \ + sctx->tracked_regs.reg_saved |= BITFIELD64_BIT(reg); \ + sctx->tracked_regs.reg_value[reg] = __value; \ + } \ +} while (0) + +#define radeon_opt_set_uconfig_reg(sctx, offset, reg, val) do { \ + unsigned __value = val; \ + if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \ + sctx->tracked_regs.reg_value[reg] != __value) { \ + radeon_set_uconfig_reg(cs, offset, __value); \ + sctx->tracked_regs.reg_saved |= 0x1ull << (reg); \ + sctx->tracked_regs.reg_value[reg] = __value; \ + } \ +} while (0) + #define radeon_set_privileged_config_reg(cs, reg, value) do { \ assert((reg) < CIK_UCONFIG_REG_OFFSET); \ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); \ diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 93c9f28bee7d43e07349affe71b8c4a402f0a3ac..0a563b3caa0f256cbe8fda0d2cf3312a7c05d276 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1225,6 +1225,19 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res sctx->descriptors_dirty |= 1u << descriptors_idx; } +void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader) +{ + if (shader == PIPE_SHADER_COMPUTE) + return; + + if (sctx->shaders[shader].key.opt.inline_uniforms) { + sctx->shaders[shader].key.opt.inline_uniforms = false; + memset(sctx->shaders[shader].key.opt.inlined_uniform_values, 0, + sizeof(sctx->shaders[shader].key.opt.inlined_uniform_values)); + sctx->do_update_shaders = true; + } +} + static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shader_type shader, uint slot, bool take_ownership, const struct pipe_constant_buffer *input) @@ -1244,10 +1257,8 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shad si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER; } - if (slot == 0) { - /* Invalidate current inlinable uniforms. */ - sctx->inlinable_uniforms_valid_mask &= ~(1 << shader); - } + if (slot == 0) + si_invalidate_inlinable_uniforms(sctx, shader); } slot = si_get_constbuf_slot(slot); @@ -1262,10 +1273,13 @@ static void si_set_inlinable_constants(struct pipe_context *ctx, { struct si_context *sctx = (struct si_context *)ctx; - if (!(sctx->inlinable_uniforms_valid_mask & BITFIELD_BIT(shader))) { + if (shader == PIPE_SHADER_COMPUTE) + return; + + if (!sctx->shaders[shader].key.opt.inline_uniforms) { /* It's the first time we set the constants. Always update shaders. */ - memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4); - sctx->inlinable_uniforms_valid_mask |= BITFIELD_BIT(shader); + sctx->shaders[shader].key.opt.inline_uniforms = true; + memcpy(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4); sctx->do_update_shaders = true; return; } @@ -1273,8 +1287,8 @@ static void si_set_inlinable_constants(struct pipe_context *ctx, /* We have already set inlinable constants for this shader. Update the shader only if * the constants are being changed so as not to update shaders needlessly. */ - if (memcmp(sctx->inlinable_uniforms[shader], values, num_values * 4)) { - memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4); + if (memcmp(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4)) { + memcpy(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4); sctx->do_update_shaders = true; } } @@ -1940,11 +1954,13 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS); if (shader == PIPE_SHADER_VERTEX) { + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen); + sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL && sctx->num_vertex_elements > - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; sctx->vertex_buffer_user_sgprs_dirty = - sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs; + sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs; } si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); @@ -1952,12 +1968,14 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad void si_shader_pointers_mark_dirty(struct si_context *sctx) { + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen); + sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS); sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL && sctx->num_vertex_elements > - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; sctx->vertex_buffer_user_sgprs_dirty = - sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs; + sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs; si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 1081b238bcdaacdb10a506a41a62aabb9450e486..80e9f760e09b02e6ea26774aa4c1ec2653bbffae 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -294,7 +294,7 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx) ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x0000001e; /* From GFX8 */ /* Set all cleared context registers to saved. */ - ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */ + ctx->tracked_regs.reg_saved = BITFIELD64_MASK(SI_TRACKED_GE_PC_ALLOC); ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */ } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index e77eb0e4de27c893aed7734f6a18f8295e3c84f9..2b3400dc8004805cc19379af3a6682718e0b8574 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -571,6 +571,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign si_init_state_functions(sctx); si_init_streamout_functions(sctx); si_init_viewport_functions(sctx); + si_init_spi_map_functions(sctx); sctx->blitter = util_blitter_create(&sctx->b); if (sctx->blitter == NULL) @@ -716,6 +717,23 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign si_init_cp_reg_shadowing(sctx); } + /* Set immutable fields of shader keys. */ + if (sctx->chip_class >= GFX9) { + /* The LS output / HS input layout can be communicated + * directly instead of via user SGPRs for merged LS-HS. + * This also enables jumping over the VS prolog for HS-only waves. + * + * When the LS VGPR fix is needed, monolithic shaders can: + * - avoid initializing EXEC in both the LS prolog + * and the LS main part when !vs_needs_prolog + * - remove the fixup for unused input VGPRs + */ + sctx->shader.tcs.key.opt.prefer_mono = 1; + + /* This enables jumping over the VS prolog for GS-only waves. */ + sctx->shader.gs.key.opt.prefer_mono = 1; + } + si_begin_new_gfx_cs(sctx, true); assert(sctx->gfx_cs.current.cdw == sctx->initial_gfx_cs_size); @@ -1155,11 +1173,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3; - /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't - * have to allocate and count references for the upload buffer. - */ - sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1; - /* Determine tessellation ring info. */ bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 && sscreen->info.family != CHIP_CARRIZO && diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 057fd9aef55b41fa8e70801befd327537d702afc..60a58957375953ad183ef58ce065a1d72b009ddf 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -518,7 +518,6 @@ struct si_screen { unsigned width, unsigned height, unsigned depth, uint32_t *state, uint32_t *fmask_state); - unsigned num_vbos_in_user_sgprs; unsigned max_memory_usage_kb; unsigned pa_sc_raster_config; unsigned pa_sc_raster_config_1; @@ -837,35 +836,6 @@ union si_vgt_param_key { uint16_t index; }; -#define SI_NUM_VGT_STAGES_KEY_BITS 6 -#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) - -/* The VGT_SHADER_STAGES key used to index the table of precomputed values. - * Some fields are set by state-change calls, most are set by draw_vbo. - */ -union si_vgt_stages_key { - struct { -#if UTIL_ARCH_LITTLE_ENDIAN - uint8_t tess : 1; - uint8_t gs : 1; - uint8_t ngg_gs_fast_launch : 1; - uint8_t ngg_passthrough : 1; - uint8_t ngg : 1; /* gfx10+ */ - uint8_t streamout : 1; /* only used with NGG */ - uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; -#else /* UTIL_ARCH_BIG_ENDIAN */ - uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; - uint8_t streamout : 1; - uint8_t ngg : 1; - uint8_t ngg_passthrough : 1; - uint8_t ngg_gs_fast_launch : 1; - uint8_t gs : 1; - uint8_t tess : 1; -#endif - } u; - uint8_t index; -}; - struct si_texture_handle { unsigned desc_slot; bool desc_dirty; @@ -1037,11 +1007,10 @@ struct si_context { struct si_cs_shader_state cs_shader_state; /* shader information */ + uint64_t ps_inputs_read_or_disabled; struct si_vertex_elements *vertex_elements; unsigned num_vertex_elements; - unsigned sprite_coord_enable; unsigned cs_max_waves_per_sh; - bool flatshade; bool do_update_shaders; bool compute_shaderbuf_sgprs_dirty; bool compute_image_sgprs_dirty; @@ -1054,8 +1023,6 @@ struct si_context { unsigned descriptors_dirty; unsigned shader_pointers_dirty; unsigned shader_needs_decompress_mask; - unsigned inlinable_uniforms_valid_mask; - uint32_t inlinable_uniforms[SI_NUM_SHADERS][MAX_INLINABLE_UNIFORMS]; struct si_buffer_resources internal_bindings; struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS]; struct si_samplers samplers[SI_NUM_SHADERS]; @@ -1110,10 +1077,7 @@ struct si_context { bool allow_flat_shading : 1; /* Emitted draw state. */ - bool gs_tri_strip_adj_fix : 1; - bool ls_vgpr_fix : 1; bool ngg : 1; - bool same_patch_vertices : 1; uint8_t ngg_culling; unsigned last_index_size; int last_base_vertex; @@ -1257,6 +1221,7 @@ struct si_context { pipe_draw_vbo_func draw_vbo[2][2][2]; /* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */ pipe_draw_vbo_func real_draw_vbo; + void (*emit_spi_map[33])(struct si_context *sctx); /* SQTT */ struct ac_thread_trace_data *thread_trace; @@ -1579,6 +1544,9 @@ bool si_init_thread_trace(struct si_context *sctx); void si_destroy_thread_trace(struct si_context *sctx); void si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs); +/* si_state_shaders.c */ +struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key); + /* * common helpers */ @@ -1988,6 +1956,20 @@ static inline unsigned si_get_num_coverage_samples(struct si_context *sctx) return 1; } +static unsigned ALWAYS_INLINE +si_num_vbos_in_user_sgprs_inline(enum chip_class chip_class) +{ + /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't + * have to allocate and count references for the upload buffer. + */ + return chip_class >= GFX9 ? 5 : 1; +} + +static inline unsigned si_num_vbos_in_user_sgprs(struct si_screen *sscreen) +{ + return si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class); +} + #define PRINT_ERR(fmt, args...) \ fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index 22b6e3ad5185a32b5d8a01ed0fd809055a0f5777..8213fe3819052a311a67c14a4dce74909cdb3bdd 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -117,8 +117,8 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; - if (state->shader) { - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, state->shader->bo, + if (state->is_shader) { + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, ((struct si_shader*)state)->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); } @@ -139,7 +139,7 @@ void si_pm4_reset_emitted(struct si_context *sctx, bool first_cs) for (unsigned i = 0; i < SI_NUM_STATES; i++) { struct si_pm4_state *state = sctx->emitted.array[i]; - if (state && state->shader) { + if (state && state->is_shader) { sctx->emitted.array[i] = NULL; sctx->dirty_states |= 1 << i; } diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h index 06909ff1a910294e05da88cfd1ed581c829fe158..03f79e0ba30ebf8b06be3c4afceebc5337d0fb19 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.h +++ b/src/gallium/drivers/radeonsi/si_pm4.h @@ -54,7 +54,7 @@ struct si_pm4_state { uint32_t pm4[SI_PM4_MAX_DW]; /* For shader states only */ - struct si_shader *shader; + bool is_shader; struct si_atom atom; }; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index b3d84eec8feb30cd04efd5d2293700a92961999b..b381d5d09a344f81afb1d75c10b5be1f7ed9c40f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1433,8 +1433,10 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi si_dump_streamout(&sel->so); } - memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000, - sizeof(shader->info.vs_output_param_offset)); + /* Initialize vs_output_ps_input_cntl to default. */ + for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++) + shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED; + shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0; shader->info.uses_instanceid = sel->info.uses_instanceid; @@ -1445,6 +1447,43 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir)) return false; + /* Compute vs_output_ps_input_cntl. */ + if ((sel->info.stage == MESA_SHADER_VERTEX || + sel->info.stage == MESA_SHADER_TESS_EVAL || + sel->info.stage == MESA_SHADER_GEOMETRY) && + !shader->key.as_ls && !shader->key.as_es) { + ubyte *vs_output_param_offset = shader->info.vs_output_param_offset; + + if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) + vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset; + + /* VS and TES should also set primitive ID output if it's used. */ + unsigned num_outputs_with_prim_id = sel->info.num_outputs + + shader->key.mono.u.vs_export_prim_id; + + for (unsigned i = 0; i < num_outputs_with_prim_id; i++) { + unsigned semantic = sel->info.output_semantic[i]; + unsigned offset = vs_output_param_offset[i]; + unsigned ps_input_cntl; + + if (offset <= AC_EXP_PARAM_OFFSET_31) { + /* The input is loaded from parameter memory. */ + ps_input_cntl = S_028644_OFFSET(offset); + } else { + /* The input is a DEFAULT_VAL constant. */ + assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && + offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); + offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; + + /* OFFSET=0x20 means that DEFAULT_VAL is used. */ + ps_input_cntl = S_028644_OFFSET(0x20) | + S_028644_DEFAULT_VAL(offset); + } + + shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl; + } + } + /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */ if (sel->info.stage == MESA_SHADER_COMPUTE) { unsigned wave_size = sscreen->compute_wave_size; @@ -2002,8 +2041,8 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler shader->info.num_input_vgprs = mainp->info.num_input_vgprs; shader->info.face_vgpr_index = mainp->info.face_vgpr_index; shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; - memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset, - sizeof(mainp->info.vs_output_param_offset)); + memcpy(shader->info.vs_output_ps_input_cntl, mainp->info.vs_output_ps_input_cntl, + sizeof(mainp->info.vs_output_ps_input_cntl)); shader->info.uses_instanceid = mainp->info.uses_instanceid; shader->info.nr_pos_exports = mainp->info.nr_pos_exports; shader->info.nr_param_exports = mainp->info.nr_param_exports; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index acb5d6fe3cf33fe22c5643a2a408aafabadf6789..fa32c8ed705f30b5791a37e79219edacaca1440d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -138,6 +138,7 @@ #include "util/u_inlines.h" #include "util/u_live_shader_cache.h" #include "util/u_queue.h" +#include "si_pm4.h" #include @@ -158,6 +159,12 @@ struct si_context; #define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29)) +#define SI_PS_INPUT_CNTL_0000 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0)) +#define SI_PS_INPUT_CNTL_0001 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3)) +#define SI_PS_INPUT_CNTL_UNUSED SI_PS_INPUT_CNTL_0000 +/* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */ +#define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001 + /* SGPR user data indices */ enum { @@ -323,6 +330,16 @@ enum si_color_output_type { SI_TYPE_UINT16, }; +union si_input_info { + struct { + ubyte semantic; + ubyte interpolate; + ubyte fp16_lo_hi_valid; + ubyte usage_mask; + }; + uint32_t _unused; /* this just forces 4-byte alignment */ +}; + struct si_shader_info { shader_info base; @@ -330,12 +347,8 @@ struct si_shader_info { ubyte num_inputs; ubyte num_outputs; - ubyte input_semantic[PIPE_MAX_SHADER_INPUTS]; - ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS]; - ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS]; - ubyte input_fp16_lo_hi_valid[PIPE_MAX_SHADER_INPUTS]; + union si_input_info input[PIPE_MAX_SHADER_INPUTS]; ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS]; - char output_semantic_to_slot[VARYING_SLOT_VAR15_16BIT + 1]; ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS]; ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS]; @@ -700,6 +713,7 @@ struct si_shader_key { /* GCN-specific shader info. */ struct si_shader_binary_info { ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; + uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS]; ubyte num_input_sgprs; ubyte num_input_vgprs; signed char face_vgpr_index; @@ -729,7 +743,37 @@ struct gfx9_gs_info { unsigned esgs_ring_size; /* in bytes */ }; +#define SI_NUM_VGT_STAGES_KEY_BITS 6 +#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) + +/* The VGT_SHADER_STAGES key used to index the table of precomputed values. + * Some fields are set by state-change calls, most are set by draw_vbo. + */ +union si_vgt_stages_key { + struct { +#if UTIL_ARCH_LITTLE_ENDIAN + uint8_t tess : 1; + uint8_t gs : 1; + uint8_t ngg_gs_fast_launch : 1; + uint8_t ngg_passthrough : 1; + uint8_t ngg : 1; /* gfx10+ */ + uint8_t streamout : 1; /* only used with NGG */ + uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; +#else /* UTIL_ARCH_BIG_ENDIAN */ + uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; + uint8_t streamout : 1; + uint8_t ngg : 1; + uint8_t ngg_passthrough : 1; + uint8_t ngg_gs_fast_launch : 1; + uint8_t gs : 1; + uint8_t tess : 1; +#endif + } u; + uint8_t index; +}; + struct si_shader { + struct si_pm4_state pm4; /* base class */ struct si_compiler_ctx_state compiler_ctx_state; struct si_shader_selector *selector; @@ -741,7 +785,6 @@ struct si_shader { struct si_shader_part *prolog2; struct si_shader_part *epilog; - struct si_pm4_state *pm4; struct si_resource *bo; struct si_resource *scratch_bo; struct si_shader_key key; @@ -796,6 +839,8 @@ struct si_shader { unsigned vgt_gs_onchip_cntl; unsigned vgt_gs_max_prims_per_subgroup; unsigned vgt_esgs_ring_itemsize; + unsigned spi_shader_pgm_rsrc3_gs; + unsigned spi_shader_pgm_rsrc4_gs; } gs; struct { @@ -812,6 +857,9 @@ struct si_shader { unsigned pa_cl_ngg_cntl; unsigned vgt_gs_max_vert_out; /* for API GS */ unsigned ge_pc_alloc; /* uconfig register */ + unsigned spi_shader_pgm_rsrc3_gs; + unsigned spi_shader_pgm_rsrc4_gs; + union si_vgt_stages_key vgt_stages; } ngg; struct { @@ -832,6 +880,7 @@ struct si_shader { unsigned spi_shader_z_format; unsigned spi_shader_col_format; unsigned cb_shader_mask; + unsigned num_interp; } ps; } ctx_reg; @@ -888,7 +937,7 @@ bool gfx10_is_ngg_passthrough(struct si_shader *shader); /* Return the pointer to the main shader part's pointer. */ static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel, - struct si_shader_key *key) + const struct si_shader_key *key) { if (key->as_ls) return &sel->main_shader_part_ls; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index c975581fe4f78c53164e3d54c694a1e2c24aedaa..1a1dd07a507f918d36e8a1a33ec0532dcac5d450 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -22,6 +22,7 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "ac_exp_param.h" #include "ac_nir_to_llvm.h" #include "ac_rtld.h" #include "si_pipe.h" @@ -895,12 +896,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad /* Unconditionally declare scratch space base for streamout and * vertex compaction. Whether space is actually allocated is * determined during linking / PM4 creation. - * - * Add an extra dword per vertex to ensure an odd stride, which - * avoids bank conflicts for SoA accesses. */ - if (!gfx10_is_ngg_passthrough(shader)) - si_llvm_declare_esgs_ring(ctx); + si_llvm_declare_esgs_ring(ctx); /* This is really only needed when streamout and / or vertex * compaction is enabled. diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index b4a3b8a8aadc4929c20776cbeb3359de7d47fce0..a9ab0c549f3d0b76ec55c3236436fea01a7b22e4 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -52,7 +52,7 @@ static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned in unsigned param; LLVMValueRef value; - param = si_shader_io_get_unique_index(info->input_semantic[input_index], false); + param = si_shader_io_get_unique_index(info->input[input_index].semantic, false); /* GFX9 has the ESGS ring in LDS. */ if (ctx->screen->info.chip_class >= GFX9) { diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index c79c475506d7438d6f2444fa17c6f6d6d35e52f3..68e3fc18e2184984e60b0e34e2436468a890d034 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -390,7 +390,7 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMType ubyte semantic; if (load_input) { - semantic = info->input_semantic[driver_location]; + semantic = info->input[driver_location].semantic; } else { semantic = info->output_semantic[driver_location]; } @@ -448,7 +448,7 @@ static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, LLVMTypeRef struct si_shader_info *info = &ctx->shader->selector->info; LLVMValueRef base, addr; - ubyte semantic = info->input_semantic[driver_location]; + ubyte semantic = info->input[driver_location].semantic; assert((semantic >= VARYING_SLOT_PATCH0 || semantic == VARYING_SLOT_TESS_LEVEL_INNER || diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index b6bfa6fe09d412c22916e48144668508b0ee1ccc..d35c296c2195d37e40a5ec1aa0f837b142d25d71 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -26,6 +26,7 @@ #include "si_shader_internal.h" #include "sid.h" #include "util/u_memory.h" +#include "ac_exp_param.h" static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index) { @@ -107,7 +108,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L * ... which is what we must prevent at all cost. */ const bool can_speculate = false; - unsigned bit_size = info->input_fp16_lo_hi_valid[input_index] & 0x1 ? 16 : 32; + unsigned bit_size = info->input[input_index].fp16_lo_hi_valid & 0x1 ? 16 : 32; LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32; LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32; unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; @@ -157,7 +158,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L return; } - unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]); + unsigned required_channels = util_last_bit(info->input[input_index].usage_mask); if (required_channels == 0) { for (unsigned i = 0; i < 4; ++i) out[i] = LLVMGetUndef(ctx->ac.f32); @@ -452,6 +453,9 @@ static void si_prepare_param_exports(struct si_shader_context *ctx, struct si_shader *shader = ctx->shader; unsigned param_count = 0; + memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000, + sizeof(shader->info.vs_output_param_offset)); + for (unsigned i = 0; i < noutput; i++) { unsigned semantic = outputs[i].semantic; diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 832dd4fed5ca354e122ff40a804fc68525a6655e..ed07fa7e0a78d6dcb6b57de8bfbb14d4526dcb5d 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -108,21 +108,25 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr unsigned num_slots = indirect ? nir_intrinsic_io_semantics(intr).num_slots : 1; if (is_input) { - assert(driver_location + num_slots <= ARRAY_SIZE(info->input_usage_mask)); + assert(driver_location + num_slots <= ARRAY_SIZE(info->input)); for (unsigned i = 0; i < num_slots; i++) { unsigned loc = driver_location + i; - info->input_semantic[loc] = semantic + i; - info->input_interpolate[loc] = interp; + info->input[loc].semantic = semantic + i; + + if (semantic == SYSTEM_VALUE_PRIMITIVE_ID) + info->input[loc].interpolate = INTERP_MODE_FLAT; + else + info->input[loc].interpolate = interp; if (mask) { - info->input_usage_mask[loc] |= mask; + info->input[loc].usage_mask |= mask; if (bit_size == 16) { if (nir_intrinsic_io_semantics(intr).high_16bits) - info->input_fp16_lo_hi_valid[loc] |= 0x2; + info->input[loc].fp16_lo_hi_valid |= 0x2; else - info->input_fp16_lo_hi_valid[loc] |= 0x1; + info->input[loc].fp16_lo_hi_valid |= 0x1; } info->num_inputs = MAX2(info->num_inputs, loc + 1); } @@ -130,13 +134,11 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr } else { /* Outputs. */ assert(driver_location + num_slots <= ARRAY_SIZE(info->output_usagemask)); - assert(semantic + num_slots < ARRAY_SIZE(info->output_semantic_to_slot)); for (unsigned i = 0; i < num_slots; i++) { unsigned loc = driver_location + i; info->output_semantic[loc] = semantic + i; - info->output_semantic_to_slot[semantic + i] = loc; if (is_output_load) { /* Output loads have only a few things that we need to track. */ @@ -475,14 +477,22 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf info->writes_position = nir->info.outputs_written & VARYING_BIT_POS; } - memset(info->output_semantic_to_slot, -1, sizeof(info->output_semantic_to_slot)); - func = (struct nir_function *)exec_list_get_head_const(&nir->functions); nir_foreach_block (block, func->impl) { nir_foreach_instr (instr, block) scan_instruction(nir, info, instr); } + if (info->stage == MESA_SHADER_VERTEX || info->stage == MESA_SHADER_TESS_EVAL) { + /* Add the PrimitiveID output, but don't increment num_outputs. + * The driver inserts PrimitiveID only when it's used by the pixel shader, + * and si_emit_spi_map uses this unconditionally when such a pixel shader is used. + */ + info->output_semantic[info->num_outputs] = VARYING_SLOT_PRIMITIVE_ID; + info->output_type[info->num_outputs] = nir_type_uint32; + info->output_usagemask[info->num_outputs] = 0x1; + } + if (nir->info.stage == MESA_SHADER_FRAGMENT) { info->allow_flat_shading = !(info->uses_persp_center || info->uses_persp_centroid || info->uses_persp_sample || info->uses_linear_center || @@ -496,16 +506,25 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) || BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN) || BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_HELPER_INVOCATION)); - } - /* Add color inputs to the list of inputs. */ - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - for (unsigned i = 0; i < 2; i++) { - if ((info->colors_read >> (i * 4)) & 0xf) { - info->input_semantic[info->num_inputs] = VARYING_SLOT_COL0 + i; - info->input_interpolate[info->num_inputs] = info->color_interpolate[i]; - info->input_usage_mask[info->num_inputs] = info->colors_read >> (i * 4); - info->num_inputs++; + /* Add both front and back color inputs. */ + unsigned num_inputs_with_colors = info->num_inputs; + for (unsigned back = 0; back < 2; back++) { + for (unsigned i = 0; i < 2; i++) { + if ((info->colors_read >> (i * 4)) & 0xf) { + unsigned index = num_inputs_with_colors; + + info->input[index].semantic = (back ? VARYING_SLOT_BFC0 : VARYING_SLOT_COL0) + i; + info->input[index].interpolate = info->color_interpolate[i]; + info->input[index].usage_mask = info->colors_read >> (i * 4); + num_inputs_with_colors++; + + /* Back-face color don't increment num_inputs. si_emit_spi_map will use + * back-face colors conditionally only when they are needed. + */ + if (!back) + info->num_inputs = num_inputs_with_colors; + } } } } diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 0ce9f50d25055e025ae1347016ad3756ec16d2bb..8180201bd28d03697c4697c6fcd5458c0194268c 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -709,8 +709,12 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) old_blend->alpha_to_one != blend->alpha_to_one || old_blend->dual_src_blend != blend->dual_src_blend || old_blend->blend_enable_4bit != blend->blend_enable_4bit || - old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) + old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) { + si_ps_key_update_framebuffer_blend(sctx); + si_ps_key_update_blend_rasterizer(sctx); + si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; + } if (sctx->screen->dpbb_allowed && (old_blend->alpha_to_coverage != blend->alpha_to_coverage || @@ -1110,6 +1114,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl) si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); + if (old_rs->sprite_coord_enable != rs->sprite_coord_enable || + old_rs->flatshade != rs->flatshade) + si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); + if (old_rs->clip_plane_enable != rs->clip_plane_enable || old_rs->rasterizer_discard != rs->rasterizer_discard || old_rs->sprite_coord_enable != rs->sprite_coord_enable || @@ -1119,8 +1127,19 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth || old_rs->clamp_fragment_color != rs->clamp_fragment_color || old_rs->force_persample_interp != rs->force_persample_interp || - old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) + old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) { + si_ps_key_update_blend_rasterizer(sctx); + si_ps_key_update_rasterizer(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; + } + + if (old_rs->line_smooth != rs->line_smooth || + old_rs->poly_smooth != rs->poly_smooth || + old_rs->poly_stipple_enable != rs->poly_stipple_enable || + old_rs->flatshade != rs->flatshade) + si_update_vrs_flat_shading(sctx); } static void si_delete_rs_state(struct pipe_context *ctx, void *state) @@ -1336,8 +1355,12 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); } - if (old_dsa->alpha_func != dsa->alpha_func) + if (old_dsa->alpha_func != dsa->alpha_func) { + si_ps_key_update_dsa(sctx); + si_update_ps_inputs_read_or_disabled(sctx); + si_update_ps_kill_enable(sctx); sctx->do_update_shaders = true; + } if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled || old_dsa->stencil_enabled != dsa->stencil_enabled || @@ -2983,6 +3006,10 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); } + si_ps_key_update_framebuffer(sctx); + si_ps_key_update_framebuffer_blend(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; if (!sctx->decompression_enabled) { @@ -3635,6 +3662,9 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples) return; sctx->ps_iter_samples = min_samples; + + si_ps_key_update_sample_shading(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); sctx->do_update_shaders = true; si_update_ps_iter_samples(sctx); @@ -4650,8 +4680,9 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, v->count = count; + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sscreen); unsigned alloc_count = - count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0; + count > num_vbos_in_user_sgprs ? count - num_vbos_in_user_sgprs : 0; v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT); for (i = 0; i < count; ++i) { @@ -4875,8 +4906,10 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) * src_offset alignment, which is reflected in fix_fetch_opencode. */ old->fix_fetch_opencode != v->fix_fetch_opencode || memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * - MAX2(old->count, v->count))) + MAX2(old->count, v->count))) { + si_vs_key_update_inputs(sctx); sctx->do_update_shaders = true; + } if (v->instance_divisor_is_fetched) { struct pipe_constant_buffer cb; @@ -4972,8 +5005,10 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, * be the case in well-behaved applications anyway. */ if ((sctx->vertex_elements->vb_alignment_check_mask & - (unaligned | orig_unaligned) & updated_mask)) + (unaligned | orig_unaligned) & updated_mask)) { + si_vs_key_update_inputs(sctx); sctx->do_update_shaders = true; + } } /* diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index f2608736784231c00729a063ed8c9f9929b1ac13..cd1bd6328515b5b65546363d62a03f69e6667fe7 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -183,13 +183,13 @@ union si_state { struct si_state_rasterizer *rasterizer; struct si_state_dsa *dsa; struct si_pm4_state *poly_offset; - struct si_pm4_state *ls; - struct si_pm4_state *hs; - struct si_pm4_state *es; - struct si_pm4_state *gs; + struct si_shader *ls; + struct si_shader *hs; + struct si_shader *es; + struct si_shader *gs; struct si_pm4_state *vgt_shader_config; - struct si_pm4_state *vs; - struct si_pm4_state *ps; + struct si_shader *vs; + struct si_shader *ps; } named; struct si_pm4_state *array[sizeof(struct si_state_named) / sizeof(struct si_pm4_state *)]; }; @@ -342,7 +342,10 @@ enum si_tracked_reg SI_TRACKED_VGT_TF_PARAM, SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, + /* Non-context registers: */ SI_TRACKED_GE_PC_ALLOC, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, SI_NUM_TRACKED_REGS, }; @@ -488,6 +491,7 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */ bool is_stencil, uint16_t access, uint32_t * restrict state); void si_update_ps_colorbuf0_slot(struct si_context *sctx); +void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader); void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot, struct pipe_constant_buffer *cbuf); void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot, @@ -563,7 +567,6 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20], struct si_shader *shader, bool insert_into_disk_cache); bool si_shader_mem_ordered(struct si_shader *shader); -bool si_update_shaders(struct si_context *sctx); void si_init_screen_live_shader_cache(struct si_screen *sscreen); void si_init_shader_functions(struct si_context *sctx); bool si_init_shader_cache(struct si_screen *sscreen); @@ -574,13 +577,28 @@ void si_schedule_initial_compile(struct si_context *sctx, gl_shader_stage stage, util_queue_execute_func execute); void si_get_active_slot_masks(const struct si_shader_info *info, uint64_t *const_and_shader_buffers, uint64_t *samplers_and_images); -int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key, int thread_index, bool optimized_or_none); -void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs, - struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key); +int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state, + const struct si_shader_key *key, int thread_index, + bool optimized_or_none); +int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state); +void si_vs_key_update_inputs(struct si_context *sctx); +void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, + struct si_vs_prolog_bits *prolog_key); +void si_update_ps_inputs_read_or_disabled(struct si_context *sctx); +void si_update_ps_kill_enable(struct si_context *sctx); +void si_update_vrs_flat_shading(struct si_context *sctx); unsigned si_get_input_prim(const struct si_shader_selector *gs); bool si_update_ngg(struct si_context *sctx); +void si_ps_key_update_framebuffer(struct si_context *sctx); +void si_ps_key_update_framebuffer_blend(struct si_context *sctx); +void si_ps_key_update_blend_rasterizer(struct si_context *sctx); +void si_ps_key_update_rasterizer(struct si_context *sctx); +void si_ps_key_update_dsa(struct si_context *sctx); +void si_ps_key_update_sample_shading(struct si_context *sctx); +void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx); +void si_init_tess_factor_ring(struct si_context *sctx); +bool si_update_gs_ring_buffers(struct si_context *sctx); +bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes); /* si_state_draw.c */ void si_init_draw_functions_GFX6(struct si_context *sctx); @@ -589,6 +607,7 @@ void si_init_draw_functions_GFX8(struct si_context *sctx); void si_init_draw_functions_GFX9(struct si_context *sctx); void si_init_draw_functions_GFX10(struct si_context *sctx); void si_init_draw_functions_GFX10_3(struct si_context *sctx); +void si_init_spi_map_functions(struct si_context *sctx); /* si_state_msaa.c */ void si_init_msaa_functions(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index f8084e8a4821783f84a841fbaa5fd5834cb18992..763dfac8c4de1cddd405da211d18d53f3e861bdf 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -22,6 +22,7 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "ac_exp_param.h" #include "ac_sqtt.h" #include "si_build_pm4.h" #include "util/u_index_modify.h" @@ -47,6 +48,317 @@ /* special primitive types */ #define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX +template +static void si_emit_spi_map(struct si_context *sctx) +{ + struct si_shader *ps = sctx->shader.ps.current; + struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL; + unsigned spi_ps_input_cntl[NUM_INTERP]; + + STATIC_ASSERT(NUM_INTERP >= 0 && NUM_INTERP <= 32); + + if (!NUM_INTERP) + return; + + struct si_shader *vs = si_get_vs(sctx)->current; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + for (unsigned i = 0; i < NUM_INTERP; i++) { + union si_input_info input = psinfo->input[i]; + unsigned ps_input_cntl = vs->info.vs_output_ps_input_cntl[input.semantic]; + bool non_default_val = G_028644_OFFSET(ps_input_cntl) != 0x20; + + if (non_default_val) { + if (input.interpolate == INTERP_MODE_FLAT || + (input.interpolate == INTERP_MODE_COLOR && rs->flatshade)) + ps_input_cntl |= S_028644_FLAT_SHADE(1); + + if (input.fp16_lo_hi_valid) { + ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | + S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */ + S_028644_ATTR1_VALID(!!(input.fp16_lo_hi_valid & 0x2)); + } + } + + if (input.semantic == VARYING_SLOT_PNTC || + (input.semantic >= VARYING_SLOT_TEX0 && input.semantic <= VARYING_SLOT_TEX7 && + rs->sprite_coord_enable & (1 << (input.semantic - VARYING_SLOT_TEX0)))) { + /* Overwrite the whole value for sprite coordinates. */ + ps_input_cntl = S_028644_OFFSET(0) | + S_028644_PT_SPRITE_TEX(1); + if (input.fp16_lo_hi_valid & 0x1) { + ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | + S_028644_ATTR0_VALID(1); + } + } + + spi_ps_input_cntl[i] = ps_input_cntl; + } + + /* R_028644_SPI_PS_INPUT_CNTL_0 */ + /* Dota 2: Only ~16% of SPI map updates set different values. */ + /* Talos: Only ~9% of SPI map updates set different values. */ + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, + sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP); + radeon_end_update_context_roll(sctx); +} + +template +static bool si_update_shaders(struct si_context *sctx) +{ + struct pipe_context *ctx = (struct pipe_context *)sctx; + struct si_shader *old_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current; + unsigned old_kill_clip_distances = old_vs ? old_vs->key.opt.kill_clip_distances : 0; + struct si_shader *old_ps = sctx->shader.ps.current; + unsigned old_spi_shader_col_format = + old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0; + int r; + + /* Update TCS and TES. */ + if (HAS_TESS) { + if (!sctx->tess_rings) { + si_init_tess_factor_ring(sctx); + if (!sctx->tess_rings) + return false; + } + + if (sctx->shader.tcs.cso) { + r = si_shader_select(ctx, &sctx->shader.tcs); + if (r) + return false; + si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current); + } else { + if (!sctx->fixed_func_tcs_shader.cso) { + sctx->fixed_func_tcs_shader.cso = + (struct si_shader_selector*)si_create_fixed_func_tcs(sctx); + if (!sctx->fixed_func_tcs_shader.cso) + return false; + + sctx->fixed_func_tcs_shader.key.part.tcs.epilog.invoc0_tess_factors_are_def = + sctx->fixed_func_tcs_shader.cso->info.tessfactors_are_def_in_all_invocs; + } + + r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader); + if (r) + return false; + si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current); + } + + if (!HAS_GS || GFX_VERSION <= GFX8) { + r = si_shader_select(ctx, &sctx->shader.tes); + if (r) + return false; + + if (HAS_GS) { + /* TES as ES */ + assert(GFX_VERSION <= GFX8); + si_pm4_bind_state(sctx, es, sctx->shader.tes.current); + } else if (NGG) { + si_pm4_bind_state(sctx, gs, sctx->shader.tes.current); + } else { + si_pm4_bind_state(sctx, vs, sctx->shader.tes.current); + } + } + } else { + if (GFX_VERSION <= GFX8) { + si_pm4_bind_state(sctx, ls, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS; + } + si_pm4_bind_state(sctx, hs, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS; + } + + /* Update GS. */ + if (HAS_GS) { + r = si_shader_select(ctx, &sctx->shader.gs); + if (r) + return false; + si_pm4_bind_state(sctx, gs, sctx->shader.gs.current); + if (!NGG) { + si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader); + + if (!si_update_gs_ring_buffers(sctx)) + return false; + } else { + si_pm4_bind_state(sctx, vs, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; + } + } else { + if (!NGG) { + si_pm4_bind_state(sctx, gs, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS; + if (GFX_VERSION <= GFX8) { + si_pm4_bind_state(sctx, es, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES; + } + } + } + + /* Update VS. */ + if ((!HAS_TESS && !HAS_GS) || GFX_VERSION <= GFX8) { + r = si_shader_select(ctx, &sctx->shader.vs); + if (r) + return false; + + if (!HAS_TESS && !HAS_GS) { + if (NGG) { + si_pm4_bind_state(sctx, gs, sctx->shader.vs.current); + si_pm4_bind_state(sctx, vs, NULL); + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; + } else { + si_pm4_bind_state(sctx, vs, sctx->shader.vs.current); + } + } else if (HAS_TESS) { + si_pm4_bind_state(sctx, ls, sctx->shader.vs.current); + } else { + assert(HAS_GS); + si_pm4_bind_state(sctx, es, sctx->shader.vs.current); + } + } + + sctx->vs_uses_base_instance = + sctx->shader.vs.current ? sctx->shader.vs.current->uses_base_instance : + sctx->queued.named.hs ? sctx->queued.named.hs->uses_base_instance : + sctx->shader.gs.current->uses_base_instance; + + union si_vgt_stages_key key; + key.index = 0; + + /* Update VGT_SHADER_STAGES_EN. */ + if (HAS_TESS) + key.u.tess = 1; + if (HAS_GS) + key.u.gs = 1; + if (NGG) + key.index |= si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ctx_reg.ngg.vgt_stages.index; + + struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index]; + if (unlikely(!*pm4)) + *pm4 = si_build_vgt_shader_config(sctx->screen, key); + si_pm4_bind_state(sctx, vgt_shader_config, *pm4); + + if (old_kill_clip_distances != si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.opt.kill_clip_distances) + si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); + + r = si_shader_select(ctx, &sctx->shader.ps); + if (r) + return false; + si_pm4_bind_state(sctx, ps, sctx->shader.ps.current); + + if (si_pm4_state_changed(sctx, ps) || + (!NGG && si_pm4_state_changed(sctx, vs)) || + (NGG && si_pm4_state_changed(sctx, gs))) { + sctx->atoms.s.spi_map.emit = sctx->emit_spi_map[sctx->shader.ps.current->ctx_reg.ps.num_interp]; + si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); + } + + if ((GFX_VERSION >= GFX10_3 || (GFX_VERSION >= GFX9 && sctx->screen->info.rbplus_allowed)) && + si_pm4_state_changed(sctx, ps) && + (!old_ps || old_spi_shader_col_format != + sctx->shader.ps.current->key.part.ps.epilog.spi_shader_col_format)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); + + if (sctx->smoothing_enabled != + sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing) { + sctx->smoothing_enabled = sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing; + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + + /* NGG cull state uses smoothing_enabled. */ + if (GFX_VERSION >= GFX10 && sctx->screen->use_ngg_culling) + si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state); + + if (GFX_VERSION == GFX6) + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + + if (sctx->framebuffer.nr_samples <= 1) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); + } + + if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) { + /* Pretend the bound shaders form a vk pipeline */ + uint32_t pipeline_code_hash = 0; + uint64_t base_address = ~0; + + for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) { + struct si_shader *shader = sctx->shaders[i].current; + if (sctx->shaders[i].cso && shader) { + pipeline_code_hash = _mesa_hash_data_with_seed( + shader->binary.elf_buffer, + shader->binary.elf_size, + pipeline_code_hash); + base_address = MIN2(base_address, + shader->bo->gpu_address); + } + } + + struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; + if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) { + si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false); + } + + si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0); + } + + if ((GFX_VERSION <= GFX8 && + (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, es))) || + si_pm4_state_enabled_and_changed(sctx, hs) || si_pm4_state_enabled_and_changed(sctx, gs) || + si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) { + unsigned scratch_size = 0; + + if (HAS_TESS) { + if (GFX_VERSION <= GFX8) /* LS */ + scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave); + + scratch_size = MAX2(scratch_size, sctx->queued.named.hs->config.scratch_bytes_per_wave); + + if (HAS_GS) { + if (GFX_VERSION <= GFX8) /* ES */ + scratch_size = MAX2(scratch_size, sctx->shader.tes.current->config.scratch_bytes_per_wave); + + scratch_size = MAX2(scratch_size, sctx->shader.gs.current->config.scratch_bytes_per_wave); + } else { + scratch_size = MAX2(scratch_size, sctx->shader.tes.current->config.scratch_bytes_per_wave); + } + } else if (HAS_GS) { + if (GFX_VERSION <= GFX8) /* ES */ + scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave); + + scratch_size = MAX2(scratch_size, sctx->shader.gs.current->config.scratch_bytes_per_wave); + } else { + scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave); + } + + scratch_size = MAX2(scratch_size, sctx->shader.ps.current->config.scratch_bytes_per_wave); + + if (scratch_size && !si_update_spi_tmpring_size(sctx, scratch_size)) + return false; + + if (GFX_VERSION >= GFX7) { + if (GFX_VERSION <= GFX8 && HAS_TESS && si_pm4_state_enabled_and_changed(sctx, ls)) + sctx->prefetch_L2_mask |= SI_PREFETCH_LS; + + if (HAS_TESS && si_pm4_state_enabled_and_changed(sctx, hs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_HS; + + if (GFX_VERSION <= GFX8 && HAS_GS && si_pm4_state_enabled_and_changed(sctx, es)) + sctx->prefetch_L2_mask |= SI_PREFETCH_ES; + + if ((HAS_GS || NGG) && si_pm4_state_enabled_and_changed(sctx, gs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_GS; + + if (!NGG && si_pm4_state_enabled_and_changed(sctx, vs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_VS; + + if (si_pm4_state_enabled_and_changed(sctx, ps)) + sctx->prefetch_L2_mask |= SI_PREFETCH_PS; + } + } + + sctx->do_update_shaders = false; + return true; +} + ALWAYS_INLINE static unsigned si_conv_pipe_prim(unsigned mode) { @@ -71,9 +383,9 @@ static unsigned si_conv_pipe_prim(unsigned mode) return prim_conv[mode]; } -static void si_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state) +static void si_prefetch_shader_async(struct si_context *sctx, struct si_shader *shader) { - struct pipe_resource *bo = &state->shader->bo->b.b; + struct pipe_resource *bo = &shader->bo->b.b; si_cp_dma_prefetch(sctx, bo, 0, bo->width0); } @@ -1355,6 +1667,7 @@ template num_vertex_elements; + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs_inline(GFX_VERSION); bool pointer_dirty, user_sgprs_dirty; assert(count <= SI_MAX_ATTRIBS); @@ -1391,7 +1704,6 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx) } unsigned first_vb_use_mask = velems->first_vb_use_mask; - unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs; for (unsigned i = 0; i < count; i++) { struct pipe_vertex_buffer *vb; @@ -1456,7 +1768,6 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx) if (pointer_dirty || user_sgprs_dirty) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; - unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs; unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG, PIPE_SHADER_VERTEX); assert(count); @@ -1665,8 +1976,8 @@ static void si_draw_vbo(struct pipe_context *ctx, GFX_VERSION >= GFX9 && tcs && sctx->patch_vertices == tcs->info.base.tess.tcs_vertices_out; - if (sctx->same_patch_vertices != same_patch_vertices) { - sctx->same_patch_vertices = same_patch_vertices; + if (sctx->shader.tcs.key.opt.same_patch_vertices != same_patch_vertices) { + sctx->shader.tcs.key.opt.same_patch_vertices = same_patch_vertices; sctx->do_update_shaders = true; } @@ -1681,8 +1992,9 @@ static void si_draw_vbo(struct pipe_context *ctx, bool ls_vgpr_fix = tcs && sctx->patch_vertices > tcs->info.base.tess.tcs_vertices_out; - if (ls_vgpr_fix != sctx->ls_vgpr_fix) { - sctx->ls_vgpr_fix = ls_vgpr_fix; + if (ls_vgpr_fix != sctx->shader.tcs.key.part.tcs.ls_prolog.ls_vgpr_fix) { + sctx->shader.tcs.key.part.tcs.ls_prolog.ls_vgpr_fix = ls_vgpr_fix; + sctx->fixed_func_tcs_shader.key.part.tcs.ls_prolog.ls_vgpr_fix = ls_vgpr_fix; sctx->do_update_shaders = true; } } @@ -1716,8 +2028,8 @@ static void si_draw_vbo(struct pipe_context *ctx, bool gs_tri_strip_adj_fix = !HAS_TESS && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; - if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) { - sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix; + if (gs_tri_strip_adj_fix != sctx->shader.gs.key.part.gs.prolog.tri_strip_adj_fix) { + sctx->shader.gs.key.part.gs.prolog.tri_strip_adj_fix = gs_tri_strip_adj_fix; sctx->do_update_shaders = true; } } @@ -1886,7 +2198,7 @@ static void si_draw_vbo(struct pipe_context *ctx, } if (unlikely(sctx->do_update_shaders)) { - if (unlikely(!si_update_shaders(sctx))) { + if (unlikely(!(si_update_shaders(sctx)))) { DRAW_CLEANUP; return; } @@ -2164,3 +2476,48 @@ void GFX(si_init_draw_functions_)(struct si_context *sctx) si_init_ia_multi_vgt_param_table(sctx); } + +#if GFX_VER == 6 /* declare this function only once because it supports all chips. */ + +extern "C" +void si_init_spi_map_functions(struct si_context *sctx) +{ + /* This unrolls the loops in si_emit_spi_map and inlines memcmp and memcpys. + * It improves performance for viewperf/snx. + */ + sctx->emit_spi_map[0] = si_emit_spi_map<0>; + sctx->emit_spi_map[1] = si_emit_spi_map<1>; + sctx->emit_spi_map[2] = si_emit_spi_map<2>; + sctx->emit_spi_map[3] = si_emit_spi_map<3>; + sctx->emit_spi_map[4] = si_emit_spi_map<4>; + sctx->emit_spi_map[5] = si_emit_spi_map<5>; + sctx->emit_spi_map[6] = si_emit_spi_map<6>; + sctx->emit_spi_map[7] = si_emit_spi_map<7>; + sctx->emit_spi_map[8] = si_emit_spi_map<8>; + sctx->emit_spi_map[9] = si_emit_spi_map<9>; + sctx->emit_spi_map[10] = si_emit_spi_map<10>; + sctx->emit_spi_map[11] = si_emit_spi_map<11>; + sctx->emit_spi_map[12] = si_emit_spi_map<12>; + sctx->emit_spi_map[13] = si_emit_spi_map<13>; + sctx->emit_spi_map[14] = si_emit_spi_map<14>; + sctx->emit_spi_map[15] = si_emit_spi_map<15>; + sctx->emit_spi_map[16] = si_emit_spi_map<16>; + sctx->emit_spi_map[17] = si_emit_spi_map<17>; + sctx->emit_spi_map[18] = si_emit_spi_map<18>; + sctx->emit_spi_map[19] = si_emit_spi_map<19>; + sctx->emit_spi_map[20] = si_emit_spi_map<20>; + sctx->emit_spi_map[21] = si_emit_spi_map<21>; + sctx->emit_spi_map[22] = si_emit_spi_map<22>; + sctx->emit_spi_map[23] = si_emit_spi_map<23>; + sctx->emit_spi_map[24] = si_emit_spi_map<24>; + sctx->emit_spi_map[25] = si_emit_spi_map<25>; + sctx->emit_spi_map[26] = si_emit_spi_map<26>; + sctx->emit_spi_map[27] = si_emit_spi_map<27>; + sctx->emit_spi_map[28] = si_emit_spi_map<28>; + sctx->emit_spi_map[29] = si_emit_spi_map<29>; + sctx->emit_spi_map[30] = si_emit_spi_map<30>; + sctx->emit_spi_map[31] = si_emit_spi_map<31>; + sctx->emit_spi_map[32] = si_emit_spi_map<32>; +} + +#endif diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index ccbc41951bb427ea5f9f27ff88ab8d896f2cad5f..3ecbd5664a6d8e0ee8433a63063d57205b3efa39 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -373,7 +373,7 @@ bool si_shader_mem_ordered(struct si_shader *shader) } static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes, - struct si_pm4_state *pm4) + struct si_shader *shader) { const struct si_shader_info *info = &tes->info; unsigned tes_prim_mode = info->base.tess.primitive_mode; @@ -430,10 +430,9 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad } else distribution_mode = V_028B6C_NO_DIST; - assert(pm4->shader); - pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | - S_028B6C_TOPOLOGY(topology) | - S_028B6C_DISTRIBUTION_MODE(distribution_mode); + shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | + S_028B6C_TOPOLOGY(topology) | + S_028B6C_DISTRIBUTION_MODE(distribution_mode); } /* Polaris needs different VTX_REUSE_DEPTH settings depending on @@ -447,18 +446,16 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad * VS as ES | ES -> GS -> VS | 30 * TES as VS | LS -> HS -> VS | 14 or 30 * TES as ES | LS -> HS -> ES -> GS -> VS | 14 or 30 - * - * If "shader" is NULL, it's assumed it's not LS or GS copy shader. */ static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel, - struct si_shader *shader, struct si_pm4_state *pm4) + struct si_shader *shader) { if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.chip_class >= GFX10) return; /* VS as VS, or VS as ES: */ if ((sel->info.stage == MESA_SHADER_VERTEX && - (!shader || (!shader->key.as_ls && !shader->is_gs_copy_shader))) || + (!shader->key.as_ls && !shader->is_gs_copy_shader)) || /* TES as VS, or TES as ES: */ sel->info.stage == MESA_SHADER_TESS_EVAL) { unsigned vtx_reuse_depth = 30; @@ -467,25 +464,15 @@ static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_sh sel->info.base.tess.spacing == TESS_SPACING_FRACTIONAL_ODD) vtx_reuse_depth = 14; - assert(pm4->shader); - pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth; + shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth; } } static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader) { - if (shader->pm4) - si_pm4_clear_state(shader->pm4); - else - shader->pm4 = CALLOC_STRUCT(si_pm4_state); - - if (shader->pm4) { - shader->pm4->shader = shader; - return shader->pm4; - } else { - fprintf(stderr, "radeonsi: Failed to create pm4 state.\n"); - return NULL; - } + si_pm4_clear_state(&shader->pm4); + shader->pm4.is_shader = true; + return &shader->pm4; } static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader, @@ -616,7 +603,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) static void si_emit_shader_es(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.es->shader; + struct si_shader *shader = sctx->queued.named.es; if (!shader) return; @@ -677,9 +664,9 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->selector, pm4); + si_set_tesseval_regs(sscreen, shader->selector, shader); - polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4); + polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader); } void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs, @@ -777,7 +764,7 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector * static void si_emit_shader_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -832,6 +819,20 @@ static void si_emit_shader_gs(struct si_context *sctx) shader->vgt_vertex_reuse_block_cntl); } radeon_end_update_context_roll(sctx); + + /* These don't cause any context rolls. */ + radeon_begin_again(&sctx->gfx_cs); + if (sctx->chip_class >= GFX7) { + radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs); + } + if (sctx->chip_class >= GFX10) { + radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs); + } + radeon_end(); } static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) @@ -936,13 +937,11 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); - if (sscreen->info.chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); - } + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) | + S_00B21C_WAVE_LIMIT(0x3F); + shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs = + S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0); shader->ctx_reg.gs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) | @@ -953,14 +952,13 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4; if (es_stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4); + si_set_tesseval_regs(sscreen, shader->key.part.gs.es, shader); - polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4); + polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, shader); } else { - if (sscreen->info.chip_class >= GFX7) { - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); - } + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) | + S_00B21C_WAVE_LIMIT(0x3F); + si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8)); @@ -975,23 +973,6 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) } } -static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value) -{ - enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC; - - if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || - sctx->tracked_regs.reg_value[reg] != value) { - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - - radeon_begin(cs); - radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value); - radeon_end(); - - sctx->tracked_regs.reg_saved |= 0x1ull << reg; - sctx->tracked_regs.reg_value[reg] = value; - } -} - bool gfx10_is_ngg_passthrough(struct si_shader *shader) { struct si_shader_selector *sel = shader->selector; @@ -1002,9 +983,15 @@ bool gfx10_is_ngg_passthrough(struct si_shader *shader) if (sel->screen->use_ngg_culling) return false; - return sel->info.stage != MESA_SHADER_GEOMETRY && !sel->so.num_outputs && !sel->info.writes_edgeflag && - !shader->key.opt.ngg_culling && - (sel->info.stage != MESA_SHADER_VERTEX || !shader->key.mono.u.vs_export_prim_id); + /* The definition of NGG passthrough is: + * - user GS is turned off (no amplification, no GS instancing, and no culling) + * - VGT_ESGS_RING_ITEMSIZE is ignored (behaving as if it was equal to 1) + * - vertex indices are packed into 1 VGPR + * - Dimgrey and later chips can optionally skip the gs_alloc_req message + * + * NGG passthrough still allows the use of LDS. + */ + return sel->info.stage != MESA_SHADER_GEOMETRY && !shader->key.opt.ngg_culling; } /* Common tail code for NGG primitive shaders. */ @@ -1040,13 +1027,22 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); radeon_end_update_context_roll(sctx); - /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ - gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc); + /* These don't cause a context roll. */ + radeon_begin_again(&sctx->gfx_cs); + radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC, + shader->ctx_reg.ngg.ge_pc_alloc); + radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs); + radeon_end(); } static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1055,7 +1051,7 @@ static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1069,7 +1065,7 @@ static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx) static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1083,7 +1079,7 @@ static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx) static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1231,12 +1227,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) | S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) | S_00B22C_LDS_SIZE(shader->config.lds_size)); - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F)); - si_pm4_set_reg( - pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64)); + shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(cu_mask) | + S_00B21C_WAVE_LIMIT(0x3F); + shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs = + S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64); nparams = MAX2(shader->info.nr_param_exports, 1); shader->ctx_reg.ngg.spi_vs_out_config = @@ -1267,7 +1262,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader } if (es_stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, es_sel, pm4); + si_set_tesseval_regs(sscreen, es_sel, shader); shader->ctx_reg.ngg.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) | @@ -1346,11 +1341,17 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); } + + shader->ctx_reg.ngg.vgt_stages.u.ngg = 1; + shader->ctx_reg.ngg.vgt_stages.u.streamout = gs_sel->so.num_outputs; + shader->ctx_reg.ngg.vgt_stages.u.ngg_passthrough = gfx10_is_ngg_passthrough(shader); + shader->ctx_reg.ngg.vgt_stages.u.ngg_gs_fast_launch = + !!(shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); } static void si_emit_shader_vs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.vs->shader; + struct si_shader *shader = sctx->queued.named.vs; if (!shader) return; @@ -1401,8 +1402,12 @@ static void si_emit_shader_vs(struct si_context *sctx) radeon_end_update_context_roll(sctx); /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ - if (sctx->chip_class >= GFX10) - gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc); + if (sctx->chip_class >= GFX10) { + radeon_begin_again(&sctx->gfx_cs); + radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC, + shader->ctx_reg.vs.ge_pc_alloc); + radeon_end(); + } } /** @@ -1550,9 +1555,9 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->selector, pm4); + si_set_tesseval_regs(sscreen, shader->selector, shader); - polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4); + polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader); } static unsigned si_get_ps_num_interp(struct si_shader *ps) @@ -1587,7 +1592,7 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader) static void si_emit_shader_ps(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.ps->shader; + struct si_shader *shader = sctx->queued.named.ps; if (!shader) return; @@ -1715,10 +1720,13 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) shader->ctx_reg.ps.spi_ps_input_ena = input_ena; shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr; + unsigned num_interp = si_get_ps_num_interp(shader); + /* Set interpolation controls. */ - spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) | + spi_ps_in_control = S_0286D8_NUM_INTERP(num_interp) | S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32); + shader->ctx_reg.ps.num_interp = num_interp; shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl; shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control; shader->ctx_reg.ps.spi_shader_z_format = @@ -1785,16 +1793,32 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader } } -void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs, - struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key) +static void si_clear_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, + struct si_vs_prolog_bits *prolog_key) { - if (vs->info.base.vs.blit_sgprs_amd) - return; + prolog_key->instance_divisor_is_one = 0; + prolog_key->instance_divisor_is_fetched = 0; + key->mono.vs_fetch_opencode = 0; + memset(key->mono.vs_fix_fetch, 0, sizeof(key->mono.vs_fix_fetch)); +} +void si_vs_key_update_inputs(struct si_context *sctx) +{ + struct si_shader_selector *vs = sctx->shader.vs.cso; struct si_vertex_elements *elts = sctx->vertex_elements; + struct si_shader_key *key = &sctx->shader.vs.key; - prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one; - prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched; + if (!vs) + return; + + if (vs->info.base.vs.blit_sgprs_amd) { + si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog); + return; + } + + key->part.vs.prolog.instance_divisor_is_one = elts->instance_divisor_is_one; + key->part.vs.prolog.instance_divisor_is_fetched = elts->instance_divisor_is_fetched; + key->opt.prefer_mono = elts->instance_divisor_is_fetched; unsigned count_mask = (1 << vs->info.num_inputs) - 1; unsigned fix = elts->fix_fetch_always & count_mask; @@ -1815,6 +1839,8 @@ void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selecto } } + memset(key->mono.vs_fix_fetch, 0, sizeof(key->mono.vs_fix_fetch)); + while (fix) { unsigned i = u_bit_scan(&fix); key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i]; @@ -1822,12 +1848,20 @@ void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selecto key->mono.vs_fetch_opencode = opencode; } -static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shader_selector *vs, - struct si_shader_key *key) +void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, + struct si_vs_prolog_bits *prolog_key) { - struct si_shader_selector *ps = sctx->shader.ps.cso; + prolog_key->instance_divisor_is_one = sctx->shader.vs.key.part.vs.prolog.instance_divisor_is_one; + prolog_key->instance_divisor_is_fetched = sctx->shader.vs.key.part.vs.prolog.instance_divisor_is_fetched; - key->opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable; + key->mono.vs_fetch_opencode = sctx->shader.vs.key.mono.vs_fetch_opencode; + memcpy(key->mono.vs_fix_fetch, sctx->shader.vs.key.mono.vs_fix_fetch, + sizeof(key->mono.vs_fix_fetch)); +} + +void si_update_ps_inputs_read_or_disabled(struct si_context *sctx) +{ + struct si_shader_selector *ps = sctx->shader.ps.cso; /* Find out if PS is disabled. */ bool ps_disabled = true; @@ -1842,36 +1876,259 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shad (!ps_colormask && !ps_modifies_zs && !ps->info.base.writes_memory); } - /* Find out which VS outputs aren't used by the PS. */ - uint64_t outputs_written = vs->outputs_written_before_ps; - uint64_t inputs_read = 0; + sctx->ps_inputs_read_or_disabled = ps_disabled ? 0 : ps->inputs_read; +} - /* Ignore outputs that are not passed from VS to PS. */ - outputs_written &= ~((1ull << si_shader_io_get_unique_index(VARYING_SLOT_POS, true)) | - (1ull << si_shader_io_get_unique_index(VARYING_SLOT_PSIZ, true)) | - (1ull << si_shader_io_get_unique_index(VARYING_SLOT_CLIP_VERTEX, true))); +static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, + struct si_shader_key *key) +{ - if (!ps_disabled) { - inputs_read = ps->inputs_read; - } + key->opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable; - uint64_t linked = outputs_written & inputs_read; + /* Find out which VS outputs aren't used by the PS. */ + uint64_t outputs_written = vs->outputs_written_before_ps; + uint64_t linked = outputs_written & sctx->ps_inputs_read_or_disabled; key->opt.kill_outputs = ~linked & outputs_written; if (vs->info.stage != MESA_SHADER_GEOMETRY) { key->opt.ngg_culling = sctx->ngg_culling; - - if (sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid) - key->mono.u.vs_export_prim_id = 1; + key->mono.u.vs_export_prim_id = sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid; + } else { + key->opt.ngg_culling = 0; + key->mono.u.vs_export_prim_id = 0; } /* We need PKT3_CONTEXT_REG_RMW, which we currently only use on GFX10+. */ - if (sctx->chip_class >= GFX10 && - vs->info.writes_psize && - sctx->current_rast_prim != PIPE_PRIM_POINTS && - !sctx->queued.named.rasterizer->polygon_mode_is_points) - key->opt.kill_pointsize = 1; + key->opt.kill_pointsize = sctx->chip_class >= GFX10 && + vs->info.writes_psize && + sctx->current_rast_prim != PIPE_PRIM_POINTS && + !sctx->queued.named.rasterizer->polygon_mode_is_points; +} + +static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, + struct si_shader_key *key) +{ + key->opt.kill_clip_distances = 0; + key->opt.kill_outputs = 0; + key->opt.ngg_culling = 0; + key->mono.u.vs_export_prim_id = 0; + key->opt.kill_pointsize = 0; +} + +void si_ps_key_update_framebuffer(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + + if (!sel) + return; + + if (sel->info.color0_writes_all_cbufs && + sel->info.colors_written == 0x1) + key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; + else + key->part.ps.epilog.last_cbuf = 0; + + /* ps_uses_fbfetch is true only if the color buffer is bound. */ + if (sctx->ps_uses_fbfetch) { + struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; + struct pipe_resource *tex = cb0->texture; + + /* 1D textures are allocated and used as 2D on GFX9. */ + key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1; + key->mono.u.ps.fbfetch_is_1D = + sctx->chip_class != GFX9 && + (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY); + key->mono.u.ps.fbfetch_layered = + tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY || + tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY || + tex->target == PIPE_TEXTURE_3D; + } else { + key->mono.u.ps.fbfetch_msaa = 0; + key->mono.u.ps.fbfetch_is_1D = 0; + key->mono.u.ps.fbfetch_layered = 0; + } +} + +void si_ps_key_update_framebuffer_blend(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_blend *blend = sctx->queued.named.blend; + + if (!sel) + return; + + /* Select the shader color format based on whether + * blending or alpha are needed. + */ + key->part.ps.epilog.spi_shader_col_format = + (blend->blend_enable_4bit & blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_blend_alpha) | + (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_blend) | + (~blend->blend_enable_4bit & blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_alpha) | + (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format); + key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit; + + /* The output for dual source blending should have + * the same format as the first output. + */ + if (blend->dual_src_blend) { + key->part.ps.epilog.spi_shader_col_format |= + (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4; + } + + /* If alpha-to-coverage is enabled, we have to export alpha + * even if there is no color buffer. + */ + if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage) + key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR; + + /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs + * to the range supported by the type if a channel has less + * than 16 bits and the export format is 16_ABGR. + */ + if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) { + key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8; + key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10; + } + + /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */ + if (!key->part.ps.epilog.last_cbuf) { + key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit; + key->part.ps.epilog.color_is_int8 &= sel->info.colors_written; + key->part.ps.epilog.color_is_int10 &= sel->info.colors_written; + } + + /* Eliminate shader code computing output values that are unused. + * This enables dead code elimination between shader parts. + * Check if any output is eliminated. + */ + if (sel->colors_written_4bit & + ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit)) + key->opt.prefer_mono = 1; + else + key->opt.prefer_mono = 0; +} + +void si_ps_key_update_blend_rasterizer(struct si_context *sctx) +{ + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_blend *blend = sctx->queued.named.blend; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable; +} + +void si_ps_key_update_rasterizer(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + if (!sel) + return; + + key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read; + key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color; + key->part.ps.epilog.clamp_color = rs->clamp_fragment_color; +} + +void si_ps_key_update_dsa(struct si_context *sctx) +{ + struct si_shader_key *key = &sctx->shader.ps.key; + + key->part.ps.epilog.alpha_func = sctx->queued.named.dsa->alpha_func; +} + +static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_context *sctx) +{ + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); + bool is_line = util_prim_is_lines(sctx->current_rast_prim); + + key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; + key->part.ps.epilog.poly_line_smoothing = + ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) && + sctx->framebuffer.nr_samples <= 1; +} + +void si_ps_key_update_sample_shading(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + + if (!sel) + return; + + if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) + key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples); + else + key->part.ps.prolog.samplemask_log_ps_iter = 0; +} + +void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + if (!sel) + return; + + bool uses_persp_center = sel->info.uses_persp_center || + (!rs->flatshade && sel->info.uses_persp_center_color); + bool uses_persp_centroid = sel->info.uses_persp_centroid || + (!rs->flatshade && sel->info.uses_persp_centroid_color); + bool uses_persp_sample = sel->info.uses_persp_sample || + (!rs->flatshade && sel->info.uses_persp_sample_color); + + if (rs->force_persample_interp && rs->multisample_enable && + sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) { + key->part.ps.prolog.force_persp_sample_interp = + uses_persp_center || uses_persp_centroid; + + key->part.ps.prolog.force_linear_sample_interp = + sel->info.uses_linear_center || sel->info.uses_linear_centroid; + + key->part.ps.prolog.force_persp_center_interp = 0; + key->part.ps.prolog.force_linear_center_interp = 0; + key->part.ps.prolog.bc_optimize_for_persp = 0; + key->part.ps.prolog.bc_optimize_for_linear = 0; + key->mono.u.ps.interpolate_at_sample_force_center = 0; + } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) { + key->part.ps.prolog.force_persp_sample_interp = 0; + key->part.ps.prolog.force_linear_sample_interp = 0; + key->part.ps.prolog.force_persp_center_interp = 0; + key->part.ps.prolog.force_linear_center_interp = 0; + key->part.ps.prolog.bc_optimize_for_persp = + uses_persp_center && uses_persp_centroid; + key->part.ps.prolog.bc_optimize_for_linear = + sel->info.uses_linear_center && sel->info.uses_linear_centroid; + key->mono.u.ps.interpolate_at_sample_force_center = 0; + } else { + key->part.ps.prolog.force_persp_sample_interp = 0; + key->part.ps.prolog.force_linear_sample_interp = 0; + + /* Make sure SPI doesn't compute more than 1 pair + * of (i,j), which is the optimization here. */ + key->part.ps.prolog.force_persp_center_interp = uses_persp_center + + uses_persp_centroid + + uses_persp_sample > 1; + + key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center + + sel->info.uses_linear_centroid + + sel->info.uses_linear_sample > 1; + key->part.ps.prolog.bc_optimize_for_persp = 0; + key->part.ps.prolog.bc_optimize_for_linear = 0; + key->mono.u.ps.interpolate_at_sample_force_center = sel->info.uses_interp_at_sample; + } } /* Compute the key for the hw shader variant */ @@ -1880,215 +2137,48 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh { struct si_context *sctx = (struct si_context *)ctx; - memset(&key->part, 0, sizeof(key->part)); - memset(&key->mono, 0, sizeof(key->mono)); - memset(&key->opt, 0, sizeof(key->opt)); - - unsigned num_inlinable_uniforms = sel->info.base.num_inlinable_uniforms; - if (num_inlinable_uniforms && - sctx->inlinable_uniforms_valid_mask & (1 << sel->pipe_shader_type)) { - key->opt.inline_uniforms = true; - memcpy(key->opt.inlined_uniform_values, - sctx->inlinable_uniforms[sel->pipe_shader_type], - num_inlinable_uniforms * 4); - } - switch (sel->info.stage) { case MESA_SHADER_VERTEX: - si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog); - if (!sctx->shader.tes.cso && !sctx->shader.gs.cso) - si_shader_selector_key_hw_vs(sctx, sel, key); + si_get_vs_key_outputs(sctx, sel, key); + else + si_clear_vs_key_outputs(sctx, sel, key); break; case MESA_SHADER_TESS_CTRL: if (sctx->chip_class >= GFX9) { - si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, key, &key->part.tcs.ls_prolog); + si_get_vs_key_inputs(sctx, key, &key->part.tcs.ls_prolog); key->part.tcs.ls = sctx->shader.vs.cso; - - /* When the LS VGPR fix is needed, monolithic shaders - * can: - * - avoid initializing EXEC in both the LS prolog - * and the LS main part when !vs_needs_prolog - * - remove the fixup for unused input VGPRs - */ - key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix; - - /* The LS output / HS input layout can be communicated - * directly instead of via user SGPRs for merged LS-HS. - * This also enables jumping over the VS prolog for HS-only waves. - */ - key->opt.prefer_mono = 1; - key->opt.same_patch_vertices = sctx->same_patch_vertices; } - - key->part.tcs.epilog.prim_mode = - sctx->shader.tes.cso->info.base.tess.primitive_mode; - key->part.tcs.epilog.invoc0_tess_factors_are_def = - sel->info.tessfactors_are_def_in_all_invocs; - key->part.tcs.epilog.tes_reads_tess_factors = sctx->shader.tes.cso->info.reads_tess_factors; - - if (sel == sctx->fixed_func_tcs_shader.cso) - key->mono.u.ff_tcs_inputs_to_copy = sctx->shader.vs.cso->outputs_written; break; case MESA_SHADER_TESS_EVAL: if (!sctx->shader.gs.cso) - si_shader_selector_key_hw_vs(sctx, sel, key); + si_get_vs_key_outputs(sctx, sel, key); + else + si_clear_vs_key_outputs(sctx, sel, key); break; case MESA_SHADER_GEOMETRY: if (sctx->chip_class >= GFX9) { if (sctx->shader.tes.cso) { + si_clear_vs_key_inputs(sctx, key, &key->part.gs.vs_prolog); key->part.gs.es = sctx->shader.tes.cso; } else { - si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, key, &key->part.gs.vs_prolog); + si_get_vs_key_inputs(sctx, key, &key->part.gs.vs_prolog); key->part.gs.es = sctx->shader.vs.cso; } /* Only NGG can eliminate GS outputs, because the code is shared with VS. */ if (sctx->ngg) - si_shader_selector_key_hw_vs(sctx, sel, key); - - /* This enables jumping over the VS prolog for GS-only waves. */ - key->opt.prefer_mono = 1; + si_get_vs_key_outputs(sctx, sel, key); + else + si_clear_vs_key_outputs(sctx, sel, key); } - key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix; break; - case MESA_SHADER_FRAGMENT: { - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_state_blend *blend = sctx->queued.named.blend; - - if (sel->info.color0_writes_all_cbufs && - sel->info.colors_written == 0x1) - key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; - - /* Select the shader color format based on whether - * blending or alpha are needed. - */ - key->part.ps.epilog.spi_shader_col_format = - (blend->blend_enable_4bit & blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_blend_alpha) | - (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_blend) | - (~blend->blend_enable_4bit & blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_alpha) | - (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format); - key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit; - - /* The output for dual source blending should have - * the same format as the first output. - */ - if (blend->dual_src_blend) { - key->part.ps.epilog.spi_shader_col_format |= - (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4; - } - - /* If alpha-to-coverage is enabled, we have to export alpha - * even if there is no color buffer. - */ - if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage) - key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR; - - /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs - * to the range supported by the type if a channel has less - * than 16 bits and the export format is 16_ABGR. - */ - if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) { - key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8; - key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10; - } - - /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */ - if (!key->part.ps.epilog.last_cbuf) { - key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit; - key->part.ps.epilog.color_is_int8 &= sel->info.colors_written; - key->part.ps.epilog.color_is_int10 &= sel->info.colors_written; - } - - /* Eliminate shader code computing output values that are unused. - * This enables dead code elimination between shader parts. - * Check if any output is eliminated. - */ - if (sel->colors_written_4bit & - ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit)) - key->opt.prefer_mono = 1; - - bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); - bool is_line = util_prim_is_lines(sctx->current_rast_prim); - - key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read; - key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color; - - key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable; - - key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; - key->part.ps.epilog.poly_line_smoothing = - ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) && - sctx->framebuffer.nr_samples <= 1; - key->part.ps.epilog.clamp_color = rs->clamp_fragment_color; - - if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) { - key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples); - } - - bool uses_persp_center = sel->info.uses_persp_center || - (!rs->flatshade && sel->info.uses_persp_center_color); - bool uses_persp_centroid = sel->info.uses_persp_centroid || - (!rs->flatshade && sel->info.uses_persp_centroid_color); - bool uses_persp_sample = sel->info.uses_persp_sample || - (!rs->flatshade && sel->info.uses_persp_sample_color); - - if (rs->force_persample_interp && rs->multisample_enable && - sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) { - key->part.ps.prolog.force_persp_sample_interp = - uses_persp_center || uses_persp_centroid; - - key->part.ps.prolog.force_linear_sample_interp = - sel->info.uses_linear_center || sel->info.uses_linear_centroid; - } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) { - key->part.ps.prolog.bc_optimize_for_persp = - uses_persp_center && uses_persp_centroid; - key->part.ps.prolog.bc_optimize_for_linear = - sel->info.uses_linear_center && sel->info.uses_linear_centroid; - } else { - /* Make sure SPI doesn't compute more than 1 pair - * of (i,j), which is the optimization here. */ - key->part.ps.prolog.force_persp_center_interp = uses_persp_center + - uses_persp_centroid + - uses_persp_sample > 1; - - key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center + - sel->info.uses_linear_centroid + - sel->info.uses_linear_sample > 1; - - if (sel->info.uses_interp_at_sample) - key->mono.u.ps.interpolate_at_sample_force_center = 1; - } - - key->part.ps.epilog.alpha_func = sctx->queued.named.dsa->alpha_func; - - /* ps_uses_fbfetch is true only if the color buffer is bound. */ - if (sctx->ps_uses_fbfetch && !sctx->blitter_running) { - struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; - struct pipe_resource *tex = cb0->texture; - - /* 1D textures are allocated and used as 2D on GFX9. */ - key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1; - key->mono.u.ps.fbfetch_is_1D = - sctx->chip_class != GFX9 && - (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY); - key->mono.u.ps.fbfetch_layered = - tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY || - tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY || - tex->target == PIPE_TEXTURE_3D; - } + case MESA_SHADER_FRAGMENT: + si_ps_key_update_primtype_shader_rasterizer_framebuffer(sctx); break; - } default: assert(0); } - - if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT))) - memset(&key->opt, 0, sizeof(key->opt)); } static void si_build_shader_variant(struct si_shader *shader, int thread_index, bool low_priority) @@ -2146,7 +2236,7 @@ static const struct si_shader_key zeroed; static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel, struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key) + const struct si_shader_key *key) { struct si_shader **mainp = si_get_main_shader_part(sel, key); @@ -2177,6 +2267,16 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad return true; } +/* A helper to copy *key to *local_key and return local_key. */ +static const struct si_shader_key * +use_local_key_copy(const struct si_shader_key *key, struct si_shader_key *local_key) +{ + if (key != local_key) + memcpy(local_key, key, sizeof(*key)); + + return local_key; +} + /** * Select a shader variant according to the shader key. * @@ -2184,14 +2284,26 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad * the compilation isn't finished, don't select any * shader and return an error. */ -int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key, int thread_index, bool optimized_or_none) +int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state, + const struct si_shader_key *key, int thread_index, + bool optimized_or_none) { + struct si_screen *sscreen = sctx->screen; struct si_shader_selector *sel = state->cso; struct si_shader_selector *previous_stage_sel = NULL; struct si_shader *current = state->current; struct si_shader *iter, *shader = NULL; + /* si_shader_select_with_key must not modify 'key' because it would affect future shaders. + * If we need to modify it for this specific shader (eg: to disable optimizations), we + * use a copy. + */ + struct si_shader_key local_key; + + if (unlikely(sscreen->debug_flags & DBG(NO_OPT_VARIANT))) { + /* Disable shader variant optimizations. */ + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); + } again: /* Check if we don't need to change anything. @@ -2204,7 +2316,8 @@ again: if (optimized_or_none) return -1; - memset(&key->opt, 0, sizeof(key->opt)); + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); goto current_not_ready; } @@ -2243,9 +2356,10 @@ current_not_ready: key->opt.inlined_uniform_values, MAX_INLINABLE_UNIFORMS * 4) != 0) { if (variant_count++ > max_inline_uniforms_variants) { + key = use_local_key_copy(key, &local_key); /* Too many variants. Disable inlining for this shader. */ - key->opt.inline_uniforms = 0; - memset(key->opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4); + local_key.opt.inline_uniforms = 0; + memset(local_key.opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4); simple_mtx_unlock(&sel->mutex); goto again; } @@ -2262,7 +2376,9 @@ current_not_ready: if (iter->is_optimized) { if (optimized_or_none) return -1; - memset(&key->opt, 0, sizeof(key->opt)); + + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); goto again; } @@ -2287,9 +2403,14 @@ current_not_ready: util_queue_fence_init(&shader->ready); + if (!sctx->compiler.passes) + si_init_compiler(sctx->screen, &sctx->compiler); + shader->selector = sel; shader->key = *key; - shader->compiler_ctx_state = *compiler_state; + shader->compiler_ctx_state.compiler = &sctx->compiler; + shader->compiler_ctx_state.debug = sctx->debug; + shader->compiler_ctx_state.is_debug_context = sctx->is_debug; /* If this is a merged shader, get the first shader's selector. */ if (sscreen->info.chip_class >= GFX9) { @@ -2335,12 +2456,13 @@ current_not_ready: } simple_mtx_lock(&previous_stage_sel->mutex); - ok = si_check_missing_main_part(sscreen, previous_stage_sel, compiler_state, &shader1_key); + ok = si_check_missing_main_part(sscreen, previous_stage_sel, &shader->compiler_ctx_state, + &shader1_key); simple_mtx_unlock(&previous_stage_sel->mutex); } if (ok) { - ok = si_check_missing_main_part(sscreen, sel, compiler_state, key); + ok = si_check_missing_main_part(sscreen, sel, &shader->compiler_ctx_state, key); } if (!ok) { @@ -2383,7 +2505,8 @@ current_not_ready: } /* Use the default (unoptimized) shader for now. */ - memset(&key->opt, 0, sizeof(key->opt)); + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); simple_mtx_unlock(&sel->mutex); if (sscreen->options.sync_compile) @@ -2418,13 +2541,12 @@ current_not_ready: return shader->compilation_failed ? -1 : 0; } -static int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state) +int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state) { struct si_context *sctx = (struct si_context *)ctx; si_shader_selector_key(ctx, state->cso, &state->key); - return si_shader_select_with_key(sctx->screen, state, compiler_state, &state->key, -1, false); + return si_shader_select_with_key(sctx, state, &state->key, -1, false); } static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout, @@ -2482,6 +2604,19 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind if (!compiler->passes) si_init_compiler(sscreen, compiler); + /* The GS copy shader is always pre-compiled. */ + if (sel->info.stage == MESA_SHADER_GEOMETRY && + (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */ + sel->tess_turns_off_ngg)) { + sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug); + if (!sel->gs_copy_shader) { + fprintf(stderr, "radeonsi: can't create GS copy shader\n"); + return; + } + + si_shader_vs(sscreen, sel->gs_copy_shader, sel); + } + /* Serialize NIR to save memory. Monolithic shader variants * have to deserialize NIR before compilation. */ @@ -2566,14 +2701,16 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind unsigned i; for (i = 0; i < sel->info.num_outputs; i++) { - unsigned offset = shader->info.vs_output_param_offset[i]; + unsigned semantic = sel->info.output_semantic[i]; + unsigned ps_input_cntl = shader->info.vs_output_ps_input_cntl[semantic]; - if (offset <= AC_EXP_PARAM_OFFSET_31) + /* OFFSET=0x20 means DEFAULT_VAL, which means VS doesn't export it. */ + if (G_028644_OFFSET(ps_input_cntl) != 0x20) continue; - unsigned semantic = sel->info.output_semantic[i]; unsigned id; + /* Remove the output from the mask. */ if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && semantic != VARYING_SLOT_POS && semantic != VARYING_SLOT_PSIZ && @@ -2586,19 +2723,6 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind } } - /* The GS copy shader is always pre-compiled. */ - if (sel->info.stage == MESA_SHADER_GEOMETRY && - (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */ - sel->tess_turns_off_ngg)) { - sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug); - if (!sel->gs_copy_shader) { - fprintf(stderr, "radeonsi: can't create GS copy shader\n"); - return; - } - - si_shader_vs(sscreen, sel->gs_copy_shader, sel); - } - /* Free NIR. We only keep serialized NIR after this point. */ if (sel->nir) { ralloc_free(sel->nir); @@ -2714,7 +2838,8 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->info.stage == MESA_SHADER_VERTEX && !sel->info.base.vs.blit_sgprs_amd ? sel->info.num_inputs : 0; - sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs); + unsigned num_vbos_in_sgprs = si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class); + sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, num_vbos_in_sgprs); /* The prolog is a no-op if there are no inputs. */ sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs && @@ -2740,8 +2865,14 @@ static void *si_create_shader_selector(struct pipe_context *ctx, } else if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && semantic != VARYING_SLOT_EDGE) { sel->outputs_written |= 1ull << si_shader_io_get_unique_index(semantic, false); - sel->outputs_written_before_ps |= 1ull - << si_shader_io_get_unique_index(semantic, true); + + /* Ignore outputs that are not passed from VS to PS. */ + if (semantic != VARYING_SLOT_POS && + semantic != VARYING_SLOT_PSIZ && + semantic != VARYING_SLOT_CLIP_VERTEX) { + sel->outputs_written_before_ps |= 1ull + << si_shader_io_get_unique_index(semantic, true); + } } } } @@ -2808,7 +2939,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, case MESA_SHADER_FRAGMENT: for (i = 0; i < sel->info.num_inputs; i++) { - unsigned semantic = sel->info.input_semantic[i]; + unsigned semantic = sel->info.input[i].semantic; if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && semantic != VARYING_SLOT_PNTC) { @@ -2821,9 +2952,9 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->colors_written_4bit |= 0xf << (4 * i); for (i = 0; i < sel->info.num_inputs; i++) { - if (sel->info.input_semantic[i] == VARYING_SLOT_COL0) + if (sel->info.input[i].semantic == VARYING_SLOT_COL0) sel->color_attr_index[0] = i; - else if (sel->info.input_semantic[i] == VARYING_SLOT_COL1) + else if (sel->info.input[i].semantic == VARYING_SLOT_COL1) sel->color_attr_index[1] = i; } break; @@ -3022,9 +3153,7 @@ static void si_update_common_shader_state(struct si_context *sctx, struct si_sha si_shader_uses_bindless_images(sctx->shader.tcs.cso) || si_shader_uses_bindless_images(sctx->shader.tes.cso); - /* Invalidate inlinable uniforms. */ - sctx->inlinable_uniforms_valid_mask &= ~(1 << type); - + si_invalidate_inlinable_uniforms(sctx, type); sctx->do_update_shaders = true; } @@ -3042,6 +3171,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) sctx->shader.vs.current = sel ? sel->first_variant : NULL; sctx->num_vs_blit_sgprs = sel ? sel->info.base.vs.blit_sgprs_amd : 0; sctx->vs_uses_draw_id = sel ? sel->info.uses_drawid : false; + sctx->fixed_func_tcs_shader.key.mono.u.ff_tcs_inputs_to_copy = sel ? sel->outputs_written : 0; if (si_update_ngg(sctx)) si_shader_change_notify(sctx); @@ -3053,6 +3183,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso, si_get_vs(sctx)->current); si_update_rasterized_prim(sctx); + si_vs_key_update_inputs(sctx); } static void si_update_tess_uses_prim_id(struct si_context *sctx) @@ -3148,6 +3279,8 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) sctx->shader.tcs.cso = sel; sctx->shader.tcs.current = sel ? sel->first_variant : NULL; + sctx->shader.tcs.key.part.tcs.epilog.invoc0_tess_factors_are_def = + sel ? sel->info.tessfactors_are_def_in_all_invocs : 0; si_update_tess_uses_prim_id(sctx); si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_CTRL); @@ -3172,6 +3305,14 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state) sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL; si_update_tess_uses_prim_id(sctx); + sctx->shader.tcs.key.part.tcs.epilog.prim_mode = + sctx->fixed_func_tcs_shader.key.part.tcs.epilog.prim_mode = + sel ? sel->info.base.tess.primitive_mode : 0; + + sctx->shader.tcs.key.part.tcs.epilog.tes_reads_tess_factors = + sctx->fixed_func_tcs_shader.key.part.tcs.epilog.tes_reads_tess_factors = + sel ? sel->info.reads_tess_factors : 0; + si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL); si_select_draw_vbo(sctx); sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ @@ -3188,6 +3329,41 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state) si_update_rasterized_prim(sctx); } +void si_update_ps_kill_enable(struct si_context *sctx) +{ + if (!sctx->shader.ps.cso) + return; + + unsigned db_shader_control = sctx->shader.ps.cso->db_shader_control | + S_02880C_KILL_ENABLE(sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS); + + if (sctx->ps_db_shader_control != db_shader_control) { + sctx->ps_db_shader_control = db_shader_control; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + if (sctx->screen->dpbb_allowed) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + } +} + +void si_update_vrs_flat_shading(struct si_context *sctx) +{ + if (sctx->chip_class >= GFX10_3 && sctx->shader.ps.cso) { + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct si_shader_info *info = &sctx->shader.ps.cso->info; + bool allow_flat_shading = info->allow_flat_shading; + + if (allow_flat_shading && + (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable || + (!rs->flatshade && info->uses_interp_color))) + allow_flat_shading = false; + + if (sctx->allow_flat_shading != allow_flat_shading) { + sctx->allow_flat_shading = allow_flat_shading; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + } + } +} + static void si_bind_ps_shader(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -3216,6 +3392,17 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); } si_update_ps_colorbuf0_slot(sctx); + + si_ps_key_update_framebuffer(sctx); + si_ps_key_update_framebuffer_blend(sctx); + si_ps_key_update_blend_rasterizer(sctx); + si_ps_key_update_rasterizer(sctx); + si_ps_key_update_dsa(sctx); + si_ps_key_update_sample_shading(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + si_update_ps_inputs_read_or_disabled(sctx); + si_update_ps_kill_enable(sctx); + si_update_vrs_flat_shading(sctx); } static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) @@ -3226,55 +3413,55 @@ static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) util_queue_fence_destroy(&shader->ready); - if (shader->pm4) { - /* If destroyed shaders were not unbound, the next compiled - * shader variant could get the same pointer address and so - * binding it to the same shader stage would be considered - * a no-op, causing random behavior. - */ - switch (shader->selector->info.stage) { - case MESA_SHADER_VERTEX: - if (shader->key.as_ls) { - assert(sctx->chip_class <= GFX8); - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(ls)); - } else if (shader->key.as_es) { - assert(sctx->chip_class <= GFX8); - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(es)); - } else if (shader->key.as_ngg) { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs)); - } else { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs)); - } - break; - case MESA_SHADER_TESS_CTRL: - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(hs)); - break; - case MESA_SHADER_TESS_EVAL: - if (shader->key.as_es) { - assert(sctx->chip_class <= GFX8); - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(es)); - } else if (shader->key.as_ngg) { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs)); - } else { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs)); - } - break; - case MESA_SHADER_GEOMETRY: - if (shader->is_gs_copy_shader) - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs)); - else - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs)); - break; - case MESA_SHADER_FRAGMENT: - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(ps)); - break; - default:; + /* If destroyed shaders were not unbound, the next compiled + * shader variant could get the same pointer address and so + * binding it to the same shader stage would be considered + * a no-op, causing random behavior. + */ + int state_index = -1; + + switch (shader->selector->info.stage) { + case MESA_SHADER_VERTEX: + if (shader->key.as_ls) { + if (sctx->chip_class <= GFX8) + state_index = SI_STATE_IDX(ls); + } else if (shader->key.as_es) { + if (sctx->chip_class <= GFX8) + state_index = SI_STATE_IDX(es); + } else if (shader->key.as_ngg) { + state_index = SI_STATE_IDX(gs); + } else { + state_index = SI_STATE_IDX(vs); + } + break; + case MESA_SHADER_TESS_CTRL: + state_index = SI_STATE_IDX(hs); + break; + case MESA_SHADER_TESS_EVAL: + if (shader->key.as_es) { + if (sctx->chip_class <= GFX8) + state_index = SI_STATE_IDX(es); + } else if (shader->key.as_ngg) { + state_index = SI_STATE_IDX(gs); + } else { + state_index = SI_STATE_IDX(vs); } + break; + case MESA_SHADER_GEOMETRY: + if (shader->is_gs_copy_shader) + state_index = SI_STATE_IDX(vs); + else + state_index = SI_STATE_IDX(gs); + break; + case MESA_SHADER_FRAGMENT: + state_index = SI_STATE_IDX(ps); + break; + default:; } si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL); si_shader_destroy(shader); - free(shader); + si_pm4_free_state(sctx, &shader->pm4, state_index); } static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso) @@ -3323,123 +3510,6 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state) si_shader_selector_reference(sctx, &sel, NULL); } -static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader *vs, - unsigned semantic, enum glsl_interp_mode interpolate, - ubyte fp16_lo_hi_mask) -{ - struct si_shader_info *vsinfo = &vs->selector->info; - unsigned offset, ps_input_cntl = 0; - - if (interpolate == INTERP_MODE_FLAT || - (interpolate == INTERP_MODE_COLOR && sctx->flatshade) || - semantic == VARYING_SLOT_PRIMITIVE_ID) - ps_input_cntl |= S_028644_FLAT_SHADE(1); - - if (semantic == VARYING_SLOT_PNTC || - (semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7 && - sctx->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) { - ps_input_cntl |= S_028644_PT_SPRITE_TEX(1); - if (fp16_lo_hi_mask & 0x1) { - ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | - S_028644_ATTR0_VALID(1); - } - } - - int vs_slot = vsinfo->output_semantic_to_slot[semantic]; - if (vs_slot >= 0) { - offset = vs->info.vs_output_param_offset[vs_slot]; - - if (offset <= AC_EXP_PARAM_OFFSET_31) { - /* The input is loaded from parameter memory. */ - ps_input_cntl |= S_028644_OFFSET(offset); - } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { - /* The input is a DEFAULT_VAL constant. */ - assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && - offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); - offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; - - ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset); - } - - if (fp16_lo_hi_mask && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) { - assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000); - - ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | - S_028644_USE_DEFAULT_ATTR1(offset == AC_EXP_PARAM_DEFAULT_VAL_0000) | - S_028644_DEFAULT_VAL_ATTR1(0) | - S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */ - S_028644_ATTR1_VALID(!!(fp16_lo_hi_mask & 0x2)); - } - } else { - /* VS output not found. */ - if (semantic == VARYING_SLOT_PRIMITIVE_ID) { - /* PrimID is written after the last output when HW VS is used. */ - ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]); - } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { - /* No corresponding output found, load defaults into input. - * Don't set any other bits. - * (FLAT_SHADE=1 completely changes behavior) */ - ps_input_cntl = S_028644_OFFSET(0x20); - /* D3D 9 behaviour. GL is undefined */ - if (semantic == VARYING_SLOT_COL0) - ps_input_cntl |= S_028644_DEFAULT_VAL(3); - } - } - - return ps_input_cntl; -} - -static void si_emit_spi_map(struct si_context *sctx) -{ - struct si_shader *ps = sctx->shader.ps.current; - struct si_shader *vs; - struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL; - unsigned i, num_interp, num_written = 0; - unsigned spi_ps_input_cntl[32]; - - if (!ps || !ps->selector->info.num_inputs) - return; - - /* With legacy GS, only the GS copy shader contains information about param exports. */ - if (sctx->shader.gs.cso && !sctx->ngg) - vs = sctx->shader.gs.cso->gs_copy_shader; - else - vs = si_get_vs(sctx)->current; - - num_interp = si_get_ps_num_interp(ps); - assert(num_interp > 0); - - for (i = 0; i < psinfo->num_inputs; i++) { - unsigned semantic = psinfo->input_semantic[i]; - unsigned interpolate = psinfo->input_interpolate[i]; - ubyte fp16_lo_hi_mask = psinfo->input_fp16_lo_hi_valid[i]; - - spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, semantic, interpolate, - fp16_lo_hi_mask); - } - - if (ps->key.part.ps.prolog.color_two_side) { - for (i = 0; i < 2; i++) { - if (!(psinfo->colors_read & (0xf << (i * 4)))) - continue; - - unsigned semantic = VARYING_SLOT_BFC0 + i; - spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, semantic, - psinfo->color_interpolate[i], - false); - } - } - assert(num_interp == num_written); - - /* R_028644_SPI_PS_INPUT_CNTL_0 */ - /* Dota 2: Only ~16% of SPI map updates set different values. */ - /* Talos: Only ~9% of SPI map updates set different values. */ - radeon_begin(&sctx->gfx_cs); - radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, - sctx->tracked_regs.spi_ps_input_cntl, num_interp); - radeon_end_update_context_roll(sctx); -} - /** * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that. */ @@ -3479,7 +3549,7 @@ static void si_emit_vgt_flush(struct radeon_cmdbuf *cs) } /* Initialize state related to ESGS / GSVS ring buffers */ -static bool si_update_gs_ring_buffers(struct si_context *sctx) +bool si_update_gs_ring_buffers(struct si_context *sctx) { struct si_shader_selector *es = sctx->shader.tes.cso ? sctx->shader.tes.cso : sctx->shader.vs.cso; @@ -3682,11 +3752,6 @@ static int si_update_scratch_buffer(struct si_context *sctx, struct si_shader *s return 1; } -static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader) -{ - return shader ? shader->config.scratch_bytes_per_wave : 0; -} - static struct si_shader *si_get_tcs_current(struct si_context *sctx) { if (!sctx->shader.tes.cso) @@ -3709,19 +3774,19 @@ static bool si_update_scratch_relocs(struct si_context *sctx) if (r < 0) return false; if (r == 1) - si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4); + si_pm4_bind_state(sctx, ps, sctx->shader.ps.current); r = si_update_scratch_buffer(sctx, sctx->shader.gs.current); if (r < 0) return false; if (r == 1) - si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.gs.current); r = si_update_scratch_buffer(sctx, tcs); if (r < 0) return false; if (r == 1) - si_pm4_bind_state(sctx, hs, tcs->pm4); + si_pm4_bind_state(sctx, hs, tcs); /* VS can be bound as LS, ES, or VS. */ r = si_update_scratch_buffer(sctx, sctx->shader.vs.current); @@ -3729,13 +3794,13 @@ static bool si_update_scratch_relocs(struct si_context *sctx) return false; if (r == 1) { if (sctx->shader.vs.current->key.as_ls) - si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, ls, sctx->shader.vs.current); else if (sctx->shader.vs.current->key.as_es) - si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, es, sctx->shader.vs.current); else if (sctx->shader.vs.current->key.as_ngg) - si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.vs.current); else - si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, vs, sctx->shader.vs.current); } /* TES can be bound as ES or VS. */ @@ -3744,17 +3809,17 @@ static bool si_update_scratch_relocs(struct si_context *sctx) return false; if (r == 1) { if (sctx->shader.tes.current->key.as_es) - si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, es, sctx->shader.tes.current); else if (sctx->shader.tes.current->key.as_ngg) - si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.tes.current); else - si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, vs, sctx->shader.tes.current); } return true; } -static bool si_update_spi_tmpring_size(struct si_context *sctx) +bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes) { /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer. * There are 2 cases to handle: @@ -3769,17 +3834,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) * Otherwise, the number of waves that can use scratch is * SPI_TMPRING_SIZE.WAVES. */ - unsigned bytes = 0; - - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.ps.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.gs.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.vs.current)); - - if (sctx->shader.tes.cso) { - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.tes.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx))); - } - sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes); unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves; @@ -3818,7 +3872,7 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) return true; } -static void si_init_tess_factor_ring(struct si_context *sctx) +void si_init_tess_factor_ring(struct si_context *sctx) { assert(!sctx->tess_rings); assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0); @@ -3918,8 +3972,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx) si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } -static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, - union si_vgt_stages_key key) +struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key) { struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); uint32_t stages = 0; @@ -3963,284 +4016,6 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, return pm4; } -static void si_update_vgt_shader_config(struct si_context *sctx) -{ - union si_vgt_stages_key key; - key.index = 0; - - if (sctx->shader.tes.cso) - key.u.tess = 1; - if (sctx->shader.gs.cso) - key.u.gs = 1; - - if (sctx->ngg) { - struct si_shader *vs = si_get_vs(sctx)->current; - - key.u.ngg = 1; - key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs; - /* These must be done after the shader variant is selected. */ - key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs); - key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); - } - - struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index]; - - if (unlikely(!*pm4)) - *pm4 = si_build_vgt_shader_config(sctx->screen, key); - si_pm4_bind_state(sctx, vgt_shader_config, *pm4); -} - -bool si_update_shaders(struct si_context *sctx) -{ - struct pipe_context *ctx = (struct pipe_context *)sctx; - struct si_compiler_ctx_state compiler_state; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_shader *old_vs = si_get_vs(sctx)->current; - unsigned old_kill_clip_distances = old_vs ? old_vs->key.opt.kill_clip_distances : 0; - struct si_shader *old_ps = sctx->shader.ps.current; - unsigned old_spi_shader_col_format = - old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0; - int r; - - if (!sctx->compiler.passes) - si_init_compiler(sctx->screen, &sctx->compiler); - - compiler_state.compiler = &sctx->compiler; - compiler_state.debug = sctx->debug; - compiler_state.is_debug_context = sctx->is_debug; - - /* Update TCS and TES. */ - if (sctx->shader.tes.cso) { - if (!sctx->tess_rings) { - si_init_tess_factor_ring(sctx); - if (!sctx->tess_rings) - return false; - } - - if (sctx->shader.tcs.cso) { - r = si_shader_select(ctx, &sctx->shader.tcs, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current->pm4); - } else { - if (!sctx->fixed_func_tcs_shader.cso) { - sctx->fixed_func_tcs_shader.cso = si_create_fixed_func_tcs(sctx); - if (!sctx->fixed_func_tcs_shader.cso) - return false; - } - - r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4); - } - - if (!sctx->shader.gs.cso || sctx->chip_class <= GFX8) { - r = si_shader_select(ctx, &sctx->shader.tes, &compiler_state); - if (r) - return false; - - if (sctx->shader.gs.cso) { - /* TES as ES */ - assert(sctx->chip_class <= GFX8); - si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4); - } else if (sctx->ngg) { - si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4); - } else { - si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4); - } - } - } else { - if (sctx->chip_class <= GFX8) - si_pm4_bind_state(sctx, ls, NULL); - si_pm4_bind_state(sctx, hs, NULL); - } - - /* Update GS. */ - if (sctx->shader.gs.cso) { - r = si_shader_select(ctx, &sctx->shader.gs, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4); - if (!sctx->ngg) { - si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader->pm4); - - if (!si_update_gs_ring_buffers(sctx)) - return false; - } else { - si_pm4_bind_state(sctx, vs, NULL); - } - } else { - if (!sctx->ngg) { - si_pm4_bind_state(sctx, gs, NULL); - if (sctx->chip_class <= GFX8) - si_pm4_bind_state(sctx, es, NULL); - } - } - - /* Update VS. */ - if ((!sctx->shader.tes.cso && !sctx->shader.gs.cso) || sctx->chip_class <= GFX8) { - r = si_shader_select(ctx, &sctx->shader.vs, &compiler_state); - if (r) - return false; - - if (!sctx->shader.tes.cso && !sctx->shader.gs.cso) { - if (sctx->ngg) { - si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4); - si_pm4_bind_state(sctx, vs, NULL); - } else { - si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4); - } - } else if (sctx->shader.tes.cso) { - si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4); - } else { - assert(sctx->shader.gs.cso); - si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4); - } - } - - sctx->vs_uses_base_instance = - sctx->shader.vs.current ? sctx->shader.vs.current->uses_base_instance : - sctx->queued.named.hs ? sctx->queued.named.hs->shader->uses_base_instance : - sctx->shader.gs.current->uses_base_instance; - - si_update_vgt_shader_config(sctx); - - if (old_kill_clip_distances != si_get_vs(sctx)->current->key.opt.kill_clip_distances) - si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); - - if (sctx->shader.ps.cso) { - unsigned db_shader_control; - - r = si_shader_select(ctx, &sctx->shader.ps, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4); - - db_shader_control = sctx->shader.ps.cso->db_shader_control | - S_02880C_KILL_ENABLE(sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS); - - if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) || - (sctx->ngg && si_pm4_state_changed(sctx, gs)) || - sctx->sprite_coord_enable != rs->sprite_coord_enable || - sctx->flatshade != rs->flatshade) { - sctx->sprite_coord_enable = rs->sprite_coord_enable; - sctx->flatshade = rs->flatshade; - si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); - } - - if (sctx->screen->info.rbplus_allowed && si_pm4_state_changed(sctx, ps) && - (!old_ps || old_spi_shader_col_format != - sctx->shader.ps.current->key.part.ps.epilog.spi_shader_col_format)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - - if (sctx->ps_db_shader_control != db_shader_control) { - sctx->ps_db_shader_control = db_shader_control; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - if (sctx->screen->dpbb_allowed) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - } - - if (sctx->smoothing_enabled != - sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing) { - sctx->smoothing_enabled = sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing; - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - - /* NGG cull state uses smoothing_enabled. */ - if (sctx->screen->use_ngg_culling) - si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state); - - if (sctx->chip_class == GFX6) - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - - if (sctx->framebuffer.nr_samples <= 1) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); - } - - if (sctx->chip_class >= GFX10_3) { - struct si_shader_info *info = &sctx->shader.ps.cso->info; - bool allow_flat_shading = info->allow_flat_shading; - - if (allow_flat_shading && - (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable || - (!rs->flatshade && info->uses_interp_color))) - allow_flat_shading = false; - - if (sctx->allow_flat_shading != allow_flat_shading) { - sctx->allow_flat_shading = allow_flat_shading; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - } - } - } - - if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) { - /* Pretend the bound shaders form a vk pipeline */ - uint32_t pipeline_code_hash = 0; - uint64_t base_address = ~0; - - for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) { - struct si_shader *shader = sctx->shaders[i].current; - if (sctx->shaders[i].cso && shader) { - pipeline_code_hash = _mesa_hash_data_with_seed( - shader->binary.elf_buffer, - shader->binary.elf_size, - pipeline_code_hash); - base_address = MIN2(base_address, - shader->bo->gpu_address); - } - } - - struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; - if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) { - si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false); - } - - si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0); - } - - if (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, hs) || - si_pm4_state_enabled_and_changed(sctx, es) || si_pm4_state_enabled_and_changed(sctx, gs) || - si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) { - if (!si_update_spi_tmpring_size(sctx)) - return false; - } - - if (sctx->chip_class >= GFX7) { - if (si_pm4_state_enabled_and_changed(sctx, ls)) - sctx->prefetch_L2_mask |= SI_PREFETCH_LS; - else if (!sctx->queued.named.ls) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS; - - if (si_pm4_state_enabled_and_changed(sctx, hs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_HS; - else if (!sctx->queued.named.hs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS; - - if (si_pm4_state_enabled_and_changed(sctx, es)) - sctx->prefetch_L2_mask |= SI_PREFETCH_ES; - else if (!sctx->queued.named.es) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES; - - if (si_pm4_state_enabled_and_changed(sctx, gs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_GS; - else if (!sctx->queued.named.gs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS; - - if (si_pm4_state_enabled_and_changed(sctx, vs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_VS; - else if (!sctx->queued.named.vs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; - - if (si_pm4_state_enabled_and_changed(sctx, ps)) - sctx->prefetch_L2_mask |= SI_PREFETCH_PS; - else if (!sctx->queued.named.ps) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS; - } - - sctx->do_update_shaders = false; - return true; -} - static void si_emit_scratch_state(struct si_context *sctx) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; @@ -4263,7 +4038,6 @@ void si_init_screen_live_shader_cache(struct si_screen *sscreen) void si_init_shader_functions(struct si_context *sctx) { - sctx->atoms.s.spi_map.emit = si_emit_spi_map; sctx->atoms.s.scratch_state.emit = si_emit_scratch_state; sctx->b.create_vs_state = si_create_shader;