Commit 9fecac09 authored by Marek Olšák's avatar Marek Olšák Committed by Marge Bot
Browse files

radeonsi/gfx11: scattered register deltas



Acked-by: Pierre-Eric Pelloux-Prayer's avatarPierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <!16328>
parent afc110a1
This commit is part of merge request !16328. Comments created here will be created in the context of that merge request.
......@@ -1181,7 +1181,10 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
info->has_stable_pstate = info->drm_minor >= 45;
if (info->chip_class >= GFX9 && info->has_graphics) {
if (info->chip_class >= GFX11) {
info->pc_lines = 1024;
info->pbb_max_alloc_count = 255; /* minimum is 2, maximum is 256 */
} else if (info->chip_class >= GFX9 && info->has_graphics) {
unsigned pc_lines = 0;
switch (info->family) {
......
......@@ -452,6 +452,17 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
if (sctx->chip_class < GFX11)
radeon_set_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3, 0);
}
if (sctx->chip_class >= GFX11) {
radeon_set_sh_reg_seq(R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, 4);
radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE4 */
radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE5 */
radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE6 */
radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE7 */
radeon_set_sh_reg(R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, 64);
}
radeon_end();
}
......
......@@ -1224,7 +1224,9 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
*/
unsigned max_offchip_buffers_per_se;
if (sscreen->info.chip_class >= GFX10)
if (sscreen->info.chip_class >= GFX11)
max_offchip_buffers_per_se = 256; /* TODO: we could decrease this to reduce memory/cache usage */
else if (sscreen->info.chip_class >= GFX10)
max_offchip_buffers_per_se = 128;
/* Only certain chips can use the maximum value. */
else if (sscreen->info.family == CHIP_VEGA12 || sscreen->info.family == CHIP_VEGA20)
......@@ -1249,7 +1251,12 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
sscreen->tess_factor_ring_size = 48 * 1024 * sscreen->info.max_se;
sscreen->tess_offchip_ring_size = max_offchip_buffers * sscreen->tess_offchip_block_dw_size * 4;
if (sscreen->info.chip_class >= GFX10_3) {
if (sscreen->info.chip_class >= GFX11) {
/* OFFCHIP_BUFFERING is per SE. */
sscreen->vgt_hs_offchip_param =
S_03093C_OFFCHIP_BUFFERING_GFX103(max_offchip_buffers_per_se - 1) |
S_03093C_OFFCHIP_GRANULARITY_GFX103(offchip_granularity);
} else if (sscreen->info.chip_class >= GFX10_3) {
sscreen->vgt_hs_offchip_param =
S_03093C_OFFCHIP_BUFFERING_GFX103(max_offchip_buffers - 1) |
S_03093C_OFFCHIP_GRANULARITY_GFX103(offchip_granularity);
......
......@@ -122,7 +122,8 @@ static void si_emit_cb_render_state(struct si_context *sctx)
S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) |
S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) |
S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode));
S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->chip_class < GFX11 &&
sctx->screen->info.has_dcc_constant_encode));
}
}
......@@ -636,7 +637,8 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, sx_mrt_blend_opt[i]);
/* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */
if (blend->dual_src_blend || logicop_enable || mode == V_028808_CB_RESOLVE)
if (blend->dual_src_blend || logicop_enable || mode == V_028808_CB_RESOLVE ||
(sctx->chip_class == GFX11 && blend->blend_enable_4bit))
color_control |= S_028808_DISABLE_DUAL_QUAD(1);
}
......@@ -2658,9 +2660,11 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su
z_info = S_028038_FORMAT(format) |
S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) |
S_028038_SW_MODE(tex->surface.u.gfx9.swizzle_mode) |
S_028038_MAXMIP(tex->buffer.b.b.last_level);
S_028038_MAXMIP(tex->buffer.b.b.last_level) |
S_028040_ITERATE_256(sctx->chip_class >= GFX11);
s_info = S_02803C_FORMAT(stencil_format) |
S_02803C_SW_MODE(tex->surface.u.gfx9.zs.stencil_swizzle_mode);
S_02803C_SW_MODE(tex->surface.u.gfx9.zs.stencil_swizzle_mode) |
S_028044_ITERATE_256(sctx->chip_class >= GFX11);
if (sctx->chip_class == GFX9) {
surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.epitch);
......@@ -3441,8 +3445,12 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
radeon_set_context_reg(R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 7);
radeon_emit(S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
if (sctx->chip_class >= GFX11) {
radeon_set_context_reg_seq(R_028040_DB_Z_INFO, 6);
} else {
radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 7);
radeon_emit(S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
}
radeon_emit(db_z_info | /* DB_Z_INFO */
S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
radeon_emit(db_stencil_info); /* DB_STENCIL_INFO */
......@@ -3524,7 +3532,11 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
else
radeon_set_context_reg_seq(R_028040_DB_Z_INFO, 2);
radeon_emit(S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
/* Gfx11 only: DB_Z_INFO.NUM_SAMPLES should always match the framebuffer samples.
* It affects VRS and occlusion queries if depth and stencil are not bound.
*/
radeon_emit(S_028040_FORMAT(V_028040_Z_INVALID) | /* DB_Z_INFO */
S_028040_NUM_SAMPLES(sctx->chip_class == GFX11 ? sctx->framebuffer.log_samples : 0));
radeon_emit(S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
}
......@@ -5516,10 +5528,13 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
if (sctx->chip_class < GFX11) {
si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
}
}
si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
......@@ -5540,8 +5555,10 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
}
if (sctx->chip_class <= GFX7 || !has_clear_state) {
si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
if (sctx->chip_class < GFX11) {
si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
}
/* CLEAR_STATE doesn't clear these correctly on certain generations.
* I don't know why. Deduced by trial and error.
......@@ -5567,7 +5584,8 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
if (sctx->chip_class >= GFX7) {
ac_set_reg_cu_en(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
S_00B01C_CU_EN(cu_mask_ps) |
S_00B01C_WAVE_LIMIT(0x3F),
S_00B01C_WAVE_LIMIT(0x3F) |
S_00B01C_LDS_GROUP_SIZE(sctx->chip_class >= GFX11),
C_00B01C_CU_EN, 0, &sscreen->info,
(void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
}
......@@ -5676,10 +5694,6 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0);
si_pm4_set_reg(pm4, R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2, 0);
si_pm4_set_reg(pm4, R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3, 0);
si_pm4_set_reg(pm4, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0);
si_pm4_set_reg(pm4, R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1, 0);
si_pm4_set_reg(pm4, R_00B1D0_SPI_SHADER_USER_ACCUM_VS_2, 0);
si_pm4_set_reg(pm4, R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3, 0);
si_pm4_set_reg(pm4, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 0);
si_pm4_set_reg(pm4, R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1, 0);
si_pm4_set_reg(pm4, R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2, 0);
......@@ -5692,7 +5706,6 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
S_00B0C0_SOFT_GROUPING_EN(1) |
S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
/* Enable CMASK/HTILE/DCC caching in L2 for small chips. */
unsigned meta_write_policy, meta_read_policy;
......@@ -5745,9 +5758,13 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
* the size of the PC minus the largest possible allocation for
* a single primitive shader subgroup.
*/
si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512));
/* Reuse for legacy (non-NGG) only. */
si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL,
S_028C50_MAX_DEALLOCS_IN_WAVE(sctx->chip_class >= GFX11 ? 16 : 512));
if (sctx->chip_class < GFX11) {
/* Reuse for legacy (non-NGG) only. */
si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
}
if (!has_clear_state) {
si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
......@@ -5773,6 +5790,12 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
ac_set_reg_cu_en(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff),
C_00B404_CU_EN, 16, &sscreen->info,
(void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
si_pm4_set_reg(pm4, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0);
si_pm4_set_reg(pm4, R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1, 0);
si_pm4_set_reg(pm4, R_00B1D0_SPI_SHADER_USER_ACCUM_VS_2, 0);
si_pm4_set_reg(pm4, R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3, 0);
}
if (sctx->chip_class >= GFX10_3) {
......@@ -5792,6 +5815,10 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
}
if (sctx->chip_class >= GFX11) {
si_pm4_set_reg(pm4, R_028C54_PA_SC_BINNER_CNTL_2, 0);
si_pm4_set_reg(pm4, R_028620_PA_RATE_CNTL,
S_028620_VERTEX_RATE(2) | S_028620_PRIM_RATE(1));
/* We must wait for idle before changing the SPI attribute ring registers. */
si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
......@@ -5802,6 +5829,9 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
si_pm4_set_reg(pm4, R_031110_SPI_GS_THROTTLE_CNTL1, 0x12355123);
si_pm4_set_reg(pm4, R_031114_SPI_GS_THROTTLE_CNTL2, 0x1544D);
assert((sscreen->attribute_ring->gpu_address >> 32) == sscreen->info.address32_hi);
/* The PS will read inputs from this address. */
......
......@@ -493,7 +493,11 @@ void si_emit_dpbb_state(struct si_context *sctx)
}
/* Tunable parameters. */
unsigned fpovs_per_batch = 63; /* allowed range: [0, 255], 0 = unlimited */
/* Allowed range:
* gfx9-10: [0, 255] (0 = unlimited)
* gfx11: [1, 255] (255 = unlimited)
*/
unsigned fpovs_per_batch = 63;
/* Emit registers. */
struct uvec2 bin_size_extend = {};
......
......@@ -1109,7 +1109,10 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);
if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (NGG || HAS_GS))) {
radeon_set_context_reg(R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
if (GFX_VERSION >= GFX11)
radeon_set_uconfig_reg(R_030998_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
else
radeon_set_context_reg(R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
sctx->last_gs_out_prim = gs_out_prim;
}
......
......@@ -1186,8 +1186,10 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
shader->ctx_reg.ngg.ge_ngg_subgrp_cntl);
radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
shader->ctx_reg.ngg.vgt_primitiveid_en);
radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
shader->ctx_reg.ngg.vgt_gs_onchip_cntl);
if (sctx->chip_class < GFX11) {
radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
shader->ctx_reg.ngg.vgt_gs_onchip_cntl);
}
radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
shader->ctx_reg.ngg.vgt_gs_instance_cnt);
radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
......@@ -1467,10 +1469,6 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
if (es_stage == MESA_SHADER_TESS_EVAL)
si_set_tesseval_regs(sscreen, es_sel, shader);
shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
shader->ctx_reg.ngg.ge_max_output_per_subgroup =
S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
shader->ctx_reg.ngg.ge_ngg_subgrp_cntl = S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) |
......@@ -1516,6 +1514,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
shader->ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(shader->ngg.max_gsprims) |
S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
}
/* On gfx10, the GE only checks against the maximum number of ES verts after
......@@ -1929,6 +1932,10 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
spi_ps_in_control = S_0286D8_NUM_INTERP(num_interp) |
S_0286D8_PS_W32_EN(shader->wave_size == 32);
/* Workaround when there are no PS inputs but LDS is used. */
if (sscreen->info.chip_class == GFX11 && !num_interp && shader->config.lds_size)
spi_ps_in_control |= S_0286D8_PARAM_GEN(1);
shader->ctx_reg.ps.num_interp = num_interp;
shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment