diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index f534b5c2e5e9431a83b29565c8c7ed19814a841b..3acc1c69a0a809ff5f19416be17784fa90a472a6 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -422,7 +422,8 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx, si_aligned_buffer_create(&sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, - scratch_needed, 256); + scratch_needed, + sctx->screen->info.pte_fragment_size); if (!sctx->compute_scratch_buffer) return false; @@ -531,9 +532,13 @@ static bool si_switch_compute_shader(struct si_context *sctx, COMPUTE_DBG(sctx->screen, "COMPUTE_PGM_RSRC1: 0x%08x " "COMPUTE_PGM_RSRC2: 0x%08x\n", config->rsrc1, config->rsrc2); + sctx->max_seen_compute_scratch_bytes_per_wave = + MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, + config->scratch_bytes_per_wave); + radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, S_00B860_WAVES(sctx->scratch_waves) - | S_00B860_WAVESIZE(config->scratch_bytes_per_wave >> 10)); + | S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10)); sctx->cs_shader_state.emitted_program = program; sctx->cs_shader_state.offset = offset; diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c index 373fd4ffa7cb0e1a6a5469b95c23b1b360a02df9..34e6d344486fbc1c5a4abdcab1043728a61cf812 100644 --- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c +++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c @@ -977,7 +977,7 @@ static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx) SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, sctx->index_ring_size_per_ib * 2, - 2 * 1024 * 1024); + sctx->screen->info.pte_fragment_size); if (!sctx->index_ring) return false; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index b0495f0841b7b8880a5ed433240e116cd26c4481..86aee2e0824894cf1c65af308fcc0e0afed84c56 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1078,6 +1078,8 @@ struct si_context { struct si_resource *scratch_buffer; unsigned scratch_waves; unsigned spi_tmpring_size; + unsigned max_seen_scratch_bytes_per_wave; + unsigned max_seen_compute_scratch_bytes_per_wave; struct si_resource *compute_scratch_buffer; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 86ca4bac57a508c1471068748fd1574e899dfb91..a1e4a5b9cee28fa0c559d4a5092d8477da1d2e30 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -715,6 +715,10 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) static void si_delete_blend_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + + if (sctx->queued.named.blend == state) + si_bind_blend_state(ctx, sctx->noop_blend); + si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state); } @@ -1091,7 +1095,7 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state) struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; if (sctx->queued.named.rasterizer == state) - si_pm4_bind_state(sctx, poly_offset, NULL); + si_bind_rs_state(ctx, sctx->discard_rasterizer_state); FREE(rs->pm4_poly_offset); si_pm4_delete_state(sctx, rasterizer, rs); @@ -1335,6 +1339,10 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) static void si_delete_dsa_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + + if (sctx->queued.named.dsa == state) + si_bind_dsa_state(ctx, sctx->noop_dsa); + si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state); } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 365eed2f84e1dbe49f7c9ddc2f41ff07a30357e3..d6fa1f1858219af9a8778ad4adf28f2d23f41271 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3487,7 +3487,8 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, - esgs_ring_size, alignment); + esgs_ring_size, + sctx->screen->info.pte_fragment_size); if (!sctx->esgs_ring) return false; } @@ -3498,7 +3499,8 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, - gsvs_ring_size, alignment); + gsvs_ring_size, + sctx->screen->info.pte_fragment_size); if (!sctx->gsvs_ring) return false; } @@ -3623,11 +3625,6 @@ static int si_update_scratch_buffer(struct si_context *sctx, return 1; } -static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx) -{ - return sctx->scratch_buffer ? sctx->scratch_buffer->b.b.width0 : 0; -} - static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader) { return shader ? shader->config.scratch_bytes_per_wave : 0; @@ -3642,23 +3639,6 @@ static struct si_shader *si_get_tcs_current(struct si_context *sctx) sctx->fixed_func_tcs_shader.current; } -static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx) -{ - unsigned bytes = 0; - - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current)); - - if (sctx->tes_shader.cso) { - struct si_shader *tcs = si_get_tcs_current(sctx); - - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(tcs)); - } - return bytes; -} - static bool si_update_scratch_relocs(struct si_context *sctx) { struct si_shader *tcs = si_get_tcs_current(sctx); @@ -3720,24 +3700,49 @@ static bool si_update_scratch_relocs(struct si_context *sctx) static bool si_update_spi_tmpring_size(struct si_context *sctx) { - unsigned current_scratch_buffer_size = - si_get_current_scratch_buffer_size(sctx); - unsigned scratch_bytes_per_wave = - si_get_max_scratch_bytes_per_wave(sctx); - unsigned scratch_needed_size = scratch_bytes_per_wave * - sctx->scratch_waves; + /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer. + * There are 2 cases to handle: + * + * - If the current needed size is less than the maximum seen size, + * use the maximum seen size, so that WAVESIZE remains the same. + * + * - If the current needed size is greater than the maximum seen size, + * the scratch buffer is reallocated, so we can increase WAVESIZE. + * + * Shaders that set SCRATCH_EN=0 don't allocate scratch space. + * Otherwise, the number of waves that can use scratch is + * SPI_TMPRING_SIZE.WAVES. + */ + unsigned bytes = 0; + + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current)); + + if (sctx->tes_shader.cso) { + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx))); + } + + sctx->max_seen_scratch_bytes_per_wave = + MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes); + + unsigned scratch_needed_size = + sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves; unsigned spi_tmpring_size; if (scratch_needed_size > 0) { - if (scratch_needed_size > current_scratch_buffer_size) { + if (!sctx->scratch_buffer || + scratch_needed_size > sctx->scratch_buffer->b.b.width0) { /* Create a bigger scratch buffer */ si_resource_reference(&sctx->scratch_buffer, NULL); sctx->scratch_buffer = si_aligned_buffer_create(&sctx->screen->b, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - scratch_needed_size, 256); + SI_RESOURCE_FLAG_UNMAPPABLE, + PIPE_USAGE_DEFAULT, + scratch_needed_size, + sctx->screen->info.pte_fragment_size); if (!sctx->scratch_buffer) return false; @@ -3755,7 +3760,7 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) "scratch size should already be aligned correctly."); spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) | - S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10); + S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10); if (spi_tmpring_size != sctx->spi_tmpring_size) { sctx->spi_tmpring_size = spi_tmpring_size; si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);