Commit 4fa7a69a authored by Timur Kristóf's avatar Timur Kristóf
Browse files

radeonsi: Calculate the tess LDS offsets instead of using an argument.



This commit gets rid of the tcs_out_lds_offsets argument.
Instead, it calculates the patch0 offset and the patch data offset
in the shader. LLVM generates overall better code with this change.
Signed-off-by: Timur Kristóf's avatarTimur Kristóf <timur.kristof@gmail.com>
parent 6b53f7f9
Pipeline #279914 waiting for manual action with stages
......@@ -425,7 +425,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
declare_global_desc_pointers(ctx);
declare_per_stage_desc_pointers(ctx, true);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
......@@ -460,7 +459,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
declare_vs_specific_input_sgprs(ctx);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
if (ctx->stage == MESA_SHADER_VERTEX)
declare_vb_descriptor_input_sgprs(ctx);
......
......@@ -191,7 +191,6 @@ enum
/* GFX6-8: TCS only */
GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
GFX6_SGPR_TCS_OUT_OFFSETS,
GFX6_SGPR_TCS_OUT_LAYOUT,
GFX6_SGPR_TCS_IN_LAYOUT,
GFX6_TCS_NUM_USER_SGPR,
......@@ -203,7 +202,6 @@ enum
/* GFX9: Merged LS-HS (VS-TCS) only. */
GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR,
GFX9_SGPR_TCS_OUT_OFFSETS,
GFX9_SGPR_TCS_OUT_LAYOUT,
GFX9_TCS_NUM_USER_SGPR,
......
......@@ -118,12 +118,6 @@ struct si_shader_context {
struct ac_arg tcs_offchip_layout;
/* API TCS */
/* Offsets where TCS outputs and TCS patch outputs live in LDS:
* [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32 = 64K (TODO: not enough bits)
* [16:31] = TCS output patch0 offset for per-patch / 16
* max = (NUM_PATCHES + 1) * 32*32 = 66624 (TODO: not enough bits)
*/
struct ac_arg tcs_out_lds_offsets;
/* Layout of TCS outputs / TES inputs:
* [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
* max = 32*32*4 + 32*4 = 4224
......
......@@ -99,14 +99,21 @@ static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
static LLVMValueRef get_tcs_out_patch0_offset(struct si_shader_context *ctx)
{
return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
LLVMConstInt(ctx->ac.i32, 4, 0), "");
LLVMValueRef in_patch_dw_size = get_tcs_in_patch_stride(ctx);
LLVMValueRef num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
num_patches = LLVMBuildAdd(ctx->ac.builder, num_patches, ctx->ac.i32_1, "");
return LLVMBuildMul(ctx->ac.builder, num_patches, in_patch_dw_size, "");
}
static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
{
return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
LLVMConstInt(ctx->ac.i32, 4, 0), "");
const struct si_shader_info *info = &ctx->shader->selector->info;
unsigned tcs_out_vertices = info->base.tess.tcs_vertices_out;
unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
LLVMValueRef patch_data_dw_offset = LLVMConstInt(ctx->ac.i32, tcs_out_vertices * vertex_dw_stride, 0);
LLVMValueRef patch0_dw_offset = get_tcs_out_patch0_offset(ctx);
return LLVMBuildAdd(ctx->ac.builder, patch0_dw_offset, patch_data_dw_offset, "");
}
static LLVMValueRef get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
......@@ -941,7 +948,6 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets, 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
......@@ -1039,7 +1045,6 @@ void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_par
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
} else {
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
......@@ -1047,7 +1052,6 @@ void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_par
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
......
......@@ -333,16 +333,11 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
sctx->last_num_patches = *num_patches;
unsigned output_patch0_offset = input_patch_size * *num_patches;
unsigned perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
/* Compute userdata SGPRs. */
assert(((input_vertex_size / 4) & ~0xff) == 0);
assert(((output_vertex_size / 4) & ~0xff) == 0);
assert(((input_patch_size / 4) & ~0x1fff) == 0);
assert(((output_patch_size / 4) & ~0x1fff) == 0);
assert(((output_patch0_offset / 16) & ~0xffff) == 0);
assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
assert(num_tcs_input_cp <= 32);
assert(num_tcs_output_cp <= 32);
assert(*num_patches <= 64);
......@@ -355,7 +350,6 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
unsigned tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |
S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);
unsigned tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va;
unsigned tcs_out_offsets = (output_patch0_offset / 16) | ((perpatch_output_offset / 16) << 16);
unsigned offchip_layout =
(*num_patches - 1) | ((num_tcs_output_cp - 1) << 6) |
((pervertex_output_patch_size * *num_patches) << 11);
......@@ -395,9 +389,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
/* Set userdata SGPRs for merged LS-HS. */
radeon_set_sh_reg_seq(
cs, R_00B430_SPI_SHADER_USER_DATA_LS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
cs, R_00B430_SPI_SHADER_USER_DATA_LS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 2);
radeon_emit(cs, offchip_layout);
radeon_emit(cs, tcs_out_offsets);
radeon_emit(cs, tcs_out_layout);
} else {
unsigned ls_rsrc2 = ls_current->config.rsrc2;
......@@ -415,9 +408,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
/* Set userdata SGPRs for TCS. */
radeon_set_sh_reg_seq(
cs, R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
cs, R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
radeon_emit(cs, offchip_layout);
radeon_emit(cs, tcs_out_offsets);
radeon_emit(cs, tcs_out_layout);
radeon_emit(cs, tcs_in_layout);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment