diff --git a/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c b/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c index a0f7e5e2fb857c0b53f7d3ddbfbef8e08f1ef4c8..5055d8dd5f4a75415cc68d9c808747f722b99893 100644 --- a/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c +++ b/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c @@ -26,6 +26,7 @@ #include "nir.h" #include "pipe/p_state.h" +#include "util/u_math.h" #include "util/u_memory.h" #include "util/hash_table.h" @@ -383,21 +384,24 @@ create_shared_block(struct ntv_context *ctx, unsigned shared_size) } static inline unsigned char -reserve_slot(struct ntv_context *ctx) +reserve_slot(struct ntv_context *ctx, unsigned num_slots) { /* TODO: this should actually be clamped to the limits value as in the table * in 14.1.4 of the vulkan spec, though there's not really any recourse * other than aborting if we do hit it... */ - assert(ctx->shader_slots_reserved < MAX_VARYING); - return ctx->shader_slots_reserved++; + assert(ctx->shader_slots_reserved + num_slots <= MAX_VARYING); + unsigned ret = ctx->shader_slots_reserved; + ctx->shader_slots_reserved += num_slots; + return ret; } static inline unsigned -handle_slot(struct ntv_context *ctx, unsigned slot) +handle_slot(struct ntv_context *ctx, unsigned slot, unsigned num_slots) { + assert(num_slots); if (ctx->shader_slot_map[slot] == SLOT_UNSET) - ctx->shader_slot_map[slot] = reserve_slot(ctx); + ctx->shader_slot_map[slot] = reserve_slot(ctx, num_slots); slot = ctx->shader_slot_map[slot]; assert(slot < MAX_VARYING); return slot; @@ -420,7 +424,7 @@ handle_handle_slot(struct ntv_context *ctx, struct nir_variable *var, bool outpu (!output && ctx->stage == MESA_SHADER_TESS_EVAL))) { return var->data.location - VARYING_SLOT_VAR0; } - return handle_slot(ctx, var->data.location); + return handle_slot(ctx, var->data.location, glsl_count_vec4_slots(var->type, false, false)); } static SpvId @@ -465,7 +469,7 @@ emit_input(struct ntv_context *ctx, struct nir_variable *var) HANDLE_EMIT_BUILTIN(FACE, FrontFacing); default: - slot = handle_slot(ctx, slot); + slot = handle_slot(ctx, slot, glsl_count_vec4_slots(var->type, false, false)); spirv_builder_emit_location(&ctx->builder, var_id, slot); } if (var->data.centroid) @@ -617,6 +621,14 @@ emit_output(struct ntv_context *ctx, struct nir_variable *var) if (var->data.patch) spirv_builder_emit_decoration(&ctx->builder, var_id, SpvDecorationPatch); + if (var->data.explicit_xfb_buffer) { + spirv_builder_emit_offset(&ctx->builder, var_id, var->data.offset); + spirv_builder_emit_xfb_buffer(&ctx->builder, var_id, var->data.xfb.buffer); + spirv_builder_emit_xfb_stride(&ctx->builder, var_id, var->data.xfb.stride); + if (var->data.stream) + spirv_builder_emit_stream(&ctx->builder, var_id, var->data.stream); + } + _mesa_hash_table_insert(ctx->vars, var, (void *)(intptr_t)var_id); assert(ctx->num_entry_ifaces < ARRAY_SIZE(ctx->entry_ifaces)); @@ -1246,6 +1258,7 @@ get_output_type(struct ntv_context *ctx, unsigned register_index, unsigned num_c static void emit_so_info(struct ntv_context *ctx, const struct zink_so_info *so_info) { + unsigned output = 0; for (unsigned i = 0; i < so_info->so_info.num_outputs; i++) { struct pipe_stream_output so_output = so_info->so_info.output[i]; unsigned slot = so_info->so_info_slots[i] << 2 | so_output.start_component; @@ -1257,7 +1270,7 @@ emit_so_info(struct ntv_context *ctx, const struct zink_so_info *so_info) SpvStorageClassOutput); char name[10]; - snprintf(name, 10, "xfb%d", i); + snprintf(name, 10, "xfb%d", output); spirv_builder_emit_name(&ctx->builder, var_id, name); spirv_builder_emit_offset(&ctx->builder, var_id, (so_output.dst_offset * 4)); spirv_builder_emit_xfb_buffer(&ctx->builder, var_id, so_output.output_buffer); @@ -1269,8 +1282,8 @@ emit_so_info(struct ntv_context *ctx, const struct zink_so_info *so_info) * so we need to ensure that the new xfb location slot doesn't conflict with any previously-emitted * outputs. */ - uint32_t location = ctx->shader_slots_reserved + i; - assert(location < VARYING_SLOT_VAR0); + uint32_t location = ctx->shader_slots_reserved + output; + assert(location < VARYING_SLOT_VAR0 && ctx->shader_slot_map[location] == SLOT_UNSET); spirv_builder_emit_location(&ctx->builder, var_id, location); /* note: gl_ClipDistance[4] can the 0-indexed member of VARYING_SLOT_CLIP_DIST1 here, @@ -1285,6 +1298,7 @@ emit_so_info(struct ntv_context *ctx, const struct zink_so_info *so_info) assert(ctx->num_entry_ifaces < ARRAY_SIZE(ctx->entry_ifaces)); ctx->entry_ifaces[ctx->num_entry_ifaces++] = var_id; + output += align(so_output.num_components, 4) / 4; } } @@ -1309,7 +1323,6 @@ emit_so_outputs(struct ntv_context *ctx, while (!output) output = ctx->outputs[location--]; location++; - assert(orig_location - location < 8); SpvId output_type = ctx->so_output_types[location]; const struct glsl_type *out_type = ctx->so_output_gl_types[location]; @@ -1344,12 +1357,18 @@ emit_so_outputs(struct ntv_context *ctx, * and re-pack them into the desired output type */ for (unsigned c = 0; c < so_output.num_components; c++) { - uint32_t member[] = { so_output.start_component + c }; - SpvId base_type = get_glsl_type(ctx, glsl_without_array(out_type)); + uint32_t member[2]; + unsigned member_idx = 0; + if (glsl_type_is_matrix(out_type)) { + member_idx = 1; + member[0] = so_output.register_index; + } + member[member_idx] = so_output.start_component + c; + SpvId base_type = get_glsl_basetype(ctx, glsl_get_base_type(glsl_without_array_or_matrix(out_type))); if (slot == VARYING_SLOT_CLIP_DIST1) - member[0] += 4; - components[c] = spirv_builder_emit_composite_extract(&ctx->builder, base_type, src, member, 1); + member[member_idx] += 4; + components[c] = spirv_builder_emit_composite_extract(&ctx->builder, base_type, src, member, 1 + member_idx); } result = spirv_builder_emit_composite_construct(&ctx->builder, type, components, so_output.num_components); } @@ -3809,7 +3828,7 @@ nir_to_spirv(struct nir_shader *s, const struct zink_so_info *so_info, default: break; } - if (so_info && so_info->so_info.num_outputs) { + if (s->info.has_transform_feedback_varyings) { spirv_builder_emit_cap(&ctx.builder, SpvCapabilityTransformFeedback); spirv_builder_emit_exec_mode(&ctx.builder, entry_point, SpvExecutionModeXfb); diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c index 62ba11fd4a910e11af78bf229050e4bef6e5fc8b..6d3688b0163cead5c1652c16a1a03b9d7ef76147 100644 --- a/src/gallium/drivers/zink/zink_compiler.c +++ b/src/gallium/drivers/zink/zink_compiler.c @@ -402,13 +402,13 @@ check_psiz(struct nir_shader *s) return false; } -/* semi-copied from iris */ static void -update_so_info(struct zink_shader *sh, +update_so_info(struct zink_shader *zs, const struct pipe_stream_output_info *so_info, uint64_t outputs_written, bool have_psiz) { uint8_t reverse_map[64] = {}; unsigned slot = 0; + /* semi-copied from iris */ while (outputs_written) { int bit = u_bit_scan64(&outputs_written); /* PSIZ from nir_lower_point_size_mov breaks stream output, so always skip it */ @@ -417,11 +417,40 @@ update_so_info(struct zink_shader *sh, reverse_map[slot++] = bit; } - for (unsigned i = 0; i < sh->streamout.so_info.num_outputs; i++) { - struct pipe_stream_output *output = &sh->streamout.so_info.output[i]; + nir_foreach_shader_out_variable(var, zs->nir) + var->data.explicit_xfb_buffer = 0; + + bool inlined[64] = {0}; + for (unsigned i = 0; i < so_info->num_outputs; i++) { + const struct pipe_stream_output *output = &so_info->output[i]; + unsigned slot = reverse_map[output->register_index]; + /* always set stride to be used during draw */ + zs->streamout.so_info.stride[output->output_buffer] = so_info->stride[output->output_buffer]; + if ((zs->nir->info.stage != MESA_SHADER_GEOMETRY || util_bitcount(zs->nir->info.gs.active_stream_mask) == 1) && + !output->start_component) { + nir_variable *var = NULL; + while (!var) + var = nir_find_variable_with_location(zs->nir, nir_var_shader_out, slot--); + slot++; + if (inlined[slot]) + continue; + assert(var && var->data.location == slot); + /* if this is the entire variable, try to blast it out during the initial declaration */ + if (glsl_get_components(var->type) == output->num_components) { + var->data.explicit_xfb_buffer = 1; + var->data.xfb.buffer = output->output_buffer; + var->data.xfb.stride = so_info->stride[output->output_buffer] * 4; + var->data.offset = output->dst_offset * 4; + var->data.stream = output->stream; + inlined[slot] = true; + continue; + } + } + zs->streamout.so_info.output[zs->streamout.so_info.num_outputs] = *output; /* Map Gallium's condensed "slots" back to real VARYING_SLOT_* enums */ - sh->streamout.so_info_slots[i] = reverse_map[output->register_index]; + zs->streamout.so_info_slots[zs->streamout.so_info.num_outputs++] = reverse_map[output->register_index]; } + zs->streamout.have_xfb = !!zs->streamout.so_info.num_outputs; } VkShaderModule @@ -434,7 +463,7 @@ zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, struct z /* TODO: use a separate mem ctx here for ralloc */ if (zs->nir->info.stage < MESA_SHADER_FRAGMENT) { if (zink_vs_key(key)->last_vertex_stage) { - if (zs->streamout.so_info_slots) + if (zs->streamout.have_xfb) streamout = &zs->streamout; if (!zink_vs_key(key)->clip_halfz) { @@ -547,6 +576,11 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir, if (nir->info.stage == MESA_SHADER_VERTEX) create_vs_pushconst(nir); + else if (nir->info.stage == MESA_SHADER_TESS_CTRL || + nir->info.stage == MESA_SHADER_TESS_EVAL) { + NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out, UINT_MAX); + NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false); + } /* only do uniforms -> ubo if we have uniforms, otherwise we're just * screwing with the bindings for no reason @@ -657,12 +691,8 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir, } ret->nir = nir; - if (so_info) { - memcpy(&ret->streamout.so_info, so_info, sizeof(struct pipe_stream_output_info)); - ret->streamout.so_info_slots = malloc(so_info->num_outputs * sizeof(unsigned int)); - assert(ret->streamout.so_info_slots); - update_so_info(ret, nir->info.outputs_written, have_psiz); - } + if (so_info && nir->info.outputs_written && nir->info.has_transform_feedback_varyings) + update_so_info(ret, so_info, nir->info.outputs_written, have_psiz); return ret; } @@ -688,7 +718,6 @@ zink_shader_free(struct zink_context *ctx, struct zink_shader *shader) } } _mesa_set_destroy(shader->programs, NULL); - free(shader->streamout.so_info_slots); ralloc_free(shader->nir); FREE(shader); } diff --git a/src/gallium/drivers/zink/zink_compiler.h b/src/gallium/drivers/zink/zink_compiler.h index 80b2b88fdf324f141a20c677f75c69f570c1cfdf..e61d8ae45b7e1da431bc42c9fd56f8c61a57c740 100644 --- a/src/gallium/drivers/zink/zink_compiler.h +++ b/src/gallium/drivers/zink/zink_compiler.h @@ -50,7 +50,8 @@ struct set; struct tgsi_token; struct zink_so_info { struct pipe_stream_output_info so_info; - unsigned *so_info_slots; + unsigned so_info_slots[PIPE_MAX_SO_OUTPUTS]; + bool have_xfb; }; diff --git a/src/gallium/drivers/zink/zink_program.c b/src/gallium/drivers/zink/zink_program.c index 94203dd3f314d4c2f7f19be8f871c93d9987091f..aa6ce49c3f9d709b75ccb41f1f39e9d28ba959e3 100644 --- a/src/gallium/drivers/zink/zink_program.c +++ b/src/gallium/drivers/zink/zink_program.c @@ -415,15 +415,66 @@ static void init_slot_map(struct zink_context *ctx, struct zink_gfx_program *prog) { unsigned existing_shaders = 0; + bool needs_new_map = false; - /* if there's a case where we'll be reusing any shaders, we need to reuse the slot map too */ + /* if there's a case where we'll be reusing any shaders, we need to (maybe) reuse the slot map too */ if (ctx->curr_program) { for (int i = 0; i < ZINK_SHADER_COUNT; ++i) { if (ctx->curr_program->shaders[i]) existing_shaders |= 1 << i; } + /* if there's reserved slots, check whether we have enough remaining slots */ + if (ctx->curr_program->shader_slots_reserved) { + uint64_t max_outputs = 0; + uint32_t num_xfb_outputs = 0; + for (int i = 0; i < ZINK_SHADER_COUNT; ++i) { + if (i != PIPE_SHADER_TESS_CTRL && + i != PIPE_SHADER_FRAGMENT && + ctx->gfx_stages[i]) { + uint32_t user_outputs = ctx->gfx_stages[i]->nir->info.outputs_written >> 32; + uint32_t builtin_outputs = ctx->gfx_stages[i]->nir->info.outputs_written; + num_xfb_outputs = MAX2(num_xfb_outputs, ctx->gfx_stages[i]->streamout.so_info.num_outputs); + unsigned user_outputs_count = 0; + /* check builtins first */ + u_foreach_bit(slot, builtin_outputs) { + switch (slot) { + /* none of these require slot map entries */ + case VARYING_SLOT_POS: + case VARYING_SLOT_PSIZ: + case VARYING_SLOT_LAYER: + case VARYING_SLOT_PRIMITIVE_ID: + case VARYING_SLOT_CULL_DIST0: + case VARYING_SLOT_CLIP_DIST0: + case VARYING_SLOT_VIEWPORT: + case VARYING_SLOT_TESS_LEVEL_INNER: + case VARYING_SLOT_TESS_LEVEL_OUTER: + break; + default: + /* remaining legacy builtins only require 1 slot each */ + if (ctx->curr_program->shader_slot_map[slot] == -1) + user_outputs_count++; + break; + } + } + u_foreach_bit(slot, user_outputs) { + if (ctx->curr_program->shader_slot_map[slot] == -1) { + /* user variables can span multiple slots */ + nir_variable *var = nir_find_variable_with_location(ctx->gfx_stages[i]->nir, + nir_var_shader_out, slot); + assert(var); + user_outputs_count += glsl_count_vec4_slots(var->type, false, false); + } + } + max_outputs = MAX2(max_outputs, user_outputs_count); + } + } + /* slot map can only hold 32 entries, so dump this one if we'll exceed that */ + if (ctx->curr_program->shader_slots_reserved + max_outputs + num_xfb_outputs > 32) + needs_new_map = true; + } } - if (ctx->dirty_shader_stages == existing_shaders || !existing_shaders) { + + if (needs_new_map || ctx->dirty_shader_stages == existing_shaders || !existing_shaders) { /* all shaders are being recompiled: new slot map */ memset(prog->shader_slot_map, -1, sizeof(prog->shader_slot_map)); /* we need the slot map to match up, so we can't reuse the previous cache if we can't guarantee