diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index f34b5691a781a3758ffa56993ab1b889674a2c4c..27da22fd2e8350456d327ec03fd0da0fd817c06b 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -10865,7 +10865,7 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt = Temp(), Tem /* VS/TES: we infer the vertex and primitive count from arguments * GS: the caller needs to supply them */ - assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY + assert((ctx->stage & sw_gs) ? (vtx_cnt.id() && prm_cnt.id()) : (!vtx_cnt.id() && !prm_cnt.id())); @@ -11330,9 +11330,18 @@ void ngg_gs_export_vertices(isel_context *ctx, Temp wg_vtx_cnt, Temp tid_in_tg, begin_divergent_if_then(ctx, &ic, is_vtx_export_thread); bld.reset(ctx->block); - /* Vertex compaction: read stream 1 of the primitive flags to see which vertex the current thread needs to export */ - Operand m = load_lds_size_m0(bld); - Temp exported_vtx_idx = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset + 1); + /* The index of the vertex that the current thread will export. */ + Temp exported_vtx_idx; + + if (ctx->ngg_gs_early_alloc) { + /* No vertex compaction necessary, the thread can export its own vertex. */ + exported_vtx_idx = tid_in_tg; + } else { + /* Vertex compaction: read stream 1 of the primitive flags to see which vertex the current thread needs to export */ + Operand m = load_lds_size_m0(bld); + exported_vtx_idx = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset + 1); + } + /* Get the LDS address of the vertex that the current thread must export. */ Temp exported_vtx_addr = ngg_gs_vertex_lds_addr(ctx, exported_vtx_idx); @@ -11367,6 +11376,19 @@ void ngg_gs_export_vertices(isel_context *ctx, Temp wg_vtx_cnt, Temp tid_in_tg, end_divergent_if(ctx, &ic); } +void ngg_gs_prelude(isel_context *ctx) +{ + if (!ctx->ngg_gs_early_alloc) + return; + + /* We know the GS writes the maximum possible number of vertices, so + * it's likely that most threads need to export a primitive, too. + * Thus, we won't have to worry about primitive compaction here. + */ + Temp num_max_vertices = ngg_max_vertex_count(ctx); + ngg_emit_sendmsg_gs_alloc_req(ctx, num_max_vertices, num_max_vertices); +} + void ngg_gs_finale(isel_context *ctx) { if_context ic; @@ -11391,19 +11413,33 @@ void ngg_gs_finale(isel_context *ctx) */ Temp vertex_live = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), Operand(prim_flag_0)); - /* Perform a workgroup reduction and exclusive scan. */ - std::pair<Temp, Temp> wg_scan = ngg_gs_workgroup_reduce_and_scan(ctx, vertex_live); - bld.reset(ctx->block); /* Total number of vertices emitted by the workgroup. */ - Temp wg_vtx_cnt = wg_scan.first; + Temp wg_vtx_cnt; /* ID of the thread which will export the current thread's vertex. */ - Temp exporter_tid_in_tg = wg_scan.second; - /* Skip all exports when possible. */ - Temp have_exports = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), wg_vtx_cnt, Operand(0u)); - max_vtxcnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), max_vtxcnt, Operand(0u), bld.scc(have_exports)); + Temp exporter_tid_in_tg; + + if (ctx->ngg_gs_early_alloc) { + /* There is no need for a scan or vertex compaction, we know that + * the GS writes all possible vertices so each thread can export its own vertex. + */ + wg_vtx_cnt = max_vtxcnt; + exporter_tid_in_tg = tid_in_tg; + } else { + /* Perform a workgroup reduction and exclusive scan. */ + std::pair<Temp, Temp> wg_scan = ngg_gs_workgroup_reduce_and_scan(ctx, vertex_live); + bld.reset(ctx->block); + /* Total number of vertices emitted by the workgroup. */ + wg_vtx_cnt = wg_scan.first; + /* ID of the thread which will export the current thread's vertex. */ + exporter_tid_in_tg = wg_scan.second; + /* Skip all exports when possible. */ + Temp have_exports = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), wg_vtx_cnt, Operand(0u)); + max_vtxcnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), max_vtxcnt, Operand(0u), bld.scc(have_exports)); + + ngg_emit_sendmsg_gs_alloc_req(ctx, wg_vtx_cnt, max_vtxcnt); + ngg_gs_setup_vertex_compaction(ctx, vertex_live, tid_in_tg, exporter_tid_in_tg); + } - ngg_emit_sendmsg_gs_alloc_req(ctx, wg_vtx_cnt, max_vtxcnt); - ngg_gs_setup_vertex_compaction(ctx, vertex_live, tid_in_tg, exporter_tid_in_tg); ngg_gs_export_primitives(ctx, max_vtxcnt, tid_in_tg, exporter_tid_in_tg, prim_flag_0); ngg_gs_export_vertices(ctx, wg_vtx_cnt, tid_in_tg, vertex_lds_addr); } @@ -11440,6 +11476,8 @@ void select_program(Program *program, if (ngg_no_gs) ngg_nogs_prelude(&ctx); + else if (!i && ngg_gs) + ngg_gs_prelude(&ctx); /* In a merged VS+TCS HS, the VS implementation can be completely empty. */ nir_function_impl *func = nir_shader_get_entrypoint(nir); diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index 69ef7809405c8144f4f60dd2978cbfbabaa68d92..360f7db38b39cabc222ea7b0dc41962bcb65f15a 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -94,11 +94,14 @@ struct isel_context { /* GS inputs */ bool ngg_nogs_early_prim_export = false; + bool ngg_gs_early_alloc = false; Temp gs_wave_id; unsigned ngg_gs_emit_addr = 0; unsigned ngg_gs_emit_vtx_bytes = 0; unsigned ngg_gs_scratch_addr = 0; unsigned ngg_gs_primflags_offset = 0; + int ngg_gs_const_vtxcnt[4]; + int ngg_gs_const_prmcnt[4]; /* VS output information */ bool export_clip_dists; diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index a3030346c4e77e83e49a304ab9d33f44c0c53657..352f40b94696ddd29e96f3f438ef1e6b032100e9 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -516,6 +516,10 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir) /* Make sure we have enough room for emitted GS vertices */ assert((ngg_emit_bytes % (ctx->ngg_gs_emit_vtx_bytes * nir->info.gs.vertices_out)) == 0); + + /* See if the number of vertices and primitives are compile-time known */ + nir_gs_count_vertices_and_primitives(nir, ctx->ngg_gs_const_vtxcnt, ctx->ngg_gs_const_prmcnt, 4u); + ctx->ngg_gs_early_alloc = ctx->ngg_gs_const_vtxcnt[0] == nir->info.gs.vertices_out && ctx->ngg_gs_const_prmcnt[0] != -1; } if (ctx->stage & sw_vs)