diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 19fe7f9c6bf99227160b87e0b0d5acdf7d8366db..efe06a891246b5fdc03a402791f4e1ed34ea2519 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -6805,7 +6805,114 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
    bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
 }
 
-void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) {
+unsigned gs_outprim_vertices(unsigned outprim)
+{
+   switch (outprim) {
+   case 0: /* GL_POINTS */
+      return 1;
+   case 3: /* GL_LINE_STRIP */
+      return 2;
+   case 5: /* GL_TRIANGLE_STRIP */
+      return 3;
+   default:
+      unreachable("Unsupported GS output primitive type.");
+   }
+}
+
+void ngg_visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp emit_vertex_idx = get_ssa_temp(ctx, instr->src[0].ssa);
+   Temp emit_vertex_addr = ngg_gs_emit_vertex_lds_addr(ctx, emit_vertex_idx);
+   unsigned stream = nir_intrinsic_stream_id(instr);
+   unsigned out_idx = 0;
+
+   for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
+      if (ctx->program->info->gs.output_streams[i] != stream) {
+         continue;
+      } else if (!ctx->outputs.mask[i] && ctx->program->info->gs.output_usage_mask[i]) {
+         /* The GS can write this output, but it's empty for the current vertex. */
+         out_idx++;
+         continue;
+      }
+
+      uint32_t wrmask = ctx->program->info->gs.output_usage_mask[i] &
+                        ctx->outputs.mask[i];
+
+      /* Clear output for the next vertex. */
+      ctx->outputs.mask[i] = 0;
+
+      if (!wrmask)
+         continue;
+
+      for (unsigned j = 0; j < 4; j++) {
+         if (wrmask & (1 << j)) {
+            Temp elem = ctx->outputs.temps[i * 4u + j];
+            store_lds(ctx, elem.bytes(), elem, 0x1u, emit_vertex_addr, out_idx * 4u, 4u);
+         }
+
+         out_idx++;
+      }
+   }
+
+   /* Calculate per-vertex primitive flags based on current and total vertex count per primitive:
+    *   bit 0: whether this vertex finishes a primitive
+    *   bit 1: whether the primitive is odd (if we are emitting triangle strips, otherwise always 0)
+    *   bit 2: always 1 (so that we can use it for determining vertex liveness)
+    */
+   unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive);
+   bool calc_odd = stream == 0 && total_vtx_per_prim == 3;
+   Temp prim_flag;
+
+   if (nir_src_is_const(instr->src[1])) {
+      uint8_t current_vtx_per_prim = nir_src_as_uint(instr->src[1]);
+      uint8_t completes_prim = (current_vtx_per_prim >= (total_vtx_per_prim - 1)) ? 1 : 0;
+      uint8_t odd = calc_odd & current_vtx_per_prim;
+      uint8_t flag = completes_prim | (odd << 1) | (1 << 2);
+      prim_flag = bld.copy(bld.def(v1b), Operand(flag));
+   } else if (!instr->src[1].ssa->divergent) {
+      Temp current_vtx_per_prim = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
+      Temp completes_prim = bld.sopc(aco_opcode::s_cmp_le_u32, bld.def(s1, scc), Operand(total_vtx_per_prim - 1), current_vtx_per_prim);
+      prim_flag = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0b101u), Operand(0b100u), bld.scc(completes_prim));
+      if (calc_odd) {
+         Temp odd = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), current_vtx_per_prim, Operand(0u));
+         prim_flag = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1), bld.def(s1, scc), odd, prim_flag);
+      }
+   } else {
+      Temp current_vtx_per_prim = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
+      Temp completes_prim = bld.vopc(aco_opcode::v_cmp_le_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(total_vtx_per_prim - 1), current_vtx_per_prim);
+      prim_flag = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0b100u), Operand(0b101u), Operand(completes_prim));
+      if (calc_odd) {
+         Temp odd = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), current_vtx_per_prim);
+         prim_flag = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), odd, Operand(1u), prim_flag);
+      }
+   }
+
+   /* Store the per-vertex primitive flags at the end of the vertex data */
+   prim_flag = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), as_vgpr(ctx, prim_flag), Operand(0u));
+   unsigned primflag_offset = ctx->ngg_gs_primflags_offset + stream;
+   store_lds(ctx, 1, prim_flag, 1u, emit_vertex_addr, primflag_offset, 1);
+}
+
+void ngg_gs_clear_primflags(isel_context *ctx, Temp vtx_cnt, unsigned stream);
+
+void ngg_visit_set_vertex_and_primitive_count(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   unsigned stream = nir_intrinsic_stream_id(instr);
+   if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
+      return;
+
+   /* Clear the primitive flags of non-emitted GS vertices. */
+   if (!nir_src_is_const(instr->src[0]) || nir_src_as_uint(instr->src[0]) < ctx->shader->info.gs.vertices_out) {
+      Temp vtx_cnt = get_ssa_temp(ctx, instr->src[0].ssa);
+      ngg_gs_clear_primflags(ctx, vtx_cnt, stream);
+   }
+
+   /* TODO: also take the primitive count into use */
+}
+
+void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr)
+{
    Builder bld(ctx->program, ctx->block);
 
    unsigned stream = nir_intrinsic_stream_id(instr);
@@ -8033,16 +8140,23 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       break;
    }
    case nir_intrinsic_emit_vertex_with_counter: {
-      visit_emit_vertex_with_counter(ctx, instr);
+      if (ctx->stage & hw_ngg_gs)
+         ngg_visit_emit_vertex_with_counter(ctx, instr);
+      else
+         visit_emit_vertex_with_counter(ctx, instr);
       break;
    }
    case nir_intrinsic_end_primitive_with_counter: {
-      unsigned stream = nir_intrinsic_stream_id(instr);
-      bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
+      if ((ctx->stage & hw_ngg_gs) == 0) {
+         unsigned stream = nir_intrinsic_stream_id(instr);
+         bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
+      }
       break;
    }
    case nir_intrinsic_set_vertex_and_primitive_count: {
-      /* unused, the HW keeps track of this for us */
+      if (ctx->stage & hw_ngg_gs)
+         ngg_visit_set_vertex_and_primitive_count(ctx, instr);
+      /* unused in the legacy pipeline, the HW keeps track of this for us */
       break;
    }
    default:
@@ -11017,6 +11131,213 @@ std::pair<Temp, Temp> ngg_gs_workgroup_reduce_and_scan(isel_context *ctx, Temp s
    return std::make_pair(wg_reduction, wg_excl);
 }
 
+void ngg_gs_clear_primflags(isel_context *ctx, Temp vtx_cnt, unsigned stream)
+{
+   loop_context lc;
+   if_context ic;
+   Builder bld(ctx->program, ctx->block);
+   Temp zero = bld.copy(bld.def(v1b), Operand(uint8_t(0)));
+   Temp counter_init = bld.copy(bld.def(v1), as_vgpr(ctx, vtx_cnt));
+
+   begin_loop(ctx, &lc);
+
+   Temp incremented_counter = bld.tmp(counter_init.regClass());
+   bld.reset(&ctx->block->instructions, ctx->block->instructions.begin());
+   Temp counter = bld.pseudo(aco_opcode::p_phi, bld.def(counter_init.regClass()), Operand(counter_init), incremented_counter);
+   bld.reset(ctx->block);
+   Temp break_cond = bld.vopc(aco_opcode::v_cmp_le_u32, bld.def(bld.lm), Operand(ctx->shader->info.gs.vertices_out), counter);
+
+   /* Break when vertices_out <= counter  */
+   begin_divergent_if_then(ctx, &ic, break_cond);
+   emit_loop_break(ctx);
+   begin_divergent_if_else(ctx, &ic);
+   end_divergent_if(ctx, &ic);
+   bld.reset(ctx->block);
+
+   /* Store zero to the primitive flag of the current vertex for the current stream */
+   Temp emit_vertex_addr = ngg_gs_emit_vertex_lds_addr(ctx, counter);
+   unsigned primflag_offset = ctx->ngg_gs_primflags_offset + stream;
+   store_lds(ctx, 1, zero, 0xf, emit_vertex_addr, primflag_offset, 1);
+
+   /* Increment counter */
+   bld.vadd32(Definition(incremented_counter), counter, Operand(1u));
+
+   end_loop(ctx, &lc);
+}
+
+Temp ngg_gs_load_prim_flag_0(isel_context *ctx, Temp tid_in_tg, Temp max_vtxcnt, Temp vertex_lds_addr)
+{
+   if_context ic;
+   Builder bld(ctx->program, ctx->block);
+
+   Temp is_vertex_emit_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), max_vtxcnt, tid_in_tg);
+   begin_divergent_if_then(ctx, &ic, is_vertex_emit_thread);
+   bld.reset(ctx->block);
+
+   Operand m = load_lds_size_m0(bld);
+   Temp prim_flag_0 = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset);
+
+   begin_divergent_if_else(ctx, &ic);
+   end_divergent_if(ctx, &ic);
+
+   bld.reset(&ctx->block->instructions, ctx->block->instructions.begin());
+   prim_flag_0 = bld.pseudo(aco_opcode::p_phi, bld.def(prim_flag_0.regClass()), Operand(prim_flag_0), Operand(0u));
+
+   return prim_flag_0;
+}
+
+void ngg_gs_setup_vertex_compaction(isel_context *ctx, Temp vertex_live, Temp tid_in_tg, Temp exporter_tid_in_tg)
+{
+   if_context ic;
+   Builder bld(ctx->program, ctx->block);
+   assert(vertex_live.regClass() == bld.lm);
+
+   begin_divergent_if_then(ctx, &ic, vertex_live);
+   bld.reset(ctx->block);
+
+   /* Setup the vertex compaction.
+    * Save the current thread's id for the thread which will export the current vertex.
+    * We reuse stream 1 of the primitive flag of the other thread's vertex for storing this.
+    */
+   Temp export_thread_lds_addr = ngg_gs_vertex_lds_addr(ctx, exporter_tid_in_tg);
+   tid_in_tg = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tid_in_tg, Operand(0u));
+   store_lds(ctx, 1u, tid_in_tg, 1u, export_thread_lds_addr, ctx->ngg_gs_primflags_offset + 1u, 1u);
+
+   begin_divergent_if_else(ctx, &ic);
+   end_divergent_if(ctx, &ic);
+   bld.reset(ctx->block);
+
+   /* Wait for all waves to setup the vertex compaction. */
+   create_workgroup_barrier(bld);
+}
+
+void ngg_gs_export_primitives(isel_context *ctx, Temp max_prmcnt, Temp tid_in_tg, Temp exporter_tid_in_tg,
+                              Temp prim_flag_0)
+{
+   if_context ic;
+   Builder bld(ctx->program, ctx->block);
+   unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive);
+   assert(total_vtx_per_prim <= 3);
+
+   Temp is_prim_export_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), max_prmcnt, tid_in_tg);
+   begin_divergent_if_then(ctx, &ic, is_prim_export_thread);
+   bld.reset(ctx->block);
+
+   Temp is_null_prim = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(-1u), prim_flag_0);
+   Temp indices[3];
+
+   indices[total_vtx_per_prim - 1] = exporter_tid_in_tg;
+   if (total_vtx_per_prim >= 2)
+      indices[total_vtx_per_prim - 2] = bld.vsub32(bld.def(v1), exporter_tid_in_tg, Operand(1u));
+   if (total_vtx_per_prim == 3)
+      indices[total_vtx_per_prim - 3] = bld.vsub32(bld.def(v1), exporter_tid_in_tg, Operand(2u));
+
+   if (total_vtx_per_prim == 3) {
+      /* API GS outputs triangle strips, but NGG HW needs triangles.
+      * We already have triangles due to how we set the primitive flags, but we need to
+      * make sure the vertex order is so that the front/back is correct, and the provoking vertex is kept.
+      */
+
+      /* If the primitive is odd, this will increment indices[1] and decrement indices[2] */
+      Temp is_odd = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), Operand(prim_flag_0), Operand(1u), Operand(1u));
+      indices[1] = bld.vadd32(bld.def(v1), indices[1], Operand(is_odd));
+      indices[2] = bld.vsub32(bld.def(v1), indices[2], Operand(is_odd));
+   }
+
+   ngg_emit_prim_export(ctx, total_vtx_per_prim, indices, is_null_prim);
+
+   begin_divergent_if_else(ctx, &ic);
+   end_divergent_if(ctx, &ic);
+}
+
+void ngg_gs_export_vertices(isel_context *ctx, Temp wg_vtx_cnt, Temp tid_in_tg, Temp vertex_lds_addr)
+{
+   if_context ic;
+   Builder bld(ctx->program, ctx->block);
+
+   /* See if the current thread has to export a vertex. */
+   Temp is_vtx_export_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), wg_vtx_cnt, tid_in_tg);
+   begin_divergent_if_then(ctx, &ic, is_vtx_export_thread);
+   bld.reset(ctx->block);
+
+   /* Vertex compaction: read stream 1 of the primitive flags to see which vertex the current thread needs to export */
+   Operand m = load_lds_size_m0(bld);
+   Temp exported_vtx_idx = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset + 1);
+   /* Get the LDS address of the vertex that the current thread must export. */
+   Temp exported_vtx_addr = ngg_gs_vertex_lds_addr(ctx, exported_vtx_idx);
+
+   /* Read the vertex attributes from LDS. */
+   unsigned out_idx = 0;
+   for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
+      if (ctx->program->info->gs.output_streams[i] != 0)
+         continue;
+
+      /* Set the output mask to the GS output usage mask. */
+      unsigned rdmask =
+         ctx->outputs.mask[i] =
+         ctx->program->info->gs.output_usage_mask[i];
+
+      if (!rdmask)
+         continue;
+
+      for (unsigned j = 0; j < 4; j++) {
+         if (rdmask & (1 << j))
+            ctx->outputs.temps[i * 4u + j] =
+               load_lds(ctx, 4u, bld.tmp(v1), exported_vtx_addr, out_idx * 4u, 4u);
+
+         out_idx++;
+      }
+   }
+
+   /* Export the vertex parameters. */
+   create_vs_exports(ctx);
+   ctx->block->kind |= block_kind_export_end;
+
+   begin_divergent_if_else(ctx, &ic);
+   end_divergent_if(ctx, &ic);
+}
+
+void ngg_gs_finale(isel_context *ctx)
+{
+   if_context ic;
+   Builder bld(ctx->program, ctx->block);
+
+   /* Wait for all waves to reach the epilogue. */
+   create_workgroup_barrier(bld);
+
+   /* Thread ID in the entire threadgroup */
+   Temp tid_in_tg = thread_id_in_threadgroup(ctx);
+   /* Number of threads that may need to export a vertex or primitive. */
+   Temp max_vtxcnt = ngg_max_vertex_count(ctx);
+   /* LDS address of the vertex corresponding to the current thread. */
+   Temp vertex_lds_addr = ngg_gs_vertex_lds_addr(ctx, tid_in_tg);
+   /* Primitive flag from stream 0 of the vertex corresponding to the current thread. */
+   Temp prim_flag_0 = ngg_gs_load_prim_flag_0(ctx, tid_in_tg, max_vtxcnt, vertex_lds_addr);
+
+   bld.reset(ctx->block);
+
+   /* NIR already filters out incomplete primitives and vertices,
+    * so any vertex whose primitive flag is non-zero is considered live/valid.
+    */
+   Temp vertex_live = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), Operand(prim_flag_0));
+
+   /* Perform a workgroup reduction and exclusive scan. */
+   std::pair<Temp, Temp> wg_scan = ngg_gs_workgroup_reduce_and_scan(ctx, vertex_live);
+   bld.reset(ctx->block);
+   /* Total number of vertices emitted by the workgroup. */
+   Temp wg_vtx_cnt = wg_scan.first;
+   /* ID of the thread which will export the current thread's vertex. */
+   Temp exporter_tid_in_tg = wg_scan.second;
+   /* Skip all exports when possible. */
+   Temp have_exports = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), wg_vtx_cnt, Operand(0u));
+   max_vtxcnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), max_vtxcnt, Operand(0u), bld.scc(have_exports));
+
+   ngg_emit_sendmsg_gs_alloc_req(ctx, wg_vtx_cnt, max_vtxcnt);
+   ngg_gs_setup_vertex_compaction(ctx, vertex_live, tid_in_tg, exporter_tid_in_tg);
+   ngg_gs_export_primitives(ctx, max_vtxcnt, tid_in_tg, exporter_tid_in_tg, prim_flag_0);
+   ngg_gs_export_vertices(ctx, wg_vtx_cnt, tid_in_tg, vertex_lds_addr);
+}
+
 } /* end namespace */
 
 void select_program(Program *program,
@@ -11028,6 +11349,7 @@ void select_program(Program *program,
    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
    if_context ic_merged_wave_info;
    bool ngg_no_gs = ctx.stage == ngg_vertex_gs || ctx.stage == ngg_tess_eval_gs;
+   bool ngg_gs = ctx.stage == ngg_vertex_geometry_gs || ctx.stage == ngg_tess_eval_geometry_gs;
 
    for (unsigned i = 0; i < shader_count; i++) {
       nir_shader *nir = shaders[i];
@@ -11088,7 +11410,7 @@ void select_program(Program *program,
          ctx.block->kind |= block_kind_export_end;
       } else if (ngg_no_gs && ctx.ngg_nogs_early_prim_export) {
          ngg_nogs_export_vertices(&ctx);
-      } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
+      } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) {
          Builder bld(ctx.program, ctx.block);
          bld.barrier(aco_opcode::p_barrier,
                      memory_sync_info(storage_vmem_output, semantic_release, scope_device));
@@ -11109,6 +11431,8 @@ void select_program(Program *program,
 
       if (ngg_no_gs && !ctx.ngg_nogs_early_prim_export)
          ngg_nogs_late_export_finale(&ctx);
+      else if (ngg_gs && nir->info.stage == MESA_SHADER_GEOMETRY)
+         ngg_gs_finale(&ctx);
 
       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
          /* Outputs of the previous stage are inputs to the next stage */