Commit a737470a authored by Daniel Schürmann's avatar Daniel Schürmann
Browse files

ac: set .align_mul = 16u/8u for load/store_shared

As the shared variables are now written/loaded per slot,
the align_mul is 16 instead of 4.
For store_shared, because of the additional copies when
using .align_mul = 16u, it's better to keep it at 8u.

Totals from 135 (0.09% of 149839) affected shaders: (GFX10.3)
VGPRs: 6504 -> 6776 (+4.18%); split: -0.12%, +4.31%
CodeSize: 505684 -> 479276 (-5.22%); split: -5.36%, +0.13%
MaxWaves: 2926 -> 2854 (-2.46%); split: +0.07%, -2.53%
Instrs: 89882 -> 87780 (-2.34%); split: -3.41%, +1.08%
Latency: 321525 -> 313024 (-2.64%); split: -3.13%, +0.48%
InvThroughput: 96611 -> 96225 (-0.40%); split: -2.87%, +2.47%
Copies: 7501 -> 10076 (+34.33%)
PreVGPRs: 5113 -> 5171 (+1.13%); split: -0.04%, +1.17%
parent 2f5b37e5
Pipeline #340722 waiting for manual action with stages
...@@ -597,7 +597,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri ...@@ -597,7 +597,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
} }
/* Store the output to LDS */ /* Store the output to LDS */
nir_build_store_shared(b, out_val, gs_emit_vtx_addr, .base = packed_location * 16, .align_mul = 4, .write_mask = write_mask); nir_build_store_shared(b, out_val, gs_emit_vtx_addr, .base = packed_location * 16, .align_mul = 8u, .write_mask = write_mask);
} }
/* Calculate and store per-vertex primitive flags based on vertex counts: /* Calculate and store per-vertex primitive flags based on vertex counts:
...@@ -739,7 +739,7 @@ ngg_gs_export_vertices(nir_builder *b, nir_ssa_def *max_num_out_vtx, nir_ssa_def ...@@ -739,7 +739,7 @@ ngg_gs_export_vertices(nir_builder *b, nir_ssa_def *max_num_out_vtx, nir_ssa_def
unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot))); unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot)));
nir_io_semantics io_sem = { .location = slot, .num_slots = 1 }; nir_io_semantics io_sem = { .location = slot, .num_slots = 1 };
nir_ssa_def *load = nir_build_load_shared(b, 4, 32, exported_out_vtx_lds_addr, .base = packed_location * 16u, .align_mul = 4u); nir_ssa_def *load = nir_build_load_shared(b, 4, 32, exported_out_vtx_lds_addr, .base = packed_location * 16u, .align_mul = 16u);
for (unsigned comp = 0; comp < 4; ++comp) { for (unsigned comp = 0; comp < 4; ++comp) {
gs_output_component_info *info = &s->output_component_info[slot][comp]; gs_output_component_info *info = &s->output_component_info[slot][comp];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment