Commit 91f8f828 authored by Rhys Perry's avatar Rhys Perry Committed by Marge Bot
Browse files

radv,aco: use all attributes in a binding to obtain an alignment for fetch



Instead of assuming scalar alignment for an attribute, we can use the
required alignment of other attributes in a binding to expect a higher
one.

This uses the alignment of all attributes in the pipeline, not just the
ones loaded. This can create slightly better code, but could break
pipelines which relied on unused (and unaligned) attributes no being
loaded. I don't think such pipelines are allowed by the spec.

fossil-db (Sienna Cichlid):
Totals from 44350 (30.32% of 146267) affected shaders:
VGPRs: 1694464 -> 1700616 (+0.36%); split: -0.08%, +0.44%
CodeSize: 60207184 -> 58093836 (-3.51%); split: -3.51%, +0.00%
MaxWaves: 1175998 -> 1174948 (-0.09%); split: +0.02%, -0.11%
Instrs: 11763444 -> 11458952 (-2.59%); split: -2.60%, +0.01%
Latency: 70679612 -> 67062215 (-5.12%); split: -5.27%, +0.15%
InvThroughput: 11482495 -> 11362911 (-1.04%); split: -1.20%, +0.16%
VClause: 359459 -> 343248 (-4.51%); split: -6.36%, +1.85%
SClause: 422404 -> 419229 (-0.75%); split: -1.17%, +0.42%
Copies: 754384 -> 764368 (+1.32%); split: -1.74%, +3.06%
Branches: 197472 -> 197474 (+0.00%); split: -0.03%, +0.03%
PreVGPRs: 1215348 -> 1215503 (+0.01%)
Signed-off-by: Rhys Perry's avatarRhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann's avatarDaniel Schürmann <daniel@schuermann.dev>
Part-of: <mesa/mesa!9007>
parent d41faa69
......@@ -4592,23 +4592,25 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr
}
bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
unsigned offset, unsigned stride, unsigned channels)
unsigned offset, unsigned binding_align, unsigned channels)
{
unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
if (vtx_info->chan_byte_size != 4 && channels == 3)
return false;
/* Always split typed vertex buffer loads on GFX6 and GFX10+ to avoid any
/* Split typed vertex buffer loads on GFX6 and GFX10+ to avoid any
* alignment issues that triggers memory violations and eventually a GPU
* hang. This can happen if the stride (static or dynamic) is unaligned and
* also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO
* offset is 2 for R16G16B16A16_SNORM).
*/
return (ctx->options->chip_class >= GFX7 && ctx->options->chip_class <= GFX9) ||
(channels == 1);
(offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0);
}
uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
unsigned offset, unsigned stride, unsigned *channels)
unsigned offset, unsigned stride, unsigned *channels,
unsigned binding_align)
{
if (!vtx_info->chan_byte_size) {
*channels = vtx_info->num_channels;
......@@ -4616,10 +4618,11 @@ uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_
}
unsigned num_channels = *channels;
if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) {
unsigned new_channels = num_channels + 1;
/* first, assume more loads is worse and try using a larger data format */
while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
while (new_channels <= 4 &&
!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) {
new_channels++;
/* don't make the attribute potentially out-of-bounds */
if (offset + new_channels * vtx_info->chan_byte_size > stride)
......@@ -4629,7 +4632,8 @@ uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_
if (new_channels == 5) {
/* then try decreasing load size (at the cost of more loads) */
new_channels = *channels;
while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
while (new_channels > 1 &&
!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels))
new_channels--;
}
......@@ -4708,6 +4712,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];
enum ac_fetch_format alpha_adjust = ctx->options->key.vs.alpha_adjust[location];
unsigned dfmt = attrib_format & 0xf;
......@@ -4776,7 +4781,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
vtx_info->chan_byte_size == 4;
unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
if (!use_mubuf) {
fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component);
fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component,
binding_align);
} else {
if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
/* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
......
......@@ -2585,6 +2585,17 @@ radv_generate_graphics_pipeline_key(const struct radv_pipeline *pipeline,
key.vertex_attribute_bindings[location] = desc->binding;
key.vertex_attribute_offsets[location] = desc->offset;
const struct ac_data_format_info *dfmt_info = ac_get_data_format_info(data_format);
unsigned attrib_align =
dfmt_info->chan_byte_size ? dfmt_info->chan_byte_size : dfmt_info->element_size;
/* If desc->offset is misaligned, then the buffer offset must be too. Just
* skip updating vertex_binding_align in this case.
*/
if (desc->offset % attrib_align == 0)
key.vertex_binding_align[desc->binding] =
MAX2(key.vertex_binding_align[desc->binding], attrib_align);
if (!uses_dynamic_stride) {
/* From the Vulkan spec 1.2.157:
*
......@@ -2708,6 +2719,8 @@ radv_fill_shader_keys(struct radv_device *device, struct radv_shader_variant_key
keys[MESA_SHADER_VERTEX].vs.vertex_attribute_strides[i] = key->vertex_attribute_strides[i];
keys[MESA_SHADER_VERTEX].vs.alpha_adjust[i] = key->vertex_alpha_adjust[i];
}
for (unsigned i = 0; i < MAX_VBS; ++i)
keys[MESA_SHADER_VERTEX].vs.vertex_binding_align[i] = key->vertex_binding_align[i];
keys[MESA_SHADER_VERTEX].vs.outprim = si_conv_prim_to_gs_out(key->topology);
keys[MESA_SHADER_VERTEX].vs.provoking_vtx_last = key->provoking_vtx_last;
......
......@@ -357,6 +357,7 @@ struct radv_pipeline_key {
uint32_t vertex_attribute_bindings[MAX_VERTEX_ATTRIBS];
uint32_t vertex_attribute_offsets[MAX_VERTEX_ATTRIBS];
uint32_t vertex_attribute_strides[MAX_VERTEX_ATTRIBS];
uint8_t vertex_binding_align[MAX_VBS];
enum ac_fetch_format vertex_alpha_adjust[MAX_VERTEX_ATTRIBS];
uint32_t vertex_post_shuffle;
unsigned tess_input_vertices;
......
......@@ -66,6 +66,7 @@ struct radv_vs_variant_key {
uint32_t vertex_attribute_bindings[MAX_VERTEX_ATTRIBS];
uint32_t vertex_attribute_offsets[MAX_VERTEX_ATTRIBS];
uint32_t vertex_attribute_strides[MAX_VERTEX_ATTRIBS];
uint8_t vertex_binding_align[MAX_VBS];
/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
* so we may need to fix it up. */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment