From abfd4651ed683dec2cd11c15f9ce95e0a31fa72c Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Wed, 6 Nov 2019 11:19:00 -0600 Subject: [PATCH 01/15] anv/pipeline: Assume layout != NULL In the early days of the driver we allowed layout to be VK_NULL_HANDLE and used that for some internal pipelines when we wanted to be lazy. Vulkan doesn't actually allow NULL layouts, however, so there's no reason to have this check. Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/anv_pipeline.c | 40 ++++++++++++++++----------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 4992d290ee76..7e8a825a7d69 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -712,27 +712,25 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline, nir_address_format_64bit_global); /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */ - if (layout) { - anv_nir_apply_pipeline_layout(pdevice, - pipeline->device->robust_buffer_access, - layout, nir, prog_data, - &stage->bind_map); - - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo, - nir_address_format_32bit_index_offset); - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo, - anv_nir_ssbo_addr_format(pdevice, - pipeline->device->robust_buffer_access)); - - NIR_PASS_V(nir, nir_opt_constant_folding); - - /* We don't support non-uniform UBOs and non-uniform SSBO access is - * handled naturally by falling back to A64 messages. - */ - NIR_PASS_V(nir, nir_lower_non_uniform_access, - nir_lower_non_uniform_texture_access | - nir_lower_non_uniform_image_access); - } + anv_nir_apply_pipeline_layout(pdevice, + pipeline->device->robust_buffer_access, + layout, nir, prog_data, + &stage->bind_map); + + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo, + nir_address_format_32bit_index_offset); + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo, + anv_nir_ssbo_addr_format(pdevice, + pipeline->device->robust_buffer_access)); + + NIR_PASS_V(nir, nir_opt_constant_folding); + + /* We don't support non-uniform UBOs and non-uniform SSBO access is + * handled naturally by falling back to A64 messages. + */ + NIR_PASS_V(nir, nir_lower_non_uniform_access, + nir_lower_non_uniform_texture_access | + nir_lower_non_uniform_image_access); if (nir->info.stage != MESA_SHADER_COMPUTE) brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); -- GitLab From 0a02f2a27844f30c7c5048c36d75a626cab13838 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 31 Oct 2019 10:25:48 -0500 Subject: [PATCH 02/15] genxml: Mark everything in genX_pack.h always_inline Reviewed-by: Lionel Landwerlin --- src/intel/genxml/gen_pack_header.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/intel/genxml/gen_pack_header.py b/src/intel/genxml/gen_pack_header.py index 118cc6330aff..2795e5905bef 100644 --- a/src/intel/genxml/gen_pack_header.py +++ b/src/intel/genxml/gen_pack_header.py @@ -69,13 +69,13 @@ union __gen_value { uint32_t dw; }; -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_mbo(uint32_t start, uint32_t end) { return (~0ull >> (64 - (end - start + 1))) << start; } -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_uint(uint64_t v, uint32_t start, NDEBUG_UNUSED uint32_t end) { __gen_validate_value(v); @@ -91,7 +91,7 @@ __gen_uint(uint64_t v, uint32_t start, NDEBUG_UNUSED uint32_t end) return v << start; } -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_sint(int64_t v, uint32_t start, uint32_t end) { const int width = end - start + 1; @@ -111,7 +111,7 @@ __gen_sint(int64_t v, uint32_t start, uint32_t end) return (v & mask) << start; } -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_offset(uint64_t v, NDEBUG_UNUSED uint32_t start, NDEBUG_UNUSED uint32_t end) { __gen_validate_value(v); @@ -124,14 +124,14 @@ __gen_offset(uint64_t v, NDEBUG_UNUSED uint32_t start, NDEBUG_UNUSED uint32_t en return v; } -static inline uint32_t +static inline __attribute__((always_inline)) uint32_t __gen_float(float v) { __gen_validate_value(v); return ((union __gen_value) { .f = (v) }).dw; } -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_sfixed(float v, uint32_t start, uint32_t end, uint32_t fract_bits) { __gen_validate_value(v); @@ -150,7 +150,7 @@ __gen_sfixed(float v, uint32_t start, uint32_t end, uint32_t fract_bits) return (int_val & mask) << start; } -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_ufixed(float v, uint32_t start, NDEBUG_UNUSED uint32_t end, uint32_t fract_bits) { __gen_validate_value(v); @@ -618,7 +618,7 @@ class Parser(object): def emit_pack_function(self, name, group): name = self.gen_prefix(name) print(textwrap.dedent("""\ - static inline void + static inline __attribute__((always_inline)) void %s_pack(__attribute__((unused)) __gen_user_data *data, %s__attribute__((unused)) void * restrict dst, %s__attribute__((unused)) const struct %s * restrict values) -- GitLab From fa120cb31cd8802b2a3a758acb1d911b5cbb22a2 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 31 Oct 2019 16:57:29 -0500 Subject: [PATCH 03/15] anv: Input attachments are always single-plane Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/genX_cmd_buffer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 5342cd7860f8..67ba0307195c 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2282,10 +2282,11 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, /* For depth and stencil input attachments, we treat it like any * old texture that a user may have bound. */ + assert(desc->image_view->n_planes == 1); struct anv_surface_state sstate = (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? - desc->image_view->planes[binding->plane].general_sampler_surface_state : - desc->image_view->planes[binding->plane].optimal_sampler_surface_state; + desc->image_view->planes[0].general_sampler_surface_state : + desc->image_view->planes[0].optimal_sampler_surface_state; surface_state = sstate.state; assert(surface_state.alloc_size); if (need_client_mem_relocs) -- GitLab From 0709c0f6b40b1e365104b248464ffefa746b5052 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 31 Oct 2019 14:09:39 -0500 Subject: [PATCH 04/15] anv: Flatten descriptor bindings in anv_nir_apply_pipeline_layout This lets us stop tracking the pipeline layout. It also means less indirection on a very hot path. As an extra bonus, we can make some of our data structures smaller. No measurable CPU overhead improvement. Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/anv_cmd_buffer.c | 8 ---- .../vulkan/anv_nir_apply_pipeline_layout.c | 39 ++++++++++------ src/intel/vulkan/anv_pipeline.c | 3 -- src/intel/vulkan/anv_private.h | 34 ++++++++------ src/intel/vulkan/genX_cmd_buffer.c | 45 ++++--------------- src/intel/vulkan/genX_pipeline.c | 1 - 6 files changed, 54 insertions(+), 76 deletions(-) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 59a281d60532..4bf2725dccce 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -614,14 +614,6 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.descriptors_dirty |= set_layout->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS; } - - /* Pipeline layout objects are required to live at least while any command - * buffers that use them are in recording state. We need to grab a reference - * to the pipeline layout being bound here so we can compute correct dynamic - * offsets for VK_DESCRIPTOR_TYPE_*_DYNAMIC in dynamic_offset_for_binding() - * when we record draw commands that come after this. - */ - pipe_state->layout = layout; } void anv_CmdBindDescriptorSets( diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index 1d13aa604924..c4717ea7665f 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -1137,7 +1137,7 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, map->surface_to_descriptor[map->surface_count] = (struct anv_pipeline_binding) { .set = ANV_DESCRIPTOR_SET_DESCRIPTORS, - .binding = s, + .index = s, }; state.set[s].desc_offset = map->surface_count; map->surface_count++; @@ -1226,16 +1226,28 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, state.set[set].surface_offsets[b] = BINDLESS_OFFSET; } else { state.set[set].surface_offsets[b] = map->surface_count; - struct anv_sampler **samplers = binding->immutable_samplers; - for (unsigned i = 0; i < binding->array_size; i++) { - uint8_t planes = samplers ? samplers[i]->n_planes : 1; - for (uint8_t p = 0; p < planes; p++) { + if (binding->dynamic_offset_index < 0) { + struct anv_sampler **samplers = binding->immutable_samplers; + for (unsigned i = 0; i < binding->array_size; i++) { + uint8_t planes = samplers ? samplers[i]->n_planes : 1; + for (uint8_t p = 0; p < planes; p++) { + map->surface_to_descriptor[map->surface_count++] = + (struct anv_pipeline_binding) { + .set = set, + .index = binding->descriptor_index + i, + .plane = p, + }; + } + } + } else { + for (unsigned i = 0; i < binding->array_size; i++) { map->surface_to_descriptor[map->surface_count++] = (struct anv_pipeline_binding) { .set = set, - .binding = b, - .index = i, - .plane = p, + .index = binding->descriptor_index + i, + .dynamic_offset_index = + layout->set[set].dynamic_offset_start + + binding->dynamic_offset_index + i, }; } } @@ -1264,8 +1276,7 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, map->sampler_to_descriptor[map->sampler_count++] = (struct anv_pipeline_binding) { .set = set, - .binding = b, - .index = i, + .index = binding->descriptor_index + i, .plane = p, }; } @@ -1294,8 +1305,9 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, const uint32_t set = var->data.descriptor_set; const uint32_t binding = var->data.binding; - const uint32_t array_size = - layout->set[set].layout->binding[binding].array_size; + struct anv_descriptor_set_binding_layout *bind_layout = + &layout->set[set].layout->binding[binding]; + const uint32_t array_size = bind_layout->array_size; if (state.set[set].use_count[binding] == 0) continue; @@ -1307,8 +1319,7 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, &map->surface_to_descriptor[state.set[set].surface_offsets[binding]]; for (unsigned i = 0; i < array_size; i++) { assert(pipe_binding[i].set == set); - assert(pipe_binding[i].binding == binding); - assert(pipe_binding[i].index == i); + assert(pipe_binding[i].index == bind_layout->descriptor_index + i); if (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS) diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 7e8a825a7d69..0f582e2801ed 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -926,14 +926,12 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler, if (stage->key.wm.color_outputs_valid & BITFIELD_BIT(rt)) { rt_bindings[rt] = (struct anv_pipeline_binding) { .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, - .binding = 0, .index = rt, }; } else { /* Setup a null render target */ rt_bindings[rt] = (struct anv_pipeline_binding) { .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, - .binding = 0, .index = UINT32_MAX, }; } @@ -943,7 +941,6 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler, /* Setup a null render target */ rt_bindings[0] = (struct anv_pipeline_binding) { .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, - .binding = 0, .index = UINT32_MAX, }; num_rt_bindings = 1; diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 0b1e97c64b6b..5b8ac491ce57 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2012,25 +2012,32 @@ anv_descriptor_set_destroy(struct anv_device *device, #define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX struct anv_pipeline_binding { - /* The descriptor set this surface corresponds to. The special value of - * ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS indicates that the offset refers - * to a color attachment and not a regular descriptor. + /** Index in the descriptor set + * + * This is a flattened index; the descriptor set layout is already taken + * into account. */ - uint8_t set; + uint32_t index; - /* Binding in the descriptor set */ - uint32_t binding; + /** The descriptor set this surface corresponds to. + * + * The special ANV_DESCRIPTOR_SET_* values above indicates that this + * binding is not a normal descriptor set but something else. + */ + uint8_t set; - /* Index in the binding */ - uint32_t index; + union { + /** Plane in the binding index for images */ + uint8_t plane; - /* Plane in the binding index */ - uint8_t plane; + /** Input attachment index (relative to the subpass) */ + uint8_t input_attachment_index; - /* Input attachment index (relative to the subpass) */ - uint8_t input_attachment_index; + /** Dynamic offset index (for dynamic UBOs and SSBOs) */ + uint8_t dynamic_offset_index; + }; - /* For a storage image, whether it is write-only */ + /** For a storage image, whether it is write-only */ bool write_only; }; @@ -2468,7 +2475,6 @@ struct anv_attachment_state { */ struct anv_cmd_pipeline_state { struct anv_pipeline *pipeline; - struct anv_pipeline_layout *layout; struct anv_descriptor_set *descriptors[MAX_SETS]; uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS]; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 67ba0307195c..b52aabdd984c 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2072,34 +2072,6 @@ cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; } -static const struct anv_descriptor * -anv_descriptor_for_binding(const struct anv_cmd_pipeline_state *pipe_state, - const struct anv_pipeline_binding *binding) -{ - assert(binding->set < MAX_SETS); - const struct anv_descriptor_set *set = - pipe_state->descriptors[binding->set]; - const uint32_t offset = - set->layout->binding[binding->binding].descriptor_index; - return &set->descriptors[offset + binding->index]; -} - -static uint32_t -dynamic_offset_for_binding(const struct anv_cmd_pipeline_state *pipe_state, - const struct anv_pipeline_binding *binding) -{ - assert(binding->set < MAX_SETS); - const struct anv_descriptor_set *set = - pipe_state->descriptors[binding->set]; - - uint32_t dynamic_offset_idx = - pipe_state->layout->set[binding->set].dynamic_offset_start + - set->layout->binding[binding->binding].dynamic_offset_index + - binding->index; - - return pipe_state->dynamic_offsets[dynamic_offset_idx]; -} - static struct anv_address anv_descriptor_set_address(struct anv_cmd_buffer *cmd_buffer, struct anv_descriptor_set *set) @@ -2179,7 +2151,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) { /* Color attachment binding */ assert(stage == MESA_SHADER_FRAGMENT); - assert(binding->binding == 0); if (binding->index < subpass->color_count) { const unsigned att = subpass->color_attachments[binding->index].attachment; @@ -2247,7 +2218,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, * given by binding->binding. (Yes, that's confusing.) */ struct anv_descriptor_set *set = - pipe_state->descriptors[binding->binding]; + pipe_state->descriptors[binding->index]; assert(set->desc_mem.alloc_size); assert(set->desc_surface_state.alloc_size); bt_map[s] = set->desc_surface_state.offset + state_offset; @@ -2257,7 +2228,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, } const struct anv_descriptor *desc = - anv_descriptor_for_binding(pipe_state, binding); + &pipe_state->descriptors[binding->set]->descriptors[binding->index]; switch (desc->type) { case VK_DESCRIPTOR_TYPE_SAMPLER: @@ -2339,7 +2310,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { /* Compute the offset within the buffer */ uint32_t dynamic_offset = - dynamic_offset_for_binding(pipe_state, binding); + pipe_state->dynamic_offsets[binding->dynamic_offset_index]; uint64_t offset = desc->offset + dynamic_offset; /* Clamp to the buffer size */ offset = MIN2(offset, desc->buffer->size); @@ -2413,7 +2384,7 @@ emit_samplers(struct anv_cmd_buffer *cmd_buffer, for (uint32_t s = 0; s < map->sampler_count; s++) { struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s]; const struct anv_descriptor *desc = - anv_descriptor_for_binding(pipe_state, binding); + &pipe_state->descriptors[binding->set]->descriptors[binding->index]; if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER && desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) @@ -2608,7 +2579,7 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, * confusing.) */ struct anv_descriptor_set *set = - gfx_state->base.descriptors[binding->binding]; + gfx_state->base.descriptors[binding->index]; struct anv_address desc_buffer_addr = anv_descriptor_set_address(cmd_buffer, set); const unsigned desc_buffer_size = set->desc_mem.alloc_size; @@ -2618,8 +2589,10 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, read_addr = anv_address_add(desc_buffer_addr, range->start * 32); } else { + struct anv_descriptor_set *set = + gfx_state->base.descriptors[binding->set]; const struct anv_descriptor *desc = - anv_descriptor_for_binding(&gfx_state->base, binding); + &set->descriptors[binding->index]; if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { read_len = MIN2(range->length, @@ -2630,7 +2603,7 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); uint32_t dynamic_offset = - dynamic_offset_for_binding(&gfx_state->base, binding); + gfx_state->base.dynamic_offsets[binding->dynamic_offset_index]; uint32_t buf_offset = MIN2(desc->offset + dynamic_offset, desc->buffer->size); uint32_t buf_range = diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 5594726e4a29..1d35c99001b8 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -1111,7 +1111,6 @@ emit_cb_state(struct anv_pipeline *pipeline, continue; } - assert(binding->binding == 0); const VkPipelineColorBlendAttachmentState *a = &info->pAttachments[binding->index]; -- GitLab From ebad00d9e7d38647614eaa4800d7ef7dfbfd5767 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Wed, 6 Nov 2019 14:13:44 -0600 Subject: [PATCH 05/15] anv: Delete dead shader constant pushing code As of 2d78e55a8c5481, nir_intrinsic_load_constant with a constant offset is constant-folded so we should never end up with any that trigger brw_nir_analyze_ubo_ranges. Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/anv_nir_apply_pipeline_layout.c | 5 +++++ src/intel/vulkan/genX_cmd_buffer.c | 15 ++------------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index c4717ea7665f..da64ea0bdf9b 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -782,6 +782,11 @@ lower_load_constant(nir_intrinsic_instr *intrin, b->cursor = nir_before_instr(&intrin->instr); + /* Any constant-offset load_constant instructions should have been removed + * by constant folding. + */ + assert(!nir_src_is_const(intrin->src[0])); + nir_ssa_def *index = nir_imm_int(b, state->constants_offset); nir_ssa_def *offset = nir_iadd(b, nir_ssa_for_src(b, intrin->src[0], 1), nir_imm_int(b, nir_intrinsic_base(intrin))); diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index b52aabdd984c..d5477ee43281 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2561,19 +2561,7 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, struct anv_address read_addr; uint32_t read_len; - if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { - struct anv_address constant_data = { - .bo = pipeline->device->dynamic_state_pool.block_pool.bo, - .offset = pipeline->shaders[stage]->constant_data.offset, - }; - unsigned constant_data_size = - pipeline->shaders[stage]->constant_data_size; - - read_len = MIN2(range->length, - DIV_ROUND_UP(constant_data_size, 32) - range->start); - read_addr = anv_address_add(constant_data, - range->start * 32); - } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { + if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { /* This is a descriptor set buffer so the set index is * actually given by binding->binding. (Yes, that's * confusing.) @@ -2589,6 +2577,7 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, read_addr = anv_address_add(desc_buffer_addr, range->start * 32); } else { + assert(binding->set < MAX_SETS); struct anv_descriptor_set *set = gfx_state->base.descriptors[binding->set]; const struct anv_descriptor *desc = -- GitLab From 4b392ced2d744fccffe95490ff57e6b41033c266 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 8 Nov 2019 09:33:07 -0600 Subject: [PATCH 06/15] anv: Stop bounds-checking pushed UBOs The bounds checking is actually less safe than just pushing the data. If the bounds checking actually ever kicks in and it's not on the last UBO push range, then the shrinking will cause all subsequent ranges to be pushed to the wrong place in the GRF. One of the behaviors we definitely don't want is for OOB UBO access to result in completely unrelated UBOs returning garbage values. It's safer to just push the UBOs as-requested. If we're really concerned about robustness, we can emit shader code to do bounds checking which should be stupid cheap (a CMP followed by SEL). Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/genX_cmd_buffer.c | 38 ++++++++---------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index d5477ee43281..a0df116b05cf 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2559,8 +2559,7 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, const struct anv_pipeline_binding *binding = &bind_map->surface_to_descriptor[surface]; - struct anv_address read_addr; - uint32_t read_len; + struct anv_address addr; if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { /* This is a descriptor set buffer so the set index is * actually given by binding->binding. (Yes, that's @@ -2568,14 +2567,8 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, */ struct anv_descriptor_set *set = gfx_state->base.descriptors[binding->index]; - struct anv_address desc_buffer_addr = - anv_descriptor_set_address(cmd_buffer, set); - const unsigned desc_buffer_size = set->desc_mem.alloc_size; - - read_len = MIN2(range->length, - DIV_ROUND_UP(desc_buffer_size, 32) - range->start); - read_addr = anv_address_add(desc_buffer_addr, - range->start * 32); + + addr = anv_descriptor_set_address(cmd_buffer, set); } else { assert(binding->set < MAX_SETS); struct anv_descriptor_set *set = @@ -2584,32 +2577,21 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, &set->descriptors[binding->index]; if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { - read_len = MIN2(range->length, - DIV_ROUND_UP(desc->buffer_view->range, 32) - range->start); - read_addr = anv_address_add(desc->buffer_view->address, - range->start * 32); + addr = desc->buffer_view->address; } else { assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); uint32_t dynamic_offset = gfx_state->base.dynamic_offsets[binding->dynamic_offset_index]; - uint32_t buf_offset = - MIN2(desc->offset + dynamic_offset, desc->buffer->size); - uint32_t buf_range = - MIN2(desc->range, desc->buffer->size - buf_offset); - - read_len = MIN2(range->length, - DIV_ROUND_UP(buf_range, 32) - range->start); - read_addr = anv_address_add(desc->buffer->address, - buf_offset + range->start * 32); + addr = anv_address_add(desc->buffer->address, + desc->offset + dynamic_offset); } } - if (read_len > 0) { - c.ConstantBody.Buffer[n] = read_addr; - c.ConstantBody.ReadLength[n] = read_len; - n--; - } + c.ConstantBody.Buffer[n] = + anv_address_add(addr, range->start * 32); + c.ConstantBody.ReadLength[n] = range->length; + n--; } struct anv_state state = -- GitLab From aecde235198f6c8dccb0d26b6397f1efb5e22bfe Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 8 Nov 2019 09:42:30 -0600 Subject: [PATCH 07/15] anv: Pre-compute push ranges for graphics pipelines It turns off that emitting push constants is one of the hottest paths in the driver and ANY work we do there costs us. By pre-computing things a bit ahead of time, we shave 5% off the runtime of a CPU-limited example running with the Dawn WebGPU implementation. Reviewed-by: Lionel Landwerlin --- src/intel/Makefile.sources | 1 + src/intel/vulkan/anv_nir.h | 4 + .../vulkan/anv_nir_compute_push_layout.c | 78 ++++++++++++++++ src/intel/vulkan/anv_pipeline.c | 4 + src/intel/vulkan/anv_pipeline_cache.c | 3 + src/intel/vulkan/anv_private.h | 20 +++++ src/intel/vulkan/genX_cmd_buffer.c | 90 ++++++------------- src/intel/vulkan/meson.build | 1 + 8 files changed, 137 insertions(+), 64 deletions(-) create mode 100644 src/intel/vulkan/anv_nir_compute_push_layout.c diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index f50b7c991d5b..b3861af32f55 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -258,6 +258,7 @@ VULKAN_FILES := \ vulkan/anv_nir.h \ vulkan/anv_nir_add_base_work_group_id.c \ vulkan/anv_nir_apply_pipeline_layout.c \ + vulkan/anv_nir_compute_push_layout.c \ vulkan/anv_nir_lower_multiview.c \ vulkan/anv_nir_lower_push_constants.c \ vulkan/anv_nir_lower_ycbcr_textures.c \ diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h index 844e5b0bfd40..8977599607e5 100644 --- a/src/intel/vulkan/anv_nir.h +++ b/src/intel/vulkan/anv_nir.h @@ -59,6 +59,10 @@ void anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, struct brw_stage_prog_data *prog_data, struct anv_pipeline_bind_map *map); +void anv_compute_push_layout(const struct anv_physical_device *pdevice, + struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map); + bool anv_nir_add_base_work_group_id(nir_shader *shader, struct brw_cs_prog_data *prog_data); diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c new file mode 100644 index 000000000000..72bff5544bc3 --- /dev/null +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -0,0 +1,78 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_nir.h" +#include "compiler/brw_nir.h" + +void +anv_compute_push_layout(const struct anv_physical_device *pdevice, + struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map) +{ + struct anv_push_range push_constant_range = { + .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, + .length = DIV_ROUND_UP(prog_data->nr_params, 8), + }; + + if (pdevice->info.gen >= 8 || pdevice->info.is_haswell) { + /* The Skylake PRM contains the following restriction: + * + * "The driver must ensure The following case does not occur + * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with + * buffer 3 read length equal to zero committed followed by a + * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to + * zero committed." + * + * To avoid this, we program the buffers in the highest slots. + * This way, slot 0 is only used if slot 3 is also used. + */ + int n = 3; + + for (int i = 3; i >= 0; i--) { + const struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i]; + if (ubo_range->length == 0) + continue; + + const struct anv_pipeline_binding *binding = + &map->surface_to_descriptor[ubo_range->block]; + + map->push_ranges[n--] = (struct anv_push_range) { + .set = binding->set, + .index = binding->index, + .dynamic_offset_index = binding->dynamic_offset_index, + .start = ubo_range->start, + .length = ubo_range->length, + }; + } + + if (push_constant_range.length > 0) + map->push_ranges[n--] = push_constant_range; + } else { + /* For Ivy Bridge, the push constants packets have a different + * rule that would require us to iterate in the other direction + * and possibly mess around with dynamic state base address. + * Don't bother; just emit regular push constants at n = 0. + */ + map->push_ranges[0] = push_constant_range; + } +} diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 0f582e2801ed..bcc62b77a3d5 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -1394,6 +1394,10 @@ anv_pipeline_compile_graphics(struct anv_pipeline *pipeline, goto fail; } + anv_compute_push_layout(&pipeline->device->instance->physicalDevice, + &stages[s].prog_data.base, + &stages[s].bind_map); + struct anv_shader_bin *bin = anv_device_upload_kernel(pipeline->device, cache, &stages[s].cache_key, diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c index e1d48b879b07..a4294e1eb609 100644 --- a/src/intel/vulkan/anv_pipeline_cache.c +++ b/src/intel/vulkan/anv_pipeline_cache.c @@ -169,6 +169,8 @@ anv_shader_bin_write_to_blob(const struct anv_shader_bin *shader, blob_write_bytes(blob, shader->bind_map.sampler_to_descriptor, shader->bind_map.sampler_count * sizeof(*shader->bind_map.sampler_to_descriptor)); + blob_write_bytes(blob, shader->bind_map.push_ranges, + sizeof(shader->bind_map.push_ranges)); return !blob->out_of_memory; } @@ -212,6 +214,7 @@ anv_shader_bin_create_from_blob(struct anv_device *device, bind_map.sampler_to_descriptor = (void *) blob_read_bytes(blob, bind_map.sampler_count * sizeof(*bind_map.sampler_to_descriptor)); + blob_copy_bytes(blob, bind_map.push_ranges, sizeof(bind_map.push_ranges)); if (blob->overrun) return NULL; diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 5b8ac491ce57..82e582269de2 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2006,6 +2006,7 @@ anv_descriptor_set_destroy(struct anv_device *device, struct anv_descriptor_pool *pool, struct anv_descriptor_set *set); +#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 4) #define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 3) #define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS (UINT8_MAX - 2) #define ANV_DESCRIPTOR_SET_SHADER_CONSTANTS (UINT8_MAX - 1) @@ -2041,6 +2042,23 @@ struct anv_pipeline_binding { bool write_only; }; +struct anv_push_range { + /** Index in the descriptor set */ + uint32_t index; + + /** Descriptor set index */ + uint8_t set; + + /** Dynamic offset index (for dynamic UBOs) */ + uint8_t dynamic_offset_index; + + /** Start offset in units of 32B */ + uint8_t start; + + /** Range in units of 32B */ + uint8_t length; +}; + struct anv_pipeline_layout { struct { struct anv_descriptor_set_layout *layout; @@ -2912,6 +2930,8 @@ struct anv_pipeline_bind_map { struct anv_pipeline_binding * surface_to_descriptor; struct anv_pipeline_binding * sampler_to_descriptor; + + struct anv_push_range push_ranges[4]; }; struct anv_shader_bin_key { diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index a0df116b05cf..4f35df86fc19 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2528,98 +2528,60 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, c._3DCommandSubOpcode = push_constant_opcodes[stage]; if (anv_pipeline_has_stage(pipeline, stage)) { -#if GEN_GEN >= 8 || GEN_IS_HASWELL - const struct brw_stage_prog_data *prog_data = - pipeline->shaders[stage]->prog_data; const struct anv_pipeline_bind_map *bind_map = &pipeline->shaders[stage]->bind_map; - /* The Skylake PRM contains the following restriction: - * - * "The driver must ensure The following case does not occur - * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with - * buffer 3 read length equal to zero committed followed by a - * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to - * zero committed." - * - * To avoid this, we program the buffers in the highest slots. - * This way, slot 0 is only used if slot 3 is also used. - */ - int n = 3; - - for (int i = 3; i >= 0; i--) { - const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; + for (unsigned i = 0; i < 4; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; if (range->length == 0) continue; - const unsigned surface = - prog_data->binding_table.ubo_start + range->block; - - assert(surface <= bind_map->surface_count); - const struct anv_pipeline_binding *binding = - &bind_map->surface_to_descriptor[surface]; - struct anv_address addr; - if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { + switch (range->set) { + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { /* This is a descriptor set buffer so the set index is * actually given by binding->binding. (Yes, that's * confusing.) */ struct anv_descriptor_set *set = - gfx_state->base.descriptors[binding->index]; - + gfx_state->base.descriptors[range->index]; addr = anv_descriptor_set_address(cmd_buffer, set); - } else { - assert(binding->set < MAX_SETS); + break; + } + + case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: { + struct anv_state state = + anv_cmd_buffer_push_constants(cmd_buffer, stage); + addr = (struct anv_address) { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = state.offset, + }; + break; + } + + default: { + assert(range->set < MAX_SETS); struct anv_descriptor_set *set = - gfx_state->base.descriptors[binding->set]; + gfx_state->base.descriptors[range->set]; const struct anv_descriptor *desc = - &set->descriptors[binding->index]; + &set->descriptors[range->index]; if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { addr = desc->buffer_view->address; } else { assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); - uint32_t dynamic_offset = - gfx_state->base.dynamic_offsets[binding->dynamic_offset_index]; + gfx_state->base.dynamic_offsets[range->dynamic_offset_index]; addr = anv_address_add(desc->buffer->address, desc->offset + dynamic_offset); } } + } - c.ConstantBody.Buffer[n] = + c.ConstantBody.ReadLength[i] = range->length; + c.ConstantBody.Buffer[i] = anv_address_add(addr, range->start * 32); - c.ConstantBody.ReadLength[n] = range->length; - n--; - } - - struct anv_state state = - anv_cmd_buffer_push_constants(cmd_buffer, stage); - - if (state.alloc_size > 0) { - c.ConstantBody.Buffer[n] = (struct anv_address) { - .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, - .offset = state.offset, - }; - c.ConstantBody.ReadLength[n] = - DIV_ROUND_UP(state.alloc_size, 32); } -#else - /* For Ivy Bridge, the push constants packets have a different - * rule that would require us to iterate in the other direction - * and possibly mess around with dynamic state base address. - * Don't bother; just emit regular push constants at n = 0. - */ - struct anv_state state = - anv_cmd_buffer_push_constants(cmd_buffer, stage); - - if (state.alloc_size > 0) { - c.ConstantBody.Buffer[0].offset = state.offset, - c.ConstantBody.ReadLength[0] = - DIV_ROUND_UP(state.alloc_size, 32); - } -#endif } } diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index c21d7bd25074..31127aabf11e 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -114,6 +114,7 @@ libanv_files = files( 'anv_nir.h', 'anv_nir_add_base_work_group_id.c', 'anv_nir_apply_pipeline_layout.c', + 'anv_nir_compute_push_layout.c', 'anv_nir_lower_multiview.c', 'anv_nir_lower_push_constants.c', 'anv_nir_lower_ycbcr_textures.c', -- GitLab From d1c4e64a69e49c64148529024ecb700d18d3c1c8 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 31 Oct 2019 15:57:52 -0500 Subject: [PATCH 08/15] intel/compiler: Add a flag to avoid compacting push constants In vec4, we can just not run the pass. In fs, things are a bit more deeply intertwined. Reviewed-by: Lionel Landwerlin --- src/gallium/drivers/iris/iris_screen.c | 1 + src/intel/compiler/brw_compiler.h | 6 + src/intel/compiler/brw_fs.cpp | 303 ++++++++++++----------- src/intel/compiler/brw_vec4.cpp | 3 + src/intel/vulkan/anv_device.c | 1 + src/mesa/drivers/dri/i965/intel_screen.c | 1 + 6 files changed, 170 insertions(+), 145 deletions(-) diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c index f1bb03de8f94..cbefa9dfb136 100644 --- a/src/gallium/drivers/iris/iris_screen.c +++ b/src/gallium/drivers/iris/iris_screen.c @@ -673,6 +673,7 @@ iris_screen_create(int fd, const struct pipe_screen_config *config) screen->compiler->shader_perf_log = iris_shader_perf_log; screen->compiler->supports_pull_constants = false; screen->compiler->supports_shader_constants = true; + screen->compiler->compact_params = false; iris_disk_cache_init(screen); diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index dd2de2094e71..6e6610a26af5 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -119,6 +119,12 @@ struct brw_compiler { * whether nir_opt_large_constants will be run. */ bool supports_shader_constants; + + /** + * Whether or not the driver wants uniform params to be compacted by the + * back-end compiler. + */ + bool compact_params; }; /** diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 37a90d0d4890..0d3d6137058f 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2307,159 +2307,190 @@ fs_visitor::assign_constant_locations() return; } - struct uniform_slot_info slots[uniforms]; - memset(slots, 0, sizeof(slots)); + if (compiler->compact_params) { + struct uniform_slot_info slots[uniforms]; + memset(slots, 0, sizeof(slots)); - foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { - for (int i = 0 ; i < inst->sources; i++) { - if (inst->src[i].file != UNIFORM) - continue; + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + for (int i = 0 ; i < inst->sources; i++) { + if (inst->src[i].file != UNIFORM) + continue; - /* NIR tightly packs things so the uniform number might not be - * aligned (if we have a double right after a float, for instance). - * This is fine because the process of re-arranging them will ensure - * that things are properly aligned. The offset into that uniform, - * however, must be aligned. - * - * In Vulkan, we have explicit offsets but everything is crammed - * into a single "variable" so inst->src[i].nr will always be 0. - * Everything will be properly aligned relative to that one base. - */ - assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); + /* NIR tightly packs things so the uniform number might not be + * aligned (if we have a double right after a float, for + * instance). This is fine because the process of re-arranging + * them will ensure that things are properly aligned. The offset + * into that uniform, however, must be aligned. + * + * In Vulkan, we have explicit offsets but everything is crammed + * into a single "variable" so inst->src[i].nr will always be 0. + * Everything will be properly aligned relative to that one base. + */ + assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); - unsigned u = inst->src[i].nr + - inst->src[i].offset / UNIFORM_SLOT_SIZE; + unsigned u = inst->src[i].nr + + inst->src[i].offset / UNIFORM_SLOT_SIZE; - if (u >= uniforms) - continue; + if (u >= uniforms) + continue; - unsigned slots_read; - if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { - slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE); - } else { - unsigned bytes_read = inst->components_read(i) * - type_sz(inst->src[i].type); - slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); - } + unsigned slots_read; + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { + slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE); + } else { + unsigned bytes_read = inst->components_read(i) * + type_sz(inst->src[i].type); + slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); + } - assert(u + slots_read <= uniforms); - mark_uniform_slots_read(&slots[u], slots_read, - type_sz(inst->src[i].type)); + assert(u + slots_read <= uniforms); + mark_uniform_slots_read(&slots[u], slots_read, + type_sz(inst->src[i].type)); + } } - } - int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data); + int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data); - /* Only allow 16 registers (128 uniform components) as push constants. - * - * Just demote the end of the list. We could probably do better - * here, demoting things that are rarely used in the program first. - * - * If changing this value, note the limitation about total_regs in - * brw_curbe.c. - */ - unsigned int max_push_components = 16 * 8; - if (subgroup_id_index >= 0) - max_push_components--; /* Save a slot for the thread ID */ + /* Only allow 16 registers (128 uniform components) as push constants. + * + * Just demote the end of the list. We could probably do better + * here, demoting things that are rarely used in the program first. + * + * If changing this value, note the limitation about total_regs in + * brw_curbe.c. + */ + unsigned int max_push_components = 16 * 8; + if (subgroup_id_index >= 0) + max_push_components--; /* Save a slot for the thread ID */ - /* We push small arrays, but no bigger than 16 floats. This is big enough - * for a vec4 but hopefully not large enough to push out other stuff. We - * should probably use a better heuristic at some point. - */ - const unsigned int max_chunk_size = 16; + /* We push small arrays, but no bigger than 16 floats. This is big + * enough for a vec4 but hopefully not large enough to push out other + * stuff. We should probably use a better heuristic at some point. + */ + const unsigned int max_chunk_size = 16; - unsigned int num_push_constants = 0; - unsigned int num_pull_constants = 0; + unsigned int num_push_constants = 0; + unsigned int num_pull_constants = 0; - push_constant_loc = ralloc_array(mem_ctx, int, uniforms); - pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); + push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - /* Default to -1 meaning no location */ - memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc)); - memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); + /* Default to -1 meaning no location */ + memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc)); + memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); - int chunk_start = -1; - struct cplx_align align; - for (unsigned u = 0; u < uniforms; u++) { - if (!slots[u].is_live) { - assert(chunk_start == -1); - continue; - } + int chunk_start = -1; + struct cplx_align align; + for (unsigned u = 0; u < uniforms; u++) { + if (!slots[u].is_live) { + assert(chunk_start == -1); + continue; + } - /* Skip subgroup_id_index to put it in the last push register. */ - if (subgroup_id_index == (int)u) - continue; + /* Skip subgroup_id_index to put it in the last push register. */ + if (subgroup_id_index == (int)u) + continue; - if (chunk_start == -1) { - chunk_start = u; - align = slots[u].align; - } else { - /* Offset into the chunk */ - unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; + if (chunk_start == -1) { + chunk_start = u; + align = slots[u].align; + } else { + /* Offset into the chunk */ + unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; - /* Shift the slot alignment down by the chunk offset so it is - * comparable with the base chunk alignment. - */ - struct cplx_align slot_align = slots[u].align; - slot_align.offset = - (slot_align.offset - chunk_offset) & (align.mul - 1); + /* Shift the slot alignment down by the chunk offset so it is + * comparable with the base chunk alignment. + */ + struct cplx_align slot_align = slots[u].align; + slot_align.offset = + (slot_align.offset - chunk_offset) & (align.mul - 1); - align = cplx_align_combine(align, slot_align); - } + align = cplx_align_combine(align, slot_align); + } - /* Sanity check the alignment */ - cplx_align_assert_sane(align); + /* Sanity check the alignment */ + cplx_align_assert_sane(align); - if (slots[u].contiguous) - continue; + if (slots[u].contiguous) + continue; - /* Adjust the alignment to be in terms of slots, not bytes */ - assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); - assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); - align.mul /= UNIFORM_SLOT_SIZE; - align.offset /= UNIFORM_SLOT_SIZE; - - unsigned push_start_align = cplx_align_apply(align, num_push_constants); - unsigned chunk_size = u - chunk_start + 1; - if ((!compiler->supports_pull_constants && u < UBO_START) || - (chunk_size < max_chunk_size && - push_start_align + chunk_size <= max_push_components)) { - /* Align up the number of push constants */ - num_push_constants = push_start_align; - for (unsigned i = 0; i < chunk_size; i++) - push_constant_loc[chunk_start + i] = num_push_constants++; - } else { - /* We need to pull this one */ - num_pull_constants = cplx_align_apply(align, num_pull_constants); - for (unsigned i = 0; i < chunk_size; i++) - pull_constant_loc[chunk_start + i] = num_pull_constants++; + /* Adjust the alignment to be in terms of slots, not bytes */ + assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); + assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); + align.mul /= UNIFORM_SLOT_SIZE; + align.offset /= UNIFORM_SLOT_SIZE; + + unsigned push_start_align = cplx_align_apply(align, num_push_constants); + unsigned chunk_size = u - chunk_start + 1; + if ((!compiler->supports_pull_constants && u < UBO_START) || + (chunk_size < max_chunk_size && + push_start_align + chunk_size <= max_push_components)) { + /* Align up the number of push constants */ + num_push_constants = push_start_align; + for (unsigned i = 0; i < chunk_size; i++) + push_constant_loc[chunk_start + i] = num_push_constants++; + } else { + /* We need to pull this one */ + num_pull_constants = cplx_align_apply(align, num_pull_constants); + for (unsigned i = 0; i < chunk_size; i++) + pull_constant_loc[chunk_start + i] = num_pull_constants++; + } + + /* Reset the chunk and start again */ + chunk_start = -1; } - /* Reset the chunk and start again */ - chunk_start = -1; - } + /* Add the CS local thread ID uniform at the end of the push constants */ + if (subgroup_id_index >= 0) + push_constant_loc[subgroup_id_index] = num_push_constants++; - /* Add the CS local thread ID uniform at the end of the push constants */ - if (subgroup_id_index >= 0) - push_constant_loc[subgroup_id_index] = num_push_constants++; + /* As the uniforms are going to be reordered, stash the old array and + * create two new arrays for push/pull params. + */ + uint32_t *param = stage_prog_data->param; + stage_prog_data->nr_params = num_push_constants; + if (num_push_constants) { + stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, + num_push_constants); + } else { + stage_prog_data->param = NULL; + } + assert(stage_prog_data->nr_pull_params == 0); + assert(stage_prog_data->pull_param == NULL); + if (num_pull_constants > 0) { + stage_prog_data->nr_pull_params = num_pull_constants; + stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, + num_pull_constants); + } - /* As the uniforms are going to be reordered, stash the old array and - * create two new arrays for push/pull params. - */ - uint32_t *param = stage_prog_data->param; - stage_prog_data->nr_params = num_push_constants; - if (num_push_constants) { - stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, - num_push_constants); + /* Up until now, the param[] array has been indexed by reg + offset + * of UNIFORM registers. Move pull constants into pull_param[] and + * condense param[] to only contain the uniforms we chose to push. + * + * NOTE: Because we are condensing the params[] array, we know that + * push_constant_loc[i] <= i and we can do it in one smooth loop without + * having to make a copy. + */ + for (unsigned int i = 0; i < uniforms; i++) { + uint32_t value = param[i]; + if (pull_constant_loc[i] != -1) { + stage_prog_data->pull_param[pull_constant_loc[i]] = value; + } else if (push_constant_loc[i] != -1) { + stage_prog_data->param[push_constant_loc[i]] = value; + } + } + ralloc_free(param); } else { - stage_prog_data->param = NULL; - } - assert(stage_prog_data->nr_pull_params == 0); - assert(stage_prog_data->pull_param == NULL); - if (num_pull_constants > 0) { - stage_prog_data->nr_pull_params = num_pull_constants; - stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, - num_pull_constants); + /* If we don't want to compact anything, just set up dummy push/pull + * arrays. All the rest of the compiler cares about are these arrays. + */ + push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); + + for (unsigned u = 0; u < uniforms; u++) + push_constant_loc[u] = u; + + memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); } /* Now that we know how many regular uniforms we'll push, reduce the @@ -2475,24 +2506,6 @@ fs_visitor::assign_constant_locations() push_length += range->length; } assert(push_length <= 64); - - /* Up until now, the param[] array has been indexed by reg + offset - * of UNIFORM registers. Move pull constants into pull_param[] and - * condense param[] to only contain the uniforms we chose to push. - * - * NOTE: Because we are condensing the params[] array, we know that - * push_constant_loc[i] <= i and we can do it in one smooth loop without - * having to make a copy. - */ - for (unsigned int i = 0; i < uniforms; i++) { - uint32_t value = param[i]; - if (pull_constant_loc[i] != -1) { - stage_prog_data->pull_param[pull_constant_loc[i]] = value; - } else if (push_constant_loc[i] != -1) { - stage_prog_data->param[push_constant_loc[i]] = value; - } - } - ralloc_free(param); } bool diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp index 9fa946ccdb48..fb4cf365cb45 100644 --- a/src/intel/compiler/brw_vec4.cpp +++ b/src/intel/compiler/brw_vec4.cpp @@ -633,6 +633,9 @@ set_push_constant_loc(const int nr_uniforms, int *new_uniform_count, void vec4_visitor::pack_uniform_registers() { + if (!compiler->compact_params) + return; + uint8_t chans_used[this->uniforms]; int new_loc[this->uniforms]; int new_chan[this->uniforms]; diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 28a652867d96..0fc507c12a80 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -560,6 +560,7 @@ anv_physical_device_init(struct anv_physical_device *device, device->compiler->constant_buffer_0_is_relative = device->info.gen < 8 || !device->has_context_isolation; device->compiler->supports_shader_constants = true; + device->compiler->compact_params = true; /* Broadwell PRM says: * diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index 6040161d8f96..c690a8ede7c9 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -2799,6 +2799,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *dri_screen) !(screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION); screen->compiler->supports_pull_constants = true; + screen->compiler->compact_params = true; screen->has_exec_fence = intel_get_boolean(screen, I915_PARAM_HAS_EXEC_FENCE); -- GitLab From ca91ab801522c7a760562fe32d0a2a4b7b333876 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Wed, 6 Nov 2019 10:59:15 -0600 Subject: [PATCH 09/15] anv: Re-arrange push constant data a bit This moves the compute stuff into a anv_push_constants::cs sub-struct. It also moves dynamic offsets into the push constants. This means we have to duplicate the data per-stage but that doesn't seem like the end of the world and one day we may wish to make dynamic offsets per-stage anyway. Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/anv_cmd_buffer.c | 25 +++++++++++++++---------- src/intel/vulkan/anv_private.h | 23 ++++++++++++++++++----- src/intel/vulkan/genX_cmd_buffer.c | 21 +++++++++++++-------- 3 files changed, 46 insertions(+), 23 deletions(-) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 4bf2725dccce..12ab3a1f7285 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -586,13 +586,18 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer, uint32_t dynamic_offset_start = layout->set[set_index].dynamic_offset_start; - /* Assert that everything is in range */ - assert(set_layout->dynamic_offset_count <= *dynamic_offset_count); - assert(dynamic_offset_start + set_layout->dynamic_offset_count <= - ARRAY_SIZE(pipe_state->dynamic_offsets)); + anv_foreach_stage(stage, set_layout->shader_stages) { + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; - typed_memcpy(&pipe_state->dynamic_offsets[dynamic_offset_start], - *dynamic_offsets, set_layout->dynamic_offset_count); + /* Assert that everything is in range */ + assert(set_layout->dynamic_offset_count <= *dynamic_offset_count); + assert(dynamic_offset_start + set_layout->dynamic_offset_count <= + ARRAY_SIZE(push->dynamic_offsets)); + + typed_memcpy(&push->dynamic_offsets[dynamic_offset_start], + *dynamic_offsets, set_layout->dynamic_offset_count); + } *dynamic_offsets += set_layout->dynamic_offset_count; *dynamic_offset_count -= set_layout->dynamic_offset_count; @@ -749,11 +754,11 @@ anv_push_constant_value(const struct anv_cmd_pipeline_state *state, case BRW_PARAM_BUILTIN_ZERO: return 0; case BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X: - return data->base_work_group_id[0]; + return data->cs.base_work_group_id[0]; case BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y: - return data->base_work_group_id[1]; + return data->cs.base_work_group_id[1]; case BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z: - return data->base_work_group_id[2]; + return data->cs.base_work_group_id[2]; default: unreachable("Invalid param builtin"); } @@ -767,7 +772,7 @@ anv_push_constant_value(const struct anv_cmd_pipeline_state *state, } else if (ANV_PARAM_IS_DYN_OFFSET(param)) { unsigned idx = ANV_PARAM_DYN_OFFSET_IDX(param); assert(idx < MAX_DYNAMIC_BUFFERS); - return state->dynamic_offsets[idx]; + return data->dynamic_offsets[idx]; } assert(!"Invalid param"); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 82e582269de2..771268db156c 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2371,11 +2371,26 @@ struct anv_xfb_binding { #define ANV_PARAM_DYN_OFFSET_IDX(param) ((param) & 0xffff) struct anv_push_constants { - /* Push constant data provided by the client through vkPushConstants */ + /** Push constant data provided by the client through vkPushConstants */ uint8_t client_data[MAX_PUSH_CONSTANTS_SIZE]; - /* Used for vkCmdDispatchBase */ - uint32_t base_work_group_id[3]; + /** Dynamic offsets for dynamic UBOs and SSBOs */ + uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS]; + + struct { + /** Base workgroup ID + * + * Used for vkCmdDispatchBase. + */ + uint32_t base_work_group_id[3]; + + /** Subgroup ID + * + * This is never set by software but is implicitly filled out when + * uploading the push constants for compute shaders. + */ + uint32_t subgroup_id; + } cs; }; struct anv_dynamic_state { @@ -2495,8 +2510,6 @@ struct anv_cmd_pipeline_state { struct anv_pipeline *pipeline; struct anv_descriptor_set *descriptors[MAX_SETS]; - uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS]; - struct anv_push_descriptor_set *push_descriptors[MAX_SETS]; }; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 4f35df86fc19..567e556f5a5e 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2309,8 +2309,11 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { /* Compute the offset within the buffer */ + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; + uint32_t dynamic_offset = - pipe_state->dynamic_offsets[binding->dynamic_offset_index]; + push->dynamic_offsets[binding->dynamic_offset_index]; uint64_t offset = desc->offset + dynamic_offset; /* Clamp to the buffer size */ offset = MIN2(offset, desc->buffer->size); @@ -2570,8 +2573,10 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, addr = desc->buffer_view->address; } else { assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; uint32_t dynamic_offset = - gfx_state->base.dynamic_offsets[range->dynamic_offset_index]; + push->dynamic_offsets[range->dynamic_offset_index]; addr = anv_address_add(desc->buffer->address, desc->offset + dynamic_offset); } @@ -3590,12 +3595,12 @@ anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer, struct anv_push_constants *push = &cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE]; - if (push->base_work_group_id[0] != baseGroupX || - push->base_work_group_id[1] != baseGroupY || - push->base_work_group_id[2] != baseGroupZ) { - push->base_work_group_id[0] = baseGroupX; - push->base_work_group_id[1] = baseGroupY; - push->base_work_group_id[2] = baseGroupZ; + if (push->cs.base_work_group_id[0] != baseGroupX || + push->cs.base_work_group_id[1] != baseGroupY || + push->cs.base_work_group_id[2] != baseGroupZ) { + push->cs.base_work_group_id[0] = baseGroupX; + push->cs.base_work_group_id[1] = baseGroupY; + push->cs.base_work_group_id[2] = baseGroupZ; cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; } -- GitLab From 9baa33cef01f0e1fe221379d78387fe9e8517c74 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 7 Nov 2019 17:16:14 -0600 Subject: [PATCH 10/15] anv: Rework push constant handling This substantially reworks both the state setup side of push constant handling and the pipeline compile side. The fundamental change here is that we're no longer respecting the prog_data::param array and instead are just instructing the back-end compiler to leave the array alone. This makes the state setup side substantially simpler because we can now just memcpy the whole block of push constants and don't have to upload one DWORD at a time. This also means that we can compute the full push constant layout up-front and just trust the back-end compiler to not mess with it. Maybe one day we'll decide that the back-end compiler can do useful things there again but for now, this is functionally no different from what we had before this commit and makes the NIR handling cleaner. Reviewed-by: Lionel Landwerlin --- src/intel/Makefile.sources | 1 - src/intel/vulkan/anv_cmd_buffer.c | 96 +++---------- src/intel/vulkan/anv_device.c | 2 +- src/intel/vulkan/anv_nir.h | 16 ++- .../vulkan/anv_nir_add_base_work_group_id.c | 25 +--- .../vulkan/anv_nir_apply_pipeline_layout.c | 30 ++-- .../vulkan/anv_nir_compute_push_layout.c | 129 +++++++++++++++++- .../vulkan/anv_nir_lower_push_constants.c | 49 ------- src/intel/vulkan/anv_pipeline.c | 44 ++---- src/intel/vulkan/anv_private.h | 11 +- src/intel/vulkan/meson.build | 1 - 11 files changed, 176 insertions(+), 228 deletions(-) delete mode 100644 src/intel/vulkan/anv_nir_lower_push_constants.c diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index b3861af32f55..252a0cb5320d 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -260,7 +260,6 @@ VULKAN_FILES := \ vulkan/anv_nir_apply_pipeline_layout.c \ vulkan/anv_nir_compute_push_layout.c \ vulkan/anv_nir_lower_multiview.c \ - vulkan/anv_nir_lower_push_constants.c \ vulkan/anv_nir_lower_ycbcr_textures.c \ vulkan/anv_pass.c \ vulkan/anv_perf.c \ diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 12ab3a1f7285..f14225b963ae 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -745,71 +745,18 @@ anv_cmd_buffer_merge_dynamic(struct anv_cmd_buffer *cmd_buffer, return state; } -static uint32_t -anv_push_constant_value(const struct anv_cmd_pipeline_state *state, - const struct anv_push_constants *data, uint32_t param) -{ - if (BRW_PARAM_IS_BUILTIN(param)) { - switch (param) { - case BRW_PARAM_BUILTIN_ZERO: - return 0; - case BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X: - return data->cs.base_work_group_id[0]; - case BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y: - return data->cs.base_work_group_id[1]; - case BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z: - return data->cs.base_work_group_id[2]; - default: - unreachable("Invalid param builtin"); - } - } else if (ANV_PARAM_IS_PUSH(param)) { - uint32_t offset = ANV_PARAM_PUSH_OFFSET(param); - assert(offset % sizeof(uint32_t) == 0); - if (offset < sizeof(data->client_data)) - return *(uint32_t *)((uint8_t *)data + offset); - else - return 0; - } else if (ANV_PARAM_IS_DYN_OFFSET(param)) { - unsigned idx = ANV_PARAM_DYN_OFFSET_IDX(param); - assert(idx < MAX_DYNAMIC_BUFFERS); - return data->dynamic_offsets[idx]; - } - - assert(!"Invalid param"); - return 0; -} - struct anv_state anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer, gl_shader_stage stage) { - struct anv_cmd_pipeline_state *pipeline_state = &cmd_buffer->state.gfx.base; - struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; - - /* If we don't have this stage, bail. */ - if (!anv_pipeline_has_stage(pipeline, stage)) - return (struct anv_state) { .offset = 0 }; - struct anv_push_constants *data = &cmd_buffer->state.push_constants[stage]; - const struct brw_stage_prog_data *prog_data = - pipeline->shaders[stage]->prog_data; - - /* If we don't actually have any push constants, bail. */ - if (prog_data == NULL || prog_data->nr_params == 0) - return (struct anv_state) { .offset = 0 }; struct anv_state state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, - prog_data->nr_params * sizeof(float), + sizeof(struct anv_push_constants), 32 /* bottom 5 bits MBZ */); - - /* Walk through the param array and fill the buffer with data */ - uint32_t *u32_map = state.map; - for (unsigned i = 0; i < prog_data->nr_params; i++) { - u32_map[i] = anv_push_constant_value(pipeline_state, data, - prog_data->param[i]); - } + memcpy(state.map, data, sizeof(struct anv_push_constants)); return state; } @@ -817,14 +764,13 @@ anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer, struct anv_state anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) { - struct anv_cmd_pipeline_state *pipeline_state = &cmd_buffer->state.compute.base; struct anv_push_constants *data = &cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE]; struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); - const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; + const struct anv_push_range *range = + &pipeline->shaders[MESA_SHADER_COMPUTE]->bind_map.push_ranges[0]; - /* If we don't actually have any push constants, bail. */ if (cs_prog_data->push.total.size == 0) return (struct anv_state) { .offset = 0 }; @@ -837,33 +783,25 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) aligned_total_push_constants_size, push_constant_alignment); - /* Walk through the param array and fill the buffer with data */ - uint32_t *u32_map = state.map; + void *dst = state.map; + const void *src = (char *)data + (range->start * 32); if (cs_prog_data->push.cross_thread.size > 0) { - for (unsigned i = 0; - i < cs_prog_data->push.cross_thread.dwords; - i++) { - assert(prog_data->param[i] != BRW_PARAM_BUILTIN_SUBGROUP_ID); - u32_map[i] = anv_push_constant_value(pipeline_state, data, - prog_data->param[i]); - } + memcpy(dst, src, cs_prog_data->push.cross_thread.size); + dst += cs_prog_data->push.cross_thread.size; + src += cs_prog_data->push.cross_thread.size; } if (cs_prog_data->push.per_thread.size > 0) { for (unsigned t = 0; t < cs_prog_data->threads; t++) { - unsigned dst = - 8 * (cs_prog_data->push.per_thread.regs * t + - cs_prog_data->push.cross_thread.regs); - unsigned src = cs_prog_data->push.cross_thread.dwords; - for ( ; src < prog_data->nr_params; src++, dst++) { - if (prog_data->param[src] == BRW_PARAM_BUILTIN_SUBGROUP_ID) { - u32_map[dst] = t; - } else { - u32_map[dst] = anv_push_constant_value(pipeline_state, data, - prog_data->param[src]); - } - } + memcpy(dst, src, cs_prog_data->push.per_thread.size); + + uint32_t *subgroup_id = dst + + offsetof(struct anv_push_constants, cs.subgroup_id) - + (range->start * 32 + cs_prog_data->push.cross_thread.size); + *subgroup_id = t; + + dst += cs_prog_data->push.per_thread.size; } } diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 0fc507c12a80..250f75e99367 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -560,7 +560,7 @@ anv_physical_device_init(struct anv_physical_device *device, device->compiler->constant_buffer_0_is_relative = device->info.gen < 8 || !device->has_context_isolation; device->compiler->supports_shader_constants = true; - device->compiler->compact_params = true; + device->compiler->compact_params = false; /* Broadwell PRM says: * diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h index 8977599607e5..361023e1593f 100644 --- a/src/intel/vulkan/anv_nir.h +++ b/src/intel/vulkan/anv_nir.h @@ -31,8 +31,6 @@ extern "C" { #endif -void anv_nir_lower_push_constants(nir_shader *shader); - bool anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask); bool anv_nir_lower_ycbcr_textures(nir_shader *shader, @@ -59,12 +57,16 @@ void anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, struct brw_stage_prog_data *prog_data, struct anv_pipeline_bind_map *map); -void anv_compute_push_layout(const struct anv_physical_device *pdevice, - struct brw_stage_prog_data *prog_data, - struct anv_pipeline_bind_map *map); +void anv_nir_compute_push_layout(const struct anv_physical_device *pdevice, + nir_shader *nir, + struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map, + void *mem_ctx); + +void anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map); -bool anv_nir_add_base_work_group_id(nir_shader *shader, - struct brw_cs_prog_data *prog_data); +bool anv_nir_add_base_work_group_id(nir_shader *shader); #ifdef __cplusplus } diff --git a/src/intel/vulkan/anv_nir_add_base_work_group_id.c b/src/intel/vulkan/anv_nir_add_base_work_group_id.c index a7c290bb52fd..38892b96ec54 100644 --- a/src/intel/vulkan/anv_nir_add_base_work_group_id.c +++ b/src/intel/vulkan/anv_nir_add_base_work_group_id.c @@ -26,13 +26,11 @@ #include "compiler/brw_compiler.h" bool -anv_nir_add_base_work_group_id(nir_shader *shader, - struct brw_cs_prog_data *prog_data) +anv_nir_add_base_work_group_id(nir_shader *shader) { assert(shader->info.stage == MESA_SHADER_COMPUTE); nir_builder b; - int base_id_offset = -1; bool progress = false; nir_foreach_function(function, shader) { if (!function->impl) @@ -51,27 +49,14 @@ anv_nir_add_base_work_group_id(nir_shader *shader, b.cursor = nir_after_instr(&load_id->instr); - if (base_id_offset < 0) { - /* If we don't have a set of BASE_WORK_GROUP_ID params, - * add them. - */ - assert(shader->num_uniforms == prog_data->base.nr_params * 4); - uint32_t *param = - brw_stage_prog_data_add_params(&prog_data->base, 3); - param[0] = BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X; - param[1] = BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y; - param[2] = BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z; - - base_id_offset = shader->num_uniforms; - shader->num_uniforms += 12; - } - nir_intrinsic_instr *load_base = - nir_intrinsic_instr_create(shader, nir_intrinsic_load_uniform); + nir_intrinsic_instr_create(shader, nir_intrinsic_load_push_constant); load_base->num_components = 3; load_base->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_ssa_dest_init(&load_base->instr, &load_base->dest, 3, 32, NULL); - nir_intrinsic_set_base(load_base, base_id_offset); + nir_intrinsic_set_base(load_base, + offsetof(struct anv_push_constants, + cs.base_work_group_id)); nir_intrinsic_set_range(load_base, 3 * sizeof(uint32_t)); nir_builder_instr_insert(&b, &load_base->instr); diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index da64ea0bdf9b..6ebc02cfdd78 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -46,9 +46,8 @@ struct apply_pipeline_layout_state { /* Place to flag lowered instructions so we don't lower them twice */ struct set *lowered_instrs; - int dynamic_offset_uniform_start; - bool uses_constants; + bool has_dynamic_buffers; uint8_t constants_offset; struct { bool desc_buffer_used; @@ -564,7 +563,7 @@ lower_load_vulkan_descriptor(nir_intrinsic_instr *intrin, if (!state->add_bounds_checks) desc = nir_pack_64_2x32(b, nir_channels(b, desc, 0x3)); - if (state->dynamic_offset_uniform_start >= 0) { + if (state->has_dynamic_buffers) { /* This shader has dynamic offsets and we have no way of knowing * (save from the dynamic offset base index) if this buffer has a * dynamic offset. @@ -598,8 +597,10 @@ lower_load_vulkan_descriptor(nir_intrinsic_instr *intrin, } nir_intrinsic_instr *dyn_load = - nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform); - nir_intrinsic_set_base(dyn_load, state->dynamic_offset_uniform_start); + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_push_constant); + nir_intrinsic_set_base(dyn_load, offsetof(struct anv_push_constants, + dynamic_offsets)); nir_intrinsic_set_range(dyn_load, MAX_DYNAMIC_BUFFERS * 4); dyn_load->src[0] = nir_src_for_ssa(nir_imul_imm(b, dyn_offset_idx, 4)); dyn_load->num_components = 1; @@ -1119,7 +1120,6 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, .add_bounds_checks = robust_buffer_access, .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_buffer_access), .lowered_instrs = _mesa_pointer_set_create(mem_ctx), - .dynamic_offset_uniform_start = -1, }; for (unsigned s = 0; s < layout->num_sets; s++) { @@ -1209,18 +1209,16 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, qsort(infos, used_binding_count, sizeof(struct binding_info), compare_binding_infos); - bool have_dynamic_buffers = false; - for (unsigned i = 0; i < used_binding_count; i++) { unsigned set = infos[i].set, b = infos[i].binding; struct anv_descriptor_set_binding_layout *binding = &layout->set[set].layout->binding[b]; - if (binding->dynamic_offset_index >= 0) - have_dynamic_buffers = true; - const uint32_t array_size = binding->array_size; + if (binding->dynamic_offset_index >= 0) + state.has_dynamic_buffers = true; + if (binding->data & ANV_DESCRIPTOR_SURFACE_STATE) { if (map->surface_count + array_size > MAX_BINDING_TABLE_SIZE || anv_descriptor_requires_bindless(pdevice, binding, false)) { @@ -1290,16 +1288,6 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, } } - if (have_dynamic_buffers) { - state.dynamic_offset_uniform_start = shader->num_uniforms; - uint32_t *param = brw_stage_prog_data_add_params(prog_data, - MAX_DYNAMIC_BUFFERS); - for (unsigned i = 0; i < MAX_DYNAMIC_BUFFERS; i++) - param[i] = ANV_PARAM_DYN_OFFSET(i); - shader->num_uniforms += MAX_DYNAMIC_BUFFERS * 4; - assert(shader->num_uniforms == prog_data->nr_params * 4); - } - nir_foreach_variable(var, &shader->uniforms) { const struct glsl_type *glsl_type = glsl_without_array(var->type); diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index 72bff5544bc3..e624a900d9c0 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -25,16 +25,111 @@ #include "compiler/brw_nir.h" void -anv_compute_push_layout(const struct anv_physical_device *pdevice, - struct brw_stage_prog_data *prog_data, - struct anv_pipeline_bind_map *map) +anv_nir_compute_push_layout(const struct anv_physical_device *pdevice, + nir_shader *nir, + struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map, + void *mem_ctx) { + memset(map->push_ranges, 0, sizeof(map->push_ranges)); + + unsigned push_start = UINT_MAX, push_end = 0; + nir_foreach_function(function, nir) { + if (!function->impl) + continue; + + nir_foreach_block(block, function->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_push_constant) + continue; + + unsigned base = nir_intrinsic_base(intrin); + unsigned range = nir_intrinsic_range(intrin); + push_start = MIN2(push_start, base); + push_end = MAX2(push_end, base + range); + } + } + } + + const bool has_push_intrinsic = push_start <= push_end; + + if (nir->info.stage == MESA_SHADER_COMPUTE) { + /* For compute shaders, we always have to have the subgroup ID. The + * back-end compiler will "helpfully" add it for us in the last push + * constant slot. Yes, there is an off-by-one error here but that's + * because the back-end will add it so we want to claim the number of + * push constants one dword less than the full amount including + * gl_SubgroupId. + */ + assert(push_end <= offsetof(struct anv_push_constants, cs.subgroup_id)); + push_end = offsetof(struct anv_push_constants, cs.subgroup_id); + } + + /* Align push_start down to a 32B boundary and make it no larger than + * push_end (no push constants is indicated by push_start = UINT_MAX). + */ + push_start = MIN2(push_start, push_end); + push_start &= ~31u; + + if (has_push_intrinsic) { + nir_foreach_function(function, nir) { + if (!function->impl) + continue; + + nir_foreach_block(block, function->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_push_constant) + continue; + + intrin->intrinsic = nir_intrinsic_load_uniform; + nir_intrinsic_set_base(intrin, + nir_intrinsic_base(intrin) - + push_start); + } + } + } + } + + /* For vec4 our push data size needs to be aligned to a vec4 and for + * scalar, it needs to be aligned to a DWORD. + */ + const unsigned align = + pdevice->compiler->scalar_stage[nir->info.stage] ? 4 : 16; + nir->num_uniforms = ALIGN(push_end - push_start, align); + prog_data->nr_params = nir->num_uniforms / 4; + prog_data->param = ralloc_array(mem_ctx, uint32_t, prog_data->nr_params); + struct anv_push_range push_constant_range = { .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, - .length = DIV_ROUND_UP(prog_data->nr_params, 8), + .start = push_start / 32, + .length = DIV_ROUND_UP(push_end - push_start, 32), }; - if (pdevice->info.gen >= 8 || pdevice->info.is_haswell) { + if ((pdevice->info.gen >= 8 || pdevice->info.is_haswell) && + nir->info.stage != MESA_SHADER_COMPUTE) { + brw_nir_analyze_ubo_ranges(pdevice->compiler, nir, NULL, + prog_data->ubo_ranges); + + /* We can push at most 64 registers worth of data. The back-end + * compiler would do this fixup for us but we'd like to calculate + * the push constant layout ourselves. + */ + unsigned total_push_regs = push_constant_range.length; + for (unsigned i = 0; i < 4; i++) { + if (total_push_regs + prog_data->ubo_ranges[i].length > 64) + prog_data->ubo_ranges[i].length = 64 - total_push_regs; + total_push_regs += prog_data->ubo_ranges[i].length; + } + assert(total_push_regs <= 64); + /* The Skylake PRM contains the following restriction: * * "The driver must ensure The following case does not occur @@ -72,7 +167,31 @@ anv_compute_push_layout(const struct anv_physical_device *pdevice, * rule that would require us to iterate in the other direction * and possibly mess around with dynamic state base address. * Don't bother; just emit regular push constants at n = 0. + * + * In the compute case, we don't have multiple push ranges so it's + * better to just provide one in push_ranges[0]. */ map->push_ranges[0] = push_constant_range; } } + +void +anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map) +{ +#ifndef NDEBUG + unsigned prog_data_push_size = DIV_ROUND_UP(prog_data->nr_params, 8); + for (unsigned i = 0; i < 4; i++) + prog_data_push_size += prog_data->ubo_ranges[i].length; + + unsigned bind_map_push_size = 0; + for (unsigned i = 0; i < 4; i++) + bind_map_push_size += map->push_ranges[i].length; + + /* We could go through everything again but it should be enough to assert + * that they push the same number of registers. This should alert us if + * the back-end compiler decides to re-arrange stuff or shrink a range. + */ + assert(prog_data_push_size == bind_map_push_size); +#endif +} diff --git a/src/intel/vulkan/anv_nir_lower_push_constants.c b/src/intel/vulkan/anv_nir_lower_push_constants.c deleted file mode 100644 index ad60d0c824e9..000000000000 --- a/src/intel/vulkan/anv_nir_lower_push_constants.c +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright © 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "anv_nir.h" - -void -anv_nir_lower_push_constants(nir_shader *shader) -{ - nir_foreach_function(function, shader) { - if (!function->impl) - continue; - - nir_foreach_block(block, function->impl) { - nir_foreach_instr(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - - /* TODO: Handle indirect push constants */ - if (intrin->intrinsic != nir_intrinsic_load_push_constant) - continue; - - /* We just turn them into uniform loads */ - intrin->intrinsic = nir_intrinsic_load_uniform; - } - } - } -} diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index bcc62b77a3d5..144fcf46df37 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -672,37 +672,11 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline, NIR_PASS_V(nir, anv_nir_lower_ycbcr_textures, layout); - NIR_PASS_V(nir, anv_nir_lower_push_constants); - if (nir->info.stage != MESA_SHADER_COMPUTE) NIR_PASS_V(nir, anv_nir_lower_multiview, pipeline->subpass->view_mask); nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); - if (nir->num_uniforms > 0) { - assert(prog_data->nr_params == 0); - - /* If the shader uses any push constants at all, we'll just give - * them the maximum possible number - */ - assert(nir->num_uniforms <= MAX_PUSH_CONSTANTS_SIZE); - nir->num_uniforms = MAX_PUSH_CONSTANTS_SIZE; - prog_data->nr_params += MAX_PUSH_CONSTANTS_SIZE / sizeof(float); - prog_data->param = ralloc_array(mem_ctx, uint32_t, prog_data->nr_params); - - /* We now set the param values to be offsets into a - * anv_push_constant_data structure. Since the compiler doesn't - * actually dereference any of the gl_constant_value pointers in the - * params array, it doesn't really matter what we put here. - */ - struct anv_push_constants *null_data = NULL; - /* Fill out the push constants section of the param array */ - for (unsigned i = 0; i < MAX_PUSH_CONSTANTS_SIZE / sizeof(float); i++) { - prog_data->param[i] = ANV_PARAM_PUSH( - (uintptr_t)&null_data->client_data[i * sizeof(float)]); - } - } - if (nir->info.num_ssbos > 0 || nir->info.num_images > 0) pipeline->needs_data_cache = true; @@ -732,10 +706,8 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline, nir_lower_non_uniform_texture_access | nir_lower_non_uniform_image_access); - if (nir->info.stage != MESA_SHADER_COMPUTE) - brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); - - assert(nir->num_uniforms == prog_data->nr_params * 4); + anv_nir_compute_push_layout(pdevice, nir, prog_data, + &stage->bind_map, mem_ctx); stage->nir = nir; } @@ -1394,9 +1366,8 @@ anv_pipeline_compile_graphics(struct anv_pipeline *pipeline, goto fail; } - anv_compute_push_layout(&pipeline->device->instance->physicalDevice, - &stages[s].prog_data.base, - &stages[s].bind_map); + anv_nir_validate_push_layout(&stages[s].prog_data.base, + &stages[s].bind_map); struct anv_shader_bin *bin = anv_device_upload_kernel(pipeline->device, cache, @@ -1559,10 +1530,9 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline, return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } - anv_pipeline_lower_nir(pipeline, mem_ctx, &stage, layout); + NIR_PASS_V(stage.nir, anv_nir_add_base_work_group_id); - NIR_PASS_V(stage.nir, anv_nir_add_base_work_group_id, - &stage.prog_data.cs); + anv_pipeline_lower_nir(pipeline, mem_ctx, &stage, layout); NIR_PASS_V(stage.nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_type_info); @@ -1578,6 +1548,8 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline, return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } + anv_nir_validate_push_layout(&stage.prog_data.base, &stage.bind_map); + const unsigned code_size = stage.prog_data.base.program_size; bin = anv_device_upload_kernel(pipeline->device, cache, &stage.cache_key, sizeof(stage.cache_key), diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 771268db156c..3ad4a5971e20 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2362,14 +2362,6 @@ struct anv_xfb_binding { VkDeviceSize size; }; -#define ANV_PARAM_PUSH(offset) ((1 << 16) | (uint32_t)(offset)) -#define ANV_PARAM_IS_PUSH(param) ((uint32_t)(param) >> 16 == 1) -#define ANV_PARAM_PUSH_OFFSET(param) ((param) & 0xffff) - -#define ANV_PARAM_DYN_OFFSET(offset) ((2 << 16) | (uint32_t)(offset)) -#define ANV_PARAM_IS_DYN_OFFSET(param) ((uint32_t)(param) >> 16 == 2) -#define ANV_PARAM_DYN_OFFSET_IDX(param) ((param) & 0xffff) - struct anv_push_constants { /** Push constant data provided by the client through vkPushConstants */ uint8_t client_data[MAX_PUSH_CONSTANTS_SIZE]; @@ -2390,6 +2382,9 @@ struct anv_push_constants { * uploading the push constants for compute shaders. */ uint32_t subgroup_id; + + /** Pad out to a multiple of 32 bytes */ + uint32_t pad[4]; } cs; }; diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index 31127aabf11e..b8d2f1e9e870 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -116,7 +116,6 @@ libanv_files = files( 'anv_nir_apply_pipeline_layout.c', 'anv_nir_compute_push_layout.c', 'anv_nir_lower_multiview.c', - 'anv_nir_lower_push_constants.c', 'anv_nir_lower_ycbcr_textures.c', 'anv_pass.c', 'anv_perf.c', -- GitLab From ca8117b5d544f9580d05e9416abd03446e285e16 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 7 Nov 2019 14:39:28 -0600 Subject: [PATCH 11/15] anv: Use a switch statement for binding table setup It theoretically could be more efficient but the real point here is that it's no longer really a matter of dealing with special cases and then the "real" thing. The way we're handling binding tables, it's more of a multi-step process and a switch is more natural. Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/genX_cmd_buffer.c | 244 +++++++++++++++-------------- 1 file changed, 127 insertions(+), 117 deletions(-) diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 567e556f5a5e..9f143180d73a 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2148,7 +2148,8 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, struct anv_state surface_state; - if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) { + switch (binding->set) { + case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: /* Color attachment binding */ assert(stage == MESA_SHADER_FRAGMENT); if (binding->index < subpass->color_count) { @@ -2171,8 +2172,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, } bt_map[s] = surface_state.offset + state_offset; - continue; - } else if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { + break; + + case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: { struct anv_state surface_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer); @@ -2191,12 +2193,14 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, bt_map[s] = surface_state.offset + state_offset; add_surface_reloc(cmd_buffer, surface_state, constant_data); - continue; - } else if (binding->set == ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS) { + break; + } + + case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: { /* This is always the first binding for compute shaders */ assert(stage == MESA_SHADER_COMPUTE && s == 0); if (!get_cs_prog_data(pipeline)->uses_num_work_groups) - continue; + break; struct anv_state surface_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer); @@ -2212,8 +2216,10 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, add_surface_reloc(cmd_buffer, surface_state, cmd_buffer->state.compute.num_workgroups); } - continue; - } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { + break; + } + + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { /* This is a descriptor set buffer so the set index is actually * given by binding->binding. (Yes, that's confusing.) */ @@ -2224,134 +2230,138 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, bt_map[s] = set->desc_surface_state.offset + state_offset; add_surface_reloc(cmd_buffer, set->desc_surface_state, anv_descriptor_set_address(cmd_buffer, set)); - continue; + break; } - const struct anv_descriptor *desc = - &pipe_state->descriptors[binding->set]->descriptors[binding->index]; + default: { + assert(binding->set < MAX_SETS); + const struct anv_descriptor *desc = + &pipe_state->descriptors[binding->set]->descriptors[binding->index]; - switch (desc->type) { - case VK_DESCRIPTOR_TYPE_SAMPLER: - /* Nothing for us to do here */ - continue; + switch (desc->type) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + /* Nothing for us to do here */ + continue; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { - struct anv_surface_state sstate = - (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? - desc->image_view->planes[binding->plane].general_sampler_surface_state : - desc->image_view->planes[binding->plane].optimal_sampler_surface_state; - surface_state = sstate.state; - assert(surface_state.alloc_size); - if (need_client_mem_relocs) - add_surface_state_relocs(cmd_buffer, sstate); - break; - } - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - assert(stage == MESA_SHADER_FRAGMENT); - if ((desc->image_view->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) { - /* For depth and stencil input attachments, we treat it like any - * old texture that a user may have bound. - */ - assert(desc->image_view->n_planes == 1); + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { struct anv_surface_state sstate = (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? - desc->image_view->planes[0].general_sampler_surface_state : - desc->image_view->planes[0].optimal_sampler_surface_state; + desc->image_view->planes[binding->plane].general_sampler_surface_state : + desc->image_view->planes[binding->plane].optimal_sampler_surface_state; surface_state = sstate.state; assert(surface_state.alloc_size); if (need_client_mem_relocs) add_surface_state_relocs(cmd_buffer, sstate); - } else { - /* For color input attachments, we create the surface state at - * vkBeginRenderPass time so that we can include aux and clear - * color information. - */ - assert(binding->input_attachment_index < subpass->input_count); - const unsigned subpass_att = binding->input_attachment_index; - const unsigned att = subpass->input_attachments[subpass_att].attachment; - surface_state = cmd_buffer->state.attachments[att].input.state; + break; } - break; - - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { - struct anv_surface_state sstate = (binding->write_only) - ? desc->image_view->planes[binding->plane].writeonly_storage_surface_state - : desc->image_view->planes[binding->plane].storage_surface_state; - surface_state = sstate.state; - assert(surface_state.alloc_size); - if (need_client_mem_relocs) - add_surface_state_relocs(cmd_buffer, sstate); - break; - } + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + assert(stage == MESA_SHADER_FRAGMENT); + if ((desc->image_view->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) { + /* For depth and stencil input attachments, we treat it like any + * old texture that a user may have bound. + */ + assert(desc->image_view->n_planes == 1); + struct anv_surface_state sstate = + (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? + desc->image_view->planes[0].general_sampler_surface_state : + desc->image_view->planes[0].optimal_sampler_surface_state; + surface_state = sstate.state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + } else { + /* For color input attachments, we create the surface state at + * vkBeginRenderPass time so that we can include aux and clear + * color information. + */ + assert(binding->input_attachment_index < subpass->input_count); + const unsigned subpass_att = binding->input_attachment_index; + const unsigned att = subpass->input_attachments[subpass_att].attachment; + surface_state = cmd_buffer->state.attachments[att].input.state; + } + break; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - surface_state = desc->buffer_view->surface_state; - assert(surface_state.alloc_size); - if (need_client_mem_relocs) { - add_surface_reloc(cmd_buffer, surface_state, - desc->buffer_view->address); + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { + struct anv_surface_state sstate = (binding->write_only) + ? desc->image_view->planes[binding->plane].writeonly_storage_surface_state + : desc->image_view->planes[binding->plane].storage_surface_state; + surface_state = sstate.state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + break; } - break; - - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - /* If the shader never does any UBO pulls (this is a fairly common - * case) then we don't need to fill out those binding table entries. - * The real cost savings here is that we don't have to build the - * surface state for them which is surprisingly expensive when it's - * on the hot-path. - */ - if (!bin->prog_data->has_ubo_pull) - continue; - /* Fall through */ - - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { - /* Compute the offset within the buffer */ - struct anv_push_constants *push = - &cmd_buffer->state.push_constants[stage]; - - uint32_t dynamic_offset = - push->dynamic_offsets[binding->dynamic_offset_index]; - uint64_t offset = desc->offset + dynamic_offset; - /* Clamp to the buffer size */ - offset = MIN2(offset, desc->buffer->size); - /* Clamp the range to the buffer size */ - uint32_t range = MIN2(desc->range, desc->buffer->size - offset); - struct anv_address address = - anv_address_add(desc->buffer->address, offset); - - surface_state = - anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); - enum isl_format format = - anv_isl_format_for_descriptor_type(desc->type); + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + surface_state = desc->buffer_view->surface_state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + /* If the shader never does any UBO pulls (this is a fairly common + * case) then we don't need to fill out those binding table entries. + * The real cost savings here is that we don't have to build the + * surface state for them which is surprisingly expensive when it's + * on the hot-path. + */ + if (!bin->prog_data->has_ubo_pull) + continue; + /* Fall through */ + + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + /* Compute the offset within the buffer */ + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; + + uint32_t dynamic_offset = + push->dynamic_offsets[binding->dynamic_offset_index]; + uint64_t offset = desc->offset + dynamic_offset; + /* Clamp to the buffer size */ + offset = MIN2(offset, desc->buffer->size); + /* Clamp the range to the buffer size */ + uint32_t range = MIN2(desc->range, desc->buffer->size - offset); + + struct anv_address address = + anv_address_add(desc->buffer->address, offset); + + surface_state = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); + enum isl_format format = + anv_isl_format_for_descriptor_type(desc->type); + + anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, + format, address, range, 1); + if (need_client_mem_relocs) + add_surface_reloc(cmd_buffer, surface_state, address); + break; + } - anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, - format, address, range, 1); - if (need_client_mem_relocs) - add_surface_reloc(cmd_buffer, surface_state, address); - break; - } + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + surface_state = (binding->write_only) + ? desc->buffer_view->writeonly_storage_surface_state + : desc->buffer_view->storage_surface_state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } + break; - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - surface_state = (binding->write_only) - ? desc->buffer_view->writeonly_storage_surface_state - : desc->buffer_view->storage_surface_state; - assert(surface_state.alloc_size); - if (need_client_mem_relocs) { - add_surface_reloc(cmd_buffer, surface_state, - desc->buffer_view->address); + default: + assert(!"Invalid descriptor type"); + continue; } + bt_map[s] = surface_state.offset + state_offset; break; - - default: - assert(!"Invalid descriptor type"); - continue; } - - bt_map[s] = surface_state.offset + state_offset; + } } return VK_SUCCESS; -- GitLab From 22f16ff54a4a23a9903e837d37e9d3d838e535f1 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 7 Nov 2019 11:44:08 -0600 Subject: [PATCH 12/15] anv: More carefully dirty state in BindDescriptorSets Instead of dirtying all graphics or all compute based on binding point, we're now much more careful. We first check to see if the actual descriptor set changed and then only dirty the stages used by that descriptor set. For dynamic offsets, we keep a bitfield per-stage of which offsets are actually used in that stage and we only dirty push constants and descriptors if that stage has dynamic offsets AND those offsets actually change. Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/anv_cmd_buffer.c | 56 ++++++++++++++++----------- src/intel/vulkan/anv_descriptor_set.c | 9 +++++ src/intel/vulkan/anv_private.h | 3 ++ src/intel/vulkan/genX_cmd_buffer.c | 5 +++ 4 files changed, 51 insertions(+), 22 deletions(-) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index f14225b963ae..2a555f1d2be1 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -572,53 +572,65 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer, struct anv_descriptor_set_layout *set_layout = layout->set[set_index].layout; - struct anv_cmd_pipeline_state *pipe_state; + VkShaderStageFlags stages = set_layout->shader_stages & + (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE ? + VK_SHADER_STAGE_COMPUTE_BIT : VK_SHADER_STAGE_ALL_GRAPHICS); + + VkShaderStageFlags dirty_stages = 0; if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { - pipe_state = &cmd_buffer->state.compute.base; + if (cmd_buffer->state.compute.base.descriptors[set_index] != set) { + cmd_buffer->state.compute.base.descriptors[set_index] = set; + dirty_stages |= stages; + } } else { assert(bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS); - pipe_state = &cmd_buffer->state.gfx.base; + if (cmd_buffer->state.gfx.base.descriptors[set_index] != set) { + cmd_buffer->state.gfx.base.descriptors[set_index] = set; + dirty_stages |= stages; + } } - pipe_state->descriptors[set_index] = set; + + /* If it's a push descriptor set, we have to flag things as dirty + * regardless of whether or not the CPU-side data structure changed as we + * may have edited in-place. + */ + if (set->pool == NULL) + dirty_stages |= stages; if (dynamic_offsets) { if (set_layout->dynamic_offset_count > 0) { uint32_t dynamic_offset_start = layout->set[set_index].dynamic_offset_start; - anv_foreach_stage(stage, set_layout->shader_stages) { + anv_foreach_stage(stage, stages) { struct anv_push_constants *push = &cmd_buffer->state.push_constants[stage]; + uint32_t *push_offsets = + &push->dynamic_offsets[dynamic_offset_start]; /* Assert that everything is in range */ assert(set_layout->dynamic_offset_count <= *dynamic_offset_count); assert(dynamic_offset_start + set_layout->dynamic_offset_count <= ARRAY_SIZE(push->dynamic_offsets)); - typed_memcpy(&push->dynamic_offsets[dynamic_offset_start], - *dynamic_offsets, set_layout->dynamic_offset_count); + unsigned mask = set_layout->stage_dynamic_offsets[stage]; + STATIC_ASSERT(MAX_DYNAMIC_BUFFERS <= sizeof(mask) * 8); + while (mask) { + int i = u_bit_scan(&mask); + if (push_offsets[i] != (*dynamic_offsets)[i]) { + push_offsets[i] = (*dynamic_offsets)[i]; + dirty_stages |= mesa_to_vk_shader_stage(stage); + } + } } *dynamic_offsets += set_layout->dynamic_offset_count; *dynamic_offset_count -= set_layout->dynamic_offset_count; - - if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { - cmd_buffer->state.push_constants_dirty |= - VK_SHADER_STAGE_COMPUTE_BIT; - } else { - cmd_buffer->state.push_constants_dirty |= - VK_SHADER_STAGE_ALL_GRAPHICS; - } } } - if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { - cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; - } else { - assert(bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS); - cmd_buffer->state.descriptors_dirty |= - set_layout->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS; - } + cmd_buffer->state.descriptors_dirty |= dirty_stages; + cmd_buffer->state.push_constants_dirty |= dirty_stages; } void anv_CmdBindDescriptorSets( diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c index 2f7b32b1b82f..af8fa40a0a96 100644 --- a/src/intel/vulkan/anv_descriptor_set.c +++ b/src/intel/vulkan/anv_descriptor_set.c @@ -469,7 +469,15 @@ VkResult anv_CreateDescriptorSetLayout( case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: set_layout->binding[b].dynamic_offset_index = dynamic_offset_count; + anv_foreach_stage(s, binding->stageFlags) { + STATIC_ASSERT(MAX_DYNAMIC_BUFFERS <= + sizeof(set_layout->stage_dynamic_offsets[s]) * 8); + set_layout->stage_dynamic_offsets[s] |= + BITFIELD_RANGE(set_layout->binding[b].dynamic_offset_index, + binding->descriptorCount); + } dynamic_offset_count += binding->descriptorCount; + assert(dynamic_offset_count < MAX_DYNAMIC_BUFFERS); break; default: @@ -603,6 +611,7 @@ VkResult anv_CreatePipelineLayout( dynamic_offset_count += set_layout->binding[b].array_size; } } + assert(dynamic_offset_count < MAX_DYNAMIC_BUFFERS); struct mesa_sha1 ctx; _mesa_sha1_init(&ctx); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 3ad4a5971e20..ec403acd4166 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1803,6 +1803,9 @@ struct anv_descriptor_set_layout { /* Number of dynamic offsets used by this descriptor set */ uint16_t dynamic_offset_count; + /* For each shader stage, which offsets apply to that stage */ + uint16_t stage_dynamic_offsets[MESA_SHADER_STAGES]; + /* Size of the descriptor buffer for this descriptor set */ uint32_t descriptor_buffer_size; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 9f143180d73a..d4ed49263de4 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -3544,6 +3544,11 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); + + /* The workgroup size of the pipeline affects our push constant layout + * so flag push constants as dirty if we change the pipeline. + */ + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; } if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) || -- GitLab From 98dc179c1e094ab42346b23fe046ebb719b66ed4 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 7 Nov 2019 11:28:47 -0600 Subject: [PATCH 13/15] anv: More carefully dirty state in BindPipeline Instead of blindly dirtying descriptors and push constants the moment we see a pipeline change, check to see if it actually changes the bind layout or push constant layout. This doubles the runtime performance of one CPU-limited example running with the Dawn WebGPU implementation when running on my laptop. NOTE: This effectively reverts beca63c6c07. While it was a nice optimization, it was based on prog_data and we can't do that anymore once we start allowing the same binding table to be used with multiple different pipelines. Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/anv_cmd_buffer.c | 49 +++++++++++++++++-- .../vulkan/anv_nir_apply_pipeline_layout.c | 13 +++++ .../vulkan/anv_nir_compute_push_layout.c | 9 ++++ src/intel/vulkan/anv_pipeline.c | 6 +++ src/intel/vulkan/anv_pipeline_cache.c | 9 ++++ src/intel/vulkan/anv_private.h | 16 +++++- src/intel/vulkan/genX_cmd_buffer.c | 24 ++------- 7 files changed, 101 insertions(+), 25 deletions(-) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 2a555f1d2be1..6ba3cd633320 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -380,6 +380,34 @@ anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer) cmd_buffer); } +static bool +mem_update(void *dst, const void *src, size_t size) +{ + if (memcmp(dst, src, size) == 0) + return false; + + memcpy(dst, src, size); + return true; +} + +static void +set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, + const struct anv_pipeline_bind_map *map) +{ + if (mem_update(cmd_buffer->state.surface_sha1s[stage], + map->surface_sha1, sizeof(map->surface_sha1))) + cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage); + + if (mem_update(cmd_buffer->state.sampler_sha1s[stage], + map->sampler_sha1, sizeof(map->sampler_sha1))) + cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage); + + if (mem_update(cmd_buffer->state.push_sha1s[stage], + map->push_sha1, sizeof(map->push_sha1))) + cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage); +} + void anv_CmdBindPipeline( VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, @@ -389,19 +417,30 @@ void anv_CmdBindPipeline( ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline); switch (pipelineBindPoint) { - case VK_PIPELINE_BIND_POINT_COMPUTE: + case VK_PIPELINE_BIND_POINT_COMPUTE: { + if (cmd_buffer->state.compute.base.pipeline == pipeline) + return; + cmd_buffer->state.compute.base.pipeline = pipeline; cmd_buffer->state.compute.pipeline_dirty = true; - cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; - cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[MESA_SHADER_COMPUTE]->bind_map; + set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE, bind_map); break; + } case VK_PIPELINE_BIND_POINT_GRAPHICS: + if (cmd_buffer->state.gfx.base.pipeline == pipeline) + return; + cmd_buffer->state.gfx.base.pipeline = pipeline; cmd_buffer->state.gfx.vb_dirty |= pipeline->vb_used; cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; - cmd_buffer->state.push_constants_dirty |= pipeline->active_stages; - cmd_buffer->state.descriptors_dirty |= pipeline->active_stages; + + anv_foreach_stage(stage, pipeline->active_stages) { + set_dirty_for_bind_map(cmd_buffer, stage, + &pipeline->shaders[stage]->bind_map); + } /* Apply the dynamic state from the pipeline */ cmd_buffer->state.gfx.dirty |= diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index 6ebc02cfdd78..f37b672543b7 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -25,6 +25,7 @@ #include "program/prog_parameter.h" #include "nir/nir_builder.h" #include "compiler/brw_nir.h" +#include "util/mesa-sha1.h" #include "util/set.h" /* Sampler tables don't actually have a maximum size but we pick one just so @@ -1318,6 +1319,7 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, dim == GLSL_SAMPLER_DIM_SUBPASS_MS) pipe_binding[i].input_attachment_index = var->data.index + i; + /* NOTE: This is a uint8_t so we really do need to != 0 here */ pipe_binding[i].write_only = (var->data.image.access & ACCESS_NON_READABLE) != 0; } @@ -1367,4 +1369,15 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, } ralloc_free(mem_ctx); + + /* Now that we're done computing the surface and sampler portions of the + * bind map, hash them. This lets us quickly determine if the actual + * mapping has changed and not just a no-op pipeline change. + */ + _mesa_sha1_compute(map->surface_to_descriptor, + map->surface_count * sizeof(struct anv_pipeline_binding), + map->surface_sha1); + _mesa_sha1_compute(map->sampler_to_descriptor, + map->sampler_count * sizeof(struct anv_pipeline_binding), + map->sampler_sha1); } diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index e624a900d9c0..0b696fbc9e77 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -23,6 +23,7 @@ #include "anv_nir.h" #include "compiler/brw_nir.h" +#include "util/mesa-sha1.h" void anv_nir_compute_push_layout(const struct anv_physical_device *pdevice, @@ -173,6 +174,14 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice, */ map->push_ranges[0] = push_constant_range; } + + /* Now that we're done computing the push constant portion of the + * bind map, hash it. This lets us quickly determine if the actual + * mapping has changed and not just a no-op pipeline change. + */ + _mesa_sha1_compute(map->push_ranges, + sizeof(map->push_ranges), + map->push_sha1); } void diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 144fcf46df37..aee3b2219379 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -1550,6 +1550,12 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline, anv_nir_validate_push_layout(&stage.prog_data.base, &stage.bind_map); + if (!stage.prog_data.cs.uses_num_work_groups) { + assert(stage.bind_map.surface_to_descriptor[0].set == + ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS); + stage.bind_map.surface_to_descriptor[0].set = ANV_DESCRIPTOR_SET_NULL; + } + const unsigned code_size = stage.prog_data.base.program_size; bin = anv_device_upload_kernel(pipeline->device, cache, &stage.cache_key, sizeof(stage.cache_key), diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c index a4294e1eb609..396b75f1aa4f 100644 --- a/src/intel/vulkan/anv_pipeline_cache.c +++ b/src/intel/vulkan/anv_pipeline_cache.c @@ -161,6 +161,12 @@ anv_shader_bin_write_to_blob(const struct anv_shader_bin *shader, blob_write_uint32(blob, 0); } + blob_write_bytes(blob, shader->bind_map.surface_sha1, + sizeof(shader->bind_map.surface_sha1)); + blob_write_bytes(blob, shader->bind_map.sampler_sha1, + sizeof(shader->bind_map.sampler_sha1)); + blob_write_bytes(blob, shader->bind_map.push_sha1, + sizeof(shader->bind_map.push_sha1)); blob_write_uint32(blob, shader->bind_map.surface_count); blob_write_uint32(blob, shader->bind_map.sampler_count); blob_write_bytes(blob, shader->bind_map.surface_to_descriptor, @@ -206,6 +212,9 @@ anv_shader_bin_create_from_blob(struct anv_device *device, xfb_info = blob_read_bytes(blob, xfb_size); struct anv_pipeline_bind_map bind_map; + blob_copy_bytes(blob, bind_map.surface_sha1, sizeof(bind_map.surface_sha1)); + blob_copy_bytes(blob, bind_map.sampler_sha1, sizeof(bind_map.sampler_sha1)); + blob_copy_bytes(blob, bind_map.push_sha1, sizeof(bind_map.push_sha1)); bind_map.surface_count = blob_read_uint32(blob); bind_map.sampler_count = blob_read_uint32(blob); bind_map.surface_to_descriptor = (void *) diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index ec403acd4166..7c144d7d6c3c 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2009,6 +2009,7 @@ anv_descriptor_set_destroy(struct anv_device *device, struct anv_descriptor_pool *pool, struct anv_descriptor_set *set); +#define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 5) #define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 4) #define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 3) #define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS (UINT8_MAX - 2) @@ -2042,7 +2043,12 @@ struct anv_pipeline_binding { }; /** For a storage image, whether it is write-only */ - bool write_only; + uint8_t write_only; + + /** Pad to 64 bits so that there are no holes and we can safely memcmp + * assuming POD zero-initialization. + */ + uint8_t pad; }; struct anv_push_range { @@ -2575,6 +2581,10 @@ struct anv_cmd_state { struct anv_state binding_tables[MESA_SHADER_STAGES]; struct anv_state samplers[MESA_SHADER_STAGES]; + unsigned char sampler_sha1s[MESA_SHADER_STAGES][20]; + unsigned char surface_sha1s[MESA_SHADER_STAGES][20]; + unsigned char push_sha1s[MESA_SHADER_STAGES][20]; + /** * Whether or not the gen8 PMA fix is enabled. We ensure that, at the top * of any command buffer it is disabled by disabling it in EndCommandBuffer @@ -2936,6 +2946,10 @@ mesa_to_vk_shader_stage(gl_shader_stage mesa_stage) __tmp &= ~(1 << (stage))) struct anv_pipeline_bind_map { + unsigned char surface_sha1[20]; + unsigned char sampler_sha1[20]; + unsigned char push_sha1[20]; + uint32_t surface_count; uint32_t sampler_count; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index d4ed49263de4..a30f67527434 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2122,8 +2122,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, return VK_SUCCESS; } - struct anv_shader_bin *bin = pipeline->shaders[stage]; - struct anv_pipeline_bind_map *map = &bin->bind_map; + struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map; if (map->surface_count == 0) { *bt_state = (struct anv_state) { 0, }; return VK_SUCCESS; @@ -2149,6 +2148,10 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, struct anv_state surface_state; switch (binding->set) { + case ANV_DESCRIPTOR_SET_NULL: + bt_map[s] = 0; + break; + case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: /* Color attachment binding */ assert(stage == MESA_SHADER_FRAGMENT); @@ -2199,8 +2202,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: { /* This is always the first binding for compute shaders */ assert(stage == MESA_SHADER_COMPUTE && s == 0); - if (!get_cs_prog_data(pipeline)->uses_num_work_groups) - break; struct anv_state surface_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer); @@ -2305,16 +2306,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - /* If the shader never does any UBO pulls (this is a fairly common - * case) then we don't need to fill out those binding table entries. - * The real cost savings here is that we don't have to build the - * surface state for them which is surprisingly expensive when it's - * on the hot-path. - */ - if (!bin->prog_data->has_ubo_pull) - continue; - /* Fall through */ - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { /* Compute the offset within the buffer */ struct anv_push_constants *push = @@ -2730,11 +2721,6 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); - /* The exact descriptor layout is pulled from the pipeline, so we need - * to re-emit binding tables on every pipeline change. - */ - cmd_buffer->state.descriptors_dirty |= pipeline->active_stages; - /* If the pipeline changed, we may need to re-allocate push constant * space in the URB. */ -- GitLab From bc9d7836bc6a448d0328f090b8d538411f8aa1a0 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 7 Nov 2019 14:02:09 -0600 Subject: [PATCH 14/15] anv: Use an anv_state for the next binding table This is a bit more natural because we're already getting an anv_state most places in the pipeline. The important part here, however, is that we're no longer calling anv_block_pool_map on every alloc_binding_table call. While it's probably pretty cheap, it is potentially a linear walk over the list of BOs and it was showing up in profiles. Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/anv_batch_chain.c | 25 ++++++++++++++----------- src/intel/vulkan/anv_private.h | 2 +- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index 61ee1417b20c..9180f9083795 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -709,20 +709,18 @@ anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer, uint32_t entries, uint32_t *state_offset) { struct anv_device *device = cmd_buffer->device; - struct anv_state_pool *state_pool = &device->surface_state_pool; struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); - struct anv_state state; - state.alloc_size = align_u32(entries * 4, 32); + uint32_t bt_size = align_u32(entries * 4, 32); - if (cmd_buffer->bt_next + state.alloc_size > state_pool->block_size) + struct anv_state state = cmd_buffer->bt_next; + if (bt_size > state.alloc_size) return (struct anv_state) { 0 }; - state.offset = cmd_buffer->bt_next; - state.map = anv_block_pool_map(&anv_binding_table_pool(device)->block_pool, - bt_block->offset + state.offset); - - cmd_buffer->bt_next += state.alloc_size; + state.alloc_size = bt_size; + cmd_buffer->bt_next.offset += bt_size; + cmd_buffer->bt_next.map += bt_size; + cmd_buffer->bt_next.alloc_size -= bt_size; if (device->instance->physicalDevice.use_softpin) { assert(bt_block->offset >= 0); @@ -762,7 +760,12 @@ anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer) } *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device); - cmd_buffer->bt_next = 0; + + /* The bt_next state is a rolling state (we update it as we suballocate + * from it) which is relative to the start of the binding table block. + */ + cmd_buffer->bt_next = *bt_block; + cmd_buffer->bt_next.offset = 0; return VK_SUCCESS; } @@ -871,7 +874,7 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) anv_binding_table_pool_free(cmd_buffer->device, *bt_block); } assert(u_vector_length(&cmd_buffer->bt_block_states) == 1); - cmd_buffer->bt_next = 0; + cmd_buffer->bt_next = ANV_STATE_NULL; anv_reloc_list_clear(&cmd_buffer->surface_relocs); cmd_buffer->last_ss_pool_center = 0; diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 7c144d7d6c3c..8fa0d74f5aa9 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2671,7 +2671,7 @@ struct anv_cmd_buffer { * initialized by anv_cmd_buffer_init_batch_bo_chain() */ struct u_vector bt_block_states; - uint32_t bt_next; + struct anv_state bt_next; struct anv_reloc_list surface_relocs; /** Last seen surface state block pool center bo offset */ -- GitLab From fdaf8144a8bf65afa7dc66b8d827da38e27a850a Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 7 Nov 2019 22:05:21 -0600 Subject: [PATCH 15/15] anv: Emit a NULL vertex for zero base_vertex/instance If both are zero (the common case), we can emit a null vertex buffer rather than emitting a vertex buffer with zeros in it. The packing of the VERTEX_BUFFER_STATE is faster because no relocation is emitted and we can avoid creating the vertex buffer which means one less anv_state_stream_alloc. Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/genX_cmd_buffer.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index a30f67527434..610ada98760c 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2807,7 +2807,8 @@ emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, .VertexBufferIndex = index, .AddressModifyEnable = true, .BufferPitch = 0, - .MOCS = anv_mocs_for_bo(cmd_buffer->device, addr.bo), + .MOCS = addr.bo ? anv_mocs_for_bo(cmd_buffer->device, addr.bo) : 0, + .NullVertexBuffer = size == 0, #if (GEN_GEN >= 8) .BufferStartingAddress = addr, .BufferSize = size @@ -2822,25 +2823,29 @@ static void emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr) { - emit_vertex_bo(cmd_buffer, addr, 8, ANV_SVGS_VB_INDEX); + emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX); } static void emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, uint32_t base_vertex, uint32_t base_instance) { - struct anv_state id_state = - anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); + if (base_vertex == 0 && base_instance == 0) { + emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS); + } else { + struct anv_state id_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); - ((uint32_t *)id_state.map)[0] = base_vertex; - ((uint32_t *)id_state.map)[1] = base_instance; + ((uint32_t *)id_state.map)[0] = base_vertex; + ((uint32_t *)id_state.map)[1] = base_instance; - struct anv_address addr = { - .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, - .offset = id_state.offset, - }; + struct anv_address addr = { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = id_state.offset, + }; - emit_base_vertex_instance_bo(cmd_buffer, addr); + emit_base_vertex_instance_bo(cmd_buffer, addr); + } } static void -- GitLab