Commit 380d4904 authored by Connor Abbott's avatar Connor Abbott
Browse files

tu: Read some input attachments directly

It can happen that the user reads an input attachment as the first use
of that attachment. In that case there are no subpass dependencies
required at all, because there could be a pipeline barrier before the
renderpass instead, and in any case we assume that dependencies with the
first subpass as a destination can be executed only once outside the
renderpass. The result is that we only do a CACHE_INVALIDATE once
before the entire renderpass, but it's actually required after each GMEM
load, because input attachments read GMEM through UCHE and those writes
to GMEM invalidate UCHE.

While we could add the missing CACHE_INVALIDATE "by hand" somehow, it
turns out it's actually just as easy to do an optimization the blob
does, where it simply doesn't patch those input attachments and reads
them directly instead. This means we can skip allocating memory in GMEM
for them entirely in some circumstances.

This fixes e.g.
dEQP-VK.api.copy_and_blit.core.resolve_image.whole_array_image.4_bit
with TU_DEBUG=forcebin.

Part-of: <mesa/mesa!12213>
parent 11ac7d9e
Pipeline #378614 waiting for manual action with stages
......@@ -11,13 +11,6 @@ KHR-GL33.transform_feedback.draw_xfb_stream_instanced_test,Crash
KHR-GL33.transform_feedback.query_vertex_interleaved_test,Fail
KHR-GL33.transform_feedback.query_vertex_separate_test,Fail
# Fails with TU_DEBUG=forcebin
# https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12213
dEQP-VK.api.copy_and_blit.core.resolve_image.whole_array_image.4_bit,Fail
dEQP-VK.api.copy_and_blit.core.resolve_image.whole_array_image_one_region.4_bit,Fail
dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_array_image.4_bit,Fail
dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_array_image_one_region.4_bit,Fail
# "Fail (createInstance returned VK_ERROR_INITIALIZATION_FAILED)"
# happens inside the loader on anholt's debian system, and there are various
# likely-looking fixes in later versions of the loader.
......
......@@ -1056,7 +1056,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
gmem_offset = att->gmem_offset_stencil;
}
if (!gmem)
if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem)
continue;
/* patched for gmem */
......@@ -2924,6 +2924,9 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
cmd->state.cache.pending_flush_bits;
cmd->state.renderpass_cache.flush_bits = 0;
if (pass->subpasses[0].feedback_invalidate)
cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
/* Track LRZ valid state */
uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
if (a != VK_ATTACHMENT_UNUSED) {
......@@ -3011,6 +3014,9 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
/* Handle dependencies for the next subpass */
tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
if (cmd->state.subpass->feedback_invalidate)
cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
/* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
tu6_emit_zs(cmd, cmd->state.subpass, cs);
tu6_emit_mrt(cmd, cmd->state.subpass, cs);
......
......@@ -334,6 +334,75 @@ tu_render_pass_add_implicit_deps(struct tu_render_pass *pass,
}
}
/* If an input attachment is used without an intervening write to the same
* attachment, then we can just use the original image, even in GMEM mode.
* This is an optimization, but it's also important because it allows us to
* avoid having to invalidate UCHE at the beginning of each tile due to it
* becoming invalid. The only reads of GMEM via UCHE should be after an
* earlier subpass modified it, which only works if there's already an
* appropriate dependency that will add the CACHE_INVALIDATE anyway. We
* don't consider this in the dependency code, so this is also required for
* correctness.
*/
static void
tu_render_pass_patch_input_gmem(struct tu_render_pass *pass)
{
bool written[pass->attachment_count];
memset(written, 0, sizeof(written));
for (unsigned i = 0; i < pass->subpass_count; i++) {
struct tu_subpass *subpass = &pass->subpasses[i];
for (unsigned j = 0; j < subpass->input_count; j++) {
uint32_t a = subpass->input_attachments[j].attachment;
if (a == VK_ATTACHMENT_UNUSED)
continue;
subpass->input_attachments[j].patch_input_gmem = written[a];
}
for (unsigned j = 0; j < subpass->color_count; j++) {
uint32_t a = subpass->color_attachments[j].attachment;
if (a == VK_ATTACHMENT_UNUSED)
continue;
written[a] = true;
for (unsigned k = 0; k < subpass->input_count; k++) {
if (subpass->input_attachments[k].attachment == a &&
!subpass->input_attachments[k].patch_input_gmem) {
/* For render feedback loops, we have no idea whether the use
* as a color attachment or input attachment will come first,
* so we have to always use GMEM in case the color attachment
* comes first and defensively invalidate UCHE in case the
* input attachment comes first.
*/
subpass->feedback_invalidate = true;
subpass->input_attachments[k].patch_input_gmem = true;
}
}
}
for (unsigned j = 0; j < subpass->resolve_count; j++) {
uint32_t a = subpass->resolve_attachments[j].attachment;
if (a == VK_ATTACHMENT_UNUSED)
continue;
written[a] = true;
}
if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
written[subpass->depth_stencil_attachment.attachment] = true;
for (unsigned k = 0; k < subpass->input_count; k++) {
if (subpass->input_attachments[k].attachment ==
subpass->depth_stencil_attachment.attachment &&
!subpass->input_attachments[k].patch_input_gmem) {
subpass->feedback_invalidate = true;
subpass->input_attachments[k].patch_input_gmem = true;
}
}
}
}
}
static void update_samples(struct tu_subpass *subpass,
VkSampleCountFlagBits samples)
{
......@@ -584,8 +653,10 @@ tu_CreateRenderPass2(VkDevice _device,
for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
uint32_t a = desc->pInputAttachments[j].attachment;
subpass->input_attachments[j].attachment = a;
if (a != VK_ATTACHMENT_UNUSED)
pass->attachments[a].gmem_offset = 0;
/* Note: attachments only used as input attachments will be read
* directly instead of through gmem, so we don't mark input
* attachments as needing gmem.
*/
}
}
......@@ -635,6 +706,8 @@ tu_CreateRenderPass2(VkDevice _device,
}
}
tu_render_pass_patch_input_gmem(pass);
/* disable unused attachments */
for (uint32_t i = 0; i < pass->attachment_count; i++) {
struct tu_render_pass_attachment *att = &pass->attachments[i];
......
......@@ -1542,6 +1542,12 @@ struct tu_subpass_barrier {
struct tu_subpass_attachment
{
uint32_t attachment;
/* For input attachments, true if it needs to be patched to refer to GMEM
* in GMEM mode. This is false if it hasn't already been written as an
* attachment.
*/
bool patch_input_gmem;
};
struct tu_subpass
......@@ -1550,6 +1556,10 @@ struct tu_subpass
uint32_t color_count;
uint32_t resolve_count;
bool resolve_depth_stencil;
/* True if we must invalidate UCHE thanks to a feedback loop. */
bool feedback_invalidate;
struct tu_subpass_attachment *input_attachments;
struct tu_subpass_attachment *color_attachments;
struct tu_subpass_attachment *resolve_attachments;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment