From 7948c4b0b43a2a0c15c834f1e066c1e5262146b8 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Wed, 28 Jul 2021 14:27:59 +0200 Subject: [PATCH 1/2] tu: Make tile stores use a dedicated CS We were trying to calculate how much space they need, That was already difficult and one of the most opaque and hard-to-verify uses of sub_cs, but it will become even more difficult with the 3D path. What's worse is that sometimes we have to touch that path when we start touching registers that would affect rasterization, and there's no indication that you have to then recalculate the size etc. Just rip this out and start keeping a separate CS for it instead. Note that this adds a small amount of memory wastage and extra buffers (at worst one buffer per command buffer). Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.c | 33 ++++++++++------------------ src/freedreno/vulkan/tu_private.h | 3 +-- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index f9bf3e14f616..45eeea2fbdfb 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -1169,7 +1169,7 @@ static void tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { /* Do any resolves of the last subpass. These are handled in the - * tile_store_ib in the gmem path. + * tile_store_cs in the gmem path. */ tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass); @@ -1242,7 +1242,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS)); } - tu_cs_emit_ib(cs, &cmd->state.tile_store_ib); + tu_cs_emit_call(cs, &cmd->tile_store_cs); tu_cs_sanity_check(cs); } @@ -1299,25 +1299,6 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd) tu6_sysmem_render_end(cmd, &cmd->cs); } -static void -tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd) -{ - const uint32_t tile_store_space = 7 + (35 * 2) * cmd->state.pass->attachment_count; - struct tu_cs sub_cs; - - VkResult result = - tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs); - if (result != VK_SUCCESS) { - cmd->record_result = result; - return; - } - - /* emit to tile-store sub_cs */ - tu6_emit_tile_store(cmd, &sub_cs); - - cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs); -} - static VkResult tu_create_cmd_buffer(struct tu_device *device, struct tu_cmd_pool *pool, @@ -1349,6 +1330,7 @@ tu_create_cmd_buffer(struct tu_device *device, tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096); tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096); + tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048); tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096); tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048); @@ -1364,6 +1346,7 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer) tu_cs_finish(&cmd_buffer->cs); tu_cs_finish(&cmd_buffer->draw_cs); + tu_cs_finish(&cmd_buffer->tile_store_cs); tu_cs_finish(&cmd_buffer->draw_epilogue_cs); tu_cs_finish(&cmd_buffer->sub_cs); @@ -1377,6 +1360,7 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer) tu_cs_reset(&cmd_buffer->cs); tu_cs_reset(&cmd_buffer->draw_cs); + tu_cs_reset(&cmd_buffer->tile_store_cs); tu_cs_reset(&cmd_buffer->draw_epilogue_cs); tu_cs_reset(&cmd_buffer->sub_cs); @@ -1510,6 +1494,7 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, tu_cs_begin(&cmd_buffer->cs); tu_cs_begin(&cmd_buffer->draw_cs); + tu_cs_begin(&cmd_buffer->tile_store_cs); tu_cs_begin(&cmd_buffer->draw_epilogue_cs); /* setup initial configuration into command buffer */ @@ -2051,6 +2036,7 @@ tu_EndCommandBuffer(VkCommandBuffer commandBuffer) tu_cs_end(&cmd_buffer->cs); tu_cs_end(&cmd_buffer->draw_cs); + tu_cs_end(&cmd_buffer->tile_store_cs); tu_cs_end(&cmd_buffer->draw_epilogue_cs); cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE; @@ -2927,7 +2913,7 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, cmd->state.framebuffer = fb; cmd->state.render_area = pRenderPassBegin->renderArea; - tu_cmd_prepare_tile_store_ib(cmd); + tu6_emit_tile_store(cmd, &cmd->tile_store_cs); /* Note: because this is external, any flushes will happen before draw_cs * gets called. However deferred flushes could have to happen later as part @@ -4364,6 +4350,7 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); tu_cs_end(&cmd_buffer->draw_cs); + tu_cs_end(&cmd_buffer->tile_store_cs); tu_cs_end(&cmd_buffer->draw_epilogue_cs); if (use_sysmem_rendering(cmd_buffer)) @@ -4382,6 +4369,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, rendered */ tu_cs_discard_entries(&cmd_buffer->draw_cs); tu_cs_begin(&cmd_buffer->draw_cs); + tu_cs_discard_entries(&cmd_buffer->tile_store_cs); + tu_cs_begin(&cmd_buffer->tile_store_cs); tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs); tu_cs_begin(&cmd_buffer->draw_epilogue_cs); diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index c0d4e8ef899a..153e1880e0a0 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -974,8 +974,6 @@ struct tu_cmd_state const struct tu_framebuffer *framebuffer; VkRect2D render_area; - struct tu_cs_entry tile_store_ib; - bool xfb_used; bool has_tess; bool has_subpass_predication; @@ -1034,6 +1032,7 @@ struct tu_cmd_buffer struct tu_cs cs; struct tu_cs draw_cs; + struct tu_cs tile_store_cs; struct tu_cs draw_epilogue_cs; struct tu_cs sub_cs; -- GitLab From b157a5d0d68ee8a1b4cb862a56b97bd881841413 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Wed, 28 Jul 2021 14:42:08 +0200 Subject: [PATCH 2/2] tu: Implement non-aligned multisample GMEM STORE_OP_STORE We have to a bit careful here when disabling draw states. This also necessitates moving the actual recording of the stores to the end so that we set the dirty flag correctly. Closes: #4462 Part-of: --- .../ci/deqp-freedreno-a630-fails.txt | 3 - src/freedreno/vulkan/tu_clear_blit.c | 113 +++++++++++++++--- src/freedreno/vulkan/tu_cmd_buffer.c | 13 +- src/freedreno/vulkan/tu_private.h | 2 + 4 files changed, 104 insertions(+), 27 deletions(-) diff --git a/src/freedreno/ci/deqp-freedreno-a630-fails.txt b/src/freedreno/ci/deqp-freedreno-a630-fails.txt index 463a09a07267..0de081814755 100644 --- a/src/freedreno/ci/deqp-freedreno-a630-fails.txt +++ b/src/freedreno/ci/deqp-freedreno-a630-fails.txt @@ -17,9 +17,6 @@ dEQP-VK.api.device_init.create_instance_device_intentional_alloc_fail,Fail dEQP-VK.compute.basic.max_local_size_x,Crash dEQP-VK.compute.basic.max_local_size_y,Crash -# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4462 -dEQP-VK.pipeline.framebuffer_attachment.diff_attachments_2d_19x27_32x32_ms,Fail - # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3019 # should be fixed by https://gerrit.khronos.org/c/vk-gl-cts/+/7745 dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.7,Fail diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c index fd8e75b13f2d..5ecf11ce530a 100644 --- a/src/freedreno/vulkan/tu_clear_blit.c +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -894,6 +894,36 @@ r3d_src_buffer(struct tu_cmd_buffer *cmd, r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST); } +static void +r3d_src_gmem(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + const struct tu_image_view *iview, + VkFormat format, + uint32_t gmem_offset, + uint32_t cpp) +{ + uint32_t desc[A6XX_TEX_CONST_DWORDS]; + memcpy(desc, iview->descriptor, sizeof(desc)); + + /* patch the format so that depth/stencil get the right format */ + desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK; + desc[0] |= A6XX_TEX_CONST_0_FMT(tu6_format_texture(format, TILE6_2).fmt); + + /* patched for gmem */ + desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); + desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); + desc[2] = + A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | + A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp); + desc[3] = 0; + desc[4] = cmd->device->physical_device->gmem_base + gmem_offset; + desc[5] = A6XX_TEX_CONST_5_DEPTH(1); + for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) + desc[i] = 0; + + r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST); +} + static void r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) { @@ -2733,6 +2763,42 @@ store_cp_blit(struct tu_cmd_buffer *cmd, tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); } +static void +store_3d_blit(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + const struct tu_image_view *iview, + uint32_t dst_samples, + bool separate_stencil, + VkFormat format, + const VkRect2D *render_area, + uint32_t gmem_offset, + uint32_t cpp) +{ + r3d_setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, + iview->ubwc_enabled, dst_samples); + + r3d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent); + + if (separate_stencil) + r3d_dst_stencil(cs, iview, 0); + else + r3d_dst(cs, iview, 0); + + r3d_src_gmem(cmd, cs, iview, format, gmem_offset, cpp); + + /* sync GMEM writes with CACHE. */ + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + + r3d_run(cmd, cs); + + /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to + * sysmem, and we generally assume that GMEM renderpasses leave their + * results in sysmem, so we need to flush manually here. The 3d blit path + * writes to depth images as a color RT, so there's no need to flush depth. + */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); +} + void tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -2782,26 +2848,39 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, return; } - if (dst->samples > 1) { - /* I guess we need to use shader path in this case? - * need a testcase which fails because of this - */ - tu_finishme("unaligned store of msaa attachment\n"); - return; - } - - r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent); - VkFormat format = src->format; if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) format = VK_FORMAT_D32_SFLOAT; - if (dst->store) { - store_cp_blit(cmd, cs, iview, src->samples, resolve_d32s8_s8, format, - src->gmem_offset, src->cpp); - } - if (dst->store_stencil) { - store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT, - src->gmem_offset_stencil, src->samples); + if (dst->samples > 1) { + /* If we hit this path, we have to disable draw states after every tile + * instead of once at the end of the renderpass, so that they aren't + * executed when calling CP_DRAW. + * + * TODO: store a flag somewhere so we don't do this more than once and + * don't do it after the renderpass when this happens. + */ + if (dst->store || dst->store_stencil) + tu_disable_draw_states(cmd, cs); + + if (dst->store) { + store_3d_blit(cmd, cs, iview, dst->samples, resolve_d32s8_s8, format, + render_area, src->gmem_offset, src->cpp); + } + if (dst->store_stencil) { + store_3d_blit(cmd, cs, iview, dst->samples, true, VK_FORMAT_S8_UINT, + render_area, src->gmem_offset, src->samples); + } + } else { + r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent); + + if (dst->store) { + store_cp_blit(cmd, cs, iview, src->samples, resolve_d32s8_s8, format, + src->gmem_offset, src->cpp); + } + if (dst->store_stencil) { + store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT, + src->gmem_offset_stencil, src->samples); + } } } diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 45eeea2fbdfb..47aa21789fa2 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -684,7 +684,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) } } -static void +void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); @@ -2913,8 +2913,6 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, cmd->state.framebuffer = fb; cmd->state.render_area = pRenderPassBegin->renderArea; - tu6_emit_tile_store(cmd, &cmd->tile_store_cs); - /* Note: because this is external, any flushes will happen before draw_cs * gets called. However deferred flushes could have to happen later as part * of the subpass. @@ -4349,6 +4347,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, { TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + tu6_emit_tile_store(cmd_buffer, &cmd_buffer->tile_store_cs); + tu_cs_end(&cmd_buffer->draw_cs); tu_cs_end(&cmd_buffer->tile_store_cs); tu_cs_end(&cmd_buffer->draw_epilogue_cs); @@ -4358,10 +4358,9 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, else tu_cmd_render_tiles(cmd_buffer); - /* outside of renderpasses we assume all draw states are disabled - * we can do this in the main cs because no resolve/store commands - * should use a draw command (TODO: this will change if unaligned - * GMEM stores are supported) + /* Outside of renderpasses we assume all draw states are disabled. We do + * this outside the draw CS for the normal case where 3d gmem stores aren't + * used. */ tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs); diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 153e1880e0a0..5400072c859c 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -1239,6 +1239,8 @@ void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_ void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1); +void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + struct tu_pvtmem_config { uint64_t iova; uint32_t per_fiber_size; -- GitLab