Commit e1dc3ab7 authored by Samuel Pitoiset's avatar Samuel Pitoiset

radv/gfx10: allocate GDS/OA buffer objects for NGG streamout

This allocates two BOs for GFX10 NGG streamout.
Signed-off-by: Samuel Pitoiset's avatarSamuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen's avatarBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
parent 957c3436
......@@ -337,6 +337,7 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->esgs_ring_size_needed = 0;
cmd_buffer->gsvs_ring_size_needed = 0;
cmd_buffer->tess_rings_needed = false;
cmd_buffer->gds_needed = false;
cmd_buffer->sample_positions_needed = false;
if (cmd_buffer->upload.upload_bo)
......@@ -5815,6 +5816,9 @@ radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
((old_streamout_enabled != so->streamout_enabled) ||
(old_hw_enabled_mask != so->hw_enabled_mask)))
radv_emit_streamout_enable(cmd_buffer);
if (cmd_buffer->device->physical_device->use_ngg_streamout)
cmd_buffer->gds_needed = true;
}
static void radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
......
......@@ -1746,6 +1746,10 @@ radv_queue_finish(struct radv_queue *queue)
queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
if (queue->tess_rings_bo)
queue->device->ws->buffer_destroy(queue->tess_rings_bo);
if (queue->gds_bo)
queue->device->ws->buffer_destroy(queue->gds_bo);
if (queue->gds_oa_bo)
queue->device->ws->buffer_destroy(queue->gds_oa_bo);
if (queue->compute_scratch_bo)
queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
}
......@@ -2598,6 +2602,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
uint32_t esgs_ring_size,
uint32_t gsvs_ring_size,
bool needs_tess_rings,
bool needs_gds,
bool needs_sample_positions,
struct radeon_cmdbuf **initial_full_flush_preamble_cs,
struct radeon_cmdbuf **initial_preamble_cs,
......@@ -2609,8 +2614,10 @@ radv_get_preamble_cs(struct radv_queue *queue,
struct radeon_winsys_bo *esgs_ring_bo = NULL;
struct radeon_winsys_bo *gsvs_ring_bo = NULL;
struct radeon_winsys_bo *tess_rings_bo = NULL;
struct radeon_winsys_bo *gds_bo = NULL;
struct radeon_winsys_bo *gds_oa_bo = NULL;
struct radeon_cmdbuf *dest_cs[3] = {0};
bool add_tess_rings = false, add_sample_positions = false;
bool add_tess_rings = false, add_gds = false, add_sample_positions = false;
unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
unsigned max_offchip_buffers;
unsigned hs_offchip_param = 0;
......@@ -2620,6 +2627,10 @@ radv_get_preamble_cs(struct radv_queue *queue,
if (needs_tess_rings)
add_tess_rings = true;
}
if (!queue->has_gds) {
if (needs_gds)
add_gds = true;
}
if (!queue->has_sample_positions) {
if (needs_sample_positions)
add_sample_positions = true;
......@@ -2635,7 +2646,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
compute_scratch_size <= queue->compute_scratch_size &&
esgs_ring_size <= queue->esgs_ring_size &&
gsvs_ring_size <= queue->gsvs_ring_size &&
!add_tess_rings && !add_sample_positions &&
!add_tess_rings && !add_gds && !add_sample_positions &&
queue->initial_preamble_cs) {
*initial_full_flush_preamble_cs = queue->initial_full_flush_preamble_cs;
*initial_preamble_cs = queue->initial_preamble_cs;
......@@ -2711,6 +2722,32 @@ radv_get_preamble_cs(struct radv_queue *queue,
tess_rings_bo = queue->tess_rings_bo;
}
if (add_gds) {
assert(queue->device->physical_device->rad_info.chip_class >= GFX10);
/* 4 streamout GDS counters.
* We need 256B (64 dw) of GDS, otherwise streamout hangs.
*/
gds_bo = queue->device->ws->buffer_create(queue->device->ws,
256, 4,
RADEON_DOMAIN_GDS,
ring_bo_flags,
RADV_BO_PRIORITY_SCRATCH);
if (!gds_bo)
goto fail;
gds_oa_bo = queue->device->ws->buffer_create(queue->device->ws,
4, 1,
RADEON_DOMAIN_OA,
ring_bo_flags,
RADV_BO_PRIORITY_SCRATCH);
if (!gds_oa_bo)
goto fail;
} else {
gds_bo = queue->gds_bo;
gds_oa_bo = queue->gds_oa_bo;
}
if (scratch_bo != queue->scratch_bo ||
esgs_ring_bo != queue->esgs_ring_bo ||
gsvs_ring_bo != queue->gsvs_ring_bo ||
......@@ -2801,6 +2838,11 @@ radv_get_preamble_cs(struct radv_queue *queue,
radv_emit_global_shader_pointers(queue, cs, descriptor_bo);
radv_emit_compute_scratch(queue, cs, compute_scratch_bo);
if (gds_bo)
radv_cs_add_buffer(queue->device->ws, cs, gds_bo);
if (gds_oa_bo)
radv_cs_add_buffer(queue->device->ws, cs, gds_oa_bo);
if (i == 0) {
si_cs_emit_cache_flush(cs,
queue->device->physical_device->rad_info.chip_class,
......@@ -2876,6 +2918,14 @@ radv_get_preamble_cs(struct radv_queue *queue,
queue->has_tess_rings = true;
}
if (gds_bo != queue->gds_bo) {
queue->gds_bo = gds_bo;
queue->has_gds = true;
}
if (gds_oa_bo != queue->gds_oa_bo)
queue->gds_oa_bo = gds_oa_bo;
if (descriptor_bo != queue->descriptor_bo) {
if (queue->descriptor_bo)
queue->device->ws->buffer_destroy(queue->descriptor_bo);
......@@ -2908,6 +2958,11 @@ fail:
queue->device->ws->buffer_destroy(gsvs_ring_bo);
if (tess_rings_bo && tess_rings_bo != queue->tess_rings_bo)
queue->device->ws->buffer_destroy(tess_rings_bo);
if (gds_bo && gds_bo != queue->gds_bo)
queue->device->ws->buffer_destroy(gds_bo);
if (gds_oa_bo && gds_oa_bo != queue->gds_oa_bo)
queue->device->ws->buffer_destroy(gds_oa_bo);
return vk_error(queue->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
}
......@@ -3070,6 +3125,7 @@ VkResult radv_QueueSubmit(
VkResult result;
bool fence_emitted = false;
bool tess_rings_needed = false;
bool gds_needed = false;
bool sample_positions_needed = false;
/* Do this first so failing to allocate scratch buffers can't result in
......@@ -3085,14 +3141,16 @@ VkResult radv_QueueSubmit(
esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
tess_rings_needed |= cmd_buffer->tess_rings_needed;
gds_needed |= cmd_buffer->gds_needed;
sample_positions_needed |= cmd_buffer->sample_positions_needed;
}
}
result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
esgs_ring_size, gsvs_ring_size, tess_rings_needed,
sample_positions_needed, &initial_flush_preamble_cs,
&initial_preamble_cs, &continue_preamble_cs);
gds_needed, sample_positions_needed,
&initial_flush_preamble_cs,
&initial_preamble_cs, &continue_preamble_cs);
if (result != VK_SUCCESS)
return result;
......
......@@ -663,6 +663,7 @@ struct radv_queue {
uint32_t esgs_ring_size;
uint32_t gsvs_ring_size;
bool has_tess_rings;
bool has_gds;
bool has_sample_positions;
struct radeon_winsys_bo *scratch_bo;
......@@ -671,6 +672,8 @@ struct radv_queue {
struct radeon_winsys_bo *esgs_ring_bo;
struct radeon_winsys_bo *gsvs_ring_bo;
struct radeon_winsys_bo *tess_rings_bo;
struct radeon_winsys_bo *gds_bo;
struct radeon_winsys_bo *gds_oa_bo;
struct radeon_cmdbuf *initial_preamble_cs;
struct radeon_cmdbuf *initial_full_flush_preamble_cs;
struct radeon_cmdbuf *continue_preamble_cs;
......@@ -1223,6 +1226,7 @@ struct radv_cmd_buffer {
uint32_t esgs_ring_size_needed;
uint32_t gsvs_ring_size_needed;
bool tess_rings_needed;
bool gds_needed; /* for GFX10 streamout */
bool sample_positions_needed;
VkResult record_result;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment