Commit 46e52df3 authored by Dave Airlie's avatar Dave Airlie
Browse files

radv: add tessellation ring allocation support. (v2)



This patch adds support for the offchip rings for storing
tessellation factors and attribute data.

It includes the register setup for the TF ring

v2: always do tess ring size calcs (Bas)
Reviewed-by: Bas Nieuwenhuizen's avatarBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Signed-off-by: default avatarDave Airlie <airlied@redhat.com>
parent bbfb62df
......@@ -221,6 +221,7 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->compute_scratch_size_needed = 0;
cmd_buffer->esgs_ring_size_needed = 0;
cmd_buffer->gsvs_ring_size_needed = 0;
cmd_buffer->tess_rings_needed = false;
if (cmd_buffer->upload.upload_bo)
cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
......@@ -1903,6 +1904,9 @@ void radv_CmdBindPipeline(
if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
if (radv_pipeline_has_tess(pipeline))
cmd_buffer->tess_rings_needed = true;
if (radv_pipeline_has_gs(pipeline)) {
struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
AC_UD_SCRATCH_RING_OFFSETS);
......@@ -2070,6 +2074,8 @@ void radv_CmdExecuteCommands(
primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
if (secondary->tess_rings_needed)
primary->tess_rings_needed = true;
if (secondary->ring_offsets_idx != -1) {
if (primary->ring_offsets_idx == -1)
......
......@@ -845,6 +845,10 @@ radv_queue_finish(struct radv_queue *queue)
queue->device->ws->buffer_destroy(queue->esgs_ring_bo);
if (queue->gsvs_ring_bo)
queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
if (queue->tess_factor_ring_bo)
queue->device->ws->buffer_destroy(queue->tess_factor_ring_bo);
if (queue->tess_offchip_ring_bo)
queue->device->ws->buffer_destroy(queue->tess_offchip_ring_bo);
if (queue->compute_scratch_bo)
queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
}
......@@ -1182,20 +1186,29 @@ static void radv_dump_trace(struct radv_device *device,
}
static void
fill_geom_rings(struct radv_queue *queue,
uint32_t *map,
uint32_t esgs_ring_size,
struct radeon_winsys_bo *esgs_ring_bo,
uint32_t gsvs_ring_size,
struct radeon_winsys_bo *gsvs_ring_bo)
fill_geom_tess_rings(struct radv_queue *queue,
uint32_t *map,
uint32_t esgs_ring_size,
struct radeon_winsys_bo *esgs_ring_bo,
uint32_t gsvs_ring_size,
struct radeon_winsys_bo *gsvs_ring_bo,
uint32_t tess_factor_ring_size,
struct radeon_winsys_bo *tess_factor_ring_bo,
uint32_t tess_offchip_ring_size,
struct radeon_winsys_bo *tess_offchip_ring_bo)
{
uint64_t esgs_va = 0, gsvs_va = 0;
uint64_t tess_factor_va = 0, tess_offchip_va = 0;
uint32_t *desc = &map[4];
if (esgs_ring_bo)
esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo);
if (gsvs_ring_bo)
gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo);
if (tess_factor_ring_bo)
tess_factor_va = queue->device->ws->buffer_get_va(tess_factor_ring_bo);
if (tess_offchip_ring_bo)
tess_offchip_va = queue->device->ws->buffer_get_va(tess_offchip_ring_bo);
/* stride 0, num records - size, add tid, swizzle, elsize4,
index stride 64 */
......@@ -1270,6 +1283,88 @@ fill_geom_rings(struct radv_queue *queue,
S_008F0C_ELEMENT_SIZE(1) |
S_008F0C_INDEX_STRIDE(1) |
S_008F0C_ADD_TID_ENABLE(true);
desc += 4;
desc[0] = tess_factor_va;
desc[1] = S_008F04_BASE_ADDRESS_HI(tess_factor_va >> 32) |
S_008F04_STRIDE(0) |
S_008F04_SWIZZLE_ENABLE(false);
desc[2] = tess_factor_ring_size;
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
S_008F0C_ELEMENT_SIZE(0) |
S_008F0C_INDEX_STRIDE(0) |
S_008F0C_ADD_TID_ENABLE(false);
desc += 4;
desc[0] = tess_offchip_va;
desc[1] = S_008F04_BASE_ADDRESS_HI(tess_offchip_va >> 32) |
S_008F04_STRIDE(0) |
S_008F04_SWIZZLE_ENABLE(false);
desc[2] = tess_offchip_ring_size;
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
S_008F0C_ELEMENT_SIZE(0) |
S_008F0C_INDEX_STRIDE(0) |
S_008F0C_ADD_TID_ENABLE(false);
}
static unsigned
radv_get_hs_offchip_param(struct radv_device *device, uint32_t *max_offchip_buffers_p)
{
bool double_offchip_buffers = device->physical_device->rad_info.chip_class >= CIK &&
device->physical_device->rad_info.family != CHIP_CARRIZO &&
device->physical_device->rad_info.family != CHIP_STONEY;
unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
unsigned max_offchip_buffers = max_offchip_buffers_per_se *
device->physical_device->rad_info.max_se;
unsigned offchip_granularity;
unsigned hs_offchip_param;
switch (device->tess_offchip_block_dw_size) {
default:
assert(0);
/* fall through */
case 8192:
offchip_granularity = V_03093C_X_8K_DWORDS;
break;
case 4096:
offchip_granularity = V_03093C_X_4K_DWORDS;
break;
}
switch (device->physical_device->rad_info.chip_class) {
case SI:
max_offchip_buffers = MIN2(max_offchip_buffers, 126);
break;
case CIK:
max_offchip_buffers = MIN2(max_offchip_buffers, 508);
break;
case VI:
default:
max_offchip_buffers = MIN2(max_offchip_buffers, 512);
break;
}
*max_offchip_buffers_p = max_offchip_buffers;
if (device->physical_device->rad_info.chip_class >= CIK) {
if (device->physical_device->rad_info.chip_class >= VI)
--max_offchip_buffers;
hs_offchip_param =
S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
S_03093C_OFFCHIP_GRANULARITY(offchip_granularity);
} else {
hs_offchip_param =
S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
}
return hs_offchip_param;
}
static VkResult
......@@ -1278,6 +1373,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
uint32_t compute_scratch_size,
uint32_t esgs_ring_size,
uint32_t gsvs_ring_size,
bool needs_tess_rings,
struct radeon_winsys_cs **initial_preamble_cs,
struct radeon_winsys_cs **continue_preamble_cs)
{
......@@ -1286,12 +1382,28 @@ radv_get_preamble_cs(struct radv_queue *queue,
struct radeon_winsys_bo *compute_scratch_bo = NULL;
struct radeon_winsys_bo *esgs_ring_bo = NULL;
struct radeon_winsys_bo *gsvs_ring_bo = NULL;
struct radeon_winsys_bo *tess_factor_ring_bo = NULL;
struct radeon_winsys_bo *tess_offchip_ring_bo = NULL;
struct radeon_winsys_cs *dest_cs[2] = {0};
bool add_tess_rings = false;
unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
unsigned max_offchip_buffers;
unsigned hs_offchip_param = 0;
if (!queue->has_tess_rings) {
if (needs_tess_rings)
add_tess_rings = true;
}
tess_factor_ring_size = 32768 * queue->device->physical_device->rad_info.max_se;
hs_offchip_param = radv_get_hs_offchip_param(queue->device,
&max_offchip_buffers);
tess_offchip_ring_size = max_offchip_buffers *
queue->device->tess_offchip_block_dw_size * 4;
if (scratch_size <= queue->scratch_size &&
compute_scratch_size <= queue->compute_scratch_size &&
esgs_ring_size <= queue->esgs_ring_size &&
gsvs_ring_size <= queue->gsvs_ring_size &&
!add_tess_rings &&
queue->initial_preamble_cs) {
*initial_preamble_cs = queue->initial_preamble_cs;
*continue_preamble_cs = queue->continue_preamble_cs;
......@@ -1349,12 +1461,35 @@ radv_get_preamble_cs(struct radv_queue *queue,
gsvs_ring_size = queue->gsvs_ring_size;
}
if (add_tess_rings) {
tess_factor_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
tess_factor_ring_size,
256,
RADEON_DOMAIN_VRAM,
RADEON_FLAG_NO_CPU_ACCESS);
if (!tess_factor_ring_bo)
goto fail;
tess_offchip_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
tess_offchip_ring_size,
256,
RADEON_DOMAIN_VRAM,
RADEON_FLAG_NO_CPU_ACCESS);
if (!tess_offchip_ring_bo)
goto fail;
} else {
tess_factor_ring_bo = queue->tess_factor_ring_bo;
tess_offchip_ring_bo = queue->tess_offchip_ring_bo;
}
if (scratch_bo != queue->scratch_bo ||
esgs_ring_bo != queue->esgs_ring_bo ||
gsvs_ring_bo != queue->gsvs_ring_bo) {
gsvs_ring_bo != queue->gsvs_ring_bo ||
tess_factor_ring_bo != queue->tess_factor_ring_bo ||
tess_offchip_ring_bo != queue->tess_offchip_ring_bo) {
uint32_t size = 0;
if (gsvs_ring_bo || esgs_ring_bo)
size = 80; /* 2 dword + 2 padding + 4 dword * 4 */
if (gsvs_ring_bo || esgs_ring_bo ||
tess_factor_ring_bo || tess_offchip_ring_bo)
size = 112; /* 2 dword + 2 padding + 4 dword * 6 */
else if (scratch_bo)
size = 8; /* 2 dword */
......@@ -1386,6 +1521,12 @@ radv_get_preamble_cs(struct radv_queue *queue,
if (gsvs_ring_bo)
queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8);
if (tess_factor_ring_bo)
queue->device->ws->cs_add_buffer(cs, tess_factor_ring_bo, 8);
if (tess_offchip_ring_bo)
queue->device->ws->cs_add_buffer(cs, tess_offchip_ring_bo, 8);
if (descriptor_bo)
queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8);
......@@ -1400,18 +1541,24 @@ radv_get_preamble_cs(struct radv_queue *queue,
map[1] = rsrc1;
}
if (esgs_ring_bo || gsvs_ring_bo)
fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);
if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo)
fill_geom_tess_rings(queue, map,
esgs_ring_size, esgs_ring_bo,
gsvs_ring_size, gsvs_ring_bo,
tess_factor_ring_size, tess_factor_ring_bo,
tess_offchip_ring_size, tess_offchip_ring_bo);
queue->device->ws->buffer_unmap(descriptor_bo);
}
if (esgs_ring_bo || gsvs_ring_bo) {
if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
}
if (esgs_ring_bo || gsvs_ring_bo) {
if (queue->device->physical_device->rad_info.chip_class >= CIK) {
radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
radeon_emit(cs, esgs_ring_size >> 8);
......@@ -1423,6 +1570,24 @@ radv_get_preamble_cs(struct radv_queue *queue,
}
}
if (tess_factor_ring_bo) {
uint64_t tf_va = queue->device->ws->buffer_get_va(tess_factor_ring_bo);
if (queue->device->physical_device->rad_info.chip_class >= CIK) {
radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE,
S_030938_SIZE(tess_factor_ring_size / 4));
radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE,
tf_va >> 8);
radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, hs_offchip_param);
} else {
radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE,
S_008988_SIZE(tess_factor_ring_size / 4));
radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE,
tf_va >> 8);
radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM,
hs_offchip_param);
}
}
if (descriptor_bo) {
uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
R_00B130_SPI_SHADER_USER_DATA_VS_0,
......@@ -1504,6 +1669,15 @@ radv_get_preamble_cs(struct radv_queue *queue,
queue->gsvs_ring_size = gsvs_ring_size;
}
if (tess_factor_ring_bo != queue->tess_factor_ring_bo) {
queue->tess_factor_ring_bo = tess_factor_ring_bo;
}
if (tess_offchip_ring_bo != queue->tess_offchip_ring_bo) {
queue->tess_offchip_ring_bo = tess_offchip_ring_bo;
queue->has_tess_rings = true;
}
if (descriptor_bo != queue->descriptor_bo) {
if (queue->descriptor_bo)
queue->device->ws->buffer_destroy(queue->descriptor_bo);
......@@ -1530,6 +1704,10 @@ fail:
queue->device->ws->buffer_destroy(esgs_ring_bo);
if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo)
queue->device->ws->buffer_destroy(gsvs_ring_bo);
if (tess_factor_ring_bo && tess_factor_ring_bo != queue->tess_factor_ring_bo)
queue->device->ws->buffer_destroy(tess_factor_ring_bo);
if (tess_offchip_ring_bo && tess_offchip_ring_bo != queue->tess_offchip_ring_bo)
queue->device->ws->buffer_destroy(tess_offchip_ring_bo);
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
}
......@@ -1551,6 +1729,7 @@ VkResult radv_QueueSubmit(
struct radeon_winsys_cs *initial_preamble_cs = NULL, *continue_preamble_cs = NULL;
VkResult result;
bool fence_emitted = false;
bool tess_rings_needed = false;
/* Do this first so failing to allocate scratch buffers can't result in
* partially executed submissions. */
......@@ -1564,11 +1743,12 @@ VkResult radv_QueueSubmit(
cmd_buffer->compute_scratch_size_needed);
esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
tess_rings_needed |= cmd_buffer->tess_rings_needed;
}
}
result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
esgs_ring_size, gsvs_ring_size,
esgs_ring_size, gsvs_ring_size, tess_rings_needed,
&initial_preamble_cs, &continue_preamble_cs);
if (result != VK_SUCCESS)
return result;
......
......@@ -459,12 +459,15 @@ struct radv_queue {
uint32_t compute_scratch_size;
uint32_t esgs_ring_size;
uint32_t gsvs_ring_size;
bool has_tess_rings;
struct radeon_winsys_bo *scratch_bo;
struct radeon_winsys_bo *descriptor_bo;
struct radeon_winsys_bo *compute_scratch_bo;
struct radeon_winsys_bo *esgs_ring_bo;
struct radeon_winsys_bo *gsvs_ring_bo;
struct radeon_winsys_bo *tess_factor_ring_bo;
struct radeon_winsys_bo *tess_offchip_ring_bo;
struct radeon_winsys_cs *initial_preamble_cs;
struct radeon_winsys_cs *continue_preamble_cs;
};
......@@ -744,6 +747,7 @@ struct radv_cmd_buffer {
uint32_t compute_scratch_size_needed;
uint32_t esgs_ring_size_needed;
uint32_t gsvs_ring_size_needed;
bool tess_rings_needed;
int ring_offsets_idx; /* just used for verification */
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment