diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 4ad73280320309cfbfe27734bfd0bcfdd9858bc6..e9df8c433d748bc48449e6e1dffcc4700c7b3673 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -1027,7 +1027,8 @@ panfrost_analyze_sysvals(struct panfrost_shader_state *ss) unsigned dirty_shader = PAN_DIRTY_STAGE_RENDERER | PAN_DIRTY_STAGE_CONST; - for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) { + for (unsigned i = 0; i < ss->info.sysvals.push_count + + ss->info.sysvals.ubo_count; ++i) { switch (PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[i])) { case PAN_SYSVAL_VIEWPORT_SCALE: case PAN_SYSVAL_VIEWPORT_OFFSET: @@ -1075,92 +1076,105 @@ panfrost_analyze_sysvals(struct panfrost_shader_state *ss) ss->dirty_shader = dirty_shader; } +static void +panfrost_upload_sysval(struct panfrost_batch *batch, + enum pipe_shader_type st, + int sysval, + struct sysval_uniform *uniform, + mali_ptr gpu) +{ + switch (PAN_SYSVAL_TYPE(sysval)) { + case PAN_SYSVAL_VIEWPORT_SCALE: + panfrost_upload_viewport_scale_sysval(batch, uniform); + break; + case PAN_SYSVAL_VIEWPORT_OFFSET: + panfrost_upload_viewport_offset_sysval(batch, uniform); + break; + case PAN_SYSVAL_TEXTURE_SIZE: + panfrost_upload_txs_sysval(batch, st, + PAN_SYSVAL_ID(sysval), + uniform); + break; + case PAN_SYSVAL_SSBO: + panfrost_upload_ssbo_sysval(batch, st, + PAN_SYSVAL_ID(sysval), + uniform); + break; + case PAN_SYSVAL_NUM_WORK_GROUPS: + for (unsigned j = 0; j < 3; j++) + batch->num_wg_sysval[j] = gpu + (j * 4); + panfrost_upload_num_work_groups_sysval(batch, uniform); + break; + case PAN_SYSVAL_LOCAL_GROUP_SIZE: + panfrost_upload_local_group_size_sysval(batch, uniform); + break; + case PAN_SYSVAL_WORK_DIM: + panfrost_upload_work_dim_sysval(batch, uniform); + break; + case PAN_SYSVAL_SAMPLER: + panfrost_upload_sampler_sysval(batch, st, + PAN_SYSVAL_ID(sysval), + uniform); + break; + case PAN_SYSVAL_IMAGE_SIZE: + panfrost_upload_image_size_sysval(batch, st, + PAN_SYSVAL_ID(sysval), + uniform); + break; + case PAN_SYSVAL_SAMPLE_POSITIONS: + panfrost_upload_sample_positions_sysval(batch, uniform); + break; + case PAN_SYSVAL_MULTISAMPLED: + panfrost_upload_multisampled_sysval(batch, uniform); + break; + case PAN_SYSVAL_RT_CONVERSION: + panfrost_upload_rt_conversion_sysval(batch, + PAN_SYSVAL_ID(sysval), + uniform); + break; + case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS: + batch->ctx->first_vertex_sysval_ptr = gpu; + batch->ctx->base_vertex_sysval_ptr = + batch->ctx->first_vertex_sysval_ptr + 4; + batch->ctx->base_instance_sysval_ptr = + batch->ctx->first_vertex_sysval_ptr + 8; + + uniform->u[0] = batch->ctx->offset_start; + uniform->u[1] = batch->ctx->base_vertex; + uniform->u[2] = batch->ctx->base_instance; + break; + case PAN_SYSVAL_DRAWID: + uniform->u[0] = batch->ctx->drawid; + break; + default: + assert(0); + } +} + static void panfrost_upload_sysvals(struct panfrost_batch *batch, - const struct panfrost_ptr *ptr, + const struct panfrost_ptr *ptr_push, + const struct panfrost_ptr *ptr_ubo, struct panfrost_shader_state *ss, enum pipe_shader_type st) { - struct sysval_uniform *uniforms = ptr->cpu; + unsigned push_count = ss->info.sysvals.push_count; + unsigned ubo_count = ss->info.sysvals.ubo_count; + + struct sysval_uniform *uniforms = ptr_push->cpu; - for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) { + for (unsigned i = 0; i < push_count; ++i) { int sysval = ss->info.sysvals.sysvals[i]; + mali_ptr gpu = ptr_push->gpu + (i * sizeof(*uniforms)); + panfrost_upload_sysval(batch, st, sysval, &uniforms[i], gpu); + } - switch (PAN_SYSVAL_TYPE(sysval)) { - case PAN_SYSVAL_VIEWPORT_SCALE: - panfrost_upload_viewport_scale_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_VIEWPORT_OFFSET: - panfrost_upload_viewport_offset_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_TEXTURE_SIZE: - panfrost_upload_txs_sysval(batch, st, - PAN_SYSVAL_ID(sysval), - &uniforms[i]); - break; - case PAN_SYSVAL_SSBO: - panfrost_upload_ssbo_sysval(batch, st, - PAN_SYSVAL_ID(sysval), - &uniforms[i]); - break; - case PAN_SYSVAL_NUM_WORK_GROUPS: - for (unsigned j = 0; j < 3; j++) { - batch->num_wg_sysval[j] = - ptr->gpu + (i * sizeof(*uniforms)) + (j * 4); - } - panfrost_upload_num_work_groups_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_LOCAL_GROUP_SIZE: - panfrost_upload_local_group_size_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_WORK_DIM: - panfrost_upload_work_dim_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_SAMPLER: - panfrost_upload_sampler_sysval(batch, st, - PAN_SYSVAL_ID(sysval), - &uniforms[i]); - break; - case PAN_SYSVAL_IMAGE_SIZE: - panfrost_upload_image_size_sysval(batch, st, - PAN_SYSVAL_ID(sysval), - &uniforms[i]); - break; - case PAN_SYSVAL_SAMPLE_POSITIONS: - panfrost_upload_sample_positions_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_MULTISAMPLED: - panfrost_upload_multisampled_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_RT_CONVERSION: - panfrost_upload_rt_conversion_sysval(batch, - PAN_SYSVAL_ID(sysval), &uniforms[i]); - break; - case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS: - batch->ctx->first_vertex_sysval_ptr = - ptr->gpu + (i * sizeof(*uniforms)); - batch->ctx->base_vertex_sysval_ptr = - batch->ctx->first_vertex_sysval_ptr + 4; - batch->ctx->base_instance_sysval_ptr = - batch->ctx->first_vertex_sysval_ptr + 8; - - uniforms[i].u[0] = batch->ctx->offset_start; - uniforms[i].u[1] = batch->ctx->base_vertex; - uniforms[i].u[2] = batch->ctx->base_instance; - break; - case PAN_SYSVAL_DRAWID: - uniforms[i].u[0] = batch->ctx->drawid; - break; - default: - assert(0); - } + uniforms = ptr_ubo->cpu; + + for (unsigned i = 0; i < ubo_count; ++i) { + int sysval = ss->info.sysvals.sysvals[push_count + i]; + mali_ptr gpu = ptr_ubo->gpu + (i * sizeof(*uniforms)); + panfrost_upload_sysval(batch, st, sysval, &uniforms[i], gpu); } } @@ -1198,18 +1212,16 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage]; struct panfrost_shader_state *ss = &all->variants[all->active_variant]; - /* Allocate room for the sysval and the uniforms */ - size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count; - struct panfrost_ptr transfer = - pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16); - - /* Upload sysvals requested by the shader */ - panfrost_upload_sysvals(batch, &transfer, ss, stage); + /* Allocate room for the sysvals and the uniforms */ + size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.ubo_count; + struct panfrost_ptr sysval_ptr = {0}; + if (sys_size) + sysval_ptr = pan_pool_alloc_aligned(&batch->pool.base, + sys_size, 16); /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */ struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage); unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0); - unsigned sysval_ubo = sys_size ? ubo_count : ~0; struct panfrost_ptr ubos = pan_pool_alloc_desc_array(&batch->pool.base, @@ -1218,16 +1230,7 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, uint64_t *ubo_ptr = (uint64_t *) ubos.cpu; - /* Upload sysval as a final UBO */ - - if (sys_size) { - pan_pack(ubo_ptr + ubo_count, UNIFORM_BUFFER, cfg) { - cfg.entries = DIV_ROUND_UP(sys_size, 16); - cfg.pointer = transfer.gpu; - } - } - - /* The rest are honest-to-goodness UBOs */ + /* Upload all used UBOs other than sysvals */ u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) { size_t usz = buf->cb[ubo].buffer_size; @@ -1248,64 +1251,42 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, } } - if (ss->info.push.count == 0) - return ubos.gpu; - /* Copy push constants required by the shader */ - struct panfrost_ptr push_transfer = - pan_pool_alloc_aligned(&batch->pool.base, - ss->info.push.count * 4, 16); + struct panfrost_ptr push_transfer = {0}; + if (ss->info.push.count) + push_transfer = pan_pool_alloc_aligned(&batch->pool.base, ss->info.push.count * 4, 16); uint32_t *push_cpu = (uint32_t *) push_transfer.cpu; *push_constants = push_transfer.gpu; - for (unsigned i = 0; i < ss->info.push.count; ++i) { - struct panfrost_ubo_word src = ss->info.push.words[i]; - - if (src.ubo == sysval_ubo) { - unsigned sysval_idx = src.offset / 16; - unsigned sysval_comp = (src.offset % 16) / 4; - unsigned sysval_type = PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[sysval_idx]); - mali_ptr ptr = push_transfer.gpu + (4 * i); - - switch (sysval_type) { - case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS: - switch (sysval_comp) { - case 0: - batch->ctx->first_vertex_sysval_ptr = ptr; - break; - case 1: - batch->ctx->base_vertex_sysval_ptr = ptr; - break; - case 2: - batch->ctx->base_instance_sysval_ptr = ptr; - break; - case 3: - /* Spurious (Midgard doesn't pack) */ - break; - default: - unreachable("Invalid vertex/instance offset component\n"); - } - break; - - case PAN_SYSVAL_NUM_WORK_GROUPS: - batch->num_wg_sysval[sysval_comp] = ptr; - break; + /* Upload sysvals requested by the shader */ + panfrost_upload_sysvals(batch, &push_transfer, &sysval_ptr, ss, stage); - default: - break; - } + /* Upload sysval as a final UBO */ + if (sys_size) { + unsigned sysval_push = ss->info.sysvals.push_count; + pan_pack(ubo_ptr + ubo_count, UNIFORM_BUFFER, cfg) { + cfg.entries = DIV_ROUND_UP(sys_size, 16) + sysval_push; + cfg.pointer = sysval_ptr.gpu - sysval_push * 16; } + } + + unsigned sys_push = ss->info.sysvals.push_count * 4; + push_cpu += sys_push; + + for (unsigned i = sys_push ? 1 : 0; i < ss->info.push.num_ranges; ++i) { + struct panfrost_ubo_range src = ss->info.push.ranges[i]; + /* Map the UBO, this should be cheap. However this is reading * from write-combine memory which is _very_ slow. It might pay - * off to upload sysvals to a staging buffer on the CPU on the - * assumption sysvals will get pushed (TODO) */ + * off to upload UBOs to a staging buffer on the CPU on the + * assumption they will get pushed. */ + + const void *mapped_ubo = panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo); - const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu : - panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo); + memcpy(push_cpu, (uint8_t *) mapped_ubo + src.offset, src.size * 4); - /* TODO: Is there any benefit to combining ranges */ - memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4); + push_cpu += src.size; } return ubos.gpu; diff --git a/src/panfrost/bifrost/bi_opt_push_ubo.c b/src/panfrost/bifrost/bi_opt_push_ubo.c index caaa5ee872fd065c5b993c528a7d01bfbdb78b95..57c4e576792faed2fd404f8f7cc7a507572a15a0 100644 --- a/src/panfrost/bifrost/bi_opt_push_ubo.c +++ b/src/panfrost/bifrost/bi_opt_push_ubo.c @@ -91,8 +91,12 @@ bi_analyze_ranges(bi_context *ctx) * sophisticated. Select from the last UBO first to prioritize sysvals. */ static void -bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis) +bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis, + unsigned sysval_ubo) { + /* The sysval push range must be first */ + assert(sysval_ubo == analysis->nr_blocks - 1); + for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) { struct bi_ubo_block *block = &analysis->blocks[ubo]; @@ -102,18 +106,17 @@ bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis) /* Don't push something we don't access */ if (range == 0) continue; + /* We want a single push range for sysvals, pretend + * there are no holes between sysvals */ + if (ubo == sysval_ubo) + range = 4; + /* Don't push more than possible */ if (push->count > PAN_MAX_PUSH - range) return; - for (unsigned offs = 0; offs < range; ++offs) { - struct panfrost_ubo_word word = { - .ubo = ubo, - .offset = (r + offs) * 4 - }; - - push->words[push->count++] = word; - } + for (unsigned offs = 0; offs < range; ++offs) + pan_add_pushed_ubo(push, ubo, (r + offs) * 4); /* Mark it as pushed so we can rewrite */ BITSET_SET(block->pushed, r); @@ -133,8 +136,13 @@ bi_opt_push_ubo(bi_context *ctx) /* This pass only runs once */ assert(ctx->info->push.count == 0); + unsigned sysval_ubo = + MAX2(ctx->inputs->sysval_ubo, ctx->nir->info.num_ubos); + struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx); - bi_pick_ubo(&ctx->info->push, &analysis); + bi_pick_ubo(&ctx->info->push, &analysis, sysval_ubo); + + pan_set_sysval_push(ctx->info, sysval_ubo); ctx->ubo_mask = 0; diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index b3b7192eb4fe1231384ae0b8a3f0d6c4cd329011..28f7427d426ea3964311db69f8ba0dfd533f4a62 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -378,7 +378,7 @@ bi_make_vec_to(bi_builder *b, bi_index final_dst, } } -static bi_instr * +static void bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval, unsigned nr_components, unsigned offset) { @@ -388,11 +388,18 @@ bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval, pan_lookup_sysval(b->shader->sysval_to_id, &b->shader->info->sysvals, sysval); - unsigned idx = (uniform * 16) + offset; + unsigned idx = uniform * 16; + + unsigned offset_words = offset / 4; + + bi_index load_dest = bi_temp(b->shader); + bi_load_to(b, (nr_components + offset_words) * 32, load_dest, + bi_imm_u32(idx), + bi_imm_u32(sysval_ubo), BI_SEG_UBO); - return bi_load_to(b, nr_components * 32, dest, - bi_imm_u32(idx), - bi_imm_u32(sysval_ubo), BI_SEG_UBO); + for (unsigned i = 0; i < nr_components; ++i) + bi_mov_i32_to(b, bi_word(dest, i), + bi_word(load_dest, i + offset_words)); } static void diff --git a/src/panfrost/lib/pan_indirect_dispatch.c b/src/panfrost/lib/pan_indirect_dispatch.c index 79e055bece8bb7703b9ec4abfd86f8db34bffd74..32bce553622937e4c0c2586330f3cd8a039df292 100644 --- a/src/panfrost/lib/pan_indirect_dispatch.c +++ b/src/panfrost/lib/pan_indirect_dispatch.c @@ -105,8 +105,11 @@ get_push_uniforms(struct pan_pool *pool, uint32_t *out = push_consts_buf.cpu; uint8_t *in = (uint8_t *)inputs; - for (unsigned i = 0; i < dev->indirect_dispatch.push.count; ++i) - memcpy(out + i, in + dev->indirect_dispatch.push.words[i].offset, 4); + for (unsigned i = 0; i < dev->indirect_dispatch.push.num_ranges; ++i) { + struct panfrost_ubo_range src = dev->indirect_dispatch.push.ranges[i]; + memcpy(out, in + src.offset, src.size * 4); + out += src.size; + } return push_consts_buf.gpu; } @@ -232,7 +235,8 @@ pan_indirect_dispatch_init(struct panfrost_device *dev) assert(!shader_info.tls_size); assert(!shader_info.wls_size); - assert(!shader_info.sysvals.sysval_count); + assert(!shader_info.sysvals.push_count); + assert(!shader_info.sysvals.ubo_count); dev->indirect_dispatch.bin = panfrost_bo_create(dev, binary.size, PAN_BO_EXECUTE, diff --git a/src/panfrost/lib/pan_indirect_draw.c b/src/panfrost/lib/pan_indirect_draw.c index 39eef07b05767179798bda7a6b2db0cac7b0bb51..51dc09d619782f51422fd9c835b9beebd1cb15fc 100644 --- a/src/panfrost/lib/pan_indirect_draw.c +++ b/src/panfrost/lib/pan_indirect_draw.c @@ -1085,7 +1085,8 @@ create_indirect_draw_shader(struct panfrost_device *dev, assert(!shader_info.tls_size); assert(!shader_info.wls_size); - assert(!shader_info.sysvals.sysval_count); + assert(!shader_info.sysvals.push_count); + assert(!shader_info.sysvals.ubo_count); unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search); struct pan_indirect_draw_shader *draw_shader = @@ -1175,8 +1176,11 @@ get_push_uniforms(struct pan_pool *pool, uint32_t *out = push_consts_buf.cpu; uint8_t *in = (uint8_t *)inputs; - for (unsigned i = 0; i < shader->push.count; ++i) - memcpy(out + i, in + shader->push.words[i].offset, 4); + for (unsigned i = 0; i < shader->push.num_ranges; ++i) { + struct panfrost_ubo_range src = shader->push.ranges[i]; + memcpy(out, in + src.offset, src.size * 4); + out += src.size; + } return push_consts_buf.gpu; } diff --git a/src/panfrost/lib/pan_shader.c b/src/panfrost/lib/pan_shader.c index f1d4d4402da5019396e120bbfc1fafb4cf545a88..a80d5ba586ff701a520d1db37b78c98517a5af67 100644 --- a/src/panfrost/lib/pan_shader.c +++ b/src/panfrost/lib/pan_shader.c @@ -277,7 +277,7 @@ pan_shader_compile(const struct panfrost_device *dev, info->outputs_written = s->info.outputs_written; /* Sysvals have dedicated UBO */ - if (info->sysvals.sysval_count) + if (info->sysvals.ubo_count) info->ubo_count = MAX2(s->info.num_ubos + 1, inputs->sysval_ubo + 1); else info->ubo_count = s->info.num_ubos; diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c index 704cfc874662c616535c5373b65d9b0dbb403ec8..e4e706d647a27dda9a65a898fe5f67b3b72d659f 100644 --- a/src/panfrost/midgard/midgard_ra.c +++ b/src/panfrost/midgard/midgard_ra.c @@ -1042,7 +1042,10 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) unsigned idx = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 4; assert(idx < ctx->info->push.count); - ctx->ubo_mask |= BITSET_BIT(ctx->info->push.words[idx].ubo); + struct panfrost_ubo_range word = + pan_index_pushed_ubo(&ctx->info->push, idx); + + ctx->ubo_mask |= BITSET_BIT(word.ubo); midgard_instruction ld = { .type = TAG_LOAD_STORE_4, @@ -1055,11 +1058,10 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) .load_store = { .index_reg = REGISTER_LDST_ZERO, }, - .constants.u32[0] = ctx->info->push.words[idx].offset + .constants.u32[0] = word.offset }; - midgard_pack_ubo_index_imm(&ld.load_store, - ctx->info->push.words[idx].ubo); + midgard_pack_ubo_index_imm(&ld.load_store, word.ubo); mir_insert_instruction_before_scheduled(ctx, block, before, ld); @@ -1068,7 +1070,25 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) } } - ctx->info->push.count = MIN2(ctx->info->push.count, new_cutoff * 4); + unsigned new_push_count = new_cutoff * 4; + if (new_push_count && new_push_count < ctx->info->push.count) { + struct panfrost_ubo_range last_pushed = + pan_index_pushed_ubo(&ctx->info->push, new_push_count - 1); + + ctx->info->push.num_ranges = last_pushed.index + 1; + + ctx->info->push.ranges[last_pushed.index].size = + (last_pushed.offset - + ctx->info->push.ranges[last_pushed.offset].offset) / 4 + 1; + } + + ctx->info->push.count = MIN2(ctx->info->push.count, new_push_count); + + unsigned push_sysvals = ctx->info->sysvals.push_count; + if (push_sysvals > new_cutoff) { + ctx->info->sysvals.ubo_count += push_sysvals - new_cutoff; + ctx->info->sysvals.push_count = new_cutoff; + } } /* Run register allocation in a loop, spilling until we succeed */ diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c index f987b7f17fd4f9f1cfaeed25b05233ff79179040..f1b78ccba285deaf3fc2049b8b8ec00ac8b71f2f 100644 --- a/src/panfrost/midgard/midgard_schedule.c +++ b/src/panfrost/midgard/midgard_schedule.c @@ -978,7 +978,7 @@ mir_schedule_condition(compiler_context *ctx, midgard_instruction *cond = mir_schedule_comparison( ctx, instructions, predicate, worklist, count, last->src[condition_index], - vector, last->swizzle[2], last); + vector, last->swizzle[condition_index], last); /* We have exclusive reign over this (possibly move) conditional * instruction. We can rewrite into a pipeline conditional register */ diff --git a/src/panfrost/midgard/mir_promote_uniforms.c b/src/panfrost/midgard/mir_promote_uniforms.c index 5d7045a9c799d8802d03d6efe0dcf64086a6aaeb..7fd1a08b37b8b700b7e5390e660b354f8226af63 100644 --- a/src/panfrost/midgard/mir_promote_uniforms.c +++ b/src/panfrost/midgard/mir_promote_uniforms.c @@ -46,8 +46,14 @@ mir_is_ubo(midgard_instruction *ins) static bool mir_is_direct_aligned_ubo(midgard_instruction *ins) { - return mir_is_ubo(ins) && - !(ins->constants.u32[0] & 0xF) && + if (!mir_is_ubo(ins)) + return false; + + unsigned offset = ins->constants.u32[0]; + unsigned size = util_logbase2_ceil(mir_bytemask(ins)); + + return !(offset & 0x3) && + ((offset & 0xf) + size <= 16) && (ins->src[1] == ~0) && (ins->src[2] == ~0); } @@ -96,10 +102,13 @@ mir_analyze_ranges(compiler_context *ctx) * sophisticated. Select from the last UBO first to prioritize sysvals. */ static void -mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, unsigned max_qwords) +mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, unsigned max_qwords, unsigned sysval_ubo) { unsigned max_words = MIN2(PAN_MAX_PUSH, max_qwords * 4); + /* The sysval push range must be first */ + assert(sysval_ubo == analysis->nr_blocks - 1); + for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) { struct mir_ubo_block *block = &analysis->blocks[ubo]; @@ -109,14 +118,8 @@ mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, if (push->count > max_words - 4) return; - for (unsigned offs = 0; offs < 4; ++offs) { - struct panfrost_ubo_word word = { - .ubo = ubo, - .offset = (vec4 * 16) + (offs * 4) - }; - - push->words[push->count++] = word; - } + for (unsigned offs = 0; offs < 4; ++offs) + pan_add_pushed_ubo(push, ubo, (vec4 * 16) + (offs * 4)); /* Mark it as pushed so we can rewrite */ BITSET_SET(block->pushed, vec4); @@ -276,10 +279,15 @@ midgard_promote_uniforms(compiler_context *ctx) unsigned work_count = mir_work_heuristic(ctx, &analysis); unsigned promoted_count = 24 - work_count; + unsigned sysval_ubo = + MAX2(ctx->inputs->sysval_ubo, ctx->nir->info.num_ubos); + /* Ensure we are 16 byte aligned to avoid underallocations */ - mir_pick_ubo(&ctx->info->push, &analysis, promoted_count); + mir_pick_ubo(&ctx->info->push, &analysis, promoted_count, sysval_ubo); ctx->info->push.count = ALIGN_POT(ctx->info->push.count, 4); + pan_set_sysval_push(ctx->info, sysval_ubo); + /* First, figure out special indices a priori so we don't recompute a lot */ BITSET_WORD *special = mir_special_indices(ctx); @@ -290,6 +298,7 @@ midgard_promote_uniforms(compiler_context *ctx) unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store); unsigned qword = ins->constants.u32[0] / 16; + unsigned offset = ins->constants.u32[0] % 16; if (!mir_is_direct_aligned_ubo(ins)) { if (ins->src[1] == ~0) @@ -318,6 +327,13 @@ midgard_promote_uniforms(compiler_context *ctx) assert(address < promoted_count); unsigned promoted = SSA_FIXED_REGISTER(uniform_reg); + unsigned comps = mir_components_for_type(ins->dest_type); + unsigned swz_offset = offset * comps / 16; + + unsigned swizzle[MIR_VEC_COMPONENTS]; + for (unsigned i = 0; i < comps - swz_offset; ++i) + swizzle[i] = i + swz_offset; + /* We do need the move for safety for a non-SSA dest, or if * we're being fed into a special class */ @@ -331,12 +347,13 @@ midgard_promote_uniforms(compiler_context *ctx) midgard_instruction mov = v_mov(promoted, ins->dest); mov.dest_type = nir_type_uint | type_size; mov.src_types[1] = mov.dest_type; + memcpy(mov.swizzle[1], swizzle, sizeof(swizzle)); uint16_t rounded = mir_round_bytemask_up(mir_bytemask(ins), type_size); mir_set_bytemask(&mov, rounded); mir_insert_instruction_before(ctx, ins, mov); } else { - mir_rewrite_index_src(ctx, ins->dest, promoted); + mir_rewrite_index_src_swizzle(ctx, ins->dest, promoted, swizzle); } mir_remove_instruction(ins); diff --git a/src/panfrost/util/pan_ir.c b/src/panfrost/util/pan_ir.c index c469274933fb062363efa0fd95fee4b948d2bb27..15dae46135bbac88c2ac1f7ac6c3e4b1ba0d6157 100644 --- a/src/panfrost/util/pan_ir.c +++ b/src/panfrost/util/pan_ir.c @@ -135,16 +135,87 @@ pan_print_alu_type(nir_alu_type t, FILE *fp) unsigned pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs) { - struct panfrost_ubo_word word = { + unsigned count = 0; + + for (unsigned i = 0; i < push->num_ranges; ++i) { + struct panfrost_ubo_range range = push->ranges[i]; + + count += range.size; + + if (range.ubo != ubo) + continue; + + unsigned start = range.offset; + unsigned end = range.offset + (range.size * 4); + + if (offs >= start && offs < end) + return count - (end - offs) / 4; + } + + unreachable("UBO not pushed"); + +} + +void +pan_add_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs) +{ + ++push->count; + + if (push->num_ranges) { + struct panfrost_ubo_range *prev; + prev = &push->ranges[push->num_ranges - 1]; + + if (prev->ubo == ubo && + prev->offset + prev->size * 4 == offs) { + ++prev->size; + return; + } + } + + struct panfrost_ubo_range range = { .ubo = ubo, - .offset = offs + .size = 1, + .offset = offs, }; - for (unsigned i = 0; i < push->count; ++i) { - if (memcmp(push->words + i, &word, sizeof(word)) == 0) - return i; + push->ranges[push->num_ranges++] = range; +} + +struct panfrost_ubo_range +pan_index_pushed_ubo(struct panfrost_ubo_push *push, unsigned push_word) +{ + assert(push_word < push->count); + + for (unsigned i = 0; i < push->num_ranges; ++i) { + struct panfrost_ubo_range range = push->ranges[i]; + + if (range.size > push_word) { + return (struct panfrost_ubo_range){ + .ubo = range.ubo, + .offset = range.offset + push_word * 4, + .index = i, + }; + } + + push_word -= range.size; } - unreachable("UBO not pushed"); + unreachable("Invalid panfrost_ubo_push state"); +} + +void +pan_set_sysval_push(struct pan_shader_info *info, unsigned sysval_ubo) +{ + if (info->push.num_ranges && + info->push.ranges[0].ubo == sysval_ubo) { + + unsigned pushed_sysvals = info->push.ranges[0].size / 4; + info->sysvals.ubo_count -= pushed_sysvals; + info->sysvals.push_count += pushed_sysvals; + + /* The sysval upload code can only handle a single range */ + assert(!(info->push.num_ranges > 1 && + info->push.ranges[1].ubo == sysval_ubo)); + } } diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 1ff240b307052bd9fa7556b4682acaf2c8c474f2..ef425e4c0d03887394262ed91625bb78372aafe7 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -77,9 +77,11 @@ enum { }; struct panfrost_sysvals { - /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */ + /* The mapping of sysvals to uniforms, and the count of pushed and ubo + * sysvals */ unsigned sysvals[MAX_SYSVAL_COUNT]; - unsigned sysval_count; + unsigned push_count; + unsigned ubo_count; }; /* Technically Midgard could go up to 92 in a pathological case but we don't @@ -91,14 +93,19 @@ struct panfrost_sysvals { /* Architectural invariants (Midgard and Bifrost): UBO must be <= 2^16 bytes so * an offset to a word must be < 2^16. There are less than 2^8 UBOs */ -struct panfrost_ubo_word { - uint16_t ubo; +struct panfrost_ubo_range { + uint8_t ubo; + union { + uint8_t size; + uint8_t index; + }; uint16_t offset; }; struct panfrost_ubo_push { unsigned count; - struct panfrost_ubo_word words[PAN_MAX_PUSH]; + unsigned num_ranges; + struct panfrost_ubo_range ranges[PAN_MAX_PUSH]; }; /* Helper for searching the above. Note this is O(N) to the number of pushed @@ -107,6 +114,15 @@ struct panfrost_ubo_push { unsigned pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs); +void +pan_add_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs); + +/* Get a panfrost_ubo_range struct corresponding to the index in the push + * contants buffer, with the range index pointing to the range. */ + +struct panfrost_ubo_range +pan_index_pushed_ubo(struct panfrost_ubo_push *push, unsigned push_word); + struct hash_table_u64 * panfrost_init_sysvals(struct panfrost_sysvals *sysvals, void *memctx); @@ -334,6 +350,8 @@ void pan_print_alu_type(nir_alu_type t, FILE *fp); bool pan_has_source_mod(nir_alu_src *src, nir_op op); bool pan_has_dest_mod(nir_dest **dest, nir_op op); +void pan_set_sysval_push(struct pan_shader_info *info, unsigned sysval_ubo); + /* NIR passes to do some backend-specific lowering */ #define PAN_WRITEOUT_C 1 diff --git a/src/panfrost/util/pan_sysval.c b/src/panfrost/util/pan_sysval.c index 80a509f5b8b007c0fee61d68bb59c74e7fa0982a..4bdbceb1f446f3478181871bfeac6c7ad785951b 100644 --- a/src/panfrost/util/pan_sysval.c +++ b/src/panfrost/util/pan_sysval.c @@ -146,7 +146,7 @@ pan_lookup_sysval(struct hash_table_u64 *sysval_to_id, /* Else assign */ - unsigned id = sysvals->sysval_count++; + unsigned id = sysvals->ubo_count++; assert(id < MAX_SYSVAL_COUNT); _mesa_hash_table_u64_insert(sysval_to_id, sysval, (void *) ((uintptr_t) id + 1)); sysvals->sysvals[id] = sysval; @@ -157,6 +157,7 @@ pan_lookup_sysval(struct hash_table_u64 *sysval_to_id, struct hash_table_u64 * panfrost_init_sysvals(struct panfrost_sysvals *sysvals, void *memctx) { - sysvals->sysval_count = 0; + sysvals->push_count = 0; + sysvals->ubo_count = 0; return _mesa_hash_table_u64_create(memctx); } diff --git a/src/panfrost/vulkan/panvk_cmd_buffer.c b/src/panfrost/vulkan/panvk_cmd_buffer.c index 3da978b4837bdb81cfbb48c5e6a0c9eccf8e8029..5441224413845607fca08c599df156b16b770525 100644 --- a/src/panfrost/vulkan/panvk_cmd_buffer.c +++ b/src/panfrost/vulkan/panvk_cmd_buffer.c @@ -984,7 +984,7 @@ panvk_cmd_prepare_sysvals(struct panvk_cmd_buffer *cmdbuf) return; for (unsigned i = 0; i < ARRAY_SIZE(desc_state->sysvals); i++) { - unsigned sysval_count = pipeline->sysvals[i].ids.sysval_count; + unsigned sysval_count = pipeline->sysvals[i].ids.ubo_count; if (!sysval_count || (desc_state->sysvals[i] && !(cmdbuf->state.dirty & pipeline->sysvals[i].dirty_mask))) @@ -994,7 +994,7 @@ panvk_cmd_prepare_sysvals(struct panvk_cmd_buffer *cmdbuf) pan_pool_alloc_aligned(&cmdbuf->desc_pool.base, sysval_count * 16, 16); union panvk_sysval_data *data = sysvals.cpu; - for (unsigned s = 0; s < pipeline->sysvals[i].ids.sysval_count; s++) { + for (unsigned s = 0; s < pipeline->sysvals[i].ids.ubo_count; s++) { panvk_cmd_upload_sysval(cmdbuf, pipeline->sysvals[i].ids.sysvals[s], &data[s]); } diff --git a/src/panfrost/vulkan/panvk_cs.c b/src/panfrost/vulkan/panvk_cs.c index aa8dd5a286dd6083449c7b1b825e174401a2a374..495fc9846da987b9b6152560b30ff8a5f4fba405 100644 --- a/src/panfrost/vulkan/panvk_cs.c +++ b/src/panfrost/vulkan/panvk_cs.c @@ -264,13 +264,13 @@ panvk_emit_ubos(const struct panvk_pipeline *pipeline, } for (unsigned i = 0; i < ARRAY_SIZE(pipeline->sysvals); i++) { - if (!pipeline->sysvals[i].ids.sysval_count) + if (!pipeline->sysvals[i].ids.ubo_count) continue; pan_pack(&ubos[pipeline->sysvals[i].ubo_idx], UNIFORM_BUFFER, cfg) { cfg.pointer = pipeline->sysvals[i].ubo ? : state->sysvals[i]; - cfg.entries = pipeline->sysvals[i].ids.sysval_count; + cfg.entries = pipeline->sysvals[i].ids.ubo_count; } } } diff --git a/src/panfrost/vulkan/panvk_pipeline.c b/src/panfrost/vulkan/panvk_pipeline.c index e51e9ec9464822c37ba2bb2fe834787d2a92c31c..7e55a5666586edec62292b0a15b262866895e609 100644 --- a/src/panfrost/vulkan/panvk_pipeline.c +++ b/src/panfrost/vulkan/panvk_pipeline.c @@ -136,7 +136,7 @@ panvk_pipeline_builder_compile_shaders(struct panvk_pipeline_builder *builder, if (!shader) return VK_ERROR_OUT_OF_HOST_MEMORY; - if (shader->info.sysvals.sysval_count) + if (shader->info.sysvals.ubo_count) sysval_ubo++; builder->shaders[stage] = shader; @@ -219,11 +219,11 @@ panvk_pipeline_builder_alloc_static_state_bo(struct panvk_pipeline_builder *buil for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) { const struct panvk_shader *shader = builder->shaders[i]; - if (!shader || !shader->info.sysvals.sysval_count) + if (!shader || !shader->info.sysvals.ubo_count) continue; bool static_sysvals = true; - for (unsigned s = 0; s < shader->info.sysvals.sysval_count; s++) { + for (unsigned s = 0; s < shader->info.sysvals.ubo_count; s++) { unsigned id = shader->info.sysvals.sysvals[i]; static_sysvals &= panvk_pipeline_static_sysval(pipeline, id); switch (PAN_SYSVAL_TYPE(id)) { @@ -243,7 +243,7 @@ panvk_pipeline_builder_alloc_static_state_bo(struct panvk_pipeline_builder *buil bo_size = ALIGN_POT(bo_size, 16); builder->stages[i].sysvals_offset = bo_size; - bo_size += shader->info.sysvals.sysval_count * 16; + bo_size += shader->info.sysvals.ubo_count * 16; } if (bo_size) { @@ -282,7 +282,7 @@ panvk_pipeline_builder_init_sysvals(struct panvk_pipeline_builder *builder, pipeline->sysvals[stage].ids = shader->info.sysvals; pipeline->sysvals[stage].ubo_idx = shader->sysval_ubo; - if (!shader->info.sysvals.sysval_count || + if (!shader->info.sysvals.ubo_count || builder->stages[stage].sysvals_offset == ~0) return; @@ -292,7 +292,7 @@ panvk_pipeline_builder_init_sysvals(struct panvk_pipeline_builder *builder, pipeline->sysvals[stage].ubo = pipeline->state_bo->ptr.gpu + builder->stages[stage].sysvals_offset; - for (unsigned i = 0; i < shader->info.sysvals.sysval_count; i++) { + for (unsigned i = 0; i < shader->info.sysvals.ubo_count; i++) { unsigned id = shader->info.sysvals.sysvals[i]; panvk_pipeline_builder_upload_sysval(builder, @@ -347,13 +347,13 @@ panvk_pipeline_builder_init_shaders(struct panvk_pipeline_builder *builder, pipeline->num_ubos = builder->layout->num_ubos; for (unsigned i = 0; i < ARRAY_SIZE(pipeline->sysvals); i++) { - if (pipeline->sysvals[i].ids.sysval_count) + if (pipeline->sysvals[i].ids.ubo_count) pipeline->num_ubos = MAX2(pipeline->num_ubos, pipeline->sysvals[i].ubo_idx + 1); } pipeline->num_sysvals = 0; for (unsigned i = 0; i < ARRAY_SIZE(pipeline->sysvals); i++) - pipeline->num_sysvals += pipeline->sysvals[i].ids.sysval_count; + pipeline->num_sysvals += pipeline->sysvals[i].ids.ubo_count; } diff --git a/src/panfrost/vulkan/panvk_shader.c b/src/panfrost/vulkan/panvk_shader.c index 756ad24e011a55580ea5d306199eb8c51be9f529..4d7fcc4f51abb5c700a8d8026ff95e5b7c727392 100644 --- a/src/panfrost/vulkan/panvk_shader.c +++ b/src/panfrost/vulkan/panvk_shader.c @@ -402,7 +402,7 @@ panvk_shader_create(struct panvk_device *dev, /* Patch the descriptor count */ shader->info.ubo_count = - shader->info.sysvals.sysval_count ? sysval_ubo + 1 : layout->num_ubos; + shader->info.sysvals.ubo_count ? sysval_ubo + 1 : layout->num_ubos; shader->info.sampler_count = layout->num_samplers; shader->info.texture_count = layout->num_textures;