Commit 85e745f2 authored by Alyssa Rosenzweig's avatar Alyssa Rosenzweig 💜

panfrost: Integrate kernel names for tiler FBD

These names are from the replay workaround in kbase; they begin to shine
some light on the meaning of these fields. In particular, we now
understand why the "tiler_meta" field has the effect it does on
performance in certain scenes (controlling tile granularity).
Signed-off-by: Alyssa Rosenzweig's avatarAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
parent 1a7caac9
......@@ -2,6 +2,7 @@
* © Copyright 2017-2018 Alyssa Rosenzweig
* © Copyright 2017-2018 Connor Abbott
* © Copyright 2017-2018 Lyude Paul
* © Copyright2019 Collabora
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
......@@ -1362,16 +1363,16 @@ struct mali_single_framebuffer {
u32 zero6[7];
/* Very weird format, see generation code in trans_builder.c */
u32 resolution_check;
u32 tiler_resolution_check;
u32 tiler_flags;
u64 unknown_address_1; /* Pointing towards... a zero buffer? */
u64 unknown_address_2;
/* Guesses? */
mali_ptr tiler_scratch_start; /* Pointing towards... a zero buffer? */
mali_ptr tiler_scratch_middle;
/* See mali_kbase_replay.c */
u64 tiler_heap_free;
u64 tiler_heap_end;
mali_ptr tiler_heap_free;
mali_ptr tiler_heap_end;
/* More below this, maybe */
} __attribute__((packed));
......@@ -1519,18 +1520,29 @@ struct bifrost_framebuffer {
u32 clear_stencil : 8;
u32 unk3 : 24; // = 0x100
float clear_depth;
mali_ptr tiler_meta;
/* 0x40 */
/* Tiler section begins here */
u32 tiler_unknown;
/* Name known from the replay workaround in the kernel. What exactly is
* flagged here is less known. We do that (tiler_flags & 0x1ff)
* specifies a mask of hierarchy weights, which explains some of the
* performance mysteries around setting it. We also known (1 << 16)
* should be set, but there's no explanation in the kernel why. */
u32 tiler_flags;
/* Note: these are guesses! */
mali_ptr tiler_scratch_start;
mali_ptr tiler_scratch_middle;
/* These are not, since we see symmetry with replay jobs which name these explicitly */
mali_ptr tiler_heap_start;
/* These are not, since we see symmetry with replay
* jobs which name these explicitly */
mali_ptr tiler_heap_start; /* tiler heap_free_address */
mali_ptr tiler_heap_end;
u64 zero9, zero10, zero11, zero12;
u32 tiler_weights[8];
/* optional: struct bifrost_fb_extra extra */
/* struct bifrost_render_target rts[] */
......
......@@ -107,7 +107,7 @@ panfrost_set_framebuffer_resolution(struct mali_single_framebuffer *fb, int w, i
* The formula itself was discovered mostly by manual bruteforce and
* aggressive algebraic simplification. */
fb->resolution_check = ((w + h) / 3) << 4;
fb->tiler_resolution_check = ((w + h) / 3) << 4;
}
struct mali_single_framebuffer
......@@ -118,8 +118,8 @@ panfrost_emit_sfbd(struct panfrost_context *ctx)
.format = 0x30000000,
.clear_flags = 0x1000,
.unknown_address_0 = ctx->scratchpad.gpu,
.unknown_address_1 = ctx->misc_0.gpu,
.unknown_address_2 = ctx->misc_0.gpu + 40960,
.tiler_scratch_start = ctx->misc_0.gpu,
.tiler_scratch_middle = ctx->misc_0.gpu + 40960,
.tiler_flags = 0xf0,
.tiler_heap_free = ctx->tiler_heap.gpu,
.tiler_heap_end = ctx->tiler_heap.gpu + ctx->tiler_heap.size,
......@@ -134,28 +134,22 @@ struct bifrost_framebuffer
panfrost_emit_mfbd(struct panfrost_context *ctx)
{
struct bifrost_framebuffer framebuffer = {
/* It is not yet clear what tiler_meta means or how it's
* calculated, but we can tell the lower 32-bits are a
* (monotonically increasing?) function of tile count and
* geometry complexity; I suspect it defines a memory size of
* some kind? for the tiler. It's really unclear at the
* moment... but to add to the confusion, the hardware is happy
* enough to accept a zero in this field, so we don't even have
* to worry about it right now.
*
* The byte (just after the 32-bit mark) is much more
* interesting. The higher nibble I've only ever seen as 0xF,
* but the lower one I've seen as 0x0 or 0xF, and it's not
* obvious what the difference is. But what -is- obvious is
* that when the lower nibble is zero, performance is severely
* degraded compared to when the lower nibble is set.
* Evidently, that nibble enables some sort of fast path,
* perhaps relating to caching or tile flush? Regardless, at
* this point there's no clear reason not to set it, aside from
* substantially increased memory requirements (of the misc_0
* buffer) */
.tiler_meta = ((uint64_t) 0xff << 32) | 0x0,
/* It is not yet clear what this means or how it's
* calculated, but we can tell it is a (monotonically
* increasing?) function of tile count and geometry complexity;
* I suspect it defines a memory size of some kind? for the
* tiler. It's really unclear at the moment... but to add to
* the confusion, the hardware is happy enough to accept a zero
* in this field, so we don't even have to worry about it right
* now. */
.tiler_unknown = 0x0,
/* The lower 0xff controls the hierarchy mask. Set more bits
* on for more tile granularity (which can be a performance win
* on some scenes, at memory bandwidth costs). For now, be lazy
* and enable everything. This might be a terrible idea. */
.tiler_flags = 0xff,
.width1 = MALI_POSITIVE(ctx->pipe_framebuffer.width),
.height1 = MALI_POSITIVE(ctx->pipe_framebuffer.height),
......
......@@ -463,10 +463,10 @@ pandecode_replay_sfbd(uint64_t gpu_va, int job_no)
}
MEMORY_PROP(s, unknown_address_0);
MEMORY_PROP(s, unknown_address_1);
MEMORY_PROP(s, unknown_address_2);
MEMORY_PROP(s, tiler_scratch_start);
MEMORY_PROP(s, tiler_scratch_middle);
pandecode_prop("resolution_check = 0x%" PRIx32, s->resolution_check);
pandecode_prop("tiler_resolution_check = 0x%" PRIx32, s->tiler_resolution_check);
pandecode_prop("tiler_flags = 0x%" PRIx32, s->tiler_flags);
MEMORY_PROP(s, tiler_heap_free);
......@@ -640,12 +640,12 @@ pandecode_replay_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets)
if (fb->sample_locations)
pandecode_prop("sample_locations = sample_locations_%d", job_no);
/* Assume that unknown1 and tiler_meta were emitted in the last job for
/* Assume that unknown1 was emitted in the last job for
* now */
/*pandecode_prop("unknown1 = unknown1_%d_p", job_no - 1);
pandecode_prop("tiler_meta = tiler_meta_%d_p", job_no - 1);*/
MEMORY_PROP(fb, unknown1);
MEMORY_PROP(fb, tiler_meta);
pandecode_prop("tiler_unknown = 0x%x", fb->tiler_unknown);
pandecode_prop("tiler_flags = 0x%x", fb->tiler_flags);
pandecode_prop("width1 = MALI_POSITIVE(%d)", fb->width1 + 1);
pandecode_prop("height1 = MALI_POSITIVE(%d)", fb->height1 + 1);
......@@ -668,14 +668,26 @@ pandecode_replay_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets)
MEMORY_PROP(fb, tiler_heap_start);
MEMORY_PROP(fb, tiler_heap_end);
if (fb->zero3 || fb->zero4 || fb->zero9 || fb->zero10 || fb->zero11 || fb->zero12) {
if (fb->zero3 || fb->zero4) {
pandecode_msg("framebuffer zeros tripped\n");
pandecode_prop("zero3 = 0x%" PRIx32, fb->zero3);
pandecode_prop("zero4 = 0x%" PRIx32, fb->zero4);
pandecode_prop("zero9 = 0x%" PRIx64, fb->zero9);
pandecode_prop("zero10 = 0x%" PRIx64, fb->zero10);
pandecode_prop("zero11 = 0x%" PRIx64, fb->zero11);
pandecode_prop("zero12 = 0x%" PRIx64, fb->zero12);
}
bool nonzero_weights = false;
for (unsigned w = 0; w < ARRAY_SIZE(fb->tiler_weights); ++w) {
nonzero_weights |= fb->tiler_weights[w] != 0x0;
}
if (nonzero_weights) {
pandecode_log(".tiler_weights = {");
for (unsigned w = 0; w < ARRAY_SIZE(fb->tiler_weights); ++w) {
pandecode_log("%d, ", fb->tiler_weights[w]);
}
pandecode_log("},");
}
pandecode_indent--;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment