Commit f0766fc7 authored by Alyssa Rosenzweig's avatar Alyssa Rosenzweig 💜

panfrost: Implement transient pools

Previously, we allocated a massive amount of transient
(single-frame-only) command stream memory and just chipped into that
with a linear allocator. This broke if we needed more than we allocated,
and it used an absurd amount of memory when we didn't.

We continue to use a linear allocator for transient data, but we
allocate only a small slab on start-up. As more memory is needed, we
gradually increase the size of the pool, ensuring small apps require
minimal memory and large apps have enough for their needs.

This patch, in combination with the pb memory management patch on which
this patch depends, results in es2gears using one-tenth of the memory.
That's a 90% savings!

The other patch must be merged first.
parent 3051d29c
......@@ -26,11 +26,71 @@
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "pan_context.h"
#include "pan_nondrm.h"
/* TODO: What does this actually have to be? */
#define ALIGNMENT 128
/* Transient command stream pooling: command stream uploads try to simply copy
* into whereever we left off. If there isn't space, we allocate a new entry
* into the pool and copy there */
struct panfrost_transfer
panfrost_allocate_transient(struct panfrost_context *ctx, size_t sz)
{
/* Pad the size */
sz = ALIGN(sz, ALIGNMENT);
/* Check if there is room in the current entry */
struct panfrost_transient_pool *pool = &ctx->transient_pools[ctx->cmdstream_i];
if ((pool->entry_offset + sz) > pool->entry_size) {
/* Don't overflow this entry -- advance to the next */
pool->entry_offset = 0;
pool->entry_index++;
assert(pool->entry_index < PANFROST_MAX_TRANSIENT_ENTRIES);
/* Check if this entry exists */
if (pool->entry_index >= pool->entry_count) {
/* Don't overflow the pool -- allocate a new one */
struct pb_slab_entry *entry = pb_slab_alloc(&ctx->slabs, pool->entry_size, HEAP_TRANSIENT);
pool->entry_count++;
pool->entries[pool->entry_index] = (struct panfrost_memory_entry *) entry;
}
/* Make sure we -still- won't overflow */
assert(sz < pool->entry_size);
}
/* We have an entry we can write to, so do the upload! */
struct panfrost_memory_entry *p_entry = pool->entries[pool->entry_index];
struct panfrost_memory *backing = (struct panfrost_memory *) p_entry->base.slab;
struct panfrost_transfer ret = {
.cpu = backing->cpu + p_entry->offset + pool->entry_offset,
.gpu = backing->gpu + p_entry->offset + pool->entry_offset
};
/* Advance the pointer */
pool->entry_offset += sz;
return ret;
}
mali_ptr
panfrost_upload_transient(struct panfrost_context *ctx, const void *data, size_t sz)
{
struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sz);
memcpy(transfer.cpu, data, sz);
return transfer.gpu;
}
// TODO: An actual allocator, perhaps
// TODO: Multiple stacks for multiple bases?
......
This diff is collapsed.
......@@ -75,6 +75,25 @@ struct panfrost_constant_buffer {
void *buffer;
};
#define PANFROST_MAX_TRANSIENT_ENTRIES 64
struct panfrost_transient_pool {
/* Memory blocks in the pool */
struct panfrost_memory_entry *entries[PANFROST_MAX_TRANSIENT_ENTRIES];
/* Number of entries we own */
unsigned entry_count;
/* Current entry that we are writing to, zero-indexed, strictly less than entry_count */
unsigned entry_index;
/* Number of bytes into the current entry we are */
off_t entry_offset;
/* Entry size (all entries must be homogenous) */
size_t entry_size;
};
struct panfrost_context {
/* Gallium context */
struct pipe_context base;
......@@ -83,11 +102,11 @@ struct panfrost_context {
int fd;
struct pipe_framebuffer_state pipe_framebuffer;
/* The number of concurrent FBOs allowed depends on the number of rings used */
struct panfrost_memory cmdstream_rings[2];
int cmdstream_i;
/* The number of concurrent FBOs allowed depends on the number of pools
* used; pools are ringed for parallelism opportunities */
struct panfrost_memory cmdstream;
struct panfrost_transient_pool transient_pools[2];
int cmdstream_i;
struct panfrost_memory cmdstream_persistent;
struct panfrost_memory shaders;
......@@ -143,6 +162,10 @@ struct panfrost_context {
mali_ptr vertex_jobs[MAX_DRAW_CALLS];
mali_ptr tiler_jobs[MAX_DRAW_CALLS];
struct mali_job_descriptor_header *u_set_value_job;
struct mali_job_descriptor_header *u_vertex_jobs[MAX_DRAW_CALLS];
struct mali_job_descriptor_header *u_tiler_jobs[MAX_DRAW_CALLS];
unsigned vertex_job_count;
unsigned tiler_job_count;
......@@ -342,9 +365,7 @@ panfrost_resource_create_front(struct pipe_screen *screen,
void
panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data);
mali_ptr
struct panfrost_transfer
panfrost_vertex_tiler_job(struct panfrost_context *ctx, bool is_tiler, bool is_elided_tiler);
#define JOB_DESC(ptr) ((struct mali_job_descriptor_header *) (uintptr_t) (ptr - mem.gpu + (uintptr_t) mem.cpu))
#endif
......@@ -49,6 +49,25 @@ struct panfrost_shader_state;
void
panfrost_shader_compile(struct panfrost_context *ctx, struct mali_shader_meta *meta, const char *src, int type, struct panfrost_shader_state *state);
/* Texture memory */
#define HEAP_TEXTURE 0
/* Single-frame (transient) command stream memory, done at the block scale
* rather than the individual cmdstream alllocation scale. We use pb_alloc for
* pooling, but we have to implement our own logic atop the API for performance
* reasons when considering many low-latency tiny heterogenous allocations */
#define HEAP_TRANSIENT 1
/* Represents a fat pointer for GPU-mapped memory, returned from the transient
* allocator and not used for much else */
struct panfrost_transfer {
uint8_t *cpu;
mali_ptr gpu;
};
struct panfrost_memory {
/* Subclassing slab object */
struct pb_slab slab;
......@@ -79,6 +98,12 @@ mali_ptr pandev_upload_sequential(mali_ptr base, void *base_map, const void *dat
mali_ptr panfrost_upload(struct panfrost_memory *mem, const void *data, size_t sz, bool no_pad);
mali_ptr panfrost_upload_sequential(struct panfrost_memory *mem, const void *data, size_t sz);
struct panfrost_transfer
panfrost_allocate_transient(struct panfrost_context *ctx, size_t sz);
mali_ptr
panfrost_upload_transient(struct panfrost_context *ctx, const void *data, size_t sz);
void *
panfrost_allocate_transfer(struct panfrost_memory *mem, size_t sz, mali_ptr *gpu);
......
......@@ -209,7 +209,7 @@ panfrost_draw_wallpaper(struct pipe_context *pipe)
2048.0, 1280.0, 0.0, 1.0,
};
ctx->payload_tiler.postfix.position_varying = panfrost_upload(&ctx->cmdstream, implied_position_varying, sizeof(implied_position_varying), true);
ctx->payload_tiler.postfix.position_varying = panfrost_upload_transient(ctx, implied_position_varying, sizeof(implied_position_varying));
/* Similarly, setup the texture coordinate varying, hardcoded to match
* the corners of the screen */
......@@ -223,13 +223,13 @@ panfrost_draw_wallpaper(struct pipe_context *pipe)
struct mali_attr varyings[1] = {
{
.elements = panfrost_upload(&ctx->cmdstream, texture_coordinates, sizeof(texture_coordinates), true) | 1,
.elements = panfrost_upload_transient(ctx, texture_coordinates, sizeof(texture_coordinates)) | 1,
.stride = sizeof(float) * 4,
.size = sizeof(texture_coordinates)
}
};
ctx->payload_tiler.postfix.varyings = panfrost_upload(&ctx->cmdstream, varyings, sizeof(varyings), true);
ctx->payload_tiler.postfix.varyings = panfrost_upload_transient(ctx, varyings, sizeof(varyings));
struct mali_attr_meta varying_meta[1] = {
{
......@@ -242,11 +242,13 @@ panfrost_draw_wallpaper(struct pipe_context *pipe)
};
mali_ptr saved_varying_meta = ctx->payload_tiler.postfix.varying_meta;
ctx->payload_tiler.postfix.varying_meta = panfrost_upload(&ctx->cmdstream, varying_meta, sizeof(varying_meta), true);
ctx->payload_tiler.postfix.varying_meta = panfrost_upload_transient(ctx, varying_meta, sizeof(varying_meta));
/* Emit the tiler job */
mali_ptr tiler_job = panfrost_vertex_tiler_job(ctx, true, true);
ctx->tiler_jobs[ctx->tiler_job_count++] = tiler_job;
struct panfrost_transfer tiler = panfrost_vertex_tiler_job(ctx, true, true);
struct mali_job_descriptor_header *jd = (struct mali_job_descriptor_header *) tiler.cpu;
ctx->u_tiler_jobs[ctx->tiler_job_count] = jd;
ctx->tiler_jobs[ctx->tiler_job_count++] = tiler.gpu;
ctx->draw_count++;
/* Okay, so we have the tiler job emitted. Since we set elided_tiler
......@@ -258,8 +260,7 @@ panfrost_draw_wallpaper(struct pipe_context *pipe)
*/
if (ctx->tiler_job_count > 1) {
struct panfrost_memory mem = ctx->cmdstream;
JOB_DESC(ctx->tiler_jobs[0])->job_dependency_index_2 = JOB_DESC(tiler_job)->job_index;
ctx->u_tiler_jobs[0]->job_dependency_index_2 = jd->job_index;
}
printf("Wallpaper boop\n");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment