Commit ce713cd5 authored by Christoph Bumiller's avatar Christoph Bumiller
Browse files

nvc0: replace VERTEX_DATA push mode with translate to buffer

While pushing vertices through the FIFO is relatively fast on nv50,
it's horribly slow on nvc0.
parent edbfeed5
......@@ -11,9 +11,9 @@ C_SOURCES := \
nvc0_tex.c \
nvc0_transfer.c \
nvc0_vbo.c \
nvc0_vbo_translate.c \
nvc0_program.c \
nvc0_shader_state.c \
nvc0_push.c \
nvc0_query.c
CPP_SOURCES := \
......
......@@ -88,6 +88,7 @@ struct nvc0_context {
uint32_t constant_elts;
int32_t index_bias;
uint16_t scissor;
uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
uint8_t num_vtxbufs;
uint8_t num_vtxelts;
uint8_t num_textures[5];
......@@ -118,7 +119,6 @@ struct nvc0_context {
unsigned num_vtxbufs;
struct pipe_index_buffer idxbuf;
uint32_t constant_vbos;
uint32_t vbo_fifo; /* bitmask of vertex elements to be pushed to FIFO */
uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
unsigned vbo_min_index; /* from pipe_draw_info, for vertex upload */
unsigned vbo_max_index;
......
......@@ -35,6 +35,7 @@ struct nvc0_zsa_stateobj {
struct nvc0_vertex_element {
struct pipe_vertex_element pipe;
uint32_t state;
uint32_t state_alt; /* buffer 0 and with source offset (for translate) */
};
struct nvc0_vertex_stateobj {
......@@ -43,8 +44,7 @@ struct nvc0_vertex_stateobj {
uint32_t instance_elts;
uint32_t instance_bufs;
boolean need_conversion; /* e.g. VFETCH cannot convert f64 to f32 */
unsigned vtx_size;
unsigned vtx_per_packet_max;
unsigned size; /* size of vertex in bytes (when packed) */
struct nvc0_vertex_element element[0];
};
......
......@@ -86,31 +86,41 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
so->element[i].state = nvc0_format_table[fmt].vtx;
so->need_conversion = TRUE;
}
so->element[i].state |= i;
if (unlikely(ve->instance_divisor)) {
so->instance_elts |= 1 << i;
so->instance_bufs |= 1 << vbi;
}
if (1) {
unsigned ca;
unsigned j = transkey.nr_elements++;
ca = util_format_description(fmt)->channel[0].size / 8;
if (ca != 1 && ca != 2)
ca = 4;
transkey.element[j].type = TRANSLATE_ELEMENT_NORMAL;
transkey.element[j].input_format = ve->src_format;
transkey.element[j].input_buffer = vbi;
transkey.element[j].input_offset = ve->src_offset;
transkey.element[j].instance_divisor = ve->instance_divisor;
transkey.output_stride = align(transkey.output_stride, ca);
transkey.element[j].output_format = fmt;
transkey.element[j].output_offset = transkey.output_stride;
transkey.output_stride += (util_format_get_stride(fmt, 1) + 3) & ~3;
transkey.output_stride += util_format_get_blocksize(fmt);
if (unlikely(ve->instance_divisor)) {
so->instance_elts |= 1 << i;
so->instance_bufs |= 1 << vbi;
}
so->element[i].state_alt = so->element[i].state;
so->element[i].state_alt |= transkey.element[j].output_offset << 7;
}
so->element[i].state |= i << NVC0_3D_VERTEX_ATTRIB_FORMAT_BUFFER__SHIFT;
}
transkey.output_stride = align(transkey.output_stride, 4);
so->size = transkey.output_stride;
so->translate = translate_create(&transkey);
so->vtx_size = transkey.output_stride / 4;
so->vtx_per_packet_max = NV04_PFIFO_MAX_PACKET_LEN / MAX2(so->vtx_size, 1);
return so;
}
......@@ -182,7 +192,10 @@ nvc0_vbuf_range(struct nvc0_context *nvc0, int vbi,
}
}
static void
/* Return whether to use alternative vertex submission mode (translate),
* and validate vertex buffers and upload user arrays (if normal mode).
*/
static uint8_t
nvc0_prevalidate_vbufs(struct nvc0_context *nvc0)
{
const uint32_t bo_flags = NOUVEAU_BO_RD | NOUVEAU_BO_GART;
......@@ -192,7 +205,7 @@ nvc0_prevalidate_vbufs(struct nvc0_context *nvc0)
int i;
uint32_t base, size;
nvc0->vbo_fifo = nvc0->vbo_user = 0;
nvc0->vbo_user = 0;
nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
......@@ -203,10 +216,8 @@ nvc0_prevalidate_vbufs(struct nvc0_context *nvc0)
buf = nv04_resource(vb->buffer);
if (!nouveau_resource_mapped_by_gpu(vb->buffer)) {
if (nvc0->vbo_push_hint) {
nvc0->vbo_fifo = ~0;
return;
}
if (nvc0->vbo_push_hint)
return 1;
nvc0->base.vbo_dirty = TRUE;
if (buf->status & NOUVEAU_BUFFER_STATUS_USER_MEMORY) {
......@@ -223,6 +234,7 @@ nvc0_prevalidate_vbufs(struct nvc0_context *nvc0)
}
BCTX_REFN(nvc0->bufctx_3d, VTX, buf, RD);
}
return 0;
}
static void
......@@ -283,55 +295,85 @@ nvc0_vertex_arrays_validate(struct nvc0_context *nvc0)
struct nvc0_vertex_element *ve;
uint32_t const_vbos;
unsigned i;
uint8_t vbo_mode;
boolean update_vertex;
if (unlikely(vertex->need_conversion) ||
unlikely(nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS)) {
nvc0->vbo_user = 0;
nvc0->vbo_fifo = ~nvc0->constant_vbos;
vbo_mode = 3;
} else {
nvc0_prevalidate_vbufs(nvc0);
nvc0->vbo_fifo &= ~nvc0->constant_vbos;
vbo_mode = nvc0_prevalidate_vbufs(nvc0);
}
const_vbos = nvc0->vbo_fifo ? 0 : nvc0->constant_vbos;
const_vbos = vbo_mode ? 0 : nvc0->constant_vbos;
update_vertex = (nvc0->dirty & NVC0_NEW_VERTEX) ||
(const_vbos != nvc0->state.constant_vbos);
(const_vbos != nvc0->state.constant_vbos) ||
(vbo_mode != nvc0->state.vbo_mode);
if (update_vertex) {
uint32_t *restrict data;
const unsigned n = MAX2(vertex->num_elements, nvc0->state.num_vtxelts);
if (unlikely(vertex->instance_elts != nvc0->state.instance_elts)) {
nvc0->state.instance_elts = vertex->instance_elts;
assert(n); /* if (n == 0), both masks should be 0 */
PUSH_SPACE(push, 3);
BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2);
PUSH_DATA (push, n);
PUSH_DATA (push, vertex->instance_elts);
}
nvc0->state.num_vtxelts = vertex->num_elements;
nvc0->state.constant_vbos = const_vbos;
nvc0->state.constant_elts = 0;
nvc0->state.num_vtxelts = vertex->num_elements;
nvc0->state.vbo_mode = vbo_mode;
if (unlikely(vbo_mode)) {
if (unlikely(nvc0->state.instance_elts & 3)) {
/* translate mode uses only 2 vertex buffers */
nvc0->state.instance_elts &= ~3;
PUSH_SPACE(push, 3);
BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_PER_INSTANCE(0)), 2);
PUSH_DATA (push, 0);
PUSH_DATA (push, 0);
}
PUSH_SPACE(push, n * 2 + 4);
PUSH_SPACE(push, n * 2 + 1);
BEGIN_NVC0(push, NVC0_3D(VERTEX_ATTRIB_FORMAT(0)), n);
data = push->cur;
push->cur += n;
for (i = 0; i < vertex->num_elements; ++data, ++i) {
ve = &vertex->element[i];
*data = ve->state;
if (unlikely(const_vbos & (1 << ve->pipe.vertex_buffer_index))) {
*data |= NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST;
nvc0->state.constant_elts |= 1 << i;
BEGIN_NVC0(push, NVC0_3D(VERTEX_ATTRIB_FORMAT(0)), n);
for (i = 0; i < vertex->num_elements; ++i)
PUSH_DATA(push, vertex->element[i].state_alt);
for (; i < n; ++i)
PUSH_DATA(push, NVC0_3D_VERTEX_ATTRIB_INACTIVE);
BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(0)), 1);
PUSH_DATA (push, (1 << 12) | vertex->size);
for (i = 1; i < n; ++i)
IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0);
} else {
uint32_t *restrict data;
if (unlikely(vertex->instance_elts != nvc0->state.instance_elts)) {
nvc0->state.instance_elts = vertex->instance_elts;
assert(n); /* if (n == 0), both masks should be 0 */
PUSH_SPACE(push, 3);
BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2);
PUSH_DATA (push, n);
PUSH_DATA (push, vertex->instance_elts);
}
PUSH_SPACE(push, n * 2 + 1);
BEGIN_NVC0(push, NVC0_3D(VERTEX_ATTRIB_FORMAT(0)), n);
data = push->cur;
push->cur += n;
for (i = 0; i < vertex->num_elements; ++i) {
ve = &vertex->element[i];
data[i] = ve->state;
if (unlikely(const_vbos & (1 << ve->pipe.vertex_buffer_index))) {
nvc0->state.constant_elts |= 1 << i;
data[i] |= NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST;
IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0);
}
}
for (; i < n; ++i) {
data[i] = NVC0_3D_VERTEX_ATTRIB_INACTIVE;
IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0);
}
}
for (; i < n; ++data, ++i) {
IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0);
*data = NVC0_3D_VERTEX_ATTRIB_INACTIVE;
}
}
if (nvc0->state.vbo_mode) /* using translate, don't set up arrays here */
return;
PUSH_SPACE(push, vertex->num_elements * 8);
for (i = 0; i < vertex->num_elements; ++i) {
......@@ -660,25 +702,35 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
/* For picking only a few vertices from a large user buffer, push is better,
* if index count is larger and we expect repeated vertices, suggest upload.
*/
nvc0->vbo_push_hint = /* the 64 is heuristic */
!(info->indexed &&
((info->max_index - info->min_index + 64) < info->count));
nvc0->vbo_push_hint =
info->indexed &&
(info->max_index - info->min_index) >= (info->count * 2);
nvc0->vbo_min_index = info->min_index;
nvc0->vbo_max_index = info->max_index;
if (nvc0->vbo_push_hint != !!nvc0->vbo_fifo)
nvc0->dirty |= NVC0_NEW_ARRAYS;
if (nvc0->vbo_user && !(nvc0->dirty & (NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS)))
nvc0_update_user_vbufs(nvc0);
/* Check whether we want to switch vertex-submission mode,
* and if not, update user vbufs.
*/
if (!(nvc0->dirty & NVC0_NEW_ARRAYS)) {
if (nvc0->vbo_push_hint) {
if (nvc0->vbo_user)
nvc0->dirty |= NVC0_NEW_ARRAYS; /* switch to translate mode */
} else
if (nvc0->state.vbo_mode == 1) {
nvc0->dirty |= NVC0_NEW_ARRAYS; /* back to normal mode */
}
if (nvc0->vbo_user &&
!(nvc0->dirty & (NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS)))
nvc0_update_user_vbufs(nvc0);
}
/* 8 as minimum to avoid immediate double validation of new buffers */
nvc0_state_validate(nvc0, ~0, 8);
push->kick_notify = nvc0_draw_vbo_kick_notify;
if (nvc0->vbo_fifo) {
if (nvc0->state.vbo_mode) {
nvc0_push_vbo(nvc0, info);
push->kick_notify = nvc0_default_kick_notify;
return;
......
#include "pipe/p_context.h"
#include "pipe/p_state.h"
#include "util/u_inlines.h"
#include "util/u_format.h"
#include "translate/translate.h"
#include "nvc0_context.h"
#include "nvc0_resource.h"
#include "nvc0_3d.xml.h"
struct push_context {
struct nouveau_pushbuf *push;
struct translate *translate;
void *dest;
const void *idxbuf;
uint32_t vertex_size;
uint32_t restart_index;
uint32_t instance_id;
boolean prim_restart;
boolean need_vertex_id;
struct {
boolean enabled;
boolean value;
unsigned stride;
const uint8_t *data;
} edgeflag;
};
static void nvc0_push_upload_vertex_ids(struct push_context *,
struct nvc0_context *,
const struct pipe_draw_info *);
static void
nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx)
{
ctx->push = nvc0->base.pushbuf;
ctx->translate = nvc0->vertex->translate;
ctx->vertex_size = nvc0->vertex->size;
ctx->need_vertex_id =
nvc0->vertprog->vp.need_vertex_id && (nvc0->vertex->num_elements < 32);
ctx->edgeflag.value = TRUE;
ctx->edgeflag.enabled = nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS;
/* silence warnings */
ctx->edgeflag.data = NULL;
ctx->edgeflag.stride = 0;
}
static INLINE void
nvc0_vertex_configure_translate(struct nvc0_context *nvc0, int32_t index_bias)
{
struct translate *translate = nvc0->vertex->translate;
unsigned i;
for (i = 0; i < nvc0->num_vtxbufs; ++i) {
const uint8_t *map;
const struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[i];
map = nouveau_resource_map_offset(&nvc0->base,
nv04_resource(vb->buffer), vb->buffer_offset, NOUVEAU_BO_RD);
if (index_bias && !unlikely(nvc0->vertex->instance_bufs & (1 << i)))
map += (intptr_t)index_bias * vb->stride;
translate->set_buffer(translate, i, map, vb->stride, ~0);
}
}
static INLINE void
nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0)
{
struct nv04_resource *buf = nv04_resource(nvc0->idxbuf.buffer);
unsigned offset = nvc0->idxbuf.offset;
ctx->idxbuf = nouveau_resource_map_offset(&nvc0->base,
buf, offset, NOUVEAU_BO_RD);
}
static INLINE void
nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0,
int32_t index_bias)
{
unsigned attr = nvc0->vertprog->vp.edgeflag;
struct pipe_vertex_element *ve = &nvc0->vertex->element[attr].pipe;
struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[ve->vertex_buffer_index];
struct nv04_resource *buf = nv04_resource(vb->buffer);
unsigned offset = vb->buffer_offset + ve->src_offset;
ctx->edgeflag.stride = vb->stride;
ctx->edgeflag.data = nouveau_resource_map_offset(&nvc0->base,
buf, offset, NOUVEAU_BO_RD);
if (index_bias)
ctx->edgeflag.data += (intptr_t)index_bias * vb->stride;
}
static INLINE unsigned
prim_restart_search_i08(const uint8_t *elts, unsigned push, uint8_t index)
{
unsigned i;
for (i = 0; i < push && elts[i] != index; ++i);
return i;
}
static INLINE unsigned
prim_restart_search_i16(const uint16_t *elts, unsigned push, uint16_t index)
{
unsigned i;
for (i = 0; i < push && elts[i] != index; ++i);
return i;
}
static INLINE unsigned
prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index)
{
unsigned i;
for (i = 0; i < push && elts[i] != index; ++i);
return i;
}
static INLINE boolean
ef_value(const struct push_context *ctx, uint32_t index)
{
float *pf = (float *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
return *pf ? TRUE : FALSE;
}
static INLINE boolean
ef_toggle(struct push_context *ctx)
{
ctx->edgeflag.value = !ctx->edgeflag.value;
return ctx->edgeflag.value;
}
static INLINE unsigned
ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n)
{
unsigned i;
for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
return i;
}
static INLINE unsigned
ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n)
{
unsigned i;
for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
return i;
}
static INLINE unsigned
ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n)
{
unsigned i;
for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
return i;
}
static INLINE unsigned
ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n)
{
unsigned i;
for (i = 0; i < n && ef_value(ctx, start++) == ctx->edgeflag.value; ++i);
return i;
}
static INLINE void *
nvc0_push_setup_vertex_array(struct nvc0_context *nvc0, const unsigned count)
{
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
struct nouveau_bo *bo;
uint64_t va;
const unsigned size = count * nvc0->vertex->size;
void *const dest = nouveau_scratch_get(&nvc0->base, size, &va, &bo);
BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_START_HIGH(0)), 2);
PUSH_DATAh(push, va);
PUSH_DATA (push, va);
BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(0)), 2);
PUSH_DATAh(push, va + size - 1);
PUSH_DATA (push, va + size - 1);
BCTX_REFN_bo(nvc0->bufctx_3d, VTX_TMP, NOUVEAU_BO_GART | NOUVEAU_BO_RD,
bo);
nouveau_pushbuf_validate(push);
return dest;
}
static void
disp_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
{
struct nouveau_pushbuf *push = ctx->push;
struct translate *translate = ctx->translate;
const uint8_t *restrict elts = (uint8_t *)ctx->idxbuf + start;
unsigned pos = 0;
do {
unsigned nR = count;
if (unlikely(ctx->prim_restart))
nR = prim_restart_search_i08(elts, nR, ctx->restart_index);
translate->run_elts8(translate, elts, nR, ctx->instance_id, ctx->dest);
count -= nR;
ctx->dest += nR * ctx->vertex_size;
while (nR) {
unsigned nE = nR;
if (unlikely(ctx->edgeflag.enabled))
nE = ef_toggle_search_i08(ctx, elts, nR);
PUSH_SPACE(push, 4);
if (likely(nE >= 2)) {
BEGIN_NVC0(push, NVC0_3D(VERTEX_BUFFER_FIRST), 2);
PUSH_DATA (push, pos);
PUSH_DATA (push, nE);
} else
if (nE) {
if (pos <= 0xff) {
IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_U32), pos);
} else {
BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
PUSH_DATA (push, pos);
}
}
if (unlikely(nE != nR))
IMMED_NVC0(push, NVC0_3D(EDGEFLAG), ef_toggle(ctx));
pos += nE;
elts += nE;
nR -= nE;
}
if (count) {
BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
PUSH_DATA (push, ctx->restart_index);
++elts;
ctx->dest += ctx->vertex_size;
++pos;
--count;
}
} while (count);
}
static void
disp_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
{
struct nouveau_pushbuf *push = ctx->push;
struct translate *translate = ctx->translate;
const uint16_t *restrict elts = (uint16_t *)ctx->idxbuf + start;
unsigned pos = 0;
do {
unsigned nR = count;
if (unlikely(ctx->prim_restart))
nR = prim_restart_search_i16(elts, nR, ctx->restart_index);
translate->run_elts16(translate, elts, nR, ctx->instance_id, ctx->dest);
count -= nR;
ctx->dest += nR * ctx->vertex_size;
while (nR) {
unsigned nE = nR;
if (unlikely(ctx->edgeflag.enabled))
nE = ef_toggle_search_i16(ctx, elts, nR);
PUSH_SPACE(push, 4);
if (likely(nE >= 2)) {
BEGIN_NVC0(push, NVC0_3D(VERTEX_BUFFER_FIRST), 2);
PUSH_DATA (push, pos);
PUSH_DATA (push, nE);
} else
if (nE) {
if (pos <= 0xff) {
IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_U32), pos);
} else {
BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
PUSH_DATA (push, pos);
}
}
if (unlikely(nE != nR))
IMMED_NVC0(push, NVC0_3D(EDGEFLAG), ef_toggle(ctx));
pos += nE;
elts += nE;
nR -= nE;