Commit 1c2e5c22 authored by Zack Rusin's avatar Zack Rusin
Browse files

draw/translate: fix instancing



We were incorrectly computing the buffer offset when using the
instances. The buffer offset is always equal to:
start_instance * stride + (instance_num / instance_divisor) *
stride
We were completely ignoring the start instance quite
often producing instances that completely wrong, e.g. if
start instance = 5, instance divisor = 2, then on the first
iteration it should be:
5 * stride, not (5/2) * stride as we'd have currently, and if
start instance = 1, instance divisor = 3, then on the first
iteration it should be:
1 * stride, not 0 as we'd have.
This fixes it and adjusts all the code to the changes.
Signed-off-by: default avatarZack Rusin <zackr@vmware.com>
parent df4ab797
......@@ -674,6 +674,7 @@ generate_vs(struct draw_llvm_variant *variant,
static void
generate_fetch(struct gallivm_state *gallivm,
struct draw_context *draw,
LLVMValueRef vbuffers_ptr,
LLVMValueRef *res,
struct pipe_vertex_element *velem,
......@@ -704,10 +705,17 @@ generate_fetch(struct gallivm_state *gallivm,
struct lp_build_if_state if_ctx;
if (velem->instance_divisor) {
/* array index = instance_id / instance_divisor */
index = LLVMBuildUDiv(builder, instance_id,
lp_build_const_int32(gallivm, velem->instance_divisor),
"instance_divisor");
/* Index is equal to the start instance plus the number of current
* instance divided by the divisor. In this case we compute it as:
* index = start_instance + ((instance_id - start_instance) / divisor)
*/
LLVMValueRef current_instance;
index = lp_build_const_int32(gallivm, draw->start_instance);
current_instance = LLVMBuildSub(builder, instance_id, index, "");
current_instance = LLVMBuildUDiv(builder, current_instance,
lp_build_const_int32(gallivm, velem->instance_divisor),
"instance_divisor");
index = LLVMBuildAdd(builder, index, current_instance, "instance");
}
stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
......@@ -1697,7 +1705,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
LLVMValueRef vb_index =
lp_build_const_int32(gallivm, velem->vertex_buffer_index);
LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
generate_fetch(gallivm, vbuffers_ptr,
generate_fetch(gallivm, draw, vbuffers_ptr,
&aos_attribs[j][i], velem, vb, true_index,
system_values.instance_id);
}
......
......@@ -138,7 +138,7 @@ emit_vertex( struct vbuf_stage *vbuf,
/* Note: we really do want data[0] here, not data[pos]:
*/
vbuf->translate->set_buffer(vbuf->translate, 0, vertex->data[0], 0, ~0);
vbuf->translate->run(vbuf->translate, 0, 1, 0, vbuf->vertex_ptr);
vbuf->translate->run(vbuf->translate, 0, 1, 0, 0, vbuf->vertex_ptr);
if (0) draw_dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr);
......
......@@ -306,6 +306,7 @@ struct draw_context
} extra_shader_outputs;
unsigned instance_id;
unsigned start_instance;
#ifdef HAVE_LLVM
struct draw_llvm *llvm;
......
......@@ -533,6 +533,7 @@ draw_vbo(struct draw_context *draw,
for (instance = 0; instance < info->instance_count; instance++) {
draw->instance_id = instance + info->start_instance;
draw->start_instance = info->start_instance;
/* check for overflow */
if (draw->instance_id < instance ||
draw->instance_id < info->start_instance) {
......
......@@ -171,6 +171,7 @@ draw_pt_emit(struct pt_emit *emit,
translate->run(translate,
0,
vertex_count,
draw->start_instance,
draw->instance_id,
hw_verts );
......@@ -234,6 +235,7 @@ draw_pt_emit_linear(struct pt_emit *emit,
translate->run(translate,
0,
count,
draw->start_instance,
draw->instance_id,
hw_verts);
......
......@@ -168,6 +168,7 @@ draw_pt_fetch_run(struct pt_fetch *fetch,
translate->run_elts( translate,
elts,
count,
draw->start_instance,
draw->instance_id,
verts );
}
......@@ -195,6 +196,7 @@ draw_pt_fetch_run_linear(struct pt_fetch *fetch,
translate->run( translate,
start,
count,
draw->start_instance,
draw->instance_id,
verts );
}
......
......@@ -210,6 +210,7 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
feme->translate->run_elts( feme->translate,
fetch_elts,
fetch_count,
draw->start_instance,
draw->instance_id,
hw_verts );
......@@ -267,6 +268,7 @@ static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
feme->translate->run( feme->translate,
start,
count,
draw->start_instance,
draw->instance_id,
hw_verts );
......@@ -326,6 +328,7 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
feme->translate->run( feme->translate,
start,
count,
draw->start_instance,
draw->instance_id,
hw_verts );
......
......@@ -182,12 +182,29 @@ static void so_emit_prim(struct pt_so_emit *so,
buffer = (float *)((char *)draw->so.targets[ob]->mapping +
draw->so.targets[ob]->target.buffer_offset +
draw->so.targets[ob]->internal_offset) + state->output[slot].dst_offset;
draw->so.targets[ob]->internal_offset) +
state->output[slot].dst_offset;
if (idx == so->pos_idx && pcp_ptr)
memcpy(buffer, &pre_clip_pos[start_comp], num_comps * sizeof(float));
memcpy(buffer, &pre_clip_pos[start_comp],
num_comps * sizeof(float));
else
memcpy(buffer, &input[idx][start_comp], num_comps * sizeof(float));
memcpy(buffer, &input[idx][start_comp],
num_comps * sizeof(float));
#if 0
{
int j;
debug_printf("VERT[%d], offset = %d, slot[%d] sc = %d, num_c = %d, idx = %d = [",
i + draw->so.targets[ob]->emitted_vertices,
draw->so.targets[ob]->internal_offset,
slot, start_comp, num_comps, idx);
for (j = 0; j < num_comps; ++j) {
unsigned *ubuffer = (unsigned*)buffer;
debug_printf("%d (0x%x), ", ubuffer[j], ubuffer[j]);
}
debug_printf("]\n");
}
#endif
}
for (ob = 0; ob < draw->so.num_targets; ++ob) {
struct draw_so_target *target = draw->so.targets[ob];
......
......@@ -168,6 +168,7 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_variant *variant,
vsvg->fetch->run_elts( vsvg->fetch,
elts,
count,
vsvg->draw->start_instance,
vsvg->draw->instance_id,
temp_buffer );
......@@ -211,6 +212,7 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_variant *variant,
vsvg->emit->run( vsvg->emit,
0, count,
vsvg->draw->start_instance,
vsvg->draw->instance_id,
output_buffer );
......@@ -234,6 +236,7 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_variant *variant,
vsvg->fetch->run( vsvg->fetch,
start,
count,
vsvg->draw->start_instance,
vsvg->draw->instance_id,
temp_buffer );
......@@ -274,6 +277,7 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_variant *variant,
vsvg->emit->run( vsvg->emit,
0, count,
vsvg->draw->start_instance,
vsvg->draw->instance_id,
output_buffer );
......
......@@ -74,24 +74,28 @@ struct translate;
typedef void (PIPE_CDECL *run_elts_func)(struct translate *,
const unsigned *elts,
unsigned count,
unsigned start_instance,
unsigned instance_id,
void *output_buffer);
typedef void (PIPE_CDECL *run_elts16_func)(struct translate *,
const uint16_t *elts,
unsigned count,
unsigned start_instance,
unsigned instance_id,
void *output_buffer);
typedef void (PIPE_CDECL *run_elts8_func)(struct translate *,
const uint8_t *elts,
unsigned count,
unsigned start_instance,
unsigned instance_id,
void *output_buffer);
typedef void (PIPE_CDECL *run_func)(struct translate *,
unsigned start,
unsigned count,
unsigned start_instance,
unsigned instance_id,
void *output_buffer);
......
......@@ -607,6 +607,7 @@ static emit_func get_emit_func( enum pipe_format format )
static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg,
unsigned elt,
unsigned start_instance,
unsigned instance_id,
void *vert )
{
......@@ -623,7 +624,9 @@ static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *
int copy_size;
if (tg->attrib[attr].instance_divisor) {
index = instance_id / tg->attrib[attr].instance_divisor;
index = start_instance;
index += (instance_id - start_instance) /
tg->attrib[attr].instance_divisor;
/* XXX we need to clamp the index here too, but to a
* per-array max value, not the draw->pt.max_index value
* that's being given to us via translate->set_buffer().
......@@ -674,6 +677,7 @@ static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *
static void PIPE_CDECL generic_run_elts( struct translate *translate,
const unsigned *elts,
unsigned count,
unsigned start_instance,
unsigned instance_id,
void *output_buffer )
{
......@@ -682,7 +686,7 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
unsigned i;
for (i = 0; i < count; i++) {
generic_run_one(tg, *elts++, instance_id, vert);
generic_run_one(tg, *elts++, start_instance, instance_id, vert);
vert += tg->translate.key.output_stride;
}
}
......@@ -690,6 +694,7 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
static void PIPE_CDECL generic_run_elts16( struct translate *translate,
const uint16_t *elts,
unsigned count,
unsigned start_instance,
unsigned instance_id,
void *output_buffer )
{
......@@ -698,7 +703,7 @@ static void PIPE_CDECL generic_run_elts16( struct translate *translate,
unsigned i;
for (i = 0; i < count; i++) {
generic_run_one(tg, *elts++, instance_id, vert);
generic_run_one(tg, *elts++, start_instance, instance_id, vert);
vert += tg->translate.key.output_stride;
}
}
......@@ -706,6 +711,7 @@ static void PIPE_CDECL generic_run_elts16( struct translate *translate,
static void PIPE_CDECL generic_run_elts8( struct translate *translate,
const uint8_t *elts,
unsigned count,
unsigned start_instance,
unsigned instance_id,
void *output_buffer )
{
......@@ -714,7 +720,7 @@ static void PIPE_CDECL generic_run_elts8( struct translate *translate,
unsigned i;
for (i = 0; i < count; i++) {
generic_run_one(tg, *elts++, instance_id, vert);
generic_run_one(tg, *elts++, start_instance, instance_id, vert);
vert += tg->translate.key.output_stride;
}
}
......@@ -722,6 +728,7 @@ static void PIPE_CDECL generic_run_elts8( struct translate *translate,
static void PIPE_CDECL generic_run( struct translate *translate,
unsigned start,
unsigned count,
unsigned start_instance,
unsigned instance_id,
void *output_buffer )
{
......@@ -730,7 +737,7 @@ static void PIPE_CDECL generic_run( struct translate *translate,
unsigned i;
for (i = 0; i < count; i++) {
generic_run_one(tg, start + i, instance_id, vert);
generic_run_one(tg, start + i, start_instance, instance_id, vert);
vert += tg->translate.key.output_stride;
}
}
......
......@@ -112,6 +112,7 @@ struct translate_sse {
boolean use_instancing;
unsigned instance_id;
unsigned start_instance;
/* these are actually known values, but putting them in a struct
* like this is helpful to keep them in sync across the file.
......@@ -1061,6 +1062,8 @@ static boolean init_inputs( struct translate_sse *p,
unsigned i;
struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
get_offset(p, &p->instance_id));
struct x86_reg start_instance = x86_make_disp(p->machine_EDI,
get_offset(p, &p->start_instance));
for (i = 0; i < p->nr_buffer_variants; i++) {
struct translate_buffer_variant *variant = &p->buffer_variant[i];
......@@ -1082,7 +1085,8 @@ static boolean init_inputs( struct translate_sse *p,
* base_ptr + stride * index, where index depends on instance divisor
*/
if (variant->instance_divisor) {
/* Our index is instance ID divided by instance divisor.
/* Start with instance = instance_id
* which is true if divisor is 1.
*/
x86_mov(p->func, tmp_EAX, instance_id);
......@@ -1090,13 +1094,22 @@ static boolean init_inputs( struct translate_sse *p,
struct x86_reg tmp_EDX = p->tmp2_EDX;
struct x86_reg tmp_ECX = p->src_ECX;
/* instance_num = instance_id - start_instance */
x86_mov(p->func, tmp_EDX, start_instance);
x86_sub(p->func, tmp_EAX, tmp_EDX);
/* TODO: Add x86_shr() to rtasm and use it whenever
* instance divisor is power of two.
*/
x86_xor(p->func, tmp_EDX, tmp_EDX);
x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
/* instance = (instance_id - start_instance) / divisor +
* start_instance
*/
x86_mov(p->func, tmp_EDX, start_instance);
x86_add(p->func, tmp_EAX, tmp_EDX);
}
/* XXX we need to clamp the index here too, but to a
......@@ -1312,16 +1325,23 @@ static boolean build_vertex_emit( struct translate_sse *p,
x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
if(x86_target(p->func) != X86_32)
x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
else
x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
/* Load instance ID.
*/
if (p->use_instancing) {
if (p->use_instancing) {
x86_mov(p->func,
p->tmp_EAX,
p->tmp2_EDX,
x86_fn_arg(p->func, 4));
x86_mov(p->func,
x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)),
p->tmp2_EDX);
x86_mov(p->func,
p->tmp_EAX,
x86_fn_arg(p->func, 5));
x86_mov(p->func,
x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
p->tmp_EAX);
......
......@@ -403,13 +403,13 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
switch (ib->index_size) {
case 4:
tr->run_elts(tr, (unsigned*)map, num_indices, 0, out_map);
tr->run_elts(tr, (unsigned*)map, num_indices, 0, 0, out_map);
break;
case 2:
tr->run_elts16(tr, (uint16_t*)map, num_indices, 0, out_map);
tr->run_elts16(tr, (uint16_t*)map, num_indices, 0, 0, out_map);
break;
case 1:
tr->run_elts8(tr, map, num_indices, 0, out_map);
tr->run_elts8(tr, map, num_indices, 0, 0, out_map);
break;
}
......@@ -428,7 +428,7 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
out_offset -= key->output_stride * start_vertex;
tr->run(tr, 0, num_vertices, 0, out_map);
tr->run(tr, 0, num_vertices, 0, 0, out_map);
}
/* Unmap all buffers. */
......
......@@ -99,7 +99,7 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
ctx->translate->run_elts8(ctx->translate, elts, nr, 0, ctx->push->cur);
ctx->translate->run_elts8(ctx->translate, elts, nr, 0, 0, ctx->push->cur);
ctx->push->cur += size;
count -= nr;
......@@ -131,7 +131,7 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
ctx->translate->run_elts16(ctx->translate, elts, nr, 0, ctx->push->cur);
ctx->translate->run_elts16(ctx->translate, elts, nr, 0, 0, ctx->push->cur);
ctx->push->cur += size;
count -= nr;
......@@ -163,7 +163,7 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
ctx->translate->run_elts(ctx->translate, elts, nr, 0, ctx->push->cur);
ctx->translate->run_elts(ctx->translate, elts, nr, 0, 0, ctx->push->cur);
ctx->push->cur += size;
count -= nr;
......@@ -187,7 +187,7 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
ctx->translate->run(ctx->translate, start, push, 0, ctx->push->cur);
ctx->translate->run(ctx->translate, start, push, 0, 0, ctx->push->cur);
ctx->push->cur += size;
count -= push;
start += push;
......
......@@ -76,7 +76,7 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
ctx->translate->run_elts8(ctx->translate, elts, nr, ctx->instance_id,
ctx->translate->run_elts8(ctx->translate, elts, nr, 0, ctx->instance_id,
ctx->push->cur);
ctx->push->cur += size;
......@@ -109,7 +109,7 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
ctx->translate->run_elts16(ctx->translate, elts, nr, ctx->instance_id,
ctx->translate->run_elts16(ctx->translate, elts, nr, 0, ctx->instance_id,
ctx->push->cur);
ctx->push->cur += size;
......@@ -142,7 +142,7 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
ctx->translate->run_elts(ctx->translate, elts, nr, ctx->instance_id,
ctx->translate->run_elts(ctx->translate, elts, nr, 0, ctx->instance_id,
ctx->push->cur);
ctx->push->cur += size;
......@@ -167,7 +167,7 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
ctx->translate->run(ctx->translate, start, push, ctx->instance_id,
ctx->translate->run(ctx->translate, start, push, 0, ctx->instance_id,
ctx->push->cur);
ctx->push->cur += size;
count -= push;
......
......@@ -137,7 +137,7 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
ctx->translate->run_elts8(ctx->translate, elts, nr, ctx->instance_id,
ctx->translate->run_elts8(ctx->translate, elts, nr, 0, ctx->instance_id,
ctx->push->cur);
ctx->push->cur += size;
......@@ -178,7 +178,7 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
ctx->translate->run_elts16(ctx->translate, elts, nr, ctx->instance_id,
ctx->translate->run_elts16(ctx->translate, elts, nr, 0, ctx->instance_id,
ctx->push->cur);
ctx->push->cur += size;
......@@ -219,7 +219,7 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
ctx->translate->run_elts(ctx->translate, elts, nr, ctx->instance_id,
ctx->translate->run_elts(ctx->translate, elts, nr, 0, ctx->instance_id,
ctx->push->cur);
ctx->push->cur += size;
......@@ -252,7 +252,7 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
ctx->translate->run(ctx->translate, start, push, ctx->instance_id,
ctx->translate->run(ctx->translate, start, push, 0, ctx->instance_id,
ctx->push->cur);
ctx->push->cur += size;
......
......@@ -215,7 +215,7 @@ disp_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
if (unlikely(ctx->prim_restart))
nR = prim_restart_search_i08(elts, nR, ctx->restart_index);
translate->run_elts8(translate, elts, nR, ctx->instance_id, ctx->dest);
translate->run_elts8(translate, elts, nR, 0, ctx->instance_id, ctx->dest);
count -= nR;
ctx->dest += nR * ctx->vertex_size;
......@@ -271,7 +271,7 @@ disp_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
if (unlikely(ctx->prim_restart))
nR = prim_restart_search_i16(elts, nR, ctx->restart_index);
translate->run_elts16(translate, elts, nR, ctx->instance_id, ctx->dest);
translate->run_elts16(translate, elts, nR, 0, ctx->instance_id, ctx->dest);
count -= nR;
ctx->dest += nR * ctx->vertex_size;
......@@ -327,7 +327,7 @@ disp_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
if (unlikely(ctx->prim_restart))
nR = prim_restart_search_i32(elts, nR, ctx->restart_index);
translate->run_elts(translate, elts, nR, ctx->instance_id, ctx->dest);
translate->run_elts(translate, elts, nR, 0, ctx->instance_id, ctx->dest);
count -= nR;
ctx->dest += nR * ctx->vertex_size;
......@@ -376,7 +376,7 @@ disp_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
struct translate *translate = ctx->translate;
unsigned pos = 0;
translate->run(translate, start, count, ctx->instance_id, ctx->dest);
translate->run(translate, start, count, 0, ctx->instance_id, ctx->dest);
do {
unsigned nr = count;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment