Commit 054eb1ab authored by Tvrtko Ursulin's avatar Tvrtko Ursulin

benchmarks/gem_wsim: Command submission workload simulator

Tool which emits batch buffers to engines with configurable
sequences, durations, contexts, dependencies and userspace waits.

Unfinished but shows promise so sending out for early feedback.

v2:
 * Load workload descriptors from files. (also -w)
 * Help text.
 * Calibration control if needed. (-t)
 * NORELOC | LUT to eb flags.
 * Added sample workload to wsim/workload1.

v3:
 * Multiple parallel different workloads (-w -w ...).
 * Multi-context workloads.
 * Variable (random) batch length.
 * Load balancing (round robin and queue depth estimation).
 * Workloads delays and explicit sync steps.
 * Workload frequency (period) control.

v4:
 * Fixed queue-depth estimation by creating separate batches
   per engine when qd load balancing is on.
 * Dropped separate -s cmd line option. It can turn itself on
   automatically when needed.
 * Keep a single status page and lie about the write hazard
   as suggested by Chris.
 * Use batch_start_offset for controlling the batch duration.
   (Chris)
 * Set status page object cache level. (Chris)
 * Moved workload description to a README.
 * Tidied example workloads.
 * Some other cleanups and refactorings.

v5:
 * Master and background workloads (-W / -w).
 * Single batch per step is enough even when balancing. (Chris)
 * Use hars_petruska_f54_1_random IGT functions and see to zero
   at start. (Chris)
 * Use WC cache domain when WC mapping. (Chris)
 * Keep seqnos 64-bytes apart in the status page. (Chris)
 * Add workload throttling and queue-depth throttling commands.
   (Chris)

v6:
 * Added two more workloads.
 * Merged RT balancer from Chris.

v7:
 * Merged NO_RELOC patch from Chris.
 * Added missing RT balancer to help text.

TODO list:

 * Fence support.
 * Batch buffer caching (re-use pool).
 * Better error handling.
 * Less 1980's workload parsing.
 * More workloads.
 * Threads?
 * ... ?
Signed-off-by: Tvrtko Ursulin's avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
parent cf6f2c9b
......@@ -14,6 +14,7 @@ benchmarks_prog_list = \
gem_prw \
gem_set_domain \
gem_syslatency \
gem_wsim \
kms_vblank \
prime_lookup \
vgem_mmap \
......
/*
* Copyright © 2017 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
*/
#include <unistd.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <inttypes.h>
#include <errno.h>
#include <poll.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <time.h>
#include <assert.h>
#include <limits.h>
#include "intel_chipset.h"
#include "drm.h"
#include "ioctl_wrappers.h"
#include "drmtest.h"
#include "intel_io.h"
#include "igt_rand.h"
enum intel_engine_id {
RCS,
BCS,
VCS,
VCS1,
VCS2,
VECS,
NUM_ENGINES
};
struct duration {
unsigned int min, max;
};
enum w_type
{
BATCH,
SYNC,
DELAY,
PERIOD,
THROTTLE,
QD_THROTTLE
};
struct w_step
{
/* Workload step metadata */
enum w_type type;
unsigned int context;
unsigned int engine;
struct duration duration;
int dependency;
int wait;
/* Implementation details */
unsigned int idx;
struct drm_i915_gem_execbuffer2 eb;
struct drm_i915_gem_exec_object2 obj[4];
struct drm_i915_gem_relocation_entry reloc[3];
unsigned long bb_sz;
uint32_t bb_handle;
uint32_t *mapped_batch;
uint32_t *seqno_value;
uint32_t *seqno_address;
uint32_t *rt0_value;
uint32_t *rt0_address;
uint32_t *rt1_address;
unsigned int mapped_len;
};
struct workload
{
unsigned int nr_steps;
struct w_step *steps;
struct timespec repeat_start;
int pipe[2];
unsigned int nr_ctxs;
uint32_t *ctx_id;
uint32_t seqno[NUM_ENGINES];
uint32_t status_page_handle;
uint32_t *status_page;
unsigned int vcs_rr;
unsigned long qd_sum[NUM_ENGINES];
unsigned long nr_bb[NUM_ENGINES];
};
static const unsigned int eb_engine_map[NUM_ENGINES] = {
[RCS] = I915_EXEC_RENDER,
[BCS] = I915_EXEC_BLT,
[VCS] = I915_EXEC_BSD,
[VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1,
[VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2,
[VECS] = I915_EXEC_VEBOX
};
static const unsigned int nop_calibration_us = 1000;
static unsigned long nop_calibration;
static bool quiet;
static int fd;
#define SWAPVCS (1<<0)
#define SEQNO (1<<1)
#define BALANCE (1<<2)
#define RT (1<<3)
#define VCS_SEQNO_IDX(engine) (((engine) - VCS1) * 16)
#define VCS_SEQNO_OFFSET(engine) (VCS_SEQNO_IDX(engine) * sizeof(uint32_t))
#define RCS_TIMESTAMP (0x2000 + 0x358)
#define REG(x) (volatile uint32_t *)((volatile char *)igt_global_mmio + x)
/*
* Workload descriptor:
*
* ctx.engine.duration.dependency.wait,...
* <uint>.<str>.<uint>.<int <= 0>.<0|1>,...
*
* Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS
*
* "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1"
*/
static const char *ring_str_map[NUM_ENGINES] = {
[RCS] = "RCS",
[BCS] = "BCS",
[VCS] = "VCS",
[VCS1] = "VCS1",
[VCS2] = "VCS2",
[VECS] = "VECS",
};
static struct workload *parse_workload(char *_desc)
{
struct workload *wrk;
unsigned int nr_steps = 0;
char *desc = strdup(_desc);
char *_token, *token, *tctx = NULL, *tstart = desc;
char *field, *fctx = NULL, *fstart;
struct w_step step, *steps = NULL;
unsigned int valid;
int tmp;
while ((_token = strtok_r(tstart, ",", &tctx)) != NULL) {
tstart = NULL;
token = strdup(_token);
fstart = token;
valid = 0;
memset(&step, 0, sizeof(step));
if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
fstart = NULL;
if (!strcasecmp(field, "d")) {
if ((field = strtok_r(fstart, ".", &fctx)) !=
NULL) {
tmp = atoi(field);
if (tmp <= 0) {
if (!quiet)
fprintf(stderr,
"Invalid delay at step %u!\n",
nr_steps);
return NULL;
}
step.type = DELAY;
step.wait = tmp;
goto add_step;
}
} else if (!strcasecmp(field, "p")) {
if ((field = strtok_r(fstart, ".", &fctx)) !=
NULL) {
tmp = atoi(field);
if (tmp <= 0) {
if (!quiet)
fprintf(stderr,
"Invalid period at step %u!\n",
nr_steps);
return NULL;
}
step.type = PERIOD;
step.wait = tmp;
goto add_step;
}
} else if (!strcasecmp(field, "s")) {
if ((field = strtok_r(fstart, ".", &fctx)) !=
NULL) {
tmp = atoi(field);
if (tmp >= 0) {
if (!quiet)
fprintf(stderr,
"Invalid sync target at step %u!\n",
nr_steps);
return NULL;
}
step.type = SYNC;
step.wait = tmp;
goto add_step;
}
} else if (!strcasecmp(field, "t")) {
if ((field = strtok_r(fstart, ".", &fctx)) !=
NULL) {
tmp = atoi(field);
if (tmp < 0) {
if (!quiet)
fprintf(stderr,
"Invalid throttle at step %u!\n",
nr_steps);
return NULL;
}
step.type = THROTTLE;
step.wait = tmp;
goto add_step;
}
} else if (!strcasecmp(field, "q")) {
if ((field = strtok_r(fstart, ".", &fctx)) !=
NULL) {
tmp = atoi(field);
if (tmp < 0) {
if (!quiet)
fprintf(stderr,
"Invalid qd throttle at step %u!\n",
nr_steps);
return NULL;
}
step.type = QD_THROTTLE;
step.wait = tmp;
goto add_step;
}
}
tmp = atoi(field);
if (tmp < 0) {
if (!quiet)
fprintf(stderr,
"Invalid ctx id at step %u!\n",
nr_steps);
return NULL;
}
step.context = tmp;
valid++;
}
if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
unsigned int i, old_valid = valid;
fstart = NULL;
for (i = 0; i < ARRAY_SIZE(ring_str_map); i++) {
if (!strcasecmp(field, ring_str_map[i])) {
step.engine = i;
valid++;
break;
}
}
if (old_valid == valid) {
if (!quiet)
fprintf(stderr,
"Invalid engine id at step %u!\n",
nr_steps);
return NULL;
}
}
if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
char *sep = NULL;
long int tmpl;
fstart = NULL;
tmpl = strtol(field, &sep, 10);
if (tmpl == LONG_MIN || tmpl == LONG_MAX) {
if (!quiet)
fprintf(stderr,
"Invalid duration at step %u!\n",
nr_steps);
return NULL;
}
step.duration.min = tmpl;
if (sep && *sep == '-') {
tmpl = strtol(sep + 1, NULL, 10);
if (tmpl == LONG_MIN || tmpl == LONG_MAX) {
if (!quiet)
fprintf(stderr,
"Invalid duration range at step %u!\n",
nr_steps);
return NULL;
}
step.duration.max = tmpl;
} else {
step.duration.max = step.duration.min;
}
valid++;
}
if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
fstart = NULL;
tmp = atoi(field);
if (tmp > 0) {
if (!quiet)
fprintf(stderr,
"Invalid forward dependency at step %u!\n",
nr_steps);
return NULL;
}
step.dependency = tmp;
valid++;
}
if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
fstart = NULL;
tmp = atoi(field);
if (tmp != 0 && tmp != 1) {
if (!quiet)
fprintf(stderr,
"Invalid wait boolean at step %u!\n",
nr_steps);
return NULL;
}
step.wait = tmp;
valid++;
}
if (valid != 5) {
if (!quiet)
fprintf(stderr, "Invalid record at step %u!\n",
nr_steps);
return NULL;
}
step.type = BATCH;
add_step:
step.idx = nr_steps++;
steps = realloc(steps, sizeof(step) * nr_steps);
igt_assert(steps);
memcpy(&steps[nr_steps - 1], &step, sizeof(step));
free(token);
}
wrk = malloc(sizeof(*wrk));
igt_assert(wrk);
wrk->nr_steps = nr_steps;
wrk->steps = steps;
free(desc);
return wrk;
}
static struct workload *
clone_workload(struct workload *_wrk)
{
struct workload *wrk;
wrk = malloc(sizeof(*wrk));
igt_assert(wrk);
memset(wrk, 0, sizeof(*wrk));
wrk->nr_steps = _wrk->nr_steps;
wrk->steps = calloc(wrk->nr_steps, sizeof(struct w_step));
igt_assert(wrk->steps);
memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps);
return wrk;
}
#define rounddown(x, y) (x - (x%y))
#ifndef PAGE_SIZE
#define PAGE_SIZE (4096)
#endif
static unsigned int get_duration(struct duration *dur)
{
if (dur->min == dur->max)
return dur->min;
else
return dur->min + hars_petruska_f54_1_random_unsafe() %
(dur->max + 1 - dur->min);
}
static unsigned long get_bb_sz(unsigned int duration)
{
return ALIGN(duration * nop_calibration * sizeof(uint32_t) /
nop_calibration_us, sizeof(uint32_t));
}
static void
terminate_bb(struct w_step *w, unsigned int flags)
{
const uint32_t bbe = 0xa << 23;
unsigned long mmap_start, mmap_len;
unsigned long batch_start = w->bb_sz;
uint32_t *ptr, *cs;
igt_assert(((flags & RT) && (flags & SEQNO)) || !(flags & RT));
batch_start -= sizeof(uint32_t); /* bbend */
if (flags & SEQNO)
batch_start -= 4 * sizeof(uint32_t);
if (flags & RT)
batch_start -= 8 * sizeof(uint32_t);
mmap_start = rounddown(batch_start, PAGE_SIZE);
mmap_len = w->bb_sz - mmap_start;
gem_set_domain(fd, w->bb_handle,
I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);
if (flags & SEQNO) {
w->reloc[0].offset = batch_start + sizeof(uint32_t);
batch_start += 4 * sizeof(uint32_t);
*cs++ = MI_STORE_DWORD_IMM;
w->seqno_address = cs;
*cs++ = 0;
*cs++ = 0;
w->seqno_value = cs;
*cs++ = 0;
}
if (flags & RT) {
w->reloc[1].offset = batch_start + sizeof(uint32_t);
batch_start += 4 * sizeof(uint32_t);
*cs++ = MI_STORE_DWORD_IMM;
w->rt0_address = cs;
*cs++ = 0;
*cs++ = 0;
w->rt0_value = cs;
*cs++ = 0;
w->reloc[2].offset = batch_start + 2 * sizeof(uint32_t);
batch_start += 4 * sizeof(uint32_t);
*cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */
*cs++ = RCS_TIMESTAMP;
w->rt1_address = cs;
*cs++ = 0;
*cs++ = 0;
}
*cs = bbe;
w->mapped_batch = ptr;
w->mapped_len = mmap_len;
}
static void
eb_update_flags(struct w_step *w, enum intel_engine_id engine,
unsigned int flags)
{
w->eb.flags = eb_engine_map[engine];
w->eb.flags |= I915_EXEC_HANDLE_LUT;
w->eb.flags |= I915_EXEC_NO_RELOC;
}
static void
alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
{
enum intel_engine_id engine = w->engine;
unsigned int bb_i, j = 0;
w->obj[j].handle = gem_create(fd, 4096);
w->obj[j].flags = EXEC_OBJECT_WRITE;
j++;
if (flags & SEQNO) {
w->obj[j].handle = wrk->status_page_handle;
j++;
}
bb_i = j++;
w->bb_sz = get_bb_sz(w->duration.max);
w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz);
terminate_bb(w, flags);
igt_assert(w->dependency <= 0);
if (w->dependency) {
int dep_idx = w->idx + w->dependency;
igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
igt_assert(wrk->steps[dep_idx].type == BATCH);
w->obj[j].handle = w->obj[bb_i].handle;
bb_i = j;
w->obj[j - 1].handle = wrk->steps[dep_idx].obj[0].handle;
j++;
}
if (flags & SEQNO) {
w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc);
if (flags & RT)
w->obj[bb_i].relocation_count = 3;
else
w->obj[bb_i].relocation_count = 1;
for (int i = 0; i < w->obj[bb_i].relocation_count; i++)
w->reloc[i].target_handle = 1;
}
w->eb.buffers_ptr = to_user_pointer(w->obj);
w->eb.buffer_count = j;
w->eb.rsvd1 = wrk->ctx_id[w->context];
if (flags & SWAPVCS && engine == VCS1)
engine = VCS2;
else if (flags & SWAPVCS && engine == VCS2)
engine = VCS1;
eb_update_flags(w, engine, flags);
#ifdef DEBUG
printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n",
w->idx, w->eb.buffer_count, w->obj[0].handle,
w->obj[1].handle, w->obj[2].handle, w->obj[3].handle,
w->bb_sz, w->eb.flags, w->bb_handle, bb_i,
w->context, wrk->ctx_id[w->context]);
#endif
}
static void
prepare_workload(struct workload *wrk, unsigned int flags)
{
int max_ctx = -1;
struct w_step *w;
int i;
if (flags & SEQNO) {
const unsigned int status_sz = sizeof(uint32_t);
uint32_t handle = gem_create(fd, status_sz);
gem_set_caching(fd, handle, I915_CACHING_CACHED);
wrk->status_page_handle = handle;
wrk->status_page = gem_mmap__cpu(fd, handle, 0, status_sz,
PROT_READ);
}
for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
if ((int)w->context > max_ctx) {
int delta = w->context + 1 - wrk->nr_ctxs;
wrk->nr_ctxs += delta;
wrk->ctx_id = realloc(wrk->ctx_id,
wrk->nr_ctxs * sizeof(uint32_t));
memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0,
delta * sizeof(uint32_t));
max_ctx = w->context;
}
if (!wrk->ctx_id[w->context]) {
struct drm_i915_gem_context_create arg = {};
drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
igt_assert(arg.ctx_id);
wrk->ctx_id[w->context] = arg.ctx_id;
}
}
for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
unsigned int _flags = flags;
enum intel_engine_id engine = w->engine;
if (w->type != BATCH)
continue;
if (engine != VCS && engine != VCS1 && engine != VCS2)
_flags &= ~(SEQNO | RT);
if (engine == VCS)
_flags &= ~SWAPVCS;
alloc_step_batch(wrk, w, _flags);
}
}
static double elapsed(const struct timespec *start, const struct timespec *end)
{
return (end->tv_sec - start->tv_sec) +
(end->tv_nsec - start->tv_nsec) / 1e9;
}
static int elapsed_us(const struct timespec *start, const struct timespec *end)
{
return elapsed(start, end) * 1e6;
}
static enum intel_engine_id get_vcs_engine(unsigned int n)
{
const enum intel_engine_id vcs_engines[2] = { VCS1, VCS2 };
igt_assert(n < ARRAY_SIZE(vcs_engines));
return vcs_engines[n];
}
struct workload_balancer {
unsigned int (*get_qd)(const struct workload_balancer *balancer,
struct workload *wrk,
enum intel_engine_id engine);
enum intel_engine_id (*balance)(const struct workload_balancer *balancer,
struct workload *wrk, struct w_step *w);
};
static enum intel_engine_id
rr_balance(const struct workload_balancer *balancer,
struct workload *wrk, struct w_step *w)
{
unsigned int engine;
engine = get_vcs_engine(wrk->vcs_rr);
wrk->vcs_rr ^= 1;
return engine;
}
static const struct workload_balancer rr_balancer = {
.balance = rr_balance,
};
static unsigned int
get_qd_depth(const struct workload_balancer *balancer,
struct workload *wrk, enum intel_engine_id engine)
{
return wrk->seqno[engine] -
wrk->status_page[VCS_SEQNO_IDX(engine)];
}
static enum intel_engine_id
qd_balance(const struct workload_balancer *balancer,
struct workload *wrk, struct w_step *w)
{
enum intel_engine_id engine;
long qd[NUM_ENGINES];
unsigned int n;
igt_assert(w->engine == VCS);
qd[VCS1] = balancer->get_qd(balancer, wrk, VCS1);
wrk->qd_sum[VCS1] += qd[VCS1];
qd[VCS2] = balancer->get_qd(balancer, wrk, VCS2);
wrk->qd_sum[VCS2] += qd[VCS2];
if (qd[VCS1] < qd[VCS2])
n = 0;
else if (qd[VCS2] < qd[VCS1])
n = 1;
else
n = wrk->vcs_rr;
engine = get_vcs_engine(n);
wrk->vcs_rr = n ^ 1;
#ifdef DEBUG
printf("qd_balance: 1:%ld 2:%ld rr:%u = %u\t(%lu - %u) (%lu - %u)\n",
qd[VCS1], qd[VCS2], wrk->vcs_rr, engine,
wrk->seqno[VCS1], wrk->status_page[VCS_SEQNO_IDX(VCS1)],
wrk->seqno[VCS2], wrk->status_page[VCS_SEQNO_IDX(VCS2)]);
#endif
return engine;
}
static const struct workload_balancer qd_balancer = {
.get_qd = get_qd_depth,
.balance = qd_balance,
};
static enum intel_engine_id
rt_balance(const struct workload_balancer *balancer,
struct workload *wrk, struct w_step *w)
{
enum