gem_exec_schedule.c 71.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 * Copyright © 2016 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

24 25
#include "config.h"

26 27 28
#include <linux/userfaultfd.h>

#include <pthread.h>
29
#include <sys/poll.h>
30
#include <sys/ioctl.h>
31 32
#include <sys/mman.h>
#include <sys/syscall.h>
33
#include <sched.h>
34
#include <signal.h>
35
#include <unistd.h>
36

37 38
#include "i915/gem.h"
#include "i915/gem_ring.h"
39
#include "igt.h"
40
#include "igt_rand.h"
Chris Wilson's avatar
Chris Wilson committed
41
#include "igt_rapl.h"
42
#include "igt_sysfs.h"
43
#include "igt_vgem.h"
44
#include "sw_sync.h"
45 46 47 48 49

#define LO 0
#define HI 1
#define NOISE 2

50 51
#define MAX_PRIO I915_CONTEXT_MAX_USER_PRIORITY
#define MIN_PRIO I915_CONTEXT_MIN_USER_PRIORITY
52

53
#define MAX_CONTEXTS 1024
54 55
#define MAX_ELSP_QLEN 16
#define MAX_ENGINES (I915_EXEC_RING_MASK + 1)
56

57 58 59 60 61 62 63 64 65
#define MI_SEMAPHORE_WAIT		(0x1c << 23)
#define   MI_SEMAPHORE_POLL             (1 << 15)
#define   MI_SEMAPHORE_SAD_GT_SDD       (0 << 12)
#define   MI_SEMAPHORE_SAD_GTE_SDD      (1 << 12)
#define   MI_SEMAPHORE_SAD_LT_SDD       (2 << 12)
#define   MI_SEMAPHORE_SAD_LTE_SDD      (3 << 12)
#define   MI_SEMAPHORE_SAD_EQ_SDD       (4 << 12)
#define   MI_SEMAPHORE_SAD_NEQ_SDD      (5 << 12)

66 67
IGT_TEST_DESCRIPTION("Check that we can control the order of execution");

68 69 70 71 72
static unsigned int offset_in_page(void *addr)
{
	return (uintptr_t)addr & 4095;
}

73 74 75 76 77
static inline
uint32_t __sync_read_u32(int fd, uint32_t handle, uint64_t offset)
{
	uint32_t value;

78 79
	gem_set_domain(fd, handle, /* No write hazard lies! */
		       I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
80 81 82 83 84 85 86 87
	gem_read(fd, handle, offset, &value, sizeof(value));

	return value;
}

static inline
void __sync_read_u32_count(int fd, uint32_t handle, uint32_t *dst, uint64_t size)
{
88 89
	gem_set_domain(fd, handle, /* No write hazard lies! */
		       I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
90 91 92
	gem_read(fd, handle, 0, dst, size);
}

93 94
static uint32_t __store_dword(int fd, uint32_t ctx, unsigned ring,
			      uint32_t target, uint32_t offset, uint32_t value,
95
			      uint32_t cork, int fence, unsigned write_domain)
96 97 98 99 100 101 102 103 104
{
	const int gen = intel_gen(intel_get_drm_devid(fd));
	struct drm_i915_gem_exec_object2 obj[3];
	struct drm_i915_gem_relocation_entry reloc;
	struct drm_i915_gem_execbuffer2 execbuf;
	uint32_t batch[16];
	int i;

	memset(&execbuf, 0, sizeof(execbuf));
105
	execbuf.buffers_ptr = to_user_pointer(obj + !cork);
106 107 108 109 110 111
	execbuf.buffer_count = 2 + !!cork;
	execbuf.flags = ring;
	if (gen < 6)
		execbuf.flags |= I915_EXEC_SECURE;
	execbuf.rsvd1 = ctx;

112 113 114 115 116
	if (fence != -1) {
		execbuf.flags |= I915_EXEC_FENCE_IN;
		execbuf.rsvd2 = fence;
	}

117 118
	memset(obj, 0, sizeof(obj));
	obj[0].handle = cork;
119
	obj[0].offset = cork << 20;
120
	obj[1].handle = target;
121
	obj[1].offset = target << 20;
122
	obj[2].handle = gem_create(fd, 4096);
123 124
	obj[2].offset = 256 << 10;
	obj[2].offset += (random() % 128) << 12;
125 126 127

	memset(&reloc, 0, sizeof(reloc));
	reloc.target_handle = obj[1].handle;
128
	reloc.presumed_offset = obj[1].offset;
129 130 131 132
	reloc.offset = sizeof(uint32_t);
	reloc.delta = offset;
	reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION;
	reloc.write_domain = write_domain;
133
	obj[2].relocs_ptr = to_user_pointer(&reloc);
134 135 136 137 138
	obj[2].relocation_count = 1;

	i = 0;
	batch[i] = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
	if (gen >= 8) {
139
		batch[++i] = reloc.presumed_offset + reloc.delta;
140 141 142
		batch[++i] = 0;
	} else if (gen >= 4) {
		batch[++i] = 0;
143
		batch[++i] = reloc.presumed_offset + reloc.delta;
144 145 146
		reloc.offset += sizeof(uint32_t);
	} else {
		batch[i]--;
147
		batch[++i] = reloc.presumed_offset + reloc.delta;
148 149 150 151 152
	}
	batch[++i] = value;
	batch[++i] = MI_BATCH_BUFFER_END;
	gem_write(fd, obj[2].handle, 0, batch, sizeof(batch));
	gem_execbuf(fd, &execbuf);
153 154 155 156 157 158

	return obj[2].handle;
}

static void store_dword(int fd, uint32_t ctx, unsigned ring,
			uint32_t target, uint32_t offset, uint32_t value,
159
			unsigned write_domain)
160 161 162
{
	gem_close(fd, __store_dword(fd, ctx, ring,
				    target, offset, value,
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
				    0, -1, write_domain));
}

static void store_dword_plug(int fd, uint32_t ctx, unsigned ring,
			     uint32_t target, uint32_t offset, uint32_t value,
			     uint32_t cork, unsigned write_domain)
{
	gem_close(fd, __store_dword(fd, ctx, ring,
				    target, offset, value,
				    cork, -1, write_domain));
}

static void store_dword_fenced(int fd, uint32_t ctx, unsigned ring,
			       uint32_t target, uint32_t offset, uint32_t value,
			       int fence, unsigned write_domain)
{
	gem_close(fd, __store_dword(fd, ctx, ring,
				    target, offset, value,
				    0, fence, write_domain));
182 183
}

184 185
static uint32_t create_highest_priority(int fd)
{
186
	uint32_t ctx = gem_context_clone_with_engines(fd, 0);
187 188 189 190 191 192

	/*
	 * If there is no priority support, all contexts will have equal
	 * priority (and therefore the max user priority), so no context
	 * can overtake us, and we effectively can form a plug.
	 */
193
	__gem_context_set_priority(fd, ctx, MAX_PRIO);
194 195 196 197

	return ctx;
}

198
static void unplug_show_queue(int fd, struct igt_cork *c, unsigned int engine)
199
{
200
	igt_spin_t *spin[MAX_ELSP_QLEN];
201
	int max = MAX_ELSP_QLEN;
202

203 204 205 206 207
	/* If no scheduler, all batches are emitted in submission order */
	if (!gem_scheduler_enabled(fd))
		max = 1;

	for (int n = 0; n < max; n++) {
208 209 210 211
		const struct igt_spin_factory opts = {
			.ctx = create_highest_priority(fd),
			.engine = engine,
		};
212
		spin[n] = __igt_spin_factory(fd, &opts);
213
		gem_context_destroy(fd, opts.ctx);
214 215
	}

216
	igt_cork_unplug(c); /* batches will now be queued on the engine */
217 218
	igt_debugfs_dump(fd, "i915_engine_info");

219
	for (int n = 0; n < max; n++)
220
		igt_spin_free(fd, spin[n]);
221

222 223
}

224 225
static void fifo(int fd, unsigned ring)
{
226 227
	IGT_CORK_FENCE(cork);
	uint32_t scratch;
228
	uint32_t result;
229
	int fence;
230 231 232

	scratch = gem_create(fd, 4096);

233
	fence = igt_cork_plug(&cork, fd);
234 235

	/* Same priority, same timeline, final result will be the second eb */
236 237
	store_dword_fenced(fd, 0, ring, scratch, 0, 1, fence, 0);
	store_dword_fenced(fd, 0, ring, scratch, 0, 2, fence, 0);
238

239
	unplug_show_queue(fd, &cork, ring);
240
	close(fence);
241

242
	result =  __sync_read_u32(fd, scratch, 0);
243 244
	gem_close(fd, scratch);

245
	igt_assert_eq_u32(result, 2);
246 247
}

248 249 250 251 252 253 254
enum implicit_dir {
	READ_WRITE = 0x1,
	WRITE_READ = 0x2,
};

static void implicit_rw(int i915, unsigned ring, enum implicit_dir dir)
{
255
	const struct intel_execution_engine2 *e;
256 257 258 259 260 261 262
	IGT_CORK_FENCE(cork);
	unsigned int count;
	uint32_t scratch;
	uint32_t result;
	int fence;

	count = 0;
263 264
	__for_each_physical_engine(i915, e) {
		if (e->flags == ring)
265 266
			continue;

267 268 269
		if (!gem_class_can_store_dword(i915, e->class))
			continue;

270 271 272 273 274 275 276 277 278
		count++;
	}
	igt_require(count);

	scratch = gem_create(i915, 4096);
	fence = igt_cork_plug(&cork, i915);

	if (dir & WRITE_READ)
		store_dword_fenced(i915, 0,
279
				   ring, scratch, 0, ~ring,
280 281
				   fence, I915_GEM_DOMAIN_RENDER);

282 283
	__for_each_physical_engine(i915, e) {
		if (e->flags == ring)
284 285
			continue;

286 287 288
		if (!gem_class_can_store_dword(i915, e->class))
			continue;

289
		store_dword_fenced(i915, 0,
290
				   e->flags, scratch, 0, e->flags,
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
				   fence, 0);
	}

	if (dir & READ_WRITE)
		store_dword_fenced(i915, 0,
				   ring, scratch, 0, ring,
				   fence, I915_GEM_DOMAIN_RENDER);

	unplug_show_queue(i915, &cork, ring);
	close(fence);

	result =  __sync_read_u32(i915, scratch, 0);
	gem_close(i915, scratch);

	if (dir & WRITE_READ)
306
		igt_assert_neq_u32(result, ~ring);
307 308 309 310
	if (dir & READ_WRITE)
		igt_assert_eq_u32(result, ring);
}

311 312
static void independent(int fd, unsigned int engine)
{
313
	const struct intel_execution_engine2 *e;
314
	IGT_CORK_FENCE(cork);
315
	igt_spin_t *spin = NULL;
316
	uint32_t scratch, batch;
317
	uint32_t *ptr;
318
	int fence;
319 320

	scratch = gem_create(fd, 4096);
321
	ptr = gem_mmap__device_coherent(fd, scratch, 0, 4096, PROT_READ);
322 323
	igt_assert_eq(ptr[0], 0);

324
	fence = igt_cork_plug(&cork, fd);
325 326

	/* Check that we can submit to engine while all others are blocked */
327 328
	__for_each_physical_engine(fd, e) {
		if (e->flags == engine)
329 330
			continue;

331
		if (!gem_class_can_store_dword(fd, e->class))
332 333 334
			continue;

		if (spin == NULL) {
335
			spin = __igt_spin_new(fd, .engine = e->flags);
336 337 338
		} else {
			struct drm_i915_gem_execbuffer2 eb = {
				.buffer_count = 1,
339
				.buffers_ptr = to_user_pointer(&spin->obj[IGT_SPIN_BATCH]),
340
				.flags = e->flags,
341 342 343 344
			};
			gem_execbuf(fd, &eb);
		}

345
		store_dword_fenced(fd, 0, e->flags, scratch, 0, e->flags, fence, 0);
346 347 348 349
	}
	igt_require(spin);

	/* Same priority, but different timeline (as different engine) */
350
	batch = __store_dword(fd, 0, engine, scratch, 0, engine, 0, fence, 0);
351 352

	unplug_show_queue(fd, &cork, engine);
353
	close(fence);
354 355 356 357 358 359 360 361 362 363

	gem_sync(fd, batch);
	igt_assert(!gem_bo_busy(fd, batch));
	igt_assert(gem_bo_busy(fd, spin->handle));
	gem_close(fd, batch);

	/* Only the local engine should be free to complete. */
	igt_assert(gem_bo_busy(fd, scratch));
	igt_assert_eq(ptr[0], engine);

364
	igt_spin_free(fd, spin);
365 366 367 368 369 370 371 372 373 374
	gem_quiescent_gpu(fd);

	/* And we expect the others to have overwritten us, order unspecified */
	igt_assert(!gem_bo_busy(fd, scratch));
	igt_assert_neq(ptr[0], engine);

	munmap(ptr, 4096);
	gem_close(fd, scratch);
}

375 376 377
static void smoketest(int fd, unsigned ring, unsigned timeout)
{
	const int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
378
	const struct intel_execution_engine2 *e;
379
	unsigned engines[MAX_ENGINES];
380 381 382
	unsigned nengine;
	unsigned engine;
	uint32_t scratch;
383
	uint32_t result[2 * ncpus];
384 385

	nengine = 0;
386
	if (ring == ALL_ENGINES) {
387 388 389
		__for_each_physical_engine(fd, e)
			if (gem_class_can_store_dword(fd, e->class))
				engines[nengine++] = e->flags;
390
	} else {
391
		engines[nengine++] = ring;
392 393 394 395 396 397 398 399 400 401
	}
	igt_require(nengine);

	scratch = gem_create(fd, 4096);
	igt_fork(child, ncpus) {
		unsigned long count = 0;
		uint32_t ctx;

		hars_petruska_f54_1_random_perturb(child);

402
		ctx = gem_context_clone_with_engines(fd, 0);
403 404 405 406
		igt_until_timeout(timeout) {
			int prio;

			prio = hars_petruska_f54_1_random_unsafe_max(MAX_PRIO - MIN_PRIO) + MIN_PRIO;
407
			gem_context_set_priority(fd, ctx, prio);
408 409 410 411

			engine = engines[hars_petruska_f54_1_random_unsafe_max(nengine)];
			store_dword(fd, ctx, engine, scratch,
				    8*child + 0, ~child,
412
				    0);
413 414 415
			for (unsigned int step = 0; step < 8; step++)
				store_dword(fd, ctx, engine, scratch,
					    8*child + 4, count++,
416
					    0);
417 418 419 420 421
		}
		gem_context_destroy(fd, ctx);
	}
	igt_waitchildren();

422
	__sync_read_u32_count(fd, scratch, result, sizeof(result));
423 424 425
	gem_close(fd, scratch);

	for (unsigned n = 0; n < ncpus; n++) {
426
		igt_assert_eq_u32(result[2 * n], ~n);
427 428 429 430 431 432
		/*
		 * Note this count is approximate due to unconstrained
		 * ordering of the dword writes between engines.
		 *
		 * Take the result with a pinch of salt.
		 */
433
		igt_info("Child[%d] completed %u cycles\n",  n, result[(2 * n) + 1]);
434 435 436
	}
}

437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
static uint32_t timeslicing_batches(int i915, uint32_t *offset)
{
        uint32_t handle = gem_create(i915, 4096);
        uint32_t cs[256];

	*offset += 4000;
	for (int pair = 0; pair <= 1; pair++) {
		int x = 1;
		int i = 0;

		for (int step = 0; step < 8; step++) {
			if (pair) {
				cs[i++] =
					MI_SEMAPHORE_WAIT |
					MI_SEMAPHORE_POLL |
					MI_SEMAPHORE_SAD_EQ_SDD |
					(4 - 2);
				cs[i++] = x++;
				cs[i++] = *offset;
				cs[i++] = 0;
			}

			cs[i++] = MI_STORE_DWORD_IMM;
			cs[i++] = *offset;
			cs[i++] = 0;
			cs[i++] = x++;

			if (!pair) {
				cs[i++] =
					MI_SEMAPHORE_WAIT |
					MI_SEMAPHORE_POLL |
					MI_SEMAPHORE_SAD_EQ_SDD |
					(4 - 2);
				cs[i++] = x++;
				cs[i++] = *offset;
				cs[i++] = 0;
			}
		}

		cs[i++] = MI_BATCH_BUFFER_END;
		igt_assert(i < ARRAY_SIZE(cs));
		gem_write(i915, handle, pair * sizeof(cs), cs, sizeof(cs));
	}

	*offset = sizeof(cs);
        return handle;
}

485
static void timeslice(int i915, unsigned int engine)
486 487 488 489 490 491 492 493 494 495
{
	unsigned int offset = 24 << 20;
	struct drm_i915_gem_exec_object2 obj = {
		.offset = offset,
		.flags = EXEC_OBJECT_PINNED,
	};
	struct drm_i915_gem_execbuffer2 execbuf  = {
		.buffers_ptr = to_user_pointer(&obj),
		.buffer_count = 1,
	};
496
	uint32_t *result;
497 498 499 500 501 502 503 504 505 506 507 508 509 510
	int out;

	/*
	 * Create a pair of interlocking batches, that ping pong
	 * between each other, and only advance one step at a time.
	 * We require the kernel to preempt at each semaphore and
	 * switch to the other batch in order to advance.
	 */

	igt_require(gem_scheduler_has_semaphores(i915));
	igt_require(gem_scheduler_has_preemption(i915));
	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);

	obj.handle = timeslicing_batches(i915, &offset);
511
	result = gem_mmap__device_coherent(i915, obj.handle, 0, 4096, PROT_READ);
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526

	execbuf.flags = engine | I915_EXEC_FENCE_OUT;
	execbuf.batch_start_offset = 0;
	gem_execbuf_wr(i915, &execbuf);

	/* No coupling between requests; free to timeslice */

	execbuf.rsvd1 = gem_context_clone_with_engines(i915, 0);
	execbuf.rsvd2 >>= 32;
	execbuf.flags = engine | I915_EXEC_FENCE_OUT;
	execbuf.batch_start_offset = offset;
	gem_execbuf_wr(i915, &execbuf);
	gem_context_destroy(i915, execbuf.rsvd1);

	gem_sync(i915, obj.handle);
527
	gem_close(i915, obj.handle);
528 529 530 531 532 533 534 535 536 537

	/* no hangs! */
	out = execbuf.rsvd2;
	igt_assert_eq(sync_fence_status(out), 1);
	close(out);

	out = execbuf.rsvd2 >> 32;
	igt_assert_eq(sync_fence_status(out), 1);
	close(out);

538 539
	igt_assert_eq(result[1000], 16);
	munmap(result, 4096);
540 541
}

542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577
static uint32_t timesliceN_batches(int i915, uint32_t offset, int count)
{
        uint32_t handle = gem_create(i915, (count + 1) * 1024);
        uint32_t cs[256];

	for (int pair = 0; pair < count; pair++) {
		int x = pair;
		int i = 0;

		for (int step = 0; step < 8; step++) {
			cs[i++] =
				MI_SEMAPHORE_WAIT |
				MI_SEMAPHORE_POLL |
				MI_SEMAPHORE_SAD_EQ_SDD |
				(4 - 2);
			cs[i++] = x;
			cs[i++] = offset;
			cs[i++] = 0;

			cs[i++] = MI_STORE_DWORD_IMM;
			cs[i++] = offset;
			cs[i++] = 0;
			cs[i++] = x + 1;

			x += count;
		}

		cs[i++] = MI_BATCH_BUFFER_END;
		igt_assert(i < ARRAY_SIZE(cs));
		gem_write(i915, handle, (pair + 1) * sizeof(cs),
			  cs, sizeof(cs));
	}

        return handle;
}

578
static void timesliceN(int i915, unsigned int engine, int count)
579
{
580
	const unsigned int sz = ALIGN((count + 1) * 1024, 4096);
581 582 583 584 585 586 587 588 589 590 591
	unsigned int offset = 24 << 20;
	struct drm_i915_gem_exec_object2 obj = {
		.handle = timesliceN_batches(i915, offset, count),
		.offset = offset,
		.flags = EXEC_OBJECT_PINNED,
	};
	struct drm_i915_gem_execbuffer2 execbuf  = {
		.buffers_ptr = to_user_pointer(&obj),
		.buffer_count = 1,
		.flags = engine | I915_EXEC_FENCE_OUT,
	};
592 593
	uint32_t *result =
		gem_mmap__device_coherent(i915, obj.handle, 0, sz, PROT_READ);
594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618
	int fence[count];

	/*
	 * Create a pair of interlocking batches, that ping pong
	 * between each other, and only advance one step at a time.
	 * We require the kernel to preempt at each semaphore and
	 * switch to the other batch in order to advance.
	 */

	igt_require(gem_scheduler_has_semaphores(i915));
	igt_require(gem_scheduler_has_preemption(i915));
	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);

	/* No coupling between requests; free to timeslice */

	for (int i = 0; i < count; i++) {
		execbuf.rsvd1 = gem_context_clone_with_engines(i915, 0);
		execbuf.batch_start_offset = (i + 1) * 1024;;
		gem_execbuf_wr(i915, &execbuf);
		gem_context_destroy(i915, execbuf.rsvd1);

		fence[i] = execbuf.rsvd2 >> 32;
	}

	gem_sync(i915, obj.handle);
619
	gem_close(i915, obj.handle);
620 621 622 623 624 625 626

	/* no hangs! */
	for (int i = 0; i < count; i++) {
		igt_assert_eq(sync_fence_status(fence[i]), 1);
		close(fence[i]);
	}

627 628
	igt_assert_eq(*result, 8 * count);
	munmap(result, sz);
629 630
}

631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687
static void lateslice(int i915, unsigned int engine)
{
	igt_spin_t *spin[3];
	uint32_t ctx;

	igt_require(gem_scheduler_has_semaphores(i915));
	igt_require(gem_scheduler_has_preemption(i915));
	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);

	ctx = gem_context_create(i915);
	spin[0] = igt_spin_new(i915, .ctx = ctx, .engine = engine,
			       .flags = (IGT_SPIN_POLL_RUN |
					 IGT_SPIN_FENCE_OUT));
	gem_context_destroy(i915, ctx);

	igt_spin_busywait_until_started(spin[0]);

	ctx = gem_context_create(i915);
	spin[1] = igt_spin_new(i915, .ctx = ctx, .engine = engine,
			       .fence = spin[0]->out_fence,
			       .flags = (IGT_SPIN_POLL_RUN |
					 IGT_SPIN_FENCE_IN));
	gem_context_destroy(i915, ctx);

	usleep(5000); /* give some time for the new spinner to be scheduled */

	/*
	 * Now that we have two spinners in the HW submission queue [ELSP],
	 * and since they are strictly ordered, the timeslicing timer may
	 * be disabled as no reordering is possible. However, upon adding a
	 * third spinner we then expect timeslicing to be real enabled.
	 */

	ctx = gem_context_create(i915);
	spin[2] = igt_spin_new(i915, .ctx = ctx, .engine = engine,
			       .flags = IGT_SPIN_POLL_RUN);
	gem_context_destroy(i915, ctx);

	igt_spin_busywait_until_started(spin[2]);

	igt_assert(gem_bo_busy(i915, spin[0]->handle));
	igt_assert(gem_bo_busy(i915, spin[1]->handle));
	igt_assert(gem_bo_busy(i915, spin[2]->handle));

	igt_assert(!igt_spin_has_started(spin[1]));
	igt_spin_free(i915, spin[0]);

	/* Now just spin[1] and spin[2] active */
	igt_spin_busywait_until_started(spin[1]);

	igt_assert(gem_bo_busy(i915, spin[2]->handle));
	igt_spin_free(i915, spin[2]);

	igt_assert(gem_bo_busy(i915, spin[1]->handle));
	igt_spin_free(i915, spin[1]);
}

688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
static void cancel_spinner(int i915,
			   uint32_t ctx, unsigned int engine,
			   igt_spin_t *spin)
{
	struct drm_i915_gem_exec_object2 obj = {
		.handle = gem_create(i915, 4096),
	};
	struct drm_i915_gem_execbuffer2 execbuf = {
		.buffers_ptr = to_user_pointer(&obj),
		.buffer_count = 1,
		.flags = engine | I915_EXEC_FENCE_SUBMIT,
		.rsvd1 = ctx, /* same vm */
		.rsvd2 = spin->out_fence,
	};
	uint32_t *map, *cs;

	map = gem_mmap__device_coherent(i915, obj.handle, 0, 4096, PROT_WRITE);
	cs = map;

	*cs++ = MI_STORE_DWORD_IMM;
	*cs++ = spin->obj[IGT_SPIN_BATCH].offset +
		offset_in_page(spin->condition);
	*cs++ = spin->obj[IGT_SPIN_BATCH].offset >> 32;
	*cs++ = MI_BATCH_BUFFER_END;

	*cs++ = MI_BATCH_BUFFER_END;
	munmap(map, 4096);

	gem_execbuf(i915, &execbuf);
	gem_close(i915, obj.handle);
}

static void submit_slice(int i915,
			 const struct intel_execution_engine2 *e,
			 unsigned int flags)
#define EARLY_SUBMIT 0x1
#define LATE_SUBMIT 0x2
{
	I915_DEFINE_CONTEXT_PARAM_ENGINES(engines , 1) = {};
	const struct intel_execution_engine2 *cancel;
	struct drm_i915_gem_context_param param = {
		.ctx_id = gem_context_create(i915),
		.param = I915_CONTEXT_PARAM_ENGINES,
		.value = to_user_pointer(&engines),
		.size = sizeof(engines),
	};

	/*
	 * When using a submit fence, we do not want to block concurrent work,
	 * especially when that work is coperating with the spinner.
	 */

	igt_require(gem_scheduler_has_semaphores(i915));
	igt_require(gem_scheduler_has_preemption(i915));
	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);

	__for_each_physical_engine(i915, cancel) {
		igt_spin_t *bg, *spin;
		int timeline = -1;
		int fence = -1;

		if (!gem_class_can_store_dword(i915, cancel->class))
			continue;

		igt_debug("Testing cancellation from %s\n", e->name);

		bg = igt_spin_new(i915, .engine = e->flags);

		if (flags & LATE_SUBMIT) {
			timeline = sw_sync_timeline_create();
			fence = sw_sync_timeline_create_fence(timeline, 1);
		}

		engines.engines[0].engine_class = e->class;
		engines.engines[0].engine_instance = e->instance;
		gem_context_set_param(i915, &param);
		spin = igt_spin_new(i915, .ctx = param.ctx_id,
				    .fence = fence,
				    .flags =
				    IGT_SPIN_POLL_RUN |
				    (flags & LATE_SUBMIT ? IGT_SPIN_FENCE_IN : 0) |
				    IGT_SPIN_FENCE_OUT);
		if (fence != -1)
			close(fence);

		if (flags & EARLY_SUBMIT)
			igt_spin_busywait_until_started(spin);

		engines.engines[0].engine_class = cancel->class;
		engines.engines[0].engine_instance = cancel->instance;
		gem_context_set_param(i915, &param);
		cancel_spinner(i915, param.ctx_id, 0, spin);

		if (timeline != -1)
			close(timeline);

		gem_sync(i915, spin->handle);
		igt_spin_free(i915, spin);
		igt_spin_free(i915, bg);
	}

	gem_context_destroy(i915, param.ctx_id);
}

792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
static uint32_t __batch_create(int i915, uint32_t offset)
{
	const uint32_t bbe = MI_BATCH_BUFFER_END;
	uint32_t handle;

	handle = gem_create(i915, ALIGN(offset + 4, 4096));
	gem_write(i915, handle, offset, &bbe, sizeof(bbe));

	return handle;
}

static uint32_t batch_create(int i915)
{
	return __batch_create(i915, 0);
}

static void semaphore_userlock(int i915)
{
810
	const struct intel_execution_engine2 *e;
811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
	struct drm_i915_gem_exec_object2 obj = {
		.handle = batch_create(i915),
	};
	igt_spin_t *spin = NULL;
	uint32_t scratch;

	igt_require(gem_scheduler_has_semaphores(i915));

	/*
	 * Given the use of semaphores to govern parallel submission
	 * of nearly-ready work to HW, we still want to run actually
	 * ready work immediately. Without semaphores, the dependent
	 * work wouldn't be submitted so our ready work will run.
	 */

	scratch = gem_create(i915, 4096);
827
	__for_each_physical_engine(i915, e) {
828
		if (!spin) {
829 830
			spin = igt_spin_new(i915,
					    .dependency = scratch,
831
					    .engine = e->flags);
832 833 834
		} else {
			uint64_t saved = spin->execbuf.flags;

835 836
			spin->execbuf.flags &= ~I915_EXEC_RING_MASK;
			spin->execbuf.flags |= e->flags;
837 838 839 840 841 842 843 844 845 846 847 848 849 850

			gem_execbuf(i915, &spin->execbuf);

			spin->execbuf.flags = saved;
		}
	}
	igt_require(spin);
	gem_close(i915, scratch);

	/*
	 * On all dependent engines, the request may be executing (busywaiting
	 * on a HW semaphore) but it should not prevent any real work from
	 * taking precedence.
	 */
851 852
	scratch = gem_context_clone_with_engines(i915, 0);
	__for_each_physical_engine(i915, e) {
853 854 855
		struct drm_i915_gem_execbuffer2 execbuf = {
			.buffers_ptr = to_user_pointer(&obj),
			.buffer_count = 1,
856
			.flags = e->flags,
857 858 859
			.rsvd1 = scratch,
		};

860
		if (e->flags == (spin->execbuf.flags & I915_EXEC_RING_MASK))
861 862 863 864 865 866 867 868
			continue;

		gem_execbuf(i915, &execbuf);
	}
	gem_context_destroy(i915, scratch);
	gem_sync(i915, obj.handle); /* to hang unless we can preempt */
	gem_close(i915, obj.handle);

869
	igt_spin_free(i915, spin);
870 871
}

872 873
static void semaphore_codependency(int i915)
{
874
	const struct intel_execution_engine2 *e;
875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890
	struct {
		igt_spin_t *xcs, *rcs;
	} task[2];
	int i;

	/*
	 * Consider two tasks, task A runs on (xcs0, rcs0) and task B
	 * on (xcs1, rcs0). That is they must both run a dependent
	 * batch on rcs0, after first running in parallel on separate
	 * engines. To maximise throughput, we want the shorter xcs task
	 * to start on rcs first. However, if we insert semaphores we may
	 * pick wrongly and end up running the requests in the least
	 * optimal order.
	 */

	i = 0;
891
	__for_each_physical_engine(i915, e) {
892 893
		uint32_t ctx;

894
		if (!gem_class_can_store_dword(i915, e->class))
895 896
			continue;

897
		if (!e->flags)
898 899
			continue;

900
		ctx = gem_context_clone_with_engines(i915, 0);
901 902

		task[i].xcs =
903 904
			__igt_spin_new(i915,
				       .ctx = ctx,
905
				       .engine = e->flags,
906
				       .flags = IGT_SPIN_POLL_RUN);
907
		igt_spin_busywait_until_started(task[i].xcs);
908 909 910

		/* Common rcs tasks will be queued in FIFO */
		task[i].rcs =
911 912
			__igt_spin_new(i915,
				       .ctx = ctx,
913
				       .engine = 0,
914
				       .dependency = task[i].xcs->handle);
915 916 917 918 919 920 921 922 923

		gem_context_destroy(i915, ctx);

		if (++i == ARRAY_SIZE(task))
			break;
	}
	igt_require(i == ARRAY_SIZE(task));

	/* Since task[0] was queued first, it will be first in queue for rcs */
924 925
	igt_spin_end(task[1].xcs);
	igt_spin_end(task[1].rcs);
926 927 928
	gem_sync(i915, task[1].rcs->handle); /* to hang if task[0] hogs rcs */

	for (i = 0; i < ARRAY_SIZE(task); i++) {
929 930
		igt_spin_free(i915, task[i].xcs);
		igt_spin_free(i915, task[i].rcs);
931 932 933
	}
}

934 935
static void semaphore_resolve(int i915)
{
936
	const struct intel_execution_engine2 *e;
937 938 939 940 941 942 943 944 945 946 947 948 949 950 951
	const uint32_t SEMAPHORE_ADDR = 64 << 10;
	uint32_t semaphore, outer, inner, *sema;

	/*
	 * Userspace may submit batches that wait upon unresolved
	 * semaphores. Ideally, we want to put those blocking batches
	 * to the back of the execution queue if we have something else
	 * that is ready to run right away. This test exploits a failure
	 * to reorder batches around a blocking semaphore by submitting
	 * the release of that semaphore from a later context.
	 */

	igt_require(gem_scheduler_has_preemption(i915));
	igt_require(intel_get_drm_devid(i915) >= 8); /* for MI_SEMAPHORE_WAIT */

952 953
	outer = gem_context_clone_with_engines(i915, 0);
	inner = gem_context_clone_with_engines(i915, 0);
954 955 956 957

	semaphore = gem_create(i915, 4096);
	sema = gem_mmap__wc(i915, semaphore, 0, 4096, PROT_WRITE);

958
	__for_each_physical_engine(i915, e) {
959 960 961 962 963
		struct drm_i915_gem_exec_object2 obj[3];
		struct drm_i915_gem_execbuffer2 eb;
		uint32_t handle, cancel;
		uint32_t *cs, *map;
		igt_spin_t *spin;
964
		int64_t poke = 1;
965

966
		if (!gem_class_can_store_dword(i915, e->class))
967 968
			continue;

969
		spin = __igt_spin_new(i915, .engine = e->flags);
970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993
		igt_spin_end(spin); /* we just want its address for later */
		gem_sync(i915, spin->handle);
		igt_spin_reset(spin);

		handle = gem_create(i915, 4096);
		cs = map = gem_mmap__cpu(i915, handle, 0, 4096, PROT_WRITE);

		/* Set semaphore initially to 1 for polling and signaling */
		*cs++ = MI_STORE_DWORD_IMM;
		*cs++ = SEMAPHORE_ADDR;
		*cs++ = 0;
		*cs++ = 1;

		/* Wait until another batch writes to our semaphore */
		*cs++ = MI_SEMAPHORE_WAIT |
			MI_SEMAPHORE_POLL |
			MI_SEMAPHORE_SAD_EQ_SDD |
			(4 - 2);
		*cs++ = 0;
		*cs++ = SEMAPHORE_ADDR;
		*cs++ = 0;

		/* Then cancel the spinner */
		*cs++ = MI_STORE_DWORD_IMM;
994 995
		*cs++ = spin->obj[IGT_SPIN_BATCH].offset +
			offset_in_page(spin->condition);
996 997 998 999 1000 1001 1002 1003 1004 1005
		*cs++ = 0;
		*cs++ = MI_BATCH_BUFFER_END;

		*cs++ = MI_BATCH_BUFFER_END;
		munmap(map, 4096);

		memset(&eb, 0, sizeof(eb));

		/* First up is our spinning semaphore */
		memset(obj, 0, sizeof(obj));
1006
		obj[0] = spin->obj[IGT_SPIN_BATCH];
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019
		obj[1].handle = semaphore;
		obj[1].offset = SEMAPHORE_ADDR;
		obj[1].flags = EXEC_OBJECT_PINNED;
		obj[2].handle = handle;
		eb.buffer_count = 3;
		eb.buffers_ptr = to_user_pointer(obj);
		eb.rsvd1 = outer;
		gem_execbuf(i915, &eb);

		/* Then add the GPU hang intermediatory */
		memset(obj, 0, sizeof(obj));
		obj[0].handle = handle;
		obj[0].flags = EXEC_OBJECT_WRITE; /* always after semaphore */
1020
		obj[1] = spin->obj[IGT_SPIN_BATCH];
1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
		eb.buffer_count = 2;
		eb.rsvd1 = 0;
		gem_execbuf(i915, &eb);

		while (READ_ONCE(*sema) == 0)
			;

		/* Now the semaphore is spinning, cancel it */
		cancel = gem_create(i915, 4096);
		cs = map = gem_mmap__cpu(i915, cancel, 0, 4096, PROT_WRITE);
		*cs++ = MI_STORE_DWORD_IMM;
		*cs++ = SEMAPHORE_ADDR;
		*cs++ = 0;
		*cs++ = 0;
		*cs++ = MI_BATCH_BUFFER_END;
		munmap(map, 4096);

		memset(obj, 0, sizeof(obj));
		obj[0].handle = semaphore;
		obj[0].offset = SEMAPHORE_ADDR;
		obj[0].flags = EXEC_OBJECT_PINNED;
		obj[1].handle = cancel;
		eb.buffer_count = 2;
		eb.rsvd1 = inner;
		gem_execbuf(i915, &eb);
1046
		gem_wait(i915, cancel, &poke); /* match sync's WAIT_PRIORITY */
1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
		gem_close(i915, cancel);

		gem_sync(i915, handle); /* To hang unless cancel runs! */
		gem_close(i915, handle);
		igt_spin_free(i915, spin);

		igt_assert_eq(*sema, 0);
	}

	munmap(sema, 4096);
	gem_close(i915, semaphore);

	gem_context_destroy(i915, inner);
	gem_context_destroy(i915, outer);
}

1063 1064 1065
static void semaphore_noskip(int i915)
{
	const int gen = intel_gen(intel_get_drm_devid(i915));
1066
	const struct intel_execution_engine2 *outer, *inner;
1067 1068 1069 1070
	uint32_t ctx;

	igt_require(gen >= 6); /* MI_STORE_DWORD_IMM convenience */

1071
	ctx = gem_context_clone_with_engines(i915, 0);
1072

1073 1074
	__for_each_physical_engine(i915, outer) {
	__for_each_physical_engine(i915, inner) {
1075 1076 1077 1078 1079
		struct drm_i915_gem_exec_object2 obj[3];
		struct drm_i915_gem_execbuffer2 eb;
		uint32_t handle, *cs, *map;
		igt_spin_t *chain, *spin;

1080 1081
		if (inner->flags == outer->flags ||
		    !gem_class_can_store_dword(i915, inner->class))
1082 1083
			continue;

1084
		chain = __igt_spin_new(i915, .engine = outer->flags);
1085

1086
		spin = __igt_spin_new(i915, .engine = inner->flags);
1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
		igt_spin_end(spin); /* we just want its address for later */
		gem_sync(i915, spin->handle);
		igt_spin_reset(spin);

		handle = gem_create(i915, 4096);
		cs = map = gem_mmap__cpu(i915, handle, 0, 4096, PROT_WRITE);

		/* Cancel the following spinner */
		*cs++ = MI_STORE_DWORD_IMM;
		if (gen >= 8) {
1097 1098
			*cs++ = spin->obj[IGT_SPIN_BATCH].offset +
				offset_in_page(spin->condition);
1099 1100 1101
			*cs++ = 0;
		} else {
			*cs++ = 0;
1102 1103
			*cs++ = spin->obj[IGT_SPIN_BATCH].offset +
				offset_in_page(spin->condition);
1104 1105 1106 1107 1108 1109 1110 1111
		}
		*cs++ = MI_BATCH_BUFFER_END;

		*cs++ = MI_BATCH_BUFFER_END;
		munmap(map, 4096);

		/* port0: implicit semaphore from engine */
		memset(obj, 0, sizeof(obj));
1112
		obj[0] = chain->obj[IGT_SPIN_BATCH];
1113
		obj[0].flags |= EXEC_OBJECT_WRITE;
1114
		obj[1] = spin->obj[IGT_SPIN_BATCH];
1115 1116 1117 1118 1119
		obj[2].handle = handle;
		memset(&eb, 0, sizeof(eb));
		eb.buffer_count = 3;
		eb.buffers_ptr = to_user_pointer(obj);
		eb.rsvd1 = ctx;
1120
		eb.flags = inner->flags;
1121 1122 1123 1124 1125 1126
		gem_execbuf(i915, &eb);

		/* port1: dependency chain from port0 */
		memset(obj, 0, sizeof(obj));
		obj[0].handle = handle;
		obj[0].flags = EXEC_OBJECT_WRITE;
1127
		obj[1] = spin->obj[IGT_SPIN_BATCH];
1128 1129 1130
		memset(&eb, 0, sizeof(eb));
		eb.buffer_count = 2;
		eb.buffers_ptr = to_user_pointer(obj);
1131
		eb.flags = inner->flags;
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
		gem_execbuf(i915, &eb);

		igt_spin_set_timeout(chain, NSEC_PER_SEC / 100);
		gem_sync(i915, spin->handle); /* To hang unless cancel runs! */

		gem_close(i915, handle);
		igt_spin_free(i915, spin);
		igt_spin_free(i915, chain);
	}
	}

	gem_context_destroy(i915, ctx);
}

1146 1147 1148
static void reorder(int fd, unsigned ring, unsigned flags)
#define EQUAL 1
{
1149 1150
	IGT_CORK_FENCE(cork);
	uint32_t scratch;
1151
	uint32_t result;
1152
	uint32_t ctx[2];
1153
	int fence;
1154

1155
	ctx[LO] = gem_context_clone_with_engines(fd, 0);
1156
	gem_context_set_priority(fd, ctx[LO], MIN_PRIO);
1157

1158
	ctx[HI] = gem_context_clone_with_engines(fd, 0);
1159
	gem_context_set_priority(fd, ctx[HI], flags & EQUAL ? MIN_PRIO : 0);
1160 1161

	scratch = gem_create(fd, 4096);
1162
	fence = igt_cork_plug(&cork, fd);
1163 1164 1165 1166

	/* We expect the high priority context to be executed first, and
	 * so the final result will be value from the low priority context.
	 */
1167 1168
	store_dword_fenced(fd, ctx[LO], ring, scratch, 0, ctx[LO], fence, 0);
	store_dword_fenced(fd, ctx[HI], ring, scratch, 0, ctx[HI], fence, 0);
1169

1170
	unplug_show_queue(fd, &cork, ring);
1171
	close(fence);
1172 1173 1174 1175

	gem_context_destroy(fd, ctx[LO]);
	gem_context_destroy(fd, ctx[HI]);

1176
	result =  __sync_read_u32(fd, scratch, 0);
1177 1178 1179
	gem_close(fd, scratch);

	if (flags & EQUAL) /* equal priority, result will be fifo */
1180
		igt_assert_eq_u32(result, ctx[HI]);
1181
	else
1182
		igt_assert_eq_u32(result, ctx[LO]);
1183 1184 1185 1186
}

static void promotion(int fd, unsigned ring)
{
1187
	IGT_CORK_FENCE(cork);
1188
	uint32_t result, dep;
1189
	uint32_t result_read, dep_read;
1190
	uint32_t ctx[3];
1191
	int fence;
1192

1193
	ctx[LO] = gem_context_clone_with_engines(fd, 0);
1194
	gem_context_set_priority(fd, ctx[LO], MIN_PRIO);
1195

1196
	ctx[HI] = gem_context_clone_with_engines(fd, 0);
1197
	gem_context_set_priority(fd, ctx[HI], 0);
1198

1199
	ctx[NOISE] = gem_context_clone_with_engines(fd, 0);
1200
	gem_context_set_priority(fd, ctx[NOISE], MIN_PRIO/2);
1201 1202 1203 1204

	result = gem_create(fd, 4096);
	dep = gem_create(fd, 4096);

1205
	fence = igt_cork_plug(&cork, fd);
1206 1207 1208 1209 1210 1211

	/* Expect that HI promotes LO, so the order will be LO, HI, NOISE.
	 *
	 * fifo would be NOISE, LO, HI.
	 * strict priority would be  HI, NOISE, LO
	 */
1212 1213
	store_dword_fenced(fd, ctx[NOISE], ring, result, 0, ctx[NOISE], fence, 0);
	store_dword_fenced(fd, ctx[LO], ring, result, 0, ctx[LO], fence, 0);
1214 1215

	/* link LO <-> HI via a dependency on another buffer */
1216 1217
	store_dword(fd, ctx[LO], ring, dep, 0, ctx[LO], I915_GEM_DOMAIN_INSTRUCTION);
	store_dword(fd, ctx[HI], ring, dep, 0