Compare revisions

2aed5454 · 642f4bf4 · 88b48161 · 193c8bdd · 00e69c8a · 9b69da6d
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -164,8 +164,6 @@ build:tests-debian-meson-mips:
 #################### TEST ##########################
 test:ninja-test:
-  tags:
-    - gstreamer
  retry: 2
  dependencies:
    - build:tests-fedora
@@ -173,8 +171,6 @@ test:ninja-test:
  script: meson test -C build --num-processes ${FDO_CI_CONCURRENT:-4}
 test:ninja-test-clang:
-  tags:
-    - gstreamer
  retry: 2
  dependencies:
    - build:tests-fedora-clang
@@ -184,8 +180,6 @@ test:ninja-test-clang:
  script: meson test -C build --num-processes ${FDO_CI_CONCURRENT:-4}
 test:ninja-test-minimal:
-  tags:
-    - gstreamer
  retry: 2
  image: $CI_REGISTRY/$CI_PROJECT_PATH/build-debian-minimal:commit-$CI_COMMIT_SHA
  dependencies:
@@ -195,8 +189,6 @@ test:ninja-test-minimal:
 # arm testing temporarily disabled until converted to run on native arm HW
 # test:ninja-test-arm64:
-#   tags:
-#     - gstreamer
 #   retry: 2
 #   image: $CI_REGISTRY/$CI_PROJECT_PATH/build-debian-arm64:commit-$CI_COMMIT_SHA
 #   dependencies:
@@ -212,8 +204,6 @@ test:ninja-test-minimal:
 #     when: on_failure
 #
 # test:ninja-test-armhf:
-#   tags:
-#     - gstreamer
 #   retry: 2
 #   image: $CI_REGISTRY/$CI_PROJECT_PATH/build-debian-armhf:commit-$CI_COMMIT_SHA
 #   dependencies:
@@ -228,22 +218,21 @@ test:ninja-test-minimal:
 #       - build
 #     when: on_failure
-test:ninja-test-mips:
+# mips testing temporarily disabled
-  tags:
+# test:ninja-test-mips:
-    - gstreamer
+#   retry: 2
-  retry: 2
+#   image: $CI_REGISTRY/$CI_PROJECT_PATH/build-debian-mips:commit-$CI_COMMIT_SHA
-  image: $CI_REGISTRY/$CI_PROJECT_PATH/build-debian-mips:commit-$CI_COMMIT_SHA
+#   dependencies:
-  dependencies:
+#     - build:tests-debian-meson-mips
-    - build:tests-debian-meson-mips
+#   stage: test
-  stage: test
+#   script:
-  script:
+#     - export PKG_CONFIG_PATH=/usr/lib/mips-linux-gnu/pkgconfig/
-    - export PKG_CONFIG_PATH=/usr/lib/mips-linux-gnu/pkgconfig/
+#     - env > build/envdump.txt
-    - env > build/envdump.txt
+#     - meson test -C build --num-processes ${FDO_CI_CONCURRENT:-4}
-    - meson test -C build --num-processes ${FDO_CI_CONCURRENT:-4}
+#   artifacts:
-  artifacts:
+#     paths:
-    paths:
+#       - build
-      - build
+#     when: on_failure
-    when: on_failure
 test:list-undocumented-tests:
  dependencies:

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -30,6 +30,12 @@ The Code
  provided by the igt library. The semantic patch lib/igt.cocci can help with
  more automatic conversions.
+- Tests that use kernel interfaces (uapi, sysfs, or even debugfs) that
+  become deprecated in favour of new interfaces should have fallbacks
+  to the deprecated interfaces if the new stuff is not present in the
+  running kernel. The same IGT version can be used to test the tip of
+  development along with stable kernel releases that way.
 [igt-describe]: https://drm.pages.freedesktop.org/igt-gpu-tools/igt-gpu-tools-Core.html#igt-describe
@@ -57,6 +63,17 @@ Sending Patches
  on its first invocation.
+- Place relevant prefix in subject, for example when your change is in one
+  testfile, use its name without '.c' nor '.h' suffix, like:
+  tests/simple_test: short description
+  Consider sending cover letter with your patch, so if you decide to change
+  subject it can still be linked into same patchseries on patchwork.
+- Look into some guides from Linux and Open Source community:
+  https://kernelnewbies.org/PatchPhilosophy
+  https://www.kernel.org/doc/html/latest/process/submitting-patches.html
+  https://www.kernel.org/doc/html/latest/process/submit-checklist.html
 - Patches need to be reviewed on the mailing list. Exceptions only apply for
  testcases and tooling for drivers with just a single contributor (e.g. vc4).
  In this case patches must still be submitted to the mailing list first.
@@ -69,8 +86,17 @@ Sending Patches
  contact one of the maintainers (listed in the MAINTAINERS file) and cc the
  igt-dev mailing list.
+- Before sending use Linux kernel script 'checkpatch.pl' for checking your
+  patchset. You could ignore some of them like 'line too long' or 'typedef'
+  but most of the time its log is accurate. Useful options you could use:
+  --emacs --strict --show-types --max-line-length=100 \
+  --ignore=BIT_MACRO,SPLIT_STRING,LONG_LINE_STRING,BOOL_MEMBER
 - Changes to the testcases are automatically tested. Take the results into
-  account before merging.
+  account before merging.  Please also reply to CI failures if you think they
+  are unrelated, add also to Cc CI e-mail which is present in message.  This
+  can help our bug-filing team. When replying, you can cut a message after
+  'Known bugs' to keep it in reasonable size.
 Commit Rights

--- a/MAINTAINERS
+++ b/MAINTAINERS
-Petri Latvala <petri.latvala@intel.com>
+Petri Latvala <adrinael@adrinael.net>
 Arkadiusz Hiler <arek@hiler.eu>
+Kamil Konieczny <kamil.konieczny@linux.intel.com>
+Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
+Bhanuprakash Modem <bhanuprakash.modem@gmail.com>
+Ashutosh Dixit <ashutosh.dixit@intel.com>
+Karthik B S <karthik.b.s@intel.com>
--- a/NEWS
+++ b/NEWS
+Release 2.00 (2025-03-13)
+-------------------------
+General changes:
+- Added Karthik B S as a new maintainer.
+Library changes:
+- Added GFX1153 to GFX1150 amdgpu family. (Tim Huang)
+- Improved kernel module un/loading. (Lucas De Marchi)
+- Added ftrace logs at test boundaries. (Umesh Nerlige Ramappa)
+- Improved device scanning used in device filters. (Zbigniew Kempczyński)
+- Add support to check joiner mode limit. (Jeevan B)
+- Updated PCI ids for DG2/MTL/BMG/PTL. (Matt Atwood)
+- Added PTL opencl kernels. (Janga Rahul Kumar and Priyanka Dandamudi)
+- Extended GPGPU compute square kernel for PTL. (Sai Gowtham Ch)
+- Using Long Running mode for GPGPU compute with Xe. (Francois Dugast)
+- Make running under Valgrind quiet. (Tvrtko Ursulin)
+Runner changes:
+- Added igt_runner cmdline to results. (Lucas De Marchi)
+- Added printing GPU related facts. (Peter Senna Tschudin)
+- Added kmemleak scans. (Peter Senna Tschudin)
+- Parse results more persistently. (Kamil Konieczny)
+Test changes:
+- Improved Xe EUdebug tests. (Dominik Grzegorzek)
+- Improved support for 3d testing on VMs for vmwgfx. (Maaz Mombasawala)
+- Improved Xe debugfs testing. (Pravalika Gurram)
+- Improved amdgpu PCI unplug test. (Vitaly Prosyak)
+- Added support for page queues in amdgpu tests. (Jesse Zhang)
+- New short hibernate subtest in Intel KMS ccs. (Juha-Pekka Heikkila)
+- Renamed test i915_pipe_stress into kms_pipe_stress and added
+  support for Xe in it. (Swati Sharma)
+- New test for devcoredump for Xe. (Zhanjun Dong)
+- New DP link training validation of both UHBR and non-UHBR link
+  rates over SST and MST configurations. (Kunal Joshi)
+- New test for Frame Buffer Compression (FBC) with dirty rectangles which
+  allows FBC to recompress a subsection of a frame. (Santhosh Reddy Guddati)
+- New test for EU stall sampling for Xe. (Harish Chegondi)
+- New test for eudebug/SR-IOV exclusion for Xe. (Christoph Manszewski)
+- New test for PMU (Performance Monitoring Unit) for Xe. (Vinay Belgaumkar
+  and Riana Tauro)
+- New test for SRIOV auto-provisioning for Xe. (Marcin Bernatowicz)
+- New equal-throughput validation for VFs in SRIOV. (Marcin Bernatowicz)
+- Improved test documentation. (Katarzyna Piecielska and Swati Sharma)
+- Improved Xe OA test. (Umesh Nerlige Ramappa, Sai Teja Pottumuttu and
+  Ashutosh Dixit)
+Tools and scripts changes:
+- New tool for measuring display memory bandwidth utilization (Ville Syrjälä)
+- New igt_facts tool for displaying GPU related info. (Peter Senna Tschudin)
+- Power tool renamed to igt_power (Kamil Konieczny)
+- New --pci-slot option in intel_reg for multi-GPU configs. (Łukasz Łaguna
+  and Kamil Konieczny)
+- Added kmemleak option to run-tests.sh script. (Peter Senna Tschudin)
+And many other bug fixes, improvements, cleanups and new tests.
+Release 1.30 (2024-12-13)
+-------------------------
+General changes:
+- New meson options xe_eudebug and vmtb.
+Library changes:
+- Added PantherLake (PTL) support, unified PCI IDs into one
+  common header pciids.h (Ngai-Mint Kwan)
+- Added BMG support for OA (Observability Architecture) for Xe driver.
+  (José Roberto de Souza)
+- Added support for Xe3 platforms in GPGPU shader. (Andrzej Hajda)
+- Added 6k resolution support for a single CRTC. (Jeevan B)
+- Added support for MTL platform in GPGPU compute. (Nishit Sharma)
+Runner changes:
+- Set option PRUNE_KEEP_ALL as default. (Kamil Konieczny)
+- Allow to dynamically ignore dmesg warn messages when generating
+  results, usefull when driver is using kernel fault injection.
+  (Kamil Konieczny).
+Test changes:
+- Added sanity checks for KMS properties. (Dmitry Baryshkov, Maxime Ripard)
+- Improved GPGPU tests for i915 and Xe. (Zbigniew Kempczyński)
+- New SRIOV test for Functional Level Reset (FLR) for Xe. (Marcin Bernatowicz)
+- Added test that draws triangle without using 3d commands for vmwgfx.
+  (Maaz Mombasawala)
+- Added subtest for fallback for DP connector. (Kunal Joshi)
+- Added async flips suspend resume subtest. (Santhosh Reddy Guddati)
+- New test for error handling of Xe at probe time. (Francois Dugast)
+- Added testing SIZE_HINTS property in KMS cursor test. (Ville Syrjälä)
+- Added KMS testing for ultrajoiner. (Karthik B S)
+- New test for TLB invalidation in Xe. (Sai Gowtham Ch)
+- New test for timeslice duration in Xe. (Sai Gowtham Ch)
+- Display brightness test during DPMS on and off. (Mohammed Thasleem)
+- New tests for EU debugging for Xe. (Dominik Grzegorzek, Mika Kuoppala,
+  Christoph Manszewski, Karolina Stolarek, Maciej Patelczyk, Pawel Sikora,
+  Andrzej Hajda, Dominik Karol Piątkowski, Jonathan Cavitt et al)
+Tools changes:
+- New power tool for power/energy measurement. (Ville Syrjälä)
+- New VM Testbench (VMTB) - SR-IOV Virtual Machine testing tool.
+  (Adam Miszczak)
+- Fixes in amd_hdmi_compliance. (Stylon Wang and Wayne Lin)
+- Fixes in intel_reg. (Lucas De Marchi)
+And many other bug fixes, improvements, cleanups and new tests.
+Release 1.29 (2024-09-04)
+-------------------------
+General changes:
+- Added gcc warns: dangling-pointer, int-conversion (Bhanuprakash Modem)
+- More guidelines in CONTRIBUTING. (Louis Chauvet, Kamil Konieczny)
+- Reorganized Tests: Moved all vendor specific tests to their own dir
+  (Bhanuprakash Modem)
+- Fix musl/uclibc build (Bernd Kuhls, Stefano Ragni)
+Benchmarks changes:
+- New KMS framebuffer stress benchmark. (Arthur Grillo)
+- Added basic Xe support in gem_wsim. (Marcin Bernatowicz)
+Documentation changes:
+- Add documentation about cross-builds. (Mauro Carvalho Chehab)
+- Improve tests documentation. (Katarzyna Piecielska)
+Library changes:
+- Add Battlemage (BMG) support in xe_pciids.h (Andrzej Hajda)
+- Add amdgpu GFX1152, GFX1150 and gfx12. (Jesse Zhang and Tim Huang)
+- Added inline support for iga64 assembly in GPGPU shader. (Andrzej Hajda)
+- Improved KUnit support. (Janusz Krzysztofik)
+- Enable igt run on security enhanced distros (like Gentoo/Hardened)
+  (Matt Turner)
+- Use blitter as a default for all KMS tests. (Juha-Pekka Heikkila and
+  Bhanuprakash Modem)
+- Increased MAX supported pipes to 16 (Pipe-A to Pipe-P) (Vignesh Raman)
+- Added generic way to reset sysfs/debugfs attrs to default values upon
+  exit. (Ville Syrjälä)
+Runner changes:
+- Added hook scripts to igt_runner. (Gustavo Sousa)
+Test changes:
+- Added support for Xe in KMS tests. (Swati Sharma, Bhanuprakash Modem et al)
+- Added new subtests and improvements to VRR. (Bhanuprakash Modem, Jeevan B,
+  Manasi Navare, Sean Paul et al)
+- Added new subtests to force joiner. (Kunal Joshi)
+- Added fbdev tests to Xe. (Bhanuprakash Modem)
+- Added amdgpu fuzzing tests. (Vitaly Prosyak)
+- Added syncobj_eventfd test. (Simon Ser)
+- Added basic Single-Root IO Virtualization (SRIOV) test. (Katarzyna Dec et al)
+- Added prime test for vmwgfx. (Zack Rusin)
+- Improved core_getversion. (Rob Clark, Helen Koike, Kamil Konieczny)
+- Improved kms_atomic on non-mutable planes. (Melissa Wen)
+- Added and improved Xe tests. (Rodrigo Vivi, Matthew Auld, Zbigniew Kempczyński,
+  Francois Dugast, Nirmoy Das, Lucas De Marchi, Janga Rahul Kumar et al)
+Tools and scripts changes:
+- New Xe perf/OA tools. (Ashutosh Dixit)
+- New intel_tiling_detect tool. (Zbigniew Kempczyński)
+- New option in lsgpu for printing GPU on PCI bus, working also
+  in case when no gpu driver is loaded. (Zbigniew Kempczyński)
+- Added sysfs profiling knob to gputop. (Adrián Larumbe)
+- Support for Xe in gputop. (Lucas De Marchi)
+- Improved generating test lists at compilation time.(Mauro Carvalho Chehab)
+- Improved code coverage. (Mauro Carvalho Chehab)
+- Improved intel_vbt_decode and other intel tools. (Ville Syrjälä,
+  Lucas De Marchi, Jani Nikula, Tvrtko Ursulin, Gustavo Sousa et al)
+And many other bug fixes, improvements, cleanups and new tests.
+Release 1.28 (2023-09-13)
+-------------------------
+General changes:
+- New meson options testplan, sphinx and xe_driver. (Mauro Carvalho Chehab)
+Library changes:
+- Add amdgpu GFX1036, GFX1037 chips. (Jesse Zhang)
+- Add xe_pciids.h with Lunar Lake (LNL) support. (Lucas De Marchi)
+- Use the new procps library libproc2. (Craig Small)
+- Add helper for srandom seed initialization. (Łukasz Łaguna)
+- Support for vmwgfx driver. (Maaz Mombasawala, Roye Eshed, Zack Rusin)
+- i915_pciids.h updated to include Pontevecchio (PVC) platform.
+  (Niranjana Vishwanathapura)
+- Add OAM formats and support for media engines in perf tests.
+  (Umesh Nerlige Ramappa)
+- Support for Xe driver. (Matthew Brost, Mauro Carvalho Chehab, Rodrigo Vivi,
+  Jason Ekstrand, Francois Dugast, Philippe Lecluse, Zbigniew Kempczyński,
+  Maarten Lankhorst, Juha-Pekka Heikkila, Bhanuprakash Modem et al)
+Runner changes:
+- igt_runner can now dump GPU state on timeout. (Chris Wilson)
+- igt_runner will now use proper 'abort' as result instead of pseudoresult.
+  (Petri Latvala)
+Tools changes:
+- New vendor agnostic gputop tool. (Tvrtko Ursulin)
+- New tool to dump Intel GuC/HuC CSS header. (Lucas De Marchi)
+- Improve tools intel_watermark, intel_vbt_decode, intel_reg. (Ville Syrjälä)
+Documentation changes:
+- New way for documenting tests will allow to generate documentation and
+  testlists during build, see README.md and test_documentation.md. This
+  is mandatory for Intel (both i915 and xe) and kms tests. (Mauro Carvalho
+  Chehab)
+Test changes:
+- Move intel specific tests to new directory. (Bhanuprakash Modem)
+- Ported and refactored drmlib security tests in amdgpu. (Vitaly Prosyak)
+- Switch DRM selftests to KUnit. (Isabella Basso, Dominik Karol Piątkowski,
+  Mauro Carvalho Chehab)
+- Enabled MeteorLake aux ccs tests. (Juha-Pekka Heikkila)
+- Exercise oversized object detection for Xe. (Priyanka Dandamudi)
+- Enable validation for VDSC output formats. (Swati Sharma)
+- Add support for Bigjoiner / 8K mode. (Bhanuprakash Modem)
+- Use intel_cmds_info library. (Karolina Stolarek)
+- Use Intel kernel gpu command definitions. (Zbigniew Kempczyński)
+- Add a basic perf_pmu test. (Riana Tauro)
+- Add test for V3D's Wait BO IOCTL. (Maíra Canal)
+- Add i915_power test for power measurement. (Ashutosh Dixit)
+- Remove sysfs_clients. (Lucas De Marchi)
+And many other bug fixes, improvements, cleanups and new tests.
+Release 1.27.1 (2023-01-18)
+-------------------------
+- Removed gcc option -fcommon from meson.build and fix broken
+  build on Linux RedHat platform. (Zbigniew Kempczyński)
+- Updated CONTRIBUTING guidelines for interface deprecation. (Petri Latvala)
+- Fixed one additional test bug (Zbigniew Kempczyński)
+Release 1.27 (2023-01-12)
+-------------------------
+- Support for Intel discrete graphics and other new platforms (Andrzej
+  Turko, Matt Roper, Clint Taylor, Tejas Upadhyay, et al)
+- Support for MSM driver. (Mark Yacoub, Rob Clark)
+- Support for SRIOV device selection. (Łukasz Łaguna)
+- Tiled display emulation support with chamelium. (Kunal Joshi)
+- Support for Chamelium v3. (Mark Yacoub)
+- Initial FreeBSD support. (Jake Freeland)
+- Structured communication from tests to igt_runner. (Petri Latvala)
+- Removed last remaining uses of libdrm_intel in tests and
+  tools. (Zbigniew Kempczyński)
+- Automatic kernel code coverage collection during testing. (Mauro
+  Carvalho Chehab)
+And many other bug fixes, improvements, cleanups and new tests.
 Release 1.26 (2021-04-23)
 -------------------------

--- a/README.md
+++ b/README.md
@@ -49,6 +49,11 @@ Documentation is built using
    $ ninja -C build igt-gpu-tools-doc
+Please notice that some drivers and test sets may require that all
+tests to be properly documented via testplan. By default, build
+will fail if one forgets to document or update the documentation.
+This is currently enabled for the Xe, i915 drivers and for KMS tests.
+See docs/test_documentation.md for more details.
 Running Tests
 -------------
@@ -164,7 +169,7 @@ was used to generate them.
 Imported i915_drm.h uapi headers from airlied's drm-next branch.
 In some cases updating a single uapi file is needed as our history
-shows. So in this case, it should be done by:
+shows. In this case, it should be done by:
    # From the kernel dir with a drm/drm-next commit checked out:
    $ make INSTALL_HDR_PATH=<dest-dir> headers_install

--- a/assembler/gram.y
+++ b/assembler/gram.y
@@ -216,7 +216,7 @@ brw_program_add_instruction(struct brw_program *p,
 {
    struct brw_program_instruction *list_entry;
-    list_entry = calloc(sizeof(struct brw_program_instruction), 1);
+    list_entry = calloc(1, sizeof(struct brw_program_instruction));
    list_entry->type = GEN4ASM_INSTRUCTION_GEN;
    list_entry->insn.gen = instruction->insn.gen;
    brw_program_append_entry(p, list_entry);
@@ -228,7 +228,7 @@ brw_program_add_relocatable(struct brw_program *p,
 {
    struct brw_program_instruction *list_entry;
-    list_entry = calloc(sizeof(struct brw_program_instruction), 1);
+    list_entry = calloc(1, sizeof(struct brw_program_instruction));
    list_entry->type = GEN4ASM_INSTRUCTION_GEN_RELOCATABLE;
    list_entry->insn.gen = instruction->insn.gen;
    list_entry->reloc = instruction->reloc;
@@ -239,7 +239,7 @@ static void brw_program_add_label(struct brw_program *p, const char *label)
 {
    struct brw_program_instruction *list_entry;
-    list_entry = calloc(sizeof(struct brw_program_instruction), 1);
+    list_entry = calloc(1, sizeof(struct brw_program_instruction));
    list_entry->type = GEN4ASM_INSTRUCTION_LABEL;
    list_entry->insn.label.name = strdup(label);
    brw_program_append_entry(p, list_entry);

--- a/assembler/main.c
+++ b/assembler/main.c
@@ -397,7 +397,7 @@ int main(int argc, char **argv)
 	    if (entry1 && is_label(entry1) && is_entry_point(entry1)) {
 		// insert NOP instructions until (inst_offset+1) % 4 == 0
 		while (((inst_offset+1) % 4) != 0) {
-		    tmp_entry = calloc(sizeof(*tmp_entry), 1);
+		    tmp_entry = calloc(1, sizeof(*tmp_entry));
 		    tmp_entry->insn.gen.header.opcode = BRW_OPCODE_NOP;
 		    entry->next = tmp_entry;
 		    tmp_entry->next = entry1;

--- a/benchmarks/gem_busy.c
+++ b/benchmarks/gem_busy.c
@@ -33,8 +33,8 @@
 #include <fcntl.h>
 #include <inttypes.h>
 #include <errno.h>
+#include <poll.h>
 #include <sys/stat.h>
-#include <sys/poll.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
 #include <time.h>

--- a/benchmarks/gem_exec_reloc.c
+++ b/benchmarks/gem_exec_reloc.c
@@ -80,9 +80,9 @@ static int run(unsigned batch_size,
 	struct drm_i915_gem_relocation_entry *mem_reloc = NULL;
 	int *target;
-	gem_exec = calloc(sizeof(*gem_exec), num_objects + 1);
+	gem_exec = calloc(num_objects + 1, sizeof(*gem_exec));
-	mem_reloc = calloc(sizeof(*mem_reloc), num_relocs);
+	mem_reloc = calloc(num_relocs, sizeof(*mem_reloc));
-	target = calloc(sizeof(*target), num_relocs);
+	target = calloc(num_relocs, sizeof(*target));
 	fd = drm_open_driver(DRIVER_INTEL);

--- a/benchmarks/gem_exec_tracer.c
+++ b/benchmarks/gem_exec_tracer.c
@@ -271,7 +271,11 @@ static int is_i915(int fd)
 }
 int
+#ifdef __GLIBC__
 ioctl(int fd, unsigned long request, ...)
+#else
+ioctl(int fd, int request, ...)
+#endif
 {
 	struct trace *t, **p;
 	va_list args;

--- a/benchmarks/gem_latency.c
+++ b/benchmarks/gem_latency.c
@@ -36,10 +36,10 @@
 #include <inttypes.h>
 #include <limits.h>
 #include <errno.h>
+#include <poll.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
-#include <sys/poll.h>
 #include <sys/resource.h>
 #include "drm.h"
@@ -193,7 +193,7 @@ static void setup_workload(struct producer *p, int gen,
 	struct drm_i915_gem_relocation_entry *reloc;
 	int offset;
-	reloc = calloc(sizeof(*reloc), 2*factor);
+	reloc = calloc(2*factor, sizeof(*reloc));
 	p->workload_dispatch.exec[0].handle = scratch;
 	p->workload_dispatch.exec[1].relocation_count = 2*factor;
@@ -457,7 +457,7 @@ static int run(int seconds,
 		return IGT_EXIT_SKIP; /* Needs BCS timestamp */
 	intel_register_access_init(&mmio_data,
-				   igt_device_get_pci_device(fd), false, fd);
+				   igt_device_get_pci_device(fd), false);
 	if (gen == 6)
 		timestamp_reg = REG(RCS_TIMESTAMP);

--- a/benchmarks/gem_wsim.c
+++ b/benchmarks/gem_wsim.c
+// SPDX-License-Identifier: MIT
 /*
 * Copyright © 2017 Intel Corporation
 *
@@ -42,6 +43,7 @@
 #include <limits.h>
 #include <pthread.h>
 #include <math.h>
+#include <ctype.h>
 #include "drm.h"
 #include "drmtest.h"
@@ -60,23 +62,56 @@
 #include "i915/gem_engine_topology.h"
 #include "i915/gem_mman.h"
-enum intel_engine_id {
+#include "igt_syncobj.h"
-	DEFAULT,
+#include "intel_allocator.h"
+#include "xe_drm.h"
+#include "xe/xe_ioctl.h"
+#include "xe/xe_spin.h"
+enum intel_engine_class {
 	RCS,
 	BCS,
 	VCS,
-	VCS1,
-	VCS2,
 	VECS,
-	NUM_ENGINES
+	CCS,
+	NUM_ENGINE_CLASSES,
 };
+_Static_assert(RCS == DRM_XE_ENGINE_CLASS_RENDER, "mismatch");
+_Static_assert(BCS == DRM_XE_ENGINE_CLASS_COPY, "mismatch");
+_Static_assert(VCS == DRM_XE_ENGINE_CLASS_VIDEO_DECODE, "mismatch");
+_Static_assert(VECS == DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE, "mismatch");
+_Static_assert(CCS == DRM_XE_ENGINE_CLASS_COMPUTE, "mismatch");
+_Static_assert((int)RCS == (int)I915_ENGINE_CLASS_RENDER, "mismatch");
+_Static_assert((int)BCS == (int)I915_ENGINE_CLASS_COPY, "mismatch");
+_Static_assert((int)VCS == (int)I915_ENGINE_CLASS_VIDEO, "mismatch");
+_Static_assert((int)VECS == (int)I915_ENGINE_CLASS_VIDEO_ENHANCE, "mismatch");
+_Static_assert((int)CCS == (int)I915_ENGINE_CLASS_COMPUTE, "mismatch");
+static const char *intel_engine_class_string(uint16_t engine_class)
+{
+	switch (engine_class) {
+	case RCS:
+		return "RCS";
+	case BCS:
+		return "BCS";
+	case VCS:
+		return "VCS";
+	case VECS:
+		return "VECS";
+	case CCS:
+		return "CCS";
+	default:
+		igt_assert(0);
+	}
+}
 struct duration {
 	unsigned int min, max;
+	bool unbound;
 };
-enum w_type
+enum w_type {
-{
 	BATCH,
 	SYNC,
 	DELAY,
@@ -101,13 +136,16 @@ struct dep_entry {
 	int working_set; /* -1 = step dependecy, >= 0 working set id */
 };
-struct deps
+struct deps {
-{
 	int nr;
 	bool submit_fence;
 	struct dep_entry *list;
 };
+#define for_each_dep(__dep, __deps) \
+	for (int __i = 0; __i < __deps.nr && \
+	     (__dep = &__deps.list[__i]); ++__i)
 struct w_arg {
 	char *filename;
 	char *desc;
@@ -115,9 +153,19 @@ struct w_arg {
 	bool sseu;
 };
+#define INVALID_ID ((uint16_t)-2)
+#define DEFAULT_ID ((uint16_t)-1)
+typedef struct drm_xe_engine_class_instance intel_engine_t;
+struct intel_engines {
+	unsigned int nr_engines;
+	intel_engine_t *engines;
+};
 struct bond {
-	uint64_t mask;
+	struct intel_engines mask;
-	enum intel_engine_id master;
+	intel_engine_t master;
 };
 struct work_buffer_size {
@@ -136,16 +184,15 @@ struct working_set {
 struct workload;
-struct w_step
+struct w_step {
-{
 	struct workload *wrk;
 	/* Workload step metadata */
 	enum w_type type;
 	unsigned int context;
-	unsigned int engine;
+	intel_engine_t engine;
+	unsigned int engine_idx;
 	struct duration duration;
-	bool unbound_duration;
 	struct deps data_deps;
 	struct deps fence_deps;
 	int emit_fence;
@@ -155,17 +202,10 @@ struct w_step
 		int period;
 		int target;
 		int throttle;
-		int fence_signal;
 		int priority;
-		struct {
+		struct intel_engines engine_map;
-			unsigned int engine_map_count;
-			enum intel_engine_id *engine_map;
-		};
 		bool load_balance;
-		struct {
+		struct bond bond;
-			uint64_t bond_mask;
-			enum intel_engine_id bond_master;
-		};
 		int sseu;
 		struct working_set working_set;
 	};
@@ -173,29 +213,60 @@ struct w_step
 	/* Implementation details */
 	unsigned int idx;
 	struct igt_list_head rq_link;
-	unsigned int request;
+	unsigned int request_idx;
 	unsigned int preempt_us;
-	struct drm_i915_gem_execbuffer2 eb;
+	union {
-	struct drm_i915_gem_exec_object2 *obj;
+		struct {
-	struct drm_i915_gem_relocation_entry reloc[3];
+			struct drm_i915_gem_execbuffer2 eb;
+			struct drm_i915_gem_exec_object2 *obj;
+			struct drm_i915_gem_relocation_entry reloc[3];
+			uint32_t *bb_duration;
+		} i915;
+		struct {
+			struct drm_xe_exec exec;
+			struct {
+				struct xe_spin spin;
+				uint64_t vm_sync;
+				uint64_t exec_sync;
+			} *data;
+			struct drm_xe_sync *syncs;
+		} xe;
+	};
+	unsigned long bb_size;
 	uint32_t bb_handle;
-	uint32_t *bb_duration;
+};
+struct vm {
+	uint32_t id;
+	bool compute_mode;
+	uint64_t ahnd;
+};
+struct xe_exec_queue {
+	uint32_t id;
+	unsigned int nr_hwes;
+	struct drm_xe_engine_class_instance *hwe_list;
 };
 struct ctx {
 	uint32_t id;
 	int priority;
-	unsigned int engine_map_count;
+	struct intel_engines engine_map;
-	enum intel_engine_id *engine_map;
 	unsigned int bond_count;
 	struct bond *bonds;
 	bool load_balance;
 	uint64_t sseu;
+	/* reference to vm */
+	struct vm *vm;
+	struct {
+		/* exec queues */
+		unsigned int nr_queues;
+		struct xe_exec_queue *queue_list;
+	} xe;
 };
-struct workload
+struct workload {
-{
 	unsigned int id;
 	unsigned int nr_steps;
@@ -213,42 +284,54 @@ struct workload
 	uint32_t bb_prng;
 	uint32_t bo_prng;
-	struct timespec repeat_start;
 	unsigned int nr_ctxs;
 	struct ctx *ctx_list;
+	unsigned int nr_vms;
+	struct vm *vm_list;
 	struct working_set **working_sets; /* array indexed by set id */
 	int max_working_set_id;
 	int sync_timeline;
 	uint32_t sync_seqno;
-	struct igt_list_head requests[NUM_ENGINES];
+	struct igt_list_head *requests;
-	unsigned int nrequest[NUM_ENGINES];
+	unsigned int *nrequest;
 };
+#define __for_each_ctx(__ctx, __wrk, __ctx_idx) \
+	for (typeof((__wrk)->nr_ctxs) __ctx_idx = 0; __ctx_idx < (__wrk)->nr_ctxs && \
+	     (__ctx = &(__wrk)->ctx_list[__ctx_idx]); ++__ctx_idx)
+#define for_each_ctx(__ctx, __wrk) \
+	__for_each_ctx(__ctx, __wrk, igt_unique(__ctx_idx))
+/* igt_unique(idx) is same on both lines as macro when expanded comes out on one line */
+#define for_each_w_step(__w_step, __wrk) \
+	for (typeof(__wrk->nr_steps) igt_unique(idx) = ({__w_step = __wrk->steps; 0; }); \
+	     igt_unique(idx) < __wrk->nr_steps; igt_unique(idx)++, __w_step++)
 static unsigned int master_prng;
 static int verbose = 1;
 static int fd;
+static bool is_xe;
 static struct drm_i915_gem_context_param_sseu device_sseu = {
 	.slice_mask = -1 /* Force read on first use. */
 };
-#define SYNCEDCLIENTS	(1<<1)
+#define FLAG_SYNCEDCLIENTS	(1<<1)
-#define DEPSYNC		(1<<2)
+#define FLAG_DEPSYNC		(1<<2)
-#define SSEU		(1<<3)
+#define FLAG_SSEU		(1<<3)
-static const char *ring_str_map[NUM_ENGINES] = {
+static void w_step_sync(struct w_step *w)
-	[DEFAULT] = "DEFAULT",
+{
-	[RCS] = "RCS",
+	if (is_xe)
-	[BCS] = "BCS",
+		igt_assert(syncobj_wait(fd, &w->xe.syncs[0].handle, 1, INT64_MAX, 0, NULL));
-	[VCS] = "VCS",
+	else
-	[VCS1] = "VCS1",
+		gem_sync(fd, w->i915.obj[0].handle);
-	[VCS2] = "VCS2",
+}
-	[VECS] = "VECS",
-};
 static int read_timestamp_frequency(int i915)
 {
@@ -335,6 +418,19 @@ parse_working_set_deps(struct workload *wrk,
 	return 0;
 }
+static void __attribute__((format(printf, 1, 2)))
+wsim_err(const char *fmt, ...)
+{
+	va_list ap;
+	if (!verbose)
+		return;
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+}
 static int
 parse_dependency(unsigned int nr_steps, struct w_step *w, char *str)
 {
@@ -355,11 +451,18 @@ parse_dependency(unsigned int nr_steps, struct w_step *w, char *str)
 		break;
 	case 's':
+		/* no submit fence in xe */
+		if (is_xe) {
+			wsim_err("Submit fences are not supported with xe\n");
+			return -1;
+		}
 		submit_fence = true;
 		/* Fall-through. */
 	case 'f':
-		/* Multiple fences not yet supported. */
+		/* xe supports multiple fences */
-		igt_assert_eq(w->fence_deps.nr, 0);
+		if (!is_xe)
+			/* Multiple fences not yet supported. */
+			igt_assert_eq(w->fence_deps.nr, 0);
 		entry.target = atoi(++str);
 		if (entry.target > 0 || ((int)nr_steps + entry.target) < 0)
@@ -429,19 +532,6 @@ out:
 	return ret;
 }
-static void __attribute__((format(printf, 1, 2)))
-wsim_err(const char *fmt, ...)
-{
-	va_list ap;
-	if (!verbose)
-		return;
-	va_start(ap, fmt);
-	vfprintf(stderr, fmt, ap);
-	va_end(ap);
-}
 #define check_arg(cond, fmt, ...) \
 { \
 	if (cond) { \
@@ -450,268 +540,263 @@ wsim_err(const char *fmt, ...)
 	} \
 }
-static int str_to_engine(const char *str)
+/* engine_class[<engine_instance>-<gt_id>] */
-{
+static intel_engine_t str_to_engine(const char *str)
-	unsigned int i;
+{
+	intel_engine_t e = {INVALID_ID, DEFAULT_ID, DEFAULT_ID};
+	size_t pos;
+	if (!strcasecmp("DEFAULT", str)) {
+		e.engine_class = DEFAULT_ID;
+		return e;
+	} else if (!strncasecmp("RCS", str, 3)) {
+		e.engine_class = RCS;
+		pos = 3;
+	} else if (!strncasecmp("BCS", str, 3)) {
+		e.engine_class = BCS;
+		pos = 3;
+	} else if (!strncasecmp("VCS", str, 3)) {
+		e.engine_class = VCS;
+		pos = 3;
+	} else if (!strncasecmp("VECS", str, 4)) {
+		e.engine_class = VECS;
+		pos = 4;
+	} else if (!strncasecmp("CCS", str, 3)) {
+		e.engine_class = CCS;
+		pos = 3;
+	} else {
+		return (intel_engine_t){INVALID_ID};
+	}
+	if (str[pos]) {
+		char *s = strchr(&str[pos], '-');
+		char *endptr = NULL;
+		long id;
+		if (!s || (s && *s != str[pos])) {
+			id = strtol(&str[pos], &endptr, 10);
+			if (endptr == &str[pos] || id < 1 || id >= INVALID_ID)
+				return (intel_engine_t){INVALID_ID};
+			e.engine_instance = id - 1;
+		}
-	for (i = 0; i < ARRAY_SIZE(ring_str_map); i++) {
+		if (s && *(++s)) {
-		if (!strcasecmp(str, ring_str_map[i]))
+			id = strtol(s, &endptr, 10);
-			return i;
+			if (endptr == s || id < 0 || id >= INVALID_ID)
+				return (intel_engine_t){INVALID_ID};
+			e.gt_id = id;
+		}
+		if (endptr && endptr != (str + strlen(str)))
+			return (intel_engine_t){INVALID_ID};
 	}
-	return -1;
+	return e;
 }
-static bool __engines_queried;
+static struct i915_engine_class_instance
-static unsigned int __num_engines;
+engine_to_i915_engine_class(const intel_engine_t *engine)
-static struct i915_engine_class_instance *__engines;
-static int
-__i915_query(int i915, struct drm_i915_query *q)
 {
-	if (igt_ioctl(i915, DRM_IOCTL_I915_QUERY, q))
+	return (struct i915_engine_class_instance){ engine->engine_class,
-		return -errno;
+						    engine->engine_instance };
-	return 0;
 }
-static int
+static unsigned int
-__i915_query_items(int i915, struct drm_i915_query_item *items, uint32_t n_items)
+engine_to_i915_legacy_ring(const intel_engine_t *engine)
 {
-	struct drm_i915_query q = {
+	switch (engine->engine_class) {
-		.num_items = n_items,
+	case DEFAULT_ID:
-		.items_ptr = to_user_pointer(items),
+		return I915_EXEC_DEFAULT;
+	case RCS:
+		return I915_EXEC_RENDER;
+	case BCS:
+		return I915_EXEC_BLT;
+	case VCS:
+		if (engine->engine_instance == DEFAULT_ID)
+			return I915_EXEC_BSD;
+		else if (engine->engine_instance == 0)
+			return I915_EXEC_BSD | I915_EXEC_BSD_RING1;
+		else if (engine->engine_instance == 1)
+			return I915_EXEC_BSD | I915_EXEC_BSD_RING2;
+		break;
+	case VECS:
+		return I915_EXEC_VEBOX;
 	};
-	return __i915_query(i915, &q);
+	igt_assert(0);
 }
-static void
+static bool are_equal_engines(const intel_engine_t *e1,
-i915_query_items(int i915, struct drm_i915_query_item *items, uint32_t n_items)
+			      const intel_engine_t *e2)
 {
-	igt_assert_eq(__i915_query_items(i915, items, n_items), 0);
+	return e1->engine_class == e2->engine_class &&
+	       e1->engine_instance == e2->engine_instance &&
+	       e1->gt_id == e2->gt_id;
 }
-static bool has_engine_query(int i915)
+static bool find_engine_in_map(const intel_engine_t *engine,
+			       struct intel_engines *engines, unsigned int *idx)
 {
-	struct drm_i915_query_item item = {
+	igt_assert(idx);
-		.query_id = DRM_I915_QUERY_ENGINE_INFO,
+	for (unsigned int i = 0; i < engines->nr_engines; ++i)
-	};
+		if (are_equal_engines(engine, &engines->engines[i])) {
+			*idx = i;
+			return true;
+		}
-	return __i915_query_items(i915, &item, 1) == 0 && item.length > 0;
+	return false;
 }
-static void query_engines(void)
+static struct intel_engines *query_engines(void)
 {
-	struct i915_engine_class_instance *engines;
+	static struct intel_engines engines = {};
-	unsigned int num;
-	if (__engines_queried)
-		return;
-	__engines_queried = true;
-	if (!has_engine_query(fd)) {
-		unsigned int num_bsd = gem_has_bsd(fd) + gem_has_bsd2(fd);
-		unsigned int i = 0;
-		igt_assert(num_bsd);
+	if (engines.nr_engines)
+		return &engines;
-		num = 1 + num_bsd;
+	if (is_xe) {
+		struct drm_xe_engine_class_instance *hwe;
-		if (gem_has_blt(fd))
-			num++;
-		if (gem_has_vebox(fd))
-			num++;
-		engines = calloc(num,
-				 sizeof(struct i915_engine_class_instance));
-		igt_assert(engines);
-		engines[i].engine_class = I915_ENGINE_CLASS_RENDER;
-		engines[i].engine_instance = 0;
-		i++;
-		if (gem_has_blt(fd)) {
-			engines[i].engine_class = I915_ENGINE_CLASS_COPY;
-			engines[i].engine_instance = 0;
-			i++;
-		}
-		if (gem_has_bsd(fd)) {
-			engines[i].engine_class = I915_ENGINE_CLASS_VIDEO;
-			engines[i].engine_instance = 0;
-			i++;
-		}
-		if (gem_has_bsd2(fd)) {
+		engines.engines = calloc(xe_number_engines(fd), sizeof(intel_engine_t));
-			engines[i].engine_class = I915_ENGINE_CLASS_VIDEO;
+		igt_assert(engines.engines);
-			engines[i].engine_instance = 1;
+		engines.nr_engines = 0;
-			i++;
+		xe_for_each_engine(fd, hwe)
-		}
+			engines.engines[engines.nr_engines++] = *hwe;
+		igt_assert(engines.nr_engines);
-		if (gem_has_vebox(fd)) {
-			engines[i].engine_class =
-				I915_ENGINE_CLASS_VIDEO_ENHANCE;
-			engines[i].engine_instance = 0;
-			i++;
-		}
 	} else {
-		struct drm_i915_query_engine_info *engine_info;
+		struct intel_engine_data ed = {};
-		struct drm_i915_query_item item = {
-			.query_id = DRM_I915_QUERY_ENGINE_INFO,
+		ed = intel_engine_list_of_physical(fd);
-		};
+		igt_assert(ed.nengines);
-		const unsigned int sz = 4096;
+		engines.nr_engines = ed.nengines;
-		unsigned int i;
+		engines.engines = calloc(engines.nr_engines, sizeof(intel_engine_t));
+		igt_assert(engines.engines);
-		engine_info = malloc(sz);
+		for (int i = 0; i < ed.nengines; ++i) {
-		igt_assert(engine_info);
+			engines.engines[i].engine_class = ed.engines[i].class;
-		memset(engine_info, 0, sz);
+			engines.engines[i].engine_instance = ed.engines[i].instance;
+			engines.engines[i].gt_id = DEFAULT_ID;
-		item.data_ptr = to_user_pointer(engine_info);
-		item.length = sz;
-		i915_query_items(fd, &item, 1);
-		igt_assert(item.length > 0);
-		igt_assert(item.length <= sz);
-		num = engine_info->num_engines;
-		engines = calloc(num,
-				 sizeof(struct i915_engine_class_instance));
-		igt_assert(engines);
-		for (i = 0; i < num; i++) {
-			struct drm_i915_engine_info *engine =
-				(struct drm_i915_engine_info *)&engine_info->engines[i];
-			engines[i] = engine->engine;
 		}
 	}
-	__engines = engines;
+	return &engines;
-	__num_engines = num;
 }
-static unsigned int num_engines_in_class(enum intel_engine_id class)
+static bool is_valid_engine(const intel_engine_t *engine)
 {
-	unsigned int i, count = 0;
+	return engine->engine_class != INVALID_ID;
+}
-	igt_assert(class == VCS);
-	query_engines();
-	for (i = 0; i < __num_engines; i++) {
-		if (__engines[i].engine_class == I915_ENGINE_CLASS_VIDEO)
-			count++;
-	}
-	igt_assert(count);
+static bool is_default_engine(const intel_engine_t *engine)
-	return count;
+{
+	return engine->engine_class == DEFAULT_ID &&
+	       engine->engine_instance == DEFAULT_ID &&
+	       engine->gt_id == DEFAULT_ID;
 }
-static void
+static bool engine_matches_filter(const intel_engine_t *engine, const intel_engine_t *filter)
-fill_engines_id_class(enum intel_engine_id *list,
-		      enum intel_engine_id class)
 {
-	enum intel_engine_id engine = VCS1;
+	return (filter->engine_class == DEFAULT_ID ||
-	unsigned int i, j = 0;
+		filter->engine_class == engine->engine_class) &&
+	       (filter->engine_instance == DEFAULT_ID ||
+		filter->engine_instance == engine->engine_instance) &&
+	       (filter->gt_id == DEFAULT_ID ||
+		filter->gt_id == engine->gt_id);
+}
-	igt_assert(class == VCS);
+#define for_each_matching_engine(__engine, __filter, __engines) \
-	igt_assert(num_engines_in_class(VCS) <= 2);
+	for (unsigned int __i = 0; __i < (__engines)->nr_engines && \
+	     ((__engine) = &(__engines)->engines[__i]); __i++) \
+		for_if(engine_matches_filter((__engine), (__filter)))
-	query_engines();
+static unsigned int
+append_matching_engines(const intel_engine_t *filter, struct intel_engines *engines)
+{
+	unsigned int prev_nr_engines;
+	struct intel_engines *all = query_engines();
+	intel_engine_t *engine;
-	for (i = 0; i < __num_engines; i++) {
+	igt_assert(engines);
-		if (__engines[i].engine_class != I915_ENGINE_CLASS_VIDEO)
+	prev_nr_engines = engines->nr_engines;
-			continue;
-		list[j++] = engine++;
+	for_each_matching_engine(engine, filter, all) {
+		engines->nr_engines++;
+		engines->engines = realloc(engines->engines,
+					   engines->nr_engines * sizeof(intel_engine_t));
+		igt_assert(engines->engines);
+		engines->engines[engines->nr_engines - 1] = *engine;
 	}
+	return engines->nr_engines - prev_nr_engines;
 }
-static unsigned int
+static intel_engine_t get_default_engine(void)
-find_physical_instance(enum intel_engine_id class, unsigned int logical)
 {
-	unsigned int i, j = 0;
+	struct intel_engines *all_engines = query_engines();
+	const intel_engine_t filters[] = {
-	igt_assert(class == VCS);
+		{RCS, DEFAULT_ID, DEFAULT_ID},
+		{CCS, DEFAULT_ID, DEFAULT_ID},
-	for (i = 0; i < __num_engines; i++) {
+		{DEFAULT_ID, DEFAULT_ID, DEFAULT_ID},
-		if (__engines[i].engine_class != I915_ENGINE_CLASS_VIDEO)
+		{INVALID_ID}
-			continue;
+	}, *filter, *default_engine;
-		/* Map logical to physical instances. */
+	for (filter = filters; is_valid_engine(filter); filter++)
-		if (logical == j++)
+		for_each_matching_engine(default_engine, filter, all_engines)
-			return __engines[i].engine_instance;
+			return *default_engine;
-	}
 	igt_assert(0);
-	return 0;
 }
-static struct i915_engine_class_instance
+static intel_engine_t resolve_to_physical_engine_(const intel_engine_t *engine)
-get_engine(enum intel_engine_id engine)
 {
-	struct i915_engine_class_instance ci;
+	struct intel_engines *all_engines = query_engines();
+	intel_engine_t *resolved;
-	query_engines();
+	igt_assert(engine);
+	if (is_default_engine(engine))
+		return get_default_engine();
-	switch (engine) {
+	for_each_matching_engine(resolved, engine, all_engines)
-	case RCS:
+		return *resolved;
-		ci.engine_class = I915_ENGINE_CLASS_RENDER;
-		ci.engine_instance = 0;
+	return (intel_engine_t){INVALID_ID};
-		break;
+}
-	case BCS:
-		ci.engine_class = I915_ENGINE_CLASS_COPY;
-		ci.engine_instance = 0;
-		break;
-	case VCS1:
-	case VCS2:
-		ci.engine_class = I915_ENGINE_CLASS_VIDEO;
-		ci.engine_instance = find_physical_instance(VCS, engine - VCS1);
-		break;
-	case VECS:
-		ci.engine_class = I915_ENGINE_CLASS_VIDEO_ENHANCE;
-		ci.engine_instance = 0;
-		break;
-	default:
-		igt_assert(0);
-	};
-	return ci;
+static void resolve_to_physical_engine(intel_engine_t *engine)
+{
+	*engine = resolve_to_physical_engine_(engine);
+	igt_assert(is_valid_engine(engine));
 }
 static int parse_engine_map(struct w_step *step, const char *_str)
 {
 	char *token, *tctx = NULL, *tstart = (char *)_str;
+	intel_engine_t engine;
 	while ((token = strtok_r(tstart, "|", &tctx))) {
-		enum intel_engine_id engine;
-		unsigned int add;
 		tstart = NULL;
-		if (!strcmp(token, "DEFAULT"))
+		engine = str_to_engine(token);
+		if (!is_valid_engine(&engine) || is_default_engine(&engine))
 			return -1;
-		engine = str_to_engine(token);
+		if (!append_matching_engines(&engine, &step->engine_map))
-		if ((int)engine < 0)
 			return -1;
+	}
-		if (engine != VCS && engine != VCS1 && engine != VCS2 &&
+	return 0;
-		    engine != RCS)
+}
-			return -1; /* TODO */
+static int parse_bond_engines(struct w_step *step, const char *_str)
+{
+	char *token, *tctx = NULL, *tstart = (char *)_str;
+	intel_engine_t engine;
-		add = engine == VCS ? num_engines_in_class(VCS) : 1;
+	while ((token = strtok_r(tstart, "|", &tctx))) {
-		step->engine_map_count += add;
+		tstart = NULL;
-		step->engine_map = realloc(step->engine_map,
-					   step->engine_map_count *
-					   sizeof(step->engine_map[0]));
-		if (engine != VCS)
+		engine = str_to_engine(token);
-			step->engine_map[step->engine_map_count - add] = engine;
+		if (append_matching_engines(&engine, &step->bond.mask) != 1)
-		else
+			return -1;
-			fill_engines_id_class(&step->engine_map[step->engine_map_count - add], VCS);
 	}
 	return 0;
@@ -807,6 +892,7 @@ static int add_buffers(struct working_set *set, char *str)
 	for (i = 0; i < add; i++) {
 		struct work_buffer_size *sz = &sizes[set->nr + i];
 		sz->min = min_sz;
 		sz->max = max_sz;
 		sz->size = 0;
@@ -832,42 +918,63 @@ static int parse_working_set(struct working_set *set, char *str)
 	return 0;
 }
-static uint64_t engine_list_mask(const char *_str)
+static unsigned long
+allocate_working_set(struct workload *wrk, struct working_set *set);
+static long __duration(long dur, double scale)
 {
-	uint64_t mask = 0;
+	return round(scale * dur);
+}
-	char *token, *tctx = NULL, *tstart = (char *)_str;
+static int
+parse_duration(unsigned int nr_steps, struct duration *dur, double scale_dur, char *field)
+{
+	char *sep = NULL;
+	long tmpl;
-	while ((token = strtok_r(tstart, "|", &tctx))) {
+	if (field[0] == '*') {
-		enum intel_engine_id engine = str_to_engine(token);
+		if (intel_gen(intel_get_drm_devid(fd)) < 8) {
+			wsim_err("Infinite batch at step %u needs Gen8+!\n", nr_steps);
+			return -1;
+		}
+		dur->unbound = true;
+	} else {
+		tmpl = strtol(field, &sep, 10);
+		if (tmpl <= 0 || tmpl == LONG_MIN || tmpl == LONG_MAX) {
+			wsim_err("Invalid duration at step %u!\n", nr_steps);
+			return -1;
+		}
-		if ((int)engine < 0 || engine == DEFAULT || engine == VCS)
+		dur->min = __duration(tmpl, scale_dur);
-			return 0;
-		mask |= 1 << engine;
+		if (sep && *sep == '-') {
+			tmpl = strtol(sep + 1, NULL, 10);
+			if (tmpl <= 0 || __duration(tmpl, scale_dur) <= dur->min ||
+			    tmpl == LONG_MIN || tmpl == LONG_MAX) {
+				wsim_err("Invalid maximum duration at step %u!\n", nr_steps);
+				return -1;
+			}
-		tstart = NULL;
+			dur->max = __duration(tmpl, scale_dur);
+		} else {
+			dur->max = dur->min;
+		}
 	}
-	return mask;
+	return 0;
-}
-static unsigned long
-allocate_working_set(struct workload *wrk, struct working_set *set);
-static long __duration(long dur, double scale)
-{
-	return round(scale * dur);
 }
 #define int_field(_STEP_, _FIELD_, _COND_, _ERR_) \
-	if ((field = strtok_r(fstart, ".", &fctx))) { \
+	do { \
-		tmp = atoi(field); \
+		field = strtok_r(fstart, ".", &fctx); \
-		check_arg(_COND_, _ERR_, nr_steps); \
+		if (field) { \
-		step.type = _STEP_; \
+			tmp = atoi(field); \
-		step._FIELD_ = tmp; \
+			check_arg(_COND_, _ERR_, nr_steps); \
-		goto add_step; \
+			step.type = _STEP_; \
-	} \
+			step._FIELD_ = tmp; \
+			goto add_step; \
+		} \
+	} while (0)
 static struct workload *
 parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
@@ -892,9 +999,18 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
 		valid = 0;
 		memset(&step, 0, sizeof(step));
-		if ((field = strtok_r(fstart, ".", &fctx))) {
+		field = strtok_r(fstart, ".", &fctx);
+		if (field) {
 			fstart = NULL;
+			/* line starting with # is a comment */
+			if (field[0] == '#') {
+				if (verbose > 3)
+					printf("skipped line: %s\n", _token);
+				free(token);
+				continue;
+			}
 			if (!strcmp(field, "d")) {
 				int_field(DELAY, delay, tmp <= 0,
 					  "Invalid delay at step %u!\n");
@@ -903,6 +1019,13 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
 					  "Invalid period at step %u!\n");
 			} else if (!strcmp(field, "P")) {
 				unsigned int nr = 0;
+				if (is_xe) {
+					wsim_err("Priority step is not implemented with xe yet.\n");
+					free(token);
+					return NULL;
+				}
 				while ((field = strtok_r(fstart, ".", &fctx))) {
 					tmp = atoi(field);
 					check_arg(nr == 0 && tmp <= 0,
@@ -928,6 +1051,13 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
 					  "Invalid sync target at step %u!\n");
 			} else if (!strcmp(field, "S")) {
 				unsigned int nr = 0;
+				if (is_xe) {
+					wsim_err("SSEU step is not implemented with xe yet.\n");
+					free(token);
+					return NULL;
+				}
 				while ((field = strtok_r(fstart, ".", &fctx))) {
 					tmp = atoi(field);
 					check_arg(tmp <= 0 && nr == 0,
@@ -964,6 +1094,7 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
 				goto add_step;
 			} else if (!strcmp(field, "M")) {
 				unsigned int nr = 0;
 				while ((field = strtok_r(fstart, ".", &fctx))) {
 					tmp = atoi(field);
 					check_arg(nr == 0 && tmp <= 0,
@@ -994,6 +1125,7 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
 					  "Invalid terminate target at step %u!\n");
 			} else if (!strcmp(field, "X")) {
 				unsigned int nr = 0;
 				while ((field = strtok_r(fstart, ".", &fctx))) {
 					tmp = atoi(field);
 					check_arg(nr == 0 && tmp <= 0,
@@ -1018,6 +1150,7 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
 				goto add_step;
 			} else if (!strcmp(field, "B")) {
 				unsigned int nr = 0;
 				while ((field = strtok_r(fstart, ".", &fctx))) {
 					tmp = atoi(field);
 					check_arg(nr == 0 && tmp <= 0,
@@ -1037,6 +1170,13 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
 				goto add_step;
 			} else if (!strcmp(field, "b")) {
 				unsigned int nr = 0;
+				if (is_xe) {
+					wsim_err("Bonding is not implemented with xe yet.\n");
+					free(token);
+					return NULL;
+				}
 				while ((field = strtok_r(fstart, ".", &fctx))) {
 					check_arg(nr > 2,
 						  "Invalid bond format at step %u!\n",
@@ -1049,18 +1189,19 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
 							  "Invalid context at step %u!\n",
 							  nr_steps);
 					} else if (nr == 1) {
-						step.bond_mask = engine_list_mask(field);
+						tmp = parse_bond_engines(&step, field);
-						check_arg(step.bond_mask == 0,
+						check_arg(tmp < 0,
-							"Invalid siblings list at step %u!\n",
+							  "Invalid siblings list at step %u!\n",
-							nr_steps);
+							  nr_steps);
 					} else if (nr == 2) {
-						tmp = str_to_engine(field);
+						struct intel_engines engines;
-						check_arg(tmp <= 0 ||
-							  tmp == VCS ||
+						step.bond.master = str_to_engine(field);
-							  tmp == DEFAULT,
+						check_arg(append_matching_engines(&step.bond.master,
+										  &engines) != 1,
 							  "Invalid master engine at step %u!\n",
 							  nr_steps);
-						step.bond_master = tmp;
+						free(engines.engines);
 					}
 					nr++;
@@ -1071,6 +1212,12 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
 			} else if (!strcmp(field, "w") || !strcmp(field, "W")) {
 				unsigned int nr = 0;
+				if (is_xe) {
+					wsim_err("Working sets are not implemented with xe yet.\n");
+					free(token);
+					return NULL;
+				}
 				step.working_set.shared = field[0] == 'W';
 				while ((field = strtok_r(fstart, ".", &fctx))) {
@@ -1108,56 +1255,29 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
 			valid++;
 		}
-		if ((field = strtok_r(fstart, ".", &fctx))) {
+		field = strtok_r(fstart, ".", &fctx);
+		if (field) {
 			fstart = NULL;
-			i = str_to_engine(field);
+			step.engine = str_to_engine(field);
-			check_arg(i < 0,
+			check_arg(!is_valid_engine(&step.engine),
 				  "Invalid engine id at step %u!\n", nr_steps);
 			valid++;
-			step.engine = i;
 		}
-		if ((field = strtok_r(fstart, ".", &fctx))) {
+		field = strtok_r(fstart, ".", &fctx);
-			char *sep = NULL;
+		if (field) {
-			long int tmpl;
 			fstart = NULL;
-			if (field[0] == '*') {
+			if (parse_duration(nr_steps, &step.duration, scale_dur, field))
-				check_arg(intel_gen(intel_get_drm_devid(fd)) < 8,
+				return NULL;
-					  "Infinite batch at step %u needs Gen8+!\n",
-					  nr_steps);
-				step.unbound_duration = true;
-			} else {
-				tmpl = strtol(field, &sep, 10);
-				check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
-					  tmpl == LONG_MAX,
-					  "Invalid duration at step %u!\n",
-					  nr_steps);
-				step.duration.min = __duration(tmpl, scale_dur);
-				if (sep && *sep == '-') {
-					tmpl = strtol(sep + 1, NULL, 10);
-					check_arg(tmpl <= 0 ||
-						tmpl <= step.duration.min ||
-						tmpl == LONG_MIN ||
-						tmpl == LONG_MAX,
-						"Invalid duration range at step %u!\n",
-						nr_steps);
-					step.duration.max = __duration(tmpl,
-								       scale_dur);
-				} else {
-					step.duration.max = step.duration.min;
-				}
-			}
 			valid++;
 		}
-		if ((field = strtok_r(fstart, ".", &fctx))) {
+		field = strtok_r(fstart, ".", &fctx);
+		if (field) {
 			fstart = NULL;
 			tmp = parse_dependencies(nr_steps, &step, field);
@@ -1167,7 +1287,8 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
 			valid++;
 		}
-		if ((field = strtok_r(fstart, ".", &fctx))) {
+		field = strtok_r(fstart, ".", &fctx);
+		if (field) {
 			fstart = NULL;
 			check_arg(strlen(field) != 1 ||
@@ -1188,7 +1309,8 @@ add_step:
 			step.delay = __duration(step.delay, scale_time);
 		step.idx = nr_steps++;
-		step.request = -1;
+		step.rq_link.next = NULL;
+		step.rq_link.prev = NULL;
 		steps = realloc(steps, sizeof(step) * nr_steps);
 		igt_assert(steps);
@@ -1220,7 +1342,7 @@ add_step:
 	wrk->sseu = arg->sseu;
 	wrk->max_working_set_id = -1;
 	wrk->working_sets = NULL;
-	wrk->bo_prng = (flags & SYNCEDCLIENTS) ? master_prng : rand();
+	wrk->bo_prng = (flags & FLAG_SYNCEDCLIENTS) ? master_prng : rand();
 	free(desc);
@@ -1229,8 +1351,10 @@ add_step:
 	 * referencing them as a sync fence dependency.
 	 */
 	for (i = 0; i < nr_steps; i++) {
-		for (j = 0; j < steps[i].fence_deps.nr; j++) {
+		struct dep_entry *dep;
-			tmp = steps[i].idx + steps[i].fence_deps.list[j].target;
+		for_each_dep(dep, steps[i].fence_deps) {
+			tmp = steps[i].idx + dep->target;
 			check_arg(tmp < 0 || tmp >= i ||
 				  (steps[tmp].type != BATCH &&
 				   steps[tmp].type != SW_FENCE),
@@ -1252,14 +1376,14 @@ add_step:
 	/*
 	 * Check no duplicate working set ids.
 	 */
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+	for_each_w_step(w, wrk) {
 		struct w_step *w2;
 		if (w->type != WORKINGSET)
 			continue;
-		for (j = 0, w2 = wrk->steps; j < wrk->nr_steps; w2++, j++) {
+		for_each_w_step(w2, wrk) {
-			if (j == i)
+			if (w->idx == w2->idx)
 				continue;
 			if (w2->type != WORKINGSET)
 				continue;
@@ -1272,7 +1396,7 @@ add_step:
 	/*
 	 * Allocate shared working sets.
 	 */
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+	for_each_w_step(w, wrk) {
 		if (w->type == WORKINGSET && w->working_set.shared) {
 			unsigned long total =
 				allocate_working_set(wrk, &w->working_set);
@@ -1284,7 +1408,7 @@ add_step:
 	}
 	wrk->max_working_set_id = -1;
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+	for_each_w_step(w, wrk) {
 		if (w->type == WORKINGSET &&
 		    w->working_set.shared &&
 		    w->working_set.id > wrk->max_working_set_id)
@@ -1295,7 +1419,7 @@ add_step:
 				   sizeof(*wrk->working_sets));
 	igt_assert(wrk->working_sets);
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+	for_each_w_step(w, wrk) {
 		if (w->type == WORKINGSET && w->working_set.shared)
 			wrk->working_sets[w->working_set.id] = &w->working_set;
 	}
@@ -1306,8 +1430,9 @@ add_step:
 static struct workload *
 clone_workload(struct workload *_wrk)
 {
+	int nr_engines = query_engines()->nr_engines;
 	struct workload *wrk;
-	int i;
+	struct w_step *w;
 	wrk = malloc(sizeof(*wrk));
 	igt_assert(wrk);
@@ -1334,16 +1459,20 @@ clone_workload(struct workload *_wrk)
 	}
 	/* Check if we need a sw sync timeline. */
-	for (i = 0; i < wrk->nr_steps; i++) {
+	for_each_w_step(w, wrk) {
-		if (wrk->steps[i].type == SW_FENCE) {
+		if (w->type == SW_FENCE) {
 			wrk->sync_timeline = sw_sync_timeline_create();
 			igt_assert(wrk->sync_timeline >= 0);
 			break;
 		}
 	}
-	for (i = 0; i < NUM_ENGINES; i++)
+	wrk->requests = calloc(nr_engines, sizeof(*wrk->requests));
-		IGT_INIT_LIST_HEAD(&wrk->requests[i]);
+	igt_assert(wrk->requests);
+	wrk->nrequest = calloc(nr_engines, sizeof(*wrk->nrequest));
+	igt_assert(wrk->nrequest);
+	while (--nr_engines >= 0)
+		IGT_INIT_LIST_HEAD(&wrk->requests[nr_engines]);
 	return wrk;
 }
@@ -1370,37 +1499,32 @@ __get_ctx(struct workload *wrk, const struct w_step *w)
 	return &wrk->ctx_list[w->context];
 }
-static uint32_t mmio_base(int i915, enum intel_engine_id engine, int gen)
+static uint32_t mmio_base(int i915, const intel_engine_t *engine, int gen)
 {
-	const char *name;
+	char name[16];
 	if (gen >= 11)
 		return 0;
-	switch (engine) {
+	switch (engine->engine_class) {
-	case NUM_ENGINES:
 	default:
 		return 0;
-	case DEFAULT:
+	case DEFAULT_ID:
 	case RCS:
-		name = "rcs0";
+		snprintf(name, sizeof(name), "rcs%u", engine->engine_instance);
 		break;
 	case BCS:
-		name = "bcs0";
+		snprintf(name, sizeof(name), "bcs%u", engine->engine_instance);
 		break;
 	case VCS:
-	case VCS1:
+		snprintf(name, sizeof(name), "vcs%u", engine->engine_instance);
-		name = "vcs0";
-		break;
-	case VCS2:
-		name = "vcs1";
 		break;
 	case VECS:
-		name = "vecs0";
+		snprintf(name, sizeof(name), "vecs%u", engine->engine_instance);
+		break;
+	case CCS:
+		snprintf(name, sizeof(name), "ccs%u", engine->engine_instance);
 		break;
 	}
@@ -1410,12 +1534,12 @@ static uint32_t mmio_base(int i915, enum intel_engine_id engine, int gen)
 static unsigned int create_bb(struct w_step *w, int self)
 {
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	const uint32_t base = mmio_base(fd, w->engine, gen);
+	const uint32_t base = mmio_base(fd, &w->engine, gen);
 #define CS_GPR(x) (base + 0x600 + 8 * (x))
 #define TIMESTAMP (base + 0x3a8)
 	const int use_64b = gen >= 8;
 	enum { START_TS, NOW_TS };
-	uint32_t *ptr, *cs, *jmp;
+	uint32_t *cs, *jmp;
 	unsigned int r = 0;
 	/* Loop until CTX_TIMESTAMP - initial > target ns */
@@ -1423,10 +1547,19 @@ static unsigned int create_bb(struct w_step *w, int self)
 	gem_set_domain(fd, w->bb_handle,
 		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
-	cs = ptr = gem_mmap__wc(fd, w->bb_handle, 0, 4096, PROT_WRITE);
+	if (__gem_set_caching(fd, w->bb_handle, I915_CACHING_CACHED) == 0) {
+		cs = gem_mmap__cpu(fd, w->bb_handle,
+				   0, w->bb_size,
+				   PROT_READ | PROT_WRITE);
+	} else {
+		cs = gem_mmap__device_coherent(fd,
+					       w->bb_handle,
+					       0, w->bb_size,
+					       PROT_READ | PROT_WRITE);
+	}
 	/* Store initial 64b timestamp: start */
-	*cs++ = MI_LOAD_REGISTER_IMM | MI_CS_MMIO_DST;
+	*cs++ = MI_LOAD_REGISTER_IMM(1) | MI_CS_MMIO_DST;
 	*cs++ = CS_GPR(START_TS) + 4;
 	*cs++ = 0;
 	*cs++ = MI_LOAD_REGISTER_REG | MI_CS_MMIO_DST | MI_CS_MMIO_SRC;
@@ -1441,7 +1574,7 @@ static unsigned int create_bb(struct w_step *w, int self)
 		*cs++ = MI_ARB_CHECK;
 	/* Store this 64b timestamp: now */
-	*cs++ = MI_LOAD_REGISTER_IMM | MI_CS_MMIO_DST;
+	*cs++ = MI_LOAD_REGISTER_IMM(1) | MI_CS_MMIO_DST;
 	*cs++ = CS_GPR(NOW_TS) + 4;
 	*cs++ = 0;
 	*cs++ = MI_LOAD_REGISTER_REG | MI_CS_MMIO_DST | MI_CS_MMIO_SRC;
@@ -1456,11 +1589,13 @@ static unsigned int create_bb(struct w_step *w, int self)
 	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
 	/* Save delta for indirect read by COND_BBE */
-	*cs++ = MI_STORE_REGISTER_MEM | (1 + use_64b) | MI_CS_MMIO_DST;
+	*cs++ = MI_STORE_REGISTER_MEM_CMD | (1 + use_64b) | MI_CS_MMIO_DST;
 	*cs++ = CS_GPR(NOW_TS);
-	w->reloc[r].target_handle = self;
+	w->i915.reloc[r].presumed_offset = w->i915.obj[self].offset;
-	w->reloc[r].offset = offset_in_page(cs);
+	w->i915.reloc[r].target_handle = self;
-	*cs++ = w->reloc[r].delta = 4000;
+	w->i915.reloc[r].offset = offset_in_page(cs);
+	w->i915.reloc[r].delta = 4000;
+	*cs++ = w->i915.reloc[r].presumed_offset + w->i915.reloc[r].delta;
 	*cs++ = 0;
 	r++;
@@ -1473,19 +1608,23 @@ static unsigned int create_bb(struct w_step *w, int self)
 	/* Break if delta [time elapsed] > target ns (target filled in later) */
 	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
-	w->bb_duration = cs;
+	w->i915.bb_duration = cs;
 	*cs++ = 0;
-	w->reloc[r].target_handle = self;
+	w->i915.reloc[r].presumed_offset = w->i915.obj[self].offset;
-	w->reloc[r].offset = offset_in_page(cs);
+	w->i915.reloc[r].target_handle = self;
-	*cs++ = w->reloc[r].delta = 4000;
+	w->i915.reloc[r].offset = offset_in_page(cs);
+	w->i915.reloc[r].delta = 4000;
+	*cs++ = w->i915.reloc[r].presumed_offset + w->i915.reloc[r].delta;
 	*cs++ = 0;
 	r++;
 	/* Otherwise back to recalculating delta */
 	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
-	w->reloc[r].target_handle = self;
+	w->i915.reloc[r].presumed_offset = w->i915.obj[self].offset;
-	w->reloc[r].offset = offset_in_page(cs);
+	w->i915.reloc[r].target_handle = self;
-	*cs++ = w->reloc[r].delta = offset_in_page(jmp);
+	w->i915.reloc[r].offset = offset_in_page(cs);
+	w->i915.reloc[r].delta = offset_in_page(jmp);
+	*cs++ = w->i915.reloc[r].presumed_offset + w->i915.reloc[r].delta;
 	*cs++ = 0;
 	r++;
@@ -1493,135 +1632,220 @@ static unsigned int create_bb(struct w_step *w, int self)
 	return r;
 }
-static const unsigned int eb_engine_map[NUM_ENGINES] = {
-	[DEFAULT] = I915_EXEC_DEFAULT,
-	[RCS] = I915_EXEC_RENDER,
-	[BCS] = I915_EXEC_BLT,
-	[VCS] = I915_EXEC_BSD,
-	[VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1,
-	[VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2,
-	[VECS] = I915_EXEC_VEBOX
-};
 static void
-eb_set_engine(struct drm_i915_gem_execbuffer2 *eb, enum intel_engine_id engine)
+eb_update_flags(struct workload *wrk, struct w_step *w)
 {
-	eb->flags = eb_engine_map[engine];
+	w->i915.eb.flags = w->engine_idx;
+	w->i915.eb.flags |= I915_EXEC_HANDLE_LUT;
+	w->i915.eb.flags |= I915_EXEC_NO_RELOC;
+	igt_assert(w->emit_fence <= 0);
+	if (w->emit_fence)
+		w->i915.eb.flags |= I915_EXEC_FENCE_OUT;
 }
-static unsigned int
+static uint32_t
-find_engine_in_map(struct ctx *ctx, enum intel_engine_id engine)
+get_ctxid(struct workload *wrk, struct w_step *w)
 {
-	unsigned int i;
+	return wrk->ctx_list[w->context].id;
-	for (i = 0; i < ctx->engine_map_count; i++) {
-		if (ctx->engine_map[i] == engine)
-			return i + 1;
-	}
-	igt_assert(ctx->load_balance);
-	return 0;
 }
-static void
+static struct xe_exec_queue *
-eb_update_flags(struct workload *wrk, struct w_step *w,
+xe_get_eq(struct workload *wrk, const struct w_step *w)
-		enum intel_engine_id engine)
 {
 	struct ctx *ctx = __get_ctx(wrk, w);
-	if (ctx->engine_map)
+	igt_assert_lt(w->engine_idx, ctx->xe.nr_queues);
-		w->eb.flags = find_engine_in_map(ctx, engine);
+	return &ctx->xe.queue_list[w->engine_idx];
-	else
-		eb_set_engine(&w->eb, engine);
-	w->eb.flags |= I915_EXEC_HANDLE_LUT;
-	w->eb.flags |= I915_EXEC_NO_RELOC;
-	igt_assert(w->emit_fence <= 0);
-	if (w->emit_fence)
-		w->eb.flags |= I915_EXEC_FENCE_OUT;
 }
-static uint32_t
+static struct vm *
-get_ctxid(struct workload *wrk, struct w_step *w)
+get_vm(struct workload *wrk, const struct w_step *w)
 {
-	return wrk->ctx_list[w->context].id;
+	return wrk->vm_list;
 }
-static uint32_t alloc_bo(int i915, unsigned long size)
+static uint32_t alloc_bo(int i915, unsigned long *size)
 {
-	return gem_create(i915, size);
+	uint32_t handle;
+	uint64_t sz = *size;
+	igt_assert_eq(__gem_create(i915, &sz, &handle), 0);
+	igt_assert(sz <= ULONG_MAX);
+	*size = sz;
+	return handle;
 }
 static void
 alloc_step_batch(struct workload *wrk, struct w_step *w)
 {
-	enum intel_engine_id engine = w->engine;
+	struct dep_entry *dep;
 	unsigned int j = 0;
 	unsigned int nr_obj = 2 + w->data_deps.nr;
-	unsigned int i;
+	unsigned int objflags = 0;
+	uint64_t addr;
+	struct vm *vm = get_vm(wrk, w);
+	addr = gem_aperture_size(fd) / 2;
+	if (addr >> 32)
+		objflags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+	if (vm->ahnd)
+		objflags |= EXEC_OBJECT_PINNED;
+	w->i915.obj = calloc(nr_obj, sizeof(*w->i915.obj));
+	igt_assert(w->i915.obj);
-	w->obj = calloc(nr_obj, sizeof(*w->obj));
+	w->bb_size = PAGE_SIZE;
-	igt_assert(w->obj);
+	w->i915.obj[j].handle = alloc_bo(fd, &w->bb_size);
+	w->i915.obj[j].flags = EXEC_OBJECT_WRITE;
+	if (vm->ahnd) {
+		addr = get_offset(vm->ahnd, w->i915.obj[j].handle, w->bb_size, 0);
+		w->i915.obj[j].offset = CANONICAL(addr);
+		w->i915.obj[j].flags |= objflags;
+	}
-	w->obj[j].handle = alloc_bo(fd, 4096);
-	w->obj[j].flags = EXEC_OBJECT_WRITE;
 	j++;
 	igt_assert(j < nr_obj);
-	for (i = 0; i < w->data_deps.nr; i++) {
+	for_each_dep(dep, w->data_deps) {
-		struct dep_entry *entry = &w->data_deps.list[i];
 		uint32_t dep_handle;
+		uint64_t dep_size;
-		if (entry->working_set == -1) {
+		if (dep->working_set == -1) {
-			int dep_idx = w->idx + entry->target;
+			int dep_idx = w->idx + dep->target;
-			igt_assert(entry->target <= 0);
+			igt_assert(dep->target <= 0);
 			igt_assert(dep_idx >= 0 && dep_idx < w->idx);
 			igt_assert(wrk->steps[dep_idx].type == BATCH);
-			dep_handle = wrk->steps[dep_idx].obj[0].handle;
+			dep_handle = wrk->steps[dep_idx].i915.obj[0].handle;
+			dep_size = w->bb_size;
 		} else {
 			struct working_set *set;
-			igt_assert(entry->working_set <=
+			igt_assert(dep->working_set <=
 				   wrk->max_working_set_id);
-			set = wrk->working_sets[entry->working_set];
+			set = wrk->working_sets[dep->working_set];
 			igt_assert(set->nr);
-			igt_assert(entry->target < set->nr);
+			igt_assert(dep->target < set->nr);
-			igt_assert(set->sizes[entry->target].size);
+			igt_assert(set->sizes[dep->target].size);
-			dep_handle = set->handles[entry->target];
+			dep_handle = set->handles[dep->target];
+			dep_size = set->sizes[dep->target].size;
 		}
-		w->obj[j].flags = entry->write ? EXEC_OBJECT_WRITE : 0;
+		w->i915.obj[j].flags = dep->write ? EXEC_OBJECT_WRITE : 0;
-		w->obj[j].handle = dep_handle;
+		w->i915.obj[j].handle = dep_handle;
+		if (vm->ahnd) {
+			addr = get_offset(vm->ahnd, w->i915.obj[j].handle, dep_size, 0);
+			w->i915.obj[j].offset = CANONICAL(addr);
+			w->i915.obj[j].flags |= objflags;
+		}
 		j++;
 		igt_assert(j < nr_obj);
 	}
-	w->bb_handle = w->obj[j].handle = gem_create(fd, 4096);
+	w->bb_handle = w->i915.obj[j].handle = alloc_bo(fd, &w->bb_size);
-	w->obj[j].relocation_count = create_bb(w, j);
+	if (vm->ahnd) {
-	igt_assert(w->obj[j].relocation_count <= ARRAY_SIZE(w->reloc));
+		addr = get_offset(vm->ahnd, w->i915.obj[j].handle, w->bb_size, 0);
-	w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
+		w->i915.obj[j].offset = CANONICAL(addr);
+		w->i915.obj[j].flags |= objflags;
+	}
+	w->i915.obj[j].relocation_count = create_bb(w, j);
+	if (vm->ahnd) {
+		w->i915.obj[j].relocation_count = 0;
+	} else {
+		igt_assert(w->i915.obj[j].relocation_count <= ARRAY_SIZE(w->i915.reloc));
+		w->i915.obj[j].relocs_ptr = to_user_pointer(&w->i915.reloc);
+	}
-	w->eb.buffers_ptr = to_user_pointer(w->obj);
+	w->i915.eb.buffers_ptr = to_user_pointer(w->i915.obj);
-	w->eb.buffer_count = j + 1;
+	w->i915.eb.buffer_count = j + 1;
-	w->eb.rsvd1 = get_ctxid(wrk, w);
+	w->i915.eb.rsvd1 = get_ctxid(wrk, w);
-	eb_update_flags(wrk, w, engine);
+	eb_update_flags(wrk, w);
 #ifdef DEBUG
-	printf("%u: %u:|", w->idx, w->eb.buffer_count);
+	printf("%u: %u:|", w->idx, w->i915.eb.buffer_count);
 	for (i = 0; i <= j; i++)
-		printf("%x|", w->obj[i].handle);
+		printf("%x|", w->i915.obj[i].handle);
 	printf(" flags=%llx bb=%x[%u] ctx[%u]=%u\n",
-		w->eb.flags, w->bb_handle, j, w->context,
+		w->i915.eb.flags, w->bb_handle, j, w->context,
 		get_ctxid(wrk, w));
 #endif
 }
+static void
+xe_alloc_step_batch(struct workload *wrk, struct w_step *w)
+{
+	struct vm *vm = get_vm(wrk, w);
+	struct xe_exec_queue *eq = xe_get_eq(wrk, w);
+	struct dep_entry *dep;
+	int i;
+	w->bb_size = xe_bb_size(fd, PAGE_SIZE);
+	w->bb_handle = xe_bo_create(fd, vm->id, w->bb_size,
+				    vram_if_possible(fd, eq->hwe_list[0].gt_id),
+				    DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
+	w->xe.data = xe_bo_map(fd, w->bb_handle, w->bb_size);
+	w->xe.exec.address =
+		intel_allocator_alloc_with_strategy(vm->ahnd, w->bb_handle, w->bb_size,
+						    0, ALLOC_STRATEGY_LOW_TO_HIGH);
+	xe_vm_bind_sync(fd, vm->id, w->bb_handle, 0, w->xe.exec.address, w->bb_size);
+	xe_spin_init_opts(&w->xe.data->spin, .addr = w->xe.exec.address,
+				   .preempt = (w->preempt_us > 0),
+				   .ctx_ticks = xe_spin_nsec_to_ticks(fd, eq->hwe_list[0].gt_id,
+								      1000LL * get_duration(wrk, w)));
+	w->xe.exec.exec_queue_id = eq->id;
+	w->xe.exec.num_batch_buffer = 1;
+	/* always at least one out fence */
+	w->xe.exec.num_syncs = 1;
+	/* count syncs */
+	for_each_dep(dep, w->data_deps) {
+		int dep_idx = w->idx + dep->target;
+		igt_assert(dep_idx >= 0 && dep_idx < w->idx);
+		igt_assert(wrk->steps[dep_idx].type == BATCH);
+		w->xe.exec.num_syncs++;
+	}
+	for_each_dep(dep, w->fence_deps) {
+		int dep_idx = w->idx + dep->target;
+		igt_assert(dep_idx >= 0 && dep_idx < w->idx);
+		igt_assert(wrk->steps[dep_idx].type == SW_FENCE ||
+			   wrk->steps[dep_idx].type == BATCH);
+		w->xe.exec.num_syncs++;
+	}
+	w->xe.syncs = calloc(w->xe.exec.num_syncs, sizeof(*w->xe.syncs));
+	/* fill syncs */
+	i = 0;
+	/* out fence */
+	w->xe.syncs[i].handle = syncobj_create(fd, 0);
+	w->xe.syncs[i].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
+	w->xe.syncs[i++].flags = DRM_XE_SYNC_FLAG_SIGNAL;
+	/* in fence(s) */
+	for_each_dep(dep, w->data_deps) {
+		int dep_idx = w->idx + dep->target;
+		igt_assert(wrk->steps[dep_idx].xe.syncs && wrk->steps[dep_idx].xe.syncs[0].handle);
+		w->xe.syncs[i].handle = wrk->steps[dep_idx].xe.syncs[0].handle;
+		w->xe.syncs[i++].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
+	}
+	for_each_dep(dep, w->fence_deps) {
+		int dep_idx = w->idx + dep->target;
+		igt_assert(wrk->steps[dep_idx].xe.syncs && wrk->steps[dep_idx].xe.syncs[0].handle);
+		w->xe.syncs[i].handle = wrk->steps[dep_idx].xe.syncs[0].handle;
+		w->xe.syncs[i++].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
+	}
+	w->xe.exec.syncs = to_user_pointer(w->xe.syncs);
+}
 static bool set_priority(uint32_t ctx_id, int prio)
 {
 	struct drm_i915_gem_context_param param = {
@@ -1671,22 +1895,6 @@ static void vm_destroy(int i915, uint32_t vm_id)
 	igt_assert_eq(__vm_destroy(i915, vm_id), 0);
 }
-static unsigned int
-find_engine(struct i915_engine_class_instance *ci, unsigned int count,
-	    enum intel_engine_id engine)
-{
-	struct i915_engine_class_instance e = get_engine(engine);
-	unsigned int i;
-	for (i = 0; i < count; i++, ci++) {
-		if (!memcmp(&e, ci, sizeof(*ci)))
-			return i;
-	}
-	igt_assert(0);
-	return 0;
-}
 static struct drm_i915_gem_context_param_sseu get_device_sseu(void)
 {
 	struct drm_i915_gem_context_param param = { };
@@ -1710,7 +1918,7 @@ set_ctx_sseu(struct ctx *ctx, uint64_t slice_mask)
 	if (slice_mask == -1)
 		slice_mask = device_sseu.slice_mask;
-	if (ctx->engine_map && ctx->load_balance) {
+	if (ctx->engine_map.nr_engines && ctx->load_balance) {
 		sseu.flags = I915_CONTEXT_SSEU_FLAG_ENGINE_INDEX;
 		sseu.engine.engine_class = I915_ENGINE_CLASS_INVALID;
 		sseu.engine.engine_instance = 0;
@@ -1767,7 +1975,7 @@ allocate_working_set(struct workload *wrk, struct working_set *set)
 	for (i = 0; i < set->nr; i++) {
 		set->sizes[i].size = get_buffer_size(wrk, &set->sizes[i]);
-		set->handles[i] = alloc_bo(fd, set->sizes[i].size);
+		set->handles[i] = alloc_bo(fd, &set->sizes[i].size);
 		total += set->sizes[i].size;
 	}
@@ -1791,21 +1999,23 @@ find_dep(struct dep_entry *deps, unsigned int nr, struct dep_entry dep)
 static void measure_active_set(struct workload *wrk)
 {
 	unsigned long total = 0, batch_sizes = 0;
-	struct dep_entry *deps = NULL;
+	struct dep_entry *dep, *deps = NULL;
-	unsigned int nr = 0, i, j;
+	unsigned int nr = 0;
 	struct w_step *w;
 	if (verbose < 3)
 		return;
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+	for_each_w_step(w, wrk) {
 		if (w->type != BATCH)
 			continue;
-		batch_sizes += 4096;
+		batch_sizes += w->bb_size;
-		for (j = 0; j < w->data_deps.nr; j++) {
+		if (is_xe)
-			struct dep_entry *dep = &w->data_deps.list[j];
+			continue;
+		for_each_dep(dep, w->data_deps) {
 			struct dep_entry _dep = *dep;
 			if (dep->working_set == -1 && dep->target < 0) {
@@ -1814,12 +2024,12 @@ static void measure_active_set(struct workload *wrk)
 				igt_assert(idx >= 0 && idx < w->idx);
 				igt_assert(wrk->steps[idx].type == BATCH);
-				_dep.target = wrk->steps[idx].obj[0].handle;
+				_dep.target = wrk->steps[idx].i915.obj[0].handle;
 			}
 			if (!find_dep(deps, nr, _dep)) {
 				if (dep->working_set == -1) {
-					total += 4096;
+					total += w->bb_size;
 				} else {
 					struct working_set *set;
@@ -1848,24 +2058,39 @@ static void measure_active_set(struct workload *wrk)
 #define alloca0(sz) ({ size_t sz__ = (sz); memset(alloca(sz__), 0, sz__); })
-static int prepare_workload(unsigned int id, struct workload *wrk)
+static void xe_vm_create_(struct vm *vm)
+{
+	uint32_t flags = 0;
+	if (vm->compute_mode)
+		flags |= DRM_XE_VM_CREATE_FLAG_LR_MODE;
+	vm->id = xe_vm_create(fd, flags, 0);
+}
+static void xe_exec_queue_create_(struct ctx *ctx, struct xe_exec_queue *eq)
+{
+	struct drm_xe_exec_queue_create create = {
+		.vm_id = ctx->vm->id,
+		.width = 1,
+		.num_placements = eq->nr_hwes,
+		.instances = to_user_pointer(eq->hwe_list),
+	};
+	igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &create), 0);
+	eq->id = create.exec_queue_id;
+}
+static void allocate_contexts(unsigned int id, struct workload *wrk)
 {
-	struct working_set **sets;
-	unsigned long total = 0;
-	uint32_t share_vm = 0;
 	int max_ctx = -1;
 	struct w_step *w;
-	int i, j;
-	wrk->id = id;
-	wrk->bb_prng = (wrk->flags & SYNCEDCLIENTS) ? master_prng : rand();
-	wrk->bo_prng = (wrk->flags & SYNCEDCLIENTS) ? master_prng : rand();
-	wrk->run = true;
 	/*
 	 * Pre-scan workload steps to allocate context list storage.
 	 */
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+	for_each_w_step(w, wrk) {
 		int ctx = w->context + 1;
 		int delta;
@@ -1884,22 +2109,27 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 		max_ctx = ctx;
 	}
+}
+static int prepare_contexts(unsigned int id, struct workload *wrk)
+{
+	uint32_t share_vm = 0;
+	struct w_step *w;
+	struct ctx *ctx, *ctx2;
+	unsigned int j;
 	/*
 	 * Transfer over engine map configuration from the workload step.
 	 */
-	for (j = 0; j < wrk->nr_ctxs; j++) {
+	__for_each_ctx(ctx, wrk, ctx_idx) {
-		struct ctx *ctx = &wrk->ctx_list[j];
+		for_each_w_step(w, wrk) {
+			if (w->context != ctx_idx)
-		for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
-			if (w->context != j)
 				continue;
 			if (w->type == ENGINE_MAP) {
 				ctx->engine_map = w->engine_map;
-				ctx->engine_map_count = w->engine_map_count;
 			} else if (w->type == LOAD_BALANCE) {
-				if (!ctx->engine_map) {
+				if (!ctx->engine_map.nr_engines) {
 					wsim_err("Load balancing needs an engine map!\n");
 					return 1;
 				}
@@ -1918,10 +2148,7 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 						     ctx->bond_count *
 						     sizeof(struct bond));
 				igt_assert(ctx->bonds);
-				ctx->bonds[ctx->bond_count - 1].mask =
+				ctx->bonds[ctx->bond_count - 1] = w->bond;
-					w->bond_mask;
-				ctx->bonds[ctx->bond_count - 1].master =
-					w->bond_master;
 			}
 		}
 	}
@@ -1929,32 +2156,39 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 	/*
 	 * Create and configure contexts.
 	 */
-	for (i = 0; i < wrk->nr_ctxs; i++) {
+	__for_each_ctx(ctx, wrk, ctx_idx) {
 		struct drm_i915_gem_context_create_ext_setparam ext = {
 			.base.name = I915_CONTEXT_CREATE_EXT_SETPARAM,
 			.param.param = I915_CONTEXT_PARAM_VM,
 		};
 		struct drm_i915_gem_context_create_ext args = { };
-		struct ctx *ctx = &wrk->ctx_list[i];
 		uint32_t ctx_id;
 		igt_assert(!ctx->id);
 		/* Find existing context to share ppgtt with. */
-		for (j = 0; !share_vm && j < wrk->nr_ctxs; j++) {
+		if (!share_vm)
-			struct drm_i915_gem_context_param param = {
+			for_each_ctx(ctx2, wrk) {
-				.param = I915_CONTEXT_PARAM_VM,
+				struct drm_i915_gem_context_param param = {
-				.ctx_id = wrk->ctx_list[j].id,
+					.param = I915_CONTEXT_PARAM_VM,
-			};
+					.ctx_id = ctx2->id,
+				};
-			if (!param.ctx_id)
-				continue;
+				if (!param.ctx_id)
+					continue;
-			gem_context_get_param(fd, &param);
+				gem_context_get_param(fd, &param);
-			igt_assert(param.value);
+				igt_assert(param.value);
-			share_vm = param.value;
+				share_vm = param.value;
-			break;
+				wrk->nr_vms = 1;
-		}
+				wrk->vm_list = calloc(wrk->nr_vms, sizeof(struct vm));
+				igt_assert(wrk->vm_list);
+				wrk->vm_list->id = share_vm;
+				wrk->vm_list->ahnd = intel_allocator_open(fd, share_vm,
+									  INTEL_ALLOCATOR_RELOC);
+				ctx2->vm = wrk->vm_list;
+				break;
+			}
 		if (share_vm) {
 			ext.param.value = share_vm;
@@ -1968,22 +2202,44 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 		ctx_id = args.ctx_id;
 		ctx->id = ctx_id;
 		ctx->sseu = device_sseu.slice_mask;
+		ctx->vm = wrk->vm_list;
 		__configure_context(ctx_id, wrk->prio);
-		if (ctx->engine_map) {
+		if (ctx->engine_map.nr_engines) {
 			struct i915_context_param_engines *set_engines =
-				alloca0(sizeof_param_engines(ctx->engine_map_count + 1));
+				alloca0(sizeof_param_engines(ctx->engine_map.nr_engines + 1));
 			struct i915_context_engines_load_balance *load_balance =
-				alloca0(sizeof_load_balance(ctx->engine_map_count));
+				alloca0(sizeof_load_balance(ctx->engine_map.nr_engines));
 			struct drm_i915_gem_context_param param = {
 				.ctx_id = ctx_id,
 				.param = I915_CONTEXT_PARAM_ENGINES,
-				.size = sizeof_param_engines(ctx->engine_map_count + 1),
+				.size = sizeof_param_engines(ctx->engine_map.nr_engines + 1),
 				.value = to_user_pointer(set_engines),
 			};
 			struct i915_context_engines_bond *last = NULL;
+			/* update engine_idx and request_idx */
+			for_each_w_step(w, wrk) {
+				if (w->context != ctx_idx)
+					continue;
+				if (w->type == BATCH) {
+					unsigned int map_idx = 0;
+					if (find_engine_in_map(&w->engine, &ctx->engine_map,
+							       &map_idx))
+						/* 0 is virtual, map indexes are shifted by one */
+						w->engine_idx = map_idx + 1;
+					else
+						igt_assert(ctx->load_balance);
+					igt_assert(find_engine_in_map(&ctx->engine_map
+										.engines[map_idx],
+								      query_engines(),
+								      &w->request_idx));
+				}
+			}
 			if (ctx->load_balance) {
 				set_engines->extensions =
 					to_user_pointer(load_balance);
@@ -1991,11 +2247,12 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 				load_balance->base.name =
 					I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE;
 				load_balance->num_siblings =
-					ctx->engine_map_count;
+					ctx->engine_map.nr_engines;
-				for (j = 0; j < ctx->engine_map_count; j++)
+				for (j = 0; j < ctx->engine_map.nr_engines; j++)
 					load_balance->engines[j] =
-						get_engine(ctx->engine_map[j]);
+						engine_to_i915_engine_class(&ctx->engine_map
+										.engines[j]);
 			}
 			/* Reserve slot for virtual engine. */
@@ -2004,34 +2261,32 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 			set_engines->engines[0].engine_instance =
 				I915_ENGINE_CLASS_INVALID_NONE;
-			for (j = 1; j <= ctx->engine_map_count; j++)
+			for (j = 1; j <= ctx->engine_map.nr_engines; j++)
 				set_engines->engines[j] =
-					get_engine(ctx->engine_map[j - 1]);
+					engine_to_i915_engine_class(&ctx->engine_map
+										.engines[j - 1]);
 			last = NULL;
 			for (j = 0; j < ctx->bond_count; j++) {
-				unsigned long mask = ctx->bonds[j].mask;
+				struct intel_engines *mask = &ctx->bonds[j].mask;
 				struct i915_context_engines_bond *bond =
-					alloca0(sizeof_engines_bond(__builtin_popcount(mask)));
+					alloca0(sizeof_engines_bond(mask->nr_engines));
 				unsigned int b, e;
 				bond->base.next_extension = to_user_pointer(last);
 				bond->base.name = I915_CONTEXT_ENGINES_EXT_BOND;
 				bond->virtual_index = 0;
-				bond->master = get_engine(ctx->bonds[j].master);
+				bond->master = engine_to_i915_engine_class(&ctx->bonds[j].master);
-				for (b = 0, e = 0; mask; e++, mask >>= 1) {
+				for (b = 0, e = 0; e < mask->nr_engines; e++) {
 					unsigned int idx;
-					if (!(mask & 1))
+					igt_assert(find_engine_in_map(&mask->engines[e],
-						continue;
+								      &ctx->engine_map,
+								      &idx));
-					idx = find_engine(&set_engines->engines[1],
+					bond->engines[b++] = set_engines->engines[1 + idx];
-							  ctx->engine_map_count,
-							  e);
-					bond->engines[b++] =
-						set_engines->engines[1 + idx];
 				}
 				last = bond;
@@ -2039,6 +2294,19 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 			load_balance->base.next_extension = to_user_pointer(last);
 			gem_context_set_param(fd, &param);
+		} else {
+			/* update engine_idx and request_idx */
+			for_each_w_step(w, wrk) {
+				if (w->context != ctx_idx)
+					continue;
+				if (w->type == BATCH) {
+					w->engine_idx = engine_to_i915_legacy_ring(&w->engine);
+					resolve_to_physical_engine(&w->engine);
+					igt_assert(find_engine_in_map(&w->engine,
+								      query_engines(),
+								      &w->request_idx));
+				}
+			}
 		}
 		if (wrk->sseu) {
@@ -2050,51 +2318,163 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 	if (share_vm)
 		vm_destroy(fd, share_vm);
-	/* Record default preemption. */
+	return 0;
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+}
-		if (w->type == BATCH)
-			w->preempt_us = 100;
-	}
-	/*
+static int xe_prepare_contexts(unsigned int id, struct workload *wrk)
-	 * Scan for contexts with modified preemption config and record their
+{
-	 * preemption period for the following steps belonging to the same
+	struct xe_exec_queue *eq;
-	 * context.
+	struct w_step *w;
-	 */
+	struct ctx *ctx;
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+	unsigned int i;
-		struct w_step *w2;
-		if (w->type != PREEMPTION)
+	/* shortcut, create one vm */
-			continue;
+	wrk->nr_vms = 1;
+	wrk->vm_list = calloc(wrk->nr_vms, sizeof(struct vm));
+	igt_assert(wrk->vm_list);
+	wrk->vm_list->compute_mode = false;
+	xe_vm_create_(wrk->vm_list);
+	wrk->vm_list->ahnd = intel_allocator_open(fd, wrk->vm_list->id,
+						  INTEL_ALLOCATOR_RELOC);
+	__for_each_ctx(ctx, wrk, ctx_idx) {
+		/* link with vm */
+		ctx->vm = wrk->vm_list;
+		for_each_w_step(w, wrk) {
+			if (w->context != ctx_idx)
+				continue;
+			if (w->type == ENGINE_MAP) {
+				ctx->engine_map = w->engine_map;
+			} else if (w->type == LOAD_BALANCE) {
+				if (!ctx->engine_map.nr_engines) {
+					wsim_err("Load balancing needs an engine map!\n");
+					return 1;
+				}
+				ctx->load_balance = w->load_balance;
+			}
+		}
-		for (j = i + 1; j < wrk->nr_steps; j++) {
+		/* create exec queue for each referenced engine */
-			w2 = &wrk->steps[j];
+		if (ctx->engine_map.nr_engines) {
+			ctx->xe.nr_queues = 1;
+			ctx->xe.queue_list = calloc(ctx->xe.nr_queues, sizeof(*ctx->xe.queue_list));
+			igt_assert(ctx->xe.queue_list);
+			eq = &ctx->xe.queue_list[ctx->xe.nr_queues - 1];
+			eq->nr_hwes = ctx->engine_map.nr_engines;
+			eq->hwe_list = calloc(eq->nr_hwes, sizeof(*eq->hwe_list));
+			for (i = 0; i < eq->nr_hwes; ++i) {
+				eq->hwe_list[i] = ctx->engine_map.engines[i];
+				/* check no mixing classes and no duplicates */
+				for (int j = 0; j < i; ++j) {
+					if (eq->hwe_list[j].engine_class !=
+					    eq->hwe_list[i].engine_class) {
+						free(eq->hwe_list);
+						eq->nr_hwes = 0;
+						wsim_err("Mixing of engine class not supported!\n");
+						return 1;
+					}
-			if (w2->context != w->context)
+					if (eq->hwe_list[j].engine_instance ==
-				continue;
+					    eq->hwe_list[i].engine_instance) {
-			else if (w2->type == PREEMPTION)
+						free(eq->hwe_list);
-				break;
+						eq->nr_hwes = 0;
-			else if (w2->type != BATCH)
+						wsim_err("Duplicate engine entry!\n");
+						return 1;
+					}
+				}
+				if (verbose > 3)
+					printf("%u ctx[%d] %s [%u:%u:%u]\n", id,
+					       ctx_idx,
+					       intel_engine_class_string(ctx->engine_map
+										.engines[i]
+										.engine_class),
+					       eq->hwe_list[i].engine_class,
+					       eq->hwe_list[i].engine_instance,
+					       eq->hwe_list[i].gt_id);
+			}
+			xe_exec_queue_create_(ctx, eq);
+		} else {
+			/* create engine_map, update engine_idx */
+			for_each_w_step(w, wrk) {
+				if (w->context != ctx_idx)
+					continue;
+				if (w->type == BATCH) {
+					resolve_to_physical_engine(&w->engine);
+					if (!find_engine_in_map(&w->engine, &ctx->engine_map,
+								&w->engine_idx)) {
+						igt_assert(1 ==
+							   append_matching_engines(&w->engine,
+										   &ctx->engine_map)
+							  );
+						w->engine_idx = ctx->engine_map.nr_engines - 1;
+					}
+				}
+			}
+			/* skip not referenced context */
+			if (!ctx->engine_map.nr_engines)
 				continue;
-			w2->preempt_us = w->period;
+			ctx->xe.nr_queues = ctx->engine_map.nr_engines;
+			ctx->xe.queue_list = calloc(ctx->xe.nr_queues, sizeof(*ctx->xe.queue_list));
+			for (i = 0; i < ctx->xe.nr_queues; i++) {
+				eq = &ctx->xe.queue_list[i];
+				eq->nr_hwes = 1;
+				eq->hwe_list = calloc(1, sizeof(*eq->hwe_list));
+				eq->hwe_list[0] = ctx->engine_map.engines[i];
+				if (verbose > 3)
+					printf("%u ctx[%d] %s [%d:%d:%d]\n",
+					       id, ctx_idx,
+					       intel_engine_class_string(ctx->engine_map
+										.engines[i]
+										.engine_class),
+					       eq->hwe_list[0].engine_class,
+					       eq->hwe_list[0].engine_instance,
+					       eq->hwe_list[0].gt_id);
+				xe_exec_queue_create_(ctx, eq);
+			}
 		}
-	}
-	/*
+		/* update request_idx */
-	 * Scan for SSEU control steps.
+		for_each_w_step(w, wrk) {
-	 */
+			if (w->context != ctx_idx)
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+				continue;
-		if (w->type == SSEU) {
+			if (w->type == BATCH) {
-			get_device_sseu();
+				igt_assert(find_engine_in_map(&ctx->engine_map
-			break;
+									.engines[w->engine_idx],
+							      query_engines(),
+							      &w->request_idx));
+			}
 		}
 	}
+	/* create syncobjs for SW_FENCE */
+	for_each_w_step(w, wrk)
+		if (w->type == SW_FENCE) {
+			w->xe.syncs = calloc(1, sizeof(struct drm_xe_sync));
+			w->xe.syncs[0].handle = syncobj_create(fd, 0);
+			w->xe.syncs[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
+		}
+	return 0;
+}
+static void prepare_working_sets(unsigned int id, struct workload *wrk)
+{
+	struct working_set **sets;
+	unsigned long total = 0;
+	struct w_step *w;
 	/*
 	 * Allocate working sets.
 	 */
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+	for_each_w_step(w, wrk) {
 		if (w->type == WORKINGSET && !w->working_set.shared)
 			total += allocate_working_set(wrk, &w->working_set);
 	}
@@ -2106,7 +2486,7 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 	 * Map of working set ids.
 	 */
 	wrk->max_working_set_id = -1;
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+	for_each_w_step(w, wrk) {
 		if (w->type == WORKINGSET &&
 		    w->working_set.id > wrk->max_working_set_id)
 			wrk->max_working_set_id = w->working_set.id;
@@ -2117,7 +2497,7 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 				   sizeof(*wrk->working_sets));
 	igt_assert(wrk->working_sets);
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+	for_each_w_step(w, wrk) {
 		struct working_set *set;
 		if (w->type != WORKINGSET)
@@ -2138,20 +2518,86 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 	if (sets)
 		free(sets);
+}
+static int prepare_workload(unsigned int id, struct workload *wrk)
+{
+	struct w_step *w;
+	int ret = 0;
+	wrk->id = id;
+	wrk->bb_prng = (wrk->flags & FLAG_SYNCEDCLIENTS) ? master_prng : rand();
+	wrk->bo_prng = (wrk->flags & FLAG_SYNCEDCLIENTS) ? master_prng : rand();
+	wrk->run = true;
+	allocate_contexts(id, wrk);
+	if (is_xe)
+		ret = xe_prepare_contexts(id, wrk);
+	else
+		ret = prepare_contexts(id, wrk);
+	if (ret)
+		return ret;
+	/* Record default preemption. */
+	for_each_w_step(w, wrk)
+		if (w->type == BATCH)
+			w->preempt_us = 100;
+	/*
+	 * Scan for contexts with modified preemption config and record their
+	 * preemption period for the following steps belonging to the same
+	 * context.
+	 */
+	for_each_w_step(w, wrk) {
+		struct w_step *w2;
+		if (w->type != PREEMPTION)
+			continue;
+		for (int j = w->idx + 1; j < wrk->nr_steps; j++) {
+			w2 = &wrk->steps[j];
+			if (w2->context != w->context)
+				continue;
+			else if (w2->type == PREEMPTION)
+				break;
+			else if (w2->type != BATCH)
+				continue;
+			w2->preempt_us = w->period;
+		}
+	}
+	/*
+	 * Scan for SSEU control steps.
+	 */
+	for_each_w_step(w, wrk) {
+		if (w->type == SSEU) {
+			get_device_sseu();
+			break;
+		}
+	}
+	prepare_working_sets(id, wrk);
 	/*
 	 * Allocate batch buffers.
 	 */
-	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+	for_each_w_step(w, wrk) {
 		if (w->type != BATCH)
 			continue;
-		alloc_step_batch(wrk, w);
+		if (is_xe)
+			xe_alloc_step_batch(wrk, w);
+		else
+			alloc_step_batch(wrk, w);
 	}
 	measure_active_set(wrk);
-	return 0;
+	return ret;
 }
 static double elapsed(const struct timespec *start, const struct timespec *end)
@@ -2172,10 +2618,10 @@ update_bb_start(struct workload *wrk, struct w_step *w)
 	/* ticks is inverted for MI_DO_COMPARE (less-than comparison) */
 	ticks = 0;
-	if (!w->unbound_duration)
+	if (!w->duration.unbound)
-		ticks = ~ns_to_ctx_ticks(1000 * get_duration(wrk, w));
+		ticks = ~ns_to_ctx_ticks(1000LL * get_duration(wrk, w));
-	*w->bb_duration = ticks;
+	*w->i915.bb_duration = ticks;
 }
 static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
@@ -2193,19 +2639,38 @@ static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
 	igt_assert(target < wrk->nr_steps);
 	igt_assert(wrk->steps[target].type == BATCH);
-	gem_sync(fd, wrk->steps[target].obj[0].handle);
+	w_step_sync(&wrk->steps[target]);
+}
+static void do_xe_exec(struct workload *wrk, struct w_step *w)
+{
+	struct xe_exec_queue *eq = xe_get_eq(wrk, w);
+	igt_assert(w->emit_fence <= 0);
+	if (w->emit_fence == -1)
+		syncobj_reset(fd, &w->xe.syncs[0].handle, 1);
+	/* update duration if random */
+	if (w->duration.max != w->duration.min)
+		xe_spin_init_opts(&w->xe.data->spin,
+				  .addr = w->xe.exec.address,
+				  .preempt = (w->preempt_us > 0),
+				  .ctx_ticks = xe_spin_nsec_to_ticks(fd, eq->hwe_list[0].gt_id,
+								     1000LL * get_duration(wrk, w)));
+	xe_exec(fd, &w->xe.exec);
 }
 static void
-do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine)
+do_eb(struct workload *wrk, struct w_step *w)
 {
+	struct dep_entry *dep;
 	unsigned int i;
-	eb_update_flags(wrk, w, engine);
+	eb_update_flags(wrk, w);
 	update_bb_start(wrk, w);
-	for (i = 0; i < w->fence_deps.nr; i++) {
+	for_each_dep(dep, w->fence_deps) {
-		int tgt = w->idx + w->fence_deps.list[i].target;
+		int tgt = w->idx + dep->target;
 		/* TODO: fence merging needed to support multiple inputs */
 		igt_assert(i == 0);
@@ -2213,20 +2678,20 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine)
 		igt_assert(wrk->steps[tgt].emit_fence > 0);
 		if (w->fence_deps.submit_fence)
-			w->eb.flags |= I915_EXEC_FENCE_SUBMIT;
+			w->i915.eb.flags |= I915_EXEC_FENCE_SUBMIT;
 		else
-			w->eb.flags |= I915_EXEC_FENCE_IN;
+			w->i915.eb.flags |= I915_EXEC_FENCE_IN;
-		w->eb.rsvd2 = wrk->steps[tgt].emit_fence;
+		w->i915.eb.rsvd2 = wrk->steps[tgt].emit_fence;
 	}
-	if (w->eb.flags & I915_EXEC_FENCE_OUT)
+	if (w->i915.eb.flags & I915_EXEC_FENCE_OUT)
-		gem_execbuf_wr(fd, &w->eb);
+		gem_execbuf_wr(fd, &w->i915.eb);
 	else
-		gem_execbuf(fd, &w->eb);
+		gem_execbuf(fd, &w->i915.eb);
-	if (w->eb.flags & I915_EXEC_FENCE_OUT) {
+	if (w->i915.eb.flags & I915_EXEC_FENCE_OUT) {
-		w->emit_fence = w->eb.rsvd2 >> 32;
+		w->emit_fence = w->i915.eb.rsvd2 >> 32;
 		igt_assert(w->emit_fence > 0);
 	}
 }
@@ -2252,20 +2717,19 @@ static void sync_deps(struct workload *wrk, struct w_step *w)
 		igt_assert(dep_idx >= 0 && dep_idx < w->idx);
 		igt_assert(wrk->steps[dep_idx].type == BATCH);
-		gem_sync(fd, wrk->steps[dep_idx].obj[0].handle);
+		w_step_sync(&wrk->steps[dep_idx]);
 	}
 }
 static void *run_workload(void *data)
 {
 	struct workload *wrk = (struct workload *)data;
-	struct timespec t_start, t_end;
+	struct timespec t_start, t_end, repeat_start;
 	struct w_step *w;
 	int throttle = -1;
 	int qd_throttle = -1;
 	int count, missed = 0;
 	unsigned long time_tot = 0, time_min = ULONG_MAX, time_max = 0;
-	int i;
 	clock_gettime(CLOCK_MONOTONIC, &t_start);
@@ -2273,13 +2737,14 @@ static void *run_workload(void *data)
 	     count++) {
 		unsigned int cur_seqno = wrk->sync_seqno;
-		clock_gettime(CLOCK_MONOTONIC, &wrk->repeat_start);
+		clock_gettime(CLOCK_MONOTONIC, &repeat_start);
-		for (i = 0, w = wrk->steps; wrk->run && (i < wrk->nr_steps);
+		for_each_w_step(w, wrk) {
-		     i++, w++) {
-			enum intel_engine_id engine = w->engine;
 			int do_sleep = 0;
+			if (!wrk->run)
+				break;
 			if (w->type == DELAY) {
 				do_sleep = w->delay;
 			} else if (w->type == PERIOD) {
@@ -2287,7 +2752,7 @@ static void *run_workload(void *data)
 				int elapsed;
 				clock_gettime(CLOCK_MONOTONIC, &now);
-				elapsed = elapsed_us(&wrk->repeat_start, &now);
+				elapsed = elapsed_us(&repeat_start, &now);
 				do_sleep = w->period - elapsed;
 				time_tot += elapsed;
 				if (elapsed < time_min)
@@ -2298,15 +2763,15 @@ static void *run_workload(void *data)
 					missed++;
 					if (verbose > 2)
 						printf("%u: Dropped period @ %u/%u (%dus late)!\n",
-						       wrk->id, count, i, do_sleep);
+						       wrk->id, count, w->idx, do_sleep);
 					continue;
 				}
 			} else if (w->type == SYNC) {
-				unsigned int s_idx = i + w->target;
+				unsigned int s_idx = w->idx + w->target;
-				igt_assert(s_idx >= 0 && s_idx < i);
+				igt_assert(s_idx >= 0 && s_idx < w->idx);
 				igt_assert(wrk->steps[s_idx].type == BATCH);
-				gem_sync(fd, wrk->steps[s_idx].obj[0].handle);
+				w_step_sync(&wrk->steps[s_idx]);
 				continue;
 			} else if (w->type == THROTTLE) {
 				throttle = w->throttle;
@@ -2320,12 +2785,16 @@ static void *run_workload(void *data)
 					sw_sync_timeline_create_fence(wrk->sync_timeline,
 								      cur_seqno + w->idx);
 				igt_assert(w->emit_fence > 0);
+				if (is_xe)
+					/* Convert sync file to syncobj */
+					syncobj_import_sync_file(fd, w->xe.syncs[0].handle,
+								 w->emit_fence);
 				continue;
 			} else if (w->type == SW_FENCE_SIGNAL) {
 				int tgt = w->idx + w->target;
 				int inc;
-				igt_assert(tgt >= 0 && tgt < i);
+				igt_assert(tgt >= 0 && tgt < w->idx);
 				igt_assert(wrk->steps[tgt].type == SW_FENCE);
 				cur_seqno += wrk->steps[tgt].idx;
 				inc = cur_seqno - wrk->sync_seqno;
@@ -2345,13 +2814,16 @@ static void *run_workload(void *data)
 				}
 				continue;
 			} else if (w->type == TERMINATE) {
-				unsigned int t_idx = i + w->target;
+				unsigned int t_idx = w->idx + w->target;
-				igt_assert(t_idx >= 0 && t_idx < i);
+				igt_assert(t_idx >= 0 && t_idx < w->idx);
 				igt_assert(wrk->steps[t_idx].type == BATCH);
-				igt_assert(wrk->steps[t_idx].unbound_duration);
+				igt_assert(wrk->steps[t_idx].duration.unbound);
-				*wrk->steps[t_idx].bb_duration = 0xffffffff;
+				if (is_xe)
+					xe_spin_end(&wrk->steps[t_idx].xe.data->spin);
+				else
+					*wrk->steps[t_idx].i915.bb_duration = 0xffffffff;
 				__sync_synchronize();
 				continue;
 			} else if (w->type == SSEU) {
@@ -2377,40 +2849,41 @@ static void *run_workload(void *data)
 			igt_assert(w->type == BATCH);
-			if (wrk->flags & DEPSYNC)
+			if (wrk->flags & FLAG_DEPSYNC)
 				sync_deps(wrk, w);
 			if (throttle > 0)
-				w_sync_to(wrk, w, i - throttle);
+				w_sync_to(wrk, w, w->idx - throttle);
-			do_eb(wrk, w, engine);
+			if (is_xe)
+				do_xe_exec(wrk, w);
+			else
+				do_eb(wrk, w);
-			if (w->request != -1) {
+			if (w->rq_link.next) {
 				igt_list_del(&w->rq_link);
-				wrk->nrequest[w->request]--;
+				wrk->nrequest[w->request_idx]--;
 			}
-			w->request = engine;
+			igt_list_add_tail(&w->rq_link, &wrk->requests[w->request_idx]);
-			igt_list_add_tail(&w->rq_link, &wrk->requests[engine]);
+			wrk->nrequest[w->request_idx]++;
-			wrk->nrequest[engine]++;
 			if (!wrk->run)
 				break;
 			if (w->sync)
-				gem_sync(fd, w->obj[0].handle);
+				w_step_sync(w);
 			if (qd_throttle > 0) {
-				while (wrk->nrequest[engine] > qd_throttle) {
+				while (wrk->nrequest[w->request_idx] > qd_throttle) {
 					struct w_step *s;
-					s = igt_list_first_entry(&wrk->requests[engine],
+					s = igt_list_first_entry(&wrk->requests[w->request_idx],
 								 s, rq_link);
-					gem_sync(fd, s->obj[0].handle);
+					w_step_sync(s);
-					s->request = -1;
 					igt_list_del(&s->rq_link);
-					wrk->nrequest[engine]--;
+					wrk->nrequest[w->request_idx]--;
 				}
 			}
 		}
@@ -2424,21 +2897,44 @@ static void *run_workload(void *data)
 		}
 		/* Cleanup all fences instantiated in this iteration. */
-		for (i = 0, w = wrk->steps; wrk->run && (i < wrk->nr_steps);
+		for_each_w_step(w, wrk) {
-		     i++, w++) {
+			if (!wrk->run)
+				break;
 			if (w->emit_fence > 0) {
+				if (is_xe) {
+					igt_assert(w->type == SW_FENCE);
+					syncobj_reset(fd, &w->xe.syncs[0].handle, 1);
+				}
 				close(w->emit_fence);
 				w->emit_fence = -1;
 			}
 		}
 	}
-	for (i = 0; i < NUM_ENGINES; i++) {
+	for (int i = query_engines()->nr_engines; --i >= 0;) {
 		if (!wrk->nrequest[i])
 			continue;
 		w = igt_list_last_entry(&wrk->requests[i], w, rq_link);
-		gem_sync(fd, w->obj[0].handle);
+		w_step_sync(w);
+	}
+	if (is_xe) {
+		for_each_w_step(w, wrk) {
+			if (w->type == BATCH) {
+				w_step_sync(w);
+				syncobj_destroy(fd, w->xe.syncs[0].handle);
+				free(w->xe.syncs);
+				xe_vm_unbind_sync(fd, get_vm(wrk, w)->id, 0, w->xe.exec.address,
+						  w->bb_size);
+				gem_munmap(w->xe.data, w->bb_size);
+				gem_close(fd, w->bb_handle);
+			} else if (w->type == SW_FENCE) {
+				syncobj_destroy(fd, w->xe.syncs[0].handle);
+				free(w->xe.syncs);
+			}
+		}
 	}
 	clock_gettime(CLOCK_MONOTONIC, &t_end);
@@ -2493,6 +2989,7 @@ static void print_help(void)
 "  -f <scale>        Scale factor for batch durations.\n"
 "  -F <scale>        Scale factor for delays.\n"
 "  -L                List GPUs.\n"
+"  -l                List physical engines.\n"
 "  -D <gpu>          One of the GPUs from -L.\n"
 	);
 }
@@ -2503,6 +3000,7 @@ static char *load_workload_descriptor(char *filename)
 	char *buf;
 	int infd, ret, i;
 	ssize_t len;
+	bool in_comment = false;
 	ret = stat(filename, &sbuf);
 	if (ret || !S_ISREG(sbuf.st_mode))
@@ -2519,8 +3017,18 @@ static char *load_workload_descriptor(char *filename)
 	close(infd);
 	for (i = 0; i < len; i++) {
-		if (buf[i] == '\n')
+		/*
+		 * Lines starting with '#' are skipped.
+		 * If command line step separator (',') is encountered after '#'
+		 * it is replaced with ';' to not break parsing.
+		 */
+		if (buf[i] == '#')
+			in_comment = true;
+		else if (buf[i] == '\n') {
 			buf[i] = ',';
+			in_comment = false;
+		} else if (in_comment && buf[i] == ',')
+			buf[i] = ';';
 	}
 	len--;
@@ -2541,10 +3049,42 @@ add_workload_arg(struct w_arg *w_args, unsigned int nr_args, char *w_arg,
 	return w_args;
 }
+static void list_engines(void)
+{
+	struct intel_engines *engines = query_engines();
+	int engine_class_count[NUM_ENGINE_CLASSES] = {};
+	unsigned int i;
+	for (i = 0; i < engines->nr_engines; ++i) {
+		igt_assert_lt(engines->engines[i].engine_class, NUM_ENGINE_CLASSES);
+		engine_class_count[engines->engines[i].engine_class]++;
+	}
+	for (i = 0; i < engines->nr_engines; ++i) {
+		if (engine_class_count[engines->engines[i].engine_class] > 1)
+			printf("%s%u",
+			       intel_engine_class_string(engines->engines[i].engine_class),
+			       engines->engines[i].engine_instance + 1);
+		else
+			printf("%s",
+			       intel_engine_class_string(engines->engines[i].engine_class));
+		if (is_xe && engines->engines[i].gt_id)
+			printf("-%u", engines->engines[i].gt_id);
+		if (verbose > 3)
+			printf(" [%d:%d:%d]", engines->engines[i].engine_class,
+			       engines->engines[i].engine_instance,
+			       engines->engines[i].gt_id);
+		printf("\n");
+	}
+}
 int main(int argc, char **argv)
 {
 	struct igt_device_card card = { };
 	bool list_devices_arg = false;
+	bool list_engines_arg = false;
 	unsigned int repeat = 1;
 	unsigned int clients = 1;
 	unsigned int flags = 0;
@@ -2567,11 +3107,14 @@ int main(int argc, char **argv)
 	master_prng = time(NULL);
 	while ((c = getopt(argc, argv,
-			   "LhqvsSdc:r:w:W:a:p:I:f:F:D:")) != -1) {
+			   "LlhqvsSdc:r:w:W:a:p:I:f:F:D:")) != -1) {
 		switch (c) {
 		case 'L':
 			list_devices_arg = true;
 			break;
+		case 'l':
+			list_engines_arg = true;
+			break;
 		case 'D':
 			device_arg = strdup(optarg);
 			break;
@@ -2584,7 +3127,7 @@ int main(int argc, char **argv)
 			/* Fall through */
 		case 'w':
 			w_args = add_workload_arg(w_args, ++nr_w_args, optarg,
-						  prio, flags & SSEU);
+						  prio, flags & FLAG_SSEU);
 			break;
 		case 'p':
 			prio = atoi(optarg);
@@ -2610,13 +3153,13 @@ int main(int argc, char **argv)
 			verbose++;
 			break;
 		case 'S':
-			flags |= SYNCEDCLIENTS;
+			flags |= FLAG_SYNCEDCLIENTS;
 			break;
 		case 's':
-			flags ^= SSEU;
+			flags ^= FLAG_SSEU;
 			break;
 		case 'd':
-			flags |= DEPSYNC;
+			flags |= FLAG_DEPSYNC;
 			break;
 		case 'I':
 			master_prng = strtol(optarg, NULL, 0);
@@ -2635,7 +3178,7 @@ int main(int argc, char **argv)
 		}
 	}
-	igt_devices_scan(false);
+	igt_devices_scan();
 	if (list_devices_arg) {
 		struct igt_devices_print_format fmt = {
@@ -2660,8 +3203,12 @@ int main(int argc, char **argv)
 		ret = igt_device_find_first_i915_discrete_card(&card);
 		if (!ret)
 			ret = igt_device_find_integrated_card(&card);
+		if (!ret)
+			ret = igt_device_find_first_xe_discrete_card(&card);
+		if (!ret)
+			ret = igt_device_find_xe_integrated_card(&card);
 		if (!ret) {
-			wsim_err("No device filter specified and no i915 devices found!\n");
+			wsim_err("No device filter specified and no intel devices found!\n");
 			return EXIT_FAILURE;
 		}
 	}
@@ -2684,6 +3231,15 @@ int main(int argc, char **argv)
 	if (verbose > 1)
 		printf("Using device %s\n", drm_dev);
+	is_xe = is_xe_device(fd);
+	if (is_xe)
+		xe_device_get(fd);
+	if (list_engines_arg) {
+		list_engines();
+		goto out;
+	}
 	if (!nr_w_args) {
 		wsim_err("No workload descriptor(s)!\n");
 		goto err;
@@ -2704,6 +3260,7 @@ int main(int argc, char **argv)
 	if (append_workload_arg) {
 		struct w_arg arg = { NULL, append_workload_arg, 0 };
 		app_w = parse_workload(&arg, flags, scale_dur, scale_time,
 				       NULL);
 		if (!app_w) {
@@ -2802,5 +3359,8 @@ int main(int argc, char **argv)
 out:
 	exitcode = EXIT_SUCCESS;
 err:
+	if (is_xe)
+		xe_device_put(fd);
 	return exitcode;
 }
--- a/benchmarks/intel_upload_blit_large.c
+++ b/benchmarks/intel_upload_blit_large.c
@@ -45,6 +45,7 @@
 */
 #include "igt.h"
+#include "i915/gem_create.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -72,74 +73,99 @@ get_time_in_secs(void)
 }
 static void
-do_render(drm_intel_bufmgr *bufmgr, struct intel_batchbuffer *batch,
+do_render(int i915, uint32_t dst_handle)
-	  drm_intel_bo *dst_bo, int width, int height)
 {
-	uint32_t data[width * height];
+	struct drm_i915_gem_execbuffer2 exec = {};
-	drm_intel_bo *src_bo;
+	struct drm_i915_gem_exec_object2 obj[3] = {};
-	int i;
+	struct drm_i915_gem_relocation_entry reloc[2];
 	static uint32_t seed = 1;
+	uint32_t data[OBJECT_WIDTH * OBJECT_HEIGHT];
+	uint64_t size = OBJECT_WIDTH * OBJECT_HEIGHT * 4, bb_size = 4096;
+	uint32_t src_handle, bb_handle, *bb;
+	uint32_t gen = intel_gen(intel_get_drm_devid(i915));
+	const bool has_64b_reloc = gen >= 8;
+	int i;
+	bb_handle = gem_create_from_pool(i915, &bb_size, REGION_SMEM);
+	src_handle = gem_create_from_pool(i915, &size, REGION_SMEM);
-	/* Generate some junk.  Real workloads would be doing a lot more
+	for (i = 0; i < OBJECT_WIDTH * OBJECT_HEIGHT; i++)
-	 * work to generate the junk.
-	 */
-	for (i = 0; i < width * height; i++) {
 		data[i] = seed++;
-	}
-	/* Upload the junk. */
+	gem_write(i915, src_handle, 0, data, sizeof(data));
-	src_bo = drm_intel_bo_alloc(bufmgr, "src", sizeof(data), 4096);
-	drm_intel_bo_subdata(src_bo, 0, sizeof(data), data);
 	/* Render the junk to the dst. */
-	BLIT_COPY_BATCH_START(0);
+	bb = gem_mmap__device_coherent(i915, bb_handle, 0, bb_size, PROT_WRITE);
-	OUT_BATCH((3 << 24) | /* 32 bits */
+	i = 0;
+	bb[i++] = XY_SRC_COPY_BLT_CMD |
+		  XY_SRC_COPY_BLT_WRITE_ALPHA |
+		  XY_SRC_COPY_BLT_WRITE_RGB |
+		  (6 + 2*(gen >= 8));
+	bb[i++] = (3 << 24) | /* 32 bits */
 		  (0xcc << 16) | /* copy ROP */
-		  (width * 4) /* dst pitch */);
+		  (OBJECT_WIDTH * 4) /* dst pitch */;
-	OUT_BATCH(0); /* dst x1,y1 */
+	bb[i++] = 0; /* dst x1,y1 */
-	OUT_BATCH((height << 16) | width); /* dst x2,y2 */
+	bb[i++] = (OBJECT_HEIGHT << 16) | OBJECT_WIDTH; /* dst x2,y2 */
-	OUT_RELOC(dst_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
-	OUT_BATCH(0); /* src x1,y1 */
+	obj[0].handle = dst_handle;
-	OUT_BATCH(width * 4); /* src pitch */
+	obj[0].offset = dst_handle * size;
-	OUT_RELOC(src_bo, I915_GEM_DOMAIN_RENDER, 0, 0);
+	reloc[0].target_handle = dst_handle;
-	ADVANCE_BATCH();
+	reloc[0].presumed_offset = obj[0].offset;
+	reloc[0].offset = sizeof(uint32_t) * i;
-	intel_batchbuffer_flush(batch);
+	reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
+	reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
-	drm_intel_bo_unreference(src_bo);
+	bb[i++] = obj[0].offset;
+	if (has_64b_reloc)
+		bb[i++] = obj[0].offset >> 32;
+	bb[i++] = 0; /* src x1,y1 */
+	bb[i++] = OBJECT_WIDTH * 4; /* src pitch */
+	obj[1].handle = src_handle;
+	obj[1].offset = src_handle * size;
+	reloc[1].target_handle = src_handle;
+	reloc[1].presumed_offset = obj[1].offset;
+	reloc[1].offset = sizeof(uint32_t) * i;
+	reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
+	reloc[1].write_domain = 0;
+	bb[i++] = obj[1].offset;
+	if (has_64b_reloc)
+		bb[i++] = obj[1].offset >> 32;
+	obj[2].handle = bb_handle;
+	obj[2].relocs_ptr = to_user_pointer(reloc);
+	obj[2].relocation_count = 2;
+	bb[i++] = MI_BATCH_BUFFER_END;
+	gem_munmap(bb, bb_size);
+	exec.buffers_ptr = to_user_pointer(obj);
+	exec.buffer_count = 3;
+	exec.flags = gen >= 6 ? I915_EXEC_BLT : 0 | I915_EXEC_NO_RELOC;
+	gem_execbuf(i915, &exec);
 }
 int main(int argc, char **argv)
 {
-	int fd;
-	int object_size = OBJECT_WIDTH * OBJECT_HEIGHT * 4;
 	double start_time, end_time;
-	drm_intel_bo *dst_bo;
+	uint32_t dst_handle;
-	drm_intel_bufmgr *bufmgr;
+	int i915, i;
-	struct intel_batchbuffer *batch;
-	int i;
-	fd = drm_open_driver(DRIVER_INTEL);
-	bufmgr = drm_intel_bufmgr_gem_init(fd, 4096);
+	i915 = drm_open_driver(DRIVER_INTEL);
-	drm_intel_bufmgr_gem_enable_reuse(bufmgr);
+	dst_handle = gem_create(i915, OBJECT_WIDTH * OBJECT_HEIGHT * 4);
-	batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd));
-	dst_bo = drm_intel_bo_alloc(bufmgr, "dst", object_size, 4096);
 	/* Prep loop to get us warmed up. */
-	for (i = 0; i < 60; i++) {
+	for (i = 0; i < 60; i++)
-		do_render(bufmgr, batch, dst_bo, OBJECT_WIDTH, OBJECT_HEIGHT);
+		do_render(i915, dst_handle);
-	}
+	gem_sync(i915, dst_handle);
-	drm_intel_bo_wait_rendering(dst_bo);
 	/* Do the actual timing. */
 	start_time = get_time_in_secs();
-	for (i = 0; i < 200; i++) {
+	for (i = 0; i < 200; i++)
-		do_render(bufmgr, batch, dst_bo, OBJECT_WIDTH, OBJECT_HEIGHT);
+		do_render(i915, dst_handle);
-	}
+	gem_sync(i915, dst_handle);
-	drm_intel_bo_wait_rendering(dst_bo);
 	end_time = get_time_in_secs();
 	printf("%d iterations in %.03f secs: %.01f MB/sec\n", i,
@@ -147,10 +173,5 @@ int main(int argc, char **argv)
 	       (double)i * OBJECT_WIDTH * OBJECT_HEIGHT * 4 / 1024.0 / 1024.0 /
 	       (end_time - start_time));
-	intel_batchbuffer_free(batch);
+	close(i915);
-	drm_intel_bufmgr_destroy(bufmgr);
-	close(fd);
-	return 0;
 }
--- a/benchmarks/intel_upload_blit_large_gtt.c
+++ b/benchmarks/intel_upload_blit_large_gtt.c
@@ -45,6 +45,7 @@
 */
 #include "igt.h"
+#include "i915/gem_create.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -69,74 +70,98 @@ get_time_in_secs(void)
 }
 static void
-do_render(drm_intel_bufmgr *bufmgr, struct intel_batchbuffer *batch,
+do_render(int i915, uint32_t dst_handle)
-	  drm_intel_bo *dst_bo, int width, int height)
 {
-	uint32_t *data;
+	struct drm_i915_gem_execbuffer2 exec = {};
-	drm_intel_bo *src_bo;
+	struct drm_i915_gem_exec_object2 obj[3] = {};
-	int i;
+	struct drm_i915_gem_relocation_entry reloc[2];
 	static uint32_t seed = 1;
+	uint64_t size = OBJECT_WIDTH * OBJECT_HEIGHT * 4, bb_size = 4096;
+	uint32_t *data, src_handle, bb_handle, *bb;
+	uint32_t gen = intel_gen(intel_get_drm_devid(i915));
+	const bool has_64b_reloc = gen >= 8;
+	int i;
-	src_bo = drm_intel_bo_alloc(bufmgr, "src", width * height * 4, 4096);
+	bb_handle = gem_create_from_pool(i915, &bb_size, REGION_SMEM);
+	src_handle = gem_create_from_pool(i915, &size, REGION_SMEM);
-	drm_intel_gem_bo_map_gtt(src_bo);
-	data = src_bo->virtual;
+	data = gem_mmap__gtt(i915, src_handle, size, PROT_WRITE);
-	for (i = 0; i < width * height; i++) {
+	for (i = 0; i < OBJECT_WIDTH * OBJECT_HEIGHT; i++)
 		data[i] = seed++;
-	}
+	gem_munmap(data, size);
-	drm_intel_gem_bo_unmap_gtt(src_bo);
 	/* Render the junk to the dst. */
-	BLIT_COPY_BATCH_START(0);
+	bb = gem_mmap__device_coherent(i915, bb_handle, 0, bb_size, PROT_WRITE);
-	OUT_BATCH((3 << 24) | /* 32 bits */
+	i = 0;
+	bb[i++] = XY_SRC_COPY_BLT_CMD |
+		  XY_SRC_COPY_BLT_WRITE_ALPHA |
+		  XY_SRC_COPY_BLT_WRITE_RGB |
+		  (6 + 2*(gen >= 8));
+	bb[i++] = (3 << 24) | /* 32 bits */
 		  (0xcc << 16) | /* copy ROP */
-		  (width * 4) /* dst pitch */);
+		  (OBJECT_WIDTH * 4) /* dst pitch */;
-	OUT_BATCH(0); /* dst x1,y1 */
+	bb[i++] = 0; /* dst x1,y1 */
-	OUT_BATCH((height << 16) | width); /* dst x2,y2 */
+	bb[i++] = (OBJECT_HEIGHT << 16) | OBJECT_WIDTH; /* dst x2,y2 */
-	OUT_RELOC(dst_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
-	OUT_BATCH(0); /* src x1,y1 */
+	obj[0].handle = dst_handle;
-	OUT_BATCH(width * 4); /* src pitch */
+	obj[0].offset = dst_handle * size;
-	OUT_RELOC(src_bo, I915_GEM_DOMAIN_RENDER, 0, 0);
+	reloc[0].target_handle = dst_handle;
-	ADVANCE_BATCH();
+	reloc[0].presumed_offset = obj[0].offset;
+	reloc[0].offset = sizeof(uint32_t) * i;
-	intel_batchbuffer_flush(batch);
+	reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
+	reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
-	drm_intel_bo_unreference(src_bo);
+	bb[i++] = obj[0].offset;
+	if (has_64b_reloc)
+		bb[i++] = obj[0].offset >> 32;
+	bb[i++] = 0; /* src x1,y1 */
+	bb[i++] = OBJECT_WIDTH * 4; /* src pitch */
+	obj[1].handle = src_handle;
+	obj[1].offset = src_handle * size;
+	reloc[1].target_handle = src_handle;
+	reloc[1].presumed_offset = obj[1].offset;
+	reloc[1].offset = sizeof(uint32_t) * i;
+	reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
+	reloc[1].write_domain = 0;
+	bb[i++] = obj[1].offset;
+	if (has_64b_reloc)
+		bb[i++] = obj[1].offset >> 32;
+	obj[2].handle = bb_handle;
+	obj[2].relocs_ptr = to_user_pointer(reloc);
+	obj[2].relocation_count = 2;
+	bb[i++] = MI_BATCH_BUFFER_END;
+	gem_munmap(bb, bb_size);
+	exec.buffers_ptr = to_user_pointer(obj);
+	exec.buffer_count = 3;
+	exec.flags = gen >= 6 ? I915_EXEC_BLT : 0 | I915_EXEC_NO_RELOC;
+	gem_execbuf(i915, &exec);
 }
 int main(int argc, char **argv)
 {
-	int fd;
-	int object_size = OBJECT_WIDTH * OBJECT_HEIGHT * 4;
 	double start_time, end_time;
-	drm_intel_bo *dst_bo;
+	uint32_t dst_handle;
-	drm_intel_bufmgr *bufmgr;
+	int i915, i;
-	struct intel_batchbuffer *batch;
-	int i;
-	fd = drm_open_driver(DRIVER_INTEL);
-	bufmgr = drm_intel_bufmgr_gem_init(fd, 4096);
+	i915 = drm_open_driver(DRIVER_INTEL);
-	drm_intel_bufmgr_gem_enable_reuse(bufmgr);
+	dst_handle = gem_create(i915, OBJECT_WIDTH * OBJECT_HEIGHT * 4);
-	batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd));
-	dst_bo = drm_intel_bo_alloc(bufmgr, "dst", object_size, 4096);
 	/* Prep loop to get us warmed up. */
-	for (i = 0; i < 60; i++) {
+	for (i = 0; i < 60; i++)
-		do_render(bufmgr, batch, dst_bo, OBJECT_WIDTH, OBJECT_HEIGHT);
+		do_render(i915, dst_handle);
-	}
+	gem_sync(i915, dst_handle);
-	drm_intel_bo_wait_rendering(dst_bo);
 	/* Do the actual timing. */
 	start_time = get_time_in_secs();
-	for (i = 0; i < 200; i++) {
+	for (i = 0; i < 200; i++)
-		do_render(bufmgr, batch, dst_bo, OBJECT_WIDTH, OBJECT_HEIGHT);
+		do_render(i915, dst_handle);
-	}
+	gem_sync(i915, dst_handle);
-	drm_intel_bo_wait_rendering(dst_bo);
 	end_time = get_time_in_secs();
 	printf("%d iterations in %.03f secs: %.01f MB/sec\n", i,
@@ -144,10 +169,5 @@ int main(int argc, char **argv)
 	       (double)i * OBJECT_WIDTH * OBJECT_HEIGHT * 4 / 1024.0 / 1024.0 /
 	       (end_time - start_time));
-	intel_batchbuffer_free(batch);
+	close(i915);
-	drm_intel_bufmgr_destroy(bufmgr);
-	close(fd);
-	return 0;
 }
--- a/benchmarks/intel_upload_blit_large_map.c
+++ b/benchmarks/intel_upload_blit_large_map.c
@@ -48,6 +48,7 @@
 */
 #include "igt.h"
+#include "i915/gem_create.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -72,74 +73,99 @@ get_time_in_secs(void)
 }
 static void
-do_render(drm_intel_bufmgr *bufmgr, struct intel_batchbuffer *batch,
+do_render(int i915, uint32_t dst_handle)
-	  drm_intel_bo *dst_bo, int width, int height)
 {
-	uint32_t *data;
+	struct drm_i915_gem_execbuffer2 exec = {};
-	drm_intel_bo *src_bo;
+	struct drm_i915_gem_exec_object2 obj[3] = {};
-	int i;
+	struct drm_i915_gem_relocation_entry reloc[2];
 	static uint32_t seed = 1;
+	uint64_t size = OBJECT_WIDTH * OBJECT_HEIGHT * 4, bb_size = 4096;
+	uint32_t *data, src_handle, bb_handle, *bb;
+	uint32_t gen = intel_gen(intel_get_drm_devid(i915));
+	const bool has_64b_reloc = gen >= 8;
+	int i;
-	src_bo = drm_intel_bo_alloc(bufmgr, "src", width * height * 4, 4096);
+	bb_handle = gem_create_from_pool(i915, &bb_size, REGION_SMEM);
+	src_handle = gem_create_from_pool(i915, &size, REGION_SMEM);
-	drm_intel_bo_map(src_bo, 1);
-	data = src_bo->virtual;
+	data = gem_mmap__cpu(i915, src_handle, 0, size, PROT_WRITE);
-	for (i = 0; i < width * height; i++) {
+	for (i = 0; i < OBJECT_WIDTH * OBJECT_HEIGHT; i++)
 		data[i] = seed++;
-	}
+	gem_set_domain(i915, src_handle, I915_GEM_DOMAIN_CPU, 0);
+	gem_munmap(data, size);
-	drm_intel_bo_unmap(src_bo);
 	/* Render the junk to the dst. */
-	BLIT_COPY_BATCH_START(0);
+	bb = gem_mmap__device_coherent(i915, bb_handle, 0, bb_size, PROT_WRITE);
-	OUT_BATCH((3 << 24) | /* 32 bits */
+	i = 0;
+	bb[i++] = XY_SRC_COPY_BLT_CMD |
+		  XY_SRC_COPY_BLT_WRITE_ALPHA |
+		  XY_SRC_COPY_BLT_WRITE_RGB |
+		  (6 + 2*(gen >= 8));
+	bb[i++] = (3 << 24) | /* 32 bits */
 		  (0xcc << 16) | /* copy ROP */
-		  (width * 4) /* dst pitch */);
+		  (OBJECT_WIDTH * 4) /* dst pitch */;
-	OUT_BATCH(0); /* dst x1,y1 */
+	bb[i++] = 0; /* dst x1,y1 */
-	OUT_BATCH((height << 16) | width); /* dst x2,y2 */
+	bb[i++] = (OBJECT_HEIGHT << 16) | OBJECT_WIDTH; /* dst x2,y2 */
-	OUT_RELOC(dst_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
-	OUT_BATCH(0); /* src x1,y1 */
+	obj[0].handle = dst_handle;
-	OUT_BATCH(width * 4); /* src pitch */
+	obj[0].offset = dst_handle * size;
-	OUT_RELOC(src_bo, I915_GEM_DOMAIN_RENDER, 0, 0);
+	reloc[0].target_handle = dst_handle;
-	ADVANCE_BATCH();
+	reloc[0].presumed_offset = obj[0].offset;
+	reloc[0].offset = sizeof(uint32_t) * i;
-	intel_batchbuffer_flush(batch);
+	reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
+	reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
-	drm_intel_bo_unreference(src_bo);
+	bb[i++] = obj[0].offset;
+	if (has_64b_reloc)
+		bb[i++] = obj[0].offset >> 32;
+	bb[i++] = 0; /* src x1,y1 */
+	bb[i++] = OBJECT_WIDTH * 4; /* src pitch */
+	obj[1].handle = src_handle;
+	obj[1].offset = src_handle * size;
+	reloc[1].target_handle = src_handle;
+	reloc[1].presumed_offset = obj[1].offset;
+	reloc[1].offset = sizeof(uint32_t) * i;
+	reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
+	reloc[1].write_domain = 0;
+	bb[i++] = obj[1].offset;
+	if (has_64b_reloc)
+		bb[i++] = obj[1].offset >> 32;
+	obj[2].handle = bb_handle;
+	obj[2].relocs_ptr = to_user_pointer(reloc);
+	obj[2].relocation_count = 2;
+	bb[i++] = MI_BATCH_BUFFER_END;
+	gem_munmap(bb, bb_size);
+	exec.buffers_ptr = to_user_pointer(obj);
+	exec.buffer_count = 3;
+	exec.flags = gen >= 6 ? I915_EXEC_BLT : 0 | I915_EXEC_NO_RELOC;
+	gem_execbuf(i915, &exec);
 }
 int main(int argc, char **argv)
 {
-	int fd;
-	int object_size = OBJECT_WIDTH * OBJECT_HEIGHT * 4;
 	double start_time, end_time;
-	drm_intel_bo *dst_bo;
+	uint32_t dst_handle;
-	drm_intel_bufmgr *bufmgr;
+	int i915, i;
-	struct intel_batchbuffer *batch;
-	int i;
-	fd = drm_open_driver(DRIVER_INTEL);
-	bufmgr = drm_intel_bufmgr_gem_init(fd, 4096);
+	i915 = drm_open_driver(DRIVER_INTEL);
-	drm_intel_bufmgr_gem_enable_reuse(bufmgr);
+	dst_handle = gem_create(i915, OBJECT_WIDTH * OBJECT_HEIGHT * 4);
-	batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd));
-	dst_bo = drm_intel_bo_alloc(bufmgr, "dst", object_size, 4096);
 	/* Prep loop to get us warmed up. */
-	for (i = 0; i < 60; i++) {
+	for (i = 0; i < 60; i++)
-		do_render(bufmgr, batch, dst_bo, OBJECT_WIDTH, OBJECT_HEIGHT);
+		do_render(i915, dst_handle);
-	}
+	gem_sync(i915, dst_handle);
-	drm_intel_bo_wait_rendering(dst_bo);
 	/* Do the actual timing. */
 	start_time = get_time_in_secs();
-	for (i = 0; i < 200; i++) {
+	for (i = 0; i < 200; i++)
-		do_render(bufmgr, batch, dst_bo, OBJECT_WIDTH, OBJECT_HEIGHT);
+		do_render(i915, dst_handle);
-	}
+	gem_sync(i915, dst_handle);
-	drm_intel_bo_wait_rendering(dst_bo);
 	end_time = get_time_in_secs();
 	printf("%d iterations in %.03f secs: %.01f MB/sec\n", i,
@@ -147,10 +173,6 @@ int main(int argc, char **argv)
 	       (double)i * OBJECT_WIDTH * OBJECT_HEIGHT * 4 / 1024.0 / 1024.0 /
 	       (end_time - start_time));
-	intel_batchbuffer_free(batch);
+	close(i915);
-	drm_intel_bufmgr_destroy(bufmgr);
-	close(fd);
-	return 0;
 }
--- a/benchmarks/intel_upload_blit_small.c
+++ b/benchmarks/intel_upload_blit_small.c
@@ -41,6 +41,7 @@
 */
 #include "igt.h"
+#include "i915/gem_create.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -66,21 +67,27 @@ get_time_in_secs(void)
 }
 static void
-do_render(drm_intel_bufmgr *bufmgr, struct intel_batchbuffer *batch,
+do_render(int i915, uint32_t dst_handle)
-	  drm_intel_bo *dst_bo, int width, int height)
 {
-	uint32_t data[64];
+	struct drm_i915_gem_execbuffer2 exec = {};
-	drm_intel_bo *src_bo;
+	struct drm_i915_gem_exec_object2 obj[3] = {};
-	int i;
+	struct drm_i915_gem_relocation_entry reloc[2];
 	static uint32_t seed = 1;
+	uint32_t data[OBJECT_WIDTH * OBJECT_HEIGHT];
+	uint64_t size = OBJECT_WIDTH * OBJECT_HEIGHT * 4, bb_size = 4096;
+	uint32_t src_handle, bb_handle, *bb;
+	uint32_t gen = intel_gen(intel_get_drm_devid(i915));
+	const bool has_64b_reloc = gen >= 8;
+	int i;
-	src_bo = drm_intel_bo_alloc(bufmgr, "src", width * height * 4, 4096);
+	bb_handle = gem_create_from_pool(i915, &bb_size, REGION_SMEM);
+	src_handle = gem_create_from_pool(i915, &size, REGION_SMEM);
 	/* Upload some junk.  Real workloads would be doing a lot more
 	 * work to generate the junk.
 	 */
-	for (i = 0; i < width * height;) {
+	for (i = 0; i < OBJECT_WIDTH * OBJECT_HEIGHT; i++) {
-		int size, j;
+		int subsize, j;
 		/* Choose a size from 1 to 64 dwords to upload.
 		 * Normal workloads have a distribution of sizes with a
@@ -88,68 +95,92 @@ do_render(drm_intel_bufmgr *bufmgr, struct intel_batchbuffer *batch,
 		 * pile of vertices, most likely), but I'm trying to get at
 		 * the cost of the small uploads here.
 		 */
-		size = random() % 64 + 1;
+		subsize = random() % 64 + 1;
-		if (i + size > width * height)
+		if (i + subsize > OBJECT_WIDTH * OBJECT_HEIGHT)
-			size = width * height - i;
+			subsize = OBJECT_WIDTH * OBJECT_HEIGHT - i;
-		for (j = 0; j < size; j++)
+		for (j = 0; j < subsize; j++)
 			data[j] = seed++;
 		/* Upload the junk. */
-		drm_intel_bo_subdata(src_bo, i * 4, size * 4, data);
+		//drm_intel_bo_subdata(src_bo, i * 4, size * 4, data);
+		gem_write(i915, src_handle, i * 4, data, subsize * 4);
-		i += size;
+		i += subsize;
 	}
 	/* Render the junk to the dst. */
-	BLIT_COPY_BATCH_START(0);
+	bb = gem_mmap__device_coherent(i915, bb_handle, 0, bb_size, PROT_WRITE);
-	OUT_BATCH((3 << 24) | /* 32 bits */
+	i = 0;
+	bb[i++] = XY_SRC_COPY_BLT_CMD |
+		  XY_SRC_COPY_BLT_WRITE_ALPHA |
+		  XY_SRC_COPY_BLT_WRITE_RGB |
+		  (6 + 2*(gen >= 8));
+	bb[i++] = (3 << 24) | /* 32 bits */
 		  (0xcc << 16) | /* copy ROP */
-		  (width * 4) /* dst pitch */);
+		  (OBJECT_WIDTH * 4) /* dst pitch */;
-	OUT_BATCH(0); /* dst x1,y1 */
+	bb[i++] = 0; /* dst x1,y1 */
-	OUT_BATCH((height << 16) | width); /* dst x2,y2 */
+	bb[i++] = (OBJECT_HEIGHT << 16) | OBJECT_WIDTH; /* dst x2,y2 */
-	OUT_RELOC(dst_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
-	OUT_BATCH(0); /* src x1,y1 */
+	obj[0].handle = dst_handle;
-	OUT_BATCH(width * 4); /* src pitch */
+	obj[0].offset = dst_handle * size;
-	OUT_RELOC(src_bo, I915_GEM_DOMAIN_RENDER, 0, 0);
+	reloc[0].target_handle = dst_handle;
-	ADVANCE_BATCH();
+	reloc[0].presumed_offset = obj[0].offset;
+	reloc[0].offset = sizeof(uint32_t) * i;
-	intel_batchbuffer_flush(batch);
+	reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
+	reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
-	drm_intel_bo_unreference(src_bo);
+	bb[i++] = obj[0].offset;
+	if (has_64b_reloc)
+		bb[i++] = obj[0].offset >> 32;
+	bb[i++] = 0; /* src x1,y1 */
+	bb[i++] = OBJECT_WIDTH * 4; /* src pitch */
+	obj[1].handle = src_handle;
+	obj[1].offset = src_handle * size;
+	reloc[1].target_handle = src_handle;
+	reloc[1].presumed_offset = obj[1].offset;
+	reloc[1].offset = sizeof(uint32_t) * i;
+	reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
+	reloc[1].write_domain = 0;
+	bb[i++] = obj[1].offset;
+	if (has_64b_reloc)
+		bb[i++] = obj[1].offset >> 32;
+	obj[2].handle = bb_handle;
+	obj[2].relocs_ptr = to_user_pointer(reloc);
+	obj[2].relocation_count = 2;
+	bb[i++] = MI_BATCH_BUFFER_END;
+	gem_munmap(bb, bb_size);
+	exec.buffers_ptr = to_user_pointer(obj);
+	exec.buffer_count = 3;
+	exec.flags = gen >= 6 ? I915_EXEC_BLT : 0 | I915_EXEC_NO_RELOC;
+	gem_execbuf(i915, &exec);
 }
 int main(int argc, char **argv)
 {
-	int fd;
-	int object_size = OBJECT_WIDTH * OBJECT_HEIGHT * 4;
 	double start_time, end_time;
-	drm_intel_bo *dst_bo;
+	uint32_t dst_handle;
-	drm_intel_bufmgr *bufmgr;
+	int i915, i;
-	struct intel_batchbuffer *batch;
-	int i;
-	fd = drm_open_driver(DRIVER_INTEL);
-	bufmgr = drm_intel_bufmgr_gem_init(fd, 4096);
-	drm_intel_bufmgr_gem_enable_reuse(bufmgr);
-	batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd));
-	dst_bo = drm_intel_bo_alloc(bufmgr, "dst", object_size, 4096);
+	i915 = drm_open_driver(DRIVER_INTEL);
+	dst_handle = gem_create(i915, OBJECT_WIDTH * OBJECT_HEIGHT * 4);
 	/* Prep loop to get us warmed up. */
-	for (i = 0; i < 20; i++) {
+	for (i = 0; i < 60; i++)
-		do_render(bufmgr, batch, dst_bo, OBJECT_WIDTH, OBJECT_HEIGHT);
+		do_render(i915, dst_handle);
-	}
+	gem_sync(i915, dst_handle);
-	drm_intel_bo_wait_rendering(dst_bo);
 	/* Do the actual timing. */
 	start_time = get_time_in_secs();
-	for (i = 0; i < 1000; i++) {
+	for (i = 0; i < 1000; i++)
-		do_render(bufmgr, batch, dst_bo, OBJECT_WIDTH, OBJECT_HEIGHT);
+		do_render(i915, dst_handle);
-	}
+	gem_sync(i915, dst_handle);
-	drm_intel_bo_wait_rendering(dst_bo);
 	end_time = get_time_in_secs();
 	printf("%d iterations in %.03f secs: %.01f MB/sec\n", i,
@@ -157,10 +188,6 @@ int main(int argc, char **argv)
 	       (double)i * OBJECT_WIDTH * OBJECT_HEIGHT * 4 / 1024.0 / 1024.0 /
 	       (end_time - start_time));
-	intel_batchbuffer_free(batch);
+	close(i915);
-	drm_intel_bufmgr_destroy(bufmgr);
-	close(fd);
-	return 0;
 }
--- a/benchmarks/kms_fb_stress.c
+++ b/benchmarks/kms_fb_stress.c
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2024 Arthur Grillo
+ */
+#include "igt.h"
+#define FRAME_COUNT 100
+#define NUM_FBS 2
+struct rect_t {
+	int x, y;
+	int width, height;
+};
+struct plane_t {
+	igt_plane_t *base;
+	struct rect_t rect;
+	uint32_t format;
+	struct igt_fb fbs[NUM_FBS];
+};
+struct kms_t {
+	struct rect_t crtc;
+	struct plane_t primary;
+	struct plane_t overlay_a;
+	struct plane_t overlay_b;
+	struct plane_t writeback;
+};
+struct data_t {
+	int fd;
+	igt_display_t display;
+	igt_output_t *wb_output;
+	drmModeModeInfo *mode;
+	struct kms_t kms;
+};
+static void plane_setup(struct plane_t *plane, int index)
+{
+	igt_plane_set_size(plane->base, plane->rect.width, plane->rect.height);
+	igt_plane_set_position(plane->base, plane->rect.x, plane->rect.y);
+	igt_plane_set_fb(plane->base, &plane->fbs[index]);
+}
+static void gen_fbs(struct data_t *data)
+{
+	struct kms_t *kms = &data->kms;
+	drmModeModeInfo *mode = igt_output_get_mode(data->wb_output);
+	for (int i = 0; i < NUM_FBS; i++) {
+		igt_create_color_fb(data->fd, kms->primary.rect.width, kms->primary.rect.height,
+				    kms->primary.format, DRM_FORMAT_MOD_LINEAR,
+				    !i, i, i,
+				    &kms->primary.fbs[i]);
+		igt_create_color_fb(data->fd, kms->overlay_a.rect.width, kms->overlay_a.rect.height,
+				    kms->overlay_a.format, DRM_FORMAT_MOD_LINEAR,
+				    i, !i, i,
+				    &kms->overlay_a.fbs[i]);
+		igt_create_color_fb(data->fd, kms->overlay_b.rect.width, kms->overlay_b.rect.height,
+				    kms->overlay_b.format, DRM_FORMAT_MOD_LINEAR,
+				    i, i, !i,
+				    &kms->overlay_b.fbs[i]);
+		kms->writeback.rect.width = mode->hdisplay;
+		kms->writeback.rect.height = mode->vdisplay;
+		igt_create_fb(data->fd, kms->writeback.rect.width, kms->writeback.rect.height,
+			      kms->writeback.format, DRM_FORMAT_MOD_LINEAR,
+			      &kms->writeback.fbs[i]);
+	}
+}
+static igt_output_t *find_wb_output(struct data_t *data)
+{
+	for (int i = 0; i < data->display.n_outputs; i++) {
+		igt_output_t *output = &data->display.outputs[i];
+		if (output->config.connector->connector_type != DRM_MODE_CONNECTOR_WRITEBACK)
+			continue;
+		return output;
+	}
+	return NULL;
+}
+static void set_crtc_size(struct data_t *data)
+{
+	drmModeModeInfo *mode;
+	struct rect_t *crtc = &data->kms.crtc;
+	for_each_connector_mode(data->wb_output) {
+		mode = &data->wb_output->config.connector->modes[j__];
+		if (mode->hdisplay == crtc->width && mode->vdisplay == crtc->height) {
+			igt_output_override_mode(data->wb_output, mode);
+			return;
+		}
+	}
+	igt_assert_f(0, "CRTC size %dx%d not supported\n", crtc->width, crtc->height);
+}
+static struct kms_t default_kms = {
+	.crtc = {
+		.width = 4096, .height = 2160,
+	},
+	.primary = {
+		.rect = {
+			.x = 101, .y = 0,
+			.width = 3639, .height = 2160,
+		},
+		.format = DRM_FORMAT_XRGB8888,
+	},
+	.overlay_a = {
+		.rect = {
+			.x = 201, .y = 199,
+			.width = 3033, .height = 1777,
+		},
+		.format = DRM_FORMAT_XRGB16161616,
+	},
+	.overlay_b = {
+		.rect = {
+			.x = 1800, .y = 250,
+			.width = 1507, .height = 1400,
+		},
+		.format = DRM_FORMAT_ARGB8888,
+	},
+	.writeback = {
+		.rect = {
+			.x = 0, .y = 0,
+			// Size is to be determined at runtime
+		},
+		.format = DRM_FORMAT_XRGB8888,
+	},
+};
+igt_simple_main
+{
+	struct data_t data = {0};
+	enum pipe pipe = PIPE_NONE;
+	struct timespec then, now;
+	double elapsed;
+	data.kms = default_kms;
+	data.fd = drm_open_driver_master(DRIVER_ANY);
+	kmstest_set_vt_graphics_mode();
+	igt_display_require(&data.display, data.fd);
+	igt_require(data.display.is_atomic);
+	igt_display_require_output(&data.display);
+	igt_display_reset(&data.display);
+	data.wb_output = find_wb_output(&data);
+	igt_require(data.wb_output);
+	for_each_pipe(&data.display, pipe) {
+		igt_debug("Selecting pipe %s to %s\n",
+			  kmstest_pipe_name(pipe),
+			  igt_output_name(data.wb_output));
+		igt_output_set_pipe(data.wb_output, pipe);
+		break;
+	}
+	set_crtc_size(&data);
+	gen_fbs(&data);
+	data.kms.primary.base = igt_output_get_plane_type(data.wb_output, DRM_PLANE_TYPE_PRIMARY);
+	data.kms.overlay_a.base = igt_output_get_plane_type_index(data.wb_output,
+								  DRM_PLANE_TYPE_OVERLAY, 0);
+	data.kms.overlay_b.base = igt_output_get_plane_type_index(data.wb_output,
+								  DRM_PLANE_TYPE_OVERLAY, 1);
+	igt_assert_eq(igt_gettime(&then), 0);
+	for (int i = 0; i < FRAME_COUNT; i++) {
+		int fb_index = i % NUM_FBS;
+		plane_setup(&data.kms.primary, fb_index);
+		plane_setup(&data.kms.overlay_a, fb_index);
+		plane_setup(&data.kms.overlay_b, fb_index);
+		igt_output_set_writeback_fb(data.wb_output, &data.kms.writeback.fbs[fb_index]);
+		igt_display_commit2(&data.display, COMMIT_ATOMIC);
+	}
+	igt_assert_eq(igt_gettime(&now), 0);
+	elapsed = igt_time_elapsed(&then, &now);
+	igt_info("Time spent in the loop with %d frames: %lfs.\n", FRAME_COUNT, elapsed);
+	igt_display_fini(&data.display);
+	drm_close_driver(data.fd);
+}
--- a/benchmarks/kms_vblank.c
+++ b/benchmarks/kms_vblank.c
@@ -161,7 +161,7 @@ int main(int argc, char **argv)
 		}
 	}
-	fd = drm_open_driver(DRIVER_INTEL);
+	fd = drm_open_driver(DRIVER_INTEL | DRIVER_XE);
 	if (!crtc0_active(fd)) {
 		fprintf(stderr, "CRTC/pipe 0 not active\n");
 		return 77;

--- a/benchmarks/meson.build
+++ b/benchmarks/meson.build
@@ -13,20 +13,16 @@ benchmark_progs = [
 	'gem_syslatency',
 	'gem_userptr_benchmark',
 	'gem_wsim',
+	'intel_upload_blit_large',
+	'intel_upload_blit_large_gtt',
+	'intel_upload_blit_large_map',
+	'intel_upload_blit_small',
+	'kms_fb_stress',
 	'kms_vblank',
 	'prime_lookup',
 	'vgem_mmap',
 ]
-if libdrm_intel.found()
-	benchmark_progs += [
-		'intel_upload_blit_large',
-		'intel_upload_blit_large_gtt',
-		'intel_upload_blit_large_map',
-		'intel_upload_blit_small',
-	]
-endif
 benchmarksdir = join_paths(libexecdir, 'benchmarks')
 foreach prog : benchmark_progs

--- a/benchmarks/wsim/README
+++ b/benchmarks/wsim/README
 Workload descriptor format
 ==========================
+Lines starting with '#' are treated as comments and will not create a work step.
 ctx.engine.duration_us.dependency.wait,...
 <uint>.<str>.<uint>[-<uint>]|*.<int <= 0>[/<int <= 0>][...].<0|1>,...
 B.<uint>
@@ -86,6 +88,19 @@ Batch durations can also be specified as infinite by using the '*' in the
 duration field. Such batches must be ended by the terminate command ('T')
 otherwise they will cause a GPU hang to be reported.
+Xe and i915 differences
+------------------------
+There are differences between Xe and i915, like not allowing a BO list to
+be passed to an exec (and create implicit syncs). For more details see:
+https://gitlab.freedesktop.org/drm/xe/kernel/-/blob/drm-xe-next/drivers/gpu/drm/xe/xe_exec.c
+Currently following batch steps are equal on Xe:
+1.1000.-2.0 <==> 1.1000.f-2.0
+and will create explicit sync fence dependency (via syncobjects).
+The data dependency need to wait for working sets implementation.
 Sync (fd) fences
 ----------------
@@ -129,7 +144,7 @@ runnable. When the second RCS batch completes the standalone fence is signaled
 which allows the two VCS batches to be executed. Finally we wait until the both
 VCS batches have completed before starting the (optional) next iteration.
-Submit fences
+Submit fences (i915 only)
 -------------
 Submit fences are a type of input fence which are signalled when the originating
@@ -146,7 +161,7 @@ Submit fences have the identical syntax as the sync fences with the lower-case
 Here VCS1 and VCS2 batches will only be submitted for executing once the RCS
 batch enters the GPU.
-Context priority
+Context priority (i915 only)
 ----------------
  P.1.-1
@@ -211,7 +226,7 @@ Example:
 This enables load balancing for context number one.
-Engine bonds
+Engine bonds (i915 only)
 ------------
 Engine bonds are extensions on load balanced contexts. They allow expressing
@@ -259,7 +274,7 @@ then look like:
  2.DEFAULT.1000.s-1.0
  a.-3
-Context SSEU configuration
+Context SSEU configuration (i915 only)
 --------------------------
  S.1.1
@@ -279,7 +294,7 @@ Slice mask of -1 has a special meaning of "all slices". Otherwise any integer
 can be specifying as the slice mask, but beware any apart from 1 and -1 can make
 the workload not portable between different GPUs.
-Working sets
+Working sets (i915 only)
 ------------
 When used plainly workload steps can create implicit data dependencies by
No results found