FIXME: drm/xe: Use per device page fault queues

Having a per-GT page fault queue doesn't provide any real benefit as per VM or per MM locks need to be held in an exclusive way. Move the page fault queue to device. This can be revisited if the VM or MM locks can be held in a shared way. Also use an array of ordered work queue rather than a single unbound work queue with multiple threads. Will help with scheduling other work items on page fault work queues. FIXME: Layering for init / reset now wrong, clean up before merging FIXME: GT resets may be broken which patch Signed-off-by: Matthew Brost <matthew.brost@intel.com>

FIXME: drm/xe: Use per device page fault queues
e9fd233a · Matthew Brost · 43e63426 · e9fd233a · e9fd233a · e9fd233a
Commit e9fd233a authored 10 months ago by Matthew Brost
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -228,6 +228,15 @@ static struct drm_driver driver = {
 static void xe_device_destroy(struct drm_device *dev, void *dummy)
 {
 	struct xe_device *xe = to_xe_device(dev);
+	u8 i;
+
+	for (i = 0; i < NUM_PF_QUEUE; ++i) {
+		if (xe->usm.pf_wq[i])
+			destroy_workqueue(xe->usm.pf_wq[i]);
+	}
+
+	if (xe->usm.acc_wq)
+		destroy_workqueue(xe->usm.acc_wq);

 	if (xe->preempt_fence_wq)
 		destroy_workqueue(xe->preempt_fence_wq);
@@ -246,6 +255,7 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
 {
 	struct xe_device *xe;
 	int err;
+	u8 i;

 	xe_display_driver_set_hooks(&driver);

@@ -295,6 +305,26 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
 			xa_erase(&xe->usm.asid_to_vm, asid);
 	}

+	for (i = 0; i < NUM_PF_QUEUE; ++i) {
+		xe->usm.pf_queue[i].xe = xe;
+		xe->usm.pf_queue[i].id = i;
+		spin_lock_init(&xe->usm.pf_queue[i].lock);
+		xe->usm.pf_wq[i] = alloc_ordered_workqueue("xe_gt_page_fault_work_queue",
+							   WQ_HIGHPRI);
+		if (!xe->usm.pf_wq[i]) {
+			err = -ENOMEM;
+			goto err;
+		}
+	}
+
+	xe->usm.acc_wq = alloc_workqueue("xe_gt_access_counter_work_queue",
+					 WQ_UNBOUND | WQ_HIGHPRI,
+					 NUM_ACC_QUEUE);
+	if (!xe->usm.acc_wq) {
+		err = -ENOMEM;
+		goto err;
+	}
+
 	spin_lock_init(&xe->pinned.lock);
 	INIT_LIST_HEAD(&xe->pinned.kernel_bo_present);
 	INIT_LIST_HEAD(&xe->pinned.external_vram);

--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -28,6 +28,7 @@
 #include "intel_display_device.h"
 #endif

+struct xe_device;
 struct xe_ggtt;
 struct xe_pat_ops;

@@ -354,6 +355,68 @@ struct xe_device {
 		u32 num_vm_in_non_fault_mode;
 		/** @usm.lock: protects UM state */
 		struct mutex lock;
+#define NUM_PF_QUEUE	4
+		/** @usm.pf_wq: page fault work queue, ordered, high priority */
+		struct workqueue_struct *pf_wq[NUM_PF_QUEUE];
+		/** @usm.acc_wq: access counter work queue, unbound, high priority */
+		struct workqueue_struct *acc_wq;
+		/**
+		 * @usm.pf_queue: Page fault queue used to sink faults so faults can
+		 * be processed not under the GuC CT lock. The queue is sized so
+		 * it can sync all possible faults (1 per physical engine).
+		 * Multiple queues exists for page faults from different VMs are
+		 * be processed in parallel.
+		 */
+		struct pf_queue {
+			/** @usm.pf_queue.xe: back pointer to Xe device */
+			struct xe_device *xe;
+#define PF_QUEUE_NUM_DW	128
+			/** @usm.pf_queue.data: data in the page fault queue */
+			u32 data[PF_QUEUE_NUM_DW];
+			/**
+			 * @usm.pf_queue.tail: tail pointer in DWs for page fault queue,
+			 * moved by worker which processes faults (consumer).
+			 */
+			u16 tail;
+			/**
+			 * @usm.pf_queue.head: head pointer in DWs for page fault queue,
+			 * moved by G2H handler (producer).
+			 */
+			u16 head;
+			/** @usm.pf_queue.id: id of pf queue */
+			u8 id;
+			/** @usm.pf_queue.lock: protects page fault queue */
+			spinlock_t lock;
+			/** @usm.pf_queue.worker: to process page faults */
+			struct work_struct worker;
+		} pf_queue[NUM_PF_QUEUE];
+		/**
+		 * @usm.acc_queue: Same as page fault queue, cannot process access
+		 * counters under CT lock.
+		 */
+		struct acc_queue {
+			/** @usm.acc_queue.xe: back pointer to Xe device */
+			struct xe_device *xe;
+#define ACC_QUEUE_NUM_DW	128
+			/** @usm.acc_queue.data: data in the page fault queue */
+			u32 data[ACC_QUEUE_NUM_DW];
+			/**
+			 * @usm.acc_queue.tail: tail pointer in DWs for access counter queue,
+			 * moved by worker which processes counters
+			 * (consumer).
+			 */
+			u16 tail;
+			/**
+			 * @usm.acc_queue.head: head pointer in DWs for access counter queue,
+			 * moved by G2H handler (producer).
+			 */
+			u16 head;
+			/** @usm.acc_queue.lock: protects page fault queue */
+			spinlock_t lock;
+			/** @usm.acc_queue.worker: to process access counters */
+			struct work_struct worker;
+#define NUM_ACC_QUEUE	4
+		} acc_queue[NUM_ACC_QUEUE];
 	} usm;

 	/** @pinned: pinned BO state */

--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -266,9 +266,23 @@ static void print_pagefault(struct xe_device *xe, struct pagefault *pf)
 		 pf->engine_class, pf->engine_instance);
 }

+static struct xe_gt *get_gt(struct xe_device *xe, u32 gt_id)
+{
+	struct xe_gt *gt;
+	u32 id;
+
+	for_each_gt(gt, xe, id)
+		if (gt->info.id == gt_id)
+			return gt;
+
+	XE_WARN_ON("NOT POSSIBLE");
+	return NULL;
+}
+
 #define PF_MSG_LEN_DW	4

-static bool get_pagefault(struct pf_queue *pf_queue, struct pagefault *pf)
+static bool get_pagefault(struct pf_queue *pf_queue, struct pagefault *pf,
+			  struct xe_device *xe, struct xe_gt **gt)
 {
 	const struct xe_guc_pagefault_desc *desc;
 	bool ret = false;
@@ -278,6 +292,8 @@ static bool get_pagefault(struct pf_queue *pf_queue, struct pagefault *pf)
 		desc = (const struct xe_guc_pagefault_desc *)
 			(pf_queue->data + pf_queue->tail);

+		*gt = get_gt(xe, FIELD_GET(PFD_RSVD_0, desc->dw0));
+
 		pf->fault_level = FIELD_GET(PFD_FAULT_LEVEL, desc->dw0);
 		pf->trva_fault = FIELD_GET(XE2_PFD_TRVA_FAULT, desc->dw0);
 		pf->engine_class = FIELD_GET(PFD_ENG_CLASS, desc->dw0);
@@ -318,6 +334,7 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
 	struct pf_queue *pf_queue;
 	unsigned long flags;
 	u32 asid;
+	int pf_idx;
 	bool full;

 	/*
@@ -329,14 +346,17 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
 		return -EPROTO;

 	asid = FIELD_GET(PFD_ASID, msg[1]);
-	pf_queue = gt->usm.pf_queue + (asid % NUM_PF_QUEUE);
+	pf_idx = (asid % NUM_PF_QUEUE);
+	pf_queue = xe->usm.pf_queue + pf_idx;

 	spin_lock_irqsave(&pf_queue->lock, flags);
 	full = pf_queue_full(pf_queue);
 	if (!full) {
 		memcpy(pf_queue->data + pf_queue->head, msg, len * sizeof(u32));
+		pf_queue->data[pf_queue->head] |=
+			FIELD_PREP(PFD_RSVD_0, (u32)gt->info.id);
 		pf_queue->head = (pf_queue->head + len) % PF_QUEUE_NUM_DW;
-		queue_work(gt->usm.pf_wq, &pf_queue->worker);
+		queue_work(xe->usm.pf_wq[pf_idx], &pf_queue->worker);
 	} else {
 		drm_warn(&xe->drm, "PF Queue full, shouldn't be possible");
 	}
@@ -350,8 +370,8 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
 static void pf_queue_work_func(struct work_struct *w)
 {
 	struct pf_queue *pf_queue = container_of(w, struct pf_queue, worker);
-	struct xe_gt *gt = pf_queue->gt;
-	struct xe_device *xe = gt_to_xe(gt);
+	struct xe_device *xe = pf_queue->xe;
+	struct xe_gt *gt;
 	struct xe_guc_pagefault_reply reply = {};
 	struct pagefault pf = {};
 	unsigned long threshold;
@@ -359,7 +379,7 @@ static void pf_queue_work_func(struct work_struct *w)

 	threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS);

-	while (get_pagefault(pf_queue, &pf)) {
+	while (get_pagefault(pf_queue, &pf, xe, &gt)) {
 		ret = handle_pagefault(gt, &pf);
 		if (unlikely(ret)) {
 			print_pagefault(xe, &pf);
@@ -382,7 +402,7 @@ static void pf_queue_work_func(struct work_struct *w)

 		if (time_after(jiffies, threshold) &&
 		    pf_queue->tail != pf_queue->head) {
-			queue_work(gt->usm.pf_wq, w);
+			queue_work(xe->usm.pf_wq[pf_queue->id], w);
 			break;
 		}
 	}
@@ -393,32 +413,15 @@ static void acc_queue_work_func(struct work_struct *w);
 int xe_gt_pagefault_init(struct xe_gt *gt)
 {
 	struct xe_device *xe = gt_to_xe(gt);
-	int i;
+	u8 i;

-	if (!xe->info.has_usm)
+	if (!xe->info.has_usm || gt->info.id)
 		return 0;

-	for (i = 0; i < NUM_PF_QUEUE; ++i) {
-		gt->usm.pf_queue[i].gt = gt;
-		spin_lock_init(&gt->usm.pf_queue[i].lock);
-		INIT_WORK(&gt->usm.pf_queue[i].worker, pf_queue_work_func);
-	}
-	for (i = 0; i < NUM_ACC_QUEUE; ++i) {
-		gt->usm.acc_queue[i].gt = gt;
-		spin_lock_init(&gt->usm.acc_queue[i].lock);
-		INIT_WORK(&gt->usm.acc_queue[i].worker, acc_queue_work_func);
-	}
-
-	gt->usm.pf_wq = alloc_workqueue("xe_gt_page_fault_work_queue",
-					WQ_UNBOUND | WQ_HIGHPRI, NUM_PF_QUEUE);
-	if (!gt->usm.pf_wq)
-		return -ENOMEM;
-
-	gt->usm.acc_wq = alloc_workqueue("xe_gt_access_counter_work_queue",
-					 WQ_UNBOUND | WQ_HIGHPRI,
-					 NUM_ACC_QUEUE);
-	if (!gt->usm.acc_wq)
-		return -ENOMEM;
+	for (i = 0; i < NUM_PF_QUEUE; ++i)
+		INIT_WORK(&xe->usm.pf_queue[i].worker, pf_queue_work_func);
+	for (i = 0; i < NUM_ACC_QUEUE; ++i)
+		INIT_WORK(&xe->usm.acc_queue[i].worker, acc_queue_work_func);

 	return 0;
 }
@@ -428,21 +431,21 @@ void xe_gt_pagefault_reset(struct xe_gt *gt)
 	struct xe_device *xe = gt_to_xe(gt);
 	int i;

-	if (!xe->info.has_usm)
+	if (!xe->info.has_usm || gt->info.id)
 		return;

 	for (i = 0; i < NUM_PF_QUEUE; ++i) {
-		spin_lock_irq(&gt->usm.pf_queue[i].lock);
-		gt->usm.pf_queue[i].head = 0;
-		gt->usm.pf_queue[i].tail = 0;
-		spin_unlock_irq(&gt->usm.pf_queue[i].lock);
+		spin_lock_irq(&xe->usm.pf_queue[i].lock);
+		xe->usm.pf_queue[i].head = 0;
+		xe->usm.pf_queue[i].tail = 0;
+		spin_unlock_irq(&xe->usm.pf_queue[i].lock);
 	}

 	for (i = 0; i < NUM_ACC_QUEUE; ++i) {
-		spin_lock(&gt->usm.acc_queue[i].lock);
-		gt->usm.acc_queue[i].head = 0;
-		gt->usm.acc_queue[i].tail = 0;
-		spin_unlock(&gt->usm.acc_queue[i].lock);
+		spin_lock(&xe->usm.acc_queue[i].lock);
+		xe->usm.acc_queue[i].head = 0;
+		xe->usm.acc_queue[i].tail = 0;
+		spin_unlock(&xe->usm.acc_queue[i].lock);
 	}
 }

@@ -550,7 +553,8 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)

 #define ACC_MSG_LEN_DW        4

-static bool get_acc(struct acc_queue *acc_queue, struct acc *acc)
+static bool get_acc(struct acc_queue *acc_queue, struct acc *acc,
+		    struct xe_device *xe, struct xe_gt **gt)
 {
 	const struct xe_guc_acc_desc *desc;
 	bool ret = false;
@@ -560,6 +564,8 @@ static bool get_acc(struct acc_queue *acc_queue, struct acc *acc)
 		desc = (const struct xe_guc_acc_desc *)
 			(acc_queue->data + acc_queue->tail);

+		*gt = get_gt(xe, FIELD_GET(ACC_RSVD2, desc->dw2));
+
 		acc->granularity = FIELD_GET(ACC_GRANULARITY, desc->dw2);
 		acc->sub_granularity = FIELD_GET(ACC_SUBG_HI, desc->dw1) << 31 |
 			FIELD_GET(ACC_SUBG_LO, desc->dw0);
@@ -583,15 +589,15 @@ static bool get_acc(struct acc_queue *acc_queue, struct acc *acc)
 static void acc_queue_work_func(struct work_struct *w)
 {
 	struct acc_queue *acc_queue = container_of(w, struct acc_queue, worker);
-	struct xe_gt *gt = acc_queue->gt;
-	struct xe_device *xe = gt_to_xe(gt);
+	struct xe_device *xe = acc_queue->xe;
+	struct xe_gt *gt;
 	struct acc acc = {};
 	unsigned long threshold;
 	int ret;

 	threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS);

-	while (get_acc(acc_queue, &acc)) {
+	while (get_acc(acc_queue, &acc, xe, &gt)) {
 		ret = handle_acc(gt, &acc);
 		if (unlikely(ret)) {
 			print_acc(xe, &acc);
@@ -600,7 +606,7 @@ static void acc_queue_work_func(struct work_struct *w)

 		if (time_after(jiffies, threshold) &&
 		    acc_queue->tail != acc_queue->head) {
-			queue_work(gt->usm.acc_wq, w);
+			queue_work(xe->usm.acc_wq, w);
 			break;
 		}
 	}
@@ -617,6 +623,7 @@ static bool acc_queue_full(struct acc_queue *acc_queue)
 int xe_guc_access_counter_notify_handler(struct xe_guc *guc, u32 *msg, u32 len)
 {
 	struct xe_gt *gt = guc_to_gt(guc);
+	struct xe_device *xe = gt_to_xe(gt);
 	struct acc_queue *acc_queue;
 	u32 asid;
 	bool full;
@@ -630,15 +637,17 @@ int xe_guc_access_counter_notify_handler(struct xe_guc *guc, u32 *msg, u32 len)
 		return -EPROTO;

 	asid = FIELD_GET(ACC_ASID, msg[1]);
-	acc_queue = &gt->usm.acc_queue[asid % NUM_ACC_QUEUE];
+	acc_queue = &xe->usm.acc_queue[asid % NUM_ACC_QUEUE];

 	spin_lock(&acc_queue->lock);
 	full = acc_queue_full(acc_queue);
 	if (!full) {
 		memcpy(acc_queue->data + acc_queue->head, msg,
 		       len * sizeof(u32));
+		acc_queue->data[acc_queue->head + 2] |=
+			FIELD_PREP(ACC_RSVD2, (u32)gt->info.id);
 		acc_queue->head = (acc_queue->head + len) % ACC_QUEUE_NUM_DW;
-		queue_work(gt->usm.acc_wq, &acc_queue->worker);
+		queue_work(xe->usm.acc_wq, &acc_queue->worker);
 	} else {
 		drm_warn(&gt_to_xe(gt)->drm, "ACC Queue full, dropping ACC");
 	}

--- a/drivers/gpu/drm/xe/xe_gt_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_types.h
@@ -211,66 +211,6 @@ struct xe_gt {
 		 * operations (e.g. mmigrations, fixing page tables)
 		 */
 		u16 reserved_bcs_instance;
-		/** @usm.pf_wq: page fault work queue, unbound, high priority */
-		struct workqueue_struct *pf_wq;
-		/** @usm.acc_wq: access counter work queue, unbound, high priority */
-		struct workqueue_struct *acc_wq;
-		/**
-		 * @usm.pf_queue: Page fault queue used to sync faults so faults can
-		 * be processed not under the GuC CT lock. The queue is sized so
-		 * it can sync all possible faults (1 per physical engine).
-		 * Multiple queues exists for page faults from different VMs are
-		 * be processed in parallel.
-		 */
-		struct pf_queue {
-			/** @usm.pf_queue.gt: back pointer to GT */
-			struct xe_gt *gt;
-#define PF_QUEUE_NUM_DW	128
-			/** @usm.pf_queue.data: data in the page fault queue */
-			u32 data[PF_QUEUE_NUM_DW];
-			/**
-			 * @usm.pf_queue.tail: tail pointer in DWs for page fault queue,
-			 * moved by worker which processes faults (consumer).
-			 */
-			u16 tail;
-			/**
-			 * @usm.pf_queue.head: head pointer in DWs for page fault queue,
-			 * moved by G2H handler (producer).
-			 */
-			u16 head;
-			/** @usm.pf_queue.lock: protects page fault queue */
-			spinlock_t lock;
-			/** @usm.pf_queue.worker: to process page faults */
-			struct work_struct worker;
-#define NUM_PF_QUEUE	4
-		} pf_queue[NUM_PF_QUEUE];
-		/**
-		 * @usm.acc_queue: Same as page fault queue, cannot process access
-		 * counters under CT lock.
-		 */
-		struct acc_queue {
-			/** @usm.acc_queue.gt: back pointer to GT */
-			struct xe_gt *gt;
-#define ACC_QUEUE_NUM_DW	128
-			/** @usm.acc_queue.data: data in the page fault queue */
-			u32 data[ACC_QUEUE_NUM_DW];
-			/**
-			 * @usm.acc_queue.tail: tail pointer in DWs for access counter queue,
-			 * moved by worker which processes counters
-			 * (consumer).
-			 */
-			u16 tail;
-			/**
-			 * @usm.acc_queue.head: head pointer in DWs for access counter queue,
-			 * moved by G2H handler (producer).
-			 */
-			u16 head;
-			/** @usm.acc_queue.lock: protects page fault queue */
-			spinlock_t lock;
-			/** @usm.acc_queue.worker: to process access counters */
-			struct work_struct worker;
-#define NUM_ACC_QUEUE	4
-		} acc_queue[NUM_ACC_QUEUE];
 	} usm;

 	/** @ordered_wq: used to serialize GT resets and TDRs */