drm/i915: Remove i915_request.lock requirement for execution callbacks

To implement preempt-to-busy (and so efficient timeslicing and best utilization of the hardware submission ports) we let the GPU run asynchronously in respect to the ELSP submission queue. This created challenges in keeping and accessing the driver state mirroring the asynchronous GPU execution. Previous fix 1d9221e9d395 ("drm/i915: Skip signaling a signaled request") however did not correctly serialize request retirement with the execution callbacks. We were using the i915_request.lock to serialise adding an execution callback with __i915_request_submit. However, if we use an atomic llist_add to serialise multiple waiters and then check to see if the request is already executing, we can remove the irq-spinlock and fix serialization between retirement and execution callbacks in one go. v2: Avoid using the irq_work when outside of the irq-spinlocks, where we can execute the callbacks immediately. v3: Pay close attention to the order of setting ACTIVE on retirement, we need to ensure the request is signaled and breadcrumbs detached before we finish removing the request from the engine. v4: Expanded commit message. Fixes: 1d9221e9d395 ("drm/i915: Skip signaling a signaled request") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200716142207.13003-2-chris@chris-wilson.co.uk Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com> [Joonas: Rebased and reordered into drm-intel-gt-next branch] [Joonas: Added expanded commit message from Tvrtko and Chris] Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
author: Chris Wilson <chris@chris-wilson.co.uk> 2020-07-16 15:22:07 +0100
committer: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> 2020-09-07 15:08:06 +0300
commit: 2e4c6c1a9db5e12556a12ea722df71096247e178 (patch)
tree: 4f7388c4575cca507b680f2d5188f3118a9e59ec /drivers/gpu/drm/i915
parent: b4d9145b0154f8c71dafc2db5fd445f1f3db9426 (diff)
1 files changed, 58 insertions, 47 deletions
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 4f7f67a785ac..11e272422fb7 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -187,30 +187,34 @@ static void irq_execute_cb_hook(struct irq_work *wrk)
 	irq_execute_cb(wrk);
 }
 
-static void __notify_execute_cb(struct i915_request *rq)
+static __always_inline void
+__notify_execute_cb(struct i915_request *rq, bool (*fn)(struct irq_work *wrk))
 {
 	struct execute_cb *cb, *cn;
 
-	lockdep_assert_held(&rq->lock);
-
-	GEM_BUG_ON(!i915_request_is_active(rq));
 	if (llist_empty(&rq->execute_cb))
 		return;
 
-	llist_for_each_entry_safe(cb, cn, rq->execute_cb.first, work.llnode)
-		irq_work_queue(&cb->work);
+	llist_for_each_entry_safe(cb, cn,
+				  llist_del_all(&rq->execute_cb),
+				  work.llnode)
+		fn(&cb->work);
+}
 
-	/*
-	 * XXX Rollback on __i915_request_unsubmit()
-	 *
-	 * In the future, perhaps when we have an active time-slicing scheduler,
-	 * it will be interesting to unsubmit parallel execution and remove
-	 * busywaits from the GPU until their master is restarted. This is
-	 * quite hairy, we have to carefully rollback the fence and do a
-	 * preempt-to-idle cycle on the target engine, all the while the
-	 * master execute_cb may refire.
-	 */
-	init_llist_head(&rq->execute_cb);
+static void __notify_execute_cb_irq(struct i915_request *rq)
+{
+	__notify_execute_cb(rq, irq_work_queue);
+}
+
+static bool irq_work_imm(struct irq_work *wrk)
+{
+	wrk->func(wrk);
+	return false;
+}
+
+static void __notify_execute_cb_imm(struct i915_request *rq)
+{
+	__notify_execute_cb(rq, irq_work_imm);
 }
 
 static void free_capture_list(struct i915_request *request)
@@ -257,9 +261,16 @@ static void remove_from_engine(struct i915_request *rq)
 		locked = engine;
 	}
 	list_del_init(&rq->sched.link);
+
 	clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
 	clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags);
+
+	/* Prevent further __await_execution() registering a cb, then flush */
+	set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags);
+
 	spin_unlock_irq(&locked->active.lock);
+
+	__notify_execute_cb_imm(rq);
 }
 
 bool i915_request_retire(struct i915_request *rq)
@@ -271,6 +282,7 @@ bool i915_request_retire(struct i915_request *rq)
 
 	GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
 	trace_i915_request_retire(rq);
+	i915_request_mark_complete(rq);
 
 	/*
 	 * We know the GPU must have read the request to have
@@ -288,15 +300,6 @@ bool i915_request_retire(struct i915_request *rq)
 		__i915_request_fill(rq, POISON_FREE);
 	rq->ring->head = rq->postfix;
 
-	/*
-	 * We only loosely track inflight requests across preemption,
-	 * and so we may find ourselves attempting to retire a _completed_
-	 * request that we have removed from the HW and put back on a run
-	 * queue.
-	 */
-	remove_from_engine(rq);
-
-	i915_request_mark_complete(rq);
 	if (!i915_request_signaled(rq)) {
 		spin_lock_irq(&rq->lock);
 		dma_fence_signal_locked(&rq->fence);
@@ -320,7 +323,6 @@ bool i915_request_retire(struct i915_request *rq)
 	 */
 	remove_from_engine(rq);
 	GEM_BUG_ON(!llist_empty(&rq->execute_cb));
-	spin_unlock_irq(&rq->lock);
 
 	__list_del_entry(&rq->link); /* poison neither prev/next (RCU walks) */
 
@@ -348,12 +350,6 @@ void i915_request_retire_upto(struct i915_request *rq)
 	} while (i915_request_retire(tmp) && tmp != rq);
 }
 
-static void __llist_add(struct llist_node *node, struct llist_head *head)
-{
-	node->next = head->first;
-	head->first = node;
-}
-
 static struct i915_request * const *
 __engine_active(struct intel_engine_cs *engine)
 {
@@ -451,18 +447,24 @@ __await_execution(struct i915_request *rq,
 		cb->work.func = irq_execute_cb_hook;
 	}
 
-	spin_lock_irq(&signal->lock);
-	if (i915_request_is_active(signal) || __request_in_flight(signal)) {
-		if (hook) {
-			hook(rq, &signal->fence);
-			i915_request_put(signal);
-		}
-		i915_sw_fence_complete(cb->fence);
-		kmem_cache_free(global.slab_execute_cbs, cb);
-	} else {
-		__llist_add(&cb->work.llnode, &signal->execute_cb);
+	/*
+	 * Register the callback first, then see if the signaler is already
+	 * active. This ensures that if we race with the
+	 * __notify_execute_cb from i915_request_submit() and we are not
+	 * included in that list, we get a second bite of the cherry and
+	 * execute it ourselves. After this point, a future
+	 * i915_request_submit() will notify us.
+	 *
+	 * In i915_request_retire() we set the ACTIVE bit on a completed
+	 * request (then flush the execute_cb). So by registering the
+	 * callback first, then checking the ACTIVE bit, we serialise with
+	 * the completed/retired request.
+	 */
+	if (llist_add(&cb->work.llnode, &signal->execute_cb)) {
+		if (i915_request_is_active(signal) ||
+		    __request_in_flight(signal))
+			__notify_execute_cb_imm(signal);
 	}
-	spin_unlock_irq(&signal->lock);
 
 	return 0;
 }
@@ -588,10 +590,19 @@ xfer:
 	 * preempt-to-idle cycle on the target engine, all the while the
 	 * master execute_cb may refire.
 	 */
-	__notify_execute_cb(request);
+	__notify_execute_cb_irq(request);
 
-	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
-		i915_request_enable_breadcrumb(request);
+	/* We may be recursing from the signal callback of another i915 fence */
+	if (!i915_request_signaled(request)) {
+		spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+
+		if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+			     &request->fence.flags) &&
+		    !i915_request_enable_breadcrumb(request))
+			intel_engine_signal_breadcrumbs(engine);
+
+		spin_unlock(&request->lock);
+	}
 
 	return result;
 }
author	Chris Wilson <chris@chris-wilson.co.uk>	2020-07-16 15:22:07 +0100
committer	Joonas Lahtinen <joonas.lahtinen@linux.intel.com>	2020-09-07 15:08:06 +0300
commit	2e4c6c1a9db5e12556a12ea722df71096247e178 (patch)
tree	4f7388c4575cca507b680f2d5188f3118a9e59ec /drivers/gpu/drm/i915
parent	b4d9145b0154f8c71dafc2db5fd445f1f3db9426 (diff)