From 8687b3ec852e89630bac650f15136811c7b4c1dc Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 7 Oct 2016 07:53:24 +0100
Subject: drm/i915: Distinguish last emitted request from last submitted
 request

In order not to trigger hangcheck on a idle-but-waiting engine, we need
to distinguish between the pending request queue and the actual
execution queue. This is done later in "drm/i915: Enable multiple
timelines" but for now we need a temporary fix to prevent blaming the
wrong engine for a GPU hang.

(Note that this causes a temporary subtle change in how we decide when
to allow a waitboost to be re-awarded back to the waiter, the temporary
effect is that if the wait is upon the most current execution the wait
is given for free, instead of checking to see if the client stalled
itself. This will be repaired in "drm/i915: Enable multiple timelines".)

Fixes: 0a046a0e93d2 ("drm/i915: Nonblocking request submission")
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=98104
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161007065327.24515-1-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 40978bc12ceb..8832f8ec1583 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -328,6 +328,7 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 
 	switch (state) {
 	case FENCE_COMPLETE:
+		request->engine->last_submitted_seqno = request->fence.seqno;
 		request->engine->submit_request(request);
 		break;
 
@@ -641,8 +642,8 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 					     &request->submitq);
 
 	request->emitted_jiffies = jiffies;
-	request->previous_seqno = engine->last_submitted_seqno;
-	engine->last_submitted_seqno = request->fence.seqno;
+	request->previous_seqno = engine->last_pending_seqno;
+	engine->last_pending_seqno = request->fence.seqno;
 	i915_gem_active_set(&engine->last_request, request);
 	list_add_tail(&request->link, &engine->request_list);
 	list_add_tail(&request->ring_link, &ring->request_list);
-- 
cgit v1.2.1


From 3b3f1650b1ca46a4225e0bf72804779b161e27b6 Mon Sep 17 00:00:00 2001
From: Akash Goel <akash.goel@intel.com>
Date: Thu, 13 Oct 2016 22:44:48 +0530
Subject: drm/i915: Allocate intel_engine_cs structure only for the enabled
 engines

With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
	struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
	struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.

There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).

v2:
- Remove the engine iterator field added in drm_i915_private structure,
  instead pass a local iterator variable to the for_each_engine**
  macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
  NULL pointer check on engine pointer. (Chris)

v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
  can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
  engine specific init is done later in Driver load sequence.

v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().

v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
  allocation of intel_engine_cs structure. (Chris)

v6:
- Rebase.

v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.

v8: Rebase.

v9: Rebase.

v10:
- For index calculation use engine ID instead of pointer based arithmetic in
  intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
  check for NULL engine pointer in cleanup() routines. (Joonas)

v11: Rebase.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
---
 drivers/gpu/drm/i915/i915_gem_request.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 8832f8ec1583..74ede1f53372 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -256,10 +256,11 @@ static int i915_gem_check_wedge(struct drm_i915_private *dev_priv)
 static int i915_gem_init_seqno(struct drm_i915_private *dev_priv, u32 seqno)
 {
 	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
 	int ret;
 
 	/* Carefully retire all requests without writing to the rings */
-	for_each_engine(engine, dev_priv) {
+	for_each_engine(engine, dev_priv, id) {
 		ret = intel_engine_idle(engine,
 					I915_WAIT_INTERRUPTIBLE |
 					I915_WAIT_LOCKED);
@@ -276,7 +277,7 @@ static int i915_gem_init_seqno(struct drm_i915_private *dev_priv, u32 seqno)
 	}
 
 	/* Finally reset hw state */
-	for_each_engine(engine, dev_priv)
+	for_each_engine(engine, dev_priv, id)
 		intel_engine_init_seqno(engine, seqno);
 
 	return 0;
-- 
cgit v1.2.1


From f54d1867005c3323f5d8ad83eed823e84226c429 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 25 Oct 2016 13:00:45 +0100
Subject: dma-buf: Rename struct fence to dma_fence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I plan to usurp the short name of struct fence for a core kernel struct,
and so I need to rename the specialised fence/timeline for DMA
operations to make room.

A consensus was reached in
https://lists.freedesktop.org/archives/dri-devel/2016-July/113083.html
that making clear this fence applies to DMA operations was a good thing.
Since then the patch has grown a bit as usage increases, so hopefully it
remains a good thing!

(v2...: rebase, rerun spatch)
v3: Compile on msm, spotted a manual fixup that I broke.
v4: Try again for msm, sorry Daniel

coccinelle script:
@@

@@
- struct fence
+ struct dma_fence
@@

@@
- struct fence_ops
+ struct dma_fence_ops
@@

@@
- struct fence_cb
+ struct dma_fence_cb
@@

@@
- struct fence_array
+ struct dma_fence_array
@@

@@
- enum fence_flag_bits
+ enum dma_fence_flag_bits
@@

@@
(
- fence_init
+ dma_fence_init
|
- fence_release
+ dma_fence_release
|
- fence_free
+ dma_fence_free
|
- fence_get
+ dma_fence_get
|
- fence_get_rcu
+ dma_fence_get_rcu
|
- fence_put
+ dma_fence_put
|
- fence_signal
+ dma_fence_signal
|
- fence_signal_locked
+ dma_fence_signal_locked
|
- fence_default_wait
+ dma_fence_default_wait
|
- fence_add_callback
+ dma_fence_add_callback
|
- fence_remove_callback
+ dma_fence_remove_callback
|
- fence_enable_sw_signaling
+ dma_fence_enable_sw_signaling
|
- fence_is_signaled_locked
+ dma_fence_is_signaled_locked
|
- fence_is_signaled
+ dma_fence_is_signaled
|
- fence_is_later
+ dma_fence_is_later
|
- fence_later
+ dma_fence_later
|
- fence_wait_timeout
+ dma_fence_wait_timeout
|
- fence_wait_any_timeout
+ dma_fence_wait_any_timeout
|
- fence_wait
+ dma_fence_wait
|
- fence_context_alloc
+ dma_fence_context_alloc
|
- fence_array_create
+ dma_fence_array_create
|
- to_fence_array
+ to_dma_fence_array
|
- fence_is_array
+ dma_fence_is_array
|
- trace_fence_emit
+ trace_dma_fence_emit
|
- FENCE_TRACE
+ DMA_FENCE_TRACE
|
- FENCE_WARN
+ DMA_FENCE_WARN
|
- FENCE_ERR
+ DMA_FENCE_ERR
)
 (
 ...
 )

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/20161025120045.28839-1-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 74ede1f53372..f9af2a00625e 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -26,12 +26,12 @@
 
 #include "i915_drv.h"
 
-static const char *i915_fence_get_driver_name(struct fence *fence)
+static const char *i915_fence_get_driver_name(struct dma_fence *fence)
 {
 	return "i915";
 }
 
-static const char *i915_fence_get_timeline_name(struct fence *fence)
+static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
 {
 	/* Timelines are bound by eviction to a VM. However, since
 	 * we only have a global seqno at the moment, we only have
@@ -42,12 +42,12 @@ static const char *i915_fence_get_timeline_name(struct fence *fence)
 	return "global";
 }
 
-static bool i915_fence_signaled(struct fence *fence)
+static bool i915_fence_signaled(struct dma_fence *fence)
 {
 	return i915_gem_request_completed(to_request(fence));
 }
 
-static bool i915_fence_enable_signaling(struct fence *fence)
+static bool i915_fence_enable_signaling(struct dma_fence *fence)
 {
 	if (i915_fence_signaled(fence))
 		return false;
@@ -56,7 +56,7 @@ static bool i915_fence_enable_signaling(struct fence *fence)
 	return true;
 }
 
-static signed long i915_fence_wait(struct fence *fence,
+static signed long i915_fence_wait(struct dma_fence *fence,
 				   bool interruptible,
 				   signed long timeout_jiffies)
 {
@@ -85,26 +85,26 @@ static signed long i915_fence_wait(struct fence *fence,
 	return timeout_jiffies;
 }
 
-static void i915_fence_value_str(struct fence *fence, char *str, int size)
+static void i915_fence_value_str(struct dma_fence *fence, char *str, int size)
 {
 	snprintf(str, size, "%u", fence->seqno);
 }
 
-static void i915_fence_timeline_value_str(struct fence *fence, char *str,
+static void i915_fence_timeline_value_str(struct dma_fence *fence, char *str,
 					  int size)
 {
 	snprintf(str, size, "%u",
 		 intel_engine_get_seqno(to_request(fence)->engine));
 }
 
-static void i915_fence_release(struct fence *fence)
+static void i915_fence_release(struct dma_fence *fence)
 {
 	struct drm_i915_gem_request *req = to_request(fence);
 
 	kmem_cache_free(req->i915->requests, req);
 }
 
-const struct fence_ops i915_fence_ops = {
+const struct dma_fence_ops i915_fence_ops = {
 	.get_driver_name = i915_fence_get_driver_name,
 	.get_timeline_name = i915_fence_get_timeline_name,
 	.enable_signaling = i915_fence_enable_signaling,
@@ -388,8 +388,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 * The reference count is incremented atomically. If it is zero,
 	 * the lookup knows the request is unallocated and complete. Otherwise,
 	 * it is either still in use, or has been reallocated and reset
-	 * with fence_init(). This increment is safe for release as we check
-	 * that the request we have a reference to and matches the active
+	 * with dma_fence_init(). This increment is safe for release as we
+	 * check that the request we have a reference to and matches the active
 	 * request.
 	 *
 	 * Before we increment the refcount, we chase the request->engine
@@ -412,11 +412,11 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 		goto err;
 
 	spin_lock_init(&req->lock);
-	fence_init(&req->fence,
-		   &i915_fence_ops,
-		   &req->lock,
-		   engine->fence_context,
-		   seqno);
+	dma_fence_init(&req->fence,
+		       &i915_fence_ops,
+		       &req->lock,
+		       engine->fence_context,
+		       seqno);
 
 	i915_sw_fence_init(&req->submit, submit_notify);
 
-- 
cgit v1.2.1


From b52992c06c9020cecb1b9807855301e5f62ec968 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:24 +0100
Subject: drm/i915: Support asynchronous waits on struct fence from
 i915_gem_request

We will need to wait on DMA completion (as signaled via struct fence)
before executing our i915_gem_request. Therefore we want to expose a
method for adding the await on the fence itself to the request.

v2: Add a comment detailing a failure to handle a signal-on-any
fence-array.
v3: Pretend that magic numbers don't exist.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-1-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 48 +++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index f9af2a00625e..5e38bc04a4f0 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -23,6 +23,7 @@
  */
 
 #include <linux/prefetch.h>
+#include <linux/dma-fence-array.h>
 
 #include "i915_drv.h"
 
@@ -496,6 +497,53 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
 	return 0;
 }
 
+int
+i915_gem_request_await_dma_fence(struct drm_i915_gem_request *req,
+				 struct dma_fence *fence)
+{
+	struct dma_fence_array *array;
+	int ret;
+	int i;
+
+	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
+		return 0;
+
+	if (dma_fence_is_i915(fence))
+		return i915_gem_request_await_request(req, to_request(fence));
+
+	if (!dma_fence_is_array(fence)) {
+		ret = i915_sw_fence_await_dma_fence(&req->submit,
+						    fence, I915_FENCE_TIMEOUT,
+						    GFP_KERNEL);
+		return ret < 0 ? ret : 0;
+	}
+
+	/* Note that if the fence-array was created in signal-on-any mode,
+	 * we should *not* decompose it into its individual fences. However,
+	 * we don't currently store which mode the fence-array is operating
+	 * in. Fortunately, the only user of signal-on-any is private to
+	 * amdgpu and we should not see any incoming fence-array from
+	 * sync-file being in signal-on-any mode.
+	 */
+
+	array = to_dma_fence_array(fence);
+	for (i = 0; i < array->num_fences; i++) {
+		struct dma_fence *child = array->fences[i];
+
+		if (dma_fence_is_i915(child))
+			ret = i915_gem_request_await_request(req,
+							     to_request(child));
+		else
+			ret = i915_sw_fence_await_dma_fence(&req->submit,
+							    child, I915_FENCE_TIMEOUT,
+							    GFP_KERNEL);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
 /**
  * i915_gem_request_await_object - set this request to (async) wait upon a bo
  *
-- 
cgit v1.2.1


From e95433c73a11759203af1cae5958f998c9673370 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:27 +0100
Subject: drm/i915: Rearrange i915_wait_request() accounting with callers

Our low-level wait routine has evolved from our generic wait interface
that handled unlocked, RPS boosting, waits with time tracking. If we
push our GEM fence tracking to use reservation_objects (required for
handling multiple timelines), we lose the ability to pass the required
information down to i915_wait_request(). However, if we push the extra
functionality from i915_wait_request() to the individual callsites
(i915_gem_object_wait_rendering and i915_gem_wait_ioctl) that make use
of those extras, we can both simplify our low level wait and prepare for
extending the GEM interface for use of reservation_objects.

v2: Rewrite i915_wait_request() kerneldocs

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.william.auld@gmail.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-4-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 146 ++++++++------------------------
 1 file changed, 33 insertions(+), 113 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 5e38bc04a4f0..fbe0923fe0bc 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -59,31 +59,9 @@ static bool i915_fence_enable_signaling(struct dma_fence *fence)
 
 static signed long i915_fence_wait(struct dma_fence *fence,
 				   bool interruptible,
-				   signed long timeout_jiffies)
+				   signed long timeout)
 {
-	s64 timeout_ns, *timeout;
-	int ret;
-
-	if (timeout_jiffies != MAX_SCHEDULE_TIMEOUT) {
-		timeout_ns = jiffies_to_nsecs(timeout_jiffies);
-		timeout = &timeout_ns;
-	} else {
-		timeout = NULL;
-	}
-
-	ret = i915_wait_request(to_request(fence),
-				interruptible, timeout,
-				NO_WAITBOOST);
-	if (ret == -ETIME)
-		return 0;
-
-	if (ret < 0)
-		return ret;
-
-	if (timeout_jiffies != MAX_SCHEDULE_TIMEOUT)
-		timeout_jiffies = nsecs_to_jiffies(timeout_ns);
-
-	return timeout_jiffies;
+	return i915_wait_request(to_request(fence), interruptible, timeout);
 }
 
 static void i915_fence_value_str(struct dma_fence *fence, char *str, int size)
@@ -166,7 +144,7 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	struct i915_gem_active *active, *next;
 
 	trace_i915_gem_request_retire(request);
-	list_del(&request->link);
+	list_del_init(&request->link);
 
 	/* We know the GPU must have read the request to have
 	 * sent us the seqno + interrupt, so use the position
@@ -224,7 +202,8 @@ void i915_gem_request_retire_upto(struct drm_i915_gem_request *req)
 	struct drm_i915_gem_request *tmp;
 
 	lockdep_assert_held(&req->i915->drm.struct_mutex);
-	GEM_BUG_ON(list_empty(&req->link));
+	if (list_empty(&req->link))
+		return;
 
 	do {
 		tmp = list_first_entry(&engine->request_list,
@@ -780,75 +759,48 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
 
 /**
  * i915_wait_request - wait until execution of request has finished
- * @req: duh!
+ * @req: the request to wait upon
  * @flags: how to wait
- * @timeout: in - how long to wait (NULL forever); out - how much time remaining
- * @rps: client to charge for RPS boosting
+ * @timeout: how long to wait in jiffies
+ *
+ * i915_wait_request() waits for the request to be completed, for a
+ * maximum of @timeout jiffies (with MAX_SCHEDULE_TIMEOUT implying an
+ * unbounded wait).
  *
- * Note: It is of utmost importance that the passed in seqno and reset_counter
- * values have been read by the caller in an smp safe manner. Where read-side
- * locks are involved, it is sufficient to read the reset_counter before
- * unlocking the lock that protects the seqno. For lockless tricks, the
- * reset_counter _must_ be read before, and an appropriate smp_rmb must be
- * inserted.
+ * If the caller holds the struct_mutex, the caller must pass I915_WAIT_LOCKED
+ * in via the flags, and vice versa if the struct_mutex is not held, the caller
+ * must not specify that the wait is locked.
  *
- * Returns 0 if the request was found within the alloted time. Else returns the
- * errno with remaining time filled in timeout argument.
+ * Returns the remaining time (in jiffies) if the request completed, which may
+ * be zero or -ETIME if the request is unfinished after the timeout expires.
+ * May return -EINTR is called with I915_WAIT_INTERRUPTIBLE and a signal is
+ * pending before the request completes.
  */
-int i915_wait_request(struct drm_i915_gem_request *req,
-		      unsigned int flags,
-		      s64 *timeout,
-		      struct intel_rps_client *rps)
+long i915_wait_request(struct drm_i915_gem_request *req,
+		       unsigned int flags,
+		       long timeout)
 {
 	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
 		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
 	DEFINE_WAIT(reset);
 	struct intel_wait wait;
-	unsigned long timeout_remain;
-	int ret = 0;
 
 	might_sleep();
 #if IS_ENABLED(CONFIG_LOCKDEP)
-	GEM_BUG_ON(!!lockdep_is_held(&req->i915->drm.struct_mutex) !=
+	GEM_BUG_ON(debug_locks &&
+		   !!lockdep_is_held(&req->i915->drm.struct_mutex) !=
 		   !!(flags & I915_WAIT_LOCKED));
 #endif
+	GEM_BUG_ON(timeout < 0);
 
 	if (i915_gem_request_completed(req))
-		return 0;
+		return timeout;
 
-	timeout_remain = MAX_SCHEDULE_TIMEOUT;
-	if (timeout) {
-		if (WARN_ON(*timeout < 0))
-			return -EINVAL;
-
-		if (*timeout == 0)
-			return -ETIME;
-
-		/* Record current time in case interrupted, or wedged */
-		timeout_remain = nsecs_to_jiffies_timeout(*timeout);
-		*timeout += ktime_get_raw_ns();
-	}
+	if (!timeout)
+		return -ETIME;
 
 	trace_i915_gem_request_wait_begin(req);
 
-	/* This client is about to stall waiting for the GPU. In many cases
-	 * this is undesirable and limits the throughput of the system, as
-	 * many clients cannot continue processing user input/output whilst
-	 * blocked. RPS autotuning may take tens of milliseconds to respond
-	 * to the GPU load and thus incurs additional latency for the client.
-	 * We can circumvent that by promoting the GPU frequency to maximum
-	 * before we wait. This makes the GPU throttle up much more quickly
-	 * (good for benchmarks and user experience, e.g. window animations),
-	 * but at a cost of spending more power processing the workload
-	 * (bad for battery). Not all clients even want their results
-	 * immediately and for them we should just let the GPU select its own
-	 * frequency to maximise efficiency. To prevent a single client from
-	 * forcing the clocks too high for the whole system, we only allow
-	 * each client to waitboost once in a busy period.
-	 */
-	if (IS_RPS_CLIENT(rps) && INTEL_GEN(req->i915) >= 6)
-		gen6_rps_boost(req->i915, rps, req->emitted_jiffies);
-
 	/* Optimistic short spin before touching IRQs */
 	if (i915_spin_request(req, state, 5))
 		goto complete;
@@ -867,16 +819,17 @@ int i915_wait_request(struct drm_i915_gem_request *req,
 
 	for (;;) {
 		if (signal_pending_state(state, current)) {
-			ret = -ERESTARTSYS;
+			timeout = -ERESTARTSYS;
 			break;
 		}
 
-		timeout_remain = io_schedule_timeout(timeout_remain);
-		if (timeout_remain == 0) {
-			ret = -ETIME;
+		if (!timeout) {
+			timeout = -ETIME;
 			break;
 		}
 
+		timeout = io_schedule_timeout(timeout);
+
 		if (intel_wait_complete(&wait))
 			break;
 
@@ -923,40 +876,7 @@ wakeup:
 complete:
 	trace_i915_gem_request_wait_end(req);
 
-	if (timeout) {
-		*timeout -= ktime_get_raw_ns();
-		if (*timeout < 0)
-			*timeout = 0;
-
-		/*
-		 * Apparently ktime isn't accurate enough and occasionally has a
-		 * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
-		 * things up to make the test happy. We allow up to 1 jiffy.
-		 *
-		 * This is a regrssion from the timespec->ktime conversion.
-		 */
-		if (ret == -ETIME && *timeout < jiffies_to_usecs(1)*1000)
-			*timeout = 0;
-	}
-
-	if (IS_RPS_USER(rps) &&
-	    req->fence.seqno == req->engine->last_submitted_seqno) {
-		/* The GPU is now idle and this client has stalled.
-		 * Since no other client has submitted a request in the
-		 * meantime, assume that this client is the only one
-		 * supplying work to the GPU but is unable to keep that
-		 * work supplied because it is waiting. Since the GPU is
-		 * then never kept fully busy, RPS autoclocking will
-		 * keep the clocks relatively low, causing further delays.
-		 * Compensate by giving the synchronous client credit for
-		 * a waitboost next time.
-		 */
-		spin_lock(&req->i915->rps.client_lock);
-		list_del_init(&rps->link);
-		spin_unlock(&req->i915->rps.client_lock);
-	}
-
-	return ret;
+	return timeout;
 }
 
 static bool engine_retire_requests(struct intel_engine_cs *engine)
-- 
cgit v1.2.1


From 4c7d62c6b8a2b4e2300d977644e78b25a2d5f4d0 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:32 +0100
Subject: drm/i915: Markup GEM API with lockdep asserts

Add lockdep_assert_held(struct_mutex) to the API preamble of the
internal GEM interfaces.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-9-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index fbe0923fe0bc..d234c28cbb9f 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -143,6 +143,9 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 {
 	struct i915_gem_active *active, *next;
 
+	lockdep_assert_held(&request->i915->drm.struct_mutex);
+	GEM_BUG_ON(!i915_gem_request_completed(request));
+
 	trace_i915_gem_request_retire(request);
 	list_del_init(&request->link);
 
@@ -268,6 +271,8 @@ int i915_gem_set_seqno(struct drm_device *dev, u32 seqno)
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	int ret;
 
+	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+
 	if (seqno == 0)
 		return -EINVAL;
 
@@ -612,6 +617,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	u32 reserved_tail;
 	int ret;
 
+	lockdep_assert_held(&request->i915->drm.struct_mutex);
 	trace_i915_gem_request_add(request);
 
 	/*
-- 
cgit v1.2.1


From d07f0e59b2c762584478920cd2d11fba2980a94a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:44 +0100
Subject: drm/i915: Move GEM activity tracking into a common struct
 reservation_object

In preparation to support many distinct timelines, we need to expand the
activity tracking on the GEM object to handle more than just a request
per engine. We already use the struct reservation_object on the dma-buf
to handle many fence contexts, so integrating that into the GEM object
itself is the preferred solution. (For example, we can now share the same
reservation_object between every consumer/producer using this buffer and
skip the manual import/export via dma-buf.)

v2: Reimplement busy-ioctl (by walking the reservation object), postpone
the ABI change for another day. Similarly use the reservation object to
find the last_write request (if active and from i915) for choosing
display CS flips.

Caveats:

 * busy-ioctl: busy-ioctl only reports on the native fences, it will not
warn of stalls (in set-domain-ioctl, pread/pwrite etc) if the object is
being rendered to by external fences. It also will not report the same
busy state as wait-ioctl (or polling on the dma-buf) in the same
circumstances. On the plus side, it does retain reporting of which
*i915* engines are engaged with this object.

 * non-blocking atomic modesets take a step backwards as the wait for
render completion blocks the ioctl. This is fixed in a subsequent
patch to use a fence instead for awaiting on the rendering, see
"drm/i915: Restore nonblocking awaits for modesetting"

 * dynamic array manipulation for shared-fences in reservation is slower
than the previous lockless static assignment (e.g. gem_exec_lut_handle
runtime on ivb goes from 42s to 66s), mainly due to atomic operations
(maintaining the fence refcounts).

 * loss of object-level retirement callbacks, emulated by VMA retirement
tracking.

 * minor loss of object-level last activity information from debugfs,
could be replaced with per-vma information if desired

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-21-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 48 ++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 19 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index d234c28cbb9f..01a7fa513b4a 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -196,6 +196,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	}
 
 	i915_gem_context_put(request->ctx);
+
+	dma_fence_signal(&request->fence);
 	i915_gem_request_put(request);
 }
 
@@ -553,33 +555,41 @@ i915_gem_request_await_object(struct drm_i915_gem_request *to,
 			      struct drm_i915_gem_object *obj,
 			      bool write)
 {
-	struct i915_gem_active *active;
-	unsigned long active_mask;
-	int idx;
+	struct dma_fence *excl;
+	int ret = 0;
 
 	if (write) {
-		active_mask = i915_gem_object_get_active(obj);
-		active = obj->last_read;
+		struct dma_fence **shared;
+		unsigned int count, i;
+
+		ret = reservation_object_get_fences_rcu(obj->resv,
+							&excl, &count, &shared);
+		if (ret)
+			return ret;
+
+		for (i = 0; i < count; i++) {
+			ret = i915_gem_request_await_dma_fence(to, shared[i]);
+			if (ret)
+				break;
+
+			dma_fence_put(shared[i]);
+		}
+
+		for (; i < count; i++)
+			dma_fence_put(shared[i]);
+		kfree(shared);
 	} else {
-		active_mask = 1;
-		active = &obj->last_write;
+		excl = reservation_object_get_excl_rcu(obj->resv);
 	}
 
-	for_each_active(active_mask, idx) {
-		struct drm_i915_gem_request *request;
-		int ret;
-
-		request = i915_gem_active_peek(&active[idx],
-					       &obj->base.dev->struct_mutex);
-		if (!request)
-			continue;
+	if (excl) {
+		if (ret == 0)
+			ret = i915_gem_request_await_dma_fence(to, excl);
 
-		ret = i915_gem_request_await_request(to, request);
-		if (ret)
-			return ret;
+		dma_fence_put(excl);
 	}
 
-	return 0;
+	return ret;
 }
 
 static void i915_gem_mark_busy(const struct intel_engine_cs *engine)
-- 
cgit v1.2.1


From 73cb97010d4fdd2a29f00cac14d206c7641c23d2 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:46 +0100
Subject: drm/i915: Combine seqno + tracking into a global timeline struct

Our timelines are more than just a seqno. They also provide an ordered
list of requests to be executed. Due to the restriction of handling
individual address spaces, we are limited to a timeline per address
space but we use a fence context per engine within.

Our first step to introducing independent timelines per context (i.e. to
allow each context to have a queue of requests to execute that have a
defined set of dependencies on other requests) is to provide a timeline
abstraction for the global execution queue.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-23-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 81 ++++++++++++++++++++-------------
 1 file changed, 49 insertions(+), 32 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 01a7fa513b4a..16d38f87f0a7 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -40,7 +40,7 @@ static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
 	 * multiple execution contexts (fence contexts) as we allow
 	 * engines within a single timeline to execute in parallel.
 	 */
-	return "global";
+	return to_request(fence)->timeline->common->name;
 }
 
 static bool i915_fence_signaled(struct dma_fence *fence)
@@ -211,7 +211,7 @@ void i915_gem_request_retire_upto(struct drm_i915_gem_request *req)
 		return;
 
 	do {
-		tmp = list_first_entry(&engine->request_list,
+		tmp = list_first_entry(&engine->timeline->requests,
 				       typeof(*tmp), link);
 
 		i915_gem_request_retire(tmp);
@@ -238,37 +238,39 @@ static int i915_gem_check_wedge(struct drm_i915_private *dev_priv)
 	return 0;
 }
 
-static int i915_gem_init_seqno(struct drm_i915_private *dev_priv, u32 seqno)
+static int i915_gem_init_global_seqno(struct drm_i915_private *dev_priv,
+				      u32 seqno)
 {
+	struct i915_gem_timeline *timeline = &dev_priv->gt.global_timeline;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 	int ret;
 
 	/* Carefully retire all requests without writing to the rings */
-	for_each_engine(engine, dev_priv, id) {
-		ret = intel_engine_idle(engine,
-					I915_WAIT_INTERRUPTIBLE |
-					I915_WAIT_LOCKED);
-		if (ret)
-			return ret;
-	}
+	ret = i915_gem_wait_for_idle(dev_priv,
+				     I915_WAIT_INTERRUPTIBLE |
+				     I915_WAIT_LOCKED);
+	if (ret)
+		return ret;
+
 	i915_gem_retire_requests(dev_priv);
 
 	/* If the seqno wraps around, we need to clear the breadcrumb rbtree */
-	if (!i915_seqno_passed(seqno, dev_priv->next_seqno)) {
+	if (!i915_seqno_passed(seqno, timeline->next_seqno)) {
 		while (intel_kick_waiters(dev_priv) ||
 		       intel_kick_signalers(dev_priv))
 			yield();
+		yield();
 	}
 
 	/* Finally reset hw state */
 	for_each_engine(engine, dev_priv, id)
-		intel_engine_init_seqno(engine, seqno);
+		intel_engine_init_global_seqno(engine, seqno);
 
 	return 0;
 }
 
-int i915_gem_set_seqno(struct drm_device *dev, u32 seqno)
+int i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	int ret;
@@ -281,28 +283,31 @@ int i915_gem_set_seqno(struct drm_device *dev, u32 seqno)
 	/* HWS page needs to be set less than what we
 	 * will inject to ring
 	 */
-	ret = i915_gem_init_seqno(dev_priv, seqno - 1);
+	ret = i915_gem_init_global_seqno(dev_priv, seqno - 1);
 	if (ret)
 		return ret;
 
-	dev_priv->next_seqno = seqno;
+	dev_priv->gt.global_timeline.next_seqno = seqno;
 	return 0;
 }
 
-static int i915_gem_get_seqno(struct drm_i915_private *dev_priv, u32 *seqno)
+static int i915_gem_get_global_seqno(struct drm_i915_private *dev_priv,
+				     u32 *seqno)
 {
+	struct i915_gem_timeline *tl = &dev_priv->gt.global_timeline;
+
 	/* reserve 0 for non-seqno */
-	if (unlikely(dev_priv->next_seqno == 0)) {
+	if (unlikely(tl->next_seqno == 0)) {
 		int ret;
 
-		ret = i915_gem_init_seqno(dev_priv, 0);
+		ret = i915_gem_init_global_seqno(dev_priv, 0);
 		if (ret)
 			return ret;
 
-		dev_priv->next_seqno = 1;
+		tl->next_seqno = 1;
 	}
 
-	*seqno = dev_priv->next_seqno++;
+	*seqno = tl->next_seqno++;
 	return 0;
 }
 
@@ -311,13 +316,14 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 {
 	struct drm_i915_gem_request *request =
 		container_of(fence, typeof(*request), submit);
+	struct intel_engine_cs *engine = request->engine;
 
 	/* Will be called from irq-context when using foreign DMA fences */
 
 	switch (state) {
 	case FENCE_COMPLETE:
-		request->engine->last_submitted_seqno = request->fence.seqno;
-		request->engine->submit_request(request);
+		engine->timeline->last_submitted_seqno = request->fence.seqno;
+		engine->submit_request(request);
 		break;
 
 	case FENCE_FREE:
@@ -357,7 +363,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 		return ERR_PTR(ret);
 
 	/* Move the oldest request to the slab-cache (if not in use!) */
-	req = list_first_entry_or_null(&engine->request_list,
+	req = list_first_entry_or_null(&engine->timeline->requests,
 				       typeof(*req), link);
 	if (req && i915_gem_request_completed(req))
 		i915_gem_request_retire(req);
@@ -394,15 +400,17 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	if (!req)
 		return ERR_PTR(-ENOMEM);
 
-	ret = i915_gem_get_seqno(dev_priv, &seqno);
+	ret = i915_gem_get_global_seqno(dev_priv, &seqno);
 	if (ret)
 		goto err;
 
+	req->timeline = engine->timeline;
+
 	spin_lock_init(&req->lock);
 	dma_fence_init(&req->fence,
 		       &i915_fence_ops,
 		       &req->lock,
-		       engine->fence_context,
+		       req->timeline->fence_context,
 		       seqno);
 
 	i915_sw_fence_init(&req->submit, submit_notify);
@@ -457,9 +465,16 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
 
 	GEM_BUG_ON(to == from);
 
-	if (to->engine == from->engine)
+	if (to->timeline == from->timeline)
 		return 0;
 
+	if (to->engine == from->engine) {
+		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
+						       &from->submit,
+						       GFP_KERNEL);
+		return ret < 0 ? ret : 0;
+	}
+
 	idx = intel_engine_sync_index(from->engine, to->engine);
 	if (from->fence.seqno <= from->engine->semaphore.sync_seqno[idx])
 		return 0;
@@ -622,6 +637,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 {
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_ring *ring = request->ring;
+	struct intel_timeline *timeline = request->timeline;
 	struct drm_i915_gem_request *prev;
 	u32 request_start;
 	u32 reserved_tail;
@@ -679,17 +695,17 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	 * see a more recent value in the hws than we are tracking.
 	 */
 
-	prev = i915_gem_active_raw(&engine->last_request,
+	prev = i915_gem_active_raw(&timeline->last_request,
 				   &request->i915->drm.struct_mutex);
 	if (prev)
 		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
 					     &request->submitq);
 
 	request->emitted_jiffies = jiffies;
-	request->previous_seqno = engine->last_pending_seqno;
-	engine->last_pending_seqno = request->fence.seqno;
-	i915_gem_active_set(&engine->last_request, request);
-	list_add_tail(&request->link, &engine->request_list);
+	request->previous_seqno = timeline->last_pending_seqno;
+	timeline->last_pending_seqno = request->fence.seqno;
+	i915_gem_active_set(&timeline->last_request, request);
+	list_add_tail(&request->link, &timeline->requests);
 	list_add_tail(&request->ring_link, &ring->request_list);
 
 	i915_gem_mark_busy(engine);
@@ -899,7 +915,8 @@ static bool engine_retire_requests(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *request, *next;
 
-	list_for_each_entry_safe(request, next, &engine->request_list, link) {
+	list_for_each_entry_safe(request, next,
+				 &engine->timeline->requests, link) {
 		if (!i915_gem_request_completed(request))
 			return false;
 
-- 
cgit v1.2.1


From 4680816be3362bdf6ac712cbdc6098c76febe78f Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:48 +0100
Subject: drm/i915: Wait first for submission, before waiting for request
 completion

In future patches, we will no longer be able to wait on a static global
seqno and instead have to break our wait up into phases. First we wait
for the global seqno assignment (upon submission to hardware), and once
submitted we wait for the hardware to complete.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-25-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 51 +++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 16d38f87f0a7..03ae85a1eefb 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -789,6 +789,49 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
 	return false;
 }
 
+static long
+__i915_request_wait_for_submit(struct drm_i915_gem_request *request,
+			       unsigned int flags,
+			       long timeout)
+{
+	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
+		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
+	wait_queue_head_t *q = &request->i915->gpu_error.wait_queue;
+	DEFINE_WAIT(reset);
+	DEFINE_WAIT(wait);
+
+	if (flags & I915_WAIT_LOCKED)
+		add_wait_queue(q, &reset);
+
+	do {
+		prepare_to_wait(&request->submit.wait, &wait, state);
+
+		if (i915_sw_fence_done(&request->submit))
+			break;
+
+		if (flags & I915_WAIT_LOCKED &&
+		    i915_reset_in_progress(&request->i915->gpu_error)) {
+			__set_current_state(TASK_RUNNING);
+			i915_reset(request->i915);
+			reset_wait_queue(q, &reset);
+			continue;
+		}
+
+		if (signal_pending_state(state, current)) {
+			timeout = -ERESTARTSYS;
+			break;
+		}
+
+		timeout = io_schedule_timeout(timeout);
+	} while (timeout);
+	finish_wait(&request->submit.wait, &wait);
+
+	if (flags & I915_WAIT_LOCKED)
+		remove_wait_queue(q, &reset);
+
+	return timeout;
+}
+
 /**
  * i915_wait_request - wait until execution of request has finished
  * @req: the request to wait upon
@@ -833,6 +876,14 @@ long i915_wait_request(struct drm_i915_gem_request *req,
 
 	trace_i915_gem_request_wait_begin(req);
 
+	if (!i915_sw_fence_done(&req->submit)) {
+		timeout = __i915_request_wait_for_submit(req, flags, timeout);
+		if (timeout < 0)
+			goto complete;
+
+		GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
+	}
+
 	/* Optimistic short spin before touching IRQs */
 	if (i915_spin_request(req, state, 5))
 		goto complete;
-- 
cgit v1.2.1


From 65e4760e3920c21073a9d737929dc36df561380f Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:49 +0100
Subject: drm/i915: Introduce a global_seqno for each request

Though we will have multiple timelines, we still have a single timeline
of execution. This we can use to provide an execution and retirement order
of requests. This keeps tracking execution of requests simple, and vital
for preserving a single waiter (i.e. so that we can order the waiters so
that only the earliest to wakeup need be woken). To accomplish this we
distinguish the seqno used to order requests per-context (external) and
that used internally for execution.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-26-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 03ae85a1eefb..311cf3fac2e0 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -376,7 +376,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 * of being read by __i915_gem_active_get_rcu(). As such,
 	 * we have to be very careful when overwriting the contents. During
 	 * the RCU lookup, we change chase the request->engine pointer,
-	 * read the request->fence.seqno and increment the reference count.
+	 * read the request->global_seqno and increment the reference count.
 	 *
 	 * The reference count is incremented atomically. If it is zero,
 	 * the lookup knows the request is unallocated and complete. Otherwise,
@@ -418,6 +418,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
 	req->engine = engine;
+	req->global_seqno = seqno;
 	req->ctx = i915_gem_context_get(ctx);
 
 	/* No zalloc, must clear what we need by hand */
@@ -475,8 +476,15 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
 		return ret < 0 ? ret : 0;
 	}
 
+	if (!from->global_seqno) {
+		ret = i915_sw_fence_await_dma_fence(&to->submit,
+						    &from->fence, 0,
+						    GFP_KERNEL);
+		return ret < 0 ? ret : 0;
+	}
+
 	idx = intel_engine_sync_index(from->engine, to->engine);
-	if (from->fence.seqno <= from->engine->semaphore.sync_seqno[idx])
+	if (from->global_seqno <= from->engine->semaphore.sync_seqno[idx])
 		return 0;
 
 	trace_i915_gem_ring_sync_to(to, from);
@@ -494,7 +502,7 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
 			return ret;
 	}
 
-	from->engine->semaphore.sync_seqno[idx] = from->fence.seqno;
+	from->engine->semaphore.sync_seqno[idx] = from->global_seqno;
 	return 0;
 }
 
@@ -774,7 +782,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
 
 	timeout_us += local_clock_us(&cpu);
 	do {
-		if (i915_gem_request_completed(req))
+		if (__i915_gem_request_completed(req))
 			return true;
 
 		if (signal_pending_state(state, current))
@@ -883,6 +891,7 @@ long i915_wait_request(struct drm_i915_gem_request *req,
 
 		GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
 	}
+	GEM_BUG_ON(!req->global_seqno);
 
 	/* Optimistic short spin before touching IRQs */
 	if (i915_spin_request(req, state, 5))
@@ -892,7 +901,7 @@ long i915_wait_request(struct drm_i915_gem_request *req,
 	if (flags & I915_WAIT_LOCKED)
 		add_wait_queue(&req->i915->gpu_error.wait_queue, &reset);
 
-	intel_wait_init(&wait, req->fence.seqno);
+	intel_wait_init(&wait, req->global_seqno);
 	if (intel_engine_add_wait(req->engine, &wait))
 		/* In order to check that we haven't missed the interrupt
 		 * as we enabled it, we need to kick ourselves to do a
-- 
cgit v1.2.1


From 9b81d556b11fe58827dcd87bc5deaf8da2f716ae Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:50 +0100
Subject: drm/i915: Rename ->emit_request to ->emit_breadcrumb

Now that the emission of the request tail and its submission to hardware
are two separate steps, engine->emit_request() is confusing.
engine->emit_request() is called to emit the breadcrumb commands for the
request into the ring, name it such (engine->emit_breadcrumb).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-27-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 311cf3fac2e0..a626b2638722 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -685,8 +685,8 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	request->postfix = ring->tail;
 
 	/* Not allowed to fail! */
-	ret = engine->emit_request(request);
-	WARN(ret, "(%s)->emit_request failed: %d!\n", engine->name, ret);
+	ret = engine->emit_breadcrumb(request);
+	WARN(ret, "(%s)->emit_breadcrumb failed: %d!\n", engine->name, ret);
 
 	/* Sanity check that the reserved size was large enough. */
 	ret = ring->tail - request_start;
-- 
cgit v1.2.1


From 98f29e8d908f2b9e3d966f6f7d63cd69b4aaf0a2 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:51 +0100
Subject: drm/i915: Record space required for breadcrumb emission

In the next patch, we will use deferred breadcrumb emission. That requires
reserving sufficient space in the ringbuffer to emit the breadcrumb, which
first requires us to know how large the breadcrumb is.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-28-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index a626b2638722..be9e23b32e4a 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -434,6 +434,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 * away, e.g. because a GPU scheduler has deferred it.
 	 */
 	req->reserved_space = MIN_SPACE_FOR_ADD_REQUEST;
+	GEM_BUG_ON(req->reserved_space < engine->emit_breadcrumb_sz);
 
 	if (i915.enable_execlists)
 		ret = intel_logical_ring_alloc_request_extras(req);
-- 
cgit v1.2.1


From caddfe7192f5e74d65ebcfdae614f99e8fd87222 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:52 +0100
Subject: drm/i915: Defer breadcrumb emission

Move the actual emission of the breadcrumb for closing the request from
i915_add_request() to the submit callback. (It can be moved later when
required.) This allows us to defer the allocation of the global_seqno
from request construction to actual submission, allowing us to emit the
requests out of order (wrt to the order of their construction, they
still will only be executed one all of their dependencies are resolved
including that all earlier requests on their timeline have been
submitted.) We have to specialise how we then emit the request in order
to write into the preallocated space, rather than at the tail of the
ringbuffer (which will have been advanced by the addition of new
requests).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-29-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 41 +++++++++++----------------------
 1 file changed, 13 insertions(+), 28 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index be9e23b32e4a..06daa4d203a7 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -318,17 +318,16 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 		container_of(fence, typeof(*request), submit);
 	struct intel_engine_cs *engine = request->engine;
 
+	if (state != FENCE_COMPLETE)
+		return NOTIFY_DONE;
+
 	/* Will be called from irq-context when using foreign DMA fences */
 
-	switch (state) {
-	case FENCE_COMPLETE:
-		engine->timeline->last_submitted_seqno = request->fence.seqno;
-		engine->submit_request(request);
-		break;
+	engine->timeline->last_submitted_seqno = request->fence.seqno;
 
-	case FENCE_FREE:
-		break;
-	}
+	engine->emit_breadcrumb(request,
+				request->ring->vaddr + request->postfix);
+	engine->submit_request(request);
 
 	return NOTIFY_DONE;
 }
@@ -648,9 +647,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	struct intel_ring *ring = request->ring;
 	struct intel_timeline *timeline = request->timeline;
 	struct drm_i915_gem_request *prev;
-	u32 request_start;
-	u32 reserved_tail;
-	int ret;
+	int err;
 
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
 	trace_i915_gem_request_add(request);
@@ -660,8 +657,6 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	 * should already have been reserved in the ring buffer. Let the ring
 	 * know that it is time to use that space up.
 	 */
-	request_start = ring->tail;
-	reserved_tail = request->reserved_space;
 	request->reserved_space = 0;
 
 	/*
@@ -672,10 +667,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	 * what.
 	 */
 	if (flush_caches) {
-		ret = engine->emit_flush(request, EMIT_FLUSH);
+		err = engine->emit_flush(request, EMIT_FLUSH);
 
 		/* Not allowed to fail! */
-		WARN(ret, "engine->emit_flush() failed: %d!\n", ret);
+		WARN(err, "engine->emit_flush() failed: %d!\n", err);
 	}
 
 	/* Record the position of the start of the breadcrumb so that
@@ -683,20 +678,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	 * GPU processing the request, we never over-estimate the
 	 * position of the ring's HEAD.
 	 */
+	err = intel_ring_begin(request, engine->emit_breadcrumb_sz);
+	GEM_BUG_ON(err);
 	request->postfix = ring->tail;
-
-	/* Not allowed to fail! */
-	ret = engine->emit_breadcrumb(request);
-	WARN(ret, "(%s)->emit_breadcrumb failed: %d!\n", engine->name, ret);
-
-	/* Sanity check that the reserved size was large enough. */
-	ret = ring->tail - request_start;
-	if (ret < 0)
-		ret += ring->size;
-	WARN_ONCE(ret > reserved_tail,
-		  "Not enough space reserved (%d bytes) "
-		  "for adding the request (%d bytes)\n",
-		  reserved_tail, ret);
+	ring->tail += engine->emit_breadcrumb_sz * sizeof(u32);
 
 	/* Seal the request and mark it as pending execution. Note that
 	 * we may inspect this state, without holding any locks, during
-- 
cgit v1.2.1


From 85e17f5974b357bc4a127be09de71b430be265e0 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:53 +0100
Subject: drm/i915: Move the global sync optimisation to the timeline

Currently we try to reduce the number of synchronisations (now the
number of requests we need to wait upon) by noting that if we have
earlier waited upon a request, all subsequent requests in the timeline
will be after the wait. This only applies to requests in this timeline,
as other timelines will not be ordered by that waiter.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-30-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 06daa4d203a7..9c34a4c540b5 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -238,35 +238,41 @@ static int i915_gem_check_wedge(struct drm_i915_private *dev_priv)
 	return 0;
 }
 
-static int i915_gem_init_global_seqno(struct drm_i915_private *dev_priv,
-				      u32 seqno)
+static int i915_gem_init_global_seqno(struct drm_i915_private *i915, u32 seqno)
 {
-	struct i915_gem_timeline *timeline = &dev_priv->gt.global_timeline;
+	struct i915_gem_timeline *timeline = &i915->gt.global_timeline;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 	int ret;
 
 	/* Carefully retire all requests without writing to the rings */
-	ret = i915_gem_wait_for_idle(dev_priv,
+	ret = i915_gem_wait_for_idle(i915,
 				     I915_WAIT_INTERRUPTIBLE |
 				     I915_WAIT_LOCKED);
 	if (ret)
 		return ret;
 
-	i915_gem_retire_requests(dev_priv);
+	i915_gem_retire_requests(i915);
 
 	/* If the seqno wraps around, we need to clear the breadcrumb rbtree */
 	if (!i915_seqno_passed(seqno, timeline->next_seqno)) {
-		while (intel_kick_waiters(dev_priv) ||
-		       intel_kick_signalers(dev_priv))
+		while (intel_kick_waiters(i915) || intel_kick_signalers(i915))
 			yield();
 		yield();
 	}
 
 	/* Finally reset hw state */
-	for_each_engine(engine, dev_priv, id)
+	for_each_engine(engine, i915, id)
 		intel_engine_init_global_seqno(engine, seqno);
 
+	list_for_each_entry(timeline, &i915->gt.timelines, link) {
+		for_each_engine(engine, i915, id) {
+			struct intel_timeline *tl = &timeline->engine[id];
+
+			memset(tl->sync_seqno, 0, sizeof(tl->sync_seqno));
+		}
+	}
+
 	return 0;
 }
 
@@ -462,7 +468,7 @@ static int
 i915_gem_request_await_request(struct drm_i915_gem_request *to,
 			       struct drm_i915_gem_request *from)
 {
-	int idx, ret;
+	int ret;
 
 	GEM_BUG_ON(to == from);
 
@@ -483,8 +489,7 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
 		return ret < 0 ? ret : 0;
 	}
 
-	idx = intel_engine_sync_index(from->engine, to->engine);
-	if (from->global_seqno <= from->engine->semaphore.sync_seqno[idx])
+	if (from->global_seqno <= to->timeline->sync_seqno[from->engine->id])
 		return 0;
 
 	trace_i915_gem_ring_sync_to(to, from);
@@ -502,7 +507,7 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
 			return ret;
 	}
 
-	from->engine->semaphore.sync_seqno[idx] = from->global_seqno;
+	to->timeline->sync_seqno[from->engine->id] = from->global_seqno;
 	return 0;
 }
 
-- 
cgit v1.2.1


From 28176ef4cfa510e5f1498bbf39ff1e4afd0b085d Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:56 +0100
Subject: drm/i915: Reserve space in the global seqno during request allocation

A restriction on our global seqno is that they cannot wrap, and that we
cannot use the value 0. This allows us to detect when a request has not
yet been submitted, its global seqno is still 0, and ensures that
hardware semaphores are monotonic as required by older hardware. To
meet these restrictions when we defer the assignment of the global
seqno, we must check that we have an available slot in the global seqno
space during request construction. If that test fails, we wait for all
requests to be completed and reset the hardware back to 0.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-33-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 86 +++++++++++++++++----------------
 1 file changed, 45 insertions(+), 41 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 9c34a4c540b5..9b22f66464f0 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -159,6 +159,7 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	 */
 	list_del(&request->ring_link);
 	request->ring->last_retired_head = request->postfix;
+	request->i915->gt.active_requests--;
 
 	/* Walk through the active list, calling retire on each. This allows
 	 * objects to track their GPU activity and mark themselves as idle
@@ -253,13 +254,15 @@ static int i915_gem_init_global_seqno(struct drm_i915_private *i915, u32 seqno)
 		return ret;
 
 	i915_gem_retire_requests(i915);
+	GEM_BUG_ON(i915->gt.active_requests > 1);
 
 	/* If the seqno wraps around, we need to clear the breadcrumb rbtree */
-	if (!i915_seqno_passed(seqno, timeline->next_seqno)) {
+	if (!i915_seqno_passed(seqno, atomic_read(&timeline->next_seqno))) {
 		while (intel_kick_waiters(i915) || intel_kick_signalers(i915))
 			yield();
 		yield();
 	}
+	atomic_set(&timeline->next_seqno, seqno);
 
 	/* Finally reset hw state */
 	for_each_engine(engine, i915, id)
@@ -279,7 +282,6 @@ static int i915_gem_init_global_seqno(struct drm_i915_private *i915, u32 seqno)
 int i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	int ret;
 
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
 
@@ -289,34 +291,33 @@ int i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno)
 	/* HWS page needs to be set less than what we
 	 * will inject to ring
 	 */
-	ret = i915_gem_init_global_seqno(dev_priv, seqno - 1);
-	if (ret)
-		return ret;
-
-	dev_priv->gt.global_timeline.next_seqno = seqno;
-	return 0;
+	return i915_gem_init_global_seqno(dev_priv, seqno - 1);
 }
 
-static int i915_gem_get_global_seqno(struct drm_i915_private *dev_priv,
-				     u32 *seqno)
+static int reserve_global_seqno(struct drm_i915_private *i915)
 {
-	struct i915_gem_timeline *tl = &dev_priv->gt.global_timeline;
-
-	/* reserve 0 for non-seqno */
-	if (unlikely(tl->next_seqno == 0)) {
-		int ret;
+	u32 active_requests = ++i915->gt.active_requests;
+	u32 next_seqno = atomic_read(&i915->gt.global_timeline.next_seqno);
+	int ret;
 
-		ret = i915_gem_init_global_seqno(dev_priv, 0);
-		if (ret)
-			return ret;
+	/* Reservation is fine until we need to wrap around */
+	if (likely(next_seqno + active_requests > next_seqno))
+		return 0;
 
-		tl->next_seqno = 1;
+	ret = i915_gem_init_global_seqno(i915, 0);
+	if (ret) {
+		i915->gt.active_requests--;
+		return ret;
 	}
 
-	*seqno = tl->next_seqno++;
 	return 0;
 }
 
+static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
+{
+	return atomic_inc_return(&tl->next_seqno);
+}
+
 static int __i915_sw_fence_call
 submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 {
@@ -356,9 +357,10 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 	struct drm_i915_gem_request *req;
-	u32 seqno;
 	int ret;
 
+	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+
 	/* ABI: Before userspace accesses the GPU (e.g. execbuffer), report
 	 * EIO if the GPU is already wedged, or EAGAIN to drop the struct_mutex
 	 * and restart.
@@ -367,6 +369,10 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	if (ret)
 		return ERR_PTR(ret);
 
+	ret = reserve_global_seqno(dev_priv);
+	if (ret)
+		return ERR_PTR(ret);
+
 	/* Move the oldest request to the slab-cache (if not in use!) */
 	req = list_first_entry_or_null(&engine->timeline->requests,
 				       typeof(*req), link);
@@ -402,12 +408,10 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 * Do not use kmem_cache_zalloc() here!
 	 */
 	req = kmem_cache_alloc(dev_priv->requests, GFP_KERNEL);
-	if (!req)
-		return ERR_PTR(-ENOMEM);
-
-	ret = i915_gem_get_global_seqno(dev_priv, &seqno);
-	if (ret)
-		goto err;
+	if (!req) {
+		ret = -ENOMEM;
+		goto err_unreserve;
+	}
 
 	req->timeline = engine->timeline;
 
@@ -416,14 +420,14 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 		       &i915_fence_ops,
 		       &req->lock,
 		       req->timeline->fence_context,
-		       seqno);
+		       timeline_get_seqno(req->timeline->common));
 
 	i915_sw_fence_init(&req->submit, submit_notify);
 
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
 	req->engine = engine;
-	req->global_seqno = seqno;
+	req->global_seqno = req->fence.seqno;
 	req->ctx = i915_gem_context_get(ctx);
 
 	/* No zalloc, must clear what we need by hand */
@@ -459,8 +463,9 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 
 err_ctx:
 	i915_gem_context_put(ctx);
-err:
 	kmem_cache_free(dev_priv->requests, req);
+err_unreserve:
+	dev_priv->gt.active_requests--;
 	return ERR_PTR(ret);
 }
 
@@ -624,7 +629,6 @@ static void i915_gem_mark_busy(const struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 
-	dev_priv->gt.active_engines |= intel_engine_flag(engine);
 	if (dev_priv->gt.awake)
 		return;
 
@@ -700,6 +704,9 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
 					     &request->submitq);
 
+	GEM_BUG_ON(i915_seqno_passed(timeline->last_submitted_seqno,
+				     request->fence.seqno));
+
 	request->emitted_jiffies = jiffies;
 	request->previous_seqno = timeline->last_pending_seqno;
 	timeline->last_pending_seqno = request->fence.seqno;
@@ -962,38 +969,35 @@ complete:
 	return timeout;
 }
 
-static bool engine_retire_requests(struct intel_engine_cs *engine)
+static void engine_retire_requests(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *request, *next;
 
 	list_for_each_entry_safe(request, next,
 				 &engine->timeline->requests, link) {
 		if (!i915_gem_request_completed(request))
-			return false;
+			return;
 
 		i915_gem_request_retire(request);
 	}
-
-	return true;
 }
 
 void i915_gem_retire_requests(struct drm_i915_private *dev_priv)
 {
 	struct intel_engine_cs *engine;
-	unsigned int tmp;
+	enum intel_engine_id id;
 
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
 
-	if (dev_priv->gt.active_engines == 0)
+	if (!dev_priv->gt.active_requests)
 		return;
 
 	GEM_BUG_ON(!dev_priv->gt.awake);
 
-	for_each_engine_masked(engine, dev_priv, dev_priv->gt.active_engines, tmp)
-		if (engine_retire_requests(engine))
-			dev_priv->gt.active_engines &= ~intel_engine_flag(engine);
+	for_each_engine(engine, dev_priv, id)
+		engine_retire_requests(engine);
 
-	if (dev_priv->gt.active_engines == 0)
+	if (!dev_priv->gt.active_requests)
 		queue_delayed_work(dev_priv->wq,
 				   &dev_priv->gt.idle_work,
 				   msecs_to_jiffies(100));
-- 
cgit v1.2.1


From f2d13290e3275df34c0cd625fbc665965af08c67 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:57 +0100
Subject: drm/i915: Defer setting of global seqno on request to submission

Defer the assignment of the global seqno on a request to its submission.
In the next patch, we will only allocate the global seqno at that time,
here we are just enabling the wait-for-submission before wait-for-seqno
paths.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-34-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 9b22f66464f0..7499e3b205c6 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -324,14 +324,32 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	struct drm_i915_gem_request *request =
 		container_of(fence, typeof(*request), submit);
 	struct intel_engine_cs *engine = request->engine;
+	struct intel_timeline *timeline;
+	u32 seqno;
 
 	if (state != FENCE_COMPLETE)
 		return NOTIFY_DONE;
 
 	/* Will be called from irq-context when using foreign DMA fences */
 
-	engine->timeline->last_submitted_seqno = request->fence.seqno;
+	timeline = request->timeline;
 
+	seqno = request->fence.seqno;
+	GEM_BUG_ON(!seqno);
+	GEM_BUG_ON(i915_seqno_passed(intel_engine_get_seqno(engine), seqno));
+
+	GEM_BUG_ON(i915_seqno_passed(timeline->last_submitted_seqno, seqno));
+	request->previous_seqno = timeline->last_submitted_seqno;
+	timeline->last_submitted_seqno = seqno;
+
+	/* We may be recursing from the signal callback of another i915 fence */
+	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+	request->global_seqno = seqno;
+	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
+		intel_engine_enable_signaling(request);
+	spin_unlock(&request->lock);
+
+	GEM_BUG_ON(!request->global_seqno);
 	engine->emit_breadcrumb(request,
 				request->ring->vaddr + request->postfix);
 	engine->submit_request(request);
@@ -427,10 +445,10 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
 	req->engine = engine;
-	req->global_seqno = req->fence.seqno;
 	req->ctx = i915_gem_context_get(ctx);
 
 	/* No zalloc, must clear what we need by hand */
+	req->global_seqno = 0;
 	req->previous_context = NULL;
 	req->file_priv = NULL;
 	req->batch = NULL;
@@ -704,15 +722,13 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
 					     &request->submitq);
 
-	GEM_BUG_ON(i915_seqno_passed(timeline->last_submitted_seqno,
-				     request->fence.seqno));
+	list_add_tail(&request->link, &timeline->requests);
 
-	request->emitted_jiffies = jiffies;
-	request->previous_seqno = timeline->last_pending_seqno;
 	timeline->last_pending_seqno = request->fence.seqno;
 	i915_gem_active_set(&timeline->last_request, request);
-	list_add_tail(&request->link, &timeline->requests);
+
 	list_add_tail(&request->ring_link, &ring->request_list);
+	request->emitted_jiffies = jiffies;
 
 	i915_gem_mark_busy(engine);
 
-- 
cgit v1.2.1


From 80b204bce8f27b52cd65839e0e6144b4452ae3de Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Oct 2016 13:58:58 +0100
Subject: drm/i915: Enable multiple timelines

With the infrastructure converted over to tracking multiple timelines in
the GEM API whilst preserving the efficiency of using a single execution
timeline internally, we can now assign a separate timeline to every
context with full-ppgtt.

v2: Add a comment to indicate the xfer between timelines upon submission.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-35-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 61 ++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 28 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 7499e3b205c6..79b0046d9a57 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -34,12 +34,6 @@ static const char *i915_fence_get_driver_name(struct dma_fence *fence)
 
 static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
 {
-	/* Timelines are bound by eviction to a VM. However, since
-	 * we only have a global seqno at the moment, we only have
-	 * a single timeline. Note that each timeline will have
-	 * multiple execution contexts (fence contexts) as we allow
-	 * engines within a single timeline to execute in parallel.
-	 */
 	return to_request(fence)->timeline->common->name;
 }
 
@@ -64,18 +58,6 @@ static signed long i915_fence_wait(struct dma_fence *fence,
 	return i915_wait_request(to_request(fence), interruptible, timeout);
 }
 
-static void i915_fence_value_str(struct dma_fence *fence, char *str, int size)
-{
-	snprintf(str, size, "%u", fence->seqno);
-}
-
-static void i915_fence_timeline_value_str(struct dma_fence *fence, char *str,
-					  int size)
-{
-	snprintf(str, size, "%u",
-		 intel_engine_get_seqno(to_request(fence)->engine));
-}
-
 static void i915_fence_release(struct dma_fence *fence)
 {
 	struct drm_i915_gem_request *req = to_request(fence);
@@ -90,8 +72,6 @@ const struct dma_fence_ops i915_fence_ops = {
 	.signaled = i915_fence_signaled,
 	.wait = i915_fence_wait,
 	.release = i915_fence_release,
-	.fence_value_str = i915_fence_value_str,
-	.timeline_value_str = i915_fence_timeline_value_str,
 };
 
 int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
@@ -147,7 +127,10 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	GEM_BUG_ON(!i915_gem_request_completed(request));
 
 	trace_i915_gem_request_retire(request);
+
+	spin_lock_irq(&request->engine->timeline->lock);
 	list_del_init(&request->link);
+	spin_unlock_irq(&request->engine->timeline->lock);
 
 	/* We know the GPU must have read the request to have
 	 * sent us the seqno + interrupt, so use the position
@@ -313,6 +296,12 @@ static int reserve_global_seqno(struct drm_i915_private *i915)
 	return 0;
 }
 
+static u32 __timeline_get_seqno(struct i915_gem_timeline *tl)
+{
+	/* next_seqno only incremented under a mutex */
+	return ++tl->next_seqno.counter;
+}
+
 static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
 {
 	return atomic_inc_return(&tl->next_seqno);
@@ -325,16 +314,20 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 		container_of(fence, typeof(*request), submit);
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_timeline *timeline;
+	unsigned long flags;
 	u32 seqno;
 
 	if (state != FENCE_COMPLETE)
 		return NOTIFY_DONE;
 
-	/* Will be called from irq-context when using foreign DMA fences */
+	/* Transfer from per-context onto the global per-engine timeline */
+	timeline = engine->timeline;
+	GEM_BUG_ON(timeline == request->timeline);
 
-	timeline = request->timeline;
+	/* Will be called from irq-context when using foreign DMA fences */
+	spin_lock_irqsave(&timeline->lock, flags);
 
-	seqno = request->fence.seqno;
+	seqno = timeline_get_seqno(timeline->common);
 	GEM_BUG_ON(!seqno);
 	GEM_BUG_ON(i915_seqno_passed(intel_engine_get_seqno(engine), seqno));
 
@@ -354,6 +347,12 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 				request->ring->vaddr + request->postfix);
 	engine->submit_request(request);
 
+	spin_lock_nested(&request->timeline->lock, SINGLE_DEPTH_NESTING);
+	list_move_tail(&request->link, &timeline->requests);
+	spin_unlock(&request->timeline->lock);
+
+	spin_unlock_irqrestore(&timeline->lock, flags);
+
 	return NOTIFY_DONE;
 }
 
@@ -394,7 +393,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	/* Move the oldest request to the slab-cache (if not in use!) */
 	req = list_first_entry_or_null(&engine->timeline->requests,
 				       typeof(*req), link);
-	if (req && i915_gem_request_completed(req))
+	if (req && __i915_gem_request_completed(req))
 		i915_gem_request_retire(req);
 
 	/* Beware: Dragons be flying overhead.
@@ -431,14 +430,15 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 		goto err_unreserve;
 	}
 
-	req->timeline = engine->timeline;
+	req->timeline = i915_gem_context_lookup_timeline(ctx, engine);
+	GEM_BUG_ON(req->timeline == engine->timeline);
 
 	spin_lock_init(&req->lock);
 	dma_fence_init(&req->fence,
 		       &i915_fence_ops,
 		       &req->lock,
 		       req->timeline->fence_context,
-		       timeline_get_seqno(req->timeline->common));
+		       __timeline_get_seqno(req->timeline->common));
 
 	i915_sw_fence_init(&req->submit, submit_notify);
 
@@ -722,9 +722,14 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
 					     &request->submitq);
 
+	spin_lock_irq(&timeline->lock);
 	list_add_tail(&request->link, &timeline->requests);
+	spin_unlock_irq(&timeline->lock);
+
+	GEM_BUG_ON(i915_seqno_passed(timeline->last_submitted_seqno,
+				     request->fence.seqno));
 
-	timeline->last_pending_seqno = request->fence.seqno;
+	timeline->last_submitted_seqno = request->fence.seqno;
 	i915_gem_active_set(&timeline->last_request, request);
 
 	list_add_tail(&request->ring_link, &ring->request_list);
@@ -991,7 +996,7 @@ static void engine_retire_requests(struct intel_engine_cs *engine)
 
 	list_for_each_entry_safe(request, next,
 				 &engine->timeline->requests, link) {
-		if (!i915_gem_request_completed(request))
+		if (!__i915_gem_request_completed(request))
 			return;
 
 		i915_gem_request_retire(request);
-- 
cgit v1.2.1


From 5bd11a34e46afa1048bd5330673fb1508183f6a5 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Mon, 7 Nov 2016 11:20:02 +0200
Subject: drm/i915: Avoid early GPU idling due to already pending idle work

Atm, in case an idle work handler is already pending but haven't yet
started to run, retiring a new request will not extend the active period
as required, rather simply leaves the pending idle work to be scheduled
at the original expiration time. This may lead to idling the GPU too
early. Fix this by using the delayed-work scheduler alternative which
makes sure the handler's expiration time is extended in this case.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Requested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1478510405-11799-1-git-send-email-imre.deak@intel.com
---
 drivers/gpu/drm/i915/i915_gem_request.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 79b0046d9a57..0b3b051a5683 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -1019,7 +1019,7 @@ void i915_gem_retire_requests(struct drm_i915_private *dev_priv)
 		engine_retire_requests(engine);
 
 	if (!dev_priv->gt.active_requests)
-		queue_delayed_work(dev_priv->wq,
-				   &dev_priv->gt.idle_work,
-				   msecs_to_jiffies(100));
+		mod_delayed_work(dev_priv->wq,
+				 &dev_priv->gt.idle_work,
+				 msecs_to_jiffies(100));
 }
-- 
cgit v1.2.1


From 6a5d1db98ef1da5c632640133898aa42d6916c6c Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 8 Nov 2016 14:37:19 +0000
Subject: drm/i915: Spin until breadcrumb threads are complete

When we need to reset the global seqno on wraparound, we have to wait
until the current rbtrees are drained (or otherwise the next waiter will
be out of sequence). The current mechanism to kick and spin until
complete, may exit too early as it would break if the target thread was
currently running. Instead, we must wake up the threads, but keep
spinning until the trees have been deleted.

In order to appease Tvrtko, busy spin rather than yield().

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161108143719.32215-1-chris@chris-wilson.co.uk
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_request.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 0b3b051a5683..5050464c5401 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -241,9 +241,8 @@ static int i915_gem_init_global_seqno(struct drm_i915_private *i915, u32 seqno)
 
 	/* If the seqno wraps around, we need to clear the breadcrumb rbtree */
 	if (!i915_seqno_passed(seqno, atomic_read(&timeline->next_seqno))) {
-		while (intel_kick_waiters(i915) || intel_kick_signalers(i915))
-			yield();
-		yield();
+		while (intel_breadcrumbs_busy(i915))
+			cond_resched(); /* spin until threads are complete */
 	}
 	atomic_set(&timeline->next_seqno, seqno);
 
-- 
cgit v1.2.1


From bb89485e999181a329cafa8e798b6bbf10c1a52a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 14 Nov 2016 20:40:57 +0000
Subject: drm/i915: Create distinct lockclasses for execution vs user timelines

In order to simplify the lockdep annotation, as they become more complex
in the future with deferred execution and multiple paths through the
same functions, create a separate lockclass for the user timeline and
the hardware execution timeline.

We should only ever be locking the user timeline and the execution
timeline in parallel so we only need to create two lock classes, rather
than a separate class for every timeline.

v2: Rename the lock classes to be more consistent with other lockdep.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161114204105.29171-2-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 5050464c5401..f25b537d6e64 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -346,7 +346,7 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 				request->ring->vaddr + request->postfix);
 	engine->submit_request(request);
 
-	spin_lock_nested(&request->timeline->lock, SINGLE_DEPTH_NESTING);
+	spin_lock(&request->timeline->lock);
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
-- 
cgit v1.2.1


From 23902e49c999353f75f15b7d8483bff70746b97d Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 14 Nov 2016 20:40:58 +0000
Subject: drm/i915: Split request submit/execute phase into two

In order to support deferred scheduling, we need to differentiate
between when the request is ready to run (i.e. the submit fence is
signaled) and when the request is actually run (a new execute fence).
This is typically split between the request itself wanting to wait upon
others (for which we use the submit fence) and the CPU wanting to wait
upon the request, for which we use the execute fence to be sure the
hardware is ready to signal completion.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161114204105.29171-3-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index f25b537d6e64..d0f6b9f82636 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -350,11 +350,19 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
+	i915_sw_fence_commit(&request->execute);
+
 	spin_unlock_irqrestore(&timeline->lock, flags);
 
 	return NOTIFY_DONE;
 }
 
+static int __i915_sw_fence_call
+execute_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	return NOTIFY_DONE;
+}
+
 /**
  * i915_gem_request_alloc - allocate a request structure
  *
@@ -440,6 +448,12 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 		       __timeline_get_seqno(req->timeline->common));
 
 	i915_sw_fence_init(&req->submit, submit_notify);
+	i915_sw_fence_init(&req->execute, execute_notify);
+	/* Ensure that the execute fence completes after the submit fence -
+	 * as we complete the execute fence from within the submit fence
+	 * callback, its completion would otherwise be visible first.
+	 */
+	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
 
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
@@ -816,9 +830,9 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
 }
 
 static long
-__i915_request_wait_for_submit(struct drm_i915_gem_request *request,
-			       unsigned int flags,
-			       long timeout)
+__i915_request_wait_for_execute(struct drm_i915_gem_request *request,
+				unsigned int flags,
+				long timeout)
 {
 	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
 		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
@@ -830,9 +844,9 @@ __i915_request_wait_for_submit(struct drm_i915_gem_request *request,
 		add_wait_queue(q, &reset);
 
 	do {
-		prepare_to_wait(&request->submit.wait, &wait, state);
+		prepare_to_wait(&request->execute.wait, &wait, state);
 
-		if (i915_sw_fence_done(&request->submit))
+		if (i915_sw_fence_done(&request->execute))
 			break;
 
 		if (flags & I915_WAIT_LOCKED &&
@@ -850,7 +864,7 @@ __i915_request_wait_for_submit(struct drm_i915_gem_request *request,
 
 		timeout = io_schedule_timeout(timeout);
 	} while (timeout);
-	finish_wait(&request->submit.wait, &wait);
+	finish_wait(&request->execute.wait, &wait);
 
 	if (flags & I915_WAIT_LOCKED)
 		remove_wait_queue(q, &reset);
@@ -902,13 +916,14 @@ long i915_wait_request(struct drm_i915_gem_request *req,
 
 	trace_i915_gem_request_wait_begin(req);
 
-	if (!i915_sw_fence_done(&req->submit)) {
-		timeout = __i915_request_wait_for_submit(req, flags, timeout);
+	if (!i915_sw_fence_done(&req->execute)) {
+		timeout = __i915_request_wait_for_execute(req, flags, timeout);
 		if (timeout < 0)
 			goto complete;
 
-		GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
+		GEM_BUG_ON(!i915_sw_fence_done(&req->execute));
 	}
+	GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
 	GEM_BUG_ON(!req->global_seqno);
 
 	/* Optimistic short spin before touching IRQs */
-- 
cgit v1.2.1


From d55ac5bf97c6b00539526e2aad8c938376681786 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 14 Nov 2016 20:40:59 +0000
Subject: drm/i915: Defer transfer onto execution timeline to actual hw
 submission

Defer the transfer from the client's timeline onto the execution
timeline from the point of readiness to the point of actual submission.
For example, in execlists, a request is finally submitted to hardware
when the hardware is ready, and only put onto the hardware queue when
the request is ready. By deferring the transfer, we ensure that the
timeline is maintained in retirement order if we decide to queue the
requests onto the hardware in a different order than fifo.

v2: Rebased onto distinct global/user timeline lock classes.
v3: Play with the position of the spin_lock().
v4: Nesting finally resolved with distinct sw_fence lock classes.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161114204105.29171-4-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 38 ++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 13 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index d0f6b9f82636..952d2aec5244 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -306,25 +306,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
 	return atomic_inc_return(&tl->next_seqno);
 }
 
-static int __i915_sw_fence_call
-submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+void __i915_gem_request_submit(struct drm_i915_gem_request *request)
 {
-	struct drm_i915_gem_request *request =
-		container_of(fence, typeof(*request), submit);
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_timeline *timeline;
-	unsigned long flags;
 	u32 seqno;
 
-	if (state != FENCE_COMPLETE)
-		return NOTIFY_DONE;
-
 	/* Transfer from per-context onto the global per-engine timeline */
 	timeline = engine->timeline;
 	GEM_BUG_ON(timeline == request->timeline);
-
-	/* Will be called from irq-context when using foreign DMA fences */
-	spin_lock_irqsave(&timeline->lock, flags);
+	assert_spin_locked(&timeline->lock);
 
 	seqno = timeline_get_seqno(timeline->common);
 	GEM_BUG_ON(!seqno);
@@ -344,15 +335,36 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	GEM_BUG_ON(!request->global_seqno);
 	engine->emit_breadcrumb(request,
 				request->ring->vaddr + request->postfix);
-	engine->submit_request(request);
 
 	spin_lock(&request->timeline->lock);
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
 	i915_sw_fence_commit(&request->execute);
+}
+
+void i915_gem_request_submit(struct drm_i915_gem_request *request)
+{
+	struct intel_engine_cs *engine = request->engine;
+	unsigned long flags;
 
-	spin_unlock_irqrestore(&timeline->lock, flags);
+	/* Will be called from irq-context when using foreign fences. */
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+
+	__i915_gem_request_submit(request);
+
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+}
+
+static int __i915_sw_fence_call
+submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	if (state == FENCE_COMPLETE) {
+		struct drm_i915_gem_request *request =
+			container_of(fence, typeof(*request), submit);
+
+		request->engine->submit_request(request);
+	}
 
 	return NOTIFY_DONE;
 }
-- 
cgit v1.2.1


From 0de9136dbbc9f6882bb375270eaddf1b999081bf Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 14 Nov 2016 20:41:01 +0000
Subject: drm/i915/scheduler: Signal the arrival of a new request

The start of the scheduler, add a hook into request submission for the
scheduler to see the arrival of new requests and prepare its runqueues.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161114204105.29171-6-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 952d2aec5244..1118cf48d6f0 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -762,6 +762,19 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 
 	i915_gem_mark_busy(engine);
 
+	/* Let the backend know a new request has arrived that may need
+	 * to adjust the existing execution schedule due to a high priority
+	 * request - i.e. we may want to preempt the current request in order
+	 * to run a high priority dependency chain *before* we can execute this
+	 * request.
+	 *
+	 * This is called before the request is ready to run so that we can
+	 * decide whether to preempt the entire chain so that it is ready to
+	 * run at the earliest possible convenience.
+	 */
+	if (engine->schedule)
+		engine->schedule(request, 0);
+
 	local_bh_disable();
 	i915_sw_fence_commit(&request->submit);
 	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
-- 
cgit v1.2.1


From 52e542090701ab983a695cc33ecba19e6a0335a2 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 14 Nov 2016 20:41:02 +0000
Subject: drm/i915/scheduler: Record all dependencies upon request construction

The scheduler needs to know the dependencies of each request for the
lifetime of the request, as it may choose to reschedule the requests at
any time and must ensure the dependency tree is not broken. This is in
additional to using the fence to only allow execution after all
dependencies have been completed.

One option was to extend the fence to support the bidirectional
dependency tracking required by the scheduler. However the mismatch in
lifetimes between the submit fence and the request essentially meant
that we had to build a completely separate struct (and we could not
simply reuse the existing waitqueue in the fence for one half of the
dependency tracking). The extra dependency tracking simply did not mesh
well with the fence, and keeping it separate both keeps the fence
implementation simpler and allows us to extend the dependency tracking
into a priority tree (whilst maintaining support for reordering the
tree).

To avoid the additional allocations and list manipulations, the use of
the priotree is disabled when there are no schedulers to use it.

v2: Create a dedicated slab for i915_dependency.
    Rename the lists.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161114204105.29171-7-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 91 ++++++++++++++++++++++++++++++++-
 1 file changed, 90 insertions(+), 1 deletion(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 1118cf48d6f0..78c87d94d205 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -113,6 +113,77 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 	spin_unlock(&file_priv->mm.lock);
 }
 
+static struct i915_dependency *
+i915_dependency_alloc(struct drm_i915_private *i915)
+{
+	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
+}
+
+static void
+i915_dependency_free(struct drm_i915_private *i915,
+		     struct i915_dependency *dep)
+{
+	kmem_cache_free(i915->dependencies, dep);
+}
+
+static void
+__i915_priotree_add_dependency(struct i915_priotree *pt,
+			       struct i915_priotree *signal,
+			       struct i915_dependency *dep,
+			       unsigned long flags)
+{
+	list_add(&dep->wait_link, &signal->waiters_list);
+	list_add(&dep->signal_link, &pt->signalers_list);
+	dep->signaler = signal;
+	dep->flags = flags;
+}
+
+static int
+i915_priotree_add_dependency(struct drm_i915_private *i915,
+			     struct i915_priotree *pt,
+			     struct i915_priotree *signal)
+{
+	struct i915_dependency *dep;
+
+	dep = i915_dependency_alloc(i915);
+	if (!dep)
+		return -ENOMEM;
+
+	__i915_priotree_add_dependency(pt, signal, dep, I915_DEPENDENCY_ALLOC);
+	return 0;
+}
+
+static void
+i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
+{
+	struct i915_dependency *dep, *next;
+
+	/* Everyone we depended upon (the fences we wait to be signaled)
+	 * should retire before us and remove themselves from our list.
+	 * However, retirement is run independently on each timeline and
+	 * so we may be called out-of-order.
+	 */
+	list_for_each_entry_safe(dep, next, &pt->signalers_list, signal_link) {
+		list_del(&dep->wait_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			i915_dependency_free(i915, dep);
+	}
+
+	/* Remove ourselves from everyone who depends upon us */
+	list_for_each_entry_safe(dep, next, &pt->waiters_list, wait_link) {
+		list_del(&dep->signal_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			i915_dependency_free(i915, dep);
+	}
+}
+
+static void
+i915_priotree_init(struct i915_priotree *pt)
+{
+	INIT_LIST_HEAD(&pt->signalers_list);
+	INIT_LIST_HEAD(&pt->waiters_list);
+}
+
 void i915_gem_retire_noop(struct i915_gem_active *active,
 			  struct drm_i915_gem_request *request)
 {
@@ -182,6 +253,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	i915_gem_context_put(request->ctx);
 
 	dma_fence_signal(&request->fence);
+
+	i915_priotree_fini(request->i915, &request->priotree);
 	i915_gem_request_put(request);
 }
 
@@ -467,6 +540,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 */
 	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
 
+	i915_priotree_init(&req->priotree);
+
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
 	req->engine = engine;
@@ -520,6 +595,14 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
 
 	GEM_BUG_ON(to == from);
 
+	if (to->engine->schedule) {
+		ret = i915_priotree_add_dependency(to->i915,
+						   &to->priotree,
+						   &from->priotree);
+		if (ret < 0)
+			return ret;
+	}
+
 	if (to->timeline == from->timeline)
 		return 0;
 
@@ -743,9 +826,15 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 
 	prev = i915_gem_active_raw(&timeline->last_request,
 				   &request->i915->drm.struct_mutex);
-	if (prev)
+	if (prev) {
 		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
 					     &request->submitq);
+		if (engine->schedule)
+			__i915_priotree_add_dependency(&request->priotree,
+						       &prev->priotree,
+						       &request->dep,
+						       0);
+	}
 
 	spin_lock_irq(&timeline->lock);
 	list_add_tail(&request->link, &timeline->requests);
-- 
cgit v1.2.1


From 20311bd35060435badba8a0d46b06d5d184abaf7 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 14 Nov 2016 20:41:03 +0000
Subject: drm/i915/scheduler: Execute requests in order of priorities

Track the priority of each request and use it to determine the order in
which we submit requests to the hardware via execlists.

The priority of the request is determined by the user (eventually via
the context) but may be overridden at any time by the driver. When we set
the priority of the request, we bump the priority of all of its
dependencies to match - so that a high priority drawing operation is not
stuck behind a background task.

When the request is ready to execute (i.e. we have signaled the submit
fence following completion of all its dependencies, including third
party fences), we put the request into a priority sorted rbtree to be
submitted to the hardware. If the request is higher priority than all
pending requests, it will be submitted on the next context-switch
interrupt as soon as the hardware has completed the current request. We
do not currently preempt any current execution to immediately run a very
high priority request, at least not yet.

One more limitation, is that this is first implementation is for
execlists only so currently limited to gen8/gen9.

v2: Replace recursive priority inheritance bumping with an iterative
depth-first search list.
v3: list_next_entry() for walking lists
v4: Explain how the dfs solves the recursion problem with PI.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161114204105.29171-8-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 78c87d94d205..13574a1e29b1 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -132,6 +132,7 @@ __i915_priotree_add_dependency(struct i915_priotree *pt,
 			       struct i915_dependency *dep,
 			       unsigned long flags)
 {
+	INIT_LIST_HEAD(&dep->dfs_link);
 	list_add(&dep->wait_link, &signal->waiters_list);
 	list_add(&dep->signal_link, &pt->signalers_list);
 	dep->signaler = signal;
@@ -158,6 +159,8 @@ i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
 {
 	struct i915_dependency *dep, *next;
 
+	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
+
 	/* Everyone we depended upon (the fences we wait to be signaled)
 	 * should retire before us and remove themselves from our list.
 	 * However, retirement is run independently on each timeline and
@@ -182,6 +185,8 @@ i915_priotree_init(struct i915_priotree *pt)
 {
 	INIT_LIST_HEAD(&pt->signalers_list);
 	INIT_LIST_HEAD(&pt->waiters_list);
+	RB_CLEAR_NODE(&pt->node);
+	pt->priority = INT_MIN;
 }
 
 void i915_gem_retire_noop(struct i915_gem_active *active,
-- 
cgit v1.2.1


From 9f792ebafe0079e6cedde726ea943bc0f412ff6b Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 14 Nov 2016 20:41:04 +0000
Subject: drm/i915: Store the execution priority on the context

In order to support userspace defining different levels of importance to
different contexts, and in particular the preferred order of execution,
store a priority value on each context. By default, the kernel's
context, which is used for idling and other background tasks, is given
minimum priority (i.e. all user contexts will execute before the kernel).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161114204105.29171-9-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 13574a1e29b1..b9b5253cf3cd 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -867,7 +867,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	 * run at the earliest possible convenience.
 	 */
 	if (engine->schedule)
-		engine->schedule(request, 0);
+		engine->schedule(request, request->ctx->priority);
 
 	local_bh_disable();
 	i915_sw_fence_commit(&request->submit);
-- 
cgit v1.2.1


From 4302055b29cbc8566aaa5eb7f594ea9cc78ebf41 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 15 Nov 2016 16:46:20 +0000
Subject: drm/i915: Be more careful to drop the GT wakeref

Since we can retire requests from multiple paths, we cannot assume that
i915_gem_retire_requests() is the sole path on which we can transition
to gt.active_requests == 0. A consequence of this is that we would skip
the function if we had already retired all the requests and not
scheduled the idle worker.

This is fallout from changing the routine from considering active_engines
(for which it was the only consumer) to active_requests.

v2: Move kicking the idle working to i915_gem_request_retire() otherwise
we could postpone the idle callback everytime we called retire_requests
even though we did no work.
v3: We only need to move the idle work kicking!
v4: Drop the BUG_ON(!awake) as we may be called from the shrinker in the
middle of constructing a request before we have marked the device awake.
v5: Add a BUG_ON() for active_requests underflow upon retirement (Joonas)

Fixes: 28176ef4cfa5 ("drm/i915: Reserve space in the global seqno during request allocation")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161115164620.17185-1-chris@chris-wilson.co.uk
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_request.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index b9b5253cf3cd..db2cac7f5d43 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -201,6 +201,7 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
 	GEM_BUG_ON(!i915_gem_request_completed(request));
+	GEM_BUG_ON(!request->i915->gt.active_requests);
 
 	trace_i915_gem_request_retire(request);
 
@@ -218,7 +219,12 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	 */
 	list_del(&request->ring_link);
 	request->ring->last_retired_head = request->postfix;
-	request->i915->gt.active_requests--;
+	if (!--request->i915->gt.active_requests) {
+		GEM_BUG_ON(!request->i915->gt.awake);
+		mod_delayed_work(request->i915->wq,
+				 &request->i915->gt.idle_work,
+				 msecs_to_jiffies(100));
+	}
 
 	/* Walk through the active list, calling retire on each. This allows
 	 * objects to track their GPU activity and mark themselves as idle
@@ -763,6 +769,8 @@ static void i915_gem_mark_busy(const struct intel_engine_cs *engine)
 	if (dev_priv->gt.awake)
 		return;
 
+	GEM_BUG_ON(!dev_priv->gt.active_requests);
+
 	intel_runtime_pm_get_noresume(dev_priv);
 	dev_priv->gt.awake = true;
 
@@ -1146,13 +1154,6 @@ void i915_gem_retire_requests(struct drm_i915_private *dev_priv)
 	if (!dev_priv->gt.active_requests)
 		return;
 
-	GEM_BUG_ON(!dev_priv->gt.awake);
-
 	for_each_engine(engine, dev_priv, id)
 		engine_retire_requests(engine);
-
-	if (!dev_priv->gt.active_requests)
-		mod_delayed_work(dev_priv->wq,
-				 &dev_priv->gt.idle_work,
-				 msecs_to_jiffies(100));
 }
-- 
cgit v1.2.1


From 786d290cae849a8fc1145e969bfb0953072e3fc7 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 18 Nov 2016 14:34:12 +0000
Subject: drm/i915: Check that each request phase is completed before retiring

Trying to chase an impossible bug (ivb):

[  207.765411] [drm:i915_reset_and_wakeup [i915]] resetting chip
[  207.765734] [drm:i915_gem_reset [i915]] resetting render ring to restart from tail of request 0x4ee834
[  207.765791] [drm:intel_print_rc6_info [i915]] Enabling RC6 states: RC6 on RC6p on RC6pp off
[  207.767213] [drm:intel_guc_setup [i915]] GuC fw status: path (null), fetch NONE, load NONE
[  207.767515] kernel BUG at drivers/gpu/drm/i915/i915_gem_request.c:203!
[  207.767551] invalid opcode: 0000 [#1] PREEMPT SMP
[  207.767576] Modules linked in: snd_hda_intel i915 cdc_ncm usbnet mii x86_pkg_temp_thermal coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel lpc_ich snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic snd_hda_codec snd_hwdep snd_hda_core mei_me mei snd_pcm sdhci_pci sdhci mmc_core e1000e ptp pps_core [last unloaded: i915]
[  207.767808] CPU: 3 PID: 8855 Comm: gem_ringfill Tainted: G     U          4.9.0-rc5-CI-Patchwork_3052+ #1
[  207.767854] Hardware name: LENOVO 2356GCG/2356GCG, BIOS G7ET31WW (1.13 ) 07/02/2012
[  207.767894] task: ffff88012c82a740 task.stack: ffffc9000383c000
[  207.767927] RIP: 0010:[<ffffffffa00a0a3a>]  [<ffffffffa00a0a3a>] i915_gem_request_retire+0x2a/0x4b0 [i915]
[  207.767999] RSP: 0018:ffffc9000383fb20  EFLAGS: 00010293
[  207.768027] RAX: 00000000004ee83c RBX: ffff880135dcb480 RCX: 00000000004ee83a
[  207.768062] RDX: ffff88012fea42a8 RSI: 0000000000000001 RDI: ffff88012c82af68
[  207.768095] RBP: ffffc9000383fb48 R08: 0000000000000000 R09: 0000000000000000
[  207.768129] R10: 0000000000000000 R11: 0000000000000000 R12: ffff880135dcb480
[  207.768163] R13: ffff88012fea42a8 R14: 0000000000000000 R15: 00000000000001d8
[  207.768200] FS:  00007f955f658740(0000) GS:ffff88013e2c0000(0000) knlGS:0000000000000000
[  207.768239] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  207.768258] CR2: 0000555899725930 CR3: 00000001316f6000 CR4: 00000000001406e0
[  207.768286] Stack:
[  207.768299]  ffff880135dcb480 ffff880135dcbe00 ffff88012fea42a8 0000000000000000
[  207.768350]  00000000000001d8 ffffc9000383fb70 ffffffffa00a1339 0000000000000000
[  207.768402]  ffff88012f296c88 00000000000003f0 ffffc9000383fbb0 ffffffffa00b582d
[  207.768453] Call Trace:
[  207.768493]  [<ffffffffa00a1339>] i915_gem_request_retire_upto+0x49/0x90 [i915]
[  207.768553]  [<ffffffffa00b582d>] intel_ring_begin+0x15d/0x2d0 [i915]
[  207.768608]  [<ffffffffa00b59cb>] intel_ring_alloc_request_extras+0x2b/0x40 [i915]
[  207.768667]  [<ffffffffa00a2fd9>] i915_gem_request_alloc+0x359/0x440 [i915]
[  207.768723]  [<ffffffffa008bd03>] i915_gem_do_execbuffer.isra.15+0x783/0x1a10 [i915]
[  207.768766]  [<ffffffff811a6a2e>] ? __might_fault+0x3e/0x90
[  207.768816]  [<ffffffffa008d380>] i915_gem_execbuffer2+0xc0/0x250 [i915]
[  207.768854]  [<ffffffff815532a6>] drm_ioctl+0x1f6/0x480
[  207.768900]  [<ffffffffa008d2c0>] ? i915_gem_execbuffer+0x330/0x330 [i915]
[  207.768939]  [<ffffffff81202f6e>] do_vfs_ioctl+0x8e/0x690
[  207.768972]  [<ffffffff818193ac>] ? retint_kernel+0x2d/0x2d
[  207.769004]  [<ffffffff810d6ef2>] ? trace_hardirqs_on_caller+0x122/0x1b0
[  207.769039]  [<ffffffff812035ac>] SyS_ioctl+0x3c/0x70
[  207.769068]  [<ffffffff818189ae>] entry_SYSCALL_64_fastpath+0x1c/0xb1
[  207.769103] Code: 90 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 8b 35 fa 7b e1 e1 85 f6 0f 85 55 03 00 00 41 8b 84 24 80 02 00 00 85 c0 75 02 <0f> 0b 49 8b 94 24 a8 00 00 00 48 8b 8a e0 01 00 00 8b 89 c0 00
[  207.769400] RIP  [<ffffffffa00a0a3a>] i915_gem_request_retire+0x2a/0x4b0 [i915]
[  207.769463]  RSP <ffffc9000383fb20>

Let's add a couple more BUG_ONs before this to ascertain that the request
did make it to hardware. The impossible part of this stacktrace is that
request must have been considered completed by the i915_request_wait()
before we tried to retire it.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161118143412.26508-1-chris@chris-wilson.co.uk
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_request.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index db2cac7f5d43..27e8f257fb39 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -200,6 +200,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	struct i915_gem_active *active, *next;
 
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
+	GEM_BUG_ON(!i915_sw_fence_done(&request->submit));
+	GEM_BUG_ON(!i915_sw_fence_done(&request->execute));
 	GEM_BUG_ON(!i915_gem_request_completed(request));
 	GEM_BUG_ON(!request->i915->gt.active_requests);
 
-- 
cgit v1.2.1


From 0e932c080cdee38e873476df92d7dc02bdb023bc Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 25 Nov 2016 13:17:17 +0000
Subject: drm/i915: Hold a reference on the request for its fence chain

Currently, we have an active reference for the request until it is
retired. Though it cannot be retired before it has been executed by
hardware, the request may be completed before we have finished
processing the execute fence, i.e. we may continue to process that fence
as we free the request.

Fixes: 5590af3e115a ("drm/i915: Drive request submission through fence callbacks")
Fixes: 23902e49c999 ("drm/i915: Split request submit/execute phase into two")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161125131718.20978-3-chris@chris-wilson.co.uk
(cherry picked from commit 48bc2a4a427ad81578f887d71d45794619a77211)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_request.c | 34 ++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_gem_request.c')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 27e8f257fb39..57194471f8cd 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -200,8 +200,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	struct i915_gem_active *active, *next;
 
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
-	GEM_BUG_ON(!i915_sw_fence_done(&request->submit));
-	GEM_BUG_ON(!i915_sw_fence_done(&request->execute));
+	GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit));
+	GEM_BUG_ON(!i915_sw_fence_signaled(&request->execute));
 	GEM_BUG_ON(!i915_gem_request_completed(request));
 	GEM_BUG_ON(!request->i915->gt.active_requests);
 
@@ -445,11 +445,17 @@ void i915_gem_request_submit(struct drm_i915_gem_request *request)
 static int __i915_sw_fence_call
 submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 {
-	if (state == FENCE_COMPLETE) {
-		struct drm_i915_gem_request *request =
-			container_of(fence, typeof(*request), submit);
+	struct drm_i915_gem_request *request =
+		container_of(fence, typeof(*request), submit);
 
+	switch (state) {
+	case FENCE_COMPLETE:
 		request->engine->submit_request(request);
+		break;
+
+	case FENCE_FREE:
+		i915_gem_request_put(request);
+		break;
 	}
 
 	return NOTIFY_DONE;
@@ -458,6 +464,18 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 static int __i915_sw_fence_call
 execute_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 {
+	struct drm_i915_gem_request *request =
+		container_of(fence, typeof(*request), execute);
+
+	switch (state) {
+	case FENCE_COMPLETE:
+		break;
+
+	case FENCE_FREE:
+		i915_gem_request_put(request);
+		break;
+	}
+
 	return NOTIFY_DONE;
 }
 
@@ -545,8 +563,10 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 		       req->timeline->fence_context,
 		       __timeline_get_seqno(req->timeline->common));
 
-	i915_sw_fence_init(&req->submit, submit_notify);
-	i915_sw_fence_init(&req->execute, execute_notify);
+	/* We bump the ref for the fence chain */
+	i915_sw_fence_init(&i915_gem_request_get(req)->submit, submit_notify);
+	i915_sw_fence_init(&i915_gem_request_get(req)->execute, execute_notify);
+
 	/* Ensure that the execute fence completes after the submit fence -
 	 * as we complete the execute fence from within the submit fence
 	 * callback, its completion would otherwise be visible first.
-- 
cgit v1.2.1