summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/i915/i915_gem_context.h
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2019-02-19 12:21:52 +0000
committerChris Wilson <chris@chris-wilson.co.uk>2019-02-19 14:46:21 +0000
commit7f4127c4839b1801087e08b1797e830a766391c1 (patch)
tree9c96e1995f79b02ca35d2e45954727e48b47d612 /drivers/gpu/drm/i915/i915_gem_context.h
parent8f54b3c6c921275d10e33746553c40294ffa0d58 (diff)
downloadlinux-next-7f4127c4839b1801087e08b1797e830a766391c1.tar.gz
drm/i915: Use time based guilty context banning
Currently, we accumulate each time a context hangs the GPU, offset against the number of requests it submits, and if that score exceeds a certain threshold, we ban that context from submitting any more requests (cancelling any work in flight). In contrast, we use a simple timer on the file, that if we see more than a 9 hangs faster than 60s apart in total across all of its contexts, we will ban the client from creating any more contexts. This leads to a confusing situation where the file may be banned before the context, so lets use a simple timer scheme for each. If the context submits 3 hanging requests within a 120s period, declare it forbidden to ever send more requests. This has the advantage of not being easy to repair by simply sending empty requests, but has the disadvantage that if the context is idle then it is forgiven. However, if the context is idle, it is not disrupting the system, but a hog can evade the request counting and cause much more severe disruption to the system. Updating ban_score from request retirement is dubious as the retirement is purposely not in sync with request submission (i.e. we try and batch retirement to reduce overhead and avoid latency on submission), which leads to surprising situations where we can forgive a hang immediately due to a backlog of requests from before the hang being retired afterwards. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190219122215.8941-2-chris@chris-wilson.co.uk
Diffstat (limited to 'drivers/gpu/drm/i915/i915_gem_context.h')
-rw-r--r--drivers/gpu/drm/i915/i915_gem_context.h9
1 files changed, 5 insertions, 4 deletions
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 071108d34ae0..dc6c58f38cfa 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -209,10 +209,11 @@ struct i915_gem_context {
*/
atomic_t active_count;
-#define CONTEXT_SCORE_GUILTY 10
-#define CONTEXT_SCORE_BAN_THRESHOLD 40
- /** ban_score: Accumulated score of all hangs caused by this context. */
- atomic_t ban_score;
+ /**
+ * @hang_timestamp: The last time(s) this context caused a GPU hang
+ */
+ unsigned long hang_timestamp[2];
+#define CONTEXT_FAST_HANG_JIFFIES (120 * HZ) /* 3 hangs within 120s? Banned! */
/** remap_slice: Bitmask of cache lines that need remapping */
u8 remap_slice;