Verify that if a context is active at the time it is closed, that it is either persistent and preemptible (with hangcheck running) or it shall be removed from execution.
Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs") Testcase: igt/gem_ctx_persistence/heartbeat-close Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: stable@vger.kernel.org # v5.7+ --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 48 +++++---------------- 1 file changed, 10 insertions(+), 38 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index a548626fa8bc..4fd38101bb56 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -390,24 +390,6 @@ __context_engines_static(const struct i915_gem_context *ctx) return rcu_dereference_protected(ctx->engines, true); }
-static bool __reset_engine(struct intel_engine_cs *engine) -{ - struct intel_gt *gt = engine->gt; - bool success = false; - - if (!intel_has_reset_engine(gt)) - return false; - - if (!test_and_set_bit(I915_RESET_ENGINE + engine->id, - >->reset.flags)) { - success = intel_engine_reset(engine, NULL) == 0; - clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, - >->reset.flags); - } - - return success; -} - static void __reset_context(struct i915_gem_context *ctx, struct intel_engine_cs *engine) { @@ -431,12 +413,7 @@ static bool __cancel_engine(struct intel_engine_cs *engine) * kill the banned context, we fallback to doing a local reset * instead. */ - if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) && - !intel_engine_pulse(engine)) - return true; - - /* If we are unable to send a pulse, try resetting this engine. */ - return __reset_engine(engine); + return intel_engine_pulse(engine) == 0; }
static bool @@ -506,7 +483,7 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce) return engine; }
-static void kill_engines(struct i915_gem_engines *engines) +static void kill_engines(struct i915_gem_engines *engines, bool ban) { struct i915_gem_engines_iter it; struct intel_context *ce; @@ -521,7 +498,7 @@ static void kill_engines(struct i915_gem_engines *engines) for_each_gem_engine(ce, engines, it) { struct intel_engine_cs *engine;
- if (intel_context_set_banned(ce)) + if (ban && intel_context_set_banned(ce)) continue;
/* @@ -534,7 +511,7 @@ static void kill_engines(struct i915_gem_engines *engines) engine = active_engine(ce);
/* First attempt to gracefully cancel the context */ - if (engine && !__cancel_engine(engine)) + if (engine && !__cancel_engine(engine) && ban) /* * If we are unable to send a preemptive pulse to bump * the context from the GPU, we have to resort to a full @@ -544,8 +521,10 @@ static void kill_engines(struct i915_gem_engines *engines) } }
-static void kill_stale_engines(struct i915_gem_context *ctx) +static void kill_context(struct i915_gem_context *ctx) { + bool ban = (!i915_gem_context_is_persistent(ctx) || + !ctx->i915->params.enable_hangcheck); struct i915_gem_engines *pos, *next;
spin_lock_irq(&ctx->stale.lock); @@ -558,7 +537,7 @@ static void kill_stale_engines(struct i915_gem_context *ctx)
spin_unlock_irq(&ctx->stale.lock);
- kill_engines(pos); + kill_engines(pos, ban);
spin_lock_irq(&ctx->stale.lock); GEM_BUG_ON(i915_sw_fence_signaled(&pos->fence)); @@ -570,11 +549,6 @@ static void kill_stale_engines(struct i915_gem_context *ctx) spin_unlock_irq(&ctx->stale.lock); }
-static void kill_context(struct i915_gem_context *ctx) -{ - kill_stale_engines(ctx); -} - static void engines_idle_release(struct i915_gem_context *ctx, struct i915_gem_engines *engines) { @@ -609,7 +583,7 @@ static void engines_idle_release(struct i915_gem_context *ctx,
kill: if (list_empty(&engines->link)) /* raced, already closed */ - kill_engines(engines); + kill_engines(engines, true);
i915_sw_fence_commit(&engines->fence); } @@ -667,9 +641,7 @@ static void context_close(struct i915_gem_context *ctx) * case we opt to forcibly kill off all remaining requests on * context close. */ - if (!i915_gem_context_is_persistent(ctx) || - !ctx->i915->params.enable_hangcheck) - kill_context(ctx); + kill_context(ctx);
i915_gem_context_put(ctx); }
On 16/09/2020 10:42, Chris Wilson wrote:
Verify that if a context is active at the time it is closed, that it is either persistent and preemptible (with hangcheck running) or it shall be removed from execution.
Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs") Testcase: igt/gem_ctx_persistence/heartbeat-close Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: stable@vger.kernel.org # v5.7+
drivers/gpu/drm/i915/gem/i915_gem_context.c | 48 +++++---------------- 1 file changed, 10 insertions(+), 38 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index a548626fa8bc..4fd38101bb56 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -390,24 +390,6 @@ __context_engines_static(const struct i915_gem_context *ctx) return rcu_dereference_protected(ctx->engines, true); } -static bool __reset_engine(struct intel_engine_cs *engine) -{
- struct intel_gt *gt = engine->gt;
- bool success = false;
- if (!intel_has_reset_engine(gt))
return false;
- if (!test_and_set_bit(I915_RESET_ENGINE + engine->id,
>->reset.flags)) {
success = intel_engine_reset(engine, NULL) == 0;
clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
>->reset.flags);
- }
- return success;
-}
- static void __reset_context(struct i915_gem_context *ctx, struct intel_engine_cs *engine) {
@@ -431,12 +413,7 @@ static bool __cancel_engine(struct intel_engine_cs *engine) * kill the banned context, we fallback to doing a local reset * instead. */
- if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) &&
!intel_engine_pulse(engine))
return true;
- /* If we are unable to send a pulse, try resetting this engine. */
- return __reset_engine(engine);
- return intel_engine_pulse(engine) == 0;
Where is the path now which actually resets the currently running workload (engine) of a non-persistent context? Pulse will be sent and then rely on hangcheck but otherwise let it run?
Regards,
Tvrtko
} static bool @@ -506,7 +483,7 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce) return engine; } -static void kill_engines(struct i915_gem_engines *engines) +static void kill_engines(struct i915_gem_engines *engines, bool ban) { struct i915_gem_engines_iter it; struct intel_context *ce; @@ -521,7 +498,7 @@ static void kill_engines(struct i915_gem_engines *engines) for_each_gem_engine(ce, engines, it) { struct intel_engine_cs *engine;
if (intel_context_set_banned(ce))
if (ban && intel_context_set_banned(ce)) continue;
/* @@ -534,7 +511,7 @@ static void kill_engines(struct i915_gem_engines *engines) engine = active_engine(ce); /* First attempt to gracefully cancel the context */
if (engine && !__cancel_engine(engine))
if (engine && !__cancel_engine(engine) && ban) /* * If we are unable to send a preemptive pulse to bump * the context from the GPU, we have to resort to a full
@@ -544,8 +521,10 @@ static void kill_engines(struct i915_gem_engines *engines) } } -static void kill_stale_engines(struct i915_gem_context *ctx) +static void kill_context(struct i915_gem_context *ctx) {
- bool ban = (!i915_gem_context_is_persistent(ctx) ||
struct i915_gem_engines *pos, *next;!ctx->i915->params.enable_hangcheck);
spin_lock_irq(&ctx->stale.lock); @@ -558,7 +537,7 @@ static void kill_stale_engines(struct i915_gem_context *ctx) spin_unlock_irq(&ctx->stale.lock);
kill_engines(pos);
kill_engines(pos, ban);
spin_lock_irq(&ctx->stale.lock); GEM_BUG_ON(i915_sw_fence_signaled(&pos->fence)); @@ -570,11 +549,6 @@ static void kill_stale_engines(struct i915_gem_context *ctx) spin_unlock_irq(&ctx->stale.lock); } -static void kill_context(struct i915_gem_context *ctx) -{
- kill_stale_engines(ctx);
-}
- static void engines_idle_release(struct i915_gem_context *ctx, struct i915_gem_engines *engines) {
@@ -609,7 +583,7 @@ static void engines_idle_release(struct i915_gem_context *ctx, kill: if (list_empty(&engines->link)) /* raced, already closed */
kill_engines(engines);
kill_engines(engines, true);
i915_sw_fence_commit(&engines->fence); } @@ -667,9 +641,7 @@ static void context_close(struct i915_gem_context *ctx) * case we opt to forcibly kill off all remaining requests on * context close. */
- if (!i915_gem_context_is_persistent(ctx) ||
!ctx->i915->params.enable_hangcheck)
kill_context(ctx);
- kill_context(ctx);
i915_gem_context_put(ctx); }
Quoting Tvrtko Ursulin (2020-09-24 15:26:56)
On 16/09/2020 10:42, Chris Wilson wrote:
Verify that if a context is active at the time it is closed, that it is either persistent and preemptible (with hangcheck running) or it shall be removed from execution.
Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs") Testcase: igt/gem_ctx_persistence/heartbeat-close Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: stable@vger.kernel.org # v5.7+
drivers/gpu/drm/i915/gem/i915_gem_context.c | 48 +++++---------------- 1 file changed, 10 insertions(+), 38 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index a548626fa8bc..4fd38101bb56 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -390,24 +390,6 @@ __context_engines_static(const struct i915_gem_context *ctx) return rcu_dereference_protected(ctx->engines, true); } -static bool __reset_engine(struct intel_engine_cs *engine) -{
struct intel_gt *gt = engine->gt;
bool success = false;
if (!intel_has_reset_engine(gt))
return false;
if (!test_and_set_bit(I915_RESET_ENGINE + engine->id,
>->reset.flags)) {
success = intel_engine_reset(engine, NULL) == 0;
clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
>->reset.flags);
}
return success;
-}
- static void __reset_context(struct i915_gem_context *ctx, struct intel_engine_cs *engine) {
@@ -431,12 +413,7 @@ static bool __cancel_engine(struct intel_engine_cs *engine) * kill the banned context, we fallback to doing a local reset * instead. */
if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) &&
!intel_engine_pulse(engine))
return true;
/* If we are unable to send a pulse, try resetting this engine. */
return __reset_engine(engine);
return intel_engine_pulse(engine) == 0;
Where is the path now which actually resets the currently running workload (engine) of a non-persistent context? Pulse will be sent and then rely on hangcheck but otherwise let it run?
If the pulse fails, we just call intel_handle_error() on the engine. On looking at this code again, I could not justify the open-coding of the engine reset fragment of the general error handler, especially as we end up taking that route anyway for device resets. (Unlike inside the tasklet, there's no atomicity concerns on this engine-reset path.) -Chris
On 25/09/2020 11:05, Chris Wilson wrote:
Quoting Tvrtko Ursulin (2020-09-24 15:26:56)
On 16/09/2020 10:42, Chris Wilson wrote:
Verify that if a context is active at the time it is closed, that it is either persistent and preemptible (with hangcheck running) or it shall be removed from execution.
Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs") Testcase: igt/gem_ctx_persistence/heartbeat-close Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: stable@vger.kernel.org # v5.7+
drivers/gpu/drm/i915/gem/i915_gem_context.c | 48 +++++---------------- 1 file changed, 10 insertions(+), 38 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index a548626fa8bc..4fd38101bb56 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -390,24 +390,6 @@ __context_engines_static(const struct i915_gem_context *ctx) return rcu_dereference_protected(ctx->engines, true); } -static bool __reset_engine(struct intel_engine_cs *engine) -{
struct intel_gt *gt = engine->gt;
bool success = false;
if (!intel_has_reset_engine(gt))
return false;
if (!test_and_set_bit(I915_RESET_ENGINE + engine->id,
>->reset.flags)) {
success = intel_engine_reset(engine, NULL) == 0;
clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
>->reset.flags);
}
return success;
-}
- static void __reset_context(struct i915_gem_context *ctx, struct intel_engine_cs *engine) {
@@ -431,12 +413,7 @@ static bool __cancel_engine(struct intel_engine_cs *engine) * kill the banned context, we fallback to doing a local reset * instead. */
if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) &&
!intel_engine_pulse(engine))
return true;
/* If we are unable to send a pulse, try resetting this engine. */
return __reset_engine(engine);
return intel_engine_pulse(engine) == 0;
Where is the path now which actually resets the currently running workload (engine) of a non-persistent context? Pulse will be sent and then rely on hangcheck but otherwise let it run?
If the pulse fails, we just call intel_handle_error() on the engine. On looking at this code again, I could not justify the open-coding of the engine reset fragment of the general error handler, especially as we end up taking that route anyway for device resets. (Unlike inside the tasklet, there's no atomicity concerns on this engine-reset path.)
I think yesterday I got lost in boolean logic and flows here. Today it looks fine. Bool ban will be true and code will indeed enter the __kill_context path.
Reviewed-by: Tvrtko Ursulin tvrtko.ursulin@intel.com
Regards,
Tvrtko
linux-stable-mirror@lists.linaro.org