These patchset adds support for VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW message added in recent VPU firmware. Without it the driver will not be able to process any jobs after this message is received and would need to be reloaded.
The last patch in this series is as-is from upstream, but other two patches had to be rebased because of missing new CMDQ UAPI changes that should not be backported to stable.
Karol Wachowski (3): accel/ivpu: Abort all jobs after command queue unregister accel/ivpu: Fix locking order in ivpu_job_submit accel/ivpu: Add handling of VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW
drivers/accel/ivpu/ivpu_drv.c | 32 ++------- drivers/accel/ivpu/ivpu_drv.h | 2 + drivers/accel/ivpu/ivpu_job.c | 111 ++++++++++++++++++++++++++------ drivers/accel/ivpu/ivpu_job.h | 1 + drivers/accel/ivpu/ivpu_mmu.c | 3 +- drivers/accel/ivpu/ivpu_sysfs.c | 5 +- 6 files changed, 103 insertions(+), 51 deletions(-)
From: Karol Wachowski karol.wachowski@intel.com
commit 5bbccadaf33eea2b879d8326ad59ae0663be47d1 upstream.
With hardware scheduler it is not expected to receive JOB_DONE notifications from NPU FW for the jobs aborted due to command queue destroy JSM command.
Remove jobs submitted to unregistered command queue from submitted_jobs_xa to avoid triggering a TDR in such case.
Add explicit submitted_jobs_lock that protects access to list of submitted jobs which is now used to find jobs to abort.
Move context abort procedure to separate work queue not to slow down handling of IPCs or DCT requests in case where job abort takes longer, especially when destruction of the last job of a specific context results in context release.
Cc: stable@vger.kernel.org # v6.14 Signed-off-by: Karol Wachowski karol.wachowski@intel.com Signed-off-by: Maciej Falkowski maciej.falkowski@linux.intel.com Reviewed-by: Jacek Lawrynowicz jacek.lawrynowicz@linux.intel.com Signed-off-by: Jacek Lawrynowicz jacek.lawrynowicz@linux.intel.com Link: https://patchwork.freedesktop.org/patch/msgid/20250107173238.381120-4-maciej... --- drivers/accel/ivpu/ivpu_drv.c | 32 +++---------- drivers/accel/ivpu/ivpu_drv.h | 2 + drivers/accel/ivpu/ivpu_job.c | 85 +++++++++++++++++++++++++-------- drivers/accel/ivpu/ivpu_job.h | 1 + drivers/accel/ivpu/ivpu_mmu.c | 3 +- drivers/accel/ivpu/ivpu_sysfs.c | 5 +- 6 files changed, 79 insertions(+), 49 deletions(-)
diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 3e56ce8bc2c1d..93c3687d30b79 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -36,8 +36,6 @@ #define DRIVER_VERSION_STR "1.0.0 " UTS_RELEASE #endif
-static struct lock_class_key submitted_jobs_xa_lock_class_key; - int ivpu_dbg_mask; module_param_named(dbg_mask, ivpu_dbg_mask, int, 0644); MODULE_PARM_DESC(dbg_mask, "Driver debug mask. See IVPU_DBG_* macros."); @@ -465,26 +463,6 @@ static const struct drm_driver driver = { .major = 1, };
-static void ivpu_context_abort_invalid(struct ivpu_device *vdev) -{ - struct ivpu_file_priv *file_priv; - unsigned long ctx_id; - - mutex_lock(&vdev->context_list_lock); - - xa_for_each(&vdev->context_xa, ctx_id, file_priv) { - if (!file_priv->has_mmu_faults || file_priv->aborted) - continue; - - mutex_lock(&file_priv->lock); - ivpu_context_abort_locked(file_priv); - file_priv->aborted = true; - mutex_unlock(&file_priv->lock); - } - - mutex_unlock(&vdev->context_list_lock); -} - static irqreturn_t ivpu_irq_thread_handler(int irq, void *arg) { struct ivpu_device *vdev = arg; @@ -498,9 +476,6 @@ static irqreturn_t ivpu_irq_thread_handler(int irq, void *arg) case IVPU_HW_IRQ_SRC_IPC: ivpu_ipc_irq_thread_handler(vdev); break; - case IVPU_HW_IRQ_SRC_MMU_EVTQ: - ivpu_context_abort_invalid(vdev); - break; case IVPU_HW_IRQ_SRC_DCT: ivpu_pm_dct_irq_thread_handler(vdev); break; @@ -617,16 +592,21 @@ static int ivpu_dev_init(struct ivpu_device *vdev) xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1); xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1); - lockdep_set_class(&vdev->submitted_jobs_xa.xa_lock, &submitted_jobs_xa_lock_class_key); INIT_LIST_HEAD(&vdev->bo_list);
vdev->db_limit.min = IVPU_MIN_DB; vdev->db_limit.max = IVPU_MAX_DB;
+ INIT_WORK(&vdev->context_abort_work, ivpu_context_abort_thread_handler); + ret = drmm_mutex_init(&vdev->drm, &vdev->context_list_lock); if (ret) goto err_xa_destroy;
+ ret = drmm_mutex_init(&vdev->drm, &vdev->submitted_jobs_lock); + if (ret) + goto err_xa_destroy; + ret = drmm_mutex_init(&vdev->drm, &vdev->bo_list_lock); if (ret) goto err_xa_destroy; diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h index 3fdff3f6cffd8..ebfcf3e42a3d9 100644 --- a/drivers/accel/ivpu/ivpu_drv.h +++ b/drivers/accel/ivpu/ivpu_drv.h @@ -137,6 +137,7 @@ struct ivpu_device { struct mutex context_list_lock; /* Protects user context addition/removal */ struct xarray context_xa; struct xa_limit context_xa_limit; + struct work_struct context_abort_work;
struct xarray db_xa; struct xa_limit db_limit; @@ -145,6 +146,7 @@ struct ivpu_device { struct mutex bo_list_lock; /* Protects bo_list */ struct list_head bo_list;
+ struct mutex submitted_jobs_lock; /* Protects submitted_jobs */ struct xarray submitted_jobs_xa; struct ivpu_ipc_consumer job_done_consumer;
diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c index 7149312f16e19..fc91681469e33 100644 --- a/drivers/accel/ivpu/ivpu_job.c +++ b/drivers/accel/ivpu/ivpu_job.c @@ -223,7 +223,8 @@ static int ivpu_cmdq_fini(struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cm if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) { ret = ivpu_jsm_hws_destroy_cmdq(vdev, file_priv->ctx.id, cmdq->id); if (!ret) - ivpu_dbg(vdev, JOB, "Command queue %d destroyed\n", cmdq->id); + ivpu_dbg(vdev, JOB, "Command queue %d destroyed, ctx %d\n", + cmdq->id, file_priv->ctx.id); }
ret = ivpu_jsm_unregister_db(vdev, cmdq->db_id); @@ -324,6 +325,8 @@ void ivpu_context_abort_locked(struct ivpu_file_priv *file_priv)
if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_OS) ivpu_jsm_context_release(vdev, file_priv->ctx.id); + + file_priv->aborted = true; }
static int ivpu_cmdq_push_job(struct ivpu_cmdq *cmdq, struct ivpu_job *job) @@ -462,16 +465,14 @@ static struct ivpu_job *ivpu_job_remove_from_submitted_jobs(struct ivpu_device * { struct ivpu_job *job;
- xa_lock(&vdev->submitted_jobs_xa); - job = __xa_erase(&vdev->submitted_jobs_xa, job_id); + lockdep_assert_held(&vdev->submitted_jobs_lock);
+ job = xa_erase(&vdev->submitted_jobs_xa, job_id); if (xa_empty(&vdev->submitted_jobs_xa) && job) { vdev->busy_time = ktime_add(ktime_sub(ktime_get(), vdev->busy_start_ts), vdev->busy_time); }
- xa_unlock(&vdev->submitted_jobs_xa); - return job; }
@@ -479,6 +480,8 @@ static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 { struct ivpu_job *job;
+ lockdep_assert_held(&vdev->submitted_jobs_lock); + job = ivpu_job_remove_from_submitted_jobs(vdev, job_id); if (!job) return -ENOENT; @@ -497,6 +500,10 @@ static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 ivpu_stop_job_timeout_detection(vdev);
ivpu_rpm_put(vdev); + + if (!xa_empty(&vdev->submitted_jobs_xa)) + ivpu_start_job_timeout_detection(vdev); + return 0; }
@@ -505,8 +512,12 @@ void ivpu_jobs_abort_all(struct ivpu_device *vdev) struct ivpu_job *job; unsigned long id;
+ mutex_lock(&vdev->submitted_jobs_lock); + xa_for_each(&vdev->submitted_jobs_xa, id, job) ivpu_job_signal_and_destroy(vdev, id, DRM_IVPU_JOB_STATUS_ABORTED); + + mutex_unlock(&vdev->submitted_jobs_lock); }
static int ivpu_job_submit(struct ivpu_job *job, u8 priority) @@ -531,15 +542,16 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority) goto err_unlock_file_priv; }
- xa_lock(&vdev->submitted_jobs_xa); + mutex_lock(&vdev->submitted_jobs_lock); + is_first_job = xa_empty(&vdev->submitted_jobs_xa); - ret = __xa_alloc_cyclic(&vdev->submitted_jobs_xa, &job->job_id, job, file_priv->job_limit, - &file_priv->job_id_next, GFP_KERNEL); + ret = xa_alloc_cyclic(&vdev->submitted_jobs_xa, &job->job_id, job, file_priv->job_limit, + &file_priv->job_id_next, GFP_KERNEL); if (ret < 0) { ivpu_dbg(vdev, JOB, "Too many active jobs in ctx %d\n", file_priv->ctx.id); ret = -EBUSY; - goto err_unlock_submitted_jobs_xa; + goto err_unlock_submitted_jobs; }
ret = ivpu_cmdq_push_job(cmdq, job); @@ -562,19 +574,21 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority) job->job_id, file_priv->ctx.id, job->engine_idx, priority, job->cmd_buf_vpu_addr, cmdq->jobq->header.tail);
- xa_unlock(&vdev->submitted_jobs_xa); - + mutex_unlock(&vdev->submitted_jobs_lock); mutex_unlock(&file_priv->lock);
- if (unlikely(ivpu_test_mode & IVPU_TEST_MODE_NULL_HW)) + if (unlikely(ivpu_test_mode & IVPU_TEST_MODE_NULL_HW)) { + mutex_lock(&vdev->submitted_jobs_lock); ivpu_job_signal_and_destroy(vdev, job->job_id, VPU_JSM_STATUS_SUCCESS); + mutex_unlock(&vdev->submitted_jobs_lock); + }
return 0;
err_erase_xa: - __xa_erase(&vdev->submitted_jobs_xa, job->job_id); -err_unlock_submitted_jobs_xa: - xa_unlock(&vdev->submitted_jobs_xa); + xa_erase(&vdev->submitted_jobs_xa, job->job_id); +err_unlock_submitted_jobs: + mutex_unlock(&vdev->submitted_jobs_lock); err_unlock_file_priv: mutex_unlock(&file_priv->lock); ivpu_rpm_put(vdev); @@ -745,7 +759,6 @@ ivpu_job_done_callback(struct ivpu_device *vdev, struct ivpu_ipc_hdr *ipc_hdr, struct vpu_jsm_msg *jsm_msg) { struct vpu_ipc_msg_payload_job_done *payload; - int ret;
if (!jsm_msg) { ivpu_err(vdev, "IPC message has no JSM payload\n"); @@ -758,9 +771,10 @@ ivpu_job_done_callback(struct ivpu_device *vdev, struct ivpu_ipc_hdr *ipc_hdr, }
payload = (struct vpu_ipc_msg_payload_job_done *)&jsm_msg->payload; - ret = ivpu_job_signal_and_destroy(vdev, payload->job_id, payload->job_status); - if (!ret && !xa_empty(&vdev->submitted_jobs_xa)) - ivpu_start_job_timeout_detection(vdev); + + mutex_lock(&vdev->submitted_jobs_lock); + ivpu_job_signal_and_destroy(vdev, payload->job_id, payload->job_status); + mutex_unlock(&vdev->submitted_jobs_lock); }
void ivpu_job_done_consumer_init(struct ivpu_device *vdev) @@ -773,3 +787,36 @@ void ivpu_job_done_consumer_fini(struct ivpu_device *vdev) { ivpu_ipc_consumer_del(vdev, &vdev->job_done_consumer); } + +void ivpu_context_abort_thread_handler(struct work_struct *work) +{ + struct ivpu_device *vdev = container_of(work, struct ivpu_device, context_abort_work); + struct ivpu_file_priv *file_priv; + unsigned long ctx_id; + struct ivpu_job *job; + unsigned long id; + + mutex_lock(&vdev->context_list_lock); + xa_for_each(&vdev->context_xa, ctx_id, file_priv) { + if (!file_priv->has_mmu_faults || file_priv->aborted) + continue; + + mutex_lock(&file_priv->lock); + ivpu_context_abort_locked(file_priv); + mutex_unlock(&file_priv->lock); + } + mutex_unlock(&vdev->context_list_lock); + + if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW) + return; + /* + * In hardware scheduling mode NPU already has stopped processing jobs + * and won't send us any further notifications, thus we have to free job related resources + * and notify userspace + */ + mutex_lock(&vdev->submitted_jobs_lock); + xa_for_each(&vdev->submitted_jobs_xa, id, job) + if (job->file_priv->aborted) + ivpu_job_signal_and_destroy(vdev, job->job_id, DRM_IVPU_JOB_STATUS_ABORTED); + mutex_unlock(&vdev->submitted_jobs_lock); +} diff --git a/drivers/accel/ivpu/ivpu_job.h b/drivers/accel/ivpu/ivpu_job.h index 8b19e3f8b4cfb..af1ed039569cd 100644 --- a/drivers/accel/ivpu/ivpu_job.h +++ b/drivers/accel/ivpu/ivpu_job.h @@ -66,6 +66,7 @@ void ivpu_cmdq_reset_all_contexts(struct ivpu_device *vdev);
void ivpu_job_done_consumer_init(struct ivpu_device *vdev); void ivpu_job_done_consumer_fini(struct ivpu_device *vdev); +void ivpu_context_abort_thread_handler(struct work_struct *work);
void ivpu_jobs_abort_all(struct ivpu_device *vdev);
diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c index 26ef52fbb93e5..21f820dd0c658 100644 --- a/drivers/accel/ivpu/ivpu_mmu.c +++ b/drivers/accel/ivpu/ivpu_mmu.c @@ -890,8 +890,7 @@ void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev) REGV_WR32(IVPU_MMU_REG_EVTQ_CONS_SEC, vdev->mmu->evtq.cons); }
- if (!kfifo_put(&vdev->hw->irq.fifo, IVPU_HW_IRQ_SRC_MMU_EVTQ)) - ivpu_err_ratelimited(vdev, "IRQ FIFO full\n"); + queue_work(system_wq, &vdev->context_abort_work); }
void ivpu_mmu_evtq_dump(struct ivpu_device *vdev) diff --git a/drivers/accel/ivpu/ivpu_sysfs.c b/drivers/accel/ivpu/ivpu_sysfs.c index 616477fc17fa0..8a616791c32f5 100644 --- a/drivers/accel/ivpu/ivpu_sysfs.c +++ b/drivers/accel/ivpu/ivpu_sysfs.c @@ -30,11 +30,12 @@ npu_busy_time_us_show(struct device *dev, struct device_attribute *attr, char *b struct ivpu_device *vdev = to_ivpu_device(drm); ktime_t total, now = 0;
- xa_lock(&vdev->submitted_jobs_xa); + mutex_lock(&vdev->submitted_jobs_lock); + total = vdev->busy_time; if (!xa_empty(&vdev->submitted_jobs_xa)) now = ktime_sub(ktime_get(), vdev->busy_start_ts); - xa_unlock(&vdev->submitted_jobs_xa); + mutex_unlock(&vdev->submitted_jobs_lock);
return sysfs_emit(buf, "%lld\n", ktime_to_us(ktime_add(total, now))); }
[ Sasha's backport helper bot ]
Hi,
✅ All tests passed successfully. No issues detected. No action required from the submitter.
The upstream commit SHA1 provided is correct: 5bbccadaf33eea2b879d8326ad59ae0663be47d1
WARNING: Author mismatch between patch and upstream commit: Backport author: Jacek Lawrynowiczjacek.lawrynowicz@linux.intel.com Commit author: Karol Wachowskikarol.wachowski@intel.com
Note: The patch differs from the upstream commit: --- 1: 5bbccadaf33ee ! 1: ac0a38707ed72 accel/ivpu: Abort all jobs after command queue unregister @@ Metadata ## Commit message ## accel/ivpu: Abort all jobs after command queue unregister
+ commit 5bbccadaf33eea2b879d8326ad59ae0663be47d1 upstream. + With hardware scheduler it is not expected to receive JOB_DONE notifications from NPU FW for the jobs aborted due to command queue destroy JSM command. @@ Commit message especially when destruction of the last job of a specific context results in context release.
+ Cc: stable@vger.kernel.org # v6.14 Signed-off-by: Karol Wachowski karol.wachowski@intel.com Signed-off-by: Maciej Falkowski maciej.falkowski@linux.intel.com Reviewed-by: Jacek Lawrynowicz jacek.lawrynowicz@linux.intel.com @@ drivers/accel/ivpu/ivpu_drv.h: struct ivpu_device {
## drivers/accel/ivpu/ivpu_job.c ## -@@ drivers/accel/ivpu/ivpu_job.c: static struct ivpu_cmdq *ivpu_cmdq_create(struct ivpu_file_priv *file_priv, u8 p - - cmdq->priority = priority; - cmdq->is_legacy = is_legacy; -+ cmdq->is_valid = true; - - ret = xa_alloc_cyclic(&file_priv->cmdq_xa, &cmdq->id, cmdq, file_priv->cmdq_limit, - &file_priv->cmdq_id_next, GFP_KERNEL); -@@ drivers/accel/ivpu/ivpu_job.c: static struct ivpu_cmdq *ivpu_cmdq_create(struct ivpu_file_priv *file_priv, u8 p - goto err_free_cmdq; - } - -+ ivpu_dbg(vdev, JOB, "Command queue %d created, ctx %d\n", cmdq->id, file_priv->ctx.id); - return cmdq; - - err_free_cmdq: -@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_cmdq_unregister(struct ivpu_file_priv *file_priv, struct ivpu_cm +@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_cmdq_fini(struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cm if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) { ret = ivpu_jsm_hws_destroy_cmdq(vdev, file_priv->ctx.id, cmdq->id); if (!ret) @@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_cmdq_unregister(struct ivpu_file_ }
ret = ivpu_jsm_unregister_db(vdev, cmdq->db_id); -@@ drivers/accel/ivpu/ivpu_job.c: static struct ivpu_cmdq *ivpu_cmdq_acquire(struct ivpu_file_priv *file_priv, u32 - lockdep_assert_held(&file_priv->lock); - - cmdq = xa_load(&file_priv->cmdq_xa, cmdq_id); -- if (!cmdq) { -- ivpu_err(vdev, "Failed to find command queue with ID: %u\n", cmdq_id); -+ if (!cmdq || !cmdq->is_valid) { -+ ivpu_warn_ratelimited(vdev, "Failed to find command queue with ID: %u\n", cmdq_id); - return NULL; - } - -@@ drivers/accel/ivpu/ivpu_job.c: void ivpu_cmdq_reset_all_contexts(struct ivpu_device *vdev) - mutex_unlock(&vdev->context_list_lock); - } - --static void ivpu_cmdq_unregister_all(struct ivpu_file_priv *file_priv) --{ -- struct ivpu_cmdq *cmdq; -- unsigned long cmdq_id; -- -- xa_for_each(&file_priv->cmdq_xa, cmdq_id, cmdq) -- ivpu_cmdq_unregister(file_priv, cmdq); --} -- - void ivpu_context_abort_locked(struct ivpu_file_priv *file_priv) - { - struct ivpu_device *vdev = file_priv->vdev; -+ struct ivpu_cmdq *cmdq; -+ unsigned long cmdq_id; - - lockdep_assert_held(&file_priv->lock); - -- ivpu_cmdq_unregister_all(file_priv); -+ xa_for_each(&file_priv->cmdq_xa, cmdq_id, cmdq) -+ ivpu_cmdq_unregister(file_priv, cmdq); +@@ drivers/accel/ivpu/ivpu_job.c: void ivpu_context_abort_locked(struct ivpu_file_priv *file_priv)
if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_OS) ivpu_jsm_context_release(vdev, file_priv->ctx.id); @@ drivers/accel/ivpu/ivpu_job.c: void ivpu_jobs_abort_all(struct ivpu_device *vdev xa_for_each(&vdev->submitted_jobs_xa, id, job) ivpu_job_signal_and_destroy(vdev, id, DRM_IVPU_JOB_STATUS_ABORTED); + -+ mutex_unlock(&vdev->submitted_jobs_lock); -+} -+ -+void ivpu_cmdq_abort_all_jobs(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id) -+{ -+ struct ivpu_job *job; -+ unsigned long id; -+ -+ mutex_lock(&vdev->submitted_jobs_lock); -+ -+ xa_for_each(&vdev->submitted_jobs_xa, id, job) -+ if (job->file_priv->ctx.id == ctx_id && job->cmdq_id == cmdq_id) -+ ivpu_job_signal_and_destroy(vdev, id, DRM_IVPU_JOB_STATUS_ABORTED); -+ + mutex_unlock(&vdev->submitted_jobs_lock); }
- static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) -@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) + static int ivpu_job_submit(struct ivpu_job *job, u8 priority) +@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority) goto err_unlock_file_priv; }
- xa_lock(&vdev->submitted_jobs_xa); -+ job->cmdq_id = cmdq->id; -+ + mutex_lock(&vdev->submitted_jobs_lock); + is_first_job = xa_empty(&vdev->submitted_jobs_xa); @@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, }
ret = ivpu_cmdq_push_job(cmdq, job); -@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) - job->job_id, file_priv->ctx.id, job->engine_idx, cmdq->priority, +@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority) + job->job_id, file_priv->ctx.id, job->engine_idx, priority, job->cmd_buf_vpu_addr, cmdq->jobq->header.tail);
- xa_unlock(&vdev->submitted_jobs_xa); @@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, err_unlock_file_priv: mutex_unlock(&file_priv->lock); ivpu_rpm_put(vdev); -@@ drivers/accel/ivpu/ivpu_job.c: int ivpu_cmdq_create_ioctl(struct drm_device *dev, void *data, struct drm_file * - int ivpu_cmdq_destroy_ioctl(struct drm_device *dev, void *data, struct drm_file *file) - { - struct ivpu_file_priv *file_priv = file->driver_priv; -+ struct ivpu_device *vdev = file_priv->vdev; - struct drm_ivpu_cmdq_destroy *args = data; - struct ivpu_cmdq *cmdq; -+ u32 cmdq_id; - int ret = 0; - - mutex_lock(&file_priv->lock); - - cmdq = xa_load(&file_priv->cmdq_xa, args->cmdq_id); -- if (!cmdq || cmdq->is_legacy) { -+ if (!cmdq || !cmdq->is_valid || cmdq->is_legacy) { - ret = -ENOENT; - goto unlock; - } - -+ /* -+ * There is no way to stop executing jobs per command queue -+ * in OS scheduling mode, mark command queue as invalid instead -+ * and it will be freed together with context release. -+ */ -+ if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_OS) { -+ cmdq->is_valid = false; -+ goto unlock; -+ } -+ -+ cmdq_id = cmdq->id; - ivpu_cmdq_destroy(file_priv, cmdq); -+ ivpu_cmdq_abort_all_jobs(vdev, file_priv->ctx.id, cmdq_id); - unlock: - mutex_unlock(&file_priv->lock); - return ret; @@ drivers/accel/ivpu/ivpu_job.c: ivpu_job_done_callback(struct ivpu_device *vdev, struct ivpu_ipc_hdr *ipc_hdr, struct vpu_jsm_msg *jsm_msg) { @@ drivers/accel/ivpu/ivpu_job.c: void ivpu_job_done_consumer_fini(struct ivpu_devi +}
## drivers/accel/ivpu/ivpu_job.h ## -@@ drivers/accel/ivpu/ivpu_job.h: struct ivpu_cmdq { - u32 id; - u32 db_id; - u8 priority; -+ bool is_valid; - bool is_legacy; - }; - -@@ drivers/accel/ivpu/ivpu_job.h: struct ivpu_job { - struct ivpu_file_priv *file_priv; - struct dma_fence *done_fence; - u64 cmd_buf_vpu_addr; -+ u32 cmdq_id; - u32 job_id; - u32 engine_idx; - size_t bo_count; -@@ drivers/accel/ivpu/ivpu_job.h: void ivpu_context_abort_locked(struct ivpu_file_priv *file_priv); - - void ivpu_cmdq_release_all_locked(struct ivpu_file_priv *file_priv); - void ivpu_cmdq_reset_all_contexts(struct ivpu_device *vdev); -+void ivpu_cmdq_abort_all_jobs(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id); +@@ drivers/accel/ivpu/ivpu_job.h: void ivpu_cmdq_reset_all_contexts(struct ivpu_device *vdev);
void ivpu_job_done_consumer_init(struct ivpu_device *vdev); void ivpu_job_done_consumer_fini(struct ivpu_device *vdev); ---
Results of testing on various branches:
| Branch | Patch Apply | Build Test | |---------------------------|-------------|------------| | stable/linux-5.4.y | Success | Success |
On Wed, Apr 30, 2025 at 02:36:51PM +0200, Jacek Lawrynowicz wrote:
From: Karol Wachowski karol.wachowski@intel.com
commit 5bbccadaf33eea2b879d8326ad59ae0663be47d1 upstream.
With hardware scheduler it is not expected to receive JOB_DONE notifications from NPU FW for the jobs aborted due to command queue destroy JSM command.
Remove jobs submitted to unregistered command queue from submitted_jobs_xa to avoid triggering a TDR in such case.
Add explicit submitted_jobs_lock that protects access to list of submitted jobs which is now used to find jobs to abort.
Move context abort procedure to separate work queue not to slow down handling of IPCs or DCT requests in case where job abort takes longer, especially when destruction of the last job of a specific context results in context release.
Cc: stable@vger.kernel.org # v6.14 Signed-off-by: Karol Wachowski karol.wachowski@intel.com Signed-off-by: Maciej Falkowski maciej.falkowski@linux.intel.com Reviewed-by: Jacek Lawrynowicz jacek.lawrynowicz@linux.intel.com Signed-off-by: Jacek Lawrynowicz jacek.lawrynowicz@linux.intel.com Link: https://patchwork.freedesktop.org/patch/msgid/20250107173238.381120-4-maciej...
drivers/accel/ivpu/ivpu_drv.c | 32 +++---------- drivers/accel/ivpu/ivpu_drv.h | 2 + drivers/accel/ivpu/ivpu_job.c | 85 +++++++++++++++++++++++++-------- drivers/accel/ivpu/ivpu_job.h | 1 + drivers/accel/ivpu/ivpu_mmu.c | 3 +- drivers/accel/ivpu/ivpu_sysfs.c | 5 +- 6 files changed, 79 insertions(+), 49 deletions(-)
This backport is quite different from the original commit, so please document what you did differently here in the backport down in the signed-off-by area. Same for the other patches in this series.
thanks,
greg k-h
From: Karol Wachowski karol.wachowski@intel.com
commit ab680dc6c78aa035e944ecc8c48a1caab9f39924 upstream.
Fix deadlock in job submission and abort handling. When a thread aborts currently executing jobs due to a fault, it first locks the global lock protecting submitted_jobs (#1).
After the last job is destroyed, it proceeds to release the related context and locks file_priv (#2). Meanwhile, in the job submission thread, the file_priv lock (#2) is taken first, and then the submitted_jobs lock (#1) is obtained when a job is added to the submitted jobs list.
CPU0 CPU1 ---- ---- (for example due to a fault) (jobs submissions keep coming)
lock(&vdev->submitted_jobs_lock) #1 ivpu_jobs_abort_all() job_destroy() lock(&file_priv->lock) #2 lock(&vdev->submitted_jobs_lock) #1 file_priv_release() lock(&vdev->context_list_lock) lock(&file_priv->lock) #2
This order of locking causes a deadlock. To resolve this issue, change the order of locking in ivpu_job_submit().
Cc: stable@vger.kernel.org # v6.14 Signed-off-by: Karol Wachowski karol.wachowski@intel.com Signed-off-by: Maciej Falkowski maciej.falkowski@linux.intel.com Reviewed-by: Jacek Lawrynowicz jacek.lawrynowicz@linux.intel.com Signed-off-by: Jacek Lawrynowicz jacek.lawrynowicz@linux.intel.com Link: https://patchwork.freedesktop.org/patch/msgid/20250107173238.381120-12-macie... --- drivers/accel/ivpu/ivpu_job.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c index fc91681469e33..5b6d93c20b2da 100644 --- a/drivers/accel/ivpu/ivpu_job.c +++ b/drivers/accel/ivpu/ivpu_job.c @@ -532,6 +532,7 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority) if (ret < 0) return ret;
+ mutex_lock(&vdev->submitted_jobs_lock); mutex_lock(&file_priv->lock);
cmdq = ivpu_cmdq_acquire(file_priv, priority); @@ -539,11 +540,9 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority) ivpu_warn_ratelimited(vdev, "Failed to get job queue, ctx %d engine %d prio %d\n", file_priv->ctx.id, job->engine_idx, priority); ret = -EINVAL; - goto err_unlock_file_priv; + goto err_unlock; }
- mutex_lock(&vdev->submitted_jobs_lock); - is_first_job = xa_empty(&vdev->submitted_jobs_xa); ret = xa_alloc_cyclic(&vdev->submitted_jobs_xa, &job->job_id, job, file_priv->job_limit, &file_priv->job_id_next, GFP_KERNEL); @@ -551,7 +550,7 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority) ivpu_dbg(vdev, JOB, "Too many active jobs in ctx %d\n", file_priv->ctx.id); ret = -EBUSY; - goto err_unlock_submitted_jobs; + goto err_unlock; }
ret = ivpu_cmdq_push_job(cmdq, job); @@ -574,22 +573,20 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority) job->job_id, file_priv->ctx.id, job->engine_idx, priority, job->cmd_buf_vpu_addr, cmdq->jobq->header.tail);
- mutex_unlock(&vdev->submitted_jobs_lock); mutex_unlock(&file_priv->lock);
if (unlikely(ivpu_test_mode & IVPU_TEST_MODE_NULL_HW)) { - mutex_lock(&vdev->submitted_jobs_lock); ivpu_job_signal_and_destroy(vdev, job->job_id, VPU_JSM_STATUS_SUCCESS); - mutex_unlock(&vdev->submitted_jobs_lock); }
+ mutex_unlock(&vdev->submitted_jobs_lock); + return 0;
err_erase_xa: xa_erase(&vdev->submitted_jobs_xa, job->job_id); -err_unlock_submitted_jobs: +err_unlock: mutex_unlock(&vdev->submitted_jobs_lock); -err_unlock_file_priv: mutex_unlock(&file_priv->lock); ivpu_rpm_put(vdev); return ret;
[ Sasha's backport helper bot ]
Hi,
✅ All tests passed successfully. No issues detected. No action required from the submitter.
The upstream commit SHA1 provided is correct: ab680dc6c78aa035e944ecc8c48a1caab9f39924
WARNING: Author mismatch between patch and upstream commit: Backport author: Jacek Lawrynowiczjacek.lawrynowicz@linux.intel.com Commit author: Karol Wachowskikarol.wachowski@intel.com
Note: The patch differs from the upstream commit: --- 1: ab680dc6c78aa ! 1: b7d18d7cd5b08 accel/ivpu: Fix locking order in ivpu_job_submit @@ Metadata ## Commit message ## accel/ivpu: Fix locking order in ivpu_job_submit
+ commit ab680dc6c78aa035e944ecc8c48a1caab9f39924 upstream. + Fix deadlock in job submission and abort handling. When a thread aborts currently executing jobs due to a fault, it first locks the global lock protecting submitted_jobs (#1). @@ Commit message This order of locking causes a deadlock. To resolve this issue, change the order of locking in ivpu_job_submit().
+ Cc: stable@vger.kernel.org # v6.14 Signed-off-by: Karol Wachowski karol.wachowski@intel.com Signed-off-by: Maciej Falkowski maciej.falkowski@linux.intel.com Reviewed-by: Jacek Lawrynowicz jacek.lawrynowicz@linux.intel.com @@ Commit message Link: https://patchwork.freedesktop.org/patch/msgid/20250107173238.381120-12-macie...
## drivers/accel/ivpu/ivpu_job.c ## -@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) +@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority) if (ret < 0) return ret;
+ mutex_lock(&vdev->submitted_jobs_lock); mutex_lock(&file_priv->lock);
- if (cmdq_id == 0) -@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) - if (!cmdq) { - ivpu_warn_ratelimited(vdev, "Failed to get job queue, ctx %d\n", file_priv->ctx.id); + cmdq = ivpu_cmdq_acquire(file_priv, priority); +@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority) + ivpu_warn_ratelimited(vdev, "Failed to get job queue, ctx %d engine %d prio %d\n", + file_priv->ctx.id, job->engine_idx, priority); ret = -EINVAL; - goto err_unlock_file_priv; + goto err_unlock; }
- ret = ivpu_cmdq_register(file_priv, cmdq); - if (ret) { - ivpu_err(vdev, "Failed to register command queue: %d\n", ret); -- goto err_unlock_file_priv; -+ goto err_unlock; - } - - job->cmdq_id = cmdq->id; - - mutex_lock(&vdev->submitted_jobs_lock); - is_first_job = xa_empty(&vdev->submitted_jobs_xa); ret = xa_alloc_cyclic(&vdev->submitted_jobs_xa, &job->job_id, job, file_priv->job_limit, &file_priv->job_id_next, GFP_KERNEL); -@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) +@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority) ivpu_dbg(vdev, JOB, "Too many active jobs in ctx %d\n", file_priv->ctx.id); ret = -EBUSY; @@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, }
ret = ivpu_cmdq_push_job(cmdq, job); -@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) - job->job_id, file_priv->ctx.id, job->engine_idx, cmdq->priority, +@@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_submit(struct ivpu_job *job, u8 priority) + job->job_id, file_priv->ctx.id, job->engine_idx, priority, job->cmd_buf_vpu_addr, cmdq->jobq->header.tail);
- mutex_unlock(&vdev->submitted_jobs_lock); ---
Results of testing on various branches:
| Branch | Patch Apply | Build Test | |---------------------------|-------------|------------| | stable/linux-5.4.y | Success | Success |
From: Karol Wachowski karol.wachowski@intel.com
commit dad945c27a42dfadddff1049cf5ae417209a8996 upstream.
Mark as invalid context of a job that returned HW context violation error and queue work that aborts jobs from faulty context. Add engine reset to the context abort thread handler to not only abort currently executing jobs but also to ensure NPU invalid state recovery.
Cc: stable@vger.kernel.org # v6.14 Signed-off-by: Karol Wachowski karol.wachowski@intel.com Signed-off-by: Maciej Falkowski maciej.falkowski@linux.intel.com Reviewed-by: Jacek Lawrynowicz jacek.lawrynowicz@linux.intel.com Signed-off-by: Jacek Lawrynowicz jacek.lawrynowicz@linux.intel.com Link: https://patchwork.freedesktop.org/patch/msgid/20250107173238.381120-13-macie... --- drivers/accel/ivpu/ivpu_job.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+)
diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c index 5b6d93c20b2da..673801889c7b2 100644 --- a/drivers/accel/ivpu/ivpu_job.c +++ b/drivers/accel/ivpu/ivpu_job.c @@ -482,6 +482,26 @@ static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32
lockdep_assert_held(&vdev->submitted_jobs_lock);
+ job = xa_load(&vdev->submitted_jobs_xa, job_id); + if (!job) + return -ENOENT; + + if (job_status == VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW) { + guard(mutex)(&job->file_priv->lock); + + if (job->file_priv->has_mmu_faults) + return 0; + + /* + * Mark context as faulty and defer destruction of the job to jobs abort thread + * handler to synchronize between both faults and jobs returning context violation + * status and ensure both are handled in the same way + */ + job->file_priv->has_mmu_faults = true; + queue_work(system_wq, &vdev->context_abort_work); + return 0; + } + job = ivpu_job_remove_from_submitted_jobs(vdev, job_id); if (!job) return -ENOENT; @@ -793,6 +813,9 @@ void ivpu_context_abort_thread_handler(struct work_struct *work) struct ivpu_job *job; unsigned long id;
+ if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) + ivpu_jsm_reset_engine(vdev, 0); + mutex_lock(&vdev->context_list_lock); xa_for_each(&vdev->context_xa, ctx_id, file_priv) { if (!file_priv->has_mmu_faults || file_priv->aborted) @@ -806,6 +829,8 @@ void ivpu_context_abort_thread_handler(struct work_struct *work)
if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW) return; + + ivpu_jsm_hws_resume_engine(vdev, 0); /* * In hardware scheduling mode NPU already has stopped processing jobs * and won't send us any further notifications, thus we have to free job related resources
[ Sasha's backport helper bot ]
Hi,
✅ All tests passed successfully. No issues detected. No action required from the submitter.
The upstream commit SHA1 provided is correct: dad945c27a42dfadddff1049cf5ae417209a8996
WARNING: Author mismatch between patch and upstream commit: Backport author: Jacek Lawrynowiczjacek.lawrynowicz@linux.intel.com Commit author: Karol Wachowskikarol.wachowski@intel.com
Note: The patch differs from the upstream commit: --- 1: dad945c27a42d ! 1: 2c62f69a5a389 accel/ivpu: Add handling of VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW @@ Metadata ## Commit message ## accel/ivpu: Add handling of VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW
+ commit dad945c27a42dfadddff1049cf5ae417209a8996 upstream. + Mark as invalid context of a job that returned HW context violation error and queue work that aborts jobs from faulty context. Add engine reset to the context abort thread handler to not only abort currently executing jobs but also to ensure NPU invalid state recovery.
+ Cc: stable@vger.kernel.org # v6.14 Signed-off-by: Karol Wachowski karol.wachowski@intel.com Signed-off-by: Maciej Falkowski maciej.falkowski@linux.intel.com Reviewed-by: Jacek Lawrynowicz jacek.lawrynowicz@linux.intel.com @@ drivers/accel/ivpu/ivpu_job.c: static int ivpu_job_signal_and_destroy(struct ivp job = ivpu_job_remove_from_submitted_jobs(vdev, job_id); if (!job) return -ENOENT; -@@ drivers/accel/ivpu/ivpu_job.c: void ivpu_context_abort_work_fn(struct work_struct *work) - unsigned long ctx_id; +@@ drivers/accel/ivpu/ivpu_job.c: void ivpu_context_abort_thread_handler(struct work_struct *work) + struct ivpu_job *job; unsigned long id;
+ if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) @@ drivers/accel/ivpu/ivpu_job.c: void ivpu_context_abort_work_fn(struct work_struc mutex_lock(&vdev->context_list_lock); xa_for_each(&vdev->context_xa, ctx_id, file_priv) { if (!file_priv->has_mmu_faults || file_priv->aborted) -@@ drivers/accel/ivpu/ivpu_job.c: void ivpu_context_abort_work_fn(struct work_struct *work) +@@ drivers/accel/ivpu/ivpu_job.c: void ivpu_context_abort_thread_handler(struct work_struct *work)
if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW) return; ---
Results of testing on various branches:
| Branch | Patch Apply | Build Test | |---------------------------|-------------|------------| | stable/linux-5.4.y | Success | Success |
linux-stable-mirror@lists.linaro.org