-This is purely a timing issue. Here, sometimes Job free is happening before the job is done. To fix this issue moving 'dma_fence_cb' callback from job(struct drm_sched_job) to scheduler fence (struct drm_sched_fence).
-Added drm_sched_fence_set_parent() and drm_sched_fence_clear_parent() functions to move fence handling into sched_fence.c and this just cleanup.
BUG: kernel NULL pointer dereference, address: 0000000000000088 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.0.0-rc2-custom #1 Hardware name: AMD Dibbler/Dibbler, BIOS RDB1107CC 09/26/2018 RIP: 0010:drm_sched_job_done.isra.0+0x11/0x140 [gpu_sched] Code: 8b fe ff ff be 03 00 00 00 e8 7b da b7 e3 e9 d4 fe ff ff 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53 <48> 8b 9f 88 00 00 00 f0 ff 8b f0 00 00 00 48 8b 83 80 01 00 00 f0 RSP: 0018:ffffb1b1801d4d38 EFLAGS: 00010087 RAX: ffffffffc0aa48b0 RBX: ffffb1b1801d4d70 RCX: 0000000000000018 RDX: 000036c70afb7c1d RSI: ffff8a45ca413c60 RDI: 0000000000000000 RBP: ffffb1b1801d4d50 R08: 00000000000000b5 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffffb1b1801d4d70 R14: ffff8a45c4160000 R15: ffff8a45c416a708 FS: 0000000000000000(0000) GS:ffff8a48a0a80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000088 CR3: 000000014ad50000 CR4: 00000000003506e0 Call Trace: <IRQ> drm_sched_job_done_cb+0x12/0x20 [gpu_sched] dma_fence_signal_timestamp_locked+0x7e/0x110 dma_fence_signal+0x31/0x60 amdgpu_fence_process+0xc4/0x140 [amdgpu] gfx_v9_0_eop_irq+0x9d/0xd0 [amdgpu] amdgpu_irq_dispatch+0xb7/0x210 [amdgpu] amdgpu_ih_process+0x86/0x100 [amdgpu] amdgpu_irq_handler+0x24/0x60 [amdgpu] __handle_irq_event_percpu+0x4b/0x190 handle_irq_event_percpu+0x15/0x50 handle_irq_event+0x39/0x60 handle_edge_irq+0xaf/0x210 __common_interrupt+0x6e/0x110 common_interrupt+0xc1/0xe0 </IRQ> <TASK>
Signed-off-by: Arvind Yadav Arvind.Yadav@amd.com ---
Changes in v2: Moving 'dma_fence_cb' callback from job(struct drm_sched_job) to scheduler fence(struct drm_sched_fence) instead of adding NULL check for s_fence.
Changes in v3: Added drm_sched_fence_set_parent() function(and others *_parent_cb) in sched_fence.c. Moved parent fence intilization and callback installation into this (this just cleanup).
Changes in v4 : Add drm_sched_fence_clear_parent() function in sched_fence.c. and done the changes as per review comments. --- drivers/gpu/drm/scheduler/sched_fence.c | 64 +++++++++++++++++++++++++ drivers/gpu/drm/scheduler/sched_main.c | 53 ++++---------------- include/drm/gpu_scheduler.h | 10 +++- 3 files changed, 81 insertions(+), 46 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c index 7fd869520ef2..68343614f9ed 100644 --- a/drivers/gpu/drm/scheduler/sched_fence.c +++ b/drivers/gpu/drm/scheduler/sched_fence.c @@ -78,6 +78,70 @@ static void drm_sched_fence_free_rcu(struct rcu_head *rcu) kmem_cache_free(sched_fence_slab, fence); }
+/** + * drm_sched_fence_parent_cb - the callback for a done job + * @f: fence + * @cb: fence callbacks + */ +static void drm_sched_fence_parent_cb(struct dma_fence *f, struct dma_fence_cb *cb) +{ + struct drm_sched_fence *s_fence = container_of(cb, struct drm_sched_fence, + cb); + struct drm_gpu_scheduler *sched = s_fence->sched; + + atomic_dec(&sched->hw_rq_count); + atomic_dec(sched->score); + + dma_fence_get(&s_fence->finished); + drm_sched_fence_finished(s_fence); + dma_fence_put(&s_fence->finished); + wake_up_interruptible(&sched->wake_up_worker); +} + +/** + * drm_sched_fence_clear_parent - Remove callbacks from pending list + * @s_fence: pointer to the fence + * + * Remove callbacks from pending list and clear the parent fence. + */ +bool drm_sched_fence_clear_parent(struct drm_sched_fence *s_fence) +{ + if (s_fence->parent && + dma_fence_remove_callback(s_fence->parent, &s_fence->cb)) { + dma_fence_put(s_fence->parent); + s_fence->parent = NULL; + return true; + } + + return false; +} + +/** + * drm_sched_fence_set_parent - set the parent fence and add the callback + * @s_fence: pointer to the fence + * fence: pointer to the hw fence + * + * Set the parent fence and install the callback for a done job. + */ +void drm_sched_fence_set_parent(struct drm_sched_fence *s_fence, + struct dma_fence *fence) +{ + int r; + + if (s_fence->parent && + dma_fence_remove_callback(s_fence->parent, &s_fence->cb)) + dma_fence_put(s_fence->parent); + + /* We keep the reference of the parent fence here. */ + swap(s_fence->parent, fence); + dma_fence_put(fence); + + r = dma_fence_add_callback(s_fence->parent, &s_fence->cb, + drm_sched_fence_parent_cb); + if (r == -ENOENT) + drm_sched_fence_parent_cb(NULL, &s_fence->cb); +} + /** * drm_sched_fence_free - free up an uninitialized fence * diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 4cc59bae38dd..30597d9a949f 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -253,13 +253,12 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
/** * drm_sched_job_done - complete a job - * @s_job: pointer to the job which is done + * @s_fence: pointer to the fence of a done job * * Finish the job's fence and wake up the worker thread. */ -static void drm_sched_job_done(struct drm_sched_job *s_job) +static void drm_sched_job_done(struct drm_sched_fence *s_fence) { - struct drm_sched_fence *s_fence = s_job->s_fence; struct drm_gpu_scheduler *sched = s_fence->sched;
atomic_dec(&sched->hw_rq_count); @@ -273,18 +272,6 @@ static void drm_sched_job_done(struct drm_sched_job *s_job) wake_up_interruptible(&sched->wake_up_worker); }
-/** - * drm_sched_job_done_cb - the callback for a done job - * @f: fence - * @cb: fence callbacks - */ -static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb) -{ - struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb); - - drm_sched_job_done(s_job); -} - /** * drm_sched_dependency_optimized - test if the dependency can be optimized * @@ -504,11 +491,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) */ list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list, list) { - if (s_job->s_fence->parent && - dma_fence_remove_callback(s_job->s_fence->parent, - &s_job->cb)) { - dma_fence_put(s_job->s_fence->parent); - s_job->s_fence->parent = NULL; + if (drm_sched_fence_clear_parent(s_job->s_fence)) { atomic_dec(&sched->hw_rq_count); } else { /* @@ -560,7 +543,6 @@ EXPORT_SYMBOL(drm_sched_stop); void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) { struct drm_sched_job *s_job, *tmp; - int r;
/* * Locking the list is not required here as the sched thread is parked @@ -575,16 +557,10 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) if (!full_recovery) continue;
- if (fence) { - r = dma_fence_add_callback(fence, &s_job->cb, - drm_sched_job_done_cb); - if (r == -ENOENT) - drm_sched_job_done(s_job); - else if (r) - DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", - r); - } else - drm_sched_job_done(s_job); + if (fence) + drm_sched_fence_set_parent(s_job->s_fence, fence); + else + drm_sched_job_done(s_job->s_fence); }
if (full_recovery) { @@ -1008,7 +984,6 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched) static int drm_sched_main(void *param) { struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param; - int r;
sched_set_fifo_low(current);
@@ -1049,22 +1024,12 @@ static int drm_sched_main(void *param) drm_sched_fence_scheduled(s_fence);
if (!IS_ERR_OR_NULL(fence)) { - s_fence->parent = dma_fence_get(fence); - /* Drop for original kref_init of the fence */ - dma_fence_put(fence); - - r = dma_fence_add_callback(fence, &sched_job->cb, - drm_sched_job_done_cb); - if (r == -ENOENT) - drm_sched_job_done(sched_job); - else if (r) - DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", - r); + drm_sched_fence_set_parent(s_fence, fence); } else { if (IS_ERR(fence)) dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
- drm_sched_job_done(sched_job); + drm_sched_job_done(s_fence); }
wake_up(&sched->job_scheduled); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 1f7d9dd1a444..5066729c15ce 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -281,6 +281,10 @@ struct drm_sched_fence { * @owner: job owner for debugging */ void *owner; + /** + * @cb: callback + */ + struct dma_fence_cb cb; };
struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); @@ -300,7 +304,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); * be scheduled further. * @s_priority: the priority of the job. * @entity: the entity to which this job belongs. - * @cb: the callback for the parent fence in s_fence. * * A job is created by the driver using drm_sched_job_init(), and * should call drm_sched_entity_push_job() once it wants the scheduler @@ -325,7 +328,6 @@ struct drm_sched_job { atomic_t karma; enum drm_sched_priority s_priority; struct drm_sched_entity *entity; - struct dma_fence_cb cb; /** * @dependencies: * @@ -559,6 +561,10 @@ void drm_sched_fence_free(struct drm_sched_fence *fence); void drm_sched_fence_scheduled(struct drm_sched_fence *fence); void drm_sched_fence_finished(struct drm_sched_fence *fence);
+bool drm_sched_fence_clear_parent(struct drm_sched_fence *s_fence); +void drm_sched_fence_set_parent(struct drm_sched_fence *s_fence, + struct dma_fence *fence); + unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched); void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, unsigned long remaining);