On 12/11/25 16:53, Tvrtko Ursulin wrote:
On 11/12/2025 13:16, Christian König wrote:
This allows amdgpu_fences to outlive the amdgpu module.
v2: use dma_fence_get_rcu_safe to be NULL safe here.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 63 +++++++---------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 - 2 files changed, 20 insertions(+), 44 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index c7843e336310..c636347801c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -112,8 +112,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct amdgpu_fence *af, af->ring = ring; seq = ++ring->fence_drv.sync_seq; - dma_fence_init(fence, &amdgpu_fence_ops, - &ring->fence_drv.lock, + dma_fence_init(fence, &amdgpu_fence_ops, NULL, adev->fence_context + ring->idx, seq); amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, @@ -467,7 +466,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring) timer_setup(&ring->fence_drv.fallback_timer, amdgpu_fence_fallback, 0); ring->fence_drv.num_fences_mask = ring->num_hw_submission * 2 - 1; - spin_lock_init(&ring->fence_drv.lock); ring->fence_drv.fences = kcalloc(ring->num_hw_submission * 2, sizeof(void *), GFP_KERNEL); @@ -654,16 +652,20 @@ void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error) struct amdgpu_fence_driver *drv = &ring->fence_drv; unsigned long flags; - spin_lock_irqsave(&drv->lock, flags); + rcu_read_lock(); for (unsigned int i = 0; i <= drv->num_fences_mask; ++i) { struct dma_fence *fence; - fence = rcu_dereference_protected(drv->fences[i], - lockdep_is_held(&drv->lock)); - if (fence && !dma_fence_is_signaled_locked(fence)) + fence = dma_fence_get_rcu(drv->fences[i]);
dma_fence_get_rcu is not safe against passing a NULL fence in, while the existing code makes it look like drv->fence[] slot can contain NULL at this point?
amdgpu_fence_process() is the place which can NULL the slots? Irq context? Why is that safe with no reference held from clearing the slot to operating on the fence?
The slots are never cleared. It can only be that they are NULL when the driver is loaded.
I've switched over to dma_fence_get_rcu_safe() where appropriated.
Regards, Christian.
+ if (!fence) + continue;
+ dma_fence_lock_irqsave(fence, flags); + if (!dma_fence_is_signaled_locked(fence)) dma_fence_set_error(fence, error); + dma_fence_unlock_irqrestore(fence, flags); } - spin_unlock_irqrestore(&drv->lock, flags); + rcu_read_unlock(); } /** @@ -714,16 +716,19 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af) seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask; /* mark all fences from the guilty context with an error */ - spin_lock_irqsave(&ring->fence_drv.lock, flags); + rcu_read_lock(); do { last_seq++; last_seq &= ring->fence_drv.num_fences_mask; ptr = &ring->fence_drv.fences[last_seq]; - rcu_read_lock(); - unprocessed = rcu_dereference(*ptr); + unprocessed = dma_fence_get_rcu_safe(ptr);
Similar concern like the above.
Regards,
Tvrtko
+ if (!unprocessed) + continue; - if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) { + dma_fence_lock_irqsave(unprocessed, flags); + if (dma_fence_is_signaled_locked(unprocessed)) { fence = container_of(unprocessed, struct amdgpu_fence, base); if (fence == af) @@ -731,9 +736,10 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af) else if (fence->context == af->context) dma_fence_set_error(&fence->base, -ECANCELED); } - rcu_read_unlock(); + dma_fence_unlock_irqrestore(unprocessed, flags); + dma_fence_put(unprocessed); } while (last_seq != seq); - spin_unlock_irqrestore(&ring->fence_drv.lock, flags); + rcu_read_unlock(); /* signal the guilty fence */ amdgpu_fence_write(ring, (u32)af->base.seqno); amdgpu_fence_process(ring); @@ -823,39 +829,10 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f) return true; } -/**
- amdgpu_fence_free - free up the fence memory
- @rcu: RCU callback head
- Free up the fence memory after the RCU grace period.
- */
-static void amdgpu_fence_free(struct rcu_head *rcu) -{ - struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
- /* free fence_slab if it's separated fence*/ - kfree(to_amdgpu_fence(f)); -}
-/**
- amdgpu_fence_release - callback that fence can be freed
- @f: fence
- This function is called when the reference count becomes zero.
- It just RCU schedules freeing up the fence.
- */
-static void amdgpu_fence_release(struct dma_fence *f) -{ - call_rcu(&f->rcu, amdgpu_fence_free); -}
static const struct dma_fence_ops amdgpu_fence_ops = { .get_driver_name = amdgpu_fence_get_driver_name, .get_timeline_name = amdgpu_fence_get_timeline_name, .enable_signaling = amdgpu_fence_enable_signaling, - .release = amdgpu_fence_release, }; /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 7a27c6c4bb44..9cbf63454004 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -125,7 +125,6 @@ struct amdgpu_fence_driver { unsigned irq_type; struct timer_list fallback_timer; unsigned num_fences_mask; - spinlock_t lock; struct dma_fence **fences; };
linaro-mm-sig@lists.linaro.org