When dumping IB contents from a hung job, amdgpu_devcoredump_format() acquires the VM root PD's reservation lock via amdgpu_vm_lock_by_pasid() and then, for each IB referenced by the job, calls amdgpu_bo_reserve() on the BO that backs the IB. Both reservations are taken on reservation_ww_class_mutex objects but neither uses a ww_acquire_ctx, which trips lockdep:
WARNING: possible recursive locking detected -------------------------------------------- kworker/u128:0 is trying to acquire lock: ffff88838b16e1f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
but task is already holding lock: ffff8882f82681f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
Possible unsafe locking scenario: CPU0 ---- lock(reservation_ww_class_mutex); lock(reservation_ww_class_mutex);
*** DEADLOCK *** May be due to missing lock nesting notation
Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu] Call Trace: __ww_mutex_lock.constprop.0 ww_mutex_lock amdgpu_bo_reserve amdgpu_devcoredump_format+0x1594 [amdgpu] amdgpu_devcoredump_deferred_work+0xea [amdgpu] process_one_work worker_thread kthread
The two reservations are on different BOs in the captured trace, so the splat is a lockdep-correctness warning, not an observed deadlock. It becomes a real self-deadlock whenever the IB BO shares its dma_resv with the root PD (the always-valid case, see amdgpu_vm_is_bo_always_valid()): amdgpu_bo_reserve(abo) re-acquires the same ww_mutex without a ticket and blocks forever.
With amdgpu.gpu_recovery=0 the timeout handler refires every ~2 s and each invocation produces this splat, drowning the kernel ring buffer.
Fix it by collecting the per-IB BO references under the root PD's reservation, then releasing the root before reserving each IB BO individually. The walk over the VM mapping tree must remain under the root lock (mappings can be torn down without it), but the actual content copies do not need to nest inside it. Each per-IB reservation is now an independent top-level acquire, eliminating the nested ww_mutex.
The collect/release logic is factored out into two small helpers (amdgpu_devcoredump_collect_ib_refs / amdgpu_devcoredump_release_ib_refs) to keep the main function's indentation reasonable.
This also fixes a BO refcount leak in the original code: when amdgpu_bo_reserve() failed, control jumped to free_ib_content without running amdgpu_bo_unref(). In the new structure the per-IB BO refs are released unconditionally in the cleanup helper.
Reproducer (~150 LoC libdrm_amdgpu): submit a single GFX IB containing PACKET3_INDIRECT_BUFFER chained at GPU VA 0 and wait for the fence. The TDR fires within ~10 s and the deferred coredump worker produces the splat above on every invocation.
Fixes: 7b15fc2d1f1a ("drm/amdgpu: dump job ibs in the devcoredump") Cc: stable@vger.kernel.org # 7.1 Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com --- .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 147 +++++++++++++----- 1 file changed, 110 insertions(+), 37 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index d386bc775d03..f6bb968de756 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -207,6 +207,72 @@ static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev, } }
+struct amdgpu_devcoredump_ib_ref { + struct amdgpu_bo *bo; + u64 offset; +}; + +/* + * Walk the VM's mapping tree under the root PD's reservation to obtain the BO + * that backs each IB and pin it with a refcount. The root PD reservation is + * dropped before this function returns; the caller can then reserve each IB + * BO individually without nesting ww_mutex acquires on + * reservation_ww_class_mutex. + * + * Returns an array of num_ibs entries (each ib_refs[i].bo may be NULL if its + * mapping was not found), or NULL on allocation failure / VM lookup failure. + * The caller must release the BO refs and free the array. + */ +static struct amdgpu_devcoredump_ib_ref * +amdgpu_devcoredump_collect_ib_refs(struct amdgpu_device *adev, + struct amdgpu_coredump_info *coredump) +{ + struct amdgpu_devcoredump_ib_ref *ib_refs; + struct amdgpu_bo_va_mapping *mapping; + struct amdgpu_bo *root; + struct amdgpu_vm *vm; + u64 va_start; + + ib_refs = kcalloc(coredump->num_ibs, sizeof(*ib_refs), GFP_KERNEL); + if (!ib_refs) + return NULL; + + vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid); + if (!vm) { + kfree(ib_refs); + return NULL; + } + + for (int i = 0; i < coredump->num_ibs; i++) { + va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK; + mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE); + if (!mapping) + continue; + + ib_refs[i].bo = amdgpu_bo_ref(mapping->bo_va->base.bo); + ib_refs[i].offset = va_start - + mapping->start * AMDGPU_GPU_PAGE_SIZE; + } + + amdgpu_bo_unreserve(root); + amdgpu_bo_unref(&root); + + return ib_refs; +} + +static void +amdgpu_devcoredump_release_ib_refs(struct amdgpu_devcoredump_ib_ref *ib_refs, + int num_ibs) +{ + if (!ib_refs) + return; + + for (int i = 0; i < num_ibs; i++) + if (ib_refs[i].bo) + amdgpu_bo_unref(&ib_refs[i].bo); + kfree(ib_refs); +} + static ssize_t amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_info *coredump) { @@ -214,13 +280,11 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf struct drm_printer p; struct drm_print_iterator iter; struct amdgpu_vm_fault_info *fault_info; - struct amdgpu_bo_va_mapping *mapping; struct amdgpu_ip_block *ip_block; struct amdgpu_res_cursor cursor; - struct amdgpu_bo *abo, *root; - uint64_t va_start, offset; + struct amdgpu_bo *abo; + uint64_t offset; struct amdgpu_ring *ring; - struct amdgpu_vm *vm; u32 *ib_content; uint8_t *kptr; int ver, i, j, r; @@ -343,43 +407,52 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf drm_printf(&p, "VRAM is lost due to GPU reset!\n");
if (coredump->num_ibs) { - /* Don't try to lookup the VM or map the BOs when calculating the - * size required to store the devcoredump. + struct amdgpu_devcoredump_ib_ref *ib_refs = NULL; + + /* + * Snapshot per-IB BO references under the root PD's reservation, + * then release the root before reserving each IB BO individually + * to copy its contents. + * + * Reserving an IB BO while the root PD is still reserved would + * be a nested ww_mutex acquire on reservation_ww_class_mutex + * without a ww_acquire_ctx, which trips lockdep's recursive- + * locking check and self-deadlocks for IB BOs that share their + * dma_resv with the root PD (always-valid BOs). + * + * Skip lookup/reservation entirely on the sizing pass: it does + * not write IB content, and the size estimate doesn't depend on + * whether the BOs are reachable. */ - if (sizing_pass) - vm = NULL; - else - vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid); + if (!sizing_pass) + ib_refs = amdgpu_devcoredump_collect_ib_refs(adev, coredump);
- for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) { + for (int i = 0; i < coredump->num_ibs; i++) { ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4, GFP_KERNEL); if (!ib_content) continue;
- /* vm=NULL can only happen when 'sizing_pass' is true. Skip to the - * drm_printf() calls (ib_content doesn't need to be initialized - * as its content won't be written anywhere). - */ - if (!vm) + if (sizing_pass) goto output_ib_content;
- va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK; - mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE); - if (!mapping) - goto free_ib_content; + if (!ib_refs || !ib_refs[i].bo) + goto output_ib_content; + + abo = ib_refs[i].bo; + offset = ib_refs[i].offset;
- offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE); - abo = amdgpu_bo_ref(mapping->bo_va->base.bo); r = amdgpu_bo_reserve(abo, false); if (r) - goto free_ib_content; + goto output_ib_content;
if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) { off = 0;
- if (abo->tbo.resource->mem_type != TTM_PL_VRAM) - goto unreserve_abo; + if (abo->tbo.resource->mem_type != TTM_PL_VRAM) { + amdgpu_bo_unreserve(abo); + goto output_ib_content; + }
amdgpu_res_first(abo->tbo.resource, offset, coredump->ibs[i].ib_size_dw * 4, @@ -395,8 +468,10 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf r = ttm_bo_kmap(&abo->tbo, 0, PFN_UP(abo->tbo.base.size), &abo->kmap); - if (r) - goto unreserve_abo; + if (r) { + amdgpu_bo_unreserve(abo); + goto output_ib_content; + }
kptr = amdgpu_bo_kptr(abo); kptr += offset; @@ -406,21 +481,19 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf amdgpu_bo_kunmap(abo); }
+ amdgpu_bo_unreserve(abo); + output_ib_content: drm_printf(&p, "\nIB #%d 0x%llx %d dw\n", i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw); - for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) - drm_printf(&p, "0x%08x\n", ib_content[j]); -unreserve_abo: - if (vm) - amdgpu_bo_unreserve(abo); -free_ib_content: + if (!sizing_pass && ib_refs && ib_refs[i].bo) { + for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) + drm_printf(&p, "0x%08x\n", ib_content[j]); + } kvfree(ib_content); } - if (vm) { - amdgpu_bo_unreserve(root); - amdgpu_bo_unref(&root); - } + + amdgpu_devcoredump_release_ib_refs(ib_refs, coredump->num_ibs); }
return count - iter.remain;
On Wed, Apr 29, 2026 at 7:37 PM Mikhail Gavrilov mikhail.v.gavrilov@gmail.com wrote:
When dumping IB contents from a hung job, amdgpu_devcoredump_format() acquires the VM root PD's reservation lock via amdgpu_vm_lock_by_pasid() and then, for each IB referenced by the job, calls amdgpu_bo_reserve() on the BO that backs the IB. Both reservations are taken on reservation_ww_class_mutex objects but neither uses a ww_acquire_ctx, which trips lockdep:
WARNING: possible recursive locking detected
kworker/u128:0 is trying to acquire lock: ffff88838b16e1f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
but task is already holding lock: ffff8882f82681f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
Possible unsafe locking scenario: CPU0 ---- lock(reservation_ww_class_mutex); lock(reservation_ww_class_mutex);
*** DEADLOCK *** May be due to missing lock nesting notation
Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu] Call Trace: __ww_mutex_lock.constprop.0 ww_mutex_lock amdgpu_bo_reserve amdgpu_devcoredump_format+0x1594 [amdgpu] amdgpu_devcoredump_deferred_work+0xea [amdgpu] process_one_work worker_thread kthread
Friendly ping. Pierre-Eric, Christian, Alex — any thoughts on this fix?
Happy to spin a v2 with any review feedback. One thing I'm aware of: the `Cc: stable@vger.kernel.org # 7.1` tag is probably unnecessary since the regression only landed in 7.1-rc1 and the fix will reach 7.1 final naturally via drm-fixes; I can drop it in v2 if preferred.
On 5/19/26 15:05, Mikhail Gavrilov wrote:
On Wed, Apr 29, 2026 at 7:37 PM Mikhail Gavrilov mikhail.v.gavrilov@gmail.com wrote:
When dumping IB contents from a hung job, amdgpu_devcoredump_format() acquires the VM root PD's reservation lock via amdgpu_vm_lock_by_pasid() and then, for each IB referenced by the job, calls amdgpu_bo_reserve() on the BO that backs the IB. Both reservations are taken on reservation_ww_class_mutex objects but neither uses a ww_acquire_ctx, which trips lockdep:
WARNING: possible recursive locking detected
kworker/u128:0 is trying to acquire lock: ffff88838b16e1f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
but task is already holding lock: ffff8882f82681f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
Possible unsafe locking scenario: CPU0 ---- lock(reservation_ww_class_mutex); lock(reservation_ww_class_mutex);
*** DEADLOCK *** May be due to missing lock nesting notation
Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu] Call Trace: __ww_mutex_lock.constprop.0 ww_mutex_lock amdgpu_bo_reserve amdgpu_devcoredump_format+0x1594 [amdgpu] amdgpu_devcoredump_deferred_work+0xea [amdgpu] process_one_work worker_thread kthread
Friendly ping. Pierre-Eric, Christian, Alex — any thoughts on this fix?
Happy to spin a v2 with any review feedback. One thing I'm aware of: the `Cc: stable@vger.kernel.org # 7.1` tag is probably unnecessary since the regression only landed in 7.1-rc1 and the fix will reach 7.1 final naturally via drm-fixes; I can drop it in v2 if preferred.
Good catch, but the fix is complete overkill.
You can lock multiple BOs at the same time, something like that here should do it:
drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 2); drm_exec_until_all_locked(&exec) { ret = amdgpu_vm_lock_pd(vm, &exec, 1); drm_exec_retry_on_contention(&exec); if (unlikely(ret)) goto fail_lock;
mapping = amdgpu_vm_bo_lookup_mapping(vm, ib_addr >> PAGE_SHIFT); if (!wptr_mapping) { ret = -EINVAL; goto fail_lock; }
obj = mapping->bo_va->base.bo; ret = drm_exec_lock_obj(&exec, &obj->tbo.base); drm_exec_retry_on_contention(&exec); if (unlikely(ret)) goto fail_lock; }
@Pierre-Eric can you take a look at that as well?
Thanks in advance, Christian.
When dumping IB contents from a hung job, amdgpu_devcoredump_format() acquires the VM root PD's reservation lock via amdgpu_vm_lock_by_pasid() and then, for each IB referenced by the job, calls amdgpu_bo_reserve() on the BO that backs the IB. Both reservations are taken on reservation_ww_class_mutex objects but neither uses a ww_acquire_ctx, which trips lockdep:
WARNING: possible recursive locking detected -------------------------------------------- kworker/u128:0 is trying to acquire lock: ffff88838b16e1f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
but task is already holding lock: ffff8882f82681f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
Possible unsafe locking scenario: CPU0 ---- lock(reservation_ww_class_mutex); lock(reservation_ww_class_mutex);
*** DEADLOCK *** May be due to missing lock nesting notation
Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu] Call Trace: __ww_mutex_lock.constprop.0 ww_mutex_lock amdgpu_bo_reserve amdgpu_devcoredump_format+0x1594 [amdgpu] amdgpu_devcoredump_deferred_work+0xea [amdgpu]
The two reservations are on different BOs in the captured trace, so the splat is a lockdep-correctness warning, not an observed deadlock. It becomes a real self-deadlock whenever the IB BO shares its dma_resv with the root PD (the always-valid case, see amdgpu_vm_is_bo_always_valid()): amdgpu_bo_reserve(abo) re-acquires the same ww_mutex without a ticket and blocks forever.
Fix it in two steps:
1. Collect per-IB BO references under the root PD's reservation, then release the root before locking the IB BOs. The walk over the VM mapping tree must remain under the root lock (mappings can be torn down without it), but the actual content copies do not.
2. Lock all the IB BOs together using drm_exec(9) with a single ww_acquire_ctx. DRM_EXEC_IGNORE_DUPLICATES handles the case where IB BOs share a dma_resv (e.g. always-valid BOs). Each lock attempt is now a top-level acquire under one ticket, with retry-on- contention handled by drm_exec; the recursive ww_mutex condition is gone.
The collect/lock/release logic is factored out into three small helpers (amdgpu_devcoredump_{collect,lock,release}_ib_refs) to keep the main function readable and within the kernel coding style indentation guideline.
This also fixes a BO refcount leak in the original code: when amdgpu_bo_reserve() failed, control jumped to free_ib_content without running amdgpu_bo_unref(). In the new structure the per-IB BO refs are released unconditionally in the cleanup helper.
Reproducer (~150 LoC libdrm_amdgpu): submit a single GFX IB containing PACKET3_INDIRECT_BUFFER chained at GPU VA 0 and wait for the fence. The TDR fires within ~10 s and the deferred coredump worker produces the splat above on every invocation.
v2: switch from per-IB amdgpu_bo_reserve() to drm_exec for the IB BO locking as suggested by Christian König; the snapshot approach for collecting BO references under the root PD's reservation is retained.
Fixes: 7b15fc2d1f1a ("drm/amdgpu: dump job ibs in the devcoredump") Suggested-by: Christian König christian.koenig@amd.com Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com --- .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 187 ++++++++++++++---- 1 file changed, 148 insertions(+), 39 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index d386bc775d03..9ac958cf09fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -24,6 +24,7 @@
#include <generated/utsrelease.h> #include <linux/devcoredump.h> +#include <drm/drm_exec.h> #include "amdgpu_dev_coredump.h" #include "atom.h"
@@ -207,6 +208,108 @@ static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev, } }
+struct amdgpu_devcoredump_ib_ref { + struct amdgpu_bo *bo; + u64 offset; +}; + +/* + * Walk the VM's mapping tree under the root PD's reservation to obtain the BO + * that backs each IB and pin it with a refcount. The root PD reservation is + * dropped before this function returns; the caller can then lock each IB BO + * via drm_exec without nesting reservations on reservation_ww_class_mutex. + * + * Returns an array of num_ibs entries (each ib_refs[i].bo may be NULL if its + * mapping was not found), or NULL on allocation failure / VM lookup failure. + * The caller must release the BO refs and free the array via + * amdgpu_devcoredump_release_ib_refs(). + */ +static struct amdgpu_devcoredump_ib_ref * +amdgpu_devcoredump_collect_ib_refs(struct amdgpu_device *adev, + struct amdgpu_coredump_info *coredump) +{ + struct amdgpu_devcoredump_ib_ref *ib_refs; + struct amdgpu_bo_va_mapping *mapping; + struct amdgpu_bo *root; + struct amdgpu_vm *vm; + u64 va_start; + + ib_refs = kcalloc(coredump->num_ibs, sizeof(*ib_refs), GFP_KERNEL); + if (!ib_refs) + return NULL; + + vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid); + if (!vm) { + kfree(ib_refs); + return NULL; + } + + for (int i = 0; i < coredump->num_ibs; i++) { + va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK; + mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE); + if (!mapping) + continue; + + ib_refs[i].bo = amdgpu_bo_ref(mapping->bo_va->base.bo); + ib_refs[i].offset = va_start - + mapping->start * AMDGPU_GPU_PAGE_SIZE; + } + + amdgpu_bo_unreserve(root); + amdgpu_bo_unref(&root); + + return ib_refs; +} + +static void +amdgpu_devcoredump_release_ib_refs(struct amdgpu_devcoredump_ib_ref *ib_refs, + int num_ibs) +{ + if (!ib_refs) + return; + + for (int i = 0; i < num_ibs; i++) + if (ib_refs[i].bo) + amdgpu_bo_unref(&ib_refs[i].bo); + kfree(ib_refs); +} + +/* + * Lock all collected IB BOs together using a single drm_exec ticket. This + * eliminates the nested ww_mutex acquire that lockdep flags as recursive + * locking (and that becomes a real self-deadlock for IB BOs sharing their + * dma_resv with the root PD). + * + * Returns 0 if drm_exec was initialised and the BOs are locked; the caller + * must call drm_exec_fini() on success. Returns non-zero on failure, in which + * case drm_exec is already torn down. + */ +static int +amdgpu_devcoredump_lock_ib_refs(struct drm_exec *exec, + struct amdgpu_devcoredump_ib_ref *ib_refs, + int num_ibs) +{ + int r = 0; + + drm_exec_init(exec, DRM_EXEC_IGNORE_DUPLICATES, num_ibs); + drm_exec_until_all_locked(exec) { + r = 0; + for (int i = 0; i < num_ibs; i++) { + if (!ib_refs[i].bo) + continue; + r = drm_exec_lock_obj(exec, &ib_refs[i].bo->tbo.base); + drm_exec_retry_on_contention(exec); + if (r) + break; + } + if (r) + break; + } + if (r) + drm_exec_fini(exec); + return r; +} + static ssize_t amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_info *coredump) { @@ -214,13 +317,9 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf struct drm_printer p; struct drm_print_iterator iter; struct amdgpu_vm_fault_info *fault_info; - struct amdgpu_bo_va_mapping *mapping; struct amdgpu_ip_block *ip_block; struct amdgpu_res_cursor cursor; - struct amdgpu_bo *abo, *root; - uint64_t va_start, offset; struct amdgpu_ring *ring; - struct amdgpu_vm *vm; u32 *ib_content; uint8_t *kptr; int ver, i, j, r; @@ -343,43 +442,52 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf drm_printf(&p, "VRAM is lost due to GPU reset!\n");
if (coredump->num_ibs) { - /* Don't try to lookup the VM or map the BOs when calculating the - * size required to store the devcoredump. + struct amdgpu_devcoredump_ib_ref *ib_refs = NULL; + struct drm_exec exec; + bool ibs_locked = false; + + /* + * Collect the BO that backs each IB under the root PD's + * reservation, drop the root reservation, then lock all the + * IB BOs together in one drm_exec ticket. This avoids nesting + * amdgpu_bo_reserve() inside the root PD's reservation, which + * would be a recursive reservation_ww_class_mutex acquire + * without a ww_acquire_ctx (lockdep splat, and a real + * self-deadlock for always-valid BOs that share their dma_resv + * with the root PD). + * + * Skip lookup/locking entirely on the sizing pass: it does not + * write IB content, and the size estimate doesn't depend on + * whether the BOs are reachable. */ - if (sizing_pass) - vm = NULL; - else - vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid); + if (!sizing_pass) { + ib_refs = amdgpu_devcoredump_collect_ib_refs(adev, coredump); + if (ib_refs) { + r = amdgpu_devcoredump_lock_ib_refs(&exec, ib_refs, + coredump->num_ibs); + if (!r) + ibs_locked = true; + } + } + + for (int i = 0; i < coredump->num_ibs; i++) { + struct amdgpu_bo *abo = ibs_locked ? ib_refs[i].bo : NULL; + u64 offset = ibs_locked ? ib_refs[i].offset : 0; + bool emit_content = sizing_pass;
- for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) { ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4, GFP_KERNEL); if (!ib_content) continue;
- /* vm=NULL can only happen when 'sizing_pass' is true. Skip to the - * drm_printf() calls (ib_content doesn't need to be initialized - * as its content won't be written anywhere). - */ - if (!vm) + if (!abo) goto output_ib_content;
- va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK; - mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE); - if (!mapping) - goto free_ib_content; - - offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE); - abo = amdgpu_bo_ref(mapping->bo_va->base.bo); - r = amdgpu_bo_reserve(abo, false); - if (r) - goto free_ib_content; - if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) { off = 0;
if (abo->tbo.resource->mem_type != TTM_PL_VRAM) - goto unreserve_abo; + goto output_ib_content;
amdgpu_res_first(abo->tbo.resource, offset, coredump->ibs[i].ib_size_dw * 4, @@ -391,12 +499,13 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf off += cursor.size; amdgpu_res_next(&cursor, cursor.size); } + emit_content = true; } else { r = ttm_bo_kmap(&abo->tbo, 0, PFN_UP(abo->tbo.base.size), &abo->kmap); if (r) - goto unreserve_abo; + goto output_ib_content;
kptr = amdgpu_bo_kptr(abo); kptr += offset; @@ -404,23 +513,23 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf coredump->ibs[i].ib_size_dw * 4);
amdgpu_bo_kunmap(abo); + emit_content = true; }
output_ib_content: drm_printf(&p, "\nIB #%d 0x%llx %d dw\n", i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw); - for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) - drm_printf(&p, "0x%08x\n", ib_content[j]); -unreserve_abo: - if (vm) - amdgpu_bo_unreserve(abo); -free_ib_content: + if (emit_content) { + for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) + drm_printf(&p, "0x%08x\n", ib_content[j]); + } kvfree(ib_content); } - if (vm) { - amdgpu_bo_unreserve(root); - amdgpu_bo_unref(&root); - } + + if (ibs_locked) + drm_exec_fini(&exec); + + amdgpu_devcoredump_release_ib_refs(ib_refs, coredump->num_ibs); }
return count - iter.remain;
On 5/19/26 18:15, Mikhail Gavrilov wrote:
When dumping IB contents from a hung job, amdgpu_devcoredump_format() acquires the VM root PD's reservation lock via amdgpu_vm_lock_by_pasid() and then, for each IB referenced by the job, calls amdgpu_bo_reserve() on the BO that backs the IB. Both reservations are taken on reservation_ww_class_mutex objects but neither uses a ww_acquire_ctx, which trips lockdep:
WARNING: possible recursive locking detected
kworker/u128:0 is trying to acquire lock: ffff88838b16e1f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
but task is already holding lock: ffff8882f82681f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
Possible unsafe locking scenario: CPU0 ---- lock(reservation_ww_class_mutex); lock(reservation_ww_class_mutex);
*** DEADLOCK *** May be due to missing lock nesting notation
Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu] Call Trace: __ww_mutex_lock.constprop.0 ww_mutex_lock amdgpu_bo_reserve amdgpu_devcoredump_format+0x1594 [amdgpu] amdgpu_devcoredump_deferred_work+0xea [amdgpu]
The two reservations are on different BOs in the captured trace, so the splat is a lockdep-correctness warning, not an observed deadlock. It becomes a real self-deadlock whenever the IB BO shares its dma_resv with the root PD (the always-valid case, see amdgpu_vm_is_bo_always_valid()): amdgpu_bo_reserve(abo) re-acquires the same ww_mutex without a ticket and blocks forever.
Fix it in two steps:
Collect per-IB BO references under the root PD's reservation, then release the root before locking the IB BOs. The walk over the VM mapping tree must remain under the root lock (mappings can be torn down without it), but the actual content copies do not.
Lock all the IB BOs together using drm_exec(9) with a single ww_acquire_ctx. DRM_EXEC_IGNORE_DUPLICATES handles the case where IB BOs share a dma_resv (e.g. always-valid BOs). Each lock attempt is now a top-level acquire under one ticket, with retry-on- contention handled by drm_exec; the recursive ww_mutex condition is gone.
The collect/lock/release logic is factored out into three small helpers (amdgpu_devcoredump_{collect,lock,release}_ib_refs) to keep the main function readable and within the kernel coding style indentation guideline.
This also fixes a BO refcount leak in the original code: when amdgpu_bo_reserve() failed, control jumped to free_ib_content without running amdgpu_bo_unref(). In the new structure the per-IB BO refs are released unconditionally in the cleanup helper.
Reproducer (~150 LoC libdrm_amdgpu): submit a single GFX IB containing PACKET3_INDIRECT_BUFFER chained at GPU VA 0 and wait for the fence. The TDR fires within ~10 s and the deferred coredump worker produces the splat above on every invocation.
v2: switch from per-IB amdgpu_bo_reserve() to drm_exec for the IB BO locking as suggested by Christian König; the snapshot approach for collecting BO references under the root PD's reservation is retained.
Fixes: 7b15fc2d1f1a ("drm/amdgpu: dump job ibs in the devcoredump") Suggested-by: Christian König christian.koenig@amd.com Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com
.../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 187 ++++++++++++++---- 1 file changed, 148 insertions(+), 39 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index d386bc775d03..9ac958cf09fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -24,6 +24,7 @@ #include <generated/utsrelease.h> #include <linux/devcoredump.h> +#include <drm/drm_exec.h> #include "amdgpu_dev_coredump.h" #include "atom.h" @@ -207,6 +208,108 @@ static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev, } } +struct amdgpu_devcoredump_ib_ref {
- struct amdgpu_bo *bo;
- u64 offset;
+};
+/*
- Walk the VM's mapping tree under the root PD's reservation to obtain the BO
- that backs each IB and pin it with a refcount. The root PD reservation is
- dropped before this function returns; the caller can then lock each IB BO
- via drm_exec without nesting reservations on reservation_ww_class_mutex.
- Returns an array of num_ibs entries (each ib_refs[i].bo may be NULL if its
- mapping was not found), or NULL on allocation failure / VM lookup failure.
- The caller must release the BO refs and free the array via
- amdgpu_devcoredump_release_ib_refs().
- */
+static struct amdgpu_devcoredump_ib_ref * +amdgpu_devcoredump_collect_ib_refs(struct amdgpu_device *adev,
struct amdgpu_coredump_info *coredump)+{
- struct amdgpu_devcoredump_ib_ref *ib_refs;
- struct amdgpu_bo_va_mapping *mapping;
- struct amdgpu_bo *root;
- struct amdgpu_vm *vm;
- u64 va_start;
- ib_refs = kcalloc(coredump->num_ibs, sizeof(*ib_refs), GFP_KERNEL);
- if (!ib_refs)
return NULL;- vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid);
- if (!vm) {
kfree(ib_refs);return NULL;- }
- for (int i = 0; i < coredump->num_ibs; i++) {
va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK;mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE);if (!mapping)continue;ib_refs[i].bo = amdgpu_bo_ref(mapping->bo_va->base.bo);ib_refs[i].offset = va_start -mapping->start * AMDGPU_GPU_PAGE_SIZE;- }
- amdgpu_bo_unreserve(root);
- amdgpu_bo_unref(&root);
- return ib_refs;
+}
That whole infrastructure is superflous. You just need to modify amdgpu_vm_lock_by_pasid() to take a drm_exec object to lock the root BO.
Regards, Christian.
+static void +amdgpu_devcoredump_release_ib_refs(struct amdgpu_devcoredump_ib_ref *ib_refs,
int num_ibs)+{
- if (!ib_refs)
return;- for (int i = 0; i < num_ibs; i++)
if (ib_refs[i].bo)amdgpu_bo_unref(&ib_refs[i].bo);- kfree(ib_refs);
+}
+/*
- Lock all collected IB BOs together using a single drm_exec ticket. This
- eliminates the nested ww_mutex acquire that lockdep flags as recursive
- locking (and that becomes a real self-deadlock for IB BOs sharing their
- dma_resv with the root PD).
- Returns 0 if drm_exec was initialised and the BOs are locked; the caller
- must call drm_exec_fini() on success. Returns non-zero on failure, in which
- case drm_exec is already torn down.
- */
+static int +amdgpu_devcoredump_lock_ib_refs(struct drm_exec *exec,
struct amdgpu_devcoredump_ib_ref *ib_refs,int num_ibs)+{
- int r = 0;
- drm_exec_init(exec, DRM_EXEC_IGNORE_DUPLICATES, num_ibs);
- drm_exec_until_all_locked(exec) {
r = 0;for (int i = 0; i < num_ibs; i++) {if (!ib_refs[i].bo)continue;r = drm_exec_lock_obj(exec, &ib_refs[i].bo->tbo.base);drm_exec_retry_on_contention(exec);if (r)break;}if (r)break;- }
- if (r)
drm_exec_fini(exec);- return r;
+}
static ssize_t amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_info *coredump) { @@ -214,13 +317,9 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf struct drm_printer p; struct drm_print_iterator iter; struct amdgpu_vm_fault_info *fault_info;
- struct amdgpu_bo_va_mapping *mapping; struct amdgpu_ip_block *ip_block; struct amdgpu_res_cursor cursor;
- struct amdgpu_bo *abo, *root;
- uint64_t va_start, offset; struct amdgpu_ring *ring;
- struct amdgpu_vm *vm; u32 *ib_content; uint8_t *kptr; int ver, i, j, r;
@@ -343,43 +442,52 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf drm_printf(&p, "VRAM is lost due to GPU reset!\n"); if (coredump->num_ibs) {
/* Don't try to lookup the VM or map the BOs when calculating the* size required to store the devcoredump.
struct amdgpu_devcoredump_ib_ref *ib_refs = NULL;struct drm_exec exec;bool ibs_locked = false;/** Collect the BO that backs each IB under the root PD's* reservation, drop the root reservation, then lock all the* IB BOs together in one drm_exec ticket. This avoids nesting* amdgpu_bo_reserve() inside the root PD's reservation, which* would be a recursive reservation_ww_class_mutex acquire* without a ww_acquire_ctx (lockdep splat, and a real* self-deadlock for always-valid BOs that share their dma_resv* with the root PD).** Skip lookup/locking entirely on the sizing pass: it does not* write IB content, and the size estimate doesn't depend on */* whether the BOs are reachable.
if (sizing_pass)vm = NULL;elsevm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid);
if (!sizing_pass) {ib_refs = amdgpu_devcoredump_collect_ib_refs(adev, coredump);if (ib_refs) {r = amdgpu_devcoredump_lock_ib_refs(&exec, ib_refs,coredump->num_ibs);if (!r)ibs_locked = true;}}for (int i = 0; i < coredump->num_ibs; i++) {struct amdgpu_bo *abo = ibs_locked ? ib_refs[i].bo : NULL;u64 offset = ibs_locked ? ib_refs[i].offset : 0;bool emit_content = sizing_pass;
for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) { ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4, GFP_KERNEL); if (!ib_content) continue;
/* vm=NULL can only happen when 'sizing_pass' is true. Skip to the* drm_printf() calls (ib_content doesn't need to be initialized* as its content won't be written anywhere).*/if (!vm)
if (!abo) goto output_ib_content;
va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK;mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE);if (!mapping)goto free_ib_content;offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE);abo = amdgpu_bo_ref(mapping->bo_va->base.bo);r = amdgpu_bo_reserve(abo, false);if (r)goto free_ib_content;if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) { off = 0;if (abo->tbo.resource->mem_type != TTM_PL_VRAM)
goto unreserve_abo;
goto output_ib_content;amdgpu_res_first(abo->tbo.resource, offset, coredump->ibs[i].ib_size_dw * 4, @@ -391,12 +499,13 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf off += cursor.size; amdgpu_res_next(&cursor, cursor.size); }
emit_content = true; } else { r = ttm_bo_kmap(&abo->tbo, 0, PFN_UP(abo->tbo.base.size), &abo->kmap); if (r)
goto unreserve_abo;
goto output_ib_content;kptr = amdgpu_bo_kptr(abo); kptr += offset; @@ -404,23 +513,23 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf coredump->ibs[i].ib_size_dw * 4); amdgpu_bo_kunmap(abo);
emit_content = true; }output_ib_content: drm_printf(&p, "\nIB #%d 0x%llx %d dw\n", i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw);
for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++)drm_printf(&p, "0x%08x\n", ib_content[j]);-unreserve_abo:
if (vm)amdgpu_bo_unreserve(abo);-free_ib_content:
if (emit_content) {for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++)drm_printf(&p, "0x%08x\n", ib_content[j]); }} kvfree(ib_content);
if (vm) {amdgpu_bo_unreserve(root);amdgpu_bo_unref(&root);}
if (ibs_locked)drm_exec_fini(&exec); }amdgpu_devcoredump_release_ib_refs(ib_refs, coredump->num_ibs);return count - iter.remain;
On Wed, May 20, 2026 at 12:08 PM Christian König christian.koenig@amd.com wrote:
That whole infrastructure is superflous. You just need to modify amdgpu_vm_lock_by_pasid() to take a drm_exec object to lock the root BO.
Christian, modifying amdgpu_vm_lock_by_pasid() to take a drm_exec turns out to also require converting its other caller, amdgpu_vm_handle_fault(), to drm_exec — most of the diff is that conversion, not the helper itself.
I can: (a) convert both in a 2-patch series (handle_fault becomes drm_exec_init + drm_exec_until_all_locked + drm_exec_fini, ~30 lines), or (b) keep the loop inside amdgpu_vm_lock_by_pasid() so handle_fault stays a one-liner — but then the devcoredump caller can't add the IB BOs to the same ticket, which is the whole point.
(a) seems unavoidable if we want one helper. Is that what you had in mind, or did you intend something lighter — e.g. a separate amdgpu_vm_lock_by_pasid_exec() leaving handle_fault untouched?
On 5/20/26 10:07, Mikhail Gavrilov wrote:
On Wed, May 20, 2026 at 12:08 PM Christian König christian.koenig@amd.com wrote:
That whole infrastructure is superflous. You just need to modify amdgpu_vm_lock_by_pasid() to take a drm_exec object to lock the root BO.
Christian, modifying amdgpu_vm_lock_by_pasid() to take a drm_exec turns out to also require converting its other caller, amdgpu_vm_handle_fault(), to drm_exec — most of the diff is that conversion, not the helper itself.
I can: (a) convert both in a 2-patch series (handle_fault becomes drm_exec_init + drm_exec_until_all_locked + drm_exec_fini, ~30 lines), or (b) keep the loop inside amdgpu_vm_lock_by_pasid() so handle_fault stays a one-liner — but then the devcoredump caller can't add the IB BOs to the same ticket, which is the whole point.
(a) seems unavoidable if we want one helper. Is that what you had in mind, or did you intend something lighter — e.g. a separate amdgpu_vm_lock_by_pasid_exec() leaving handle_fault untouched?
Just make that two patches, first switching over amdgpu_vm_lock_by_pasid() to using drm_exec() on both use cases.
And then changing the one for device core dumping to lock all BOs at once.
Thanks, Christian.
This series fixes a lockdep "possible recursive locking" splat in amdgpu_devcoredump_format() that fires on every GPU timeout once a job with a PASID context is involved. With amdgpu.gpu_recovery=0 the timeout handler refires every ~2 s, so the splat repeats until it drowns the kernel ring buffer. It is also a real self-deadlock for IB BOs that share their dma_resv with the root PD (the always-valid case).
The root cause: amdgpu_devcoredump_format() holds the VM root PD's reservation and then reserves each IB BO on top of it, nesting two reservation_ww_class_mutex acquires without a ww_acquire_ctx.
v1 fixed this with a snapshot helper that collected BO references under the root reservation and reserved them one by one afterwards. Christian pointed out that drm_exec already solves exactly this — lock everything in one ww ticket — and suggested teaching amdgpu_vm_lock_by_pasid() to take a drm_exec context. This v3 follows that approach.
Because amdgpu_vm_lock_by_pasid() has a second caller in the page-fault path, the series is split so each patch builds and works on its own:
1/2 Convert amdgpu_vm_lock_by_pasid() to take a drm_exec context and lock the root PD via amdgpu_vm_lock_pd(). Updates the existing caller, amdgpu_vm_handle_fault(). Pure refactor, no functional change to the page-fault path.
2/2 Use the new signature in amdgpu_devcoredump_format(): lock the root PD and every IB BO together in one drm_exec ticket. The per-IB amdgpu_bo_reserve() nesting is gone, along with a BO refcount leak on the old reserve-failure path. This is the actual bug fix and carries the Fixes: tag.
Tested on Linux 7.1-rc4 + this series, Radeon RX 7900 XTX (gfx1100), KASAN + PROVE_LOCKING enabled, using a small libdrm_amdgpu reproducer that submits a GFX IB chained at GPU VA 0 and waits for the hang. Before the series the splat fires on every TDR; after it the dmesg is clean across repeated timeouts and the devcoredump output is unchanged.
v1: https://lore.kernel.org/amd-gfx/20260429143743.50743-1-mikhail.v.gavrilov@gm... v2: https://lore.kernel.org/amd-gfx/20260519161541.19994-1-mikhail.v.gavrilov@gm...
Changes since v2: - Reworked along the lines Christian suggested: instead of a private snapshot helper and a separate drm_exec pass, amdgpu_vm_lock_by_pasid() now takes a drm_exec context directly (patch 1), and the devcoredump code locks the root PD and all IB BOs in a single ticket (patch 2). - Dropped the amdgpu_devcoredump_ib_ref struct and the three collect/lock/release helpers from v2 entirely.
Changes since v1: - Switched from per-IB amdgpu_bo_reserve() to drm_exec. - Dropped the Cc: stable tag: the regression only landed in 7.1-rc1, so the fix reaches 7.1 via drm-fixes without a stable backport.
Mikhail Gavrilov (2): drm/amdgpu: convert amdgpu_vm_lock_by_pasid() to drm_exec drm/amdgpu: fix recursive ww_mutex acquire in amdgpu_devcoredump_format
.../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 103 ++++++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 72 ++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 3 +- 3 files changed, 122 insertions(+), 56 deletions(-)
amdgpu_vm_lock_by_pasid() looks up a VM by PASID and reserves its root PD with a bare amdgpu_bo_reserve(), returning the still-reserved root to the caller. A caller that then needs to reserve further BOs (for example the devcoredump IB dump) ends up nesting reservation_ww_class_mutex acquires without a ww_acquire_ctx, which lockdep flags as recursive locking.
Convert the helper to take a drm_exec context and lock the root PD via amdgpu_vm_lock_pd() instead. Callers now run it inside a drm_exec_until_all_locked() loop and can lock additional BOs in the same ww ticket, so there is no nested ww_mutex acquire.
The only existing caller, amdgpu_vm_handle_fault(), is updated accordingly. Its is_compute_context path, which previously dropped the root reservation around svm_range_restore_pages() and re-took it, now finalises the drm_exec context and re-initialises a fresh one; behaviour is otherwise unchanged.
No functional change intended for the page-fault path.
Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 72 ++++++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 3 +- 2 files changed, 51 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 9ba9de16a27a..3a22670b733f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2950,14 +2950,22 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) }
/** - * amdgpu_vm_lock_by_pasid - return an amdgpu_vm and its root bo from a pasid, if possible. + * amdgpu_vm_lock_by_pasid - look up a VM by PASID and lock its root PD * @adev: amdgpu device pointer - * @root: root BO of the VM + * @root: out: reference to the VM's root BO, dropped by the caller * @pasid: PASID of the VM - * The caller needs to unreserve and unref the root bo on success. + * @exec: drm_exec context to lock the root PD in + * + * Must be called from within a drm_exec_until_all_locked() loop; the caller + * runs drm_exec_retry_on_contention() afterwards and drops the *root + * reference once the drm_exec context is finalised. + * + * Return: the VM on success, or NULL if the PASID has no VM, the VM is being + * torn down, or locking the root PD failed. */ struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, - struct amdgpu_bo **root, u32 pasid) + struct amdgpu_bo **root, u32 pasid, + struct drm_exec *exec) { unsigned long irqflags; struct amdgpu_vm *vm; @@ -2971,9 +2979,11 @@ struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, if (!*root) return NULL;
- r = amdgpu_bo_reserve(*root, true); - if (r) - goto error_unref; + r = amdgpu_vm_lock_pd(vm, exec, 0); + if (r) { + amdgpu_bo_unref(root); + return NULL; + }
/* Double check that the VM still exists */ xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); @@ -2981,16 +2991,12 @@ struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, if (vm && vm->root.bo != *root) vm = NULL; xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags); - if (!vm) - goto error_unlock; + if (!vm) { + amdgpu_bo_unref(root); + return NULL; + }
return vm; -error_unlock: - amdgpu_bo_unreserve(*root); - -error_unref: - amdgpu_bo_unref(root); - return NULL; }
/** @@ -3013,20 +3019,32 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, { bool is_compute_context = false; struct amdgpu_bo *root; + struct drm_exec exec; uint64_t value, flags; struct amdgpu_vm *vm; int r;
- vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid); - if (!vm) + drm_exec_init(&exec, 0, 0); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid, &exec); + drm_exec_retry_on_contention(&exec); + if (!vm) + break; + } + if (!vm) { + drm_exec_fini(&exec); return false; + }
is_compute_context = vm->is_compute_context;
if (is_compute_context) { - /* Unreserve root since svm_range_restore_pages might try to reserve it. */ - /* TODO: rework svm_range_restore_pages so that this isn't necessary. */ - amdgpu_bo_unreserve(root); + /* Release the root PD lock since svm_range_restore_pages + * might try to take it. + * TODO: rework svm_range_restore_pages so that this isn't + * necessary. + */ + drm_exec_fini(&exec);
if (!svm_range_restore_pages(adev, pasid, vmid, node_id, addr >> PAGE_SHIFT, ts, write_fault)) { @@ -3036,9 +3054,17 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, amdgpu_bo_unref(&root);
/* Re-acquire the VM lock, could be that the VM was freed in between. */ - vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid); - if (!vm) + drm_exec_init(&exec, 0, 0); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid, &exec); + drm_exec_retry_on_contention(&exec); + if (!vm) + break; + } + if (!vm) { + drm_exec_fini(&exec); return false; + } }
addr /= AMDGPU_GPU_PAGE_SIZE; @@ -3076,7 +3102,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, r = amdgpu_vm_update_pdes(adev, vm, true);
error_unlock: - amdgpu_bo_unreserve(root); + drm_exec_fini(&exec); if (r < 0) dev_err(adev->dev, "Can't handle page fault (%d)\n", r);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index d083d7aab75c..af292c2fc521 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -593,7 +593,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, bool write_fault);
struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, - struct amdgpu_bo **root, u32 pasid); + struct amdgpu_bo **root, u32 pasid, + struct drm_exec *exec);
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
On 5/20/26 17:17, Mikhail Gavrilov wrote:
amdgpu_vm_lock_by_pasid() looks up a VM by PASID and reserves its root PD with a bare amdgpu_bo_reserve(), returning the still-reserved root to the caller. A caller that then needs to reserve further BOs (for example the devcoredump IB dump) ends up nesting reservation_ww_class_mutex acquires without a ww_acquire_ctx, which lockdep flags as recursive locking.
Convert the helper to take a drm_exec context and lock the root PD via amdgpu_vm_lock_pd() instead. Callers now run it inside a drm_exec_until_all_locked() loop and can lock additional BOs in the same ww ticket, so there is no nested ww_mutex acquire.
The only existing caller, amdgpu_vm_handle_fault(), is updated accordingly. Its is_compute_context path, which previously dropped the root reservation around svm_range_restore_pages() and re-took it, now finalises the drm_exec context and re-initialises a fresh one; behaviour is otherwise unchanged.
No functional change intended for the page-fault path.
Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 72 ++++++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 3 +- 2 files changed, 51 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 9ba9de16a27a..3a22670b733f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2950,14 +2950,22 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) } /**
- amdgpu_vm_lock_by_pasid - return an amdgpu_vm and its root bo from a pasid, if possible.
- amdgpu_vm_lock_by_pasid - look up a VM by PASID and lock its root PD
- @adev: amdgpu device pointer
- @root: root BO of the VM
- @root: out: reference to the VM's root BO, dropped by the caller
- @pasid: PASID of the VM
- The caller needs to unreserve and unref the root bo on success.
- @exec: drm_exec context to lock the root PD in
- Must be called from within a drm_exec_until_all_locked() loop; the caller
- runs drm_exec_retry_on_contention() afterwards and drops the *root
- reference once the drm_exec context is finalised.
- Return: the VM on success, or NULL if the PASID has no VM, the VM is being
*/
- torn down, or locking the root PD failed.
struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev,
struct amdgpu_bo **root, u32 pasid)
struct amdgpu_bo **root, u32 pasid,
I think we can drop the root parameter now, the exec reference should be sufficient.
struct drm_exec *exec){ unsigned long irqflags; struct amdgpu_vm *vm; @@ -2971,9 +2979,11 @@ struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, if (!*root) return NULL;
- r = amdgpu_bo_reserve(*root, true);
- if (r)
goto error_unref;
- r = amdgpu_vm_lock_pd(vm, exec, 0);
amdgpu_vm_lock_pd() can't be used here since we can't gurantee that the VM pointer wouldn't go away.
Just do:
r = drm_exec_lock_obj(exec, root->tbo.base);
- if (r) {
amdgpu_bo_unref(root);return NULL;- }
/* Double check that the VM still exists */ xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); @@ -2981,16 +2991,12 @@ struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, if (vm && vm->root.bo != *root) vm = NULL; xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
- if (!vm)
goto error_unlock;
- if (!vm) {
We should cleanup with drm_exec_unlock_obj() here, same as it was before.
amdgpu_bo_unref(root);return NULL;- }
return vm;
We can drop the extra reference on the root BO before returning the VM now since the drm_exec object holds one as well.
Apart from that this looks like a really nice cleanup to me.
Thanks, Christian.
-error_unlock:
- amdgpu_bo_unreserve(*root);
-error_unref:
- amdgpu_bo_unref(root);
- return NULL;
} /** @@ -3013,20 +3019,32 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, { bool is_compute_context = false; struct amdgpu_bo *root;
- struct drm_exec exec; uint64_t value, flags; struct amdgpu_vm *vm; int r;
- vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid);
- if (!vm)
- drm_exec_init(&exec, 0, 0);
- drm_exec_until_all_locked(&exec) {
vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid, &exec);drm_exec_retry_on_contention(&exec);if (!vm)break;- }
- if (!vm) {
return false;drm_exec_fini(&exec);- }
is_compute_context = vm->is_compute_context; if (is_compute_context) {
/* Unreserve root since svm_range_restore_pages might try to reserve it. *//* TODO: rework svm_range_restore_pages so that this isn't necessary. */amdgpu_bo_unreserve(root);
/* Release the root PD lock since svm_range_restore_pages* might try to take it.* TODO: rework svm_range_restore_pages so that this isn't* necessary.*/drm_exec_fini(&exec);if (!svm_range_restore_pages(adev, pasid, vmid, node_id, addr >> PAGE_SHIFT, ts, write_fault)) { @@ -3036,9 +3054,17 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, amdgpu_bo_unref(&root); /* Re-acquire the VM lock, could be that the VM was freed in between. */
vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid);if (!vm)
drm_exec_init(&exec, 0, 0);drm_exec_until_all_locked(&exec) {vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid, &exec);drm_exec_retry_on_contention(&exec);if (!vm)break;}if (!vm) {drm_exec_fini(&exec); return false; }}addr /= AMDGPU_GPU_PAGE_SIZE; @@ -3076,7 +3102,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, r = amdgpu_vm_update_pdes(adev, vm, true); error_unlock:
- amdgpu_bo_unreserve(root);
- drm_exec_fini(&exec); if (r < 0) dev_err(adev->dev, "Can't handle page fault (%d)\n", r);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index d083d7aab75c..af292c2fc521 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -593,7 +593,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, bool write_fault); struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev,
struct amdgpu_bo **root, u32 pasid);
struct amdgpu_bo **root, u32 pasid,struct drm_exec *exec);void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
When dumping IB contents from a hung job, amdgpu_devcoredump_format() acquired the VM root PD's reservation via amdgpu_vm_lock_by_pasid() and then, for each IB, called amdgpu_bo_reserve() on the BO backing the IB. Both reservations are reservation_ww_class_mutex objects and neither used a ww_acquire_ctx, which trips lockdep:
WARNING: possible recursive locking detected -------------------------------------------- kworker/u128:0 is trying to acquire lock: ffff88838b16e1f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
but task is already holding lock: ffff8882f82681f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
Possible unsafe locking scenario: CPU0 ---- lock(reservation_ww_class_mutex); lock(reservation_ww_class_mutex);
*** DEADLOCK *** May be due to missing lock nesting notation
Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu] Call Trace: __ww_mutex_lock.constprop.0 ww_mutex_lock amdgpu_bo_reserve amdgpu_devcoredump_format+0x1594 [amdgpu] amdgpu_devcoredump_deferred_work+0xea [amdgpu]
The two reservations are on different BOs in the captured trace, so the splat is a lockdep-correctness warning, not an observed deadlock. It becomes a real self-deadlock whenever the IB BO shares its dma_resv with the root PD (the always-valid case, see amdgpu_vm_is_bo_always_valid()): amdgpu_bo_reserve(abo) re-acquires the same ww_mutex without a ticket and blocks forever.
With amdgpu.gpu_recovery=0 the timeout handler refires every ~2 s and each invocation produces this splat, drowning the kernel ring buffer.
Now that amdgpu_vm_lock_by_pasid() takes a drm_exec context, lock the root PD and every IB BO together in a single drm_exec ticket. DRM_EXEC_IGNORE_DUPLICATES handles IB BOs that share a dma_resv (e.g. always-valid BOs, or two IBs backed by the same BO). Every lock is now a top-level acquire under one ww_acquire_ctx, so the recursive ww_mutex condition is gone, and the per-IB amdgpu_bo_reserve()/amdgpu_bo_unref() dance -- including a BO refcount leak on the amdgpu_bo_reserve() failure path -- is removed.
Reproducer (~150 LoC libdrm_amdgpu): submit a single GFX IB containing PACKET3_INDIRECT_BUFFER chained at GPU VA 0 and wait for the fence. The TDR fires within ~10 s and the deferred coredump worker produces the splat above on every invocation; with this change applied the splat is gone.
Fixes: 7b15fc2d1f1a ("drm/amdgpu: dump job ibs in the devcoredump") Suggested-by: Christian König christian.koenig@amd.com Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com --- .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 103 ++++++++++++------ 1 file changed, 71 insertions(+), 32 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index d386bc775d03..a9d8e03fad83 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -24,6 +24,7 @@
#include <generated/utsrelease.h> #include <linux/devcoredump.h> +#include <drm/drm_exec.h> #include "amdgpu_dev_coredump.h" #include "atom.h"
@@ -214,13 +215,9 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf struct drm_printer p; struct drm_print_iterator iter; struct amdgpu_vm_fault_info *fault_info; - struct amdgpu_bo_va_mapping *mapping; struct amdgpu_ip_block *ip_block; struct amdgpu_res_cursor cursor; - struct amdgpu_bo *abo, *root; - uint64_t va_start, offset; struct amdgpu_ring *ring; - struct amdgpu_vm *vm; u32 *ib_content; uint8_t *kptr; int ver, i, j, r; @@ -343,43 +340,84 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf drm_printf(&p, "VRAM is lost due to GPU reset!\n");
if (coredump->num_ibs) { - /* Don't try to lookup the VM or map the BOs when calculating the - * size required to store the devcoredump. + struct amdgpu_bo_va_mapping *mapping; + struct amdgpu_bo *root, *abo; + struct drm_exec exec; + struct amdgpu_vm *vm; + u64 va_start, offset; + bool locked = false; + + /* + * Lock the VM root PD and every IB BO together in a single + * drm_exec ticket. Reserving the IB BOs one by one while the + * root PD is held would be a recursive reservation_ww_class_mutex + * acquire without a ww_acquire_ctx, which trips lockdep and + * self-deadlocks for IB BOs that share their dma_resv with the + * root PD (always-valid BOs). + * + * Skip locking entirely on the sizing pass: it does not write + * IB content, so the size estimate doesn't depend on whether + * the BOs are reachable. */ - if (sizing_pass) - vm = NULL; - else - vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid); + if (!sizing_pass) { + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, + 1 + coredump->num_ibs); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, &root, + coredump->pasid, &exec); + drm_exec_retry_on_contention(&exec); + if (!vm) + break; + + for (int i = 0; i < coredump->num_ibs; i++) { + u64 pfn; + + va_start = coredump->ibs[i].gpu_addr & + AMDGPU_GMC_HOLE_MASK; + pfn = va_start / AMDGPU_GPU_PAGE_SIZE; + mapping = amdgpu_vm_bo_lookup_mapping(vm, pfn); + if (!mapping) + continue; + + abo = mapping->bo_va->base.bo; + r = drm_exec_lock_obj(&exec, &abo->tbo.base); + drm_exec_retry_on_contention(&exec); + if (r) + break; + } + if (r) + break; + } + if (vm && !r) + locked = true; + else + drm_exec_fini(&exec); + } + + for (int i = 0; i < coredump->num_ibs; i++) { + bool emit_content = sizing_pass;
- for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) { ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4, GFP_KERNEL); if (!ib_content) continue;
- /* vm=NULL can only happen when 'sizing_pass' is true. Skip to the - * drm_printf() calls (ib_content doesn't need to be initialized - * as its content won't be written anywhere). - */ - if (!vm) + if (!locked) goto output_ib_content;
va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK; mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE); if (!mapping) - goto free_ib_content; + goto output_ib_content;
- offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE); - abo = amdgpu_bo_ref(mapping->bo_va->base.bo); - r = amdgpu_bo_reserve(abo, false); - if (r) - goto free_ib_content; + abo = mapping->bo_va->base.bo; + offset = va_start - mapping->start * AMDGPU_GPU_PAGE_SIZE;
if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) { off = 0;
if (abo->tbo.resource->mem_type != TTM_PL_VRAM) - goto unreserve_abo; + goto output_ib_content;
amdgpu_res_first(abo->tbo.resource, offset, coredump->ibs[i].ib_size_dw * 4, @@ -391,12 +429,13 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf off += cursor.size; amdgpu_res_next(&cursor, cursor.size); } + emit_content = true; } else { r = ttm_bo_kmap(&abo->tbo, 0, PFN_UP(abo->tbo.base.size), &abo->kmap); if (r) - goto unreserve_abo; + goto output_ib_content;
kptr = amdgpu_bo_kptr(abo); kptr += offset; @@ -404,21 +443,21 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf coredump->ibs[i].ib_size_dw * 4);
amdgpu_bo_kunmap(abo); + emit_content = true; }
output_ib_content: drm_printf(&p, "\nIB #%d 0x%llx %d dw\n", i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw); - for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) - drm_printf(&p, "0x%08x\n", ib_content[j]); -unreserve_abo: - if (vm) - amdgpu_bo_unreserve(abo); -free_ib_content: + if (emit_content) { + for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) + drm_printf(&p, "0x%08x\n", ib_content[j]); + } kvfree(ib_content); } - if (vm) { - amdgpu_bo_unreserve(root); + + if (locked) { + drm_exec_fini(&exec); amdgpu_bo_unref(&root); } }
Hi Mikhail,
kernel test robot noticed the following build warnings:
[auto build test WARNING on drm-misc/drm-misc-next] [also build test WARNING on drm/drm-next drm-i915/for-linux-next drm-i915/for-linux-next-fixes drm-tip/drm-tip next-20260520] [cannot apply to linus/master v6.16-rc1] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Mikhail-Gavrilov/drm-amdgpu-c... base: https://gitlab.freedesktop.org/drm/misc/kernel.git drm-misc-next patch link: https://lore.kernel.org/r/20260520151741.50575-3-mikhail.v.gavrilov%40gmail.... patch subject: [PATCH v3 2/2] drm/amdgpu: fix recursive ww_mutex acquire in amdgpu_devcoredump_format compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) docutils: docutils (Docutils 0.21.2, Python 3.13.5, on linux) reproduce: (https://download.01.org/0day-ci/archive/20260521/202605211112.xlUYDZeM-lkp@i...)
If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot lkp@intel.com | Closes: https://lore.kernel.org/oe-kbuild-all/202605211112.xlUYDZeM-lkp@intel.com/
All warnings (new ones prefixed by >>):
AMD plane color pipeline ------------------------ [docutils]
Documentation/gpu/amdgpu/driver-core:225: ./drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c:2958: WARNING: Inline emphasis start-string without end-string. [docutils]
Documentation/gpu/driver-uapi:31: ./include/uapi/drm/xe_drm.h:2538: ERROR: Unexpected section title.
-- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
This series fixes a lockdep "possible recursive locking" splat in amdgpu_devcoredump_format() that fires on every GPU timeout once a job with a PASID context is involved. With amdgpu.gpu_recovery=0 the timeout handler refires every ~2 s, so the splat repeats until it drowns the kernel ring buffer. It is also a real self-deadlock for IB BOs that share their dma_resv with the root PD (the always-valid case).
The root cause: amdgpu_devcoredump_format() holds the VM root PD's reservation and then reserves each IB BO on top of it, nesting two reservation_ww_class_mutex acquires without a ww_acquire_ctx.
The fix teaches amdgpu_vm_lock_by_pasid() to lock the root PD in a drm_exec context, so the devcoredump path can lock the root PD and all the IB BOs together in one ww ticket. Because amdgpu_vm_lock_by_pasid() has a second caller in the page-fault path, the series is split so each patch builds and works on its own:
1/2 Convert amdgpu_vm_lock_by_pasid() to take a drm_exec context and lock the root PD with drm_exec_lock_obj(). The drm_exec context holds the root BO reference, so the root output parameter is dropped. Updates the existing caller, amdgpu_vm_handle_fault(). Pure refactor, no functional change to the page-fault path.
2/2 Use the new signature in amdgpu_devcoredump_format(): lock the root PD and every IB BO together in one drm_exec ticket. The per-IB amdgpu_bo_reserve() nesting is gone, along with a BO refcount leak on the old reserve-failure path. This is the actual bug fix and carries the Fixes: tag.
Tested on Linux 7.1-rc4 + this series, Radeon RX 7900 XTX (gfx1100), KASAN + PROVE_LOCKING enabled, using a small libdrm_amdgpu reproducer that submits a GFX IB chained at GPU VA 0 and waits for the hang. Before the series the splat fires on every TDR; after it the dmesg is clean across repeated timeouts and the devcoredump output is unchanged.
v1: https://lore.kernel.org/amd-gfx/20260429143743.50743-1-mikhail.v.gavrilov@gm... v2: https://lore.kernel.org/amd-gfx/20260519161541.19994-1-mikhail.v.gavrilov@gm... v3: https://lore.kernel.org/amd-gfx/20260520151741.50575-1-mikhail.v.gavrilov@gm...
Changes since v3: - Lock the root PD with drm_exec_lock_obj() instead of amdgpu_vm_lock_pd(): the latter dereferences the VM pointer, which is not yet re-validated at that point (Christian). - Drop the root output parameter of amdgpu_vm_lock_by_pasid() entirely; the drm_exec context already holds a reference on the locked root BO, so the extra reference and the parameter are unnecessary (Christian). - Unlock the root BO with drm_exec_unlock_obj() on the VM-recheck-failed path (Christian). - amdgpu_vm_handle_fault() and amdgpu_devcoredump_format() updated for the simplified signature; both lose their root variable. - Drops the v3 kernel-doc "*root" reference, which also resolves the docutils "Inline emphasis start-string without end-string" warning the kernel test robot reported against v3.
Changes since v2: - Reworked along the lines Christian suggested: amdgpu_vm_lock_by_pasid() takes a drm_exec context directly (patch 1), and the devcoredump code locks the root PD and all IB BOs in a single ticket (patch 2). The amdgpu_devcoredump_ib_ref struct and the three collect/lock/release helpers from v2 are gone.
Changes since v1: - Switched from per-IB amdgpu_bo_reserve() to drm_exec. - Dropped the Cc: stable tag: the regression only landed in 7.1-rc1, so the fix reaches 7.1 via drm-fixes without a stable backport.
Mikhail Gavrilov (2): drm/amdgpu: convert amdgpu_vm_lock_by_pasid() to drm_exec drm/amdgpu: fix recursive ww_mutex acquire in amdgpu_devcoredump_format
.../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 105 ++++++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 91 +++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- 3 files changed, 129 insertions(+), 69 deletions(-)
amdgpu_vm_lock_by_pasid() looks up a VM by PASID and reserves its root PD with a bare amdgpu_bo_reserve(), returning the still-reserved root to the caller. A caller that then needs to reserve further BOs (for example the devcoredump IB dump) ends up nesting reservation_ww_class_mutex acquires without a ww_acquire_ctx, which lockdep flags as recursive locking.
Convert the helper to take a drm_exec context and lock the root PD with drm_exec_lock_obj(). Callers now run it inside a drm_exec_until_all_locked() loop and can lock additional BOs in the same ww ticket, so there is no nested ww_mutex acquire.
The drm_exec context holds its own reference on the locked root BO, so the helper no longer hands a root reference back to the caller: the root output parameter is dropped, and the transient reference taken across the PASID lookup is released before returning.
The only existing caller, amdgpu_vm_handle_fault(), is updated accordingly. Its is_compute_context path, which previously dropped the root reservation around svm_range_restore_pages() and re-took it, now finalises the drm_exec context and re-initialises a fresh one; behaviour is otherwise unchanged.
No functional change intended for the page-fault path.
Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 91 ++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- 2 files changed, 58 insertions(+), 35 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 9ba9de16a27a..591980907211 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2950,47 +2950,56 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) }
/** - * amdgpu_vm_lock_by_pasid - return an amdgpu_vm and its root bo from a pasid, if possible. + * amdgpu_vm_lock_by_pasid - look up a VM by PASID and lock its root PD * @adev: amdgpu device pointer - * @root: root BO of the VM * @pasid: PASID of the VM - * The caller needs to unreserve and unref the root bo on success. + * @exec: drm_exec context to lock the root PD in + * + * Must be called from within a drm_exec_until_all_locked() loop; the caller + * runs drm_exec_retry_on_contention() afterwards. The drm_exec context holds + * a reference on the root BO until it is finalised. + * + * Return: the VM on success, or NULL if the PASID has no VM, the VM is being + * torn down, or locking the root PD failed. */ struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, - struct amdgpu_bo **root, u32 pasid) + u32 pasid, struct drm_exec *exec) { unsigned long irqflags; + struct amdgpu_bo *root; struct amdgpu_vm *vm; int r;
xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); vm = xa_load(&adev->vm_manager.pasids, pasid); - *root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL; + root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL; xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
- if (!*root) + if (!root) return NULL;
- r = amdgpu_bo_reserve(*root, true); - if (r) - goto error_unref; + r = drm_exec_lock_obj(exec, &root->tbo.base); + if (r) { + amdgpu_bo_unref(&root); + return NULL; + }
/* Double check that the VM still exists */ xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); vm = xa_load(&adev->vm_manager.pasids, pasid); - if (vm && vm->root.bo != *root) + if (vm && vm->root.bo != root) vm = NULL; xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags); - if (!vm) - goto error_unlock; + if (!vm) { + drm_exec_unlock_obj(exec, &root->tbo.base); + amdgpu_bo_unref(&root); + return NULL; + }
- return vm; -error_unlock: - amdgpu_bo_unreserve(*root); + /* The drm_exec context holds its own reference on the root BO. */ + amdgpu_bo_unref(&root);
-error_unref: - amdgpu_bo_unref(root); - return NULL; + return vm; }
/** @@ -3012,33 +3021,49 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, uint64_t ts, bool write_fault) { bool is_compute_context = false; - struct amdgpu_bo *root; + struct drm_exec exec; uint64_t value, flags; struct amdgpu_vm *vm; int r;
- vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid); - if (!vm) + drm_exec_init(&exec, 0, 0); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, pasid, &exec); + drm_exec_retry_on_contention(&exec); + if (!vm) + break; + } + if (!vm) { + drm_exec_fini(&exec); return false; + }
is_compute_context = vm->is_compute_context;
if (is_compute_context) { - /* Unreserve root since svm_range_restore_pages might try to reserve it. */ - /* TODO: rework svm_range_restore_pages so that this isn't necessary. */ - amdgpu_bo_unreserve(root); + /* Release the root PD lock since svm_range_restore_pages + * might try to take it. + * TODO: rework svm_range_restore_pages so that this isn't + * necessary. + */ + drm_exec_fini(&exec);
if (!svm_range_restore_pages(adev, pasid, vmid, - node_id, addr >> PAGE_SHIFT, ts, write_fault)) { - amdgpu_bo_unref(&root); + node_id, addr >> PAGE_SHIFT, ts, write_fault)) return true; - } - amdgpu_bo_unref(&root);
/* Re-acquire the VM lock, could be that the VM was freed in between. */ - vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid); - if (!vm) + drm_exec_init(&exec, 0, 0); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, pasid, &exec); + drm_exec_retry_on_contention(&exec); + if (!vm) + break; + } + if (!vm) { + drm_exec_fini(&exec); return false; + } }
addr /= AMDGPU_GPU_PAGE_SIZE; @@ -3062,7 +3087,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, value = 0; }
- r = dma_resv_reserve_fences(root->tbo.base.resv, 1); + r = dma_resv_reserve_fences(vm->root.bo->tbo.base.resv, 1); if (r) { pr_debug("failed %d to reserve fence slot\n", r); goto error_unlock; @@ -3076,12 +3101,10 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, r = amdgpu_vm_update_pdes(adev, vm, true);
error_unlock: - amdgpu_bo_unreserve(root); + drm_exec_fini(&exec); if (r < 0) dev_err(adev->dev, "Can't handle page fault (%d)\n", r);
- amdgpu_bo_unref(&root); - return false; }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index d083d7aab75c..0c6e3e0368c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -593,7 +593,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, bool write_fault);
struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, - struct amdgpu_bo **root, u32 pasid); + u32 pasid, struct drm_exec *exec);
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
On 5/21/26 12:43, Mikhail Gavrilov wrote:
amdgpu_vm_lock_by_pasid() looks up a VM by PASID and reserves its root PD with a bare amdgpu_bo_reserve(), returning the still-reserved root to the caller. A caller that then needs to reserve further BOs (for example the devcoredump IB dump) ends up nesting reservation_ww_class_mutex acquires without a ww_acquire_ctx, which lockdep flags as recursive locking.
Convert the helper to take a drm_exec context and lock the root PD with drm_exec_lock_obj(). Callers now run it inside a drm_exec_until_all_locked() loop and can lock additional BOs in the same ww ticket, so there is no nested ww_mutex acquire.
The drm_exec context holds its own reference on the locked root BO, so the helper no longer hands a root reference back to the caller: the root output parameter is dropped, and the transient reference taken across the PASID lookup is released before returning.
The only existing caller, amdgpu_vm_handle_fault(), is updated accordingly. Its is_compute_context path, which previously dropped the root reservation around svm_range_restore_pages() and re-took it, now finalises the drm_exec context and re-initialises a fresh one; behaviour is otherwise unchanged.
No functional change intended for the page-fault path.
Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 91 ++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- 2 files changed, 58 insertions(+), 35 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 9ba9de16a27a..591980907211 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2950,47 +2950,56 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) } /**
- amdgpu_vm_lock_by_pasid - return an amdgpu_vm and its root bo from a pasid, if possible.
- amdgpu_vm_lock_by_pasid - look up a VM by PASID and lock its root PD
- @adev: amdgpu device pointer
- @root: root BO of the VM
- @pasid: PASID of the VM
- The caller needs to unreserve and unref the root bo on success.
- @exec: drm_exec context to lock the root PD in
- Must be called from within a drm_exec_until_all_locked() loop; the caller
- runs drm_exec_retry_on_contention() afterwards. The drm_exec context holds
- a reference on the root BO until it is finalised.
- Return: the VM on success, or NULL if the PASID has no VM, the VM is being
*/
- torn down, or locking the root PD failed.
struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev,
struct amdgpu_bo **root, u32 pasid)
u32 pasid, struct drm_exec *exec){ unsigned long irqflags;
- struct amdgpu_bo *root; struct amdgpu_vm *vm; int r;
xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); vm = xa_load(&adev->vm_manager.pasids, pasid);
- *root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL;
- root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL; xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
- if (!*root)
- if (!root) return NULL;
- r = amdgpu_bo_reserve(*root, true);
- if (r)
goto error_unref;
- r = drm_exec_lock_obj(exec, &root->tbo.base);
- if (r) {
amdgpu_bo_unref(&root);return NULL;- }
/* Double check that the VM still exists */ xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); vm = xa_load(&adev->vm_manager.pasids, pasid);
- if (vm && vm->root.bo != *root)
- if (vm && vm->root.bo != root) vm = NULL; xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
- if (!vm)
goto error_unlock;
- if (!vm) {
drm_exec_unlock_obj(exec, &root->tbo.base);amdgpu_bo_unref(&root);return NULL;- }
- return vm;
-error_unlock:
- amdgpu_bo_unreserve(*root);
- /* The drm_exec context holds its own reference on the root BO. */
- amdgpu_bo_unref(&root);
-error_unref:
- amdgpu_bo_unref(root);
- return NULL;
- return vm;
} /** @@ -3012,33 +3021,49 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, uint64_t ts, bool write_fault) { bool is_compute_context = false;
- struct amdgpu_bo *root;
- struct drm_exec exec; uint64_t value, flags; struct amdgpu_vm *vm; int r;
- vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid);
- if (!vm)
- drm_exec_init(&exec, 0, 0);
Make the last parameter 1 here since we are expecting to lock 1 object.
Not a must have, it will work without but it is just a little bit more optimal.
Apart from that Reviewed-by: Christian König christian.koenig@amd.com.
Thanks, Christian.
- drm_exec_until_all_locked(&exec) {
vm = amdgpu_vm_lock_by_pasid(adev, pasid, &exec);drm_exec_retry_on_contention(&exec);if (!vm)break;- }
- if (!vm) {
return false;drm_exec_fini(&exec);- }
is_compute_context = vm->is_compute_context; if (is_compute_context) {
/* Unreserve root since svm_range_restore_pages might try to reserve it. *//* TODO: rework svm_range_restore_pages so that this isn't necessary. */amdgpu_bo_unreserve(root);
/* Release the root PD lock since svm_range_restore_pages* might try to take it.* TODO: rework svm_range_restore_pages so that this isn't* necessary.*/drm_exec_fini(&exec);if (!svm_range_restore_pages(adev, pasid, vmid,
node_id, addr >> PAGE_SHIFT, ts, write_fault)) {amdgpu_bo_unref(&root);
node_id, addr >> PAGE_SHIFT, ts, write_fault)) return true;
}amdgpu_bo_unref(&root);/* Re-acquire the VM lock, could be that the VM was freed in between. */
vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid);if (!vm)
drm_exec_init(&exec, 0, 0);drm_exec_until_all_locked(&exec) {vm = amdgpu_vm_lock_by_pasid(adev, pasid, &exec);drm_exec_retry_on_contention(&exec);if (!vm)break;}if (!vm) {drm_exec_fini(&exec); return false; }}addr /= AMDGPU_GPU_PAGE_SIZE; @@ -3062,7 +3087,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, value = 0; }
- r = dma_resv_reserve_fences(root->tbo.base.resv, 1);
- r = dma_resv_reserve_fences(vm->root.bo->tbo.base.resv, 1); if (r) { pr_debug("failed %d to reserve fence slot\n", r); goto error_unlock;
@@ -3076,12 +3101,10 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, r = amdgpu_vm_update_pdes(adev, vm, true); error_unlock:
- amdgpu_bo_unreserve(root);
- drm_exec_fini(&exec); if (r < 0) dev_err(adev->dev, "Can't handle page fault (%d)\n", r);
- amdgpu_bo_unref(&root);
- return false;
} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index d083d7aab75c..0c6e3e0368c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -593,7 +593,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, bool write_fault); struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev,
struct amdgpu_bo **root, u32 pasid);
u32 pasid, struct drm_exec *exec);void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
When dumping IB contents from a hung job, amdgpu_devcoredump_format() acquired the VM root PD's reservation via amdgpu_vm_lock_by_pasid() and then, for each IB, called amdgpu_bo_reserve() on the BO backing the IB. Both reservations are reservation_ww_class_mutex objects and neither used a ww_acquire_ctx, which trips lockdep:
WARNING: possible recursive locking detected -------------------------------------------- kworker/u128:0 is trying to acquire lock: ffff88838b16e1f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
but task is already holding lock: ffff8882f82681f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
Possible unsafe locking scenario: CPU0 ---- lock(reservation_ww_class_mutex); lock(reservation_ww_class_mutex);
*** DEADLOCK *** May be due to missing lock nesting notation
Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu] Call Trace: __ww_mutex_lock.constprop.0 ww_mutex_lock amdgpu_bo_reserve amdgpu_devcoredump_format+0x1594 [amdgpu] amdgpu_devcoredump_deferred_work+0xea [amdgpu]
The two reservations are on different BOs in the captured trace, so the splat is a lockdep-correctness warning, not an observed deadlock. It becomes a real self-deadlock whenever the IB BO shares its dma_resv with the root PD (the always-valid case, see amdgpu_vm_is_bo_always_valid()): amdgpu_bo_reserve(abo) re-acquires the same ww_mutex without a ticket and blocks forever.
With amdgpu.gpu_recovery=0 the timeout handler refires every ~2 s and each invocation produces this splat, drowning the kernel ring buffer.
Now that amdgpu_vm_lock_by_pasid() takes a drm_exec context, lock the root PD and every IB BO together in a single drm_exec ticket. DRM_EXEC_IGNORE_DUPLICATES handles IB BOs that share a dma_resv (e.g. always-valid BOs, or two IBs backed by the same BO). Every lock is now a top-level acquire under one ww_acquire_ctx, so the recursive ww_mutex condition is gone, and the per-IB amdgpu_bo_reserve()/amdgpu_bo_unref() dance -- including a BO refcount leak on the amdgpu_bo_reserve() failure path -- is removed.
Reproducer (~150 LoC libdrm_amdgpu): submit a single GFX IB containing PACKET3_INDIRECT_BUFFER chained at GPU VA 0 and wait for the fence. The TDR fires within ~10 s and the deferred coredump worker produces the splat above on every invocation; with this change applied the splat is gone.
Fixes: 7b15fc2d1f1a ("drm/amdgpu: dump job ibs in the devcoredump") Suggested-by: Christian König christian.koenig@amd.com Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com --- .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 105 ++++++++++++------ 1 file changed, 71 insertions(+), 34 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index d386bc775d03..456ea9911d48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -24,6 +24,7 @@
#include <generated/utsrelease.h> #include <linux/devcoredump.h> +#include <drm/drm_exec.h> #include "amdgpu_dev_coredump.h" #include "atom.h"
@@ -214,13 +215,9 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf struct drm_printer p; struct drm_print_iterator iter; struct amdgpu_vm_fault_info *fault_info; - struct amdgpu_bo_va_mapping *mapping; struct amdgpu_ip_block *ip_block; struct amdgpu_res_cursor cursor; - struct amdgpu_bo *abo, *root; - uint64_t va_start, offset; struct amdgpu_ring *ring; - struct amdgpu_vm *vm; u32 *ib_content; uint8_t *kptr; int ver, i, j, r; @@ -343,43 +340,84 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf drm_printf(&p, "VRAM is lost due to GPU reset!\n");
if (coredump->num_ibs) { - /* Don't try to lookup the VM or map the BOs when calculating the - * size required to store the devcoredump. + struct amdgpu_bo_va_mapping *mapping; + struct amdgpu_bo *abo; + struct drm_exec exec; + struct amdgpu_vm *vm; + u64 va_start, offset; + bool locked = false; + + /* + * Lock the VM root PD and every IB BO together in a single + * drm_exec ticket. Reserving the IB BOs one by one while the + * root PD is held would be a recursive reservation_ww_class_mutex + * acquire without a ww_acquire_ctx, which trips lockdep and + * self-deadlocks for IB BOs that share their dma_resv with the + * root PD (always-valid BOs). + * + * Skip locking entirely on the sizing pass: it does not write + * IB content, so the size estimate doesn't depend on whether + * the BOs are reachable. */ - if (sizing_pass) - vm = NULL; - else - vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid); + if (!sizing_pass) { + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, + 1 + coredump->num_ibs); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, coredump->pasid, + &exec); + drm_exec_retry_on_contention(&exec); + if (!vm) + break; + + for (int i = 0; i < coredump->num_ibs; i++) { + u64 pfn; + + va_start = coredump->ibs[i].gpu_addr & + AMDGPU_GMC_HOLE_MASK; + pfn = va_start / AMDGPU_GPU_PAGE_SIZE; + mapping = amdgpu_vm_bo_lookup_mapping(vm, pfn); + if (!mapping) + continue; + + abo = mapping->bo_va->base.bo; + r = drm_exec_lock_obj(&exec, &abo->tbo.base); + drm_exec_retry_on_contention(&exec); + if (r) + break; + } + if (r) + break; + } + if (vm && !r) + locked = true; + else + drm_exec_fini(&exec); + } + + for (int i = 0; i < coredump->num_ibs; i++) { + bool emit_content = sizing_pass;
- for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) { ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4, GFP_KERNEL); if (!ib_content) continue;
- /* vm=NULL can only happen when 'sizing_pass' is true. Skip to the - * drm_printf() calls (ib_content doesn't need to be initialized - * as its content won't be written anywhere). - */ - if (!vm) + if (!locked) goto output_ib_content;
va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK; mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE); if (!mapping) - goto free_ib_content; + goto output_ib_content;
- offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE); - abo = amdgpu_bo_ref(mapping->bo_va->base.bo); - r = amdgpu_bo_reserve(abo, false); - if (r) - goto free_ib_content; + abo = mapping->bo_va->base.bo; + offset = va_start - mapping->start * AMDGPU_GPU_PAGE_SIZE;
if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) { off = 0;
if (abo->tbo.resource->mem_type != TTM_PL_VRAM) - goto unreserve_abo; + goto output_ib_content;
amdgpu_res_first(abo->tbo.resource, offset, coredump->ibs[i].ib_size_dw * 4, @@ -391,12 +429,13 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf off += cursor.size; amdgpu_res_next(&cursor, cursor.size); } + emit_content = true; } else { r = ttm_bo_kmap(&abo->tbo, 0, PFN_UP(abo->tbo.base.size), &abo->kmap); if (r) - goto unreserve_abo; + goto output_ib_content;
kptr = amdgpu_bo_kptr(abo); kptr += offset; @@ -404,23 +443,21 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf coredump->ibs[i].ib_size_dw * 4);
amdgpu_bo_kunmap(abo); + emit_content = true; }
output_ib_content: drm_printf(&p, "\nIB #%d 0x%llx %d dw\n", i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw); - for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) - drm_printf(&p, "0x%08x\n", ib_content[j]); -unreserve_abo: - if (vm) - amdgpu_bo_unreserve(abo); -free_ib_content: + if (emit_content) { + for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) + drm_printf(&p, "0x%08x\n", ib_content[j]); + } kvfree(ib_content); } - if (vm) { - amdgpu_bo_unreserve(root); - amdgpu_bo_unref(&root); - } + + if (locked) + drm_exec_fini(&exec); }
return count - iter.remain;
This series fixes a lockdep "possible recursive locking" splat in amdgpu_devcoredump_format() that fires on every GPU timeout once a job with a PASID context is involved. With amdgpu.gpu_recovery=0 the timeout handler refires every ~2 s, so the splat repeats until it drowns the kernel ring buffer. It is also a real self-deadlock for IB BOs that share their dma_resv with the root PD (the always-valid case).
The root cause: amdgpu_devcoredump_format() holds the VM root PD's reservation and then reserves each IB BO on top of it, nesting two reservation_ww_class_mutex acquires without a ww_acquire_ctx.
The fix teaches amdgpu_vm_lock_by_pasid() to lock the root PD in a drm_exec context, so the devcoredump path can lock the root PD and all the IB BOs together in one ww ticket. Because amdgpu_vm_lock_by_pasid() has a second caller in the page-fault path, the series is split so each patch builds and works on its own:
1/2 Convert amdgpu_vm_lock_by_pasid() to take a drm_exec context and lock the root PD with drm_exec_lock_obj(). The drm_exec context holds the root BO reference, so the root output parameter is dropped. Updates the existing caller, amdgpu_vm_handle_fault(). Pure refactor, no functional change to the page-fault path.
2/2 Use the new signature in amdgpu_devcoredump_format(): lock the root PD and every IB BO together in one drm_exec ticket. The per-IB amdgpu_bo_reserve() nesting is gone, along with a BO refcount leak on the old reserve-failure path. This is the actual bug fix and carries the Fixes: tag.
Tested on Linux 7.1-rc4 + this series, Radeon RX 7900 XTX (gfx1100), KASAN + PROVE_LOCKING enabled, using a small libdrm_amdgpu reproducer that submits a GFX IB chained at GPU VA 0 and waits for the hang. Before the series the splat fires on every TDR; after it the dmesg is clean across repeated timeouts and the devcoredump IB dump is produced correctly.
v1: https://lore.kernel.org/amd-gfx/20260429143743.50743-1-mikhail.v.gavrilov@gm... v2: https://lore.kernel.org/amd-gfx/20260519161541.19994-1-mikhail.v.gavrilov@gm... v3: https://lore.kernel.org/amd-gfx/20260520151741.50575-1-mikhail.v.gavrilov@gm... v4: https://lore.kernel.org/amd-gfx/20260521104335.28978-1-mikhail.v.gavrilov@gm...
Changes since v4: - Pass nr=1 to drm_exec_init() in amdgpu_vm_handle_fault(), since exactly one object (the root PD) is locked there (Christian). - Picked up Christian's Reviewed-by on patch 1.
Changes since v3: - Lock the root PD with drm_exec_lock_obj() instead of amdgpu_vm_lock_pd(): the latter dereferences the VM pointer, which is not yet re-validated at that point (Christian). - Drop the root output parameter of amdgpu_vm_lock_by_pasid() entirely; the drm_exec context already holds a reference on the locked root BO, so the extra reference and the parameter are unnecessary (Christian). - Unlock the root BO with drm_exec_unlock_obj() on the VM-recheck-failed path (Christian). - amdgpu_vm_handle_fault() and amdgpu_devcoredump_format() updated for the simplified signature; both lose their root variable. - Drops the v3 kernel-doc "*root" reference, which also resolves the docutils "Inline emphasis start-string without end-string" warning the kernel test robot reported against v3.
Changes since v2: - Reworked along the lines Christian suggested: amdgpu_vm_lock_by_pasid() takes a drm_exec context directly (patch 1), and the devcoredump code locks the root PD and all IB BOs in a single ticket (patch 2). The amdgpu_devcoredump_ib_ref struct and the three collect/lock/release helpers from v2 are gone.
Changes since v1: - Switched from per-IB amdgpu_bo_reserve() to drm_exec. - Dropped the Cc: stable tag: the regression only landed in 7.1-rc1, so the fix reaches 7.1 via drm-fixes without a stable backport.
Mikhail Gavrilov (2): drm/amdgpu: convert amdgpu_vm_lock_by_pasid() to drm_exec drm/amdgpu: fix recursive ww_mutex acquire in amdgpu_devcoredump_format
.../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 105 ++++++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 91 +++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- 3 files changed, 129 insertions(+), 69 deletions(-)
amdgpu_vm_lock_by_pasid() looks up a VM by PASID and reserves its root PD with a bare amdgpu_bo_reserve(), returning the still-reserved root to the caller. A caller that then needs to reserve further BOs (for example the devcoredump IB dump) ends up nesting reservation_ww_class_mutex acquires without a ww_acquire_ctx, which lockdep flags as recursive locking.
Convert the helper to take a drm_exec context and lock the root PD with drm_exec_lock_obj(). Callers now run it inside a drm_exec_until_all_locked() loop and can lock additional BOs in the same ww ticket, so there is no nested ww_mutex acquire.
The drm_exec context holds its own reference on the locked root BO, so the helper no longer hands a root reference back to the caller: the root output parameter is dropped, and the transient reference taken across the PASID lookup is released before returning.
The only existing caller, amdgpu_vm_handle_fault(), is updated accordingly. Its is_compute_context path, which previously dropped the root reservation around svm_range_restore_pages() and re-took it, now finalises the drm_exec context and re-initialises a fresh one; behaviour is otherwise unchanged.
No functional change intended for the page-fault path.
Reviewed-by: Christian König christian.koenig@amd.com Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 91 ++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- 2 files changed, 58 insertions(+), 35 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 9ba9de16a27a..d734d8c0de6e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2950,47 +2950,56 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) }
/** - * amdgpu_vm_lock_by_pasid - return an amdgpu_vm and its root bo from a pasid, if possible. + * amdgpu_vm_lock_by_pasid - look up a VM by PASID and lock its root PD * @adev: amdgpu device pointer - * @root: root BO of the VM * @pasid: PASID of the VM - * The caller needs to unreserve and unref the root bo on success. + * @exec: drm_exec context to lock the root PD in + * + * Must be called from within a drm_exec_until_all_locked() loop; the caller + * runs drm_exec_retry_on_contention() afterwards. The drm_exec context holds + * a reference on the root BO until it is finalised. + * + * Return: the VM on success, or NULL if the PASID has no VM, the VM is being + * torn down, or locking the root PD failed. */ struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, - struct amdgpu_bo **root, u32 pasid) + u32 pasid, struct drm_exec *exec) { unsigned long irqflags; + struct amdgpu_bo *root; struct amdgpu_vm *vm; int r;
xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); vm = xa_load(&adev->vm_manager.pasids, pasid); - *root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL; + root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL; xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
- if (!*root) + if (!root) return NULL;
- r = amdgpu_bo_reserve(*root, true); - if (r) - goto error_unref; + r = drm_exec_lock_obj(exec, &root->tbo.base); + if (r) { + amdgpu_bo_unref(&root); + return NULL; + }
/* Double check that the VM still exists */ xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); vm = xa_load(&adev->vm_manager.pasids, pasid); - if (vm && vm->root.bo != *root) + if (vm && vm->root.bo != root) vm = NULL; xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags); - if (!vm) - goto error_unlock; + if (!vm) { + drm_exec_unlock_obj(exec, &root->tbo.base); + amdgpu_bo_unref(&root); + return NULL; + }
- return vm; -error_unlock: - amdgpu_bo_unreserve(*root); + /* The drm_exec context holds its own reference on the root BO. */ + amdgpu_bo_unref(&root);
-error_unref: - amdgpu_bo_unref(root); - return NULL; + return vm; }
/** @@ -3012,33 +3021,49 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, uint64_t ts, bool write_fault) { bool is_compute_context = false; - struct amdgpu_bo *root; + struct drm_exec exec; uint64_t value, flags; struct amdgpu_vm *vm; int r;
- vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid); - if (!vm) + drm_exec_init(&exec, 0, 1); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, pasid, &exec); + drm_exec_retry_on_contention(&exec); + if (!vm) + break; + } + if (!vm) { + drm_exec_fini(&exec); return false; + }
is_compute_context = vm->is_compute_context;
if (is_compute_context) { - /* Unreserve root since svm_range_restore_pages might try to reserve it. */ - /* TODO: rework svm_range_restore_pages so that this isn't necessary. */ - amdgpu_bo_unreserve(root); + /* Release the root PD lock since svm_range_restore_pages + * might try to take it. + * TODO: rework svm_range_restore_pages so that this isn't + * necessary. + */ + drm_exec_fini(&exec);
if (!svm_range_restore_pages(adev, pasid, vmid, - node_id, addr >> PAGE_SHIFT, ts, write_fault)) { - amdgpu_bo_unref(&root); + node_id, addr >> PAGE_SHIFT, ts, write_fault)) return true; - } - amdgpu_bo_unref(&root);
/* Re-acquire the VM lock, could be that the VM was freed in between. */ - vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid); - if (!vm) + drm_exec_init(&exec, 0, 1); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, pasid, &exec); + drm_exec_retry_on_contention(&exec); + if (!vm) + break; + } + if (!vm) { + drm_exec_fini(&exec); return false; + } }
addr /= AMDGPU_GPU_PAGE_SIZE; @@ -3062,7 +3087,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, value = 0; }
- r = dma_resv_reserve_fences(root->tbo.base.resv, 1); + r = dma_resv_reserve_fences(vm->root.bo->tbo.base.resv, 1); if (r) { pr_debug("failed %d to reserve fence slot\n", r); goto error_unlock; @@ -3076,12 +3101,10 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, r = amdgpu_vm_update_pdes(adev, vm, true);
error_unlock: - amdgpu_bo_unreserve(root); + drm_exec_fini(&exec); if (r < 0) dev_err(adev->dev, "Can't handle page fault (%d)\n", r);
- amdgpu_bo_unref(&root); - return false; }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index d083d7aab75c..0c6e3e0368c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -593,7 +593,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, bool write_fault);
struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, - struct amdgpu_bo **root, u32 pasid); + u32 pasid, struct drm_exec *exec);
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
When dumping IB contents from a hung job, amdgpu_devcoredump_format() acquired the VM root PD's reservation via amdgpu_vm_lock_by_pasid() and then, for each IB, called amdgpu_bo_reserve() on the BO backing the IB. Both reservations are reservation_ww_class_mutex objects and neither used a ww_acquire_ctx, which trips lockdep:
WARNING: possible recursive locking detected -------------------------------------------- kworker/u128:0 is trying to acquire lock: ffff88838b16e1f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
but task is already holding lock: ffff8882f82681f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
Possible unsafe locking scenario: CPU0 ---- lock(reservation_ww_class_mutex); lock(reservation_ww_class_mutex);
*** DEADLOCK *** May be due to missing lock nesting notation
Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu] Call Trace: __ww_mutex_lock.constprop.0 ww_mutex_lock amdgpu_bo_reserve amdgpu_devcoredump_format+0x1594 [amdgpu] amdgpu_devcoredump_deferred_work+0xea [amdgpu]
The two reservations are on different BOs in the captured trace, so the splat is a lockdep-correctness warning, not an observed deadlock. It becomes a real self-deadlock whenever the IB BO shares its dma_resv with the root PD (the always-valid case, see amdgpu_vm_is_bo_always_valid()): amdgpu_bo_reserve(abo) re-acquires the same ww_mutex without a ticket and blocks forever.
With amdgpu.gpu_recovery=0 the timeout handler refires every ~2 s and each invocation produces this splat, drowning the kernel ring buffer.
Now that amdgpu_vm_lock_by_pasid() takes a drm_exec context, lock the root PD and every IB BO together in a single drm_exec ticket. DRM_EXEC_IGNORE_DUPLICATES handles IB BOs that share a dma_resv (e.g. always-valid BOs, or two IBs backed by the same BO). Every lock is now a top-level acquire under one ww_acquire_ctx, so the recursive ww_mutex condition is gone, and the per-IB amdgpu_bo_reserve()/amdgpu_bo_unref() dance -- including a BO refcount leak on the amdgpu_bo_reserve() failure path -- is removed.
Reproducer (~150 LoC libdrm_amdgpu): submit a single GFX IB containing PACKET3_INDIRECT_BUFFER chained at GPU VA 0 and wait for the fence. The TDR fires within ~10 s and the deferred coredump worker produces the splat above on every invocation; with this change applied the splat is gone.
Fixes: 7b15fc2d1f1a ("drm/amdgpu: dump job ibs in the devcoredump") Suggested-by: Christian König christian.koenig@amd.com Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com --- .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 105 ++++++++++++------ 1 file changed, 71 insertions(+), 34 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index d386bc775d03..456ea9911d48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -24,6 +24,7 @@
#include <generated/utsrelease.h> #include <linux/devcoredump.h> +#include <drm/drm_exec.h> #include "amdgpu_dev_coredump.h" #include "atom.h"
@@ -214,13 +215,9 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf struct drm_printer p; struct drm_print_iterator iter; struct amdgpu_vm_fault_info *fault_info; - struct amdgpu_bo_va_mapping *mapping; struct amdgpu_ip_block *ip_block; struct amdgpu_res_cursor cursor; - struct amdgpu_bo *abo, *root; - uint64_t va_start, offset; struct amdgpu_ring *ring; - struct amdgpu_vm *vm; u32 *ib_content; uint8_t *kptr; int ver, i, j, r; @@ -343,43 +340,84 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf drm_printf(&p, "VRAM is lost due to GPU reset!\n");
if (coredump->num_ibs) { - /* Don't try to lookup the VM or map the BOs when calculating the - * size required to store the devcoredump. + struct amdgpu_bo_va_mapping *mapping; + struct amdgpu_bo *abo; + struct drm_exec exec; + struct amdgpu_vm *vm; + u64 va_start, offset; + bool locked = false; + + /* + * Lock the VM root PD and every IB BO together in a single + * drm_exec ticket. Reserving the IB BOs one by one while the + * root PD is held would be a recursive reservation_ww_class_mutex + * acquire without a ww_acquire_ctx, which trips lockdep and + * self-deadlocks for IB BOs that share their dma_resv with the + * root PD (always-valid BOs). + * + * Skip locking entirely on the sizing pass: it does not write + * IB content, so the size estimate doesn't depend on whether + * the BOs are reachable. */ - if (sizing_pass) - vm = NULL; - else - vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid); + if (!sizing_pass) { + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, + 1 + coredump->num_ibs); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, coredump->pasid, + &exec); + drm_exec_retry_on_contention(&exec); + if (!vm) + break; + + for (int i = 0; i < coredump->num_ibs; i++) { + u64 pfn; + + va_start = coredump->ibs[i].gpu_addr & + AMDGPU_GMC_HOLE_MASK; + pfn = va_start / AMDGPU_GPU_PAGE_SIZE; + mapping = amdgpu_vm_bo_lookup_mapping(vm, pfn); + if (!mapping) + continue; + + abo = mapping->bo_va->base.bo; + r = drm_exec_lock_obj(&exec, &abo->tbo.base); + drm_exec_retry_on_contention(&exec); + if (r) + break; + } + if (r) + break; + } + if (vm && !r) + locked = true; + else + drm_exec_fini(&exec); + } + + for (int i = 0; i < coredump->num_ibs; i++) { + bool emit_content = sizing_pass;
- for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) { ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4, GFP_KERNEL); if (!ib_content) continue;
- /* vm=NULL can only happen when 'sizing_pass' is true. Skip to the - * drm_printf() calls (ib_content doesn't need to be initialized - * as its content won't be written anywhere). - */ - if (!vm) + if (!locked) goto output_ib_content;
va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK; mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE); if (!mapping) - goto free_ib_content; + goto output_ib_content;
- offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE); - abo = amdgpu_bo_ref(mapping->bo_va->base.bo); - r = amdgpu_bo_reserve(abo, false); - if (r) - goto free_ib_content; + abo = mapping->bo_va->base.bo; + offset = va_start - mapping->start * AMDGPU_GPU_PAGE_SIZE;
if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) { off = 0;
if (abo->tbo.resource->mem_type != TTM_PL_VRAM) - goto unreserve_abo; + goto output_ib_content;
amdgpu_res_first(abo->tbo.resource, offset, coredump->ibs[i].ib_size_dw * 4, @@ -391,12 +429,13 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf off += cursor.size; amdgpu_res_next(&cursor, cursor.size); } + emit_content = true; } else { r = ttm_bo_kmap(&abo->tbo, 0, PFN_UP(abo->tbo.base.size), &abo->kmap); if (r) - goto unreserve_abo; + goto output_ib_content;
kptr = amdgpu_bo_kptr(abo); kptr += offset; @@ -404,23 +443,21 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf coredump->ibs[i].ib_size_dw * 4);
amdgpu_bo_kunmap(abo); + emit_content = true; }
output_ib_content: drm_printf(&p, "\nIB #%d 0x%llx %d dw\n", i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw); - for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) - drm_printf(&p, "0x%08x\n", ib_content[j]); -unreserve_abo: - if (vm) - amdgpu_bo_unreserve(abo); -free_ib_content: + if (emit_content) { + for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) + drm_printf(&p, "0x%08x\n", ib_content[j]); + } kvfree(ib_content); } - if (vm) { - amdgpu_bo_unreserve(root); - amdgpu_bo_unref(&root); - } + + if (locked) + drm_exec_fini(&exec); }
return count - iter.remain;
On 5/21/26 17:08, Mikhail Gavrilov wrote:
When dumping IB contents from a hung job, amdgpu_devcoredump_format() acquired the VM root PD's reservation via amdgpu_vm_lock_by_pasid() and then, for each IB, called amdgpu_bo_reserve() on the BO backing the IB. Both reservations are reservation_ww_class_mutex objects and neither used a ww_acquire_ctx, which trips lockdep:
WARNING: possible recursive locking detected
kworker/u128:0 is trying to acquire lock: ffff88838b16e1f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
but task is already holding lock: ffff8882f82681f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
Possible unsafe locking scenario: CPU0 ---- lock(reservation_ww_class_mutex); lock(reservation_ww_class_mutex);
*** DEADLOCK *** May be due to missing lock nesting notation
Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu] Call Trace: __ww_mutex_lock.constprop.0 ww_mutex_lock amdgpu_bo_reserve amdgpu_devcoredump_format+0x1594 [amdgpu] amdgpu_devcoredump_deferred_work+0xea [amdgpu]
The two reservations are on different BOs in the captured trace, so the splat is a lockdep-correctness warning, not an observed deadlock. It becomes a real self-deadlock whenever the IB BO shares its dma_resv with the root PD (the always-valid case, see amdgpu_vm_is_bo_always_valid()): amdgpu_bo_reserve(abo) re-acquires the same ww_mutex without a ticket and blocks forever.
With amdgpu.gpu_recovery=0 the timeout handler refires every ~2 s and each invocation produces this splat, drowning the kernel ring buffer.
Now that amdgpu_vm_lock_by_pasid() takes a drm_exec context, lock the root PD and every IB BO together in a single drm_exec ticket. DRM_EXEC_IGNORE_DUPLICATES handles IB BOs that share a dma_resv (e.g. always-valid BOs, or two IBs backed by the same BO). Every lock is now a top-level acquire under one ww_acquire_ctx, so the recursive ww_mutex condition is gone, and the per-IB amdgpu_bo_reserve()/amdgpu_bo_unref() dance -- including a BO refcount leak on the amdgpu_bo_reserve() failure path -- is removed.
Reproducer (~150 LoC libdrm_amdgpu): submit a single GFX IB containing PACKET3_INDIRECT_BUFFER chained at GPU VA 0 and wait for the fence. The TDR fires within ~10 s and the deferred coredump worker produces the splat above on every invocation; with this change applied the splat is gone.
That commit message is a bit to long. It should describe the problem and the solution and not necessary how to reproduce it.
Fixes: 7b15fc2d1f1a ("drm/amdgpu: dump job ibs in the devcoredump") Suggested-by: Christian König christian.koenig@amd.com Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com
.../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 105 ++++++++++++------ 1 file changed, 71 insertions(+), 34 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index d386bc775d03..456ea9911d48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -24,6 +24,7 @@ #include <generated/utsrelease.h> #include <linux/devcoredump.h> +#include <drm/drm_exec.h> #include "amdgpu_dev_coredump.h" #include "atom.h" @@ -214,13 +215,9 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf struct drm_printer p; struct drm_print_iterator iter; struct amdgpu_vm_fault_info *fault_info;
- struct amdgpu_bo_va_mapping *mapping; struct amdgpu_ip_block *ip_block; struct amdgpu_res_cursor cursor;
- struct amdgpu_bo *abo, *root;
- uint64_t va_start, offset; struct amdgpu_ring *ring;
- struct amdgpu_vm *vm; u32 *ib_content; uint8_t *kptr; int ver, i, j, r;
@@ -343,43 +340,84 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf drm_printf(&p, "VRAM is lost due to GPU reset!\n"); if (coredump->num_ibs) {
/* Don't try to lookup the VM or map the BOs when calculating the* size required to store the devcoredump.
struct amdgpu_bo_va_mapping *mapping;struct amdgpu_bo *abo;struct drm_exec exec;struct amdgpu_vm *vm;u64 va_start, offset;
It's probably a good idea to put the IB dumping into a separate function.
bool locked = false;
Drop that variable and handling, it is superflous.
/** Lock the VM root PD and every IB BO together in a single* drm_exec ticket. Reserving the IB BOs one by one while the* root PD is held would be a recursive reservation_ww_class_mutex* acquire without a ww_acquire_ctx, which trips lockdep and* self-deadlocks for IB BOs that share their dma_resv with the* root PD (always-valid BOs).** Skip locking entirely on the sizing pass: it does not write* IB content, so the size estimate doesn't depend on whether */* the BOs are reachable.
if (sizing_pass)vm = NULL;elsevm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid);
if (!sizing_pass) {drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES,1 + coredump->num_ibs);drm_exec_until_all_locked(&exec) {vm = amdgpu_vm_lock_by_pasid(adev, coredump->pasid,&exec);drm_exec_retry_on_contention(&exec);if (!vm)break;
This should use goto error handling, when we can't find the VM we should abort here.
for (int i = 0; i < coredump->num_ibs; i++) {u64 pfn;va_start = coredump->ibs[i].gpu_addr &AMDGPU_GMC_HOLE_MASK;pfn = va_start / AMDGPU_GPU_PAGE_SIZE;mapping = amdgpu_vm_bo_lookup_mapping(vm, pfn);if (!mapping)continue;
That's also an error, it could be that we just want to print the IB start address in that case.
abo = mapping->bo_va->base.bo;r = drm_exec_lock_obj(&exec, &abo->tbo.base);drm_exec_retry_on_contention(&exec);if (r)break;
Dito
}if (r)break;
And here as well.
}if (vm && !r)locked = true;elsedrm_exec_fini(&exec);
Don't call drm_exec_fini() here.
Regards, Christian.
}for (int i = 0; i < coredump->num_ibs; i++) {bool emit_content = sizing_pass;
for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) { ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4, GFP_KERNEL); if (!ib_content) continue;
/* vm=NULL can only happen when 'sizing_pass' is true. Skip to the* drm_printf() calls (ib_content doesn't need to be initialized* as its content won't be written anywhere).*/if (!vm)
if (!locked) goto output_ib_content;va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK; mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE); if (!mapping)
goto free_ib_content;
goto output_ib_content;
offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE);abo = amdgpu_bo_ref(mapping->bo_va->base.bo);r = amdgpu_bo_reserve(abo, false);if (r)goto free_ib_content;
abo = mapping->bo_va->base.bo;offset = va_start - mapping->start * AMDGPU_GPU_PAGE_SIZE;if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) { off = 0; if (abo->tbo.resource->mem_type != TTM_PL_VRAM)
goto unreserve_abo;
goto output_ib_content;amdgpu_res_first(abo->tbo.resource, offset, coredump->ibs[i].ib_size_dw * 4, @@ -391,12 +429,13 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf off += cursor.size; amdgpu_res_next(&cursor, cursor.size); }
emit_content = true; } else { r = ttm_bo_kmap(&abo->tbo, 0, PFN_UP(abo->tbo.base.size), &abo->kmap); if (r)
goto unreserve_abo;
goto output_ib_content;kptr = amdgpu_bo_kptr(abo); kptr += offset; @@ -404,23 +443,21 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf coredump->ibs[i].ib_size_dw * 4); amdgpu_bo_kunmap(abo);
emit_content = true; }output_ib_content: drm_printf(&p, "\nIB #%d 0x%llx %d dw\n", i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw);
for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++)drm_printf(&p, "0x%08x\n", ib_content[j]);-unreserve_abo:
if (vm)amdgpu_bo_unreserve(abo);-free_ib_content:
if (emit_content) {for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++)drm_printf(&p, "0x%08x\n", ib_content[j]); }} kvfree(ib_content);
if (vm) {amdgpu_bo_unreserve(root);amdgpu_bo_unref(&root);}
if (locked) }drm_exec_fini(&exec);return count - iter.remain;
Thanks for the review. v6 will:
- trim the commit message: drop the reproducer paragraph, keep just the problem description and the solution - move the IB dumping into its own function - replace the break-based flow inside drm_exec_until_all_locked() with goto error handling, and drop the now-superfluous `locked` variable - not call drm_exec_fini() in the locking helper on the error path
One thing I'd like to confirm before respinning — the !mapping case in the locking loop:
mapping = amdgpu_vm_bo_lookup_mapping(vm, pfn); if (!mapping) continue;
You commented "That's also an error, it could be that we just want to print the IB start address in that case."
My reading: a missing mapping is not fatal to the whole dump. For that IB there is simply nothing to lock, so the locking loop should move on to the next IB, and the content loop then still emits the "IB #N 0x<addr> <dw>" header with no body (it already does this via goto output_ib_content). The dump continues for the remaining IBs.
So in the locking loop I'd keep `continue` for !mapping, and reserve goto-abort only for real errors (drm_exec_lock_obj() failure, VM not found). Is that what you intended, or should a missing mapping abort the whole IB dump?
This series fixes a lockdep "possible recursive locking" splat in amdgpu_devcoredump_format() that fires on every GPU timeout once a job with a PASID context is involved. With amdgpu.gpu_recovery=0 the timeout handler refires every ~2 s, so the splat repeats until it drowns the kernel ring buffer. It is also a real self-deadlock for IB BOs that share their dma_resv with the root PD (the always-valid case).
The root cause: amdgpu_devcoredump_format() holds the VM root PD's reservation and then reserves each IB BO on top of it, nesting two reservation_ww_class_mutex acquires without a ww_acquire_ctx.
The fix teaches amdgpu_vm_lock_by_pasid() to lock the root PD in a drm_exec context, so the devcoredump path can lock the root PD and all the IB BOs together in one ww ticket. Because amdgpu_vm_lock_by_pasid() has a second caller in the page-fault path, the series is split so each patch builds and works on its own:
1/2 Convert amdgpu_vm_lock_by_pasid() to take a drm_exec context and lock the root PD with drm_exec_lock_obj(). The drm_exec context holds the root BO reference, so the root output parameter is dropped. Updates the existing caller, amdgpu_vm_handle_fault(). Pure refactor, no functional change to the page-fault path. (Reviewed-by Christian on v5.)
2/2 Move the IB dumping into a separate helper that locks the root PD and every IB BO together in one drm_exec ticket. The per-IB amdgpu_bo_reserve() nesting is gone, along with a BO refcount leak on the old reserve-failure path. This is the actual bug fix and carries the Fixes: tag.
Tested on Linux 7.1-rc4 + this series, Radeon RX 7900 XTX (gfx1100), KASAN + PROVE_LOCKING enabled, using a small libdrm_amdgpu reproducer that submits a GFX IB chained at GPU VA 0 and waits for the hang. Before the series the splat fires on every TDR; after it the dmesg is clean across repeated timeouts and the devcoredump IB dump is produced correctly.
v1: https://lore.kernel.org/amd-gfx/20260429143743.50743-1-mikhail.v.gavrilov@gm... v2: https://lore.kernel.org/amd-gfx/20260519161541.19994-1-mikhail.v.gavrilov@gm... v3: https://lore.kernel.org/amd-gfx/20260520151741.50575-1-mikhail.v.gavrilov@gm... v4: https://lore.kernel.org/amd-gfx/20260521104335.28978-1-mikhail.v.gavrilov@gm... v5: https://lore.kernel.org/amd-gfx/20260521150841.20625-1-mikhail.v.gavrilov@gm...
Changes since v5 (all in patch 2, per Christian's review): - Trim the commit message: drop the reproducer paragraph, keep the problem description and the solution. - Move the IB dumping out of amdgpu_devcoredump_format() into a separate amdgpu_devcoredump_print_ibs() helper. - Use goto error handling inside drm_exec_until_all_locked() instead of break, and drop the now-superfluous `locked` variable. drm_exec_fini() is called once at the end of the helper, not in the locking path. - Patch 1 is unchanged from v5 and keeps Christian's Reviewed-by.
A note on one review point I couldn't fully confirm before respinning (asked on the v5 thread [1], no reply yet): in the locking loop, when amdgpu_vm_bo_lookup_mapping() returns no mapping for an IB, this version treats it as non-fatal -- there is simply nothing to lock for that IB, so the loop continues, and the content loop still emits the "IB #N <addr>" header without a body. goto-abort is reserved for real errors (VM not found, drm_exec_lock_obj() failure). If a missing mapping should instead abort the whole dump, I'll change it.
[1] https://lore.kernel.org/amd-gfx/CABXGCsPPY3qX7Ad-a7==nmA5R7aejCTCrmWYpn-9OQQ...
Changes since v4: - Pass nr=1 to drm_exec_init() in amdgpu_vm_handle_fault() (Christian). - Picked up Christian's Reviewed-by on patch 1.
Changes since v3: - Lock the root PD with drm_exec_lock_obj() instead of amdgpu_vm_lock_pd(); drop the root output parameter; unlock with drm_exec_unlock_obj() on the VM-recheck-failed path (Christian). - Resolves the docutils warning the kernel test robot reported on v3.
Changes since v2: - Reworked along the lines Christian suggested: amdgpu_vm_lock_by_pasid() takes a drm_exec context directly (patch 1), devcoredump locks the root PD and all IB BOs in one ticket (patch 2). The v2 helper struct and the three collect/lock/release helpers are gone.
Changes since v1: - Switched from per-IB amdgpu_bo_reserve() to drm_exec. - Dropped the Cc: stable tag: the regression only landed in 7.1-rc1, so the fix reaches 7.1 via drm-fixes without a stable backport.
Mikhail Gavrilov (2): drm/amdgpu: convert amdgpu_vm_lock_by_pasid() to drm_exec drm/amdgpu: fix recursive ww_mutex acquire in amdgpu_devcoredump_format
.../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 215 ++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 91 +++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- 3 files changed, 184 insertions(+), 124 deletions(-)
amdgpu_vm_lock_by_pasid() looks up a VM by PASID and reserves its root PD with a bare amdgpu_bo_reserve(), returning the still-reserved root to the caller. A caller that then needs to reserve further BOs (for example the devcoredump IB dump) ends up nesting reservation_ww_class_mutex acquires without a ww_acquire_ctx, which lockdep flags as recursive locking.
Convert the helper to take a drm_exec context and lock the root PD with drm_exec_lock_obj(). Callers now run it inside a drm_exec_until_all_locked() loop and can lock additional BOs in the same ww ticket, so there is no nested ww_mutex acquire.
The drm_exec context holds its own reference on the locked root BO, so the helper no longer hands a root reference back to the caller: the root output parameter is dropped, and the transient reference taken across the PASID lookup is released before returning.
The only existing caller, amdgpu_vm_handle_fault(), is updated accordingly. Its is_compute_context path, which previously dropped the root reservation around svm_range_restore_pages() and re-took it, now finalises the drm_exec context and re-initialises a fresh one; behaviour is otherwise unchanged.
No functional change intended for the page-fault path.
Reviewed-by: Christian König christian.koenig@amd.com Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 91 ++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- 2 files changed, 58 insertions(+), 35 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index fccd758b6699..b9d93b664daf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2950,47 +2950,56 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) }
/** - * amdgpu_vm_lock_by_pasid - return an amdgpu_vm and its root bo from a pasid, if possible. + * amdgpu_vm_lock_by_pasid - look up a VM by PASID and lock its root PD * @adev: amdgpu device pointer - * @root: root BO of the VM * @pasid: PASID of the VM - * The caller needs to unreserve and unref the root bo on success. + * @exec: drm_exec context to lock the root PD in + * + * Must be called from within a drm_exec_until_all_locked() loop; the caller + * runs drm_exec_retry_on_contention() afterwards. The drm_exec context holds + * a reference on the root BO until it is finalised. + * + * Return: the VM on success, or NULL if the PASID has no VM, the VM is being + * torn down, or locking the root PD failed. */ struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, - struct amdgpu_bo **root, u32 pasid) + u32 pasid, struct drm_exec *exec) { unsigned long irqflags; + struct amdgpu_bo *root; struct amdgpu_vm *vm; int r;
xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); vm = xa_load(&adev->vm_manager.pasids, pasid); - *root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL; + root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL; xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
- if (!*root) + if (!root) return NULL;
- r = amdgpu_bo_reserve(*root, true); - if (r) - goto error_unref; + r = drm_exec_lock_obj(exec, &root->tbo.base); + if (r) { + amdgpu_bo_unref(&root); + return NULL; + }
/* Double check that the VM still exists */ xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); vm = xa_load(&adev->vm_manager.pasids, pasid); - if (vm && vm->root.bo != *root) + if (vm && vm->root.bo != root) vm = NULL; xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags); - if (!vm) - goto error_unlock; + if (!vm) { + drm_exec_unlock_obj(exec, &root->tbo.base); + amdgpu_bo_unref(&root); + return NULL; + }
- return vm; -error_unlock: - amdgpu_bo_unreserve(*root); + /* The drm_exec context holds its own reference on the root BO. */ + amdgpu_bo_unref(&root);
-error_unref: - amdgpu_bo_unref(root); - return NULL; + return vm; }
/** @@ -3012,33 +3021,49 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, uint64_t ts, bool write_fault) { bool is_compute_context = false; - struct amdgpu_bo *root; + struct drm_exec exec; uint64_t value, flags; struct amdgpu_vm *vm; int r;
- vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid); - if (!vm) + drm_exec_init(&exec, 0, 1); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, pasid, &exec); + drm_exec_retry_on_contention(&exec); + if (!vm) + break; + } + if (!vm) { + drm_exec_fini(&exec); return false; + }
is_compute_context = vm->is_compute_context;
if (is_compute_context) { - /* Unreserve root since svm_range_restore_pages might try to reserve it. */ - /* TODO: rework svm_range_restore_pages so that this isn't necessary. */ - amdgpu_bo_unreserve(root); + /* Release the root PD lock since svm_range_restore_pages + * might try to take it. + * TODO: rework svm_range_restore_pages so that this isn't + * necessary. + */ + drm_exec_fini(&exec);
if (!svm_range_restore_pages(adev, pasid, vmid, - node_id, addr >> PAGE_SHIFT, ts, write_fault)) { - amdgpu_bo_unref(&root); + node_id, addr >> PAGE_SHIFT, ts, write_fault)) return true; - } - amdgpu_bo_unref(&root);
/* Re-acquire the VM lock, could be that the VM was freed in between. */ - vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid); - if (!vm) + drm_exec_init(&exec, 0, 1); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, pasid, &exec); + drm_exec_retry_on_contention(&exec); + if (!vm) + break; + } + if (!vm) { + drm_exec_fini(&exec); return false; + } }
addr /= AMDGPU_GPU_PAGE_SIZE; @@ -3062,7 +3087,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, value = 0; }
- r = dma_resv_reserve_fences(root->tbo.base.resv, 1); + r = dma_resv_reserve_fences(vm->root.bo->tbo.base.resv, 1); if (r) { pr_debug("failed %d to reserve fence slot\n", r); goto error_unlock; @@ -3076,12 +3101,10 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, r = amdgpu_vm_update_pdes(adev, vm, true);
error_unlock: - amdgpu_bo_unreserve(root); + drm_exec_fini(&exec); if (r < 0) dev_err(adev->dev, "Can't handle page fault (%d)\n", r);
- amdgpu_bo_unref(&root); - return false; }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index d083d7aab75c..0c6e3e0368c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -593,7 +593,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, bool write_fault);
struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, - struct amdgpu_bo **root, u32 pasid); + u32 pasid, struct drm_exec *exec);
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
When dumping IB contents from a hung job, amdgpu_devcoredump_format() acquired the VM root PD's reservation via amdgpu_vm_lock_by_pasid() and then, for each IB, called amdgpu_bo_reserve() on the BO backing the IB. Both reservations are reservation_ww_class_mutex objects and neither used a ww_acquire_ctx, which trips lockdep:
WARNING: possible recursive locking detected -------------------------------------------- kworker/u128:0 is trying to acquire lock: ffff88838b16e1f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
but task is already holding lock: ffff8882f82681f0 (reservation_ww_class_mutex){+.+.}-{4:4}, at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
Possible unsafe locking scenario: CPU0 ---- lock(reservation_ww_class_mutex); lock(reservation_ww_class_mutex);
*** DEADLOCK *** May be due to missing lock nesting notation
Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu] Call Trace: __ww_mutex_lock.constprop.0 ww_mutex_lock amdgpu_bo_reserve amdgpu_devcoredump_format+0x1594 [amdgpu] amdgpu_devcoredump_deferred_work+0xea [amdgpu]
The two reservations are on different BOs in the captured trace, so the splat is a lockdep-correctness warning, not an observed deadlock. It becomes a real self-deadlock whenever the IB BO shares its dma_resv with the root PD (the always-valid case, see amdgpu_vm_is_bo_always_valid()): amdgpu_bo_reserve(abo) re-acquires the same ww_mutex without a ticket and blocks forever. With amdgpu.gpu_recovery=0 the timeout handler refires every ~2 s and each invocation produces this splat, drowning the kernel ring buffer.
Now that amdgpu_vm_lock_by_pasid() takes a drm_exec context, move the IB dumping into a separate helper that locks the root PD and every IB BO together in a single drm_exec ticket. DRM_EXEC_IGNORE_DUPLICATES handles IB BOs that share a dma_resv (e.g. always-valid BOs, or two IBs backed by the same BO). Every lock is now a top-level acquire under one ww_acquire_ctx, so the recursive ww_mutex condition is gone, and the per-IB amdgpu_bo_reserve()/amdgpu_bo_unref() dance -- including a BO refcount leak on the amdgpu_bo_reserve() failure path -- is removed.
Fixes: 7b15fc2d1f1a ("drm/amdgpu: dump job ibs in the devcoredump") Suggested-by: Christian König christian.koenig@amd.com Signed-off-by: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com --- .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 215 ++++++++++-------- 1 file changed, 126 insertions(+), 89 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index d386bc775d03..c7d43796d603 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -24,6 +24,7 @@
#include <generated/utsrelease.h> #include <linux/devcoredump.h> +#include <drm/drm_exec.h> #include "amdgpu_dev_coredump.h" #include "atom.h"
@@ -207,23 +208,137 @@ static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev, } }
+static void +amdgpu_devcoredump_print_ibs(struct drm_printer *p, + struct amdgpu_coredump_info *coredump, + bool sizing_pass) +{ + struct amdgpu_device *adev = coredump->adev; + struct amdgpu_bo_va_mapping *mapping; + struct amdgpu_bo *abo; + struct drm_exec exec; + struct amdgpu_vm *vm; + u32 *ib_content; + u64 va_start, offset; + u8 *kptr; + u32 off; + int r; + + /* + * On the sizing pass there is no VM to look up and no BO to lock; the + * size estimate doesn't depend on whether the IB BOs are reachable. + * Just emit the per-IB headers (the content is not written anywhere). + */ + if (sizing_pass) { + for (int i = 0; i < coredump->num_ibs; i++) { + drm_printf(p, "\nIB #%d 0x%llx %d dw\n", i, + coredump->ibs[i].gpu_addr, + coredump->ibs[i].ib_size_dw); + } + return; + } + + /* + * Lock the VM root PD and every IB BO together in a single drm_exec + * ticket. Reserving the IB BOs one by one while the root PD is held + * would be a recursive reservation_ww_class_mutex acquire without a + * ww_acquire_ctx, which trips lockdep and self-deadlocks for IB BOs + * that share their dma_resv with the root PD (always-valid BOs). + */ + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 1 + coredump->num_ibs); + drm_exec_until_all_locked(&exec) { + vm = amdgpu_vm_lock_by_pasid(adev, coredump->pasid, &exec); + if (!vm) + goto unlock; + + for (int i = 0; i < coredump->num_ibs; i++) { + u64 pfn = (coredump->ibs[i].gpu_addr & + AMDGPU_GMC_HOLE_MASK) / AMDGPU_GPU_PAGE_SIZE; + + mapping = amdgpu_vm_bo_lookup_mapping(vm, pfn); + if (!mapping) + continue; + + abo = mapping->bo_va->base.bo; + r = drm_exec_lock_obj(&exec, &abo->tbo.base); + drm_exec_retry_on_contention(&exec); + if (r) + goto unlock; + } + } + + for (int i = 0; i < coredump->num_ibs; i++) { + bool emit_content = false; + + ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4, + GFP_KERNEL); + if (!ib_content) + continue; + + va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK; + mapping = amdgpu_vm_bo_lookup_mapping(vm, + va_start / AMDGPU_GPU_PAGE_SIZE); + if (!mapping) + goto output_ib_content; + + abo = mapping->bo_va->base.bo; + offset = va_start - mapping->start * AMDGPU_GPU_PAGE_SIZE; + + if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) { + struct amdgpu_res_cursor cursor; + + off = 0; + + if (abo->tbo.resource->mem_type != TTM_PL_VRAM) + goto output_ib_content; + + amdgpu_res_first(abo->tbo.resource, offset, + coredump->ibs[i].ib_size_dw * 4, &cursor); + while (cursor.remaining) { + amdgpu_device_mm_access(adev, cursor.start / 4, + &ib_content[off], cursor.size / 4, + false); + off += cursor.size; + amdgpu_res_next(&cursor, cursor.size); + } + emit_content = true; + } else { + r = ttm_bo_kmap(&abo->tbo, 0, PFN_UP(abo->tbo.base.size), + &abo->kmap); + if (r) + goto output_ib_content; + + kptr = amdgpu_bo_kptr(abo); + kptr += offset; + memcpy(ib_content, kptr, coredump->ibs[i].ib_size_dw * 4); + + amdgpu_bo_kunmap(abo); + emit_content = true; + } + +output_ib_content: + drm_printf(p, "\nIB #%d 0x%llx %d dw\n", i, + coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw); + if (emit_content) { + for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) + drm_printf(p, "0x%08x\n", ib_content[j]); + } + kvfree(ib_content); + } + +unlock: + drm_exec_fini(&exec); +} + static ssize_t amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_info *coredump) { - struct amdgpu_device *adev = coredump->adev; struct drm_printer p; struct drm_print_iterator iter; struct amdgpu_vm_fault_info *fault_info; - struct amdgpu_bo_va_mapping *mapping; struct amdgpu_ip_block *ip_block; - struct amdgpu_res_cursor cursor; - struct amdgpu_bo *abo, *root; - uint64_t va_start, offset; struct amdgpu_ring *ring; - struct amdgpu_vm *vm; - u32 *ib_content; - uint8_t *kptr; - int ver, i, j, r; + int ver, i, j; u32 ring_idx, off; bool sizing_pass;
@@ -342,86 +457,8 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf else if (coredump->reset_vram_lost) drm_printf(&p, "VRAM is lost due to GPU reset!\n");
- if (coredump->num_ibs) { - /* Don't try to lookup the VM or map the BOs when calculating the - * size required to store the devcoredump. - */ - if (sizing_pass) - vm = NULL; - else - vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid); - - for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) { - ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4, - GFP_KERNEL); - if (!ib_content) - continue; - - /* vm=NULL can only happen when 'sizing_pass' is true. Skip to the - * drm_printf() calls (ib_content doesn't need to be initialized - * as its content won't be written anywhere). - */ - if (!vm) - goto output_ib_content; - - va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK; - mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE); - if (!mapping) - goto free_ib_content; - - offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE); - abo = amdgpu_bo_ref(mapping->bo_va->base.bo); - r = amdgpu_bo_reserve(abo, false); - if (r) - goto free_ib_content; - - if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) { - off = 0; - - if (abo->tbo.resource->mem_type != TTM_PL_VRAM) - goto unreserve_abo; - - amdgpu_res_first(abo->tbo.resource, offset, - coredump->ibs[i].ib_size_dw * 4, - &cursor); - while (cursor.remaining) { - amdgpu_device_mm_access(adev, cursor.start / 4, - &ib_content[off], cursor.size / 4, - false); - off += cursor.size; - amdgpu_res_next(&cursor, cursor.size); - } - } else { - r = ttm_bo_kmap(&abo->tbo, 0, - PFN_UP(abo->tbo.base.size), - &abo->kmap); - if (r) - goto unreserve_abo; - - kptr = amdgpu_bo_kptr(abo); - kptr += offset; - memcpy(ib_content, kptr, - coredump->ibs[i].ib_size_dw * 4); - - amdgpu_bo_kunmap(abo); - } - -output_ib_content: - drm_printf(&p, "\nIB #%d 0x%llx %d dw\n", - i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw); - for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) - drm_printf(&p, "0x%08x\n", ib_content[j]); -unreserve_abo: - if (vm) - amdgpu_bo_unreserve(abo); -free_ib_content: - kvfree(ib_content); - } - if (vm) { - amdgpu_bo_unreserve(root); - amdgpu_bo_unref(&root); - } - } + if (coredump->num_ibs) + amdgpu_devcoredump_print_ibs(&p, coredump, sizing_pass);
return count - iter.remain; }
linaro-mm-sig@lists.linaro.org