When dumping IB contents from a hung job, amdgpu_devcoredump_format()
acquires the VM root PD's reservation lock via amdgpu_vm_lock_by_pasid()
and then, for each IB referenced by the job, calls amdgpu_bo_reserve()
on the BO that backs the IB. Both reservations are taken on
reservation_ww_class_mutex objects but neither uses a ww_acquire_ctx,
which trips lockdep:
WARNING: possible recursive locking detected
--------------------------------------------
kworker/u128:0 is trying to acquire lock:
ffff88838b16e1f0 (reservation_ww_class_mutex){+.+.}-{4:4},
at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
but task is already holding lock:
ffff8882f82681f0 (reservation_ww_class_mutex){+.+.}-{4:4},
at: amdgpu_devcoredump_format+0x1594/0x23f0 [amdgpu]
Possible unsafe locking scenario:
CPU0
----
lock(reservation_ww_class_mutex);
lock(reservation_ww_class_mutex);
*** DEADLOCK ***
May be due to missing lock nesting notation
Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu]
Call Trace:
__ww_mutex_lock.constprop.0
ww_mutex_lock
amdgpu_bo_reserve
amdgpu_devcoredump_format+0x1594 [amdgpu]
amdgpu_devcoredump_deferred_work+0xea [amdgpu]
process_one_work
worker_thread
kthread
The two reservations are on different BOs in the captured trace, so the
splat is a lockdep-correctness warning, not an observed deadlock. It
becomes a real self-deadlock whenever the IB BO shares its dma_resv
with the root PD (the always-valid case, see
amdgpu_vm_is_bo_always_valid()): amdgpu_bo_reserve(abo) re-acquires the
same ww_mutex without a ticket and blocks forever.
With amdgpu.gpu_recovery=0 the timeout handler refires every ~2 s and
each invocation produces this splat, drowning the kernel ring buffer.
Fix it by collecting the per-IB BO references under the root PD's
reservation, then releasing the root before reserving each IB BO
individually. The walk over the VM mapping tree must remain under the
root lock (mappings can be torn down without it), but the actual
content copies do not need to nest inside it. Each per-IB reservation
is now an independent top-level acquire, eliminating the nested
ww_mutex.
The collect/release logic is factored out into two small helpers
(amdgpu_devcoredump_collect_ib_refs / amdgpu_devcoredump_release_ib_refs)
to keep the main function's indentation reasonable.
This also fixes a BO refcount leak in the original code: when
amdgpu_bo_reserve() failed, control jumped to free_ib_content without
running amdgpu_bo_unref(). In the new structure the per-IB BO refs
are released unconditionally in the cleanup helper.
Reproducer (~150 LoC libdrm_amdgpu): submit a single GFX IB containing
PACKET3_INDIRECT_BUFFER chained at GPU VA 0 and wait for the fence.
The TDR fires within ~10 s and the deferred coredump worker produces
the splat above on every invocation.
Fixes: 7b15fc2d1f1a ("drm/amdgpu: dump job ibs in the devcoredump")
Cc: stable(a)vger.kernel.org # 7.1
Signed-off-by: Mikhail Gavrilov <mikhail.v.gavrilov(a)gmail.com>
---
.../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 147 +++++++++++++-----
1 file changed, 110 insertions(+), 37 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
index d386bc775d03..f6bb968de756 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
@@ -207,6 +207,72 @@ static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev,
}
}
+struct amdgpu_devcoredump_ib_ref {
+ struct amdgpu_bo *bo;
+ u64 offset;
+};
+
+/*
+ * Walk the VM's mapping tree under the root PD's reservation to obtain the BO
+ * that backs each IB and pin it with a refcount. The root PD reservation is
+ * dropped before this function returns; the caller can then reserve each IB
+ * BO individually without nesting ww_mutex acquires on
+ * reservation_ww_class_mutex.
+ *
+ * Returns an array of num_ibs entries (each ib_refs[i].bo may be NULL if its
+ * mapping was not found), or NULL on allocation failure / VM lookup failure.
+ * The caller must release the BO refs and free the array.
+ */
+static struct amdgpu_devcoredump_ib_ref *
+amdgpu_devcoredump_collect_ib_refs(struct amdgpu_device *adev,
+ struct amdgpu_coredump_info *coredump)
+{
+ struct amdgpu_devcoredump_ib_ref *ib_refs;
+ struct amdgpu_bo_va_mapping *mapping;
+ struct amdgpu_bo *root;
+ struct amdgpu_vm *vm;
+ u64 va_start;
+
+ ib_refs = kcalloc(coredump->num_ibs, sizeof(*ib_refs), GFP_KERNEL);
+ if (!ib_refs)
+ return NULL;
+
+ vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid);
+ if (!vm) {
+ kfree(ib_refs);
+ return NULL;
+ }
+
+ for (int i = 0; i < coredump->num_ibs; i++) {
+ va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK;
+ mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE);
+ if (!mapping)
+ continue;
+
+ ib_refs[i].bo = amdgpu_bo_ref(mapping->bo_va->base.bo);
+ ib_refs[i].offset = va_start -
+ mapping->start * AMDGPU_GPU_PAGE_SIZE;
+ }
+
+ amdgpu_bo_unreserve(root);
+ amdgpu_bo_unref(&root);
+
+ return ib_refs;
+}
+
+static void
+amdgpu_devcoredump_release_ib_refs(struct amdgpu_devcoredump_ib_ref *ib_refs,
+ int num_ibs)
+{
+ if (!ib_refs)
+ return;
+
+ for (int i = 0; i < num_ibs; i++)
+ if (ib_refs[i].bo)
+ amdgpu_bo_unref(&ib_refs[i].bo);
+ kfree(ib_refs);
+}
+
static ssize_t
amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_info *coredump)
{
@@ -214,13 +280,11 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf
struct drm_printer p;
struct drm_print_iterator iter;
struct amdgpu_vm_fault_info *fault_info;
- struct amdgpu_bo_va_mapping *mapping;
struct amdgpu_ip_block *ip_block;
struct amdgpu_res_cursor cursor;
- struct amdgpu_bo *abo, *root;
- uint64_t va_start, offset;
+ struct amdgpu_bo *abo;
+ uint64_t offset;
struct amdgpu_ring *ring;
- struct amdgpu_vm *vm;
u32 *ib_content;
uint8_t *kptr;
int ver, i, j, r;
@@ -343,43 +407,52 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
if (coredump->num_ibs) {
- /* Don't try to lookup the VM or map the BOs when calculating the
- * size required to store the devcoredump.
+ struct amdgpu_devcoredump_ib_ref *ib_refs = NULL;
+
+ /*
+ * Snapshot per-IB BO references under the root PD's reservation,
+ * then release the root before reserving each IB BO individually
+ * to copy its contents.
+ *
+ * Reserving an IB BO while the root PD is still reserved would
+ * be a nested ww_mutex acquire on reservation_ww_class_mutex
+ * without a ww_acquire_ctx, which trips lockdep's recursive-
+ * locking check and self-deadlocks for IB BOs that share their
+ * dma_resv with the root PD (always-valid BOs).
+ *
+ * Skip lookup/reservation entirely on the sizing pass: it does
+ * not write IB content, and the size estimate doesn't depend on
+ * whether the BOs are reachable.
*/
- if (sizing_pass)
- vm = NULL;
- else
- vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid);
+ if (!sizing_pass)
+ ib_refs = amdgpu_devcoredump_collect_ib_refs(adev, coredump);
- for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) {
+ for (int i = 0; i < coredump->num_ibs; i++) {
ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4,
GFP_KERNEL);
if (!ib_content)
continue;
- /* vm=NULL can only happen when 'sizing_pass' is true. Skip to the
- * drm_printf() calls (ib_content doesn't need to be initialized
- * as its content won't be written anywhere).
- */
- if (!vm)
+ if (sizing_pass)
goto output_ib_content;
- va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK;
- mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE);
- if (!mapping)
- goto free_ib_content;
+ if (!ib_refs || !ib_refs[i].bo)
+ goto output_ib_content;
+
+ abo = ib_refs[i].bo;
+ offset = ib_refs[i].offset;
- offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE);
- abo = amdgpu_bo_ref(mapping->bo_va->base.bo);
r = amdgpu_bo_reserve(abo, false);
if (r)
- goto free_ib_content;
+ goto output_ib_content;
if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) {
off = 0;
- if (abo->tbo.resource->mem_type != TTM_PL_VRAM)
- goto unreserve_abo;
+ if (abo->tbo.resource->mem_type != TTM_PL_VRAM) {
+ amdgpu_bo_unreserve(abo);
+ goto output_ib_content;
+ }
amdgpu_res_first(abo->tbo.resource, offset,
coredump->ibs[i].ib_size_dw * 4,
@@ -395,8 +468,10 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf
r = ttm_bo_kmap(&abo->tbo, 0,
PFN_UP(abo->tbo.base.size),
&abo->kmap);
- if (r)
- goto unreserve_abo;
+ if (r) {
+ amdgpu_bo_unreserve(abo);
+ goto output_ib_content;
+ }
kptr = amdgpu_bo_kptr(abo);
kptr += offset;
@@ -406,21 +481,19 @@ amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_inf
amdgpu_bo_kunmap(abo);
}
+ amdgpu_bo_unreserve(abo);
+
output_ib_content:
drm_printf(&p, "\nIB #%d 0x%llx %d dw\n",
i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw);
- for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++)
- drm_printf(&p, "0x%08x\n", ib_content[j]);
-unreserve_abo:
- if (vm)
- amdgpu_bo_unreserve(abo);
-free_ib_content:
+ if (!sizing_pass && ib_refs && ib_refs[i].bo) {
+ for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++)
+ drm_printf(&p, "0x%08x\n", ib_content[j]);
+ }
kvfree(ib_content);
}
- if (vm) {
- amdgpu_bo_unreserve(root);
- amdgpu_bo_unref(&root);
- }
+
+ amdgpu_devcoredump_release_ib_refs(ib_refs, coredump->num_ibs);
}
return count - iter.remain;
--
2.54.0
When sharing a dma-buf between components of different trust levels, the
allocator may need to hand a consumer a read-only view of a buffer it
holds with read-write access. An example is a camera pipeline where the
capture component writes frames into a buffer and needs to pass a
read-only handle to a downstream processing component that should not be
able to modify the data.
However, no such mechanism exists today. The access mode of a dma-buf
file descriptor is fixed at export time, and the standard POSIX
interfaces for duplicating or changing file descriptors (i.e., dup(2),
dup3(2), and fcntl(F_SETFL)) cannot alter the read/write access mode of
the copy.
One natural candidate would be reopening via /proc/self/fd/<N> with
O_RDONLY, which works for regular files. For dma-buf this would fail
(that is, if we were to add a new handler for open f_op) with ENXIO
because the dmabuf pseudo-filesystem carries SB_NOUSER, which prevents
the VFS from opening its files through path-based resolution from
userspace.
Alternatively, exporting the buffer twice would produce two independent
dma_buf instances, which breaks fence synchronization.
Therefore we add a new DMA_BUF_IOCTL_DERIVE ioctl, which produces a new
file descriptor for an existing dma-buf with a caller-specified subset
of the original permissions:
```
struct dma_buf_derive { __u32 flags; __s32 fd; };
struct dma_buf_derive req = { .flags = O_RDONLY | O_CLOEXEC };
ioctl(rw_fd, DMA_BUF_IOCTL_DERIVE, &req);
/* req.fd is now a read-only alias of the same buffer */
```
Permission escalation is rejected with -EACCES. The new fd aliases the
same struct dma_buf as the original, same dma_resv, same exporter ops,
same underlying memory; so importers attaching to either fd see the same
fence timeline and operate on the same object. Access control for which
components may receive or pass on restricted descriptors can be layered on
top via SELinux file:read and file:write permissions.
A shared writable mapping (PROT_WRITE | MAP_SHARED) on the read-only fd is
rejected with -EACCES in dma_buf_mmap_internal().
Two small internal adjustments accompany the ioctl:
- __dma_buf_list_del() is moved to dma_buf_release() so it fires exactly
once on dentry destruction rather than on every file close.
- dma_buf_file_release() is updated to call dma_buf_put() only for
files that are not the primary dma-buf file.
This may not be the best approach, but after considering different
options and alternatives (as described above), we decided to raise the
discussion upstream. Thus, we welcome any alternative proposal or ideas.
The series is structured as:
- Patch 1 adds the new ioctl implementation.
- Patch 2 adds selftests covering the new ioctl.
Signed-off-by: Albert Esteve <aesteve(a)redhat.com>
---
Albert Esteve (2):
dma-buf: add DMA_BUF_IOCTL_DERIVE for reduced-permission aliases
selftests: dma-buf: add DERIVE ioctl tests
drivers/dma-buf/dma-buf.c | 58 ++++++++++-
include/uapi/linux/dma-buf.h | 28 +++++
tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c | 114 ++++++++++++++++++++-
3 files changed, 198 insertions(+), 2 deletions(-)
---
base-commit: ab5fce87a778cb780a05984a2ca448f2b41aafbf
change-id: 20260520-dmabuf-limit-access-73261353841a
Best regards,
--
Albert Esteve <aesteve(a)redhat.com>
Turn the mixed bag of manual locks and guards into something
more consistent.
This patchset takes care of locks that already have guards
available, but also adds new guards for resv, drm_dev_enter/exit
and the custom panthor_device_resume_and_get() helper we have
around runtime PM.
I've intentionally placed the patch transition all locks with
readily available guards first so we can merge it even if the
new guards face some controversy.
Signed-off-by: Boris Brezillon <boris.brezillon(a)collabora.com>
---
Boris Brezillon (6):
drm/panthor: Driver-wide xxx_[un]lock -> [scoped_]guard replacement
dma-resv: Define guards for context-less dma_resv locks
drm: Define a conditional guard for drm_dev_{enter,exit}()
drm/panthor: Use guards for resv locking
drm/panthor: Use the drm_dev_access guard
drm/panthor: Add a new guard for our custom resume_and_get() PM helper
drivers/gpu/drm/panthor/panthor_devfreq.c | 29 +-
drivers/gpu/drm/panthor/panthor_device.c | 163 +++++-----
drivers/gpu/drm/panthor/panthor_device.h | 10 +-
drivers/gpu/drm/panthor/panthor_drv.c | 62 ++--
drivers/gpu/drm/panthor/panthor_gem.c | 102 +++----
drivers/gpu/drm/panthor/panthor_gpu.c | 40 +--
drivers/gpu/drm/panthor/panthor_heap.c | 139 ++++-----
drivers/gpu/drm/panthor/panthor_mmu.c | 480 ++++++++++++++----------------
drivers/gpu/drm/panthor/panthor_pwr.c | 8 +-
drivers/gpu/drm/panthor/panthor_sched.c | 254 ++++++++--------
include/drm/drm_drv.h | 9 +
include/linux/dma-resv.h | 5 +
12 files changed, 589 insertions(+), 712 deletions(-)
---
base-commit: ac5ac0acf11df04295eb1811066097b7022d6c7f
change-id: 20260512-panthor-guard-refactor-f1c6bc30c321
prerequisite-message-id: 20260512-panthor-signal-from-irq-v2-0-95c614a739cb(a)collabora.com
prerequisite-patch-id: e3cfd6399b2dc5439687932c6e961d845369562a
prerequisite-patch-id: 79820e6740c0c456efc1dfa273de04e495515a1c
prerequisite-patch-id: a3611a7c9551c606aaf87125782e6d18b6a6549e
prerequisite-patch-id: 6e9dc83a60e53e7b0d84030727ad9b1921e4b2ca
prerequisite-patch-id: eabd36064a01418a6ada3176b996a4038a314c21
prerequisite-patch-id: ca3a30182b71bf66c51ed2b6411d7ed8dc761c8e
prerequisite-patch-id: 6e549dd0ee9e3e0c8866da72dcabc82209d88360
prerequisite-patch-id: 5217700df7026ef533a2f273ea2535f9fc1274ac
prerequisite-patch-id: 8d57abec9f92bcbb21108d3005805b7c155a48f6
prerequisite-patch-id: 0bf98de955fce577ff8d4fb82c02dc04684beca6
prerequisite-patch-id: a9e0d90a64dfd5950a69b857af3867404be1ab45
Best regards,
--
Boris Brezillon <boris.brezillon(a)collabora.com>
On Wed, May 20, 2026 at 02:43:50PM -0700, John Hubbard wrote:
> The dma-buf pseudo filesystem dispenses S_ANON_INODE inodes via
> alloc_anon_inode() but never sets SB_I_NOEXEC on its superblock.
> Since commit 1e7ab6f67824 ("anon_inode: rework assertions") in 6.17,
> path_noexec() warns on exactly that combination, so an mmap() on any
> dma-buf fd trips the warning:
>
> WARNING: CPU: 11 PID: 121813 at fs/exec.c:118 path_noexec+0x47/0x50
> do_mmap+0x2b5/0x680
> vm_mmap_pgoff+0x129/0x210
> ksys_mmap_pgoff+0x177/0x240
> __x64_sys_mmap+0x33/0x70
>
> dma-bufs have no business being executable, which is the invariant
> that the new assertion is enforcing. Set SB_I_NOEXEC on the dmabuf
> superblock.
>
> Reproducer on a CONFIG_DEBUG_VFS=y kernel:
>
> make -C tools/testing/selftests/dmabuf-heaps
> sudo ./tools/testing/selftests/dmabuf-heaps/dmabuf-heap -t system
>
> The selftest allocates from /dev/dma_heap/system and mmaps the
> returned fd, which trips the warning without this patch.
>
> Fixes: 1e7ab6f67824 ("anon_inode: rework assertions")
> Cc: stable(a)vger.kernel.org
> Signed-off-by: John Hubbard <jhubbard(a)nvidia.com>
> ---
Perfect, the asserts are paying off. Thanks!
Reviewed-by: Christian Brauner (Amutable) <brauner(a)kernel.org>
> drivers/dma-buf/dma-buf.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> index 71f37544a5c6..d86a99d7b8dc 100644
> --- a/drivers/dma-buf/dma-buf.c
> +++ b/drivers/dma-buf/dma-buf.c
> @@ -216,6 +216,7 @@ static int dma_buf_fs_init_context(struct fs_context *fc)
> if (!ctx)
> return -ENOMEM;
> ctx->dops = &dma_buf_dentry_ops;
> + fc->s_iflags |= SB_I_NOEXEC;
While you're at it, also raise SB_I_NODEV. You're not creating any
device nodes and this is additional hardening.
4mmc best quality for sale WhatsApp +33605825265
3mmc best quality for sale WhatsApp +33605825265
We offer a wide range of products from the best laboratories in the world. All our products are 100% original and delivered worldwide at affordable prices.
Contact details.
WhatsApp +33605825265
E-mail: cryvojaserty(a)gmail.com
after registering their products, customers can track their shipment online and know its current location,
until it reaches the addressee. Our products are characterized by purity from 98 to 99%.
For questions about our products, please contact us.
Contact details.
WhatsApp +33605825265
Email .. cryvojaserty(a)gmail.com
Buy 4MMC Online|4MMC For Sale|Order 4MMC Online.. WhatsApp +33605825265
Buy 6apb, 4mmc, 5mapb, MDMA, Coke, Ketamine ... WhatsApp +33605825265
buy crystal meth online,
buy meth online,
how to buy cheap meth online,
buy methamphetamine for personal use,
buy MDPV online,
buy cheap meth online,
buy cheap methylone online,
buy amphetamine tablets online,
buy methylone online,
buy MDMA online,
how to buy cheap amphetamines online,
concert for sale online,
cheap concerts online
buy meth online,
methamphetamine for sale,
lisdexamfetamine used to treat ADHD for sale online
Contact details.
WhatsApp +33605825265
Email .. cryvojaserty(a)gmail.com
Each product has its own unique properties. We import directly from factory, so we can guarantee high quality product with purity over
99.9%. We are one of the safest and most trusted sellers of mephedrone, methylone, amphetamines, MDAI, 4-
Buy weight loss medications: Mounjaro, Ozempic, Wegovy, Saxenda
Buy weight loss medications without a prescription
Buy Mounjaro,
Buy Ozempic,
What is the most effective weight loss remedy available over the counter?
What is the most effective weight loss drug?
Ketamine HCl 200 mg/ml, solution for injection, 10 ml
Where to buy ketamine HCL 200 mg/ml
Contact details.
WhatsApp +33605825265
Email .. cryvojaserty(a)gmail.com
Ketamine Sale Online ..WhatsApp +33605825265
Ketamine hydrochloride injection 50 mg
Buy ketamine online with delivery via PayPal
Ketamine hydrochloride injection 50 mg, 10 ml vial
BUY COKE, XANAX, ADDERALL, ESTACY, METH, AMPHETAMINE SPEED, OXY 80 MG, COCAINE, MDMA WhatsApp +33605825265
Buy Ketamine, Nembutal, Adderall Online: WhatsApp +33605825265
,Buy Xanax 2 mg Online, Buy Painkillers, Buy Ozempic, Buy Sexanda Online, Buy Abortion Pills Online, Buy Xanax, Buy Wegovy | MDMA, 2CB, A-PVP, 3CMC | OZEMPIC | STEROIDS (ECT/Nembutal) | Heroin, cocaine. Buy Xanax Online, Buy Painkillers Online, Buy Nembutal Online, Buy Anxiety Pills Online
Contact details.
WhatsApp +33605825265
Email .. cryvojaserty(a)gmail.com
virtio_gpu_cursor_plane_update() and virtio_gpu_resource_flush() lock
the framebuffer BO's dma_resv via virtio_gpu_array_lock_resv() and
ignore its return value. The function can fail with -EINTR from
dma_resv_lock_interruptible() (signal during lock wait) or with
-ENOMEM from dma_resv_reserve_fences() (fence slot allocation),
leaving the resv lock not held. The queue path then walks the object
array and calls dma_resv_add_fence(), which requires the lock held;
with lockdep enabled this trips dma_resv_assert_held():
WARNING: drivers/dma-buf/dma-resv.c:296 at dma_resv_add_fence+0x71e/0x840
Call Trace:
virtio_gpu_array_add_fence
virtio_gpu_queue_ctrl_sgs
virtio_gpu_queue_fenced_ctrl_buffer
virtio_gpu_cursor_plane_update
drm_atomic_helper_commit_planes
drm_atomic_helper_commit_tail
commit_tail
drm_atomic_helper_commit
drm_atomic_commit
drm_atomic_helper_update_plane
__setplane_atomic
drm_mode_cursor_universal
drm_mode_cursor_common
drm_mode_cursor_ioctl
drm_ioctl
__x64_sys_ioctl
Beyond the WARN, mutating the dma_resv fence list without the lock
races with concurrent readers/writers and can corrupt the list.
Both call sites run inside the .atomic_update plane callback, which
DRM atomic helpers do not allow to fail (by the time it runs, the
commit has been signed off to userspace and there is no clean
rollback path). Moving the lock acquisition to .prepare_fb was
rejected because the broader lock scope deadlocks against other BO
locking paths in the same atomic commit.
Introduce virtio_gpu_lock_one_resv_uninterruptible() that uses
dma_resv_lock() instead of dma_resv_lock_interruptible(). This
eliminates the -EINTR failure mode -- the realistic syzbot trigger
-- without extending the lock hold across the commit. The helper
locks a single BO and rejects nents > 1 with -EINVAL; both fix
sites lock exactly one BO.
Use it from virtio_gpu_cursor_plane_update() and
virtio_gpu_resource_flush(); check the return value to handle the
remaining -ENOMEM case from dma_resv_reserve_fences() by freeing
the objs and skipping the plane update for that frame. The
framebuffer BOs touched here are not shared with other contexts
and lock contention is expected to be brief, so the loss of
signal-interruptibility is acceptable.
Other callers of virtio_gpu_array_lock_resv() (the ioctl paths)
continue to use the interruptible variant.
The bug was reported by syzbot, triggered via fault injection
(fail_nth) on the DRM_IOCTL_MODE_CURSOR path, which forces the
-ENOMEM branch in dma_resv_reserve_fences().
Reported-by: syzbot+72bd3dd3a5d5f39a0271(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=72bd3dd3a5d5f39a0271
Fixes: 5cfd31c5b3a3 ("drm/virtio: fix virtio_gpu_cursor_plane_update().")
Cc: stable(a)vger.kernel.org
Signed-off-by: Deepanshu Kartikey <kartikey406(a)gmail.com>
---
v4: Rename the helper to virtio_gpu_lock_one_resv_uninterruptible()
and reject objs->nents > 1 with -EINVAL. The v3 helper's
multi-object branch used drm_gem_lock_reservations(), which is
interruptible, contradicting the "uninterruptible" name; both
fix sites lock a single BO so the multi-object path is dropped.
(Dmitry Osipenko)
v3: Drop the prepare_fb/cleanup_fb approach from v2 (it deadlocked
against virtio_gpu_resource_flush(), which also locks the BO in
the same atomic commit). Instead add an uninterruptible variant
of the resv lock helper and use it in both
virtio_gpu_cursor_plane_update() and virtio_gpu_resource_flush().
(Dmitry Osipenko)
v2: Move resv lock acquisition from .atomic_update (which must not
fail) to .prepare_fb (which may), per maintainer review of v1.
The v1 approach of silently skipping the cursor update on lock
failure violated the atomic-commit contract with userspace.
---
drivers/gpu/drm/virtio/virtgpu_drv.h | 1 +
drivers/gpu/drm/virtio/virtgpu_gem.c | 17 +++++++++++++++++
drivers/gpu/drm/virtio/virtgpu_plane.c | 10 ++++++++--
3 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h b/drivers/gpu/drm/virtio/virtgpu_drv.h
index f17660a71a3e..2f3531950aa4 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.h
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.h
@@ -317,6 +317,7 @@ virtio_gpu_array_from_handles(struct drm_file *drm_file, u32 *handles, u32 nents
void virtio_gpu_array_add_obj(struct virtio_gpu_object_array *objs,
struct drm_gem_object *obj);
int virtio_gpu_array_lock_resv(struct virtio_gpu_object_array *objs);
+int virtio_gpu_lock_one_resv_uninterruptible(struct virtio_gpu_object_array *objs);
void virtio_gpu_array_unlock_resv(struct virtio_gpu_object_array *objs);
void virtio_gpu_array_add_fence(struct virtio_gpu_object_array *objs,
struct dma_fence *fence);
diff --git a/drivers/gpu/drm/virtio/virtgpu_gem.c b/drivers/gpu/drm/virtio/virtgpu_gem.c
index f22dc5c21cd4..435d37d36034 100644
--- a/drivers/gpu/drm/virtio/virtgpu_gem.c
+++ b/drivers/gpu/drm/virtio/virtgpu_gem.c
@@ -238,6 +238,23 @@ int virtio_gpu_array_lock_resv(struct virtio_gpu_object_array *objs)
return ret;
}
+int virtio_gpu_lock_one_resv_uninterruptible(struct virtio_gpu_object_array *objs)
+{
+ int ret;
+
+ if (objs->nents != 1)
+ return -EINVAL;
+
+ dma_resv_lock(objs->objs[0]->resv, NULL);
+
+ ret = dma_resv_reserve_fences(objs->objs[0]->resv, 1);
+ if (ret) {
+ virtio_gpu_array_unlock_resv(objs);
+ return ret;
+ }
+ return 0;
+}
+
void virtio_gpu_array_unlock_resv(struct virtio_gpu_object_array *objs)
{
if (objs->nents == 1) {
diff --git a/drivers/gpu/drm/virtio/virtgpu_plane.c b/drivers/gpu/drm/virtio/virtgpu_plane.c
index a126d1b25f46..652352424744 100644
--- a/drivers/gpu/drm/virtio/virtgpu_plane.c
+++ b/drivers/gpu/drm/virtio/virtgpu_plane.c
@@ -215,7 +215,10 @@ static void virtio_gpu_resource_flush(struct drm_plane *plane,
if (!objs)
return;
virtio_gpu_array_add_obj(objs, vgfb->base.obj[0]);
- virtio_gpu_array_lock_resv(objs);
+ if (virtio_gpu_lock_one_resv_uninterruptible(objs)) {
+ virtio_gpu_array_put_free(objs);
+ return;
+ }
virtio_gpu_cmd_resource_flush(vgdev, bo->hw_res_handle, x, y,
width, height, objs,
vgplane_st->fence);
@@ -459,7 +462,10 @@ static void virtio_gpu_cursor_plane_update(struct drm_plane *plane,
if (!objs)
return;
virtio_gpu_array_add_obj(objs, vgfb->base.obj[0]);
- virtio_gpu_array_lock_resv(objs);
+ if (virtio_gpu_lock_one_resv_uninterruptible(objs)) {
+ virtio_gpu_array_put_free(objs);
+ return;
+ }
virtio_gpu_cmd_transfer_to_host_2d
(vgdev, 0,
plane->state->crtc_w,
--
2.43.0
This patch series introduces the Qualcomm DSP Accelerator (QDA) driver,
a DRM-based accelerator driver for Qualcomm DSPs. The driver provides a
standardized interface for offloading computational tasks to DSPs found
on Qualcomm SoCs, supporting all DSP domains.
The QDA driver implements the FastRPC protocol over the DRM accel
subsystem. It uses the same device-tree node structure as the existing
fastrpc driver in drivers/misc/. The approach for binding the QDA driver
to device-tree nodes while coexisting with the fastrpc driver is an open
item described below.
RFC thread: https://lore.kernel.org/dri-devel/20260224-qda-firstpost-v1-0-fe46a9c1a046@…
User-space staging branch
=========================
https://github.com/qualcomm/fastrpc/tree/accel/staging
Key Features
============
* Standard DRM accelerator interface via /dev/accel/accelN
* GEM-based buffer management with DMA-BUF import/export (PRIME)
* IOMMU-based memory isolation using per-process context banks
* FastRPC protocol implementation for DSP communication
* RPMsg transport layer for reliable message passing
* Support for all DSP domains (ADSP, CDSP, SDSP, GDSP)
* DRM IOCTL interface for DSP session management, buffer allocation,
and remote procedure invocation
Architecture
============
1. DRM Accelerator Framework Integration
The driver registers as a DRM accel device, exposing a standard
/dev/accel/accelN character device node. This provides established
DRM infrastructure for device management, file operations, and
IOCTL dispatch.
2. Memory Management
Buffers are managed as GEM objects with full PRIME support for
DMA-BUF import/export. This enables seamless buffer sharing with
other DRM drivers (GPU, camera, video) using standard kernel
mechanisms.
3. IOMMU Context Bank Management
IOMMU context banks (CBs) are represented as proper struct device
instances on a custom virtual bus (qda-compute-cb). Each CB device
is registered with the IOMMU subsystem and receives its own IOMMU
domain, enabling per-session address space isolation. The custom
bus was introduced because IOMMU context banks are synthetic
constructs — not real platform devices — and to ensure CB device
lifetime is strictly subordinate to the parent QDA device.
See also: https://lore.kernel.org/all/245d602f-3037-4ae3-9af9-d98f37258aae@oss.qualco…
4. Memory Manager Architecture
A pluggable memory manager coordinates IOMMU device assignment and
buffer allocation. The current implementation uses a DMA-coherent
backend with SID-prefixed DMA addresses for DSP firmware
compatibility.
5. Transport Layer
RPMsg communication is handled in a dedicated transport layer
(qda_rpmsg.c), separate from the core DRM driver logic.
6. Code Organization
The driver is organized across multiple files (~4600 lines total):
* qda_drv.c: Core driver and DRM integration
* qda_rpmsg.c: RPMsg transport layer
* qda_cb.c: Context bank device management
* qda_compute_bus.c: Custom virtual bus for CB devices
* qda_gem.c: GEM object management
* qda_prime.c: DMA-BUF import (PRIME)
* qda_memory_manager.c: IOMMU device registry and allocation
* qda_memory_dma.c: DMA-coherent allocation backend
* qda_fastrpc.c: FastRPC protocol implementation
* qda_ioctl.c: IOCTL dispatch
7. UAPI Design
The driver exposes DRM-style IOCTLs defined in
include/uapi/drm/qda_accel.h, following DRM UAPI conventions
(__u32/__u64 types, C++ guard, GPL-2.0-only WITH Linux-syscall-note).
Patch Series Organization
==========================
Patch 01: MAINTAINERS entry
Patch 02: Driver documentation (Documentation/accel/qda/)
Patches 03-04: Core driver skeleton and compute bus
Patch 05: iommu: Register qda-compute-cb bus with IOMMU subsystem
Patches 06-07: CB device enumeration and memory manager
Patch 08: QUERY IOCTL and UAPI header
Patches 09-11: GEM buffer management and PRIME import
Patches 12-15: FastRPC protocol (invoke, session create/release,
map/unmap)
Open Items
===========
1. Device-Tree Compatible String
The QDA driver uses the same device-tree node structure and
properties as the existing fastrpc driver in drivers/misc/. A
mechanism is needed to allow the QDA driver to bind to its device
node independently of the fastrpc driver.
The intended coexistence model is: platforms that require the
complete fastrpc feature set continue to use "qcom,fastrpc"; new
platforms where a feature available only in QDA takes priority, or
where QDA's current feature set is sufficient, use a QDA-specific
compatible string. New feature development is directed toward QDA
rather than the existing fastrpc driver. As QDA matures toward
feature parity with fastrpc, platforms can adopt the QDA-specific
compatible string exclusively.
The options under consideration are:
a) Add a new "qcom,qda" compatible string to the existing
qcom,fastrpc.yaml binding, since the DT node structure and
properties are identical. This avoids a separate binding file
but adds a QDA-specific string to a fastrpc binding.
b) Introduce a separate qcom,qda.yaml binding that references or
inherits the fastrpc binding properties.
Seeking guidance from DT binding maintainers on the preferred
approach.
2. Privilege Level Management
Currently, daemon processes and user processes have the same access
level as both use the same accel device node. This needs to be
addressed as daemons attach to privileged DSP protection domains
and require higher privilege levels for system-level operations.
Seeking guidance on the best approach: separate device nodes,
capability-based checks, or DRM master/authentication mechanisms.
3. UAPI Compatibility Layer
A compatibility layer is needed to facilitate migration of client
applications from the existing FastRPC UAPI to the new QDA UAPI,
ensuring a smooth transition for existing userspace code. Seeking
guidance on the preferred implementation approach: in-kernel
translation layer, userspace wrapper library, or hybrid solution.
An initial evaluation of an in-kernel translation shim was
performed, where legacy FastRPC device nodes (/dev/fastrpc-*) are
exposed and requests are internally routed to the QDA accel driver.
The goal was to keep the compatibility layer minimal, reuse existing
QDA helper paths (attach, buffer allocation, mapping, etc.), and
avoid duplication of GEM and buffer management logic.
However, the following challenges were identified:
a) Dependency on drm_file for QDA helpers
QDA relies on GEM-backed allocations and per-client handle
namespaces, which require a valid struct drm_file. Since GEM
handles are scoped per drm_file, the compatibility layer cannot
directly reuse QDA helper paths without establishing a proper
drm_file context for each client.
b) Lack of public API for drm_file creation
Creating a drm_file directly (similar to mock_drm_getfile()-style
approaches) is not feasible, as the required helpers
(drm_file_alloc(), drm_file_free(), etc.) are internal to the DRM
core and not exported. This prevents external drivers from safely
constructing and managing drm_file instances.
c) VFS-based open is not a viable solution
Opening the underlying accel device (/dev/accel/accelN) from the
compatibility driver via filp_open() does provide a valid
drm_file, but introduces reliance on userspace-visible device
paths, lack of stability in containerized or chroot environments,
and no clean mapping between legacy device nodes and accel
devices.
d) Userspace proxy limitations (CUSE)
A CUSE-based userspace proxy was evaluated. However, DMA-buf file
descriptors passed by legacy applications cannot be directly
reused in the CUSE daemon (file descriptors are process-specific),
which breaks buffer sharing semantics.
e) drm_client-based approaches do not match requirements
drm_client APIs (used for fbdev emulation) rely on a shared
drm_file and do not provide the per-client isolation required by
FastRPC semantics.
Due to the above constraints, it is currently unclear how to
implement an in-kernel compatibility layer that correctly handles
per-client drm_file contexts without relying on VFS paths or
non-exported DRM internals.
4. Documentation Improvements
Add detailed IOCTL usage examples, document DSP firmware interface
requirements, and create a migration guide from the existing FastRPC
driver.
5. Per-Session Memory Allocation
Develop a userspace API to support memory allocation on a per-session
basis, enabling session-specific memory management.
6. Audio and Sensors PD Support
The current series does not handle Audio PD and Sensors PD
functionalities. These specialized protection domains require
additional support for real-time constraints and power management.
Interface Compatibility
========================
The QDA driver uses the same device-tree node structure and child node
layout (including "qcom,fastrpc-compute-cb" child nodes) as the
existing fastrpc driver. The underlying FastRPC protocol and DSP
firmware interface are compatible with the existing fastrpc driver,
ensuring that DSP firmware and libraries continue to work without
modification.
References
==========
Previous discussions on this migration:
- https://lkml.org/lkml/2024/6/24/479
- https://lkml.org/lkml/2024/6/21/1252
Testing
=======
The driver has been tested on Qualcomm platforms with:
- Basic FastRPC attach/release operations
- DSP process creation and initialization
- Memory mapping/unmapping operations
- Dynamic invocation with various buffer types
- GEM buffer allocation and mmap
- PRIME buffer import from other subsystems
Signed-off-by: Ekansh Gupta <ekansh.gupta(a)oss.qualcomm.com>
---
Ekansh Gupta (15):
MAINTAINERS: Add entry for Qualcomm DSP Accelerator (QDA) driver
accel/qda: Add QDA driver documentation
accel/qda: Add initial QDA DRM accelerator driver
accel/qda: Add compute bus for QDA context banks
iommu: Add QDA compute context bank bus to iommu_buses
accel/qda: Create compute context bank devices on QDA compute bus
accel/qda: Add memory manager for CB devices
accel/qda: Add QUERY IOCTL and QDA UAPI header
accel/qda: Add DMA-backed GEM objects and memory manager integration
accel/qda: Add GEM_CREATE and GEM_MMAP_OFFSET IOCTLs
accel/qda: Add PRIME DMA-BUF import support
accel/qda: Add FastRPC invocation support
accel/qda: Add DSP process creation and release
accel/qda: Add remote memory mapping to DSP address space
accel/qda: Add remote memory unmap from DSP address space
Documentation/accel/index.rst | 1 +
Documentation/accel/qda/index.rst | 13 +
Documentation/accel/qda/qda.rst | 146 +++++
MAINTAINERS | 9 +
drivers/accel/Kconfig | 1 +
drivers/accel/Makefile | 2 +
drivers/accel/qda/Kconfig | 34 +
drivers/accel/qda/Makefile | 19 +
drivers/accel/qda/qda_cb.c | 146 +++++
drivers/accel/qda/qda_cb.h | 32 +
drivers/accel/qda/qda_compute_bus.c | 68 ++
drivers/accel/qda/qda_drv.c | 192 ++++++
drivers/accel/qda/qda_drv.h | 91 +++
drivers/accel/qda/qda_fastrpc.c | 1058 ++++++++++++++++++++++++++++++++
drivers/accel/qda/qda_fastrpc.h | 390 ++++++++++++
drivers/accel/qda/qda_gem.c | 177 ++++++
drivers/accel/qda/qda_gem.h | 62 ++
drivers/accel/qda/qda_ioctl.c | 296 +++++++++
drivers/accel/qda/qda_ioctl.h | 19 +
drivers/accel/qda/qda_memory_dma.c | 110 ++++
drivers/accel/qda/qda_memory_dma.h | 17 +
drivers/accel/qda/qda_memory_manager.c | 380 ++++++++++++
drivers/accel/qda/qda_memory_manager.h | 75 +++
drivers/accel/qda/qda_prime.c | 184 ++++++
drivers/accel/qda/qda_prime.h | 18 +
drivers/accel/qda/qda_rpmsg.c | 248 ++++++++
drivers/accel/qda/qda_rpmsg.h | 30 +
drivers/iommu/iommu.c | 4 +
include/linux/qda_compute_bus.h | 32 +
include/uapi/drm/qda_accel.h | 229 +++++++
30 files changed, 4083 insertions(+)
---
base-commit: 80dd246accce631c328ea43294e53b2b2dd2aa32
change-id: 20260519-qda-series-78c2bf0ed78b
Best regards,
--
Ekansh Gupta <ekansh.gupta(a)oss.qualcomm.com>
On 5/20/26 14:33, Xaver Hugl wrote:
> Am Mi., 20. Mai 2026 um 10:08 Uhr schrieb Christian König
> <christian.koenig(a)amd.com>:
>> Well I would say the other way around is a pretty common use case.
>>
>> In other words the compositors uses the internal GPU for composing and displaying the picture. And the client uses the external GPU for fast rendering.
> Sure, but that's not what I'm talking about.
Yeah sorry for that, I wasn't sure if I misunderstood your use case because it's usually the other way around.
>>> - the buffers from the client stay valid
>>
>> Buffers from the hot plugged GPU don't stay valid. Accessing CPU mappings either result in a SIGBUS or are redirected to a dummy page.
> Again, not what I wrote about. The buffers are on the integrated GPU.
General rule of thumb is that as long as the exporter stays around the buffers stay around as well.
>>> - the syncobj stays valid on the client side
>>> - the syncobj becomes invalid on the compositor side
>>
>> Nope that's not correct. The syncobj itself stays valid even if you completely hot plug the device.
>>
>> It can just be that the fences inside the syncobj are terminated with an error.
> What about eventfd created for a point on the syncobj?
The eventfd unfortunately doesn't has error handling as far as I know, so when a fence signals with an error condition then the eventfd you only sees that it is signaled.
> Another (future) problem with hotplugs will be if the sync file hasn't
> materialized for the timeline point when the device is hotunplugged,
> since there can't be an error on the fence if there isn't one. Or
> could userspace somehow set an 'artificial' fence with an error in
> that case?
In general the answer is yes, userspace needs to take care of inserting fences when wait before signal is used and the work can not be submitted to the HW for some reason.
Currently we only have an IOCTL to insert the signaled dummy fence at some timeline sequence, but it should be trivial as well to insert a signaled fence with an error code.
But the compositor needs to be able to handle that case anyway, because it can be that a malicious or just buggy client just never inserts the fence.
So that a device is hot plugged is not different to just a client not inserting the fence in the first place.
>>> "invalid" there means either
>>> - the acquire point of the client is marked as signaled, before
>>> rendering on the client side is completed
>>> - the acquire point of the client is never signaled. Since the
>>> compositor waits for the acquire point, the Wayland surface is stuck
>>> forever
>>
>> Both of those would be a *massive* violation of documented kernel rules for hot-plugging which could lead to random data corruption and/or deadlocks.
>>
>> If you see any HW driver showing behavior like that please open up a bug report and ping the relevant maintainers immediately.
> If there are no error codes with syncobj yet, then to userspace, the
> latter behavior is exactly what we get, isn't it?
No, from userspace side you just see a signaled fence. It's just that you need to export the timeline point of the syncobj to a syncfile and then you can call the QUERY IOCTL on the syncfile to see the error code.
>> When a hotplug happens all operations of the device should return an -ENODEV error, even when exposed to other devices/application through syncobj or syncfile.
> Okay, that at least gives us a way to fail imports somewhat
> gracefully. Normally, failing to import a syncobj is a fatal error in
> the Wayland protocol.
So the task at hand would be to avoid importing the syncobj into a driver. That should be relatively trivial.
The only real problem I see is if you want to create a syncobj without having any device whatsoever.
>> One problem is that only syncfile allows for querying such error codes at the moment, we have patches pending to add that to syncobj as well but we lack a compositor with support for that as userspace client.
> As long as the error case can be detected with an eventfd,
Yeah that's the problem. The eventfd only tells you if the operation is completed (or at least has materialized).
To query the error you would need to ask the underlying syncobj or syncfile directly.
> implementing that in KWin shouldn't be a challenge.
>
>> Well the question here is if the device the compositor is using or the client is using is gone?
>>
>> If the client device is hot removed the compositor should be perfectly capable to import the syncobj.
>>
>> If the compositor device is gone then you don't have a device to display anything any more, so generating the next frame doesn't seem to make sense either.
>>
>> What could be is that you want the compositor to be kept alive even when the display device is gone to switch over to vkms or whatever so that a VNC session or other remote desktop still works.
> There are two GPUs in the example I gave. The compositor can use both
> for rendering (in cosmic-comp's case) or switch between them (what I'm
> trying to do with KWin), or use one device for rendering, and another
> for importing the syncobj.
Ah! I think I got the problem now. You basically want to avoid importing the syncobj because when the wrong device goes away you are busted.
The reason we didn't considered having the IOCTLs on the FD is because if you don't import them and instead keep them around you can run out file descriptors quite quickly.
When you have an use case where you receive an FD from the client and do a one shot conversion to an eventfd that will probably work, but for keeping them in the long run you need some kind of container for the syncobjs, don't you?
>>>>>>> 3. It removes the need to translate between syncobjs fds and handles.
>>>>>>
>>>>>> That's a pretty big no-go as well. The differentiation between FDs and handles is completely intentional.
>>>>> Could you expand on why it's needed? For compositors, the handle is
>>>>> just an intermediary thing when translating between file descriptors.
>>>>
>>>> Well what we could do is to add an IOCTL to directly attach an syncobj file descriptor to an eventfd.
>>> That would be nice.
>>
>> Take a look at drm_syncobj_file_fops and how drm_syncobj_add_eventfd() is used. Adding that functionality shouldn't be more than a typing exercise.
> Yeah, this patchset already adds that functionality (on the new device).
>
>> Do I see it right that this would already solve most problems in the compositor side?
> Skipping the syncobj handle step would only reduce the amounts of
> ioctls the compositor does, but afaict it wouldn't solve any
> compositor problems. At least not as long as it's still tied to a drm
> device.
Yeah, you need something like a syncobj container or dummy DRM device.
> For device hotplugs, the only new thing we need for correctly handling
> syncobj is a way to receive errors on the eventfd.
I need to look into the eventfd code, could be that this is somehow possible but it's clearly not something I used before.
> A device-independent way to create and use syncobj would still be
> useful to us though, both to simplify the compositor and to improve
> the software rendering use cases.
Yeah not sure how to cleanly do that. We could have a dummy /dev/dri/rendersync or something like that, but that would be quite a hack.
At least I understand the requirement now.
Thanks,
Christian.
>
> - Xaver
On 5/19/26 17:31, Xaver Hugl wrote:
> Am Di., 19. Mai 2026 um 15:29 Uhr schrieb Christian König
> <christian.koenig(a)amd.com>:
>>> 1. This series makes the ability to manipulate syncobjs available
>>> independently of attached hardware.
>>> 2. It makes it available under a consistent path /dev/syncobj.
>>
>> Exactly that is a big no-go. This has to be under /dev/dri.
> FWIW udmabuf is also under /dev directly, but I don't think any
> compositor developer would complain about a different path.
> What are the rules for that? Could this simply be put in /dev/dri/syncobj?
The syncobj are actually the DRM specific way of doing things. The general kernel wide way is to use sync files (see drivers/dma-buf/sync_file.c).
But there has already been tons of problems with those sync files. E.g. they doesn't support your use case at all since they don't have wait before submit behavior.
So there are already ways to do this, but the Linux kernel so far told everybody that this is forbidden. The DRM syncobj wait before signal functionality is much better, but then basically the second try to do this.
> The part where we get this independent of attached hardware is quite
> important for us though, since we can't just ignore explicit sync once
> the device we previously imported the syncobj into is disconnected.
Can you elaborate more on this?
> Buffers can be from any device or allocated in system memory and
> access should be synchronized properly in all cases.
>
> How exactly it's made available isn't all that critical.
>
>>> 3. It removes the need to translate between syncobjs fds and handles.
>>
>> That's a pretty big no-go as well. The differentiation between FDs and handles is completely intentional.
> Could you expand on why it's needed? For compositors, the handle is
> just an intermediary thing when translating between file descriptors.
Well what we could do is to add an IOCTL to directly attach an syncobj file descriptor to an eventfd.
> FTR for me at least, this part would be merely nice to have, since it
> slightly reduces the amount of ioctls a compositor needs to call, but
> it's not important.
>
>>>> What about using VGEM for this?
>>>
>>> If the vgem render node were made available unconditionally under,
>>
>> Software rendering is a complete corner case, I don't think that this will be enabled by default.
> That simply makes vgem unsuitable for solving the problems we face in
> compositors.
Thinking more about it vgem also has the same issues as sync file mentioned above. So that is really also not doable.
Maybe Simona or David have another idea.
Regards,
Christian.
>
> - Xaver
The patch set allows to register a dmabuf to an io_uring instance for
a specified file and use it with io_uring read / write requests. The
infrastructure is not tied to io_uring and there could be more users
in the future. A similar idea was attempted some years ago by Keith [1],
from where I borrowed a good number of changes, and later was brough up
by Tushar and Vishal from Intel.
It's an opt-in feature for files, and they need to implement a new
file operation to use it. Only NVMe block devices are supported in this
series. The user API is built on top of io_uring's "registered buffers",
where a dmabuf is registered in a special way, but after it can be used
as any other "registered buffer" with IORING_OP_{READ,WRITE}_FIXED
requests. It's created via a new file operation and the resulted map is
then passed through the I/O stack in a new iterator type. There is some
additional infrastructure to bind it all, which also counts requests
using a dmabuf map and managing lifetimes, which is used to implement
map invalidation.
It was tested for GPU <-> NVMe transfers. Also, as it maintains a
long-term dma mapping, it helps with the IOMMU cost. The numbers
below are for udmabuf reads previously run by Anuj for different
IOMMU modes:
- STRICT: before = 570 KIOPS, after = 5.01 MIOPS
- LAZY: before = 1.93 MIOPS, after = 5.01 MIOPS
- PASSTHROUGH: before = 5.01 MIOPS, after = 5.01 MIOPS
There are some liburing tests that can serve as an example:
git: https://github.com/isilence/liburing.git rw-dmabuf-tests-v3
url: https://github.com/isilence/liburing/tree/rw-dmabuf-tests-v3
[1] https://lore.kernel.org/io-uring/20220805162444.3985535-1-kbusch@fb.com/
v3: - Rework io_uring registration
- Move token/map infrastructure code out of blk-mq
- Simplify callbacks: remove a separate blk-mq table, which was
mostly just forwarding calls (to nvme).
- Don't skip dma sync depending on request direction
- Fix a couple of hangs
- Rename s/dma/dmabuf/
- Other small changes
v2: - Don't pass raw dma addresses, wrap it into a driver specific object
- Split into two objects: token and map
- Implement move_notify
Pavel Begunkov (10):
file: add callback for creating long-term dmabuf maps
iov_iter: add iterator type for dmabuf maps
block: move bvec init into __bio_clone
block: introduce dma map backed bio type
lib: add dmabuf token infrastructure
block: forward create_dmabuf_token to drivers
nvme-pci: implement dma_token backed requests
io_uring/rsrc: introduce buf registration structure
io_uring/rsrc: extend buffer update
io_uring/rsrc: add dmabuf backed registered buffers
block/bio.c | 28 +++-
block/blk-merge.c | 14 ++
block/blk.h | 3 +-
block/fops.c | 16 ++
drivers/nvme/host/pci.c | 282 ++++++++++++++++++++++++++++++++
include/linux/bio.h | 19 ++-
include/linux/blk-mq.h | 9 +
include/linux/blk_types.h | 8 +-
include/linux/fs.h | 2 +
include/linux/io_dmabuf_token.h | 92 +++++++++++
include/linux/io_uring_types.h | 5 +
include/linux/uio.h | 11 ++
include/uapi/linux/io_uring.h | 31 +++-
io_uring/io_uring.c | 3 +-
io_uring/rsrc.c | 266 +++++++++++++++++++++++++-----
io_uring/rsrc.h | 30 +++-
io_uring/rw.c | 4 +-
lib/Kconfig | 4 +
lib/Makefile | 2 +
lib/io_dmabuf_token.c | 272 ++++++++++++++++++++++++++++++
lib/iov_iter.c | 29 +++-
21 files changed, 1071 insertions(+), 59 deletions(-)
create mode 100644 include/linux/io_dmabuf_token.h
create mode 100644 lib/io_dmabuf_token.c
--
2.53.0