Linaro-mm-sig June 2020

linaro-mm-sig@lists.linaro.org

25 participants
34 discussions

Re: [Linaro-mm-sig] [PATCH v4 1/3] virtio: add dma-buf support for exported objects

by Guennadi Liakhovetski

Hi Michael, On Thu, Jun 04, 2020 at 03:05:23PM -0400, Michael S. Tsirkin wrote: > On Tue, May 26, 2020 at 07:58:09PM +0900, David Stevens wrote: > > This change adds a new flavor of dma-bufs that can be used by virtio > > drivers to share exported objects. A virtio dma-buf can be queried by > > virtio drivers to obtain the UUID which identifies the underlying > > exported object. > > > > Signed-off-by: David Stevens <stevensd(a)chromium.org> > > Is this just for graphics? If yes I'd rather we put it in the graphics > driver. We can always move it later ... Wouldn't this be the API that audio virtualisation will have to use to share buffers between the host and any guests? Thanks Guennadi > > --- > > drivers/virtio/Makefile | 2 +- > > drivers/virtio/virtio.c | 6 +++ > > drivers/virtio/virtio_dma_buf.c | 89 +++++++++++++++++++++++++++++++++ > > include/linux/virtio.h | 1 + > > include/linux/virtio_dma_buf.h | 58 +++++++++++++++++++++ > > 5 files changed, 155 insertions(+), 1 deletion(-) > > create mode 100644 drivers/virtio/virtio_dma_buf.c > > create mode 100644 include/linux/virtio_dma_buf.h > > > > diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile > > index 29a1386ecc03..ecdae5b596de 100644 > > --- a/drivers/virtio/Makefile > > +++ b/drivers/virtio/Makefile > > @@ -1,5 +1,5 @@ > > # SPDX-License-Identifier: GPL-2.0 > > -obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o > > +obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o virtio_dma_buf.o > > obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o > > obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o > > virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c > > index a977e32a88f2..5d46f0ded92d 100644 > > --- a/drivers/virtio/virtio.c > > +++ b/drivers/virtio/virtio.c > > @@ -357,6 +357,12 @@ int register_virtio_device(struct virtio_device *dev) > > } > > EXPORT_SYMBOL_GPL(register_virtio_device); > > > > +bool is_virtio_device(struct device *dev) > > +{ > > + return dev->bus == &virtio_bus; > > +} > > +EXPORT_SYMBOL_GPL(is_virtio_device); > > + > > void unregister_virtio_device(struct virtio_device *dev) > > { > > int index = dev->index; /* save for after device release */ > > diff --git a/drivers/virtio/virtio_dma_buf.c b/drivers/virtio/virtio_dma_buf.c > > new file mode 100644 > > index 000000000000..23e3399b11ed > > --- /dev/null > > +++ b/drivers/virtio/virtio_dma_buf.c > > @@ -0,0 +1,89 @@ > > +// SPDX-License-Identifier: GPL-2.0-or-later > > +/* > > + * dma-bufs for virtio exported objects > > + * > > + * Copyright (C) 2020 Google, Inc. > > + */ > > + > > +#include <linux/virtio_dma_buf.h> > > + > > +/** > > + * virtio_dma_buf_export - Creates a new dma-buf for a virtio exported object > > + * > > + * This wraps dma_buf_export() to allow virtio drivers to create a dma-buf > > + * for an virtio exported object that can be queried by other virtio drivers > > + * for the object's UUID. > > + */ > > +struct dma_buf *virtio_dma_buf_export( > > + const struct virtio_dma_buf_export_info *virtio_exp_info) > > +{ > > + struct dma_buf_export_info exp_info; > > + > > + if (!virtio_exp_info->ops > > + || virtio_exp_info->ops->ops.attach != &virtio_dma_buf_attach > > + || !virtio_exp_info->ops->get_uuid) { > > + return ERR_PTR(-EINVAL); > > + } > > + > > + exp_info.exp_name = virtio_exp_info->exp_name; > > + exp_info.owner = virtio_exp_info->owner; > > + exp_info.ops = &virtio_exp_info->ops->ops; > > + exp_info.size = virtio_exp_info->size; > > + exp_info.flags = virtio_exp_info->flags; > > + exp_info.resv = virtio_exp_info->resv; > > + exp_info.priv = virtio_exp_info->priv; > > + BUILD_BUG_ON(sizeof(struct virtio_dma_buf_export_info) > > + != sizeof(struct dma_buf_export_info)); > > This is the only part that gives me pause. Why do we need this hack? > What's wrong with just using dma_buf_export_info directly, > and if you want the virtio ops, just using container_off? > > > > > + > > + return dma_buf_export(&exp_info); > > +} > > +EXPORT_SYMBOL(virtio_dma_buf_export); > > + > > +/** > > + * virtio_dma_buf_attach - mandatory attach callback for virtio dma-bufs > > + */ > > +int virtio_dma_buf_attach(struct dma_buf *dma_buf, > > + struct dma_buf_attachment *attach) > > +{ > > + int ret; > > + const struct virtio_dma_buf_ops *ops = container_of( > > + dma_buf->ops, const struct virtio_dma_buf_ops, ops); > > + > > + if (ops->device_attach) { > > + ret = ops->device_attach(dma_buf, attach); > > + if (ret) > > + return ret; > > + } > > + return 0; > > +} > > +EXPORT_SYMBOL(virtio_dma_buf_attach); > > + > > +/** > > + * is_virtio_dma_buf - returns true if the given dma-buf is a virtio dma-buf > > + * @dma_buf: buffer to query > > + */ > > +bool is_virtio_dma_buf(struct dma_buf *dma_buf) > > +{ > > + return dma_buf->ops->attach == &virtio_dma_buf_attach; > > +} > > +EXPORT_SYMBOL(is_virtio_dma_buf); > > + > > +/** > > + * virtio_dma_buf_get_uuid - gets the uuid of the virtio dma-buf's exported object > > + * @dma_buf: [in] buffer to query > > + * @uuid: [out] the uuid > > + * > > + * Returns: 0 on success, negative on failure. > > + */ > > +int virtio_dma_buf_get_uuid(struct dma_buf *dma_buf, > > + uuid_t *uuid) > > +{ > > + const struct virtio_dma_buf_ops *ops = container_of( > > + dma_buf->ops, const struct virtio_dma_buf_ops, ops); > > + > > + if (!is_virtio_dma_buf(dma_buf)) > > + return -EINVAL; > > + > > + return ops->get_uuid(dma_buf, uuid); > > +} > > +EXPORT_SYMBOL(virtio_dma_buf_get_uuid); > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h > > index 15f906e4a748..9397e25616c4 100644 > > --- a/include/linux/virtio.h > > +++ b/include/linux/virtio.h > > @@ -128,6 +128,7 @@ static inline struct virtio_device *dev_to_virtio(struct device *_dev) > > void virtio_add_status(struct virtio_device *dev, unsigned int status); > > int register_virtio_device(struct virtio_device *dev); > > void unregister_virtio_device(struct virtio_device *dev); > > +bool is_virtio_device(struct device *dev); > > > > void virtio_break_device(struct virtio_device *dev); > > > > diff --git a/include/linux/virtio_dma_buf.h b/include/linux/virtio_dma_buf.h > > new file mode 100644 > > index 000000000000..29fee167afbd > > --- /dev/null > > +++ b/include/linux/virtio_dma_buf.h > > @@ -0,0 +1,58 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ > > +/* > > + * dma-bufs for virtio exported objects > > + * > > + * Copyright (C) 2020 Google, Inc. > > + */ > > + > > +#ifndef _LINUX_VIRTIO_DMA_BUF_H > > +#define _LINUX_VIRTIO_DMA_BUF_H > > + > > +#include <linux/dma-buf.h> > > +#include <linux/uuid.h> > > +#include <linux/virtio.h> > > + > > +/** > > + * struct virtio_dma_buf_ops - operations possible on exported object dma-buf > > + * @ops: the base dma_buf_ops. ops.attach MUST be virtio_dma_buf_attach. > > + * @device_attach: [optional] callback invoked by virtio_dma_buf_attach during > > + * all attach operations. > > + * @get_uid: [required] callback to get the uuid of the exported object. > > + */ > > +struct virtio_dma_buf_ops { > > + struct dma_buf_ops ops; > > + int (*device_attach)(struct dma_buf *dma_buf, > > + struct dma_buf_attachment *attach); > > + int (*get_uuid)(struct dma_buf *dma_buf, uuid_t *uuid); > > +}; > > + > > +/** > > + * struct virtio_dma_buf_export_info - see struct dma_buf_export_info > > + */ > > +struct virtio_dma_buf_export_info { > > + const char *exp_name; > > + struct module *owner; > > + const struct virtio_dma_buf_ops *ops; > > + size_t size; > > + int flags; > > + struct dma_resv *resv; > > + void *priv; > > +}; > > + > > +/** > > + * DEFINE_VIRTIO_DMA_BUF_EXPORT_INFO - helper macro for exporters > > + */ > > +#define DEFINE_VIRTIO_DMA_BUF_EXPORT_INFO(name) \ > > + struct virtio_dma_buf_export_info name = { \ > > + .exp_name = KBUILD_MODNAME, \ > > + .owner = THIS_MODULE } > > + > > +int virtio_dma_buf_attach(struct dma_buf *dma_buf, > > + struct dma_buf_attachment *attach); > > + > > +struct dma_buf *virtio_dma_buf_export( > > + const struct virtio_dma_buf_export_info *virtio_exp_info); > > +bool is_virtio_dma_buf(struct dma_buf *dma_buf); > > +int virtio_dma_buf_get_uuid(struct dma_buf *dma_buf, uuid_t *uuid); > > + > > +#endif /* _LINUX_VIRTIO_DMA_BUF_H */ > > -- > > 2.27.0.rc0.183.gde8f92d652-goog > > _______________________________________________ > Virtualization mailing list > Virtualization(a)lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/virtualization

5 years, 3 months

[PATCH] dmabuf: use spinlock to access dmabuf->name

by Charan Teja Kalla

There exists a sleep-while-atomic bug while accessing the dmabuf->name under mutex in the dmabuffs_dname(). This is caused from the SELinux permissions checks on a process where it tries to validate the inherited files from fork() by traversing them through iterate_fd() (which traverse files under spin_lock) and call match_file(security/selinux/hooks.c) where the permission checks happen. This audit information is logged using dump_common_audit_data() where it calls d_path() to get the file path name. If the file check happen on the dmabuf's fd, then it ends up in ->dmabuffs_dname() and use mutex to access dmabuf->name. The flow will be like below: flush_unauthorized_files() iterate_fd() spin_lock() --> Start of the atomic section. match_file() file_has_perm() avc_has_perm() avc_audit() slow_avc_audit() common_lsm_audit() dump_common_audit_data() audit_log_d_path() d_path() dmabuffs_dname() mutex_lock()--> Sleep while atomic. Call trace captured (on 4.19 kernels) is below: ___might_sleep+0x204/0x208 __might_sleep+0x50/0x88 __mutex_lock_common+0x5c/0x1068 __mutex_lock_common+0x5c/0x1068 mutex_lock_nested+0x40/0x50 dmabuffs_dname+0xa0/0x170 d_path+0x84/0x290 audit_log_d_path+0x74/0x130 common_lsm_audit+0x334/0x6e8 slow_avc_audit+0xb8/0xf8 avc_has_perm+0x154/0x218 file_has_perm+0x70/0x180 match_file+0x60/0x78 iterate_fd+0x128/0x168 selinux_bprm_committing_creds+0x178/0x248 security_bprm_committing_creds+0x30/0x48 install_exec_creds+0x1c/0x68 load_elf_binary+0x3a4/0x14e0 search_binary_handler+0xb0/0x1e0 So, use spinlock to access dmabuf->name to avoid sleep-while-atomic. Cc: <stable(a)vger.kernel.org> [5.3+] Signed-off-by: Charan Teja Reddy <charante(a)codeaurora.org> --- drivers/dma-buf/dma-buf.c | 13 +++++++------ include/linux/dma-buf.h | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 01ce125..2e0456c 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -45,10 +45,10 @@ static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen) size_t ret = 0; dmabuf = dentry->d_fsdata; - dma_resv_lock(dmabuf->resv, NULL); + spin_lock(&dmabuf->name_lock); if (dmabuf->name) ret = strlcpy(name, dmabuf->name, DMA_BUF_NAME_LEN); - dma_resv_unlock(dmabuf->resv); + spin_unlock(&dmabuf->name_lock); return dynamic_dname(dentry, buffer, buflen, "/%s:%s", dentry->d_name.name, ret > 0 ? name : ""); @@ -335,7 +335,7 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf) if (IS_ERR(name)) return PTR_ERR(name); - dma_resv_lock(dmabuf->resv, NULL); + spin_lock(&dmabuf->name_lock); if (!list_empty(&dmabuf->attachments)) { ret = -EBUSY; kfree(name); @@ -345,7 +345,7 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf) dmabuf->name = name; out_unlock: - dma_resv_unlock(dmabuf->resv); + spin_unlock(&dmabuf->name_lock); return ret; } @@ -405,10 +405,10 @@ static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file) /* Don't count the temporary reference taken inside procfs seq_show */ seq_printf(m, "count:\t%ld\n", file_count(dmabuf->file) - 1); seq_printf(m, "exp_name:\t%s\n", dmabuf->exp_name); - dma_resv_lock(dmabuf->resv, NULL); + spin_lock(&dmabuf->name_lock); if (dmabuf->name) seq_printf(m, "name:\t%s\n", dmabuf->name); - dma_resv_unlock(dmabuf->resv); + spin_unlock(&dmabuf->name_lock); } static const struct file_operations dma_buf_fops = { @@ -546,6 +546,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) dmabuf->size = exp_info->size; dmabuf->exp_name = exp_info->exp_name; dmabuf->owner = exp_info->owner; + spin_lock_init(&dmabuf->name_lock); init_waitqueue_head(&dmabuf->poll); dmabuf->cb_excl.poll = dmabuf->cb_shared.poll = &dmabuf->poll; dmabuf->cb_excl.active = dmabuf->cb_shared.active = 0; diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index ab0c156..93108fd 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -311,6 +311,7 @@ struct dma_buf { void *vmap_ptr; const char *exp_name; const char *name; + spinlock_t name_lock; struct module *owner; struct list_head list_node; void *priv; -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project

5 years, 4 months

[PATCH] drm/ttm: Fix dma_fence refcnt leak when adding move fence

by Xiyu Yang

ttm_bo_add_move_fence() invokes dma_fence_get(), which returns a reference of the specified dma_fence object to "fence" with increased refcnt. When ttm_bo_add_move_fence() returns, local variable "fence" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in one exception handling path of ttm_bo_add_move_fence(). When no_wait_gpu flag is equals to true, the function forgets to decrease the refcnt increased by dma_fence_get(), causing a refcnt leak. Fix this issue by calling dma_fence_put() when no_wait_gpu flag is equals to true. Signed-off-by: Xiyu Yang <xiyuyang19(a)fudan.edu.cn> Signed-off-by: Xin Tan <tanxin.ctf(a)gmail.com> --- drivers/gpu/drm/ttm/ttm_bo.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index f73b81c2576e..0f20e14a4cfd 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -883,8 +883,10 @@ static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo, if (!fence) return 0; - if (no_wait_gpu) + if (no_wait_gpu) { + dma_fence_put(fence); return -EBUSY; + } dma_resv_add_shared_fence(bo->base.resv, fence); -- 2.7.4

5 years, 4 months

[PATCH] drm/ttm: Fix dma_fence refcnt leak in ttm_bo_vm_fault_reserved

by Xiyu Yang

ttm_bo_vm_fault_reserved() invokes dma_fence_get(), which returns a reference of the specified dma_fence object to "moving" with increased refcnt. When ttm_bo_vm_fault_reserved() returns, local variable "moving" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in several exception handling paths of ttm_bo_vm_fault_reserved(). When those error scenarios occur such as "err" equals to -EBUSY, the function forgets to decrease the refcnt increased by dma_fence_get(), causing a refcnt leak. Fix this issue by calling dma_fence_put() when no_wait_gpu flag is equals to true. Signed-off-by: Xiyu Yang <xiyuyang19(a)fudan.edu.cn> Signed-off-by: Xin Tan <tanxin.ctf(a)gmail.com> --- drivers/gpu/drm/ttm/ttm_bo_vm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index a43aa7275f12..fa03fab02076 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -300,8 +300,10 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf, break; case -EBUSY: case -ERESTARTSYS: + dma_fence_put(moving); return VM_FAULT_NOPAGE; default: + dma_fence_put(moving); return VM_FAULT_SIGBUS; } -- 2.7.4

5 years, 4 months

[PATCH] dma-buf: support to walk the list of dmabuf for debug

by Hyesoo Yu

Let's support debugging function to show exporter detail information. The exporter don't need to manage the lists for debugging because all dmabuf list are managed on dmabuf framework. That supports to walk the dmabuf list and show the detailed information for exporter by passed function implemented from exporter. That helps to show exporter detail information. For example, ION may show the buffer flag, heap name, or the name of process to request allocation. Change-Id: I670f04dda4a0870081e1b0fd96b9185b48b9dd15 Signed-off-by: Hyesoo Yu <hyesoo.yu(a)samsung.com> --- drivers/dma-buf/dma-buf.c | 30 ++++++++++++++++++++++++++++++ include/linux/dma-buf.h | 2 ++ 2 files changed, 32 insertions(+) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 01ce125f8e8d..002bd3ac636e 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1254,6 +1254,36 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr) } EXPORT_SYMBOL_GPL(dma_buf_vunmap); +int dma_buf_exp_show(struct seq_file *s, + int (*it)(struct seq_file *s, struct dma_buf *dmabuf)) +{ + int ret; + struct dma_buf *buf_obj; + + ret = mutex_lock_interruptible(&db_list.lock); + if (ret) + return ret; + + list_for_each_entry(buf_obj, &db_list.head, list_node) { + ret = mutex_lock_interruptible(&buf_obj->lock); + if (ret) { + seq_puts(s, + "\tERROR locking buffer object: skipping\n"); + continue; + } + + ret = it(s, buf_obj); + mutex_unlock(&buf_obj->lock); + if (ret) + break; + } + mutex_unlock(&db_list.lock); + + return 0; + +} +EXPORT_SYMBOL_GPL(dma_buf_exp_show); + #ifdef CONFIG_DEBUG_FS static int dma_buf_debug_show(struct seq_file *s, void *unused) { diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index ab0c156abee6..b5c0a10b4eb3 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -502,4 +502,6 @@ int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *, unsigned long); void *dma_buf_vmap(struct dma_buf *); void dma_buf_vunmap(struct dma_buf *, void *vaddr); +int dma_buf_exp_show(struct seq_file *s, + int (*it)(struct seq_file *s, struct dma_buf *dmabuf)); #endif /* __DMA_BUF_H__ */ -- 2.27.0

5 years, 4 months

[PATCH 04/18] dma-fence: prime lockdep annotations

by Daniel Vetter

Two in one go: - it is allowed to call dma_fence_wait() while holding a dma_resv_lock(). This is fundamental to how eviction works with ttm, so required. - it is allowed to call dma_fence_wait() from memory reclaim contexts, specifically from shrinker callbacks (which i915 does), and from mmu notifier callbacks (which amdgpu does, and which i915 sometimes also does, and probably always should, but that's kinda a debate). Also for stuff like HMM we really need to be able to do this, or things get real dicey. Consequence is that any critical path necessary to get to a dma_fence_signal for a fence must never a) call dma_resv_lock nor b) allocate memory with GFP_KERNEL. Also by implication of dma_resv_lock(), no userspace faulting allowed. That's some supremely obnoxious limitations, which is why we need to sprinkle the right annotations to all relevant paths. The one big locking context we're leaving out here is mmu notifiers, added in commit 23b68395c7c78a764e8963fc15a7cfd318bf187f Author: Daniel Vetter <daniel.vetter(a)ffwll.ch> Date: Mon Aug 26 22:14:21 2019 +0200 mm/mmu_notifiers: add a lockdep map for invalidate_range_start/end that one covers a lot of other callsites, and it's also allowed to wait on dma-fences from mmu notifiers. But there's no ready-made functions exposed to prime this, so I've left it out for now. v2: Also track against mmu notifier context. v3: kerneldoc to spec the cross-driver contract. Note that currently i915 throws in a hard-coded 10s timeout on foreign fences (not sure why that was done, but it's there), which is why that rule is worded with SHOULD instead of MUST. Also some of the mmu_notifier/shrinker rules might surprise SoC drivers, I haven't fully audited them all. Which is infeasible anyway, we'll need to run them with lockdep and dma-fence annotations and see what goes boom. v4: A spelling fix from Mika Cc: Mika Kuoppala <mika.kuoppala(a)intel.com> Cc: Thomas Hellstrom <thomas.hellstrom(a)intel.com> Cc: linux-media(a)vger.kernel.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: linux-rdma(a)vger.kernel.org Cc: amd-gfx(a)lists.freedesktop.org Cc: intel-gfx(a)lists.freedesktop.org Cc: Chris Wilson <chris(a)chris-wilson.co.uk> Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com> Cc: Christian König <christian.koenig(a)amd.com> Signed-off-by: Daniel Vetter <daniel.vetter(a)intel.com> --- Documentation/driver-api/dma-buf.rst | 6 ++++ drivers/dma-buf/dma-fence.c | 41 ++++++++++++++++++++++++++++ drivers/dma-buf/dma-resv.c | 4 +++ include/linux/dma-fence.h | 1 + 4 files changed, 52 insertions(+) diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst index 05d856131140..f8f6decde359 100644 --- a/Documentation/driver-api/dma-buf.rst +++ b/Documentation/driver-api/dma-buf.rst @@ -133,6 +133,12 @@ DMA Fences .. kernel-doc:: drivers/dma-buf/dma-fence.c :doc: DMA fences overview +DMA Fence Cross-Driver Contract +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. kernel-doc:: drivers/dma-buf/dma-fence.c + :doc: fence cross-driver contract + DMA Fence Signalling Annotations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 0005bc002529..754e6fb84fb7 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -64,6 +64,47 @@ static atomic64_t dma_fence_context_counter = ATOMIC64_INIT(1); * &dma_buf.resv pointer. */ +/** + * DOC: fence cross-driver contract + * + * Since &dma_fence provide a cross driver contract, all drivers must follow the + * same rules: + * + * * Fences must complete in a reasonable time. Fences which represent kernels + * and shaders submitted by userspace, which could run forever, must be backed + * up by timeout and gpu hang recovery code. Minimally that code must prevent + * further command submission and force complete all in-flight fences, e.g. + * when the driver or hardware do not support gpu reset, or if the gpu reset + * failed for some reason. Ideally the driver supports gpu recovery which only + * affects the offending userspace context, and no other userspace + * submissions. + * + * * Drivers may have different ideas of what completion within a reasonable + * time means. Some hang recovery code uses a fixed timeout, others a mix + * between observing forward progress and increasingly strict timeouts. + * Drivers should not try to second guess timeout handling of fences from + * other drivers. + * + * * To ensure there's no deadlocks of dma_fence_wait() against other locks + * drivers should annotate all code required to reach dma_fence_signal(), + * which completes the fences, with dma_fence_begin_signalling() and + * dma_fence_end_signalling(). + * + * * Drivers are allowed to call dma_fence_wait() while holding dma_resv_lock(). + * This means any code required for fence completion cannot acquire a + * &dma_resv lock. Note that this also pulls in the entire established + * locking hierarchy around dma_resv_lock() and dma_resv_unlock(). + * + * * Drivers are allowed to call dma_fence_wait() from their &shrinker + * callbacks. This means any code required for fence completion cannot + * allocate memory with GFP_KERNEL. + * + * * Drivers are allowed to call dma_fence_wait() from their &mmu_notifier + * respectively &mmu_interval_notifier callbacks. This means any code required + * for fence completeion cannot allocate memory with GFP_NOFS or GFP_NOIO. + * Only GFP_ATOMIC is permissible, which might fail. + */ + static const char *dma_fence_stub_get_name(struct dma_fence *fence) { return "stub"; diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c index 99c0a33c918d..c223f32425c4 100644 --- a/drivers/dma-buf/dma-resv.c +++ b/drivers/dma-buf/dma-resv.c @@ -35,6 +35,7 @@ #include <linux/dma-resv.h> #include <linux/export.h> #include <linux/sched/mm.h> +#include <linux/mmu_notifier.h> /** * DOC: Reservation Object Overview @@ -115,6 +116,9 @@ static int __init dma_resv_lockdep(void) if (ret == -EDEADLK) dma_resv_lock_slow(&obj, &ctx); fs_reclaim_acquire(GFP_KERNEL); + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + __dma_fence_might_wait(); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); fs_reclaim_release(GFP_KERNEL); ww_mutex_unlock(&obj.lock); ww_acquire_fini(&ctx); diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 3f288f7db2ef..09e23adb351d 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -360,6 +360,7 @@ dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep) #ifdef CONFIG_LOCKDEP bool dma_fence_begin_signalling(void); void dma_fence_end_signalling(bool cookie); +void __dma_fence_might_wait(void); #else static inline bool dma_fence_begin_signalling(void) { -- 2.26.2

5 years, 4 months

[PATCH 03/18] dma-fence: basic lockdep annotations

by Daniel Vetter

Design is similar to the lockdep annotations for workers, but with some twists: - We use a read-lock for the execution/worker/completion side, so that this explicit annotation can be more liberally sprinkled around. With read locks lockdep isn't going to complain if the read-side isn't nested the same way under all circumstances, so ABBA deadlocks are ok. Which they are, since this is an annotation only. - We're using non-recursive lockdep read lock mode, since in recursive read lock mode lockdep does not catch read side hazards. And we _very_ much want read side hazards to be caught. For full details of this limitation see commit e91498589746065e3ae95d9a00b068e525eec34f Author: Peter Zijlstra <peterz(a)infradead.org> Date: Wed Aug 23 13:13:11 2017 +0200 locking/lockdep/selftests: Add mixed read-write ABBA tests - To allow nesting of the read-side explicit annotations we explicitly keep track of the nesting. lock_is_held() allows us to do that. - The wait-side annotation is a write lock, and entirely done within dma_fence_wait() for everyone by default. - To be able to freely annotate helper functions I want to make it ok to call dma_fence_begin/end_signalling from soft/hardirq context. First attempt was using the hardirq locking context for the write side in lockdep, but this forces all normal spinlocks nested within dma_fence_begin/end_signalling to be spinlocks. That bollocks. The approach now is to simple check in_atomic(), and for these cases entirely rely on the might_sleep() check in dma_fence_wait(). That will catch any wrong nesting against spinlocks from soft/hardirq contexts. The idea here is that every code path that's critical for eventually signalling a dma_fence should be annotated with dma_fence_begin/end_signalling. The annotation ideally starts right after a dma_fence is published (added to a dma_resv, exposed as a sync_file fd, attached to a drm_syncobj fd, or anything else that makes the dma_fence visible to other kernel threads), up to and including the dma_fence_wait(). Examples are irq handlers, the scheduler rt threads, the tail of execbuf (after the corresponding fences are visible), any workers that end up signalling dma_fences and really anything else. Not annotated should be code paths that only complete fences opportunistically as the gpu progresses, like e.g. shrinker/eviction code. The main class of deadlocks this is supposed to catch are: Thread A: mutex_lock(A); mutex_unlock(A); dma_fence_signal(); Thread B: mutex_lock(A); dma_fence_wait(); mutex_unlock(A); Thread B is blocked on A signalling the fence, but A never gets around to that because it cannot acquire the lock A. Note that dma_fence_wait() is allowed to be nested within dma_fence_begin/end_signalling sections. To allow this to happen the read lock needs to be upgraded to a write lock, which means that any other lock is acquired between the dma_fence_begin_signalling() call and the call to dma_fence_wait(), and still held, this will result in an immediate lockdep complaint. The only other option would be to not annotate such calls, defeating the point. Therefore these annotations cannot be sprinkled over the code entirely mindless to avoid false positives. v2: handle soft/hardirq ctx better against write side and dont forget EXPORT_SYMBOL, drivers can't use this otherwise. v3: Kerneldoc. v4: Some spelling fixes from Mika Cc: Mika Kuoppala <mika.kuoppala(a)intel.com> Cc: Thomas Hellstrom <thomas.hellstrom(a)intel.com> Cc: linux-media(a)vger.kernel.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: linux-rdma(a)vger.kernel.org Cc: amd-gfx(a)lists.freedesktop.org Cc: intel-gfx(a)lists.freedesktop.org Cc: Chris Wilson <chris(a)chris-wilson.co.uk> Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com> Cc: Christian König <christian.koenig(a)amd.com> Signed-off-by: Daniel Vetter <daniel.vetter(a)intel.com> --- Documentation/driver-api/dma-buf.rst | 12 +- drivers/dma-buf/dma-fence.c | 161 +++++++++++++++++++++++++++ include/linux/dma-fence.h | 12 ++ 3 files changed, 182 insertions(+), 3 deletions(-) diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst index 63dec76d1d8d..05d856131140 100644 --- a/Documentation/driver-api/dma-buf.rst +++ b/Documentation/driver-api/dma-buf.rst @@ -100,11 +100,11 @@ CPU Access to DMA Buffer Objects .. kernel-doc:: drivers/dma-buf/dma-buf.c :doc: cpu access -Fence Poll Support -~~~~~~~~~~~~~~~~~~ +Implicit Fence Poll Support +~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. kernel-doc:: drivers/dma-buf/dma-buf.c - :doc: fence polling + :doc: implicit fence polling Kernel Functions and Structures Reference ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -133,6 +133,12 @@ DMA Fences .. kernel-doc:: drivers/dma-buf/dma-fence.c :doc: DMA fences overview +DMA Fence Signalling Annotations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. kernel-doc:: drivers/dma-buf/dma-fence.c + :doc: fence signalling annotation + DMA Fences Functions Reference ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 656e9ac2d028..0005bc002529 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -110,6 +110,160 @@ u64 dma_fence_context_alloc(unsigned num) } EXPORT_SYMBOL(dma_fence_context_alloc); +/** + * DOC: fence signalling annotation + * + * Proving correctness of all the kernel code around &dma_fence through code + * review and testing is tricky for a few reasons: + * + * * It is a cross-driver contract, and therefore all drivers must follow the + * same rules for lock nesting order, calling contexts for various functions + * and anything else significant for in-kernel interfaces. But it is also + * impossible to test all drivers in a single machine, hence brute-force N vs. + * N testing of all combinations is impossible. Even just limiting to the + * possible combinations is infeasible. + * + * * There is an enormous amount of driver code involved. For render drivers + * there's the tail of command submission, after fences are published, + * scheduler code, interrupt and workers to process job completion, + * and timeout, gpu reset and gpu hang recovery code. Plus for integration + * with core mm with have &mmu_notifier, respectively &mmu_interval_notifier, + * and &shrinker. For modesetting drivers there's the commit tail functions + * between when fences for an atomic modeset are published, and when the + * corresponding vblank completes, including any interrupt processing and + * related workers. Auditing all that code, across all drivers, is not + * feasible. + * + * * Due to how many other subsystems are involved and the locking hierarchies + * this pulls in there is extremely thin wiggle-room for driver-specific + * differences. &dma_fence interacts with almost all of the core memory + * handling through page fault handlers via &dma_resv, dma_resv_lock() and + * dma_resv_unlock(). On the other side it also interacts through all + * allocation sites through &mmu_notifier and &shrinker. + * + * Furthermore lockdep does not handle cross-release dependencies, which means + * any deadlocks between dma_fence_wait() and dma_fence_signal() can't be caught + * at runtime with some quick testing. The simplest example is one thread + * waiting on a &dma_fence while holding a lock:: + * + * lock(A); + * dma_fence_wait(B); + * unlock(A); + * + * while the other thread is stuck trying to acquire the same lock, which + * prevents it from signalling the fence the previous thread is stuck waiting + * on:: + * + * lock(A); + * unlock(A); + * dma_fence_signal(B); + * + * By manually annotating all code relevant to signalling a &dma_fence we can + * teach lockdep about these dependencies, which also helps with the validation + * headache since now lockdep can check all the rules for us:: + * + * cookie = dma_fence_begin_signalling(); + * lock(A); + * unlock(A); + * dma_fence_signal(B); + * dma_fence_end_signalling(cookie); + * + * For using dma_fence_begin_signalling() and dma_fence_end_signalling() to + * annotate critical sections the following rules need to be observed: + * + * * All code necessary to complete a &dma_fence must be annotated, from the + * point where a fence is accessible to other threads, to the point where + * dma_fence_signal() is called. Un-annotated code can contain deadlock issues, + * and due to the very strict rules and many corner cases it is infeasible to + * catch these just with review or normal stress testing. + * + * * &struct dma_resv deserves a special note, since the readers are only + * protected by rcu. This means the signalling critical section starts as soon + * as the new fences are installed, even before dma_resv_unlock() is called. + * + * * The only exception are fast paths and opportunistic signalling code, which + * calls dma_fence_signal() purely as an optimization, but is not required to + * guarantee completion of a &dma_fence. The usual example is a wait IOCTL + * which calls dma_fence_signal(), while the mandatory completion path goes + * through a hardware interrupt and possible job completion worker. + * + * * To aid composability of code, the annotations can be freely nested, as long + * as the overall locking hierarchy is consistent. The annotations also work + * both in interrupt and process context. Due to implementation details this + * requires that callers pass an opaque cookie from + * dma_fence_begin_signalling() to dma_fence_end_signalling(). + * + * * Validation against the cross driver contract is implemented by priming + * lockdep with the relevant hierarchy at boot-up. This means even just + * testing with a single device is enough to validate a driver, at least as + * far as deadlocks with dma_fence_wait() against dma_fence_signal() are + * concerned. + */ +#ifdef CONFIG_LOCKDEP +struct lockdep_map dma_fence_lockdep_map = { + .name = "dma_fence_map" +}; + +/** + * dma_fence_begin_signalling - begin a critical DMA fence signalling section + * + * Drivers should use this to annotate the beginning of any code section + * required to eventually complete &dma_fence by calling dma_fence_signal(). + * + * The end of these critical sections are annotated with + * dma_fence_end_signalling(). + * + * Returns: + * + * Opaque cookie needed by the implementation, which needs to be passed to + * dma_fence_end_signalling(). + */ +bool dma_fence_begin_signalling(void) +{ + /* explicitly nesting ... */ + if (lock_is_held_type(&dma_fence_lockdep_map, 1)) + return true; + + /* rely on might_sleep check for soft/hardirq locks */ + if (in_atomic()) + return true; + + /* ... and non-recursive readlock */ + lock_acquire(&dma_fence_lockdep_map, 0, 0, 1, 1, NULL, _RET_IP_); + + return false; +} +EXPORT_SYMBOL(dma_fence_begin_signalling); + +/** + * dma_fence_end_signalling - end a critical DMA fence signalling section + * + * Closes a critical section annotation opened by dma_fence_begin_signalling(). + */ +void dma_fence_end_signalling(bool cookie) +{ + if (cookie) + return; + + lock_release(&dma_fence_lockdep_map, _RET_IP_); +} +EXPORT_SYMBOL(dma_fence_end_signalling); + +void __dma_fence_might_wait(void) +{ + bool tmp; + + tmp = lock_is_held_type(&dma_fence_lockdep_map, 1); + if (tmp) + lock_release(&dma_fence_lockdep_map, _THIS_IP_); + lock_map_acquire(&dma_fence_lockdep_map); + lock_map_release(&dma_fence_lockdep_map); + if (tmp) + lock_acquire(&dma_fence_lockdep_map, 0, 0, 1, 1, NULL, _THIS_IP_); +} +#endif + + /** * dma_fence_signal_locked - signal completion of a fence * @fence: the fence to signal @@ -170,14 +324,19 @@ int dma_fence_signal(struct dma_fence *fence) { unsigned long flags; int ret; + bool tmp; if (!fence) return -EINVAL; + tmp = dma_fence_begin_signalling(); + spin_lock_irqsave(fence->lock, flags); ret = dma_fence_signal_locked(fence); spin_unlock_irqrestore(fence->lock, flags); + dma_fence_end_signalling(tmp); + return ret; } EXPORT_SYMBOL(dma_fence_signal); @@ -210,6 +369,8 @@ dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout) might_sleep(); + __dma_fence_might_wait(); + trace_dma_fence_wait_start(fence); if (fence->ops->wait) ret = fence->ops->wait(fence, intr, timeout); diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 3347c54f3a87..3f288f7db2ef 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -357,6 +357,18 @@ dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep) } while (1); } +#ifdef CONFIG_LOCKDEP +bool dma_fence_begin_signalling(void); +void dma_fence_end_signalling(bool cookie); +#else +static inline bool dma_fence_begin_signalling(void) +{ + return true; +} +static inline void dma_fence_end_signalling(bool cookie) {} +static inline void __dma_fence_might_wait(void) {} +#endif + int dma_fence_signal(struct dma_fence *fence); int dma_fence_signal_locked(struct dma_fence *fence); signed long dma_fence_default_wait(struct dma_fence *fence, -- 2.26.2

5 years, 4 months

[PATCH] dma-buf: Move dma_buf_release() from fops to dentry_ops

by Sumit Semwal

Charan Teja reported a 'use-after-free' in dmabuffs_dname [1], which happens if the dma_buf_release() is called while the userspace is accessing the dma_buf pseudo fs's dmabuffs_dname() in another process, and dma_buf_release() releases the dmabuf object when the last reference to the struct file goes away. I discussed with Arnd Bergmann, and he suggested that rather than tying the dma_buf_release() to the file_operations' release(), we can tie it to the dentry_operations' d_release(), which will be called when the last ref to the dentry is removed. The path exercised by __fput() calls f_op->release() first, and then calls dput, which eventually calls d_op->d_release(). In the 'normal' case, when no userspace access is happening via dma_buf pseudo fs, there should be exactly one fd, file, dentry and inode, so closing the fd will kill of everything right away. In the presented case, the dentry's d_release() will be called only when the dentry's last ref is released. Therefore, lets move dma_buf_release() from fops->release() to d_ops->d_release(). Many thanks to Arnd for his FS insights :) [1]: https://lore.kernel.org/patchwork/patch/1238278/ Fixes: bb2bb9030425 ("dma-buf: add DMA_BUF_SET_NAME ioctls") Reported-by: syzbot+3643a18836bce555bff6(a)syzkaller.appspotmail.com Cc: <stable(a)vger.kernel.org> [5.3+] Cc: Arnd Bergmann <arnd(a)arndb.de> Reported-by: Charan Teja Reddy <charante(a)codeaurora.org> Signed-off-by: Sumit Semwal <sumit.semwal(a)linaro.org> --- drivers/dma-buf/dma-buf.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 01ce125f8e8d..92ba4b6ef3e7 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -54,8 +54,11 @@ static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen) dentry->d_name.name, ret > 0 ? name : ""); } +static void dma_buf_release(struct dentry *dentry); + static const struct dentry_operations dma_buf_dentry_ops = { .d_dname = dmabuffs_dname, + .d_release = dma_buf_release, }; static struct vfsmount *dma_buf_mnt; @@ -77,14 +80,14 @@ static struct file_system_type dma_buf_fs_type = { .kill_sb = kill_anon_super, }; -static int dma_buf_release(struct inode *inode, struct file *file) +static void dma_buf_release(struct dentry *dentry) { struct dma_buf *dmabuf; - if (!is_dma_buf_file(file)) - return -EINVAL; + if (dentry->d_op != &dma_buf_dentry_ops) + return; - dmabuf = file->private_data; + dmabuf = dentry->d_fsdata; BUG_ON(dmabuf->vmapping_counter); @@ -110,7 +113,6 @@ static int dma_buf_release(struct inode *inode, struct file *file) module_put(dmabuf->owner); kfree(dmabuf->name); kfree(dmabuf); - return 0; } static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma) @@ -412,7 +414,6 @@ static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file) } static const struct file_operations dma_buf_fops = { - .release = dma_buf_release, .mmap = dma_buf_mmap_internal, .llseek = dma_buf_llseek, .poll = dma_buf_poll, -- 2.27.0

5 years, 4 months

Re: [Linaro-mm-sig] [PATCH 13/18] drm/amdgpu/dc: Stop dma_resv_lock inversion in commit_tail

by Daniel Vetter

On Fri, Jun 5, 2020 at 10:30 AM Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer(a)amd.com> wrote: > > Hi Daniel, > > On 04/06/2020 10:12, Daniel Vetter wrote: > [...] > > @@ -6910,7 +6910,11 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, > > * explicitly on fences instead > > * and in general should be called for > > * blocking commit to as per framework helpers > > + * > > + * Yes, this deadlocks, since you're calling dma_resv_lock in a > > + * path that leads to a dma_fence_signal(). Don't do that. > > */ > > +#if 0 > > r = amdgpu_bo_reserve(abo, true); > > if (unlikely(r != 0)) > > DRM_ERROR("failed to reserve buffer before flip\n"); > > @@ -6920,6 +6924,12 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, > > tmz_surface = amdgpu_bo_encrypted(abo); > > > > amdgpu_bo_unreserve(abo); > > +#endif > > + /* > > + * this races anyway, so READ_ONCE isn't any better or worse > > + * than the stuff above. Except the stuff above can deadlock. > > + */ > > + tiling_flags = READ_ONCE(abo->tiling_flags); > > With this change "tmz_surface" won't be initialized properly. > Adding the following line should fix it: > > tmz_surface = READ_ONCE(abo->flags) & AMDGPU_GEM_CREATE_ENCRYPTED; So to make this clear, I'm not really proposing to fix up all the drivers in detail. There's a lot more bugs in all the other drivers, I'm pretty sure. The driver fixups really are just quick hacks to illustrate the problem, and at least in some cases, maybe illustrate a possible solution. For the real fixes I think this needs driver teams working on this, and make sure it's all solid. I can help a bit with review (especially for placing the annotations, e.g. the one I put in cs_submit() annotates a bit too much), but that's it. Also I think the patch is from before tmz landed, and I just blindly rebased over it :-) -Daniel > > > Pierre-Eric > > > > > > fill_dc_plane_info_and_addr( > > dm->adev, new_plane_state, tiling_flags, > > -- Daniel Vetter Software Engineer, Intel Corporation +41 (0) 79 365 57 48 - http://blog.ffwll.ch

5 years, 4 months

Re: [Linaro-mm-sig] [PATCH 03/18] dma-fence: basic lockdep annotations

by Daniel Vetter

On Thu, Jun 4, 2020 at 10:57 AM Thomas Hellström (Intel) <thomas_os(a)shipmail.org> wrote: > > > On 6/4/20 10:12 AM, Daniel Vetter wrote: > ... > > Thread A: > > > > mutex_lock(A); > > mutex_unlock(A); > > > > dma_fence_signal(); > > > > Thread B: > > > > mutex_lock(A); > > dma_fence_wait(); > > mutex_unlock(A); > > > > Thread B is blocked on A signalling the fence, but A never gets around > > to that because it cannot acquire the lock A. > > > > Note that dma_fence_wait() is allowed to be nested within > > dma_fence_begin/end_signalling sections. To allow this to happen the > > read lock needs to be upgraded to a write lock, which means that any > > other lock is acquired between the dma_fence_begin_signalling() call and > > the call to dma_fence_wait(), and still held, this will result in an > > immediate lockdep complaint. The only other option would be to not > > annotate such calls, defeating the point. Therefore these annotations > > cannot be sprinkled over the code entirely mindless to avoid false > > positives. > > Just realized, isn't that example actually a true positive, or at least > a great candidate for a true positive, since if another thread reenters > that signaling path, it will block on that mutex, and the fence would > never be signaled unless there is another signaling path? Not sure I understand fully, but I think the answer is "it's complicated". dma_fence are meant to be a DAG (directed acyclic graph). Now it would be nice to enforce that, and i915 has some attempts to that effect, but these annotations here don't try to pull off that miracle. I'm assuming that all the dependencies between dma_fence don't create a loop, and instead I'm only focusing on deadlocks between dma_fences and other locks. Usually an async work looks like this: 1. wait for a bunch of dma_fence that we have as dependencies 2. do work (e.g. atomic commit) 3. signal the dma_fence that represents our work This can happen on the cpu in a kthread or worker, or on the gpu. Now for reasons you might want to have a per-work mutex or something and hold that while going through all this, and this is the false positive I'm thinking off. Of course, if your fences aren't a DAG, or if you're holding a mutex that's shared with some other work which is part of your dependency chain, then this goes boom. But it doesn't have to. I think in general it's best to purely rely on ordering, and remove as much locking as possible. This is the design behind the atomic modeset commit code, which is does not take any mutexes in the commit path, at least not in the helpers. Drivers can still do stuff of course. Then the only locks you're left with are spinlocks (maybe irq safe ones) to coordinate with interrupt handlers, workers, handle the wait/wake queues, manage work/scheduler run queues and all that stuff, and no spinlocks. Now for the case where you have something like the below: thread 1: dma_fence_begin_signalling() mutex_lock(a); dma_fence_wait(b1); mutex_unlock(a); dma_fence_signal(b2); dma_fence_end_signalling(); That's indeed a bit problematic, assuming you're annotating stuff correctly, and the locking is actually required. I've seen a few of these, and annotating the properly needs care: - often the mutex_lock/unlock is not needed, and just gets in the way. This was the case for the original atomic modeset commit work patches, which again locked all the modeset locks. But strict ordering of commit work was all that was needed to make this work, plus making sure data structure lifetimes are handled correctly too. I think the tendency to abuse locking to handle lifetime and ordering problems is fairly common, but it can lead to lots of trouble. Ime all async work items with the above problematic pattern can be fixed like this. - other often case is that the dma_fence_begin_signalling() can&should be pushed down past the mutex_lock, and maybe even past the dma_fence_wait, depending upon when/how the dma_fence is published. The fence signalling critical section can still extend past the mutex_unlock, lockdep and semantics are fine with that (I think at least). This is more the case for execbuf tails, where you take locks, set up some async work, publish the fences and then begin to process these fences (which could just be pushing the work to the job scheduler, but could also involve running it directly in the userspace process thread context, but with locks already dropped). So I wouldn't go out and say these are true positives, just maybe unecessary locking and over-eager annotations, without any real bugs in the code. Or am I completely off the track and you're thinking of something else? > Although I agree the conclusion is sound: These annotations cannot be > sprinkled mindlessly over the code. Yup, that much is for sure. -Daniel > > /Thomas > > > > > > > > > > v2: handle soft/hardirq ctx better against write side and dont forget > > EXPORT_SYMBOL, drivers can't use this otherwise. > > > > v3: Kerneldoc. > > > > v4: Some spelling fixes from Mika > > > > Cc: Mika Kuoppala <mika.kuoppala(a)intel.com> > > Cc: Thomas Hellstrom <thomas.hellstrom(a)intel.com> > > Cc: linux-media(a)vger.kernel.org > > Cc: linaro-mm-sig(a)lists.linaro.org > > Cc: linux-rdma(a)vger.kernel.org > > Cc: amd-gfx(a)lists.freedesktop.org > > Cc: intel-gfx(a)lists.freedesktop.org > > Cc: Chris Wilson <chris(a)chris-wilson.co.uk> > > Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com> > > Cc: Christian König <christian.koenig(a)amd.com> > > Signed-off-by: Daniel Vetter <daniel.vetter(a)intel.com> > > --- > > Documentation/driver-api/dma-buf.rst | 12 +- > > drivers/dma-buf/dma-fence.c | 161 +++++++++++++++++++++++++++ > > include/linux/dma-fence.h | 12 ++ > > 3 files changed, 182 insertions(+), 3 deletions(-) > > > > diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst > > index 63dec76d1d8d..05d856131140 100644 > > --- a/Documentation/driver-api/dma-buf.rst > > +++ b/Documentation/driver-api/dma-buf.rst > > @@ -100,11 +100,11 @@ CPU Access to DMA Buffer Objects > > .. kernel-doc:: drivers/dma-buf/dma-buf.c > > :doc: cpu access > > > > -Fence Poll Support > > -~~~~~~~~~~~~~~~~~~ > > +Implicit Fence Poll Support > > +~~~~~~~~~~~~~~~~~~~~~~~~~~~ > > > > .. kernel-doc:: drivers/dma-buf/dma-buf.c > > - :doc: fence polling > > + :doc: implicit fence polling > > > > Kernel Functions and Structures Reference > > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > > @@ -133,6 +133,12 @@ DMA Fences > > .. kernel-doc:: drivers/dma-buf/dma-fence.c > > :doc: DMA fences overview > > > > +DMA Fence Signalling Annotations > > +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > > + > > +.. kernel-doc:: drivers/dma-buf/dma-fence.c > > + :doc: fence signalling annotation > > + > > DMA Fences Functions Reference > > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > > > > diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c > > index 656e9ac2d028..0005bc002529 100644 > > --- a/drivers/dma-buf/dma-fence.c > > +++ b/drivers/dma-buf/dma-fence.c > > @@ -110,6 +110,160 @@ u64 dma_fence_context_alloc(unsigned num) > > } > > EXPORT_SYMBOL(dma_fence_context_alloc); > > > > +/** > > + * DOC: fence signalling annotation > > + * > > + * Proving correctness of all the kernel code around &dma_fence through code > > + * review and testing is tricky for a few reasons: > > + * > > + * * It is a cross-driver contract, and therefore all drivers must follow the > > + * same rules for lock nesting order, calling contexts for various functions > > + * and anything else significant for in-kernel interfaces. But it is also > > + * impossible to test all drivers in a single machine, hence brute-force N vs. > > + * N testing of all combinations is impossible. Even just limiting to the > > + * possible combinations is infeasible. > > + * > > + * * There is an enormous amount of driver code involved. For render drivers > > + * there's the tail of command submission, after fences are published, > > + * scheduler code, interrupt and workers to process job completion, > > + * and timeout, gpu reset and gpu hang recovery code. Plus for integration > > + * with core mm with have &mmu_notifier, respectively &mmu_interval_notifier, > > + * and &shrinker. For modesetting drivers there's the commit tail functions > > + * between when fences for an atomic modeset are published, and when the > > + * corresponding vblank completes, including any interrupt processing and > > + * related workers. Auditing all that code, across all drivers, is not > > + * feasible. > > + * > > + * * Due to how many other subsystems are involved and the locking hierarchies > > + * this pulls in there is extremely thin wiggle-room for driver-specific > > + * differences. &dma_fence interacts with almost all of the core memory > > + * handling through page fault handlers via &dma_resv, dma_resv_lock() and > > + * dma_resv_unlock(). On the other side it also interacts through all > > + * allocation sites through &mmu_notifier and &shrinker. > > + * > > + * Furthermore lockdep does not handle cross-release dependencies, which means > > + * any deadlocks between dma_fence_wait() and dma_fence_signal() can't be caught > > + * at runtime with some quick testing. The simplest example is one thread > > + * waiting on a &dma_fence while holding a lock:: > > + * > > + * lock(A); > > + * dma_fence_wait(B); > > + * unlock(A); > > + * > > + * while the other thread is stuck trying to acquire the same lock, which > > + * prevents it from signalling the fence the previous thread is stuck waiting > > + * on:: > > + * > > + * lock(A); > > + * unlock(A); > > + * dma_fence_signal(B); > > + * > > + * By manually annotating all code relevant to signalling a &dma_fence we can > > + * teach lockdep about these dependencies, which also helps with the validation > > + * headache since now lockdep can check all the rules for us:: > > + * > > + * cookie = dma_fence_begin_signalling(); > > + * lock(A); > > + * unlock(A); > > + * dma_fence_signal(B); > > + * dma_fence_end_signalling(cookie); > > + * > > + * For using dma_fence_begin_signalling() and dma_fence_end_signalling() to > > + * annotate critical sections the following rules need to be observed: > > + * > > + * * All code necessary to complete a &dma_fence must be annotated, from the > > + * point where a fence is accessible to other threads, to the point where > > + * dma_fence_signal() is called. Un-annotated code can contain deadlock issues, > > + * and due to the very strict rules and many corner cases it is infeasible to > > + * catch these just with review or normal stress testing. > > + * > > + * * &struct dma_resv deserves a special note, since the readers are only > > + * protected by rcu. This means the signalling critical section starts as soon > > + * as the new fences are installed, even before dma_resv_unlock() is called. > > + * > > + * * The only exception are fast paths and opportunistic signalling code, which > > + * calls dma_fence_signal() purely as an optimization, but is not required to > > + * guarantee completion of a &dma_fence. The usual example is a wait IOCTL > > + * which calls dma_fence_signal(), while the mandatory completion path goes > > + * through a hardware interrupt and possible job completion worker. > > + * > > + * * To aid composability of code, the annotations can be freely nested, as long > > + * as the overall locking hierarchy is consistent. The annotations also work > > + * both in interrupt and process context. Due to implementation details this > > + * requires that callers pass an opaque cookie from > > + * dma_fence_begin_signalling() to dma_fence_end_signalling(). > > + * > > + * * Validation against the cross driver contract is implemented by priming > > + * lockdep with the relevant hierarchy at boot-up. This means even just > > + * testing with a single device is enough to validate a driver, at least as > > + * far as deadlocks with dma_fence_wait() against dma_fence_signal() are > > + * concerned. > > + */ > > +#ifdef CONFIG_LOCKDEP > > +struct lockdep_map dma_fence_lockdep_map = { > > + .name = "dma_fence_map" > > +}; > > + > > +/** > > + * dma_fence_begin_signalling - begin a critical DMA fence signalling section > > + * > > + * Drivers should use this to annotate the beginning of any code section > > + * required to eventually complete &dma_fence by calling dma_fence_signal(). > > + * > > + * The end of these critical sections are annotated with > > + * dma_fence_end_signalling(). > > + * > > + * Returns: > > + * > > + * Opaque cookie needed by the implementation, which needs to be passed to > > + * dma_fence_end_signalling(). > > + */ > > +bool dma_fence_begin_signalling(void) > > +{ > > + /* explicitly nesting ... */ > > + if (lock_is_held_type(&dma_fence_lockdep_map, 1)) > > + return true; > > + > > + /* rely on might_sleep check for soft/hardirq locks */ > > + if (in_atomic()) > > + return true; > > + > > + /* ... and non-recursive readlock */ > > + lock_acquire(&dma_fence_lockdep_map, 0, 0, 1, 1, NULL, _RET_IP_); > > + > > + return false; > > +} > > +EXPORT_SYMBOL(dma_fence_begin_signalling); > > + > > +/** > > + * dma_fence_end_signalling - end a critical DMA fence signalling section > > + * > > + * Closes a critical section annotation opened by dma_fence_begin_signalling(). > > + */ > > +void dma_fence_end_signalling(bool cookie) > > +{ > > + if (cookie) > > + return; > > + > > + lock_release(&dma_fence_lockdep_map, _RET_IP_); > > +} > > +EXPORT_SYMBOL(dma_fence_end_signalling); > > + > > +void __dma_fence_might_wait(void) > > +{ > > + bool tmp; > > + > > + tmp = lock_is_held_type(&dma_fence_lockdep_map, 1); > > + if (tmp) > > + lock_release(&dma_fence_lockdep_map, _THIS_IP_); > > + lock_map_acquire(&dma_fence_lockdep_map); > > + lock_map_release(&dma_fence_lockdep_map); > > + if (tmp) > > + lock_acquire(&dma_fence_lockdep_map, 0, 0, 1, 1, NULL, _THIS_IP_); > > +} > > +#endif > > + > > + > > /** > > * dma_fence_signal_locked - signal completion of a fence > > * @fence: the fence to signal > > @@ -170,14 +324,19 @@ int dma_fence_signal(struct dma_fence *fence) > > { > > unsigned long flags; > > int ret; > > + bool tmp; > > > > if (!fence) > > return -EINVAL; > > > > + tmp = dma_fence_begin_signalling(); > > + > > spin_lock_irqsave(fence->lock, flags); > > ret = dma_fence_signal_locked(fence); > > spin_unlock_irqrestore(fence->lock, flags); > > > > + dma_fence_end_signalling(tmp); > > + > > return ret; > > } > > EXPORT_SYMBOL(dma_fence_signal); > > @@ -210,6 +369,8 @@ dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout) > > > > might_sleep(); > > > > + __dma_fence_might_wait(); > > + > > trace_dma_fence_wait_start(fence); > > if (fence->ops->wait) > > ret = fence->ops->wait(fence, intr, timeout); > > diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h > > index 3347c54f3a87..3f288f7db2ef 100644 > > --- a/include/linux/dma-fence.h > > +++ b/include/linux/dma-fence.h > > @@ -357,6 +357,18 @@ dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep) > > } while (1); > > } > > > > +#ifdef CONFIG_LOCKDEP > > +bool dma_fence_begin_signalling(void); > > +void dma_fence_end_signalling(bool cookie); > > +#else > > +static inline bool dma_fence_begin_signalling(void) > > +{ > > + return true; > > +} > > +static inline void dma_fence_end_signalling(bool cookie) {} > > +static inline void __dma_fence_might_wait(void) {} > > +#endif > > + > > int dma_fence_signal(struct dma_fence *fence); > > int dma_fence_signal_locked(struct dma_fence *fence); > > signed long dma_fence_default_wait(struct dma_fence *fence, -- Daniel Vetter Software Engineer, Intel Corporation +41 (0) 79 365 57 48 - http://blog.ffwll.ch

5 years, 4 months

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

Linaro-mm-sig June 2020