Expand the VFIO DMABUF revocation state to three states: Not revoked, temporarily revoked, and permanently revoked.
The first two are for existing transient revocation, e.g. across a function reset, and the DMABUF is put into the last in response to a new VFIO feature VFIO_DEVICE_FEATURE_DMA_BUF_REVOKE.
VFIO_DEVICE_FEATURE_DMA_BUF_REVOKE passes a DMABUF by fd and requests that the DMABUF is permanently revoked. On success, it's guaranteed that the buffer can never be imported/attached/mmap()ed in future, that dynamic imports have been cleanly detached, and that all mappings have been made inaccessible/PTEs zapped.
This is useful for lifecycle management, to reclaim VFIO PCI BAR ranges previously delegated to a subordinate client process: by revoking, the driver process can ensure that the loaned resources are made inaccessible when the client is deemed "done". The original DMABUF is defunct, and BAR resources can then be safely re-exported for use by new clients.
Refactor the revocation code out of vfio_pci_dma_buf_move() to a function common to move and the new feature request path. Note: this now only calls dma_buf_invalidate_mappings()/dma_resv_wait_timeout() on the revoke path, whereas vfio_pci_dma_buf_move() originally called them for both revoke and (unnecessarily) un-revoke.
Signed-off-by: Matt Evans matt@ozlabs.org --- drivers/vfio/pci/vfio_pci_core.c | 6 +- drivers/vfio/pci/vfio_pci_dmabuf.c | 164 ++++++++++++++++++++++------- drivers/vfio/pci/vfio_pci_priv.h | 19 +++- include/uapi/linux/vfio.h | 20 ++++ 4 files changed, 168 insertions(+), 41 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index cd15b934912b..ebdf9ec30517 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1577,6 +1577,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, return vfio_pci_core_feature_token(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_DMA_BUF: return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_DMA_BUF_REVOKE: + return vfio_pci_core_feature_dma_buf_revoke(vdev, flags, arg, argsz); default: return -ENOTTY; } @@ -1788,7 +1790,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
dma_resv_lock(priv->dmabuf->resv, NULL);
- if (priv->revoked) { + if (priv->status != VFIO_PCI_DMABUF_OK) { pr_debug_ratelimited("%s VA 0x%lx, pgoff 0x%lx: DMABUF revoked/cleaned up\n", __func__, vmf->address, vma->vm_pgoff); dma_resv_unlock(priv->dmabuf->resv); @@ -1813,7 +1815,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
scoped_guard(rwsem_read, &vdev->memory_lock) { /* Revocation status must be re-read, under memory_lock */ - if (!priv->revoked) { + if (priv->status == VFIO_PCI_DMABUF_OK) { int pres = vfio_pci_dma_buf_find_pfn(priv, vma, vmf->address, order, &pfn); diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index 50b713249341..cfca820b767a 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -19,7 +19,7 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, if (!attachment->peer2peer) return -EOPNOTSUPP;
- if (priv->revoked) + if (priv->status != VFIO_PCI_DMABUF_OK) return -ENODEV;
if (!dma_buf_attach_revocable(attachment)) @@ -44,7 +44,7 @@ static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct * * still safe because the fault handler ultimately prevents * access to a revoked buffer if it isn't caught here. */ - if (READ_ONCE(priv->revoked)) + if (READ_ONCE(priv->status) != VFIO_PCI_DMABUF_OK) return -ENODEV; if ((vma->vm_flags & VM_SHARED) == 0) return -EINVAL; @@ -79,7 +79,7 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
dma_resv_assert_held(priv->dmabuf->resv);
- if (priv->revoked) + if (priv->status != VFIO_PCI_DMABUF_OK) return ERR_PTR(-ENODEV);
ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider, @@ -305,7 +305,8 @@ static int vfio_pci_dmabuf_export(struct vfio_pci_core_device *vdev, INIT_LIST_HEAD(&priv->dmabufs_elm); down_write(&vdev->memory_lock); dma_resv_lock(priv->dmabuf->resv, NULL); - priv->revoked = !__vfio_pci_memory_enabled(vdev); + priv->status = __vfio_pci_memory_enabled(vdev) ? VFIO_PCI_DMABUF_OK : + VFIO_PCI_DMABUF_TEMP_REVOKED; list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs); dma_resv_unlock(priv->dmabuf->resv); up_write(&vdev->memory_lock); @@ -336,7 +337,7 @@ int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, return -EOPNOTSUPP;
priv = attachment->dmabuf->priv; - if (priv->revoked) + if (priv->status != VFIO_PCI_DMABUF_OK) return -ENODEV;
/* More than one range to iommufd will require proper DMABUF support */ @@ -600,6 +601,59 @@ int vfio_pci_core_mmap_prep_dmabuf(struct vfio_pci_core_device *vdev, return ret; }
+/* Set the DMABUF's revocation status (OK or temporarily/permanently revoked) */ +static void vfio_pci_dma_buf_set_status(struct vfio_pci_dma_buf *priv, + enum vfio_pci_dma_buf_status new_status) +{ + bool was_revoked; + + lockdep_assert_held_write(&priv->vdev->memory_lock); + + if (priv->status == VFIO_PCI_DMABUF_PERM_REVOKED || + priv->status == new_status) + return; + + dma_resv_lock(priv->dmabuf->resv, NULL); + was_revoked = (priv->status == VFIO_PCI_DMABUF_TEMP_REVOKED); + + if (new_status != VFIO_PCI_DMABUF_OK) { + priv->status = new_status; /* Temp or permanently revoked */ + + if (was_revoked) { + /* + * TEMP_REVOKED is being upgraded to + * PERM_REVOKED. The buffer is already gone, + * don't wait on it again. + */ + dma_resv_unlock(priv->dmabuf->resv); + return; + } + dma_buf_invalidate_mappings(priv->dmabuf); + dma_resv_wait_timeout(priv->dmabuf->resv, + DMA_RESV_USAGE_BOOKKEEP, false, + MAX_SCHEDULE_TIMEOUT); + dma_resv_unlock(priv->dmabuf->resv); + kref_put(&priv->kref, vfio_pci_dma_buf_done); + wait_for_completion(&priv->comp); + unmap_mapping_range(priv->dmabuf->file->f_mapping, + 0, priv->size, true); + /* + * Re-arm the registered kref reference and the + * completion so the post-revoke state matches the + * post-creation state. An un-revoke followed by a + * new mapping needs the kref to be non-zero before + * kref_get(), and vfio_pci_dma_buf_cleanup() + * delegates its drain back through this revoke + * path on a possibly-already-revoked dma-buf. + */ + kref_init(&priv->kref); + reinit_completion(&priv->comp); + } else { + priv->status = VFIO_PCI_DMABUF_OK; + dma_resv_unlock(priv->dmabuf->resv); + } +} + void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) { struct vfio_pci_dma_buf *priv; @@ -607,45 +661,16 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
/* * Holding memory_lock ensures a racing VMA fault observes - * priv->revoked properly. + * priv->status properly. */ lockdep_assert_held_write(&vdev->memory_lock);
list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { if (!get_file_active(&priv->dmabuf->file)) continue; - - if (priv->revoked != revoked) { - dma_resv_lock(priv->dmabuf->resv, NULL); - if (revoked) - priv->revoked = true; - dma_buf_invalidate_mappings(priv->dmabuf); - dma_resv_wait_timeout(priv->dmabuf->resv, - DMA_RESV_USAGE_BOOKKEEP, false, - MAX_SCHEDULE_TIMEOUT); - dma_resv_unlock(priv->dmabuf->resv); - if (revoked) { - kref_put(&priv->kref, vfio_pci_dma_buf_done); - wait_for_completion(&priv->comp); - unmap_mapping_range(priv->dmabuf->file->f_mapping, - 0, priv->size, true); - /* - * Re-arm the registered kref reference and the - * completion so the post-revoke state matches the - * post-creation state. An un-revoke followed by a - * new mapping needs the kref to be non-zero before - * kref_get(), and vfio_pci_dma_buf_cleanup() - * delegates its drain back through this revoke - * path on a possibly-already-revoked dma-buf. - */ - kref_init(&priv->kref); - reinit_completion(&priv->comp); - } else { - dma_resv_lock(priv->dmabuf->resv, NULL); - priv->revoked = false; - dma_resv_unlock(priv->dmabuf->resv); - } - } + vfio_pci_dma_buf_set_status(priv, revoked ? + VFIO_PCI_DMABUF_TEMP_REVOKED : + VFIO_PCI_DMABUF_OK); fput(priv->dmabuf->file); } } @@ -677,3 +702,66 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) } up_write(&vdev->memory_lock); } + +#ifdef CONFIG_VFIO_PCI_DMABUF +int vfio_pci_core_feature_dma_buf_revoke( + struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf_revoke __user *arg, + size_t argsz) +{ + struct vfio_device_feature_dma_buf_revoke db_revoke; + struct vfio_pci_dma_buf *priv; + struct dma_buf *dmabuf; + int ret; + + if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys) + return -EOPNOTSUPP; + + ret = vfio_check_feature(flags, argsz, + VFIO_DEVICE_FEATURE_SET, + sizeof(db_revoke)); + if (ret != 1) + return ret; + + if (copy_from_user(&db_revoke, arg, sizeof(db_revoke))) + return -EFAULT; + + dmabuf = dma_buf_get(db_revoke.dmabuf_fd); + if (IS_ERR(dmabuf)) + return PTR_ERR(dmabuf); + + priv = dmabuf->priv; + /* + * Sanity-check the DMABUF is really a vfio_pci_dma_buf _and_ + * relates to the VFIO device it was provided with. + * + * If the DMABUF relates to this vdev then priv->vdev is + * stable because this open fd prevents cleanup. + * + * If it relates to a different vdev, reading priv->vdev might + * race with a concurrent cleanup on that device. But if so, + * it points to a non-matching vdev or NULL and is unusable + * either way. + */ + if (dmabuf->ops != &vfio_pci_dmabuf_ops || + READ_ONCE(priv->vdev) != vdev) { + ret = -ENODEV; + goto out_put_buf; + } + + scoped_guard(rwsem_write, &vdev->memory_lock) { + if (priv->status == VFIO_PCI_DMABUF_PERM_REVOKED) { + ret = -EBADFD; + } else { + vfio_pci_dma_buf_set_status(priv, + VFIO_PCI_DMABUF_PERM_REVOKED); + ret = 0; + } + } + +out_put_buf: + dma_buf_put(dmabuf); + + return ret; +} +#endif /* CONFIG_VFIO_PCI_DMABUF */ diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index c3fa35381679..8741abd04461 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -23,6 +23,12 @@ struct vfio_pci_ioeventfd { bool test_mem; };
+enum vfio_pci_dma_buf_status { + VFIO_PCI_DMABUF_OK = 0, + VFIO_PCI_DMABUF_TEMP_REVOKED = 1, + VFIO_PCI_DMABUF_PERM_REVOKED = 2, +}; + struct vfio_pci_dma_buf { struct dma_buf *dmabuf; struct vfio_pci_core_device *vdev; @@ -35,7 +41,7 @@ struct vfio_pci_dma_buf { struct kref kref; struct completion comp; unsigned long vma_pgoff_adjust; - u8 revoked : 1; + enum vfio_pci_dma_buf_status status; };
bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); @@ -147,6 +153,10 @@ void vfio_pci_set_vma_ops(struct vm_area_struct *vma); int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, struct vfio_device_feature_dma_buf __user *arg, size_t argsz); +int vfio_pci_core_feature_dma_buf_revoke( + struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf_revoke __user *arg, + size_t argsz); #else static inline int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, @@ -155,6 +165,13 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, { return -ENOTTY; } +static inline int vfio_pci_core_feature_dma_buf_revoke( + struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf_revoke __user *arg, + size_t argsz) +{ + return -ENOTTY; +} #endif
#endif diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 5de618a3a5ee..697c0bb4b9bc 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1534,6 +1534,26 @@ struct vfio_device_feature_dma_buf { */ #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
+/** + * Given a dma_buf fd previously created by + * VFIO_DEVICE_FEATURE_DMA_BUF, a SET of this feature requests that + * access to the corresponding DMABUF is immediately and permanently + * revoked. On successful return, the buffer is not accessible + * through any mmap() or dma-buf import. The buffer is permanently + * disabled, and VFIO refuses all map, mmap, attach, etc. requests. + * + * Return: 0 on success, -1 and errno is set on failure: + * + * EBADF, EINVAL: dmabuf_fd is not a DMABUF fd. + * ENODEV: The dmabuf_fd does not match this VFIO device. + * EBADFD: The DMABUF is already revoked. + */ +#define VFIO_DEVICE_FEATURE_DMA_BUF_REVOKE 13 + +struct vfio_device_feature_dma_buf_revoke { + __s32 dmabuf_fd; +}; + /* -------- API for Type1 VFIO IOMMU -------- */
/**