On Wed, 27 May 2026 03:23:11 -0700 Matt Evans mattev@meta.com wrote:
Expand the VFIO DMABUF revocation state to three states: Not revoked, temporarily revoked, and permanently revoked.
The first two are for existing transient revocation, e.g. across a function reset, and the DMABUF is put into the last in response to a new ioctl(VFIO_DEVICE_PCI_DMABUF_REVOKE) request.
The DMABUF is created via a VFIO_DEVICE_FEATURE ioctl and you next patch is setting attributes via another VFIO_DEVICE_FEATURE, why would the REVOKE operation not also be a VFIO_DEVICE_FEATURE?
This VFIO device fd ioctl() passes a DMABUF by fd and requests that the DMABUF is permanently revoked. On success, it's guaranteed that the buffer can never be imported/attached/mmap()ed in future, that dynamic imports have been cleanly detached, and that all mappings have been made inaccessible/PTEs zapped.
This is useful for lifecycle management, to reclaim VFIO PCI BAR ranges previously delegated to a subordinate client process: The driver process can ensure that the loaned resources are revoked when the client is deemed "done", and exported ranges can be safely re-used elsewhere.
Refactor the revocation code out of vfio_pci_dma_buf_move() to a function common to move and the new ioctl path.
Signed-off-by: Matt Evans mattev@meta.com
drivers/vfio/pci/vfio_pci_core.c | 21 ++++- drivers/vfio/pci/vfio_pci_dmabuf.c | 146 +++++++++++++++++++++-------- drivers/vfio/pci/vfio_pci_priv.h | 14 ++- include/uapi/linux/vfio.h | 30 ++++++ 4 files changed, 170 insertions(+), 41 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 41e049fa9a8a..5184b3cac160 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1500,6 +1500,21 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, ioeventfd.fd); } +static int vfio_pci_ioctl_dmabuf_revoke(struct vfio_pci_core_device *vdev,
struct vfio_pci_dmabuf_revoke __user *arg)+{
- unsigned long minsz = offsetofend(struct vfio_pci_dmabuf_revoke, dmabuf_fd);
- struct vfio_pci_dmabuf_revoke revoke;
- if (copy_from_user(&revoke, arg, minsz))
return -EFAULT;- if (revoke.argsz < minsz)
return -EINVAL;- return vfio_pci_dma_buf_revoke(vdev, revoke.dmabuf_fd);
+}
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -1522,6 +1537,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return vfio_pci_ioctl_reset(vdev, uarg); case VFIO_DEVICE_SET_IRQS: return vfio_pci_ioctl_set_irqs(vdev, uarg);
- case VFIO_DEVICE_PCI_DMABUF_REVOKE:
default: return -ENOTTY; }return vfio_pci_ioctl_dmabuf_revoke(vdev, uarg);@@ -1792,7 +1809,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, dma_resv_lock(priv->dmabuf->resv, NULL); vdev = READ_ONCE(priv->vdev);
- if (priv->revoked || !vdev) {
- if (priv->status != VFIO_PCI_DMABUF_OK || !vdev) { pr_debug_ratelimited("%s VA 0x%lx, pgoff 0x%lx: DMABUF revoked/cleaned up\n", __func__, vmf->address, vma->vm_pgoff); dma_resv_unlock(priv->dmabuf->resv);
@@ -1815,7 +1832,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, scoped_guard(rwsem_read, &vdev->memory_lock) { /* Revocation status must be re-read, under memory_lock */
if (!priv->revoked) {
if (priv->status == VFIO_PCI_DMABUF_OK) { int pres = vfio_pci_dma_buf_find_pfn(priv, vma, vmf->address, order, &pfn);diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index 4b3b15655f1d..3fa14760898f 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -19,7 +19,7 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, if (!attachment->peer2peer) return -EOPNOTSUPP;
- if (priv->revoked)
- if (priv->status != VFIO_PCI_DMABUF_OK) return -ENODEV;
if (!dma_buf_attach_revocable(attachment)) @@ -32,7 +32,7 @@ static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct * { struct vfio_pci_dma_buf *priv = dmabuf->priv;
- if (priv->revoked)
- if (priv->status != VFIO_PCI_DMABUF_OK) return -ENODEV; if ((vma->vm_flags & VM_SHARED) == 0) return -EINVAL;
@@ -72,7 +72,7 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, dma_resv_assert_held(priv->dmabuf->resv);
- if (priv->revoked)
- if (priv->status != VFIO_PCI_DMABUF_OK) return ERR_PTR(-ENODEV);
ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider, @@ -287,7 +287,8 @@ static int vfio_pci_dmabuf_export(struct vfio_pci_core_device *vdev, INIT_LIST_HEAD(&priv->dmabufs_elm); down_write(&vdev->memory_lock); dma_resv_lock(priv->dmabuf->resv, NULL);
- priv->revoked = !__vfio_pci_memory_enabled(vdev);
- priv->status = __vfio_pci_memory_enabled(vdev) ? VFIO_PCI_DMABUF_OK :
list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs); dma_resv_unlock(priv->dmabuf->resv); up_write(&vdev->memory_lock);VFIO_PCI_DMABUF_TEMP_REVOKED;@@ -318,7 +319,7 @@ int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, return -EOPNOTSUPP; priv = attachment->dmabuf->priv;
- if (priv->revoked)
- if (priv->status != VFIO_PCI_DMABUF_OK) return -ENODEV;
/* More than one range to iommufd will require proper DMABUF support */ @@ -585,6 +586,63 @@ int vfio_pci_core_mmap_prep_dmabuf(struct vfio_pci_core_device *vdev, return ret; } +static void __vfio_pci_dma_buf_revoke(struct vfio_pci_dma_buf *priv, bool revoked,
bool permanently)
If the underscore prefix is mean to imply the lock semantics, that's explicit with the annotation below and can be dropped.
The double bool args are not very intuitive to use and the [false, true] combination is rather invalid. Why not an enum:
enum vfio_pci_dma_buf_revoke_action { VFIO_PCI_DMABUF_REVOKE_RESTORE, VFIO_PCI_DMABUF_REVOKE_TEMPORARY, VFIO_PCI_DMABUF_REVOKE_PERMANENT, };
+{
- bool was_revoked;
- lockdep_assert_held_write(&priv->vdev->memory_lock);
- if ((priv->status == VFIO_PCI_DMABUF_PERM_REVOKED) ||
(priv->status == VFIO_PCI_DMABUF_OK && !revoked) ||(priv->status == VFIO_PCI_DMABUF_TEMP_REVOKED && revoked && !permanently)) {return;- }
- dma_resv_lock(priv->dmabuf->resv, NULL);
- was_revoked = priv->status != VFIO_PCI_DMABUF_OK;
- if (revoked)
priv->status = permanently ? VFIO_PCI_DMABUF_PERM_REVOKED :VFIO_PCI_DMABUF_TEMP_REVOKED;- /*
* If TEMP_REVOKED is being upgraded to PERM_REVOKED, the* buffer is already gone. Don't wait on it again.*/- if (was_revoked && revoked) {
dma_resv_unlock(priv->dmabuf->resv);return;- }
- dma_buf_invalidate_mappings(priv->dmabuf);
- dma_resv_wait_timeout(priv->dmabuf->resv,
DMA_RESV_USAGE_BOOKKEEP, false,MAX_SCHEDULE_TIMEOUT);- dma_resv_unlock(priv->dmabuf->resv);
- if (revoked) {
kref_put(&priv->kref, vfio_pci_dma_buf_done);wait_for_completion(&priv->comp);unmap_mapping_range(priv->dmabuf->file->f_mapping,0, priv->size, 1);/** Re-arm the registered kref reference and the* completion so the post-revoke state matches the* post-creation state. An un-revoke followed by a* new mapping needs the kref to be non-zero before* kref_get(), and vfio_pci_dma_buf_cleanup()* delegates its drain back through this revoke* path on a possibly-already-revoked dma-buf.*/kref_init(&priv->kref);reinit_completion(&priv->comp);- } else {
dma_resv_lock(priv->dmabuf->resv, NULL);priv->status = VFIO_PCI_DMABUF_OK;dma_resv_unlock(priv->dmabuf->resv);- }
+}
void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) { struct vfio_pci_dma_buf *priv; @@ -593,44 +651,13 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) lockdep_assert_held_write(&vdev->memory_lock); /* * Holding memory_lock ensures a racing VMA fault observes
* priv->revoked properly.
*/* priv->status properly.list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { if (!get_file_active(&priv->dmabuf->file)) continue;
if (priv->revoked != revoked) {dma_resv_lock(priv->dmabuf->resv, NULL);if (revoked)priv->revoked = true;dma_buf_invalidate_mappings(priv->dmabuf);dma_resv_wait_timeout(priv->dmabuf->resv,DMA_RESV_USAGE_BOOKKEEP, false,MAX_SCHEDULE_TIMEOUT);dma_resv_unlock(priv->dmabuf->resv);if (revoked) {kref_put(&priv->kref, vfio_pci_dma_buf_done);wait_for_completion(&priv->comp);unmap_mapping_range(priv->dmabuf->file->f_mapping,0, priv->size, 1);/** Re-arm the registered kref reference and the* completion so the post-revoke state matches the* post-creation state. An un-revoke followed by a* new mapping needs the kref to be non-zero before* kref_get(), and vfio_pci_dma_buf_cleanup()* delegates its drain back through this revoke* path on a possibly-already-revoked dma-buf.*/kref_init(&priv->kref);reinit_completion(&priv->comp);} else {dma_resv_lock(priv->dmabuf->resv, NULL);priv->revoked = false;dma_resv_unlock(priv->dmabuf->resv);}}
fput(priv->dmabuf->file); }__vfio_pci_dma_buf_revoke(priv, revoked, false);} @@ -662,3 +689,46 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) } up_write(&vdev->memory_lock); }
+#ifdef CONFIG_VFIO_PCI_DMABUF +int vfio_pci_dma_buf_revoke(struct vfio_pci_core_device *vdev, int dmabuf_fd) +{
- struct vfio_pci_dma_buf *priv;
- struct dma_buf *dmabuf;
- int ret = 0;
- dmabuf = dma_buf_get(dmabuf_fd);
- if (IS_ERR(dmabuf))
return PTR_ERR(dmabuf);- priv = dmabuf->priv;
- /*
* Sanity-check the DMABUF is really a vfio_pci_dma_buf _and_* relates to the VFIO device it was provided with.** If the DMABUF relates to this vdev then priv->vdev is* stable because this open fd prevents cleanup.** If it relates to a different vdev, reading priv->vdev might* race with a concurrent cleanup on that device. But if so,* it points to a non-matching vdev or NULL and is unusable* either way.*/- if (dmabuf->ops != &vfio_pci_dmabuf_ops || priv->vdev != vdev) {
ret = -ENODEV;goto out_put_buf;- }
- scoped_guard(rwsem_write, &vdev->memory_lock) {
if (priv->status == VFIO_PCI_DMABUF_PERM_REVOKED)ret = -EBADFD;else__vfio_pci_dma_buf_revoke(priv, true, true);- }
- out_put_buf:
- dma_buf_put(dmabuf);
- return ret;
+} +#endif /* CONFIG_VFIO_PCI_DMABUF */ diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index db2e2aeae88f..a1e0f4fcb1dc 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -23,6 +23,12 @@ struct vfio_pci_ioeventfd { bool test_mem; }; +enum vfio_pci_dma_buf_status {
- VFIO_PCI_DMABUF_OK = 0,
- VFIO_PCI_DMABUF_TEMP_REVOKED = 1,
- VFIO_PCI_DMABUF_PERM_REVOKED = 2,
+};
struct vfio_pci_dma_buf { struct dma_buf *dmabuf; struct vfio_pci_core_device *vdev; @@ -35,7 +41,7 @@ struct vfio_pci_dma_buf { struct kref kref; struct completion comp; unsigned long vma_pgoff_adjust;
- u8 revoked : 1;
- enum vfio_pci_dma_buf_status status;
}; extern const struct vm_operations_struct vfio_pci_mmap_ops; @@ -148,6 +154,7 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, struct vfio_device_feature_dma_buf __user *arg, size_t argsz); +int vfio_pci_dma_buf_revoke(struct vfio_pci_core_device *vdev, int dmabuf_fd); #else static inline int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, @@ -156,6 +163,11 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, { return -ENOTTY; } +static inline int vfio_pci_dma_buf_revoke(struct vfio_pci_core_device *vdev,
int dmabuf_fd)+{
- return -ENODEV;
+} #endif #endif diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 5de618a3a5ee..02366e9f8e16 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1321,6 +1321,36 @@ struct vfio_precopy_info { #define VFIO_MIG_GET_PRECOPY_INFO _IO(VFIO_TYPE, VFIO_BASE + 21) +/**
- VFIO_DEVICE_PCI_DMABUF_REVOKE - _IO(VFIO_TYPE, VFIO_BASE + 22)
- This ioctl is used on the device FD, and requests that access to
- the buffer corresponding to the DMABUF FD parameter is immediately
- and permanently revoked. On successful return, the buffer is not
- accessible through any mmap() or dma-buf import. The request fails
- if the buffer is pinned; otherwise, the exporter marks the buffer
- as inaccessible and uses the move_notify callback to inform
- importers of the change. The buffer is permanently disabled, and
- VFIO refuses all map, mmap, attach, etc. requests.
- Returns:
- Return: 0 on success, -1 and errno set on failure:
- ENODEV if the associated dmabuf FD no longer exists/is closed,
These actually seem to map to EBADF/EINVAL. Thanks,
Alex
or is not a DMABUF created for this device.
- EINVAL if the dmabuf_fd parameter isn't a DMABUF.
- EBADF if the dmabuf_fd parameter isn't a valid file number.
- EBADFD if the buffer has already been revoked.
- */
+struct vfio_pci_dmabuf_revoke {
- __u32 argsz;
- __s32 dmabuf_fd;
+};
+#define VFIO_DEVICE_PCI_DMABUF_REVOKE _IO(VFIO_TYPE, VFIO_BASE + 22)
/*
- Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power
- state with the platform-based power management. Device use of lower power