On Tue, Oct 25, 2022 at 03:12:24PM -0300, Jason Gunthorpe wrote:
+static void iommufd_test_access_unmap(void *data, unsigned long iova,
unsigned long length)
+{
- unsigned long iova_last = iova + length - 1;
- struct selftest_access *staccess = data;
- struct selftest_access_item *item;
- struct selftest_access_item *tmp;
- spin_lock(&staccess->lock);
- list_for_each_entry_safe(item, tmp, &staccess->items, items_elm) {
if (iova > item->iova_end || iova_last < item->iova)
continue;
list_del(&item->items_elm);
spin_unlock(&staccess->lock);
iommufd_access_unpin_pages(staccess->access, item->iova,
item->length);
kfree(item);
spin_lock(&staccess->lock);
- }
- spin_unlock(&staccess->lock);
+}
+static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd,
unsigned int access_id, unsigned long iova,
size_t length, void __user *uptr,
u32 flags)
+{
- struct iommu_test_cmd *cmd = ucmd->cmd;
- struct selftest_access_item *item;
- struct selftest_access *staccess;
- struct page **pages;
- size_t npages;
- int rc;
- if (flags & ~MOCK_FLAGS_ACCESS_WRITE)
return -EOPNOTSUPP;
- staccess = iommufd_access_get(access_id);
- if (IS_ERR(staccess))
return PTR_ERR(staccess);
- npages = (ALIGN(iova + length, PAGE_SIZE) -
ALIGN_DOWN(iova, PAGE_SIZE)) /
PAGE_SIZE;
- pages = kvcalloc(npages, sizeof(*pages), GFP_KERNEL_ACCOUNT);
- if (!pages) {
rc = -ENOMEM;
goto out_put;
- }
- rc = iommufd_access_pin_pages(staccess->access, iova, length, pages,
flags & MOCK_FLAGS_ACCESS_WRITE);
- if (rc)
goto out_free_pages;
- rc = iommufd_test_check_pages(
uptr - (iova - ALIGN_DOWN(iova, PAGE_SIZE)), pages, npages);
- if (rc)
goto out_unaccess;
- item = kzalloc(sizeof(*item), GFP_KERNEL_ACCOUNT);
- if (!item) {
rc = -ENOMEM;
goto out_unaccess;
- }
- item->iova = iova;
- item->length = length;
- spin_lock(&staccess->lock);
- item->id = staccess->next_id++;
- list_add_tail(&item->items_elm, &staccess->items);
- spin_unlock(&staccess->lock);
I haven't been remarking on the bugs that syzkaller finds in the test suite itself (sigh), but this one is surprising and complicated enough to deserve some wider attention.
VFIO has a protocol which has been mapped into iommufd allowing an external driver to convert IOVA to struct pages *. iommufd natively represents this as the sequence:
access = iommufd_access_create(ops) iommufd_access_pin_pages(access, iova, length, pages) iommufd_access_unpin_pages(access, iova, length)
One of the quirks of the VFIO design is that if userspace does an unmap then the unmap shall succeed, but like in a HW iommu, the above pin_pages is revoked and the external driver must stop accessing that memory. iommufd achieves this by calling a callback:
static const struct iommufd_access_ops selftest_access_ops = { .unmap = iommufd_test_access_unmap, };
Which has the invariant that upon return the unpin_pages must be completed.
This all sounds simple enough, but when you throw syzkalller at this and it generates all kinds of races it generates something like this:
CPU1 CPU2 CPU3 iommufd_access_create() iommufd_access_pin_pages() unmap_all() iommufd_test_access_unmap() unmap_all() iommufd_test_access_unmap()
spin_lock(&staccess->lock); list_add_tail(&item->items_elm, &staccess->items);
And of course since the list_add_tail is in the wrong order it means iommufd_test_access_unmap() doesn't see it and doesn't undo it, triggering a WARN_ON.
The only way I can see to solve this is to hold a serializing lock across iommufd_access_pin_pages() so that neither iommufd_test_access_unmap() can progress until both the pin is completed and the record of the pin is stored.
Fortunately in the iommufd design we can hold a lock like this across these calls, and in the op callback, without deadlocking. I can't recall if vfio can do the same, I suspect not since I had in my mind I needed to avoid that kind of locking for deadlock reasons..
I doubt any mdev drivers do this properly, so this will be some oddball bugs. Thankfully it doesn't harm kernel integrity, but it does leave a mess for a userspace vIOMMU which is tracking a guest command to unmap an IOVA range and the kernel chucked out a WARN_ON and told it EDEADLOCK. I guess sleep and retry?
Anyhow, the below seems to have fixed it. And this is the last open syzkaller bug, the rest were dups of the prior one. Now we wait for it to find something else.
Jason
@@ -420,7 +420,7 @@ static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd, struct selftest_access { struct iommufd_access *access; struct file *file; - spinlock_t lock; + struct mutex lock; struct list_head items; unsigned int next_id; bool destroying; @@ -458,19 +458,17 @@ static void iommufd_test_access_unmap(void *data, unsigned long iova, struct selftest_access_item *item; struct selftest_access_item *tmp;
- spin_lock(&staccess->lock); + mutex_lock(&staccess->lock); list_for_each_entry_safe(item, tmp, &staccess->items, items_elm) { if (iova > item->iova + item->length - 1 || iova_last < item->iova) continue; list_del(&item->items_elm); - spin_unlock(&staccess->lock); iommufd_access_unpin_pages(staccess->access, item->iova, item->length); kfree(item); - spin_lock(&staccess->lock); } - spin_unlock(&staccess->lock); + mutex_unlock(&staccess->lock); }
static int iommufd_test_access_item_destroy(struct iommufd_ucmd *ucmd, @@ -484,19 +482,19 @@ static int iommufd_test_access_item_destroy(struct iommufd_ucmd *ucmd, if (IS_ERR(staccess)) return PTR_ERR(staccess);
- spin_lock(&staccess->lock); + mutex_lock(&staccess->lock); list_for_each_entry(item, &staccess->items, items_elm) { if (item->id == item_id) { list_del(&item->items_elm); - spin_unlock(&staccess->lock); iommufd_access_unpin_pages(staccess->access, item->iova, item->length); + mutex_unlock(&staccess->lock); kfree(item); fput(staccess->file); return 0; } } - spin_unlock(&staccess->lock); + mutex_unlock(&staccess->lock); fput(staccess->file); return -ENOENT; } @@ -510,6 +508,7 @@ static int iommufd_test_staccess_release(struct inode *inode, iommufd_test_access_unmap(staccess, 0, ULONG_MAX); iommufd_access_destroy(staccess->access); } + mutex_destroy(&staccess->lock); kfree(staccess); return 0; } @@ -536,7 +535,7 @@ static struct selftest_access *iommufd_test_alloc_access(void) if (!staccess) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&staccess->items); - spin_lock_init(&staccess->lock); + mutex_init(&staccess->lock);
filep = anon_inode_getfile("[iommufd_test_staccess]", &iommfd_test_staccess_fops, staccess, @@ -662,10 +661,20 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, goto out_put; }
+ /* + * Drivers will need to think very carefully about this locking. The + * core code can do multiple unmaps instantaneously after + * iommufd_access_pin_pages() and *all* the unmaps must not return until + * the range is unpinned. This simple implementation puts a global lock + * around the pin, which may not suit drivers that want this to be a + * performance path. drivers that get this wrong will trigger WARN_ON + * races and cause EDEADLOCK failures to userspace. + */ + mutex_lock(&staccess->lock); rc = iommufd_access_pin_pages(staccess->access, iova, length, pages, flags & MOCK_FLAGS_ACCESS_WRITE); if (rc) - goto out_free_pages; + goto out_unlock;
/* For syzkaller allow uptr to be NULL to skip this check */ if (uptr) { @@ -684,25 +693,22 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd,
item->iova = iova; item->length = length; - spin_lock(&staccess->lock); item->id = staccess->next_id++; list_add_tail(&item->items_elm, &staccess->items); - spin_unlock(&staccess->lock);
cmd->access_pages.out_access_pages_id = item->id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_free_item; - goto out_free_pages; + goto out_unlock;
out_free_item: - spin_lock(&staccess->lock); list_del(&item->items_elm); - spin_unlock(&staccess->lock); kfree(item); out_unaccess: iommufd_access_unpin_pages(staccess->access, iova, length); -out_free_pages: +out_unlock: + mutex_unlock(&staccess->lock); kvfree(pages); out_put: fput(staccess->file);