In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. The x86 architecture maps the kernel's virtual address space into the upper portion of every process's page table. Consequently, in an SVA context, the IOMMU hardware can walk and cache kernel page table entries.
The Linux kernel currently lacks a notification mechanism for kernel page table changes, specifically when page table pages are freed and reused. The IOMMU driver is only notified of changes to user virtual address mappings. This can cause the IOMMU's internal caches to retain stale entries for kernel VA.
A Use-After-Free (UAF) and Write-After-Free (WAF) condition arises when kernel page table pages are freed and later reallocated. The IOMMU could misinterpret the new data as valid page table entries. The IOMMU might then walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation. This is also a Write-After-Free issue, as the IOMMU will potentially continue to write Accessed and Dirty bits to the freed memory while attempting to walk the stale page tables.
Currently, SVA contexts are unprivileged and cannot access kernel mappings. However, the IOMMU will still walk kernel-only page tables all the way down to the leaf entries, where it realizes the mapping is for the kernel and errors out. This means the IOMMU still caches these intermediate page table entries, making the described vulnerability a real concern.
To mitigate this, a new IOMMU interface is introduced to flush IOTLB entries for the kernel address space. This interface is invoked from the x86 architecture code that manages combined user and kernel page tables, specifically when a kernel page table update requires a CPU TLB flush.
Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Cc: stable@vger.kernel.org Suggested-by: Jann Horn jannh@google.com Co-developed-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Lu Baolu baolu.lu@linux.intel.com Reviewed-by: Jason Gunthorpe jgg@nvidia.com Reviewed-by: Vasant Hegde vasant.hegde@amd.com Reviewed-by: Kevin Tian kevin.tian@intel.com --- drivers/iommu/iommu-sva.c | 29 ++++++++++++++++++++++++++++- include/linux/iommu.h | 4 ++++ mm/pgtable-generic.c | 2 ++ 3 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 1a51cfd82808..d236aef80a8d 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -10,6 +10,8 @@ #include "iommu-priv.h"
static DEFINE_MUTEX(iommu_sva_lock); +static bool iommu_sva_present; +static LIST_HEAD(iommu_sva_mms); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm);
@@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid; + iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); /* * Make sure the write to mm->iommu_mm is not reordered in front of @@ -132,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1; - list_add(&domain->next, &mm->iommu_mm->sva_domains);
+ if (list_empty(&iommu_mm->sva_domains)) { + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = true; + list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); + } + list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); @@ -175,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); } + + if (list_empty(&iommu_mm->sva_domains)) { + list_del(&iommu_mm->mm_list_elm); + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = false; + } + mutex_unlock(&iommu_sva_lock); kfree(handle); } @@ -312,3 +327,15 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
return domain; } + +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{ + struct iommu_mm_data *iommu_mm; + + guard(mutex)(&iommu_sva_lock); + if (!iommu_sva_present) + return; + + list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) + mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); +} diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c30d12e16473..66e4abb2df0d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1134,7 +1134,9 @@ struct iommu_sva {
struct iommu_mm_data { u32 pasid; + struct mm_struct *mm; struct list_head sva_domains; + struct list_head mm_list_elm; };
int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1615,6 +1617,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1639,6 +1642,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) }
static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */
#ifdef CONFIG_IOMMU_IOPF diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index cf884e8300ca..4c81ca8b196f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -13,6 +13,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mm_inline.h> +#include <linux/iommu.h> #include <asm/pgalloc.h> #include <asm/tlb.h>
@@ -430,6 +431,7 @@ static void kernel_pgtable_work_func(struct work_struct *work) list_splice_tail_init(&kernel_pgtable_work.list, &page_list); spin_unlock(&kernel_pgtable_work.lock);
+ iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); list_for_each_entry_safe(pt, next, &page_list, pt_list) { list_del(&pt->pt_list); __pagetable_free(pt);
linux-stable-mirror@lists.linaro.org