On 8/8/2025 1:15 PM, Baolu Lu wrote:
On 8/7/25 23:31, Dave Hansen wrote:
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + struct page *page = virt_to_page(pte);
+ guard(spinlock)(&kernel_pte_work.lock); + list_add(&page->lru, &kernel_pte_work.list); + schedule_work(&kernel_pte_work.work); +} diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/ pgalloc.h index 3c8ec3bfea44..716ebab67636 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -46,6 +46,7 @@ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm) #define pte_alloc_one_kernel(...) alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__)) #endif
+#ifndef __HAVE_ARCH_PTE_FREE_KERNEL /** * pte_free_kernel - free PTE-level kernel page table memory * @mm: the mm_struct of the current context @@ -55,6 +56,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { pagetable_dtor_free(virt_to_ptdesc(pte)); } +#endif
/** * __pte_alloc_one - allocate memory for a PTE-level user page table
I'd much rather the arch-generic code looked like this:
#ifdef CONFIG_ASYNC_PGTABLE_FREE // code and struct here, or dump them over in some // other file and do this in a header #else static void pte_free_kernel_async(struct page *page) {} #endif
void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { struct page *page = virt_to_page(pte);
if (IS_DEFINED(CONFIG_ASYNC_PGTABLE_FREE)) { pte_free_kernel_async(page); else pagetable_dtor_free(page_ptdesc(page)); }
Then in Kconfig, you end up with something like:
config ASYNC_PGTABLE_FREE def_bool y depends on INTEL_IOMMU_WHATEVER
That very much tells much more of the whole story in code. It also gives the x86 folks that compile out the IOMMU the exact same code as the arch-generic folks. It_also_ makes it dirt simple and obvious for the x86 folks to optimize out the async behavior if they don't like it in the future by replacing the compile-time IOMMU check with a runtime one.
Also, if another crazy IOMMU implementation comes along that happens to do what the x86 IOMMUs do, then they have a single Kconfig switch to flip. If they follow what this patch tries to do, they'll start by copying and pasting the x86 implementation.
I'll do it like this. Does that look good to you?
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 70d29b14d851..6f1113e024fa 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -160,6 +160,7 @@ config IOMMU_DMA # Shared Virtual Addressing config IOMMU_SVA select IOMMU_MM_DATA + select ASYNC_PGTABLE_FREE if X86 bool
config IOMMU_IOPF diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 3c8ec3bfea44..dbddacdca2ce 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -46,6 +46,19 @@ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm) #define pte_alloc_one_kernel(...) alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__)) #endif
+#ifdef CONFIG_ASYNC_PGTABLE_FREE +struct pgtable_free_work { + struct list_head list; + spinlock_t lock; + struct work_struct work; +}; +extern struct pgtable_free_work kernel_pte_work;
+void pte_free_kernel_async(struct ptdesc *ptdesc); +#else +static inline void pte_free_kernel_async(struct ptdesc *ptdesc) {} +#endif
/** * pte_free_kernel - free PTE-level kernel page table memory * @mm: the mm_struct of the current context @@ -53,7 +66,12 @@ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm) */ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - pagetable_dtor_free(virt_to_ptdesc(pte)); + struct ptdesc *ptdesc = virt_to_ptdesc(pte);
+ if (IS_ENABLED(CONFIG_ASYNC_PGTABLE_FREE)) + pte_free_kernel_async(ptdesc); + else + pagetable_dtor_free(ptdesc); }
/** diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf..528550cfa7fe 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1346,6 +1346,13 @@ config LOCK_MM_AND_FIND_VMA config IOMMU_MM_DATA bool
+config ASYNC_PGTABLE_FREE + bool "Asynchronous kernel page table freeing" + help + Perform kernel page table freeing asynchronously. This is required + for systems with IOMMU Shared Virtual Address (SVA) to flush IOTLB + paging structure caches.
config EXECMEM bool
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 567e2d084071..6639ee6641d4 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -13,6 +13,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mm_inline.h> +#include <linux/iommu.h> #include <asm/pgalloc.h> #include <asm/tlb.h>
@@ -406,3 +407,32 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, pte_unmap_unlock(pte, ptl); goto again; }
+#ifdef CONFIG_ASYNC_PGTABLE_FREE +static void kernel_pte_work_func(struct work_struct *work); +struct pgtable_free_work kernel_pte_work = { + .list = LIST_HEAD_INIT(kernel_pte_work.list), + .lock = __SPIN_LOCK_UNLOCKED(kernel_pte_work.lock), + .work = __WORK_INITIALIZER(kernel_pte_work.work, kernel_pte_work_func), +};
+static void kernel_pte_work_func(struct work_struct *work) +{ + struct ptdesc *ptdesc, *next;
+ iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL);
+ guard(spinlock)(&kernel_pte_work.lock); + list_for_each_entry_safe(ptdesc, next, &kernel_pte_work.list, pt_list) { + list_del_init(&ptdesc->pt_list); + pagetable_dtor_free(ptdesc); + } +}
+void pte_free_kernel_async(struct ptdesc *ptdesc) +{ + guard(spinlock)(&kernel_pte_work.lock); + list_add(&ptdesc->pt_list, &kernel_pte_work.list); + schedule_work(&kernel_pte_work.work); +}
kernel_pte_work.list is global shared var, it would make the producer pte_free_kernel() and the consumer kernel_pte_work_func() to operate in serialized timing. In a large system, I don't think you design this deliberately :)
Thanks, Ethan
+#endif