On 8/7/2025 12:34 AM, Dave Hansen wrote:
On 8/6/25 09:09, Jason Gunthorpe wrote:
You can't do this approach without also pushing the pages to freed on a list and defering the free till the work. This is broadly what the normal mm user flow is doing..
FWIW, I think the simplest way to do this is to plop an unconditional schedule_work() in pte_free_kernel(). The work function will invalidate the IOTLBs and then free the page.
Keep the schedule_work() unconditional to keep it simple. The schedule_work() is way cheaper than all the system-wide TLB invalidation IPIs that have to get sent as well. No need to add complexity to optimize out something that's in the noise already.
That works also, but now you have to allocate memory or you are dead.. Is it OK these days, and safe in this code which seems a little bit linked to memory management?
The MM side avoided this by putting the list and the rcu_head in the struct page.
I don't think you need to allocate memory. A little static structure that uses the page->list and has a lock should do. Logically something like this:
struct kernel_pgtable_work { struct list_head list; spinlock_t lock; struct work_struct work; } kernel_pte_work;
pte_free_kernel() { struct page *page = ptdesc_magic();
guard(spinlock)(&kernel_pte_work.lock); list_add(&page->list, &kernel_pte_work.list); schedule_work(&kernel_pte_work.work); }
work_func() { iommu_sva_invalidate_kva();
guard(spinlock)(&kernel_pte_work.lock);
list_for_each_safe() { page = container_of(...); free_whatever(page); } }
The only wrinkle is that pte_free_kernel() itself still has a pte and 'ptdesc', not a 'struct page'. But there is ptdesc->pt_list, which should be unused at this point, especially for non-pgd pages on x86.
So, either go over to the 'struct page' earlier (maybe by open-coding pagetable_dtor_free()?), or just use the ptdesc.
I refactored the code above as follows. It compiles but hasn't been tested yet. Does it look good to you?
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c88691b15f3c..d9307dd09f67 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -10,9 +10,11 @@
#define __HAVE_ARCH_PTE_ALLOC_ONE #define __HAVE_ARCH_PGD_FREE +#define __HAVE_ARCH_PTE_FREE_KERNEL #include <asm-generic/pgalloc.h>
static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } +void pte_free_kernel(struct mm_struct *mm, pte_t *pte);
#ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ddf248c3ee7d..f9f6738dd3cc 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -2,6 +2,7 @@ #include <linux/mm.h> #include <linux/gfp.h> #include <linux/hugetlb.h> +#include <linux/iommu.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/fixmap.h> @@ -844,3 +845,42 @@ void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); } + +static void kernel_pte_work_func(struct work_struct *work); + +static struct { + struct list_head list; + spinlock_t lock; + struct work_struct work; +} kernel_pte_work = { + .list = LIST_HEAD_INIT(kernel_pte_work.list), + .lock = __SPIN_LOCK_UNLOCKED(kernel_pte_work.lock), + .work = __WORK_INITIALIZER(kernel_pte_work.work, kernel_pte_work_func), +}; + +static void kernel_pte_work_func(struct work_struct *work) +{ + struct page *page, *next; + + iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL); + + guard(spinlock)(&kernel_pte_work.lock); + list_for_each_entry_safe(page, next, &kernel_pte_work.list, lru) { + list_del_init(&page->lru); + pagetable_dtor_free(page_ptdesc(page)); + } +} + +/** + * pte_free_kernel - free PTE-level kernel page table memory + * @mm: the mm_struct of the current context + * @pte: pointer to the memory containing the page table + */ +void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + struct page *page = virt_to_page(pte); + + guard(spinlock)(&kernel_pte_work.lock); + list_add(&page->lru, &kernel_pte_work.list); + schedule_work(&kernel_pte_work.work); +} diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 3c8ec3bfea44..716ebab67636 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -46,6 +46,7 @@ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm) #define pte_alloc_one_kernel(...) alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__)) #endif
+#ifndef __HAVE_ARCH_PTE_FREE_KERNEL /** * pte_free_kernel - free PTE-level kernel page table memory * @mm: the mm_struct of the current context @@ -55,6 +56,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { pagetable_dtor_free(virt_to_ptdesc(pte)); } +#endif
/** * __pte_alloc_one - allocate memory for a PTE-level user page table