Hi All,
Chages since v2: - page allocation moved out of the atomic context
Chages since v1: - Fixes: and -stable tags added to the patch description
Thanks!
Alexander Gordeev (1): kasan: Avoid sleepable page allocation from atomic context
mm/kasan/shadow.c | 65 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 16 deletions(-)
apply_to_pte_range() enters the lazy MMU mode and then invokes kasan_populate_vmalloc_pte() callback on each page table walk iteration. However, the callback can go into sleep when trying to allocate a single page, e.g. if an architecutre disables preemption on lazy MMU mode enter.
On s390 if make arch_enter_lazy_mmu_mode() -> preempt_enable() and arch_leave_lazy_mmu_mode() -> preempt_disable(), such crash occurs:
[ 553.332108] preempt_count: 1, expected: 0 [ 553.332117] no locks held by multipathd/2116. [ 553.332128] CPU: 24 PID: 2116 Comm: multipathd Kdump: loaded Tainted: [ 553.332139] Hardware name: IBM 3931 A01 701 (LPAR) [ 553.332146] Call Trace: [ 553.332152] [<00000000158de23a>] dump_stack_lvl+0xfa/0x150 [ 553.332167] [<0000000013e10d12>] __might_resched+0x57a/0x5e8 [ 553.332178] [<00000000144eb6c2>] __alloc_pages+0x2ba/0x7c0 [ 553.332189] [<00000000144d5cdc>] __get_free_pages+0x2c/0x88 [ 553.332198] [<00000000145663f6>] kasan_populate_vmalloc_pte+0x4e/0x110 [ 553.332207] [<000000001447625c>] apply_to_pte_range+0x164/0x3c8 [ 553.332218] [<000000001448125a>] apply_to_pmd_range+0xda/0x318 [ 553.332226] [<000000001448181c>] __apply_to_page_range+0x384/0x768 [ 553.332233] [<0000000014481c28>] apply_to_page_range+0x28/0x38 [ 553.332241] [<00000000145665da>] kasan_populate_vmalloc+0x82/0x98 [ 553.332249] [<00000000144c88d0>] alloc_vmap_area+0x590/0x1c90 [ 553.332257] [<00000000144ca108>] __get_vm_area_node.constprop.0+0x138/0x260 [ 553.332265] [<00000000144d17fc>] __vmalloc_node_range+0x134/0x360 [ 553.332274] [<0000000013d5dbf2>] alloc_thread_stack_node+0x112/0x378 [ 553.332284] [<0000000013d62726>] dup_task_struct+0x66/0x430 [ 553.332293] [<0000000013d63962>] copy_process+0x432/0x4b80 [ 553.332302] [<0000000013d68300>] kernel_clone+0xf0/0x7d0 [ 553.332311] [<0000000013d68bd6>] __do_sys_clone+0xae/0xc8 [ 553.332400] [<0000000013d68dee>] __s390x_sys_clone+0xd6/0x118 [ 553.332410] [<0000000013c9d34c>] do_syscall+0x22c/0x328 [ 553.332419] [<00000000158e7366>] __do_syscall+0xce/0xf0 [ 553.332428] [<0000000015913260>] system_call+0x70/0x98
Instead of allocating single pages per-PTE, bulk-allocate the shadow memory prior to applying kasan_populate_vmalloc_pte() callback on a page range.
Suggested-by: Andrey Ryabinin ryabinin.a.a@gmail.com Cc: stable@vger.kernel.org Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
Signed-off-by: Alexander Gordeev agordeev@linux.ibm.com --- mm/kasan/shadow.c | 65 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 16 deletions(-)
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 88d1c9dcb507..ea9a06715a81 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -292,30 +292,65 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start, { }
+struct vmalloc_populate_data { + unsigned long start; + struct page **pages; +}; + static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, - void *unused) + void *_data) { - unsigned long page; + struct vmalloc_populate_data *data = _data; + struct page *page; + unsigned long pfn; pte_t pte;
if (likely(!pte_none(ptep_get(ptep)))) return 0;
- page = __get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); - pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); + page = data->pages[PFN_DOWN(addr - data->start)]; + pfn = page_to_pfn(page); + __memset(pfn_to_virt(pfn), KASAN_VMALLOC_INVALID, PAGE_SIZE); + pte = pfn_pte(pfn, PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock); - if (likely(pte_none(ptep_get(ptep)))) { + if (likely(pte_none(ptep_get(ptep)))) set_pte_at(&init_mm, addr, ptep, pte); - page = 0; - } spin_unlock(&init_mm.page_table_lock); - if (page) - free_page(page); + + return 0; +} + +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +{ + unsigned long nr_pages, nr_total = PFN_UP(end - start); + struct vmalloc_populate_data data; + int ret; + + data.pages = (struct page **)__get_free_page(GFP_KERNEL); + if (!data.pages) + return -ENOMEM; + + while (nr_total) { + nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); + __memset(data.pages, 0, nr_pages * sizeof(data.pages[0])); + if (nr_pages != alloc_pages_bulk(GFP_KERNEL, nr_pages, data.pages)) { + free_page((unsigned long)data.pages); + return -ENOMEM; + } + + data.start = start; + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, + kasan_populate_vmalloc_pte, &data); + if (ret) + return ret; + + start += nr_pages * PAGE_SIZE; + nr_total -= nr_pages; + } + + free_page((unsigned long)data.pages); + return 0; }
@@ -348,9 +383,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end);
- ret = apply_to_page_range(&init_mm, shadow_start, - shadow_end - shadow_start, - kasan_populate_vmalloc_pte, NULL); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end); if (ret) return ret;
On Tue, Apr 29, 2025 at 06:08:41PM +0200, Alexander Gordeev wrote:
apply_to_pte_range() enters the lazy MMU mode and then invokes kasan_populate_vmalloc_pte() callback on each page table walk iteration. However, the callback can go into sleep when trying to allocate a single page, e.g. if an architecutre disables preemption on lazy MMU mode enter.
Should we add a comment that pte_fn_t must not sleep in apply_to_pte_range()?
On s390 if make arch_enter_lazy_mmu_mode() -> preempt_enable() and arch_leave_lazy_mmu_mode() -> preempt_disable(), such crash occurs:
[ 553.332108] preempt_count: 1, expected: 0 [ 553.332117] no locks held by multipathd/2116. [ 553.332128] CPU: 24 PID: 2116 Comm: multipathd Kdump: loaded Tainted: [ 553.332139] Hardware name: IBM 3931 A01 701 (LPAR) [ 553.332146] Call Trace: [ 553.332152] [<00000000158de23a>] dump_stack_lvl+0xfa/0x150 [ 553.332167] [<0000000013e10d12>] __might_resched+0x57a/0x5e8 [ 553.332178] [<00000000144eb6c2>] __alloc_pages+0x2ba/0x7c0 [ 553.332189] [<00000000144d5cdc>] __get_free_pages+0x2c/0x88 [ 553.332198] [<00000000145663f6>] kasan_populate_vmalloc_pte+0x4e/0x110 [ 553.332207] [<000000001447625c>] apply_to_pte_range+0x164/0x3c8 [ 553.332218] [<000000001448125a>] apply_to_pmd_range+0xda/0x318 [ 553.332226] [<000000001448181c>] __apply_to_page_range+0x384/0x768 [ 553.332233] [<0000000014481c28>] apply_to_page_range+0x28/0x38 [ 553.332241] [<00000000145665da>] kasan_populate_vmalloc+0x82/0x98 [ 553.332249] [<00000000144c88d0>] alloc_vmap_area+0x590/0x1c90 [ 553.332257] [<00000000144ca108>] __get_vm_area_node.constprop.0+0x138/0x260 [ 553.332265] [<00000000144d17fc>] __vmalloc_node_range+0x134/0x360 [ 553.332274] [<0000000013d5dbf2>] alloc_thread_stack_node+0x112/0x378 [ 553.332284] [<0000000013d62726>] dup_task_struct+0x66/0x430 [ 553.332293] [<0000000013d63962>] copy_process+0x432/0x4b80 [ 553.332302] [<0000000013d68300>] kernel_clone+0xf0/0x7d0 [ 553.332311] [<0000000013d68bd6>] __do_sys_clone+0xae/0xc8 [ 553.332400] [<0000000013d68dee>] __s390x_sys_clone+0xd6/0x118 [ 553.332410] [<0000000013c9d34c>] do_syscall+0x22c/0x328 [ 553.332419] [<00000000158e7366>] __do_syscall+0xce/0xf0 [ 553.332428] [<0000000015913260>] system_call+0x70/0x98
Instead of allocating single pages per-PTE, bulk-allocate the shadow memory prior to applying kasan_populate_vmalloc_pte() callback on a page range.
Suggested-by: Andrey Ryabinin ryabinin.a.a@gmail.com Cc: stable@vger.kernel.org Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
Signed-off-by: Alexander Gordeev agordeev@linux.ibm.com
mm/kasan/shadow.c | 65 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 16 deletions(-)
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 88d1c9dcb507..ea9a06715a81 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -292,30 +292,65 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start, { } +struct vmalloc_populate_data {
- unsigned long start;
- struct page **pages;
+};
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
void *unused)
void *_data)
{
- unsigned long page;
- struct vmalloc_populate_data *data = _data;
- struct page *page;
- unsigned long pfn; pte_t pte;
if (likely(!pte_none(ptep_get(ptep)))) return 0;
- page = __get_free_page(GFP_KERNEL);
- if (!page)
return -ENOMEM;
- __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
- page = data->pages[PFN_DOWN(addr - data->start)];
- pfn = page_to_pfn(page);
- __memset(pfn_to_virt(pfn), KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(pfn, PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
- if (likely(pte_none(ptep_get(ptep)))) {
- if (likely(pte_none(ptep_get(ptep)))) set_pte_at(&init_mm, addr, ptep, pte);
page = 0;
With this patch, now if the pte is already set, the page is leaked?
Should we set data->pages[PFN_DOWN(addr - data->start)] = NULL and free non-null elements later in __kasan_populate_vmalloc()?
- } spin_unlock(&init_mm.page_table_lock);
- if (page)
free_page(page);
- return 0;
+}
+static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +{
- unsigned long nr_pages, nr_total = PFN_UP(end - start);
- struct vmalloc_populate_data data;
- int ret;
- data.pages = (struct page **)__get_free_page(GFP_KERNEL);
- if (!data.pages)
return -ENOMEM;
- while (nr_total) {
nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0]));
__memset(data.pages, 0, nr_pages * sizeof(data.pages[0]));
if (nr_pages != alloc_pages_bulk(GFP_KERNEL, nr_pages, data.pages)) {
When the return value of alloc_pages_bulk() is less than nr_pages, you still need to free pages in the array unless nr_pages is zero.
free_page((unsigned long)data.pages);
return -ENOMEM;
}
data.start = start;
ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
kasan_populate_vmalloc_pte, &data);
if (ret)
return ret;
start += nr_pages * PAGE_SIZE;
nr_total -= nr_pages;
- }
- free_page((unsigned long)data.pages);
- return 0;
} @@ -348,9 +383,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end);
- ret = apply_to_page_range(&init_mm, shadow_start,
shadow_end - shadow_start,
kasan_populate_vmalloc_pte, NULL);
- ret = __kasan_populate_vmalloc(shadow_start, shadow_end); if (ret) return ret;
2.45.2
On Tue, 29 Apr 2025 18:08:41 +0200 Alexander Gordeev agordeev@linux.ibm.com wrote:
- __memset(pfn_to_virt(pfn), KASAN_VMALLOC_INVALID, PAGE_SIZE);
pfn_to_virt() exists only in riscv, s390, m68k, loongarch and microblaze.
linux-stable-mirror@lists.linaro.org