The ABI is broken and we cannot support it properly. Turn it off.
If this causes a meaningful performance regression for someone, KVM
can introduce an improved ABI that is supportable.
Cc: stable(a)vger.kernel.org
Signed-off-by: Andy Lutomirski <luto(a)kernel.org>
---
arch/x86/kernel/kvm.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 93ab0cbd304e..71f9f39f93da 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -318,11 +318,16 @@ static void kvm_guest_cpu_init(void)
pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
-#ifdef CONFIG_PREEMPTION
- pa |= KVM_ASYNC_PF_SEND_ALWAYS;
-#endif
pa |= KVM_ASYNC_PF_ENABLED;
+ /*
+ * We do not set KVM_ASYNC_PF_SEND_ALWAYS. With the current
+ * KVM paravirt ABI, if an async page fault occurs on an early
+ * memory access in the normal (sync) #PF path or in an NMI
+ * that happens early in the #PF code, the combination of CR2
+ * and the APF reason field will be corrupted.
+ */
+
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
--
2.24.1
In the commit setting up the qcom/msm pin controller to
be hierarchical some callbacks were careful to check that
d->parent_data on irq_data was valid before calling the
parent function, however irq_chip_eoi_parent() was called
unconditionally which doesn't work with elder Qualcomm
platforms such as APQ8060.
When the drivers/mfd/qcom-pm8xxx.c driver calls
chained_irq_exit() that call will in turn call chip->irq_eoi()
which is set to irq_chip_eoi_parent() by default on a
hierachical IRQ chip, and the parent is pinctrl-msm.c
so that will in turn unconditionally call
irq_chip_eoi_parent() again, but its parent is invalid
so we get the following crash:
Unnable to handle kernel NULL pointer dereference at
virtual address 00000010
pgd = (ptrval)
[00000010] *pgd=00000000
Internal error: Oops: 5 [#1] PREEMPT SMP ARM
(...)
PC is at irq_chip_eoi_parent+0x4/0x10
LR is at pm8xxx_irq_handler+0x1b4/0x2d8
Implement a local stub just avoiding to call down to
irq_chip_eoi_parent() if d->parent_data is not set.
Cc: Bjorn Andersson <bjorn.andersson(a)linaro.org>
Cc: Lina Iyer <ilina(a)codeaurora.org>
Cc: Marc Zyngier <maz(a)kernel.org>
Cc: Stephen Boyd <swboyd(a)chromium.org>
Cc: stable(a)vger.kernel.org
Fixes: e35a6ae0eb3a ("pinctrl/msm: Setup GPIO chip in hierarchy")
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
---
drivers/pinctrl/qcom/pinctrl-msm.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
index 9a8daa256a32..511f596cf2c3 100644
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
@@ -828,6 +828,12 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
msm_gpio_irq_clear_unmask(d, false);
}
+static void msm_gpio_irq_eoi(struct irq_data *d)
+{
+ if (d->parent_data)
+ irq_chip_eoi_parent(d);
+}
+
static void msm_gpio_irq_ack(struct irq_data *d)
{
struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
@@ -1104,7 +1110,7 @@ static int msm_gpio_init(struct msm_pinctrl *pctrl)
pctrl->irq_chip.irq_mask = msm_gpio_irq_mask;
pctrl->irq_chip.irq_unmask = msm_gpio_irq_unmask;
pctrl->irq_chip.irq_ack = msm_gpio_irq_ack;
- pctrl->irq_chip.irq_eoi = irq_chip_eoi_parent;
+ pctrl->irq_chip.irq_eoi = msm_gpio_irq_eoi;
pctrl->irq_chip.irq_set_type = msm_gpio_irq_set_type;
pctrl->irq_chip.irq_set_wake = msm_gpio_irq_set_wake;
pctrl->irq_chip.irq_request_resources = msm_gpio_irq_reqres;
--
2.24.1
Prohibit probing on rcu_nmi_exit() and ist_exit() which
are called from do_int3()'s kprobe path after kprobe_int3_handler().
The commit c13324a505c7 ("x86/kprobes: Prohibit probing on
functions before kprobe_int3_handler()") tried to fix similar
issue, but it only marks the functions before kprobe_int3_handler()
in do_int3().
If we put a kprobe on rcu_nmi_exit() or ist_exit(), the kprobes
will detect reentrance. However, it only skips the kprobe handler,
exits from do_int3() and hits ist_exit() and rcu_nmi_exit() again.
Thus, it causes another int3 exception and finally we will get
the kernel panic with "Unrecoverable kprobe detected." error message.
This is reproducible by the following commands.
/ # echo 0 > /proc/sys/debug/kprobes-optimization
/ # echo p vfs_read > /sys/kernel/debug/tracing/kprobe_events
/ # echo p rcu_nmi_exit >> /sys/kernel/debug/tracing/kprobe_events
/ # echo 1 > /sys/kernel/debug/tracing/events/kprobes/enable
Fixes: c13324a505c7 ("x86/kprobes: Prohibit probing on functions before kprobe_int3_handler()")
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
arch/x86/kernel/traps.c | 1 +
kernel/rcu/tree.c | 1 +
2 files changed, 2 insertions(+)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 6ef00eb6fbb9..c63fb7697794 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -115,6 +115,7 @@ void ist_exit(struct pt_regs *regs)
if (!user_mode(regs))
rcu_nmi_exit();
}
+NOKPROBE_SYMBOL(ist_exit);
/**
* ist_begin_non_atomic() - begin a non-atomic section in an IST exception
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d91c9156fab2..c49ea0e919f9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -670,6 +670,7 @@ void rcu_nmi_exit(void)
{
rcu_nmi_exit_common(false);
}
+NOKPROBE_SYMBOL(rcu_nmi_exit);
/**
* rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
The patch titled
Subject: mm, hotplug: fix page online with DEBUG_PAGEALLOC compiled but not enabled
has been removed from the -mm tree. Its filename was
mm-hotplug-fix-page-online-with-debug_pagealloc-compiled-but-not-enabled.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Vlastimil Babka <vbabka(a)suse.cz>
Subject: mm, hotplug: fix page online with DEBUG_PAGEALLOC compiled but not enabled
Commit cd02cf1aceea ("mm/hotplug: fix an imbalance with DEBUG_PAGEALLOC")
fixed memory hotplug with debug_pagealloc enabled, where onlining a page
goes through page freeing, which removes the direct mapping. Some arches
don't like when the page is not mapped in the first place, so
generic_online_page() maps it first. This is somewhat wasteful, but
better than special casing page freeing fast paths.
The commit however missed that DEBUG_PAGEALLOC configured doesn't mean
it's actually enabled. One has to test debug_pagealloc_enabled() since
031bc5743f15 ("mm/debug-pagealloc: make debug-pagealloc boottime
configurable"), or alternatively debug_pagealloc_enabled_static() since
8e57f8acbbd1 ("mm, debug_pagealloc: don't rely on static keys too early"),
but this is not done.
As a result, a s390 kernel with DEBUG_PAGEALLOC configured but not enabled
will crash:
Unable to handle kernel pointer dereference in virtual kernel address space
Failing address: 0000000000000000 TEID: 0000000000000483
Fault in home space mode while using kernel ASCE.
AS:0000001ece13400b R2:000003fff7fd000b R3:000003fff7fcc007 S:000003fff7fd7000 P:000000000000013d
Oops: 0004 ilc:2 [#1] SMP
CPU: 1 PID: 26015 Comm: chmem Kdump: loaded Tainted: GX 5.3.18-5-default #1 SLE15-SP2 (unreleased)
Krnl PSW : 0704e00180000000 0000001ecd281b9e (__kernel_map_pages+0x166/0x188)
R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
Krnl GPRS: 0000000000000000 0000000000000800 0000400b00000000 0000000000000100
0000000000000001 0000000000000000 0000000000000002 0000000000000100
0000001ece139230 0000001ecdd98d40 0000400b00000100 0000000000000000
000003ffa17e4000 001fffe0114f7d08 0000001ecd4d93ea 001fffe0114f7b20
Krnl Code: 0000001ecd281b8e: ec17ffff00d8 ahik %r1,%r7,-1
0000001ecd281b94: ec111dbc0355 risbg %r1,%r1,29,188,3
>0000001ecd281b9e: 94fb5006 ni 6(%r5),251
0000001ecd281ba2: 41505008 la %r5,8(%r5)
0000001ecd281ba6: ec51fffc6064 cgrj %r5,%r1,6,1ecd281b9e
0000001ecd281bac: 1a07 ar %r0,%r7
0000001ecd281bae: ec03ff584076 crj %r0,%r3,4,1ecd281a5e
Call Trace:
[<0000001ecd281b9e>] __kernel_map_pages+0x166/0x188
[<0000001ecd4d9516>] online_pages_range+0xf6/0x128
[<0000001ecd2a8186>] walk_system_ram_range+0x7e/0xd8
[<0000001ecda28aae>] online_pages+0x2fe/0x3f0
[<0000001ecd7d02a6>] memory_subsys_online+0x8e/0xc0
[<0000001ecd7add42>] device_online+0x5a/0xc8
[<0000001ecd7d0430>] state_store+0x88/0x118
[<0000001ecd5b9f62>] kernfs_fop_write+0xc2/0x200
[<0000001ecd5064b6>] vfs_write+0x176/0x1e0
[<0000001ecd50676a>] ksys_write+0xa2/0x100
[<0000001ecda315d4>] system_call+0xd8/0x2c8
Fix this by checking debug_pagealloc_enabled_static() before calling
kernel_map_pages(). Backports for kernel before 5.5 should use
debug_pagealloc_enabled() instead. Also add comments.
Link: http://lkml.kernel.org/r/20200224094651.18257-1-vbabka@suse.cz
Fixes: cd02cf1aceea ("mm/hotplug: fix an imbalance with DEBUG_PAGEALLOC")
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Reported-by: Gerald Schaefer <gerald.schaefer(a)de.ibm.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Cc: Qian Cai <cai(a)lca.pw>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm.h | 4 ++++
mm/memory_hotplug.c | 8 +++++++-
2 files changed, 11 insertions(+), 1 deletion(-)
--- a/include/linux/mm.h~mm-hotplug-fix-page-online-with-debug_pagealloc-compiled-but-not-enabled
+++ a/include/linux/mm.h
@@ -2715,6 +2715,10 @@ static inline bool debug_pagealloc_enabl
#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
+/*
+ * When called in DEBUG_PAGEALLOC context, the call should most likely be
+ * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static()
+ */
static inline void
kernel_map_pages(struct page *page, int numpages, int enable)
{
--- a/mm/memory_hotplug.c~mm-hotplug-fix-page-online-with-debug_pagealloc-compiled-but-not-enabled
+++ a/mm/memory_hotplug.c
@@ -574,7 +574,13 @@ EXPORT_SYMBOL_GPL(restore_online_page_ca
void generic_online_page(struct page *page, unsigned int order)
{
- kernel_map_pages(page, 1 << order, 1);
+ /*
+ * Freeing the page with debug_pagealloc enabled will try to unmap it,
+ * so we should map it first. This is better than introducing a special
+ * case in page freeing fast path.
+ */
+ if (debug_pagealloc_enabled_static())
+ kernel_map_pages(page, 1 << order, 1);
__free_pages_core(page, order);
totalram_pages_add(1UL << order);
#ifdef CONFIG_HIGHMEM
_
Patches currently in -mm which might be from vbabka(a)suse.cz are
mm-compaction-fully-assume-capture-is-not-null-in-compact_zone_order.patch
The patch titled
Subject: fat: fix uninit-memory access for partial initialized inode
has been removed from the -mm tree. Its filename was
fat-fix-uninit-memory-access-for-partial-initialized-inode.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: OGAWA Hirofumi <hirofumi(a)mail.parknet.co.jp>
Subject: fat: fix uninit-memory access for partial initialized inode
When get an error in the middle of reading an inode, some fields in the
inode might be still not initialized. And then the evict_inode path may
access those fields via iput().
To fix, this makes sure that inode fields are initialized.
Link: http://lkml.kernel.org/r/871rqnreqx.fsf@mail.parknet.co.jp
Reported-by: syzbot+9d82b8de2992579da5d0(a)syzkaller.appspotmail.com
Signed-off-by: OGAWA Hirofumi <hirofumi(a)mail.parknet.co.jp>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/fat/inode.c | 19 +++++++------------
1 file changed, 7 insertions(+), 12 deletions(-)
--- a/fs/fat/inode.c~fat-fix-uninit-memory-access-for-partial-initialized-inode
+++ a/fs/fat/inode.c
@@ -750,6 +750,13 @@ static struct inode *fat_alloc_inode(str
return NULL;
init_rwsem(&ei->truncate_lock);
+ /* Zeroing to allow iput() even if partial initialized inode. */
+ ei->mmu_private = 0;
+ ei->i_start = 0;
+ ei->i_logstart = 0;
+ ei->i_attrs = 0;
+ ei->i_pos = 0;
+
return &ei->vfs_inode;
}
@@ -1374,16 +1381,6 @@ out:
return 0;
}
-static void fat_dummy_inode_init(struct inode *inode)
-{
- /* Initialize this dummy inode to work as no-op. */
- MSDOS_I(inode)->mmu_private = 0;
- MSDOS_I(inode)->i_start = 0;
- MSDOS_I(inode)->i_logstart = 0;
- MSDOS_I(inode)->i_attrs = 0;
- MSDOS_I(inode)->i_pos = 0;
-}
-
static int fat_read_root(struct inode *inode)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
@@ -1844,13 +1841,11 @@ int fat_fill_super(struct super_block *s
fat_inode = new_inode(sb);
if (!fat_inode)
goto out_fail;
- fat_dummy_inode_init(fat_inode);
sbi->fat_inode = fat_inode;
fsinfo_inode = new_inode(sb);
if (!fsinfo_inode)
goto out_fail;
- fat_dummy_inode_init(fsinfo_inode);
fsinfo_inode->i_ino = MSDOS_FSINFO_INO;
sbi->fsinfo_inode = fsinfo_inode;
insert_inode_hash(fsinfo_inode);
_
Patches currently in -mm which might be from hirofumi(a)mail.parknet.co.jp are
The patch titled
Subject: mm: avoid data corruption on CoW fault into PFN-mapped VMA
has been removed from the -mm tree. Its filename was
mm-avoid-data-corruption-on-cow-fault-into-pfn-mapped-vma.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: "Kirill A. Shutemov" <kirill(a)shutemov.name>
Subject: mm: avoid data corruption on CoW fault into PFN-mapped VMA
Jeff Moyer has reported that one of xfstests triggers a warning when run
on DAX-enabled filesystem:
WARNING: CPU: 76 PID: 51024 at mm/memory.c:2317 wp_page_copy+0xc40/0xd50
...
wp_page_copy+0x98c/0xd50 (unreliable)
do_wp_page+0xd8/0xad0
__handle_mm_fault+0x748/0x1b90
handle_mm_fault+0x120/0x1f0
__do_page_fault+0x240/0xd70
do_page_fault+0x38/0xd0
handle_page_fault+0x10/0x30
The warning happens on failed __copy_from_user_inatomic() which tries to
copy data into a CoW page.
This happens because of race between MADV_DONTNEED and CoW page fault:
CPU0 CPU1
handle_mm_fault()
do_wp_page()
wp_page_copy()
do_wp_page()
madvise(MADV_DONTNEED)
zap_page_range()
zap_pte_range()
ptep_get_and_clear_full()
<TLB flush>
__copy_from_user_inatomic()
sees empty PTE and fails
WARN_ON_ONCE(1)
clear_page()
The solution is to re-try __copy_from_user_inatomic() under PTL after
checking that PTE is matches the orig_pte.
The second copy attempt can still fail, like due to non-readable PTE, but
there's nothing reasonable we can do about, except clearing the CoW page.
Link: http://lkml.kernel.org/r/20200218154151.13349-1-kirill.shutemov@linux.intel…
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reported-by: Jeff Moyer <jmoyer(a)redhat.com>
Tested-by: Jeff Moyer <jmoyer(a)redhat.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Justin He <Justin.He(a)arm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 35 +++++++++++++++++++++++++++--------
1 file changed, 27 insertions(+), 8 deletions(-)
--- a/mm/memory.c~mm-avoid-data-corruption-on-cow-fault-into-pfn-mapped-vma
+++ a/mm/memory.c
@@ -2257,7 +2257,7 @@ static inline bool cow_user_page(struct
bool ret;
void *kaddr;
void __user *uaddr;
- bool force_mkyoung;
+ bool locked = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = vmf->address;
@@ -2282,11 +2282,11 @@ static inline bool cow_user_page(struct
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
- force_mkyoung = arch_faults_on_old_pte() && !pte_young(vmf->orig_pte);
- if (force_mkyoung) {
+ if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+ locked = true;
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/*
* Other thread has already handled the fault
@@ -2310,18 +2310,37 @@ static inline bool cow_user_page(struct
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
+ if (locked)
+ goto warn;
+
+ /* Re-validate under PTL if the page is still mapped */
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+ locked = true;
+ if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ /* The PTE changed under us. Retry page fault. */
+ ret = false;
+ goto pte_unlock;
+ }
+
/*
- * Give a warn in case there can be some obscure
- * use-case
+ * The same page can be mapped back since last copy attampt.
+ * Try to copy again under PTL.
*/
- WARN_ON_ONCE(1);
- clear_page(kaddr);
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
+ /*
+ * Give a warn in case there can be some obscure
+ * use-case
+ */
+warn:
+ WARN_ON_ONCE(1);
+ clear_page(kaddr);
+ }
}
ret = true;
pte_unlock:
- if (force_mkyoung)
+ if (locked)
pte_unmap_unlock(vmf->pte, vmf->ptl);
kunmap_atomic(kaddr);
flush_dcache_page(dst);
_
Patches currently in -mm which might be from kirill(a)shutemov.name are
mm-filemap-fix-a-data-race-in-filemap_fault.patch
The patch titled
Subject: mm: fix possible PMD dirty bit lost in set_pmd_migration_entry()
has been removed from the -mm tree. Its filename was
mm-fix-possible-pmd-dirty-bit-lost-in-set_pmd_migration_entry.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Huang Ying <ying.huang(a)intel.com>
Subject: mm: fix possible PMD dirty bit lost in set_pmd_migration_entry()
In set_pmd_migration_entry(), pmdp_invalidate() is used to change PMD
atomically. But the PMD is read before that with an ordinary memory
reading. If the THP (transparent huge page) is written between the PMD
reading and pmdp_invalidate(), the PMD dirty bit may be lost, and cause
data corruption. The race window is quite small, but still possible in
theory, so need to be fixed.
The race is fixed via using the return value of pmdp_invalidate() to get
the original content of PMD, which is a read/modify/write atomic
operation. So no THP writing can occur in between.
The race has been introduced when the THP migration support is added in
the commit 616b8371539a ("mm: thp: enable thp migration in generic path").
But this fix depends on the commit d52605d7cb30 ("mm: do not lose dirty
and accessed bits in pmdp_invalidate()"). So it's easy to be backported
after v4.16. But the race window is really small, so it may be fine not
to backport the fix at all.
Link: http://lkml.kernel.org/r/20200220075220.2327056-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Reviewed-by: William Kucharski <william.kucharski(a)oracle.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reviewed-by: Zi Yan <ziy(a)nvidia.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/mm/huge_memory.c~mm-fix-possible-pmd-dirty-bit-lost-in-set_pmd_migration_entry
+++ a/mm/huge_memory.c
@@ -3043,8 +3043,7 @@ void set_pmd_migration_entry(struct page
return;
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
- pmdval = *pvmw->pmd;
- pmdp_invalidate(vma, address, pvmw->pmd);
+ pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
if (pmd_dirty(pmdval))
set_page_dirty(page);
entry = make_migration_entry(page, pmd_write(pmdval));
_
Patches currently in -mm which might be from ying.huang(a)intel.com are
The patch titled
Subject: mm, numa: fix bad pmd by atomically check for pmd_trans_huge when marking page tables prot_numa
has been removed from the -mm tree. Its filename was
mm-numa-fix-bad-pmd-by-atomically-check-for-pmd_trans_huge-when-marking-page-tables-prot_numa.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Mel Gorman <mgorman(a)techsingularity.net>
Subject: mm, numa: fix bad pmd by atomically check for pmd_trans_huge when marking page tables prot_numa
: A user reported a bug against a distribution kernel while running a
: proprietary workload described as "memory intensive that is not swapping"
: that is expected to apply to mainline kernels. The workload is
: read/write/modifying ranges of memory and checking the contents. They
: reported that within a few hours that a bad PMD would be reported followed
: by a memory corruption where expected data was all zeros. A partial
: report of the bad PMD looked like
:
: [ 5195.338482] ../mm/pgtable-generic.c:33: bad pmd ffff8888157ba008(000002e0396009e2)
: [ 5195.341184] ------------[ cut here ]------------
: [ 5195.356880] kernel BUG at ../mm/pgtable-generic.c:35!
: ....
: [ 5195.410033] Call Trace:
: [ 5195.410471] [<ffffffff811bc75d>] change_protection_range+0x7dd/0x930
: [ 5195.410716] [<ffffffff811d4be8>] change_prot_numa+0x18/0x30
: [ 5195.410918] [<ffffffff810adefe>] task_numa_work+0x1fe/0x310
: [ 5195.411200] [<ffffffff81098322>] task_work_run+0x72/0x90
: [ 5195.411246] [<ffffffff81077139>] exit_to_usermode_loop+0x91/0xc2
: [ 5195.411494] [<ffffffff81003a51>] prepare_exit_to_usermode+0x31/0x40
: [ 5195.411739] [<ffffffff815e56af>] retint_user+0x8/0x10
:
: Decoding revealed that the PMD was a valid prot_numa PMD and the bad PMD
: was a false detection. The bug does not trigger if automatic NUMA
: balancing or transparent huge pages is disabled.
:
: The bug is due a race in change_pmd_range between a pmd_trans_huge and
: pmd_nond_or_clear_bad check without any locks held. During the
: pmd_trans_huge check, a parallel protection update under lock can have
: cleared the PMD and filled it with a prot_numa entry between the transhuge
: check and the pmd_none_or_clear_bad check.
:
: While this could be fixed with heavy locking, it's only necessary to make
: a copy of the PMD on the stack during change_pmd_range and avoid races. A
: new helper is created for this as the check if quite subtle and the
: existing similar helpful is not suitable. This passed 154 hours of
: testing (usually triggers between 20 minutes and 24 hours) without
: detecting bad PMDs or corruption. A basic test of an autonuma-intensive
: workload showed no significant change in behaviour.
Although Mel withdrew the patch on the face of LKML comment
https://lkml.org/lkml/2017/4/10/922 the race window aforementioned is
still open, and we have reports of Linpack test reporting bad residuals
after the bad PMD warning is observed. In addition to that, bad
rss-counter and non-zero pgtables assertions are triggered on mm teardown
for the task hitting the bad PMD.
host kernel: mm/pgtable-generic.c:40: bad pmd 00000000b3152f68(8000000d2d2008e7)
....
host kernel: BUG: Bad rss-counter state mm:00000000b583043d idx:1 val:512
host kernel: BUG: non-zero pgtables_bytes on freeing mm: 4096
The issue is observed on a v4.18-based distribution kernel, but the race
window is expected to be applicable to mainline kernels, as well.
[akpm(a)linux-foundation.org: fix comment typo, per Rafael]
Link: http://lkml.kernel.org/r/20200216191800.22423-1-aquini@redhat.com
Signed-off-by: Mel Gorman <mgorman(a)techsingularity.net>
Signed-off-by: Rafael Aquini <aquini(a)redhat.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Zi Yan <zi.yan(a)cs.rutgers.edu>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mprotect.c | 38 ++++++++++++++++++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)
--- a/mm/mprotect.c~mm-numa-fix-bad-pmd-by-atomically-check-for-pmd_trans_huge-when-marking-page-tables-prot_numa
+++ a/mm/mprotect.c
@@ -161,6 +161,31 @@ static unsigned long change_pte_range(st
return pages;
}
+/*
+ * Used when setting automatic NUMA hinting protection where it is
+ * critical that a numa hinting PMD is not confused with a bad PMD.
+ */
+static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
+{
+ pmd_t pmdval = pmd_read_atomic(pmd);
+
+ /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ barrier();
+#endif
+
+ if (pmd_none(pmdval))
+ return 1;
+ if (pmd_trans_huge(pmdval))
+ return 0;
+ if (unlikely(pmd_bad(pmdval))) {
+ pmd_clear_bad(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -178,8 +203,17 @@ static inline unsigned long change_pmd_r
unsigned long this_pages;
next = pmd_addr_end(addr, end);
- if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
- && pmd_none_or_clear_bad(pmd))
+
+ /*
+ * Automatic NUMA balancing walks the tables with mmap_sem
+ * held for read. It's possible a parallel update to occur
+ * between pmd_trans_huge() and a pmd_none_or_clear_bad()
+ * check leading to a false positive and clearing.
+ * Hence, it's necessary to atomically read the PMD value
+ * for all the checks.
+ */
+ if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
+ pmd_none_or_clear_bad_unless_trans_huge(pmd))
goto next;
/* invoke the mmu notifier if the pmd is populated */
_
Patches currently in -mm which might be from mgorman(a)techsingularity.net are