The quilt patch titled
Subject: crash: let architecture decide crash memory export to iomem_resource
has been removed from the -mm tree. Its filename was
crash-let-architecture-decide-crash-memory-export-to-iomem_resource.patch
This patch was dropped because it was merged into the mm-nonmm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Sourabh Jain <sourabhjain(a)linux.ibm.com>
Subject: crash: let architecture decide crash memory export to iomem_resource
Date: Thu, 16 Oct 2025 19:58:31 +0530
With the generic crashkernel reservation, the kernel emits the following
warning on powerpc:
WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/mem.c:341 add_system_ram_resources+0xfc/0x180
Modules linked in:
CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-auto-12607-g5472d60c129f #1 VOLUNTARY
Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.01 (NH1110_069) hv:phyp pSeries
NIP: c00000000201de3c LR: c00000000201de34 CTR: 0000000000000000
REGS: c000000127cef8a0 TRAP: 0700 Not tainted (6.17.0-auto-12607-g5472d60c129f)
MSR: 8000000002029033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 84000840 XER: 20040010
CFAR: c00000000017eed0 IRQMASK: 0
GPR00: c00000000201de34 c000000127cefb40 c0000000016a8100 0000000000000001
GPR04: c00000012005aa00 0000000020000000 c000000002b705c8 0000000000000000
GPR08: 000000007fffffff fffffffffffffff0 c000000002db8100 000000011fffffff
GPR12: c00000000201dd40 c000000002ff0000 c0000000000112bc 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c0000000015a3808
GPR24: c00000000200468c c000000001699888 0000000000000106 c0000000020d1950
GPR28: c0000000014683f8 0000000081000200 c0000000015c1868 c000000002b9f710
NIP [c00000000201de3c] add_system_ram_resources+0xfc/0x180
LR [c00000000201de34] add_system_ram_resources+0xf4/0x180
Call Trace:
add_system_ram_resources+0xf4/0x180 (unreliable)
do_one_initcall+0x60/0x36c
do_initcalls+0x120/0x220
kernel_init_freeable+0x23c/0x390
kernel_init+0x34/0x26c
ret_from_kernel_user_thread+0x14/0x1c
This warning occurs due to a conflict between crashkernel and System RAM
iomem resources.
The generic crashkernel reservation adds the crashkernel memory range to
/proc/iomem during early initialization. Later, all memblock ranges are
added to /proc/iomem as System RAM. If the crashkernel region overlaps
with any memblock range, it causes a conflict while adding those memblock
regions as iomem resources, triggering the above warning. The conflicting
memblock regions are then omitted from /proc/iomem.
For example, if the following crashkernel region is added to /proc/iomem:
20000000-11fffffff : Crash kernel
then the following memblock regions System RAM regions fail to be inserted:
00000000-7fffffff : System RAM
80000000-257fffffff : System RAM
Fix this by not adding the crashkernel memory to /proc/iomem on powerpc.
Introduce an architecture hook to let each architecture decide whether to
export the crashkernel region to /proc/iomem.
For more info checkout commit c40dd2f766440 ("powerpc: Add System RAM
to /proc/iomem") and commit bce074bdbc36 ("powerpc: insert System RAM
resource to prevent crashkernel conflict")
Note: Before switching to the generic crashkernel reservation, powerpc
never exported the crashkernel region to /proc/iomem.
Link: https://lkml.kernel.org/r/20251016142831.144515-1-sourabhjain@linux.ibm.com
Fixes: e3185ee438c2 ("powerpc/crash: use generic crashkernel reservation").
Signed-off-by: Sourabh Jain <sourabhjain(a)linux.ibm.com>
Reported-by: Venkat Rao Bagalkote <venkat88(a)linux.ibm.com>
Closes: https://lore.kernel.org/all/90937fe0-2e76-4c82-b27e-7b8a7fe3ac69@linux.ibm.…
Tested-by: Venkat Rao Bagalkote <venkat88(a)linux.ibm.com>
Cc: Baoquan he <bhe(a)redhat.com>
Cc: Hari Bathini <hbathini(a)linux.ibm.com>
Cc: Madhavan Srinivasan <maddy(a)linux.ibm.com>
Cc: Mahesh Salgaonkar <mahesh(a)linux.ibm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Ritesh Harjani (IBM) <ritesh.list(a)gmail.com>
Cc: Vivek Goyal <vgoyal(a)redhat.com>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Mike Rapoport <rppt(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/powerpc/include/asm/crash_reserve.h | 8 ++++++++
include/linux/crash_reserve.h | 6 ++++++
kernel/crash_reserve.c | 3 +++
3 files changed, 17 insertions(+)
--- a/arch/powerpc/include/asm/crash_reserve.h~crash-let-architecture-decide-crash-memory-export-to-iomem_resource
+++ a/arch/powerpc/include/asm/crash_reserve.h
@@ -5,4 +5,12 @@
/* crash kernel regions are Page size agliged */
#define CRASH_ALIGN PAGE_SIZE
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static inline bool arch_add_crash_res_to_iomem(void)
+{
+ return false;
+}
+#define arch_add_crash_res_to_iomem arch_add_crash_res_to_iomem
+#endif
+
#endif /* _ASM_POWERPC_CRASH_RESERVE_H */
--- a/include/linux/crash_reserve.h~crash-let-architecture-decide-crash-memory-export-to-iomem_resource
+++ a/include/linux/crash_reserve.h
@@ -32,6 +32,12 @@ int __init parse_crashkernel(char *cmdli
void __init reserve_crashkernel_cma(unsigned long long cma_size);
#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef arch_add_crash_res_to_iomem
+static inline bool arch_add_crash_res_to_iomem(void)
+{
+ return true;
+}
+#endif
#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
#endif
--- a/kernel/crash_reserve.c~crash-let-architecture-decide-crash-memory-export-to-iomem_resource
+++ a/kernel/crash_reserve.c
@@ -524,6 +524,9 @@ void __init reserve_crashkernel_cma(unsi
#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
static __init int insert_crashkernel_resources(void)
{
+ if (!arch_add_crash_res_to_iomem())
+ return 0;
+
if (crashk_res.start < crashk_res.end)
insert_resource(&iomem_resource, &crashk_res);
_
Patches currently in -mm which might be from sourabhjain(a)linux.ibm.com are
crash-fix-crashkernel-resource-shrink.patch
The quilt patch titled
Subject: scs: fix a wrong parameter in __scs_magic
has been removed from the -mm tree. Its filename was
scs-fix-a-wrong-parameter-in-__scs_magic.patch
This patch was dropped because it was merged into the mm-nonmm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Zhichi Lin <zhichi.lin(a)vivo.com>
Subject: scs: fix a wrong parameter in __scs_magic
Date: Sat, 11 Oct 2025 16:22:22 +0800
__scs_magic() needs a 'void *' variable, but a 'struct task_struct *' is
given. 'task_scs(tsk)' is the starting address of the task's shadow call
stack, and '__scs_magic(task_scs(tsk))' is the end address of the task's
shadow call stack. Here should be '__scs_magic(task_scs(tsk))'.
The user-visible effect of this bug is that when CONFIG_DEBUG_STACK_USAGE
is enabled, the shadow call stack usage checking function
(scs_check_usage) would scan an incorrect memory range. This could lead
to:
1. **Inaccurate stack usage reporting**: The function would calculate
wrong usage statistics for the shadow call stack, potentially showing
incorrect value in kmsg.
2. **Potential kernel crash**: If the value of __scs_magic(tsk)is
greater than that of __scs_magic(task_scs(tsk)), the for loop may
access unmapped memory, potentially causing a kernel panic. However,
this scenario is unlikely because task_struct is allocated via the slab
allocator (which typically returns lower addresses), while the shadow
call stack returned by task_scs(tsk) is allocated via vmalloc(which
typically returns higher addresses).
However, since this is purely a debugging feature
(CONFIG_DEBUG_STACK_USAGE), normal production systems should be not
unaffected. The bug only impacts developers and testers who are actively
debugging stack usage with this configuration enabled.
Link: https://lkml.kernel.org/r/20251011082222.12965-1-zhichi.lin@vivo.com
Fixes: 5bbaf9d1fcb9 ("scs: Add support for stack usage debugging")
Signed-off-by: Jiyuan Xie <xiejiyuan(a)vivo.com>
Signed-off-by: Zhichi Lin <zhichi.lin(a)vivo.com>
Reviewed-by: Sami Tolvanen <samitolvanen(a)google.com>
Acked-by: Will Deacon <will(a)kernel.org>
Cc: Andrey Konovalov <andreyknvl(a)gmail.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Marco Elver <elver(a)google.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: Yee Lee <yee.lee(a)mediatek.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/scs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/scs.c~scs-fix-a-wrong-parameter-in-__scs_magic
+++ a/kernel/scs.c
@@ -135,7 +135,7 @@ static void scs_check_usage(struct task_
if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE))
return;
- for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) {
+ for (p = task_scs(tsk); p < __scs_magic(task_scs(tsk)); ++p) {
if (!READ_ONCE_NOCHECK(*p))
break;
used += sizeof(*p);
_
Patches currently in -mm which might be from zhichi.lin(a)vivo.com are
F2FS can mount filesystems with corrupted directory depth values that
get runtime-clamped to MAX_DIR_HASH_DEPTH. When RENAME_WHITEOUT
operations are performed on such directories, f2fs_rename performs
directory modifications (updating target entry and deleting source
entry) before attempting to add the whiteout entry via f2fs_add_link.
If f2fs_add_link fails due to the corrupted directory structure, the
function returns an error to VFS, but the partial directory
modifications have already been committed to disk. VFS assumes the
entire rename operation failed and does not update the dentry cache,
leaving stale mappings.
In the error path, VFS does not call d_move() to update the dentry
cache. This results in new_dentry still pointing to the old inode
(new_inode) which has already had its i_nlink decremented to zero.
The stale cache causes subsequent operations to incorrectly reference
the freed inode.
This causes subsequent operations to use cached dentry information that
no longer matches the on-disk state. When a second rename targets the
same entry, VFS attempts to decrement i_nlink on the stale inode, which
may already have i_nlink=0, triggering a WARNING in drop_nlink().
Example sequence:
1. First rename (RENAME_WHITEOUT): file2 → file1
- f2fs updates file1 entry on disk (points to inode 8)
- f2fs deletes file2 entry on disk
- f2fs_add_link(whiteout) fails (corrupted directory)
- Returns error to VFS
- VFS does not call d_move() due to error
- VFS cache still has: file1 → inode 7 (stale!)
- inode 7 has i_nlink=0 (already decremented)
2. Second rename: file3 → file1
- VFS uses stale cache: file1 → inode 7
- Tries to drop_nlink on inode 7 (i_nlink already 0)
- WARNING in drop_nlink()
Fix this by explicitly invalidating old_dentry and new_dentry when
f2fs_add_link fails during whiteout creation. This forces VFS to
refresh from disk on subsequent operations, ensuring cache consistency
even when the rename partially succeeds.
Reproducer:
1. Mount F2FS image with corrupted i_current_depth
2. renameat2(file2, file1, RENAME_WHITEOUT)
3. renameat2(file3, file1, 0)
4. System triggers WARNING in drop_nlink()
Fixes: 7e01e7ad746b ("f2fs: support RENAME_WHITEOUT")
Reported-by: syzbot+632cf32276a9a564188d(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=632cf32276a9a564188d
Suggested-by: Chao Yu <chao(a)kernel.org>
Link: https://lore.kernel.org/all/20251022233349.102728-1-kartikey406@gmail.com/ [v1]
Cc: stable(a)vger.kernel.org
Signed-off-by: Deepanshu Kartikey <kartikey406(a)gmail.com>
---
Changes in v2:
- Added detailed explanation about VFS not calling d_move() in error path,
resulting in new_dentry still pointing to inode with zeroed i_nlink
(suggested by Chao Yu)
- Added Fixes tag pointing to commit 7e01e7ad746b
- Added Cc: stable(a)vger.kernel.org for backporting to stable kernels
---
fs/f2fs/namei.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b882771e4699..712479b7b93d 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1053,9 +1053,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (whiteout) {
set_inode_flag(whiteout, FI_INC_LINK);
err = f2fs_add_link(old_dentry, whiteout);
- if (err)
+ if (err) {
+ d_invalidate(old_dentry);
+ d_invalidate(new_dentry);
goto put_out_dir;
-
+ }
spin_lock(&whiteout->i_lock);
whiteout->i_state &= ~I_LINKABLE;
spin_unlock(&whiteout->i_lock);
--
2.43.0
svm_update_lbrv() is called when MSR_IA32_DEBUGCTLMSR is updated, and on
nested transitions where LBRV is used. It checks whether LBRV enablement
needs to be changed in the current VMCB, and if it does, it also
recalculate intercepts to LBR MSRs.
However, there are cases where intercepts need to be updated even when
LBRV enablement doesn't. Example scenario:
- L1 has MSR_IA32_DEBUGCTLMSR cleared.
- L1 runs L2 without LBR_CTL_ENABLE (no LBRV).
- L2 sets DEBUGCTLMSR_LBR in MSR_IA32_DEBUGCTLMSR, svm_update_lbrv()
sets LBR_CTL_ENABLE in VMCB02 and disables intercepts to LBR MSRs.
- L2 exits to L1, svm_update_lbrv() is not called on this transition.
- L1 clears MSR_IA32_DEBUGCTLMSR, svm_update_lbrv() finds that
LBR_CTL_ENABLE is already cleared in VMCB01 and does nothing.
- Intercepts remain disabled, L1 reads to LBR MSRs read the host MSRs.
Fix it by always recalculating intercepts in svm_update_lbrv().
Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running")
Cc: stable(a)vger.kernel.org
Signed-off-by: Yosry Ahmed <yosry.ahmed(a)linux.dev>
---
arch/x86/kvm/svm/svm.c | 29 +++++++++++++++++++----------
1 file changed, 19 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d25c56b30b4e2..26ab75ecf1c67 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -806,25 +806,29 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
vmcb_mark_dirty(to_vmcb, VMCB_LBR);
}
-void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
- svm_recalc_lbr_msr_intercepts(vcpu);
/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
if (is_guest_mode(vcpu))
svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
}
-static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
+void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+{
+ __svm_enable_lbrv(vcpu);
+ svm_recalc_lbr_msr_intercepts(vcpu);
+}
+
+static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
- svm_recalc_lbr_msr_intercepts(vcpu);
/*
* Move the LBR msrs back to the vmcb01 to avoid copying them
@@ -853,13 +857,18 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu)
(is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
- if (enable_lbrv == current_enable_lbrv)
- return;
+ if (enable_lbrv && !current_enable_lbrv)
+ __svm_enable_lbrv(vcpu);
+ else if (!enable_lbrv && current_enable_lbrv)
+ __svm_disable_lbrv(vcpu);
- if (enable_lbrv)
- svm_enable_lbrv(vcpu);
- else
- svm_disable_lbrv(vcpu);
+ /*
+ * During nested transitions, it is possible that the current VMCB has
+ * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
+ * In this case, even though LBR_CTL does not need an update, intercepts
+ * do, so always recalculate the intercepts here.
+ */
+ svm_recalc_lbr_msr_intercepts(vcpu);
}
void disable_nmi_singlestep(struct vcpu_svm *svm)
--
2.51.2.1041.gc1ab5b90ca-goog
On Fri, 11 Oct 2024 22:38:53 +0800 Zheng Yejian <zhengyejian(a)huaweicloud.com> wrote:
>
> Currently when the length of a symbol is longer than 0x7f characters,
> its type shown in /proc/kallsyms can be incorrect.
>
> I found this issue when reading the code, but it can be reproduced by
> following steps:
>
> 1. Define a function which symbol length is 130 characters:
>
> #define X13(x) x##x##x##x##x##x##x##x##x##x##x##x##x
> static noinline void X13(x123456789)(void)
> {
> printk("hello world\n");
> }
>
> 2. The type in vmlinux is 't':
>
> $ nm vmlinux | grep x123456
> ffffffff816290f0 t x123456789x123456789x123456789x12[...]
>
> 3. Then boot the kernel, the type shown in /proc/kallsyms becomes 'g'
> instead of the expected 't':
>
> # cat /proc/kallsyms | grep x123456
> ffffffff816290f0 g x123456789x123456789x123456789x12[...]
>
> The root cause is that, after commit 73bbb94466fd ("kallsyms: support
> "big" kernel symbols"), ULEB128 was used to encode symbol name length.
> That is, for "big" kernel symbols of which name length is longer than
> 0x7f characters, the length info is encoded into 2 bytes.
>
> kallsyms_get_symbol_type() expects to read the first char of the
> symbol name which indicates the symbol type. However, due to the
> "big" symbol case not being handled, the symbol type read from
> /proc/kallsyms may be wrong, so handle it properly.
>
> Cc: stable(a)vger.kernel.org
> Fixes: 73bbb94466fd ("kallsyms: support "big" kernel symbols")
> Signed-off-by: Zheng Yejian <zhengyejian(a)huaweicloud.com>
Gary made me aware of this thread (thanks!) -- we are coming from:
https://lore.kernel.org/all/aQjua6zkEHYNVN3X@x1/
For which I sent this patch without knowing about this one:
https://lore.kernel.org/rust-for-linux/20251107050414.511648-1-ojeda@kernel…
This has been seen now by Arnaldo (Cc'ing) in a real system, so I think
we should take this one since it was first, with:
Cc: stable(a)vger.kernel.org
Thanks!
Cheers,
Miguel
css_task_iter_next() pins and returns a task, but the task can do whatever
between that and cgroup_procs_show() being called, including dying and
losing its PID. When that happens, task_pid_vnr() returns 0.
d245698d727a ("cgroup: Defer task cgroup unlink until after the task is
done switching out") makes this more likely as tasks now stay iterable with
css_task_iter_next() until the last schedule is complete, which can be
after the task has lost its PID.
Showing "0" in cgroup.procs or cgroup.threads is confusing and can lead to
surprising outcomes. For example, if a user tries to kill PID 0, it kills
all processes in the current process group.
Skip entries with PID 0 by returning SEQ_SKIP.
Cc: stable(a)vger.kernel.org
Signed-off-by: Tejun Heo <tj(a)kernel.org>
---
kernel/cgroup/cgroup.c | 11 +++++++++++
1 file changed, 11 insertions(+)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5287,6 +5287,17 @@ static void *cgroup_procs_start(struct s
static int cgroup_procs_show(struct seq_file *s, void *v)
{
+ pid_t pid = task_pid_vnr(v);
+
+ /*
+ * css_task_iter_next() could have visited a task which has already lost
+ * its PID but is not dead yet or the task could have been unhashed
+ * since css_task_iter_next(). In such cases, $pid would be 0 here.
+ * Don't confuse userspace with it.
+ */
+ if (unlikely(!pid))
+ return SEQ_SKIP;
+
seq_printf(s, "%d\n", task_pid_vnr(v));
return 0;
}
The patch titled
Subject: mm, swap: fix potential UAF issue for VMA readahead
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-swap-fix-potential-uaf-issue-for-vma-readahead.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Kairui Song <kasong(a)tencent.com>
Subject: mm, swap: fix potential UAF issue for VMA readahead
Date: Tue, 11 Nov 2025 21:36:08 +0800
Since commit 78524b05f1a3 ("mm, swap: avoid redundant swap device
pinning"), the common helper for allocating and preparing a folio in the
swap cache layer no longer tries to get a swap device reference
internally, because all callers of __read_swap_cache_async are already
holding a swap entry reference. The repeated swap device pinning isn't
needed on the same swap device.
Caller of VMA readahead is also holding a reference to the target entry's
swap device, but VMA readahead walks the page table, so it might encounter
swap entries from other devices, and call __read_swap_cache_async on
another device without holding a reference to it.
So it is possible to cause a UAF when swapoff of device A raced with
swapin on device B, and VMA readahead tries to read swap entries from
device A. It's not easy to trigger, but in theory, it could cause real
issues.
Make VMA readahead try to get the device reference first if the swap
device is a different one from the target entry.
Link: https://lkml.kernel.org/r/20251111-swap-fix-vma-uaf-v1-1-41c660e58562@tence…
Fixes: 78524b05f1a3 ("mm, swap: avoid redundant swap device pinning")
Suggested-by: Huang Ying <ying.huang(a)linux.alibaba.com>
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Barry Song <baohua(a)kernel.org>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Kemeng Shi <shikemeng(a)huaweicloud.com>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/swap_state.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
--- a/mm/swap_state.c~mm-swap-fix-potential-uaf-issue-for-vma-readahead
+++ a/mm/swap_state.c
@@ -748,6 +748,8 @@ static struct folio *swap_vma_readahead(
blk_start_plug(&plug);
for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) {
+ struct swap_info_struct *si = NULL;
+
if (!pte++) {
pte = pte_offset_map(vmf->pmd, addr);
if (!pte)
@@ -761,8 +763,19 @@ static struct folio *swap_vma_readahead(
continue;
pte_unmap(pte);
pte = NULL;
+ /*
+ * Readahead entry may come from a device that we are not
+ * holding a reference to, try to grab a reference, or skip.
+ */
+ if (swp_type(entry) != swp_type(targ_entry)) {
+ si = get_swap_device(entry);
+ if (!si)
+ continue;
+ }
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
+ if (si)
+ put_swap_device(si);
if (!folio)
continue;
if (page_allocated) {
_
Patches currently in -mm which might be from kasong(a)tencent.com are
mm-swap-fix-potential-uaf-issue-for-vma-readahead.patch
mm-swap-do-not-perform-synchronous-discard-during-allocation.patch
mm-swap-rename-helper-for-setup-bad-slots.patch
mm-swap-cleanup-swap-entry-allocation-parameter.patch
mm-migrate-swap-drop-usage-of-folio_index.patch
mm-swap-remove-redundant-argument-for-isolating-a-cluster.patch
revert-mm-swap-avoid-redundant-swap-device-pinning.patch
The sysfs buffer passed to alarms_store() is allocated with 'size + 1'
bytes and a NUL terminator is appended. However, the 'size' argument
does not account for this extra byte. The original code then allocated
'size' bytes and used strcpy() to copy 'buf', which always writes one
byte past the allocated buffer since strcpy() copies until the NUL
terminator at index 'size'.
Fix this by parsing the 'buf' parameter directly using simple_strtoll()
without allocating any intermediate memory or string copying. This
removes the overflow while simplifying the code.
Cc: stable(a)vger.kernel.org
Fixes: e2c94d6f5720 ("w1_therm: adding alarm sysfs entry")
Signed-off-by: Thorsten Blum <thorsten.blum(a)linux.dev>
---
Compile-tested only.
Changes in v4:
- Use simple_strtoll because kstrtoint also parses long long internally
- Return -ERANGE in addition to -EINVAL to match kstrtoint's behavior
- Remove any changes unrelated to fixing the buffer overflow (Krzysztof)
while maintaining the same behavior and return values as before
- Link to v3: https://lore.kernel.org/lkml/20251030155614.447905-1-thorsten.blum@linux.de…
Changes in v3:
- Add integer range check for 'temp' to match kstrtoint() behavior
- Explicitly cast 'temp' to int when calling int_to_short()
- Link to v2: https://lore.kernel.org/lkml/20251029130045.70127-2-thorsten.blum@linux.dev/
Changes in v2:
- Fix buffer overflow instead of truncating the copy using strscpy()
- Parse buffer directly using simple_strtol() as suggested by David
- Update patch subject and description
- Link to v1: https://lore.kernel.org/lkml/20251017170047.114224-2-thorsten.blum@linux.de…
---
drivers/w1/slaves/w1_therm.c | 64 ++++++++++++------------------------
1 file changed, 21 insertions(+), 43 deletions(-)
diff --git a/drivers/w1/slaves/w1_therm.c b/drivers/w1/slaves/w1_therm.c
index 9ccedb3264fb..5707fa34e804 100644
--- a/drivers/w1/slaves/w1_therm.c
+++ b/drivers/w1/slaves/w1_therm.c
@@ -1836,55 +1836,36 @@ static ssize_t alarms_store(struct device *device,
struct w1_slave *sl = dev_to_w1_slave(device);
struct therm_info info;
u8 new_config_register[3]; /* array of data to be written */
- int temp, ret;
- char *token = NULL;
+ long long temp;
+ int ret = 0;
s8 tl, th; /* 1 byte per value + temp ring order */
- char *p_args, *orig;
-
- p_args = orig = kmalloc(size, GFP_KERNEL);
- /* Safe string copys as buf is const */
- if (!p_args) {
- dev_warn(device,
- "%s: error unable to allocate memory %d\n",
- __func__, -ENOMEM);
- return size;
- }
- strcpy(p_args, buf);
-
- /* Split string using space char */
- token = strsep(&p_args, " ");
-
- if (!token) {
- dev_info(device,
- "%s: error parsing args %d\n", __func__, -EINVAL);
- goto free_m;
- }
-
- /* Convert 1st entry to int */
- ret = kstrtoint (token, 10, &temp);
+ const char *p = buf;
+ char *endp;
+
+ temp = simple_strtoll(p, &endp, 10);
+ if (p == endp || *endp != ' ')
+ ret = -EINVAL;
+ else if (temp < INT_MIN || temp > INT_MAX)
+ ret = -ERANGE;
if (ret) {
dev_info(device,
"%s: error parsing args %d\n", __func__, ret);
- goto free_m;
+ goto err;
}
tl = int_to_short(temp);
- /* Split string using space char */
- token = strsep(&p_args, " ");
- if (!token) {
- dev_info(device,
- "%s: error parsing args %d\n", __func__, -EINVAL);
- goto free_m;
- }
- /* Convert 2nd entry to int */
- ret = kstrtoint (token, 10, &temp);
+ p = endp + 1;
+ temp = simple_strtoll(p, &endp, 10);
+ if (p == endp)
+ ret = -EINVAL;
+ else if (temp < INT_MIN || temp > INT_MAX)
+ ret = -ERANGE;
if (ret) {
dev_info(device,
"%s: error parsing args %d\n", __func__, ret);
- goto free_m;
+ goto err;
}
-
/* Prepare to cast to short by eliminating out of range values */
th = int_to_short(temp);
@@ -1905,7 +1886,7 @@ static ssize_t alarms_store(struct device *device,
dev_info(device,
"%s: error reading from the slave device %d\n",
__func__, ret);
- goto free_m;
+ goto err;
}
/* Write data in the device RAM */
@@ -1913,7 +1894,7 @@ static ssize_t alarms_store(struct device *device,
dev_info(device,
"%s: Device not supported by the driver %d\n",
__func__, -ENODEV);
- goto free_m;
+ goto err;
}
ret = SLAVE_SPECIFIC_FUNC(sl)->write_data(sl, new_config_register);
@@ -1922,10 +1903,7 @@ static ssize_t alarms_store(struct device *device,
"%s: error writing to the slave device %d\n",
__func__, ret);
-free_m:
- /* free allocated memory */
- kfree(orig);
-
+err:
return size;
}
--
2.51.1